Skip to content

Commit adfc577

Browse files
[OpenMP][CodeExtractor]Add align metadata to load instructions (llvm#131131)
Moving code to another function can lead to missed optimization opportunities, because function passes operate on smaller chunks of code, and they cannot figure out all details. One example of missed optimization opportunities after code extraction is information about pointer alignment. The instruction combine pass adds information about pointer alignment to LLVM intrinsic memcpy calls if it can deduce it from the code or if align metadata is added. If this information is not present, then further optimization passes can generate inefficient code. If we add align metadata to extracted pointers, then the instruction combine pass can add the align attribute to the LLVM intrinsic memcpy call and unblock further optimization. Scope of changes: 1. Analyze MLIR map operations. Add information about the alignment of objects that are passed by reference to OpenMP GPU kernels. 2. Propagate alignment information to the outlined by `CodeExtractor` helper functions.
1 parent b122956 commit adfc577

File tree

3 files changed

+176
-5
lines changed

3 files changed

+176
-5
lines changed

llvm/lib/Transforms/Utils/CodeExtractor.cpp

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,21 @@ buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
248248
return Result;
249249
}
250250

251+
/// isAlignmentPreservedForAddrCast - Return true if the cast operation
252+
/// for specified target preserves original alignment
253+
static bool isAlignmentPreservedForAddrCast(const Triple &TargetTriple) {
254+
switch (TargetTriple.getArch()) {
255+
case Triple::ArchType::amdgcn:
256+
case Triple::ArchType::r600:
257+
return true;
258+
// TODO: Add other architectures for which we are certain that alignment
259+
// is preserved during address space cast operations.
260+
default:
261+
return false;
262+
}
263+
return false;
264+
}
265+
251266
CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
252267
bool AggregateArgs, BlockFrequencyInfo *BFI,
253268
BranchProbabilityInfo *BPI, AssumptionCache *AC,
@@ -1612,8 +1627,42 @@ void CodeExtractor::emitFunctionBody(
16121627
Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), aggIdx);
16131628
GetElementPtrInst *GEP = GetElementPtrInst::Create(
16141629
StructArgTy, AggArg, Idx, "gep_" + inputs[i]->getName(), newFuncRoot);
1615-
RewriteVal = new LoadInst(StructArgTy->getElementType(aggIdx), GEP,
1616-
"loadgep_" + inputs[i]->getName(), newFuncRoot);
1630+
LoadInst *LoadGEP =
1631+
new LoadInst(StructArgTy->getElementType(aggIdx), GEP,
1632+
"loadgep_" + inputs[i]->getName(), newFuncRoot);
1633+
// If we load pointer, we can add optional !align metadata
1634+
// The existence of the !align metadata on the instruction tells
1635+
// the optimizer that the value loaded is known to be aligned to
1636+
// a boundary specified by the integer value in the metadata node.
1637+
// Example:
1638+
// %res = load ptr, ptr %input, align 8, !align !align_md_node
1639+
// ^ ^
1640+
// | |
1641+
// alignment of %input address |
1642+
// |
1643+
// alignment of %res object
1644+
if (StructArgTy->getElementType(aggIdx)->isPointerTy()) {
1645+
unsigned AlignmentValue;
1646+
const Triple &TargetTriple =
1647+
newFunction->getParent()->getTargetTriple();
1648+
const DataLayout &DL = header->getDataLayout();
1649+
// Pointers without casting can provide more information about
1650+
// alignment. Use pointers without casts if given target preserves
1651+
// alignment information for cast the operation.
1652+
if (isAlignmentPreservedForAddrCast(TargetTriple))
1653+
AlignmentValue =
1654+
inputs[i]->stripPointerCasts()->getPointerAlignment(DL).value();
1655+
else
1656+
AlignmentValue = inputs[i]->getPointerAlignment(DL).value();
1657+
MDBuilder MDB(header->getContext());
1658+
LoadGEP->setMetadata(
1659+
LLVMContext::MD_align,
1660+
MDNode::get(
1661+
header->getContext(),
1662+
MDB.createConstant(ConstantInt::get(
1663+
Type::getInt64Ty(header->getContext()), AlignmentValue))));
1664+
}
1665+
RewriteVal = LoadGEP;
16171666
++aggIdx;
16181667
} else
16191668
RewriteVal = &*ScalarAI++;

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include "llvm/IR/DebugInfoMetadata.h"
3434
#include "llvm/IR/DerivedTypes.h"
3535
#include "llvm/IR/IRBuilder.h"
36+
#include "llvm/IR/MDBuilder.h"
3637
#include "llvm/IR/ReplaceConstant.h"
3738
#include "llvm/Support/FileSystem.h"
3839
#include "llvm/TargetParser/Triple.h"
@@ -4534,13 +4535,17 @@ createDeviceArgumentAccessor(MapInfoData &mapData, llvm::Argument &arg,
45344535
builder.restoreIP(allocaIP);
45354536

45364537
omp::VariableCaptureKind capture = omp::VariableCaptureKind::ByRef;
4537-
4538+
LLVM::TypeToLLVMIRTranslator typeToLLVMIRTranslator(
4539+
ompBuilder.M.getContext());
4540+
unsigned alignmentValue = 0;
45384541
// Find the associated MapInfoData entry for the current input
45394542
for (size_t i = 0; i < mapData.MapClause.size(); ++i)
45404543
if (mapData.OriginalValue[i] == input) {
45414544
auto mapOp = cast<omp::MapInfoOp>(mapData.MapClause[i]);
45424545
capture = mapOp.getMapCaptureType();
4543-
4546+
// Get information of alignment of mapped object
4547+
alignmentValue = typeToLLVMIRTranslator.getPreferredAlignment(
4548+
mapOp.getVarType(), ompBuilder.M.getDataLayout());
45444549
break;
45454550
}
45464551

@@ -4564,9 +4569,34 @@ createDeviceArgumentAccessor(MapInfoData &mapData, llvm::Argument &arg,
45644569
break;
45654570
}
45664571
case omp::VariableCaptureKind::ByRef: {
4567-
retVal = builder.CreateAlignedLoad(
4572+
llvm::LoadInst *loadInst = builder.CreateAlignedLoad(
45684573
v->getType(), v,
45694574
ompBuilder.M.getDataLayout().getPrefTypeAlign(v->getType()));
4575+
// CreateAlignedLoad function creates similar LLVM IR:
4576+
// %res = load ptr, ptr %input, align 8
4577+
// This LLVM IR does not contain information about alignment
4578+
// of the loaded value. We need to add !align metadata to unblock
4579+
// optimizer. The existence of the !align metadata on the instruction
4580+
// tells the optimizer that the value loaded is known to be aligned to
4581+
// a boundary specified by the integer value in the metadata node.
4582+
// Example:
4583+
// %res = load ptr, ptr %input, align 8, !align !align_md_node
4584+
// ^ ^
4585+
// | |
4586+
// alignment of %input address |
4587+
// |
4588+
// alignment of %res object
4589+
if (v->getType()->isPointerTy() && alignmentValue) {
4590+
llvm::MDBuilder MDB(builder.getContext());
4591+
loadInst->setMetadata(
4592+
llvm::LLVMContext::MD_align,
4593+
llvm::MDNode::get(builder.getContext(),
4594+
MDB.createConstant(llvm::ConstantInt::get(
4595+
llvm::Type::getInt64Ty(builder.getContext()),
4596+
alignmentValue))));
4597+
}
4598+
retVal = loadInst;
4599+
45704600
break;
45714601
}
45724602
case omp::VariableCaptureKind::This:
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
2+
3+
// The aim of this test is to verfiy that information of
4+
// alignment of loaded objects is passed to outlined
5+
// functions.
6+
7+
module attributes {llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} {
8+
omp.private {type = private} @_QFEk_private_i32 : i32
9+
llvm.func @_QQmain() {
10+
%0 = llvm.mlir.constant(1 : i32) : i32
11+
%7 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<5>
12+
%8 = llvm.addrspacecast %7 : !llvm.ptr<5> to !llvm.ptr
13+
%12 = llvm.mlir.constant(1 : i64) : i64
14+
%13 = llvm.alloca %12 x i32 {bindc_name = "k"} : (i64) -> !llvm.ptr<5>
15+
%14 = llvm.addrspacecast %13 : !llvm.ptr<5> to !llvm.ptr
16+
%15 = llvm.mlir.constant(1 : i64) : i64
17+
%16 = llvm.alloca %15 x i32 {bindc_name = "b"} : (i64) -> !llvm.ptr<5>
18+
%17 = llvm.addrspacecast %16 : !llvm.ptr<5> to !llvm.ptr
19+
%19 = llvm.mlir.constant(1 : index) : i64
20+
%20 = llvm.mlir.constant(0 : index) : i64
21+
%22 = llvm.mlir.addressof @_QFEa : !llvm.ptr
22+
%25 = llvm.mlir.addressof @_QFECnz : !llvm.ptr
23+
%60 = llvm.getelementptr %8[0, 7, %20, 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
24+
%61 = llvm.load %60 : !llvm.ptr -> i64
25+
%62 = llvm.getelementptr %8[0, 7, %20, 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
26+
%63 = llvm.load %62 : !llvm.ptr -> i64
27+
%64 = llvm.getelementptr %8[0, 7, %20, 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
28+
%65 = llvm.load %64 : !llvm.ptr -> i64
29+
%66 = llvm.sub %63, %19 : i64
30+
%67 = omp.map.bounds lower_bound(%20 : i64) upper_bound(%66 : i64) extent(%63 : i64) stride(%65 : i64) start_idx(%61 : i64) {stride_in_bytes = true}
31+
%68 = llvm.getelementptr %22[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
32+
%69 = omp.map.info var_ptr(%22 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) var_ptr_ptr(%68 : !llvm.ptr) bounds(%67) -> !llvm.ptr {name = ""}
33+
%70 = omp.map.info var_ptr(%22 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(to) capture(ByRef) members(%69 : [0] : !llvm.ptr) -> !llvm.ptr {name = "a"}
34+
%71 = omp.map.info var_ptr(%17 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "b"}
35+
%72 = omp.map.info var_ptr(%14 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "k"}
36+
%73 = omp.map.info var_ptr(%25 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "nz"}
37+
omp.target map_entries(%70 -> %arg0, %71 -> %arg1, %72 -> %arg2, %73 -> %arg3, %69 -> %arg4 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) {
38+
%106 = llvm.mlir.constant(0 : index) : i64
39+
%107 = llvm.mlir.constant(13 : i32) : i32
40+
%108 = llvm.mlir.constant(1000 : i32) : i32
41+
%109 = llvm.mlir.constant(1 : i32) : i32
42+
omp.teams {
43+
omp.parallel private(@_QFEk_private_i32 %arg2 -> %arg5 : !llvm.ptr) {
44+
%110 = llvm.mlir.constant(1 : i32) : i32
45+
%111 = llvm.alloca %110 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<5>
46+
%112 = llvm.addrspacecast %111 : !llvm.ptr<5> to !llvm.ptr
47+
omp.distribute {
48+
omp.wsloop {
49+
omp.loop_nest (%arg6) : i32 = (%109) to (%108) inclusive step (%109) {
50+
llvm.store %arg6, %arg5 : i32, !llvm.ptr
51+
%115 = llvm.mlir.constant(48 : i32) : i32
52+
"llvm.intr.memcpy"(%112, %arg0, %115) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
53+
omp.yield
54+
}
55+
} {omp.composite}
56+
} {omp.composite}
57+
omp.terminator
58+
} {omp.composite}
59+
omp.terminator
60+
}
61+
omp.terminator
62+
}
63+
llvm.return
64+
}
65+
llvm.mlir.global internal @_QFEa() {addr_space = 0 : i32} : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {
66+
%6 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
67+
llvm.return %6 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
68+
}
69+
llvm.mlir.global internal constant @_QFECnz() {addr_space = 0 : i32} : i32 {
70+
%0 = llvm.mlir.constant(1000 : i32) : i32
71+
llvm.return %0 : i32
72+
}
73+
}
74+
75+
// CHECK: call void @__kmpc_distribute_for_static_loop_4u(
76+
// CHECK-SAME: ptr addrspacecast (ptr addrspace(1) @[[GLOB:[0-9]+]] to ptr),
77+
// CHECK-SAME: ptr @[[LOOP_BODY_FUNC:.*]], ptr %[[LOOP_BODY_FUNC_ARG:.*]],
78+
// CHEKC-SAME i32 1000, i32 %1, i32 0, i32 0)
79+
80+
81+
// CHECK: define internal void @[[LOOP_BODY_FUNC]](i32 %[[CNT:.*]], ptr %[[LOOP_BODY_ARG_PTR:.*]]) #[[ATTRS:[0-9]+]] {
82+
// CHECK: %[[GEP_PTR_0:.*]] = getelementptr { ptr, ptr, ptr }, ptr %[[LOOP_BODY_ARG_PTR]], i32 0, i32 0
83+
// CHECK: %[[INT_PTR:.*]] = load ptr, ptr %[[GEP_PTR_0]], align 8, !align ![[ALIGN_INT:[0-9]+]]
84+
// CHECK: %[[GEP_PTR_1:.*]] = getelementptr { ptr, ptr, ptr }, ptr %[[LOOP_BODY_ARG_PTR]], i32 0, i32 1
85+
// CHECK: %[[STRUCT_PTR_0:.*]] = load ptr, ptr %[[GEP_PTR_1]], align 8, !align ![[ALIGN_STRUCT:[0-9]+]]
86+
// CHECK: %[[GEP_PTR_2:.*]] = getelementptr { ptr, ptr, ptr }, ptr %[[LOOP_BODY_ARG_PTR]], i32 0, i32 2
87+
// CHECK: %[[STRUCT_PTR_1:.*]] = load ptr, ptr %[[GEP_PTR_2]], align 8, !align ![[ALIGN_STRUCT:[0-9]+]]
88+
// CHECK: store i32 %[[DATA_INT:.*]], ptr %[[INT_PTR]], align 4
89+
// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr %[[STRUCT_PTR_0]], ptr %[[STRUCT_PTR_1]], i32 48, i1 false)
90+
91+
// CHECK: ![[ALIGN_STRUCT]] = !{i64 8}
92+
// CHECK: ![[ALIGN_INT]] = !{i64 4}

0 commit comments

Comments
 (0)