Skip to content

Commit 6c84bba

Browse files
authored
[LowerMemIntrinsics] Use correct alignment in residual loop for variable llvm.memcpy (#97998)
Memcpy intrinsics with statically unknown loop sizes are lowered with two load/store loops: one with access widths specified by the target, and a residual loop that copies remaining bytes individually. As the residual loop operates byte-wise, its accesses are only 1-aligned. However, we currently use the alignment that is optimal for the first loop in both, which is unsound. With this patch, we use the correct alignment in the residual loop. The lowering of memcpy with a static size already handles alignments for the residual correctly.
1 parent 4267219 commit 6c84bba

File tree

2 files changed

+18
-15
lines changed

2 files changed

+18
-15
lines changed

llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp

+6-3
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,9 @@ void llvm::createMemCpyLoopUnknownSize(
262262
assert((ResLoopOpSize == AtomicElementSize ? *AtomicElementSize : 1) &&
263263
"Store size is expected to match type size");
264264

265+
Align ResSrcAlign(commonAlignment(PartSrcAlign, ResLoopOpSize));
266+
Align ResDstAlign(commonAlignment(PartDstAlign, ResLoopOpSize));
267+
265268
Value *RuntimeResidual = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen,
266269
CILoopOpSize, LoopOpSize);
267270
Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual);
@@ -303,16 +306,16 @@ void llvm::createMemCpyLoopUnknownSize(
303306
Value *SrcGEP =
304307
ResBuilder.CreateInBoundsGEP(ResLoopOpType, SrcAddr, FullOffset);
305308
LoadInst *Load = ResBuilder.CreateAlignedLoad(ResLoopOpType, SrcGEP,
306-
PartSrcAlign, SrcIsVolatile);
309+
ResSrcAlign, SrcIsVolatile);
307310
if (!CanOverlap) {
308311
// Set alias scope for loads.
309312
Load->setMetadata(LLVMContext::MD_alias_scope,
310313
MDNode::get(Ctx, NewScope));
311314
}
312315
Value *DstGEP =
313316
ResBuilder.CreateInBoundsGEP(ResLoopOpType, DstAddr, FullOffset);
314-
StoreInst *Store = ResBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign,
315-
DstIsVolatile);
317+
StoreInst *Store =
318+
ResBuilder.CreateAlignedStore(Load, DstGEP, ResDstAlign, DstIsVolatile);
316319
if (!CanOverlap) {
317320
// Indicate that stores don't overlap loads.
318321
Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));

llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll

+12-12
Original file line numberDiff line numberDiff line change
@@ -930,9 +930,9 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrs
930930
; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
931931
; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
932932
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
933-
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 4
933+
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
934934
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP10]]
935-
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
935+
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
936936
; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
937937
; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
938938
; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
@@ -966,9 +966,9 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrs
966966
; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
967967
; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
968968
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
969-
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 2
969+
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
970970
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP10]]
971-
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 2
971+
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
972972
; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
973973
; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
974974
; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
@@ -1038,9 +1038,9 @@ define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspa
10381038
; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
10391039
; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
10401040
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
1041-
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 4
1041+
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1
10421042
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
1043-
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 4
1043+
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1
10441044
; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
10451045
; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
10461046
; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
@@ -1074,9 +1074,9 @@ define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspa
10741074
; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
10751075
; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
10761076
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
1077-
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 2
1077+
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1
10781078
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
1079-
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 2
1079+
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1
10801080
; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
10811081
; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
10821082
; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
@@ -1146,9 +1146,9 @@ define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrsp
11461146
; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
11471147
; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
11481148
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i32 [[TMP10]]
1149-
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 4
1149+
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
11501150
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
1151-
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 4
1151+
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1
11521152
; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
11531153
; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
11541154
; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
@@ -1182,9 +1182,9 @@ define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrsp
11821182
; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
11831183
; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
11841184
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
1185-
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 4
1185+
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1
11861186
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i32 [[TMP10]]
1187-
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
1187+
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
11881188
; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
11891189
; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
11901190
; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]

0 commit comments

Comments
 (0)