Skip to content

Commit cbc96b9

Browse files
authored
Reapply "[LowerMemIntrinsics] Use correct alignment in residual loop for variable llvm.memcpy" (#98482)
Reverts #98295, which reverted #97998 The failure in the "InOneWeekend" test of the HIP test suite on clang-hip-vega20 (https://lab.llvm.org/buildbot/#/builders/123/builds/1498) seems to be unrelated; I observed it (and a similar failure for the "TheNextWeek" test in the same suite) intermittently on my system, with and without the patch applied. (It occurred in 2 out of 50 repeated runs without the patch and in 1 out of 50 runs with the patch.)
1 parent d601ca3 commit cbc96b9

File tree

2 files changed

+18
-15
lines changed

2 files changed

+18
-15
lines changed

llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,9 @@ void llvm::createMemCpyLoopUnknownSize(
262262
assert((ResLoopOpSize == AtomicElementSize ? *AtomicElementSize : 1) &&
263263
"Store size is expected to match type size");
264264

265+
Align ResSrcAlign(commonAlignment(PartSrcAlign, ResLoopOpSize));
266+
Align ResDstAlign(commonAlignment(PartDstAlign, ResLoopOpSize));
267+
265268
Value *RuntimeResidual = getRuntimeLoopRemainder(DL, PLBuilder, CopyLen,
266269
CILoopOpSize, LoopOpSize);
267270
Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual);
@@ -303,16 +306,16 @@ void llvm::createMemCpyLoopUnknownSize(
303306
Value *SrcGEP =
304307
ResBuilder.CreateInBoundsGEP(ResLoopOpType, SrcAddr, FullOffset);
305308
LoadInst *Load = ResBuilder.CreateAlignedLoad(ResLoopOpType, SrcGEP,
306-
PartSrcAlign, SrcIsVolatile);
309+
ResSrcAlign, SrcIsVolatile);
307310
if (!CanOverlap) {
308311
// Set alias scope for loads.
309312
Load->setMetadata(LLVMContext::MD_alias_scope,
310313
MDNode::get(Ctx, NewScope));
311314
}
312315
Value *DstGEP =
313316
ResBuilder.CreateInBoundsGEP(ResLoopOpType, DstAddr, FullOffset);
314-
StoreInst *Store = ResBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign,
315-
DstIsVolatile);
317+
StoreInst *Store =
318+
ResBuilder.CreateAlignedStore(Load, DstGEP, ResDstAlign, DstIsVolatile);
316319
if (!CanOverlap) {
317320
// Indicate that stores don't overlap loads.
318321
Store->setMetadata(LLVMContext::MD_noalias, MDNode::get(Ctx, NewScope));

llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -930,9 +930,9 @@ define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrs
930930
; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
931931
; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
932932
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
933-
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 4
933+
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
934934
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP10]]
935-
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
935+
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
936936
; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
937937
; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
938938
; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
@@ -966,9 +966,9 @@ define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrs
966966
; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
967967
; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
968968
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
969-
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 2
969+
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
970970
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP10]]
971-
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 2
971+
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
972972
; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
973973
; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
974974
; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
@@ -1038,9 +1038,9 @@ define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspa
10381038
; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
10391039
; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
10401040
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
1041-
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 4
1041+
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1
10421042
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
1043-
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 4
1043+
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1
10441044
; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
10451045
; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
10461046
; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
@@ -1074,9 +1074,9 @@ define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspa
10741074
; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
10751075
; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
10761076
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
1077-
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 2
1077+
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1
10781078
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
1079-
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 2
1079+
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1
10801080
; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
10811081
; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
10821082
; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
@@ -1146,9 +1146,9 @@ define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrsp
11461146
; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
11471147
; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
11481148
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i32 [[TMP10]]
1149-
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 4
1149+
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
11501150
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
1151-
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 4
1151+
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1
11521152
; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
11531153
; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
11541154
; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
@@ -1182,9 +1182,9 @@ define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrsp
11821182
; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
11831183
; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
11841184
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
1185-
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 4
1185+
; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1
11861186
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i32 [[TMP10]]
1187-
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
1187+
; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
11881188
; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
11891189
; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
11901190
; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]

0 commit comments

Comments
 (0)