llvm · jcogan-nv · Mar 19, 2025 · Mar 24, 2025 · jcogan-nv · Apr 1, 2025
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -3111,21 +3111,39 @@ bool IRTranslator::translateAlloca(const User &U,
       getOrCreateVReg(*ConstantInt::get(IntPtrIRTy, DL->getTypeAllocSize(Ty)));
   MIRBuilder.buildMul(AllocSize, NumElts, TySize);
 
-  // Round the size of the allocation up to the stack alignment size
-  // by add SA-1 to the size. This doesn't overflow because we're computing
-  // an address inside an alloca.
-  Align StackAlign = MF->getSubtarget().getFrameLowering()->getStackAlign();
-  auto SAMinusOne = MIRBuilder.buildConstant(IntPtrTy, StackAlign.value() - 1);
-  auto AllocAdd = MIRBuilder.buildAdd(IntPtrTy, AllocSize, SAMinusOne,
-                                      MachineInstr::NoUWrap);
-  auto AlignCst =
-      MIRBuilder.buildConstant(IntPtrTy, ~(uint64_t)(StackAlign.value() - 1));
-  auto AlignedAlloc = MIRBuilder.buildAnd(IntPtrTy, AllocAdd, AlignCst);
-
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+  Align StackAlign = TFI->getStackAlign();
   Align Alignment = std::max(AI.getAlign(), DL->getPrefTypeAlign(Ty));
-  if (Alignment <= StackAlign)
+
+  // If the stack alignment is stricter than the alloca's alignment, ignore the
+  // alloca's alignment. We will align the size of the alloca to the stack
+  // alignment, which will guarantee that the alloca's alignment is satisfied.
+  bool IsUnderAligned = Alignment <= StackAlign;
+  if (IsUnderAligned)
     Alignment = Align(1);
-  MIRBuilder.buildDynStackAlloc(getOrCreateVReg(AI), AlignedAlloc, Alignment);
+
+  // If the stack grows up, adding the alloca's size to SP without padding may
+  // leave SP not aligned (to the stack alignment) after the alloca because we
+  // align SP (to the stack align or alloca align) *before* adding the alloca
+  // size. On the other hand, if the stack grows down, we will align SP *after*
+  // decrementing it, so there is no need to pad the size.
+  if (TFI->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ||
+      IsUnderAligned) {
+    // Round the size of the allocation up to the stack alignment size
+    // by add SA-1 to the size. This doesn't overflow because we're computing
+    // an address inside an alloca.
+    auto SAMinusOne =
+        MIRBuilder.buildConstant(IntPtrTy, StackAlign.value() - 1);
+    auto AllocAdd = MIRBuilder.buildAdd(IntPtrTy, AllocSize, SAMinusOne,
+                                        MachineInstr::NoUWrap);
+    auto AlignCst =
+        MIRBuilder.buildConstant(IntPtrTy, ~(uint64_t)(StackAlign.value() - 1));
+    auto AlignedAlloc = MIRBuilder.buildAnd(IntPtrTy, AllocAdd, AlignCst);
+
+    MIRBuilder.buildDynStackAlloc(getOrCreateVReg(AI), AlignedAlloc, Alignment);
+  } else {
+    MIRBuilder.buildDynStackAlloc(getOrCreateVReg(AI), AllocSize, Alignment);
+  }
 
   MF->getFrameInfo().CreateVariableSizedObject(Alignment, &AI);
   assert(MF->getFrameInfo().hasVarSizedObjects());

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/dynamic-alloca.ll b/llvm/test/CodeGen/AArch64/GlobalISel/dynamic-alloca.ll
@@ -28,11 +28,7 @@ define ptr @test_aligned_alloca(i32 %numelts) {
   ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
   ; CHECK:   [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
   ; CHECK:   [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]]
-  ; CHECK:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
-  ; CHECK:   [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]]
-  ; CHECK:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16
-  ; CHECK:   [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]]
-  ; CHECK:   [[DYN_STACKALLOC:%[0-9]+]]:_(p0) = G_DYN_STACKALLOC [[AND]](s64), 32
+  ; CHECK:   [[DYN_STACKALLOC:%[0-9]+]]:_(p0) = G_DYN_STACKALLOC [[MUL]](s64), 32
   ; CHECK:   $x0 = COPY [[DYN_STACKALLOC]](p0)
   ; CHECK:   RET_ReallyLR implicit $x0
   %addr = alloca i8, i32 %numelts, align 32

diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs                                   | FileCheck %s
-; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s
+; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs                                   | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; Dynamically-sized allocation, needs a loop which can handle any size at
 ; runtime. The final iteration of the loop will temporarily put SP below the
@@ -107,13 +107,20 @@ define void @dynamic_align_64(i64 %size, ptr %out) #0 {
 ; CHECK-NEXT:    .cfi_offset w29, -32
 ; CHECK-NEXT:    sub x9, sp, #32
 ; CHECK-NEXT:    and sp, x9, #0xffffffffffffffc0
-; CHECK-NEXT:    add x9, x0, #15
-; CHECK-NEXT:    mov x8, sp
-; CHECK-DAG:     str xzr, [sp]
-; CHECK-DAG:     and x9, x9, #0xfffffffffffffff0
-; CHECK-NOT:     INVALID_TO_BREAK_UP_CHECK_DAG
-; CHECK-DAG:     mov x19, sp
-; CHECK-DAG:     sub x8, x8, x9
+;
+; CHECK-SD-NEXT: add x9, x0, #15
+; CHECK-SD-NEXT: mov x8, sp
+; CHECK-SD-DAG:  str xzr, [sp]
+; CHECK-SD-DAG:  and x9, x9, #0xfffffffffffffff0
+; CHECK-SD-NOT:  INVALID_TO_BREAK_UP_CHECK_DAG
+; CHECK-SD-DAG:  mov x19, sp
+; CHECK-SD-DAG:  sub x8, x8, x9
+;
+; CHECK-GI-NEXT: mov x8, sp
+; CHECK-GI-NEXT: str xzr, [sp]
+; CHECK-GI-DAG:  mov x19, sp
+; CHECK-GI-DAG:  sub x8, x8, x0
+;
 ; CHECK-NEXT:    and x8, x8, #0xffffffffffffffc0
 ; CHECK-NEXT:  .LBB2_1: // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
@@ -167,13 +174,20 @@ define void @dynamic_align_8192(i64 %size, ptr %out) #0 {
 ; CHECK-NEXT:    b .LBB3_1
 ; CHECK-NEXT:  .LBB3_3:
 ; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    add x9, x0, #15
-; CHECK-NEXT:    mov x8, sp
-; CHECK-DAG:     ldr xzr, [sp]
-; CHECK-DAG:     and x9, x9, #0xfffffffffffffff0
-; CHECK-NOT:     INVALID_TO_BREAK_UP_CHECK_DAG
-; CHECK-DAG:     mov x19, sp
-; CHECK-DAG:     sub x8, x8, x9
+;
+; CHECK-SD-NEXT: add x9, x0, #15
+; CHECK-SD-NEXT: mov x8, sp
+; CHECK-SD-DAG:  ldr xzr, [sp]
+; CHECK-SD-DAG:  and x9, x9, #0xfffffffffffffff0
+; CHECK-SD-NOT:  INVALID_TO_BREAK_UP_CHECK_DAG
+; CHECK-SD-DAG:  mov x19, sp
+; CHECK-SD-DAG:  sub x8, x8, x9
+;
+; CHECK-GI-NEXT: mov x8, sp
+; CHECK-GI-NEXT: ldr xzr, [sp]
+; CHECK-GI-DAG:  mov x19, sp
+; CHECK-GI-DAG:  sub x8, x8, x0
+;
 ; CHECK-NEXT:    and x8, x8, #0xffffffffffffe000
 ; CHECK-NEXT:  .LBB3_4: // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096