[X86][MS] Fix the aligement mismatch of vector variable arguments on Win32

phoebewang · phoebewang · commit 9d7d34c7691a · 2021-09-08T09:26:44.000+08:00
The alignment of vector variable arguments in callee side is 4, which is aligned with MSVC. But the caller aligns them to the size of vector arguments. It results in run fails. This patch fixes this problem by trimming it to 4 bytes for variable arguments on Win32. Fixed vector arguments are passed by pointer on Win32. So they don't have the problem. I don't find a doc in MSDN for this calling conversion, so I did several experiments here: https://godbolt.org/z/n1zn1Gx1z Reviewed By: rnk Differential Revision: https://reviews.llvm.org/D108887
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
@@ -23,6 +23,13 @@ class CCIfNotSubtarget<string F, CCAction A>
                        "(State.getMachineFunction().getSubtarget()).", F),
            A>;
 
+/// CCIfIsVarArgOnWin - Match if isVarArg on Windows 32bits.
+class CCIfIsVarArgOnWin<CCAction A>
+    : CCIf<"State.isVarArg() && "
+           "State.getMachineFunction().getSubtarget().getTargetTriple()."
+           "isWindowsMSVCEnvironment()",
+           A>;
+
 // Register classes for RegCall
 class RC_X86_RegCall {
   list<Register> GPR_8 = [];
@@ -771,6 +778,22 @@ def CC_X86_32_Vector_Common : CallingConv<[
            CCAssignToStack<64, 64>>
 ]>;
 
+/// CC_X86_Win32_Vector - In X86 Win32 calling conventions, extra vector
+/// values are spilled on the stack.
+def CC_X86_Win32_Vector : CallingConv<[
+  // Other SSE vectors get 16-byte stack slots that are 4-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
+           CCAssignToStack<16, 4>>,
+
+  // 256-bit AVX vectors get 32-byte stack slots that are 4-byte aligned.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64],
+           CCAssignToStack<32, 4>>,
+
+  // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 4-byte aligned.
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
+           CCAssignToStack<64, 4>>
+]>;
+
 // CC_X86_32_Vector_Standard - The first 3 vector arguments are passed in
 // vector registers
 def CC_X86_32_Vector_Standard : CallingConv<[
@@ -787,6 +810,7 @@ def CC_X86_32_Vector_Standard : CallingConv<[
   CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
                 CCAssignToReg<[ZMM0, ZMM1, ZMM2]>>>,
 
+  CCIfIsVarArgOnWin<CCDelegateTo<CC_X86_Win32_Vector>>,
   CCDelegateTo<CC_X86_32_Vector_Common>
 ]>;
 
diff --git a/llvm/test/CodeGen/X86/vaargs-win32.ll b/llvm/test/CodeGen/X86/vaargs-win32.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mcpu=generic -mtriple=i686-pc-windows-msvc -mattr=+sse < %s | FileCheck %s --check-prefix=MSVC
+; RUN: llc -mcpu=generic -mtriple=i686-pc-mingw32 -mattr=+sse < %s | FileCheck %s --check-prefix=MINGW
+
+@a = external dso_local global <4 x float>, align 16
+
+define dso_local void @testPastArguments() nounwind {
+; MSVC-LABEL: testPastArguments:
+; MSVC:       # %bb.0: # %entry
+; MSVC-NEXT:    subl $20, %esp
+; MSVC-NEXT:    movaps _a, %xmm0
+; MSVC-NEXT:    movups %xmm0, 4(%esp)
+; MSVC-NEXT:    movl $1, (%esp)
+; MSVC-NEXT:    calll _testm128
+; MSVC-NEXT:    addl $20, %esp
+; MSVC-NEXT:    retl
+;
+; MINGW-LABEL: testPastArguments:
+; MINGW:       # %bb.0: # %entry
+; MINGW-NEXT:    pushl %ebp
+; MINGW-NEXT:    movl %esp, %ebp
+; MINGW-NEXT:    andl $-16, %esp
+; MINGW-NEXT:    subl $48, %esp
+; MINGW-NEXT:    movaps _a, %xmm0
+; MINGW-NEXT:    movaps %xmm0, 16(%esp)
+; MINGW-NEXT:    movl $1, (%esp)
+; MINGW-NEXT:    calll _testm128
+; MINGW-NEXT:    movl %ebp, %esp
+; MINGW-NEXT:    popl %ebp
+; MINGW-NEXT:    retl
+entry:
+  %0 = load <4 x float>, <4 x float>* @a, align 16
+  %call = tail call i32 (i32, ...) @testm128(i32 1, <4 x float> inreg %0)
+  ret void
+}
+
+declare i32 @testm128(i32, ...) nounwind
diff --git a/llvm/test/CodeGen/X86/win32-spill-xmm.ll b/llvm/test/CodeGen/X86/win32-spill-xmm.ll
@@ -20,7 +20,7 @@ declare void @bar(<16 x float> %a, i32 %b)
 ; Check that proper alignment of spilled vector does not affect vargs
 
 ; CHECK-LABEL: vargs_not_affected
-; CHECK: movl 28(%ebp), %eax
+; CHECK: movl 28(%esp), %eax
 define i32 @vargs_not_affected(<4 x float> %v, i8* %f, ...) {
 entry:
   %ap = alloca i8*, align 4