Skip to content

Commit 8516f54

Browse files
[AMDGPU] Implement variadic functions by IR lowering (#93362)
This is a mostly-target-independent variadic function optimisation and lowering pass. It is only enabled for AMDGPU in this initial commit. The purpose is to make C style variadic functions a zero cost abstraction. They are lowered to equivalent IR which is then amenable to other optimisations. This is inherently slightly target specific but much less so than one might expect - the C varargs interface heavily constrains the ABI design divergence. The pass is primarily tested from webassembly. This is because wasm has a straightforward variadic lowering strategy which coincides exactly with what this pass transforms code into and a struct passing convention with few cases to check. Adding further targets conventions is straightforward and elided from this patch primarily to simplify the review. Implemented in other branches are Linux X86, AMD64, AArch64 and NVPTX. Testing for targets that have existing lowering for va_arg from clang is most efficiently done by checking that clang | opt completely elides the variadic syntax from test cases. The lowering produces a struct for each call site which can be inspected to check the various alignment and indirections are correct. AMDGPU presently has no variadic support other than some ad hoc printf handling. Combined with the pass being inactive on all other targets landing this represents strict increase in capability with zero risk. Testing and refining will continue post commit. In addition to the compiler tests included here, a self contained x64 clang/musl toolchain was constructed using the "lowering" instead of the systemv ABI and used to build various C programs like lua and libxml2.
1 parent 4c6dd70 commit 8516f54

26 files changed

+4572
-22
lines changed

clang/lib/CodeGen/Targets/AMDGPU.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,11 @@ void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
120120

121121
Address AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
122122
QualType Ty) const {
123-
llvm_unreachable("AMDGPU does not support varargs");
123+
const bool IsIndirect = false;
124+
const bool AllowHigherAlign = false;
125+
return emitVoidPtrVAArg(CGF, VAListAddr, Ty, IsIndirect,
126+
getContext().getTypeInfoInChars(Ty),
127+
CharUnits::fromQuantity(4), AllowHigherAlign);
124128
}
125129

126130
ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {

clang/test/CodeGen/voidptr-vaarg.c

Lines changed: 478 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
2+
// REQUIRES: webassembly-registered-target
3+
4+
// Simple calls to known variadic functions that are completely elided when
5+
// optimisations are on This is a functional check that the expand-variadic pass
6+
// is consistent with clang's va_arg handling
7+
8+
// When expand-variadics is added to the default pipeline, clang -O1 will
9+
// suffice here -Wno-varargs avoids warning second argument to 'va_start' is not
10+
// the last named parameter
11+
12+
// RUN: %clang_cc1 %s -triple wasm32-unknown-unknown -Wno-varargs -O1 -emit-llvm -o - | opt - -S --passes='module(expand-variadics,default<O1>)' --expand-variadics-override=optimize -o - | FileCheck %s
13+
14+
#include <stdarg.h>
15+
#include <stdint.h>
16+
17+
template <typename X, typename Y> static X first(...) {
18+
va_list va;
19+
__builtin_va_start(va, 0);
20+
X r = va_arg(va, X);
21+
va_end(va);
22+
return r;
23+
}
24+
25+
template <typename X, typename Y> static Y second(...) {
26+
va_list va;
27+
__builtin_va_start(va, 0);
28+
va_arg(va, X);
29+
Y r = va_arg(va, Y);
30+
va_end(va);
31+
return r;
32+
}
33+
34+
extern "C" {
35+
36+
// CHECK-LABEL: define {{[^@]+}}@first_pair_i32
37+
// CHECK-SAME: (i32 noundef returned [[X:%.*]], i32 noundef [[Y:%.*]])
38+
// CHECK-NEXT: entry:
39+
// CHECK-NEXT: ret i32 [[X]]
40+
//
41+
int first_pair_i32(int x, int y) { return first<int, int>(x, y); }
42+
43+
// CHECK-LABEL: define {{[^@]+}}@second_pair_i32
44+
// CHECK-SAME: (i32 noundef [[X:%.*]], i32 noundef returned [[Y:%.*]])
45+
// CHECK-NEXT: entry:
46+
// CHECK-NEXT: ret i32 [[Y]]
47+
//
48+
int second_pair_i32(int x, int y) { return second<int, int>(x, y); }
49+
50+
// CHECK-LABEL: define {{[^@]+}}@first_pair_f64
51+
// CHECK-SAME: (double noundef returned [[X:%.*]], double noundef [[Y:%.*]])
52+
// CHECK-NEXT: entry:
53+
// CHECK-NEXT: ret double [[X]]
54+
//
55+
double first_pair_f64(double x, double y) {
56+
return first<double, double>(x, y);
57+
}
58+
59+
// CHECK-LABEL: define {{[^@]+}}@second_pair_f64
60+
// CHECK-SAME: (double noundef [[X:%.*]], double noundef returned [[Y:%.*]])
61+
// CHECK-NEXT: entry:
62+
// CHECK-NEXT: ret double [[Y]]
63+
//
64+
double second_pair_f64(double x, double y) {
65+
return second<double, double>(x, y);
66+
}
67+
}
68+
69+
extern "C" {
70+
71+
// CHECK-LABEL: define {{[^@]+}}@first_i32_f64
72+
// CHECK-SAME: (i32 noundef returned [[X:%.*]], double noundef [[Y:%.*]])
73+
// CHECK-NEXT: entry:
74+
// CHECK-NEXT: ret i32 [[X]]
75+
//
76+
int first_i32_f64(int x, double y) { return first<int, double>(x, y); }
77+
78+
// CHECK-LABEL: define {{[^@]+}}@second_i32_f64
79+
// CHECK-SAME: (i32 noundef [[X:%.*]], double noundef returned [[Y:%.*]])
80+
// CHECK-NEXT: entry:
81+
// CHECK-NEXT: ret double [[Y]]
82+
//
83+
double second_i32_f64(int x, double y) { return second<int, double>(x, y); }
84+
85+
// CHECK-LABEL: define {{[^@]+}}@first_f64_i32
86+
// CHECK-SAME: (double noundef returned [[X:%.*]], i32 noundef [[Y:%.*]])
87+
// CHECK-NEXT: entry:
88+
// CHECK-NEXT: ret double [[X]]
89+
//
90+
double first_f64_i32(double x, int y) { return first<double, int>(x, y); }
91+
92+
// CHECK-LABEL: define {{[^@]+}}@second_f64_i32
93+
// CHECK-SAME: (double noundef [[X:%.*]], i32 noundef returned [[Y:%.*]])
94+
// CHECK-NEXT: entry:
95+
// CHECK-NEXT: ret i32 [[Y]]
96+
//
97+
int second_f64_i32(double x, int y) { return second<double, int>(x, y); }
98+
}
99+
100+
extern "C" {
101+
typedef uint64_t ulong2 __attribute__((__vector_size__(16), __aligned__(16)));
102+
103+
// CHECK-LABEL: define {{[^@]+}}@first_i32_ulong2
104+
// CHECK-SAME: (i32 noundef returned [[X:%.*]], ptr nocapture noundef readonly [[Y:%.*]])
105+
// CHECK-NEXT: entry:
106+
// CHECK-NEXT: ret i32 [[X]]
107+
//
108+
int first_i32_ulong2(int x, ulong2 *y) { return first<int, ulong2>(x, *y); }
109+
110+
// CHECK-LABEL: define {{[^@]+}}@second_i32_ulong2
111+
// CHECK-SAME: (i32 noundef [[X:%.*]], ptr nocapture noundef readonly [[Y:%.*]], ptr nocapture noundef writeonly [[R:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
112+
// CHECK-NEXT: entry:
113+
// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[Y]], align 16, !tbaa [[TBAA2:![0-9]+]]
114+
// CHECK-NEXT: store <2 x i64> [[TMP0]], ptr [[R]], align 16, !tbaa [[TBAA2]]
115+
// CHECK-NEXT: ret void
116+
//
117+
void second_i32_ulong2(int x, ulong2 *y, ulong2 *r) {
118+
*r = second<int, ulong2>(x, *y);
119+
}
120+
121+
// CHECK-LABEL: define {{[^@]+}}@first_ulong2_i32
122+
// CHECK-SAME: (ptr nocapture noundef readonly [[X:%.*]], i32 noundef [[Y:%.*]], ptr nocapture noundef writeonly [[R:%.*]]) local_unnamed_addr #[[ATTR1]] {
123+
// CHECK-NEXT: entry:
124+
// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[X]], align 16, !tbaa [[TBAA2]]
125+
// CHECK-NEXT: store <2 x i64> [[TMP0]], ptr [[R]], align 16, !tbaa [[TBAA2]]
126+
// CHECK-NEXT: ret void
127+
//
128+
void first_ulong2_i32(ulong2 *x, int y, ulong2 *r) {
129+
*r = first<ulong2, int>(*x, y);
130+
}
131+
132+
// CHECK-LABEL: define {{[^@]+}}@second_ulong2_i32
133+
// CHECK-SAME: (ptr nocapture noundef readonly [[X:%.*]], i32 noundef returned [[Y:%.*]])
134+
// CHECK-NEXT: entry:
135+
// CHECK-NEXT: ret i32 [[Y]]
136+
//
137+
int second_ulong2_i32(ulong2 *x, int y) { return second<ulong2, int>(*x, y); }
138+
}
139+
140+
// ascending alignment
141+
typedef struct {
142+
char c;
143+
short s;
144+
int i;
145+
long l;
146+
float f;
147+
double d;
148+
} asc;
149+
150+
extern "C" {
151+
152+
// CHECK-LABEL: define {{[^@]+}}@first_i32_asc
153+
// CHECK-SAME: (i32 noundef returned [[X:%.*]], ptr nocapture noundef readonly [[Y:%.*]])
154+
// CHECK-NEXT: entry:
155+
// CHECK-NEXT: ret i32 [[X]]
156+
//
157+
int first_i32_asc(int x, asc *y) { return first<int, asc>(x, *y); }
158+
159+
// CHECK-LABEL: define {{[^@]+}}@second_i32_asc
160+
// CHECK-SAME: (i32 noundef [[X:%.*]], ptr nocapture noundef readonly [[Y:%.*]], ptr nocapture noundef writeonly [[R:%.*]]) local_unnamed_addr #[[ATTR1]] {
161+
// CHECK-NEXT: entry:
162+
// CHECK-NEXT: tail call void @llvm.memmove.p0.p0.i32(ptr noundef nonnull align 8 dereferenceable(24) [[R]], ptr noundef nonnull align 1 dereferenceable(24) [[Y]], i32 24, i1 false)
163+
// CHECK-NEXT: ret void
164+
//
165+
void second_i32_asc(int x, asc *y, asc *r) { *r = second<int, asc>(x, *y); }
166+
167+
// CHECK-LABEL: define {{[^@]+}}@first_asc_i32
168+
// CHECK-SAME: (ptr nocapture noundef readonly [[X:%.*]], i32 noundef [[Y:%.*]], ptr nocapture noundef writeonly [[R:%.*]]) local_unnamed_addr #[[ATTR1]] {
169+
// CHECK-NEXT: entry:
170+
// CHECK-NEXT: tail call void @llvm.memmove.p0.p0.i32(ptr noundef nonnull align 8 dereferenceable(24) [[R]], ptr noundef nonnull align 1 dereferenceable(24) [[X]], i32 24, i1 false)
171+
// CHECK-NEXT: ret void
172+
//
173+
void first_asc_i32(asc *x, int y, asc *r) { *r = first<asc, int>(*x, y); }
174+
175+
// CHECK-LABEL: define {{[^@]+}}@second_asc_i32
176+
// CHECK-SAME: (ptr nocapture noundef readonly [[X:%.*]], i32 noundef returned [[Y:%.*]])
177+
// CHECK-NEXT: entry:
178+
// CHECK-NEXT: ret i32 [[Y]]
179+
//
180+
int second_asc_i32(asc *x, int y) { return second<asc, int>(*x, y); }
181+
}

libc/config/gpu/entrypoints.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,10 @@ set(TARGET_LIBC_ENTRYPOINTS
181181
libc.src.stdio.fflush
182182
libc.src.stdio.ftell
183183
libc.src.stdio.clearerr
184+
libc.src.stdio.sprintf
185+
libc.src.stdio.snprintf
186+
libc.src.stdio.vsprintf
187+
libc.src.stdio.vsnprintf
184188
libc.src.stdio.puts
185189
libc.src.stdio.fopen
186190
libc.src.stdio.fclose

libc/test/src/__support/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@ add_libc_test(
8686
libc.src.__support.uint128
8787
)
8888

89-
# The GPU does not support varargs currently.
90-
if(NOT LIBC_TARGET_OS_IS_GPU)
89+
# NVPTX does not support varargs currently.
90+
if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
9191
add_libc_test(
9292
arg_list_test
9393
SUITE

llvm/include/llvm/IR/InstrTypes.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2128,6 +2128,15 @@ class CallBase : public Instruction {
21282128
return Attrs.getParamStackAlignment(ArgNo);
21292129
}
21302130

2131+
/// Extract the byref type for a call or parameter.
2132+
Type *getParamByRefType(unsigned ArgNo) const {
2133+
if (auto *Ty = Attrs.getParamByRefType(ArgNo))
2134+
return Ty;
2135+
if (const Function *F = getCalledFunction())
2136+
return F->getAttributes().getParamByRefType(ArgNo);
2137+
return nullptr;
2138+
}
2139+
21312140
/// Extract the byval type for a call or parameter.
21322141
Type *getParamByValType(unsigned ArgNo) const {
21332142
if (auto *Ty = Attrs.getParamByValType(ArgNo))

llvm/include/llvm/InitializePasses.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ void initializeExpandLargeDivRemLegacyPassPass(PassRegistry&);
106106
void initializeExpandMemCmpLegacyPassPass(PassRegistry &);
107107
void initializeExpandPostRAPass(PassRegistry&);
108108
void initializeExpandReductionsPass(PassRegistry&);
109+
void initializeExpandVariadicsPass(PassRegistry &);
109110
void initializeExpandVectorPredicationPass(PassRegistry &);
110111
void initializeExternalAAWrapperPassPass(PassRegistry&);
111112
void initializeFEntryInserterPass(PassRegistry&);
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
//===- ExpandVariadics.h - expand variadic functions ------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
#ifndef LLVM_TRANSFORMS_IPO_EXPANDVARIADICS_H
9+
#define LLVM_TRANSFORMS_IPO_EXPANDVARIADICS_H
10+
11+
#include "llvm/IR/PassManager.h"
12+
13+
namespace llvm {
14+
15+
class Module;
16+
class ModulePass;
17+
class OptimizationLevel;
18+
19+
enum class ExpandVariadicsMode {
20+
Unspecified, // Use the implementation defaults
21+
Disable, // Disable the pass entirely
22+
Optimize, // Optimise without changing ABI
23+
Lowering, // Change variadic calling convention
24+
};
25+
26+
class ExpandVariadicsPass : public PassInfoMixin<ExpandVariadicsPass> {
27+
const ExpandVariadicsMode Mode;
28+
29+
public:
30+
// Operates under passed mode unless overridden on commandline
31+
ExpandVariadicsPass(ExpandVariadicsMode Mode);
32+
33+
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
34+
};
35+
36+
ModulePass *createExpandVariadicsPass(ExpandVariadicsMode);
37+
38+
} // end namespace llvm
39+
40+
#endif // LLVM_TRANSFORMS_IPO_EXPANDVARIADICS_H

llvm/lib/Passes/PassBuilder.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@
139139
#include "llvm/Transforms/IPO/DeadArgumentElimination.h"
140140
#include "llvm/Transforms/IPO/ElimAvailExtern.h"
141141
#include "llvm/Transforms/IPO/EmbedBitcodePass.h"
142+
#include "llvm/Transforms/IPO/ExpandVariadics.h"
142143
#include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
143144
#include "llvm/Transforms/IPO/FunctionAttrs.h"
144145
#include "llvm/Transforms/IPO/FunctionImport.h"

llvm/lib/Passes/PassRegistry.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ MODULE_PASS("dot-callgraph", CallGraphDOTPrinterPass())
5959
MODULE_PASS("dxil-upgrade", DXILUpgradePass())
6060
MODULE_PASS("elim-avail-extern", EliminateAvailableExternallyPass())
6161
MODULE_PASS("extract-blocks", BlockExtractorPass({}, false))
62+
MODULE_PASS("expand-variadics", ExpandVariadicsPass(ExpandVariadicsMode::Disable))
6263
MODULE_PASS("forceattrs", ForceFunctionAttrsPass())
6364
MODULE_PASS("function-import", FunctionImportPass())
6465
MODULE_PASS("globalopt", GlobalOptPass())

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
#include "llvm/Transforms/HipStdPar/HipStdPar.h"
5858
#include "llvm/Transforms/IPO.h"
5959
#include "llvm/Transforms/IPO/AlwaysInliner.h"
60+
#include "llvm/Transforms/IPO/ExpandVariadics.h"
6061
#include "llvm/Transforms/IPO/GlobalDCE.h"
6162
#include "llvm/Transforms/IPO/Internalize.h"
6263
#include "llvm/Transforms/Scalar.h"
@@ -992,6 +993,10 @@ void AMDGPUPassConfig::addIRPasses() {
992993
if (isPassEnabled(EnableImageIntrinsicOptimizer))
993994
addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
994995

996+
// This can be disabled by passing ::Disable here or on the command line
997+
// with --expand-variadics-override=disable.
998+
addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
999+
9951000
// Function calls are not supported, so make sure we inline everything.
9961001
addPass(createAMDGPUAlwaysInlinePass());
9971002
addPass(createAlwaysInlinerLegacyPass());

llvm/lib/Transforms/IPO/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ add_llvm_component_library(LLVMipo
1212
DeadArgumentElimination.cpp
1313
ElimAvailExtern.cpp
1414
EmbedBitcodePass.cpp
15+
ExpandVariadics.cpp
1516
ExtractGV.cpp
1617
ForceFunctionAttrs.cpp
1718
FunctionAttrs.cpp

0 commit comments

Comments
 (0)