Skip to content

Commit 1a53d4b

Browse files
tstellaraaupovpetrhosek
authored
[clang][cmake] Apply bolt optimizations as part of the clang target (#119896)
This change removes the need to call the clang-bolt target in order to apply bolt optimizations to clang. Now running `ninja clang` will build a clang with bolt optimizations, and `ninja check-clang` and `ninja install-clang` will test and install bolt optimized clang too. The clang-bolt target has been kept for compatibilty reasons, but it is now just an alias to the clang target. Also, this new design for applying the bolt optimizations to clang will be easier to generalize and use to optimize other binaries/libraries in the project. --------- Co-authored-by: Amir Ayupov <[email protected]> Co-authored-by: Petr Hosek <[email protected]>
1 parent 52bffdf commit 1a53d4b

File tree

4 files changed

+151
-77
lines changed

4 files changed

+151
-77
lines changed

clang/CMakeLists.txt

Lines changed: 0 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -872,58 +872,6 @@ if (CLANG_ENABLE_BOOTSTRAP)
872872
endforeach()
873873
endif()
874874

875-
set(CLANG_BOLT OFF CACHE STRING "Apply BOLT optimization to Clang. \
876-
May be specified as Instrument or Perf or LBR to use a particular profiling \
877-
mechanism.")
878-
string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)
879-
880-
if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
881-
set(CLANG_PATH ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang)
882-
set(CLANG_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CLANG_BOLT_INSTRUMENTED})
883-
set(BOLT_FDATA ${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/prof.fdata)
884-
885-
# Pass extra flag in no-LBR mode
886-
if (CLANG_BOLT STREQUAL "PERF")
887-
set(BOLT_NO_LBR "-nl")
888-
endif()
889-
890-
if (CLANG_BOLT STREQUAL "INSTRUMENT")
891-
# Instrument clang with BOLT
892-
add_custom_target(clang-instrumented
893-
DEPENDS ${CLANG_INSTRUMENTED}
894-
)
895-
add_custom_command(OUTPUT ${CLANG_INSTRUMENTED}
896-
DEPENDS clang llvm-bolt
897-
COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED}
898-
-instrument --instrumentation-file-append-pid
899-
--instrumentation-file=${BOLT_FDATA}
900-
COMMENT "Instrumenting clang binary with BOLT"
901-
USES_TERMINAL
902-
VERBATIM
903-
)
904-
add_custom_target(clang-bolt-training-deps DEPENDS clang-instrumented)
905-
else() # perf or LBR
906-
add_custom_target(clang-bolt-training-deps DEPENDS clang)
907-
endif()
908-
909-
# Optimize original (pre-bolt) Clang using the collected profile
910-
add_custom_target(clang-bolt
911-
DEPENDS clang-bolt-profile
912-
COMMAND ${CMAKE_COMMAND} -E rename $<TARGET_FILE:clang> ${CLANG_PATH}-prebolt
913-
COMMAND ${CMAKE_COMMAND} -E create_symlink ${CLANG_PATH}-prebolt ${CLANG_PATH}++-prebolt
914-
COMMAND llvm-bolt ${CLANG_PATH}-prebolt
915-
-o $<TARGET_FILE:clang>
916-
-data ${BOLT_FDATA}
917-
-reorder-blocks=ext-tsp -reorder-functions=cdsort -split-functions
918-
-split-all-cold -split-eh -dyno-stats -use-gnu-stack
919-
-update-debug-sections
920-
${BOLT_NO_LBR}
921-
COMMENT "Optimizing Clang with BOLT"
922-
USES_TERMINAL
923-
VERBATIM
924-
)
925-
endif()
926-
927875
if (LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION)
928876
add_subdirectory(utils/ClangVisualizers)
929877
endif()

clang/tools/driver/CMakeLists.txt

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,18 @@ if(CLANG_PLUGIN_SUPPORT)
2323
set(support_plugins SUPPORT_PLUGINS)
2424
endif()
2525

26+
set(CLANG_BOLT OFF CACHE STRING "Apply BOLT optimization to Clang. \
27+
May be specified as Instrument or Perf or LBR to use a particular profiling \
28+
mechanism.")
29+
string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)
30+
31+
if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
32+
set(CLANG_BOLT_DEPS clear-bolt-fdata llvm-bolt llvm-readobj)
33+
if (NOT CLANG_BOLT STREQUAL "INSTRUMENT")
34+
list(APPEND CLANG_BOLT_DEPS clear-perf-data)
35+
endif()
36+
endif()
37+
2638
add_clang_tool(clang
2739
driver.cpp
2840
cc1_main.cpp
@@ -35,6 +47,7 @@ add_clang_tool(clang
3547
ARMTargetParserTableGen
3648
AArch64TargetParserTableGen
3749
${support_plugins}
50+
${CLANG_BOLT_DEPS}
3851
GENERATE_DRIVER
3952
)
4053

@@ -134,3 +147,42 @@ if(CLANG_ORDER_FILE AND
134147
set_target_properties(clang PROPERTIES LINK_DEPENDS ${CLANG_ORDER_FILE})
135148
endif()
136149
endif()
150+
151+
if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
152+
# Add a clang-bolt target for backwards compatibility.
153+
add_custom_target(clang-bolt DEPENDS clang)
154+
155+
set(CLANG_BOLT_INSTRUMENTED "clang-bolt.inst" CACHE STRING
156+
"Name of BOLT-instrumented Clang binary")
157+
set(CLANG_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CLANG_BOLT_INSTRUMENTED})
158+
set(PERF_TRAINING_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../utils/perf-training)
159+
set(BOLT_FDATA ${PERF_TRAINING_BINARY_DIR}/prof.fdata)
160+
get_llvm_lit_path(
161+
lit_base_dir
162+
lit_file_name
163+
ALLOW_EXTERNAL
164+
)
165+
set(LIT_COMMAND "${lit_base_dir}/${lit_file_name}")
166+
167+
# This POST_BUILD command is executed unconditionally even if the clang target
168+
# is already built. We need to wrap the whole bolt optimization process in
169+
# a single python wrapper, so that we can first check if the binary has
170+
# already been optimized and then exit early with a 0 status if it has.
171+
add_custom_command(
172+
TARGET clang POST_BUILD
173+
COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/perf-training/perf-helper.py
174+
bolt-optimize
175+
--method ${CLANG_BOLT}
176+
--input $<TARGET_FILE:clang>
177+
--instrumented-output ${CLANG_INSTRUMENTED}
178+
--fdata ${BOLT_FDATA}
179+
--perf-training-binary-dir ${PERF_TRAINING_BINARY_DIR}
180+
--readelf $<TARGET_FILE:llvm-readobj>
181+
--bolt $<TARGET_FILE:llvm-bolt>
182+
--lit "${LIT_COMMAND}"
183+
--merge-fdata $<TARGET_FILE:merge-fdata>
184+
COMMENT "Optimizing Clang with BOLT"
185+
USES_TERMINAL
186+
VERBATIM
187+
)
188+
endif()

clang/utils/perf-training/CMakeLists.txt

Lines changed: 1 addition & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,6 @@ if(APPLE AND DTRACE AND NOT LLVM_TOOL_LLVM_DRIVER_BUILD)
8383
endif()
8484

8585
if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
86-
set(CLANG_BOLT_INSTRUMENTED "clang-bolt.inst" CACHE STRING
87-
"Name of BOLT-instrumented Clang binary")
8886
configure_lit_site_cfg(
8987
${CMAKE_CURRENT_SOURCE_DIR}/bolt.lit.site.cfg.in
9088
${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/lit.site.cfg
@@ -93,7 +91,7 @@ if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
9391
add_lit_testsuite(generate-bolt-fdata "Generating BOLT profile for Clang"
9492
${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/
9593
EXCLUDE_FROM_CHECK_ALL
96-
DEPENDS clang-bolt-training-deps clear-bolt-fdata clear-perf-data
94+
DEPENDS clear-bolt-fdata clear-perf-data
9795
)
9896

9997
add_custom_target(clear-bolt-fdata
@@ -104,26 +102,4 @@ if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
104102
COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} perf.data
105103
COMMENT "Clearing old perf data")
106104

107-
string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)
108-
if (CLANG_BOLT STREQUAL "LBR")
109-
set(BOLT_LBR "--lbr")
110-
endif()
111-
112-
add_custom_target(merge-fdata-deps)
113-
if (CLANG_BOLT STREQUAL "INSTRUMENT")
114-
add_dependencies(merge-fdata-deps generate-bolt-fdata)
115-
else()
116-
# Convert perf profiles into fdata
117-
add_custom_target(convert-perf-fdata
118-
COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py perf2bolt $<TARGET_FILE:llvm-bolt> ${CMAKE_CURRENT_BINARY_DIR} $<TARGET_FILE:clang> ${BOLT_LBR}
119-
COMMENT "Converting perf files to BOLT fdata"
120-
DEPENDS llvm-bolt generate-bolt-fdata)
121-
add_dependencies(merge-fdata-deps convert-perf-fdata)
122-
endif()
123-
124-
# Merge profiles into one using merge-fdata
125-
add_custom_target(clang-bolt-profile
126-
COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py merge-fdata $<TARGET_FILE:merge-fdata> ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata ${CMAKE_CURRENT_BINARY_DIR}
127-
COMMENT "Merging BOLT fdata"
128-
DEPENDS merge-fdata merge-fdata-deps)
129105
endif()

clang/utils/perf-training/perf-helper.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
import bisect
1717
import shlex
1818
import tempfile
19+
import re
20+
import shutil
1921

2022
test_env = {"PATH": os.environ["PATH"]}
2123

@@ -558,7 +560,103 @@ def genOrderFile(args):
558560
return 0
559561

560562

563+
def bolt_optimize(args):
564+
parser = argparse.ArgumentParser("%prog [options] ")
565+
parser.add_argument("--method", choices=["INSTRUMENT", "PERF", "LBR"])
566+
parser.add_argument("--input")
567+
parser.add_argument("--instrumented-output")
568+
parser.add_argument("--fdata")
569+
parser.add_argument("--perf-training-binary-dir")
570+
parser.add_argument("--readelf")
571+
parser.add_argument("--bolt")
572+
parser.add_argument("--lit")
573+
parser.add_argument("--merge-fdata")
574+
575+
opts = parser.parse_args(args)
576+
577+
output = subprocess.check_output(
578+
[opts.readelf, "-WS", opts.input], universal_newlines=True
579+
)
580+
581+
# This binary has already been bolt-optimized, so skip further processing.
582+
if re.search("\\.bolt\\.org\\.text", output, re.MULTILINE):
583+
return 0
584+
585+
if opts.method == "INSTRUMENT":
586+
process = subprocess.run(
587+
[
588+
opts.bolt,
589+
opts.input,
590+
"-o",
591+
opts.instrumented_output,
592+
"-instrument",
593+
"--instrumentation-file-append-pid",
594+
f"--instrumentation-file={opts.fdata}",
595+
],
596+
stdout=subprocess.PIPE,
597+
stderr=subprocess.STDOUT,
598+
text=True,
599+
)
600+
601+
print(process.args)
602+
for line in process.stdout:
603+
sys.stdout.write(line)
604+
process.check_returncode()
605+
606+
process = subprocess.run(
607+
[
608+
sys.executable,
609+
opts.lit,
610+
os.path.join(opts.perf_training_binary_dir, "bolt-fdata"),
611+
],
612+
stdout=subprocess.PIPE,
613+
stderr=subprocess.STDOUT,
614+
text=True,
615+
)
616+
617+
print(process.args)
618+
for line in process.stdout:
619+
sys.stdout.write(line)
620+
process.check_returncode()
621+
622+
if opts.method in ["PERF", "LBR"]:
623+
perf2bolt([opts.bolt, opts.perf_training_binary_dir, opts.input])
624+
625+
merge_fdata([opts.merge_fdata, opts.fdata, opts.perf_training_binary_dir])
626+
627+
shutil.copy(opts.input, f"{opts.input}-prebolt")
628+
629+
process = subprocess.run(
630+
[
631+
opts.bolt,
632+
f"{opts.input}-prebolt",
633+
"-o",
634+
opts.input,
635+
"-data",
636+
opts.fdata,
637+
"-reorder-blocks=ext-tsp",
638+
"-reorder-functions=cdsort",
639+
"-split-functions",
640+
"-split-all-cold",
641+
"-split-eh",
642+
"-dyno-stats",
643+
"-use-gnu-stack",
644+
"-update-debug-sections",
645+
"-nl" if opts.method == "PERF" else "",
646+
],
647+
stdout=subprocess.PIPE,
648+
stderr=subprocess.STDOUT,
649+
text=True,
650+
)
651+
652+
print(process.args)
653+
for line in process.stdout:
654+
sys.stdout.write(line)
655+
process.check_returncode()
656+
657+
561658
commands = {
659+
"bolt-optimize": bolt_optimize,
562660
"clean": clean,
563661
"merge": merge,
564662
"dtrace": dtrace,

0 commit comments

Comments
 (0)