Skip to content

[Clang][CMake] Support perf, LBR, and Instrument CLANG_BOLT options #69133

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 30 additions & 14 deletions clang/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -850,23 +850,38 @@ if (CLANG_ENABLE_BOOTSTRAP)
endforeach()
endif()

if (CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED)
set(CLANG_BOLT "INSTRUMENT" CACHE STRING "Apply BOLT optimization to Clang. \
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wild guess, this defaults to INSTRUMENT, but it should default to some OFF value?

May be specified as Instrument or Perf or LBR to use a particular profiling \
mechanism.")
string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)

if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
set(CLANG_PATH ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang)
set(CLANG_INSTRUMENTED ${CLANG_PATH}-bolt.inst)
set(CLANG_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CLANG_BOLT_INSTRUMENTED})
set(BOLT_FDATA ${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/prof.fdata)

# Instrument clang with BOLT
add_custom_target(clang-instrumented
DEPENDS ${CLANG_INSTRUMENTED}
)
add_custom_command(OUTPUT ${CLANG_INSTRUMENTED}
DEPENDS clang llvm-bolt
COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED}
-instrument --instrumentation-file-append-pid
--instrumentation-file=${BOLT_FDATA}
COMMENT "Instrumenting clang binary with BOLT"
VERBATIM
)
# Pass extra flag in no-LBR mode
if (CLANG_BOLT STREQUAL "PERF")
set(BOLT_NO_LBR "-nl")
endif()

if (CLANG_BOLT STREQUAL "INSTRUMENT")
# Instrument clang with BOLT
add_custom_target(clang-instrumented
DEPENDS ${CLANG_INSTRUMENTED}
)
add_custom_command(OUTPUT ${CLANG_INSTRUMENTED}
DEPENDS clang llvm-bolt
COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED}
-instrument --instrumentation-file-append-pid
--instrumentation-file=${BOLT_FDATA}
COMMENT "Instrumenting clang binary with BOLT"
VERBATIM
)
add_custom_target(clang-bolt-training-deps DEPENDS clang-instrumented)
else() # perf or LBR
add_custom_target(clang-bolt-training-deps DEPENDS clang)
endif()

# Optimize original (pre-bolt) Clang using the collected profile
set(CLANG_OPTIMIZED ${CMAKE_CURRENT_BINARY_DIR}/clang.bolt)
Expand All @@ -880,6 +895,7 @@ if (CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED)
-data ${BOLT_FDATA}
-reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions
-split-all-cold -split-eh -dyno-stats -icf=1 -use-gnu-stack
${BOLT_NO_LBR}
COMMAND ${CMAKE_COMMAND} -E rename ${CLANG_OPTIMIZED} $<TARGET_FILE:clang>
COMMENT "Optimizing Clang with BOLT"
VERBATIM
Expand Down
2 changes: 1 addition & 1 deletion clang/cmake/caches/BOLT.cmake
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
set(CMAKE_BUILD_TYPE Release CACHE STRING "")
set(CLANG_BOLT_INSTRUMENT ON CACHE BOOL "")
set(CLANG_BOLT "INSTRUMENT" CACHE STRING "")
set(CMAKE_EXE_LINKER_FLAGS "-Wl,--emit-relocs,-znow" CACHE STRING "")

set(LLVM_ENABLE_PROJECTS "bolt;clang" CACHE STRING "")
Expand Down
29 changes: 26 additions & 3 deletions clang/utils/perf-training/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ if(APPLE AND DTRACE AND NOT LLVM_TOOL_LLVM_DRIVER_BUILD)
DEPENDS generate-dtrace-logs)
endif()

if(CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED)
if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
set(CLANG_BOLT_INSTRUMENTED "clang-bolt.inst" CACHE STRING
"Name of BOLT-instrumented Clang binary")
configure_lit_site_cfg(
${CMAKE_CURRENT_SOURCE_DIR}/bolt.lit.site.cfg.in
${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/lit.site.cfg
Expand All @@ -71,16 +73,37 @@ if(CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED)
add_lit_testsuite(generate-bolt-fdata "Generating BOLT profile for Clang"
${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/
EXCLUDE_FROM_CHECK_ALL
DEPENDS clang-instrumented clear-bolt-fdata
DEPENDS clang-bolt-training-deps clear-bolt-fdata clear-perf-data
)

add_custom_target(clear-bolt-fdata
COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} fdata
COMMENT "Clearing old BOLT fdata")

add_custom_target(clear-perf-data
COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} perf.data
COMMENT "Clearing old perf data")

string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)
if (CLANG_BOLT STREQUAL "LBR")
set(BOLT_LBR "--lbr")
endif()

add_custom_target(merge-fdata-deps)
if (CLANG_BOLT STREQUAL "INSTRUMENT")
add_dependencies(merge-fdata-deps generate-bolt-fdata)
else()
# Convert perf profiles into fdata
add_custom_target(convert-perf-fdata
COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py perf2bolt $<TARGET_FILE:llvm-bolt> ${CMAKE_CURRENT_BINARY_DIR} $<TARGET_FILE:clang> ${BOLT_LBR}
COMMENT "Converting perf files to BOLT fdata"
DEPENDS llvm-bolt generate-bolt-fdata)
add_dependencies(merge-fdata-deps convert-perf-fdata)
endif()

# Merge profiles into one using merge-fdata
add_custom_target(clang-bolt-profile
COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py merge-fdata $<TARGET_FILE:merge-fdata> ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata ${CMAKE_CURRENT_BINARY_DIR}
COMMENT "Merging BOLT fdata"
DEPENDS merge-fdata generate-bolt-fdata)
DEPENDS merge-fdata merge-fdata-deps)
endif()
47 changes: 39 additions & 8 deletions clang/utils/perf-training/bolt.lit.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,46 @@ import lit.util
import os
import subprocess

config.clang = os.path.realpath(lit.util.which('clang-bolt.inst', config.clang_tools_dir)).replace('\\', '/')
clang_bolt_mode = config.clang_bolt_mode.lower()
clang_binary = "clang"
perf_wrapper = f"{config.python_exe} {config.perf_helper_dir}/perf-helper.py perf "

config.name = 'Clang Perf Training'
config.suffixes = ['.c', '.cc', '.cpp', '.m', '.mm', '.cu', '.ll', '.cl', '.s', '.S', '.modulemap', '.test']
if clang_bolt_mode == "instrument":
perf_wrapper = ""
clang_binary = config.clang_bolt_name
elif clang_bolt_mode == "lbr":
perf_wrapper += " --lbr -- "
elif clang_bolt_mode == "perf":
perf_wrapper += " -- "
else:
assert 0, "Unsupported CLANG_BOLT_MODE variable"

config.clang = perf_wrapper + os.path.realpath(
lit.util.which(clang_binary, config.clang_tools_dir)
).replace("\\", "/")

config.name = "Clang Perf Training"
config.suffixes = [
".c",
".cc",
".cpp",
".m",
".mm",
".cu",
".ll",
".cl",
".s",
".S",
".modulemap",
".test",
]

use_lit_shell = os.environ.get("LIT_USE_INTERNAL_SHELL")
config.test_format = lit.formats.ShTest(use_lit_shell == "0")
config.substitutions.append( ('%clang_cpp_skip_driver', ' %s --driver-mode=g++ ' % (config.clang)))
config.substitutions.append( ('%clang_cpp', ' %s --driver-mode=g++ ' % (config.clang)))
config.substitutions.append( ('%clang_skip_driver', ' %s ' % (config.clang)))
config.substitutions.append( ('%clang', ' %s ' % (config.clang) ) )
config.substitutions.append( ('%test_root', config.test_exec_root ) )
config.substitutions.append(
("%clang_cpp_skip_driver", f" {config.clang} --driver-mode=g++ ")
)
config.substitutions.append(("%clang_cpp", f" {config.clang} --driver-mode=g++ "))
config.substitutions.append(("%clang_skip_driver", config.clang))
config.substitutions.append(("%clang", config.clang))
config.substitutions.append(("%test_root", config.test_exec_root))
2 changes: 2 additions & 0 deletions clang/utils/perf-training/bolt.lit.site.cfg.in
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ config.test_source_root = "@CLANG_PGO_TRAINING_DATA@"
config.target_triple = "@LLVM_TARGET_TRIPLE@"
config.python_exe = "@Python3_EXECUTABLE@"
config.clang_obj_root = path(r"@CLANG_BINARY_DIR@")
config.clang_bolt_mode = "@CLANG_BOLT@"
config.clang_bolt_name = "@CLANG_BOLT_INSTRUMENTED@"

# Let the main config do the real work.
lit_config.load_config(config, "@CLANG_SOURCE_DIR@/utils/perf-training/bolt.lit.cfg")
58 changes: 58 additions & 0 deletions clang/utils/perf-training/perf-helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,62 @@ def merge_fdata(args):
return 0


def perf(args):
parser = argparse.ArgumentParser(
prog="perf-helper perf", description="perf wrapper for BOLT profile collection"
)
parser.add_argument(
"--lbr", action="store_true", help="Use perf with branch stacks"
)
parser.add_argument("cmd", nargs=argparse.REMAINDER, help="")

opts = parser.parse_args(args)
cmd = opts.cmd[1:]

perf_args = [
"perf",
"record",
"--event=cycles:u",
"--freq=max",
"--output=%d.perf.data" % os.getpid(),
]
if opts.lbr:
perf_args += ["--branch-filter=any,u"]
perf_args.extend(cmd)

start_time = time.time()
subprocess.check_call(perf_args)

elapsed = time.time() - start_time
print("... data collection took %.4fs" % elapsed)
return 0


def perf2bolt(args):
parser = argparse.ArgumentParser(
prog="perf-helper perf2bolt",
description="perf2bolt conversion wrapper for perf.data files",
)
parser.add_argument("bolt", help="Path to llvm-bolt")
parser.add_argument("path", help="Path containing perf.data files")
parser.add_argument("binary", help="Input binary")
parser.add_argument("--lbr", action="store_true", help="Use LBR perf2bolt mode")
opts = parser.parse_args(args)

p2b_args = [
opts.bolt,
opts.binary,
"--aggregate-only",
"--profile-format=yaml",
]
if not opts.lbr:
p2b_args += ["-nl"]
p2b_args += ["-p"]
for filename in findFilesWithExtension(opts.path, "perf.data"):
subprocess.check_call(p2b_args + [filename, "-o", filename + ".fdata"])
return 0


def dtrace(args):
parser = argparse.ArgumentParser(
prog="perf-helper dtrace",
Expand Down Expand Up @@ -507,6 +563,8 @@ def genOrderFile(args):
"cc1": cc1,
"gen-order-file": genOrderFile,
"merge-fdata": merge_fdata,
"perf": perf,
"perf2bolt": perf2bolt,
}


Expand Down