Skip to content

Commit a387bce

Browse files
[MLGO] Upstream the corpus extraction tooling (#72319)
This patch upstreams some of the MLGO utilities, particularly the corpus extraction tooling, into LLVM proper. The motivation for this patch is available in the RFC. https://discourse.llvm.org/t/rfc-upstreaming-elements-of-the-mlgo-tooling/74939
1 parent bd3838f commit a387bce

17 files changed

+1413
-0
lines changed

llvm/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1197,6 +1197,7 @@ if( LLVM_INCLUDE_UTILS )
11971197
add_subdirectory(utils/UnicodeData)
11981198
add_subdirectory(utils/yaml-bench)
11991199
add_subdirectory(utils/split-file)
1200+
add_subdirectory(utils/mlgo-utils)
12001201
if( LLVM_INCLUDE_TESTS )
12011202
add_subdirectory(${LLVM_THIRD_PARTY_DIR}/unittest ${CMAKE_CURRENT_BINARY_DIR}/third-party/unittest)
12021203
endif()

llvm/utils/mlgo-utils/CMakeLists.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
configure_lit_site_cfg(
2+
"${CMAKE_CURRENT_SOURCE_DIR}/tests/lit.site.cfg.in"
3+
"${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg"
4+
)
5+
6+
add_lit_testsuite(check-mlgo-utils "Running mlgo-utils tests"
7+
${CMAKE_CURRENT_BINARY_DIR}
8+
DEPENDS "FileCheck" "not" "count"
9+
)
10+
11+
set_target_properties(check-mlgo-utils PROPERTIES FOLDER "Tests")

llvm/utils/mlgo-utils/README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# MLGO Python Utilities
2+
3+
This folder contains MLGO Python utilities, particularly infrastructure
4+
to help enable ML applications within LLVM, especially tooling to extract
5+
corpora that can be used in downstream projects to train ML models and perform
6+
other tasks that benefit from having a large amount of data.
7+
8+
### Python Versioning
9+
10+
Due to type annotations, the MLGO tooling currently only supports a Python
11+
version greater than 3.8, deviating from the current LLVM project-wide
12+
minimum supported version of Python 3.6.
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2+
# See https://llvm.org/LICENSE.txt for license information.
3+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4+
5+
__versioninfo__ = (18, 0, 0)
6+
__version__ = ".".join(str(v) for v in __versioninfo__) + "dev"
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2+
# See https://llvm.org/LICENSE.txt for license information.
3+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4+
r"""Combine multiple training corpus into a single training corpus.
5+
6+
Currently only support the case that multiple corpus share the same
7+
configurables except the "modules" field.
8+
9+
Usage: we'd like to combine training corpus corpus1 and corpus2 into
10+
combinedcorpus; we first structure the files as follows:
11+
12+
combinedcorpus
13+
combinedcorpus/corpus1
14+
combinedcorpus/corpus2
15+
16+
Running this script with
17+
18+
python3 \
19+
compiler_opt/tools/combine_training_corpus.py \
20+
--root_dir=$PATH_TO_combinedcorpus
21+
22+
generates combinedcorpus/corpus_description.json file. In this way corpus1
23+
and corpus2 are combined into combinedcorpus.
24+
"""
25+
26+
from absl import app
27+
from absl import flags
28+
29+
from mlgo.corpus import combine_training_corpus_lib
30+
31+
flags.DEFINE_string("root_dir", "", "root dir of module paths to combine.")
32+
33+
FLAGS = flags.FLAGS
34+
35+
36+
def main(argv):
37+
if len(argv) > 1:
38+
raise app.UsageError("Too many command-line arguments.")
39+
40+
combine_training_corpus_lib.combine_corpus(FLAGS.root_dir)
41+
42+
43+
def entrypoint():
44+
app.run(main)
45+
46+
47+
if __name__ == "__main__":
48+
entrypoint()
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2+
# See https://llvm.org/LICENSE.txt for license information.
3+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4+
"""Library for combining training corpora."""
5+
6+
import os
7+
import json
8+
import glob
9+
10+
from absl import logging
11+
12+
_FILE_NAME = "corpus_description.json"
13+
14+
15+
def combine_corpus(root_dir: str) -> None:
16+
module_names = []
17+
output_corpus_description = {}
18+
19+
corpus_description_glob = os.path.join(root_dir, "*/" + _FILE_NAME)
20+
for corpus_description_path in glob.glob(corpus_description_glob):
21+
logging.info("processing %s", corpus_description_path)
22+
23+
with open(corpus_description_path, encoding="utf-8") as f:
24+
corpus_description = json.load(f)
25+
sub_dir = os.path.basename(os.path.dirname(corpus_description_path))
26+
module_names.extend(
27+
[os.path.join(sub_dir, name) for name in corpus_description["modules"]]
28+
)
29+
del corpus_description["modules"]
30+
if len(output_corpus_description) == 0:
31+
output_corpus_description = corpus_description
32+
elif corpus_description != output_corpus_description:
33+
raise ValueError("Input corpora differ by more than modules.")
34+
35+
output_corpus_description["modules"] = module_names
36+
37+
with open(os.path.join(root_dir, _FILE_NAME), "w") as f:
38+
json.dump(output_corpus_description, f, indent=2)
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
2+
# See https://llvm.org/LICENSE.txt for license information.
3+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4+
"""Extract IR for training.
5+
6+
Extract IR for training, either from a compile_commands.json file produced by
7+
cmake, or a linker parameter list file.
8+
9+
Only run with
10+
'python compiler_opt/tools/extract_ir.py ...'
11+
12+
The compilation is assumed to have been performed with clang, using
13+
-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
14+
15+
In a distributed ThinLTO case, the compilation is assumed to have been performed
16+
specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
17+
18+
In a local ThinLTO case, the compilation is assumedto have been performed
19+
specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
20+
21+
To change the logging verbosity, pass an integer representing the desired
22+
verbosity to the --verbosity flag. Use 0 for all logs, status information,
23+
and detailed debug information, -1 for solely warnings, and -2 to not produce
24+
any output.
25+
"""
26+
27+
import json
28+
import multiprocessing
29+
30+
from absl import app
31+
from absl import flags
32+
from absl import logging
33+
34+
from mlgo.corpus import extract_ir_lib
35+
36+
flags.DEFINE_string(
37+
"input",
38+
None,
39+
"Input file or directory - either compile_commands.json, a linker parameter"
40+
"list, or a path to a directory containing object files.",
41+
)
42+
flags.DEFINE_enum(
43+
"input_type",
44+
"json",
45+
["json", "params", "directory"],
46+
"Input file type - json, params, or directory. params latter refers to lld"
47+
"params.",
48+
)
49+
flags.DEFINE_string("output_dir", None, "Output directory")
50+
flags.DEFINE_integer(
51+
"num_workers",
52+
None,
53+
"Number of parallel workers for objcopy. `None` for maximum available.",
54+
)
55+
flags.DEFINE_string("llvm_objcopy_path", "llvm-objcopy", "Path to llvm-objcopy")
56+
flags.DEFINE_string(
57+
"obj_base_dir",
58+
"",
59+
"Base directory for object files. Defaults to current working dir.",
60+
)
61+
flags.DEFINE_string(
62+
"cmd_filter",
63+
None,
64+
"Include only those modules with a command line matching this regexp. "
65+
"Setting it to None for not filtering. Note that the regexp is applied "
66+
"independently for each separate command line option. For example, ^-Oz$ "
67+
"will match Oz - built binaries. Does not work with thinlto_build=lld.",
68+
)
69+
flags.DEFINE_enum(
70+
"thinlto_build",
71+
None,
72+
["distributed", "local"],
73+
"Set if the build was performed with either 'distributed' or "
74+
"'local' ThinLTO. This ensures the thinlto.bc files are also copied. "
75+
"The build is assumed to have had "
76+
"-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed "
77+
"case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files "
78+
"passed in the local case.",
79+
)
80+
flags.DEFINE_string(
81+
"cmd_section_name",
82+
".llvmcmd",
83+
"The section name passed to llvm-objcopy. For ELF object files, the "
84+
"default .llvmcmd is correct. For Mach-O object files, one should use "
85+
"something like __LLVM,__cmdline",
86+
)
87+
flags.DEFINE_string(
88+
"bitcode_section_name",
89+
".llvmbc",
90+
"The section name passed to llvm-objcopy. For ELF object files, the "
91+
"default .llvmbc is correct. For Mach-O object files, one should use "
92+
"__LLVM,__bitcode",
93+
)
94+
95+
flags.mark_flag_as_required("output_dir")
96+
97+
FLAGS = flags.FLAGS
98+
99+
100+
def main(argv):
101+
if len(argv) > 1:
102+
raise app.UsageError("Too many command-line arguments.")
103+
104+
objs = []
105+
if FLAGS.input is not None and FLAGS.thinlto_build == "local":
106+
raise ValueError("--thinlto_build=local cannot be run with --input")
107+
if FLAGS.input is None:
108+
if FLAGS.thinlto_build != "local":
109+
raise ValueError("--input or --thinlto_build=local must be provided")
110+
objs = extract_ir_lib.load_for_lld_thinlto(FLAGS.obj_base_dir, FLAGS.output_dir)
111+
elif FLAGS.input_type == "json":
112+
with open(FLAGS.input, encoding="utf-8") as f:
113+
objs = extract_ir_lib.load_from_compile_commands(
114+
json.load(f), FLAGS.output_dir
115+
)
116+
elif FLAGS.input_type == "params":
117+
if not FLAGS.obj_base_dir:
118+
logging.info(
119+
"-obj_base_dir is unspecified, assuming current directory."
120+
"If no objects are found, use this option to specify the root"
121+
"directory for the object file paths in the input file."
122+
)
123+
with open(FLAGS.input, encoding="utf-8") as f:
124+
objs = extract_ir_lib.load_from_lld_params(
125+
[l.strip() for l in f.readlines()], FLAGS.obj_base_dir, FLAGS.output_dir
126+
)
127+
elif FLAGS.input_type == "directory":
128+
logging.warning(
129+
"Using the directory input is only recommended if the build system"
130+
"your project uses does not support any structured output that"
131+
"ml-compiler-opt understands. If your build system provides a"
132+
"structured compilation database, use that instead"
133+
)
134+
objs = extract_ir_lib.load_from_directory(FLAGS.input, FLAGS.output_dir)
135+
else:
136+
logging.error("Unknown input type: %s", FLAGS.input_type)
137+
138+
relative_output_paths = extract_ir_lib.run_extraction(
139+
objs,
140+
FLAGS.num_workers,
141+
FLAGS.llvm_objcopy_path,
142+
FLAGS.cmd_filter,
143+
FLAGS.thinlto_build,
144+
FLAGS.cmd_section_name,
145+
FLAGS.bitcode_section_name,
146+
)
147+
148+
extract_ir_lib.write_corpus_manifest(
149+
FLAGS.thinlto_build, relative_output_paths, FLAGS.output_dir
150+
)
151+
152+
logging.info(
153+
"Converted %d files out of %d",
154+
len(objs) - relative_output_paths.count(None),
155+
len(objs),
156+
)
157+
158+
159+
def entrypoint():
160+
multiprocessing.set_start_method("fork")
161+
app.run(main)
162+
163+
164+
if __name__ == "__main__":
165+
entrypoint()

0 commit comments

Comments
 (0)