llvm · boomanaiden154 · Jan 20, 2024 · Nov 14, 2023 · Jan 15, 2024 · Jan 15, 2024
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
@@ -1195,6 +1195,7 @@ if( LLVM_INCLUDE_UTILS )
   add_subdirectory(utils/UnicodeData)
   add_subdirectory(utils/yaml-bench)
   add_subdirectory(utils/split-file)
+  add_subdirectory(utils/mlgo-utils)
   if( LLVM_INCLUDE_TESTS )
     add_subdirectory(${LLVM_THIRD_PARTY_DIR}/unittest ${CMAKE_CURRENT_BINARY_DIR}/third-party/unittest)
   endif()

diff --git a/llvm/utils/mlgo-utils/CMakeLists.txt b/llvm/utils/mlgo-utils/CMakeLists.txt
@@ -0,0 +1,11 @@
+configure_lit_site_cfg(
+  "${CMAKE_CURRENT_SOURCE_DIR}/tests/lit.site.cfg.in"
+  "${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg"
+)
+
+add_lit_testsuite(check-mlgo-utils "Running mlgo-utils tests"
+  ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS "FileCheck" "not" "count"
+)
+
+set_target_properties(check-mlgo-utils PROPERTIES FOLDER "Tests")
diff --git a/llvm/utils/mlgo-utils/README.md b/llvm/utils/mlgo-utils/README.md
@@ -0,0 +1,12 @@
+# MLGO Python Utilities
+
+This folder contains MLGO Python utilities, particularly infrastructure
+to help enable ML applications within LLVM, especially tooling to extract
+corpora that can be used in downstream projects to train ML models and perform
+other tasks that benefit from having a large amount of data.
+
+### Python Versioning
+
+Due to type annotations, the MLGO tooling currently only supports a Python
+version greater than 3.8, deviating from the current LLVM project-wide
+minimum supported version of Python 3.6.
diff --git a/llvm/utils/mlgo-utils/mlgo/__init__.py b/llvm/utils/mlgo-utils/mlgo/__init__.py
@@ -0,0 +1,6 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+__versioninfo__ = (18, 0, 0)
+__version__ = ".".join(str(v) for v in __versioninfo__) + "dev"
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
@@ -0,0 +1,48 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+r"""Combine multiple training corpus into a single training corpus.
+
+Currently only support the case that multiple corpus share the same
+configurables except the "modules" field.
+
+Usage: we'd like to combine training corpus corpus1 and corpus2 into
+combinedcorpus; we first structure the files as follows:
+
+combinedcorpus
+combinedcorpus/corpus1
+combinedcorpus/corpus2
+
+Running this script with
+
+python3 \
+compiler_opt/tools/combine_training_corpus.py \
+  --root_dir=$PATH_TO_combinedcorpus
+
+generates combinedcorpus/corpus_description.json file. In this way corpus1
+and corpus2 are combined into combinedcorpus.
+"""
+
+from absl import app
+from absl import flags
+
+from mlgo.corpus import combine_training_corpus_lib
+
+flags.DEFINE_string("root_dir", "", "root dir of module paths to combine.")
+
+FLAGS = flags.FLAGS
+
+
+def main(argv):
+    if len(argv) > 1:
+        raise app.UsageError("Too many command-line arguments.")
+
+    combine_training_corpus_lib.combine_corpus(FLAGS.root_dir)
+
+
+def entrypoint():
+    app.run(main)
+
+
+if __name__ == "__main__":
+    entrypoint()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py
@@ -0,0 +1,38 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Library for combining training corpora."""
+
+import os
+import json
+import glob
+
+from absl import logging
+
+_FILE_NAME = "corpus_description.json"
+
+
+def combine_corpus(root_dir: str) -> None:
+    module_names = []
+    output_corpus_description = {}
+
+    corpus_description_glob = os.path.join(root_dir, "*/" + _FILE_NAME)
+    for corpus_description_path in glob.glob(corpus_description_glob):
+        logging.info("processing %s", corpus_description_path)
+
+        with open(corpus_description_path, encoding="utf-8") as f:
+            corpus_description = json.load(f)
+            sub_dir = os.path.basename(os.path.dirname(corpus_description_path))
+            module_names.extend(
+                [os.path.join(sub_dir, name) for name in corpus_description["modules"]]
+            )
+            del corpus_description["modules"]
+            if len(output_corpus_description) == 0:
+                output_corpus_description = corpus_description
+            elif corpus_description != output_corpus_description:
+                raise ValueError("Input corpora differ by more than modules.")
+
+    output_corpus_description["modules"] = module_names
+
+    with open(os.path.join(root_dir, _FILE_NAME), "w") as f:
+        json.dump(output_corpus_description, f, indent=2)
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
@@ -0,0 +1,165 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Extract IR for training.
+
+Extract IR for training, either from a compile_commands.json file produced by
+cmake, or a linker parameter list file.
+
+Only run with
+'python compiler_opt/tools/extract_ir.py ...'
+
+The compilation is assumed to have been performed with clang, using
+-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
+
+In a distributed ThinLTO case, the compilation is assumed to have been performed
+specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
+
+In a local ThinLTO case, the compilation is assumedto have been performed
+specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
+
+To change the logging verbosity, pass an integer representing the desired
+verbosity to the --verbosity flag. Use 0 for all logs, status information,
+and detailed debug information, -1 for solely warnings, and -2 to not produce
+any output.
+"""
+
+import json
+import multiprocessing
+
+from absl import app
+from absl import flags
+from absl import logging
+
+from mlgo.corpus import extract_ir_lib
+
+flags.DEFINE_string(
+    "input",
+    None,
+    "Input file or directory - either compile_commands.json, a linker parameter"
+    "list, or a path to a directory containing object files.",
+)
+flags.DEFINE_enum(
+    "input_type",
+    "json",
+    ["json", "params", "directory"],
+    "Input file type - json, params, or directory. params latter refers to lld"
+    "params.",
+)
+flags.DEFINE_string("output_dir", None, "Output directory")
+flags.DEFINE_integer(
+    "num_workers",
+    None,
+    "Number of parallel workers for objcopy. `None` for maximum available.",
+)
+flags.DEFINE_string("llvm_objcopy_path", "llvm-objcopy", "Path to llvm-objcopy")
+flags.DEFINE_string(
+    "obj_base_dir",
+    "",
+    "Base directory for object files. Defaults to current working dir.",
+)
+flags.DEFINE_string(
+    "cmd_filter",
+    None,
+    "Include only those modules with a command line matching this regexp. "
+    "Setting it to None for not filtering. Note that the regexp is applied "
+    "independently for each separate command line option. For example, ^-Oz$ "
+    "will match Oz - built binaries. Does not work with thinlto_build=lld.",
+)
+flags.DEFINE_enum(
+    "thinlto_build",
+    None,
+    ["distributed", "local"],
+    "Set if the build was performed with either 'distributed' or "
+    "'local' ThinLTO. This ensures the thinlto.bc files are also copied. "
+    "The build is assumed to have had "
+    "-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed "
+    "case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files "
+    "passed in the local case.",
+)
+flags.DEFINE_string(
+    "cmd_section_name",
+    ".llvmcmd",
+    "The section name passed to llvm-objcopy. For ELF object files, the "
+    "default .llvmcmd is correct. For Mach-O object files, one should use "
+    "something like __LLVM,__cmdline",
+)
+flags.DEFINE_string(
+    "bitcode_section_name",
+    ".llvmbc",
+    "The section name passed to llvm-objcopy. For ELF object files, the "
+    "default .llvmbc is correct. For Mach-O object files, one should use "
+    "__LLVM,__bitcode",
+)
+
+flags.mark_flag_as_required("output_dir")
+
+FLAGS = flags.FLAGS
+
+
+def main(argv):
+    if len(argv) > 1:
+        raise app.UsageError("Too many command-line arguments.")
+
+    objs = []
+    if FLAGS.input is not None and FLAGS.thinlto_build == "local":
+        raise ValueError("--thinlto_build=local cannot be run with --input")
+    if FLAGS.input is None:
+        if FLAGS.thinlto_build != "local":
+            raise ValueError("--input or --thinlto_build=local must be provided")
+        objs = extract_ir_lib.load_for_lld_thinlto(FLAGS.obj_base_dir, FLAGS.output_dir)
+    elif FLAGS.input_type == "json":
+        with open(FLAGS.input, encoding="utf-8") as f:
+            objs = extract_ir_lib.load_from_compile_commands(
+                json.load(f), FLAGS.output_dir
+            )
+    elif FLAGS.input_type == "params":
+        if not FLAGS.obj_base_dir:
+            logging.info(
+                "-obj_base_dir is unspecified, assuming current directory."
+                "If no objects are found, use this option to specify the root"
+                "directory for the object file paths in the input file."
+            )
+        with open(FLAGS.input, encoding="utf-8") as f:
+            objs = extract_ir_lib.load_from_lld_params(
+                [l.strip() for l in f.readlines()], FLAGS.obj_base_dir, FLAGS.output_dir
+            )
+    elif FLAGS.input_type == "directory":
+        logging.warning(
+            "Using the directory input is only recommended if the build system"
+            "your project uses does not support any structured output that"
+            "ml-compiler-opt understands. If your build system provides a"
+            "structured compilation database, use that instead"
+        )
+        objs = extract_ir_lib.load_from_directory(FLAGS.input, FLAGS.output_dir)
+    else:
+        logging.error("Unknown input type: %s", FLAGS.input_type)
+
+    relative_output_paths = extract_ir_lib.run_extraction(
+        objs,
+        FLAGS.num_workers,
+        FLAGS.llvm_objcopy_path,
+        FLAGS.cmd_filter,
+        FLAGS.thinlto_build,
+        FLAGS.cmd_section_name,
+        FLAGS.bitcode_section_name,
+    )
+
+    extract_ir_lib.write_corpus_manifest(
+        FLAGS.thinlto_build, relative_output_paths, FLAGS.output_dir
+    )
+
+    logging.info(
+        "Converted %d files out of %d",
+        len(objs) - relative_output_paths.count(None),
+        len(objs),
+    )
+
+
+def entrypoint():
+    multiprocessing.set_start_method("fork")
+    app.run(main)
+
+
+if __name__ == "__main__":
+    entrypoint()