TensorFlow with CUDA or Python might rebuilds more than necessary instead of re-using bazel cache

Context: for DeepSpeech, we perform tensorflow builds and then keep the cache in a tar (capturing the whole of the home directory of the build user). We then untar it and the deepspeech build through `bazel build` picks the proper cached items so it does not rebuild anything.

Recently, we started to have increased (2.5x) build time on CUDA-enabled builds. Debugging with Bazel showed that it was rebuilding because the `actionKey` computed for `stream_executor_impl` was different. Instrumenting Bazel to get more informations, I could get down to the reason of the different actionKey: the ordering of the CUDA includes was different. The list itself contained the exact same content, just a different ordering.

Those includes are symlinks, and they are generated from a genrule. This is all taken care of by https://github.com/tensorflow/tensorflow/blob/ba64f5334d4bba31d22c30e09a96f806ea0e2f7e/third_party/gpus/cuda_configure.bzl#L915-L1035 which generated shell script for the genrules, that actually do perform the symlinks. Checking those shell scripts revealed the exact same and different ordering.

Checking more carefully, one will see that the headers are discovered by `_read_dir` function: https://github.com/tensorflow/tensorflow/blob/ba64f5334d4bba31d22c30e09a96f806ea0e2f7e/third_party/gpus/cuda_configure.bzl#L891-L894, it does directly get the output of `find`. This is dependant on the ordering provided by `readdir` syscall.

In our case, the ordering on the filesystem before making the tar archive, and after untarring it would be different.

One simple fix for that is to force ordering the list of headers, this way we are sure the order is always the same and we are not dependant on what `readdir` is going to get us.

In the past, Bazel would force the ordering of the elements considered to compute the actionKey. This was removed with 0.3.0 but it might have make the issue hidden https://github.com/bazelbuild/bazel/commit/9dc32111d5b6c1c7c5eaf39efad5fef75327ee75

	def _create_local_cuda_repository(repository_ctx):
	"""Creates the repository containing files set up to build with CUDA."""
	cuda_config = _get_cuda_config(repository_ctx)

	cudnn_header_dir = _find_cudnn_header_dir(repository_ctx,
	cuda_config.cudnn_install_basedir)

	# Set up symbolic links for the cuda toolkit by creating genrules to do
	# symlinking. We create one genrule for each directory we want to track under
	# cuda_toolkit_path
	cuda_toolkit_path = cuda_config.cuda_toolkit_path
	cuda_include_path = cuda_toolkit_path + "/include"
	genrules = [symlink_genrule_for_dir(repository_ctx,
	cuda_include_path, "cuda/include", "cuda-include")]
	genrules.append(symlink_genrule_for_dir(repository_ctx,
	cuda_toolkit_path + "/nvvm", "cuda/nvvm", "cuda-nvvm"))
	genrules.append(symlink_genrule_for_dir(repository_ctx,
	cuda_toolkit_path + "/extras/CUPTI/include",
	"cuda/extras/CUPTI/include", "cuda-extras"))

	cuda_libs = _find_libs(repository_ctx, cuda_config)
	cuda_lib_src = []
	cuda_lib_dest = []
	for lib in cuda_libs.values():
	cuda_lib_src.append(lib.path)
	cuda_lib_dest.append("cuda/lib/" + lib.file_name)
	genrules.append(symlink_genrule_for_dir(repository_ctx, None, "", "cuda-lib",
	cuda_lib_src, cuda_lib_dest))

	# Set up the symbolic links for cudnn if cndnn was not installed to
	# CUDA_TOOLKIT_PATH.
	included_files = _read_dir(repository_ctx, cuda_include_path).replace(
	cuda_include_path, '').splitlines()
	if '/cudnn.h' not in included_files:
	genrules.append(symlink_genrule_for_dir(repository_ctx, None,
	"cuda/include/", "cudnn-include", [cudnn_header_dir + "/cudnn.h"],
	["cudnn.h"]))
	else:
	genrules.append(
	'filegroup(\n' +
	' name = "cudnn-include",\n' +
	' srcs = [],\n' +
	')\n'
	)

	# Set up BUILD file for cuda/
	_tpl(repository_ctx, "cuda:build_defs.bzl",
	{
	"%{cuda_is_configured}": "True",
	"%{cuda_extra_copts}": _compute_cuda_extra_copts(
	repository_ctx, cuda_config.compute_capabilities),
	})
	_tpl(repository_ctx, "cuda:BUILD",
	{
	"%{cuda_driver_lib}": cuda_libs["cuda"].file_name,
	"%{cudart_static_lib}": cuda_libs["cudart_static"].file_name,
	"%{cudart_static_linkopt}": _cudart_static_linkopt(
	cuda_config.cpu_value),
	"%{cudart_lib}": cuda_libs["cudart"].file_name,
	"%{cublas_lib}": cuda_libs["cublas"].file_name,
	"%{cusolver_lib}": cuda_libs["cusolver"].file_name,
	"%{cudnn_lib}": cuda_libs["cudnn"].file_name,
	"%{cufft_lib}": cuda_libs["cufft"].file_name,
	"%{curand_lib}": cuda_libs["curand"].file_name,
	"%{cupti_lib}": cuda_libs["cupti"].file_name,
	"%{cuda_include_genrules}": "\n".join(genrules),
	"%{cuda_headers}": ('":cuda-include",\n' +
	' ":cudnn-include",')
	})

	is_cuda_clang = _use_cuda_clang(repository_ctx)

	should_download_clang = is_cuda_clang and _flag_enabled(
	repository_ctx, _TF_DOWNLOAD_CLANG)
	if should_download_clang:
	download_clang(repository_ctx, "crosstool/extra_tools")

	# Set up crosstool/
	cc = find_cc(repository_ctx)
	cc_fullpath = cc if not should_download_clang else "crosstool/" + cc

	host_compiler_includes = _host_compiler_includes(repository_ctx, cc_fullpath)
	cuda_defines = {
	"%{cuda_include_path}": _cuda_include_path(repository_ctx,
	cuda_config),
	"%{host_compiler_includes}": host_compiler_includes,
	}
	if is_cuda_clang:
	cuda_defines["%{clang_path}"] = cc
	_tpl(repository_ctx, "crosstool:BUILD", {"%{linker_files}": ":empty"})
	_tpl(repository_ctx, "crosstool:CROSSTOOL_clang", cuda_defines, out="crosstool/CROSSTOOL")
	repository_ctx.file("crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc", "")
	else:
	nvcc_path = str(repository_ctx.path("%s/bin/nvcc%s" %
	(cuda_config.cuda_toolkit_path,
	".exe" if cuda_config.cpu_value == "Windows" else "")))
	_tpl(repository_ctx, "crosstool:BUILD",
	{"%{linker_files}": ":crosstool_wrapper_driver_is_not_gcc"})
	_tpl(repository_ctx, "crosstool:CROSSTOOL_nvcc", cuda_defines, out="crosstool/CROSSTOOL")
	_tpl(repository_ctx,
	"crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
	{
	"%{cpu_compiler}": str(cc),
	"%{cuda_version}": cuda_config.cuda_version,
	"%{nvcc_path}": nvcc_path,
	"%{gcc_host_compiler_path}": str(cc),
	"%{cuda_compute_capabilities}": ", ".join(
	["\"%s\"" % c for c in cuda_config.compute_capabilities]),
	})

	# Set up cuda_config.h, which is used by
	# tensorflow/stream_executor/dso_loader.cc.
	_tpl(repository_ctx, "cuda:cuda_config.h",
	{
	"%{cuda_version}": cuda_config.cuda_version,
	"%{cudnn_version}": cuda_config.cudnn_version,
	"%{cuda_compute_capabilities}": ",".join(
	["CudaVersion(\"%s\")" % c
	for c in cuda_config.compute_capabilities]),
	"%{cuda_toolkit_path}": cuda_config.cuda_toolkit_path,
	}, "cuda/cuda/cuda_config.h")

	find_result = _execute(
	repository_ctx, ["find", src_dir, "-follow", "-type", "f"],
	empty_stdout_fine=True)
	result = find_result.stdout

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

TensorFlow with CUDA or Python might rebuilds more than necessary instead of re-using bazel cache #16585

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

TensorFlow with CUDA or Python might rebuilds more than necessary instead of re-using bazel cache #16585

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions