Skip to content

Introduce a minimum CGU size in non-incremental builds. #112448

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1385,7 +1385,7 @@ fn vcall_visibility_metadata<'ll, 'tcx>(
let trait_def_id = trait_ref_self.def_id();
let trait_vis = cx.tcx.visibility(trait_def_id);

let cgus = cx.sess().codegen_units();
let cgus = cx.sess().codegen_units().as_usize();
let single_cgu = cgus == 1;

let lto = cx.sess().lto();
Expand Down
6 changes: 3 additions & 3 deletions compiler/rustc_codegen_ssa/src/back/write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -646,10 +646,10 @@ fn produce_final_output_artifacts(
// rlib.
let needs_crate_object = crate_output.outputs.contains_key(&OutputType::Exe);

let keep_numbered_bitcode = user_wants_bitcode && sess.codegen_units() > 1;
let keep_numbered_bitcode = user_wants_bitcode && sess.codegen_units().as_usize() > 1;

let keep_numbered_objects =
needs_crate_object || (user_wants_objects && sess.codegen_units() > 1);
needs_crate_object || (user_wants_objects && sess.codegen_units().as_usize() > 1);

for module in compiled_modules.modules.iter() {
if let Some(ref path) = module.object {
Expand Down Expand Up @@ -1923,7 +1923,7 @@ impl<B: ExtraBackendMethods> OngoingCodegen<B> {

// FIXME: time_llvm_passes support - does this use a global context or
// something?
if sess.codegen_units() == 1 && sess.opts.unstable_opts.time_llvm_passes {
if sess.codegen_units().as_usize() == 1 && sess.opts.unstable_opts.time_llvm_passes {
self.backend.print_pass_timings()
}

Expand Down
202 changes: 168 additions & 34 deletions compiler/rustc_monomorphize/src/partitioning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ use rustc_middle::query::Providers;
use rustc_middle::ty::print::{characteristic_def_id_of_type, with_no_trimmed_paths};
use rustc_middle::ty::{self, visit::TypeVisitableExt, InstanceDef, TyCtxt};
use rustc_session::config::{DumpMonoStatsFormat, SwitchWithOptPath};
use rustc_session::CodegenUnits;
use rustc_span::symbol::Symbol;

use crate::collector::UsageMap;
Expand All @@ -121,7 +122,6 @@ use crate::errors::{CouldntDumpMonoStats, SymbolAlreadyDefined, UnknownCguCollec

struct PartitioningCx<'a, 'tcx> {
tcx: TyCtxt<'tcx>,
target_cgu_count: usize,
usage_map: &'a UsageMap<'tcx>,
}

Expand All @@ -130,26 +130,30 @@ struct PlacedRootMonoItems<'tcx> {
codegen_units: Vec<CodegenUnit<'tcx>>,

internalization_candidates: FxHashSet<MonoItem<'tcx>>,

/// These must be obtained when the iterator in `partition` runs. They
/// can't be obtained later because some inlined functions might not be
/// reachable.
unique_inlined_stats: (usize, usize),
}

// The output CGUs are sorted by name.
fn partition<'tcx, I>(
tcx: TyCtxt<'tcx>,
mono_items: I,
max_cgu_count: usize,
usage_map: &UsageMap<'tcx>,
) -> Vec<CodegenUnit<'tcx>>
where
I: Iterator<Item = MonoItem<'tcx>>,
{
let _prof_timer = tcx.prof.generic_activity("cgu_partitioning");

let cx = &PartitioningCx { tcx, target_cgu_count: max_cgu_count, usage_map };
let cx = &PartitioningCx { tcx, usage_map };

// In the first step, we place all regular monomorphizations into their
// respective 'home' codegen unit. Regular monomorphizations are all
// functions and statics defined in the local crate.
let PlacedRootMonoItems { mut codegen_units, internalization_candidates } = {
let PlacedRootMonoItems { mut codegen_units, internalization_candidates, unique_inlined_stats } = {
let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_place_roots");
place_root_mono_items(cx, mono_items)
};
Expand All @@ -158,15 +162,15 @@ where
cgu.create_size_estimate(tcx);
}

debug_dump(tcx, "INITIAL PARTITIONING", &codegen_units);
debug_dump(tcx, "ROOTS", &codegen_units, unique_inlined_stats);

// Merge until we have at most `max_cgu_count` codegen units.
// `merge_codegen_units` is responsible for updating the CGU size
// estimates.
{
let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_merge_cgus");
merge_codegen_units(cx, &mut codegen_units);
debug_dump(tcx, "POST MERGING", &codegen_units);
debug_dump(tcx, "MERGE", &codegen_units, unique_inlined_stats);
}

// In the next step, we use the inlining map to determine which additional
Expand All @@ -182,7 +186,7 @@ where
cgu.create_size_estimate(tcx);
}

debug_dump(tcx, "POST INLINING", &codegen_units);
debug_dump(tcx, "INLINE", &codegen_units, unique_inlined_stats);

// Next we try to make as many symbols "internal" as possible, so LLVM has
// more freedom to optimize.
Expand Down Expand Up @@ -226,7 +230,7 @@ where
// Ensure CGUs are sorted by name, so that we get deterministic results.
assert!(codegen_units.is_sorted_by(|a, b| Some(a.name().as_str().cmp(b.name().as_str()))));

debug_dump(tcx, "FINAL", &codegen_units);
debug_dump(tcx, "FINAL", &codegen_units, unique_inlined_stats);

codegen_units
}
Expand All @@ -252,10 +256,16 @@ where
let cgu_name_builder = &mut CodegenUnitNameBuilder::new(cx.tcx);
let cgu_name_cache = &mut FxHashMap::default();

let mut num_unique_inlined_items = 0;
let mut unique_inlined_items_size = 0;
for mono_item in mono_items {
match mono_item.instantiation_mode(cx.tcx) {
InstantiationMode::GloballyShared { .. } => {}
InstantiationMode::LocalCopy => continue,
InstantiationMode::LocalCopy => {
num_unique_inlined_items += 1;
unique_inlined_items_size += mono_item.size_estimate(cx.tcx);
continue;
}
}

let characteristic_def_id = characteristic_def_id_of_mono_item(cx.tcx, mono_item);
Expand Down Expand Up @@ -300,7 +310,11 @@ where
let mut codegen_units: Vec<_> = codegen_units.into_values().collect();
codegen_units.sort_by(|a, b| a.name().as_str().cmp(b.name().as_str()));

PlacedRootMonoItems { codegen_units, internalization_candidates }
PlacedRootMonoItems {
codegen_units,
internalization_candidates,
unique_inlined_stats: (num_unique_inlined_items, unique_inlined_items_size),
}
}

// This function requires the CGUs to be sorted by name on input, and ensures
Expand All @@ -309,7 +323,7 @@ fn merge_codegen_units<'tcx>(
cx: &PartitioningCx<'_, 'tcx>,
codegen_units: &mut Vec<CodegenUnit<'tcx>>,
) {
assert!(cx.target_cgu_count >= 1);
assert!(cx.tcx.sess.codegen_units().as_usize() >= 1);

// A sorted order here ensures merging is deterministic.
assert!(codegen_units.is_sorted_by(|a, b| Some(a.name().as_str().cmp(b.name().as_str()))));
Expand All @@ -318,11 +332,32 @@ fn merge_codegen_units<'tcx>(
let mut cgu_contents: FxHashMap<Symbol, Vec<Symbol>> =
codegen_units.iter().map(|cgu| (cgu.name(), vec![cgu.name()])).collect();

// Merge the two smallest codegen units until the target size is
// reached.
while codegen_units.len() > cx.target_cgu_count {
// Sort small cgus to the back
// Having multiple CGUs can drastically speed up compilation. But for
// non-incremental builds, tiny CGUs slow down compilation *and* result in
// worse generated code. So we don't allow CGUs smaller than this (unless
// there is just one CGU, of course). Note that CGU sizes of 100,000+ are
// common in larger programs, so this isn't all that large.
const NON_INCR_MIN_CGU_SIZE: usize = 1000;

// Repeatedly merge the two smallest codegen units as long as:
// - we have more CGUs than the upper limit, or
// - (Non-incremental builds only) the user didn't specify a CGU count, and
// there are multiple CGUs, and some are below the minimum size.
//
// The "didn't specify a CGU count" condition is because when an explicit
// count is requested we observe it as closely as possible. For example,
// the `compiler_builtins` crate sets `codegen-units = 10000` and it's
// critical they aren't merged. Also, some tests use explicit small values
// and likewise won't work if small CGUs are merged.
while codegen_units.len() > cx.tcx.sess.codegen_units().as_usize()
|| (cx.tcx.sess.opts.incremental.is_none()
&& matches!(cx.tcx.sess.codegen_units(), CodegenUnits::Default(_))
&& codegen_units.len() > 1
&& codegen_units.iter().any(|cgu| cgu.size_estimate() < NON_INCR_MIN_CGU_SIZE))
{
// Sort small cgus to the back.
codegen_units.sort_by_cached_key(|cgu| cmp::Reverse(cgu.size_estimate()));

let mut smallest = codegen_units.pop().unwrap();
let second_smallest = codegen_units.last_mut().unwrap();

Expand Down Expand Up @@ -814,47 +849,147 @@ fn default_visibility(tcx: TyCtxt<'_>, id: DefId, is_generic: bool) -> Visibilit
}
}

fn debug_dump<'a, 'tcx: 'a>(tcx: TyCtxt<'tcx>, label: &str, cgus: &[CodegenUnit<'tcx>]) {
fn debug_dump<'a, 'tcx: 'a>(
tcx: TyCtxt<'tcx>,
label: &str,
cgus: &[CodegenUnit<'tcx>],
(unique_inlined_items, unique_inlined_size): (usize, usize),
) {
let dump = move || {
use std::fmt::Write;

let num_cgus = cgus.len();
let num_items: usize = cgus.iter().map(|cgu| cgu.items().len()).sum();
let total_size: usize = cgus.iter().map(|cgu| cgu.size_estimate()).sum();
let max_size = cgus.iter().map(|cgu| cgu.size_estimate()).max().unwrap();
let min_size = cgus.iter().map(|cgu| cgu.size_estimate()).min().unwrap();
let max_min_size_ratio = max_size as f64 / min_size as f64;
let mut num_cgus = 0;
let mut all_cgu_sizes = Vec::new();

// Note: every unique root item is placed exactly once, so the number
// of unique root items always equals the number of placed root items.

let mut root_items = 0;
// unique_inlined_items is passed in above.
let mut placed_inlined_items = 0;

let mut root_size = 0;
// unique_inlined_size is passed in above.
let mut placed_inlined_size = 0;

for cgu in cgus.iter() {
num_cgus += 1;
all_cgu_sizes.push(cgu.size_estimate());

for (item, _) in cgu.items() {
match item.instantiation_mode(tcx) {
InstantiationMode::GloballyShared { .. } => {
root_items += 1;
root_size += item.size_estimate(tcx);
}
InstantiationMode::LocalCopy => {
placed_inlined_items += 1;
placed_inlined_size += item.size_estimate(tcx);
}
}
}
}

all_cgu_sizes.sort_unstable_by_key(|&n| cmp::Reverse(n));

let unique_items = root_items + unique_inlined_items;
let placed_items = root_items + placed_inlined_items;
let items_ratio = placed_items as f64 / unique_items as f64;

let unique_size = root_size + unique_inlined_size;
let placed_size = root_size + placed_inlined_size;
let size_ratio = placed_size as f64 / unique_size as f64;

let mean_cgu_size = placed_size as f64 / num_cgus as f64;

assert_eq!(placed_size, all_cgu_sizes.iter().sum::<usize>());

let s = &mut String::new();
let _ = writeln!(s, "{label}");
let _ = writeln!(
s,
"{label} ({num_items} items, total_size={total_size}; {num_cgus} CGUs, \
max_size={max_size}, min_size={min_size}, max_size/min_size={max_min_size_ratio:.1}):"
"- unique items: {unique_items} ({root_items} root + {unique_inlined_items} inlined), \
unique size: {unique_size} ({root_size} root + {unique_inlined_size} inlined)\n\
- placed items: {placed_items} ({root_items} root + {placed_inlined_items} inlined), \
placed size: {placed_size} ({root_size} root + {placed_inlined_size} inlined)\n\
- placed/unique items ratio: {items_ratio:.2}, \
placed/unique size ratio: {size_ratio:.2}\n\
- CGUs: {num_cgus}, mean size: {mean_cgu_size:.1}, sizes: {}",
list(&all_cgu_sizes),
);
let _ = writeln!(s);

for (i, cgu) in cgus.iter().enumerate() {
let name = cgu.name();
let size = cgu.size_estimate();
let num_items = cgu.items().len();
let _ = writeln!(
s,
"- CGU[{i}] {} ({num_items} items, size={}):",
cgu.name(),
cgu.size_estimate()
);
let mean_size = size as f64 / num_items as f64;

let mut placed_item_sizes: Vec<_> =
cgu.items().iter().map(|(item, _)| item.size_estimate(tcx)).collect();
placed_item_sizes.sort_unstable_by_key(|&n| cmp::Reverse(n));
let sizes = list(&placed_item_sizes);

let _ = writeln!(s, "- CGU[{i}]");
let _ = writeln!(s, " - {name}, size: {size}");
let _ =
writeln!(s, " - items: {num_items}, mean size: {mean_size:.1}, sizes: {sizes}",);

for (item, linkage) in cgu.items_in_deterministic_order(tcx) {
let symbol_name = item.symbol_name(tcx).name;
let symbol_hash_start = symbol_name.rfind('h');
let symbol_hash = symbol_hash_start.map_or("<no hash>", |i| &symbol_name[i..]);
let size = item.size_estimate(tcx);
let kind = match item.instantiation_mode(tcx) {
InstantiationMode::GloballyShared { .. } => "root",
InstantiationMode::LocalCopy => "inlined",
};
let _ = with_no_trimmed_paths!(writeln!(
s,
" - {item} [{linkage:?}] [{symbol_hash}] (size={size})"
" - {item} [{linkage:?}] [{symbol_hash}] ({kind}, size: {size})"
));
}

let _ = writeln!(s);
}

std::mem::take(s)
return std::mem::take(s);

// Converts a slice to a string, capturing repetitions to save space.
// E.g. `[4, 4, 4, 3, 2, 1, 1, 1, 1, 1]` -> "[4 (x3), 3, 2, 1 (x5)]".
fn list(ns: &[usize]) -> String {
let mut v = Vec::new();
if ns.is_empty() {
return "[]".to_string();
}

let mut elem = |curr, curr_count| {
if curr_count == 1 {
v.push(format!("{curr}"));
} else {
v.push(format!("{curr} (x{curr_count})"));
}
};

let mut curr = ns[0];
let mut curr_count = 1;

for &n in &ns[1..] {
if n != curr {
elem(curr, curr_count);
curr = n;
curr_count = 1;
} else {
curr_count += 1;
}
}
elem(curr, curr_count);

let mut s = "[".to_string();
s.push_str(&v.join(", "));
s.push_str("]");
s
}
};

debug!("{}", dump());
Expand Down Expand Up @@ -922,8 +1057,7 @@ fn collect_and_partition_mono_items(tcx: TyCtxt<'_>, (): ()) -> (&DefIdSet, &[Co
let (codegen_units, _) = tcx.sess.time("partition_and_assert_distinct_symbols", || {
sync::join(
|| {
let mut codegen_units =
partition(tcx, items.iter().copied(), tcx.sess.codegen_units(), &usage_map);
let mut codegen_units = partition(tcx, items.iter().copied(), &usage_map);
codegen_units[0].make_primary();
&*tcx.arena.alloc_from_iter(codegen_units)
},
Expand Down
Loading