Skip to content

Commit c6db014

Browse files
Implement intrinsic for swapping values
This allows move target- and backend-specific optmization from library code to codegen. Also, this should make const eval/miri evaluation simpler. Main optimization implemented in this PR makes backend generate swap without using allocas removing unneccessary memory writes and reads and reducing stack usage. One of the main optimizations is using larger integer chunks for swapping in x86_64 by utilizing unaligned reads/writes. It reduces code size (especially for debug builds) and prevent cases of ineffective vectorizations like `load <4 x i8>` (LLVM doesn't vectorize it further despite vectorizing `load i32`). Also added more tests.
1 parent 81c02da commit c6db014

File tree

22 files changed

+1309
-113
lines changed

22 files changed

+1309
-113
lines changed

compiler/rustc_codegen_cranelift/src/driver/jit.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,7 @@ fn dep_symbol_lookup_fn(
325325
Linkage::NotLinked | Linkage::IncludedFromDylib => {}
326326
Linkage::Static => {
327327
let name = crate_info.crate_name[&cnum];
328-
let mut err = sess.struct_err(&format!("Can't load static lib {}", name));
328+
let mut err = sess.struct_err(format!("Can't load static lib {}", name));
329329
err.note("rustc_codegen_cranelift can only load dylibs in JIT mode.");
330330
err.emit();
331331
}

compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs

+124
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,130 @@ fn codegen_regular_intrinsic_call<'tcx>(
567567
// FIXME use emit_small_memset
568568
fx.bcx.call_memset(fx.target_config, dst_ptr, val, count);
569569
}
570+
571+
sym::swap_nonoverlapping_single => {
572+
intrinsic_args!(fx, args => (x_ptr, y_ptr); intrinsic);
573+
let pointee_ty = x_ptr.layout().ty.builtin_deref(true).unwrap().ty;
574+
let pointee_layout = fx.layout_of(pointee_ty);
575+
576+
// ZSTs swap is noop.
577+
if pointee_layout.size != Size::ZERO {
578+
// Probably, it would be better to have dedicated method for this in
579+
// `cranelift_frontend::FunctionBuilder`
580+
// with optimizations based on size and alignment of values.
581+
582+
let x_ptr_val = x_ptr.load_scalar(fx);
583+
let y_ptr_val = y_ptr.load_scalar(fx);
584+
585+
let tmp_place = CPlace::new_stack_slot(fx, pointee_layout);
586+
let tmp_ptr_val = tmp_place.to_ptr().get_addr(fx);
587+
588+
let size_bytes = pointee_layout.size.bytes();
589+
let align_bytes: u8 = pointee_layout.align.abi.bytes().try_into().unwrap();
590+
fx.bcx.emit_small_memory_copy(
591+
fx.target_config,
592+
tmp_ptr_val,
593+
x_ptr_val,
594+
size_bytes,
595+
align_bytes,
596+
align_bytes,
597+
true,
598+
MemFlags::trusted(),
599+
);
600+
fx.bcx.emit_small_memory_copy(
601+
fx.target_config,
602+
x_ptr_val,
603+
y_ptr_val,
604+
size_bytes,
605+
align_bytes,
606+
align_bytes,
607+
true,
608+
MemFlags::trusted(),
609+
);
610+
fx.bcx.emit_small_memory_copy(
611+
fx.target_config,
612+
y_ptr_val,
613+
tmp_ptr_val,
614+
size_bytes,
615+
align_bytes,
616+
align_bytes,
617+
true,
618+
MemFlags::trusted(),
619+
);
620+
}
621+
}
622+
623+
sym::swap_nonoverlapping_many => {
624+
intrinsic_args!(fx, args => (x_ptr, y_ptr, count); intrinsic);
625+
let pointee_ty = x_ptr.layout().ty.builtin_deref(true).unwrap().ty;
626+
let pointee_layout = fx.layout_of(pointee_ty);
627+
628+
// ZSTs swap is noop.
629+
if pointee_layout.size != Size::ZERO {
630+
let x_ptr_val = x_ptr.load_scalar(fx);
631+
let y_ptr_val = y_ptr.load_scalar(fx);
632+
633+
let count = count.load_scalar(fx);
634+
635+
let tmp_place = CPlace::new_stack_slot(fx, pointee_layout);
636+
let tmp_ptr_val = tmp_place.to_ptr().get_addr(fx);
637+
638+
let elem_size_bytes = pointee_layout.size.bytes();
639+
let align_bytes: u8 = pointee_layout.align.abi.bytes().try_into().unwrap();
640+
641+
let loop_header = fx.bcx.create_block();
642+
let loop_body = fx.bcx.create_block();
643+
let loop_done = fx.bcx.create_block();
644+
645+
let index = fx.bcx.append_block_param(loop_header, fx.pointer_type);
646+
let zero = fx.bcx.ins().iconst(fx.pointer_type, 0);
647+
fx.bcx.ins().jump(loop_header, &[zero]);
648+
649+
fx.bcx.switch_to_block(loop_header);
650+
let is_done = fx.bcx.ins().icmp(IntCC::Equal, index, count);
651+
fx.bcx.ins().brif(is_done, loop_done, &[], loop_body, &[]);
652+
653+
fx.bcx.switch_to_block(loop_body);
654+
let curr_x_ptr_val = fx.bcx.ins().iadd(x_ptr_val, index);
655+
let curr_y_ptr_val = fx.bcx.ins().iadd(y_ptr_val, index);
656+
fx.bcx.emit_small_memory_copy(
657+
fx.target_config,
658+
tmp_ptr_val,
659+
curr_x_ptr_val,
660+
elem_size_bytes,
661+
align_bytes,
662+
align_bytes,
663+
true,
664+
MemFlags::trusted(),
665+
);
666+
fx.bcx.emit_small_memory_copy(
667+
fx.target_config,
668+
curr_x_ptr_val,
669+
curr_y_ptr_val,
670+
elem_size_bytes,
671+
align_bytes,
672+
align_bytes,
673+
true,
674+
MemFlags::trusted(),
675+
);
676+
fx.bcx.emit_small_memory_copy(
677+
fx.target_config,
678+
curr_y_ptr_val,
679+
tmp_ptr_val,
680+
elem_size_bytes,
681+
align_bytes,
682+
align_bytes,
683+
true,
684+
MemFlags::trusted(),
685+
);
686+
let next_index = fx.bcx.ins().iadd_imm(index, 1);
687+
fx.bcx.ins().jump(loop_header, &[next_index]);
688+
689+
fx.bcx.switch_to_block(loop_done);
690+
fx.bcx.ins().nop();
691+
}
692+
}
693+
570694
sym::ctlz | sym::ctlz_nonzero => {
571695
intrinsic_args!(fx, args => (arg); intrinsic);
572696
let val = arg.load_scalar(fx);

compiler/rustc_codegen_gcc/src/builder.rs

+49
Original file line numberDiff line numberDiff line change
@@ -1070,6 +1070,55 @@ impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> {
10701070
self.block.add_eval(None, self.context.new_call(None, memset, &[ptr, fill_byte, size]));
10711071
}
10721072

1073+
fn make_memory_loop<BodyPtrsVisitor, const VAR_COUNT: usize>(
1074+
&mut self,
1075+
loop_name: &str,
1076+
start_ptrs: [Self::Value; VAR_COUNT],
1077+
steps: [Size; VAR_COUNT],
1078+
iterations: Self::Value,
1079+
body_visitor: BodyPtrsVisitor,
1080+
) where
1081+
BodyPtrsVisitor: FnOnce(&mut Self, &[Self::Value; VAR_COUNT]),
1082+
{
1083+
assert!(VAR_COUNT > 0, "VAR_COUNT must be bigger than zero.");
1084+
1085+
for step in steps {
1086+
assert_ne!(step.bytes(), 0, "We are iterating over memory, ZSTs unexpected.");
1087+
}
1088+
1089+
let header_bb = self.append_sibling_block(&format!("{}_header", loop_name));
1090+
let body_bb = self.append_sibling_block(&format!("{}_body", loop_name));
1091+
let next_bb = self.append_sibling_block(&format!("{}_next", loop_name));
1092+
1093+
let zero = self.const_usize(0);
1094+
let additions: [Self::Value; VAR_COUNT] = steps.map(|st| self.const_usize(st.bytes()));
1095+
1096+
let loop_i = self.llbb().get_function().new_local(None, self.type_size_t(), "loop_i");
1097+
self.assign(loop_i, zero);
1098+
let loop_i_val = loop_i.to_rvalue();
1099+
1100+
self.br(header_bb);
1101+
1102+
self.switch_to_block(header_bb);
1103+
let keep_going = self.icmp(IntPredicate::IntNE, loop_i_val, iterations);
1104+
self.cond_br(keep_going, body_bb, next_bb);
1105+
1106+
self.switch_to_block(body_bb);
1107+
let current_ptrs: [Self::Value; VAR_COUNT] = core::array::from_fn(
1108+
|i|{
1109+
let start = self.pointercast(start_ptrs[i], self.type_i8p());
1110+
let offset = self.unchecked_umul(additions[i], loop_i_val);
1111+
self.inbounds_gep(self.type_i8(), start, &[offset])
1112+
}
1113+
);
1114+
body_visitor(self, &current_ptrs);
1115+
let next_i = self.unchecked_uadd(loop_i_val, self.const_usize(1));
1116+
self.assign(loop_i, next_i);
1117+
self.br(header_bb);
1118+
1119+
self.switch_to_block(next_bb);
1120+
}
1121+
10731122
fn select(&mut self, cond: RValue<'gcc>, then_val: RValue<'gcc>, mut else_val: RValue<'gcc>) -> RValue<'gcc> {
10741123
let func = self.current_func();
10751124
let variable = func.new_local(None, then_val.get_type(), "selectVar");

compiler/rustc_codegen_llvm/src/builder.rs

+48
Original file line numberDiff line numberDiff line change
@@ -935,6 +935,54 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
935935
}
936936
}
937937

938+
fn make_memory_loop<BodyPtrsVisitor, const VAR_COUNT: usize>(
939+
&mut self,
940+
loop_name: &str,
941+
start_ptrs: [Self::Value; VAR_COUNT],
942+
steps: [Size; VAR_COUNT],
943+
iterations: Self::Value,
944+
body_visitor: BodyPtrsVisitor,
945+
) where
946+
BodyPtrsVisitor: FnOnce(&mut Self, &[Self::Value; VAR_COUNT]),
947+
{
948+
const {
949+
assert!(VAR_COUNT > 0, "VAR_COUNT must be bigger than zero.");
950+
}
951+
for step in steps {
952+
assert_ne!(step.bytes(), 0, "We are iterating over memory, ZSTs unexpected.");
953+
}
954+
955+
let zero = self.const_usize(0);
956+
let additions: [Self::Value; VAR_COUNT] = steps.map(|st| self.const_usize(st.bytes()));
957+
958+
let header_bb = self.append_sibling_block(&format!("{}_header", loop_name));
959+
let body_bb = self.append_sibling_block(&format!("{}_body", loop_name));
960+
let next_bb = self.append_sibling_block(&format!("{}_next", loop_name));
961+
self.br(header_bb);
962+
963+
let mut header_bx = Builder::build(self.cx, header_bb);
964+
// Use integer for iteration instead of pointers because LLVM canonicalize loop into indexed anyway.
965+
let loop_i = header_bx.phi(self.type_isize(), &[zero], &[self.llbb()]);
966+
let keep_going = header_bx.icmp(IntPredicate::IntNE, loop_i, iterations);
967+
header_bx.cond_br(keep_going, body_bb, next_bb);
968+
969+
let mut body_bx = Builder::build(self.cx, body_bb);
970+
let current_ptrs: [Self::Value; VAR_COUNT] = std::array::from_fn(|i| {
971+
let start = start_ptrs[i];
972+
// FIXME: Remove pointercast after dropping supporting of LLVM 14.
973+
let start = self.pointercast(start, self.type_i8p());
974+
let addition = additions[i];
975+
let offset = body_bx.unchecked_umul(loop_i, addition);
976+
body_bx.inbounds_gep(body_bx.type_i8(), start, &[offset])
977+
});
978+
body_visitor(&mut body_bx, &current_ptrs);
979+
let next_i = body_bx.unchecked_uadd(loop_i, body_bx.const_usize(1));
980+
header_bx.add_incoming_to_phi(loop_i, next_i, body_bb);
981+
body_bx.br(header_bb);
982+
983+
*self = Builder::build(self.cx, next_bb);
984+
}
985+
938986
fn select(
939987
&mut self,
940988
cond: &'ll Value,

compiler/rustc_codegen_llvm/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#![doc(html_root_url = "https://doc.rust-lang.org/nightly/nightly-rustc/")]
88
#![feature(extern_types)]
99
#![feature(hash_raw_entry)]
10+
#![feature(inline_const)]
1011
#![feature(iter_intersperse)]
1112
#![feature(let_chains)]
1213
#![feature(never_type)]

0 commit comments

Comments
 (0)