Skip to content

[X86] For minsize memset/memcpy, use byte or double-word accesses #87003

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
248 changes: 148 additions & 100 deletions llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,23 @@ static cl::opt<bool>
UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(false),
cl::desc("Use fast short rep mov in memcpy lowering"));

/// Returns the best type to use with repmovs/repstos depending on alignment.
static MVT getOptimalRepType(const X86Subtarget &Subtarget, Align Alignment) {
uint64_t Align = Alignment.value();
assert((Align != 0) && "Align is normalized");
assert(isPowerOf2_64(Align) && "Align is a power of 2");
switch (Align) {
case 1:
return MVT::i8;
case 2:
return MVT::i16;
case 4:
return MVT::i32;
default:
return Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
}
}

bool X86SelectionDAGInfo::isBaseRegConflictPossible(
SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
// We cannot use TRI->hasBasePointer() until *after* we select all basic
Expand All @@ -44,102 +61,127 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
return llvm::is_contained(ClobberSet, TRI->getBaseRegister());
}

SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo) const {
// If to a segment-relative address space, use the default lowering.
if (DstPtrInfo.getAddrSpace() >= 256)
return SDValue();
/// Emit a single REP STOSB instruction for a particular constant size.
static SDValue emitRepstos(const X86Subtarget &Subtarget, SelectionDAG &DAG,
const SDLoc &dl, SDValue Chain, SDValue Dst,
SDValue Val, SDValue Size, MVT AVT) {
const bool Use64BitRegs = Subtarget.isTarget64BitLP64();
unsigned AX = X86::AL;
switch (AVT.getSizeInBits()) {
case 8:
AX = X86::AL;
break;
case 16:
AX = X86::AX;
break;
case 32:
AX = X86::EAX;
break;
default:
AX = X86::RAX;
break;
}

// If the base register might conflict with our physical registers, bail out.
const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
X86::ECX, X86::EAX, X86::EDI};
if (isBaseRegConflictPossible(DAG, ClobberSet))
return SDValue();
const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;

ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
const X86Subtarget &Subtarget =
DAG.getMachineFunction().getSubtarget<X86Subtarget>();
SDValue InGlue;
Chain = DAG.getCopyToReg(Chain, dl, AX, Val, InGlue);
InGlue = Chain.getValue(1);
Chain = DAG.getCopyToReg(Chain, dl, CX, Size, InGlue);
InGlue = Chain.getValue(1);
Chain = DAG.getCopyToReg(Chain, dl, DI, Dst, InGlue);
InGlue = Chain.getValue(1);

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
return DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
}

/// Emit a single REP STOSB instruction for a particular constant size.
static SDValue emitRepstosB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
const SDLoc &dl, SDValue Chain, SDValue Dst,
SDValue Val, uint64_t Size) {
return emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
DAG.getIntPtrConstant(Size, dl), MVT::i8);
}

/// Returns a REP STOS instruction, possibly with a few load/stores to implement
/// a constant size memory set. In some cases where we know REP MOVS is
/// inefficient we return an empty SDValue so the calling code can either
/// generate a store sequence or call the runtime memset function.
static SDValue emitConstantSizeRepstos(SelectionDAG &DAG,
const X86Subtarget &Subtarget,
const SDLoc &dl, SDValue Chain,
SDValue Dst, SDValue Val, uint64_t Size,
EVT SizeVT, Align Alignment,
bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo) {
/// In case we optimize for size, we use repstosb even if it's less efficient
/// so we can save the loads/stores of the leftover.
if (DAG.getMachineFunction().getFunction().hasMinSize()) {
if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
// Special case 0 because otherwise we get large literals,
// which causes larger encoding.
if ((Size & 31) == 0 && (ValC->getZExtValue() & 255) == 0) {
MVT BlockType = MVT::i32;
const uint64_t BlockBits = BlockType.getSizeInBits();
const uint64_t BlockBytes = BlockBits / 8;
const uint64_t BlockCount = Size / BlockBytes;

Val = DAG.getConstant(0, dl, BlockType);
// repstosd is same size as repstosb
return emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
DAG.getIntPtrConstant(BlockCount, dl), BlockType);
}
}
return emitRepstosB(Subtarget, DAG, dl, Chain, Dst, Val, Size);
}

if (Size > Subtarget.getMaxInlineSizeThreshold())
return SDValue();

// If not DWORD aligned or size is more than the threshold, call the library.
// The libc version is likely to be faster for these cases. It can use the
// address value and run time information about the CPU.
if (Alignment < Align(4) || !ConstantSize ||
ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold())
if (Alignment < Align(4))
return SDValue();

uint64_t SizeVal = ConstantSize->getZExtValue();
SDValue InGlue;
EVT AVT;
SDValue Count;
unsigned BytesLeft = 0;
MVT BlockType = MVT::i8;
uint64_t BlockCount = Size;
uint64_t BytesLeft = 0;
if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
unsigned ValReg;
uint64_t Val = ValC->getZExtValue() & 255;

// If the value is a constant, then we can potentially use larger sets.
if (Alignment >= Align(4)) {
// DWORD aligned
AVT = MVT::i32;
ValReg = X86::EAX;
Val = (Val << 8) | Val;
Val = (Val << 16) | Val;
if (Subtarget.is64Bit() && Alignment >= Align(8)) { // QWORD aligned
AVT = MVT::i64;
ValReg = X86::RAX;
Val = (Val << 32) | Val;
}
} else if (Alignment == Align(2)) {
// WORD aligned
AVT = MVT::i16;
ValReg = X86::AX;
Val = (Val << 8) | Val;
} else {
// Byte aligned
AVT = MVT::i8;
ValReg = X86::AL;
Count = DAG.getIntPtrConstant(SizeVal, dl);
}
BlockType = getOptimalRepType(Subtarget, Alignment);
uint64_t Value = ValC->getZExtValue() & 255;
const uint64_t BlockBits = BlockType.getSizeInBits();

if (AVT.bitsGT(MVT::i8)) {
unsigned UBytes = AVT.getSizeInBits() / 8;
Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl);
BytesLeft = SizeVal % UBytes;
}
if (BlockBits >= 16)
Value = (Value << 8) | Value;

Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
InGlue);
InGlue = Chain.getValue(1);
} else {
AVT = MVT::i8;
Count = DAG.getIntPtrConstant(SizeVal, dl);
Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Val, InGlue);
InGlue = Chain.getValue(1);
}
if (BlockBits >= 32)
Value = (Value << 16) | Value;

bool Use64BitRegs = Subtarget.isTarget64BitLP64();
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
Count, InGlue);
InGlue = Chain.getValue(1);
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
Dst, InGlue);
InGlue = Chain.getValue(1);
if (BlockBits >= 64)
Value = (Value << 32) | Value;

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
SDValue RepStos = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
const uint64_t BlockBytes = BlockBits / 8;
BlockCount = Size / BlockBytes;
BytesLeft = Size % BlockBytes;
Val = DAG.getConstant(Value, dl, BlockType);
}

SDValue RepStos =
emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
DAG.getIntPtrConstant(BlockCount, dl), BlockType);
/// RepStos can process the whole length.
if (BytesLeft == 0)
return RepStos;

// Handle the last 1 - 7 bytes.
SmallVector<SDValue, 4> Results;
Results.push_back(RepStos);
unsigned Offset = SizeVal - BytesLeft;
unsigned Offset = Size - BytesLeft;
EVT AddrVT = Dst.getValueType();
EVT SizeVT = Size.getValueType();

Results.push_back(
DAG.getMemset(Chain, dl,
Expand All @@ -152,6 +194,31 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
}

SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo) const {
// If to a segment-relative address space, use the default lowering.
if (DstPtrInfo.getAddrSpace() >= 256)
return SDValue();

// If the base register might conflict with our physical registers, bail out.
const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
X86::ECX, X86::EAX, X86::EDI};
if (isBaseRegConflictPossible(DAG, ClobberSet))
return SDValue();

ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
if (!ConstantSize)
return SDValue();

const X86Subtarget &Subtarget =
DAG.getMachineFunction().getSubtarget<X86Subtarget>();
return emitConstantSizeRepstos(
DAG, Subtarget, dl, Chain, Dst, Val, ConstantSize->getZExtValue(),
Size.getValueType(), Alignment, isVolatile, AlwaysInline, DstPtrInfo);
}

/// Emit a single REP MOVS{B,W,D,Q} instruction.
static SDValue emitRepmovs(const X86Subtarget &Subtarget, SelectionDAG &DAG,
const SDLoc &dl, SDValue Chain, SDValue Dst,
Expand Down Expand Up @@ -182,24 +249,6 @@ static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
DAG.getIntPtrConstant(Size, dl), MVT::i8);
}

/// Returns the best type to use with repmovs depending on alignment.
static MVT getOptimalRepmovsType(const X86Subtarget &Subtarget,
Align Alignment) {
uint64_t Align = Alignment.value();
assert((Align != 0) && "Align is normalized");
assert(isPowerOf2_64(Align) && "Align is a power of 2");
switch (Align) {
case 1:
return MVT::i8;
case 2:
return MVT::i16;
case 4:
return MVT::i32;
default:
return Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
}
}

/// Returns a REP MOVS instruction, possibly with a few load/stores to implement
/// a constant size memory copy. In some cases where we know REP MOVS is
/// inefficient we return an empty SDValue so the calling code can either
Expand All @@ -209,6 +258,10 @@ static SDValue emitConstantSizeRepmov(
SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, EVT SizeVT,
Align Alignment, bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) {
/// In case we optimize for size, we use repmovsb even if it's less efficient
/// so we can save the loads/stores of the leftover.
if (DAG.getMachineFunction().getFunction().hasMinSize())
return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);

/// TODO: Revisit next line: big copy with ERMSB on march >= haswell are very
/// efficient.
Expand All @@ -222,10 +275,10 @@ static SDValue emitConstantSizeRepmov(
assert(!Subtarget.hasERMSB() && "No efficient RepMovs");
/// We assume runtime memcpy will do a better job for unaligned copies when
/// ERMS is not present.
if (!AlwaysInline && (Alignment.value() & 3) != 0)
if (!AlwaysInline && (Alignment < Align(4)))
return SDValue();

const MVT BlockType = getOptimalRepmovsType(Subtarget, Alignment);
const MVT BlockType = getOptimalRepType(Subtarget, Alignment);
const uint64_t BlockBytes = BlockType.getSizeInBits() / 8;
const uint64_t BlockCount = Size / BlockBytes;
const uint64_t BytesLeft = Size % BlockBytes;
Expand All @@ -239,11 +292,6 @@ static SDValue emitConstantSizeRepmov(

assert(BytesLeft && "We have leftover at this point");

/// In case we optimize for size we use repmovsb even if it's less efficient
/// so we can save the loads/stores of the leftover.
if (DAG.getMachineFunction().getFunction().hasMinSize())
return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);

// Handle the last 1 - 7 bytes.
SmallVector<SDValue, 4> Results;
Results.push_back(RepMovs);
Expand Down Expand Up @@ -282,7 +330,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
if (UseFSRMForMemcpy && Subtarget.hasFSRM())
return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8);

/// Handle constant sizes,
/// Handle constant sizes
if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size))
return emitConstantSizeRepmov(DAG, Subtarget, dl, Chain, Dst, Src,
ConstantSize->getZExtValue(),
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/memcpy-struct-by-value.ll
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,9 @@ define void @test2(ptr nocapture %x) nounwind minsize {
; NOFAST32-NEXT: pushl %esi
; NOFAST32-NEXT: subl $4100, %esp # imm = 0x1004
; NOFAST32-NEXT: movl {{[0-9]+}}(%esp), %esi
; NOFAST32-NEXT: movl $1024, %ecx # imm = 0x400
; NOFAST32-NEXT: movl $4096, %ecx # imm = 0x1000
; NOFAST32-NEXT: movl %esp, %edi
; NOFAST32-NEXT: rep;movsl (%esi), %es:(%edi)
; NOFAST32-NEXT: rep;movsb (%esi), %es:(%edi)
; NOFAST32-NEXT: calll foo@PLT
; NOFAST32-NEXT: addl $4100, %esp # imm = 0x1004
; NOFAST32-NEXT: popl %esi
Expand All @@ -106,9 +106,9 @@ define void @test2(ptr nocapture %x) nounwind minsize {
; NOFAST: # %bb.0:
; NOFAST-NEXT: subq $4104, %rsp # imm = 0x1008
; NOFAST-NEXT: movq %rdi, %rsi
; NOFAST-NEXT: movl $512, %ecx # imm = 0x200
; NOFAST-NEXT: movl $4096, %ecx # imm = 0x1000
; NOFAST-NEXT: movq %rsp, %rdi
; NOFAST-NEXT: rep;movsq (%rsi), %es:(%rdi)
; NOFAST-NEXT: rep;movsb (%rsi), %es:(%rdi)
; NOFAST-NEXT: callq foo@PLT
; NOFAST-NEXT: addq $4104, %rsp # imm = 0x1008
; NOFAST-NEXT: retq
Expand Down
20 changes: 12 additions & 8 deletions llvm/test/CodeGen/X86/memcpy.ll
Original file line number Diff line number Diff line change
Expand Up @@ -202,14 +202,16 @@ define void @test3_minsize(ptr nocapture %A, ptr nocapture %B) nounwind minsize
; DARWIN-LABEL: test3_minsize:
; DARWIN: ## %bb.0:
; DARWIN-NEXT: pushq $64
; DARWIN-NEXT: popq %rdx
; DARWIN-NEXT: jmp _memcpy ## TAILCALL
; DARWIN-NEXT: popq %rcx
; DARWIN-NEXT: rep;movsb (%rsi), %es:(%rdi)
; DARWIN-NEXT: retq
;
; LINUX-LABEL: test3_minsize:
; LINUX: # %bb.0:
; LINUX-NEXT: pushq $64
; LINUX-NEXT: popq %rdx
; LINUX-NEXT: jmp memcpy@PLT # TAILCALL
; LINUX-NEXT: popq %rcx
; LINUX-NEXT: rep;movsb (%rsi), %es:(%rdi)
; LINUX-NEXT: retq
;
; LINUX-SKL-LABEL: test3_minsize:
; LINUX-SKL: # %bb.0:
Expand Down Expand Up @@ -249,14 +251,16 @@ define void @test3_minsize_optsize(ptr nocapture %A, ptr nocapture %B) nounwind
; DARWIN-LABEL: test3_minsize_optsize:
; DARWIN: ## %bb.0:
; DARWIN-NEXT: pushq $64
; DARWIN-NEXT: popq %rdx
; DARWIN-NEXT: jmp _memcpy ## TAILCALL
; DARWIN-NEXT: popq %rcx
; DARWIN-NEXT: rep;movsb (%rsi), %es:(%rdi)
; DARWIN-NEXT: retq
;
; LINUX-LABEL: test3_minsize_optsize:
; LINUX: # %bb.0:
; LINUX-NEXT: pushq $64
; LINUX-NEXT: popq %rdx
; LINUX-NEXT: jmp memcpy@PLT # TAILCALL
; LINUX-NEXT: popq %rcx
; LINUX-NEXT: rep;movsb (%rsi), %es:(%rdi)
; LINUX-NEXT: retq
;
; LINUX-SKL-LABEL: test3_minsize_optsize:
; LINUX-SKL: # %bb.0:
Expand Down
Loading
Loading