Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unroll Buffer.Memmove for arm64 #83740

Merged
merged 14 commits into from
Mar 25, 2023
137 changes: 136 additions & 1 deletion src/coreclr/jit/codegenarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3050,6 +3050,132 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
}
}

//------------------------------------------------------------------------
// genCodeForMemmove: Perform an unrolled memmove. The idea that we can
// ignore the fact that dst and src might overlap if we save the whole
// dst to temp regs in advance, e.g. for memmove(rax, rcx, 30):
//
// ldr q16, [x0]
// ldr q17, [x0, #0x0E]
// str q16, [x1]
// str q17, [x1, #0x0E]
//
// Arguments:
// tree - GenTreeBlk node
//
void CodeGen::genCodeForMemmove(GenTreeBlk* tree)
{
#ifdef TARGET_ARM64
// TODO-CQ: Support addressing modes, for now we don't use them
GenTreeIndir* srcIndir = tree->Data()->AsIndir();
assert(srcIndir->isContained() && !srcIndir->Addr()->isContained());

regNumber dst = genConsumeReg(tree->Addr());
regNumber src = genConsumeReg(srcIndir->Addr());
unsigned size = tree->Size();

auto emitLoadStore = [&](bool load, unsigned regSize, regNumber tempReg, unsigned offset) {
var_types memType;
switch (regSize)
{
case 1:
memType = TYP_UBYTE;
break;
case 2:
memType = TYP_USHORT;
break;
case 4:
memType = TYP_INT;
break;
case 8:
memType = TYP_LONG;
break;
case 16:
memType = TYP_SIMD16;
break;
default:
unreached();
}
if (load)
{
GetEmitter()->emitIns_R_R_I(ins_Load(memType), emitTypeSize(memType), tempReg, src, offset);
}
else
{
GetEmitter()->emitIns_R_R_I(ins_Store(memType), emitTypeSize(memType), tempReg, dst, offset);
}
};

// Eventually we want to emit CPYP+CPYM+CPYE on armv9 for large sizes

// TODO-CQ: Emit stp/ldp (32 bytes at once).
unsigned simdSize = FP_REGSIZE_BYTES;
if (size >= simdSize)
{
// Number of SIMD regs needed to save the whole src to regs.
const unsigned numberOfSimdRegs = tree->AvailableTempRegCount(RBM_ALLFLOAT);

// Pop all temp regs to a local array, currently, this impl is limited with LSRA's MaxInternalCount
regNumber tempRegs[LinearScan::MaxInternalCount] = {};
for (unsigned i = 0; i < numberOfSimdRegs; i++)
{
tempRegs[i] = tree->ExtractTempReg(RBM_ALLFLOAT);
}

auto emitSimdLoadStore = [&](bool load) {
unsigned offset = 0;
int regIndex = 0;
do
{
emitLoadStore(load, simdSize, tempRegs[regIndex++], offset);
offset += simdSize;
if (size == offset)
{
break;
}
if ((size - offset) < simdSize)
{
// Overlap with the previously processed data. We'll always use SIMD for that for simplicity
// TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder.
offset = size - simdSize;
}
} while (true);
};

// load everything from SRC to temp regs
emitSimdLoadStore(/* load */ true);
// store them to DST
emitSimdLoadStore(/* load */ false);
}
else
{
// Here we work with size 1..15
assert((size > 0) && (size < FP_REGSIZE_BYTES));

// Use overlapping loads/stores, e. g. for size == 9: "ldr x2, [x0]; ldr x3, [x0, #0x01]".
const unsigned loadStoreSize = 1 << BitOperations::Log2(size);
if (loadStoreSize == size)
{
const regNumber tmpReg = tree->GetSingleTempReg(RBM_ALLINT);
emitLoadStore(/* load */ true, loadStoreSize, tmpReg, 0);
emitLoadStore(/* load */ false, loadStoreSize, tmpReg, 0);
}
else
{
assert(tree->AvailableTempRegCount() == 2);
const regNumber tmpReg1 = tree->ExtractTempReg(RBM_ALLINT);
const regNumber tmpReg2 = tree->ExtractTempReg(RBM_ALLINT);
emitLoadStore(/* load */ true, loadStoreSize, tmpReg1, 0);
emitLoadStore(/* load */ true, loadStoreSize, tmpReg2, size - loadStoreSize);
emitLoadStore(/* load */ false, loadStoreSize, tmpReg1, 0);
emitLoadStore(/* load */ false, loadStoreSize, tmpReg2, size - loadStoreSize);
}
}
#else // TARGET_ARM64
unreached();
#endif
}

//------------------------------------------------------------------------
// genCodeForInitBlkHelper - Generate code for an InitBlk node by the means of the VM memcpy helper call
//
Expand Down Expand Up @@ -4370,13 +4496,22 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* blkOp)
break;

case GenTreeBlk::BlkOpKindUnroll:
case GenTreeBlk::BlkOpKindUnrollMemmove:
if (isCopyBlk)
{
if (blkOp->gtBlkOpGcUnsafe)
{
GetEmitter()->emitDisableGC();
}
genCodeForCpBlkUnroll(blkOp);
if (blkOp->gtBlkOpKind == GenTreeBlk::BlkOpKindUnroll)
{
genCodeForCpBlkUnroll(blkOp);
}
else
{
assert(blkOp->gtBlkOpKind == GenTreeBlk::BlkOpKindUnrollMemmove);
genCodeForMemmove(blkOp);
}
if (blkOp->gtBlkOpGcUnsafe)
{
GetEmitter()->emitEnableGC();
Expand Down
25 changes: 16 additions & 9 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -8941,22 +8941,24 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
//
unsigned int getUnrollThreshold(UnrollKind type, bool canUseSimd = true)
{
unsigned threshold = TARGET_POINTER_SIZE;
unsigned maxRegSize = REGSIZE_BYTES;
unsigned threshold = maxRegSize;

#if defined(FEATURE_SIMD)
if (canUseSimd)
{
threshold = maxSIMDStructBytes();
#if defined(TARGET_ARM64)
maxRegSize = maxSIMDStructBytes();
#if defined(TARGET_XARCH)
// TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial
maxRegSize = min(maxRegSize, YMM_REGSIZE_BYTES);
threshold = maxRegSize;
#elif defined(TARGET_ARM64)
// ldp/stp instructions can load/store two 16-byte vectors at once, e.g.:
//
// ldp q0, q1, [x1]
// stp q0, q1, [x0]
//
threshold *= 2;
#elif defined(TARGET_XARCH)
// TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial
threshold = min(threshold, YMM_REGSIZE_BYTES);
threshold = maxRegSize * 2;
#endif
}
#if defined(TARGET_XARCH)
Expand Down Expand Up @@ -8991,8 +8993,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
//
threshold *= 4;

// NOTE: Memmove's unrolling is currently limitted with LSRA -
// up to LinearScan::MaxInternalCount number of temp regs, e.g. 5*32=160 bytes for AVX cpu.
if (type == UnrollKind::Memmove)
EgorBo marked this conversation as resolved.
Show resolved Hide resolved
{
// NOTE: Memmove's unrolling is currently limitted with LSRA -
// up to LinearScan::MaxInternalCount number of temp regs, e.g. 5*16=80 bytes on arm64
threshold = maxRegSize * 4;
}

return threshold;
}

Expand Down
30 changes: 30 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1603,6 +1603,36 @@ CallArg* CallArgs::GetArgByIndex(unsigned index)
return cur;
}

//---------------------------------------------------------------
// GetUserArgByIndex: Get an argument with the specified index.
// Unlike GetArgByIndex, this function ignores non-user args
// like r2r cells.
//
// Parameters:
// index - The index of the argument to find.
//
// Returns:
// A pointer to the argument.
//
// Remarks:
// This function assumes enough arguments exist.
//
CallArg* CallArgs::GetUserArgByIndex(unsigned index)
{
CallArg* cur = m_head;
for (unsigned i = 0; i < index || cur->IsArgAddedLate();)
{
if (!cur->IsArgAddedLate())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this use cur->GetWellKnownArg() != WellKnownArg::None instead? (And then the Remarks "The current implementation doesn't..." can be removed?)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would ignore the 'this' arg and also ShiftLow/ShiftHigh for x86 shift helpers are arguably "user args" (in that they manifest in the IL). Maybe we should add an IsUserArg() or IsILArg() that has these special cases and use it here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added IsUserArg()

{
i++;
}
assert((cur != nullptr) && "Not enough arguments in GetArgByIndex");
cur = cur->GetNext();
}

return cur;
}
EgorBo marked this conversation as resolved.
Show resolved Hide resolved

//---------------------------------------------------------------
// GetIndex: Get the index for the specified argument.
//
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -4704,6 +4704,7 @@ class CallArgs
CallArg* GetThisArg();
CallArg* GetRetBufferArg();
CallArg* GetArgByIndex(unsigned index);
CallArg* GetUserArgByIndex(unsigned index);
unsigned GetIndex(CallArg* arg);

bool IsEmpty() const
Expand Down Expand Up @@ -4772,6 +4773,7 @@ class CallArgs
unsigned OutgoingArgsStackSize() const;

unsigned CountArgs();
unsigned CountUserArgs();

template <CallArg* (CallArg::*Next)()>
class CallArgIterator
Expand Down
41 changes: 36 additions & 5 deletions src/coreclr/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1794,18 +1794,30 @@ GenTree* Lowering::AddrGen(void* addr)
//
GenTree* Lowering::LowerCallMemmove(GenTreeCall* call)
{
JITDUMP("Considering Memmove [%06d] for unrolling.. ", comp->dspTreeID(call))
assert(comp->lookupNamedIntrinsic(call->gtCallMethHnd) == NI_System_Buffer_Memmove);
assert(call->gtArgs.CountArgs() == 3);

GenTree* lengthArg = call->gtArgs.GetArgByIndex(2)->GetNode();
assert(call->gtArgs.CountUserArgs() == 3);

if (comp->info.compHasNextCallRetAddr)
{
JITDUMP("compHasNextCallRetAddr=true so we won't be able to remove the call - bail out.\n")
return nullptr;
}

GenTree* lengthArg = call->gtArgs.GetUserArgByIndex(2)->GetNode();
if (lengthArg->IsIntegralConst())
{
ssize_t cnsSize = lengthArg->AsIntCon()->IconValue();
JITDUMP("Size=%ld.. ", (LONG)cnsSize);
// TODO-CQ: drop the whole thing in case of 0
if ((cnsSize > 0) && (cnsSize <= (ssize_t)comp->getUnrollThreshold(Compiler::UnrollKind::Memmove)))
{
GenTree* dstAddr = call->gtArgs.GetArgByIndex(0)->GetNode();
GenTree* srcAddr = call->gtArgs.GetArgByIndex(1)->GetNode();
JITDUMP("Accepted for unrolling!\nOld tree:\n")
DISPTREE(call);

GenTree* dstAddr = call->gtArgs.GetUserArgByIndex(0)->GetNode();
GenTree* srcAddr = call->gtArgs.GetUserArgByIndex(1)->GetNode();

// TODO-CQ: Try to create an addressing mode
GenTreeIndir* srcBlk = comp->gtNewIndir(TYP_STRUCT, srcAddr);
Expand All @@ -1825,8 +1837,27 @@ GenTree* Lowering::LowerCallMemmove(GenTreeCall* call)
BlockRange().Remove(lengthArg);
BlockRange().Remove(call);

// Remove all non-user args (e.g. r2r cell)
for (CallArg& arg : call->gtArgs.Args())
{
if (arg.IsArgAddedLate())
{
BlockRange().Remove(arg.GetNode());
}
}
EgorBo marked this conversation as resolved.
Show resolved Hide resolved

JITDUMP("\nNew tree:\n")
DISPTREE(storeBlk);
return storeBlk;
}
else
{
JITDUMP("Size is either 0 or too big to unroll.\n")
}
}
else
{
JITDUMP("size is not a constant.\n")
}
return nullptr;
}
Expand All @@ -1851,7 +1882,7 @@ GenTree* Lowering::LowerCall(GenTree* node)

if (call->gtCallMoreFlags & GTF_CALL_M_SPECIAL_INTRINSIC)
{
#ifdef TARGET_AMD64
#if defined(TARGET_AMD64) || defined(TARGET_ARM64)
if (comp->lookupNamedIntrinsic(call->gtCallMethHnd) == NI_System_Buffer_Memmove)
{
GenTree* newNode = LowerCallMemmove(call);
Expand Down
47 changes: 47 additions & 0 deletions src/coreclr/jit/lsraarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -730,6 +730,53 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
}
break;

case GenTreeBlk::BlkOpKindUnrollMemmove:
{
#ifdef TARGET_ARM64

// Prepare SIMD/GPR registers needed to perform an unrolled memmove. The idea that
// we can ignore the fact that dst and src might overlap if we save the whole dst
// to temp regs in advance, e.g. for memmove(rax, rcx, 120):

// Lowering was expected to get rid of memmove in case of zero
assert(size > 0);

const unsigned simdSize = FP_REGSIZE_BYTES;
if (size >= simdSize)
{
unsigned simdRegs = size / simdSize;
if ((size % simdSize) != 0)
{
// TODO-CQ: Consider using GPR load/store here if the reminder is 1,2,4 or 8
simdRegs++;
}
for (unsigned i = 0; i < simdRegs; i++)
{
// It's too late to revert the unrolling so we hope we'll have enough SIMD regs
// no more than MaxInternalCount. Currently, it's controlled by getUnrollThreshold(memmove)
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
}
}
else
{
if (isPow2(size))
{
// Single GPR for 1,2,4,8
buildInternalIntRegisterDefForNode(blkNode, availableIntRegs);
}
else
{
// Any size from 3 to 15 can be handled via two GPRs
buildInternalIntRegisterDefForNode(blkNode, availableIntRegs);
buildInternalIntRegisterDefForNode(blkNode, availableIntRegs);
}
}
#else // TARGET_ARM64
unreached();
#endif
}
break;

case GenTreeBlk::BlkOpKindHelper:
dstAddrRegMask = RBM_ARG_0;
if (srcAddrOrFill != nullptr)
Expand Down
Loading