From 2816ebbbd25077f57284f895329a7f92e3a99ec2 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Tue, 21 Mar 2023 18:37:55 +0100 Subject: [PATCH 01/11] Unroll Buffer.Memmove for arm64 --- src/coreclr/jit/codegenarmarch.cpp | 137 ++++++++++++++++++++++++++++- src/coreclr/jit/compiler.h | 25 ++++-- src/coreclr/jit/lower.cpp | 2 +- src/coreclr/jit/lsraarmarch.cpp | 47 ++++++++++ 4 files changed, 200 insertions(+), 11 deletions(-) diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 8d3628038fc78..31cd7a4c0d3e2 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -3050,6 +3050,132 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) } } +//------------------------------------------------------------------------ +// genCodeForMemmove: Perform an unrolled memmove. The idea that we can +// ignore the fact that dst and src might overlap if we save the whole +// dst to temp regs in advance, e.g. for memmove(rax, rcx, 30): +// +// ldr q16, [x0] +// ldr q17, [x0, #0x0E] +// str q16, [x1] +// str q17, [x1, #0x0E] +// +// Arguments: +// tree - GenTreeBlk node +// +void CodeGen::genCodeForMemmove(GenTreeBlk* tree) +{ +#ifdef TARGET_ARM64 + // TODO-CQ: Support addressing modes, for now we don't use them + GenTreeIndir* srcIndir = tree->Data()->AsIndir(); + assert(srcIndir->isContained() && !srcIndir->Addr()->isContained()); + + regNumber dst = genConsumeReg(tree->Addr()); + regNumber src = genConsumeReg(srcIndir->Addr()); + unsigned size = tree->Size(); + + auto emitLoadStore = [&](bool load, unsigned regSize, regNumber tempReg, unsigned offset) { + var_types memType; + switch (regSize) + { + case 1: + memType = TYP_UBYTE; + break; + case 2: + memType = TYP_USHORT; + break; + case 4: + memType = TYP_INT; + break; + case 8: + memType = TYP_LONG; + break; + case 16: + memType = TYP_SIMD16; + break; + default: + unreached(); + } + if (load) + { + GetEmitter()->emitIns_R_R_I(ins_Load(memType), emitTypeSize(memType), tempReg, src, offset); + } + else + { + GetEmitter()->emitIns_R_R_I(ins_Store(memType), emitTypeSize(memType), tempReg, dst, offset); + } + }; + + // Eventually we want to emit CPYP+CPYM+CPYE on armv9 for large sizes + + // TODO-CQ: Emit stp/ldp (32 bytes at once). + unsigned simdSize = FP_REGSIZE_BYTES; + if (size >= simdSize) + { + // Number of SIMD regs needed to save the whole src to regs. + const unsigned numberOfSimdRegs = tree->AvailableTempRegCount(RBM_ALLFLOAT); + + // Pop all temp regs to a local array, currently, this impl is limited with LSRA's MaxInternalCount + regNumber tempRegs[LinearScan::MaxInternalCount] = {}; + for (unsigned i = 0; i < numberOfSimdRegs; i++) + { + tempRegs[i] = tree->ExtractTempReg(RBM_ALLFLOAT); + } + + auto emitSimdLoadStore = [&](bool load) { + unsigned offset = 0; + int regIndex = 0; + do + { + emitLoadStore(load, simdSize, tempRegs[regIndex++], offset); + offset += simdSize; + if (size == offset) + { + break; + } + if ((size - offset) < simdSize) + { + // Overlap with the previously processed data. We'll always use SIMD for that for simplicity + // TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder. + offset = size - simdSize; + } + } while (true); + }; + + // load everything from SRC to temp regs + emitSimdLoadStore(/* load */ true); + // store them to DST + emitSimdLoadStore(/* load */ false); + } + else + { + // Here we work with size 1..15 + assert((size > 0) && (size < FP_REGSIZE_BYTES)); + + // Use overlapping loads/stores, e. g. for size == 9: "ldr x2, [x0]; ldr x3, [x0, #0x01]". + const unsigned loadStoreSize = 1 << BitOperations::Log2(size); + if (loadStoreSize == size) + { + const regNumber tmpReg = tree->GetSingleTempReg(RBM_ALLINT); + emitLoadStore(/* load */ true, loadStoreSize, tmpReg, 0); + emitLoadStore(/* load */ false, loadStoreSize, tmpReg, 0); + } + else + { + assert(tree->AvailableTempRegCount() == 2); + const regNumber tmpReg1 = tree->ExtractTempReg(RBM_ALLINT); + const regNumber tmpReg2 = tree->ExtractTempReg(RBM_ALLINT); + emitLoadStore(/* load */ true, loadStoreSize, tmpReg1, 0); + emitLoadStore(/* load */ true, loadStoreSize, tmpReg2, size - loadStoreSize); + emitLoadStore(/* load */ false, loadStoreSize, tmpReg1, 0); + emitLoadStore(/* load */ false, loadStoreSize, tmpReg2, size - loadStoreSize); + } + } +#else // TARGET_ARM64 + unreached(); +#endif +} + //------------------------------------------------------------------------ // genCodeForInitBlkHelper - Generate code for an InitBlk node by the means of the VM memcpy helper call // @@ -4370,13 +4496,22 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* blkOp) break; case GenTreeBlk::BlkOpKindUnroll: + case GenTreeBlk::BlkOpKindUnrollMemmove: if (isCopyBlk) { if (blkOp->gtBlkOpGcUnsafe) { GetEmitter()->emitDisableGC(); } - genCodeForCpBlkUnroll(blkOp); + if (blkOp->gtBlkOpKind == GenTreeBlk::BlkOpKindUnroll) + { + genCodeForCpBlkUnroll(blkOp); + } + else + { + assert(blkOp->gtBlkOpKind == GenTreeBlk::BlkOpKindUnrollMemmove); + genCodeForMemmove(blkOp); + } if (blkOp->gtBlkOpGcUnsafe) { GetEmitter()->emitEnableGC(); diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 94f14f0d67d0f..8e3d8ceff59eb 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -8941,22 +8941,24 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // unsigned int getUnrollThreshold(UnrollKind type, bool canUseSimd = true) { - unsigned threshold = TARGET_POINTER_SIZE; + unsigned maxRegSize = REGSIZE_BYTES; + unsigned threshold = maxRegSize; #if defined(FEATURE_SIMD) if (canUseSimd) { - threshold = maxSIMDStructBytes(); -#if defined(TARGET_ARM64) + maxRegSize = maxSIMDStructBytes(); +#if defined(TARGET_XARCH) + // TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial + maxRegSize = min(maxRegSize, YMM_REGSIZE_BYTES); + threshold = maxRegSize; +#elif defined(TARGET_ARM64) // ldp/stp instructions can load/store two 16-byte vectors at once, e.g.: // // ldp q0, q1, [x1] // stp q0, q1, [x0] // - threshold *= 2; -#elif defined(TARGET_XARCH) - // TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial - threshold = min(threshold, YMM_REGSIZE_BYTES); + threshold = maxRegSize * 2; #endif } #if defined(TARGET_XARCH) @@ -8991,8 +8993,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // threshold *= 4; - // NOTE: Memmove's unrolling is currently limitted with LSRA - - // up to LinearScan::MaxInternalCount number of temp regs, e.g. 5*32=160 bytes for AVX cpu. + if (type == UnrollKind::Memmove) + { + // NOTE: Memmove's unrolling is currently limitted with LSRA - + // up to LinearScan::MaxInternalCount number of temp regs, e.g. 5*16=80 bytes on arm64 + threshold = maxRegSize * 4; + } + return threshold; } diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 7d01790ddf2b4..de76a7c516e1c 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -1851,7 +1851,7 @@ GenTree* Lowering::LowerCall(GenTree* node) if (call->gtCallMoreFlags & GTF_CALL_M_SPECIAL_INTRINSIC) { -#ifdef TARGET_AMD64 +#if defined(TARGET_AMD64) || defined(TARGET_ARM64) if (comp->lookupNamedIntrinsic(call->gtCallMethHnd) == NI_System_Buffer_Memmove) { GenTree* newNode = LowerCallMemmove(call); diff --git a/src/coreclr/jit/lsraarmarch.cpp b/src/coreclr/jit/lsraarmarch.cpp index bc3cf04c23594..5451841be15a2 100644 --- a/src/coreclr/jit/lsraarmarch.cpp +++ b/src/coreclr/jit/lsraarmarch.cpp @@ -730,6 +730,53 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) } break; + case GenTreeBlk::BlkOpKindUnrollMemmove: + { +#ifdef TARGET_ARM64 + + // Prepare SIMD/GPR registers needed to perform an unrolled memmove. The idea that + // we can ignore the fact that dst and src might overlap if we save the whole dst + // to temp regs in advance, e.g. for memmove(rax, rcx, 120): + + // Lowering was expected to get rid of memmove in case of zero + assert(size > 0); + + const unsigned simdSize = FP_REGSIZE_BYTES; + if (size >= simdSize) + { + unsigned simdRegs = size / simdSize; + if ((size % simdSize) != 0) + { + // TODO-CQ: Consider using GPR load/store here if the reminder is 1,2,4 or 8 + simdRegs++; + } + for (unsigned i = 0; i < simdRegs; i++) + { + // It's too late to revert the unrolling so we hope we'll have enough SIMD regs + // no more than MaxInternalCount. Currently, it's controlled by getUnrollThreshold(memmove) + buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); + } + } + else + { + if (isPow2(size)) + { + // Single GPR for 1,2,4,8 + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + } + else + { + // Any size from 3 to 15 can be handled via two GPRs + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + } + } +#else // TARGET_ARM64 + unreached(); +#endif + } + break; + case GenTreeBlk::BlkOpKindHelper: dstAddrRegMask = RBM_ARG_0; if (srcAddrOrFill != nullptr) From 6c5c1dc3e9280fd8b5e029fc9c9732e7a94fd31f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Petryka?= <35800402+MichalPetryka@users.noreply.github.com> Date: Tue, 21 Mar 2023 19:12:10 +0100 Subject: [PATCH 02/11] Add 3xSIMD unrolling tests --- src/tests/JIT/opt/Vectorization/BufferMemmove.cs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/tests/JIT/opt/Vectorization/BufferMemmove.cs b/src/tests/JIT/opt/Vectorization/BufferMemmove.cs index f6c71fa00f8f6..c8757a3cd1d41 100644 --- a/src/tests/JIT/opt/Vectorization/BufferMemmove.cs +++ b/src/tests/JIT/opt/Vectorization/BufferMemmove.cs @@ -46,15 +46,24 @@ static int Main() // Some large simds TestMemmove((dst, src) => src.AsSpan(0, 33).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(33)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 47).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(47)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 48).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(48)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 49).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(49)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 63).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(63)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 64).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(64)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 65).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(65)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 95).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(95)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 96).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(96)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 97).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(97)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 127).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(127)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 128).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(128)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 129).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(129)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 159).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(159)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 160).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(160)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 161).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(161)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 191).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(191)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 192).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(192)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 193).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(193)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 255).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(255)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 256).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(256)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 257).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(257)).CopyTo(dst)); From 4f65ea532abaccc1f1199f43e3378149cbe9cc9c Mon Sep 17 00:00:00 2001 From: EgorBo Date: Tue, 21 Mar 2023 20:07:09 +0100 Subject: [PATCH 03/11] fix assert --- src/coreclr/jit/lower.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index de76a7c516e1c..366b4c343a509 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -1795,7 +1795,9 @@ GenTree* Lowering::AddrGen(void* addr) GenTree* Lowering::LowerCallMemmove(GenTreeCall* call) { assert(comp->lookupNamedIntrinsic(call->gtCallMethHnd) == NI_System_Buffer_Memmove); - assert(call->gtArgs.CountArgs() == 3); + + // R2R adds r2r cell arg on arm64 + assert(comp->opts.IsReadyToRun() || (call->gtArgs.CountArgs() == 3)); GenTree* lengthArg = call->gtArgs.GetArgByIndex(2)->GetNode(); if (lengthArg->IsIntegralConst()) From 5bacc34f8ff0ec5b0d93f5ff472016be4c3c5fdf Mon Sep 17 00:00:00 2001 From: EgorBo Date: Wed, 22 Mar 2023 00:00:49 +0100 Subject: [PATCH 04/11] Address feedback --- src/coreclr/jit/gentree.cpp | 30 ++++++++++++++++++++++++++++ src/coreclr/jit/gentree.h | 2 ++ src/coreclr/jit/lower.cpp | 39 ++++++++++++++++++++++++++++++++----- src/coreclr/jit/morph.cpp | 18 +++++++++++++++++ 4 files changed, 84 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index e6347725f57a2..469b9d12c4df0 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -1603,6 +1603,36 @@ CallArg* CallArgs::GetArgByIndex(unsigned index) return cur; } +//--------------------------------------------------------------- +// GetUserArgByIndex: Get an argument with the specified index. +// Unlike GetArgByIndex, this function ignores non-user args +// like r2r cells. +// +// Parameters: +// index - The index of the argument to find. +// +// Returns: +// A pointer to the argument. +// +// Remarks: +// This function assumes enough arguments exist. +// +CallArg* CallArgs::GetUserArgByIndex(unsigned index) +{ + CallArg* cur = m_head; + for (unsigned i = 0; i < index || cur->IsArgAddedLate();) + { + if (!cur->IsArgAddedLate()) + { + i++; + } + assert((cur != nullptr) && "Not enough arguments in GetArgByIndex"); + cur = cur->GetNext(); + } + + return cur; +} + //--------------------------------------------------------------- // GetIndex: Get the index for the specified argument. // diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 263e21fe556fe..40560de283325 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -4704,6 +4704,7 @@ class CallArgs CallArg* GetThisArg(); CallArg* GetRetBufferArg(); CallArg* GetArgByIndex(unsigned index); + CallArg* GetUserArgByIndex(unsigned index); unsigned GetIndex(CallArg* arg); bool IsEmpty() const @@ -4772,6 +4773,7 @@ class CallArgs unsigned OutgoingArgsStackSize() const; unsigned CountArgs(); + unsigned CountUserArgs(); template class CallArgIterator diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 366b4c343a509..89879a52304b7 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -1794,20 +1794,30 @@ GenTree* Lowering::AddrGen(void* addr) // GenTree* Lowering::LowerCallMemmove(GenTreeCall* call) { + JITDUMP("Considering Memmove [%06d] for unrolling.. ", comp->dspTreeID(call)) assert(comp->lookupNamedIntrinsic(call->gtCallMethHnd) == NI_System_Buffer_Memmove); - // R2R adds r2r cell arg on arm64 - assert(comp->opts.IsReadyToRun() || (call->gtArgs.CountArgs() == 3)); + assert(call->gtArgs.CountUserArgs() == 3); - GenTree* lengthArg = call->gtArgs.GetArgByIndex(2)->GetNode(); + if (comp->info.compHasNextCallRetAddr) + { + JITDUMP("compHasNextCallRetAddr=true so we won't be able to remove the call - bail out.\n") + return nullptr; + } + + GenTree* lengthArg = call->gtArgs.GetUserArgByIndex(2)->GetNode(); if (lengthArg->IsIntegralConst()) { ssize_t cnsSize = lengthArg->AsIntCon()->IconValue(); + JITDUMP("Size=%ld.. ", (LONG)cnsSize); // TODO-CQ: drop the whole thing in case of 0 if ((cnsSize > 0) && (cnsSize <= (ssize_t)comp->getUnrollThreshold(Compiler::UnrollKind::Memmove))) { - GenTree* dstAddr = call->gtArgs.GetArgByIndex(0)->GetNode(); - GenTree* srcAddr = call->gtArgs.GetArgByIndex(1)->GetNode(); + JITDUMP("Accepted for unrolling!\nOld tree:\n") + DISPTREE(call); + + GenTree* dstAddr = call->gtArgs.GetUserArgByIndex(0)->GetNode(); + GenTree* srcAddr = call->gtArgs.GetUserArgByIndex(1)->GetNode(); // TODO-CQ: Try to create an addressing mode GenTreeIndir* srcBlk = comp->gtNewIndir(TYP_STRUCT, srcAddr); @@ -1827,8 +1837,27 @@ GenTree* Lowering::LowerCallMemmove(GenTreeCall* call) BlockRange().Remove(lengthArg); BlockRange().Remove(call); + // Remove all non-user args (e.g. r2r cell) + for (CallArg& arg : call->gtArgs.Args()) + { + if (arg.IsArgAddedLate()) + { + BlockRange().Remove(arg.GetNode()); + } + } + + JITDUMP("\nNew tree:\n") + DISPTREE(storeBlk); return storeBlk; } + else + { + JITDUMP("Size is either 0 or too big to unroll.\n") + } + } + else + { + JITDUMP("size is not a constant.\n") } return nullptr; } diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index c623f9d10ec35..059e4a13259c0 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -3054,6 +3054,24 @@ unsigned CallArgs::CountArgs() return numArgs; } +//------------------------------------------------------------------------ +// CountArgs: Count the number of arguments ignoring non-user ones, e.g. +// r2r cell argument in a user function. +// +unsigned CallArgs::CountUserArgs() +{ + unsigned numArgs = 0; + for (CallArg& arg : Args()) + { + if (!arg.IsArgAddedLate()) + { + numArgs++; + } + } + + return numArgs; +} + //------------------------------------------------------------------------ // fgMorphArgs: Walk and transform (morph) the arguments of a call // From e81269b3729033b972d9ae7d487fa0dc12fb8d8e Mon Sep 17 00:00:00 2001 From: EgorBo Date: Wed, 22 Mar 2023 00:31:15 +0100 Subject: [PATCH 05/11] Add IL tests --- .../Vectorization/BufferMemmoveTailCall.il | 95 +++++++++++++++++++ .../BufferMemmoveTailCall.ilproj | 9 ++ 2 files changed, 104 insertions(+) create mode 100644 src/tests/JIT/opt/Vectorization/BufferMemmoveTailCall.il create mode 100644 src/tests/JIT/opt/Vectorization/BufferMemmoveTailCall.ilproj diff --git a/src/tests/JIT/opt/Vectorization/BufferMemmoveTailCall.il b/src/tests/JIT/opt/Vectorization/BufferMemmoveTailCall.il new file mode 100644 index 0000000000000..508516234efd5 --- /dev/null +++ b/src/tests/JIT/opt/Vectorization/BufferMemmoveTailCall.il @@ -0,0 +1,95 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +.assembly extern System.Runtime {} +.assembly extern System.Runtime.Extensions {} +.assembly TailCallBufferMemmoveTests { + // Allow access to private members of System.Private.CoreLib + .custom instance void System.Runtime.CompilerServices.IgnoresAccessChecksToAttribute::.ctor(string) = ( + 01 00 16 53 79 73 74 65 6d 2e 50 72 69 76 61 74 + 65 2e 43 6f 72 65 4c 69 62 00 00 + ) +} + +.class public abstract auto ansi sealed beforefieldinit TailCallBufferMemmove + extends [System.Runtime]System.Object +{ + .method private hidebysig static int32 Main() cil managed + { + .maxstack 8 + .entrypoint + + // C#: + // byte[] src = new byte[32]; + // Test(ref src[0]); + + ldc.i4.s 32 + newarr [System.Runtime]System.Byte + ldc.i4.0 + ldelema [System.Runtime]System.Byte + call void TailCallBufferMemmove::Test(uint8&) + + // return 100; + ldc.i4.s 100 + ret + } + + .method private hidebysig static void Test (uint8& src) cil managed noinlining + { + .maxstack 3 + + // C#: + // byte* data = stackalloc byte[64]; // to trigger slow helper-based tail calls + // Buffer.Memmove(ref Unsafe.AsRef(data), ref src, 64); + + ldc.i4.s 64 + conv.u + localloc + call !!0& [System.Runtime]System.Runtime.CompilerServices.Unsafe::AsRef(void*) + ldarg.0 + ldc.i4.s 64 + conv.i + tail. call void [System.Runtime]System.Buffer::Memmove(uint8&, uint8&, native uint) + ret + } +} + +// C#: +// namespace System.Runtime.CompilerServices +// { +// public class IgnoresAccessChecksToAttribute : Attribute +// { +// public IgnoresAccessChecksToAttribute(string assemblyName) +// { +// AssemblyName = assemblyName; +// } +// public string AssemblyName { get; } +// } +// } +// +.class public auto ansi beforefieldinit System.Runtime.CompilerServices.IgnoresAccessChecksToAttribute + extends [System.Runtime]System.Attribute +{ + .field private initonly string 'k__BackingField' + .method public hidebysig specialname rtspecialname instance void .ctor (string assemblyName) cil managed + { + .maxstack 8 + ldarg.0 + call instance void [System.Runtime]System.Attribute::.ctor() + ldarg.0 + ldarg.1 + stfld string System.Runtime.CompilerServices.IgnoresAccessChecksToAttribute::'k__BackingField' + ret + } + .method public hidebysig specialname instance string get_AssemblyName () cil managed + { + .maxstack 8 + ldarg.0 + ldfld string System.Runtime.CompilerServices.IgnoresAccessChecksToAttribute::'k__BackingField' + ret + } + .property instance string AssemblyName() + { + .get instance string System.Runtime.CompilerServices.IgnoresAccessChecksToAttribute::get_AssemblyName() + } +} diff --git a/src/tests/JIT/opt/Vectorization/BufferMemmoveTailCall.ilproj b/src/tests/JIT/opt/Vectorization/BufferMemmoveTailCall.ilproj new file mode 100644 index 0000000000000..67f9d9446f732 --- /dev/null +++ b/src/tests/JIT/opt/Vectorization/BufferMemmoveTailCall.ilproj @@ -0,0 +1,9 @@ + + + Exe + True + + + + + From 95e0e3de79e7969a2f09c1a37eefa6c786ecb0b2 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Thu, 23 Mar 2023 17:10:27 +0100 Subject: [PATCH 06/11] Address feedback --- src/coreclr/jit/gentree.cpp | 6 ++++-- src/coreclr/jit/lower.cpp | 2 +- src/coreclr/jit/morph.cpp | 3 +++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 2acd11ccec5e5..8e8a25d0879c9 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -1615,18 +1615,20 @@ CallArg* CallArgs::GetArgByIndex(unsigned index) // A pointer to the argument. // // Remarks: -// This function assumes enough arguments exist. +// This function assumes enough arguments exist. The current implementation +// doesn't ignore return buffers and type arguments. // CallArg* CallArgs::GetUserArgByIndex(unsigned index) { CallArg* cur = m_head; + assert((cur != nullptr) && "Not enough user arguments in GetUserArgByIndex"); for (unsigned i = 0; i < index || cur->IsArgAddedLate();) { if (!cur->IsArgAddedLate()) { i++; } - assert((cur != nullptr) && "Not enough arguments in GetArgByIndex"); + assert((cur != nullptr) && "Not enough user arguments in GetUserArgByIndex"); cur = cur->GetNext(); } diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 89879a52304b7..0b416dfd8c2b5 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -1842,7 +1842,7 @@ GenTree* Lowering::LowerCallMemmove(GenTreeCall* call) { if (arg.IsArgAddedLate()) { - BlockRange().Remove(arg.GetNode()); + arg.GetNode()->SetUnusedValue(); } } diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 059e4a13259c0..5ba5a1577011e 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -3058,6 +3058,9 @@ unsigned CallArgs::CountArgs() // CountArgs: Count the number of arguments ignoring non-user ones, e.g. // r2r cell argument in a user function. // +// Remarks: +// The current implementation doesn't ignore return buffers and type arguments. +// unsigned CallArgs::CountUserArgs() { unsigned numArgs = 0; From a53635a0b9bb509ab98003b420b36193bfd2521a Mon Sep 17 00:00:00 2001 From: EgorBo Date: Thu, 23 Mar 2023 17:47:44 +0100 Subject: [PATCH 07/11] fix nullref --- src/coreclr/jit/gentree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 8e8a25d0879c9..999797a6c6d33 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -1628,8 +1628,8 @@ CallArg* CallArgs::GetUserArgByIndex(unsigned index) { i++; } - assert((cur != nullptr) && "Not enough user arguments in GetUserArgByIndex"); cur = cur->GetNext(); + assert((cur != nullptr) && "Not enough user arguments in GetUserArgByIndex"); } return cur; From 9d030b588c6127621cfd6ff12812189adbd8cbcf Mon Sep 17 00:00:00 2001 From: EgorBo Date: Fri, 24 Mar 2023 13:09:00 +0100 Subject: [PATCH 08/11] Address feedback: typos, add IsUserArg --- src/coreclr/jit/codegenarmarch.cpp | 2 +- src/coreclr/jit/codegenxarch.cpp | 8 ++++---- src/coreclr/jit/compiler.h | 4 ++-- src/coreclr/jit/gentree.cpp | 30 +++++++++++++++++++++++++----- src/coreclr/jit/gentree.h | 2 ++ src/coreclr/jit/lower.cpp | 3 +++ src/coreclr/jit/lsraarmarch.cpp | 23 ++++++++++------------- src/coreclr/jit/lsraxarch.cpp | 23 ++++++++++------------- src/coreclr/jit/morph.cpp | 5 ++--- 9 files changed, 59 insertions(+), 41 deletions(-) diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 31cd7a4c0d3e2..736f1acd17443 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -3135,7 +3135,7 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree) } if ((size - offset) < simdSize) { - // Overlap with the previously processed data. We'll always use SIMD for that for simplicity + // Overlap with the previously processed data. We'll always use SIMD for simplicity // TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder. offset = size - simdSize; } diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index e3a55a3a417dc..3a66a96452840 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2598,7 +2598,7 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree) // temporary SIMD registers to fully load the source and avoid any potential issues with overlap. assert(numberOfSimdRegs * simdSize >= size); - // Pop all temp regs to a local array, currently, this impl is limitted with LSRA's MaxInternalCount + // Pop all temp regs to a local array, currently, this impl is limited with LSRA's MaxInternalCount regNumber tempRegs[LinearScan::MaxInternalCount] = {}; for (unsigned i = 0; i < numberOfSimdRegs; i++) { @@ -2630,7 +2630,7 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree) assert(size > offset); if ((size - offset) < simdSize) { - // Overlap with the previosly processed data. We'll always use SIMD for that for simplicity + // Overlap with the previously processed data. We'll always use SIMD for simplicity // TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder. offset = size - simdSize; } @@ -3285,7 +3285,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) size -= bytesWritten; - // Handle the remainder by overlapping with previosly processed data (only for zeroing) + // Handle the remainder by overlapping with previously processed data (only for zeroing) if (zeroing && (size > 0) && (size < regSize) && (regSize >= XMM_REGSIZE_BYTES)) { if (isPow2(size) && (size <= REGSIZE_BYTES)) @@ -3550,7 +3550,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) assert((size >= 0) && (size < regSize)); - // Handle the remainder by overlapping with previosly processed data + // Handle the remainder by overlapping with previously processed data if ((size > 0) && (size < regSize)) { assert(regSize >= XMM_REGSIZE_BYTES); diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 3c887e2ae20ba..0e3fd4773e5b2 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -8989,13 +8989,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // | arm | 32 | 16 | no SIMD support // | loongarch64 | 64 | 32 | no SIMD support // - // We might want to use a different multiplier for trully hot/cold blocks based on PGO data + // We might want to use a different multiplier for truly hot/cold blocks based on PGO data // threshold *= 4; if (type == UnrollKind::Memmove) { - // NOTE: Memmove's unrolling is currently limitted with LSRA - + // NOTE: Memmove's unrolling is currently limited with LSRA - // up to LinearScan::MaxInternalCount number of temp regs, e.g. 5*16=80 bytes on arm64 threshold = maxRegSize * 4; } diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 999797a6c6d33..9ccc7d4cb922c 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -1419,6 +1419,27 @@ bool CallArg::IsArgAddedLate() const } } +//--------------------------------------------------------------- +// IsUserArg: Check if this is an argument that can be treated as +// user-defined (in IL). +// +// Remarks: +// "this" and ShiftLow/ShiftHigt are recognized as user-defined +// +bool CallArg::IsUserArg() const +{ + switch (static_cast(m_wellKnownArg)) + { + case WellKnownArg::None: + case WellKnownArg::ShiftLow: + case WellKnownArg::ShiftHigh: + case WellKnownArg::ThisPointer: + return true; + default: + return false; + } +} + #ifdef DEBUG //--------------------------------------------------------------- // CheckIsStruct: Verify that the struct ABI information is consistent with the IR node. @@ -1615,23 +1636,22 @@ CallArg* CallArgs::GetArgByIndex(unsigned index) // A pointer to the argument. // // Remarks: -// This function assumes enough arguments exist. The current implementation -// doesn't ignore return buffers and type arguments. +// This function assumes enough arguments exist. Also, see IsUserArg's +// comments // CallArg* CallArgs::GetUserArgByIndex(unsigned index) { CallArg* cur = m_head; assert((cur != nullptr) && "Not enough user arguments in GetUserArgByIndex"); - for (unsigned i = 0; i < index || cur->IsArgAddedLate();) + for (unsigned i = 0; i < index || !cur->IsUserArg();) { - if (!cur->IsArgAddedLate()) + if (cur->IsUserArg()) { i++; } cur = cur->GetNext(); assert((cur != nullptr) && "Not enough user arguments in GetUserArgByIndex"); } - return cur; } diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 40560de283325..314cba6023d10 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -4644,6 +4644,8 @@ class CallArg bool IsArgAddedLate() const; + bool IsUserArg() const; + #ifdef DEBUG void Dump(Compiler* comp); // Check that the value of 'AbiInfo.IsStruct' is consistent. diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 0b416dfd8c2b5..4ccc92dfbabe6 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -1792,6 +1792,9 @@ GenTree* Lowering::AddrGen(void* addr) // Arguments: // tree - GenTreeCall node to replace with STORE_BLK // +// Return Value: +// nullptr if no changes were made +// GenTree* Lowering::LowerCallMemmove(GenTreeCall* call) { JITDUMP("Considering Memmove [%06d] for unrolling.. ", comp->dspTreeID(call)) diff --git a/src/coreclr/jit/lsraarmarch.cpp b/src/coreclr/jit/lsraarmarch.cpp index 5451841be15a2..2955b457971a1 100644 --- a/src/coreclr/jit/lsraarmarch.cpp +++ b/src/coreclr/jit/lsraarmarch.cpp @@ -735,8 +735,8 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) #ifdef TARGET_ARM64 // Prepare SIMD/GPR registers needed to perform an unrolled memmove. The idea that - // we can ignore the fact that dst and src might overlap if we save the whole dst - // to temp regs in advance, e.g. for memmove(rax, rcx, 120): + // we can ignore the fact that src and dst might overlap if we save the whole src + // to temp regs in advance. // Lowering was expected to get rid of memmove in case of zero assert(size > 0); @@ -757,19 +757,16 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); } } + else if (isPow2(size)) + { + // Single GPR for 1,2,4,8 + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + } else { - if (isPow2(size)) - { - // Single GPR for 1,2,4,8 - buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); - } - else - { - // Any size from 3 to 15 can be handled via two GPRs - buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); - buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); - } + // Any size from 3 to 15 can be handled via two GPRs + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); } #else // TARGET_ARM64 unreached(); diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 053dd1f1850fa..f9c288455af80 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1512,8 +1512,8 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) case GenTreeBlk::BlkOpKindUnrollMemmove: { // Prepare SIMD/GPR registers needed to perform an unrolled memmove. The idea that - // we can ignore the fact that dst and src might overlap if we save the whole dst - // to temp regs in advance, e.g. for memmove(rax, rcx, 120): + // we can ignore the fact that src and dst might overlap if we save the whole src + // to temp regs in advance, e.g. for memmove(rcx, rax, 120): // // vmovdqu ymm0, ymmword ptr[rax + 0] // vmovdqu ymm1, ymmword ptr[rax + 32] @@ -1554,19 +1554,16 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) } SetContainsAVXFlags(); } + else if (isPow2(size)) + { + // Single GPR for 1,2,4,8 + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + } else { - if (isPow2(size)) - { - // Single GPR for 1,2,4,8 - buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); - } - else - { - // Any size from 3 to 15 can be handled via two GPRs - buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); - buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); - } + // Any size from 3 to 15 can be handled via two GPRs + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); } } break; diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 5ba5a1577011e..3a7c4e72826d9 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -3059,19 +3059,18 @@ unsigned CallArgs::CountArgs() // r2r cell argument in a user function. // // Remarks: -// The current implementation doesn't ignore return buffers and type arguments. +// See IsUserArg's comments // unsigned CallArgs::CountUserArgs() { unsigned numArgs = 0; for (CallArg& arg : Args()) { - if (!arg.IsArgAddedLate()) + if (arg.IsUserArg()) { numArgs++; } } - return numArgs; } From dd71671a1b884e0595056bc7cbb377527b2447b7 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Fri, 24 Mar 2023 13:13:12 +0100 Subject: [PATCH 09/11] Clean up --- src/coreclr/jit/codegenarmarch.cpp | 4 ++-- src/coreclr/jit/codegenxarch.cpp | 4 ++-- src/coreclr/jit/lsraxarch.cpp | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 736f1acd17443..1fe8d7347b834 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -3052,8 +3052,8 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) //------------------------------------------------------------------------ // genCodeForMemmove: Perform an unrolled memmove. The idea that we can -// ignore the fact that dst and src might overlap if we save the whole -// dst to temp regs in advance, e.g. for memmove(rax, rcx, 30): +// ignore the fact that src and dst might overlap if we save the whole +// src to temp regs in advance, e.g. for memmove(dst: x1, src: x0, len: 30): // // ldr q16, [x0] // ldr q17, [x0, #0x0E] diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 3a66a96452840..4cc73f78de953 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2556,8 +2556,8 @@ void CodeGen::genStackPointerDynamicAdjustmentWithProbe(regNumber regSpDelta) //------------------------------------------------------------------------ // genCodeForMemmove: Perform an unrolled memmove. The idea that we can -// ignore the fact that dst and src might overlap if we save the whole -// dst to temp regs in advance, e.g. for memmove(rax, rcx, 120): +// ignore the fact that src and dst might overlap if we save the whole +// src to temp regs in advance, e.g. for memmove(dst: rcx, src: rax, len: 120): // // vmovdqu ymm0, ymmword ptr[rax + 0] // vmovdqu ymm1, ymmword ptr[rax + 32] diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index f9c288455af80..4b3fcd86e0e95 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1513,7 +1513,7 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) { // Prepare SIMD/GPR registers needed to perform an unrolled memmove. The idea that // we can ignore the fact that src and dst might overlap if we save the whole src - // to temp regs in advance, e.g. for memmove(rcx, rax, 120): + // to temp regs in advance, e.g. for memmove(dst: rcx, src: rax, len: 120): // // vmovdqu ymm0, ymmword ptr[rax + 0] // vmovdqu ymm1, ymmword ptr[rax + 32] From 9a069244dbc7203a85f4695b694fa47afef0e805 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Fri, 24 Mar 2023 13:16:45 +0100 Subject: [PATCH 10/11] Clean up --- src/coreclr/jit/codegenarmarch.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 1fe8d7347b834..a93183c31f2cc 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -3106,9 +3106,10 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree) } }; - // Eventually we want to emit CPYP+CPYM+CPYE on armv9 for large sizes + // Eventually, we'll emit CPYP+CPYM+CPYE on armv9 for large sizes here. - // TODO-CQ: Emit stp/ldp (32 bytes at once). + // Let's not use stp/ldp here and rely on the underlying peephole optimizations to merge subsequent + // ldr/str pairs into stp/ldp, see https://github.com/dotnet/runtime/issues/64815 unsigned simdSize = FP_REGSIZE_BYTES; if (size >= simdSize) { From b67d52ba873a5f950251811f41039ef08e1ad604 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Fri, 24 Mar 2023 19:38:27 +0100 Subject: [PATCH 11/11] Update src/coreclr/jit/gentree.cpp Co-authored-by: Bruce Forstall --- src/coreclr/jit/gentree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 9ccc7d4cb922c..a2a1fd3d8c7c8 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -1424,7 +1424,7 @@ bool CallArg::IsArgAddedLate() const // user-defined (in IL). // // Remarks: -// "this" and ShiftLow/ShiftHigt are recognized as user-defined +// "this" and ShiftLow/ShiftHigh are recognized as user-defined // bool CallArg::IsUserArg() const {