Skip to content

Commit

Permalink
Merge pull request arcaneframework#961 from arcaneframework/dev/gg-ad…
Browse files Browse the repository at this point in the history
…d-accelerator-async-memcopy

Add accelerator asynchronous memory copy for NumArray
  • Loading branch information
grospelliergilles authored Oct 29, 2023
2 parents 2ed032a + eb3211d commit 560a15b
Show file tree
Hide file tree
Showing 10 changed files with 181 additions and 98 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "arcane/accelerator/core/RunCommandImpl.h"
#include "arcane/accelerator/core/IRunQueueEventImpl.h"
#include "arcane/accelerator/core/PointerAttribute.h"
#include "arcane/accelerator/core/RunQueue.h"

#include <iostream>

Expand Down Expand Up @@ -468,8 +469,12 @@ class CudaMemoryCopier
: public IMemoryCopier
{
void copy(ConstMemoryView from, [[maybe_unused]] eMemoryRessource from_mem,
MutableMemoryView to, [[maybe_unused]] eMemoryRessource to_mem) override
MutableMemoryView to, [[maybe_unused]] eMemoryRessource to_mem, RunQueue* queue) override
{
if (queue){
queue->copyMemory(MemoryCopyArgs(to.bytes(),from.bytes()).addAsync(queue->isAsync()));
return;
}
// 'cudaMemcpyDefault' sait automatiquement ce qu'il faut faire en tenant
// uniquement compte de la valeur des pointeurs. Il faudrait voir si
// utiliser \a from_mem et \a to_mem peut améliorer les performances.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "arcane/accelerator/core/IRunQueueEventImpl.h"
#include "arcane/accelerator/core/RunCommandImpl.h"
#include "arcane/accelerator/core/DeviceInfoList.h"
#include "arcane/accelerator/core/RunQueue.h"

#include <iostream>

Expand Down Expand Up @@ -379,8 +380,12 @@ class HipMemoryCopier
: public IMemoryCopier
{
void copy(ConstMemoryView from, [[maybe_unused]] eMemoryRessource from_mem,
MutableMemoryView to, [[maybe_unused]] eMemoryRessource to_mem) override
MutableMemoryView to, [[maybe_unused]] eMemoryRessource to_mem, RunQueue* queue) override
{
if (queue){
queue->copyMemory(MemoryCopyArgs(to.bytes(),from.bytes()).addAsync(queue->isAsync()));
return;
}
// 'hipMemcpyDefault' sait automatiquement ce qu'il faut faire en tenant
// uniquement compte de la valeur des pointeurs. Il faudrait voir si
// utiliser \a from_mem et \a to_mem peut améliorer les performances.
Expand Down
23 changes: 13 additions & 10 deletions arcane/src/arcane/tests/accelerator/NumArrayUnitTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,10 @@ class NumArrayUnitTest
}

template <typename NumArrayType> double
_doSum(const NumArrayType& values, std::array<Int32,NumArrayType::rank()> bounds)
_doSum(const NumArrayType& values, std::array<Int32,NumArrayType::rank()> bounds, RunQueue* queue = nullptr)
{
if (queue)
queue->barrier();
constexpr int Rank = NumArrayType::rank();
double total = 0.0;
SimpleForLoopRanges<Rank> lb(bounds);
Expand Down Expand Up @@ -595,6 +597,7 @@ _executeTest4(eMemoryRessource mem_kind)
info() << "Execute Test4 memory_ressource=" << mem_kind;

auto queue = makeQueue(m_runner);
queue.setAsync(true);

// Ne pas changer les dimensions du tableau sinon
// il faut aussi changer le calcul des sommes
Expand All @@ -620,14 +623,14 @@ _executeTest4(eMemoryRessource mem_kind)
command << RUNCOMMAND_LOOP1(iter, n1)
{
auto [i] = iter();
if ((i%2)==0)
if ((i % 2) == 0)
out_t1(i) = _getValue(i);
else
out_t1[i] = _getValue(i);
};
NumArray<double, MDDim1> host_t1(eMemoryRessource::Host);
host_t1.copy(_toMDSpan(t1));
double s1 = _doSum(host_t1, { n1 });
host_t1.copy(_toMDSpan(t1), &queue);
double s1 = _doSum(host_t1, { n1 }, &queue);
info() << "SUM1 = " << s1;
vc.areEqual(s1, expected_sum1, "SUM1");
}
Expand All @@ -645,15 +648,15 @@ _executeTest4(eMemoryRessource mem_kind)
};

NumArray<double, MDDim1> host_t2(eMemoryRessource::Host);
host_t2.copy(_toMDSpan(t2));
double s2 = _doSum(host_t2, { n1 });
host_t2.copy(_toMDSpan(t2), &queue);
double s2 = _doSum(host_t2, { n1 }, &queue);
info() << "SUM1_2 = " << s2;
vc.areEqual(s2, expected_sum1, "SUM1_2");
}
{
auto command = makeCommand(queue);
auto in_t1 = viewIn(command,t1);
auto out_t3 = viewOut(command,t3);
auto in_t1 = viewIn(command, t1);
auto out_t3 = viewOut(command, t3);

command << RUNCOMMAND_LOOP1(iter, n1)
{
Expand All @@ -662,8 +665,8 @@ _executeTest4(eMemoryRessource mem_kind)
};

NumArray<double, MDDim1> host_t3(eMemoryRessource::Host);
host_t3.copy(_toMDSpan(t3));
double s3 = _doSum(host_t3, { n1 });
host_t3.copy(_toMDSpan(t3), &queue);
double s3 = _doSum(host_t3, { n1 }, &queue);
info() << "SUM1_3 = " << s3;
vc.areEqual(s3, expected_sum1, "SUM1_3");
}
Expand Down
78 changes: 42 additions & 36 deletions arcane/src/arcane/utils/MemoryRessourceMng.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ namespace Arcane

/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/

namespace
{
const char* _toName(eMemoryRessource r)
Expand All @@ -46,6 +47,16 @@ namespace
return "Invalid";
}

inline bool _isHost(eMemoryRessource r)
{
// Si on sait pas, considère qu'on est accessible de puis l'hôte.
if (r == eMemoryRessource::Unknown)
return true;
if (r == eMemoryRessource::Host || r == eMemoryRessource::UnifiedMemory || r == eMemoryRessource::HostPinned)
return true;
return false;
}

} // namespace

extern "C++" ARCANE_UTILS_EXPORT std::ostream&
Expand All @@ -58,8 +69,36 @@ operator<<(std::ostream& o, eMemoryRessource r)
/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/

class DefaultHostMemoryCopier
: public IMemoryCopier
{
public:

void copy(ConstMemoryView from, eMemoryRessource from_mem,
MutableMemoryView to, eMemoryRessource to_mem, [[maybe_unused]] RunQueue* queue) override
{
// Sans support accélérateur, on peut juste faire un 'memcpy' si la mémoire
// est accessible depuis le CPU

if (!_isHost(from_mem))
ARCANE_FATAL("Source buffer is not accessible from host and no copier provided (location={0})",
from_mem);

if (!_isHost(to_mem))
ARCANE_FATAL("Destination buffer is not accessible from host and no copier provided (location={0})",
to_mem);

to.copyHost(from);
}
};

/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/

MemoryRessourceMng::
MemoryRessourceMng()
: m_default_memory_copier(new DefaultHostMemoryCopier())
, m_copier(m_default_memory_copier.get())
{
std::fill(m_allocators.begin(), m_allocators.end(), nullptr);
// Par défaut on utilise l'allocateur CPU. Les allocateurs spécifiques pour
Expand Down Expand Up @@ -117,49 +156,16 @@ setAllocator(eMemoryRessource r, IMemoryAllocator* allocator)
/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/

namespace
{
inline bool _isHost(eMemoryRessource r)
{
// Si on sait pas, considère qu'on est accessible de puis l'hôte.
if (r == eMemoryRessource::Unknown)
return true;
if (r == eMemoryRessource::Host || r == eMemoryRessource::UnifiedMemory || r == eMemoryRessource::HostPinned)
return true;
return false;
}
} // namespace

/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/

void MemoryRessourceMng::
copy(ConstMemoryView from, eMemoryRessource from_mem,
MutableMemoryView to, eMemoryRessource to_mem)
MutableMemoryView to, eMemoryRessource to_mem, RunQueue* queue)
{
Int64 from_size = from.bytes().size();
Int64 to_size = to.bytes().size();
if (from_size > to_size)
ARCANE_FATAL("Destination copy is too small (to_size={0} from_size={1})", to_size, from_size);

// Utilise l'instance spécifique si elle disponible
if (m_copier) {
m_copier->copy(from, from_mem, to, to_mem);
return;
}

// Sinon, on peut juste faire un 'memcpy' si la mémoire est accessible
// depuis le CPU

if (!_isHost(from_mem))
ARCANE_FATAL("Source buffer is not accessible from host and no copier provided (location={0})",
from_mem);

if (!_isHost(to_mem))
ARCANE_FATAL("Destination buffer is not accessible from host and no copier provided (location={0})",
to_mem);

to.copyHost(from);
m_copier->copy(from, from_mem, to, to_mem, queue);
}

/*---------------------------------------------------------------------------*/
Expand All @@ -170,7 +176,7 @@ genericCopy(ConstMemoryView from, MutableMemoryView to)
{
IMemoryRessourceMng* mrm = platform::getDataMemoryRessourceMng();
eMemoryRessource mem_type = eMemoryRessource::Unknown;
mrm->_internal()->copy(from, mem_type, to, mem_type);
mrm->_internal()->copy(from, mem_type, to, mem_type, nullptr);
}

/*---------------------------------------------------------------------------*/
Expand Down
4 changes: 2 additions & 2 deletions arcane/src/arcane/utils/NumArray.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,10 @@ _checkHost(eMemoryRessource r)

void NumArrayBaseCommon::
_memoryAwareCopy(Span<const std::byte> from, eMemoryRessource from_mem,
Span<std::byte> to, eMemoryRessource to_mem)
Span<std::byte> to, eMemoryRessource to_mem, RunQueue* queue)
{
IMemoryRessourceMng* mrm = platform::getDataMemoryRessourceMng();
mrm->_internal()->copy(ConstMemoryView(from), from_mem, MutableMemoryView(to), to_mem);
mrm->_internal()->copy(ConstMemoryView(from), from_mem, MutableMemoryView(to), to_mem, queue);
}

/*---------------------------------------------------------------------------*/
Expand Down
76 changes: 49 additions & 27 deletions arcane/src/arcane/utils/NumArray.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ namespace Arcane::impl
/*---------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/
/*!
* \internal
* \brief Implémentation commune à pour NumArray.
*/
class ARCANE_UTILS_EXPORT NumArrayBaseCommon
Expand All @@ -55,10 +56,13 @@ class ARCANE_UTILS_EXPORT NumArrayBaseCommon
static IMemoryAllocator* _getDefaultAllocator(eMemoryRessource r);
static void _checkHost(eMemoryRessource r);
static void _memoryAwareCopy(Span<const std::byte> from, eMemoryRessource from_mem,
Span<std::byte> to, eMemoryRessource to_mem);
Span<std::byte> to, eMemoryRessource to_mem, RunQueue* queue);
};

// Wrapper de Arccore::Array pour la classe NumArray
/*!
* \internal
* \brief Wrapper de Arccore::Array pour la classe NumArray.
*/
template <typename DataType>
class NumArrayContainer
: private Arccore::Array<DataType>
Expand Down Expand Up @@ -148,32 +152,27 @@ class NumArrayContainer
}
}

void copyOnly(const ThatClass& v)
{
_memoryAwareCopy(v);
}

void copyOnly(const Span<const DataType>& v)
/*!
* \brief Copie les valeurs de \a v dans l'instance.
*
* \a input_ressource indique l'origine de la zone mémoire (ou eMemoryRessource::Unknown si inconnu)
*/
void copyOnly(const Span<const DataType>& v, eMemoryRessource input_ressource, RunQueue* queue = nullptr)
{
_memoryAwareCopy(v);
_memoryAwareCopy(v, input_ressource, queue);
}

private:

void _memoryAwareCopy(const ThatClass& v)
void _memoryAwareCopy(const Span<const DataType>& v, eMemoryRessource input_ressource, RunQueue* queue)
{
NumArrayBaseCommon::_memoryAwareCopy(asBytes(v.to1DSpan()), v.m_memory_ressource,
asWritableBytes(to1DSpan()), m_memory_ressource);
}
void _memoryAwareCopy(const Span<const DataType>& v)
{
NumArrayBaseCommon::_memoryAwareCopy(asBytes(v), eMemoryRessource::Unknown,
asWritableBytes(to1DSpan()), m_memory_ressource);
NumArrayBaseCommon::_memoryAwareCopy(asBytes(v), input_ressource,
asWritableBytes(to1DSpan()), m_memory_ressource, queue);
}
void _resizeAndCopy(const ThatClass& v)
{
this->_resizeNoInit(v.to1DSpan().size());
_memoryAwareCopy(v);
_memoryAwareCopy(v, v.memoryRessource(), nullptr);
}

private:
Expand Down Expand Up @@ -360,24 +359,40 @@ class NumArrayBase
* Cette opération est valide quelle que soit la mêmoire associée
* associée à l'instance.
*/
void copy(ConstSpanType rhs)
void copy(ConstSpanType rhs) { copy(rhs, nullptr); }

/*!
* \brief Copie dans l'instance les valeurs de \a rhs.
*
* Cette opération est valide quelle que soit la mêmoire associée
* associée à l'instance.
*/
void copy(const ThatClass& rhs) { copy(rhs, nullptr); }

/*!
* \brief Copie dans l'instance les valeurs de \a rhs via la file \a queue
*
* Cette opération est valide quelle que soit la mêmoire associée
* associée à l'instance.
* \a queue peut être nul. Si la file est asynchrone, il faudra la
* synchroniser avant de pouvoir utiliser l'instance.
*/
void copy(ConstSpanType rhs, RunQueue* queue)
{
this->resize(rhs.extents().dynamicExtents());
m_data.copyOnly(rhs.to1DSpan());
_updateSpanPointerFromData();
_resizeAndCopy(rhs, eMemoryRessource::Unknown, queue);
}

/*!
* \brief Copie dans l'instance les valeurs de \a rhs.
* \brief Copie dans l'instance les valeurs de \a rhs via la file \a queue
*
* Cette opération est valide quelle que soit la mêmoire associée
* associée à l'instance.
* \a queue peut être nul. Si la file est asynchrone, il faudra la
* synchroniser avant de pouvoir utiliser l'instance.
*/
void copy(const ThatClass& rhs)
void copy(const ThatClass& rhs, RunQueue* queue)
{
this->resize(rhs.extents().dynamicExtents());
m_data.copyOnly(rhs.m_data);
_updateSpanPointerFromData();
_resizeAndCopy(rhs.constSpan(), rhs.memoryRessource(), queue);
}

//! Référence constante pour l'élément \a idx
Expand Down Expand Up @@ -434,6 +449,13 @@ class NumArrayBase
{
m_span.m_ptr = m_data.to1DSpan().data();
}

void _resizeAndCopy(ConstSpanType rhs, eMemoryRessource input_ressource, RunQueue* queue)
{
this->resize(rhs.extents().dynamicExtents());
m_data.copyOnly(rhs.to1DSpan(), input_ressource, queue);
_updateSpanPointerFromData();
}
};

/*---------------------------------------------------------------------------*/
Expand Down
Loading

0 comments on commit 560a15b

Please sign in to comment.