From eef7fbd98e7bca65cb10c001ef69222e46502237 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Ga=C5=82ecki?= Date: Wed, 22 Nov 2023 10:30:42 +1000 Subject: [PATCH] Local node permutation for arb lattice --- src/ArbConnectivity.hpp | 1 + src/ArbLattice.cpp | 77 +++++++++++++++++++++++++++++++++++++---- src/ArbLattice.hpp | 39 ++++++++++++++++----- src/Lists.h.Rt | 2 +- 4 files changed, 103 insertions(+), 16 deletions(-) diff --git a/src/ArbConnectivity.hpp b/src/ArbConnectivity.hpp index 27e31915a..dffb15404 100644 --- a/src/ArbConnectivity.hpp +++ b/src/ArbConnectivity.hpp @@ -29,6 +29,7 @@ struct ArbLatticeConnectivity { } size_t getLocalSize() const { return chunk_end - chunk_begin; } + bool isGhost(Index nbr) const { return nbr != -1 && (nbr < static_cast(chunk_begin) || nbr >= static_cast(chunk_end)); } double& coord(size_t dim, size_t local_node_ind) { return coords[local_node_ind + dim * getLocalSize()]; } double coord(size_t dim, size_t local_node_ind) const { return coords[local_node_ind + dim * getLocalSize()]; } diff --git a/src/ArbLattice.cpp b/src/ArbLattice.cpp index 511b6132a..841b3b459 100644 --- a/src/ArbLattice.cpp +++ b/src/ArbLattice.cpp @@ -1,10 +1,27 @@ #include "ArbLattice.hpp" +#include #include +#include +#include #include +#include #include +#include #include "PartitionArbLattice.hpp" +#include "UtilTypes.hpp" +#include "mpitools.hpp" + +ArbLattice::ArbLattice(size_t num_snaps_, const UnitEnv& units_, const std::map& zone_map, const std::string& cxn_path, MPI_Comm comm_) : LatticeBase(ZONESETTINGS, ZONE_MAX, units_), num_snaps(num_snaps_), comm(comm_) { + readFromCxn(zone_map, cxn_path); + global_node_dist = computeInitialNodeDist(connect.num_nodes_global, mpitools::MPI_Size(comm)); + partition(); + if (connect.getLocalSize() == 0) throw std::runtime_error{"At least one MPI rank has an empty partition, please use fewer MPI ranks"}; // Realistically, this should never happen + computeGhostNodes(); + computeLocalPermutation(); + local_bounding_box = getLocalBoundingBox(); +} std::unordered_map ArbLattice::makeGroupZoneMap(const std::map& zone_map) const { std::unordered_map retval; @@ -14,15 +31,13 @@ std::unordered_map ArbLattice::makeGroupZoneMap(const std::map return retval; } -void ArbLattice::readFromCxn(const std::map& zone_map, const std::string& cxn_path, MPI_Comm comm) { +void ArbLattice::readFromCxn(const std::map& zone_map, const std::string& cxn_path) { using namespace std::string_literals; using namespace std::string_view_literals; const auto gz_map = makeGroupZoneMap(zone_map); - int comm_rank{}, comm_size{}; - MPI_Comm_rank(comm, &comm_rank); - MPI_Comm_size(comm, &comm_size); + const int comm_rank = mpitools::MPI_Rank(comm), comm_size = mpitools::MPI_Size(comm); // Open file + utils for error reporting std::fstream file(cxn_path, std::ios_base::in); @@ -168,12 +183,14 @@ void ArbLattice::readFromCxn(const std::map& zone_map, const s check_file_ok("Failed to read node data"); } }); +} - if (comm_size == 1) return; +void ArbLattice::partition() { + if (mpitools::MPI_Size(comm) == 1) return; - const auto zero_dir_ind = std::distance(Model_m::offset_directions.cbegin(), std::find(Model_m::offset_directions.cbegin(), Model_m::offset_directions.cend(), std::array{0, 0, 0})); // Note: the behavior is still correct even if (0,0,0) is not an offset direction + const auto zero_dir_ind = std::distance(Model_m::offset_directions.cbegin(), std::find(Model_m::offset_directions.cbegin(), Model_m::offset_directions.cend(), OffsetDir{0, 0, 0})); // Note: the behavior is still correct even if (0,0,0) is not an offset direction const auto offset_dir_wgts = std::vector(Model_m::offset_direction_weights.begin(), Model_m::offset_direction_weights.end()); - const auto [dist, log] = partitionArbLattice(connect, offset_dir_wgts, zero_dir_ind, comm); + auto [dist, log] = partitionArbLattice(connect, offset_dir_wgts, zero_dir_ind, comm); for (const auto& [type, msg] : log) switch (type) { case PartOutput::MsgType::Notice: NOTICE(msg.c_str()); @@ -184,4 +201,50 @@ void ArbLattice::readFromCxn(const std::map& zone_map, const s case PartOutput::MsgType::Error: throw std::runtime_error(msg); } + global_node_dist = std::move(dist); +} + +void ArbLattice::computeGhostNodes() { + std::unordered_set ghosts; + const Span all_nbrs(connect.nbrs.get(), Q * connect.getLocalSize()); + for (auto nbr : all_nbrs) + if (connect.isGhost(nbr)) ghosts.insert(nbr); + ghost_nodes.reserve(ghosts.size()); + std::copy(ghosts.cbegin(), ghosts.cend(), std::back_inserter(ghost_nodes)); + std::sort(ghost_nodes.begin(), ghost_nodes.end()); +} + +void ArbLattice::computeLocalPermutation() { + local_permutation.resize(connect.getLocalSize()); + std::iota(local_permutation.begin(), local_permutation.end(), 0); + const auto is_border_node = [&](int lid) { + for (size_t q = 0; q != Q; ++q) { + const auto nbr = connect.neighbor(q, lid); + if (connect.isGhost(nbr)) return true; + } + return false; + }; + const auto interior_begin = std::stable_partition(local_permutation.begin(), local_permutation.end(), is_border_node); + num_border_nodes = static_cast(std::distance(local_permutation.begin(), interior_begin)); + const auto by_zyx = [&](int lid1, int lid2) { // Sort by z, then y, then x -> this should help with coalesced memory access + const auto get_zyx = [&](int lid) { return std::array{connect.coord(2, lid), connect.coord(1, lid), connect.coord(0, lid)}; }; + return get_zyx(lid1) < get_zyx(lid2); + }; + std::sort(local_permutation.begin(), interior_begin, by_zyx); + std::sort(interior_begin, local_permutation.end(), by_zyx); +} + +int ArbLattice::fullLatticePos(double pos) const { + const auto retval = std::lround(pos / connect.grid_size - .5); + assert(retval <= std::numeric_limits::max() && retval >= std::numeric_limits::min()); + return static_cast(retval); +} + +lbRegion ArbLattice::getLocalBoundingBox() const { + const auto x = Span(connect.coords.get(), connect.getLocalSize()), y = Span(std::next(connect.coords.get(), connect.getLocalSize()), connect.getLocalSize()), z = Span(std::next(connect.coords.get(), 2 * connect.getLocalSize()), connect.getLocalSize()); + const auto [minx_it, maxx_it] = std::minmax_element(x.begin(), x.end()); + const auto [miny_it, maxy_it] = std::minmax_element(y.begin(), y.end()); + const auto [minz_it, maxz_it] = std::minmax_element(z.begin(), z.end()); + const double x_min = *minx_it, x_max = *maxx_it, y_min = *miny_it, y_max = *maxy_it, z_min = *minz_it, z_max = *maxz_it; + return lbRegion(x_min, y_min, z_min, x_max - x_min, y_max - y_min, z_max - z_min); } diff --git a/src/ArbLattice.hpp b/src/ArbLattice.hpp index 90c1831c6..00625d272 100644 --- a/src/ArbLattice.hpp +++ b/src/ArbLattice.hpp @@ -1,23 +1,41 @@ #ifndef ARBLATTICE_HPP #define ARBLATTICE_HPP +#include #include #include #include "ArbConnectivity.hpp" #include "LatticeBase.hpp" #include "Lists.h" +#include "Region.h" + +// Note on node numbering: We have 2 indexing schemes - a global and a local one. +// Globally, nodes are numbered such that consecutive ranks own monotonically increasing intervals, (e.g. rank 0 owns nodes [0, n_0), rank 1 owns nodes [n_0, n_1), etc.) This is required for ParMETIS, but it is also a convenient scheme in general. +// When communicating between ranks, only this numbering is used. +// Locally, nodes are permuted such that: +// - -1 is a valid index, signifying that a neighbor is not present at a given direction +// - Owned boundary nodes (i.e. nodes which have ghost nodes among their neighbors) occupy the initial B indices, starting at 0. Their numbering within [0, B) is arbitrary w.r.t. the global scheme +// - Owned interior nodes occupy the indices [B, B + I), where I is the number of interior nodes. Again, their order within that interval is arbitrary +// - Ghost nodes (i.e. nodes neighboring owned nodes, but not owned by the current rank) occupy indices [B + I, B + I + G), where G is the number of ghost nodes. **Their numbering within that interval corresponds to the global numbering scheme (important for optimal packing)** +// Note that the GPU is only aware of the local numbering scheme, which uses 32b indexing, saving memory bandwidth. Local indexing can additionally put "similar" nodes next to each other, minimizing branching and promoting coalesced memory access. +// Naming: GID == global ID; LID == local ID class ArbLattice : public LatticeBase { - public: - static constexpr size_t Q = Model_m::offset_directions.size(); - private: - ArbLatticeConnectivity connect; - size_t num_snaps; + ArbLatticeConnectivity connect; /// Lattice connectivity info + std::vector global_node_dist; /// Node distribution (in the ParMETIS sense), describing the GID node intervals owned by each process (identical in all ranks) + std::vector ghost_nodes; /// Sorted GIDs of ghost nodes + size_t num_border_nodes; /// Number of border nodes, i.e., nodes which have a ghost neighbor + lbRegion local_bounding_box; /// The bounding box of the local region (the arbitrary lattice is a subset of a full Cartesian lattice, we can use this for some optimizations) + size_t num_snaps; /// Number of snaps to hold + MPI_Comm comm; /// Communicator associated with the lattice + std::vector local_permutation; /// The permutation of owned nodes w.r.t. the global indexing scheme, see comment at the top public: - ArbLattice(size_t num_snaps_, const UnitEnv& units, const std::map& zone_map, const std::string& cxn_path, MPI_Comm comm) : LatticeBase(ZONESETTINGS, ZONE_MAX, units), num_snaps(num_snaps_) { readFromCxn(zone_map, cxn_path, comm); } + static constexpr size_t Q = Model_m::offset_directions.size(); + + ArbLattice(size_t num_snaps_, const UnitEnv& units_, const std::map& zone_map, const std::string& cxn_path, MPI_Comm comm_); size_t getLocalSize() const final { return connect.chunk_end - connect.chunk_begin; } size_t getGlobalSize() const final { return connect.num_nodes_global; } @@ -26,8 +44,13 @@ class ArbLattice : public LatticeBase { void IterateTill(int, int) final {} private: - std::unordered_map makeGroupZoneMap(const std::map& zone_map) const; - void readFromCxn(const std::map& zone_map, const std::string& cxn_path, MPI_Comm comm); + std::unordered_map makeGroupZoneMap(const std::map& zone_map) const; /// Translate the groups and zone into a single indexing scheme for the purposes of uniquely mapping read IDs from the .cxn file + void readFromCxn(const std::map& zone_map, const std::string& cxn_path); /// Read the lattice info from a .cxn file + void partition(); /// Repartition the lattice, if ParMETIS is not present this is a noop + void computeLocalPermutation(); /// Compute the local permutation, see comment at the top + void computeGhostNodes(); /// Retreive GIDs of ghost nodes from the connectivity info structure + int fullLatticePos(double pos) const; /// Compute the position (in terms of lattice offsets) of a node, assuming the arbitrary lattice is a subset of a Cartesian lattice + lbRegion getLocalBoundingBox() const; /// Compute local bounding box, assuming the arbitrary lattice is a subset of a Cartesian lattice }; #endif // ARBLATTICE_HPP diff --git a/src/Lists.h.Rt b/src/Lists.h.Rt index 86c48b1c5..9a19f1e73 100644 --- a/src/Lists.h.Rt +++ b/src/Lists.h.Rt @@ -217,7 +217,7 @@ field_ind = 0 for (f in rows(Fields)) { dirs = expand.grid(dx=f$minx:f$maxx, dy=f$miny:f$maxy, dz=f$minz:f$maxz) for (d in rows(dirs)) { - cat(paste0(" set_field_dir(", field_ind, ", ", d$dx, ", ", d$dy,", ", d$dz, ");\n")) + cat(paste0(" set_field_dir(", field_ind, ", ", d$dx, ", ", d$dy,", ", d$dz, ");\n")) } field_ind = field_ind + 1 }