From f2bb5be241077ec66d7cb5047b29d06543b6fff6 Mon Sep 17 00:00:00 2001 From: FG-TUM Date: Wed, 4 Sep 2024 13:46:53 +0200 Subject: [PATCH 01/15] const and replace outdated todo by comment --- src/parallel/GeneralDomainDecomposition.h | 2 +- src/particleContainer/TraversalTuner.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/parallel/GeneralDomainDecomposition.h b/src/parallel/GeneralDomainDecomposition.h index 7adbde1a76..f1900e720f 100644 --- a/src/parallel/GeneralDomainDecomposition.h +++ b/src/parallel/GeneralDomainDecomposition.h @@ -172,7 +172,7 @@ class GeneralDomainDecomposition : public DomainDecompMPIBase { std::pair, std::array> latchToGridSize(std::array boxMin, std::array boxMax) { for (size_t ind = 0; ind < 3; ++ind) { - double currentGridSize = (*_gridSize)[ind]; + const double currentGridSize = (*_gridSize)[ind]; // For boxmin, the lower domain boundary is 0, so that's always fine! boxMin[ind] = std::round(boxMin[ind] / currentGridSize) * currentGridSize; // update boxmax only if it isn't at the very top of the domain! diff --git a/src/particleContainer/TraversalTuner.h b/src/particleContainer/TraversalTuner.h index b4f0f27a65..5c1b66368b 100644 --- a/src/particleContainer/TraversalTuner.h +++ b/src/particleContainer/TraversalTuner.h @@ -132,7 +132,8 @@ TraversalTuner::~TraversalTuner() { template void TraversalTuner::findOptimalTraversal() { - // TODO implement autotuning here! At the moment the traversal is chosen via readXML! + // ls1 always uses the traversal selected via the XML + // If you want auto tuning activate AutoPas via CMake _optimalTraversal = _traversals[selectedTraversal].first; From c8b30341ecee68c0c5fe8e28cded6c8cf9330818 Mon Sep 17 00:00:00 2001 From: FG-TUM Date: Wed, 4 Sep 2024 13:49:26 +0200 Subject: [PATCH 02/15] Change log level from info to debug for low level information --- .../OriginalCellPairTraversal.h | 2 +- src/particleContainer/LinkedCells.cpp | 8 ++++---- src/particleContainer/TraversalTuner.h | 18 +++++++++--------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/particleContainer/LinkedCellTraversals/OriginalCellPairTraversal.h b/src/particleContainer/LinkedCellTraversals/OriginalCellPairTraversal.h index c54e9045fc..16324a213c 100644 --- a/src/particleContainer/LinkedCellTraversals/OriginalCellPairTraversal.h +++ b/src/particleContainer/LinkedCellTraversals/OriginalCellPairTraversal.h @@ -123,7 +123,7 @@ inline void OriginalCellPairTraversal::computeNeighbourOffsets() { mardyn_assert(forwardNeighbourIndex == 13); mardyn_assert(backwardNeighbourIndex == 13); - Log::global_log->info() << "Neighbour offsets are bounded by " + Log::global_log->debug() << "Neighbour offsets are bounded by " << minNeighbourOffset << ", " << maxNeighbourOffset << std::endl; } diff --git a/src/particleContainer/LinkedCells.cpp b/src/particleContainer/LinkedCells.cpp index d214872be7..bb03396c28 100644 --- a/src/particleContainer/LinkedCells.cpp +++ b/src/particleContainer/LinkedCells.cpp @@ -131,7 +131,7 @@ void LinkedCells::readXML(XMLfileUnits& xmlconfig) { } bool LinkedCells::rebuild(double bBoxMin[3], double bBoxMax[3]) { - Log::global_log->info() << "REBUILD OF LinkedCells" << std::endl; + Log::global_log->debug() << "REBUILD OF LinkedCells" << std::endl; for (int i = 0; i < 3; i++) { this->_boundingBoxMin[i] = bBoxMin[i]; @@ -139,13 +139,13 @@ bool LinkedCells::rebuild(double bBoxMin[3], double bBoxMax[3]) { // _haloWidthInNumCells[i] = ::ceil(_cellsInCutoff); _haloWidthInNumCells[i] = _cellsInCutoff; } - Log::global_log->info() << "Bounding box: " << "[" << bBoxMin[0] << ", " << bBoxMax[0] << "]" << " x " << "[" + Log::global_log->debug() << "Bounding box: " << "[" << bBoxMin[0] << ", " << bBoxMax[0] << "]" << " x " << "[" << bBoxMin[1] << ", " << bBoxMax[1] << "]" << " x " << "[" << bBoxMin[2] << ", " << bBoxMax[2] << "]" << std::endl; int numberOfCells = 1; - Log::global_log->info() << "Using " << _cellsInCutoff << " cells in cutoff." << std::endl; + Log::global_log->debug() << "Using " << _cellsInCutoff << " cells in cutoff." << std::endl; float rc = (_cutoffRadius / _cellsInCutoff); for (int dim = 0; dim < 3; dim++) { @@ -171,7 +171,7 @@ bool LinkedCells::rebuild(double bBoxMin[3], double bBoxMax[3]) { _haloBoundingBoxMax[dim] = _boundingBoxMax[dim] + _haloLength[dim]; } - Log::global_log->info() << "Cells per dimension (incl. halo): " << _cellsPerDimension[0] << " x " + Log::global_log->debug() << "Cells per dimension (incl. halo): " << _cellsPerDimension[0] << " x " << _cellsPerDimension[1] << " x " << _cellsPerDimension[2] << std::endl; diff --git a/src/particleContainer/TraversalTuner.h b/src/particleContainer/TraversalTuner.h index 5c1b66368b..85578bc52a 100644 --- a/src/particleContainer/TraversalTuner.h +++ b/src/particleContainer/TraversalTuner.h @@ -139,27 +139,27 @@ void TraversalTuner::findOptimalTraversal() { // log traversal if (dynamic_cast *>(_optimalTraversal)) - Log::global_log->info() << "Using HalfShellTraversal." << std::endl; + Log::global_log->debug() << "Using HalfShellTraversal." << std::endl; else if (dynamic_cast *>(_optimalTraversal)) - Log::global_log->info() << "Using OriginalCellPairTraversal." << std::endl; + Log::global_log->debug() << "Using OriginalCellPairTraversal." << std::endl; else if (dynamic_cast *>(_optimalTraversal)) - Log::global_log->info() << "Using C08CellPairTraversal without eighthShell." << std::endl; + Log::global_log->debug() << "Using C08CellPairTraversal without eighthShell." << std::endl; else if (dynamic_cast *>(_optimalTraversal)) - Log::global_log->info() << "Using C08CellPairTraversal with eighthShell." << std::endl; + Log::global_log->debug() << "Using C08CellPairTraversal with eighthShell." << std::endl; else if (dynamic_cast *>(_optimalTraversal)) - Log::global_log->info() << "Using C04CellPairTraversal." << std::endl; + Log::global_log->debug() << "Using C04CellPairTraversal." << std::endl; else if (dynamic_cast *>(_optimalTraversal)) - Log::global_log->info() << "Using MidpointTraversal." << std::endl; + Log::global_log->debug() << "Using MidpointTraversal." << std::endl; else if (dynamic_cast *>(_optimalTraversal)) - Log::global_log->info() << "Using NeutralTerritoryTraversal." << std::endl; + Log::global_log->debug() << "Using NeutralTerritoryTraversal." << std::endl; else if (dynamic_cast *>(_optimalTraversal)) { - Log::global_log->info() << "Using QuickschedTraversal." << std::endl; + Log::global_log->debug() << "Using QuickschedTraversal." << std::endl; #ifndef QUICKSCHED Log::global_log->error() << "MarDyn was compiled without Quicksched Support. Aborting!" << std::endl; mardyn_exit(1); #endif } else if (dynamic_cast *>(_optimalTraversal)) - Log::global_log->info() << "Using SlicedCellPairTraversal." << std::endl; + Log::global_log->debug() << "Using SlicedCellPairTraversal." << std::endl; else Log::global_log->warning() << "Using unknown traversal." << std::endl; From 84dbe7afdfae5ec88fddce61e96b1fdda80298f2 Mon Sep 17 00:00:00 2001 From: FG-TUM Date: Wed, 4 Sep 2024 13:53:39 +0200 Subject: [PATCH 03/15] Reorder cases based on likelihood to avoid unnecessary dynamic casts --- src/particleContainer/TraversalTuner.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/particleContainer/TraversalTuner.h b/src/particleContainer/TraversalTuner.h index 85578bc52a..65b072512a 100644 --- a/src/particleContainer/TraversalTuner.h +++ b/src/particleContainer/TraversalTuner.h @@ -138,16 +138,18 @@ void TraversalTuner::findOptimalTraversal() { _optimalTraversal = _traversals[selectedTraversal].first; // log traversal - if (dynamic_cast *>(_optimalTraversal)) - Log::global_log->debug() << "Using HalfShellTraversal." << std::endl; - else if (dynamic_cast *>(_optimalTraversal)) - Log::global_log->debug() << "Using OriginalCellPairTraversal." << std::endl; + if (dynamic_cast *>(_optimalTraversal)) + Log::global_log->debug() << "Using SlicedCellPairTraversal." << std::endl; else if (dynamic_cast *>(_optimalTraversal)) Log::global_log->debug() << "Using C08CellPairTraversal without eighthShell." << std::endl; - else if (dynamic_cast *>(_optimalTraversal)) - Log::global_log->debug() << "Using C08CellPairTraversal with eighthShell." << std::endl; else if (dynamic_cast *>(_optimalTraversal)) Log::global_log->debug() << "Using C04CellPairTraversal." << std::endl; + else if (dynamic_cast *>(_optimalTraversal)) + Log::global_log->debug() << "Using C08CellPairTraversal with eighthShell." << std::endl; + else if (dynamic_cast *>(_optimalTraversal)) + Log::global_log->debug() << "Using HalfShellTraversal." << std::endl; + else if (dynamic_cast *>(_optimalTraversal)) + Log::global_log->debug() << "Using OriginalCellPairTraversal." << std::endl; else if (dynamic_cast *>(_optimalTraversal)) Log::global_log->debug() << "Using MidpointTraversal." << std::endl; else if (dynamic_cast *>(_optimalTraversal)) @@ -158,9 +160,7 @@ void TraversalTuner::findOptimalTraversal() { Log::global_log->error() << "MarDyn was compiled without Quicksched Support. Aborting!" << std::endl; mardyn_exit(1); #endif - } else if (dynamic_cast *>(_optimalTraversal)) - Log::global_log->debug() << "Using SlicedCellPairTraversal." << std::endl; - else + } else Log::global_log->warning() << "Using unknown traversal." << std::endl; if (_cellsInCutoff > _optimalTraversal->maxCellsInCutoff()) { From 65a0ce179346df9188e8733c9266d18bd08c5535 Mon Sep 17 00:00:00 2001 From: FG-TUM Date: Wed, 4 Sep 2024 14:08:42 +0200 Subject: [PATCH 04/15] fix incomplete output --- src/parallel/NeighbourCommunicationScheme.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/parallel/NeighbourCommunicationScheme.cpp b/src/parallel/NeighbourCommunicationScheme.cpp index 7eaa8e94ea..cef75c7ff0 100644 --- a/src/parallel/NeighbourCommunicationScheme.cpp +++ b/src/parallel/NeighbourCommunicationScheme.cpp @@ -525,9 +525,10 @@ void IndirectNeighbourCommunicationScheme::initExchangeMoleculesMPI1D(ParticleCo const int numNeighbours = (*_neighbours)[d].size(); std::vector dummy; for (int i = 0; i < numNeighbours; ++i) { - Log::global_log->debug() << "Rank " << domainDecomp->getRank() << " is initiating communication to" << std::endl; + Log::global_log->debug() << "Rank " << domainDecomp->getRank() + << " is initiating communication to " << (*_neighbours)[d][i].getRank() << "\n"; (*_neighbours)[d][i].initSend(moleculeContainer, domainDecomp->getCommunicator(), - domainDecomp->getMPIParticleType(), msgType, dummy, false, true/*do halo position change*/); + domainDecomp->getMPIParticleType(), msgType, dummy, false, true/*do halo position check*/); } } From 0fa3e008975911572c39aafd1188af38e63725f3 Mon Sep 17 00:00:00 2001 From: FG-TUM Date: Wed, 4 Sep 2024 17:20:22 +0200 Subject: [PATCH 05/15] Refactor getCoversWholeDomain to return const reference --- src/parallel/ALLLoadBalancer.h | 2 +- src/parallel/GeneralDomainDecomposition.cpp | 2 +- src/parallel/LoadBalancer.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/parallel/ALLLoadBalancer.h b/src/parallel/ALLLoadBalancer.h index 9284dcfcf2..c9f0eb9969 100644 --- a/src/parallel/ALLLoadBalancer.h +++ b/src/parallel/ALLLoadBalancer.h @@ -21,7 +21,7 @@ class ALLLoadBalancer : public LoadBalancer { // nothing yet. } - std::array getCoversWholeDomain() override { return _coversWholeDomain; } + const std::array& getCoversWholeDomain() const override { return _coversWholeDomain; } private: ALL _all; diff --git a/src/parallel/GeneralDomainDecomposition.cpp b/src/parallel/GeneralDomainDecomposition.cpp index fbae56f0e9..daf73bb47b 100644 --- a/src/parallel/GeneralDomainDecomposition.cpp +++ b/src/parallel/GeneralDomainDecomposition.cpp @@ -250,7 +250,7 @@ void GeneralDomainDecomposition::migrateParticles(Domain* domain, ParticleContai void GeneralDomainDecomposition::initCommPartners(ParticleContainer* moleculeContainer, Domain* domain) { // init communication partners - auto coversWholeDomain = _loadBalancer->getCoversWholeDomain(); + const auto coversWholeDomain = _loadBalancer->getCoversWholeDomain(); for (int d = 0; d < DIMgeom; ++d) { // this needs to be updated for proper initialization of the neighbours _neighbourCommunicationScheme->setCoverWholeDomain(d, coversWholeDomain[d]); diff --git a/src/parallel/LoadBalancer.h b/src/parallel/LoadBalancer.h index 2b11b02498..f6b4e2faf5 100644 --- a/src/parallel/LoadBalancer.h +++ b/src/parallel/LoadBalancer.h @@ -40,5 +40,5 @@ class LoadBalancer { * Indicates if the current process / MPI rank spans the full length of a dimension. * @return Array of bools, for each dimension one value: true, iff the process spans the entire domain along this dimension. */ - virtual std::array getCoversWholeDomain() = 0; + virtual const std::array& getCoversWholeDomain() const = 0; }; From 0cfabdd3ab09060fd4b434352a7a178126a529b9 Mon Sep 17 00:00:00 2001 From: FG-TUM Date: Wed, 4 Sep 2024 17:30:58 +0200 Subject: [PATCH 06/15] rename GeneralDomainDecomposition::gridSize -> _latchGridSize for clarity --- src/parallel/GeneralDomainDecomposition.cpp | 16 ++++++++-------- src/parallel/GeneralDomainDecomposition.h | 6 +++--- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/parallel/GeneralDomainDecomposition.cpp b/src/parallel/GeneralDomainDecomposition.cpp index daf73bb47b..549633177f 100644 --- a/src/parallel/GeneralDomainDecomposition.cpp +++ b/src/parallel/GeneralDomainDecomposition.cpp @@ -36,22 +36,22 @@ void GeneralDomainDecomposition::initializeALL() { Log::global_log->info() << "gridSize:" << gridSize[0] << ", " << gridSize[1] << ", " << gridSize[2] << std::endl; Log::global_log->info() << "gridCoords:" << gridCoords[0] << ", " << gridCoords[1] << ", " << gridCoords[2] << std::endl; std::tie(_boxMin, _boxMax) = initializeRegularGrid(_domainLength, gridSize, gridCoords); - if (_forceLatchingToLinkedCellsGrid and not _gridSize.has_value()) { + if (_forceLatchingToLinkedCellsGrid and not _latchGridSize.has_value()) { std::array forcedGridSize{}; for(size_t dim = 0; dim < 3; ++dim){ size_t numCells = _domainLength[dim] / _interactionLength; forcedGridSize[dim] = _domainLength[dim] / numCells; } - _gridSize = forcedGridSize; + _latchGridSize = forcedGridSize; } - if (_gridSize.has_value()) { + if (_latchGridSize.has_value()) { std::tie(_boxMin, _boxMax) = latchToGridSize(_boxMin, _boxMax); } #ifdef ENABLE_ALLLBL // Increased slightly to prevent rounding errors. const double safetyFactor = 1. + 1.e-10; const std::array minimalDomainSize = - _gridSize.has_value() ? *_gridSize + _latchGridSize.has_value() ? *_latchGridSize : std::array{_interactionLength * safetyFactor, _interactionLength * safetyFactor, _interactionLength * safetyFactor}; @@ -102,7 +102,7 @@ void GeneralDomainDecomposition::balanceAndExchange(double lastTraversalTime, bo Log::global_log->debug() << "work:" << lastTraversalTime << std::endl; Log::global_log->set_mpi_output_root(0); auto [newBoxMin, newBoxMax] = _loadBalancer->rebalance(lastTraversalTime); - if (_gridSize.has_value()) { + if (_latchGridSize.has_value()) { std::tie(newBoxMin, newBoxMax) = latchToGridSize(newBoxMin, newBoxMax); } // migrate the particles, this will rebuild the moleculeContainer! @@ -292,12 +292,12 @@ void GeneralDomainDecomposition::readXML(XMLfileUnits& xmlconfig) { << strings.size() << "!" << std::endl; mardyn_exit(8134); } - _gridSize = {std::stod(strings[0]), std::stod(strings[1]), std::stod(strings[2])}; + _latchGridSize = {std::stod(strings[0]), std::stod(strings[1]), std::stod(strings[2])}; } else { double gridSize = std::stod(gridSizeString); - _gridSize = {gridSize, gridSize, gridSize}; + _latchGridSize = {gridSize, gridSize, gridSize}; } - for (auto gridSize : *_gridSize) { + for (auto gridSize : *_latchGridSize) { if (gridSize < _interactionLength) { Log::global_log->error() << "GeneralDomainDecomposition's gridSize (" << gridSize << ") is smaller than the interactionLength (" << _interactionLength diff --git a/src/parallel/GeneralDomainDecomposition.h b/src/parallel/GeneralDomainDecomposition.h index f1900e720f..12dd37f026 100644 --- a/src/parallel/GeneralDomainDecomposition.h +++ b/src/parallel/GeneralDomainDecomposition.h @@ -172,7 +172,7 @@ class GeneralDomainDecomposition : public DomainDecompMPIBase { std::pair, std::array> latchToGridSize(std::array boxMin, std::array boxMax) { for (size_t ind = 0; ind < 3; ++ind) { - const double currentGridSize = (*_gridSize)[ind]; + const double currentGridSize = (*_latchGridSize)[ind]; // For boxmin, the lower domain boundary is 0, so that's always fine! boxMin[ind] = std::round(boxMin[ind] / currentGridSize) * currentGridSize; // update boxmax only if it isn't at the very top of the domain! @@ -197,10 +197,10 @@ class GeneralDomainDecomposition : public DomainDecompMPIBase { size_t _initFrequency{500}; /** - * Optionally safe a given grid size on which the process boundaries are bound/latched. + * Optionally, give a grid size (=3D size of one grid cell) on which the process boundaries are bound/latched. * If no value is given, it is not used. */ - std::optional> _gridSize{}; + std::optional> _latchGridSize{}; /** * Bool that indicates whether a grid should be forced even if no gridSize is set. From 39d51ba51385a5be3f8a7a7bb594994e2cd12577 Mon Sep 17 00:00:00 2001 From: FG-TUM Date: Wed, 4 Sep 2024 17:43:12 +0200 Subject: [PATCH 07/15] Refactor ALLLoadBalancer constructor to use const references Updated ALLLoadBalancer constructor parameters to use `const` references for efficiency and alignment with expected input types for the ALL library. This change improves code clarity and ensures proper handling of input arguments by maintaining consistency in data types. --- src/parallel/ALLLoadBalancer.cpp | 30 ++++++++++++++++-------------- src/parallel/ALLLoadBalancer.h | 6 +++--- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/src/parallel/ALLLoadBalancer.cpp b/src/parallel/ALLLoadBalancer.cpp index d6ff1c95ea..b58965216a 100644 --- a/src/parallel/ALLLoadBalancer.cpp +++ b/src/parallel/ALLLoadBalancer.cpp @@ -5,24 +5,26 @@ */ #include "ALLLoadBalancer.h" -ALLLoadBalancer::ALLLoadBalancer(std::array boxMin, std::array boxMax, double gamma, - MPI_Comm comm, std::array globalSize, - std::array localCoordinates, std::array minimalPartitionSize) - : _all(3 /*dim*/, gamma) { - std::vector points; - points.emplace_back(3, boxMin.data()); - points.emplace_back(3, boxMax.data()); +ALLLoadBalancer::ALLLoadBalancer(const std::array &boxMin, const std::array &boxMax, double gamma, + MPI_Comm comm, const std::array &globalSize, + const std::array &localCoordinates, + const std::array &minimalPartitionSize) + : _all(3 /*dim*/, gamma), _minimalPartitionSize(minimalPartitionSize) { + // convert input into non-const vector because that is what ALL expects + std::vector points { + {3, boxMin.data()}, + {3, boxMax.data()}, + }; _all.set_vertices(points); - std::array global_size{static_cast(globalSize[0]), static_cast(globalSize[1]), + // convert input into non-const int arrays because that is what ALL expects + std::array globalSizeIntArray{static_cast(globalSize[0]), static_cast(globalSize[1]), static_cast(globalSize[2])}; std::array coords{static_cast(localCoordinates[0]), static_cast(localCoordinates[1]), static_cast(localCoordinates[2])}; - _all.set_proc_grid_params(coords.data(), global_size.data()); + _all.set_proc_grid_params(coords.data(), globalSizeIntArray.data()); _all.set_communicator(comm); - _coversWholeDomain = {globalSize[0] == 1, global_size[1] == 1, global_size[2] == 1}; - - _minimalPartitionSize = minimalPartitionSize; + _coversWholeDomain = {globalSizeIntArray[0] == 1, globalSizeIntArray[1] == 1, globalSizeIntArray[2] == 1}; } std::tuple, std::array> ALLLoadBalancer::rebalance(double work) { _all.set_work(work); @@ -30,8 +32,8 @@ std::tuple, std::array> ALLLoadBalancer::rebala _all.set_min_domain_size(ALL_LB_t::STAGGERED, _minimalPartitionSize.data()); _all.balance(ALL_LB_t::STAGGERED); auto resultVertices = _all.get_result_vertices(); - std::array boxMin{resultVertices[0].x(0), resultVertices[0].x(1), resultVertices[0].x(2)}; - std::array boxMax{resultVertices[1].x(0), resultVertices[1].x(1), resultVertices[1].x(2)}; _all.set_vertices(resultVertices); + const std::array boxMin{resultVertices[0].x(0), resultVertices[0].x(1), resultVertices[0].x(2)}; + const std::array boxMax{resultVertices[1].x(0), resultVertices[1].x(1), resultVertices[1].x(2)}; return std::make_tuple(boxMin, boxMax); } diff --git a/src/parallel/ALLLoadBalancer.h b/src/parallel/ALLLoadBalancer.h index c9f0eb9969..b70562ef14 100644 --- a/src/parallel/ALLLoadBalancer.h +++ b/src/parallel/ALLLoadBalancer.h @@ -11,9 +11,9 @@ class ALLLoadBalancer : public LoadBalancer { public: - ALLLoadBalancer(std::array boxMin, std::array boxMax, double gamma, MPI_Comm comm, - std::array globalSize, std::array localCoordinates, - std::array minimalPartitionSize); + ALLLoadBalancer(const std::array &boxMin, const std::array &boxMax, double gamma, MPI_Comm comm, + const std::array& globalSize, const std::array& localCoordinates, + const std::array& minimalPartitionSize); ~ALLLoadBalancer() override = default; std::tuple, std::array> rebalance(double work) override; From 8b4291904f7b5c81da000dc335be2fff6a107644 Mon Sep 17 00:00:00 2001 From: FG-TUM Date: Wed, 4 Sep 2024 17:46:14 +0200 Subject: [PATCH 08/15] fix automatic grid size calculation --- src/parallel/GeneralDomainDecomposition.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/parallel/GeneralDomainDecomposition.cpp b/src/parallel/GeneralDomainDecomposition.cpp index 549633177f..946157a12a 100644 --- a/src/parallel/GeneralDomainDecomposition.cpp +++ b/src/parallel/GeneralDomainDecomposition.cpp @@ -39,7 +39,8 @@ void GeneralDomainDecomposition::initializeALL() { if (_forceLatchingToLinkedCellsGrid and not _latchGridSize.has_value()) { std::array forcedGridSize{}; for(size_t dim = 0; dim < 3; ++dim){ - size_t numCells = _domainLength[dim] / _interactionLength; + // if we calculate 3.5 cells per dim there is only space for 3 -> floor + const auto numCells = std::floor(_domainLength[dim] / _interactionLength); forcedGridSize[dim] = _domainLength[dim] / numCells; } _latchGridSize = forcedGridSize; From 117ac00df1bab40d6975f08afae2db3897d63f20 Mon Sep 17 00:00:00 2001 From: FG-TUM Date: Wed, 4 Sep 2024 18:07:25 +0200 Subject: [PATCH 09/15] use non-flushing linebreaks --- src/parallel/DomainDecomposition.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/parallel/DomainDecomposition.cpp b/src/parallel/DomainDecomposition.cpp index 4aa51be05a..bf2a87437a 100644 --- a/src/parallel/DomainDecomposition.cpp +++ b/src/parallel/DomainDecomposition.cpp @@ -87,15 +87,15 @@ bool DomainDecomposition::queryBalanceAndExchangeNonBlocking(bool /*forceRebalan void DomainDecomposition::balanceAndExchange(double /*lastTraversalTime*/, bool /*forceRebalancing*/, ParticleContainer* moleculeContainer, Domain* domain) { if (sendLeavingWithCopies()) { - Log::global_log->debug() << "DD: Sending Leaving and Halos." << std::endl; + Log::global_log->debug() << "DD: Sending Leaving and Halos.\n"; DomainDecompMPIBase::exchangeMoleculesMPI(moleculeContainer, domain, LEAVING_AND_HALO_COPIES); } else { - Log::global_log->debug() << "DD: Sending Leaving." << std::endl; + Log::global_log->debug() << "DD: Sending Leaving.\n"; DomainDecompMPIBase::exchangeMoleculesMPI(moleculeContainer, domain, LEAVING_ONLY); #ifndef MARDYN_AUTOPAS moleculeContainer->deleteOuterParticles(); #endif - Log::global_log->debug() << "DD: Sending Halos." << std::endl; + Log::global_log->debug() << "DD: Sending Halos.\n"; DomainDecompMPIBase::exchangeMoleculesMPI(moleculeContainer, domain, HALO_COPIES); } } From 47180f4dfa9da8085b92de440ac0e985fadcca9a Mon Sep 17 00:00:00 2001 From: FG-TUM Date: Wed, 4 Sep 2024 18:37:39 +0200 Subject: [PATCH 10/15] const, rename for clarity and use std::array instead of pointer where possible --- src/parallel/NeighbourCommunicationScheme.cpp | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/parallel/NeighbourCommunicationScheme.cpp b/src/parallel/NeighbourCommunicationScheme.cpp index cef75c7ff0..ab4190d750 100644 --- a/src/parallel/NeighbourCommunicationScheme.cpp +++ b/src/parallel/NeighbourCommunicationScheme.cpp @@ -434,16 +434,17 @@ void NeighbourCommunicationScheme::selectNeighbours(MessageType msgType, bool im void DirectNeighbourCommunicationScheme::initCommunicationPartners(double cutoffRadius, Domain * domain, DomainDecompMPIBase* domainDecomp, ParticleContainer* moleculeContainer) { // corners of the process-specific domain - double rmin[DIMgeom]; // lower corner - double rmax[DIMgeom]; // higher corner - - for (int d = 0; d < DIMgeom; d++) { - rmin[d] = domainDecomp->getBoundingBoxMin(d, domain); - rmax[d] = domainDecomp->getBoundingBoxMax(d, domain); - - // TODO: this should be safe, as long as molecules don't start flying around - // at the speed of one cutoffRadius per time step - } + static_assert(DIMgeom == 3); // The initialization here assumes 3 dimensions! + const std::array localLowerCorner{ + domainDecomp->getBoundingBoxMin(0, domain), + domainDecomp->getBoundingBoxMin(1, domain), + domainDecomp->getBoundingBoxMin(2, domain), + }; + const std::array localUpperCorner{ + domainDecomp->getBoundingBoxMax(0, domain), + domainDecomp->getBoundingBoxMax(1, domain), + domainDecomp->getBoundingBoxMax(2, domain), + }; if (_pushPull) { for (unsigned int d = 0; d < _commDimms; d++) { // why free? @@ -458,18 +459,17 @@ void DirectNeighbourCommunicationScheme::initCommunicationPartners(double cutoff } } - HaloRegion ownRegion = {rmin[0], rmin[1], rmin[2], rmax[0], rmax[1], rmax[2], 0, 0, 0, cutoffRadius}; + HaloRegion ownRegion = {localLowerCorner[0], localLowerCorner[1], localLowerCorner[2], localUpperCorner[0], localUpperCorner[1], localUpperCorner[2], 0, 0, 0, cutoffRadius}; if (_pushPull) { - double* cellLength = moleculeContainer->getHaloSize(); + double* const cellLength = moleculeContainer->getHaloSize(); // halo/force regions std::vector haloOrForceRegions = _zonalMethod->getHaloImportForceExportRegions(ownRegion, cutoffRadius, _coversWholeDomain, cellLength); std::vector leavingRegions = - _zonalMethod->getLeavingExportRegions(ownRegion, cutoffRadius, - _coversWholeDomain); + _zonalMethod->getLeavingExportRegions(ownRegion, cutoffRadius, _coversWholeDomain); - std::array globalDomainLength{domain->getGlobalLength(0), domain->getGlobalLength(1), + const std::array globalDomainLength{domain->getGlobalLength(0), domain->getGlobalLength(1), domain->getGlobalLength(2)}; // assuming p1 sends regions to p2 std::tie((*_haloImportForceExportNeighbours)[0], (*_haloExportForceImportNeighbours)[0]) = From 46dc52c2532ebd3be25545af165becd01b1d55d5 Mon Sep 17 00:00:00 2001 From: FG-TUM Date: Thu, 5 Sep 2024 13:58:25 +0200 Subject: [PATCH 11/15] Refactor NeighborAcquirer.cpp for readability and efficiency - const - constexpr - rename variables - move declarations to usages - limit scope - reserve before push_back --- src/parallel/NeighborAcquirer.cpp | 151 +++++++++++++++--------------- src/parallel/NeighborAcquirer.h | 2 +- 2 files changed, 78 insertions(+), 75 deletions(-) diff --git a/src/parallel/NeighborAcquirer.cpp b/src/parallel/NeighborAcquirer.cpp index 86d69fe757..0f3bc58441 100644 --- a/src/parallel/NeighborAcquirer.cpp +++ b/src/parallel/NeighborAcquirer.cpp @@ -18,49 +18,50 @@ * saved in partners01. */ std::tuple, std::vector> NeighborAcquirer::acquireNeighbors( - const std::array &globalDomainLength, HaloRegion *ownRegion, std::vector &desiredRegions, + const std::array &globalDomainLength, HaloRegion *ownRegion, const std::vector &desiredRegions, const MPI_Comm &comm, bool excludeOwnRank) { - int my_rank; // my rank + int my_rank{}; // my rank MPI_Comm_rank(comm, &my_rank); - int num_processes; // the number of processes in comm + int num_processes{}; // the number of processes in comm MPI_Comm_size(comm, &num_processes); - int num_regions = desiredRegions.size(); // the number of regions I would like to acquire from other processes + const auto num_regions = desiredRegions.size(); // the number of regions I would like to acquire from other processes // tell the other processes how much you are going to send - int num_bytes_send = - sizeof(int) * 2 + (sizeof(double) * 3 + sizeof(double) * 3 + sizeof(int) * 3 + sizeof(double) * 1) * - num_regions; // how many bytes am I going to send to all the other processes? - std::vector num_bytes_receive_vec(num_processes, 0); // vector of number of bytes I am going to receive - // MPI_Allreduce(&num_bytes_send, &num_bytes_receive, 1, MPI_INT, MPI_SUM, comm); - MPI_Allgather(&num_bytes_send, 1, MPI_INT, num_bytes_receive_vec.data(), 1, MPI_INT, comm); + // how many bytes am I going to send to all the other processes + const int num_bytes_send = + sizeof(int) * 2 + (sizeof(double) * 3 + sizeof(double) * 3 + sizeof(int) * 3 + sizeof(double) * 1) * num_regions; - // create byte buffer + // create byte send buffer std::vector outgoingDesiredRegionsVector(num_bytes_send); // outgoing byte buffer - int i = 0; - int p = 0; // msg format: rank | number_of_regions | region_01 | region_02 | ... - - memcpy(outgoingDesiredRegionsVector.data() + i, &my_rank, sizeof(int)); - i += sizeof(int); - memcpy(outgoingDesiredRegionsVector.data() + i, &num_regions, sizeof(int)); - i += sizeof(int); + // fill the buffer + int bufferPosition = 0; + memcpy(outgoingDesiredRegionsVector.data() + bufferPosition, &my_rank, sizeof(int)); + bufferPosition += sizeof(int); + memcpy(outgoingDesiredRegionsVector.data() + bufferPosition, &num_regions, sizeof(int)); + bufferPosition += sizeof(int); for (auto ®ion : desiredRegions) { // filling up the outgoing byte buffer - memcpy(outgoingDesiredRegionsVector.data() + i, region.rmin, sizeof(double) * 3); - i += sizeof(double) * 3; - memcpy(outgoingDesiredRegionsVector.data() + i, region.rmax, sizeof(double) * 3); - i += sizeof(double) * 3; - memcpy(outgoingDesiredRegionsVector.data() + i, region.offset, sizeof(int) * 3); - i += sizeof(int) * 3; - memcpy(outgoingDesiredRegionsVector.data() + i, ®ion.width, sizeof(double)); - i += sizeof(double); + memcpy(outgoingDesiredRegionsVector.data() + bufferPosition, region.rmin, sizeof(double) * 3); + bufferPosition += sizeof(double) * 3; + memcpy(outgoingDesiredRegionsVector.data() + bufferPosition, region.rmax, sizeof(double) * 3); + bufferPosition += sizeof(double) * 3; + memcpy(outgoingDesiredRegionsVector.data() + bufferPosition, region.offset, sizeof(int) * 3); + bufferPosition += sizeof(int) * 3; + memcpy(outgoingDesiredRegionsVector.data() + bufferPosition, ®ion.width, sizeof(double)); + bufferPosition += sizeof(double); } + // set up structure information data for the Allgatherv operation + // vector of number of bytes I am going to receive + std::vector num_bytes_receive_vec(num_processes, 0); + MPI_Allgather(&num_bytes_send, 1, MPI_INT, num_bytes_receive_vec.data(), 1, MPI_INT, comm); + // vector of offsets (=displacement in MPI) in the receive buffer + std::vector num_bytes_displacements(num_processes, 0); int num_bytes_receive = 0; - std::vector num_bytes_displacements(num_processes, 0); // vector of number of bytes I am going to receive for (int j = 0; j < num_processes; j++) { num_bytes_displacements[j] = num_bytes_receive; num_bytes_receive += num_bytes_receive_vec[j]; @@ -74,38 +75,40 @@ std::tuple, std::vector> std::vector numberOfRegionsToSendToRank(num_processes, 0); // outgoing row - int bytesOneRegion = + constexpr int bytesOneRegion = sizeof(double) * 3 + sizeof(double) * 3 + sizeof(int) * 3 + sizeof(double) + sizeof(double) * 3; - std::vector>> sendingList(num_processes); // the regions I own and want to send + // the regions I own and want to send: ranks> + std::vector>> sendingList(num_processes); std::vector comm_partners02; - i = 0; - while (i != num_bytes_receive) { - int rank; - int regions; + bufferPosition = 0; + while (bufferPosition < num_bytes_receive /*== buffer length*/) { - memcpy(&rank, incomingDesiredRegionsVector.data() + i, sizeof(int)); - i += sizeof(int); // 4 - memcpy(®ions, incomingDesiredRegionsVector.data() + i, sizeof(int)); - i += sizeof(int); // 4 + int rank{}; + memcpy(&rank, incomingDesiredRegionsVector.data() + bufferPosition, sizeof(int)); + bufferPosition += sizeof(int); // 4 + int regions{}; + memcpy(®ions, incomingDesiredRegionsVector.data() + bufferPosition, sizeof(int)); + bufferPosition += sizeof(int); // 4 - for (int j = 0; j < regions; j++) { + for (int regionId = 0; regionId < regions; ++regionId) { HaloRegion unshiftedRegion{}; - memcpy(unshiftedRegion.rmin, incomingDesiredRegionsVector.data() + i, sizeof(double) * 3); - i += sizeof(double) * 3; // 24 - memcpy(unshiftedRegion.rmax, incomingDesiredRegionsVector.data() + i, sizeof(double) * 3); - i += sizeof(double) * 3; // 24 - memcpy(unshiftedRegion.offset, incomingDesiredRegionsVector.data() + i, sizeof(int) * 3); - i += sizeof(int) * 3; // 12 - memcpy(&unshiftedRegion.width, incomingDesiredRegionsVector.data() + i, sizeof(double)); - i += sizeof(double); // 4 + memcpy(unshiftedRegion.rmin, incomingDesiredRegionsVector.data() + bufferPosition, sizeof(double) * 3); + bufferPosition += sizeof(double) * 3; // 24 + memcpy(unshiftedRegion.rmax, incomingDesiredRegionsVector.data() + bufferPosition, sizeof(double) * 3); + bufferPosition += sizeof(double) * 3; // 24 + memcpy(unshiftedRegion.offset, incomingDesiredRegionsVector.data() + bufferPosition, sizeof(int) * 3); + bufferPosition += sizeof(int) * 3; // 12 + memcpy(&unshiftedRegion.width, incomingDesiredRegionsVector.data() + bufferPosition, sizeof(double)); + bufferPosition += sizeof(double); // 4 // msg format one region: rmin | rmax | offset | width | shift - auto shiftedRegionShiftPair = getPotentiallyShiftedRegions(globalDomainLength, unshiftedRegion); - - std::vector regionsToTest = shiftedRegionShiftPair.first; - std::vector> shifts = shiftedRegionShiftPair.second; - + auto [regionsToTest, shifts] = getPotentiallyShiftedRegions(globalDomainLength, unshiftedRegion); + // Before every set of push_backs make sure there is enough space for this set + all remaining. + // Work with the assumption that the others are of the same size as the current ones. + // This is potentially an overestimate but avoids a large number of resizes. + sendingList.reserve(sendingList.size() + ((regions - regionId) * regionsToTest.size())); + comm_partners02.reserve(comm_partners02.size() + ((regions - regionId) * regionsToTest.size())); for(size_t regionIndex = 0; regionIndex < regionsToTest.size(); ++regionIndex){ auto regionToTest = regionsToTest[regionIndex]; if ((not excludeOwnRank or rank != my_rank) and isIncluded(ownRegion, ®ionToTest)) { @@ -113,10 +116,10 @@ std::tuple, std::vector> numberOfRegionsToSendToRank[rank]++; // this is a region I will send to rank - auto overlappedRegion = overlap(*ownRegion, regionToTest); // different shift for the overlap? + const auto overlappedRegion = overlap(*ownRegion, regionToTest); // different shift for the overlap? // make a note in partners02 - don't forget to squeeze partners02 - bool enlarged[3][2] = {{false}}; + constexpr bool enlarged[3][2] = {{false}}; for (int k = 0; k < 3; k++) currentShift[k] *= -1; comm_partners02.emplace_back(rank, overlappedRegion.rmin, overlappedRegion.rmax, overlappedRegion.rmin, @@ -143,7 +146,7 @@ std::tuple, std::vector> std::vector singleRegion(bytesOneRegion); - p = 0; + int p = 0; memcpy(&singleRegion[p], unshiftedOverlappedRegion.rmin, sizeof(double) * 3); p += sizeof(double) * 3; memcpy(&singleRegion[p], unshiftedOverlappedRegion.rmax, sizeof(double) * 3); @@ -155,7 +158,7 @@ std::tuple, std::vector> memcpy(&singleRegion[p], currentShift.data(), sizeof(double) * 3); //p += sizeof(double) * 3; - sendingList[rank].push_back(std::move(singleRegion)); + sendingList[rank].emplace_back(std::move(singleRegion)); } } } @@ -218,19 +221,17 @@ std::tuple, std::vector> std::vector comm_partners01; // the communication partners // receive data (blocking) - int byte_counter = 0; - /** * We now receive as many regions as we previously determined that we will receive. * For that we keep track, how many regions we received and increase this according to the number of regions * received per MPI operation. */ - while (byte_counter < numberOfRegionsToReceive[my_rank] * bytesOneRegion) { + for (int byte_counter = 0; byte_counter < numberOfRegionsToReceive[my_rank] * bytesOneRegion; ) { // MPI_PROBE MPI_Probe(MPI_ANY_SOURCE, 1, comm, &probe_status); // interpret probe - int source = probe_status.MPI_SOURCE; - int bytes; + const auto source = probe_status.MPI_SOURCE; + int bytes{}; MPI_Get_count(&probe_status, MPI_BYTE, &bytes); // we have receive `bytes` bytes. So we increase the byte_counter. byte_counter += bytes; @@ -238,24 +239,26 @@ std::tuple, std::vector> std::vector raw_neighbours(bytes); MPI_Recv(raw_neighbours.data(), bytes, MPI_BYTE, source, 1, comm, &rec_status); // Interpret Buffer and add neighbours - for (int k = 0; k < (bytes / bytesOneRegion); k++) { // number of regions from this process + const auto numRegionsToReceive = bytes / bytesOneRegion; + comm_partners01.reserve(std::max(comm_partners01.size(), static_cast(numberOfRegionsToReceive[my_rank] * numRegionsToReceive))); + for (int regionId = 0; regionId < numRegionsToReceive; ++regionId) { // number of regions from this process HaloRegion region{}; - double shift[3]; - i = k * bytesOneRegion; + bufferPosition = regionId * bytesOneRegion; - memcpy(region.rmin, raw_neighbours.data() + i, sizeof(double) * 3); - i += sizeof(double) * 3; - memcpy(region.rmax, raw_neighbours.data() + i, sizeof(double) * 3); - i += sizeof(double) * 3; - memcpy(region.offset, raw_neighbours.data() + i, sizeof(int) * 3); - i += sizeof(int) * 3; - memcpy(®ion.width, raw_neighbours.data() + i, sizeof(double)); - i += sizeof(double); + memcpy(region.rmin, raw_neighbours.data() + bufferPosition, sizeof(double) * 3); + bufferPosition += sizeof(double) * 3; + memcpy(region.rmax, raw_neighbours.data() + bufferPosition, sizeof(double) * 3); + bufferPosition += sizeof(double) * 3; + memcpy(region.offset, raw_neighbours.data() + bufferPosition, sizeof(int) * 3); + bufferPosition += sizeof(int) * 3; + memcpy(®ion.width, raw_neighbours.data() + bufferPosition, sizeof(double)); + bufferPosition += sizeof(double); - memcpy(shift, raw_neighbours.data() + i, sizeof(double) * 3); - i += sizeof(double) * 3; + double shift[3]; + memcpy(shift, raw_neighbours.data() + bufferPosition, sizeof(double) * 3); + // bufferPosition += sizeof(double) * 3; - bool enlarged[3][2] = {{false}}; + constexpr bool enlarged[3][2] = {{false}}; comm_partners01.emplace_back(source, region.rmin, region.rmax, region.rmin, region.rmax, shift, region.offset, enlarged); diff --git a/src/parallel/NeighborAcquirer.h b/src/parallel/NeighborAcquirer.h index 5fb9cba960..0f92e35cee 100644 --- a/src/parallel/NeighborAcquirer.h +++ b/src/parallel/NeighborAcquirer.h @@ -29,7 +29,7 @@ class NeighborAcquirer { * second vector will own the particles. */ static std::tuple, std::vector> acquireNeighbors( - const std::array& globalDomainLength, HaloRegion* ownRegion, std::vector& desiredRegions, + const std::array& globalDomainLength, HaloRegion* ownRegion, const std::vector& desiredRegions, const MPI_Comm& comm, bool excludeOwnRank=true); static std::vector squeezePartners(const std::vector& partners); From da8e980cd99d712321510d38d5c912dbc7f2537b Mon Sep 17 00:00:00 2001 From: FG-TUM Date: Thu, 5 Sep 2024 14:07:29 +0200 Subject: [PATCH 12/15] change low level info logs to debug --- src/parallel/GeneralDomainDecomposition.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/parallel/GeneralDomainDecomposition.cpp b/src/parallel/GeneralDomainDecomposition.cpp index 946157a12a..efd6aa750f 100644 --- a/src/parallel/GeneralDomainDecomposition.cpp +++ b/src/parallel/GeneralDomainDecomposition.cpp @@ -97,7 +97,7 @@ void GeneralDomainDecomposition::balanceAndExchange(double lastTraversalTime, bo moleculeContainer->deleteOuterParticles(); // rebalance - Log::global_log->info() << "rebalancing..." << std::endl; + Log::global_log->debug() << "rebalancing..." << std::endl; Log::global_log->set_mpi_output_all(); Log::global_log->debug() << "work:" << lastTraversalTime << std::endl; @@ -107,7 +107,7 @@ void GeneralDomainDecomposition::balanceAndExchange(double lastTraversalTime, bo std::tie(newBoxMin, newBoxMax) = latchToGridSize(newBoxMin, newBoxMax); } // migrate the particles, this will rebuild the moleculeContainer! - Log::global_log->info() << "migrating particles" << std::endl; + Log::global_log->debug() << "migrating particles" << std::endl; migrateParticles(domain, moleculeContainer, newBoxMin, newBoxMax); #ifndef MARDYN_AUTOPAS @@ -120,9 +120,9 @@ void GeneralDomainDecomposition::balanceAndExchange(double lastTraversalTime, bo _boxMax = newBoxMax; // init communication partners - Log::global_log->info() << "updating communication partners" << std::endl; + Log::global_log->debug() << "updating communication partners" << std::endl; initCommPartners(moleculeContainer, domain); - Log::global_log->info() << "rebalancing finished" << std::endl; + Log::global_log->debug() << "rebalancing finished" << std::endl; DomainDecompMPIBase::exchangeMoleculesMPI(moleculeContainer, domain, HALO_COPIES); } else { if (sendLeavingWithCopies()) { From 3d8dab716987119082d20f516b08d4f222c8dfba Mon Sep 17 00:00:00 2001 From: FG-TUM Date: Thu, 5 Sep 2024 15:45:30 +0200 Subject: [PATCH 13/15] const + default inits + comments --- src/parallel/NeighborAcquirer.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/parallel/NeighborAcquirer.cpp b/src/parallel/NeighborAcquirer.cpp index 0f3bc58441..7f1d007840 100644 --- a/src/parallel/NeighborAcquirer.cpp +++ b/src/parallel/NeighborAcquirer.cpp @@ -75,12 +75,13 @@ std::tuple, std::vector> std::vector numberOfRegionsToSendToRank(num_processes, 0); // outgoing row + // parse / deserialize received data constexpr int bytesOneRegion = sizeof(double) * 3 + sizeof(double) * 3 + sizeof(int) * 3 + sizeof(double) + sizeof(double) * 3; // the regions I own and want to send: ranks> std::vector>> sendingList(num_processes); - std::vector comm_partners02; + std::vector comm_partners02{}; bufferPosition = 0; while (bufferPosition < num_bytes_receive /*== buffer length*/) { @@ -103,7 +104,7 @@ std::tuple, std::vector> bufferPosition += sizeof(double); // 4 // msg format one region: rmin | rmax | offset | width | shift - auto [regionsToTest, shifts] = getPotentiallyShiftedRegions(globalDomainLength, unshiftedRegion); + const auto [regionsToTest, shifts] = getPotentiallyShiftedRegions(globalDomainLength, unshiftedRegion); // Before every set of push_backs make sure there is enough space for this set + all remaining. // Work with the assumption that the others are of the same size as the current ones. // This is potentially an overestimate but avoids a large number of resizes. From c3176486e5f1b1fda3dad88c339af76c6d568669 Mon Sep 17 00:00:00 2001 From: FG-TUM Date: Mon, 7 Oct 2024 15:06:10 +0200 Subject: [PATCH 14/15] Reserve the correct subvector of sendingList + clarify doc --- src/parallel/NeighborAcquirer.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/parallel/NeighborAcquirer.cpp b/src/parallel/NeighborAcquirer.cpp index 7f1d007840..4dba32754d 100644 --- a/src/parallel/NeighborAcquirer.cpp +++ b/src/parallel/NeighborAcquirer.cpp @@ -106,9 +106,10 @@ std::tuple, std::vector> // msg format one region: rmin | rmax | offset | width | shift const auto [regionsToTest, shifts] = getPotentiallyShiftedRegions(globalDomainLength, unshiftedRegion); // Before every set of push_backs make sure there is enough space for this set + all remaining. - // Work with the assumption that the others are of the same size as the current ones. - // This is potentially an overestimate but avoids a large number of resizes. - sendingList.reserve(sendingList.size() + ((regions - regionId) * regionsToTest.size())); + // This guarantees that there is enough space for the current set of push_backs, and, if subsequent sets + // are smaller, further reallocations can be avoided. This potentially leads to an overestimate but comes + // with the advantage of fewer resizes. + sendingList[rank].reserve(sendingList[rank].size() + ((regions - regionId) * regionsToTest.size())); comm_partners02.reserve(comm_partners02.size() + ((regions - regionId) * regionsToTest.size())); for(size_t regionIndex = 0; regionIndex < regionsToTest.size(); ++regionIndex){ auto regionToTest = regionsToTest[regionIndex]; From 0bb2a7c3ff9ebd27ff12d1955b693d6bd839a950 Mon Sep 17 00:00:00 2001 From: FG-TUM Date: Mon, 7 Oct 2024 15:07:20 +0200 Subject: [PATCH 15/15] formatting --- src/parallel/NeighborAcquirer.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/parallel/NeighborAcquirer.cpp b/src/parallel/NeighborAcquirer.cpp index 4dba32754d..db4ae4078b 100644 --- a/src/parallel/NeighborAcquirer.cpp +++ b/src/parallel/NeighborAcquirer.cpp @@ -122,12 +122,16 @@ std::tuple, std::vector> // make a note in partners02 - don't forget to squeeze partners02 constexpr bool enlarged[3][2] = {{false}}; - for (int k = 0; k < 3; k++) currentShift[k] *= -1; + for (int k = 0; k < 3; k++) { + currentShift[k] *= -1; + } comm_partners02.emplace_back(rank, overlappedRegion.rmin, overlappedRegion.rmax, overlappedRegion.rmin, overlappedRegion.rmax, currentShift.data(), overlappedRegion.offset, enlarged); - for (int k = 0; k < 3; k++) currentShift[k] *= -1; + for (int k = 0; k < 3; k++) { + currentShift[k] *= -1; + } // Undo the shift. So it is again in the perspective of the rank we got this region from. // We cannot use unshiftedRegion, as it is not overlapped and thus potentially too big.