MueLu Version of the Day
Loading...
Searching...
No Matches
MueLu_RepartitionHeuristicFactory_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// MueLu: A package for multigrid based preconditioning
4//
5// Copyright 2012 NTESS and the MueLu contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef PACKAGES_MUELU_SRC_REBALANCING_MUELU_REPARTITIONHEURISTICFACTORY_DEF_HPP_
11#define PACKAGES_MUELU_SRC_REBALANCING_MUELU_REPARTITIONHEURISTICFACTORY_DEF_HPP_
12
13#include <algorithm>
14#include <iostream>
15#include <sstream>
16
17#ifdef HAVE_MPI
18#include <Teuchos_DefaultMpiComm.hpp>
19#include <Teuchos_CommHelpers.hpp>
20
21//#include <Xpetra_Map.hpp>
22#include <Xpetra_Matrix.hpp>
23
24#include "MueLu_RAPFactory.hpp"
25#include "MueLu_BlockedRAPFactory.hpp"
26#include "MueLu_SubBlockAFactory.hpp"
27#include "MueLu_Level.hpp"
28#include "MueLu_MasterList.hpp"
29#include "MueLu_Monitor.hpp"
30
32
33namespace MueLu {
34
35template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
37
38template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
40
41template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
43 RCP<ParameterList> validParamList = rcp(new ParameterList());
44
45#define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
46 SET_VALID_ENTRY("repartition: start level");
47 SET_VALID_ENTRY("repartition: use map");
48 SET_VALID_ENTRY("repartition: node repartition level");
49 SET_VALID_ENTRY("repartition: min rows per proc");
50 SET_VALID_ENTRY("repartition: target rows per proc");
51 SET_VALID_ENTRY("repartition: min rows per thread");
52 SET_VALID_ENTRY("repartition: target rows per thread");
53 SET_VALID_ENTRY("repartition: max imbalance");
54#undef SET_VALID_ENTRY
55
56 validParamList->set<RCP<const FactoryBase> >("A", Teuchos::null, "Factory of the matrix A");
57 validParamList->set<RCP<const FactoryBase> >("Map", Teuchos::null, "Factory of the map Map");
58 validParamList->set<RCP<const FactoryBase> >("Node Comm", Teuchos::null, "Generating factory of the node level communicator");
59
60 return validParamList;
61}
62
63template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
65 const Teuchos::ParameterList& pL = GetParameterList();
66 if (pL.isParameter("repartition: use map")) {
67 const bool useMap = pL.get<bool>("repartition: use map");
68 if (useMap)
69 Input(currentLevel, "Map");
70 else
71 Input(currentLevel, "A");
72 } else
73 Input(currentLevel, "A");
74 if (pL.isParameter("repartition: node repartition level")) {
75 const int nodeRepartLevel = pL.get<int>("repartition: node repartition level");
76 if (currentLevel.GetLevelID() == nodeRepartLevel) {
77 Input(currentLevel, "Node Comm");
78 }
79 }
80}
81
82template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
84 FactoryMonitor m(*this, "Build", currentLevel);
85
86 const Teuchos::ParameterList& pL = GetParameterList();
87 // Access parameters here to make sure that we set the parameter entry flag to "used" even in case of short-circuit evaluation.
88 // TODO (JG): I don't really know if we want to do this.
89 const int startLevel = pL.get<int>("repartition: start level");
90 const int nodeRepartLevel = pL.get<int>("repartition: node repartition level");
91 LO minRowsPerProcess = pL.get<LO>("repartition: min rows per proc");
92 LO targetRowsPerProcess = pL.get<LO>("repartition: target rows per proc");
93 LO minRowsPerThread = pL.get<LO>("repartition: min rows per thread");
94 LO targetRowsPerThread = pL.get<LO>("repartition: target rows per thread");
95 const double nonzeroImbalance = pL.get<double>("repartition: max imbalance");
96 const bool useMap = pL.get<bool>("repartition: use map");
97
98 int thread_per_mpi_rank = 1;
99#if defined(KOKKOS_ENABLE_OPENMP)
100 using execution_space = typename Node::device_type::execution_space;
101 if (std::is_same<execution_space, Kokkos::OpenMP>::value)
102 thread_per_mpi_rank = execution_space().concurrency();
103#endif
104
105 if (minRowsPerThread > 0)
106 // We ignore the value given by minRowsPerProcess and repartition based on threads instead
107 minRowsPerProcess = minRowsPerThread * thread_per_mpi_rank;
108
109 if (targetRowsPerThread == 0)
110 targetRowsPerThread = minRowsPerThread;
111
112 if (targetRowsPerThread > 0)
113 // We ignore the value given by targetRowsPerProcess and repartition based on threads instead
114 targetRowsPerProcess = targetRowsPerThread * thread_per_mpi_rank;
115
116 if (targetRowsPerProcess == 0)
117 targetRowsPerProcess = minRowsPerProcess;
118
119 // Stick this on the level so Zoltan2Interface can use this later
120 Set<LO>(currentLevel, "repartition: heuristic target rows per process", targetRowsPerProcess);
121
122 // Check for validity of the node repartition option
123 TEUCHOS_TEST_FOR_EXCEPTION(nodeRepartLevel >= startLevel, Exceptions::RuntimeError, "MueLu::RepartitionHeuristicFactory::Build(): If 'repartition: node repartition level' is set, it must be less than or equal to 'repartition: start level'");
124
125 RCP<Matrix> A;
126 RCP<const FactoryBase> Afact;
127 RCP<const Map> map;
128 if (!useMap) {
129 Afact = GetFactory("A");
130 if (!Afact.is_null() && Teuchos::rcp_dynamic_cast<const RAPFactory>(Afact) == Teuchos::null &&
131 Teuchos::rcp_dynamic_cast<const BlockedRAPFactory>(Afact) == Teuchos::null &&
132 Teuchos::rcp_dynamic_cast<const SubBlockAFactory>(Afact) == Teuchos::null) {
133 GetOStream(Warnings) << "MueLu::RepartitionHeuristicFactory::Build: The generation factory for A must "
134 "be a RAPFactory or a SubBlockAFactory providing the non-rebalanced matrix information! "
135 "It specifically must not be of type Rebalance(Blocked)AcFactory or similar. "
136 "Please check the input. Make also sure that \"number of partitions\" is provided to "
137 "the Interface class and the RepartitionFactory instance. Instead, we have a "
138 << Afact->description() << std::endl;
139 }
140 // TODO: We only need a CrsGraph. This class does not have to be templated on Scalar types.
141 A = Get<RCP<Matrix> >(currentLevel, "A");
142 map = A->getRowMap();
143 } else
144 map = Get<RCP<const Map> >(currentLevel, "Map");
145
146 // ======================================================================================================
147 // Determine whether partitioning is needed
148 // ======================================================================================================
149 // NOTE: most tests include some global communication, which is why we currently only do tests until we make
150 // a decision on whether to repartition. However, there is value in knowing how "close" we are to having to
151 // rebalance an operator. So, it would probably be beneficial to do and report *all* tests.
152
153 // Test0: Should we do node repartitioning?
154 if (currentLevel.GetLevelID() == nodeRepartLevel && map->getComm()->getSize() > 1) {
155 RCP<const Teuchos::Comm<int> > NodeComm = Get<RCP<const Teuchos::Comm<int> > >(currentLevel, "Node Comm");
156 TEUCHOS_TEST_FOR_EXCEPTION(NodeComm.is_null(), Exceptions::RuntimeError, "MueLu::RepartitionHeuristicFactory::Build(): NodeComm is null.");
157
158 // If we only have one node, then we don't want to pop down to one rank
159 if (NodeComm()->getSize() != map->getComm()->getSize()) {
160 GetOStream(Statistics1) << "Repartitioning? YES: \n Within node only" << std::endl;
161 int nodeRank = NodeComm->getRank();
162
163 // Do a reduction to get the total number of nodes
164 int isZero = (nodeRank == 0);
165 int numNodes = 0;
166 Teuchos::reduceAll(*map->getComm(), Teuchos::REDUCE_SUM, isZero, Teuchos::outArg(numNodes));
167 Set(currentLevel, "number of partitions", numNodes);
168 return;
169 }
170 }
171
172 // Test1: skip repartitioning if current level is less than the specified minimum level for repartitioning
173 if (currentLevel.GetLevelID() < startLevel) {
174 GetOStream(Statistics1) << "Repartitioning? NO:"
175 << "\n current level = " << Teuchos::toString(currentLevel.GetLevelID()) << ", first level where repartitioning can happen is " + Teuchos::toString(startLevel) << std::endl;
176
177 // a negative number of processors means: no repartitioning
178 Set(currentLevel, "number of partitions", -1);
179
180 return;
181 }
182
183 RCP<const Teuchos::Comm<int> > origComm = map->getComm();
184 RCP<const Teuchos::Comm<int> > comm = origComm;
185
186 // Test 2: check whether A is actually distributed, i.e. more than one processor owns part of A
187 // TODO: this global communication can be avoided if we store the information with the matrix (it is known when matrix is created)
188 // TODO: further improvements could be achieved when we use subcommunicator for the active set. Then we only need to check its size
189
190 // TODO: The block transfer factories do not check correctly whether or not repartitioning actually took place.
191 {
192 if (comm->getSize() == 1 && Teuchos::rcp_dynamic_cast<const RAPFactory>(Afact) != Teuchos::null) {
193 GetOStream(Statistics1) << "Repartitioning? NO:"
194 << "\n comm size = 1" << std::endl;
195
196 Set(currentLevel, "number of partitions", -1);
197 return;
198 }
199
200 int numActiveProcesses = 0;
201 MueLu_sumAll(comm, Teuchos::as<int>((map->getLocalNumElements() > 0) ? 1 : 0), numActiveProcesses);
202
203 if (numActiveProcesses == 1) {
204 GetOStream(Statistics1) << "Repartitioning? NO:"
205 << "\n # processes with rows = " << Teuchos::toString(numActiveProcesses) << std::endl;
206
207 Set(currentLevel, "number of partitions", 1);
208 return;
209 }
210 }
211
212 bool test3 = false, test4 = false;
213 std::string msg3, msg4;
214
215 // Test3: check whether number of rows on any processor satisfies the minimum number of rows requirement
216 // NOTE: Test2 ensures that repartitionning is not done when there is only one processor (it may or may not satisfy Test3)
217 if (minRowsPerProcess > 0) {
218 LO numMyRows = Teuchos::as<LO>(map->getLocalNumElements()), minNumRows, LOMAX = Teuchos::OrdinalTraits<LO>::max();
219 LO haveFewRows = (numMyRows < minRowsPerProcess ? 1 : 0), numWithFewRows = 0;
220 MueLu_sumAll(comm, haveFewRows, numWithFewRows);
221 MueLu_minAll(comm, (numMyRows > 0 ? numMyRows : LOMAX), minNumRows);
222
223 // TODO: we could change it to repartition only if the number of processors with numRows < minNumRows is larger than some
224 // percentage of the total number. This way, we won't repartition if 2 out of 1000 processors don't have enough elements.
225 // I'm thinking maybe 20% threshold. To implement, simply add " && numWithFewRows < .2*numProcs" to the if statement.
226 if (numWithFewRows > 0)
227 test3 = true;
228
229 msg3 = "\n min # rows per proc = " + Teuchos::toString(minNumRows) + ", min allowable = " + Teuchos::toString(minRowsPerProcess);
230 }
231
232 // Test4: check whether the balance in the number of nonzeros per processor is greater than threshold
233 if (!test3) {
234 if (useMap)
235 msg4 = "";
236 else {
237 GO minNnz, maxNnz, numMyNnz = Teuchos::as<GO>(A->getLocalNumEntries());
238 MueLu_maxAll(comm, numMyNnz, maxNnz);
239 MueLu_minAll(comm, (numMyNnz > 0 ? numMyNnz : maxNnz), minNnz); // min nnz over all active processors
240 double imbalance = Teuchos::as<double>(maxNnz) / minNnz;
241
242 if (imbalance > nonzeroImbalance)
243 test4 = true;
244
245 msg4 = "\n nonzero imbalance = " + Teuchos::toString(imbalance) + ", max allowable = " + Teuchos::toString(nonzeroImbalance);
246 }
247 }
248
249 if (!test3 && !test4) {
250 GetOStream(Statistics1) << "Repartitioning? NO:" << msg3 + msg4 << std::endl;
251
252 // A negative number of partitions means: no repartitioning
253 Set(currentLevel, "number of partitions", -1);
254 return;
255 }
256
257 GetOStream(Statistics1) << "Repartitioning? YES:" << msg3 + msg4 << std::endl;
258
259 // ======================================================================================================
260 // Calculate number of partitions
261 // ======================================================================================================
262 // FIXME Quick way to figure out how many partitions there should be (same algorithm as ML)
263 // FIXME Should take into account nnz? Perhaps only when user is using min #nnz per row threshold.
264
265 // The number of partitions is calculated by the RepartitionFactory and stored in "number of partitions" variable on
266 // the current level. If this variable is already set (e.g., by another instance of RepartitionFactory) then this number
267 // is used. The "number of partitions" variable serves as basic communication between the RepartitionFactory (which
268 // requests a certain number of partitions) and the *Interface classes which call the underlying partitioning algorithms
269 // and produce the "Partition" array with the requested number of partitions.
270 const auto globalNumRows = Teuchos::as<GO>(map->getGlobalNumElements());
271 int numPartitions = 1;
272 if (globalNumRows >= targetRowsPerProcess) {
273 // Make sure that each CPU thread has approximately targetRowsPerProcess
274 numPartitions = std::max(Teuchos::as<int>(globalNumRows / targetRowsPerProcess), 1);
275 }
276 numPartitions = std::min(numPartitions, comm->getSize());
277
278 Set(currentLevel, "number of partitions", numPartitions);
279
280 GetOStream(Statistics1) << "Number of partitions to use = " << numPartitions << std::endl;
281} // Build
282} // namespace MueLu
283
284#endif // ifdef HAVE_MPI
285#endif /* PACKAGES_MUELU_SRC_REBALANCING_MUELU_REPARTITIONHEURISTICFACTORY_DEF_HPP_ */
#define SET_VALID_ENTRY(name)
#define MueLu_maxAll(rcpComm, in, out)
#define MueLu_sumAll(rcpComm, in, out)
#define MueLu_minAll(rcpComm, in, out)
Exception throws to report errors in the internal logical of the program.
Timer to be used in factories. Similar to Monitor but with additional timers.
void Input(Level &level, const std::string &varName) const
T Get(Level &level, const std::string &varName) const
void Set(Level &level, const std::string &varName, const T &data) const
const RCP< const FactoryBase > GetFactory(const std::string &varName) const
Default implementation of FactoryAcceptor::GetFactory().
Class that holds all level-specific information.
int GetLevelID() const
Return level number.
virtual const Teuchos::ParameterList & GetParameterList() const
void DeclareInput(Level &currentLevel) const
Determines the data that RepartitionHeuristicFactory needs, and the factories that generate that data...
virtual ~RepartitionHeuristicFactory()
Destructor.
void Build(Level &currentLevel) const
Build an object with this factory.
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
Teuchos::FancyOStream & GetOStream(MsgType type, int thisProcRankOnly=0) const
Get an output stream for outputting the input message type.
Namespace for MueLu classes and methods.
@ Warnings
Print all warning messages.
@ Statistics1
Print more statistics.