MueLu Version of the Day
Loading...
Searching...
No Matches
MueLu_TentativePFactory_kokkos_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// MueLu: A package for multigrid based preconditioning
4//
5// Copyright 2012 NTESS and the MueLu contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
11#define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
12
13#include "Kokkos_UnorderedMap.hpp"
14#include "Xpetra_CrsGraphFactory.hpp"
15
17
18#include "MueLu_Aggregates.hpp"
19#include "MueLu_AmalgamationInfo.hpp"
20
21#include "MueLu_MasterList.hpp"
22#include "MueLu_PerfUtils.hpp"
23#include "MueLu_Monitor.hpp"
24
25#include "Xpetra_IO.hpp"
26
27namespace MueLu {
28
29namespace { // anonymous
30
31template <class LocalOrdinal, class View>
32class ReduceMaxFunctor {
33 public:
34 ReduceMaxFunctor(View view)
35 : view_(view) {}
36
37 KOKKOS_INLINE_FUNCTION
38 void operator()(const LocalOrdinal& i, LocalOrdinal& vmax) const {
39 if (vmax < view_(i))
40 vmax = view_(i);
41 }
42
43 KOKKOS_INLINE_FUNCTION
44 void join(LocalOrdinal& dst, const LocalOrdinal& src) const {
45 if (dst < src) {
46 dst = src;
47 }
48 }
49
50 KOKKOS_INLINE_FUNCTION
51 void init(LocalOrdinal& dst) const {
52 dst = 0;
53 }
54
55 private:
56 View view_;
57};
58
59// local QR decomposition
60template <class LOType, class GOType, class SCType, class DeviceType, class NspType, class aggRowsType, class maxAggDofSizeType, class agg2RowMapLOType, class statusType, class rowsType, class rowsAuxType, class colsAuxType, class valsAuxType>
61class LocalQRDecompFunctor {
62 private:
63 typedef LOType LO;
64 typedef GOType GO;
65 typedef SCType SC;
66
67 typedef typename DeviceType::execution_space execution_space;
68 typedef typename Kokkos::ArithTraits<SC>::val_type impl_SC;
69 typedef Kokkos::ArithTraits<impl_SC> impl_ATS;
70 typedef typename impl_ATS::magnitudeType Magnitude;
71
72 typedef Kokkos::View<impl_SC**, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_matrix;
73 typedef Kokkos::View<impl_SC*, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_vector;
74
75 private:
76 NspType fineNS;
77 NspType coarseNS;
78 aggRowsType aggRows;
79 maxAggDofSizeType maxAggDofSize; //< maximum number of dofs in aggregate (max size of aggregate * numDofsPerNode)
80 agg2RowMapLOType agg2RowMapLO;
81 statusType statusAtomic;
82 rowsType rows;
83 rowsAuxType rowsAux;
84 colsAuxType colsAux;
85 valsAuxType valsAux;
86 bool doQRStep;
87
88 public:
89 LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_, bool doQRStep_)
90 : fineNS(fineNS_)
91 , coarseNS(coarseNS_)
92 , aggRows(aggRows_)
93 , maxAggDofSize(maxAggDofSize_)
94 , agg2RowMapLO(agg2RowMapLO_)
95 , statusAtomic(statusAtomic_)
96 , rows(rows_)
97 , rowsAux(rowsAux_)
98 , colsAux(colsAux_)
99 , valsAux(valsAux_)
100 , doQRStep(doQRStep_) {}
101
102 KOKKOS_INLINE_FUNCTION
103 void operator()(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread, size_t& nnz) const {
104 auto agg = thread.league_rank();
105
106 // size of aggregate: number of DOFs in aggregate
107 auto aggSize = aggRows(agg + 1) - aggRows(agg);
108
109 const impl_SC one = impl_ATS::one();
110 const impl_SC two = one + one;
111 const impl_SC zero = impl_ATS::zero();
112 const auto zeroM = impl_ATS::magnitude(zero);
113
114 int m = aggSize;
115 int n = fineNS.extent(1);
116
117 // calculate row offset for coarse nullspace
118 Xpetra::global_size_t offset = agg * n;
119
120 if (doQRStep) {
121 // Extract the piece of the nullspace corresponding to the aggregate
122 shared_matrix r(thread.team_shmem(), m, n); // A (initially), R (at the end)
123 for (int j = 0; j < n; j++)
124 for (int k = 0; k < m; k++)
125 r(k, j) = fineNS(agg2RowMapLO(aggRows(agg) + k), j);
126#if 0
127 printf("A\n");
128 for (int i = 0; i < m; i++) {
129 for (int j = 0; j < n; j++)
130 printf(" %5.3lf ", r(i,j));
131 printf("\n");
132 }
133#endif
134
135 // Calculate QR decomposition (standard)
136 shared_matrix q(thread.team_shmem(), m, m); // Q
137 if (m >= n) {
138 bool isSingular = false;
139
140 // Initialize Q^T
141 auto qt = q;
142 for (int i = 0; i < m; i++) {
143 for (int j = 0; j < m; j++)
144 qt(i, j) = zero;
145 qt(i, i) = one;
146 }
147
148 for (int k = 0; k < n; k++) { // we ignore "n" instead of "n-1" to normalize
149 // FIXME_KOKKOS: use team
150 Magnitude s = zeroM, norm, norm_x;
151 for (int i = k + 1; i < m; i++)
152 s += pow(impl_ATS::magnitude(r(i, k)), 2);
153 norm = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
154
155 if (norm == zero) {
156 isSingular = true;
157 break;
158 }
159
160 r(k, k) -= norm * one;
161
162 norm_x = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
163 if (norm_x == zeroM) {
164 // We have a single diagonal element in the column.
165 // No reflections required. Just need to restor r(k,k).
166 r(k, k) = norm * one;
167 continue;
168 }
169
170 // FIXME_KOKKOS: use team
171 for (int i = k; i < m; i++)
172 r(i, k) /= norm_x;
173
174 // Update R(k:m,k+1:n)
175 for (int j = k + 1; j < n; j++) {
176 // FIXME_KOKKOS: use team in the loops
177 impl_SC si = zero;
178 for (int i = k; i < m; i++)
179 si += r(i, k) * r(i, j);
180 for (int i = k; i < m; i++)
181 r(i, j) -= two * si * r(i, k);
182 }
183
184 // Update Q^T (k:m,k:m)
185 for (int j = k; j < m; j++) {
186 // FIXME_KOKKOS: use team in the loops
187 impl_SC si = zero;
188 for (int i = k; i < m; i++)
189 si += r(i, k) * qt(i, j);
190 for (int i = k; i < m; i++)
191 qt(i, j) -= two * si * r(i, k);
192 }
193
194 // Fix R(k:m,k)
195 r(k, k) = norm * one;
196 for (int i = k + 1; i < m; i++)
197 r(i, k) = zero;
198 }
199
200#if 0
201 // Q = (Q^T)^T
202 for (int i = 0; i < m; i++)
203 for (int j = 0; j < i; j++) {
204 impl_SC tmp = qt(i,j);
205 qt(i,j) = qt(j,i);
206 qt(j,i) = tmp;
207 }
208#endif
209
210 // Build coarse nullspace using the upper triangular part of R
211 for (int j = 0; j < n; j++)
212 for (int k = 0; k <= j; k++)
213 coarseNS(offset + k, j) = r(k, j);
214
215 if (isSingular) {
216 statusAtomic(1) = true;
217 return;
218 }
219
220 } else {
221 // Special handling for m < n (i.e. single node aggregates in structural mechanics)
222
223 // The local QR decomposition is not possible in the "overconstrained"
224 // case (i.e. number of columns in qr > number of rowsAux), which
225 // corresponds to #DOFs in Aggregate < n. For usual problems this
226 // is only possible for single node aggregates in structural mechanics.
227 // (Similar problems may arise in discontinuous Galerkin problems...)
228 // We bypass the QR decomposition and use an identity block in the
229 // tentative prolongator for the single node aggregate and transfer the
230 // corresponding fine level null space information 1-to-1 to the coarse
231 // level null space part.
232
233 // NOTE: The resulting tentative prolongation operator has
234 // (m*DofsPerNode-n) zero columns leading to a singular
235 // coarse level operator A. To deal with that one has the following
236 // options:
237 // - Use the "RepairMainDiagonal" flag in the RAPFactory (default:
238 // false) to add some identity block to the diagonal of the zero rowsAux
239 // in the coarse level operator A, such that standard level smoothers
240 // can be used again.
241 // - Use special (projection-based) level smoothers, which can deal
242 // with singular matrices (very application specific)
243 // - Adapt the code below to avoid zero columns. However, we do not
244 // support a variable number of DOFs per node in MueLu/Xpetra which
245 // makes the implementation really hard.
246 //
247 // FIXME: do we need to check for singularity here somehow? Zero
248 // columns would be easy but linear dependency would require proper QR.
249
250 // R = extended (by adding identity rowsAux) qr
251 for (int j = 0; j < n; j++)
252 for (int k = 0; k < n; k++)
253 if (k < m)
254 coarseNS(offset + k, j) = r(k, j);
255 else
256 coarseNS(offset + k, j) = (k == j ? one : zero);
257
258 // Q = I (rectangular)
259 for (int i = 0; i < m; i++)
260 for (int j = 0; j < n; j++)
261 q(i, j) = (j == i ? one : zero);
262 }
263
264 // Process each row in the local Q factor and fill helper arrays to assemble P
265 for (int j = 0; j < m; j++) {
266 LO localRow = agg2RowMapLO(aggRows(agg) + j);
267 size_t rowStart = rowsAux(localRow);
268 size_t lnnz = 0;
269 for (int k = 0; k < n; k++) {
270 // skip zeros
271 if (q(j, k) != zero) {
272 colsAux(rowStart + lnnz) = offset + k;
273 valsAux(rowStart + lnnz) = q(j, k);
274 lnnz++;
275 }
276 }
277 rows(localRow + 1) = lnnz;
278 nnz += lnnz;
279 }
280
281#if 0
282 printf("R\n");
283 for (int i = 0; i < m; i++) {
284 for (int j = 0; j < n; j++)
285 printf(" %5.3lf ", coarseNS(i,j));
286 printf("\n");
287 }
288
289 printf("Q\n");
290 for (int i = 0; i < aggSize; i++) {
291 for (int j = 0; j < aggSize; j++)
292 printf(" %5.3lf ", q(i,j));
293 printf("\n");
294 }
295#endif
296 } else {
298 // "no-QR" option //
300 // Local Q factor is just the fine nullspace support over the current aggregate.
301 // Local R factor is the identity.
302 // TODO I have not implemented any special handling for aggregates that are too
303 // TODO small to locally support the nullspace, as is done in the standard QR
304 // TODO case above.
305
306 for (int j = 0; j < m; j++) {
307 LO localRow = agg2RowMapLO(aggRows(agg) + j);
308 size_t rowStart = rowsAux(localRow);
309 size_t lnnz = 0;
310 for (int k = 0; k < n; k++) {
311 const impl_SC qr_jk = fineNS(localRow, k);
312 // skip zeros
313 if (qr_jk != zero) {
314 colsAux(rowStart + lnnz) = offset + k;
315 valsAux(rowStart + lnnz) = qr_jk;
316 lnnz++;
317 }
318 }
319 rows(localRow + 1) = lnnz;
320 nnz += lnnz;
321 }
322
323 for (int j = 0; j < n; j++)
324 coarseNS(offset + j, j) = one;
325 }
326 }
327
328 // amount of shared memory
329 size_t team_shmem_size(int /* team_size */) const {
330 if (doQRStep) {
331 int m = maxAggDofSize;
332 int n = fineNS.extent(1);
333 return shared_matrix::shmem_size(m, n) + // r
334 shared_matrix::shmem_size(m, m); // q
335 } else
336 return 0;
337 }
338};
339
340} // namespace
341
342template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
344 RCP<ParameterList> validParamList = rcp(new ParameterList());
345
346#define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
347 SET_VALID_ENTRY("tentative: calculate qr");
348 SET_VALID_ENTRY("tentative: build coarse coordinates");
349#undef SET_VALID_ENTRY
350
351 validParamList->set<RCP<const FactoryBase>>("A", Teuchos::null, "Generating factory of the matrix A");
352 validParamList->set<RCP<const FactoryBase>>("Aggregates", Teuchos::null, "Generating factory of the aggregates");
353 validParamList->set<RCP<const FactoryBase>>("Nullspace", Teuchos::null, "Generating factory of the nullspace");
354 validParamList->set<RCP<const FactoryBase>>("Scaled Nullspace", Teuchos::null, "Generating factory of the scaled nullspace");
355 validParamList->set<RCP<const FactoryBase>>("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo");
356 validParamList->set<RCP<const FactoryBase>>("CoarseMap", Teuchos::null, "Generating factory of the coarse map");
357 validParamList->set<RCP<const FactoryBase>>("Coordinates", Teuchos::null, "Generating factory of the coordinates");
358 validParamList->set<RCP<const FactoryBase>>("Node Comm", Teuchos::null, "Generating factory of the node level communicator");
359
360 // Make sure we don't recursively validate options for the matrixmatrix kernels
361 ParameterList norecurse;
362 norecurse.disableRecursiveValidation();
363 validParamList->set<ParameterList>("matrixmatrix: kernel params", norecurse, "MatrixMatrix kernel parameters");
364
365 return validParamList;
366}
367
368template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
370 const ParameterList& pL = GetParameterList();
371 // NOTE: This guy can only either be 'Nullspace' or 'Scaled Nullspace' or else the validator above will cause issues
372 std::string nspName = "Nullspace";
373 if (pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
374
375 Input(fineLevel, "A");
376 Input(fineLevel, "Aggregates");
377 Input(fineLevel, nspName);
378 Input(fineLevel, "UnAmalgamationInfo");
379 Input(fineLevel, "CoarseMap");
380 if (fineLevel.GetLevelID() == 0 &&
381 fineLevel.IsAvailable("Coordinates", NoFactory::get()) && // we have coordinates (provided by user app)
382 pL.get<bool>("tentative: build coarse coordinates")) { // and we want coordinates on other levels
383 bTransferCoordinates_ = true; // then set the transfer coordinates flag to true
384 Input(fineLevel, "Coordinates");
385 } else if (bTransferCoordinates_) {
386 Input(fineLevel, "Coordinates");
387 }
388}
389
390template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
392 return BuildP(fineLevel, coarseLevel);
393}
394
395template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
397 FactoryMonitor m(*this, "Build", coarseLevel);
398
399 typedef typename Teuchos::ScalarTraits<Scalar>::coordinateType coordinate_type;
400 typedef Xpetra::MultiVectorFactory<coordinate_type, LO, GO, NO> RealValuedMultiVectorFactory;
401 const ParameterList& pL = GetParameterList();
402 std::string nspName = "Nullspace";
403 if (pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
404
405 auto A = Get<RCP<Matrix>>(fineLevel, "A");
406 auto aggregates = Get<RCP<Aggregates>>(fineLevel, "Aggregates");
407 auto amalgInfo = Get<RCP<AmalgamationInfo>>(fineLevel, "UnAmalgamationInfo");
408 auto fineNullspace = Get<RCP<MultiVector>>(fineLevel, nspName);
409 auto coarseMap = Get<RCP<const Map>>(fineLevel, "CoarseMap");
410 RCP<RealValuedMultiVector> fineCoords;
412 fineCoords = Get<RCP<RealValuedMultiVector>>(fineLevel, "Coordinates");
413 }
414
415 RCP<Matrix> Ptentative;
416 // No coarse DoFs so we need to bail by setting Ptentattive to null and returning
417 // This level will ultimately be removed in MueLu_Hierarchy_defs.h via a resize()
418 if (aggregates->GetNumGlobalAggregatesComputeIfNeeded() == 0) {
419 Ptentative = Teuchos::null;
420 Set(coarseLevel, "P", Ptentative);
421 return;
422 }
423 RCP<MultiVector> coarseNullspace;
424 RCP<RealValuedMultiVector> coarseCoords;
425
427 RCP<const Map> coarseCoordMap;
428 using array_type = typename Map::global_indices_array_device_type;
429
430 LO blkSize = 1;
431 if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
432 blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();
433
434 if (blkSize == 1) {
435 // Scalar system
436 // No amalgamation required, we can use the coarseMap
437 coarseCoordMap = coarseMap;
438 } else {
439 // Vector system
440 // NOTE: There could be further optimizations here where we detect contiguous maps and then
441 // create a contiguous amalgamated maps, which bypasses the expense of the getMyGlobalIndicesDevice()
442 // call (which is free for non-contiguous maps, but costs something if the map is contiguous).
443 using range_policy = Kokkos::RangePolicy<typename Node::execution_space>;
444 array_type elementAList = coarseMap->getMyGlobalIndicesDevice();
445 GO indexBase = coarseMap->getIndexBase();
446 auto numElements = elementAList.size() / blkSize;
447 typename array_type::non_const_type elementList_nc("elementList", numElements);
448
449 // Amalgamate the map
450 Kokkos::parallel_for(
451 "Amalgamate Element List", range_policy(0, numElements), KOKKOS_LAMBDA(LO i) {
452 elementList_nc[i] = (elementAList[i * blkSize] - indexBase) / blkSize + indexBase;
453 });
454 array_type elementList = elementList_nc;
455 coarseCoordMap = MapFactory::Build(coarseMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
456 elementList, indexBase, coarseMap->getComm());
457 }
458
459 coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());
460
461 // Create overlapped fine coordinates to reduce global communication
462 auto uniqueMap = fineCoords->getMap();
463 RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
464 if (aggregates->AggregatesCrossProcessors()) {
465 auto nonUniqueMap = aggregates->GetMap();
466 auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
467
468 ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
469 ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
470 }
471
472 // The good new is that his graph has already been constructed for the
473 // TentativePFactory and was cached in Aggregates. So this is a no-op.
474 auto aggGraph = aggregates->GetGraph();
475 auto numAggs = aggGraph.numRows();
476
477 auto fineCoordsView = fineCoords->getDeviceLocalView(Xpetra::Access::ReadOnly);
478 auto coarseCoordsView = coarseCoords->getDeviceLocalView(Xpetra::Access::OverwriteAll);
479
480 // Fill in coarse coordinates
481 {
482 SubFactoryMonitor m2(*this, "AverageCoords", coarseLevel);
483
484 const auto dim = fineCoords->getNumVectors();
485
486 typename AppendTrait<decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
487 for (size_t j = 0; j < dim; j++) {
488 Kokkos::parallel_for(
489 "MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
490 KOKKOS_LAMBDA(const LO i) {
491 // A row in this graph represents all node ids in the aggregate
492 // Therefore, averaging is very easy
493
494 auto aggregate = aggGraph.rowConst(i);
495
496 coordinate_type sum = 0.0; // do not use Scalar here (Stokhos)
497 for (size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
498 sum += fineCoordsRandomView(aggregate(colID), j);
499
500 coarseCoordsView(i, j) = sum / aggregate.length;
501 });
502 }
503 }
504 }
505
506 if (!aggregates->AggregatesCrossProcessors()) {
507 if (Xpetra::Helpers<SC, LO, GO, NO>::isTpetraBlockCrs(A)) {
508 BuildPuncoupledBlockCrs(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,
509 coarseLevel.GetLevelID());
510 } else {
511 BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
512 }
513 } else
514 BuildPcoupled(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
515
516 // If available, use striding information of fine level matrix A for range
517 // map and coarseMap as domain map; otherwise use plain range map of
518 // Ptent = plain range map of A for range map and coarseMap as domain map.
519 // NOTE:
520 // The latter is not really safe, since there is no striding information
521 // for the range map. This is not really a problem, since striding
522 // information is always available on the intermedium levels and the
523 // coarsest levels.
524 if (A->IsView("stridedMaps") == true)
525 Ptentative->CreateView("stridedMaps", A->getRowMap("stridedMaps"), coarseMap);
526 else
527 Ptentative->CreateView("stridedMaps", Ptentative->getRangeMap(), coarseMap);
528
530 Set(coarseLevel, "Coordinates", coarseCoords);
531 }
532
533 // FIXME: We should remove the NodeComm on levels past the threshold
534 if (fineLevel.IsAvailable("Node Comm")) {
535 RCP<const Teuchos::Comm<int>> nodeComm = Get<RCP<const Teuchos::Comm<int>>>(fineLevel, "Node Comm");
536 Set<RCP<const Teuchos::Comm<int>>>(coarseLevel, "Node Comm", nodeComm);
537 }
538
539 Set(coarseLevel, "Nullspace", coarseNullspace);
540 Set(coarseLevel, "P", Ptentative);
541
542 if (IsPrint(Statistics2)) {
543 RCP<ParameterList> params = rcp(new ParameterList());
544 params->set("printLoadBalancingInfo", true);
545 GetOStream(Statistics2) << PerfUtils::PrintMatrixInfo(*Ptentative, "Ptent", params);
546 }
547}
548
549template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
551 BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates> aggregates,
552 RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
553 RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
554 RCP<MultiVector>& coarseNullspace, const int levelID) const {
555 auto rowMap = A->getRowMap();
556 auto colMap = A->getColMap();
557
558 const size_t numRows = rowMap->getLocalNumElements();
559 const size_t NSDim = fineNullspace->getNumVectors();
560
561 typedef Kokkos::ArithTraits<SC> ATS;
562 using impl_SC = typename ATS::val_type;
563 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
564 const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one();
565
566 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
567
568 typename Aggregates::local_graph_type aggGraph;
569 {
570 SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
571 aggGraph = aggregates->GetGraph();
572 }
573 auto aggRows = aggGraph.row_map;
574 auto aggCols = aggGraph.entries;
575
576 // Aggregates map is based on the amalgamated column map
577 // We can skip global-to-local conversion if LIDs in row map are
578 // same as LIDs in column map
579 bool goodMap;
580 {
581 SubFactoryMonitor m2(*this, "Check good map", coarseLevel);
582 goodMap = isGoodMap(*rowMap, *colMap);
583 }
584 // FIXME_KOKKOS: need to proofread later code for bad maps
585 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
586 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
587 "(i.e. \"matching\" row and column maps)");
588
589 // STEP 1: do unamalgamation
590 // The non-kokkos version uses member functions from the AmalgamationInfo
591 // container class to unamalgamate the data. In contrast, the kokkos
592 // version of TentativePFactory does the unamalgamation here and only uses
593 // the data of the AmalgamationInfo container class
594
595 // Extract information for unamalgamation
596 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
597 GO indexBase;
598 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
599 GO globalOffset = amalgInfo->GlobalOffset();
600
601 // Extract aggregation info (already in Kokkos host views)
602 auto procWinner = aggregates->GetProcWinner()->getDeviceLocalView(Xpetra::Access::ReadOnly);
603 auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
604 const size_t numAggregates = aggregates->GetNumAggregates();
605
606 int myPID = aggregates->GetMap()->getComm()->getRank();
607
608 // Create Kokkos::View (on the device) to store the aggreate dof sizes
609 // Later used to get aggregate dof offsets
610 // NOTE: This zeros itself on construction
611 typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
612 AggSizeType aggDofSizes;
613
614 if (stridedBlockSize == 1) {
615 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
616
617 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
618 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
619
620 auto sizesConst = aggregates->ComputeAggregateSizes();
621 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), sizesConst);
622
623 } else {
624 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
625
626 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
627 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
628
629 auto nodeMap = aggregates->GetMap()->getLocalMap();
630 auto dofMap = colMap->getLocalMap();
631
632 Kokkos::parallel_for(
633 "MueLu:TentativePF:Build:compute_agg_sizes", range_type(0, numAggregates),
634 KOKKOS_LAMBDA(const LO agg) {
635 auto aggRowView = aggGraph.rowConst(agg);
636
637 size_t size = 0;
638 for (LO colID = 0; colID < aggRowView.length; colID++) {
639 GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
640
641 for (LO k = 0; k < stridedBlockSize; k++) {
642 GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
643
644 if (dofMap.getLocalElement(dofGID) != INVALID)
645 size++;
646 }
647 }
648 aggDofSizes(agg + 1) = size;
649 });
650 }
651
652 // Find maximum dof size for aggregates
653 // Later used to reserve enough scratch space for local QR decompositions
654 LO maxAggSize = 0;
655 ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
656 Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
657
658 // parallel_scan (exclusive)
659 // The aggDofSizes View then contains the aggregate dof offsets
660 Kokkos::parallel_scan(
661 "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0, numAggregates + 1),
662 KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
663 update += aggDofSizes(i);
664 if (final_pass)
665 aggDofSizes(i) = update;
666 });
667
668 // Create Kokkos::View on the device to store mapping
669 // between (local) aggregate id and row map ids (LIDs)
670 Kokkos::View<LO*, DeviceType> agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing("agg2row_map_LO"), numRows);
671 {
672 SubFactoryMonitor m2(*this, "Create Agg2RowMap", coarseLevel);
673
674 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
675 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
676
677 Kokkos::parallel_for(
678 "MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
679 KOKKOS_LAMBDA(const LO lnode) {
680 if (procWinner(lnode, 0) == myPID) {
681 // No need for atomics, it's one-to-one
682 auto aggID = vertex2AggId(lnode, 0);
683
684 auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
685 // FIXME: I think this may be wrong
686 // We unconditionally add the whole block here. When we calculated
687 // aggDofSizes, we did the isLocalElement check. Something's fishy.
688 for (LO k = 0; k < stridedBlockSize; k++)
689 agg2RowMapLO(offset + k) = lnode * stridedBlockSize + k;
690 }
691 });
692 }
693
694 // STEP 2: prepare local QR decomposition
695 // Reserve memory for tentative prolongation operator
696 coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
697
698 // Pull out the nullspace vectors so that we can have random access (on the device)
699 auto fineNS = fineNullspace->getDeviceLocalView(Xpetra::Access::ReadWrite);
700 auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
701
702 size_t nnz = 0; // actual number of nnz
703
704 typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_type local_matrix_type;
705 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
706 typedef typename local_matrix_type::index_type::non_const_type cols_type;
707 typedef typename local_matrix_type::values_type::non_const_type vals_type;
708
709 // Device View for status (error messages...)
710 typedef Kokkos::View<int[10], DeviceType> status_type;
711 status_type status("status");
712
713 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
714 typename AppendTrait<status_type, Kokkos::Atomic>::type statusAtomic = status;
715
716 const ParameterList& pL = GetParameterList();
717 const bool& doQRStep = pL.get<bool>("tentative: calculate qr");
718 if (!doQRStep) {
719 GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
720 if (NSDim > 1)
721 GetOStream(Warnings0) << "TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
722 }
723
724 size_t nnzEstimate = numRows * NSDim;
725 rows_type rowsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_rows"), numRows + 1);
726 cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_cols"), nnzEstimate);
727 vals_type valsAux("Ptent_aux_vals", nnzEstimate);
728 rows_type rows("Ptent_rows", numRows + 1);
729 {
730 // Stage 0: fill in views.
731 SubFactoryMonitor m2(*this, "Stage 0 (InitViews)", coarseLevel);
732
733 // The main thing to notice is initialization of vals with INVALID. These
734 // values will later be used to compress the arrays
735 Kokkos::parallel_for(
736 "MueLu:TentativePF:BuildPuncoupled:for1", range_type(0, numRows + 1),
737 KOKKOS_LAMBDA(const LO row) {
738 rowsAux(row) = row * NSDim;
739 });
740 Kokkos::parallel_for(
741 "MueLu:TentativePF:BuildUncoupled:for2", range_type(0, nnzEstimate),
742 KOKKOS_LAMBDA(const LO j) {
743 colsAux(j) = INVALID;
744 });
745 }
746
747 if (NSDim == 1) {
748 // 1D is special, as it is the easiest. We don't even need to the QR,
749 // just normalize an array. Plus, no worries abot small aggregates. In
750 // addition, we do not worry about compression. It is unlikely that
751 // nullspace will have zeros. If it does, a prolongator row would be
752 // zero and we'll get singularity anyway.
753 SubFactoryMonitor m2(*this, "Stage 1 (LocalQR)", coarseLevel);
754
755 // Set up team policy with numAggregates teams and one thread per team.
756 // Each team handles a slice of the data associated with one aggregate
757 // and performs a local QR decomposition (in this case real QR is
758 // unnecessary).
759 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
760
761 if (doQRStep) {
762 Kokkos::parallel_for(
763 "MueLu:TentativePF:BuildUncoupled:main_loop", policy,
764 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
765 auto agg = thread.league_rank();
766
767 // size of the aggregate (number of DOFs in aggregate)
768 LO aggSize = aggRows(agg + 1) - aggRows(agg);
769
770 // Extract the piece of the nullspace corresponding to the aggregate, and
771 // put it in the flat array, "localQR" (in column major format) for the
772 // QR routine. Trivial in 1D.
773 auto norm = impl_ATS::magnitude(zero);
774
775 // Calculate QR by hand
776 // FIXME: shouldn't there be stridedblock here?
777 // FIXME_KOKKOS: shouldn't there be stridedblock here?
778 for (decltype(aggSize) k = 0; k < aggSize; k++) {
779 auto dnorm = impl_ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0));
780 norm += dnorm * dnorm;
781 }
782 norm = sqrt(norm);
783
784 if (norm == zero) {
785 // zero column; terminate the execution
786 statusAtomic(1) = true;
787 return;
788 }
789
790 // R = norm
791 coarseNS(agg, 0) = norm;
792
793 // Q = localQR(:,0)/norm
794 for (decltype(aggSize) k = 0; k < aggSize; k++) {
795 LO localRow = agg2RowMapLO(aggRows(agg) + k);
796 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0) / norm;
797
798 rows(localRow + 1) = 1;
799 colsAux(localRow) = agg;
800 valsAux(localRow) = localVal;
801 }
802 });
803
804 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
805 Kokkos::deep_copy(statusHost, status);
806 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
807 if (statusHost(i)) {
808 std::ostringstream oss;
809 oss << "MueLu::TentativePFactory::MakeTentative: ";
810 switch (i) {
811 case 0: oss << "!goodMap is not implemented"; break;
812 case 1: oss << "fine level NS part has a zero column"; break;
813 }
814 throw Exceptions::RuntimeError(oss.str());
815 }
816
817 } else {
818 Kokkos::parallel_for(
819 "MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
820 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
821 auto agg = thread.league_rank();
822
823 // size of the aggregate (number of DOFs in aggregate)
824 LO aggSize = aggRows(agg + 1) - aggRows(agg);
825
826 // R = norm
827 coarseNS(agg, 0) = one;
828
829 // Q = localQR(:,0)/norm
830 for (decltype(aggSize) k = 0; k < aggSize; k++) {
831 LO localRow = agg2RowMapLO(aggRows(agg) + k);
832 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0);
833
834 rows(localRow + 1) = 1;
835 colsAux(localRow) = agg;
836 valsAux(localRow) = localVal;
837 }
838 });
839 }
840
841 Kokkos::parallel_reduce(
842 "MueLu:TentativeP:CountNNZ", range_type(0, numRows + 1),
843 KOKKOS_LAMBDA(const LO i, size_t& nnz_count) {
844 nnz_count += rows(i);
845 },
846 nnz);
847
848 } else { // NSdim > 1
849 // FIXME_KOKKOS: This code branch is completely unoptimized.
850 // Work to do:
851 // - Optimize QR decomposition
852 // - Remove INVALID usage similarly to CoalesceDropFactory_kokkos by
853 // packing new values in the beginning of each row
854 // We do use auxilary view in this case, so keep a second rows view for
855 // counting nonzeros in rows
856
857 {
858 SubFactoryMonitor m2 = SubFactoryMonitor(*this, doQRStep ? "Stage 1 (LocalQR)" : "Stage 1 (Fill coarse nullspace and tentative P)", coarseLevel);
859 // Set up team policy with numAggregates teams and one thread per team.
860 // Each team handles a slice of the data associated with one aggregate
861 // and performs a local QR decomposition
862 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1); // numAggregates teams a 1 thread
863 LocalQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom),
864 decltype(aggDofSizes /*aggregate sizes in dofs*/), decltype(maxAggSize), decltype(agg2RowMapLO),
865 decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux),
866 decltype(valsAux)>
867 localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
868 rows, rowsAux, colsAux, valsAux, doQRStep);
869 Kokkos::parallel_reduce("MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
870 }
871
872 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
873 Kokkos::deep_copy(statusHost, status);
874 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
875 if (statusHost(i)) {
876 std::ostringstream oss;
877 oss << "MueLu::TentativePFactory::MakeTentative: ";
878 switch (i) {
879 case 0: oss << "!goodMap is not implemented"; break;
880 case 1: oss << "fine level NS part has a zero column"; break;
881 }
882 throw Exceptions::RuntimeError(oss.str());
883 }
884 }
885
886 // Compress the cols and vals by ignoring INVALID column entries that correspond
887 // to 0 in QR.
888
889 // The real cols and vals are constructed using calculated (not estimated) nnz
890 cols_type cols;
891 vals_type vals;
892
893 if (nnz != nnzEstimate) {
894 {
895 // Stage 2: compress the arrays
896 SubFactoryMonitor m2(*this, "Stage 2 (CompressRows)", coarseLevel);
897
898 Kokkos::parallel_scan(
899 "MueLu:TentativePF:Build:compress_rows", range_type(0, numRows + 1),
900 KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
901 upd += rows(i);
902 if (final)
903 rows(i) = upd;
904 });
905 }
906
907 {
908 SubFactoryMonitor m2(*this, "Stage 2 (CompressCols)", coarseLevel);
909
910 cols = cols_type("Ptent_cols", nnz);
911 vals = vals_type("Ptent_vals", nnz);
912
913 // FIXME_KOKKOS: this can be spedup by moving correct cols and vals values
914 // to the beginning of rows. See CoalesceDropFactory_kokkos for
915 // example.
916 Kokkos::parallel_for(
917 "MueLu:TentativePF:Build:compress_cols_vals", range_type(0, numRows),
918 KOKKOS_LAMBDA(const LO i) {
919 LO rowStart = rows(i);
920
921 size_t lnnz = 0;
922 for (auto j = rowsAux(i); j < rowsAux(i + 1); j++)
923 if (colsAux(j) != INVALID) {
924 cols(rowStart + lnnz) = colsAux(j);
925 vals(rowStart + lnnz) = valsAux(j);
926 lnnz++;
927 }
928 });
929 }
930
931 } else {
932 rows = rowsAux;
933 cols = colsAux;
934 vals = valsAux;
935 }
936
937 GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl;
938
939 {
940 // Stage 3: construct Xpetra::Matrix
941 SubFactoryMonitor m2(*this, "Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
942
943 local_matrix_type lclMatrix = local_matrix_type("A", numRows, coarseMap->getLocalNumElements(), nnz, vals, rows, cols);
944
945 // Managing labels & constants for ESFC
946 RCP<ParameterList> FCparams;
947 if (pL.isSublist("matrixmatrix: kernel params"))
948 FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
949 else
950 FCparams = rcp(new ParameterList);
951
952 // By default, we don't need global constants for TentativeP
953 FCparams->set("compute global constants", FCparams->get("compute global constants", false));
954 FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + toString(levelID));
955
956 auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
957 Ptentative = rcp(new CrsMatrixWrap(PtentCrs));
958 }
959}
960
961template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
963 BuildPuncoupledBlockCrs(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates> aggregates,
964 RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
965 RCP<const Map> coarsePointMap, RCP<Matrix>& Ptentative,
966 RCP<MultiVector>& coarseNullspace, const int levelID) const {
967 /* This routine generates a BlockCrs P for a BlockCrs A. There are a few assumptions here, which meet the use cases we care about, but could
968 be generalized later, if we ever need to do so:
969 1) Null space dimension === block size of matrix: So no elasticity right now
970 2) QR is not supported: Under assumption #1, this shouldn't cause problems.
971 3) Maps are "good": Aka the first chunk of the ColMap is the RowMap.
972
973 These assumptions keep our code way simpler and still support the use cases we actually care about.
974 */
975
976 RCP<const Map> rowMap = A->getRowMap();
977 RCP<const Map> rangeMap = A->getRangeMap();
978 RCP<const Map> colMap = A->getColMap();
979 // const size_t numFinePointRows = rangeMap->getLocalNumElements();
980 const size_t numFineBlockRows = rowMap->getLocalNumElements();
981
982 // typedef Teuchos::ScalarTraits<SC> STS;
983 // typedef typename STS::magnitudeType Magnitude;
984 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
985
986 typedef Kokkos::ArithTraits<SC> ATS;
987 using impl_SC = typename ATS::val_type;
988 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
989 const impl_SC one = impl_ATS::one();
990
991 // const GO numAggs = aggregates->GetNumAggregates();
992 const size_t NSDim = fineNullspace->getNumVectors();
993 auto aggSizes = aggregates->ComputeAggregateSizes();
994
995 typename Aggregates::local_graph_type aggGraph;
996 {
997 SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
998 aggGraph = aggregates->GetGraph();
999 }
1000 auto aggRows = aggGraph.row_map;
1001 auto aggCols = aggGraph.entries;
1002
1003 // Need to generate the coarse block map
1004 // NOTE: We assume NSDim == block size here
1005 // NOTE: We also assume that coarseMap has contiguous GIDs
1006 // const size_t numCoarsePointRows = coarsePointMap->getLocalNumElements();
1007 const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim;
1008 RCP<const Map> coarseBlockMap = MapFactory::Build(coarsePointMap->lib(),
1009 Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
1010 numCoarseBlockRows,
1011 coarsePointMap->getIndexBase(),
1012 coarsePointMap->getComm());
1013 // Sanity checking
1014 const ParameterList& pL = GetParameterList();
1015 // const bool &doQRStep = pL.get<bool>("tentative: calculate qr");
1016
1017 // The aggregates use the amalgamated column map, which in this case is what we want
1018
1019 // Aggregates map is based on the amalgamated column map
1020 // We can skip global-to-local conversion if LIDs in row map are
1021 // same as LIDs in column map
1022 bool goodMap = MueLu::Utilities<SC, LO, GO, NO>::MapsAreNested(*rowMap, *colMap);
1023 TEUCHOS_TEST_FOR_EXCEPTION(!goodMap, Exceptions::RuntimeError,
1024 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
1025 "(i.e. \"matching\" row and column maps)");
1026
1027 // STEP 1: do unamalgamation
1028 // The non-kokkos version uses member functions from the AmalgamationInfo
1029 // container class to unamalgamate the data. In contrast, the kokkos
1030 // version of TentativePFactory does the unamalgamation here and only uses
1031 // the data of the AmalgamationInfo container class
1032
1033 // Extract information for unamalgamation
1034 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
1035 GO indexBase;
1036 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
1037 // GO globalOffset = amalgInfo->GlobalOffset();
1038
1039 // Extract aggregation info (already in Kokkos host views)
1040 auto procWinner = aggregates->GetProcWinner()->getDeviceLocalView(Xpetra::Access::ReadOnly);
1041 auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
1042 const size_t numAggregates = aggregates->GetNumAggregates();
1043
1044 int myPID = aggregates->GetMap()->getComm()->getRank();
1045
1046 // Create Kokkos::View (on the device) to store the aggreate dof sizes
1047 // Later used to get aggregate dof offsets
1048 // NOTE: This zeros itself on construction
1049 typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
1050 AggSizeType aggDofSizes; // This turns into "starts" after the parallel_scan
1051
1052 {
1053 SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
1054
1055 // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
1056 aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
1057
1058 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), aggSizes);
1059 }
1060
1061 // Find maximum dof size for aggregates
1062 // Later used to reserve enough scratch space for local QR decompositions
1063 LO maxAggSize = 0;
1064 ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
1065 Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
1066
1067 // parallel_scan (exclusive)
1068 // The aggDofSizes View then contains the aggregate dof offsets
1069 Kokkos::parallel_scan(
1070 "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0, numAggregates + 1),
1071 KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
1072 update += aggDofSizes(i);
1073 if (final_pass)
1074 aggDofSizes(i) = update;
1075 });
1076
1077 // Create Kokkos::View on the device to store mapping
1078 // between (local) aggregate id and row map ids (LIDs)
1079 Kokkos::View<LO*, DeviceType> aggToRowMapLO(Kokkos::ViewAllocateWithoutInitializing("aggtorow_map_LO"), numFineBlockRows);
1080 {
1081 SubFactoryMonitor m2(*this, "Create AggToRowMap", coarseLevel);
1082
1083 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
1084 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
1085
1086 Kokkos::parallel_for(
1087 "MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
1088 KOKKOS_LAMBDA(const LO lnode) {
1089 if (procWinner(lnode, 0) == myPID) {
1090 // No need for atomics, it's one-to-one
1091 auto aggID = vertex2AggId(lnode, 0);
1092
1093 auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
1094 // FIXME: I think this may be wrong
1095 // We unconditionally add the whole block here. When we calculated
1096 // aggDofSizes, we did the isLocalElement check. Something's fishy.
1097 for (LO k = 0; k < stridedBlockSize; k++)
1098 aggToRowMapLO(offset + k) = lnode * stridedBlockSize + k;
1099 }
1100 });
1101 }
1102
1103 // STEP 2: prepare local QR decomposition
1104 // Reserve memory for tentative prolongation operator
1105 coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim);
1106
1107 // Pull out the nullspace vectors so that we can have random access (on the device)
1108 auto fineNS = fineNullspace->getDeviceLocalView(Xpetra::Access::ReadWrite);
1109 auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
1110
1111 typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_type local_matrix_type;
1112 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
1113 typedef typename local_matrix_type::index_type::non_const_type cols_type;
1114 // typedef typename local_matrix_type::values_type::non_const_type vals_type;
1115
1116 // Device View for status (error messages...)
1117 typedef Kokkos::View<int[10], DeviceType> status_type;
1118 status_type status("status");
1119
1120 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
1121 typename AppendTrait<status_type, Kokkos::Atomic>::type statusAtomic = status;
1122
1123 // We're going to bypass QR in the BlockCrs version of the code regardless of what the user asks for
1124 GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
1125
1126 // BlockCrs requires that we build the (block) graph first, so let's do that...
1127
1128 // NOTE: Because we're assuming that the NSDim == BlockSize, we only have one
1129 // block non-zero per row in the matrix;
1130 rows_type ia(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows + 1);
1131 cols_type ja(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), numFineBlockRows);
1132
1133 Kokkos::parallel_for(
1134 "MueLu:TentativePF:BlockCrs:graph_init", range_type(0, numFineBlockRows),
1135 KOKKOS_LAMBDA(const LO j) {
1136 ia[j] = j;
1137 ja[j] = INVALID;
1138
1139 if (j == (LO)numFineBlockRows - 1)
1140 ia[numFineBlockRows] = numFineBlockRows;
1141 });
1142
1143 // Fill Graph
1144 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
1145 Kokkos::parallel_for(
1146 "MueLu:TentativePF:BlockCrs:fillGraph", policy,
1147 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1148 auto agg = thread.league_rank();
1149 Xpetra::global_size_t offset = agg;
1150
1151 // size of the aggregate (number of DOFs in aggregate)
1152 LO aggSize = aggRows(agg + 1) - aggRows(agg);
1153
1154 for (LO j = 0; j < aggSize; j++) {
1155 // FIXME: Allow for bad maps
1156 const LO localRow = aggToRowMapLO[aggDofSizes[agg] + j];
1157 const size_t rowStart = ia[localRow];
1158 ja[rowStart] = offset;
1159 }
1160 });
1161
1162 // Compress storage (remove all INVALID, which happen when we skip zeros)
1163 // We do that in-place
1164 {
1165 // Stage 2: compress the arrays
1166 SubFactoryMonitor m2(*this, "Stage 2 (CompressData)", coarseLevel);
1167 // Fill i_temp with the correct row starts
1168 rows_type i_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows + 1);
1169 LO nnz = 0;
1170 Kokkos::parallel_scan(
1171 "MueLu:TentativePF:BlockCrs:compress_rows", range_type(0, numFineBlockRows),
1172 KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
1173 if (final)
1174 i_temp[i] = upd;
1175 for (auto j = ia[i]; j < ia[i + 1]; j++)
1176 if (ja[j] != INVALID)
1177 upd++;
1178 if (final && i == (LO)numFineBlockRows - 1)
1179 i_temp[numFineBlockRows] = upd;
1180 },
1181 nnz);
1182
1183 cols_type j_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), nnz);
1184
1185 Kokkos::parallel_for(
1186 "MueLu:TentativePF:BlockCrs:compress_cols", range_type(0, numFineBlockRows),
1187 KOKKOS_LAMBDA(const LO i) {
1188 size_t rowStart = i_temp[i];
1189 size_t lnnz = 0;
1190 for (auto j = ia[i]; j < ia[i + 1]; j++)
1191 if (ja[j] != INVALID) {
1192 j_temp[rowStart + lnnz] = ja[j];
1193 lnnz++;
1194 }
1195 });
1196
1197 ia = i_temp;
1198 ja = j_temp;
1199 }
1200
1201 RCP<CrsGraph> BlockGraph = CrsGraphFactory::Build(rowMap, coarseBlockMap, ia, ja);
1202
1203 // Managing labels & constants for ESFC
1204 {
1205 RCP<ParameterList> FCparams;
1206 if (pL.isSublist("matrixmatrix: kernel params"))
1207 FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
1208 else
1209 FCparams = rcp(new ParameterList);
1210 // By default, we don't need global constants for TentativeP
1211 FCparams->set("compute global constants", FCparams->get("compute global constants", false));
1212 std::string levelIDs = toString(levelID);
1213 FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + levelIDs);
1214 RCP<const Export> dummy_e;
1215 RCP<const Import> dummy_i;
1216 BlockGraph->expertStaticFillComplete(coarseBlockMap, rowMap, dummy_i, dummy_e, FCparams);
1217 }
1218
1219 // We can't leave the ia/ja pointers floating around, because of host/device view counting, so
1220 // we clear them here
1221 ia = rows_type();
1222 ja = cols_type();
1223
1224 // Now let's make a BlockCrs Matrix
1225 // NOTE: Assumes block size== NSDim
1226 RCP<Xpetra::CrsMatrix<SC, LO, GO, NO>> P_xpetra = Xpetra::CrsMatrixFactory<SC, LO, GO, NO>::BuildBlock(BlockGraph, coarsePointMap, rangeMap, NSDim);
1227 RCP<Xpetra::TpetraBlockCrsMatrix<SC, LO, GO, NO>> P_tpetra = rcp_dynamic_cast<Xpetra::TpetraBlockCrsMatrix<SC, LO, GO, NO>>(P_xpetra);
1228 if (P_tpetra.is_null()) throw std::runtime_error("BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix");
1229 RCP<CrsMatrixWrap> P_wrap = rcp(new CrsMatrixWrap(P_xpetra));
1230
1231 auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst();
1232 const LO stride = NSDim * NSDim;
1233
1234 Kokkos::parallel_for(
1235 "MueLu:TentativePF:BlockCrs:main_loop_noqr", policy,
1236 KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1237 auto agg = thread.league_rank();
1238
1239 // size of the aggregate (number of DOFs in aggregate)
1240 LO aggSize = aggRows(agg + 1) - aggRows(agg);
1241 Xpetra::global_size_t offset = agg * NSDim;
1242
1243 // Q = localQR(:,0)/norm
1244 for (LO j = 0; j < aggSize; j++) {
1245 LO localBlockRow = aggToRowMapLO(aggRows(agg) + j);
1246 LO rowStart = localBlockRow * stride;
1247 for (LO r = 0; r < (LO)NSDim; r++) {
1248 LO localPointRow = localBlockRow * NSDim + r;
1249 for (LO c = 0; c < (LO)NSDim; c++) {
1250 values[rowStart + r * NSDim + c] = fineNSRandom(localPointRow, c);
1251 }
1252 }
1253 }
1254
1255 // R = norm
1256 for (LO j = 0; j < (LO)NSDim; j++)
1257 coarseNS(offset + j, j) = one;
1258 });
1259
1260 Ptentative = P_wrap;
1261}
1262
1263template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1265 BuildPcoupled(RCP<Matrix> /* A */, RCP<Aggregates> /* aggregates */,
1266 RCP<AmalgamationInfo> /* amalgInfo */, RCP<MultiVector> /* fineNullspace */,
1267 RCP<const Map> /* coarseMap */, RCP<Matrix>& /* Ptentative */,
1268 RCP<MultiVector>& /* coarseNullspace */) const {
1269 throw Exceptions::RuntimeError("MueLu: Construction of coupled tentative P is not implemented");
1270}
1271
1272template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1274 isGoodMap(const Map& rowMap, const Map& colMap) const {
1275 auto rowLocalMap = rowMap.getLocalMap();
1276 auto colLocalMap = colMap.getLocalMap();
1277
1278 const size_t numRows = rowLocalMap.getLocalNumElements();
1279 const size_t numCols = colLocalMap.getLocalNumElements();
1280
1281 if (numCols < numRows)
1282 return false;
1283
1284 size_t numDiff = 0;
1285 Kokkos::parallel_reduce(
1286 "MueLu:TentativePF:isGoodMap", range_type(0, numRows),
1287 KOKKOS_LAMBDA(const LO i, size_t& diff) {
1288 diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
1289 },
1290 numDiff);
1291
1292 return (numDiff == 0);
1293}
1294
1295} // namespace MueLu
1296
1297#define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT
1298#endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
View
#define SET_VALID_ENTRY(name)
MueLu::DefaultLocalOrdinal LocalOrdinal
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
typename LWGraph_kokkos::local_graph_type local_graph_type
Exception throws to report errors in the internal logical of the program.
Timer to be used in factories. Similar to Monitor but with additional timers.
void Input(Level &level, const std::string &varName) const
T Get(Level &level, const std::string &varName) const
void Set(Level &level, const std::string &varName, const T &data) const
Class that holds all level-specific information.
bool IsAvailable(const std::string &ename, const FactoryBase *factory=NoFactory::get()) const
Test whether a need's value has been saved.
int GetLevelID() const
Return level number.
static const NoFactory * get()
virtual const Teuchos::ParameterList & GetParameterList() const
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
bool isGoodMap(const Map &rowMap, const Map &colMap) const
void BuildP(Level &fineLevel, Level &coarseLevel) const
Abstract Build method.
void BuildPuncoupledBlockCrs(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
Kokkos::RangePolicy< local_ordinal_type, execution_space > range_type
void BuildPuncoupled(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
void Build(Level &fineLevel, Level &coarseLevel) const
Build an object with this factory.
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
void DeclareInput(Level &fineLevel, Level &coarseLevel) const
Input.
void BuildPcoupled(RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace) const
static bool MapsAreNested(const Xpetra::Map< DefaultLocalOrdinal, DefaultGlobalOrdinal, DefaultNode > &rowMap, const Xpetra::Map< DefaultLocalOrdinal, DefaultGlobalOrdinal, DefaultNode > &colMap)
Teuchos::FancyOStream & GetOStream(MsgType type, int thisProcRankOnly=0) const
Get an output stream for outputting the input message type.
bool IsPrint(MsgType type, int thisProcRankOnly=-1) const
Find out whether we need to print out information for a specific message type.
Namespace for MueLu classes and methods.
@ Warnings0
Important warning messages (one line).
@ Statistics2
Print even more statistics.
@ Runtime1
Description of what is happening (more verbose).
std::string toString(const T &what)
Little helper function to convert non-string types to strings.