Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_DistributionLowerTriangularBlock.hpp
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef __TPETRA_DISTRIBUTORLOWERTRIANGULARBLOCK_HPP
11#define __TPETRA_DISTRIBUTORLOWERTRIANGULARBLOCK_HPP
12
13// Needed by DistributionLowerTriangularBlock
14#include "Tpetra_Distributor.hpp"
15
16// Needed by LowerTriangularBlock operator
17#include "Tpetra_Core.hpp"
18#include "Tpetra_Map.hpp"
19#include "Tpetra_Operator.hpp"
20#include "Tpetra_Vector.hpp"
21#include "Tpetra_CrsMatrix.hpp"
22
23namespace Tpetra
24{
25
27template <typename gno_t, typename scalar_t>
28class DistributionLowerTriangularBlock : public Distribution<gno_t,scalar_t> {
29// Seher Acer's lower-triangular block decomposition for triangle counting
30// See also: LowerTriangularBlockOperator below that allows this distribution
31// to be used in Tpetra SpMV.
32//
33// Requirements:
34// Matrix must be square (undirected graph)
35// Number of processors np = q(q+1)/2 for some q.
36//
37// Only the lower triangular portion of the matrix is stored.
38// Processors are arranged logically as follows:
39// 0
40// 1 2
41// 3 4 5
42// ...
43//
44// The lower triangular part of the matrix is stored in a 2D distribution.
45// For example, the dense 7x7 lower triangular matrix below would be assigned
46// to processors according to numbers shown as follows:
47// 0 | |
48// 00| |
49// ---------
50// 11|2 |
51// 11|22 |
52// 11|222|
53// ---------
54// 33|444|5
55// 33|444|55
56// ...
57// (Note that we expect the matrix to be sparse. For dense matrices,
58// CrsMatrix is the wrong tool.)
59//
60// Matrix rows are assigned to processor rows greedily to roughly balance
61// (# nonzeros in processor row / # processors in processor row)
62// across processor rows.
63// The same cuts are used to divide rows and columns among processors
64// (that is, all processors have a square block).
65//
66// The lower triangular algorithm:
67// 1. distribute all matrix entries via 1D linear distribution
68// (this initial distribution is needed to avoid storing the entire
69// matrix on one processor, while providing info about the nonzeros per row
70// needed in step 2.
71// 2. (optional) sort rows in decreasing order wrt the number of nonzeros
72// per row
73// 3. find "chunk cuts": divisions in row assignments such that
74// (# nonzeros in processor row / # processors in processor row) is
75// roughly equal for all processor rows
76// 4. send nonzeros to their new processor assignment
77//
78// Known issues: (TODO)
79// - The sorting in Step 2 and computation of chunk cuts in step 3 are
80// currently done in serial and requires O(number of rows) storage each
81// processor. More effort could parallelize this computation, but parallel
82// load balancing algorithms are more appropriate in Zoltan2 than Tpetra.
83// - The sorting in Step 2 renumbers the rows (assigns new Global Ordinals to
84// the rows) to make them contiguous, as needed in Acer's triangle counting
85// algorithm.
86// (Acer's algorithm relies on local indexing from the chunk boundaries to
87// find neighbors needed for communication.)
88// The class currently provides a permutation matrix P describing the
89// reordering. Thus, the matrix stored in the lower triangular block
90// distribution is actually P A P -- the row and column permutation of
91// matrix A in the Matrix Market file.
92// The fact that a permuted matrix is stored complicates use of the matrix
93// in algorithms other than Acer's triangle counting. For SpMV with the
94// vector numbered according to the MatrixMarket numbering, for example,
95// P^T must be applied to the vector before SpMV, and P^T must be applied to
96// the result of SpMV. See LowerTriangularBlockOperator to see how this
97// permutation matrix is used.
98//
99// Before addressing these issues, we will decide (TODO)
100// - Is this Distribution general enough to be in Tpetra?
101// - Should we, instead, have a separate package for distributions (that could
102// use Zoltan2 and Tpetra without circular dependence)?
103// - Or should we allow users (such as the triangle counting algorithm) to
104// provide their own distributions (e.g., LowerTriangularBlock) that
105// inherit from Tpetra's Distribution class?
106// For now, we will push this Distribution into Tpetra, but we will revisit
107// this decision.
108
109public:
110 using Distribution<gno_t,scalar_t>::me;
111 using Distribution<gno_t,scalar_t>::np;
112 using Distribution<gno_t,scalar_t>::comm;
113 using Distribution<gno_t,scalar_t>::nrows;
114 using typename Distribution<gno_t,scalar_t>::NZindex_t;
115 using typename Distribution<gno_t,scalar_t>::LocalNZmap_t;
116
117 using map_t = Tpetra::Map<>;
118 using matrix_t = Tpetra::CrsMatrix<scalar_t>;
119
120 DistributionLowerTriangularBlock(size_t nrows_,
121 const Teuchos::RCP<const Teuchos::Comm<int> > &comm_,
122 const Teuchos::ParameterList &params) :
123 Distribution<gno_t,scalar_t>(nrows_, comm_, params),
124 initialDist(nrows_, comm_, params),
125 sortByDegree(false), permMatrix(Teuchos::null),
126 redistributed(false), chunksComputed(false), nChunks(0)
127 {
128 int npnp = 2 * np;
129 nChunks = int(std::sqrt(float(npnp)));
130 while (nChunks * (nChunks + 1) < npnp) nChunks++;
131
132 TEUCHOS_TEST_FOR_EXCEPTION(nChunks * (nChunks+1) != npnp, std::logic_error,
133 "Number of processors np = " << np <<
134 " must satisfy np = q(q+1)/2 for some q" <<
135 " for LowerTriangularBlock distribution");
136 nChunksPerRow = double(nChunks) / double(nrows);
137
138 const Teuchos::ParameterEntry *pe = params.getEntryPtr("sortByDegree");
139 if (pe != NULL) sortByDegree = pe->getValue<bool>(&sortByDegree);
140
141 pe = params.getEntryPtr("readPerProcess");
142 if (pe != NULL) redistributed = pe->getValue<bool>(&redistributed);
143
144 if (me == 0) std::cout << "\n LowerTriangularBlock Distribution: "
145 << "\n np = " << np
146 << "\n nChunks = " << nChunks
147 << std::endl;
148 }
149
150 enum DistributionType DistType() { return LowerTriangularBlock; }
151
152 bool areChunksComputed() {return chunksComputed; }
153
154 Teuchos::Array<gno_t> getChunkCuts() {
155 if(chunksComputed)
156 return chunkCuts;
157 else {
158 throw std::runtime_error("Error: Requested chunk cuts have not been computed yet.");
159 }
160 }
161
162 // Return whether this rank owns vector entry i.
163 // TODO: for now, use same vector dist as 1DLinear;
164 // TODO: think about best distribution of Vectors
165 inline bool VecMine(gno_t i) { return initialDist.VecMine(i); }
166
167 // Return whether this rank owns nonzero (i,j)
168 // Vector map and row map are the same in 1D distribution.
169 // But keep only the lower Triangular entries
170 bool Mine(gno_t i, gno_t j) {
171 if (redistributed) {
172 if (j > i) return false; // Don't keep any upper triangular entries
173 else return (procFromChunks(i,j) == me);
174 }
175 else
176 return initialDist.Mine(i,j);
177 }
178
179 inline bool Mine(gno_t i, gno_t j, int p) {return Mine(i,j);}
180
181 // How to redistribute according to chunk-based row distribution
182 void Redistribute(LocalNZmap_t &localNZ)
183 {
184 // Going to do chunking and sorting serially for now;
185 // need to gather per-row information from each processor
186 // TODO: think about a parallel implementation
187
188 gno_t myFirstRow = initialDist.getFirstRow(me);
189 gno_t nMyRows = initialDist.getNumRow(me);
190 Teuchos::Array<gno_t> nnzPerRow(nMyRows, 0);
191
192 Teuchos::Array<int> rcvcnt(np);
193 Teuchos::Array<int> disp(np);
194 for (int sum = 0, p = 0; p < np; p++) {
195 int prows = initialDist.getNumRow(p);
196 rcvcnt[p] = prows;
197 disp[p] = sum;
198 sum += prows;
199 }
200
201 // If desire sortByDegree, first need to sort with respect to ALL entries
202 // in matrix (not lower-triangular entries);
203 // decreasing sort by number of entries per row in global matrix.
204 // Generate permuteIndex for the sorted rows
205
206 Teuchos::Array<gno_t> permuteIndex; // This is the inverse permutation
207 Teuchos::Array<gno_t> sortedOrder; // This is the original permutation
208
209 Teuchos::Array<gno_t> globalRowBuf;
210 // TODO Dunno why there isn't a Teuchos::gatherAllv;
211 // TODO for now, compute and broadcast
212 if (me == 0) {
213 globalRowBuf.resize(nrows, 0); // TODO: Ick! Need parallel
214 }
215
216 if (sortByDegree) {
217 // Compute nnzPerRow; distribution is currently 1D and includes all nz
218 for (auto it = localNZ.begin(); it != localNZ.end(); it++) {
219 gno_t I = it->first.first;
220 nnzPerRow[I-myFirstRow]++;
221 }
222
223 Teuchos::gatherv<int,gno_t>(nnzPerRow.getRawPtr(), nMyRows,
224 globalRowBuf.getRawPtr(),
225 rcvcnt.getRawPtr(), disp.getRawPtr(),
226 0, *comm);
227
228 permuteIndex.resize(nrows); // TODO: Ick! Need parallel
229 sortedOrder.resize(nrows); // TODO: Ick! Need parallel
230
231 if (me == 0) { // TODO: do on all procs once have allgatherv
232
233 for (size_t i = 0 ; i != nrows; i++) sortedOrder[i] = i;
234
235 std::sort(sortedOrder.begin(), sortedOrder.end(),
236 [&](const size_t& a, const size_t& b) {
237 return (globalRowBuf[a] > globalRowBuf[b]);
238 }
239 );
240
241 // Compute inverse permutation; it is more useful for our needs
242 for (size_t i = 0; i < nrows; i++) {
243 permuteIndex[sortedOrder[i]] = i;
244 }
245 }
246
247 Teuchos::broadcast<int,gno_t>(*comm, 0, permuteIndex(0,nrows));
248 // Ick! Use a directory TODO
249
250 // Sorting is changing the global IDs associated
251 // with rows/columns. To make this distribution applicable beyond
252 // triangle counting (e.g., in a Tpetra operator), we need a way
253 // to map from the original global IDs and back again.
254 // Create a permutation matrix for use in the operator; use
255 // default Tpetra layout.
256 Teuchos::Array<gno_t> myRows;
257 for (size_t i = 0; i < nrows; i++) {
258 if (VecMine(i)) myRows.push_back(i);
259 }
260
261 Tpetra::global_size_t dummy =
262 Teuchos::OrdinalTraits<Tpetra::global_size_t>::invalid();
263 Teuchos::RCP<const map_t> permMap =
264 rcp(new map_t(dummy, myRows(), 0, comm));
265
266 permMatrix = rcp(new matrix_t(permMap, 1)); // one nz / row in permMatrix
267
268 Teuchos::Array<gno_t> cols(1);
269 Teuchos::Array<scalar_t> vals(1); vals[0] = 1.;
270
271 for (size_t i = 0; i < permMap->getLocalNumElements(); i++) {
272 gno_t gid = permMap->getGlobalElement(i);
273 cols[0] = permuteIndex[gid];
274 permMatrix->insertGlobalValues(gid, cols(), vals());
275 }
276
277 permMatrix->fillComplete(permMap, permMap);
278 }
279
280 // Now, to determine the chunks, we care only about the number of
281 // nonzeros in the lower triangular matrix.
282 // Compute nnzPerRow; distribution is currently 1D
283 nnzPerRow.assign(nMyRows, 0);
284 size_t nnz = 0;
285 for (auto it = localNZ.begin(); it != localNZ.end(); it++) {
286 gno_t I = (sortByDegree ? permuteIndex[it->first.first]
287 : it->first.first);
288 gno_t J = (sortByDegree ? permuteIndex[it->first.second]
289 : it->first.second);
290 if (J <= I) {// Lower-triangular part
291 nnzPerRow[it->first.first - myFirstRow]++;
292 nnz++;
293 }
294 }
295
296 // TODO Dunno why there isn't a Teuchos::gatherAllv;
297 // TODO for now, compute and broadcast
298
299 Teuchos::gatherv<int,gno_t>(nnzPerRow.getRawPtr(), nMyRows,
300 globalRowBuf.getRawPtr(),
301 rcvcnt.getRawPtr(), disp.getRawPtr(),
302 0, *comm);
303
304 Teuchos::Array<int>().swap(rcvcnt); // no longer needed
305 Teuchos::Array<int>().swap(disp); // no longer needed
306
307 size_t gNnz;
308 Teuchos::reduceAll(*comm, Teuchos::REDUCE_SUM, 1, &nnz, &gNnz);
309
310 chunkCuts.resize(nChunks+1, 0);
311
312
313 if (me == 0) { // TODO: when have allgatherv, can do on all procs
314 // TODO: or better, implement parallel version
315
316 // Determine chunk cuts
317 size_t target = gNnz / np; // target nnz per processor
318 size_t targetRunningTotal = 0;
319 size_t currentRunningTotal = 0;
320 gno_t I = gno_t(0);
321 for (int chunkCnt = 0; chunkCnt < nChunks; chunkCnt++) {
322 targetRunningTotal = (target * (chunkCnt+1));
323 currentRunningTotal = 0;
324 while (I < static_cast<gno_t>(nrows)) {
325 size_t nextNnz = (sortByDegree ? globalRowBuf[sortedOrder[I]]
326 : globalRowBuf[I]);
327 if (currentRunningTotal + nextNnz <= targetRunningTotal) {
328 currentRunningTotal += nextNnz;
329 I++;
330 }
331 else
332 break;
333 }
334 chunkCuts[chunkCnt+1] = I;
335 }
336 chunkCuts[nChunks] = static_cast<gno_t>(nrows);
337 }
338
339 // Free memory associated with globalRowBuf
340 Teuchos::Array<gno_t>().swap(globalRowBuf);
341
342 Teuchos::broadcast<int,gno_t>(*comm, 0, chunkCuts(0,nChunks+1));
343 chunksComputed = true;
344
345 // Determine new owner of each nonzero; buffer for sending
346 Kokkos::View<gno_t*, Kokkos::HostSpace> iOut("iOut", localNZ.size());
347 Kokkos::View<gno_t*, Kokkos::HostSpace> jOut("jOut", localNZ.size());
348 Kokkos::View<scalar_t*, Kokkos::HostSpace> vOut("vOut", localNZ.size());
349 Teuchos::Array<int> pOut(localNZ.size());
350
351 size_t sendCnt = 0;
352 for (auto it = localNZ.begin(); it != localNZ.end(); it++) {
353 iOut[sendCnt] = (sortByDegree ? permuteIndex[it->first.first]
354 : it->first.first);
355 jOut[sendCnt] = (sortByDegree ? permuteIndex[it->first.second]
356 : it->first.second);
357 if (jOut[sendCnt] <= iOut[sendCnt]) { // keep only lower diagonal entries
358 vOut[sendCnt] = it->second;
359 pOut[sendCnt] = procFromChunks(iOut[sendCnt], jOut[sendCnt]);
360
361 sendCnt++;
362 }
363 }
364
365 // Free memory associated with localNZ and permuteIndex
366 LocalNZmap_t().swap(localNZ);
367 if (sortByDegree) Teuchos::Array<gno_t>().swap(permuteIndex);
368
369 // Use a Distributor to send nonzeros to new processors.
370 Tpetra::Distributor plan(comm);
371 size_t nrecvs = plan.createFromSends(pOut(0,sendCnt));
372 Kokkos::View<gno_t*, Kokkos::HostSpace> iIn("iIn", nrecvs);
373 Kokkos::View<gno_t*, Kokkos::HostSpace> jIn("jIn", nrecvs);
374 Kokkos::View<scalar_t*, Kokkos::HostSpace> vIn("vIn", nrecvs);
375
376 // TODO: With more clever packing, could do only one round of communication
377 auto sendIndices = std::make_pair(static_cast<size_t>(0), sendCnt);
378 plan.doPostsAndWaits(Kokkos::subview(iOut, sendIndices), 1, iIn);
379 plan.doPostsAndWaits(Kokkos::subview(jOut, sendIndices), 1, jIn);
380 plan.doPostsAndWaits(Kokkos::subview(vOut, sendIndices), 1, vIn);
381
382 // Put received nonzeros in map
383 for (size_t n = 0; n < nrecvs; n++) {
384 NZindex_t nz(iIn[n], jIn[n]);
385 localNZ[nz] = vIn[n];
386 }
387
388 redistributed = true;
389 }
390
391 Teuchos::RCP<matrix_t> getPermutationMatrix() const { return permMatrix; }
392
393private:
394 // Initially distribute nonzeros with a 1D linear distribution
395 Distribution1DLinear<gno_t,scalar_t> initialDist;
396
397 // Flag indicating whether matrix should be reordered and renumbered
398 // in decreasing sort order of number of nonzeros per row in full matrix
399 bool sortByDegree;
400
401 // Column permutation matrix built only when sortByDegree = true;
402 Teuchos::RCP<matrix_t> permMatrix;
403
404 // Flag whether redistribution has occurred yet
405 // This is true
406 // i) after Tpetra performs the redistribution or
407 // ii) when Tpetra reads already-distributed nonzeros by readPerProcess function
408 bool redistributed;
409
410 // If we read the already-distributed nonzeros from per-process files,
411 // this will remain false until a triangle counting code actually computes
412 // the chunks when the need arises.
413 bool chunksComputed;
414
415 int nChunks; // in np = q(q+1)/2 layout, nChunks = q
416 double nChunksPerRow;
417 Teuchos::Array<gno_t> chunkCuts;
418
419 int findIdxInChunks(gno_t I) {
420 int m = I * nChunksPerRow;
421 while (I < chunkCuts[m]) m--;
422 while (I >= chunkCuts[m+1]) m++;
423 return m;
424 }
425
426 int procFromChunks(gno_t I, gno_t J) {
427 int m = findIdxInChunks(I);
428 int n = findIdxInChunks(J);
429 int p = m*(m+1)/2 + n;
430 return p;
431 }
432};
433
434
436// Tpetra::Operator that works with the DistributionLowerTriangularBlock
437
438template <typename scalar_t,
439 class Node = ::Tpetra::Details::DefaultTypes::node_type>
440class LowerTriangularBlockOperator :
441 public Tpetra::Operator<scalar_t, Tpetra::Map<>::local_ordinal_type,
442 Tpetra::Map<>::global_ordinal_type,
443 Node>
444{
445public:
448 using map_t = Tpetra::Map<>;
449 using import_t = Tpetra::Import<>;
450 using export_t = Tpetra::Export<>;
451 using vector_t = Tpetra::Vector<scalar_t>;
452 using mvector_t = Tpetra::MultiVector<scalar_t>;
453 using matrix_t = Tpetra::CrsMatrix<scalar_t, lno_t, gno_t, Node>;
454 using dist_t = Tpetra::DistributionLowerTriangularBlock<gno_t, scalar_t>;
455
456 LowerTriangularBlockOperator(
457 const Teuchos::RCP<const matrix_t> &lowerTriangularMatrix_,
458 const dist_t &dist)
459 : lowerTriangularMatrix(lowerTriangularMatrix_),
460 permMatrix(dist.getPermutationMatrix())
461 {
462 // LowerTriangularBlockOperator requires the range map and domain map
463 // to be the same. Check it here.
464 TEUCHOS_TEST_FOR_EXCEPTION(
465 !lowerTriangularMatrix->getRangeMap()->isSameAs(
466 *lowerTriangularMatrix->getDomainMap()),
467 std::logic_error,
468 "The Domain and Range maps of the LowerTriangularBlock matrix "
469 "must be the same");
470
471 // Extract diagonals
472
473 vector_t diagByRowMap(lowerTriangularMatrix->getRowMap());
474 lowerTriangularMatrix->getLocalDiagCopy(diagByRowMap);
475 diag = Teuchos::rcp(new vector_t(lowerTriangularMatrix->getRangeMap()));
476 Tpetra::Export<> exporter(lowerTriangularMatrix->getRowMap(),
477 lowerTriangularMatrix->getRangeMap());
478 diag->doExport(diagByRowMap, exporter, Tpetra::ADD);
479 }
480
481 void apply(const mvector_t &x, mvector_t &y, Teuchos::ETransp mode,
482 scalar_t alpha, scalar_t beta) const
483 {
484 scalar_t ZERO = Teuchos::ScalarTraits<scalar_t>::zero();
485 scalar_t ONE = Teuchos::ScalarTraits<scalar_t>::one();
486 if (alpha == ZERO) {
487 if (beta == ZERO) y.putScalar(ZERO);
488 else y.scale(beta);
489 return;
490 }
491
492 if (permMatrix == Teuchos::null) {
493
494 // Multiply lower triangular
495 lowerTriangularMatrix->apply(x, y, Teuchos::NO_TRANS, alpha, beta);
496
497 // Multiply upper triangular
498 lowerTriangularMatrix->apply(x, y, Teuchos::TRANS, alpha, ONE);
499
500 // Subtract out duplicate diagonal terms
501 y.elementWiseMultiply(-alpha, *diag, x, ONE);
502 }
503 else {
504
505 // With sorting, the LowerTriangularBlock distribution stores (P^T A P)
506 // in the CrsMatrix, for permutation matrix P.
507 // Thus, apply must compute
508 // y = P (beta (P^T y) + alpha (P^T A P) (P^T x))
509
510 vector_t xtmp(x.getMap(), x.getNumVectors());
511 vector_t ytmp(y.getMap(), y.getNumVectors());
512
513 permMatrix->apply(x, xtmp, Teuchos::TRANS);
514 if (beta != ZERO) permMatrix->apply(y, ytmp, Teuchos::TRANS);
515
516 // Multiply lower triangular
517 lowerTriangularMatrix->apply(xtmp, ytmp, Teuchos::NO_TRANS, alpha, beta);
518
519 // Multiply upper triangular
520 lowerTriangularMatrix->apply(xtmp, ytmp, Teuchos::TRANS, alpha, ONE);
521
522 // Subtract out duplicate diagonal terms
523 ytmp.elementWiseMultiply(-alpha, *diag, xtmp, ONE);
524
525 permMatrix->apply(ytmp, y, Teuchos::NO_TRANS);
526 }
527 }
528
529 Teuchos::RCP<const map_t> getDomainMap() const {
530 return lowerTriangularMatrix->getDomainMap();
531 }
532
533 Teuchos::RCP<const map_t> getRangeMap() const {
534 return lowerTriangularMatrix->getRangeMap();
535 }
536
537 bool hasTransposeApply() const {return true;} // Symmetric matrix
538
539private:
540 const Teuchos::RCP<const matrix_t > lowerTriangularMatrix;
541 const Teuchos::RCP<const matrix_t > permMatrix;
542 Teuchos::RCP<vector_t> diag;
543};
544
545
546}
547#endif
Functions for initializing and finalizing Tpetra.
GlobalOrdinal global_ordinal_type
The type of global indices.
LocalOrdinal local_ordinal_type
The type of local indices.
virtual Teuchos::RCP< const Map< Tpetra::Map<>::local_ordinal_type, Tpetra::Map<>::global_ordinal_type, ::Tpetra::Details::DefaultTypes::node_type > > getRangeMap() const=0
virtual Teuchos::RCP< const Map< Tpetra::Map<>::local_ordinal_type, Tpetra::Map<>::global_ordinal_type, ::Tpetra::Details::DefaultTypes::node_type > > getDomainMap() const=0
Namespace Tpetra contains the class and methods constituting the Tpetra library.
size_t global_size_t
Global size_t object.
@ ADD
Sum new values.
@ ZERO
Replace old values with zero.