Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
TpetraExt_MatrixMatrix_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_MATRIXMATRIX_DEF_HPP
11#define TPETRA_MATRIXMATRIX_DEF_HPP
13#include "KokkosSparse_Utils.hpp"
14#include "Tpetra_ConfigDefs.hpp"
16#include "Teuchos_VerboseObject.hpp"
17#include "Teuchos_Array.hpp"
18#include "Tpetra_Util.hpp"
19#include "Tpetra_CrsMatrix.hpp"
20#include "Tpetra_BlockCrsMatrix.hpp"
22#include "Tpetra_RowMatrixTransposer.hpp"
25#include "Tpetra_Details_makeColMap.hpp"
26#include "Tpetra_ConfigDefs.hpp"
27#include "Tpetra_Map.hpp"
28#include "Tpetra_Export.hpp"
31#include <algorithm>
32#include <type_traits>
33#include "Teuchos_FancyOStream.hpp"
34
35#include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
37
38#include "KokkosSparse_spgemm.hpp"
39#include "KokkosSparse_spadd.hpp"
40#include "Kokkos_Bitset.hpp"
41
43
48
49
50
51/*********************************************************************************************************/
52// Include the architecture-specific kernel partial specializations here
53// NOTE: This needs to be outside all namespaces
54#include "TpetraExt_MatrixMatrix_OpenMP.hpp"
55#include "TpetraExt_MatrixMatrix_Cuda.hpp"
56#include "TpetraExt_MatrixMatrix_HIP.hpp"
57#include "TpetraExt_MatrixMatrix_SYCL.hpp"
58
59namespace Tpetra {
60
61namespace MatrixMatrix{
62
63//
64// This method forms the matrix-matrix product C = op(A) * op(B), where
65// op(A) == A if transposeA is false,
66// op(A) == A^T if transposeA is true,
67// and similarly for op(B).
68//
69template <class Scalar,
70 class LocalOrdinal,
71 class GlobalOrdinal,
72 class Node>
75 bool transposeA,
77 bool transposeB,
79 bool call_FillComplete_on_result,
80 const std::string& label,
81 const Teuchos::RCP<Teuchos::ParameterList>& params)
82{
83 using Teuchos::null;
84 using Teuchos::RCP;
85 using Teuchos::rcp;
86 typedef Scalar SC;
87 typedef LocalOrdinal LO;
88 typedef GlobalOrdinal GO;
89 typedef Node NO;
90 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
92 typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
93 typedef Map<LO,GO,NO> map_type;
94 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
95
96#ifdef HAVE_TPETRA_MMM_TIMINGS
97 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
98 using Teuchos::TimeMonitor;
99 //MM is used to time setup, and then multiply.
100
101 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Setup"))));
102#endif
103
104 const std::string prefix = "TpetraExt::MatrixMatrix::Multiply(): ";
105
106 // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
107
108 // The input matrices A and B must both be fillComplete.
109 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
110 TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
111
112 // If transposeA is true, then Aprime will be the transpose of A
113 // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
114 // will just be a pointer to A.
115 RCP<const crs_matrix_type> Aprime = null;
116 // If transposeB is true, then Bprime will be the transpose of B
117 // (computed explicitly via RowMatrixTransposer). Otherwise, Bprime
118 // will just be a pointer to B.
119 RCP<const crs_matrix_type> Bprime = null;
120
121 // Is this a "clean" matrix?
122 //
123 // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
124 // locally nor globally indexed, then it was empty. I don't like
125 // this, because the most straightforward implementation presumes
126 // lazy allocation of indices. However, historical precedent
127 // demands that we keep around this predicate as a way to test
128 // whether the matrix is empty.
129 const bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
130
131 bool use_optimized_ATB = false;
132 if (transposeA && !transposeB && call_FillComplete_on_result && newFlag)
133 use_optimized_ATB = true;
134
135#ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
136 use_optimized_ATB = false;
137#endif
138
139 using Teuchos::ParameterList;
140 RCP<ParameterList> transposeParams (new ParameterList);
141 transposeParams->set ("sort", true); // Kokkos Kernels spgemm requires inputs to be sorted
142
143 if (!use_optimized_ATB && transposeA) {
144 transposer_type transposer (rcpFromRef (A));
145 Aprime = transposer.createTranspose (transposeParams);
146 }
147 else {
148 Aprime = rcpFromRef(A);
149 }
150
151 if (transposeB) {
152 transposer_type transposer (rcpFromRef (B));
153 Bprime = transposer.createTranspose (transposeParams);
154 }
155 else {
156 Bprime = rcpFromRef(B);
157 }
158
159 // Check size compatibility
160 global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
161 global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
162 global_size_t Aouter = transposeA ? numACols : A.getGlobalNumRows();
163 global_size_t Bouter = transposeB ? B.getGlobalNumRows() : numBCols;
164 global_size_t Ainner = transposeA ? A.getGlobalNumRows() : numACols;
165 global_size_t Binner = transposeB ? numBCols : B.getGlobalNumRows();
166 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
167 prefix << "ERROR, inner dimensions of op(A) and op(B) "
168 "must match for matrix-matrix product. op(A) is "
169 << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
170
171 // The result matrix C must at least have a row-map that reflects the correct
172 // row-size. Don't check the number of columns because rectangular matrices
173 // which were constructed with only one map can still end up having the
174 // correct capacity and dimensions when filled.
175 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
176 prefix << "ERROR, dimensions of result C must "
177 "match dimensions of op(A) * op(B). C has " << C.getGlobalNumRows()
178 << " rows, should have at least " << Aouter << std::endl);
179
180 // It doesn't matter whether C is already Filled or not. If it is already
181 // Filled, it must have space allocated for the positions that will be
182 // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
183 // we'll error out later when trying to store result values.
184
185 // CGB: However, matrix must be in active-fill
186 if (!C.isFillActive()) C.resumeFill();
187
188 // We're going to need to import remotely-owned sections of A and/or B if
189 // more than one processor is performing this run, depending on the scenario.
190 int numProcs = A.getComm()->getSize();
191
192 // Declare a couple of structs that will be used to hold views of the data
193 // of A and B, to be used for fast access during the matrix-multiplication.
194 crs_matrix_struct_type Aview;
195 crs_matrix_struct_type Bview;
196
197 RCP<const map_type> targetMap_A = Aprime->getRowMap();
198 RCP<const map_type> targetMap_B = Bprime->getRowMap();
199
200#ifdef HAVE_TPETRA_MMM_TIMINGS
201 {
202 TimeMonitor MM_importExtract(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All I&X")));
203#endif
204
205 // Now import any needed remote rows and populate the Aview struct
206 // NOTE: We assert that an import isn't needed --- since we do the transpose
207 // above to handle that.
208 if (!use_optimized_ATB) {
209 RCP<const import_type> dummyImporter;
210 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
211 }
212
213 // We will also need local access to all rows of B that correspond to the
214 // column-map of op(A).
215 if (numProcs > 1)
216 targetMap_B = Aprime->getColMap();
217
218 // Import any needed remote rows and populate the Bview struct.
219 if (!use_optimized_ATB)
220 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label, params);
221
222#ifdef HAVE_TPETRA_MMM_TIMINGS
223 } //stop MM_importExtract here
224 //stop the setup timer, and start the multiply timer
225 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Multiply"))));
226#endif
227
228 // Call the appropriate method to perform the actual multiplication.
229 if (use_optimized_ATB) {
230 MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
231
232 } else if (call_FillComplete_on_result && newFlag) {
233 MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
234
235 } else if (call_FillComplete_on_result) {
236 MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
237
238 } else {
239 // mfh 27 Sep 2016: Is this the "slow" case? This
240 // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
241 // thread-parallel inserts, but that may take some effort.
242 CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
243
244 MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
245 }
246
247#ifdef HAVE_TPETRA_MMM_TIMINGS
248 TimeMonitor MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All FillComplete")));
249#endif
250 if (call_FillComplete_on_result && !C.isFillComplete()) {
251 // We'll call FillComplete on the C matrix before we exit, and give it a
252 // domain-map and a range-map.
253 // The domain-map will be the domain-map of B, unless
254 // op(B)==transpose(B), in which case the range-map of B will be used.
255 // The range-map will be the range-map of A, unless op(A)==transpose(A),
256 // in which case the domain-map of A will be used.
257 C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
258 }
259}
260
261//
262// This method forms the matrix-matrix product C = op(A) * op(B), where
263// op(A) == A and similarly for op(B). op(A) = A^T is not yet implemented.
264//
265template <class Scalar,
266 class LocalOrdinal,
267 class GlobalOrdinal,
268 class Node>
271 bool transposeA,
273 bool transposeB,
275 const std::string& label)
276{
277 using Teuchos::null;
278 using Teuchos::RCP;
279 using Teuchos::rcp;
280 typedef Scalar SC;
281 typedef LocalOrdinal LO;
282 typedef GlobalOrdinal GO;
283 typedef Node NO;
284 typedef BlockCrsMatrixStruct<SC,LO,GO,NO> blockcrs_matrix_struct_type;
285 typedef Map<LO,GO,NO> map_type;
287
288 std::string prefix = std::string("TpetraExt ") + label + std::string(": ");
289
290 TEUCHOS_TEST_FOR_EXCEPTION(transposeA==true, std::runtime_error, prefix << "Matrix A cannot be transposed.");
291 TEUCHOS_TEST_FOR_EXCEPTION(transposeB==true, std::runtime_error, prefix << "Matrix B cannot be transposed.");
292
293 // Check size compatibility
294 global_size_t numACols = A->getGlobalNumCols();
295 global_size_t numBCols = B->getGlobalNumCols();
296 global_size_t numARows = A->getGlobalNumRows();
297 global_size_t numBRows = B->getGlobalNumRows();
298
299 global_size_t Aouter = numARows;
300 global_size_t Bouter = numBCols;
301 global_size_t Ainner = numACols;
302 global_size_t Binner = numBRows;
303 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
304 prefix << "ERROR, inner dimensions of op(A) and op(B) "
305 "must match for matrix-matrix product. op(A) is "
306 << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
307
308 // We're going to need to import remotely-owned sections of A and/or B if
309 // more than one processor is performing this run, depending on the scenario.
310 int numProcs = A->getComm()->getSize();
311
312 const LO blocksize = A->getBlockSize();
313 TEUCHOS_TEST_FOR_EXCEPTION(blocksize != B->getBlockSize(), std::runtime_error,
314 prefix << "ERROR, Blocksizes do not match. A.blocksize = " <<
315 blocksize << ", B.blocksize = " << B->getBlockSize() );
316
317 // Declare a couple of structs that will be used to hold views of the data
318 // of A and B, to be used for fast access during the matrix-multiplication.
319 blockcrs_matrix_struct_type Aview(blocksize);
320 blockcrs_matrix_struct_type Bview(blocksize);
321
322 RCP<const map_type> targetMap_A = A->getRowMap();
323 RCP<const map_type> targetMap_B = B->getRowMap();
324
325 // Populate the Aview struct. No remotes are needed.
326 RCP<const import_type> dummyImporter;
327 MMdetails::import_and_extract_views(*A, targetMap_A, Aview, dummyImporter, true);
328
329 // We will also need local access to all rows of B that correspond to the
330 // column-map of op(A).
331 if (numProcs > 1)
332 targetMap_B = A->getColMap();
333
334 // Import any needed remote rows and populate the Bview struct.
335 MMdetails::import_and_extract_views(*B, targetMap_B, Bview, A->getGraph()->getImporter(),
336 A->getGraph()->getImporter().is_null());
337
338 // Call the appropriate method to perform the actual multiplication.
339 MMdetails::mult_A_B_newmatrix(Aview, Bview, C);
340}
341
342template <class Scalar,
343 class LocalOrdinal,
344 class GlobalOrdinal,
345 class Node>
346void Jacobi(Scalar omega,
351 bool call_FillComplete_on_result,
352 const std::string& label,
353 const Teuchos::RCP<Teuchos::ParameterList>& params)
354{
355 using Teuchos::RCP;
356 typedef Scalar SC;
357 typedef LocalOrdinal LO;
358 typedef GlobalOrdinal GO;
359 typedef Node NO;
361 typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
362 typedef Map<LO,GO,NO> map_type;
363 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
364
365#ifdef HAVE_TPETRA_MMM_TIMINGS
366 std::string prefix_mmm = std::string("TpetraExt ")+ label + std::string(": ");
367 using Teuchos::TimeMonitor;
368 TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm+std::string("Jacobi All Setup")));
369#endif
370
371 const std::string prefix = "TpetraExt::MatrixMatrix::Jacobi(): ";
372
373 // A and B should already be Filled.
374 // Should we go ahead and call FillComplete() on them if necessary or error
375 // out? For now, we choose to error out.
376 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
377 TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
378
379 RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
380 RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
381
382 // Now check size compatibility
383 global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
384 global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
385 global_size_t Aouter = A.getGlobalNumRows();
386 global_size_t Bouter = numBCols;
387 global_size_t Ainner = numACols;
388 global_size_t Binner = B.getGlobalNumRows();
389 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
390 prefix << "ERROR, inner dimensions of op(A) and op(B) "
391 "must match for matrix-matrix product. op(A) is "
392 << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
393
394 // The result matrix C must at least have a row-map that reflects the correct
395 // row-size. Don't check the number of columns because rectangular matrices
396 // which were constructed with only one map can still end up having the
397 // correct capacity and dimensions when filled.
398 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
399 prefix << "ERROR, dimensions of result C must "
400 "match dimensions of op(A) * op(B). C has "<< C.getGlobalNumRows()
401 << " rows, should have at least "<< Aouter << std::endl);
402
403 // It doesn't matter whether C is already Filled or not. If it is already
404 // Filled, it must have space allocated for the positions that will be
405 // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
406 // we'll error out later when trying to store result values.
407
408 // CGB: However, matrix must be in active-fill
409 TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
410
411 // We're going to need to import remotely-owned sections of A and/or B if
412 // more than one processor is performing this run, depending on the scenario.
413 int numProcs = A.getComm()->getSize();
414
415 // Declare a couple of structs that will be used to hold views of the data of
416 // A and B, to be used for fast access during the matrix-multiplication.
417 crs_matrix_struct_type Aview;
418 crs_matrix_struct_type Bview;
419
420 RCP<const map_type> targetMap_A = Aprime->getRowMap();
421 RCP<const map_type> targetMap_B = Bprime->getRowMap();
422
423#ifdef HAVE_TPETRA_MMM_TIMINGS
424 TimeMonitor MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All I&X")));
425 {
426#endif
427
428 // Enable globalConstants by default
429 // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
430 RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(new Teuchos::ParameterList);
431 if(!params.is_null()) {
432 importParams1->set("compute global constants",params->get("compute global constants: temporaries",false));
433 int mm_optimization_core_count=0;
434 auto slist = params->sublist("matrixmatrix: kernel params",false);
435 mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount ());
436 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
437 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
438 bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
439 bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
440 auto & ip1slist = importParams1->sublist("matrixmatrix: kernel params",false);
441 ip1slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
442 ip1slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
443 ip1slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
444 }
445
446 //Now import any needed remote rows and populate the Aview struct.
447 RCP<const import_type> dummyImporter;
448 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label,importParams1);
449
450 // We will also need local access to all rows of B that correspond to the
451 // column-map of op(A).
452 if (numProcs > 1)
453 targetMap_B = Aprime->getColMap();
454
455 // Now import any needed remote rows and populate the Bview struct.
456 // Enable globalConstants by default
457 // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
458 RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(new Teuchos::ParameterList);
459 if(!params.is_null()) {
460 importParams2->set("compute global constants",params->get("compute global constants: temporaries",false));
461
462 auto slist = params->sublist("matrixmatrix: kernel params",false);
463 int mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount () );
464 bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
465 bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
466 auto & ip2slist = importParams2->sublist("matrixmatrix: kernel params",false);
467 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
468 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
469 ip2slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
470 ip2slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
471 ip2slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
472 }
473
474 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label,importParams2);
475
476#ifdef HAVE_TPETRA_MMM_TIMINGS
477 }
478 TimeMonitor MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All Multiply")));
479#endif
480
481 // Now call the appropriate method to perform the actual multiplication.
482 CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
483
484 // Is this a "clean" matrix
485 bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
486
487 if (call_FillComplete_on_result && newFlag) {
488 MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
489
490 } else if (call_FillComplete_on_result) {
491 MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
492
493 } else {
494 TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "jacobi_A_B_general not implemented");
495 }
496
497 if(!params.is_null()) {
498 bool removeZeroEntries = params->get("remove zeros", false);
499 if (removeZeroEntries) {
500 typedef Teuchos::ScalarTraits<Scalar> STS;
501 typename STS::magnitudeType threshold = params->get("remove zeros threshold", STS::magnitude(STS::zero()));
502 removeCrsMatrixZeros(C, threshold);
503 }
504 }
505}
506
507
508template <class Scalar,
509 class LocalOrdinal,
510 class GlobalOrdinal,
511 class Node>
512void Add(
514 bool transposeA,
515 Scalar scalarA,
517 Scalar scalarB )
518{
519 using Teuchos::Array;
520 using Teuchos::RCP;
521 using Teuchos::null;
522 typedef Scalar SC;
523 typedef LocalOrdinal LO;
524 typedef GlobalOrdinal GO;
525 typedef Node NO;
526 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
527 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
528
529 const std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
530
531 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
532 prefix << "ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
533 "(Result matrix B is not required to be isFillComplete()).");
534 TEUCHOS_TEST_FOR_EXCEPTION(B.isFillComplete() , std::runtime_error,
535 prefix << "ERROR, input matrix B must not be fill complete!");
536 TEUCHOS_TEST_FOR_EXCEPTION(B.isStaticGraph() , std::runtime_error,
537 prefix << "ERROR, input matrix B must not have static graph!");
538 TEUCHOS_TEST_FOR_EXCEPTION(B.isLocallyIndexed() , std::runtime_error,
539 prefix << "ERROR, input matrix B must not be locally indexed!");
540
541 using Teuchos::ParameterList;
542 RCP<ParameterList> transposeParams (new ParameterList);
543 transposeParams->set ("sort", false);
544
545 RCP<const crs_matrix_type> Aprime = null;
546 if (transposeA) {
547 transposer_type transposer (rcpFromRef (A));
548 Aprime = transposer.createTranspose (transposeParams);
549 }
550 else {
551 Aprime = rcpFromRef(A);
552 }
553
554 size_t a_numEntries;
555 typename crs_matrix_type::nonconst_global_inds_host_view_type a_inds("a_inds",A.getLocalMaxNumRowEntries());
556 typename crs_matrix_type::nonconst_values_host_view_type a_vals("a_vals",A.getLocalMaxNumRowEntries());
557 GO row;
558
559 if (scalarB != Teuchos::ScalarTraits<SC>::one())
560 B.scale(scalarB);
561
562 size_t numMyRows = B.getLocalNumRows();
563 if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
564 for (LO i = 0; (size_t)i < numMyRows; ++i) {
565 row = B.getRowMap()->getGlobalElement(i);
566 Aprime->getGlobalRowCopy(row, a_inds, a_vals, a_numEntries);
567
568 if (scalarA != Teuchos::ScalarTraits<SC>::one()) {
569 for (size_t j = 0; j < a_numEntries; ++j)
570 a_vals[j] *= scalarA;
571 }
572 B.insertGlobalValues(row, a_numEntries, reinterpret_cast<Scalar *>(a_vals.data()), a_inds.data());
573 }
574 }
575}
576
577template <class Scalar,
578 class LocalOrdinal,
579 class GlobalOrdinal,
580 class Node>
581Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
582add (const Scalar& alpha,
583 const bool transposeA,
585 const Scalar& beta,
586 const bool transposeB,
588 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
589 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
590 const Teuchos::RCP<Teuchos::ParameterList>& params)
591{
592 using Teuchos::RCP;
593 using Teuchos::rcpFromRef;
594 using Teuchos::rcp;
595 using Teuchos::ParameterList;
597 if(!params.is_null())
598 {
599 TEUCHOS_TEST_FOR_EXCEPTION(
600 params->isParameter("Call fillComplete") && !params->get<bool>("Call fillComplete"),
601 std::invalid_argument,
602 "Tpetra::MatrixMatrix::add(): this version of add() always calls fillComplete\n"
603 "on the result, but you explicitly set 'Call fillComplete' = false in the parameter list. Don't set this explicitly.");
604 params->set("Call fillComplete", true);
605 }
606 //If transposeB, must compute B's explicit transpose to
607 //get the correct row map for C.
608 RCP<const crs_matrix_type> Brcp = rcpFromRef(B);
609 if(transposeB)
610 {
612 Brcp = transposer.createTranspose();
613 }
614 //Check that A,B are fillComplete before getting B's column map
615 TEUCHOS_TEST_FOR_EXCEPTION
616 (! A.isFillComplete () || ! Brcp->isFillComplete (), std::invalid_argument,
617 "TpetraExt::MatrixMatrix::add(): A and B must both be fill complete.");
618 RCP<crs_matrix_type> C = rcp(new crs_matrix_type(Brcp->getRowMap(), 0));
619 //this version of add() always fill completes the result, no matter what is in params on input
620 add(alpha, transposeA, A, beta, false, *Brcp, *C, domainMap, rangeMap, params);
621 return C;
622}
623
624//This functor does the same thing as CrsGraph::convertColumnIndicesFromGlobalToLocal,
625//but since the spadd() output is always packed there is no need for a separate
626//numRowEntries here.
627//
628template<class LO, class GO, class LOView, class GOView, class LocalMap>
629struct ConvertGlobalToLocalFunctor
630{
631 ConvertGlobalToLocalFunctor(LOView& lids_, const GOView& gids_, const LocalMap localColMap_)
632 : lids(lids_), gids(gids_), localColMap(localColMap_)
633 {}
634
635 KOKKOS_FUNCTION void operator() (const GO i) const
636 {
637 lids(i) = localColMap.getLocalElement(gids(i));
638 }
639
640 LOView lids;
641 const GOView gids;
642 const LocalMap localColMap;
643};
644
645template <class Scalar,
646 class LocalOrdinal,
647 class GlobalOrdinal,
648 class Node>
649void
650add (const Scalar& alpha,
651 const bool transposeA,
653 const Scalar& beta,
654 const bool transposeB,
657 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
658 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
659 const Teuchos::RCP<Teuchos::ParameterList>& params)
660{
661 using Teuchos::RCP;
662 using Teuchos::rcp;
663 using Teuchos::rcpFromRef;
664 using Teuchos::rcp_implicit_cast;
665 using Teuchos::rcp_dynamic_cast;
666 using Teuchos::TimeMonitor;
667 using SC = Scalar;
668 using LO = LocalOrdinal;
669 using GO = GlobalOrdinal;
670 using NO = Node;
671 using crs_matrix_type = CrsMatrix<SC,LO,GO,NO>;
673 using map_type = Map<LO,GO,NO>;
674 using transposer_type = RowMatrixTransposer<SC,LO,GO,NO>;
677 using exec_space = typename crs_graph_type::execution_space;
678 using AddKern = MMdetails::AddKernels<SC,LO,GO,NO>;
679 const char* prefix_mmm = "TpetraExt::MatrixMatrix::add: ";
680 constexpr bool debug = false;
681
682#ifdef HAVE_TPETRA_MMM_TIMINGS
683 RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("transpose"))));
684#endif
685
686 if (debug) {
687 std::ostringstream os;
688 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
689 << "TpetraExt::MatrixMatrix::add" << std::endl;
690 std::cerr << os.str ();
691 }
692
693 TEUCHOS_TEST_FOR_EXCEPTION
694 (C.isLocallyIndexed() || C.isGloballyIndexed(), std::invalid_argument,
695 prefix_mmm << "C must be a 'new' matrix (neither locally nor globally indexed).");
696 TEUCHOS_TEST_FOR_EXCEPTION
697 (! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
698 prefix_mmm << "A and B must both be fill complete.");
699#ifdef HAVE_TPETRA_DEBUG
700 // The matrices don't have domain or range Maps unless they are fill complete.
701 if (A.isFillComplete () && B.isFillComplete ()) {
702 const bool domainMapsSame =
703 (! transposeA && ! transposeB &&
704 ! A.getDomainMap()->locallySameAs (*B.getDomainMap ())) ||
705 (! transposeA && transposeB &&
706 ! A.getDomainMap()->isSameAs (*B.getRangeMap ())) ||
707 ( transposeA && ! transposeB &&
708 ! A.getRangeMap ()->isSameAs (*B.getDomainMap ()));
709 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
710 prefix_mmm << "The domain Maps of Op(A) and Op(B) are not the same.");
711
712 const bool rangeMapsSame =
713 (! transposeA && ! transposeB &&
714 ! A.getRangeMap ()->isSameAs (*B.getRangeMap ())) ||
715 (! transposeA && transposeB &&
716 ! A.getRangeMap ()->isSameAs (*B.getDomainMap())) ||
717 ( transposeA && ! transposeB &&
718 ! A.getDomainMap()->isSameAs (*B.getRangeMap ()));
719 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
720 prefix_mmm << "The range Maps of Op(A) and Op(B) are not the same.");
721 }
722#endif // HAVE_TPETRA_DEBUG
723
724 using Teuchos::ParameterList;
725 // Form the explicit transpose of A if necessary.
726 RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
727 if (transposeA) {
728 transposer_type transposer (Aprime);
729 Aprime = transposer.createTranspose ();
730 }
731
732#ifdef HAVE_TPETRA_DEBUG
733 TEUCHOS_TEST_FOR_EXCEPTION
734 (Aprime.is_null (), std::logic_error,
735 prefix_mmm << "Failed to compute Op(A). "
736 "Please report this bug to the Tpetra developers.");
737#endif // HAVE_TPETRA_DEBUG
738
739 // Form the explicit transpose of B if necessary.
740 RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
741 if (transposeB) {
742 if (debug) {
743 std::ostringstream os;
744 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
745 << "Form explicit xpose of B" << std::endl;
746 std::cerr << os.str ();
747 }
748 transposer_type transposer (Bprime);
749 Bprime = transposer.createTranspose ();
750 }
751#ifdef HAVE_TPETRA_DEBUG
752 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
753 prefix_mmm << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
754 TEUCHOS_TEST_FOR_EXCEPTION(
755 !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
756 prefix_mmm << "Aprime and Bprime must both be fill complete. "
757 "Please report this bug to the Tpetra developers.");
758#endif // HAVE_TPETRA_DEBUG
759 RCP<const map_type> CDomainMap = domainMap;
760 RCP<const map_type> CRangeMap = rangeMap;
761 if(CDomainMap.is_null())
762 {
763 CDomainMap = Bprime->getDomainMap();
764 }
765 if(CRangeMap.is_null())
766 {
767 CRangeMap = Bprime->getRangeMap();
768 }
769 assert(!(CDomainMap.is_null()));
770 assert(!(CRangeMap.is_null()));
771 typedef typename AddKern::values_array values_array;
772 typedef typename AddKern::row_ptrs_array row_ptrs_array;
773 typedef typename AddKern::col_inds_array col_inds_array;
774 bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
775 bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
776 values_array vals;
777 row_ptrs_array rowptrs;
778 col_inds_array colinds;
779#ifdef HAVE_TPETRA_MMM_TIMINGS
780 MM = Teuchos::null;
781 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("rowmap check/import"))));
782#endif
783 if(!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap()))))
784 {
785 //import Aprime into Bprime's row map so the local matrices have same # of rows
786 auto import = rcp(new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
787 // cbl do not set
788 // parameterlist "isMatrixMatrix_TransferAndFillComplete" true here as
789 // this import _may_ take the form of a transfer. In practice it would be unlikely,
790 // but the general case is not so forgiving.
791 Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *import, Bprime->getDomainMap(), Bprime->getRangeMap());
792 }
793 bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
794 bool sorted = AGraphSorted && BGraphSorted;
795 RCP<const import_type> Cimport = Teuchos::null;
796 RCP<export_type> Cexport = Teuchos::null;
797 bool doFillComplete = true;
798 if(Teuchos::nonnull(params) && params->isParameter("Call fillComplete"))
799 {
800 doFillComplete = params->get<bool>("Call fillComplete");
801 }
802 auto Alocal = Aprime->getLocalMatrixDevice();
803 auto Blocal = Bprime->getLocalMatrixDevice();
804 LO numLocalRows = Alocal.numRows();
805 if(numLocalRows == 0)
806 {
807 //KokkosKernels spadd assumes rowptrs.extent(0) + 1 == nrows,
808 //but an empty Tpetra matrix is allowed to have rowptrs.extent(0) == 0.
809 //Handle this case now
810 //(without interfering with collective operations, since it's possible for
811 //some ranks to have 0 local rows and others not).
812 rowptrs = row_ptrs_array("C rowptrs", 0);
813 }
814 auto Acolmap = Aprime->getColMap();
815 auto Bcolmap = Bprime->getColMap();
816 if(!matchingColMaps)
817 {
818 using global_col_inds_array = typename AddKern::global_col_inds_array;
819#ifdef HAVE_TPETRA_MMM_TIMINGS
820 MM = Teuchos::null;
821 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mismatched col map full kernel"))));
822#endif
823 //use kernel that converts col indices in both A and B to common domain map before adding
824 auto AlocalColmap = Acolmap->getLocalMap();
825 auto BlocalColmap = Bcolmap->getLocalMap();
826 global_col_inds_array globalColinds;
827 if (debug) {
828 std::ostringstream os;
829 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
830 << "Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
831 std::cerr << os.str ();
832 }
833 AddKern::convertToGlobalAndAdd(
834 Alocal, alpha, Blocal, beta, AlocalColmap, BlocalColmap,
835 vals, rowptrs, globalColinds);
836 if (debug) {
837 std::ostringstream os;
838 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
839 << "Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
840 std::cerr << os.str ();
841 }
842#ifdef HAVE_TPETRA_MMM_TIMINGS
843 MM = Teuchos::null;
844 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Constructing graph"))));
845#endif
846 RCP<const map_type> CcolMap;
848 (CcolMap, CDomainMap, globalColinds);
849 C.replaceColMap(CcolMap);
850 col_inds_array localColinds("C colinds", globalColinds.extent(0));
851 Kokkos::parallel_for(Kokkos::RangePolicy<exec_space>(0, globalColinds.extent(0)),
852 ConvertGlobalToLocalFunctor<LocalOrdinal, GlobalOrdinal,
853 col_inds_array, global_col_inds_array,
854 typename map_type::local_map_type>
855 (localColinds, globalColinds, CcolMap->getLocalMap()));
856 KokkosSparse::sort_crs_matrix<exec_space, row_ptrs_array, col_inds_array, values_array>(rowptrs, localColinds, vals);
857 C.setAllValues(rowptrs, localColinds, vals);
858 C.fillComplete(CDomainMap, CRangeMap, params);
859 if(!doFillComplete)
860 C.resumeFill();
861 }
862 else
863 {
864 //Aprime, Bprime and C all have the same column maps
865 auto Avals = Alocal.values;
866 auto Bvals = Blocal.values;
867 auto Arowptrs = Alocal.graph.row_map;
868 auto Browptrs = Blocal.graph.row_map;
869 auto Acolinds = Alocal.graph.entries;
870 auto Bcolinds = Blocal.graph.entries;
871 if(sorted)
872 {
873 //use sorted kernel
874#ifdef HAVE_TPETRA_MMM_TIMINGS
875 MM = Teuchos::null;
876 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("sorted entries full kernel"))));
877#endif
878 if (debug) {
879 std::ostringstream os;
880 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
881 << "Call AddKern::addSorted(...)" << std::endl;
882 std::cerr << os.str ();
883 }
884#if KOKKOSKERNELS_VERSION >= 40299
885 AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
886#else
887 AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, vals, rowptrs, colinds);
888#endif
889 }
890 else
891 {
892 //use unsorted kernel
893#ifdef HAVE_TPETRA_MMM_TIMINGS
894 MM = Teuchos::null;
895 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mm add unsorted entries full kernel"))));
896#endif
897 if (debug) {
898 std::ostringstream os;
899 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
900 << "Call AddKern::addUnsorted(...)" << std::endl;
901 std::cerr << os.str ();
902 }
903 AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
904 }
905 //Bprime col map works as C's row map, since Aprime and Bprime have the same colmaps.
906 RCP<const map_type> Ccolmap = Bcolmap;
907 C.replaceColMap(Ccolmap);
908 C.setAllValues(rowptrs, colinds, vals);
909 if(doFillComplete)
910 {
911 #ifdef HAVE_TPETRA_MMM_TIMINGS
912 MM = Teuchos::null;
913 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Tpetra::Crs expertStaticFillComplete"))));
914 #endif
915 if(!CDomainMap->isSameAs(*Ccolmap))
916 {
917 if (debug) {
918 std::ostringstream os;
919 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
920 << "Create Cimport" << std::endl;
921 std::cerr << os.str ();
922 }
923 Cimport = rcp(new import_type(CDomainMap, Ccolmap));
924 }
925 if(!C.getRowMap()->isSameAs(*CRangeMap))
926 {
927 if (debug) {
928 std::ostringstream os;
929 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
930 << "Create Cexport" << std::endl;
931 std::cerr << os.str ();
932 }
933 Cexport = rcp(new export_type(C.getRowMap(), CRangeMap));
934 }
935
936 if (debug) {
937 std::ostringstream os;
938 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
939 << "Call C->expertStaticFillComplete(...)" << std::endl;
940 std::cerr << os.str ();
941 }
942 C.expertStaticFillComplete(CDomainMap, CRangeMap, Cimport, Cexport, params);
943 }
944 }
945}
946
947// This version of Add takes C as RCP&, so C may be null on input (in this case,
948// it is allocated and constructed in this function).
949template <class Scalar,
950 class LocalOrdinal,
951 class GlobalOrdinal,
952 class Node>
953void Add(
955 bool transposeA,
956 Scalar scalarA,
958 bool transposeB,
959 Scalar scalarB,
961{
962 using Teuchos::Array;
963 using Teuchos::ArrayRCP;
964 using Teuchos::ArrayView;
965 using Teuchos::RCP;
966 using Teuchos::rcp;
967 using Teuchos::rcp_dynamic_cast;
968 using Teuchos::rcpFromRef;
969 using Teuchos::tuple;
970 using std::endl;
971 // typedef typename ArrayView<const Scalar>::size_type size_type;
972 typedef Teuchos::ScalarTraits<Scalar> STS;
974 // typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
975 // typedef RowGraph<LocalOrdinal, GlobalOrdinal, Node> row_graph_type;
976 // typedef CrsGraph<LocalOrdinal, GlobalOrdinal, Node> crs_graph_type;
979
980 std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
981
982 TEUCHOS_TEST_FOR_EXCEPTION(
983 ! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
984 prefix << "A and B must both be fill complete before calling this function.");
985
986 if(C.is_null()) {
987 TEUCHOS_TEST_FOR_EXCEPTION(!A.haveGlobalConstants(), std::logic_error,
988 prefix << "C is null (must be allocated), but A.haveGlobalConstants() is false. "
989 "Please report this bug to the Tpetra developers.");
990 TEUCHOS_TEST_FOR_EXCEPTION(!B.haveGlobalConstants(), std::logic_error,
991 prefix << "C is null (must be allocated), but B.haveGlobalConstants() is false. "
992 "Please report this bug to the Tpetra developers.");
993 }
994
995#ifdef HAVE_TPETRA_DEBUG
996 {
997 const bool domainMapsSame =
998 (! transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getDomainMap ()))) ||
999 (! transposeA && transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ()))) ||
1000 (transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ())));
1001 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
1002 prefix << "The domain Maps of Op(A) and Op(B) are not the same.");
1003
1004 const bool rangeMapsSame =
1005 (! transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getRangeMap ()))) ||
1006 (! transposeA && transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ()))) ||
1007 (transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ())));
1008 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
1009 prefix << "The range Maps of Op(A) and Op(B) are not the same.");
1010 }
1011#endif // HAVE_TPETRA_DEBUG
1012
1013 using Teuchos::ParameterList;
1014 RCP<ParameterList> transposeParams (new ParameterList);
1015 transposeParams->set ("sort", false);
1016
1017 // Form the explicit transpose of A if necessary.
1018 RCP<const crs_matrix_type> Aprime;
1019 if (transposeA) {
1020 transposer_type theTransposer (rcpFromRef (A));
1021 Aprime = theTransposer.createTranspose (transposeParams);
1022 }
1023 else {
1024 Aprime = rcpFromRef (A);
1025 }
1026
1027#ifdef HAVE_TPETRA_DEBUG
1028 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1029 prefix << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
1030#endif // HAVE_TPETRA_DEBUG
1031
1032 // Form the explicit transpose of B if necessary.
1033 RCP<const crs_matrix_type> Bprime;
1034 if (transposeB) {
1035 transposer_type theTransposer (rcpFromRef (B));
1036 Bprime = theTransposer.createTranspose (transposeParams);
1037 }
1038 else {
1039 Bprime = rcpFromRef (B);
1040 }
1041
1042#ifdef HAVE_TPETRA_DEBUG
1043 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1044 prefix << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
1045#endif // HAVE_TPETRA_DEBUG
1046
1047 bool CwasFillComplete = false;
1048
1049 // Allocate or zero the entries of the result matrix.
1050 if (! C.is_null ()) {
1051 CwasFillComplete = C->isFillComplete();
1052 if(CwasFillComplete)
1053 C->resumeFill();
1054 C->setAllToScalar (STS::zero ());
1055 } else {
1056 // FIXME (mfh 08 May 2013) When I first looked at this method, I
1057 // noticed that C was being given the row Map of Aprime (the
1058 // possibly transposed version of A). Is this what we want?
1059
1060 // It is a precondition that Aprime and Bprime have the same domain and range maps.
1061 // However, they may have different row maps. In this case, it's difficult to
1062 // get a precise upper bound on the number of entries in each local row of C, so
1063 // just use the looser upper bound based on the max number of entries in any row of Aprime and Bprime.
1064 if(Aprime->getRowMap()->isSameAs(*Bprime->getRowMap())) {
1065 LocalOrdinal numLocalRows = Aprime->getLocalNumRows();
1066 Array<size_t> CmaxEntriesPerRow(numLocalRows);
1067 for(LocalOrdinal i = 0; i < numLocalRows; i++) {
1068 CmaxEntriesPerRow[i] = Aprime->getNumEntriesInLocalRow(i) + Bprime->getNumEntriesInLocalRow(i);
1069 }
1070 C = rcp (new crs_matrix_type (Aprime->getRowMap (), CmaxEntriesPerRow()));
1071 }
1072 else {
1073 // Note: above we checked that Aprime and Bprime have global constants, so it's safe to ask for max entries per row.
1074 C = rcp (new crs_matrix_type (Aprime->getRowMap (), Aprime->getGlobalMaxNumRowEntries() + Bprime->getGlobalMaxNumRowEntries()));
1075 }
1076 }
1077
1078#ifdef HAVE_TPETRA_DEBUG
1079 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1080 prefix << "At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1081 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1082 prefix << "At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1083 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1084 prefix << "At this point, C is null. Please report this bug to the Tpetra developers.");
1085#endif // HAVE_TPETRA_DEBUG
1086
1087 Array<RCP<const crs_matrix_type> > Mat =
1088 tuple<RCP<const crs_matrix_type> > (Aprime, Bprime);
1089 Array<Scalar> scalar = tuple<Scalar> (scalarA, scalarB);
1090
1091 // do a loop over each matrix to add: A reordering might be more efficient
1092 for (int k = 0; k < 2; ++k) {
1093 typename crs_matrix_type::nonconst_global_inds_host_view_type Indices;
1094 typename crs_matrix_type::nonconst_values_host_view_type Values;
1095
1096 // Loop over each locally owned row of the current matrix (either
1097 // Aprime or Bprime), and sum its entries into the corresponding
1098 // row of C. This works regardless of whether Aprime or Bprime
1099 // has the same row Map as C, because both sumIntoGlobalValues and
1100 // insertGlobalValues allow summing resp. inserting into nonowned
1101 // rows of C.
1102#ifdef HAVE_TPETRA_DEBUG
1103 TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
1104 prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1105#endif // HAVE_TPETRA_DEBUG
1106 RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
1107#ifdef HAVE_TPETRA_DEBUG
1108 TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
1109 prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1110#endif // HAVE_TPETRA_DEBUG
1111
1112 const size_t localNumRows = Mat[k]->getLocalNumRows ();
1113 for (size_t i = 0; i < localNumRows; ++i) {
1114 const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
1115 size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
1116 if (numEntries > 0) {
1117 if(numEntries > Indices.extent(0)) {
1118 Kokkos::resize(Indices, numEntries);
1119 Kokkos::resize(Values, numEntries);
1120 }
1121 Mat[k]->getGlobalRowCopy (globalRow, Indices, Values, numEntries);
1122
1123 if (scalar[k] != STS::one ()) {
1124 for (size_t j = 0; j < numEntries; ++j) {
1125 Values[j] *= scalar[k];
1126 }
1127 }
1128
1129 if (CwasFillComplete) {
1130 size_t result = C->sumIntoGlobalValues (globalRow, numEntries,
1131 reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1132 TEUCHOS_TEST_FOR_EXCEPTION(result != numEntries, std::logic_error,
1133 prefix << "sumIntoGlobalValues failed to add entries from A or B into C.");
1134 } else {
1135 C->insertGlobalValues (globalRow, numEntries,
1136 reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1137 }
1138 }
1139 }
1140 }
1141 if(CwasFillComplete) {
1142 C->fillComplete(C->getDomainMap (),
1143 C->getRangeMap ());
1144 }
1145}
1146
1147// This version of Add takes C as const RCP&, so C must not be null on input. Otherwise, its behavior is identical
1148// to the above version where C is RCP&.
1149template <class Scalar,
1150 class LocalOrdinal,
1151 class GlobalOrdinal,
1152 class Node>
1153void Add(
1155 bool transposeA,
1156 Scalar scalarA,
1158 bool transposeB,
1159 Scalar scalarB,
1161{
1162 std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
1163
1164 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::invalid_argument,
1165 prefix << "C must not be null");
1166
1167 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > C_ = C;
1168 Add(A, transposeA, scalarA, B, transposeB, scalarB, C_);
1169}
1170
1171} //End namespace MatrixMatrix
1172
1173namespace MMdetails{
1174
1175/*********************************************************************************************************/
1177//template <class TransferType>
1178//void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer, const std::string &label) {
1179// if (Transfer.is_null())
1180// return;
1181//
1182// const Distributor & Distor = Transfer->getDistributor();
1183// Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1184//
1185// size_t rows_send = Transfer->getNumExportIDs();
1186// size_t rows_recv = Transfer->getNumRemoteIDs();
1187//
1188// size_t round1_send = Transfer->getNumExportIDs() * sizeof(size_t);
1189// size_t round1_recv = Transfer->getNumRemoteIDs() * sizeof(size_t);
1190// size_t num_send_neighbors = Distor.getNumSends();
1191// size_t num_recv_neighbors = Distor.getNumReceives();
1192// size_t round2_send, round2_recv;
1193// Distor.getLastDoStatistics(round2_send,round2_recv);
1194//
1195// int myPID = Comm->getRank();
1196// int NumProcs = Comm->getSize();
1197//
1198// // Processor by processor statistics
1199// // printf("[%d] %s Statistics: neigh[s/r]=%d/%d rows[s/r]=%d/%d r1bytes[s/r]=%d/%d r2bytes[s/r]=%d/%d\n",
1200// // myPID, label.c_str(),num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv);
1201//
1202// // Global statistics
1203// size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1204// size_t gstats_min[8], gstats_max[8];
1205//
1206// double lstats_avg[8], gstats_avg[8];
1207// for(int i=0; i<8; i++)
1208// lstats_avg[i] = ((double)lstats[i])/NumProcs;
1209//
1210// Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1211// Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1212// Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1213//
1214// if(!myPID) {
1215// printf("%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1216// (int)gstats_min[0],gstats_avg[0],(int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(int)gstats_max[2],
1217// (int)gstats_min[4],gstats_avg[4],(int)gstats_max[4], (int)gstats_min[6],gstats_avg[6],(int)gstats_max[6]);
1218// printf("%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1219// (int)gstats_min[1],gstats_avg[1],(int)gstats_max[1], (int)gstats_min[3],gstats_avg[3],(int)gstats_max[3],
1220// (int)gstats_min[5],gstats_avg[5],(int)gstats_max[5], (int)gstats_min[7],gstats_avg[7],(int)gstats_max[7]);
1221// }
1222//}
1223
1224// Kernel method for computing the local portion of C = A*B
1225template<class Scalar,
1226 class LocalOrdinal,
1227 class GlobalOrdinal,
1228 class Node>
1229void mult_AT_B_newmatrix(
1230 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1231 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1232 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1233 const std::string & label,
1234 const Teuchos::RCP<Teuchos::ParameterList>& params)
1235{
1236 using Teuchos::RCP;
1237 using Teuchos::rcp;
1238 typedef Scalar SC;
1239 typedef LocalOrdinal LO;
1240 typedef GlobalOrdinal GO;
1241 typedef Node NO;
1242 typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1243 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1244
1245#ifdef HAVE_TPETRA_MMM_TIMINGS
1246 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1247 using Teuchos::TimeMonitor;
1248 RCP<TimeMonitor> MM = rcp (new TimeMonitor
1249 (*TimeMonitor::getNewTimer (prefix_mmm + "MMM-T Transpose")));
1250#endif
1251
1252 /*************************************************************/
1253 /* 1) Local Transpose of A */
1254 /*************************************************************/
1255 transposer_type transposer (rcpFromRef (A), label + std::string("XP: "));
1256
1257 using Teuchos::ParameterList;
1258 RCP<ParameterList> transposeParams (new ParameterList);
1259 transposeParams->set ("sort", true); // Kokkos Kernels spgemm requires inputs to be sorted
1260 if(! params.is_null ()) {
1261 transposeParams->set ("compute global constants",
1262 params->get ("compute global constants: temporaries",
1263 false));
1264 }
1265 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1266 transposer.createTransposeLocal (transposeParams);
1267
1268 /*************************************************************/
1269 /* 2/3) Call mult_A_B_newmatrix w/ fillComplete */
1270 /*************************************************************/
1271#ifdef HAVE_TPETRA_MMM_TIMINGS
1272 MM = Teuchos::null;
1273 MM = rcp (new TimeMonitor
1274 (*TimeMonitor::getNewTimer (prefix_mmm + std::string ("MMM-T I&X"))));
1275#endif
1276
1277 // Get views, asserting that no import is required to speed up computation
1278 crs_matrix_struct_type Aview;
1279 crs_matrix_struct_type Bview;
1280 RCP<const Import<LO, GO, NO> > dummyImporter;
1281
1282 // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
1283 RCP<Teuchos::ParameterList> importParams1 (new ParameterList);
1284 if (! params.is_null ()) {
1285 importParams1->set ("compute global constants",
1286 params->get ("compute global constants: temporaries",
1287 false));
1288 auto slist = params->sublist ("matrixmatrix: kernel params", false);
1289 bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1290 bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
1291 int mm_optimization_core_count =
1293 mm_optimization_core_count =
1294 slist.get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1295 int mm_optimization_core_count2 =
1296 params->get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1297 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1298 mm_optimization_core_count = mm_optimization_core_count2;
1299 }
1300 auto & sip1 = importParams1->sublist ("matrixmatrix: kernel params", false);
1301 sip1.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1302 sip1.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1303 sip1.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1304 }
1305
1306 MMdetails::import_and_extract_views (*Atrans, Atrans->getRowMap (),
1307 Aview, dummyImporter, true,
1308 label, importParams1);
1309
1310 RCP<ParameterList> importParams2 (new ParameterList);
1311 if (! params.is_null ()) {
1312 importParams2->set ("compute global constants",
1313 params->get ("compute global constants: temporaries",
1314 false));
1315 auto slist = params->sublist ("matrixmatrix: kernel params", false);
1316 bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1317 bool overrideAllreduce = slist.get ("MM_TAFC_OverrideAllreduceCheck", false);
1318 int mm_optimization_core_count =
1320 mm_optimization_core_count =
1321 slist.get ("MM_TAFC_OptimizationCoreCount",
1322 mm_optimization_core_count);
1323 int mm_optimization_core_count2 =
1324 params->get ("MM_TAFC_OptimizationCoreCount",
1325 mm_optimization_core_count);
1326 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1327 mm_optimization_core_count = mm_optimization_core_count2;
1328 }
1329 auto & sip2 = importParams2->sublist ("matrixmatrix: kernel params", false);
1330 sip2.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1331 sip2.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1332 sip2.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1333 }
1334
1335 if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1336 MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,true, label,importParams2);
1337 }
1338 else {
1339 MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,false, label,importParams2);
1340 }
1341
1342#ifdef HAVE_TPETRA_MMM_TIMINGS
1343 MM = Teuchos::null;
1344 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T AB-core"))));
1345#endif
1346
1347 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1348
1349 // If Atrans has no Exporter, we can use C instead of having to create a temp matrix
1350 bool needs_final_export = ! Atrans->getGraph ()->getExporter ().is_null();
1351 if (needs_final_export) {
1352 Ctemp = rcp (new Tpetra::CrsMatrix<SC, LO, GO, NO> (Atrans->getRowMap (), 0));
1353 }
1354 else {
1355 Ctemp = rcp (&C, false);
1356 }
1357
1358 mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1359
1360 /*************************************************************/
1361 /* 4) exportAndFillComplete matrix */
1362 /*************************************************************/
1363#ifdef HAVE_TPETRA_MMM_TIMINGS
1364 MM = Teuchos::null;
1365 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T exportAndFillComplete"))));
1366#endif
1367
1368 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp (&C, false);
1369
1370 if (needs_final_export) {
1371 ParameterList labelList;
1372 labelList.set("Timer Label", label);
1373 if(!params.is_null()) {
1374 ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
1375 ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
1376 int mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
1377 mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1378 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1379 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1380 labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
1381 bool isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
1382 bool overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
1383
1384 labelList_subList.set ("isMatrixMatrix_TransferAndFillComplete", isMM,
1385 "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1386 labelList.set("compute global constants",params->get("compute global constants",true));
1387 labelList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1388 }
1389 Ctemp->exportAndFillComplete (Crcp,
1390 *Ctemp->getGraph ()->getExporter (),
1391 B.getDomainMap (),
1392 A.getDomainMap (),
1393 rcp (&labelList, false));
1394 }
1395#ifdef HAVE_TPETRA_MMM_STATISTICS
1396 printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(" AT_B MMM"));
1397#endif
1398}
1399
1400/*********************************************************************************************************/
1401// Kernel method for computing the local portion of C = A*B
1402template<class Scalar,
1403 class LocalOrdinal,
1404 class GlobalOrdinal,
1405 class Node>
1406void mult_A_B(
1407 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1408 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1409 CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1410 const std::string& /* label */,
1411 const Teuchos::RCP<Teuchos::ParameterList>& /* params */)
1412{
1413 using Teuchos::Array;
1414 using Teuchos::ArrayRCP;
1415 using Teuchos::ArrayView;
1416 using Teuchos::OrdinalTraits;
1417 using Teuchos::null;
1418
1419 typedef Teuchos::ScalarTraits<Scalar> STS;
1420 // TEUCHOS_FUNC_TIME_MONITOR_DIFF("mult_A_B", mult_A_B);
1421 LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1422 LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1423
1424 LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1425 LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1426
1427 ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getLocalElementList();
1428 ArrayView<const GlobalOrdinal> bcols_import = null;
1429 if (Bview.importColMap != null) {
1430 C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1431 C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1432
1433 bcols_import = Bview.importColMap->getLocalElementList();
1434 }
1435
1436 size_t C_numCols = C_lastCol - C_firstCol +
1437 OrdinalTraits<LocalOrdinal>::one();
1438 size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1439 OrdinalTraits<LocalOrdinal>::one();
1440
1441 if (C_numCols_import > C_numCols)
1442 C_numCols = C_numCols_import;
1443
1444 Array<Scalar> dwork = Array<Scalar>(C_numCols);
1445 Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1446 Array<size_t> iwork2 = Array<size_t>(C_numCols);
1447
1448 Array<Scalar> C_row_i = dwork;
1449 Array<GlobalOrdinal> C_cols = iwork;
1450 Array<size_t> c_index = iwork2;
1451 Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1452 Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1453
1454 size_t C_row_i_length, j, k, last_index;
1455
1456 // Run through all the hash table lookups once and for all
1457 LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1458 Array<LocalOrdinal> Acol2Brow(Aview.colMap->getLocalNumElements(),LO_INVALID);
1459 Array<LocalOrdinal> Acol2Irow(Aview.colMap->getLocalNumElements(),LO_INVALID);
1460 if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1461 // Maps are the same: Use local IDs as the hash
1462 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1463 Aview.colMap->getMaxLocalIndex(); i++)
1464 Acol2Brow[i]=i;
1465 }
1466 else {
1467 // Maps are not the same: Use the map's hash
1468 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1469 Aview.colMap->getMaxLocalIndex(); i++) {
1470 GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1471 LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1472 if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1473 else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1474 }
1475 }
1476
1477 // To form C = A*B we're going to execute this expression:
1478 //
1479 // C(i,j) = sum_k( A(i,k)*B(k,j) )
1480 //
1481 // Our goal, of course, is to navigate the data in A and B once, without
1482 // performing searches for column-indices, etc.
1483 auto Arowptr = Aview.origMatrix->getLocalRowPtrsHost();
1484 auto Acolind = Aview.origMatrix->getLocalIndicesHost();
1485 auto Avals = Aview.origMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1486 auto Browptr = Bview.origMatrix->getLocalRowPtrsHost();
1487 auto Bcolind = Bview.origMatrix->getLocalIndicesHost();
1488 auto Bvals = Bview.origMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1489 decltype(Browptr) Irowptr;
1490 decltype(Bcolind) Icolind;
1491 decltype(Bvals) Ivals;
1492 if(!Bview.importMatrix.is_null()) {
1493 Irowptr = Bview.importMatrix->getLocalRowPtrsHost();
1494 Icolind = Bview.importMatrix->getLocalIndicesHost();
1495 Ivals = Bview.importMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1496 }
1497
1498 bool C_filled = C.isFillComplete();
1499
1500 for (size_t i = 0; i < C_numCols; i++)
1501 c_index[i] = OrdinalTraits<size_t>::invalid();
1502
1503 // Loop over the rows of A.
1504 size_t Arows = Aview.rowMap->getLocalNumElements();
1505 for(size_t i=0; i<Arows; ++i) {
1506
1507 // Only navigate the local portion of Aview... which is, thankfully, all of
1508 // A since this routine doesn't do transpose modes
1509 GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1510
1511 // Loop across the i-th row of A and for each corresponding row in B, loop
1512 // across columns and accumulate product A(i,k)*B(k,j) into our partial sum
1513 // quantities C_row_i. In other words, as we stride across B(k,:) we're
1514 // calculating updates for row i of the result matrix C.
1515 C_row_i_length = OrdinalTraits<size_t>::zero();
1516
1517 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1518 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1519 const Scalar Aval = Avals[k];
1520 if (Aval == STS::zero())
1521 continue;
1522
1523 if (Ak == LO_INVALID)
1524 continue;
1525
1526 for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1527 LocalOrdinal col = Bcolind[j];
1528 //assert(col >= 0 && col < C_numCols);
1529
1530 if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1531 //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1532 // This has to be a += so insertGlobalValue goes out
1533 C_row_i[C_row_i_length] = Aval*Bvals[j];
1534 C_cols[C_row_i_length] = col;
1535 c_index[col] = C_row_i_length;
1536 C_row_i_length++;
1537
1538 } else {
1539 // static cast from impl_scalar_type to Scalar needed for complex
1540 C_row_i[c_index[col]] += Aval * static_cast<Scalar>(Bvals[j]);
1541 }
1542 }
1543 }
1544
1545 for (size_t ii = 0; ii < C_row_i_length; ii++) {
1546 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1547 C_cols[ii] = bcols[C_cols[ii]];
1548 combined_index[ii] = C_cols[ii];
1549 combined_values[ii] = C_row_i[ii];
1550 }
1551 last_index = C_row_i_length;
1552
1553 //
1554 //Now put the C_row_i values into C.
1555 //
1556 // We might have to revamp this later.
1557 C_row_i_length = OrdinalTraits<size_t>::zero();
1558
1559 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1560 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1561 const Scalar Aval = Avals[k];
1562 if (Aval == STS::zero())
1563 continue;
1564
1565 if (Ak!=LO_INVALID) continue;
1566
1567 Ak = Acol2Irow[Acolind[k]];
1568 for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1569 LocalOrdinal col = Icolind[j];
1570 //assert(col >= 0 && col < C_numCols);
1571
1572 if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1573 //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1574 // This has to be a += so insertGlobalValue goes out
1575 C_row_i[C_row_i_length] = Aval*Ivals[j];
1576 C_cols[C_row_i_length] = col;
1577 c_index[col] = C_row_i_length;
1578 C_row_i_length++;
1579
1580 } else {
1581 // This has to be a += so insertGlobalValue goes out
1582 // static cast from impl_scalar_type to Scalar needed for complex
1583 C_row_i[c_index[col]] += Aval * static_cast<Scalar>(Ivals[j]);
1584 }
1585 }
1586 }
1587
1588 for (size_t ii = 0; ii < C_row_i_length; ii++) {
1589 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1590 C_cols[ii] = bcols_import[C_cols[ii]];
1591 combined_index[last_index] = C_cols[ii];
1592 combined_values[last_index] = C_row_i[ii];
1593 last_index++;
1594 }
1595
1596 // Now put the C_row_i values into C.
1597 // We might have to revamp this later.
1598 C_filled ?
1599 C.sumIntoGlobalValues(
1600 global_row,
1601 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1602 combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1603 :
1604 C.insertGlobalValues(
1605 global_row,
1606 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1607 combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1608
1609 }
1610}
1611
1612/*********************************************************************************************************/
1613template<class Scalar,
1614 class LocalOrdinal,
1615 class GlobalOrdinal,
1616 class Node>
1617void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1618 typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1619 Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1620
1621 if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1622 Mview.maxNumRowEntries = Mview.indices[0].size();
1623
1624 for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1625 if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1626 Mview.maxNumRowEntries = Mview.indices[i].size();
1627 }
1628}
1629
1630/*********************************************************************************************************/
1631template<class CrsMatrixType>
1632size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1633 // Follows the NZ estimate in ML's ml_matmatmult.c
1634 size_t Aest = 100, Best=100;
1635 if (A.getLocalNumEntries() >= A.getLocalNumRows())
1636 Aest = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries()/A.getLocalNumRows() : 100;
1637 if (B.getLocalNumEntries() >= B.getLocalNumRows())
1638 Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries()/B.getLocalNumRows() : 100;
1639
1640 size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
1641 nnzperrow *= nnzperrow;
1642
1643 return (size_t)(A.getLocalNumRows()*nnzperrow*0.75 + 100);
1644}
1645
1646
1647/*********************************************************************************************************/
1648// Kernel method for computing the local portion of C = A*B for CrsMatrix
1649//
1650// mfh 27 Sep 2016: Currently, mult_AT_B_newmatrix() also calls this
1651// function, so this is probably the function we want to
1652// thread-parallelize.
1653template<class Scalar,
1654 class LocalOrdinal,
1655 class GlobalOrdinal,
1656 class Node>
1657void mult_A_B_newmatrix(
1658 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1659 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1660 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1661 const std::string& label,
1662 const Teuchos::RCP<Teuchos::ParameterList>& params)
1663{
1664 using Teuchos::Array;
1665 using Teuchos::ArrayRCP;
1666 using Teuchos::ArrayView;
1667 using Teuchos::RCP;
1668 using Teuchos::rcp;
1669
1670 // Tpetra typedefs
1671 typedef LocalOrdinal LO;
1672 typedef GlobalOrdinal GO;
1673 typedef Node NO;
1674 typedef Import<LO,GO,NO> import_type;
1675 typedef Map<LO,GO,NO> map_type;
1676
1677 // Kokkos typedefs
1678 typedef typename map_type::local_map_type local_map_type;
1680 typedef typename KCRS::StaticCrsGraphType graph_t;
1681 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1682 typedef typename NO::execution_space execution_space;
1683 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1684 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1685
1686#ifdef HAVE_TPETRA_MMM_TIMINGS
1687 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1688 using Teuchos::TimeMonitor;
1689 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM M5 Cmap")))));
1690#endif
1691 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1692
1693 // Build the final importer / column map, hash table lookups for C
1694 RCP<const import_type> Cimport;
1695 RCP<const map_type> Ccolmap;
1696 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1697 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1698 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1699 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1700 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1701 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1702 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1703 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1704
1705 // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
1706 // indices of B, to local column indices of C. (B and C have the
1707 // same number of columns.) The kernel uses this, instead of
1708 // copying the entire input matrix B and converting its column
1709 // indices to those of C.
1710 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
1711
1712 if (Bview.importMatrix.is_null()) {
1713 // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1714 Cimport = Bimport;
1715 Ccolmap = Bview.colMap;
1716 const LO colMapSize = static_cast<LO>(Bview.colMap->getLocalNumElements());
1717 // Bcol2Ccol is trivial
1718 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1719 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1720 KOKKOS_LAMBDA(const LO i) {
1721 Bcol2Ccol(i) = i;
1722 });
1723 }
1724 else {
1725 // mfh 27 Sep 2016: B has "remotes," so we need to build the
1726 // column Map of C, as well as C's Import object (from its domain
1727 // Map to its column Map). C's column Map is the union of the
1728 // column Maps of (the local part of) B, and the "remote" part of
1729 // B. Ditto for the Import. We have optimized this "setUnion"
1730 // operation on Import objects and Maps.
1731
1732 // Choose the right variant of setUnion
1733 if (!Bimport.is_null() && !Iimport.is_null()) {
1734 Cimport = Bimport->setUnion(*Iimport);
1735 }
1736 else if (!Bimport.is_null() && Iimport.is_null()) {
1737 Cimport = Bimport->setUnion();
1738 }
1739 else if (Bimport.is_null() && !Iimport.is_null()) {
1740 Cimport = Iimport->setUnion();
1741 }
1742 else {
1743 throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1744 }
1745 Ccolmap = Cimport->getTargetMap();
1746
1747 // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
1748 // in general. We should get rid of it in order to reduce
1749 // communication costs of sparse matrix-matrix multiply.
1750 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1751 std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1752
1753 // NOTE: This is not efficient and should be folded into setUnion
1754 //
1755 // mfh 27 Sep 2016: What the above comment means, is that the
1756 // setUnion operation on Import objects could also compute these
1757 // local index - to - local index look-up tables.
1758 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
1759 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1760 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
1761 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1762 });
1763 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
1764 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1765 });
1766
1767 }
1768
1769 // Replace the column map
1770 //
1771 // mfh 27 Sep 2016: We do this because C was originally created
1772 // without a column Map. Now we have its column Map.
1773 C.replaceColMap(Ccolmap);
1774
1775 // mfh 27 Sep 2016: Construct tables that map from local column
1776 // indices of A, to local row indices of either B_local (the locally
1777 // owned part of B), or B_remote (the "imported" remote part of B).
1778 //
1779 // For column index Aik in row i of A, if the corresponding row of B
1780 // exists in the local part of B ("orig") (which I'll call B_local),
1781 // then targetMapToOrigRow[Aik] is the local index of that row of B.
1782 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1783 //
1784 // For column index Aik in row i of A, if the corresponding row of B
1785 // exists in the remote part of B ("Import") (which I'll call
1786 // B_remote), then targetMapToImportRow[Aik] is the local index of
1787 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1788 // (a flag value).
1789
1790 // Run through all the hash table lookups once and for all
1791 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
1792 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
1793
1794 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
1795 GO aidx = Acolmap_local.getGlobalElement(i);
1796 LO B_LID = Browmap_local.getLocalElement(aidx);
1797 if (B_LID != LO_INVALID) {
1798 targetMapToOrigRow(i) = B_LID;
1799 targetMapToImportRow(i) = LO_INVALID;
1800 } else {
1801 LO I_LID = Irowmap_local.getLocalElement(aidx);
1802 targetMapToOrigRow(i) = LO_INVALID;
1803 targetMapToImportRow(i) = I_LID;
1804
1805 }
1806 });
1807
1808 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
1809 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
1810 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
1811
1812}
1813
1814/*********************************************************************************************************/
1815// Kernel method for computing the local portion of C = A*B for BlockCrsMatrix
1816template<class Scalar,
1817 class LocalOrdinal,
1818 class GlobalOrdinal,
1819 class Node>
1820void mult_A_B_newmatrix(BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1821 BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1822 Teuchos::RCP<BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > &C)
1823{
1824 using Teuchos::null;
1825 using Teuchos::Array;
1826 using Teuchos::ArrayRCP;
1827 using Teuchos::ArrayView;
1828 using Teuchos::RCP;
1829 using Teuchos::rcp;
1830
1831 // Tpetra typedefs
1832 typedef LocalOrdinal LO;
1833 typedef GlobalOrdinal GO;
1834 typedef Node NO;
1835 typedef Import<LO,GO,NO> import_type;
1836 typedef Map<LO,GO,NO> map_type;
1837 typedef BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> block_crs_matrix_type;
1838 typedef typename block_crs_matrix_type::crs_graph_type graph_t;
1839
1840 // Kokkos typedefs
1841 typedef typename map_type::local_map_type local_map_type;
1842 typedef typename block_crs_matrix_type::local_matrix_device_type KBSR;
1843 typedef typename KBSR::device_type device_t;
1844 typedef typename KBSR::StaticCrsGraphType static_graph_t;
1845 typedef typename static_graph_t::row_map_type::non_const_type lno_view_t;
1846 typedef typename static_graph_t::entries_type::non_const_type lno_nnz_view_t;
1847 typedef typename KBSR::values_type::non_const_type scalar_view_t;
1848 typedef typename NO::execution_space execution_space;
1849 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1850 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1851
1852 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1853
1854 // Build the final importer / column map, hash table lookups for C
1855 RCP<const import_type> Cimport;
1856 RCP<const map_type> Ccolmap;
1857 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1858 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1859 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1860 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1861 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1862 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1863 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1864 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1865
1866 // Bcol2Ccol is a table that maps from local column
1867 // indices of B, to local column indices of C. (B and C have the
1868 // same number of columns.) The kernel uses this, instead of
1869 // copying the entire input matrix B and converting its column
1870 // indices to those of C.
1871 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
1872
1873 if (Bview.importMatrix.is_null()) {
1874 // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1875 Cimport = Bimport;
1876 Ccolmap = Bview.colMap;
1877 const LO colMapSize = static_cast<LO>(Bview.colMap->getLocalNumElements());
1878 // Bcol2Ccol is trivial
1879 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1880 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1881 KOKKOS_LAMBDA(const LO i) {
1882 Bcol2Ccol(i) = i;
1883 });
1884 }
1885 else {
1886 // B has "remotes," so we need to build the
1887 // column Map of C, as well as C's Import object (from its domain
1888 // Map to its column Map). C's column Map is the union of the
1889 // column Maps of (the local part of) B, and the "remote" part of
1890 // B. Ditto for the Import. We have optimized this "setUnion"
1891 // operation on Import objects and Maps.
1892
1893 // Choose the right variant of setUnion
1894 if (!Bimport.is_null() && !Iimport.is_null()) {
1895 Cimport = Bimport->setUnion(*Iimport);
1896 }
1897 else if (!Bimport.is_null() && Iimport.is_null()) {
1898 Cimport = Bimport->setUnion();
1899 }
1900 else if (Bimport.is_null() && !Iimport.is_null()) {
1901 Cimport = Iimport->setUnion();
1902 }
1903 else {
1904 throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1905 }
1906 Ccolmap = Cimport->getTargetMap();
1907
1908 // NOTE: This is not efficient and should be folded into setUnion
1909 //
1910 // What the above comment means, is that the
1911 // setUnion operation on Import objects could also compute these
1912 // local index - to - local index look-up tables.
1913 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
1914 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1915 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",
1916 range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),
1917 KOKKOS_LAMBDA(const LO i) {
1918 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1919 });
1920 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",
1921 range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),
1922 KOKKOS_LAMBDA(const LO i) {
1923 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1924 });
1925 }
1926
1927 // Construct tables that map from local column
1928 // indices of A, to local row indices of either B_local (the locally
1929 // owned part of B), or B_remote (the "imported" remote part of B).
1930 //
1931 // For column index Aik in row i of A, if the corresponding row of B
1932 // exists in the local part of B ("orig") (which I'll call B_local),
1933 // then targetMapToOrigRow[Aik] is the local index of that row of B.
1934 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1935 //
1936 // For column index Aik in row i of A, if the corresponding row of B
1937 // exists in the remote part of B ("Import") (which I'll call
1938 // B_remote), then targetMapToImportRow[Aik] is the local index of
1939 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1940 // (a flag value).
1941
1942 // Run through all the hash table lookups once and for all
1943 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),
1944 Aview.colMap->getLocalNumElements());
1945 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),
1946 Aview.colMap->getLocalNumElements());
1947
1948 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::construct_tables",
1949 range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),
1950 KOKKOS_LAMBDA(const LO i) {
1951 GO aidx = Acolmap_local.getGlobalElement(i);
1952 LO B_LID = Browmap_local.getLocalElement(aidx);
1953 if (B_LID != LO_INVALID) {
1954 targetMapToOrigRow(i) = B_LID;
1955 targetMapToImportRow(i) = LO_INVALID;
1956 } else {
1957 LO I_LID = Irowmap_local.getLocalElement(aidx);
1958 targetMapToOrigRow(i) = LO_INVALID;
1959 targetMapToImportRow(i) = I_LID;
1960 }
1961 });
1962
1963 // Create the KernelHandle
1964 using KernelHandle =
1965 KokkosKernels::Experimental::KokkosKernelsHandle<typename lno_view_t::const_value_type,
1966 typename lno_nnz_view_t::const_value_type,
1967 typename scalar_view_t::const_value_type,
1968 typename device_t::execution_space,
1969 typename device_t::memory_space,
1970 typename device_t::memory_space>;
1971 int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
1972 std::string myalg("SPGEMM_KK_MEMORY");
1973 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
1974
1975 KernelHandle kh;
1976 kh.create_spgemm_handle(alg_enum);
1977 kh.set_team_work_size(team_work_size);
1978
1979 // Get KokkosSparse::BsrMatrix for A and Bmerged (B and BImport)
1980 const KBSR Amat = Aview.origMatrix->getLocalMatrixDevice();
1981 const KBSR Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,
1982 targetMapToOrigRow,targetMapToImportRow,
1983 Bcol2Ccol,Icol2Ccol,
1984 Ccolmap.getConst()->getLocalNumElements());
1985
1986 RCP<graph_t> graphC;
1987 typename KBSR::values_type values;
1988 {
1989 // Call KokkosSparse routines to calculate Amat*Bmerged on device.
1990 // NOTE: Need to scope guard this since the BlockCrs constructor will need to copy the host graph
1991 KBSR Cmat;
1992 KokkosSparse::block_spgemm_symbolic(kh, Amat, false, Bmerged, false, Cmat);
1993 KokkosSparse::block_spgemm_numeric (kh, Amat, false, Bmerged, false, Cmat);
1994 kh.destroy_spgemm_handle();
1995
1996 // Build Tpetra::BlockCrsMatrix from KokkosSparse::BsrMatrix
1997 graphC = rcp(new graph_t(Cmat.graph, Aview.origMatrix->getRowMap(), Ccolmap.getConst()));
1998 values = Cmat.values;
1999 }
2000 C = rcp (new block_crs_matrix_type (*graphC, values, Aview.blocksize));
2001
2002}
2003
2004/*********************************************************************************************************/
2005// AB NewMatrix Kernel wrappers (Default non-threaded version for CrsMatrix)
2006template<class Scalar,
2007 class LocalOrdinal,
2008 class GlobalOrdinal,
2009 class Node,
2010 class LocalOrdinalViewType>
2011void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2012 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2013 const LocalOrdinalViewType & targetMapToOrigRow,
2014 const LocalOrdinalViewType & targetMapToImportRow,
2015 const LocalOrdinalViewType & Bcol2Ccol,
2016 const LocalOrdinalViewType & Icol2Ccol,
2017 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2018 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2019 const std::string& label,
2020 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2021#ifdef HAVE_TPETRA_MMM_TIMINGS
2022 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2023 using Teuchos::TimeMonitor;
2024 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore"))));
2025#endif
2026
2027 using Teuchos::Array;
2028 using Teuchos::ArrayRCP;
2029 using Teuchos::ArrayView;
2030 using Teuchos::RCP;
2031 using Teuchos::rcp;
2032
2033 // Lots and lots of typedefs
2034 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2035 typedef typename KCRS::StaticCrsGraphType graph_t;
2036 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2037 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2038 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2039 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2040
2041 typedef Scalar SC;
2042 typedef LocalOrdinal LO;
2043 typedef GlobalOrdinal GO;
2044 typedef Node NO;
2045 typedef Map<LO,GO,NO> map_type;
2046 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2047 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2048 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2049
2050 // Sizes
2051 RCP<const map_type> Ccolmap = C.getColMap();
2052 size_t m = Aview.origMatrix->getLocalNumRows();
2053 size_t n = Ccolmap->getLocalNumElements();
2054 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
2055
2056 // Grab the Kokkos::SparseCrsMatrices & inner stuff
2057 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2058 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2059
2060 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2061 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2062 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2063
2064 c_lno_view_t Irowptr;
2065 lno_nnz_view_t Icolind;
2066 scalar_view_t Ivals;
2067 if(!Bview.importMatrix.is_null()) {
2068 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2069 Irowptr = lclB.graph.row_map;
2070 Icolind = lclB.graph.entries;
2071 Ivals = lclB.values;
2072 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
2073 }
2074
2075#ifdef HAVE_TPETRA_MMM_TIMINGS
2076 RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
2077#endif
2078
2079 // Classic csr assembly (low memory edition)
2080 //
2081 // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2082 // The method loops over rows of A, and may resize after processing
2083 // each row. Chris Siefert says that this reflects experience in
2084 // ML; for the non-threaded case, ML found it faster to spend less
2085 // effort on estimation and risk an occasional reallocation.
2086 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2087 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2088 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2089 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2090
2091 // mfh 27 Sep 2016: The c_status array is an implementation detail
2092 // of the local sparse matrix-matrix multiply routine.
2093
2094 // The status array will contain the index into colind where this entry was last deposited.
2095 // c_status[i] < CSR_ip - not in the row yet
2096 // c_status[i] >= CSR_ip - this is the entry where you can find the data
2097 // We start with this filled with INVALID's indicating that there are no entries yet.
2098 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2099 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2100 std::vector<size_t> c_status(n, ST_INVALID);
2101
2102 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2103 // routine. The routine computes C := A * (B_local + B_remote).
2104 //
2105 // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2106 // you whether the corresponding row of B belongs to B_local
2107 // ("orig") or B_remote ("Import").
2108
2109 // For each row of A/C
2110 size_t CSR_ip = 0, OLD_ip = 0;
2111 for (size_t i = 0; i < m; i++) {
2112 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2113 // on the calling process.
2114 Crowptr[i] = CSR_ip;
2115
2116 // mfh 27 Sep 2016: For each entry of A in the current row of A
2117 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2118 LO Aik = Acolind[k]; // local column index of current entry of A
2119 const SC Aval = Avals[k]; // value of current entry of A
2120 if (Aval == SC_ZERO)
2121 continue; // skip explicitly stored zero values in A
2122
2123 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2124 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2125 // corresponding to the current entry of A is populated, then
2126 // the corresponding row of B is in B_local (i.e., it lives on
2127 // the calling process).
2128
2129 // Local matrix
2130 size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2131
2132 // mfh 27 Sep 2016: Go through all entries in that row of B_local.
2133 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2134 LO Bkj = Bcolind[j];
2135 LO Cij = Bcol2Ccol[Bkj];
2136
2137 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2138 // New entry
2139 c_status[Cij] = CSR_ip;
2140 Ccolind[CSR_ip] = Cij;
2141 Cvals[CSR_ip] = Aval*Bvals[j];
2142 CSR_ip++;
2143
2144 } else {
2145 Cvals[c_status[Cij]] += Aval*Bvals[j];
2146 }
2147 }
2148
2149 } else {
2150 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2151 // corresponding to the current entry of A NOT populated (has
2152 // a flag "invalid" value), then the corresponding row of B is
2153 // in B_local (i.e., it lives on the calling process).
2154
2155 // Remote matrix
2156 size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2157 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2158 LO Ikj = Icolind[j];
2159 LO Cij = Icol2Ccol[Ikj];
2160
2161 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
2162 // New entry
2163 c_status[Cij] = CSR_ip;
2164 Ccolind[CSR_ip] = Cij;
2165 Cvals[CSR_ip] = Aval*Ivals[j];
2166 CSR_ip++;
2167 } else {
2168 Cvals[c_status[Cij]] += Aval*Ivals[j];
2169 }
2170 }
2171 }
2172 }
2173
2174 // Resize for next pass if needed
2175 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
2176 CSR_alloc *= 2;
2177 Kokkos::resize(Ccolind,CSR_alloc);
2178 Kokkos::resize(Cvals,CSR_alloc);
2179 }
2180 OLD_ip = CSR_ip;
2181 }
2182
2183 Crowptr[m] = CSR_ip;
2184
2185 // Downward resize
2186 Kokkos::resize(Ccolind,CSR_ip);
2187 Kokkos::resize(Cvals,CSR_ip);
2188
2189#ifdef HAVE_TPETRA_MMM_TIMINGS
2190 {
2191 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix Final Sort")));
2192#endif
2193
2194 // Final sort & set of CRS arrays
2195 if (params.is_null() || params->get("sort entries",true))
2196 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2197 C.setAllValues(Crowptr,Ccolind, Cvals);
2198
2199
2200#ifdef HAVE_TPETRA_MMM_TIMINGS
2201 }
2202 auto MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix ESFC")));
2203 {
2204#endif
2205
2206 // Final FillComplete
2207 //
2208 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2209 // Import (from domain Map to column Map) construction (which costs
2210 // lots of communication) by taking the previously constructed
2211 // Import object. We should be able to do this without interfering
2212 // with the implementation of the local part of sparse matrix-matrix
2213 // multply above.
2214 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2215 labelList->set("Timer Label",label);
2216 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2217 RCP<const Export<LO,GO,NO> > dummyExport;
2218 C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2219#ifdef HAVE_TPETRA_MMM_TIMINGS
2220 }
2221 MM2 = Teuchos::null;
2222#endif
2223
2224}
2225/*********************************************************************************************************/
2226// Kernel method for computing the local portion of C = A*B (reuse)
2227template<class Scalar,
2228 class LocalOrdinal,
2229 class GlobalOrdinal,
2230 class Node>
2231void mult_A_B_reuse(
2232 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2233 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2234 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2235 const std::string& label,
2236 const Teuchos::RCP<Teuchos::ParameterList>& params)
2237{
2238 using Teuchos::Array;
2239 using Teuchos::ArrayRCP;
2240 using Teuchos::ArrayView;
2241 using Teuchos::RCP;
2242 using Teuchos::rcp;
2243
2244 // Tpetra typedefs
2245 typedef LocalOrdinal LO;
2246 typedef GlobalOrdinal GO;
2247 typedef Node NO;
2248 typedef Import<LO,GO,NO> import_type;
2249 typedef Map<LO,GO,NO> map_type;
2250
2251 // Kokkos typedefs
2252 typedef typename map_type::local_map_type local_map_type;
2254 typedef typename KCRS::StaticCrsGraphType graph_t;
2255 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2256 typedef typename NO::execution_space execution_space;
2257 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2258 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2259
2260#ifdef HAVE_TPETRA_MMM_TIMINGS
2261 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2262 using Teuchos::TimeMonitor;
2263 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse Cmap"))));
2264#endif
2265 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2266
2267 // Grab all the maps
2268 RCP<const import_type> Cimport = C.getGraph()->getImporter();
2269 RCP<const map_type> Ccolmap = C.getColMap();
2270 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2271 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2272 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2273 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2274 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2275 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2276
2277 // Build the final importer / column map, hash table lookups for C
2278 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2279 {
2280 // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2281 // So, column map of C may be a strict subset of the column map of B
2282 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2283 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2284 });
2285
2286 if (!Bview.importMatrix.is_null()) {
2287 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2288 std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
2289
2290 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2291 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2292 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2293 });
2294 }
2295 }
2296
2297 // Run through all the hash table lookups once and for all
2298 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2299 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2300 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2301 GO aidx = Acolmap_local.getGlobalElement(i);
2302 LO B_LID = Browmap_local.getLocalElement(aidx);
2303 if (B_LID != LO_INVALID) {
2304 targetMapToOrigRow(i) = B_LID;
2305 targetMapToImportRow(i) = LO_INVALID;
2306 } else {
2307 LO I_LID = Irowmap_local.getLocalElement(aidx);
2308 targetMapToOrigRow(i) = LO_INVALID;
2309 targetMapToImportRow(i) = I_LID;
2310
2311 }
2312 });
2313
2314 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2315 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2316 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2317}
2318
2319/*********************************************************************************************************/
2320template<class Scalar,
2321 class LocalOrdinal,
2322 class GlobalOrdinal,
2323 class Node,
2324 class LocalOrdinalViewType>
2325void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2326 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2327 const LocalOrdinalViewType & targetMapToOrigRow,
2328 const LocalOrdinalViewType & targetMapToImportRow,
2329 const LocalOrdinalViewType & Bcol2Ccol,
2330 const LocalOrdinalViewType & Icol2Ccol,
2331 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2332 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2333 const std::string& label,
2334 const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2335#ifdef HAVE_TPETRA_MMM_TIMINGS
2336 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2337 using Teuchos::TimeMonitor;
2338 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
2339 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2340#else
2341 (void)label;
2342#endif
2343 using Teuchos::RCP;
2344 using Teuchos::rcp;
2345
2346
2347 // Lots and lots of typedefs
2348 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2349 typedef typename KCRS::StaticCrsGraphType graph_t;
2350 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2351 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2352 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2353
2354 typedef Scalar SC;
2355 typedef LocalOrdinal LO;
2356 typedef GlobalOrdinal GO;
2357 typedef Node NO;
2358 typedef Map<LO,GO,NO> map_type;
2359 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2360 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2361 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2362
2363 // Sizes
2364 RCP<const map_type> Ccolmap = C.getColMap();
2365 size_t m = Aview.origMatrix->getLocalNumRows();
2366 size_t n = Ccolmap->getLocalNumElements();
2367
2368 // Grab the Kokkos::SparseCrsMatrices & inner stuff
2369 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2370 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2371 const KCRS & Cmat = C.getLocalMatrixHost();
2372
2373 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2374 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2375 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2376 scalar_view_t Cvals = Cmat.values;
2377
2378 c_lno_view_t Irowptr;
2379 lno_nnz_view_t Icolind;
2380 scalar_view_t Ivals;
2381 if(!Bview.importMatrix.is_null()) {
2382 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2383 Irowptr = lclB.graph.row_map;
2384 Icolind = lclB.graph.entries;
2385 Ivals = lclB.values;
2386 }
2387
2388#ifdef HAVE_TPETRA_MMM_TIMINGS
2389 MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
2390#endif
2391
2392 // Classic csr assembly (low memory edition)
2393 // mfh 27 Sep 2016: The c_status array is an implementation detail
2394 // of the local sparse matrix-matrix multiply routine.
2395
2396 // The status array will contain the index into colind where this entry was last deposited.
2397 // c_status[i] < CSR_ip - not in the row yet
2398 // c_status[i] >= CSR_ip - this is the entry where you can find the data
2399 // We start with this filled with INVALID's indicating that there are no entries yet.
2400 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2401 std::vector<size_t> c_status(n, ST_INVALID);
2402
2403 // For each row of A/C
2404 size_t CSR_ip = 0, OLD_ip = 0;
2405 for (size_t i = 0; i < m; i++) {
2406 // First fill the c_status array w/ locations where we're allowed to
2407 // generate nonzeros for this row
2408 OLD_ip = Crowptr[i];
2409 CSR_ip = Crowptr[i+1];
2410 for (size_t k = OLD_ip; k < CSR_ip; k++) {
2411 c_status[Ccolind[k]] = k;
2412
2413 // Reset values in the row of C
2414 Cvals[k] = SC_ZERO;
2415 }
2416
2417 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2418 LO Aik = Acolind[k];
2419 const SC Aval = Avals[k];
2420 if (Aval == SC_ZERO)
2421 continue;
2422
2423 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2424 // Local matrix
2425 size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2426
2427 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2428 LO Bkj = Bcolind[j];
2429 LO Cij = Bcol2Ccol[Bkj];
2430
2431 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2432 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2433 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2434
2435 Cvals[c_status[Cij]] += Aval * Bvals[j];
2436 }
2437
2438 } else {
2439 // Remote matrix
2440 size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2441 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2442 LO Ikj = Icolind[j];
2443 LO Cij = Icol2Ccol[Ikj];
2444
2445 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2446 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2447 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2448
2449 Cvals[c_status[Cij]] += Aval * Ivals[j];
2450 }
2451 }
2452 }
2453 }
2454
2455#ifdef HAVE_TPETRA_MMM_TIMINGS
2456 auto MM3 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse ESFC"))));
2457#endif
2458
2459 C.fillComplete(C.getDomainMap(), C.getRangeMap());
2460}
2461
2462
2463/*********************************************************************************************************/
2464// Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2465template<class Scalar,
2466 class LocalOrdinal,
2467 class GlobalOrdinal,
2468 class Node>
2469void jacobi_A_B_newmatrix(
2470 Scalar omega,
2471 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2472 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2473 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2474 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2475 const std::string& label,
2476 const Teuchos::RCP<Teuchos::ParameterList>& params)
2477{
2478 using Teuchos::Array;
2479 using Teuchos::ArrayRCP;
2480 using Teuchos::ArrayView;
2481 using Teuchos::RCP;
2482 using Teuchos::rcp;
2483 // typedef Scalar SC;
2484 typedef LocalOrdinal LO;
2485 typedef GlobalOrdinal GO;
2486 typedef Node NO;
2487
2488 typedef Import<LO,GO,NO> import_type;
2489 typedef Map<LO,GO,NO> map_type;
2490 typedef typename map_type::local_map_type local_map_type;
2491
2492 // All of the Kokkos typedefs
2494 typedef typename KCRS::StaticCrsGraphType graph_t;
2495 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2496 typedef typename NO::execution_space execution_space;
2497 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2498 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2499
2500
2501#ifdef HAVE_TPETRA_MMM_TIMINGS
2502 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2503 using Teuchos::TimeMonitor;
2504 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi M5 Cmap"))));
2505#endif
2506 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2507
2508 // Build the final importer / column map, hash table lookups for C
2509 RCP<const import_type> Cimport;
2510 RCP<const map_type> Ccolmap;
2511 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2512 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2513 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2514 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2515 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2516 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2517 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2518 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2519
2520 // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
2521 // indices of B, to local column indices of C. (B and C have the
2522 // same number of columns.) The kernel uses this, instead of
2523 // copying the entire input matrix B and converting its column
2524 // indices to those of C.
2525 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2526
2527 if (Bview.importMatrix.is_null()) {
2528 // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
2529 Cimport = Bimport;
2530 Ccolmap = Bview.colMap;
2531 // Bcol2Ccol is trivial
2532 // Bcol2Ccol is trivial
2533
2534 Kokkos::RangePolicy<execution_space, LO> range (0, static_cast<LO> (Bview.colMap->getLocalNumElements ()));
2535 Kokkos::parallel_for (range, KOKKOS_LAMBDA (const size_t i) {
2536 Bcol2Ccol(i) = static_cast<LO> (i);
2537 });
2538 } else {
2539 // mfh 27 Sep 2016: B has "remotes," so we need to build the
2540 // column Map of C, as well as C's Import object (from its domain
2541 // Map to its column Map). C's column Map is the union of the
2542 // column Maps of (the local part of) B, and the "remote" part of
2543 // B. Ditto for the Import. We have optimized this "setUnion"
2544 // operation on Import objects and Maps.
2545
2546 // Choose the right variant of setUnion
2547 if (!Bimport.is_null() && !Iimport.is_null()){
2548 Cimport = Bimport->setUnion(*Iimport);
2549 Ccolmap = Cimport->getTargetMap();
2550
2551 } else if (!Bimport.is_null() && Iimport.is_null()) {
2552 Cimport = Bimport->setUnion();
2553
2554 } else if(Bimport.is_null() && !Iimport.is_null()) {
2555 Cimport = Iimport->setUnion();
2556
2557 } else
2558 throw std::runtime_error("TpetraExt::Jacobi status of matrix importers is nonsensical");
2559
2560 Ccolmap = Cimport->getTargetMap();
2561
2562 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2563 std::runtime_error, "Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2564
2565 // NOTE: This is not efficient and should be folded into setUnion
2566 //
2567 // mfh 27 Sep 2016: What the above comment means, is that the
2568 // setUnion operation on Import objects could also compute these
2569 // local index - to - local index look-up tables.
2570 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2571 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2572 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2573 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2574 });
2575 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2576 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2577 });
2578
2579 }
2580
2581 // Replace the column map
2582 //
2583 // mfh 27 Sep 2016: We do this because C was originally created
2584 // without a column Map. Now we have its column Map.
2585 C.replaceColMap(Ccolmap);
2586
2587 // mfh 27 Sep 2016: Construct tables that map from local column
2588 // indices of A, to local row indices of either B_local (the locally
2589 // owned part of B), or B_remote (the "imported" remote part of B).
2590 //
2591 // For column index Aik in row i of A, if the corresponding row of B
2592 // exists in the local part of B ("orig") (which I'll call B_local),
2593 // then targetMapToOrigRow[Aik] is the local index of that row of B.
2594 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
2595 //
2596 // For column index Aik in row i of A, if the corresponding row of B
2597 // exists in the remote part of B ("Import") (which I'll call
2598 // B_remote), then targetMapToImportRow[Aik] is the local index of
2599 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
2600 // (a flag value).
2601
2602 // Run through all the hash table lookups once and for all
2603 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2604 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2605 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2606 GO aidx = Acolmap_local.getGlobalElement(i);
2607 LO B_LID = Browmap_local.getLocalElement(aidx);
2608 if (B_LID != LO_INVALID) {
2609 targetMapToOrigRow(i) = B_LID;
2610 targetMapToImportRow(i) = LO_INVALID;
2611 } else {
2612 LO I_LID = Irowmap_local.getLocalElement(aidx);
2613 targetMapToOrigRow(i) = LO_INVALID;
2614 targetMapToImportRow(i) = I_LID;
2615
2616 }
2617 });
2618
2619 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2620 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2621 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2622
2623}
2624
2625
2626/*********************************************************************************************************/
2627// Jacobi AB NewMatrix Kernel wrappers (Default non-threaded version)
2628// Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2629
2630template<class Scalar,
2631 class LocalOrdinal,
2632 class GlobalOrdinal,
2633 class Node,
2634 class LocalOrdinalViewType>
2635void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2636 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2637 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2638 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2639 const LocalOrdinalViewType & targetMapToOrigRow,
2640 const LocalOrdinalViewType & targetMapToImportRow,
2641 const LocalOrdinalViewType & Bcol2Ccol,
2642 const LocalOrdinalViewType & Icol2Ccol,
2643 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2644 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2645 const std::string& label,
2646 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2647
2648#ifdef HAVE_TPETRA_MMM_TIMINGS
2649 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2650 using Teuchos::TimeMonitor;
2651 auto MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Nemwmatrix SerialCore")));
2652#endif
2653
2654 using Teuchos::Array;
2655 using Teuchos::ArrayRCP;
2656 using Teuchos::ArrayView;
2657 using Teuchos::RCP;
2658 using Teuchos::rcp;
2659
2660 // Lots and lots of typedefs
2661 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2662 typedef typename KCRS::StaticCrsGraphType graph_t;
2663 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2664 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2665 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2666 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2667
2668 // Jacobi-specific
2669 typedef typename scalar_view_t::memory_space scalar_memory_space;
2670
2671 typedef Scalar SC;
2672 typedef LocalOrdinal LO;
2673 typedef GlobalOrdinal GO;
2674 typedef Node NO;
2675
2676 typedef Map<LO,GO,NO> map_type;
2677 size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2678 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2679
2680 // Sizes
2681 RCP<const map_type> Ccolmap = C.getColMap();
2682 size_t m = Aview.origMatrix->getLocalNumRows();
2683 size_t n = Ccolmap->getLocalNumElements();
2684 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
2685
2686 // Grab the Kokkos::SparseCrsMatrices & inner stuff
2687 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2688 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2689
2690 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2691 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2692 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2693
2694 c_lno_view_t Irowptr;
2695 lno_nnz_view_t Icolind;
2696 scalar_view_t Ivals;
2697 if(!Bview.importMatrix.is_null()) {
2698 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2699 Irowptr = lclB.graph.row_map;
2700 Icolind = lclB.graph.entries;
2701 Ivals = lclB.values;
2702 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
2703 }
2704
2705 // Jacobi-specific inner stuff
2706 auto Dvals =
2707 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2708
2709 // Teuchos::ArrayView::operator[].
2710 // The status array will contain the index into colind where this entry was last deposited.
2711 // c_status[i] < CSR_ip - not in the row yet.
2712 // c_status[i] >= CSR_ip, this is the entry where you can find the data
2713 // We start with this filled with INVALID's indicating that there are no entries yet.
2714 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2715 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2716 Array<size_t> c_status(n, ST_INVALID);
2717
2718 // Classic csr assembly (low memory edition)
2719 //
2720 // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2721 // The method loops over rows of A, and may resize after processing
2722 // each row. Chris Siefert says that this reflects experience in
2723 // ML; for the non-threaded case, ML found it faster to spend less
2724 // effort on estimation and risk an occasional reallocation.
2725 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2726 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2727 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2728 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2729 size_t CSR_ip = 0, OLD_ip = 0;
2730
2731 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2732
2733 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2734 // routine. The routine computes
2735 //
2736 // C := (I - omega * D^{-1} * A) * (B_local + B_remote)).
2737 //
2738 // This corresponds to one sweep of (weighted) Jacobi.
2739 //
2740 // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2741 // you whether the corresponding row of B belongs to B_local
2742 // ("orig") or B_remote ("Import").
2743
2744 // For each row of A/C
2745 for (size_t i = 0; i < m; i++) {
2746 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2747 // on the calling process.
2748 Crowptr[i] = CSR_ip;
2749 SC minusOmegaDval = -omega*Dvals(i,0);
2750
2751 // Entries of B
2752 for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2753 Scalar Bval = Bvals[j];
2754 if (Bval == SC_ZERO)
2755 continue;
2756 LO Bij = Bcolind[j];
2757 LO Cij = Bcol2Ccol[Bij];
2758
2759 // Assume no repeated entries in B
2760 c_status[Cij] = CSR_ip;
2761 Ccolind[CSR_ip] = Cij;
2762 Cvals[CSR_ip] = Bvals[j];
2763 CSR_ip++;
2764 }
2765
2766 // Entries of -omega * Dinv * A * B
2767 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2768 LO Aik = Acolind[k];
2769 const SC Aval = Avals[k];
2770 if (Aval == SC_ZERO)
2771 continue;
2772
2773 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2774 // Local matrix
2775 size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2776
2777 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2778 LO Bkj = Bcolind[j];
2779 LO Cij = Bcol2Ccol[Bkj];
2780
2781 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2782 // New entry
2783 c_status[Cij] = CSR_ip;
2784 Ccolind[CSR_ip] = Cij;
2785 Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2786 CSR_ip++;
2787
2788 } else {
2789 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2790 }
2791 }
2792
2793 } else {
2794 // Remote matrix
2795 size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2796 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2797 LO Ikj = Icolind[j];
2798 LO Cij = Icol2Ccol[Ikj];
2799
2800 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2801 // New entry
2802 c_status[Cij] = CSR_ip;
2803 Ccolind[CSR_ip] = Cij;
2804 Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2805 CSR_ip++;
2806 } else {
2807 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2808 }
2809 }
2810 }
2811 }
2812
2813 // Resize for next pass if needed
2814 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2815 CSR_alloc *= 2;
2816 Kokkos::resize(Ccolind,CSR_alloc);
2817 Kokkos::resize(Cvals,CSR_alloc);
2818 }
2819 OLD_ip = CSR_ip;
2820 }
2821 Crowptr[m] = CSR_ip;
2822
2823 // Downward resize
2824 Kokkos::resize(Ccolind,CSR_ip);
2825 Kokkos::resize(Cvals,CSR_ip);
2826
2827 {
2828#ifdef HAVE_TPETRA_MMM_TIMINGS
2829 auto MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix Final Sort")));
2830#endif
2831
2832 // Replace the column map
2833 //
2834 // mfh 27 Sep 2016: We do this because C was originally created
2835 // without a column Map. Now we have its column Map.
2836 C.replaceColMap(Ccolmap);
2837
2838 // Final sort & set of CRS arrays
2839 //
2840 // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
2841 // matrix-matrix multiply routine sort the entries for us?
2842 // Final sort & set of CRS arrays
2843 if (params.is_null() || params->get("sort entries",true))
2844 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2845 C.setAllValues(Crowptr,Ccolind, Cvals);
2846 }
2847 {
2848#ifdef HAVE_TPETRA_MMM_TIMINGS
2849 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix ESFC")));
2850#endif
2851
2852 // Final FillComplete
2853 //
2854 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2855 // Import (from domain Map to column Map) construction (which costs
2856 // lots of communication) by taking the previously constructed
2857 // Import object. We should be able to do this without interfering
2858 // with the implementation of the local part of sparse matrix-matrix
2859 // multply above
2860 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2861 labelList->set("Timer Label",label);
2862 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2863 RCP<const Export<LO,GO,NO> > dummyExport;
2864 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2865
2866 }
2867}
2868
2869
2870/*********************************************************************************************************/
2871// Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2872template<class Scalar,
2873 class LocalOrdinal,
2874 class GlobalOrdinal,
2875 class Node>
2876void jacobi_A_B_reuse(
2877 Scalar omega,
2878 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2879 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2880 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2881 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2882 const std::string& label,
2883 const Teuchos::RCP<Teuchos::ParameterList>& params)
2884{
2885 using Teuchos::Array;
2886 using Teuchos::ArrayRCP;
2887 using Teuchos::ArrayView;
2888 using Teuchos::RCP;
2889 using Teuchos::rcp;
2890
2891 typedef LocalOrdinal LO;
2892 typedef GlobalOrdinal GO;
2893 typedef Node NO;
2894
2895 typedef Import<LO,GO,NO> import_type;
2896 typedef Map<LO,GO,NO> map_type;
2897
2898 // Kokkos typedefs
2899 typedef typename map_type::local_map_type local_map_type;
2901 typedef typename KCRS::StaticCrsGraphType graph_t;
2902 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2903 typedef typename NO::execution_space execution_space;
2904 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2905 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2906
2907#ifdef HAVE_TPETRA_MMM_TIMINGS
2908 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2909 using Teuchos::TimeMonitor;
2910 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse Cmap"))));
2911#endif
2912 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2913
2914 // Grab all the maps
2915 RCP<const import_type> Cimport = C.getGraph()->getImporter();
2916 RCP<const map_type> Ccolmap = C.getColMap();
2917 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2918 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2919 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2920 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2921 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2922 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2923
2924 // Build the final importer / column map, hash table lookups for C
2925 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2926 {
2927 // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2928 // So, column map of C may be a strict subset of the column map of B
2929 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2930 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2931 });
2932
2933 if (!Bview.importMatrix.is_null()) {
2934 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2935 std::runtime_error, "Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2936
2937 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2938 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2939 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2940 });
2941 }
2942 }
2943
2944 // Run through all the hash table lookups once and for all
2945 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2946 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2947 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2948 GO aidx = Acolmap_local.getGlobalElement(i);
2949 LO B_LID = Browmap_local.getLocalElement(aidx);
2950 if (B_LID != LO_INVALID) {
2951 targetMapToOrigRow(i) = B_LID;
2952 targetMapToImportRow(i) = LO_INVALID;
2953 } else {
2954 LO I_LID = Irowmap_local.getLocalElement(aidx);
2955 targetMapToOrigRow(i) = LO_INVALID;
2956 targetMapToImportRow(i) = I_LID;
2957
2958 }
2959 });
2960
2961#ifdef HAVE_TPETRA_MMM_TIMINGS
2962 MM = Teuchos::null;
2963#endif
2964
2965 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2966 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2967 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2968}
2969
2970
2971
2972/*********************************************************************************************************/
2973template<class Scalar,
2974 class LocalOrdinal,
2975 class GlobalOrdinal,
2976 class Node,
2977 class LocalOrdinalViewType>
2978void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
2979 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2980 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2981 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2982 const LocalOrdinalViewType & targetMapToOrigRow,
2983 const LocalOrdinalViewType & targetMapToImportRow,
2984 const LocalOrdinalViewType & Bcol2Ccol,
2985 const LocalOrdinalViewType & Icol2Ccol,
2986 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2987 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2988 const std::string& label,
2989 const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2990#ifdef HAVE_TPETRA_MMM_TIMINGS
2991 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2992 using Teuchos::TimeMonitor;
2993 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore"))));
2994 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2995#else
2996 (void)label;
2997#endif
2998 using Teuchos::RCP;
2999 using Teuchos::rcp;
3000
3001 // Lots and lots of typedefs
3002 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
3003 typedef typename KCRS::StaticCrsGraphType graph_t;
3004 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
3005 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3006 typedef typename KCRS::values_type::non_const_type scalar_view_t;
3007 typedef typename scalar_view_t::memory_space scalar_memory_space;
3008
3009 typedef Scalar SC;
3010 typedef LocalOrdinal LO;
3011 typedef GlobalOrdinal GO;
3012 typedef Node NO;
3013 typedef Map<LO,GO,NO> map_type;
3014 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
3015 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
3016 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
3017
3018 // Sizes
3019 RCP<const map_type> Ccolmap = C.getColMap();
3020 size_t m = Aview.origMatrix->getLocalNumRows();
3021 size_t n = Ccolmap->getLocalNumElements();
3022
3023 // Grab the Kokkos::SparseCrsMatrices & inner stuff
3024 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
3025 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
3026 const KCRS & Cmat = C.getLocalMatrixHost();
3027
3028 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
3029 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
3030 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
3031 scalar_view_t Cvals = Cmat.values;
3032
3033 c_lno_view_t Irowptr;
3034 lno_nnz_view_t Icolind;
3035 scalar_view_t Ivals;
3036 if(!Bview.importMatrix.is_null()) {
3037 auto lclB = Bview.importMatrix->getLocalMatrixHost();
3038 Irowptr = lclB.graph.row_map;
3039 Icolind = lclB.graph.entries;
3040 Ivals = lclB.values;
3041 }
3042
3043 // Jacobi-specific inner stuff
3044 auto Dvals =
3045 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
3046
3047#ifdef HAVE_TPETRA_MMM_TIMINGS
3048 MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore - Compare"))));
3049#endif
3050
3051 // The status array will contain the index into colind where this entry was last deposited.
3052 // c_status[i] < CSR_ip - not in the row yet
3053 // c_status[i] >= CSR_ip - this is the entry where you can find the data
3054 // We start with this filled with INVALID's indicating that there are no entries yet.
3055 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
3056 std::vector<size_t> c_status(n, ST_INVALID);
3057
3058 // For each row of A/C
3059 size_t CSR_ip = 0, OLD_ip = 0;
3060 for (size_t i = 0; i < m; i++) {
3061
3062 // First fill the c_status array w/ locations where we're allowed to
3063 // generate nonzeros for this row
3064 OLD_ip = Crowptr[i];
3065 CSR_ip = Crowptr[i+1];
3066 for (size_t k = OLD_ip; k < CSR_ip; k++) {
3067 c_status[Ccolind[k]] = k;
3068
3069 // Reset values in the row of C
3070 Cvals[k] = SC_ZERO;
3071 }
3072
3073 SC minusOmegaDval = -omega*Dvals(i,0);
3074
3075 // Entries of B
3076 for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
3077 Scalar Bval = Bvals[j];
3078 if (Bval == SC_ZERO)
3079 continue;
3080 LO Bij = Bcolind[j];
3081 LO Cij = Bcol2Ccol[Bij];
3082
3083 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3084 std::runtime_error, "Trying to insert a new entry into a static graph");
3085
3086 Cvals[c_status[Cij]] = Bvals[j];
3087 }
3088
3089 // Entries of -omega * Dinv * A * B
3090 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
3091 LO Aik = Acolind[k];
3092 const SC Aval = Avals[k];
3093 if (Aval == SC_ZERO)
3094 continue;
3095
3096 if (targetMapToOrigRow[Aik] != LO_INVALID) {
3097 // Local matrix
3098 size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
3099
3100 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
3101 LO Bkj = Bcolind[j];
3102 LO Cij = Bcol2Ccol[Bkj];
3103
3104 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3105 std::runtime_error, "Trying to insert a new entry into a static graph");
3106
3107 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
3108 }
3109
3110 } else {
3111 // Remote matrix
3112 size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
3113 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
3114 LO Ikj = Icolind[j];
3115 LO Cij = Icol2Ccol[Ikj];
3116
3117 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3118 std::runtime_error, "Trying to insert a new entry into a static graph");
3119
3120 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
3121 }
3122 }
3123 }
3124 }
3125
3126#ifdef HAVE_TPETRA_MMM_TIMINGS
3127 MM2 = Teuchos::null;
3128 MM2 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
3129#endif
3130
3131 C.fillComplete(C.getDomainMap(), C.getRangeMap());
3132#ifdef HAVE_TPETRA_MMM_TIMINGS
3133 MM2 = Teuchos::null;
3134 MM = Teuchos::null;
3135#endif
3136
3137}
3138
3139
3140
3141/*********************************************************************************************************/
3142template<class Scalar,
3143 class LocalOrdinal,
3144 class GlobalOrdinal,
3145 class Node>
3146void import_and_extract_views(
3147 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
3148 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
3149 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3150 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
3151 bool userAssertsThereAreNoRemotes,
3152 const std::string& label,
3153 const Teuchos::RCP<Teuchos::ParameterList>& params)
3154{
3155 using Teuchos::Array;
3156 using Teuchos::ArrayView;
3157 using Teuchos::RCP;
3158 using Teuchos::rcp;
3159 using Teuchos::null;
3160
3161 typedef Scalar SC;
3162 typedef LocalOrdinal LO;
3163 typedef GlobalOrdinal GO;
3164 typedef Node NO;
3165
3166 typedef Map<LO,GO,NO> map_type;
3167 typedef Import<LO,GO,NO> import_type;
3168 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
3169
3170#ifdef HAVE_TPETRA_MMM_TIMINGS
3171 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
3172 using Teuchos::TimeMonitor;
3173 RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Alloc"))));
3174#endif
3175 // The goal of this method is to populate the 'Aview' struct with views of the
3176 // rows of A, including all rows that correspond to elements in 'targetMap'.
3177 //
3178 // If targetMap includes local elements that correspond to remotely-owned rows
3179 // of A, then those remotely-owned rows will be imported into
3180 // 'Aview.importMatrix', and views of them will be included in 'Aview'.
3181 Aview.deleteContents();
3182
3183 Aview.origMatrix = rcp(&A, false);
3184 // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3185 Aview.origMatrix->getApplyHelper();
3186 Aview.origRowMap = A.getRowMap();
3187 Aview.rowMap = targetMap;
3188 Aview.colMap = A.getColMap();
3189 Aview.domainMap = A.getDomainMap();
3190 Aview.importColMap = null;
3191 RCP<const map_type> rowMap = A.getRowMap();
3192 const int numProcs = rowMap->getComm()->getSize();
3193
3194 // Short circuit if the user swears there are no remotes (or if we're in serial)
3195 if (userAssertsThereAreNoRemotes || numProcs < 2)
3196 return;
3197
3198 RCP<const import_type> importer;
3199 if (params != null && params->isParameter("importer")) {
3200 importer = params->get<RCP<const import_type> >("importer");
3201
3202 } else {
3203#ifdef HAVE_TPETRA_MMM_TIMINGS
3204 MM = Teuchos::null;
3205 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap"))));
3206#endif
3207
3208 // Mark each row in targetMap as local or remote, and go ahead and get a view
3209 // for the local rows
3210 RCP<const map_type> remoteRowMap;
3211 size_t numRemote = 0;
3212 int mode = 0;
3213 if (!prototypeImporter.is_null() &&
3214 prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3215 prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3216 // We have a valid prototype importer --- ask it for the remotes
3217#ifdef HAVE_TPETRA_MMM_TIMINGS
3218 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode1"));
3219#endif
3220 ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3221 numRemote = prototypeImporter->getNumRemoteIDs();
3222
3223 Array<GO> remoteRows(numRemote);
3224 for (size_t i = 0; i < numRemote; i++)
3225 remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3226
3227 remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3228 rowMap->getIndexBase(), rowMap->getComm()));
3229 mode = 1;
3230
3231 } else if (prototypeImporter.is_null()) {
3232 // No prototype importer --- count the remotes the hard way
3233#ifdef HAVE_TPETRA_MMM_TIMINGS
3234 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode2"));
3235#endif
3236 ArrayView<const GO> rows = targetMap->getLocalElementList();
3237 size_t numRows = targetMap->getLocalNumElements();
3238
3239 Array<GO> remoteRows(numRows);
3240 for(size_t i = 0; i < numRows; ++i) {
3241 const LO mlid = rowMap->getLocalElement(rows[i]);
3242
3243 if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3244 remoteRows[numRemote++] = rows[i];
3245 }
3246 remoteRows.resize(numRemote);
3247 remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3248 rowMap->getIndexBase(), rowMap->getComm()));
3249 mode = 2;
3250
3251 } else {
3252 // PrototypeImporter is bad. But if we're in serial that's OK.
3253 mode = 3;
3254 }
3255
3256 if (numProcs < 2) {
3257 TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3258 "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3259 // If only one processor we don't need to import any remote rows, so return.
3260 return;
3261 }
3262
3263 //
3264 // Now we will import the needed remote rows of A, if the global maximum
3265 // value of numRemote is greater than 0.
3266 //
3267#ifdef HAVE_TPETRA_MMM_TIMINGS
3268 MM = Teuchos::null;
3269 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Collective-0"))));
3270#endif
3271
3272 global_size_t globalMaxNumRemote = 0;
3273 Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
3274
3275 if (globalMaxNumRemote > 0) {
3276#ifdef HAVE_TPETRA_MMM_TIMINGS
3277 MM = Teuchos::null;
3278 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-2"))));
3279#endif
3280 // Create an importer with target-map remoteRowMap and source-map rowMap.
3281 if (mode == 1)
3282 importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3283 else if (mode == 2)
3284 importer = rcp(new import_type(rowMap, remoteRowMap));
3285 else
3286 throw std::runtime_error("prototypeImporter->SourceMap() does not match A.getRowMap()!");
3287 }
3288
3289 if (params != null)
3290 params->set("importer", importer);
3291 }
3292
3293 if (importer != null) {
3294#ifdef HAVE_TPETRA_MMM_TIMINGS
3295 MM = Teuchos::null;
3296 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-3"))));
3297#endif
3298
3299 // Now create a new matrix into which we can import the remote rows of A that we need.
3300 Teuchos::ParameterList labelList;
3301 labelList.set("Timer Label", label);
3302 auto & labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
3303
3304 bool isMM = true;
3305 bool overrideAllreduce = false;
3306 int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
3307 // Minor speedup tweak - avoid computing the global constants
3308 Teuchos::ParameterList params_sublist;
3309 if(!params.is_null()) {
3310 labelList.set("compute global constants", params->get("compute global constants",false));
3311 params_sublist = params->sublist("matrixmatrix: kernel params",false);
3312 mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3313 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3314 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
3315 isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
3316 overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
3317 }
3318 labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM);
3319 labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3320 labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
3321
3322 Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3323 A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3324 // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3325 Aview.importMatrix->getApplyHelper();
3326
3327#if 0
3328 // Disabled code for dumping input matrices
3329 static int count=0;
3330 char str[80];
3331 sprintf(str,"import_matrix.%d.dat",count);
3333 count++;
3334#endif
3335
3336#ifdef HAVE_TPETRA_MMM_STATISTICS
3337 printMultiplicationStatistics(importer, label + std::string(" I&X MMM"));
3338#endif
3339
3340
3341#ifdef HAVE_TPETRA_MMM_TIMINGS
3342 MM = Teuchos::null;
3343 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-4"))));
3344#endif
3345
3346 // Save the column map of the imported matrix, so that we can convert indices back to global for arithmetic later
3347 Aview.importColMap = Aview.importMatrix->getColMap();
3348#ifdef HAVE_TPETRA_MMM_TIMINGS
3349 MM = Teuchos::null;
3350#endif
3351 }
3352}
3353
3354/*********************************************************************************************************/
3355template<class Scalar,
3356 class LocalOrdinal,
3357 class GlobalOrdinal,
3358 class Node>
3359void import_and_extract_views(
3360 const BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& M,
3361 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
3362 BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview,
3363 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
3364 bool userAssertsThereAreNoRemotes)
3365{
3366 using Teuchos::Array;
3367 using Teuchos::ArrayView;
3368 using Teuchos::RCP;
3369 using Teuchos::rcp;
3370 using Teuchos::null;
3371
3372 typedef Scalar SC;
3373 typedef LocalOrdinal LO;
3374 typedef GlobalOrdinal GO;
3375 typedef Node NO;
3376
3377 typedef Map<LO,GO,NO> map_type;
3378 typedef Import<LO,GO,NO> import_type;
3379 typedef BlockCrsMatrix<SC,LO,GO,NO> blockcrs_matrix_type;
3380
3381 // The goal of this method is to populate the 'Mview' struct with views of the
3382 // rows of M, including all rows that correspond to elements in 'targetMap'.
3383 //
3384 // If targetMap includes local elements that correspond to remotely-owned rows
3385 // of M, then those remotely-owned rows will be imported into
3386 // 'Mview.importMatrix', and views of them will be included in 'Mview'.
3387 Mview.deleteContents();
3388
3389 Mview.origMatrix = rcp(&M, false);
3390 // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3391 Mview.origMatrix->getApplyHelper();
3392 Mview.origRowMap = M.getRowMap();
3393 Mview.rowMap = targetMap;
3394 Mview.colMap = M.getColMap();
3395 Mview.importColMap = null;
3396 RCP<const map_type> rowMap = M.getRowMap();
3397 const int numProcs = rowMap->getComm()->getSize();
3398
3399 // Short circuit if the user swears there are no remotes (or if we're in serial)
3400 if (userAssertsThereAreNoRemotes || numProcs < 2) return;
3401
3402 // Mark each row in targetMap as local or remote, and go ahead and get a view
3403 // for the local rows
3404 RCP<const map_type> remoteRowMap;
3405 size_t numRemote = 0;
3406 int mode = 0;
3407 if (!prototypeImporter.is_null() &&
3408 prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3409 prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3410
3411 // We have a valid prototype importer --- ask it for the remotes
3412 ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3413 numRemote = prototypeImporter->getNumRemoteIDs();
3414
3415 Array<GO> remoteRows(numRemote);
3416 for (size_t i = 0; i < numRemote; i++)
3417 remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3418
3419 remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3420 rowMap->getIndexBase(), rowMap->getComm()));
3421 mode = 1;
3422
3423 } else if (prototypeImporter.is_null()) {
3424
3425 // No prototype importer --- count the remotes the hard way
3426 ArrayView<const GO> rows = targetMap->getLocalElementList();
3427 size_t numRows = targetMap->getLocalNumElements();
3428
3429 Array<GO> remoteRows(numRows);
3430 for(size_t i = 0; i < numRows; ++i) {
3431 const LO mlid = rowMap->getLocalElement(rows[i]);
3432
3433 if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3434 remoteRows[numRemote++] = rows[i];
3435 }
3436 remoteRows.resize(numRemote);
3437 remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3438 rowMap->getIndexBase(), rowMap->getComm()));
3439 mode = 2;
3440
3441 } else {
3442 // PrototypeImporter is bad. But if we're in serial that's OK.
3443 mode = 3;
3444 }
3445
3446 if (numProcs < 2) {
3447 TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3448 "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3449 // If only one processor we don't need to import any remote rows, so return.
3450 return;
3451 }
3452
3453 // Now we will import the needed remote rows of M, if the global maximum
3454 // value of numRemote is greater than 0.
3455 global_size_t globalMaxNumRemote = 0;
3456 Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
3457
3458 RCP<const import_type> importer;
3459
3460 if (globalMaxNumRemote > 0) {
3461 // Create an importer with target-map remoteRowMap and source-map rowMap.
3462 if (mode == 1)
3463 importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3464 else if (mode == 2)
3465 importer = rcp(new import_type(rowMap, remoteRowMap));
3466 else
3467 throw std::runtime_error("prototypeImporter->SourceMap() does not match M.getRowMap()!");
3468 }
3469
3470 if (importer != null) {
3471 // Get import matrix
3472 // TODO: create the int-typed row-pointer here
3473 Mview.importMatrix = Tpetra::importAndFillCompleteBlockCrsMatrix<blockcrs_matrix_type>(rcpFromRef(M), *importer);
3474 // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3475 Mview.importMatrix->getApplyHelper();
3476 // Save the column map of the imported matrix, so that we can convert indices
3477 // back to global for arithmetic later
3478 Mview.importColMap = Mview.importMatrix->getColMap();
3479 }
3480}
3481
3482/*********************************************************************************************************/
3483 // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3484template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
3486merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3487 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3488 const LocalOrdinalViewType & Acol2Brow,
3489 const LocalOrdinalViewType & Acol2Irow,
3490 const LocalOrdinalViewType & Bcol2Ccol,
3491 const LocalOrdinalViewType & Icol2Ccol,
3492 const size_t mergedNodeNumCols) {
3493
3494 using Teuchos::RCP;
3496 typedef typename KCRS::StaticCrsGraphType graph_t;
3497 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3498 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3499 typedef typename KCRS::values_type::non_const_type scalar_view_t;
3500 // Grab the Kokkos::SparseCrsMatrices
3501 const KCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
3502 const KCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3503
3504 // We need to do this dance if either (a) We have Bimport or (b) We don't A's colMap is not the same as B's rowMap
3505 if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3506 // We do have a Bimport
3507 // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3508 // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3509 RCP<const KCRS> Ik_;
3510 if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrixDevice());
3511 const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3512 KCRS Iks;
3513 if(Ik!=0) Iks = *Ik;
3514 size_t merge_numrows = Ak.numCols();
3515 // The last entry of this at least, need to be initialized
3516 lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3517
3518 const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3519
3520 // Use a Kokkos::parallel_scan to build the rowptr
3521 typedef typename Node::execution_space execution_space;
3522 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3523 Kokkos::parallel_scan ("Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3524 KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3525 if(final) Mrowptr(i) = update;
3526 // Get the row count
3527 size_t ct=0;
3528 if(Acol2Brow(i)!=LO_INVALID)
3529 ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3530 else
3531 ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3532 update+=ct;
3533
3534 if(final && i+1==merge_numrows)
3535 Mrowptr(i+1)=update;
3536 });
3537
3538 // Allocate nnz
3539 size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3540 lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"),merge_nnz);
3541 scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"),merge_nnz);
3542
3543 // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3544 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3545 Kokkos::parallel_for ("Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(const size_t i) {
3546 if(Acol2Brow(i)!=LO_INVALID) {
3547 size_t row = Acol2Brow(i);
3548 size_t start = Bk.graph.row_map(row);
3549 for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3550 Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3551 Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3552 }
3553 }
3554 else {
3555 size_t row = Acol2Irow(i);
3556 size_t start = Iks.graph.row_map(row);
3557 for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3558 Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3559 Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3560 }
3561 }
3562 });
3563
3564 KCRS newmat("CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3565 return newmat;
3566 }
3567 else {
3568 // We don't have a Bimport (the easy case)
3569 return Bk;
3570 }
3571}//end merge_matrices
3572
3573/*********************************************************************************************************/
3574 // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3575template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
3576const typename Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type
3577merge_matrices(BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3578 BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3579 const LocalOrdinalViewType & Acol2Brow,
3580 const LocalOrdinalViewType & Acol2Irow,
3581 const LocalOrdinalViewType & Bcol2Ccol,
3582 const LocalOrdinalViewType & Icol2Ccol,
3583 const size_t mergedNodeNumCols)
3584{
3585 using Teuchos::RCP;
3586 typedef typename Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type KBCRS;
3587 typedef typename KBCRS::StaticCrsGraphType graph_t;
3588 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3589 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3590 typedef typename KBCRS::values_type::non_const_type scalar_view_t;
3591
3592 // Grab the KokkosSparse::BsrMatrix
3593 const KBCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
3594 const KBCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3595
3596 // We need to do this dance if either (a) We have Bimport or (b) A's colMap is not the same as B's rowMap
3597 if(!Bview.importMatrix.is_null() ||
3598 (Bview.importMatrix.is_null() &&
3599 (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3600
3601 // We do have a Bimport
3602 // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3603 // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3604 RCP<const KBCRS> Ik_;
3605 if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KBCRS>(Bview.importMatrix->getLocalMatrixDevice());
3606 const KBCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3607 KBCRS Iks;
3608 if(Ik!=0) Iks = *Ik;
3609 size_t merge_numrows = Ak.numCols();
3610
3611 // The last entry of this at least, need to be initialized
3612 lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3613
3614 const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3615
3616 // Use a Kokkos::parallel_scan to build the rowptr
3617 typedef typename Node::execution_space execution_space;
3618 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3619 Kokkos::parallel_scan ("Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3620 KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3621 if(final) Mrowptr(i) = update;
3622 // Get the row count
3623 size_t ct=0;
3624 if(Acol2Brow(i)!=LO_INVALID)
3625 ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3626 else
3627 ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3628 update+=ct;
3629
3630 if(final && i+1==merge_numrows)
3631 Mrowptr(i+1)=update;
3632 });
3633
3634 // Allocate nnz
3635 size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3636 const int blocksize = Ak.blockDim();
3637 lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"),merge_nnz);
3638 scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"),merge_nnz*blocksize*blocksize);
3639
3640 // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3641 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3642 Kokkos::parallel_for ("Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(const size_t i) {
3643 if(Acol2Brow(i)!=LO_INVALID) {
3644 size_t row = Acol2Brow(i);
3645 size_t start = Bk.graph.row_map(row);
3646 for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3647 Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3648
3649 for (int b=0; b<blocksize*blocksize; ++b) {
3650 const int val_indx = j*blocksize*blocksize + b;
3651 const int b_val_indx = (j-Mrowptr(i)+start)*blocksize*blocksize + b;
3652 Mvalues(val_indx) = Bk.values(b_val_indx);
3653 }
3654 }
3655 }
3656 else {
3657 size_t row = Acol2Irow(i);
3658 size_t start = Iks.graph.row_map(row);
3659 for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3660 Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3661
3662 for (int b=0; b<blocksize*blocksize; ++b) {
3663 const int val_indx = j*blocksize*blocksize + b;
3664 const int b_val_indx = (j-Mrowptr(i)+start)*blocksize*blocksize + b;
3665 Mvalues(val_indx) = Iks.values(b_val_indx);
3666 }
3667 }
3668 }
3669 });
3670
3671 // Build and return merged KokkosSparse matrix
3672 KBCRS newmat("CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind, blocksize);
3673 return newmat;
3674 }
3675 else {
3676 // We don't have a Bimport (the easy case)
3677 return Bk;
3678 }
3679}//end merge_matrices
3680
3681/*********************************************************************************************************/
3682template<typename SC, typename LO, typename GO, typename NO>
3683void AddKernels<SC, LO, GO, NO>::
3684addSorted(
3685 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3686 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3687 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3688 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3689 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3690 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3691 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3692 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3693#if KOKKOSKERNELS_VERSION >= 40299
3694 GO numGlobalCols,
3695#endif
3696 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3697 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3698 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3699{
3700 using Teuchos::TimeMonitor;
3701 using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3702 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3703 auto nrows = Arowptrs.extent(0) - 1;
3704 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3705 typename AddKern::KKH handle;
3706 handle.create_spadd_handle(true);
3707 auto addHandle = handle.get_spadd_handle();
3708#ifdef HAVE_TPETRA_MMM_TIMINGS
3709 auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted symbolic")));
3710#endif
3711 KokkosSparse::Experimental::spadd_symbolic
3712 (&handle,
3713#if KOKKOSKERNELS_VERSION >= 40299
3714 nrows, numGlobalCols,
3715#endif
3716 Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3717 //KokkosKernels requires values to be zeroed
3718 Cvals = values_array("C values", addHandle->get_c_nnz());
3719 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3720#ifdef HAVE_TPETRA_MMM_TIMINGS
3721 MM = Teuchos::null;
3722 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted numeric")));
3723#endif
3724 KokkosSparse::Experimental::spadd_numeric(&handle,
3725#if KOKKOSKERNELS_VERSION >= 40299
3726 nrows, numGlobalCols,
3727#endif
3728 Arowptrs, Acolinds, Avals, scalarA,
3729 Browptrs, Bcolinds, Bvals, scalarB,
3730 Crowptrs, Ccolinds, Cvals);
3731}
3732
3733template<typename SC, typename LO, typename GO, typename NO>
3734void AddKernels<SC, LO, GO, NO>::
3735addUnsorted(
3736 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3737 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3738 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3739 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3740 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3741 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3742 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3743 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3744#if KOKKOSKERNELS_VERSION >= 40299
3745 GO numGlobalCols,
3746#else
3747 GO /* numGlobalCols */,
3748#endif
3749 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3750 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3751 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3752{
3753 using Teuchos::TimeMonitor;
3754 using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3755 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3756 auto nrows = Arowptrs.extent(0) - 1;
3757 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3758 typedef MMdetails::AddKernels<SC, LO, GO, NO> AddKern;
3759 typename AddKern::KKH handle;
3760 handle.create_spadd_handle(false);
3761 auto addHandle = handle.get_spadd_handle();
3762#ifdef HAVE_TPETRA_MMM_TIMINGS
3763 auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() unsorted symbolic")));
3764#endif
3765 KokkosSparse::Experimental::spadd_symbolic
3766 (&handle,
3767#if KOKKOSKERNELS_VERSION >= 40299
3768 nrows, numGlobalCols,
3769#endif
3770 Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3771 //Cvals must be zeroed out
3772 Cvals = values_array("C values", addHandle->get_c_nnz());
3773 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3774#ifdef HAVE_TPETRA_MMM_TIMINGS
3775 MM = Teuchos::null;
3776 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() unsorted kernel: unsorted numeric")));
3777#endif
3778 KokkosSparse::Experimental::spadd_numeric(&handle,
3779#if KOKKOSKERNELS_VERSION >= 40299
3780 nrows, numGlobalCols,
3781#endif
3782 Arowptrs, Acolinds, Avals, scalarA,
3783 Browptrs, Bcolinds, Bvals, scalarB,
3784 Crowptrs, Ccolinds, Cvals);
3785}
3786
3787template<typename GO,
3788 typename LocalIndicesType,
3789 typename GlobalIndicesType,
3790 typename ColMapType>
3791struct ConvertLocalToGlobalFunctor
3792{
3793 ConvertLocalToGlobalFunctor(
3794 const LocalIndicesType& colindsOrig_,
3795 const GlobalIndicesType& colindsConverted_,
3796 const ColMapType& colmap_) :
3797 colindsOrig (colindsOrig_),
3798 colindsConverted (colindsConverted_),
3799 colmap (colmap_)
3800 {}
3801 KOKKOS_INLINE_FUNCTION void
3802 operator() (const GO i) const
3803 {
3804 colindsConverted(i) = colmap.getGlobalElement(colindsOrig(i));
3805 }
3806 LocalIndicesType colindsOrig;
3807 GlobalIndicesType colindsConverted;
3808 ColMapType colmap;
3809};
3810
3811template<typename SC, typename LO, typename GO, typename NO>
3812void MMdetails::AddKernels<SC, LO, GO, NO>::
3813convertToGlobalAndAdd(
3814 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
3815 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3816 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
3817 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3818 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
3819 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
3820 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3821 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3822 typename MMdetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
3823{
3824 using Teuchos::TimeMonitor;
3825 //Need to use a different KokkosKernelsHandle type than other versions,
3826 //since the ordinals are now GO
3827 using KKH_GO = KokkosKernels::Experimental::KokkosKernelsHandle<size_t, GO, impl_scalar_type,
3828 typename NO::execution_space, typename NO::memory_space, typename NO::memory_space>;
3829
3830 const values_array Avals = A.values;
3831 const values_array Bvals = B.values;
3832 const col_inds_array Acolinds = A.graph.entries;
3833 const col_inds_array Bcolinds = B.graph.entries;
3834 auto Arowptrs = A.graph.row_map;
3835 auto Browptrs = B.graph.row_map;
3836 global_col_inds_array AcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("A colinds (converted)"), Acolinds.extent(0));
3837 global_col_inds_array BcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("B colinds (converted)"), Bcolinds.extent(0));
3838#ifdef HAVE_TPETRA_MMM_TIMINGS
3839 auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: " + std::string("column map conversion"))));
3840#endif
3841 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(Acolinds, AcolindsConverted, AcolMap);
3842 Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
3843 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(Bcolinds, BcolindsConverted, BcolMap);
3844 Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
3845 KKH_GO handle;
3846 handle.create_spadd_handle(false);
3847 auto addHandle = handle.get_spadd_handle();
3848#ifdef HAVE_TPETRA_MMM_TIMINGS
3849 MM = Teuchos::null;
3850 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted symbolic")));
3851#endif
3852 auto nrows = Arowptrs.extent(0) - 1;
3853 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3854 KokkosSparse::Experimental::spadd_symbolic
3855 (&handle,
3856#if KOKKOSKERNELS_VERSION >= 40299
3857 nrows, A.numCols(),
3858#endif
3859 Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
3860 Cvals = values_array("C values", addHandle->get_c_nnz());
3861 Ccolinds = global_col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3862#ifdef HAVE_TPETRA_MMM_TIMINGS
3863 MM = Teuchos::null;
3864 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted numeric")));
3865#endif
3866 KokkosSparse::Experimental::spadd_numeric(&handle,
3867#if KOKKOSKERNELS_VERSION >= 40299
3868 nrows, A.numCols(),
3869#endif
3870 Arowptrs, AcolindsConverted, Avals, scalarA,
3871 Browptrs, BcolindsConverted, Bvals, scalarB,
3872 Crowptrs, Ccolinds, Cvals);
3873}
3874
3875
3876} //End namepsace MMdetails
3877
3878} //End namespace Tpetra
3879
3880/*********************************************************************************************************/
3881//
3882// Explicit instantiation macro
3883//
3884// Must be expanded from within the Tpetra namespace!
3885//
3886namespace Tpetra {
3887
3888#define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
3889template \
3890 void MatrixMatrix::Multiply( \
3891 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3892 bool transposeA, \
3893 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3894 bool transposeB, \
3895 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3896 bool call_FillComplete_on_result, \
3897 const std::string & label, \
3898 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3899\
3900template \
3901 void MatrixMatrix::Multiply( \
3902 const Teuchos::RCP<const BlockCrsMatrix< SCALAR , LO , GO , NODE > >& A, \
3903 bool transposeA, \
3904 const Teuchos::RCP<const BlockCrsMatrix< SCALAR , LO , GO , NODE > >& B, \
3905 bool transposeB, \
3906 Teuchos::RCP<BlockCrsMatrix< SCALAR , LO , GO , NODE > >& C, \
3907 const std::string & label); \
3908\
3909template \
3910 void MatrixMatrix::Jacobi( \
3911 SCALAR omega, \
3912 const Vector< SCALAR, LO, GO, NODE > & Dinv, \
3913 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3914 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3915 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3916 bool call_FillComplete_on_result, \
3917 const std::string & label, \
3918 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3919\
3920 template \
3921 void MatrixMatrix::Add( \
3922 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3923 bool transposeA, \
3924 SCALAR scalarA, \
3925 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3926 bool transposeB, \
3927 SCALAR scalarB, \
3928 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > >& C); \
3929\
3930 template \
3931 void MatrixMatrix::Add( \
3932 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3933 bool transposeA, \
3934 SCALAR scalarA, \
3935 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3936 bool transposeB, \
3937 SCALAR scalarB, \
3938 const Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > >& C); \
3939\
3940 template \
3941 void MatrixMatrix::Add( \
3942 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3943 bool transposeA, \
3944 SCALAR scalarA, \
3945 CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3946 SCALAR scalarB ); \
3947\
3948 template \
3949 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
3950 MatrixMatrix::add<SCALAR, LO, GO, NODE> \
3951 (const SCALAR & alpha, \
3952 const bool transposeA, \
3953 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3954 const SCALAR & beta, \
3955 const bool transposeB, \
3956 const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3957 const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \
3958 const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \
3959 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3960\
3961 template \
3962 void \
3963 MatrixMatrix::add< SCALAR , LO, GO , NODE > \
3964 (const SCALAR & alpha, \
3965 const bool transposeA, \
3966 const CrsMatrix< SCALAR , LO, GO , NODE >& A, \
3967 const SCALAR& beta, \
3968 const bool transposeB, \
3969 const CrsMatrix< SCALAR , LO, GO , NODE >& B, \
3970 CrsMatrix< SCALAR , LO, GO , NODE >& C, \
3971 const Teuchos::RCP<const Map<LO, GO , NODE > >& domainMap, \
3972 const Teuchos::RCP<const Map<LO, GO , NODE > >& rangeMap, \
3973 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3974\
3975 template struct MMdetails::AddKernels<SCALAR, LO, GO, NODE>; \
3976\
3977 template void MMdetails::import_and_extract_views<SCALAR, LO, GO, NODE>(const CrsMatrix<SCALAR, LO, GO, NODE>& M, \
3978 Teuchos::RCP<const Map<LO, GO, NODE> > targetMap, \
3979 CrsMatrixStruct<SCALAR, LO, GO, NODE>& Mview, \
3980 Teuchos::RCP<const Import<LO,GO, NODE> > prototypeImporter, \
3981 bool userAssertsThereAreNoRemotes, \
3982 const std::string& label, \
3983 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3984\
3985 template void MMdetails::import_and_extract_views<SCALAR, LO, GO, NODE>(const BlockCrsMatrix<SCALAR, LO, GO, NODE>& M, \
3986 Teuchos::RCP<const Map<LO, GO, NODE> > targetMap, \
3987 BlockCrsMatrixStruct<SCALAR, LO, GO, NODE>& Mview, \
3988 Teuchos::RCP<const Import<LO,GO, NODE> > prototypeImporter, \
3989 bool userAssertsThereAreNoRemotes);
3990} //End namespace Tpetra
3991
3992#endif // TPETRA_MATRIXMATRIX_DEF_HPP
Matrix Market file readers and writers for Tpetra objects.
Forward declaration of some Tpetra Matrix Matrix objects.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Declaration and definition of Tpetra::Details::getEntryOnHost.
Utility functions for packing and unpacking sparse matrix entries.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a BlockCrsMatrix.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Struct that holds views of the contents of a CrsMatrix.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
bool haveGlobalConstants() const
Returns true if globalConstants have been computed; false otherwise.
void scale(const Scalar &alpha)
Scale the matrix's values: this := alpha*this.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
bool isFillActive() const
Whether the matrix is not fill complete.
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix's column Map with the given Map.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void setAllValues(const typename local_graph_device_type::row_map_type &ptr, const typename local_graph_device_type::entries_type::non_const_type &ind, const typename local_matrix_device_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
size_t getLocalNumRows() const override
The number of matrix rows owned by the calling process.
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
A parallel distribution of indices over processes.
static void writeSparseFile(const std::string &filename, const sparse_matrix_type &matrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Teuchos::RCP< crs_matrix_type > createTranspose(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute and return the transpose of the matrix given to the constructor.
A distributed dense vector.
void start()
Start the deep_copy counter.
int makeColMap(Teuchos::RCP< const Tpetra::Map< LO, GO, NT > > &colMap, Teuchos::Array< int > &remotePIDs, const Teuchos::RCP< const Tpetra::Map< LO, GO, NT > > &domMap, const RowGraph< LO, GO, NT > &graph, const bool sortEachProcsGids=true, std::ostream *errStrm=NULL)
Make the graph's column Map.
Distributed sparse matrix-matrix multiply and add.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void removeCrsMatrixZeros(CrsMatrixType &matrix, typename Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::magnitudeType const &threshold=Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::magnitude(Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::zero()))
Remove zero entries from a matrix.
size_t global_size_t
Global size_t object.
Teuchos::RCP< CrsMatrixType > importAndFillCompleteCrsMatrix(const Teuchos::RCP< const CrsMatrixType > &sourceMatrix, const Import< typename CrsMatrixType::local_ordinal_type, typename CrsMatrixType::global_ordinal_type, typename CrsMatrixType::node_type > &importer, const Teuchos::RCP< const Map< typename CrsMatrixType::local_ordinal_type, typename CrsMatrixType::global_ordinal_type, typename CrsMatrixType::node_type > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< typename CrsMatrixType::local_ordinal_type, typename CrsMatrixType::global_ordinal_type, typename CrsMatrixType::node_type > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Nonmember CrsMatrix constructor that fuses Import and fillComplete().