Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
TpetraExt_TripleMatrixMultiply_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
11#define TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
12
14#include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp" //for UnmanagedView
15#include "Teuchos_VerboseObject.hpp"
16#include "Teuchos_Array.hpp"
17#include "Tpetra_Util.hpp"
18#include "Tpetra_ConfigDefs.hpp"
19#include "Tpetra_CrsMatrix.hpp"
21#include "Tpetra_RowMatrixTransposer.hpp"
22#include "Tpetra_ConfigDefs.hpp"
23#include "Tpetra_Map.hpp"
24#include "Tpetra_Export.hpp"
27#include <algorithm>
28#include <cmath>
29#include "Teuchos_FancyOStream.hpp"
30// #include "KokkosSparse_spgemm.hpp"
31
32
37
38
39
40/*********************************************************************************************************/
41// Include the architecture-specific kernel partial specializations here
42// NOTE: This needs to be outside all namespaces
43#include "TpetraExt_MatrixMatrix_OpenMP.hpp"
44#include "TpetraExt_MatrixMatrix_Cuda.hpp"
45#include "TpetraExt_MatrixMatrix_HIP.hpp"
46#include "TpetraExt_MatrixMatrix_SYCL.hpp"
47
48namespace Tpetra {
49
50 namespace TripleMatrixMultiply{
51
52 //
53 // This method forms the matrix-matrix product Ac = op(R) * op(A) * op(P), where
54 // op(A) == A if transposeA is false,
55 // op(A) == A^T if transposeA is true,
56 // and similarly for op(R) and op(P).
57 //
58 template <class Scalar,
59 class LocalOrdinal,
60 class GlobalOrdinal,
61 class Node>
64 bool transposeR,
66 bool transposeA,
68 bool transposeP,
70 bool call_FillComplete_on_result,
71 const std::string& label,
72 const Teuchos::RCP<Teuchos::ParameterList>& params)
73 {
74 using Teuchos::null;
75 using Teuchos::RCP;
76 typedef Scalar SC;
77 typedef LocalOrdinal LO;
78 typedef GlobalOrdinal GO;
79 typedef Node NO;
80 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
83 typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
84 typedef Map<LO,GO,NO> map_type;
85 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
86
87#ifdef HAVE_TPETRA_MMM_TIMINGS
88 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
89 using Teuchos::TimeMonitor;
90 RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Setup"))));
91#endif
92
93 const std::string prefix = "TpetraExt::TripleMatrixMultiply::MultiplyRAP(): ";
94
95 // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
96
97 // The input matrices R, A and P must both be fillComplete.
98 TEUCHOS_TEST_FOR_EXCEPTION(!R.isFillComplete(), std::runtime_error, prefix << "Matrix R is not fill complete.");
99 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
100 TEUCHOS_TEST_FOR_EXCEPTION(!P.isFillComplete(), std::runtime_error, prefix << "Matrix P is not fill complete.");
101
102 // If transposeA is true, then Rprime will be the transpose of R
103 // (computed explicitly via RowMatrixTransposer). Otherwise, Rprime
104 // will just be a pointer to R.
105 RCP<const crs_matrix_type> Rprime = null;
106 // If transposeA is true, then Aprime will be the transpose of A
107 // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
108 // will just be a pointer to A.
109 RCP<const crs_matrix_type> Aprime = null;
110 // If transposeB is true, then Pprime will be the transpose of P
111 // (computed explicitly via RowMatrixTransposer). Otherwise, Pprime
112 // will just be a pointer to P.
113 RCP<const crs_matrix_type> Pprime = null;
114
115 // Is this a "clean" matrix?
116 //
117 // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
118 // locally nor globally indexed, then it was empty. I don't like
119 // this, because the most straightforward implementation presumes
120 // lazy allocation of indices. However, historical precedent
121 // demands that we keep around this predicate as a way to test
122 // whether the matrix is empty.
123 const bool newFlag = !Ac.getGraph()->isLocallyIndexed() && !Ac.getGraph()->isGloballyIndexed();
124
125 using Teuchos::ParameterList;
126 RCP<ParameterList> transposeParams (new ParameterList);
127 transposeParams->set ("sort", false);
128
129 if (transposeR && &R != &P) {
130 transposer_type transposer(rcpFromRef (R));
131 Rprime = transposer.createTranspose (transposeParams);
132 } else {
133 Rprime = rcpFromRef(R);
134 }
135
136 if (transposeA) {
137 transposer_type transposer(rcpFromRef (A));
138 Aprime = transposer.createTranspose (transposeParams);
139 } else {
140 Aprime = rcpFromRef(A);
141 }
142
143 if (transposeP) {
144 transposer_type transposer(rcpFromRef (P));
145 Pprime = transposer.createTranspose (transposeParams);
146 } else {
147 Pprime = rcpFromRef(P);
148 }
149
150 // Check size compatibility
151 global_size_t numRCols = R.getDomainMap()->getGlobalNumElements();
152 global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
153 global_size_t numPCols = P.getDomainMap()->getGlobalNumElements();
154 global_size_t Rleft = transposeR ? numRCols : R.getGlobalNumRows();
155 global_size_t Rright = transposeR ? R.getGlobalNumRows() : numRCols;
156 global_size_t Aleft = transposeA ? numACols : A.getGlobalNumRows();
157 global_size_t Aright = transposeA ? A.getGlobalNumRows() : numACols;
158 global_size_t Pleft = transposeP ? numPCols : P.getGlobalNumRows();
159 global_size_t Pright = transposeP ? P.getGlobalNumRows() : numPCols;
160 TEUCHOS_TEST_FOR_EXCEPTION(Rright != Aleft, std::runtime_error,
161 prefix << "ERROR, inner dimensions of op(R) and op(A) "
162 "must match for matrix-matrix product. op(R) is "
163 << Rleft << "x" << Rright << ", op(A) is "<< Aleft << "x" << Aright);
164
165 TEUCHOS_TEST_FOR_EXCEPTION(Aright != Pleft, std::runtime_error,
166 prefix << "ERROR, inner dimensions of op(A) and op(P) "
167 "must match for matrix-matrix product. op(A) is "
168 << Aleft << "x" << Aright << ", op(P) is "<< Pleft << "x" << Pright);
169
170 // The result matrix Ac must at least have a row-map that reflects the correct
171 // row-size. Don't check the number of columns because rectangular matrices
172 // which were constructed with only one map can still end up having the
173 // correct capacity and dimensions when filled.
174 TEUCHOS_TEST_FOR_EXCEPTION(Rleft > Ac.getGlobalNumRows(), std::runtime_error,
175 prefix << "ERROR, dimensions of result Ac must "
176 "match dimensions of op(R) * op(A) * op(P). Ac has " << Ac.getGlobalNumRows()
177 << " rows, should have at least " << Rleft << std::endl);
178
179 // It doesn't matter whether Ac is already Filled or not. If it is already
180 // Filled, it must have space allocated for the positions that will be
181 // referenced in forming Ac = op(R)*op(A)*op(P). If it doesn't have enough space,
182 // we'll error out later when trying to store result values.
183
184 // CGB: However, matrix must be in active-fill
185 TEUCHOS_TEST_FOR_EXCEPT( Ac.isFillActive() == false );
186
187 // We're going to need to import remotely-owned sections of P if
188 // more than one processor is performing this run, depending on the scenario.
189 int numProcs = P.getComm()->getSize();
190
191 // Declare a couple of structs that will be used to hold views of the data
192 // of R, A and P, to be used for fast access during the matrix-multiplication.
193 crs_matrix_struct_type Rview;
194 crs_matrix_struct_type Aview;
195 crs_matrix_struct_type Pview;
196
197 RCP<const map_type> targetMap_R = Rprime->getRowMap();
198 RCP<const map_type> targetMap_A = Aprime->getRowMap();
199 RCP<const map_type> targetMap_P = Pprime->getRowMap();
200
201#ifdef HAVE_TPETRA_MMM_TIMINGS
202 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All I&X"))));
203#endif
204
205 // Now import any needed remote rows and populate the Aview struct
206 // NOTE: We assert that an import isn't needed --- since we do the transpose
207 // above to handle that.
208 RCP<const import_type> dummyImporter;
209
210 if (!(transposeR && &R == &P))
211 MMdetails::import_and_extract_views(*Rprime, targetMap_R, Rview, dummyImporter, true, label, params);
212
213 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
214
215 // We will also need local access to all rows of P that correspond to the
216 // column-map of op(A).
217 if (numProcs > 1)
218 targetMap_P = Aprime->getColMap();
219
220 // Import any needed remote rows and populate the Pview struct.
221 MMdetails::import_and_extract_views(*Pprime, targetMap_P, Pview, Aprime->getGraph()->getImporter(), false, label, params);
222
223
224 RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Actemp;
225
226 bool needs_final_export = !Pprime->getGraph()->getImporter().is_null();
227 if (needs_final_export)
228 Actemp = rcp(new Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(Pprime->getColMap(),0));
229 else
230 Actemp = rcp(&Ac,false);// don't allow deallocation
231
232#ifdef HAVE_TPETRA_MMM_TIMINGS
233 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Multiply"))));
234#endif
235
236 // Call the appropriate method to perform the actual multiplication.
237 if (call_FillComplete_on_result && newFlag) {
238 if (transposeR && &R == &P)
239 MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
240 else
241 MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
242 } else if (call_FillComplete_on_result) {
243 if (transposeR && &R == &P)
244 MMdetails::mult_PT_A_P_reuse(Aview, Pview, *Actemp, label, params);
245 else
246 MMdetails::mult_R_A_P_reuse(Rview, Aview, Pview, *Actemp, label, params);
247 } else {
248 // mfh 27 Sep 2016: Is this the "slow" case? This
249 // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
250 // thread-parallel inserts, but that may take some effort.
251 // CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(Ac);
252
253 // MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
254
255 // #ifdef HAVE_TPETRA_MMM_TIMINGS
256 // MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All FillComplete"))));
257 // #endif
258 // if (call_FillComplete_on_result) {
259 // // We'll call FillComplete on the C matrix before we exit, and give it a
260 // // domain-map and a range-map.
261 // // The domain-map will be the domain-map of B, unless
262 // // op(B)==transpose(B), in which case the range-map of B will be used.
263 // // The range-map will be the range-map of A, unless op(A)==transpose(A),
264 // // in which case the domain-map of A will be used.
265 // if (!C.isFillComplete())
266 // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
267 // }
268 // Not implemented
269 if (transposeR && &R == &P)
270 MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
271 else
272 MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
273 }
274
275 if (needs_final_export) {
276#ifdef HAVE_TPETRA_MMM_TIMINGS
277 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP exportAndFillComplete"))));
278#endif
279 Teuchos::ParameterList labelList;
280 labelList.set("Timer Label", label);
281 Teuchos::ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
282
283 RCP<crs_matrix_type> Acprime = rcpFromRef(Ac);
284 bool isMM = true;
285 bool overrideAllreduce = false;
286 int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
287 if(!params.is_null()) {
288 Teuchos::ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
289 mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
290 mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
291 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
292 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
293 isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
294 overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
295
296 labelList.set("compute global constants",params->get("compute global constants",true));
297 }
298 labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
299
300 labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM,
301 "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
302 labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
303
304 export_type exporter = export_type(*Pprime->getGraph()->getImporter());
305 Actemp->exportAndFillComplete(Acprime,
306 exporter,
307 Acprime->getDomainMap(),
308 Acprime->getRangeMap(),
309 rcp(&labelList,false));
310
311 }
312#ifdef HAVE_TPETRA_MMM_STATISTICS
313 printMultiplicationStatistics(Actemp->getGraph()->getExporter(), label+std::string(" RAP MMM"));
314#endif
315
316 }
317
318
319 } //End namespace TripleMatrixMultiply
320
321 namespace MMdetails{
322
323 // Kernel method for computing the local portion of Ac = R*A*P
324 template<class Scalar,
325 class LocalOrdinal,
326 class GlobalOrdinal,
327 class Node>
328 void mult_R_A_P_newmatrix(
329 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
330 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
331 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
332 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
333 const std::string& label,
334 const Teuchos::RCP<Teuchos::ParameterList>& params)
335 {
336 using Teuchos::Array;
337 using Teuchos::ArrayRCP;
338 using Teuchos::ArrayView;
339 using Teuchos::RCP;
340 using Teuchos::rcp;
341
342 //typedef Scalar SC; // unused
343 typedef LocalOrdinal LO;
344 typedef GlobalOrdinal GO;
345 typedef Node NO;
346
347 typedef Import<LO,GO,NO> import_type;
348 typedef Map<LO,GO,NO> map_type;
349
350 // Kokkos typedefs
351 typedef typename map_type::local_map_type local_map_type;
353 typedef typename KCRS::StaticCrsGraphType graph_t;
354 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
355 typedef typename NO::execution_space execution_space;
356 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
357 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
358
359#ifdef HAVE_TPETRA_MMM_TIMINGS
360 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
361 using Teuchos::TimeMonitor;
362 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
363#endif
364 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
365
366 // Build the final importer / column map, hash table lookups for Ac
367 RCP<const import_type> Cimport;
368 RCP<const map_type> Ccolmap;
369 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
370 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
371 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
372 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
373 local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
374 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
375 local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
376
377
378 // mfh 27 Sep 2016: Pcol2Ccol is a table that maps from local column
379 // indices of B, to local column indices of Ac. (B and Ac have the
380 // same number of columns.) The kernel uses this, instead of
381 // copying the entire input matrix B and converting its column
382 // indices to those of C.
383 lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Pcol2Ccol"),Pview.colMap->getLocalNumElements()), Icol2Ccol;
384
385 if (Pview.importMatrix.is_null()) {
386 // mfh 27 Sep 2016: B has no "remotes," so P and C have the same column Map.
387 Cimport = Pimport;
388 Ccolmap = Pview.colMap;
389 const LO colMapSize = static_cast<LO>(Pview.colMap->getLocalNumElements());
390 // Pcol2Ccol is trivial
391 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
392 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
393 KOKKOS_LAMBDA(const LO i) {
394 Pcol2Ccol(i) = i;
395 });
396 }
397 else {
398 // mfh 27 Sep 2016: P has "remotes," so we need to build the
399 // column Map of C, as well as C's Import object (from its domain
400 // Map to its column Map). C's column Map is the union of the
401 // column Maps of (the local part of) P, and the "remote" part of
402 // P. Ditto for the Import. We have optimized this "setUnion"
403 // operation on Import objects and Maps.
404
405 // Choose the right variant of setUnion
406 if (!Pimport.is_null() && !Iimport.is_null()) {
407 Cimport = Pimport->setUnion(*Iimport);
408 }
409 else if (!Pimport.is_null() && Iimport.is_null()) {
410 Cimport = Pimport->setUnion();
411 }
412 else if (Pimport.is_null() && !Iimport.is_null()) {
413 Cimport = Iimport->setUnion();
414 }
415 else {
416 throw std::runtime_error("TpetraExt::RAP status of matrix importers is nonsensical");
417 }
418 Ccolmap = Cimport->getTargetMap();
419
420 // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
421 // in general. We should get rid of it in order to reduce
422 // communication costs of sparse matrix-matrix multiply.
423 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
424 std::runtime_error, "Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
425
426 // NOTE: This is not efficient and should be folded into setUnion
427 //
428 // mfh 27 Sep 2016: What the above comment means, is that the
429 // setUnion operation on Import objects could also compute these
430 // local index - to - local index look-up tables.
431 Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getLocalNumElements());
432 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
433 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement",range_type(0,Pview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
434 Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
435 });
436 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Pview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
437 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
438 });
439
440 }
441
442 // Replace the column map
443 //
444 // mfh 27 Sep 2016: We do this because C was originally created
445 // without a column Map. Now we have its column Map.
446 Ac.replaceColMap(Ccolmap);
447
448 // mfh 27 Sep 2016: Construct tables that map from local column
449 // indices of A, to local row indices of either B_local (the locally
450 // owned part of B), or B_remote (the "imported" remote part of B).
451 //
452 // For column index Aik in row i of A, if the corresponding row of B
453 // exists in the local part of B ("orig") (which I'll call B_local),
454 // then targetMapToOrigRow[Aik] is the local index of that row of B.
455 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
456 //
457 // For column index Aik in row i of A, if the corresponding row of B
458 // exists in the remote part of B ("Import") (which I'll call
459 // B_remote), then targetMapToImportRow[Aik] is the local index of
460 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
461 // (a flag value).
462
463 // Run through all the hash table lookups once and for all
464 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
465 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
466 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
467 GO aidx = Acolmap_local.getGlobalElement(i);
468 LO P_LID = Prowmap_local.getLocalElement(aidx);
469 if (P_LID != LO_INVALID) {
470 targetMapToOrigRow(i) = P_LID;
471 targetMapToImportRow(i) = LO_INVALID;
472 } else {
473 LO I_LID = Irowmap_local.getLocalElement(aidx);
474 targetMapToOrigRow(i) = LO_INVALID;
475 targetMapToImportRow(i) = I_LID;
476 }
477 });
478
479 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
480 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
481 KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
482 mult_R_A_P_newmatrix_kernel_wrapper(Rview, Aview, Pview,
483 targetMapToOrigRow,targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
484 Ac, Cimport, label, params);
485 }
486
487
488 // Kernel method for computing the local portion of Ac = R*A*P (reuse mode)
489 template<class Scalar,
490 class LocalOrdinal,
491 class GlobalOrdinal,
492 class Node>
493 void mult_R_A_P_reuse(
494 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
495 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
496 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
497 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
498 const std::string& label,
499 const Teuchos::RCP<Teuchos::ParameterList>& params)
500 {
501 using Teuchos::Array;
502 using Teuchos::ArrayRCP;
503 using Teuchos::ArrayView;
504 using Teuchos::RCP;
505 using Teuchos::rcp;
506
507 //typedef Scalar SC; // unused
508 typedef LocalOrdinal LO;
509 typedef GlobalOrdinal GO;
510 typedef Node NO;
511
512 typedef Import<LO,GO,NO> import_type;
513 typedef Map<LO,GO,NO> map_type;
514
515 // Kokkos typedefs
516 typedef typename map_type::local_map_type local_map_type;
518 typedef typename KCRS::StaticCrsGraphType graph_t;
519 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
520 typedef typename NO::execution_space execution_space;
521 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
522 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
523
524#ifdef HAVE_TPETRA_MMM_TIMINGS
525 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
526 using Teuchos::TimeMonitor;
527 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
528#endif
529 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
530
531 // Build the final importer / column map, hash table lookups for Ac
532 RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
533 RCP<const map_type> Ccolmap = Ac.getColMap();
534 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
535 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
536 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
537 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
538 local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
539 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
540 local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
541 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
542
543 // Build the final importer / column map, hash table lookups for C
544 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Pview.colMap->getLocalNumElements()), Icol2Ccol;
545 {
546 // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
547 // So, column map of C may be a strict subset of the column map of B
548 Kokkos::parallel_for(range_type(0,Pview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
549 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
550 });
551
552 if (!Pview.importMatrix.is_null()) {
553 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
554 std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
555
556 Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getLocalNumElements());
557 Kokkos::parallel_for(range_type(0,Pview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
558 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
559 });
560 }
561 }
562
563 // Run through all the hash table lookups once and for all
564 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
565 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
566 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
567 GO aidx = Acolmap_local.getGlobalElement(i);
568 LO B_LID = Prowmap_local.getLocalElement(aidx);
569 if (B_LID != LO_INVALID) {
570 targetMapToOrigRow(i) = B_LID;
571 targetMapToImportRow(i) = LO_INVALID;
572 } else {
573 LO I_LID = Irowmap_local.getLocalElement(aidx);
574 targetMapToOrigRow(i) = LO_INVALID;
575 targetMapToImportRow(i) = I_LID;
576
577 }
578 });
579
580 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
581 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
582 KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
583 mult_R_A_P_reuse_kernel_wrapper(Rview, Aview, Pview,
584 targetMapToOrigRow,targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
585 Ac, Cimport, label, params);
586 }
587
588
589 // Kernel method for computing the local portion of Ac = R*A*P
590 template<class Scalar,
591 class LocalOrdinal,
592 class GlobalOrdinal,
593 class Node>
594 void mult_PT_A_P_newmatrix(
595 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
596 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
597 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
598 const std::string& label,
599 const Teuchos::RCP<Teuchos::ParameterList>& params)
600 {
601 using Teuchos::Array;
602 using Teuchos::ArrayRCP;
603 using Teuchos::ArrayView;
604 using Teuchos::RCP;
605 using Teuchos::rcp;
606
607 //typedef Scalar SC; // unused
608 typedef LocalOrdinal LO;
609 typedef GlobalOrdinal GO;
610 typedef Node NO;
611
612 typedef Import<LO,GO,NO> import_type;
613 typedef Map<LO,GO,NO> map_type;
614
615 // Kokkos typedefs
616 typedef typename map_type::local_map_type local_map_type;
618 typedef typename KCRS::StaticCrsGraphType graph_t;
619 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
620 typedef typename NO::execution_space execution_space;
621 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
622 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
623
624#ifdef HAVE_TPETRA_MMM_TIMINGS
625 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
626 using Teuchos::TimeMonitor;
627 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP M5 Cmap")))));
628#endif
629 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
630
631 // Build the final importer / column map, hash table lookups for Ac
632 RCP<const import_type> Cimport;
633 RCP<const map_type> Ccolmap;
634 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
635 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
636 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
637 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
638 local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
639 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
640 local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
641
642
643 // mfh 27 Sep 2016: Pcol2Ccol is a table that maps from local column
644 // indices of B, to local column indices of Ac. (B and Ac have the
645 // same number of columns.) The kernel uses this, instead of
646 // copying the entire input matrix B and converting its column
647 // indices to those of C.
648 lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Pcol2Ccol"),Pview.colMap->getLocalNumElements()), Icol2Ccol;
649
650 if (Pview.importMatrix.is_null()) {
651 // mfh 27 Sep 2016: B has no "remotes," so P and C have the same column Map.
652 Cimport = Pimport;
653 Ccolmap = Pview.colMap;
654 const LO colMapSize = static_cast<LO>(Pview.colMap->getLocalNumElements());
655 // Pcol2Ccol is trivial
656 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
657 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
658 KOKKOS_LAMBDA(const LO i) {
659 Pcol2Ccol(i) = i;
660 });
661 }
662 else {
663 // mfh 27 Sep 2016: P has "remotes," so we need to build the
664 // column Map of C, as well as C's Import object (from its domain
665 // Map to its column Map). C's column Map is the union of the
666 // column Maps of (the local part of) P, and the "remote" part of
667 // P. Ditto for the Import. We have optimized this "setUnion"
668 // operation on Import objects and Maps.
669
670 // Choose the right variant of setUnion
671 if (!Pimport.is_null() && !Iimport.is_null()) {
672 Cimport = Pimport->setUnion(*Iimport);
673 }
674 else if (!Pimport.is_null() && Iimport.is_null()) {
675 Cimport = Pimport->setUnion();
676 }
677 else if (Pimport.is_null() && !Iimport.is_null()) {
678 Cimport = Iimport->setUnion();
679 }
680 else {
681 throw std::runtime_error("TpetraExt::RAP status of matrix importers is nonsensical");
682 }
683 Ccolmap = Cimport->getTargetMap();
684
685 // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
686 // in general. We should get rid of it in order to reduce
687 // communication costs of sparse matrix-matrix multiply.
688 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
689 std::runtime_error, "Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
690
691 // NOTE: This is not efficient and should be folded into setUnion
692 //
693 // mfh 27 Sep 2016: What the above comment means, is that the
694 // setUnion operation on Import objects could also compute these
695 // local index - to - local index look-up tables.
696 Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getLocalNumElements());
697 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
698 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement",range_type(0,Pview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
699 Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
700 });
701 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Pview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
702 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
703 });
704
705 }
706
707 // Replace the column map
708 //
709 // mfh 27 Sep 2016: We do this because C was originally created
710 // without a column Map. Now we have its column Map.
711 Ac.replaceColMap(Ccolmap);
712
713 // mfh 27 Sep 2016: Construct tables that map from local column
714 // indices of A, to local row indices of either B_local (the locally
715 // owned part of B), or B_remote (the "imported" remote part of B).
716 //
717 // For column index Aik in row i of A, if the corresponding row of B
718 // exists in the local part of B ("orig") (which I'll call B_local),
719 // then targetMapToOrigRow[Aik] is the local index of that row of B.
720 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
721 //
722 // For column index Aik in row i of A, if the corresponding row of B
723 // exists in the remote part of B ("Import") (which I'll call
724 // B_remote), then targetMapToImportRow[Aik] is the local index of
725 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
726 // (a flag value).
727
728 // Run through all the hash table lookups once and for all
729 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
730 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
731
732 Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
733 GO aidx = Acolmap_local.getGlobalElement(i);
734 LO P_LID = Prowmap_local.getLocalElement(aidx);
735 if (P_LID != LO_INVALID) {
736 targetMapToOrigRow(i) = P_LID;
737 targetMapToImportRow(i) = LO_INVALID;
738 } else {
739 LO I_LID = Irowmap_local.getLocalElement(aidx);
740 targetMapToOrigRow(i) = LO_INVALID;
741 targetMapToImportRow(i) = I_LID;
742 }
743 });
744
745 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
746 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
747 KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
748 mult_PT_A_P_newmatrix_kernel_wrapper(Aview, Pview,
749 targetMapToOrigRow,targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
750 Ac, Cimport, label, params);
751 }
752
753 // Kernel method for computing the local portion of Ac = R*A*P (reuse mode)
754 template<class Scalar,
755 class LocalOrdinal,
756 class GlobalOrdinal,
757 class Node>
758 void mult_PT_A_P_reuse(
759 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
760 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
761 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
762 const std::string& label,
763 const Teuchos::RCP<Teuchos::ParameterList>& params)
764 {
765 using Teuchos::Array;
766 using Teuchos::ArrayRCP;
767 using Teuchos::ArrayView;
768 using Teuchos::RCP;
769 using Teuchos::rcp;
770
771 //typedef Scalar SC; // unused
772 typedef LocalOrdinal LO;
773 typedef GlobalOrdinal GO;
774 typedef Node NO;
775
776 typedef Import<LO,GO,NO> import_type;
777 typedef Map<LO,GO,NO> map_type;
778
779 // Kokkos typedefs
780 typedef typename map_type::local_map_type local_map_type;
782 typedef typename KCRS::StaticCrsGraphType graph_t;
783 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
784 typedef typename NO::execution_space execution_space;
785 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
786 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
787
788#ifdef HAVE_TPETRA_MMM_TIMINGS
789 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
790 using Teuchos::TimeMonitor;
791 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
792#endif
793 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
794
795 // Build the final importer / column map, hash table lookups for Ac
796 RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
797 RCP<const map_type> Ccolmap = Ac.getColMap();
798 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
799 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
800 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
801 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
802 local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
803 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
804 local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
805 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
806
807 // Build the final importer / column map, hash table lookups for C
808 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Pview.colMap->getLocalNumElements()), Icol2Ccol;
809 {
810 // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
811 // So, column map of C may be a strict subset of the column map of B
812 Kokkos::parallel_for(range_type(0,Pview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
813 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
814 });
815
816 if (!Pview.importMatrix.is_null()) {
817 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
818 std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
819
820 Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getLocalNumElements());
821 Kokkos::parallel_for(range_type(0,Pview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
822 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
823 });
824 }
825 }
826
827 // Run through all the hash table lookups once and for all
828 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
829 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
830 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
831 GO aidx = Acolmap_local.getGlobalElement(i);
832 LO B_LID = Prowmap_local.getLocalElement(aidx);
833 if (B_LID != LO_INVALID) {
834 targetMapToOrigRow(i) = B_LID;
835 targetMapToImportRow(i) = LO_INVALID;
836 } else {
837 LO I_LID = Irowmap_local.getLocalElement(aidx);
838 targetMapToOrigRow(i) = LO_INVALID;
839 targetMapToImportRow(i) = I_LID;
840
841 }
842 });
843
844 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
845 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
846 KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
847 mult_PT_A_P_reuse_kernel_wrapper(Aview, Pview,
848 targetMapToOrigRow,targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
849 Ac, Cimport, label, params);
850 }
851
852
853 /*********************************************************************************************************/
854 // RAP NewMatrix Kernel wrappers (Default non-threaded version)
855 // Computes R * A * P -> Ac using classic Gustavson approach
856 template<class Scalar,
857 class LocalOrdinal,
858 class GlobalOrdinal,
859 class Node,
860 class LocalOrdinalViewType>
861 void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
862 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
863 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
864 const LocalOrdinalViewType & Acol2Prow_dev,
865 const LocalOrdinalViewType & Acol2PIrow_dev,
866 const LocalOrdinalViewType & Pcol2Accol_dev,
867 const LocalOrdinalViewType & PIcol2Accol_dev,
868 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
869 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
870 const std::string& label,
871 const Teuchos::RCP<Teuchos::ParameterList>& params) {
872#ifdef HAVE_TPETRA_MMM_TIMINGS
873 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
874 using Teuchos::TimeMonitor;
875 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore"))));
876#endif
877
878 using Teuchos::Array;
879 using Teuchos::ArrayRCP;
880 using Teuchos::ArrayView;
881 using Teuchos::RCP;
882 using Teuchos::rcp;
883
884 // Lots and lots of typedefs
886 typedef typename KCRS::StaticCrsGraphType graph_t;
887 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
888 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
889 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
890 typedef typename KCRS::values_type::non_const_type scalar_view_t;
891
892 typedef Scalar SC;
893 typedef LocalOrdinal LO;
894 typedef GlobalOrdinal GO;
895 typedef Node NO;
896 typedef Map<LO,GO,NO> map_type;
897 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
898 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
899 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
900
901 // Sizes
902 RCP<const map_type> Accolmap = Ac.getColMap();
903 size_t m = Rview.origMatrix->getLocalNumRows();
904 size_t n = Accolmap->getLocalNumElements();
905 size_t p_max_nnz_per_row = Pview.origMatrix->getLocalMaxNumRowEntries();
906
907 // Routine runs on host; have to put arguments on host, too
908 auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
909 Acol2Prow_dev);
910 auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
911 Acol2PIrow_dev);
912 auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
913 Pcol2Accol_dev);
914 auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
915 PIcol2Accol_dev);
916
917 // Grab the Kokkos::SparseCrsMatrices & inner stuff
918 const auto Amat = Aview.origMatrix->getLocalMatrixHost();
919 const auto Pmat = Pview.origMatrix->getLocalMatrixHost();
920 const auto Rmat = Rview.origMatrix->getLocalMatrixHost();
921
922 auto Arowptr = Amat.graph.row_map;
923 auto Prowptr = Pmat.graph.row_map;
924 auto Rrowptr = Rmat.graph.row_map;
925 const auto Acolind = Amat.graph.entries;
926 const auto Pcolind = Pmat.graph.entries;
927 const auto Rcolind = Rmat.graph.entries;
928 const auto Avals = Amat.values;
929 const auto Pvals = Pmat.values;
930 const auto Rvals = Rmat.values;
931
932 typename c_lno_view_t::HostMirror::const_type Irowptr;
933 typename lno_nnz_view_t::HostMirror Icolind;
934 typename scalar_view_t::HostMirror Ivals;
935 if(!Pview.importMatrix.is_null()) {
936 auto lclP = Pview.importMatrix->getLocalMatrixHost();
937 Irowptr = lclP.graph.row_map;
938 Icolind = lclP.graph.entries;
939 Ivals = lclP.values;
940 p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getLocalMaxNumRowEntries());
941 }
942
943#ifdef HAVE_TPETRA_MMM_TIMINGS
944 RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore - Compare"))));
945#endif
946
947 // Classic csr assembly (low memory edition)
948 //
949 // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
950 // The method loops over rows of R, and may resize after processing
951 // each row. Chris Siefert says that this reflects experience in
952 // ML; for the non-threaded case, ML found it faster to spend less
953 // effort on estimation and risk an occasional reallocation.
954 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
955 typename lno_view_t::HostMirror Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
956 typename lno_nnz_view_t::HostMirror Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
957 typename scalar_view_t::HostMirror Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
958
959 // mfh 27 Sep 2016: The ac_status array is an implementation detail
960 // of the local sparse matrix-matrix multiply routine.
961
962 // The status array will contain the index into colind where this entry was last deposited.
963 // ac_status[i] < nnz - not in the row yet
964 // ac_status[i] >= nnz - this is the entry where you can find the data
965 // We start with this filled with INVALID's indicating that there are no entries yet.
966 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
967 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
968 Array<size_t> ac_status(n, ST_INVALID);
969
970 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
971 // routine. The routine computes Ac := R * A * (P_local + P_remote).
972 //
973 // For column index Aik in row i of A, Acol2Prow[Aik] tells
974 // you whether the corresponding row of P belongs to P_local
975 // ("orig") or P_remote ("Import").
976
977 // For each row of R
978 size_t nnz = 0, nnz_old = 0;
979 for (size_t i = 0; i < m; i++) {
980 // mfh 27 Sep 2016: m is the number of rows in the input matrix R
981 // on the calling process.
982 Crowptr[i] = nnz;
983
984 // mfh 27 Sep 2016: For each entry of R in the current row of R
985 for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
986 LO k = Rcolind[kk]; // local column index of current entry of R
987 const SC Rik = Rvals[kk]; // value of current entry of R
988 if (Rik == SC_ZERO)
989 continue; // skip explicitly stored zero values in R
990 // For each entry of A in the current row of A
991 for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
992 LO l = Acolind[ll]; // local column index of current entry of A
993 const SC Akl = Avals[ll]; // value of current entry of A
994 if (Akl == SC_ZERO)
995 continue; // skip explicitly stored zero values in A
996
997
998 if (Acol2Prow[l] != LO_INVALID) {
999 // mfh 27 Sep 2016: If the entry of Acol2Prow
1000 // corresponding to the current entry of A is populated, then
1001 // the corresponding row of P is in P_local (i.e., it lives on
1002 // the calling process).
1003
1004 // Local matrix
1005 size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1006
1007 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1008 for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1009 LO j = Pcolind[jj];
1010 LO Acj = Pcol2Accol[j];
1011 SC Plj = Pvals[jj];
1012
1013 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1014#ifdef HAVE_TPETRA_DEBUG
1015 // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1016 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1017 std::runtime_error,
1018 label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1019#endif
1020 // New entry
1021 ac_status[Acj] = nnz;
1022 Ccolind[nnz] = Acj;
1023 Cvals[nnz] = Rik*Akl*Plj;
1024 nnz++;
1025 } else {
1026 Cvals[ac_status[Acj]] += Rik*Akl*Plj;
1027 }
1028 }
1029 } else {
1030 // mfh 27 Sep 2016: If the entry of Acol2PRow
1031 // corresponding to the current entry of A is NOT populated (has
1032 // a flag "invalid" value), then the corresponding row of P is
1033 // in P_remote (i.e., it does not live on the calling process).
1034
1035 // Remote matrix
1036 size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1037 for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1038 LO j = Icolind[jj];
1039 LO Acj = PIcol2Accol[j];
1040 SC Plj = Ivals[jj];
1041
1042 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1043#ifdef HAVE_TPETRA_DEBUG
1044 // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1045 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1046 std::runtime_error,
1047 label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1048#endif
1049 // New entry
1050 ac_status[Acj] = nnz;
1051 Ccolind[nnz] = Acj;
1052 Cvals[nnz] = Rik*Akl*Plj;
1053 nnz++;
1054 } else {
1055 Cvals[ac_status[Acj]] += Rik*Akl*Plj;
1056 }
1057 }
1058 }
1059 }
1060 }
1061 // Resize for next pass if needed
1062 if (nnz + n > CSR_alloc) {
1063 CSR_alloc *= 2;
1064 Kokkos::resize(Ccolind,CSR_alloc);
1065 Kokkos::resize(Cvals,CSR_alloc);
1066 }
1067 nnz_old = nnz;
1068 }
1069
1070 Crowptr[m] = nnz;
1071
1072 // Downward resize
1073 Kokkos::resize(Ccolind,nnz);
1074 Kokkos::resize(Cvals,nnz);
1075
1076#ifdef HAVE_TPETRA_MMM_TIMINGS
1077 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
1078#endif
1079 auto Crowptr_dev = Kokkos::create_mirror_view_and_copy(
1080 typename KCRS::device_type(), Crowptr);
1081 auto Ccolind_dev = Kokkos::create_mirror_view_and_copy(
1082 typename KCRS::device_type(), Ccolind);
1083 auto Cvals_dev = Kokkos::create_mirror_view_and_copy(
1084 typename KCRS::device_type(), Cvals);
1085
1086 // Final sort & set of CRS arrays
1087 if (params.is_null() || params->get("sort entries",true))
1088 Import_Util::sortCrsEntries(Crowptr_dev, Ccolind_dev, Cvals_dev);
1089 Ac.setAllValues(Crowptr_dev, Ccolind_dev, Cvals_dev);
1090
1091#ifdef HAVE_TPETRA_MMM_TIMINGS
1092 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
1093#endif
1094
1095 // Final FillComplete
1096 //
1097 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1098 // Import (from domain Map to column Map) construction (which costs
1099 // lots of communication) by taking the previously constructed
1100 // Import object. We should be able to do this without interfering
1101 // with the implementation of the local part of sparse matrix-matrix
1102 // multply above.
1103 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1104 labelList->set("Timer Label",label);
1105 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1106 RCP<const Export<LO,GO,NO> > dummyExport;
1107 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1108 Rview.origMatrix->getRangeMap(),
1109 Acimport,
1110 dummyExport,
1111 labelList);
1112
1113 }
1114
1115 /*********************************************************************************************************/
1116 // RAP Reuse Kernel wrappers (Default non-threaded version)
1117 // Computes R * A * P -> Ac using reuse Gustavson
1118 template<class Scalar,
1119 class LocalOrdinal,
1120 class GlobalOrdinal,
1121 class Node,
1122 class LocalOrdinalViewType>
1123 void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
1124 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1125 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1126 const LocalOrdinalViewType & Acol2Prow_dev,
1127 const LocalOrdinalViewType & Acol2PIrow_dev,
1128 const LocalOrdinalViewType & Pcol2Accol_dev,
1129 const LocalOrdinalViewType & PIcol2Accol_dev,
1130 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1131 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1132 const std::string& label,
1133 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1134#ifdef HAVE_TPETRA_MMM_TIMINGS
1135 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1136 using Teuchos::TimeMonitor;
1137 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse SerialCore"))));
1138#endif
1139
1140 using Teuchos::Array;
1141 using Teuchos::ArrayRCP;
1142 using Teuchos::ArrayView;
1143 using Teuchos::RCP;
1144 using Teuchos::rcp;
1145
1146 // Lots and lots of typedefs
1147 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
1148 typedef typename KCRS::StaticCrsGraphType graph_t;
1149 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1150 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1151 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1152
1153 typedef Scalar SC;
1154 typedef LocalOrdinal LO;
1155 typedef GlobalOrdinal GO;
1156 typedef Node NO;
1157 typedef Map<LO,GO,NO> map_type;
1158 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1159 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1160 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1161
1162 // Sizes
1163 RCP<const map_type> Accolmap = Ac.getColMap();
1164 size_t m = Rview.origMatrix->getLocalNumRows();
1165 size_t n = Accolmap->getLocalNumElements();
1166 size_t p_max_nnz_per_row = Pview.origMatrix->getLocalMaxNumRowEntries();
1167
1168 // Routine runs on host; have to put arguments on host, too
1169 auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1170 Acol2Prow_dev);
1171 auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1172 Acol2PIrow_dev);
1173 auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1174 Pcol2Accol_dev);
1175 auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1176 PIcol2Accol_dev);
1177
1178 // Grab the Kokkos::SparseCrsMatrices & inner stuff
1179 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
1180 const KCRS & Pmat = Pview.origMatrix->getLocalMatrixHost();
1181 const KCRS & Rmat = Rview.origMatrix->getLocalMatrixHost();
1182 const KCRS & Cmat = Ac.getLocalMatrixHost();
1183
1184 c_lno_view_t Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Rrowptr = Rmat.graph.row_map, Crowptr = Cmat.graph.row_map;
1185 const lno_nnz_view_t Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries , Rcolind = Rmat.graph.entries, Ccolind = Cmat.graph.entries;
1186 const scalar_view_t Avals = Amat.values, Pvals = Pmat.values, Rvals = Rmat.values;
1187 scalar_view_t Cvals = Cmat.values;
1188
1189 c_lno_view_t Irowptr;
1190 lno_nnz_view_t Icolind;
1191 scalar_view_t Ivals;
1192 if(!Pview.importMatrix.is_null()) {
1193 auto lclP = Pview.importMatrix->getLocalMatrixHost();
1194 Irowptr = lclP.graph.row_map;
1195 Icolind = lclP.graph.entries;
1196 Ivals = lclP.values;
1197 p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getLocalMaxNumRowEntries());
1198 }
1199
1200#ifdef HAVE_TPETRA_MMM_TIMINGS
1201 RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse SerialCore - Compare"))));
1202#endif
1203
1204 // mfh 27 Sep 2016: The ac_status array is an implementation detail
1205 // of the local sparse matrix-matrix multiply routine.
1206
1207 // The status array will contain the index into colind where this entry was last deposited.
1208 // ac_status[i] < nnz - not in the row yet
1209 // ac_status[i] >= nnz - this is the entry where you can find the data
1210 // We start with this filled with INVALID's indicating that there are no entries yet.
1211 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1212 Array<size_t> ac_status(n, ST_INVALID);
1213
1214 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1215 // routine. The routine computes Ac := R * A * (P_local + P_remote).
1216 //
1217 // For column index Aik in row i of A, Acol2Prow[Aik] tells
1218 // you whether the corresponding row of P belongs to P_local
1219 // ("orig") or P_remote ("Import").
1220
1221 // Necessary until following UVM host accesses are changed - for example Crowptr
1222 // Also probably needed in mult_R_A_P_newmatrix_kernel_wrapper - did not demonstrate this in test failure yet
1223 Kokkos::fence();
1224
1225 // For each row of R
1226 size_t OLD_ip = 0, CSR_ip = 0;
1227 for (size_t i = 0; i < m; i++) {
1228 // First fill the c_status array w/ locations where we're allowed to
1229 // generate nonzeros for this row
1230 OLD_ip = Crowptr[i];
1231 CSR_ip = Crowptr[i+1];
1232 for (size_t k = OLD_ip; k < CSR_ip; k++) {
1233 ac_status[Ccolind[k]] = k;
1234
1235 // Reset values in the row of C
1236 Cvals[k] = SC_ZERO;
1237 }
1238
1239 // mfh 27 Sep 2016: For each entry of R in the current row of R
1240 for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1241 LO k = Rcolind[kk]; // local column index of current entry of R
1242 const SC Rik = Rvals[kk]; // value of current entry of R
1243 if (Rik == SC_ZERO)
1244 continue; // skip explicitly stored zero values in R
1245 // For each entry of A in the current row of A
1246 for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1247 LO l = Acolind[ll]; // local column index of current entry of A
1248 const SC Akl = Avals[ll]; // value of current entry of A
1249 if (Akl == SC_ZERO)
1250 continue; // skip explicitly stored zero values in A
1251
1252
1253 if (Acol2Prow[l] != LO_INVALID) {
1254 // mfh 27 Sep 2016: If the entry of Acol2Prow
1255 // corresponding to the current entry of A is populated, then
1256 // the corresponding row of P is in P_local (i.e., it lives on
1257 // the calling process).
1258
1259 // Local matrix
1260 size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1261
1262 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1263 for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1264 LO j = Pcolind[jj];
1265 LO Cij = Pcol2Accol[j];
1266 SC Plj = Pvals[jj];
1267
1268 TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1269 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1270 "(c_status = " << ac_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1271
1272 Cvals[ac_status[Cij]] += Rik*Akl*Plj;
1273 }
1274 } else {
1275 // mfh 27 Sep 2016: If the entry of Acol2PRow
1276 // corresponding to the current entry of A is NOT populated (has
1277 // a flag "invalid" value), then the corresponding row of P is
1278 // in P_remote (i.e., it does not live on the calling process).
1279
1280 // Remote matrix
1281 size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1282 for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1283 LO j = Icolind[jj];
1284 LO Cij = PIcol2Accol[j];
1285 SC Plj = Ivals[jj];
1286
1287 TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1288 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1289 "(c_status = " << ac_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1290
1291 Cvals[ac_status[Cij]] += Rik*Akl*Plj;
1292 }
1293 }
1294 }
1295 }
1296 }
1297
1298#ifdef HAVE_TPETRA_MMM_TIMINGS
1299 auto MM3 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse ESFC"))));
1300#endif
1301
1302 Ac.fillComplete(Ac.getDomainMap(), Ac.getRangeMap());
1303
1304}
1305
1306
1307 /*********************************************************************************************************/
1308 // PT_A_P NewMatrix Kernel wrappers (Default, general, non-threaded version)
1309 // Computes P.T * A * P -> Ac
1310 template<class Scalar,
1311 class LocalOrdinal,
1312 class GlobalOrdinal,
1313 class Node,
1314 class LocalOrdinalViewType>
1315 void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_PT_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1316 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1317 const LocalOrdinalViewType & Acol2Prow,
1318 const LocalOrdinalViewType & Acol2PIrow,
1319 const LocalOrdinalViewType & Pcol2Accol,
1320 const LocalOrdinalViewType & PIcol2Accol,
1321 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1322 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1323 const std::string& label,
1324 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1325#ifdef HAVE_TPETRA_MMM_TIMINGS
1326 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1327 using Teuchos::TimeMonitor;
1328 Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose")));
1329#endif
1330
1331 // We don't need a kernel-level PTAP, we just transpose here
1332 typedef RowMatrixTransposer<Scalar,LocalOrdinal,GlobalOrdinal, Node> transposer_type;
1333 transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1334
1335 using Teuchos::ParameterList;
1336 using Teuchos::RCP;
1337 RCP<ParameterList> transposeParams (new ParameterList);
1338 transposeParams->set ("sort", false);
1339
1340 if (! params.is_null ()) {
1341 transposeParams->set ("compute global constants",
1342 params->get ("compute global constants: temporaries",
1343 false));
1344 }
1345 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1346 transposer.createTransposeLocal (transposeParams);
1347 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1348 Rview.origMatrix = Ptrans;
1349
1350 mult_R_A_P_newmatrix_kernel_wrapper(Rview,Aview,Pview,Acol2Prow,Acol2PIrow,Pcol2Accol,PIcol2Accol,Ac,Acimport,label,params);
1351 }
1352
1353 /*********************************************************************************************************/
1354 // PT_A_P Reuse Kernel wrappers (Default, general, non-threaded version)
1355 // Computes P.T * A * P -> Ac
1356 template<class Scalar,
1357 class LocalOrdinal,
1358 class GlobalOrdinal,
1359 class Node,
1360 class LocalOrdinalViewType>
1361 void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_PT_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1362 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1363 const LocalOrdinalViewType & Acol2Prow,
1364 const LocalOrdinalViewType & Acol2PIrow,
1365 const LocalOrdinalViewType & Pcol2Accol,
1366 const LocalOrdinalViewType & PIcol2Accol,
1367 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1368 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1369 const std::string& label,
1370 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1371#ifdef HAVE_TPETRA_MMM_TIMINGS
1372 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1373 using Teuchos::TimeMonitor;
1374 Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose")));
1375#endif
1376
1377 // We don't need a kernel-level PTAP, we just transpose here
1378 typedef RowMatrixTransposer<Scalar,LocalOrdinal,GlobalOrdinal, Node> transposer_type;
1379 transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1380
1381 using Teuchos::ParameterList;
1382 using Teuchos::RCP;
1383 RCP<ParameterList> transposeParams (new ParameterList);
1384 transposeParams->set ("sort", false);
1385
1386 if (! params.is_null ()) {
1387 transposeParams->set ("compute global constants",
1388 params->get ("compute global constants: temporaries",
1389 false));
1390 }
1391 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1392 transposer.createTransposeLocal (transposeParams);
1393 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1394 Rview.origMatrix = Ptrans;
1395
1396 mult_R_A_P_reuse_kernel_wrapper(Rview,Aview,Pview,Acol2Prow,Acol2PIrow,Pcol2Accol,PIcol2Accol,Ac,Acimport,label,params);
1397 }
1398
1399 /*********************************************************************************************************/
1400 // PT_A_P NewMatrix Kernel wrappers (Default non-threaded version)
1401 // Computes P.T * A * P -> Ac using a 2-pass algorithm.
1402 // This turned out to be slower on SerialNode, but it might still be helpful when going to Kokkos, so I left it in.
1403 // Currently, this implementation never gets called.
1404 template<class Scalar,
1405 class LocalOrdinal,
1406 class GlobalOrdinal,
1407 class Node>
1408 void KernelWrappers3MMM<Scalar,LocalOrdinal,GlobalOrdinal,Node>::mult_PT_A_P_newmatrix_kernel_wrapper_2pass(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1409 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1410 const Teuchos::Array<LocalOrdinal> & Acol2PRow,
1411 const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
1412 const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
1413 const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
1414 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1415 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1416 const std::string& label,
1417 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1418#ifdef HAVE_TPETRA_MMM_TIMINGS
1419 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1420 using Teuchos::TimeMonitor;
1421 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix SerialCore"))));
1422#endif
1423
1424 using Teuchos::Array;
1425 using Teuchos::ArrayRCP;
1426 using Teuchos::ArrayView;
1427 using Teuchos::RCP;
1428 using Teuchos::rcp;
1429
1430 typedef Scalar SC;
1431 typedef LocalOrdinal LO;
1432 typedef GlobalOrdinal GO;
1433 typedef Node NO;
1434 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1435 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1436 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1437
1438 // number of rows on the process of the fine matrix
1439 // size_t m = Pview.origMatrix->getLocalNumRows();
1440 // number of rows on the process of the coarse matrix
1441 size_t n = Ac.getRowMap()->getLocalNumElements();
1442 LO maxAccol = Ac.getColMap()->getMaxLocalIndex();
1443
1444 // Get Data Pointers
1445 ArrayRCP<size_t> Acrowptr_RCP;
1446 ArrayRCP<LO> Accolind_RCP;
1447 ArrayRCP<SC> Acvals_RCP;
1448
1449 // mfh 27 Sep 2016: get the three CSR arrays
1450 // out of the CrsMatrix. This code computes R * A * (P_local +
1451 // P_remote), where P_local contains the locally owned rows of P,
1452 // and P_remote the (previously Import'ed) remote rows of P.
1453
1454 auto Arowptr = Aview.origMatrix->getLocalRowPtrsHost();
1455 auto Acolind = Aview.origMatrix->getLocalIndicesHost();
1456 auto Avals = Aview.origMatrix->getLocalValuesHost(
1457 Tpetra::Access::ReadOnly);
1458 auto Prowptr = Pview.origMatrix->getLocalRowPtrsHost();
1459 auto Pcolind = Pview.origMatrix->getLocalIndicesHost();
1460 auto Pvals = Pview.origMatrix->getLocalValuesHost(
1461 Tpetra::Access::ReadOnly);
1462 decltype(Prowptr) Irowptr;
1463 decltype(Pcolind) Icolind;
1464 decltype(Pvals) Ivals;
1465
1466 if (!Pview.importMatrix.is_null()) {
1467 Irowptr = Pview.importMatrix->getLocalRowPtrsHost();
1468 Icolind = Pview.importMatrix->getLocalIndicesHost();
1469 Ivals = Pview.importMatrix->getLocalValuesHost(
1470 Tpetra::Access::ReadOnly);
1471 }
1472
1473 // mfh 27 Sep 2016: Remark below "For efficiency" refers to an issue
1474 // where Teuchos::ArrayRCP::operator[] may be slower than
1475 // Teuchos::ArrayView::operator[].
1476
1477 // For efficiency
1478 ArrayView<size_t> Acrowptr;
1479 ArrayView<LO> Accolind;
1480 ArrayView<SC> Acvals;
1481
1483 // In a first pass, determine the graph of Ac.
1485
1487 // Get the graph of Ac. This gets the local transpose of P,
1488 // then loops over R, A, P to get the graph of Ac.
1490
1491#ifdef HAVE_TPETRA_MMM_TIMINGS
1492 MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose"))));
1493#endif
1494
1496 // Get the local transpose of the graph of P by locally transposing
1497 // all of P
1498
1499 transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1500
1501 using Teuchos::ParameterList;
1502 RCP<ParameterList> transposeParams (new ParameterList);
1503 transposeParams->set ("sort", false);
1504 if (! params.is_null ()) {
1505 transposeParams->set ("compute global constants",
1506 params->get ("compute global constants: temporaries",
1507 false));
1508 }
1509 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1510 transposer.createTransposeLocal (transposeParams);
1511
1512 auto Rrowptr = Ptrans->getLocalRowPtrsHost();
1513 auto Rcolind = Ptrans->getLocalIndicesHost();
1514 auto Rvals = Ptrans->getLocalValuesHost(Tpetra::Access::ReadOnly);
1515
1517 // Construct graph
1518
1519 #ifdef HAVE_TPETRA_MMM_TIMINGS
1520 MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP graph"))));
1521 #endif
1522
1523 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1524 Array<size_t> ac_status(maxAccol + 1, ST_INVALID);
1525
1526 size_t nnz_alloc = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
1527 size_t nnzPerRowA = 100;
1528 if (Aview.origMatrix->getLocalNumEntries() > 0)
1529 nnzPerRowA = Aview.origMatrix->getLocalNumEntries()/Aview.origMatrix->getLocalNumRows();
1530 Acrowptr_RCP.resize(n+1);
1531 Acrowptr = Acrowptr_RCP();
1532 Accolind_RCP.resize(nnz_alloc);
1533 Accolind = Accolind_RCP();
1534
1535 size_t nnz = 0, nnz_old = 0;
1536 for (size_t i = 0; i < n; i++) {
1537 // mfh 27 Sep 2016: m is the number of rows in the input matrix R
1538 // on the calling process.
1539 Acrowptr[i] = nnz;
1540
1541 // mfh 27 Sep 2016: For each entry of R in the current row of R
1542 for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1543 LO k = Rcolind[kk]; // local column index of current entry of R
1544 // For each entry of A in the current row of A
1545 for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1546 LO l = Acolind[ll]; // local column index of current entry of A
1547
1548 if (Acol2PRow[l] != LO_INVALID) {
1549 // mfh 27 Sep 2016: If the entry of Acol2PRow
1550 // corresponding to the current entry of A is populated, then
1551 // the corresponding row of P is in P_local (i.e., it lives on
1552 // the calling process).
1553
1554 // Local matrix
1555 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1556
1557 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1558 for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1559 LO j = Pcolind[jj];
1560 LO Acj = Pcol2Accol[j];
1561
1562 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1563 // New entry
1564 ac_status[Acj] = nnz;
1565 Accolind[nnz] = Acj;
1566 nnz++;
1567 }
1568 }
1569 } else {
1570 // mfh 27 Sep 2016: If the entry of Acol2PRow
1571 // corresponding to the current entry of A is NOT populated (has
1572 // a flag "invalid" value), then the corresponding row of P is
1573 // in P_remote (i.e., it does not live on the calling process).
1574
1575 // Remote matrix
1576 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1577 for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1578 LO j = Icolind[jj];
1579 LO Acj = PIcol2Accol[j];
1580
1581 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old){
1582 // New entry
1583 ac_status[Acj] = nnz;
1584 Accolind[nnz] = Acj;
1585 nnz++;
1586 }
1587 }
1588 }
1589 }
1590 }
1591 // Resize for next pass if needed
1592 // cag: Maybe we can do something more subtle here, and not double
1593 // the size right away.
1594 if (nnz + std::max(5*nnzPerRowA, n) > nnz_alloc) {
1595 nnz_alloc *= 2;
1596 nnz_alloc = std::max(nnz_alloc, nnz + std::max(5*nnzPerRowA, n));
1597 Accolind_RCP.resize(nnz_alloc); Accolind = Accolind_RCP();
1598 Acvals_RCP.resize(nnz_alloc); Acvals = Acvals_RCP();
1599 }
1600 nnz_old = nnz;
1601 }
1602 Acrowptr[n] = nnz;
1603
1604 // Downward resize
1605 Accolind_RCP.resize(nnz);
1606 Accolind = Accolind_RCP();
1607
1608 // Allocate Acvals
1609 Acvals_RCP.resize(nnz, SC_ZERO);
1610 Acvals = Acvals_RCP();
1611
1612
1614 // In a second pass, enter the values into Acvals.
1616
1617 #ifdef HAVE_TPETRA_MMM_TIMINGS
1618 MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix Fill Matrix"))));
1619 #endif
1620
1621
1622 for (size_t k = 0; k < n; k++) {
1623 for (size_t ii = Prowptr[k]; ii < Prowptr[k+1]; ii++) {
1624 LO i = Pcolind[ii];
1625 const SC Pki = Pvals[ii];
1626 for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1627 LO l = Acolind[ll];
1628 const SC Akl = Avals[ll];
1629 if (Akl == 0.)
1630 continue;
1631 if (Acol2PRow[l] != LO_INVALID) {
1632 // mfh 27 Sep 2016: If the entry of Acol2PRow
1633 // corresponding to the current entry of A is populated, then
1634 // the corresponding row of P is in P_local (i.e., it lives on
1635 // the calling process).
1636
1637 // Local matrix
1638 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1639 for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1640 LO j = Pcolind[jj];
1641 LO Acj = Pcol2Accol[j];
1642 size_t pp;
1643 for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
1644 if (Accolind[pp] == Acj)
1645 break;
1646 // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
1647 // std::runtime_error, "problem with Ac column indices");
1648 Acvals[pp] += Pki*Akl*Pvals[jj];
1649 }
1650 } else {
1651 // mfh 27 Sep 2016: If the entry of Acol2PRow
1652 // corresponding to the current entry of A NOT populated (has
1653 // a flag "invalid" value), then the corresponding row of P is
1654 // in P_remote (i.e., it does not live on the calling process).
1655
1656 // Remote matrix
1657 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1658 for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1659 LO j = Icolind[jj];
1660 LO Acj = PIcol2Accol[j];
1661 size_t pp;
1662 for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
1663 if (Accolind[pp] == Acj)
1664 break;
1665 // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
1666 // std::runtime_error, "problem with Ac column indices");
1667 Acvals[pp] += Pki*Akl*Ivals[jj];
1668 }
1669 }
1670 }
1671 }
1672 }
1673
1674
1675#ifdef HAVE_TPETRA_MMM_TIMINGS
1676 MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP sort"))));
1677#endif
1678
1679 // Final sort & set of CRS arrays
1680 //
1681 // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
1682 // matrix-matrix multiply routine sort the entries for us?
1683 Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
1684
1685 // mfh 27 Sep 2016: This just sets pointers.
1686 Ac.setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
1687
1688#ifdef HAVE_TPETRA_MMM_TIMINGS
1689 MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix ESFC"))));
1690#endif
1691
1692 // Final FillComplete
1693 //
1694 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1695 // Import (from domain Map to column Map) construction (which costs
1696 // lots of communication) by taking the previously constructed
1697 // Import object. We should be able to do this without interfering
1698 // with the implementation of the local part of sparse matrix-matrix
1699 // multply above.
1700 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1701 labelList->set("Timer Label",label);
1702 // labelList->set("Sort column Map ghost GIDs")
1703 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1704 RCP<const Export<LO,GO,NO> > dummyExport;
1705 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1706 Pview.origMatrix->getDomainMap(),
1707 Acimport,
1708 dummyExport, labelList);
1709 }
1710
1711
1712
1713 } //End namepsace MMdetails
1714
1715} //End namespace Tpetra
1716//
1717// Explicit instantiation macro
1718//
1719// Must be expanded from within the Tpetra namespace!
1720//
1721
1722#define TPETRA_TRIPLEMATRIXMULTIPLY_INSTANT(SCALAR,LO,GO,NODE) \
1723 \
1724 template \
1725 void TripleMatrixMultiply::MultiplyRAP( \
1726 const CrsMatrix< SCALAR , LO , GO , NODE >& R, \
1727 bool transposeR, \
1728 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
1729 bool transposeA, \
1730 const CrsMatrix< SCALAR , LO , GO , NODE >& P, \
1731 bool transposeP, \
1732 CrsMatrix< SCALAR , LO , GO , NODE >& Ac, \
1733 bool call_FillComplete_on_result, \
1734 const std::string & label, \
1735 const Teuchos::RCP<Teuchos::ParameterList>& params); \
1736 \
1737
1738
1739#endif // TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
Utility functions for packing and unpacking sparse matrix entries.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
bool isFillActive() const
Whether the matrix is not fill complete.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
bool isFillComplete() const override
Whether the matrix is fill complete.
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
A parallel distribution of indices over processes.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Distributed sparse triple matrix product.
void MultiplyRAP(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &R, bool transposeR, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &P, bool transposeP, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Ac, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
size_t global_size_t
Global size_t object.