Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
TpetraExt_MatrixMatrix_ExtraKernels_def.hpp
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
11#define TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
12#include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
13#include "Tpetra_RowMatrixTransposer.hpp"
14
15namespace Tpetra {
16
17namespace MatrixMatrix{
18
19namespace ExtraKernels{
20
21
22// Double product version
23template<class CrsMatrixType>
24size_t C_estimate_nnz_per_row(CrsMatrixType & A, CrsMatrixType &B){
25 // Follows the NZ estimate in ML's ml_matmatmult.c
26 size_t Aest = 100, Best=100;
27 if (A.getLocalNumEntries() > 0)
28 Aest = (A.getLocalNumRows() > 0)? A.getLocalNumEntries()/A.getLocalNumRows() : 100;
29 if (B.getLocalNumEntries() > 0)
30 Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries()/B.getLocalNumRows() : 100;
31
32 size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
33 nnzperrow *= nnzperrow;
34
35 return nnzperrow;
36}
37
38
39// Triple product version
40template<class CrsMatrixType>
41size_t Ac_estimate_nnz(CrsMatrixType & A, CrsMatrixType &P){
42 size_t nnzPerRowA = 100, Pcols = 100;
43 if (A.getLocalNumEntries() > 0)
44 nnzPerRowA = (A.getLocalNumRows() > 0)? A.getLocalNumEntries()/A.getLocalNumRows() : 9;
45 if (P.getLocalNumEntries() > 0)
46 Pcols = (P.getLocalNumCols() > 0) ? P.getLocalNumCols() : 100;
47 return (size_t)(Pcols*nnzPerRowA + 5*nnzPerRowA + 300);
48}
49
50#if defined (HAVE_TPETRA_INST_OPENMP)
51/*********************************************************************************************************/
52template<class Scalar,
53 class LocalOrdinal,
54 class GlobalOrdinal,
55 class LocalOrdinalViewType>
56void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
57 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
58 const LocalOrdinalViewType & targetMapToOrigRow,
59 const LocalOrdinalViewType & targetMapToImportRow,
60 const LocalOrdinalViewType & Bcol2Ccol,
61 const LocalOrdinalViewType & Icol2Ccol,
62 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
63 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
64 const std::string& label,
65 const Teuchos::RCP<Teuchos::ParameterList>& params) {
66#ifdef HAVE_TPETRA_MMM_TIMINGS
67 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
68 using Teuchos::TimeMonitor;
69 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix LTGCore"))));
70#endif
71 using Teuchos::Array;
72 using Teuchos::ArrayRCP;
73 using Teuchos::ArrayView;
74 using Teuchos::RCP;
75 using Teuchos::rcp;
76
77
78 // Lots and lots of typedefs
80 typedef typename KCRS::device_type device_t;
81 typedef typename KCRS::StaticCrsGraphType graph_t;
82 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
83 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
84 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
85 typedef typename KCRS::values_type::non_const_type scalar_view_t;
86
87 // Unmanaged versions of the above
88 //typedef UnmanagedView<lno_view_t> u_lno_view_t; // unused
89 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
90 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
91
92 typedef Scalar SC;
93 typedef LocalOrdinal LO;
94 typedef GlobalOrdinal GO;
95 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
96 typedef Map<LO,GO,NO> map_type;
97
98 // NOTE (mfh 15 Sep 2017) This is specifically only for
99 // execution_space = Kokkos::OpenMP, so we neither need nor want
100 // KOKKOS_LAMBDA (with its mandatory __device__ marking).
101 typedef NO::execution_space execution_space;
102 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
103
104 // All of the invalid guys
105 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
106 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
107 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
108
109 // Grab the Kokkos::SparseCrsMatrices & inner stuff
110 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
111 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
112
113 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
114 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
115 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
116 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
117
118 c_lno_view_t Irowptr;
119 lno_nnz_view_t Icolind;
120 scalar_view_t Ivals;
121 if(!Bview.importMatrix.is_null()) {
122 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
123 Irowptr = lclB.graph.row_map;
124 Icolind = lclB.graph.entries;
125 Ivals = lclB.values;
126 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
127 }
128
129 // Sizes
130 RCP<const map_type> Ccolmap = C.getColMap();
131 size_t m = Aview.origMatrix->getLocalNumRows();
132 size_t n = Ccolmap->getLocalNumElements();
133 size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
134
135 // Get my node / thread info (right from openmp or parameter list)
136 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
137 if(!params.is_null()) {
138 if(params->isParameter("openmp: ltg thread max"))
139 thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
140 }
141
142 // 2019 Apr 10 jje: We can do rowptr in place, and no need to inialize since we can fault as we go
143 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
144 // we will not touch these until the final copyout step
145 lno_nnz_view_t entriesC;
146 scalar_view_t valuesC;
147
148 // add this, since we do the rowptr in place, we could figure this out
149 // using the rowptr, but it requires an unusual loop (that jumps all over memory)
150 lno_nnz_view_t thread_total_nnz("thread_total_nnz",thread_max+1);
151
152 // Thread-local memory
153 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind("top_colind",thread_max);
154 Kokkos::View<u_scalar_view_t*, device_t> tl_values("top_values",thread_max);
155
156 double thread_chunk = (double)(m) / thread_max;
157
158 // Run chunks of the matrix independently
159 Kokkos::parallel_for("MMM::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
160 {
161 // Thread coordination stuff
162 size_t my_thread_start = tid * thread_chunk;
163 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
164 size_t my_thread_m = my_thread_stop - my_thread_start;
165
166 // Size estimate
167 size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
168
169 // Allocations
170 std::vector<size_t> c_status(n,INVALID);
171
172 u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
173 u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
174
175 // For each row of A/C
176 size_t CSR_ip = 0, OLD_ip = 0;
177 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
178 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
179 // on the calling process.
180 // JJE 10 Apr 2019 index directly into the rowptr
181 row_mapC(i) = CSR_ip;
182
183 // mfh 27 Sep 2016: For each entry of A in the current row of A
184 for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
185 LO Aik = Acolind(k); // local column index of current entry of A
186 const SC Aval = Avals(k); // value of current entry of A
187 if (Aval == SC_ZERO)
188 continue; // skip explicitly stored zero values in A
189
190 if (targetMapToOrigRow(Aik) != LO_INVALID) {
191 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
192 // corresponding to the current entry of A is populated, then
193 // the corresponding row of B is in B_local (i.e., it lives on
194 // the calling process).
195
196 // Local matrix
197 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
198
199 // mfh 27 Sep 2016: Go through all entries in that row of B_local.
200 for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
201 LO Bkj = Bcolind(j);
202 LO Cij = Bcol2Ccol(Bkj);
203
204 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
205 // New entry
206 c_status[Cij] = CSR_ip;
207 Ccolind(CSR_ip) = Cij;
208 Cvals(CSR_ip) = Aval*Bvals(j);
209 CSR_ip++;
210
211 } else {
212 Cvals(c_status[Cij]) += Aval*Bvals(j);
213 }
214 }
215
216 } else {
217 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
218 // corresponding to the current entry of A NOT populated (has
219 // a flag "invalid" value), then the corresponding row of B is
220 // in B_local (i.e., it lives on the calling process).
221
222 // Remote matrix
223 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
224 for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
225 LO Ikj = Icolind(j);
226 LO Cij = Icol2Ccol(Ikj);
227
228 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
229 // New entry
230 c_status[Cij] = CSR_ip;
231 Ccolind(CSR_ip) = Cij;
232 Cvals(CSR_ip) = Aval*Ivals(j);
233 CSR_ip++;
234
235 } else {
236 Cvals(c_status[Cij]) += Aval*Ivals(j);
237 }
238 }
239 }
240 }
241
242 // Resize for next pass if needed
243 if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1))*b_max_nnz_per_row) > CSR_alloc) {
244 CSR_alloc *= 2;
245 Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
246 Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc((void*) Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
247 }
248 OLD_ip = CSR_ip;
249 }
250 thread_total_nnz(tid) = CSR_ip;
251 tl_colind(tid) = Ccolind;
252 tl_values(tid) = Cvals;
253 });
254
255 // Do the copy out
256 copy_out_from_thread_memory(thread_total_nnz,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
257
258#ifdef HAVE_TPETRA_MMM_TIMINGS
259 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix OpenMPSort"))));
260#endif
261 // Sort & set values
262 if (params.is_null() || params->get("sort entries",true))
263 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
264 C.setAllValues(row_mapC,entriesC,valuesC);
265
266}
267
268/*********************************************************************************************************/
269template<class Scalar,
270 class LocalOrdinal,
271 class GlobalOrdinal,
272 class LocalOrdinalViewType>
273void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
274 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
275 const LocalOrdinalViewType & targetMapToOrigRow,
276 const LocalOrdinalViewType & targetMapToImportRow,
277 const LocalOrdinalViewType & Bcol2Ccol,
278 const LocalOrdinalViewType & Icol2Ccol,
279 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
280 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
281 const std::string& label,
282 const Teuchos::RCP<Teuchos::ParameterList>& params) {
283#ifdef HAVE_TPETRA_MMM_TIMINGS
284 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
285 using Teuchos::TimeMonitor;
286 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse LTGCore"))));
287#endif
288
289 using Teuchos::Array;
290 using Teuchos::ArrayRCP;
291 using Teuchos::ArrayView;
292 using Teuchos::RCP;
293 using Teuchos::rcp;
294
295 // Lots and lots of typedefs
297 // typedef typename KCRS::device_type device_t;
298 typedef typename KCRS::StaticCrsGraphType graph_t;
299 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
300 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
301 typedef typename KCRS::values_type::non_const_type scalar_view_t;
302
303 typedef Scalar SC;
304 typedef LocalOrdinal LO;
305 typedef GlobalOrdinal GO;
306 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
307 typedef Map<LO,GO,NO> map_type;
308
309 // NOTE (mfh 15 Sep 2017) This is specifically only for
310 // execution_space = Kokkos::OpenMP, so we neither need nor want
311 // KOKKOS_LAMBDA (with its mandatory __device__ marking).
312 typedef NO::execution_space execution_space;
313 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
314
315 // All of the invalid guys
316 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
317 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
318 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
319
320 // Grab the Kokkos::SparseCrsMatrices & inner stuff
321 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
322 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
323 const KCRS & Cmat = C.getLocalMatrixDevice();
324
325 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
326 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
327 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
328 scalar_view_t Cvals = Cmat.values;
329
330 c_lno_view_t Irowptr;
331 c_lno_nnz_view_t Icolind;
332 scalar_view_t Ivals;
333 if(!Bview.importMatrix.is_null()) {
334 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
335 Irowptr = lclB.graph.row_map;
336 Icolind = lclB.graph.entries;
337 Ivals = lclB.values;
338 }
339
340 // Sizes
341 RCP<const map_type> Ccolmap = C.getColMap();
342 size_t m = Aview.origMatrix->getLocalNumRows();
343 size_t n = Ccolmap->getLocalNumElements();
344
345 // Get my node / thread info (right from openmp or parameter list)
346 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
347 if(!params.is_null()) {
348 if(params->isParameter("openmp: ltg thread max"))
349 thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
350 }
351
352 double thread_chunk = (double)(m) / thread_max;
353
354 // Run chunks of the matrix independently
355 Kokkos::parallel_for("MMM::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
356 {
357 // Thread coordination stuff
358 size_t my_thread_start = tid * thread_chunk;
359 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
360
361 // Allocations
362 std::vector<size_t> c_status(n,INVALID);
363
364 // For each row of A/C
365 size_t CSR_ip = 0, OLD_ip = 0;
366 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
367 // First fill the c_status array w/ locations where we're allowed to
368 // generate nonzeros for this row
369 OLD_ip = Crowptr(i);
370 CSR_ip = Crowptr(i+1);
371 for (size_t k = OLD_ip; k < CSR_ip; k++) {
372 c_status[Ccolind(k)] = k;
373 // Reset values in the row of C
374 Cvals(k) = SC_ZERO;
375 }
376
377 for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
378 LO Aik = Acolind(k);
379 const SC Aval = Avals(k);
380 if (Aval == SC_ZERO)
381 continue;
382
383 if (targetMapToOrigRow(Aik) != LO_INVALID) {
384 // Local matrix
385 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
386
387 for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
388 LO Bkj = Bcolind(j);
389 LO Cij = Bcol2Ccol(Bkj);
390
391 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
392 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
393 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
394
395 Cvals(c_status[Cij]) += Aval * Bvals(j);
396 }
397 } else {
398 // Remote matrix
399 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
400 for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
401 LO Ikj = Icolind(j);
402 LO Cij = Icol2Ccol(Ikj);
403
404 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
405 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
406 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
407
408 Cvals(c_status[Cij]) += Aval * Ivals(j);
409 }
410 }
411 }
412 }
413 });
414
415 // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
416}
417
418/*********************************************************************************************************/
419template<class Scalar,
420 class LocalOrdinal,
421 class GlobalOrdinal,
422 class LocalOrdinalViewType>
423void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(Scalar omega,
424 const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> & Dinv,
425 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
426 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
427 const LocalOrdinalViewType & targetMapToOrigRow,
428 const LocalOrdinalViewType & targetMapToImportRow,
429 const LocalOrdinalViewType & Bcol2Ccol,
430 const LocalOrdinalViewType & Icol2Ccol,
431 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
432 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
433 const std::string& label,
434 const Teuchos::RCP<Teuchos::ParameterList>& params) {
435#ifdef HAVE_TPETRA_MMM_TIMINGS
436 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
437 using Teuchos::TimeMonitor;
438 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix LTGCore"))));
439#endif
440
441 using Teuchos::Array;
442 using Teuchos::ArrayRCP;
443 using Teuchos::ArrayView;
444 using Teuchos::RCP;
445 using Teuchos::rcp;
446
447 // Lots and lots of typedefs
448 typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
450 typedef typename KCRS::device_type device_t;
451 typedef typename KCRS::StaticCrsGraphType graph_t;
452 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
453 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
454 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
455 typedef typename KCRS::values_type::non_const_type scalar_view_t;
456
457 // Unmanaged versions of the above
458 //typedef UnmanagedView<lno_view_t> u_lno_view_t; // unused
459 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
460 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
461
462 // Jacobi-specific
463 typedef typename scalar_view_t::memory_space scalar_memory_space;
464
465 typedef Scalar SC;
466 typedef LocalOrdinal LO;
467 typedef GlobalOrdinal GO;
468 typedef Node NO;
469 typedef Map<LO,GO,NO> map_type;
470
471 // NOTE (mfh 15 Sep 2017) This is specifically only for
472 // execution_space = Kokkos::OpenMP, so we neither need nor want
473 // KOKKOS_LAMBDA (with its mandatory __device__ marking).
474 typedef NO::execution_space execution_space;
475 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
476
477 // All of the invalid guys
478 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
479 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
480 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
481
482 // Grab the Kokkos::SparseCrsMatrices & inner stuff
483 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
484 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
485
486 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
487 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
488 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
489 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
490
491 c_lno_view_t Irowptr;
492 lno_nnz_view_t Icolind;
493 scalar_view_t Ivals;
494 if(!Bview.importMatrix.is_null()) {
495 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
496 Irowptr = lclB.graph.row_map;
497 Icolind = lclB.graph.entries;
498 Ivals = lclB.values;
499 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
500 }
501
502 // Jacobi-specific inner stuff
503 auto Dvals =
504 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
505
506 // Sizes
507 RCP<const map_type> Ccolmap = C.getColMap();
508 size_t m = Aview.origMatrix->getLocalNumRows();
509 size_t n = Ccolmap->getLocalNumElements();
510 size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
511
512 // Get my node / thread info (right from openmp)
513 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
514 if(!params.is_null()) {
515 if(params->isParameter("openmp: ltg thread max"))
516 thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
517 }
518
519 // 2019 Apr 10 jje: We can do rowptr in place, and no need to inialize since we can fault as we go
520 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
521 // we will not touch these until the final copyout step
522 lno_nnz_view_t entriesC;
523 scalar_view_t valuesC;
524
525 // add this, since we do the rowptr in place, we could figure this out
526 // using the rowptr, but it requires an unusual loop (that jumps all over memory)
527 lno_nnz_view_t thread_total_nnz("thread_total_nnz",thread_max+1);
528
529 // Thread-local memory
530 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind("top_colind",thread_max);
531 Kokkos::View<u_scalar_view_t*, device_t> tl_values("top_values",thread_max);
532
533 double thread_chunk = (double)(m) / thread_max;
534
535 // Run chunks of the matrix independently
536 Kokkos::parallel_for("Jacobi::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
537 {
538 // Thread coordination stuff
539 size_t my_thread_start = tid * thread_chunk;
540 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
541 size_t my_thread_m = my_thread_stop - my_thread_start;
542
543 // Size estimate
544 size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
545
546 // Allocations
547 std::vector<size_t> c_status(n,INVALID);
548
549 u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
550 u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
551
552 // For each row of A/C
553 size_t CSR_ip = 0, OLD_ip = 0;
554 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
555 // printf("CMS: row %d CSR_alloc = %d\n",(int)i,(int)CSR_alloc);fflush(stdout);
556 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
557 // on the calling process.
558 // JJE: directly access the rowptr here (indexed using our loop var)
559 row_mapC(i) = CSR_ip;
560 // NOTE: Vector::getLocalView returns a rank 2 view here
561 SC minusOmegaDval = -omega*Dvals(i,0);
562
563 // Entries of B
564 for (size_t j = Browptr(i); j < Browptr(i+1); j++) {
565 const SC Bval = Bvals(j);
566 if (Bval == SC_ZERO)
567 continue;
568 LO Bij = Bcolind(j);
569 LO Cij = Bcol2Ccol(Bij);
570
571 // Assume no repeated entries in B
572 c_status[Cij] = CSR_ip;
573 Ccolind(CSR_ip) = Cij;
574 Cvals(CSR_ip) = Bvals[j];
575 CSR_ip++;
576 }
577
578 // Entries of -omega * Dinv * A * B
579 // mfh 27 Sep 2016: For each entry of A in the current row of A
580 for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
581 LO Aik = Acolind(k); // local column index of current entry of A
582 const SC Aval = Avals(k); // value of current entry of A
583 if (Aval == SC_ZERO)
584 continue; // skip explicitly stored zero values in A
585
586 if (targetMapToOrigRow(Aik) != LO_INVALID) {
587 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
588 // corresponding to the current entry of A is populated, then
589 // the corresponding row of B is in B_local (i.e., it lives on
590 // the calling process).
591
592 // Local matrix
593 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
594
595 // mfh 27 Sep 2016: Go through all entries in that row of B_local.
596 for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
597 LO Bkj = Bcolind(j);
598 LO Cij = Bcol2Ccol(Bkj);
599
600 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
601 // New entry
602 c_status[Cij] = CSR_ip;
603 Ccolind(CSR_ip) = Cij;
604 Cvals(CSR_ip) = minusOmegaDval*Aval*Bvals(j);
605 CSR_ip++;
606
607 } else {
608 Cvals(c_status[Cij]) += minusOmegaDval*Aval*Bvals(j);
609 }
610 }
611
612 } else {
613 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
614 // corresponding to the current entry of A NOT populated (has
615 // a flag "invalid" value), then the corresponding row of B is
616 // in B_local (i.e., it lives on the calling process).
617
618 // Remote matrix
619 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
620 for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
621 LO Ikj = Icolind(j);
622 LO Cij = Icol2Ccol(Ikj);
623
624 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
625 // New entry
626 c_status[Cij] = CSR_ip;
627 Ccolind(CSR_ip) = Cij;
628 Cvals(CSR_ip) = minusOmegaDval*Aval*Ivals(j);
629 CSR_ip++;
630
631 } else {
632 Cvals(c_status[Cij]) += minusOmegaDval*Aval*Ivals(j);
633 }
634 }
635 }
636 }
637
638 // Resize for next pass if needed
639 if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1)+1)*b_max_nnz_per_row) > CSR_alloc) {
640 CSR_alloc *= 2;
641 Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
642 Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc((void*) Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
643 }
644 OLD_ip = CSR_ip;
645 }
646 thread_total_nnz(tid) = CSR_ip;
647 tl_colind(tid) = Ccolind;
648 tl_values(tid) = Cvals;
649 });
650
651
652 // Do the copy out (removed the tl_rowptr!)
653 copy_out_from_thread_memory(thread_total_nnz,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
654
655
656#ifdef HAVE_TPETRA_MMM_TIMINGS
657 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix OpenMPSort"))));
658#endif
659 // Sort & set values
660 if (params.is_null() || params->get("sort entries",true))
661 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
662 C.setAllValues(row_mapC,entriesC,valuesC);
663
664}
665
666
667
668/*********************************************************************************************************/
669template<class Scalar,
670 class LocalOrdinal,
671 class GlobalOrdinal,
672 class LocalOrdinalViewType>
673void jacobi_A_B_reuse_LowThreadGustavsonKernel(Scalar omega,
674 const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> & Dinv,
675 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
676 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
677 const LocalOrdinalViewType & targetMapToOrigRow,
678 const LocalOrdinalViewType & targetMapToImportRow,
679 const LocalOrdinalViewType & Bcol2Ccol,
680 const LocalOrdinalViewType & Icol2Ccol,
681 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
682 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
683 const std::string& label,
684 const Teuchos::RCP<Teuchos::ParameterList>& params) {
685#ifdef HAVE_TPETRA_MMM_TIMINGS
686 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
687 using Teuchos::TimeMonitor;
688 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse LTGCore"))));
689#endif
690 using Teuchos::Array;
691 using Teuchos::ArrayRCP;
692 using Teuchos::ArrayView;
693 using Teuchos::RCP;
694 using Teuchos::rcp;
695
696 // Lots and lots of typedefs
697 typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
699 // typedef typename KCRS::device_type device_t;
700 typedef typename KCRS::StaticCrsGraphType graph_t;
701 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
702 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
703 typedef typename KCRS::values_type::non_const_type scalar_view_t;
704
705 // Jacobi-specific
706 typedef typename scalar_view_t::memory_space scalar_memory_space;
707
708 typedef Scalar SC;
709 typedef LocalOrdinal LO;
710 typedef GlobalOrdinal GO;
711 typedef Node NO;
712 typedef Map<LO,GO,NO> map_type;
713
714 // NOTE (mfh 15 Sep 2017) This is specifically only for
715 // execution_space = Kokkos::OpenMP, so we neither need nor want
716 // KOKKOS_LAMBDA (with its mandatory __device__ marking).
717 typedef NO::execution_space execution_space;
718 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
719
720 // All of the invalid guys
721 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
722 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
723 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
724
725 // Grab the Kokkos::SparseCrsMatrices & inner stuff
726 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
727 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
728 const KCRS & Cmat = C.getLocalMatrixDevice();
729
730 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
731 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
732 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
733 scalar_view_t Cvals = Cmat.values;
734
735 c_lno_view_t Irowptr;
736 c_lno_nnz_view_t Icolind;
737 scalar_view_t Ivals;
738 if(!Bview.importMatrix.is_null()) {
739 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
740 Irowptr = lclB.graph.row_map;
741 Icolind = lclB.graph.entries;
742 Ivals = lclB.values;
743 }
744
745 // Jacobi-specific inner stuff
746 auto Dvals =
747 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
748
749 // Sizes
750 RCP<const map_type> Ccolmap = C.getColMap();
751 size_t m = Aview.origMatrix->getLocalNumRows();
752 size_t n = Ccolmap->getLocalNumElements();
753
754 // Get my node / thread info (right from openmp or parameter list)
755 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
756 if(!params.is_null()) {
757 if(params->isParameter("openmp: ltg thread max"))
758 thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
759 }
760
761 double thread_chunk = (double)(m) / thread_max;
762
763 // Run chunks of the matrix independently
764 Kokkos::parallel_for("Jacobi::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
765 {
766 // Thread coordination stuff
767 size_t my_thread_start = tid * thread_chunk;
768 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
769
770 // Allocations
771 std::vector<size_t> c_status(n,INVALID);
772
773 // For each row of A/C
774 size_t CSR_ip = 0, OLD_ip = 0;
775 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
776 // First fill the c_status array w/ locations where we're allowed to
777 // generate nonzeros for this row
778 OLD_ip = Crowptr(i);
779 CSR_ip = Crowptr(i+1);
780 // NOTE: Vector::getLocalView returns a rank 2 view here
781 SC minusOmegaDval = -omega*Dvals(i,0);
782
783 for (size_t k = OLD_ip; k < CSR_ip; k++) {
784 c_status[Ccolind(k)] = k;
785 // Reset values in the row of C
786 Cvals(k) = SC_ZERO;
787 }
788
789 // Entries of B
790 for (size_t j = Browptr(i); j < Browptr(i+1); j++) {
791 const SC Bval = Bvals(j);
792 if (Bval == SC_ZERO)
793 continue;
794 LO Bij = Bcolind(j);
795 LO Cij = Bcol2Ccol(Bij);
796
797 // Assume no repeated entries in B
798 Cvals(c_status[Cij]) += Bvals(j);
799 CSR_ip++;
800 }
801
802
803 for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
804 LO Aik = Acolind(k);
805 const SC Aval = Avals(k);
806 if (Aval == SC_ZERO)
807 continue;
808
809 if (targetMapToOrigRow(Aik) != LO_INVALID) {
810 // Local matrix
811 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
812
813 for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
814 LO Bkj = Bcolind(j);
815 LO Cij = Bcol2Ccol(Bkj);
816
817 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
818 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
819 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
820
821 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
822 }
823 } else {
824 // Remote matrix
825 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
826 for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
827 LO Ikj = Icolind(j);
828 LO Cij = Icol2Ccol(Ikj);
829
830 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
831 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
832 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
833
834 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
835 }
836 }
837 }
838 }
839 });
840
841 // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
842}
843
844
845/*********************************************************************************************************/
846template<class InColindArrayType, class InValsArrayType,
847 class OutRowptrType, class OutColindType, class OutValsType>
848void copy_out_from_thread_memory(const OutColindType& thread_total_nnz,
849 const InColindArrayType& Incolind,
850 const InValsArrayType& Invalues,
851 const size_t m,
852 const double thread_chunk,
853 OutRowptrType& row_mapC,
854 OutColindType& entriesC,
855 OutValsType& valuesC ) {
856 typedef OutRowptrType lno_view_t;
857 typedef OutColindType lno_nnz_view_t;
858 typedef OutValsType scalar_view_t;
859 typedef typename lno_view_t::execution_space execution_space;
860 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
861
862 // Generate the starting nnz number per thread
863 size_t thread_max = Incolind.size();
864 size_t c_nnz_size=0;
865 // since this kernel is 'low thread count' is very likely, that we could
866 // sum over the thread_total_nnz and not go parallel, but since it is a view
867 // we don't know where the memory actually lives... so we kinda need to go parallel
868
869 lno_view_t thread_start_nnz("thread_nnz",thread_max+1);
870 Kokkos::parallel_scan("LTG::Scan",range_type(0,thread_max).set_chunk_size(1), [=] (const size_t i, size_t& update, const bool final) {
871 size_t mynnz = thread_total_nnz(i);
872 if(final) thread_start_nnz(i) = update;
873 update+=mynnz;
874 if(final && i+1==thread_max) thread_start_nnz(i+1)=update;
875 });
876 c_nnz_size = thread_start_nnz(thread_max);
877
878 // 2019 Apr 10 JJE: update the rowptr's final entry here
879 row_mapC(m) = thread_start_nnz(thread_max);
880
881 // Allocate output
882 lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size); entriesC = entriesC_;
883 scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size); valuesC = valuesC_;
884
885 // Copy out
886 Kokkos::parallel_for("LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid) {
887 const size_t my_thread_start = tid * thread_chunk;
888 const size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
889 const size_t nnz_thread_start = thread_start_nnz(tid);
890 // we need this info, since we did the rowptr in place
891 const size_t nnz_thread_stop = thread_start_nnz(tid+1);
892
893 // There are two fundamental operations:
894 // * Updateing the rowptr with the correct offset
895 // * copying entries and values
896
897 // First, update the rowptr, it since we did it inplace, it is a += operation
898 // in the paper, I experimented with a coloring scheme that had threads do
899 // do their copies in different orders. It wasn't obvious it was beneficial
900 // but it can be replicated, by choosing the array to copy first based on your
901 // thread id % 3
902 if (my_thread_start != 0 ) {
903 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
904 row_mapC(i) += nnz_thread_start;
905 }
906 }
907
908 // try to Kokkos::single() the alloc here. It should implicitly barrier
909 // thread 0 doesn't copy the rowptr, so it could hit the single first
910 // in the paper, I played a game that effectively got LTG down to a single
911 // OpenMP barrier. But doing that requires the ability to spawn a single
912 // parallel region. The Scan above was implemented using atomic adds
913 // and the barrier was only needed so you could allocate
914 //
915 // Since we can't spawn a single region, we could move the allocations
916 // here, and using 'single'. Most likely, thread 0 will hit it first allowing
917 // the other threads to update the rowptr while it allocates.
918
919
920 // next, bulk copy the vals/colind arrays
921 const size_t my_num_nnz = nnz_thread_stop - nnz_thread_start;
922 for (size_t i = 0; i < my_num_nnz; ++i) {
923 entriesC(nnz_thread_start + i) = Incolind(tid)(i);
924 valuesC(nnz_thread_start + i) = Invalues(tid)(i);
925 }
926
927 //Free the unamanged views, let each thread deallocate its memory
928 // May need to cast away const here..
929 if(Incolind(tid).data()) free(Incolind(tid).data());
930 if(Invalues(tid).data()) free(Invalues(tid).data());
931 });
932}//end copy_out
933
934#endif // OpenMP
935
936
937/*********************************************************************************************************/
938template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, class LocalOrdinalViewType>
939void jacobi_A_B_newmatrix_MultiplyScaleAddKernel(Scalar omega,
940 const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Node> & Dinv,
941 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
942 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
943 const LocalOrdinalViewType & Acol2Brow,
944 const LocalOrdinalViewType & Acol2Irow,
945 const LocalOrdinalViewType & Bcol2Ccol,
946 const LocalOrdinalViewType & Icol2Ccol,
947 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
948 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
949 const std::string& label,
950 const Teuchos::RCP<Teuchos::ParameterList>& params) {
951#ifdef HAVE_TPETRA_MMM_TIMINGS
952 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
953 using Teuchos::TimeMonitor;
954 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK"))));
955 Teuchos::RCP<TimeMonitor> MM2 = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Multiply"))));
956 using Teuchos::rcp;
957#endif
958 typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
959
960 // This kernel computes (I-omega Dinv A) B the slow way (for generality's sake, you must understand)
961
962 // 1) Multiply A*B
963 Teuchos::RCP<Matrix_t> AB = Teuchos::rcp(new Matrix_t(C.getRowMap(),0));
964 Tpetra::MMdetails::mult_A_B_newmatrix(Aview,Bview,*AB,label+std::string(" MSAK"),params);
965
966#ifdef HAVE_TPETRA_MMM_TIMINGS
967 MM2=Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Scale"))));
968#endif
969
970 // 2) Scale A by Dinv
971 AB->leftScale(Dinv);
972
973#ifdef HAVE_TPETRA_MMM_TIMINGS
974 MM2=Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Add"))));
975#endif
976
977 // 3) Add [-omega Dinv A] + B
978 Teuchos::ParameterList jparams;
979 if(!params.is_null()) {
980 jparams = *params;
981 jparams.set("label",label+std::string(" MSAK Add"));
982 }
983 Scalar one = Teuchos::ScalarTraits<Scalar>::one();
984 Tpetra::MatrixMatrix::add(one,false,*Bview.origMatrix,Scalar(-omega),false,*AB,C,AB->getDomainMap(),AB->getRangeMap(),Teuchos::rcp(&jparams,false));
985#ifdef HAVE_TPETRA_MMM_TIMINGS
986 MM2=Teuchos::null;
987#endif
988 }// jacobi_A_B_newmatrix_MultiplyScaleAddKernel
989
990
991
992#if defined (HAVE_TPETRA_INST_OPENMP)
993/*********************************************************************************************************/
994template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class LocalOrdinalViewType>
995static inline void mult_R_A_P_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
996 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
997 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
998 const LocalOrdinalViewType & Acol2PRow,
999 const LocalOrdinalViewType & Acol2PRowImport,
1000 const LocalOrdinalViewType & Pcol2Accol,
1001 const LocalOrdinalViewType & PIcol2Accol,
1002 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
1003 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
1004 const std::string& label,
1005 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1006
1007 using Teuchos::RCP;
1008 using Tpetra::MatrixMatrix::UnmanagedView;
1009 #ifdef HAVE_TPETRA_MMM_TIMINGS
1010 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1011 using Teuchos::rcp;
1012 using Teuchos::TimeMonitor;
1013 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix LTGCore"))));
1014 #endif
1015
1016 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1017 typedef Scalar SC;
1018 typedef LocalOrdinal LO;
1019 typedef GlobalOrdinal GO;
1020 typedef Node NO;
1021 typedef Map<LO,GO,NO> map_type;
1023 typedef typename KCRS::StaticCrsGraphType graph_t;
1024 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1025 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1026 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1027 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1028 typedef typename KCRS::device_type device_t;
1029 typedef typename device_t::execution_space execution_space;
1030 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1031
1032 // Unmanaged versions of the above
1033 typedef UnmanagedView<lno_view_t> u_lno_view_t;
1034 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
1035 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
1036
1037 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1038 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1039
1040 // Sizes
1041 RCP<const map_type> Accolmap = Ac.getColMap();
1042 size_t m = Rview.origMatrix->getLocalNumRows();
1043 size_t n = Accolmap->getLocalNumElements();
1044
1045 // Get raw Kokkos matrices, and the raw CSR views
1046 const KCRS & Rmat = Rview.origMatrix->getLocalMatrixDevice();
1047 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
1048 const KCRS & Pmat = Pview.origMatrix->getLocalMatrixDevice();
1049
1050 c_lno_view_t Rrowptr = Rmat.graph.row_map,
1051 Arowptr = Amat.graph.row_map,
1052 Prowptr = Pmat.graph.row_map, Irowptr;
1053 const lno_nnz_view_t Rcolind = Rmat.graph.entries,
1054 Acolind = Amat.graph.entries,
1055 Pcolind = Pmat.graph.entries;
1056 lno_nnz_view_t Icolind;
1057 const scalar_view_t Rvals = Rmat.values,
1058 Avals = Amat.values,
1059 Pvals = Pmat.values;
1060 scalar_view_t Ivals;
1061
1062 if (!Pview.importMatrix.is_null())
1063 {
1064 const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
1065 Irowptr = Imat.graph.row_map;
1066 Icolind = Imat.graph.entries;
1067 Ivals = Imat.values;
1068 }
1069
1070 // Classic csr assembly (low memory edition)
1071 //
1072 // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
1073 // The method loops over rows of R, and may resize after processing
1074 // each row. Chris Siefert says that this reflects experience in
1075 // ML; for the non-threaded case, ML found it faster to spend less
1076 // effort on estimation and risk an occasional reallocation.
1077
1078 size_t Acest_nnz_per_row = std::ceil(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix) / m);
1079 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1080
1081 // Get my node / thread info (right from openmp or parameter list)
1082 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1083 if(!params.is_null()) {
1084 if(params->isParameter("openmp: ltg thread max"))
1085 thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
1086 }
1087
1088 double thread_chunk = (double)(m) / thread_max;
1089
1090 // we can construct the rowptr inplace, allocate here and fault in parallel
1091 lno_view_t rowmapAc(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
1092 // we do not touch these until copy out
1093 lno_nnz_view_t entriesAc;
1094 scalar_view_t valuesAc;
1095
1096 // add this, since we do the rowptr in place, we could figure this out
1097 // using the rowptr, but it requires an unusual loop (that jumps all over memory)
1098 lno_nnz_view_t thread_total_nnz("thread_total_nnz",thread_max+1);
1099
1100 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1101 // routine. The routine computes Ac := R * A * (P_local + P_remote).
1102 //
1103 // For column index Aik in row i of A, Acol2PRow[Aik] tells
1104 // you whether the corresponding row of P belongs to P_local
1105 // ("orig") or P_remote ("Import").
1106
1107 // Thread-local memory
1108 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind("top_colind", thread_max);
1109 Kokkos::View<u_scalar_view_t*, device_t> tl_values("top_values", thread_max);
1110
1111 // For each row of R
1112 Kokkos::parallel_for("MMM::RAP::NewMatrix::LTG::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
1113 {
1114 // Thread coordination stuff
1115 size_t my_thread_start = tid * thread_chunk;
1116 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
1117 size_t my_thread_m = my_thread_stop - my_thread_start;
1118
1119 size_t nnzAllocated = (size_t) (my_thread_m * Acest_nnz_per_row + 100);
1120
1121 std::vector<size_t> ac_status(n, INVALID);
1122
1123 //manually allocate the thread-local storage for Ac
1124 u_lno_view_t Acrowptr((typename u_lno_view_t::data_type) malloc(u_lno_view_t::shmem_size(my_thread_m+1)), my_thread_m + 1);
1125 u_lno_nnz_view_t Accolind((typename u_lno_nnz_view_t::data_type) malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1126 u_scalar_view_t Acvals((typename u_scalar_view_t::data_type) malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1127
1128 //count entries as they are added to Ac
1129 size_t nnz = 0, nnz_old = 0;
1130 // bmk: loop over the rows of R which are assigned to thread tid
1131 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
1132 // directly index into the rowptr
1133 rowmapAc(i) = nnz;
1134 // mfh 27 Sep 2016: For each entry of R in the current row of R
1135 for (size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1136 LO k = Rcolind(kk); // local column index of current entry of R
1137 const SC Rik = Rvals(kk); // value of current entry of R
1138 if (Rik == SC_ZERO)
1139 continue; // skip explicitly stored zero values in R
1140 // For each entry of A in the current row of A
1141 for (size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1142 LO l = Acolind(ll); // local column index of current entry of A
1143 const SC Akl = Avals(ll); // value of current entry of A
1144 if (Akl == SC_ZERO)
1145 continue; // skip explicitly stored zero values in A
1146
1147 if (Acol2PRow[l] != LO_INVALID) {
1148 // mfh 27 Sep 2016: If the entry of Acol2PRow
1149 // corresponding to the current entry of A is populated, then
1150 // the corresponding row of P is in P_local (i.e., it lives on
1151 // the calling process).
1152
1153 // Local matrix
1154 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1155
1156 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1157 for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1158 LO j = Pcolind(jj);
1159 LO Acj = Pcol2Accol(j);
1160 SC Plj = Pvals(jj);
1161
1162 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1163 #ifdef HAVE_TPETRA_DEBUG
1164 // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1165 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1166 std::runtime_error,
1167 label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1168 #endif
1169 // New entry
1170 ac_status[Acj] = nnz;
1171 Accolind(nnz) = Acj;
1172 Acvals(nnz) = Rik*Akl*Plj;
1173 nnz++;
1174 } else {
1175 Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1176 }
1177 }
1178 } else {
1179 // mfh 27 Sep 2016: If the entry of Acol2PRow
1180 // corresponding to the current entry of A is NOT populated (has
1181 // a flag "invalid" value), then the corresponding row of P is
1182 // in P_remote (i.e., it does not live on the calling process).
1183
1184 // Remote matrix
1185 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1186 for (size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
1187 LO j = Icolind(jj);
1188 LO Acj = PIcol2Accol(j);
1189 SC Plj = Ivals(jj);
1190
1191 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1192 #ifdef HAVE_TPETRA_DEBUG
1193 // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1194 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
1195 std::runtime_error,
1196 label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1197 #endif
1198 // New entry
1199 ac_status[Acj] = nnz;
1200 Accolind(nnz) = Acj;
1201 Acvals(nnz) = Rik*Akl*Plj;
1202 nnz++;
1203 } else {
1204 Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1205 }
1206 }
1207 }
1208 }
1209 }
1210 // Resize for next pass if needed
1211 if (nnz + n > nnzAllocated) {
1212 nnzAllocated *= 2;
1213 Accolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type) realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1214 Acvals = u_scalar_view_t((typename u_scalar_view_t::data_type) realloc((void*) Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1215 }
1216 nnz_old = nnz;
1217 }
1218 thread_total_nnz(tid) = nnz;
1219 tl_colind(tid) = Accolind;
1220 tl_values(tid) = Acvals;
1221 });
1222 #ifdef HAVE_TPETRA_MMM_TIMINGS
1223 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix copy from thread local"))));
1224 #endif
1225
1226 copy_out_from_thread_memory(thread_total_nnz,tl_colind, tl_values, m, thread_chunk, rowmapAc, entriesAc, valuesAc);
1227
1228 #ifdef HAVE_TPETRA_MMM_TIMINGS
1229 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
1230 #endif
1231
1232 // Final sort & set of CRS arrays
1233 Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
1234 // mfh 27 Sep 2016: This just sets pointers.
1235 Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
1236
1237 #ifdef HAVE_TPETRA_MMM_TIMINGS
1238 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
1239 #endif
1240
1241 // Final FillComplete
1242 //
1243 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1244 // Import (from domain Map to column Map) construction (which costs
1245 // lots of communication) by taking the previously constructed
1246 // Import object. We should be able to do this without interfering
1247 // with the implementation of the local part of sparse matrix-matrix
1248 // multiply above.
1249 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1250 labelList->set("Timer Label",label);
1251 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1252 RCP<const Export<LO,GO,NO> > dummyExport;
1253 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1254 Rview.origMatrix->getRangeMap(),
1255 Acimport,
1256 dummyExport,
1257 labelList);
1258}
1259
1260
1261
1262/*********************************************************************************************************/
1263template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class LocalOrdinalViewType>
1264static inline void mult_R_A_P_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
1265 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
1266 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
1267 const LocalOrdinalViewType & Acol2PRow,
1268 const LocalOrdinalViewType & Acol2PRowImport,
1269 const LocalOrdinalViewType & Pcol2Accol,
1270 const LocalOrdinalViewType & PIcol2Accol,
1271 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
1272 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
1273 const std::string& label,
1274 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1275
1276 using Teuchos::RCP;
1277 using Tpetra::MatrixMatrix::UnmanagedView;
1278 #ifdef HAVE_TPETRA_MMM_TIMINGS
1279 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1280 using Teuchos::TimeMonitor;
1281 using Teuchos::rcp;
1282 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse LTGCore"))));
1283 #endif
1284
1285 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1286 typedef Scalar SC;
1287 typedef LocalOrdinal LO;
1288 typedef GlobalOrdinal GO;
1289 typedef Node NO;
1290 typedef Map<LO,GO,NO> map_type;
1292 typedef typename KCRS::StaticCrsGraphType graph_t;
1293 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1294 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1295 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1296 typedef typename KCRS::device_type device_t;
1297 typedef typename device_t::execution_space execution_space;
1298 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1299
1300 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1301 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1302 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1303
1304 // Sizes
1305 RCP<const map_type> Accolmap = Ac.getColMap();
1306 size_t m = Rview.origMatrix->getLocalNumRows();
1307 size_t n = Accolmap->getLocalNumElements();
1308
1309 // Get raw Kokkos matrices, and the raw CSR views
1310 const KCRS & Rmat = Rview.origMatrix->getLocalMatrixDevice();
1311 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
1312 const KCRS & Pmat = Pview.origMatrix->getLocalMatrixDevice();
1313 const KCRS & Cmat = Ac.getLocalMatrixDevice();
1314
1315 c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Crowptr = Cmat.graph.row_map, Irowptr;
1316 const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Ccolind = Cmat.graph.entries;
1317 lno_nnz_view_t Icolind;
1318 const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1319 scalar_view_t Cvals = Cmat.values;
1320 scalar_view_t Ivals;
1321
1322 if (!Pview.importMatrix.is_null())
1323 {
1324 const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
1325 Irowptr = Imat.graph.row_map;
1326 Icolind = Imat.graph.entries;
1327 Ivals = Imat.values;
1328 }
1329
1330 // Get my node / thread info (right from openmp or parameter list)
1331 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1332 if(!params.is_null()) {
1333 if(params->isParameter("openmp: ltg thread max"))
1334 thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
1335 }
1336
1337 double thread_chunk = (double)(m) / thread_max;
1338
1339 // For each row of R
1340 Kokkos::parallel_for("MMM::RAP::Reuse::LTG::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
1341 {
1342 // Thread coordination stuff
1343 size_t my_thread_start = tid * thread_chunk;
1344 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
1345
1346 std::vector<size_t> c_status(n, INVALID);
1347
1348 //count entries as they are added to Ac
1349 size_t OLD_ip = 0, CSR_ip = 0;
1350 // bmk: loop over the rows of R which are assigned to thread tid
1351 for (size_t i = my_thread_start; i < my_thread_stop; i++) {
1352 // First fill the c_status array w/ locations where we're allowed to
1353 // generate nonzeros for this row
1354 OLD_ip = Crowptr(i);
1355 CSR_ip = Crowptr(i+1);
1356 for (size_t k = OLD_ip; k < CSR_ip; k++) {
1357 c_status[Ccolind(k)] = k;
1358 // Reset values in the row of C
1359 Cvals(k) = SC_ZERO;
1360 }
1361
1362 // mfh 27 Sep 2016: For each entry of R in the current row of R
1363 for (size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1364 LO k = Rcolind(kk); // local column index of current entry of R
1365 const SC Rik = Rvals(kk); // value of current entry of R
1366 if (Rik == SC_ZERO)
1367 continue; // skip explicitly stored zero values in R
1368 // For each entry of A in the current row of A
1369 for (size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1370 LO l = Acolind(ll); // local column index of current entry of A
1371 const SC Akl = Avals(ll); // value of current entry of A
1372 if (Akl == SC_ZERO)
1373 continue; // skip explicitly stored zero values in A
1374
1375 if (Acol2PRow[l] != LO_INVALID) {
1376 // mfh 27 Sep 2016: If the entry of Acol2PRow
1377 // corresponding to the current entry of A is populated, then
1378 // the corresponding row of P is in P_local (i.e., it lives on
1379 // the calling process).
1380
1381 // Local matrix
1382 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1383
1384 // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1385 for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1386 LO j = Pcolind(jj);
1387 LO Cij = Pcol2Accol(j);
1388 SC Plj = Pvals(jj);
1389 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1390 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1391 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1392
1393
1394 Cvals(c_status[Cij]) += Rik*Akl*Plj;
1395 }
1396 } else {
1397 // mfh 27 Sep 2016: If the entry of Acol2PRow
1398 // corresponding to the current entry of A is NOT populated (has
1399 // a flag "invalid" value), then the corresponding row of P is
1400 // in P_remote (i.e., it does not live on the calling process).
1401
1402 // Remote matrix
1403 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1404 for (size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
1405 LO j = Icolind(jj);
1406 LO Cij = PIcol2Accol(j);
1407 SC Plj = Ivals(jj);
1408 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1409 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1410 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1411
1412 Cvals(c_status[Cij]) += Rik*Akl*Plj;
1413 }
1414 }
1415 }
1416 }
1417 }
1418 });
1419 // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
1420}
1421
1422
1423
1424#endif
1425
1426
1427}//ExtraKernels
1428}//MatrixMatrix
1429}//Tpetra
1430
1431
1432#endif
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Distributed sparse matrix-matrix multiply and add.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
Namespace Tpetra contains the class and methods constituting the Tpetra library.