10#ifndef TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
11#define TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
13#include "Tpetra_Details_IntRowPtrHelper.hpp"
15#ifdef HAVE_TPETRA_INST_CUDA
24 class LocalOrdinalViewType>
25struct KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
26 static inline void mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
27 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
28 const LocalOrdinalViewType & Acol2Brow,
29 const LocalOrdinalViewType & Acol2Irow,
30 const LocalOrdinalViewType & Bcol2Ccol,
31 const LocalOrdinalViewType & Icol2Ccol,
32 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
33 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
34 const std::string& label = std::string(),
35 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
39 static inline void mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
40 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
41 const LocalOrdinalViewType & Acol2Brow,
42 const LocalOrdinalViewType & Acol2Irow,
43 const LocalOrdinalViewType & Bcol2Ccol,
44 const LocalOrdinalViewType & Icol2Ccol,
45 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
46 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
47 const std::string& label = std::string(),
48 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
55 class GlobalOrdinal,
class LocalOrdinalViewType>
56struct KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
57 static inline void jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
58 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
59 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
60 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
61 const LocalOrdinalViewType & Acol2Brow,
62 const LocalOrdinalViewType & Acol2Irow,
63 const LocalOrdinalViewType & Bcol2Ccol,
64 const LocalOrdinalViewType & Icol2Ccol,
65 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
66 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
67 const std::string& label = std::string(),
68 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
70 static inline void jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
71 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
72 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
73 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
74 const LocalOrdinalViewType & Acol2Brow,
75 const LocalOrdinalViewType & Acol2Irow,
76 const LocalOrdinalViewType & Bcol2Ccol,
77 const LocalOrdinalViewType & Icol2Ccol,
78 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
79 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
80 const std::string& label = std::string(),
81 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
83 static inline void jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
84 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
85 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
86 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
87 const LocalOrdinalViewType & Acol2Brow,
88 const LocalOrdinalViewType & Acol2Irow,
89 const LocalOrdinalViewType & Bcol2Ccol,
90 const LocalOrdinalViewType & Icol2Ccol,
91 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
92 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
93 const std::string& label = std::string(),
94 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
100template<
class Scalar,
103 class LocalOrdinalViewType>
104void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
105 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
106 const LocalOrdinalViewType & Acol2Brow,
107 const LocalOrdinalViewType & Acol2Irow,
108 const LocalOrdinalViewType & Bcol2Ccol,
109 const LocalOrdinalViewType & Icol2Ccol,
110 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
111 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
112 const std::string& label,
113 const Teuchos::RCP<Teuchos::ParameterList>& params) {
116#ifdef HAVE_TPETRA_MMM_TIMINGS
117 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
118 using Teuchos::TimeMonitor;
119 Teuchos::RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaWrapper")))));
122 typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
123 std::string nodename(
"Cuda");
128 typedef typename KCRS::device_type device_t;
129 typedef typename KCRS::StaticCrsGraphType graph_t;
130 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
131 using int_view_t = Kokkos::View<int *, typename lno_view_t::array_layout, typename lno_view_t::memory_space, typename lno_view_t::memory_traits>;
132 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
133 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
134 typedef typename KCRS::values_type::non_const_type scalar_view_t;
138 int team_work_size = 16;
139 std::string myalg(
"SPGEMM_KK_MEMORY");
140 if(!params.is_null()) {
141 if(params->isParameter(
"cuda: algorithm"))
142 myalg = params->get(
"cuda: algorithm",myalg);
143 if(params->isParameter(
"cuda: team work size"))
144 team_work_size = params->get(
"cuda: team work size",team_work_size);
148 typedef KokkosKernels::Experimental::KokkosKernelsHandle<
149 typename lno_view_t::const_value_type,
typename lno_nnz_view_t::const_value_type,
typename scalar_view_t::const_value_type,
150 typename device_t::execution_space,
typename device_t::memory_space,
typename device_t::memory_space > KernelHandle;
151 using IntKernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
152 typename int_view_t::const_value_type,
typename lno_nnz_view_t::const_value_type,
typename scalar_view_t::const_value_type,
153 typename device_t::execution_space,
typename device_t::memory_space,
typename device_t::memory_space >;
156 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
157 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
159 c_lno_view_t Arowptr = Amat.graph.row_map,
160 Browptr = Bmat.graph.row_map;
161 const lno_nnz_view_t Acolind = Amat.graph.entries,
162 Bcolind = Bmat.graph.entries;
163 const scalar_view_t Avals = Amat.values,
167 std::string alg = nodename+std::string(
" algorithm");
169 if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
170 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
173 KCRS Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
181#if defined(KOKKOS_ENABLE_CUDA) \
182 && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) \
183 && ((CUDA_VERSION < 11000) || (CUDA_VERSION >= 11040))
184 if constexpr (std::is_same_v<typename device_t::execution_space, Kokkos::Cuda>) {
185 if (!KokkosSparse::isCrsGraphSorted(Bmerged.graph.row_map, Bmerged.graph.entries)) {
186 KokkosSparse::sort_crs_matrix(Bmerged);
191#ifdef HAVE_TPETRA_MMM_TIMINGS
192 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaCore"))));
196 typename KernelHandle::nnz_lno_t AnumRows = Amat.numRows();
197 typename KernelHandle::nnz_lno_t BnumRows = Bmerged.numRows();
198 typename KernelHandle::nnz_lno_t BnumCols = Bmerged.numCols();
201 lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing(
"non_const_lno_row"), AnumRows + 1);
202 lno_nnz_view_t entriesC;
203 scalar_view_t valuesC;
205 Tpetra::Details::IntRowPtrHelper<
decltype(Bmerged)> irph(Bmerged.nnz(), Bmerged.graph.row_map);
206 const bool useIntRowptrs =
207 irph.shouldUseIntRowptrs() &&
208 Aview.origMatrix->getApplyHelper()->shouldUseIntRowptrs();
212 kh.create_spgemm_handle(alg_enum);
213 kh.set_team_work_size(team_work_size);
215 int_view_t int_row_mapC (Kokkos::ViewAllocateWithoutInitializing(
"non_const_int_row"), AnumRows + 1);
217 auto Aint = Aview.origMatrix->getApplyHelper()->getIntRowptrMatrix(Amat);
218 auto Bint = irph.getIntRowptrMatrix(Bmerged);
219 KokkosSparse::Experimental::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Aint.graph.row_map,Aint.graph.entries,
false,Bint.graph.row_map,Bint.graph.entries,
false, int_row_mapC);
221 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
223 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size);
224 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size);
226 KokkosSparse::Experimental::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Aint.graph.row_map,Aint.graph.entries,Aint.values,
false,Bint.graph.row_map,Bint.graph.entries,Bint.values,
false,int_row_mapC,entriesC,valuesC);
228 Kokkos::parallel_for(int_row_mapC.size(), KOKKOS_LAMBDA(
int i){ row_mapC(i) = int_row_mapC(i);});
229 kh.destroy_spgemm_handle();
233 kh.create_spgemm_handle(alg_enum);
234 kh.set_team_work_size(team_work_size);
236 KokkosSparse::Experimental::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,
false,Bmerged.graph.row_map,Bmerged.graph.entries,
false,row_mapC);
238 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
240 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size);
241 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size);
244 KokkosSparse::Experimental::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,Amat.values,
false,Bmerged.graph.row_map,Bmerged.graph.entries,Bmerged.values,
false,row_mapC,entriesC,valuesC);
246 kh.destroy_spgemm_handle();
250#ifdef HAVE_TPETRA_MMM_TIMINGS
251 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaSort"))));
255 if (params.is_null() || params->get(
"sort entries",
true))
256 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
257 C.setAllValues(row_mapC,entriesC,valuesC);
259#ifdef HAVE_TPETRA_MMM_TIMINGS
260 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaESFC"))));
264 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
265 labelList->set(
"Timer Label",label);
266 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
267 RCP<const Export<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
268 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
273template<
class Scalar,
276 class LocalOrdinalViewType>
277void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(
278 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
279 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
280 const LocalOrdinalViewType & targetMapToOrigRow_dev,
281 const LocalOrdinalViewType & targetMapToImportRow_dev,
282 const LocalOrdinalViewType & Bcol2Ccol_dev,
283 const LocalOrdinalViewType & Icol2Ccol_dev,
284 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
285 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
286 const std::string& label,
287 const Teuchos::RCP<Teuchos::ParameterList>& params) {
290 typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
292#ifdef HAVE_TPETRA_MMM_TIMINGS
293 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
294 using Teuchos::TimeMonitor;
295 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse SerialCore"))));
296 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
303 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
304 typedef typename KCRS::StaticCrsGraphType graph_t;
305 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
306 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
307 typedef typename KCRS::values_type::non_const_type scalar_view_t;
310 typedef LocalOrdinal LO;
311 typedef GlobalOrdinal GO;
313 typedef Map<LO,GO,NO> map_type;
314 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
315 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
316 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
324 auto targetMapToOrigRow =
325 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
326 targetMapToOrigRow_dev);
327 auto targetMapToImportRow =
328 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
329 targetMapToImportRow_dev);
331 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
334 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
338 RCP<const map_type> Ccolmap = C.getColMap();
339 size_t m = Aview.origMatrix->getLocalNumRows();
340 size_t n = Ccolmap->getLocalNumElements();
343 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
344 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
345 const KCRS & Cmat = C.getLocalMatrixHost();
347 c_lno_view_t Arowptr = Amat.graph.row_map,
348 Browptr = Bmat.graph.row_map,
349 Crowptr = Cmat.graph.row_map;
350 const lno_nnz_view_t Acolind = Amat.graph.entries,
351 Bcolind = Bmat.graph.entries,
352 Ccolind = Cmat.graph.entries;
353 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
354 scalar_view_t Cvals = Cmat.values;
356 c_lno_view_t Irowptr;
357 lno_nnz_view_t Icolind;
359 if(!Bview.importMatrix.is_null()) {
360 auto lclB = Bview.importMatrix->getLocalMatrixHost();
361 Irowptr = lclB.graph.row_map;
362 Icolind = lclB.graph.entries;
366#ifdef HAVE_TPETRA_MMM_TIMINGS
367 MM2 = Teuchos::null; MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore - Compare"))));
379 std::vector<size_t> c_status(n, ST_INVALID);
382 size_t CSR_ip = 0, OLD_ip = 0;
383 for (
size_t i = 0; i < m; i++) {
387 CSR_ip = Crowptr[i+1];
388 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
389 c_status[Ccolind[k]] = k;
395 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
397 const SC Aval = Avals[k];
401 if (targetMapToOrigRow[Aik] != LO_INVALID) {
403 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
405 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
407 LO Cij = Bcol2Ccol[Bkj];
409 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
410 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
411 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
413 Cvals[c_status[Cij]] += Aval * Bvals[j];
418 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
419 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
421 LO Cij = Icol2Ccol[Ikj];
423 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
424 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
425 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
427 Cvals[c_status[Cij]] += Aval * Ivals[j];
433 C.fillComplete(C.getDomainMap(), C.getRangeMap());
437template<
class Scalar,
440 class LocalOrdinalViewType>
441void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
442 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
443 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
444 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
445 const LocalOrdinalViewType & Acol2Brow,
446 const LocalOrdinalViewType & Acol2Irow,
447 const LocalOrdinalViewType & Bcol2Ccol,
448 const LocalOrdinalViewType & Icol2Ccol,
449 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
450 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
451 const std::string& label,
452 const Teuchos::RCP<Teuchos::ParameterList>& params) {
454#ifdef HAVE_TPETRA_MMM_TIMINGS
455 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
456 using Teuchos::TimeMonitor;
457 Teuchos::RCP<TimeMonitor> MM;
465 std::string myalg(
"KK");
466 if(!params.is_null()) {
467 if(params->isParameter(
"cuda: jacobi algorithm"))
468 myalg = params->get(
"cuda: jacobi algorithm",myalg);
471 if(myalg ==
"MSAK") {
472 ::Tpetra::MatrixMatrix::ExtraKernels::jacobi_A_B_newmatrix_MultiplyScaleAddKernel(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
474 else if(myalg ==
"KK") {
475 jacobi_A_B_newmatrix_KokkosKernels(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
478 throw std::runtime_error(
"Tpetra::MatrixMatrix::Jacobi newmatrix unknown kernel");
481#ifdef HAVE_TPETRA_MMM_TIMINGS
482 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix CudaESFC"))));
486 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
487 labelList->set(
"Timer Label",label);
488 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
491 if(!C.isFillComplete()) {
492 RCP<const Export<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
493 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
501template<
class Scalar,
504 class LocalOrdinalViewType>
505void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
506 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
507 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
508 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
509 const LocalOrdinalViewType & targetMapToOrigRow_dev,
510 const LocalOrdinalViewType & targetMapToImportRow_dev,
511 const LocalOrdinalViewType & Bcol2Ccol_dev,
512 const LocalOrdinalViewType & Icol2Ccol_dev,
513 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
514 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
515 const std::string& label,
516 const Teuchos::RCP<Teuchos::ParameterList>& params) {
519 typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
521#ifdef HAVE_TPETRA_MMM_TIMINGS
522 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
523 using Teuchos::TimeMonitor;
524 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse CudaCore"))));
525 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
531 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
532 typedef typename KCRS::StaticCrsGraphType graph_t;
533 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
534 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
535 typedef typename KCRS::values_type::non_const_type scalar_view_t;
536 typedef typename scalar_view_t::memory_space scalar_memory_space;
539 typedef LocalOrdinal LO;
540 typedef GlobalOrdinal GO;
542 typedef Map<LO,GO,NO> map_type;
543 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
544 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
545 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
553 auto targetMapToOrigRow =
554 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
555 targetMapToOrigRow_dev);
556 auto targetMapToImportRow =
557 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
558 targetMapToImportRow_dev);
560 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
563 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
568 RCP<const map_type> Ccolmap = C.getColMap();
569 size_t m = Aview.origMatrix->getLocalNumRows();
570 size_t n = Ccolmap->getLocalNumElements();
573 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
574 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
575 const KCRS & Cmat = C.getLocalMatrixHost();
577 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
578 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
579 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
580 scalar_view_t Cvals = Cmat.values;
582 c_lno_view_t Irowptr;
583 lno_nnz_view_t Icolind;
585 if(!Bview.importMatrix.is_null()) {
586 auto lclB = Bview.importMatrix->getLocalMatrixHost();
587 Irowptr = lclB.graph.row_map;
588 Icolind = lclB.graph.entries;
594 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
596#ifdef HAVE_TPETRA_MMM_TIMINGS
597 MM2 = Teuchos::null; MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse CudaCore - Compare"))));
605 std::vector<size_t> c_status(n, ST_INVALID);
608 size_t CSR_ip = 0, OLD_ip = 0;
609 for (
size_t i = 0; i < m; i++) {
614 CSR_ip = Crowptr[i+1];
615 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
616 c_status[Ccolind[k]] = k;
622 SC minusOmegaDval = -omega*Dvals(i,0);
625 for (
size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
626 Scalar Bval = Bvals[j];
630 LO Cij = Bcol2Ccol[Bij];
632 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
633 std::runtime_error,
"Trying to insert a new entry into a static graph");
635 Cvals[c_status[Cij]] = Bvals[j];
639 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
641 const SC Aval = Avals[k];
645 if (targetMapToOrigRow[Aik] != LO_INVALID) {
647 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
649 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
651 LO Cij = Bcol2Ccol[Bkj];
653 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
654 std::runtime_error,
"Trying to insert a new entry into a static graph");
656 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
661 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
662 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
664 LO Cij = Icol2Ccol[Ikj];
666 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
667 std::runtime_error,
"Trying to insert a new entry into a static graph");
669 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
675#ifdef HAVE_TPETRA_MMM_TIMINGS
677 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse ESFC"))));
680 C.fillComplete(C.getDomainMap(), C.getRangeMap());
685template<
class Scalar,
688 class LocalOrdinalViewType>
689void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
690 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
691 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
692 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
693 const LocalOrdinalViewType & Acol2Brow,
694 const LocalOrdinalViewType & Acol2Irow,
695 const LocalOrdinalViewType & Bcol2Ccol,
696 const LocalOrdinalViewType & Icol2Ccol,
697 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
698 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
699 const std::string& label,
700 const Teuchos::RCP<Teuchos::ParameterList>& params) {
702#ifdef HAVE_TPETRA_MMM_TIMINGS
703 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
704 using Teuchos::TimeMonitor;
705 Teuchos::RCP<TimeMonitor> MM;
712 auto rowMap = Aview.origMatrix->getRowMap();
713 Tpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> diags(rowMap);
714 Aview.origMatrix->getLocalDiagCopy(diags);
715 size_t diagLength = rowMap->getLocalNumElements();
716 Teuchos::Array<Scalar> diagonal(diagLength);
717 diags.get1dCopy(diagonal());
719 for(
size_t i = 0; i < diagLength; ++i) {
720 TEUCHOS_TEST_FOR_EXCEPTION(diagonal[i] == Teuchos::ScalarTraits<Scalar>::zero(),
722 "Matrix A has a zero/missing diagonal: " << diagonal[i] << std::endl <<
723 "KokkosKernels Jacobi-fused SpGEMM requires nonzero diagonal entries in A" << std::endl);
728 using device_t =
typename Tpetra::KokkosCompat::KokkosCudaWrapperNode::device_type;
730 using graph_t =
typename matrix_t::StaticCrsGraphType;
731 using lno_view_t =
typename graph_t::row_map_type::non_const_type;
732 using int_view_t = Kokkos::View<
int *,
733 typename lno_view_t::array_layout,
734 typename lno_view_t::memory_space,
735 typename lno_view_t::memory_traits>;
736 using c_lno_view_t =
typename graph_t::row_map_type::const_type;
737 using lno_nnz_view_t =
typename graph_t::entries_type::non_const_type;
738 using scalar_view_t =
typename matrix_t::values_type::non_const_type;
741 using handle_t =
typename KokkosKernels::Experimental::KokkosKernelsHandle<
742 typename lno_view_t::const_value_type,
typename lno_nnz_view_t::const_value_type,
typename scalar_view_t::const_value_type,
743 typename device_t::execution_space,
typename device_t::memory_space,
typename device_t::memory_space >;
745 using int_handle_t =
typename KokkosKernels::Experimental::KokkosKernelsHandle<
746 typename int_view_t::const_value_type,
typename lno_nnz_view_t::const_value_type,
typename scalar_view_t::const_value_type,
747 typename device_t::execution_space,
typename device_t::memory_space,
typename device_t::memory_space >;
750 const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
753 const matrix_t & Amat = Aview.origMatrix->getLocalMatrixDevice();
754 const matrix_t & Bmat = Bview.origMatrix->getLocalMatrixDevice();
756 typename handle_t::nnz_lno_t AnumRows = Amat.numRows();
757 typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows();
758 typename handle_t::nnz_lno_t BnumCols = Bmerged.numCols();
761 lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing(
"row_mapC"), AnumRows + 1);
762 lno_nnz_view_t entriesC;
763 scalar_view_t valuesC;
766 int team_work_size = 16;
767 std::string myalg(
"SPGEMM_KK_MEMORY");
768 if(!params.is_null()) {
769 if(params->isParameter(
"cuda: algorithm"))
770 myalg = params->get(
"cuda: algorithm",myalg);
771 if(params->isParameter(
"cuda: team work size"))
772 team_work_size = params->get(
"cuda: team work size",team_work_size);
776 std::string alg(
"Cuda algorithm");
777 if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
778 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
781 Tpetra::Details::IntRowPtrHelper<
decltype(Bmerged)> irph(Bmerged.nnz(), Bmerged.graph.row_map);
782 const bool useIntRowptrs =
783 irph.shouldUseIntRowptrs() &&
784 Aview.origMatrix->getApplyHelper()->shouldUseIntRowptrs();
788 kh.create_spgemm_handle(alg_enum);
789 kh.set_team_work_size(team_work_size);
791 int_view_t int_row_mapC (Kokkos::ViewAllocateWithoutInitializing(
"int_row_mapC"), AnumRows + 1);
793 auto Aint = Aview.origMatrix->getApplyHelper()->getIntRowptrMatrix(Amat);
794 auto Bint = irph.getIntRowptrMatrix(Bmerged);
796 KokkosSparse::Experimental::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
797 Aint.graph.row_map, Aint.graph.entries,
false,
798 Bint.graph.row_map, Bint.graph.entries,
false,
801 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
803 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size);
804 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size);
809 KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
810 Aint.graph.row_map, Aint.graph.entries, Amat.values,
false,
811 Bint.graph.row_map, Bint.graph.entries, Bint.values,
false,
812 int_row_mapC, entriesC, valuesC,
813 omega, Dinv.getLocalViewDevice(Access::ReadOnly));
815 Kokkos::parallel_for(int_row_mapC.size(), KOKKOS_LAMBDA(
int i){ row_mapC(i) = int_row_mapC(i);});
816 kh.destroy_spgemm_handle();
819 kh.create_spgemm_handle(alg_enum);
820 kh.set_team_work_size(team_work_size);
822 KokkosSparse::Experimental::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
823 Amat.graph.row_map, Amat.graph.entries,
false,
824 Bmerged.graph.row_map, Bmerged.graph.entries,
false,
827 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
829 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size);
830 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size);
833 KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
834 Amat.graph.row_map, Amat.graph.entries, Amat.values,
false,
835 Bmerged.graph.row_map, Bmerged.graph.entries, Bmerged.values,
false,
836 row_mapC, entriesC, valuesC,
837 omega, Dinv.getLocalViewDevice(Access::ReadOnly));
838 kh.destroy_spgemm_handle();
843#ifdef HAVE_TPETRA_MMM_TIMINGS
844 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix CudaSort"))));
848 if (params.is_null() || params->get(
"sort entries",
true))
849 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
850 C.setAllValues(row_mapC,entriesC,valuesC);
852#ifdef HAVE_TPETRA_MMM_TIMINGS
853 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix CudaESFC"))));
857 Teuchos::RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
858 labelList->set(
"Timer Label",label);
859 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
860 Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
861 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static bool debug()
Whether Tpetra is in debug mode.
Namespace Tpetra contains the class and methods constituting the Tpetra library.