Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
TpetraExt_MatrixMatrix_Cuda.hpp
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
11#define TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
12
13#include "Tpetra_Details_IntRowPtrHelper.hpp"
14
15#ifdef HAVE_TPETRA_INST_CUDA
16namespace Tpetra {
17namespace MMdetails {
18
19/*********************************************************************************************************/
20// MMM KernelWrappers for Partial Specialization to CUDA
21template<class Scalar,
22 class LocalOrdinal,
23 class GlobalOrdinal,
24 class LocalOrdinalViewType>
25struct KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
26 static inline void mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
27 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
28 const LocalOrdinalViewType & Acol2Brow,
29 const LocalOrdinalViewType & Acol2Irow,
30 const LocalOrdinalViewType & Bcol2Ccol,
31 const LocalOrdinalViewType & Icol2Ccol,
32 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
33 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
34 const std::string& label = std::string(),
35 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
36
37
38
39 static inline void mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
40 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
41 const LocalOrdinalViewType & Acol2Brow,
42 const LocalOrdinalViewType & Acol2Irow,
43 const LocalOrdinalViewType & Bcol2Ccol,
44 const LocalOrdinalViewType & Icol2Ccol,
45 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
46 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
47 const std::string& label = std::string(),
48 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
49
50};
51
52// Jacobi KernelWrappers for Partial Specialization to Cuda
53template<class Scalar,
54 class LocalOrdinal,
55 class GlobalOrdinal, class LocalOrdinalViewType>
56struct KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
57 static inline void jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
58 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
59 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
60 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
61 const LocalOrdinalViewType & Acol2Brow,
62 const LocalOrdinalViewType & Acol2Irow,
63 const LocalOrdinalViewType & Bcol2Ccol,
64 const LocalOrdinalViewType & Icol2Ccol,
65 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
66 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
67 const std::string& label = std::string(),
68 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
69
70 static inline void jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
71 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
72 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
73 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
74 const LocalOrdinalViewType & Acol2Brow,
75 const LocalOrdinalViewType & Acol2Irow,
76 const LocalOrdinalViewType & Bcol2Ccol,
77 const LocalOrdinalViewType & Icol2Ccol,
78 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
79 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
80 const std::string& label = std::string(),
81 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
82
83 static inline void jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
84 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
85 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
86 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
87 const LocalOrdinalViewType & Acol2Brow,
88 const LocalOrdinalViewType & Acol2Irow,
89 const LocalOrdinalViewType & Bcol2Ccol,
90 const LocalOrdinalViewType & Icol2Ccol,
91 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
92 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
93 const std::string& label = std::string(),
94 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
95};
96
97
98/*********************************************************************************************************/
99// AB NewMatrix Kernel wrappers (KokkosKernels/CUDA Version)
100template<class Scalar,
101 class LocalOrdinal,
102 class GlobalOrdinal,
103 class LocalOrdinalViewType>
104void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
105 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
106 const LocalOrdinalViewType & Acol2Brow,
107 const LocalOrdinalViewType & Acol2Irow,
108 const LocalOrdinalViewType & Bcol2Ccol,
109 const LocalOrdinalViewType & Icol2Ccol,
110 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
111 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
112 const std::string& label,
113 const Teuchos::RCP<Teuchos::ParameterList>& params) {
114
115
116#ifdef HAVE_TPETRA_MMM_TIMINGS
117 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
118 using Teuchos::TimeMonitor;
119 Teuchos::RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaWrapper")))));
120#endif
121 // Node-specific code
122 typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
123 std::string nodename("Cuda");
124
125 // Lots and lots of typedefs
126 using Teuchos::RCP;
128 typedef typename KCRS::device_type device_t;
129 typedef typename KCRS::StaticCrsGraphType graph_t;
130 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
131 using int_view_t = Kokkos::View<int *, typename lno_view_t::array_layout, typename lno_view_t::memory_space, typename lno_view_t::memory_traits>;
132 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
133 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
134 typedef typename KCRS::values_type::non_const_type scalar_view_t;
135 //typedef typename graph_t::row_map_type::const_type lno_view_t_const;
136
137 // Options
138 int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
139 std::string myalg("SPGEMM_KK_MEMORY");
140 if(!params.is_null()) {
141 if(params->isParameter("cuda: algorithm"))
142 myalg = params->get("cuda: algorithm",myalg);
143 if(params->isParameter("cuda: team work size"))
144 team_work_size = params->get("cuda: team work size",team_work_size);
145 }
146
147 // KokkosKernelsHandle
148 typedef KokkosKernels::Experimental::KokkosKernelsHandle<
149 typename lno_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
150 typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space > KernelHandle;
151 using IntKernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
152 typename int_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
153 typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space >;
154
155 // Grab the Kokkos::SparseCrsMatrices
156 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
157 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
158
159 c_lno_view_t Arowptr = Amat.graph.row_map,
160 Browptr = Bmat.graph.row_map;
161 const lno_nnz_view_t Acolind = Amat.graph.entries,
162 Bcolind = Bmat.graph.entries;
163 const scalar_view_t Avals = Amat.values,
164 Bvals = Bmat.values;
165
166 // Get the algorithm mode
167 std::string alg = nodename+std::string(" algorithm");
168 // printf("DEBUG: Using kernel: %s\n",myalg.c_str());
169 if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
170 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
171
172 // Merge the B and Bimport matrices
173 KCRS Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
174
175
176 // if Kokkos Kernels is going to use the cuSparse TPL for SpGEMM, this matrix
177 // needs to be sorted
178 // Try to mirror the Kokkos Kernels internal SpGEMM TPL use logic
179 // inspired by https://github.com/trilinos/Trilinos/pull/11709
180 // and https://github.com/kokkos/kokkos-kernels/pull/2008
181#if defined(KOKKOS_ENABLE_CUDA) \
182 && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) \
183 && ((CUDA_VERSION < 11000) || (CUDA_VERSION >= 11040))
184 if constexpr (std::is_same_v<typename device_t::execution_space, Kokkos::Cuda>) {
185 if (!KokkosSparse::isCrsGraphSorted(Bmerged.graph.row_map, Bmerged.graph.entries)) {
186 KokkosSparse::sort_crs_matrix(Bmerged);
187 }
188 }
189#endif
190
191#ifdef HAVE_TPETRA_MMM_TIMINGS
192 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaCore"))));
193#endif
194
195 // Do the multiply on whatever we've got
196 typename KernelHandle::nnz_lno_t AnumRows = Amat.numRows();
197 typename KernelHandle::nnz_lno_t BnumRows = Bmerged.numRows();
198 typename KernelHandle::nnz_lno_t BnumCols = Bmerged.numCols();
199
200 // regardless of whether integer row ptrs are used, need to ultimately produce the row pointer, entries, and values of the expected types
201 lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing("non_const_lno_row"), AnumRows + 1);
202 lno_nnz_view_t entriesC;
203 scalar_view_t valuesC;
204
205 Tpetra::Details::IntRowPtrHelper<decltype(Bmerged)> irph(Bmerged.nnz(), Bmerged.graph.row_map);
206 const bool useIntRowptrs =
207 irph.shouldUseIntRowptrs() &&
208 Aview.origMatrix->getApplyHelper()->shouldUseIntRowptrs();
209
210 if (useIntRowptrs) {
211 IntKernelHandle kh;
212 kh.create_spgemm_handle(alg_enum);
213 kh.set_team_work_size(team_work_size);
214
215 int_view_t int_row_mapC (Kokkos::ViewAllocateWithoutInitializing("non_const_int_row"), AnumRows + 1);
216
217 auto Aint = Aview.origMatrix->getApplyHelper()->getIntRowptrMatrix(Amat);
218 auto Bint = irph.getIntRowptrMatrix(Bmerged);
219 KokkosSparse::Experimental::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Aint.graph.row_map,Aint.graph.entries,false,Bint.graph.row_map,Bint.graph.entries,false, int_row_mapC);
220
221 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
222 if (c_nnz_size){
223 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
224 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
225 }
226 KokkosSparse::Experimental::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Aint.graph.row_map,Aint.graph.entries,Aint.values,false,Bint.graph.row_map,Bint.graph.entries,Bint.values,false,int_row_mapC,entriesC,valuesC);
227 // transfer the integer rowptrs back to the correct rowptr type
228 Kokkos::parallel_for(int_row_mapC.size(), KOKKOS_LAMBDA(int i){ row_mapC(i) = int_row_mapC(i);});
229 kh.destroy_spgemm_handle();
230
231 } else {
232 KernelHandle kh;
233 kh.create_spgemm_handle(alg_enum);
234 kh.set_team_work_size(team_work_size);
235
236 KokkosSparse::Experimental::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,false,Bmerged.graph.row_map,Bmerged.graph.entries,false,row_mapC);
237
238 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
239 if (c_nnz_size){
240 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
241 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
242 }
243
244 KokkosSparse::Experimental::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,Amat.values,false,Bmerged.graph.row_map,Bmerged.graph.entries,Bmerged.values,false,row_mapC,entriesC,valuesC);
245
246 kh.destroy_spgemm_handle();
247 }
248
249
250#ifdef HAVE_TPETRA_MMM_TIMINGS
251 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaSort"))));
252#endif
253
254 // Sort & set values
255 if (params.is_null() || params->get("sort entries",true))
256 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
257 C.setAllValues(row_mapC,entriesC,valuesC);
258
259#ifdef HAVE_TPETRA_MMM_TIMINGS
260 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaESFC"))));
261#endif
262
263 // Final Fillcomplete
264 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
265 labelList->set("Timer Label",label);
266 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
267 RCP<const Export<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
268 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
269}
270
271
272/*********************************************************************************************************/
273template<class Scalar,
274 class LocalOrdinal,
275 class GlobalOrdinal,
276 class LocalOrdinalViewType>
277void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(
278 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
279 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
280 const LocalOrdinalViewType & targetMapToOrigRow_dev,
281 const LocalOrdinalViewType & targetMapToImportRow_dev,
282 const LocalOrdinalViewType & Bcol2Ccol_dev,
283 const LocalOrdinalViewType & Icol2Ccol_dev,
284 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
285 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
286 const std::string& label,
287 const Teuchos::RCP<Teuchos::ParameterList>& params) {
288
289 // FIXME: Right now, this is a cut-and-paste of the serial kernel
290 typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
291
292#ifdef HAVE_TPETRA_MMM_TIMINGS
293 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
294 using Teuchos::TimeMonitor;
295 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
296 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
297#endif
298 using Teuchos::RCP;
299 using Teuchos::rcp;
300
301
302 // Lots and lots of typedefs
303 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
304 typedef typename KCRS::StaticCrsGraphType graph_t;
305 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
306 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
307 typedef typename KCRS::values_type::non_const_type scalar_view_t;
308
309 typedef Scalar SC;
310 typedef LocalOrdinal LO;
311 typedef GlobalOrdinal GO;
312 typedef Node NO;
313 typedef Map<LO,GO,NO> map_type;
314 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
315 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
316 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
317
318 // Since this is being run on Cuda, we need to fence because the below code will use UVM
319 // typename graph_t::execution_space().fence();
320
321 // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
322 // KDDKDD UVM Ideally, this function would run on device and use
323 // KDDKDD UVM KokkosKernels instead of this host implementation.
324 auto targetMapToOrigRow =
325 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
326 targetMapToOrigRow_dev);
327 auto targetMapToImportRow =
328 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
329 targetMapToImportRow_dev);
330 auto Bcol2Ccol =
331 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
332 Bcol2Ccol_dev);
333 auto Icol2Ccol =
334 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
335 Icol2Ccol_dev);
336
337 // Sizes
338 RCP<const map_type> Ccolmap = C.getColMap();
339 size_t m = Aview.origMatrix->getLocalNumRows();
340 size_t n = Ccolmap->getLocalNumElements();
341
342 // Grab the Kokkos::SparseCrsMatrices & inner stuff
343 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
344 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
345 const KCRS & Cmat = C.getLocalMatrixHost();
346
347 c_lno_view_t Arowptr = Amat.graph.row_map,
348 Browptr = Bmat.graph.row_map,
349 Crowptr = Cmat.graph.row_map;
350 const lno_nnz_view_t Acolind = Amat.graph.entries,
351 Bcolind = Bmat.graph.entries,
352 Ccolind = Cmat.graph.entries;
353 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
354 scalar_view_t Cvals = Cmat.values;
355
356 c_lno_view_t Irowptr;
357 lno_nnz_view_t Icolind;
358 scalar_view_t Ivals;
359 if(!Bview.importMatrix.is_null()) {
360 auto lclB = Bview.importMatrix->getLocalMatrixHost();
361 Irowptr = lclB.graph.row_map;
362 Icolind = lclB.graph.entries;
363 Ivals = lclB.values;
364 }
365
366#ifdef HAVE_TPETRA_MMM_TIMINGS
367 MM2 = Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
368#endif
369
370 // Classic csr assembly (low memory edition)
371 // mfh 27 Sep 2016: The c_status array is an implementation detail
372 // of the local sparse matrix-matrix multiply routine.
373
374 // The status array will contain the index into colind where this entry was last deposited.
375 // c_status[i] < CSR_ip - not in the row yet
376 // c_status[i] >= CSR_ip - this is the entry where you can find the data
377 // We start with this filled with INVALID's indicating that there are no entries yet.
378 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
379 std::vector<size_t> c_status(n, ST_INVALID);
380
381 // For each row of A/C
382 size_t CSR_ip = 0, OLD_ip = 0;
383 for (size_t i = 0; i < m; i++) {
384 // First fill the c_status array w/ locations where we're allowed to
385 // generate nonzeros for this row
386 OLD_ip = Crowptr[i];
387 CSR_ip = Crowptr[i+1];
388 for (size_t k = OLD_ip; k < CSR_ip; k++) {
389 c_status[Ccolind[k]] = k;
390
391 // Reset values in the row of C
392 Cvals[k] = SC_ZERO;
393 }
394
395 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
396 LO Aik = Acolind[k];
397 const SC Aval = Avals[k];
398 if (Aval == SC_ZERO)
399 continue;
400
401 if (targetMapToOrigRow[Aik] != LO_INVALID) {
402 // Local matrix
403 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
404
405 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
406 LO Bkj = Bcolind[j];
407 LO Cij = Bcol2Ccol[Bkj];
408
409 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
410 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
411 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
412
413 Cvals[c_status[Cij]] += Aval * Bvals[j];
414 }
415
416 } else {
417 // Remote matrix
418 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
419 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
420 LO Ikj = Icolind[j];
421 LO Cij = Icol2Ccol[Ikj];
422
423 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
424 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
425 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
426
427 Cvals[c_status[Cij]] += Aval * Ivals[j];
428 }
429 }
430 }
431 }
432
433 C.fillComplete(C.getDomainMap(), C.getRangeMap());
434}
435
436/*********************************************************************************************************/
437template<class Scalar,
438 class LocalOrdinal,
439 class GlobalOrdinal,
440 class LocalOrdinalViewType>
441void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
442 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
443 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
444 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
445 const LocalOrdinalViewType & Acol2Brow,
446 const LocalOrdinalViewType & Acol2Irow,
447 const LocalOrdinalViewType & Bcol2Ccol,
448 const LocalOrdinalViewType & Icol2Ccol,
449 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
450 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
451 const std::string& label,
452 const Teuchos::RCP<Teuchos::ParameterList>& params) {
453
454#ifdef HAVE_TPETRA_MMM_TIMINGS
455 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
456 using Teuchos::TimeMonitor;
457 Teuchos::RCP<TimeMonitor> MM;
458#endif
459
460 // Node-specific code
461 using Teuchos::RCP;
462
463 // Options
464 //int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer // unreferenced
465 std::string myalg("KK");
466 if(!params.is_null()) {
467 if(params->isParameter("cuda: jacobi algorithm"))
468 myalg = params->get("cuda: jacobi algorithm",myalg);
469 }
470
471 if(myalg == "MSAK") {
472 ::Tpetra::MatrixMatrix::ExtraKernels::jacobi_A_B_newmatrix_MultiplyScaleAddKernel(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
473 }
474 else if(myalg == "KK") {
475 jacobi_A_B_newmatrix_KokkosKernels(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
476 }
477 else {
478 throw std::runtime_error("Tpetra::MatrixMatrix::Jacobi newmatrix unknown kernel");
479 }
480
481#ifdef HAVE_TPETRA_MMM_TIMINGS
482 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix CudaESFC"))));
483#endif
484
485 // Final Fillcomplete
486 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
487 labelList->set("Timer Label",label);
488 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
489
490 // NOTE: MSAK already fillCompletes, so we have to check here
491 if(!C.isFillComplete()) {
492 RCP<const Export<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
493 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
494 }
495
496}
497
498
499
500/*********************************************************************************************************/
501template<class Scalar,
502 class LocalOrdinal,
503 class GlobalOrdinal,
504 class LocalOrdinalViewType>
505void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
506 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
507 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
508 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
509 const LocalOrdinalViewType & targetMapToOrigRow_dev,
510 const LocalOrdinalViewType & targetMapToImportRow_dev,
511 const LocalOrdinalViewType & Bcol2Ccol_dev,
512 const LocalOrdinalViewType & Icol2Ccol_dev,
513 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
514 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
515 const std::string& label,
516 const Teuchos::RCP<Teuchos::ParameterList>& params) {
517
518 // FIXME: Right now, this is a cut-and-paste of the serial kernel
519 typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
520
521#ifdef HAVE_TPETRA_MMM_TIMINGS
522 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
523 using Teuchos::TimeMonitor;
524 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse CudaCore"))));
525 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
526#endif
527 using Teuchos::RCP;
528 using Teuchos::rcp;
529
530 // Lots and lots of typedefs
531 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
532 typedef typename KCRS::StaticCrsGraphType graph_t;
533 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
534 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
535 typedef typename KCRS::values_type::non_const_type scalar_view_t;
536 typedef typename scalar_view_t::memory_space scalar_memory_space;
537
538 typedef Scalar SC;
539 typedef LocalOrdinal LO;
540 typedef GlobalOrdinal GO;
541 typedef Node NO;
542 typedef Map<LO,GO,NO> map_type;
543 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
544 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
545 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
546
547 // Since this is being run on Cuda, we need to fence because the below host code will use UVM
548 // KDDKDD typename graph_t::execution_space().fence();
549
550 // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
551 // KDDKDD UVM Ideally, this function would run on device and use
552 // KDDKDD UVM KokkosKernels instead of this host implementation.
553 auto targetMapToOrigRow =
554 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
555 targetMapToOrigRow_dev);
556 auto targetMapToImportRow =
557 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
558 targetMapToImportRow_dev);
559 auto Bcol2Ccol =
560 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
561 Bcol2Ccol_dev);
562 auto Icol2Ccol =
563 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
564 Icol2Ccol_dev);
565
566
567 // Sizes
568 RCP<const map_type> Ccolmap = C.getColMap();
569 size_t m = Aview.origMatrix->getLocalNumRows();
570 size_t n = Ccolmap->getLocalNumElements();
571
572 // Grab the Kokkos::SparseCrsMatrices & inner stuff
573 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
574 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
575 const KCRS & Cmat = C.getLocalMatrixHost();
576
577 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
578 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
579 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
580 scalar_view_t Cvals = Cmat.values;
581
582 c_lno_view_t Irowptr;
583 lno_nnz_view_t Icolind;
584 scalar_view_t Ivals;
585 if(!Bview.importMatrix.is_null()) {
586 auto lclB = Bview.importMatrix->getLocalMatrixHost();
587 Irowptr = lclB.graph.row_map;
588 Icolind = lclB.graph.entries;
589 Ivals = lclB.values;
590 }
591
592 // Jacobi-specific inner stuff
593 auto Dvals =
594 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
595
596#ifdef HAVE_TPETRA_MMM_TIMINGS
597 MM2 = Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse CudaCore - Compare"))));
598#endif
599
600 // The status array will contain the index into colind where this entry was last deposited.
601 // c_status[i] < CSR_ip - not in the row yet
602 // c_status[i] >= CSR_ip - this is the entry where you can find the data
603 // We start with this filled with INVALID's indicating that there are no entries yet.
604 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
605 std::vector<size_t> c_status(n, ST_INVALID);
606
607 // For each row of A/C
608 size_t CSR_ip = 0, OLD_ip = 0;
609 for (size_t i = 0; i < m; i++) {
610
611 // First fill the c_status array w/ locations where we're allowed to
612 // generate nonzeros for this row
613 OLD_ip = Crowptr[i];
614 CSR_ip = Crowptr[i+1];
615 for (size_t k = OLD_ip; k < CSR_ip; k++) {
616 c_status[Ccolind[k]] = k;
617
618 // Reset values in the row of C
619 Cvals[k] = SC_ZERO;
620 }
621
622 SC minusOmegaDval = -omega*Dvals(i,0);
623
624 // Entries of B
625 for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
626 Scalar Bval = Bvals[j];
627 if (Bval == SC_ZERO)
628 continue;
629 LO Bij = Bcolind[j];
630 LO Cij = Bcol2Ccol[Bij];
631
632 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
633 std::runtime_error, "Trying to insert a new entry into a static graph");
634
635 Cvals[c_status[Cij]] = Bvals[j];
636 }
637
638 // Entries of -omega * Dinv * A * B
639 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
640 LO Aik = Acolind[k];
641 const SC Aval = Avals[k];
642 if (Aval == SC_ZERO)
643 continue;
644
645 if (targetMapToOrigRow[Aik] != LO_INVALID) {
646 // Local matrix
647 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
648
649 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
650 LO Bkj = Bcolind[j];
651 LO Cij = Bcol2Ccol[Bkj];
652
653 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
654 std::runtime_error, "Trying to insert a new entry into a static graph");
655
656 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
657 }
658
659 } else {
660 // Remote matrix
661 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
662 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
663 LO Ikj = Icolind[j];
664 LO Cij = Icol2Ccol[Ikj];
665
666 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
667 std::runtime_error, "Trying to insert a new entry into a static graph");
668
669 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
670 }
671 }
672 }
673 }
674
675#ifdef HAVE_TPETRA_MMM_TIMINGS
676 MM2= Teuchos::null;
677 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
678#endif
679
680 C.fillComplete(C.getDomainMap(), C.getRangeMap());
681
682}
683
684/*********************************************************************************************************/
685template<class Scalar,
686 class LocalOrdinal,
687 class GlobalOrdinal,
688 class LocalOrdinalViewType>
689void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
690 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
691 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
692 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
693 const LocalOrdinalViewType & Acol2Brow,
694 const LocalOrdinalViewType & Acol2Irow,
695 const LocalOrdinalViewType & Bcol2Ccol,
696 const LocalOrdinalViewType & Icol2Ccol,
697 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
698 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
699 const std::string& label,
700 const Teuchos::RCP<Teuchos::ParameterList>& params) {
701
702#ifdef HAVE_TPETRA_MMM_TIMINGS
703 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
704 using Teuchos::TimeMonitor;
705 Teuchos::RCP<TimeMonitor> MM;
706#endif
707
708 // Check if the diagonal entries exist in debug mode
709 const bool debug = Tpetra::Details::Behavior::debug();
710 if(debug) {
711
712 auto rowMap = Aview.origMatrix->getRowMap();
713 Tpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> diags(rowMap);
714 Aview.origMatrix->getLocalDiagCopy(diags);
715 size_t diagLength = rowMap->getLocalNumElements();
716 Teuchos::Array<Scalar> diagonal(diagLength);
717 diags.get1dCopy(diagonal());
718
719 for(size_t i = 0; i < diagLength; ++i) {
720 TEUCHOS_TEST_FOR_EXCEPTION(diagonal[i] == Teuchos::ScalarTraits<Scalar>::zero(),
721 std::runtime_error,
722 "Matrix A has a zero/missing diagonal: " << diagonal[i] << std::endl <<
723 "KokkosKernels Jacobi-fused SpGEMM requires nonzero diagonal entries in A" << std::endl);
724 }
725 }
726
727 // Usings
728 using device_t = typename Tpetra::KokkosCompat::KokkosCudaWrapperNode::device_type;
730 using graph_t = typename matrix_t::StaticCrsGraphType;
731 using lno_view_t = typename graph_t::row_map_type::non_const_type;
732 using int_view_t = Kokkos::View<int *,
733 typename lno_view_t::array_layout,
734 typename lno_view_t::memory_space,
735 typename lno_view_t::memory_traits>;
736 using c_lno_view_t = typename graph_t::row_map_type::const_type;
737 using lno_nnz_view_t = typename graph_t::entries_type::non_const_type;
738 using scalar_view_t = typename matrix_t::values_type::non_const_type;
739
740 // KokkosKernels handle
741 using handle_t = typename KokkosKernels::Experimental::KokkosKernelsHandle<
742 typename lno_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
743 typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space >;
744
745 using int_handle_t = typename KokkosKernels::Experimental::KokkosKernelsHandle<
746 typename int_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
747 typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space >;
748
749 // Merge the B and Bimport matrices
750 const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
751
752 // Get the properties and arrays of input matrices
753 const matrix_t & Amat = Aview.origMatrix->getLocalMatrixDevice();
754 const matrix_t & Bmat = Bview.origMatrix->getLocalMatrixDevice();
755
756 typename handle_t::nnz_lno_t AnumRows = Amat.numRows();
757 typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows();
758 typename handle_t::nnz_lno_t BnumCols = Bmerged.numCols();
759
760 // Arrays of the output matrix
761 lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing("row_mapC"), AnumRows + 1);
762 lno_nnz_view_t entriesC;
763 scalar_view_t valuesC;
764
765 // Options
766 int team_work_size = 16;
767 std::string myalg("SPGEMM_KK_MEMORY");
768 if(!params.is_null()) {
769 if(params->isParameter("cuda: algorithm"))
770 myalg = params->get("cuda: algorithm",myalg);
771 if(params->isParameter("cuda: team work size"))
772 team_work_size = params->get("cuda: team work size",team_work_size);
773 }
774
775 // Get the algorithm mode
776 std::string alg("Cuda algorithm");
777 if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
778 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
779
780 // decide whether to use integer-typed row pointers for this spgemm
781 Tpetra::Details::IntRowPtrHelper<decltype(Bmerged)> irph(Bmerged.nnz(), Bmerged.graph.row_map);
782 const bool useIntRowptrs =
783 irph.shouldUseIntRowptrs() &&
784 Aview.origMatrix->getApplyHelper()->shouldUseIntRowptrs();
785
786 if (useIntRowptrs) {
787 int_handle_t kh;
788 kh.create_spgemm_handle(alg_enum);
789 kh.set_team_work_size(team_work_size);
790
791 int_view_t int_row_mapC (Kokkos::ViewAllocateWithoutInitializing("int_row_mapC"), AnumRows + 1);
792
793 auto Aint = Aview.origMatrix->getApplyHelper()->getIntRowptrMatrix(Amat);
794 auto Bint = irph.getIntRowptrMatrix(Bmerged);
795
796 KokkosSparse::Experimental::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
797 Aint.graph.row_map, Aint.graph.entries, false,
798 Bint.graph.row_map, Bint.graph.entries, false,
799 int_row_mapC);
800
801 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
802 if (c_nnz_size){
803 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
804 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
805 }
806
807 // even though there is no TPL for this, we have to use the same handle that was used in the symbolic phase,
808 // so need to have a special int-typed call for this as well.
809 KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
810 Aint.graph.row_map, Aint.graph.entries, Amat.values, false,
811 Bint.graph.row_map, Bint.graph.entries, Bint.values, false,
812 int_row_mapC, entriesC, valuesC,
813 omega, Dinv.getLocalViewDevice(Access::ReadOnly));
814 // transfer the integer rowptrs back to the correct rowptr type
815 Kokkos::parallel_for(int_row_mapC.size(), KOKKOS_LAMBDA(int i){ row_mapC(i) = int_row_mapC(i);});
816 kh.destroy_spgemm_handle();
817 } else {
818 handle_t kh;
819 kh.create_spgemm_handle(alg_enum);
820 kh.set_team_work_size(team_work_size);
821
822 KokkosSparse::Experimental::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
823 Amat.graph.row_map, Amat.graph.entries, false,
824 Bmerged.graph.row_map, Bmerged.graph.entries, false,
825 row_mapC);
826
827 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
828 if (c_nnz_size){
829 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
830 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
831 }
832
833 KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
834 Amat.graph.row_map, Amat.graph.entries, Amat.values, false,
835 Bmerged.graph.row_map, Bmerged.graph.entries, Bmerged.values, false,
836 row_mapC, entriesC, valuesC,
837 omega, Dinv.getLocalViewDevice(Access::ReadOnly));
838 kh.destroy_spgemm_handle();
839 }
840
841
842
843#ifdef HAVE_TPETRA_MMM_TIMINGS
844 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix CudaSort"))));
845#endif
846
847 // Sort & set values
848 if (params.is_null() || params->get("sort entries",true))
849 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
850 C.setAllValues(row_mapC,entriesC,valuesC);
851
852#ifdef HAVE_TPETRA_MMM_TIMINGS
853 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix CudaESFC"))));
854#endif
855
856 // Final Fillcomplete
857 Teuchos::RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
858 labelList->set("Timer Label",label);
859 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
860 Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
861 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
862}
863
864 }//MMdetails
865}//Tpetra
866
867#endif//CUDA
868
869#endif
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static bool debug()
Whether Tpetra is in debug mode.
Namespace Tpetra contains the class and methods constituting the Tpetra library.