Ifpack2 Templated Preconditioning Package Version 1.0
Loading...
Searching...
No Matches
Ifpack2_BlockComputeResidualVector.hpp
1// @HEADER
2// *****************************************************************************
3// Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4//
5// Copyright 2009 NTESS and the Ifpack2 contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
11#define IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
12
13#include "Ifpack2_BlockHelper.hpp"
14
15namespace Ifpack2 {
16
17 namespace BlockHelperDetails {
18
22 template <typename MatrixType>
23 struct AmD {
25 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
26 using size_type_1d_view = typename impl_type::size_type_1d_view;
27 using i64_3d_view = typename impl_type::i64_3d_view;
28 using impl_scalar_type_1d_view_tpetra = Unmanaged<typename impl_type::impl_scalar_type_1d_view_tpetra>;
29 // rowptr points to the start of each row of A_colindsub.
30 size_type_1d_view rowptr, rowptr_remote;
31 // Indices into A's rows giving the blocks to extract. rowptr(i) points to
32 // the i'th row. Thus, g.entries(A_colindsub(rowptr(row) : rowptr(row+1))),
33 // where g is A's graph, are the columns AmD uses. If seq_method_, then
34 // A_colindsub contains all the LIDs and A_colindsub_remote is empty. If !
35 // seq_method_, then A_colindsub contains owned LIDs and A_colindsub_remote
36 // contains the remote ones.
37 local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
38 // Precomputed direct offsets to A,x blocks, for owned entries (OverlapTag case) or all entries (AsyncTag case)
39 i64_3d_view A_x_offsets;
40 // Precomputed direct offsets to A,x blocks, for non-owned entries (OverlapTag case). For AsyncTag case this is left empty.
41 i64_3d_view A_x_offsets_remote;
42
43 // Currently always true.
44 bool is_tpetra_block_crs;
45
46 // If is_tpetra_block_crs, then this is a pointer to A_'s value data.
47 impl_scalar_type_1d_view_tpetra tpetra_values;
48
49 AmD() = default;
50 AmD(const AmD &b) = default;
51 };
52
53 template<typename MatrixType>
54 struct PartInterface {
55 using local_ordinal_type = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type;
56 using local_ordinal_type_1d_view = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view;
57 using local_ordinal_type_2d_view = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_2d_view;
58
59 PartInterface() = default;
60 PartInterface(const PartInterface &b) = default;
61
62 // Some terms:
63 // The matrix A is split as A = D + R, where D is the matrix of tridiag
64 // blocks and R is the remainder.
65 // A part is roughly a synonym for a tridiag. The distinction is that a part
66 // is the set of rows belonging to one tridiag and, equivalently, the off-diag
67 // rows in R associated with that tridiag. In contrast, the term tridiag is
68 // used to refer specifically to tridiag data, such as the pointer into the
69 // tridiag data array.
70 // Local (lcl) row are the LIDs. lclrow lists the LIDs belonging to each
71 // tridiag, and partptr points to the beginning of each tridiag. This is the
72 // LID space.
73 // Row index (idx) is the ordinal in the tridiag ordering. lclrow is indexed
74 // by this ordinal. This is the 'index' space.
75 // A flat index is the mathematical index into an array. A pack index
76 // accounts for SIMD packing.
77
78 // Local row LIDs. Permutation from caller's index space to tridiag index
79 // space.
80 local_ordinal_type_1d_view lclrow;
81 // partptr_ is the pointer array into lclrow_.
82 local_ordinal_type_1d_view partptr; // np+1
83 local_ordinal_type_2d_view partptr_sub;
84 local_ordinal_type_1d_view partptr_schur;
85 // packptr_(i), for i the pack index, indexes partptr_. partptr_(packptr_(i))
86 // is the start of the i'th pack.
87 local_ordinal_type_1d_view packptr; // npack+1
88 local_ordinal_type_1d_view packptr_sub;
89 local_ordinal_type_1d_view packindices_sub;
90 local_ordinal_type_2d_view packindices_schur;
91 // part2rowidx0_(i) is the flat row index of the start of the i'th part. It's
92 // an alias of partptr_ in the case of no overlap.
93 local_ordinal_type_1d_view part2rowidx0; // np+1
94 local_ordinal_type_1d_view part2rowidx0_sub;
95 // part2packrowidx0_(i) is the packed row index. If vector_length is 1, then
96 // it's the same as part2rowidx0_; if it's > 1, then the value is combined
97 // with i % vector_length to get the location in the packed data.
98 local_ordinal_type_1d_view part2packrowidx0; // np+1
99 local_ordinal_type_2d_view part2packrowidx0_sub;
100 local_ordinal_type part2packrowidx0_back; // So we don't need to grab the array from the GPU.
101 // rowidx2part_ maps the row index to the part index.
102 local_ordinal_type_1d_view rowidx2part; // nr
103 local_ordinal_type_1d_view rowidx2part_sub;
104 // True if lcl{row|col} is at most a constant away from row{idx|col}. In
105 // practice, this knowledge is not particularly useful, as packing for batched
106 // processing is done at the same time as the permutation from LID to index
107 // space. But it's easy to detect, so it's recorded in case an optimization
108 // can be made based on it.
109 bool row_contiguous;
110
111 local_ordinal_type max_partsz;
112 local_ordinal_type max_subpartsz;
113 local_ordinal_type n_subparts_per_part;
114 local_ordinal_type nparts;
115 };
116
117 // Precompute offsets of each A and x entry to speed up residual.
118 // (Applies for hasBlockCrsMatrix == true and OverlapTag/AsyncTag)
119 // Reading A, x take up to 4, 6 levels of indirection respectively,
120 // but precomputing the offsets reduces it to 2 for both.
121 //
122 // This function allocates and populates these members of AmD:
123 // A_x_offsets, A_x_offsets_remote
124 template<typename MatrixType>
125 void precompute_A_x_offsets(
126 AmD<MatrixType> &amd,
127 const PartInterface<MatrixType> &interf,
128 const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_crs_graph_type> &g,
129 const typename ImplType<MatrixType>::local_ordinal_type_1d_view &dm2cm,
130 int blocksize,
131 bool ownedRemoteSeparate)
132 {
133 using impl_type = ImplType<MatrixType>;
134 using i64_3d_view = typename impl_type::i64_3d_view;
135 using size_type = typename impl_type::size_type;
136 using local_ordinal_type = typename impl_type::local_ordinal_type;
137 using execution_space = typename impl_type::execution_space;
138 auto local_graph = g->getLocalGraphDevice();
139 const auto A_block_rowptr = local_graph.row_map;
140 const auto A_colind = local_graph.entries;
141 local_ordinal_type numLocalRows = interf.rowidx2part.extent(0);
142 int blocksize_square = blocksize * blocksize;
143 // shallow-copying views to avoid capturing the amd, interf objects in lambdas
144 auto lclrow = interf.lclrow;
145 auto A_colindsub = amd.A_colindsub;
146 auto A_colindsub_remote = amd.A_colindsub_remote;
147 auto rowptr = amd.rowptr;
148 auto rowptr_remote = amd.rowptr_remote;
149 bool is_dm2cm_active = dm2cm.extent(0);
150 if(ownedRemoteSeparate) {
151 // amd.rowptr points to owned entries only, and amd.rowptr_remote points to nonowned.
152 local_ordinal_type maxOwnedEntriesPerRow = 0;
153 local_ordinal_type maxNonownedEntriesPerRow = 0;
154 Kokkos::parallel_reduce(Kokkos::RangePolicy<execution_space>(0, numLocalRows),
155 KOKKOS_LAMBDA(local_ordinal_type i, local_ordinal_type& lmaxOwned, local_ordinal_type& lmaxNonowned)
156 {
157 const local_ordinal_type lr = lclrow(i);
158 local_ordinal_type rowNumOwned = rowptr(lr+1) - rowptr(lr);
159 if(rowNumOwned > lmaxOwned)
160 lmaxOwned = rowNumOwned;
161 //rowptr_remote won't be allocated for single-rank problems
162 if(rowptr_remote.extent(0)) {
163 local_ordinal_type rowNumNonowned = rowptr_remote(lr+1) - rowptr_remote(lr);
164 if(rowNumNonowned > lmaxNonowned)
165 lmaxNonowned = rowNumNonowned;
166 }
167 else {
168 lmaxNonowned = 0;
169 }
170 }, Kokkos::Max<local_ordinal_type>(maxOwnedEntriesPerRow), Kokkos::Max<local_ordinal_type>(maxNonownedEntriesPerRow));
171 // Allocate the two offsets views now that we know the dimensions
172 // For each one, the middle dimension is 0 for A offsets and 1 for x offsets.
173 // Packing them together in one view improves cache line utilization
174 amd.A_x_offsets = i64_3d_view("amd.A_x_offsets", numLocalRows, 2, maxOwnedEntriesPerRow);
175 amd.A_x_offsets_remote = i64_3d_view("amd.A_x_offsets_remote", numLocalRows, 2, maxNonownedEntriesPerRow);
176 auto A_x_offsets = amd.A_x_offsets;
177 auto A_x_offsets_remote = amd.A_x_offsets_remote;
178 // Now, populate all the offsets. Use ArithTraits<int64_t>::min to mark absent entries.
179 Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, numLocalRows),
180 KOKKOS_LAMBDA(local_ordinal_type i)
181 {
182 const local_ordinal_type lr = lclrow(i);
183 const size_type A_k0 = A_block_rowptr(lr);
184 // Owned entries
185 size_type rowBegin = rowptr(lr);
186 local_ordinal_type rowNumOwned = rowptr(lr+1) - rowBegin;
187 for(local_ordinal_type entry = 0; entry < maxOwnedEntriesPerRow; entry++) {
188 if(entry < rowNumOwned) {
189 const size_type j = A_k0 + A_colindsub(rowBegin + entry);
190 const local_ordinal_type A_colind_at_j = A_colind(j);
191 const local_ordinal_type loc = is_dm2cm_active ? dm2cm(A_colind_at_j) : A_colind_at_j;
192 A_x_offsets(i, 0, entry) = int64_t(j) * blocksize_square;
193 A_x_offsets(i, 1, entry) = int64_t(loc) * blocksize;
194 }
195 else {
196 A_x_offsets(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
197 A_x_offsets(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
198 }
199 }
200 // Nonowned entries
201 if(rowptr_remote.extent(0)) {
202 rowBegin = rowptr_remote(lr);
203 local_ordinal_type rowNumNonowned = rowptr_remote(lr+1) - rowBegin;
204 for(local_ordinal_type entry = 0; entry < maxNonownedEntriesPerRow; entry++) {
205 if(entry < rowNumNonowned) {
206 const size_type j = A_k0 + A_colindsub_remote(rowBegin + entry);
207 const local_ordinal_type A_colind_at_j = A_colind(j);
208 const local_ordinal_type loc = A_colind_at_j - numLocalRows;
209 A_x_offsets_remote(i, 0, entry) = int64_t(j) * blocksize_square;
210 A_x_offsets_remote(i, 1, entry) = int64_t(loc) * blocksize;
211 }
212 else {
213 A_x_offsets_remote(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
214 A_x_offsets_remote(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
215 }
216 }
217 }
218 });
219 }
220 else {
221 // amd.rowptr points to both owned and nonowned entries, so it tells us how many columns (last dim) A_x_offsets should have.
222 local_ordinal_type maxEntriesPerRow = 0;
223 Kokkos::parallel_reduce(Kokkos::RangePolicy<execution_space>(0, numLocalRows),
224 KOKKOS_LAMBDA(local_ordinal_type i, local_ordinal_type& lmax)
225 {
226 const local_ordinal_type lr = lclrow(i);
227 local_ordinal_type rowNum = rowptr(lr+1) - rowptr(lr);
228 if(rowNum > lmax)
229 lmax = rowNum;
230 }, Kokkos::Max<local_ordinal_type>(maxEntriesPerRow));
231 amd.A_x_offsets = i64_3d_view("amd.A_x_offsets", numLocalRows, 2, maxEntriesPerRow);
232 auto A_x_offsets = amd.A_x_offsets;
233 //Populate A,x offsets. Use ArithTraits<int64_t>::min to mark absent entries.
234 //For x offsets, add a shift blocksize*numLocalRows to represent that it indexes into x_remote instead of x.
235 Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, numLocalRows),
236 KOKKOS_LAMBDA(local_ordinal_type i)
237 {
238 const local_ordinal_type lr = lclrow(i);
239 const size_type A_k0 = A_block_rowptr(lr);
240 // Owned entries
241 size_type rowBegin = rowptr(lr);
242 local_ordinal_type rowOwned = rowptr(lr+1) - rowBegin;
243 for(local_ordinal_type entry = 0; entry < maxEntriesPerRow; entry++) {
244 if(entry < rowOwned) {
245 const size_type j = A_k0 + A_colindsub(rowBegin + entry);
246 A_x_offsets(i, 0, entry) = j * blocksize_square;
247 const local_ordinal_type A_colind_at_j = A_colind(j);
248 if(A_colind_at_j < numLocalRows) {
249 const local_ordinal_type loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
250 A_x_offsets(i, 1, entry) = int64_t(loc) * blocksize;
251 }
252 else {
253 A_x_offsets(i, 1, entry) = int64_t(A_colind_at_j) * blocksize;
254 }
255 }
256 else {
257 A_x_offsets(i, 0, entry) = Kokkos::ArithTraits<int64_t>::min();
258 A_x_offsets(i, 1, entry) = Kokkos::ArithTraits<int64_t>::min();
259 }
260 }
261 });
262 }
263 }
264
268 static inline int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize,
269 const int team_size) {
270 int total_team_size(0);
271 if (blksize <= 5) total_team_size = 32;
272 else if (blksize <= 9) total_team_size = 32; // 64
273 else if (blksize <= 12) total_team_size = 96;
274 else if (blksize <= 16) total_team_size = 128;
275 else if (blksize <= 20) total_team_size = 160;
276 else total_team_size = 160;
277 return total_team_size/team_size;
278 }
279
280 static inline int ComputeResidualVectorRecommendedHIPVectorSize(const int blksize,
281 const int team_size) {
282 int total_team_size(0);
283 if (blksize <= 5) total_team_size = 32;
284 else if (blksize <= 9) total_team_size = 32; // 64
285 else if (blksize <= 12) total_team_size = 96;
286 else if (blksize <= 16) total_team_size = 128;
287 else if (blksize <= 20) total_team_size = 160;
288 else total_team_size = 160;
289 return total_team_size/team_size;
290 }
291
292 static inline int ComputeResidualVectorRecommendedSYCLVectorSize(const int blksize,
293 const int team_size) {
294 int total_team_size(0);
295 if (blksize <= 5) total_team_size = 32;
296 else if (blksize <= 9) total_team_size = 32; // 64
297 else if (blksize <= 12) total_team_size = 96;
298 else if (blksize <= 16) total_team_size = 128;
299 else if (blksize <= 20) total_team_size = 160;
300 else total_team_size = 160;
301 return total_team_size/team_size;
302 }
303
304 template<typename T>
305 static inline int ComputeResidualVectorRecommendedVectorSize(const int blksize,
306 const int team_size) {
307 if ( is_cuda<T>::value )
308 return ComputeResidualVectorRecommendedCudaVectorSize(blksize, team_size);
309 if ( is_hip<T>::value )
310 return ComputeResidualVectorRecommendedHIPVectorSize(blksize, team_size);
311 if ( is_sycl<T>::value )
312 return ComputeResidualVectorRecommendedSYCLVectorSize(blksize, team_size);
313 return -1;
314 }
315
316 template<typename MatrixType>
317 struct ComputeResidualVector {
318 public:
319 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
320 using node_device_type = typename impl_type::node_device_type;
321 using execution_space = typename impl_type::execution_space;
322 using memory_space = typename impl_type::memory_space;
323
324 using local_ordinal_type = typename impl_type::local_ordinal_type;
325 using size_type = typename impl_type::size_type;
326 using impl_scalar_type = typename impl_type::impl_scalar_type;
327 using magnitude_type = typename impl_type::magnitude_type;
328 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
329 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
331 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
332 using size_type_1d_view = typename impl_type::size_type_1d_view;
333 using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
334 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
335 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector (layout left)
336 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
337 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
338 using i64_3d_view = typename impl_type::i64_3d_view;
339 static constexpr int vector_length = impl_type::vector_length;
340
342 using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
343
344 // enum for max blocksize and vector length
345 enum : int { max_blocksize = 32 };
346
347 private:
348 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
349 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
350 ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
351 Unmanaged<impl_scalar_type_2d_view_tpetra> y;
352 Unmanaged<vector_type_3d_view> y_packed;
353 Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
354
355 // AmD information
356 const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
357 const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
358 const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
359
360 // block crs graph information
361 // for cuda (kokkos crs graph uses a different size_type from size_t)
362 const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_block_rowptr;
363 const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_point_rowptr;
364 const ConstUnmanaged<Kokkos::View<local_ordinal_type*,node_device_type> > A_colind;
365
366 // blocksize
367 const local_ordinal_type blocksize_requested;
368
369 // part interface
370 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
371 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
372 const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
373 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
374 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
375 const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
376
377 // block offsets
378 const ConstUnmanaged<i64_3d_view> A_x_offsets;
379 const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
380
381 const bool is_dm2cm_active;
382 const bool hasBlockCrsMatrix;
383
384 public:
385 template<typename LocalCrsGraphType>
386 ComputeResidualVector(const AmD<MatrixType> &amd,
387 const LocalCrsGraphType &block_graph,
388 const LocalCrsGraphType &point_graph,
389 const local_ordinal_type &blocksize_requested_,
390 const PartInterface<MatrixType> &interf,
391 const local_ordinal_type_1d_view &dm2cm_,
392 bool hasBlockCrsMatrix_)
393 : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
394 colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
395 tpetra_values(amd.tpetra_values),
396 A_block_rowptr(block_graph.row_map),
397 A_point_rowptr(point_graph.row_map),
398 A_colind(block_graph.entries),
399 blocksize_requested(blocksize_requested_),
400 part2packrowidx0(interf.part2packrowidx0),
401 part2rowidx0(interf.part2rowidx0),
402 rowidx2part(interf.rowidx2part),
403 partptr(interf.partptr),
404 lclrow(interf.lclrow),
405 dm2cm(dm2cm_),
406 A_x_offsets(amd.A_x_offsets),
407 A_x_offsets_remote(amd.A_x_offsets_remote),
408 is_dm2cm_active(dm2cm_.span() > 0),
409 hasBlockCrsMatrix(hasBlockCrsMatrix_)
410 {}
411
412 inline
413 void
414 SerialDot(const local_ordinal_type &blocksize,
415 const local_ordinal_type &lclRowID,
416 const local_ordinal_type &lclColID,
417 const local_ordinal_type &ii,
418 const ConstUnmanaged<local_ordinal_type_1d_view>colindsub_,
419 const impl_scalar_type * const KOKKOS_RESTRICT xx,
420 /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const {
421 const size_type Aj_c = colindsub_(lclColID);
422 auto point_row_offset = A_point_rowptr(lclRowID*blocksize + ii) + Aj_c*blocksize;
423 impl_scalar_type val = 0;
424#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
425# pragma ivdep
426#endif
427#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
428# pragma unroll
429#endif
430 for (local_ordinal_type k1=0;k1<blocksize;++k1)
431 val += tpetra_values(point_row_offset + k1) * xx[k1];
432 yy[ii] -= val;
433 }
434
435 inline
436 void
437 SerialGemv(const local_ordinal_type &blocksize,
438 const impl_scalar_type * const KOKKOS_RESTRICT AA,
439 const impl_scalar_type * const KOKKOS_RESTRICT xx,
440 /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const {
441 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
442 for (local_ordinal_type k0=0;k0<blocksize;++k0) {
443 impl_scalar_type val = 0;
444#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
445# pragma ivdep
446#endif
447#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
448# pragma unroll
449#endif
450 for (local_ordinal_type k1=0;k1<blocksize;++k1)
451 val += AA[tlb::getFlatIndex(k0,k1,blocksize)]*xx[k1];
452 yy[k0] -= val;
453 }
454 }
455
456 template<typename bbViewType, typename yyViewType>
457 KOKKOS_INLINE_FUNCTION
458 void
459 VectorCopy(const member_type &member,
460 const local_ordinal_type &blocksize,
461 const bbViewType &bb,
462 const yyViewType &yy) const {
463 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &k0) {
464 yy(k0) = static_cast<typename yyViewType::const_value_type>(bb(k0));
465 });
466 }
467
468 template<typename xxViewType, typename yyViewType>
469 KOKKOS_INLINE_FUNCTION
470 void
471 VectorDot(const member_type &member,
472 const local_ordinal_type &blocksize,
473 const local_ordinal_type &lclRowID,
474 const local_ordinal_type &lclColID,
475 const local_ordinal_type &ii,
476 const ConstUnmanaged<local_ordinal_type_1d_view>colindsub_,
477 const xxViewType &xx,
478 const yyViewType &yy) const {
479 const size_type Aj_c = colindsub_(lclColID);
480 auto point_row_offset = A_point_rowptr(lclRowID*blocksize + ii) + Aj_c*blocksize;
481 impl_scalar_type val = 0;
482 Kokkos::parallel_reduce
483 (Kokkos::ThreadVectorRange(member, blocksize),
484 [&](const local_ordinal_type &k1, impl_scalar_type& update) {
485 update += tpetra_values(point_row_offset + k1) *xx(k1);
486 }, val);
487 Kokkos::single(Kokkos::PerThread(member),
488 [=]()
489 {
490 Kokkos::atomic_add(&yy(ii), typename yyViewType::const_value_type(-val));
491 });
492 }
493
494 // BMK: This version coalesces accesses to AA for LayoutRight blocks.
495 template<typename AAViewType, typename xxViewType, typename yyViewType>
496 KOKKOS_INLINE_FUNCTION
497 void
498 VectorGemv(const member_type &member,
499 const local_ordinal_type &blocksize,
500 const AAViewType &AA,
501 const xxViewType &xx,
502 const yyViewType &yy) const {
503 for (local_ordinal_type k0=0;k0<blocksize;++k0) {
504 impl_scalar_type val = 0;
505 Kokkos::parallel_reduce
506 (Kokkos::ThreadVectorRange(member, blocksize),
507 [&](const local_ordinal_type &k1, impl_scalar_type& update) {
508 update += AA(k0,k1)*xx(k1);
509 }, val);
510 Kokkos::single(Kokkos::PerThread(member),
511 [=]()
512 {
513 Kokkos::atomic_add(&yy(k0), -val);
514 });
515 }
516 }
517
518 struct SeqTag {};
519
520 KOKKOS_INLINE_FUNCTION
521 void
522 operator() (const SeqTag &, const local_ordinal_type& i) const {
523 const local_ordinal_type blocksize = blocksize_requested;
524 const local_ordinal_type blocksize_square = blocksize*blocksize;
525
526 // constants
527 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
528 const local_ordinal_type num_vectors = y.extent(1);
529 const local_ordinal_type row = i*blocksize;
530 for (local_ordinal_type col=0;col<num_vectors;++col) {
531 // y := b
532 impl_scalar_type *yy = &y(row, col);
533 const impl_scalar_type * const bb = &b(row, col);
534 memcpy(yy, bb, sizeof(impl_scalar_type)*blocksize);
535
536 // y -= Rx
537 const size_type A_k0 = A_block_rowptr[i];
538 for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
539 const size_type j = A_k0 + colindsub[k];
540 const impl_scalar_type * const xx = &x(A_colind[j]*blocksize, col);
541 if (hasBlockCrsMatrix) {
542 const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
543 SerialGemv(blocksize,AA,xx,yy);
544 }
545 else {
546 for (local_ordinal_type k0=0;k0<blocksize;++k0)
547 SerialDot(blocksize, i, k, k0, colindsub, xx, yy);
548 }
549 }
550 }
551 }
552
553 KOKKOS_INLINE_FUNCTION
554 void
555 operator() (const SeqTag &, const member_type &member) const {
556
557 // constants
558 const local_ordinal_type blocksize = blocksize_requested;
559 const local_ordinal_type blocksize_square = blocksize*blocksize;
560
561 const local_ordinal_type lr = member.league_rank();
562 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
563 const local_ordinal_type num_vectors = y.extent(1);
564
565 // subview pattern
566 auto bb = Kokkos::subview(b, block_range, 0);
567 auto xx = bb;
568 auto yy = Kokkos::subview(y, block_range, 0);
569 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
570
571 const local_ordinal_type row = lr*blocksize;
572 for (local_ordinal_type col=0;col<num_vectors;++col) {
573 // y := b
574 yy.assign_data(&y(row, col));
575 bb.assign_data(&b(row, col));
576 if (member.team_rank() == 0)
577 VectorCopy(member, blocksize, bb, yy);
578 member.team_barrier();
579
580 // y -= Rx
581 const size_type A_k0 = A_block_rowptr[lr];
582
583 if(hasBlockCrsMatrix) {
584 Kokkos::parallel_for
585 (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
586 [&](const local_ordinal_type &k) {
587 const size_type j = A_k0 + colindsub[k];
588 xx.assign_data( &x(A_colind[j]*blocksize, col) );
589 A_block_cst.assign_data( &tpetra_values(j*blocksize_square) );
590 VectorGemv(member, blocksize, A_block_cst, xx, yy);
591 });
592 }
593 else {
594 Kokkos::parallel_for
595 (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
596 [&](const local_ordinal_type &k) {
597
598 const size_type j = A_k0 + colindsub[k];
599 xx.assign_data( &x(A_colind[j]*blocksize, col) );
600
601 for (local_ordinal_type k0=0;k0<blocksize;++k0)
602 VectorDot(member, blocksize, lr, k, k0, colindsub, xx, yy);
603 });
604 }
605 }
606 }
607
608 // * B: block size for compile-time specialization, or 0 for general case (up to max_blocksize)
609 // * async: true if using async importer. overlap is not used in this case.
610 // Whether a column is owned or nonowned is decided at runtime.
611 // * overlap: true if processing the columns that are not locally owned,
612 // false if processing locally owned columns.
613 // * haveBlockMatrix: true if A is a BlockCrsMatrix, false if it's CrsMatrix.
614 template<int B, bool async, bool overlap, bool haveBlockMatrix>
615 struct GeneralTag
616 {
617 static_assert(!(async && overlap),
618 "ComputeResidualVector: async && overlap is not a valid configuration for GeneralTag");
619 };
620
621 // Define AsyncTag and OverlapTag in terms of GeneralTag:
622 // P == 0 means only compute on owned columns
623 // P == 1 means only compute on nonowned columns
624 template<int P, int B, bool haveBlockMatrix>
625 using OverlapTag = GeneralTag<B, false, P != 0, haveBlockMatrix>;
626
627 template<int B, bool haveBlockMatrix>
628 using AsyncTag = GeneralTag<B, true, false, haveBlockMatrix>;
629
630 // CPU implementation for all cases
631 template<int B, bool async, bool overlap, bool haveBlockMatrix>
632 KOKKOS_INLINE_FUNCTION
633 void
634 operator() (const GeneralTag<B, async, overlap, haveBlockMatrix>&, const local_ordinal_type &rowidx) const {
635 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
636
637 // constants
638 const local_ordinal_type partidx = rowidx2part(rowidx);
639 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
640 const local_ordinal_type v = partidx % vector_length;
641
642 const local_ordinal_type num_vectors = y_packed.extent(2);
643 const local_ordinal_type num_local_rows = lclrow.extent(0);
644
645 // temporary buffer for y flat
646 impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
647
648 const local_ordinal_type lr = lclrow(rowidx);
649
650 auto colindsub_used = overlap ? colindsub_remote : colindsub;
651 auto rowptr_used = overlap ? rowptr_remote : rowptr;
652
653 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
654 if constexpr(overlap) {
655 // y (temporary) := 0
656 memset((void*) yy, 0, sizeof(impl_scalar_type)*blocksize);
657 }
658 else {
659 // y := b
660 const local_ordinal_type row = lr*blocksize;
661 memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
662 }
663
664 // y -= Rx
665 const size_type A_k0 = A_block_rowptr[lr];
666 for (size_type k = rowptr_used[lr]; k < rowptr_used[lr+1]; ++k) {
667 const size_type j = A_k0 + colindsub_used[k];
668 const local_ordinal_type A_colind_at_j = A_colind[j];
669 if constexpr (haveBlockMatrix) {
670 const local_ordinal_type blocksize_square = blocksize*blocksize;
671 const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
672 if ((!async && !overlap) || (async && A_colind_at_j < num_local_rows)) {
673 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
674 const impl_scalar_type * const xx = &x(loc*blocksize, col);
675 SerialGemv(blocksize, AA,xx,yy);
676 } else {
677 const auto loc = A_colind_at_j - num_local_rows;
678 const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
679 SerialGemv(blocksize, AA,xx_remote,yy);
680 }
681 }
682 else {
683 if ((!async && !overlap) || (async && A_colind_at_j < num_local_rows)) {
684 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
685 const impl_scalar_type * const xx = &x(loc*blocksize, col);
686 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
687 SerialDot(blocksize, lr, k, k0, colindsub_used, xx, yy);
688 } else {
689 const auto loc = A_colind_at_j - num_local_rows;
690 const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
691 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
692 SerialDot(blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
693 }
694 }
695 }
696 // move yy to y_packed
697 if constexpr(overlap) {
698 for (local_ordinal_type k=0;k<blocksize;++k)
699 y_packed(pri, k, col)[v] += yy[k];
700 }
701 else {
702 for (local_ordinal_type k=0;k<blocksize;++k)
703 y_packed(pri, k, col)[v] = yy[k];
704 }
705 }
706 }
707
708 // GPU implementation for hasBlockCrsMatrix == true
709 template<int B, bool async, bool overlap>
710 KOKKOS_INLINE_FUNCTION
711 void
712 operator() (const GeneralTag<B, async, overlap, true>&, const member_type &member) const {
713 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
714
715 // constants
716 const local_ordinal_type rowidx = member.league_rank();
717 const local_ordinal_type partidx = rowidx2part(rowidx);
718 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
719 const local_ordinal_type v = partidx % vector_length;
720
721 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
722 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
723 const local_ordinal_type num_local_rows = lclrow.extent(0);
724
725 // subview pattern
726 using subview_1D_right_t = decltype(Kokkos::subview(b, block_range, 0));
727 using subview_1D_stride_t = decltype(Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0));
728 subview_1D_right_t bb(nullptr, blocksize);
729 subview_1D_right_t xx(nullptr, blocksize);
730 subview_1D_stride_t yy(nullptr, Kokkos::LayoutStride(blocksize, y_packed_scalar.stride_1()));
731 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
732
733 // Get shared allocation for a local copy of x, Ax, and A
734 impl_scalar_type * local_Ax = reinterpret_cast<impl_scalar_type *>(member.team_scratch(0).get_shmem(blocksize*sizeof(impl_scalar_type)));
735 impl_scalar_type * local_x = reinterpret_cast<impl_scalar_type *>(member.thread_scratch(0).get_shmem(blocksize*sizeof(impl_scalar_type)));
736
737 const local_ordinal_type lr = lclrow(rowidx);
738 const local_ordinal_type row = lr*blocksize;
739 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
740 if(col)
741 member.team_barrier();
742 // y -= Rx
743 // Initialize accumulation array
744 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),[&](const local_ordinal_type & i){
745 local_Ax[i] = 0;
746 });
747 member.team_barrier();
748
749 int numEntries;
750 if constexpr(!overlap) {
751 numEntries = A_x_offsets.extent(2);
752 }
753 else {
754 numEntries = A_x_offsets_remote.extent(2);
755 }
756
757 Kokkos::parallel_for
758 (Kokkos::TeamThreadRange(member, 0, numEntries),
759 [&](const int k) {
760 int64_t A_offset = overlap ? A_x_offsets_remote(rowidx, 0, k) : A_x_offsets(rowidx, 0, k);
761 int64_t x_offset = overlap ? A_x_offsets_remote(rowidx, 1, k) : A_x_offsets(rowidx, 1, k);
762 if(A_offset != Kokkos::ArithTraits<int64_t>::min()) {
763 A_block_cst.assign_data(tpetra_values.data() + A_offset);
764 // Pull x into local memory
765 if constexpr(async) {
766 size_type remote_cutoff = blocksize * num_local_rows;
767 if(x_offset >= remote_cutoff)
768 xx.assign_data(&x_remote(x_offset - remote_cutoff, col));
769 else
770 xx.assign_data(&x(x_offset, col));
771 }
772 else {
773 if constexpr(!overlap) {
774 xx.assign_data(&x(x_offset, col));
775 }
776 else {
777 xx.assign_data(&x_remote(x_offset, col));
778 }
779 }
780
781 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),[&](const local_ordinal_type & i){
782 local_x[i] = xx(i);
783 });
784
785 // MatVec op Ax += A*x
786 Kokkos::parallel_for(
787 Kokkos::ThreadVectorRange(member, blocksize),
788 [&](const local_ordinal_type &k0) {
789 impl_scalar_type val = 0;
790 for(int k1=0; k1<blocksize; k1++)
791 val += A_block_cst(k0,k1)*local_x[k1];
792 Kokkos::atomic_add(local_Ax+k0, val);
793 });
794 }
795 });
796 member.team_barrier();
797 // Update y = b - local_Ax
798 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
799 bb.assign_data(&b(row, col));
800 Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),[&](const local_ordinal_type & i){
801 if (!overlap)
802 yy(i) = bb(i) - local_Ax[i];
803 else
804 yy(i) -= local_Ax[i];
805 });
806 }
807 }
808
809 // GPU implementation for hasBlockCrsMatrix == false
810 template<int B, bool async, bool overlap>
811 KOKKOS_INLINE_FUNCTION
812 void
813 operator() (const GeneralTag<B, async, overlap, false>&, const member_type &member) const {
814 const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
815
816 // constants
817 const local_ordinal_type rowidx = member.league_rank();
818 const local_ordinal_type partidx = rowidx2part(rowidx);
819 const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
820 const local_ordinal_type v = partidx % vector_length;
821
822 const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
823 const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
824 const local_ordinal_type num_local_rows = lclrow.extent(0);
825
826 // subview pattern
827 using subview_1D_right_t = decltype(Kokkos::subview(b, block_range, 0));
828 using subview_1D_stride_t = decltype(Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0));
829 subview_1D_right_t bb(nullptr, blocksize);
830 subview_1D_right_t xx(nullptr, blocksize);
831 subview_1D_right_t xx_remote(nullptr, blocksize);
832 subview_1D_stride_t yy(nullptr, Kokkos::LayoutStride(blocksize, y_packed_scalar.stride_1()));
833 auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
834 auto colindsub_used = overlap ? colindsub_remote : colindsub;
835 auto rowptr_used = overlap ? rowptr_remote : rowptr;
836
837 const local_ordinal_type lr = lclrow(rowidx);
838 const local_ordinal_type row = lr*blocksize;
839 for (local_ordinal_type col = 0; col < num_vectors; ++col) {
840 yy.assign_data(&y_packed_scalar(pri, 0, col, v));
841 if (!overlap) {
842 // y := b
843 bb.assign_data(&b(row, col));
844 if (member.team_rank() == 0)
845 VectorCopy(member, blocksize, bb, yy);
846 member.team_barrier();
847 }
848
849 // y -= Rx
850 const size_type A_k0 = A_block_rowptr[lr];
851 Kokkos::parallel_for
852 (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
853 [&](const local_ordinal_type &k) {
854 const size_type j = A_k0 + colindsub_used[k];
855 const local_ordinal_type A_colind_at_j = A_colind[j];
856 if ((async && A_colind_at_j < num_local_rows) || (!async && !overlap)) {
857 const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
858 xx.assign_data( &x(loc*blocksize, col) );
859 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
860 VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx, yy);
861 } else {
862 const auto loc = A_colind_at_j - num_local_rows;
863 xx_remote.assign_data( &x_remote(loc*blocksize, col) );
864 for (local_ordinal_type k0 = 0; k0 < blocksize; ++k0)
865 VectorDot(member, blocksize, lr, k, k0, colindsub_used, xx_remote, yy);
866 }
867 });
868 }
869 }
870
871 // y = b - Rx; seq method
872 template<typename MultiVectorLocalViewTypeY,
873 typename MultiVectorLocalViewTypeB,
874 typename MultiVectorLocalViewTypeX>
875 void run(const MultiVectorLocalViewTypeY &y_,
876 const MultiVectorLocalViewTypeB &b_,
877 const MultiVectorLocalViewTypeX &x_) {
878 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
879 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE("BlockTriDi::ComputeResidual::<SeqTag>", ComputeResidual0, execution_space);
880
881 y = y_; b = b_; x = x_;
882 if constexpr (is_device<execution_space>::value) {
883 const local_ordinal_type blocksize = blocksize_requested;
884 const local_ordinal_type team_size = 8;
885 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
886 const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
887 Kokkos::parallel_for
888 ("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
889 } else {
890 const Kokkos::RangePolicy<execution_space,SeqTag> policy(0, rowptr.extent(0) - 1);
891 Kokkos::parallel_for
892 ("ComputeResidual::RangePolicy::run<SeqTag>", policy, *this);
893 }
894 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
895 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
896 }
897
898 // y = b - R (x , x_remote)
899 template<typename MultiVectorLocalViewTypeB,
900 typename MultiVectorLocalViewTypeX,
901 typename MultiVectorLocalViewTypeX_Remote>
902 void run(const vector_type_3d_view &y_packed_,
903 const MultiVectorLocalViewTypeB &b_,
904 const MultiVectorLocalViewTypeX &x_,
905 const MultiVectorLocalViewTypeX_Remote &x_remote_) {
906 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
907 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE("BlockTriDi::ComputeResidual::<AsyncTag>", ComputeResidual0, execution_space);
908
909 b = b_; x = x_; x_remote = x_remote_;
910 if constexpr (is_device<execution_space>::value) {
911 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
912 y_packed_.extent(0),
913 y_packed_.extent(1),
914 y_packed_.extent(2),
915 vector_length);
916 } else {
917 y_packed = y_packed_;
918 }
919
920 if constexpr(is_device<execution_space>::value) {
921 const local_ordinal_type blocksize = blocksize_requested;
922 // local_ordinal_type vl_power_of_two = 1;
923 // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
924 // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
925 // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
926#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
927 if(this->hasBlockCrsMatrix) { \
928 const local_ordinal_type team_size = 8; \
929 const local_ordinal_type vector_size = 8; \
930 const size_t shmem_team_size = blocksize*sizeof(btdm_scalar_type); \
931 const size_t shmem_thread_size = blocksize*sizeof(btdm_scalar_type); \
932 Kokkos::TeamPolicy<execution_space,AsyncTag<B, true> > \
933 policy(rowidx2part.extent(0), team_size, vector_size); \
934 policy.set_scratch_size(0,Kokkos::PerTeam(shmem_team_size),Kokkos::PerThread(shmem_thread_size)); \
935 Kokkos::parallel_for \
936 ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
937 policy, *this); \
938 } \
939 else { \
940 const local_ordinal_type team_size = 8; \
941 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size); \
942 const Kokkos::TeamPolicy<execution_space,AsyncTag<B, false> > \
943 policy(rowidx2part.extent(0), team_size, vector_size); \
944 Kokkos::parallel_for \
945 ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
946 policy, *this); \
947 } \
948 } break
949 switch (blocksize_requested) {
950 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
951 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
952 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
953 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
954 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
955 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
956 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
957 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
958 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
959 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
960 }
961#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
962 } else {
963#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
964 if(this->hasBlockCrsMatrix) { \
965 const Kokkos::RangePolicy<execution_space,AsyncTag<B, true> > policy(0, rowidx2part.extent(0)); \
966 Kokkos::parallel_for \
967 ("ComputeResidual::RangePolicy::run<AsyncTag>", \
968 policy, *this); \
969 } \
970 else { \
971 const Kokkos::RangePolicy<execution_space,AsyncTag<B, false> > policy(0, rowidx2part.extent(0)); \
972 Kokkos::parallel_for \
973 ("ComputeResidual::RangePolicy::run<AsyncTag>", \
974 policy, *this); \
975 } \
976 } break
977
978 switch (blocksize_requested) {
979 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
980 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
981 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
982 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
983 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
984 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
985 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
986 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
987 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
988 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
989 }
990#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
991 }
992 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
993 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
994 }
995
996 // y = b - R (y , y_remote)
997 template<typename MultiVectorLocalViewTypeB,
998 typename MultiVectorLocalViewTypeX,
999 typename MultiVectorLocalViewTypeX_Remote>
1000 void run(const vector_type_3d_view &y_packed_,
1001 const MultiVectorLocalViewTypeB &b_,
1002 const MultiVectorLocalViewTypeX &x_,
1003 const MultiVectorLocalViewTypeX_Remote &x_remote_,
1004 const bool compute_owned) {
1005 IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
1006 IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE("BlockTriDi::ComputeResidual::<OverlapTag>", ComputeResidual0, execution_space);
1007
1008 b = b_; x = x_; x_remote = x_remote_;
1009 if constexpr (is_device<execution_space>::value) {
1010 y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
1011 y_packed_.extent(0),
1012 y_packed_.extent(1),
1013 y_packed_.extent(2),
1014 vector_length);
1015 } else {
1016 y_packed = y_packed_;
1017 }
1018
1019 if constexpr (is_device<execution_space>::value) {
1020 const local_ordinal_type blocksize = blocksize_requested;
1021 // local_ordinal_type vl_power_of_two = 1;
1022 // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
1023 // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
1024 // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
1025#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
1026 if(this->hasBlockCrsMatrix) { \
1027 const local_ordinal_type team_size = 8; \
1028 const local_ordinal_type vector_size = 8; \
1029 const size_t shmem_team_size = blocksize*sizeof(btdm_scalar_type); \
1030 const size_t shmem_thread_size = blocksize*sizeof(btdm_scalar_type); \
1031 if (compute_owned) { \
1032 Kokkos::TeamPolicy<execution_space,OverlapTag<0, B, true> > \
1033 policy(rowidx2part.extent(0), team_size, vector_size); \
1034 policy.set_scratch_size(0,Kokkos::PerTeam(shmem_team_size),Kokkos::PerThread(shmem_thread_size)); \
1035 Kokkos::parallel_for \
1036 ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
1037 } else { \
1038 Kokkos::TeamPolicy<execution_space,OverlapTag<1, B, true> > \
1039 policy(rowidx2part.extent(0), team_size, vector_size); \
1040 policy.set_scratch_size(0,Kokkos::PerTeam(shmem_team_size),Kokkos::PerThread(shmem_thread_size)); \
1041 Kokkos::parallel_for \
1042 ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
1043 } \
1044 } \
1045 else { \
1046 const local_ordinal_type team_size = 8; \
1047 const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size); \
1048 if (compute_owned) { \
1049 const Kokkos::TeamPolicy<execution_space,OverlapTag<0, B, false> > \
1050 policy(rowidx2part.extent(0), team_size, vector_size); \
1051 Kokkos::parallel_for \
1052 ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
1053 } else { \
1054 const Kokkos::TeamPolicy<execution_space,OverlapTag<1, B, false> > \
1055 policy(rowidx2part.extent(0), team_size, vector_size); \
1056 Kokkos::parallel_for \
1057 ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
1058 } \
1059 } \
1060 break
1061 switch (blocksize_requested) {
1062 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
1063 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
1064 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
1065 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
1066 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
1067 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
1068 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
1069 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
1070 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
1071 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
1072 }
1073#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
1074 } else {
1075#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
1076 if(this->hasBlockCrsMatrix) { \
1077 if (compute_owned) { \
1078 const Kokkos::RangePolicy<execution_space,OverlapTag<0, B, true>> \
1079 policy(0, rowidx2part.extent(0)); \
1080 Kokkos::parallel_for \
1081 ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
1082 } else { \
1083 const Kokkos::RangePolicy<execution_space,OverlapTag<1, B, true>> \
1084 policy(0, rowidx2part.extent(0)); \
1085 Kokkos::parallel_for \
1086 ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
1087 } \
1088 } \
1089 else { \
1090 if (compute_owned) { \
1091 const Kokkos::RangePolicy<execution_space,OverlapTag<0, B, false>> \
1092 policy(0, rowidx2part.extent(0)); \
1093 Kokkos::parallel_for \
1094 ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
1095 } else { \
1096 const Kokkos::RangePolicy<execution_space,OverlapTag<1, B, false>> \
1097 policy(0, rowidx2part.extent(0)); \
1098 Kokkos::parallel_for \
1099 ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
1100 } \
1101 } \
1102 break
1103
1104 switch (blocksize_requested) {
1105 case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
1106 case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
1107 case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
1108 case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
1109 case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
1110 case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
1111 case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
1112 case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
1113 case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
1114 default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
1115 }
1116#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
1117 }
1118 IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
1119 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
1120 }
1121 };
1122
1123
1124 } // namespace BlockHelperDetails
1125
1126} // namespace Ifpack2
1127
1128#endif
Preconditioners and smoothers for Tpetra sparse matrices.
Definition Ifpack2_AdditiveSchwarz_decl.hpp:41
Definition Ifpack2_BlockHelper.hpp:249
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition Ifpack2_BlockHelper.hpp:320
size_t size_type
Definition Ifpack2_BlockHelper.hpp:253
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition Ifpack2_BlockHelper.hpp:262
node_type::device_type node_device_type
Definition Ifpack2_BlockHelper.hpp:276