Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_Details_residual.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_DETAILS_RESIDUAL_HPP
11#define TPETRA_DETAILS_RESIDUAL_HPP
12
13#include "TpetraCore_config.h"
14#include "Tpetra_CrsMatrix.hpp"
15#include "Tpetra_LocalCrsMatrixOperator.hpp"
16#include "Tpetra_MultiVector.hpp"
17#include "Teuchos_RCP.hpp"
18#include "Teuchos_ScalarTraits.hpp"
21#include "KokkosSparse_spmv_impl.hpp"
22
28
29namespace Tpetra {
30 namespace Details {
31
32
39template<class AMatrix, class MV, class ConstMV, class Offsets, bool is_MV, bool restrictedMode, bool skipOffRank>
40struct LocalResidualFunctor {
41
42 using execution_space = typename AMatrix::execution_space;
43 using LO = typename AMatrix::non_const_ordinal_type;
44 using value_type = typename AMatrix::non_const_value_type;
45 using team_policy = typename Kokkos::TeamPolicy<execution_space>;
46 using team_member = typename team_policy::member_type;
47 using ATV = Kokkos::ArithTraits<value_type>;
48
49 AMatrix A_lcl;
50 ConstMV X_colmap_lcl;
51 ConstMV B_lcl;
52 MV R_lcl;
53 int rows_per_team;
54 Offsets offsets;
55 ConstMV X_domainmap_lcl;
56
57
58 LocalResidualFunctor (const AMatrix& A_lcl_,
59 const ConstMV& X_colmap_lcl_,
60 const ConstMV& B_lcl_,
61 const MV& R_lcl_,
62 const int rows_per_team_,
63 const Offsets& offsets_,
64 const ConstMV& X_domainmap_lcl_) :
65 A_lcl(A_lcl_),
66 X_colmap_lcl(X_colmap_lcl_),
67 B_lcl(B_lcl_),
68 R_lcl(R_lcl_),
69 rows_per_team(rows_per_team_),
70 offsets(offsets_),
71 X_domainmap_lcl(X_domainmap_lcl_)
72 { }
73
74 KOKKOS_INLINE_FUNCTION
75 void operator() (const team_member& dev) const
76 {
77
78 Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) {
79 const LO lclRow = static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
80
81 if (lclRow >= A_lcl.numRows ()) {
82 return;
83 }
84
85 if (!is_MV) { // MultiVectors only have a single column
86
87 value_type A_x = ATV::zero ();
88
89 if (!restrictedMode) {
90 const auto A_row = A_lcl.rowConst(lclRow);
91 const LO row_length = static_cast<LO> (A_row.length);
92
93 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, value_type& lsum) {
94 const auto A_val = A_row.value(iEntry);
95 lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),0);
96 }, A_x);
97
98 }
99 else {
100
101 const LO offRankOffset = offsets(lclRow);
102 const size_t start = A_lcl.graph.row_map(lclRow);
103 const size_t end = A_lcl.graph.row_map(lclRow+1);
104
105 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, value_type& lsum) {
106 const auto A_val = A_lcl.values(iEntry);
107 const auto lclCol = A_lcl.graph.entries(iEntry);
108 if (iEntry < offRankOffset)
109 lsum += A_val * X_domainmap_lcl(lclCol,0);
110 else if (!skipOffRank)
111 lsum += A_val * X_colmap_lcl(lclCol,0);
112 }, A_x);
113 }
114
115 Kokkos::single(Kokkos::PerThread(dev),[&] () {
116 R_lcl(lclRow,0) = B_lcl(lclRow,0) - A_x;
117 });
118 }
119 else { // MultiVectors have more than one column
120
121 const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
122
123 for(LO v=0; v<numVectors; v++) {
124
125 value_type A_x = ATV::zero ();
126
127 if (!restrictedMode) {
128
129 const auto A_row = A_lcl.rowConst(lclRow);
130 const LO row_length = static_cast<LO> (A_row.length);
131
132 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, value_type& lsum) {
133 const auto A_val = A_row.value(iEntry);
134 lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),v);
135 }, A_x);
136 }
137 else {
138 const LO offRankOffset = offsets(lclRow);
139 const size_t start = A_lcl.graph.row_map(lclRow);
140 const size_t end = A_lcl.graph.row_map(lclRow+1);
141
142 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, value_type& lsum) {
143 const auto A_val = A_lcl.values(iEntry);
144 const auto lclCol = A_lcl.graph.entries(iEntry);
145 if (iEntry < offRankOffset)
146 lsum += A_val * X_domainmap_lcl(lclCol,v);
147 else if (!skipOffRank)
148 lsum += A_val * X_colmap_lcl(lclCol,v);
149 }, A_x);
150 }
151
152 Kokkos::single(Kokkos::PerThread(dev),[&] () {
153 R_lcl(lclRow,v) = B_lcl(lclRow,v) - A_x;
154 });
155
156 }//end for numVectors
157 }
158 });//end parallel_for TeamThreadRange
159 }
160};
161
162
164template<class AMatrix, class MV, class ConstMV, class Offsets, bool is_MV>
165struct OffRankUpdateFunctor {
166
167 using execution_space = typename AMatrix::execution_space;
168 using LO = typename AMatrix::non_const_ordinal_type;
169 using value_type = typename AMatrix::non_const_value_type;
170 using team_policy = typename Kokkos::TeamPolicy<execution_space>;
171 using team_member = typename team_policy::member_type;
172 using ATV = Kokkos::ArithTraits<value_type>;
173
174 AMatrix A_lcl;
175 ConstMV X_colmap_lcl;
176 MV R_lcl;
177 int rows_per_team;
178 Offsets offsets;
179
180
181 OffRankUpdateFunctor (const AMatrix& A_lcl_,
182 const ConstMV& X_colmap_lcl_,
183 const MV& R_lcl_,
184 const int rows_per_team_,
185 const Offsets& offsets_) :
186 A_lcl(A_lcl_),
187 X_colmap_lcl(X_colmap_lcl_),
188 R_lcl(R_lcl_),
189 rows_per_team(rows_per_team_),
190 offsets(offsets_)
191 { }
192
193 KOKKOS_INLINE_FUNCTION
194 void operator() (const team_member& dev) const
195 {
196
197 Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) {
198 const LO lclRow = static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
199
200 if (lclRow >= A_lcl.numRows ()) {
201 return;
202 }
203
204 const LO offRankOffset = offsets(lclRow);
205 const size_t end = A_lcl.graph.row_map(lclRow+1);
206
207 if (!is_MV) { // MultiVectors only have a single column
208
209 value_type A_x = ATV::zero ();
210
211 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, offRankOffset, end), [&] (const LO iEntry, value_type& lsum) {
212 const auto A_val = A_lcl.values(iEntry);
213 const auto lclCol = A_lcl.graph.entries(iEntry);
214 lsum += A_val * X_colmap_lcl(lclCol,0);
215 }, A_x);
216
217 Kokkos::single(Kokkos::PerThread(dev),[&] () {
218 R_lcl(lclRow,0) -= A_x;
219 });
220 }
221 else { // MultiVectors have more than one column
222
223 const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
224
225 for(LO v=0; v<numVectors; v++) {
226
227 value_type A_x = ATV::zero ();
228
229 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, offRankOffset, end), [&] (const LO iEntry, value_type& lsum) {
230 const auto A_val = A_lcl.values(iEntry);
231 const auto lclCol = A_lcl.graph.entries(iEntry);
232 lsum += A_val * X_colmap_lcl(lclCol,v);
233 }, A_x);
234
235 Kokkos::single(Kokkos::PerThread(dev),[&] () {
236 R_lcl(lclRow,v) -= A_x;
237 });
238
239 }//end for numVectors
240 }
241 });
242 }
243};
244
245template<class SC, class LO, class GO, class NO>
246void localResidual(const CrsMatrix<SC,LO,GO,NO> & A,
247 const MultiVector<SC,LO,GO,NO> & X_colmap,
248 const MultiVector<SC,LO,GO,NO> & B,
250 const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
251 const MultiVector<SC,LO,GO,NO> * X_domainmap=nullptr) {
253 using Teuchos::NO_TRANS;
254 ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localResidual");
255
257 using local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::non_const_type;
258 using const_local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::const_type;
259 using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
260
262 const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
263 const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
264 local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
265 const_local_view_device_type X_domainmap_lcl;
266
267 const bool debug = ::Tpetra::Details::Behavior::debug ();
268 if (debug) {
269 TEUCHOS_TEST_FOR_EXCEPTION
270 (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error,
271 "X.getNumVectors() = " << X_colmap.getNumVectors () << " != "
272 "R.getNumVectors() = " << R.getNumVectors () << ".");
273 TEUCHOS_TEST_FOR_EXCEPTION
274 (X_colmap.getLocalLength () !=
275 A.getColMap ()->getLocalNumElements (), std::runtime_error,
276 "X has the wrong number of local rows. "
277 "X.getLocalLength() = " << X_colmap.getLocalLength () << " != "
278 "A.getColMap()->getLocalNumElements() = " <<
279 A.getColMap ()->getLocalNumElements () << ".");
280 TEUCHOS_TEST_FOR_EXCEPTION
281 (R.getLocalLength () !=
282 A.getRowMap ()->getLocalNumElements (), std::runtime_error,
283 "R has the wrong number of local rows. "
284 "R.getLocalLength() = " << R.getLocalLength () << " != "
285 "A.getRowMap()->getLocalNumElements() = " <<
286 A.getRowMap ()->getLocalNumElements () << ".");
287 TEUCHOS_TEST_FOR_EXCEPTION
288 (B.getLocalLength () !=
289 A.getRowMap ()->getLocalNumElements (), std::runtime_error,
290 "B has the wrong number of local rows. "
291 "B.getLocalLength() = " << B.getLocalLength () << " != "
292 "A.getRowMap()->getLocalNumElements() = " <<
293 A.getRowMap ()->getLocalNumElements () << ".");
294
295 TEUCHOS_TEST_FOR_EXCEPTION
296 (! A.isFillComplete (), std::runtime_error, "The matrix A is not "
297 "fill complete. You must call fillComplete() (possibly with "
298 "domain and range Map arguments) without an intervening "
299 "resumeFill() call before you may call this method.");
300 TEUCHOS_TEST_FOR_EXCEPTION
301 (! X_colmap.isConstantStride () || ! R.isConstantStride () || ! B.isConstantStride (),
302 std::runtime_error, "X, Y and B must be constant stride.");
303 // If the two pointers are NULL, then they don't alias one
304 // another, even though they are equal.
305 TEUCHOS_TEST_FOR_EXCEPTION
306 ((X_colmap_lcl.data () == R_lcl.data () && X_colmap_lcl.data () != nullptr) ||
307 (X_colmap_lcl.data () == B_lcl.data () && X_colmap_lcl.data () != nullptr),
308 std::runtime_error, "X, Y and R may not alias one another.");
309 }
310
311 const bool fusedResidual = ::Tpetra::Details::Behavior::fusedResidual ();
312 if (!fusedResidual) {
313 SC one = Teuchos::ScalarTraits<SC>::one();
314 SC negone = -one;
315 SC zero = Teuchos::ScalarTraits<SC>::zero();
316 // This is currently a "reference implementation" waiting until Kokkos Kernels provides
317 // a residual kernel.
318 A.localApply(X_colmap,R,Teuchos::NO_TRANS, one, zero);
319 R.update(one,B,negone);
320 return;
321 }
322
323 if (A_lcl.numRows() == 0) {
324 return;
325 }
326
327 int64_t numLocalRows = A_lcl.numRows ();
328 int64_t myNnz = A_lcl.nnz();
329
330 int team_size = -1;
331 int vector_length = -1;
332 int64_t rows_per_thread = -1;
333
334 using execution_space = typename CrsMatrix<SC,LO,GO,NO>::execution_space;
335 using policy_type = typename Kokkos::TeamPolicy<execution_space>;
336
337 int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
338 int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team;
339
340 policy_type policy (1, 1);
341 if (team_size < 0) {
342 policy = policy_type (worksets, Kokkos::AUTO, vector_length);
343 }
344 else {
345 policy = policy_type (worksets, team_size, vector_length);
346 }
347
348 bool is_vector = (X_colmap_lcl.extent(1) == 1);
349
350 if(is_vector) {
351
352 if (X_domainmap == nullptr) {
353
355 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
356 Kokkos::parallel_for("residual-vector",policy,func);
357
358 }
359 else {
360
361 X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
363 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
364 Kokkos::parallel_for("residual-vector",policy,func);
365
366 }
367 }
368 else {
369
370 if (X_domainmap == nullptr) {
371
373 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
374 Kokkos::parallel_for("residual-multivector",policy,func);
375
376 }
377 else {
378
379 X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
381 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
382 Kokkos::parallel_for("residual-multivector",policy,func);
383
384 }
385 }
386}
387
388
389template<class SC, class LO, class GO, class NO>
390void localResidualWithCommCompOverlap(const CrsMatrix<SC,LO,GO,NO> & A,
391 MultiVector<SC,LO,GO,NO> & X_colmap,
392 const MultiVector<SC,LO,GO,NO> & B,
393 MultiVector<SC,LO,GO,NO> & R,
394 const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
395 const MultiVector<SC,LO,GO,NO> & X_domainmap) {
396 using Tpetra::Details::ProfilingRegion;
397 using Teuchos::NO_TRANS;
398 using Teuchos::RCP;
399 using import_type = typename CrsMatrix<SC,LO,GO,NO>::import_type;
400
401 ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localResidualWithCommCompOverlap");
402
403 using local_matrix_device_type = typename CrsMatrix<SC,LO,GO,NO>::local_matrix_device_type;
404 using local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::non_const_type;
405 using const_local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::const_type;
406 using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
407
408 local_matrix_device_type A_lcl = A.getLocalMatrixDevice ();
409 const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
410 const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
411 local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
412 const_local_view_device_type X_domainmap_lcl = X_domainmap.getLocalViewDevice(Access::ReadOnly);;
413
414 const bool debug = ::Tpetra::Details::Behavior::debug ();
415 if (debug) {
416 TEUCHOS_TEST_FOR_EXCEPTION
417 (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error,
418 "X.getNumVectors() = " << X_colmap.getNumVectors () << " != "
419 "R.getNumVectors() = " << R.getNumVectors () << ".");
420 TEUCHOS_TEST_FOR_EXCEPTION
421 (X_colmap.getLocalLength () !=
422 A.getColMap ()->getLocalNumElements (), std::runtime_error,
423 "X has the wrong number of local rows. "
424 "X.getLocalLength() = " << X_colmap.getLocalLength () << " != "
425 "A.getColMap()->getLocalNumElements() = " <<
426 A.getColMap ()->getLocalNumElements () << ".");
427 TEUCHOS_TEST_FOR_EXCEPTION
428 (R.getLocalLength () !=
429 A.getRowMap ()->getLocalNumElements (), std::runtime_error,
430 "R has the wrong number of local rows. "
431 "R.getLocalLength() = " << R.getLocalLength () << " != "
432 "A.getRowMap()->getLocalNumElements() = " <<
433 A.getRowMap ()->getLocalNumElements () << ".");
434 TEUCHOS_TEST_FOR_EXCEPTION
435 (B.getLocalLength () !=
436 A.getRowMap ()->getLocalNumElements (), std::runtime_error,
437 "B has the wrong number of local rows. "
438 "B.getLocalLength() = " << B.getLocalLength () << " != "
439 "A.getRowMap()->getLocalNumElements() = " <<
440 A.getRowMap ()->getLocalNumElements () << ".");
441
442 TEUCHOS_TEST_FOR_EXCEPTION
443 (! A.isFillComplete (), std::runtime_error, "The matrix A is not "
444 "fill complete. You must call fillComplete() (possibly with "
445 "domain and range Map arguments) without an intervening "
446 "resumeFill() call before you may call this method.");
447 TEUCHOS_TEST_FOR_EXCEPTION
448 (! X_colmap.isConstantStride () || ! R.isConstantStride () || ! B.isConstantStride (),
449 std::runtime_error, "X, Y and B must be constant stride.");
450 // If the two pointers are NULL, then they don't alias one
451 // another, even though they are equal.
452 TEUCHOS_TEST_FOR_EXCEPTION
453 ((X_colmap_lcl.data () == R_lcl.data () && X_colmap_lcl.data () != nullptr) ||
454 (X_colmap_lcl.data () == B_lcl.data () && X_colmap_lcl.data () != nullptr),
455 std::runtime_error, "X, Y and R may not alias one another.");
456 }
457
458 if (A_lcl.numRows() == 0) {
459 return;
460 }
461
462 int64_t numLocalRows = A_lcl.numRows ();
463 int64_t myNnz = A_lcl.nnz();
464
465 int team_size = -1;
466 int vector_length = -1;
467 int64_t rows_per_thread = -1;
468
469 using execution_space = typename CrsMatrix<SC,LO,GO,NO>::execution_space;
470 using policy_type = typename Kokkos::TeamPolicy<execution_space>;
471
472 int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
473 int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team;
474
475 policy_type policy (1, 1);
476 if (team_size < 0) {
477 policy = policy_type (worksets, Kokkos::AUTO, vector_length);
478 }
479 else {
480 policy = policy_type (worksets, team_size, vector_length);
481 }
482
483 bool is_vector = (X_colmap_lcl.extent(1) == 1);
484
485 if(is_vector) {
486
488 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
489 Kokkos::parallel_for("residual-vector",policy,func);
490
491 RCP<const import_type> importer = A.getGraph ()->getImporter ();
492 X_colmap.endImport (X_domainmap, *importer, INSERT, true);
493
494 Kokkos::fence("Tpetra::localResidualWithCommCompOverlap-1");
495
497 functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
498 Kokkos::parallel_for("residual-vector-offrank",policy,func2);
499
500 }
501 else {
502
504 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
505 Kokkos::parallel_for("residual-multivector",policy,func);
506
507 RCP<const import_type> importer = A.getGraph ()->getImporter ();
508 X_colmap.endImport (X_domainmap, *importer, INSERT, true);
509
510 Kokkos::fence("Tpetra::localResidualWithCommCompOverlap-2");
511
513 functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
514 Kokkos::parallel_for("residual-vector-offrank",policy,func2);
515
516 }
517}
518
519
521template<class SC, class LO, class GO, class NO>
523 const MultiVector<SC,LO,GO,NO> & X_in,
524 const MultiVector<SC,LO,GO,NO> & B_in,
527 using Teuchos::RCP;
528 using Teuchos::rcp;
529 using Teuchos::rcp_const_cast;
530 using Teuchos::rcpFromRef;
531
532 const bool debug = ::Tpetra::Details::Behavior::debug ();
533 const bool skipCopyAndPermuteIfPossible = ::Tpetra::Details::Behavior::skipCopyAndPermuteIfPossible ();
534 const bool overlapCommunicationAndComputation = ::Tpetra::Details::Behavior::overlapCommunicationAndComputation ();
535 if (overlapCommunicationAndComputation)
536 TEUCHOS_ASSERT(skipCopyAndPermuteIfPossible);
537
538 // Whether we are using restrictedMode in the import from domain to
539 // column map. Restricted mode skips the copy and permutation of the
540 // local part of X. We are using restrictedMode only when domain and
541 // column map are locally fitted, i.e. when the local indices of
542 // domain and column map match.
543 bool restrictedMode = false;
544
545 const CrsMatrix<SC,LO,GO,NO> * Apt = dynamic_cast<const CrsMatrix<SC,LO,GO,NO>*>(&Aop);
546 if(!Apt) {
547 // If we're not a CrsMatrix, we can't do fusion, so just do apply+update
548 SC one = Teuchos::ScalarTraits<SC>::one(), negone = -one, zero = Teuchos::ScalarTraits<SC>::zero();
549 Aop.apply(X_in,R_in,Teuchos::NO_TRANS, one, zero);
550 R_in.update(one,B_in,negone);
551 return;
552 }
553 const CrsMatrix<SC,LO,GO,NO> & A = *Apt;
554
557 using MV = MultiVector<SC,LO,GO,NO>;
558 using graph_type = Tpetra::CrsGraph<LO,GO,NO>;
559 using offset_type = typename graph_type::offset_device_view_type;
560
561 // We treat the case of a replicated MV output specially.
562 const bool R_is_replicated =
563 (! R_in.isDistributed () && A.getComm ()->getSize () != 1);
564
565 // It's possible that R is a view of X or B.
566 // We don't try to to detect the more subtle cases (e.g., one is a
567 // subview of the other, but their initial pointers differ). We
568 // only need to do this if this matrix's Import is trivial;
569 // otherwise, we don't actually apply the operator from X into Y.
570
571 RCP<const import_type> importer = A.getGraph ()->getImporter ();
572 RCP<const export_type> exporter = A.getGraph ()->getExporter ();
573
574 // Temporary MV for Import operation. After the block of code
575 // below, this will be an (Imported if necessary) column Map MV
576 // ready to give to localApply(...).
577 RCP<MV> X_colMap;
578 if (importer.is_null ()) {
579 if (! X_in.isConstantStride ()) {
580 // Not all sparse mat-vec kernels can handle an input MV with
581 // nonconstant stride correctly, so we have to copy it in that
582 // case into a constant stride MV. To make a constant stride
583 // copy of X_in, we force creation of the column (== domain)
584 // Map MV (if it hasn't already been created, else fetch the
585 // cached copy). This avoids creating a new MV each time.
586 X_colMap = A.getColumnMapMultiVector (X_in, true);
587 Tpetra::deep_copy (*X_colMap, X_in);
588 // X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
589 }
590 else {
591 // The domain and column Maps are the same, so do the local
592 // multiply using the domain Map input MV X_in.
593 X_colMap = rcp_const_cast<MV> (rcpFromRef (X_in) );
594 }
595 }
596 else { // need to Import source (multi)vector
597 ProfilingRegion regionImport ("Tpetra::CrsMatrix::residual: Import");
598 // We're doing an Import anyway, which will copy the relevant
599 // elements of the domain Map MV X_in into a separate column Map
600 // MV. Thus, we don't have to worry whether X_in is constant
601 // stride.
602 X_colMap = A.getColumnMapMultiVector (X_in);
603
604 // Do we want to use restrictedMode?
605 restrictedMode = skipCopyAndPermuteIfPossible && importer->isLocallyFitted();
606
607 if (debug && restrictedMode) {
608 TEUCHOS_TEST_FOR_EXCEPTION
609 (!importer->getTargetMap()->isLocallyFitted(*importer->getSourceMap()), std::runtime_error,
610 "Source map and target map are not locally fitted, but Tpetra::residual thinks they are.");
611 }
612
613 // Import from the domain Map MV to the column Map MV.
614 X_colMap->beginImport (X_in, *importer, INSERT, restrictedMode);
615 }
616
617 offset_type offsets;
618 if (restrictedMode)
619 A.getCrsGraph()->getLocalOffRankOffsets(offsets);
620
621 // Get a vector for the R_rowMap output residual, handling the
622 // non-constant stride and exporter cases. Since R gets clobbered
623 // we don't need to worry about the data in it
624 RCP<MV> R_rowMap;
625 if(exporter.is_null()) {
626 if (! R_in.isConstantStride ()) {
627 R_rowMap = A.getRowMapMultiVector(R_in);
628 }
629 else {
630 R_rowMap = rcpFromRef (R_in);
631 }
632 }
633 else {
634 R_rowMap = A.getRowMapMultiVector (R_in);
635 }
636
637 // Get a vector for the B_rowMap output residual, handling the
638 // non-constant stride and exporter cases
639 RCP<const MV> B_rowMap;
640 if(exporter.is_null()) {
641 if (! B_in.isConstantStride ()) {
642 // Do an allocation here. If we need to optimize this later, we can have the matrix
643 // cache this.
644 RCP<MV> B_rowMapNonConst = rcp(new MV(A.getRowMap(),B_in.getNumVectors()));
645 Tpetra::deep_copy (*B_rowMapNonConst, B_in);
646 B_rowMap = rcp_const_cast<const MV> (B_rowMapNonConst);
647 }
648 else {
649 B_rowMap = rcpFromRef (B_in);
650 }
651 }
652 else {
653 // Do an allocation here. If we need to optimize this later, we can have the matrix
654 // cache this.
655 ProfilingRegion regionExport ("Tpetra::CrsMatrix::residual: B Import");
656 RCP<MV> B_rowMapNonConst = rcp(new MV(A.getRowMap(),B_in.getNumVectors()));
657 B_rowMapNonConst->doImport(B_in, *exporter, ADD);
658 B_rowMap = rcp_const_cast<const MV> (B_rowMapNonConst);
659 }
660
661 // If we have a nontrivial Export object, we must perform an
662 // Export. In that case, the local multiply result will go into
663 // the row Map multivector. We don't have to make a
664 // constant-stride version of R_in in this case, because we had to
665 // make a constant stride R_rowMap MV and do an Export anyway.
666 if (! exporter.is_null ()) {
667 if ( ! importer.is_null ())
668 X_colMap->endImport (X_in, *importer, INSERT, restrictedMode);
669 if (restrictedMode && !importer.is_null ())
670 localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in);
671 else
672 localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets);
673
674 {
675 ProfilingRegion regionExport ("Tpetra::CrsMatrix::residual: R Export");
676
677 // Do the Export operation.
678 R_in.doExport (*R_rowMap, *exporter, ADD);
679 }
680 }
681 else { // Don't do an Export: row Map and range Map are the same.
682
683 if (restrictedMode)
684 if (overlapCommunicationAndComputation) {
685 localResidualWithCommCompOverlap (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, X_in);
686 } else {
687 X_colMap->endImport (X_in, *importer, INSERT, restrictedMode);
688 localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in);
689 }
690 else {
691 if ( ! importer.is_null ())
692 X_colMap->endImport (X_in, *importer, INSERT, restrictedMode);
693 localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets);
694 }
695
696 //
697 // If R_in does not have constant stride,
698 // then we can't let the kernel write directly to R_in.
699 // Instead, we have to use the cached row (== range)
700 // Map MV as temporary storage.
701 //
702 if (! R_in.isConstantStride () ) {
703 // We need to be sure to do a copy out in this case.
704 Tpetra::deep_copy (R_in, *R_rowMap);
705 }
706 }
707
708 // If the range Map is a locally replicated Map, sum up
709 // contributions from each process.
710 if (R_is_replicated) {
711 ProfilingRegion regionReduce ("Tpetra::CrsMatrix::residual: Reduce Y");
712 R_in.reduce ();
713 }
714}
715
716
717
718
719
720 } // namespace Details
721} // namespace Tpetra
722
723#endif // TPETRA_DETAILS_RESIDUAL_HPP
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
void localApply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, const Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar &alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar &beta=Teuchos::ScalarTraits< Scalar >::zero()) const
Compute the local part of a sparse matrix-(Multi)Vector multiply.
typename device_type::execution_space execution_space
The Kokkos execution space.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Teuchos::RCP< MV > getColumnMapMultiVector(const MV &X_domainMap, const bool force=false) const
Create a (or fetch a cached) column Map MultiVector.
Export< LocalOrdinal, GlobalOrdinal, Node > export_type
The Export specialization suitable for this CrsMatrix specialization.
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.
Teuchos::RCP< MV > getRowMapMultiVector(const MV &Y_rangeMap, const bool force=false) const
Create a (or fetch a cached) row Map MultiVector.
Import< LocalOrdinal, GlobalOrdinal, Node > import_type
The Import specialization suitable for this CrsMatrix specialization.
Teuchos::RCP< const crs_graph_type > getCrsGraph() const
This matrix's graph, as a CrsGraph.
static bool fusedResidual()
Fusing SpMV and update in residual instead of using 2 kernel launches. Fusing kernels implies that no...
static bool debug()
Whether Tpetra is in debug mode.
static bool overlapCommunicationAndComputation()
Overlap communication and computation.
static bool skipCopyAndPermuteIfPossible()
Skip copyAndPermute if possible.
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object ("forward mode").
bool isDistributed() const
Whether this is a globally distributed object.
One or more distributed dense vectors.
void reduce()
Sum values of a locally replicated multivector across all processes.
size_t getLocalLength() const
Local number of rows on the calling process.
size_t getNumVectors() const
Number of columns in the multivector.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on device. This requires that th...
bool isConstantStride() const
Whether this multivector has constant stride between columns.
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
Abstract interface for operators (e.g., matrices and preconditioners).
virtual void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const =0
Computes the operator-multivector application.
Nonmember function that computes a residual Computes R = B - A * X.
void residual(const Operator< SC, LO, GO, NO > &A, const MultiVector< SC, LO, GO, NO > &X, const MultiVector< SC, LO, GO, NO > &B, MultiVector< SC, LO, GO, NO > &R)
Computes R = B - A * X.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
@ ADD
Sum new values.
@ INSERT
Insert new values that don't currently exist.
Functor for computing the residual.
Functor for computing R -= A_offRank*X_colmap.