Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_CrsMatrixMultiplyOp.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_CRSMATRIXMULTIPLYOP_HPP
11#define TPETRA_CRSMATRIXMULTIPLYOP_HPP
12
17
19#include "Tpetra_CrsMatrix.hpp"
20#include "Tpetra_Util.hpp"
23#include "Tpetra_LocalCrsMatrixOperator.hpp"
24
25namespace Tpetra {
26
63 template <class Scalar,
64 class MatScalar,
65 class LocalOrdinal,
66 class GlobalOrdinal,
67 class Node>
69 public Operator<Scalar, LocalOrdinal, GlobalOrdinal, Node>
70 {
71 public:
77
78 private:
79 using local_matrix_device_type =
81
82 public:
84
85
90 CrsMatrixMultiplyOp (const Teuchos::RCP<const crs_matrix_type>& A) :
91 matrix_ (A),
92 localMultiply_ (std::make_shared<local_matrix_device_type> (
93 A->getLocalMatrixDevice ()))
94 {}
95
97 ~CrsMatrixMultiplyOp () override = default;
98
100
102
105 void
108 Teuchos::ETransp mode = Teuchos::NO_TRANS,
109 Scalar alpha = Teuchos::ScalarTraits<Scalar>::one (),
110 Scalar beta = Teuchos::ScalarTraits<Scalar>::zero ()) const override
111 {
112 TEUCHOS_TEST_FOR_EXCEPTION
113 (! matrix_->isFillComplete (), std::runtime_error,
114 Teuchos::typeName (*this) << "::apply(): underlying matrix is not fill-complete.");
115 TEUCHOS_TEST_FOR_EXCEPTION
116 (X.getNumVectors () != Y.getNumVectors (), std::runtime_error,
117 Teuchos::typeName (*this) << "::apply(X,Y): X and Y must have the same number of vectors.");
118 TEUCHOS_TEST_FOR_EXCEPTION
119 (Teuchos::ScalarTraits<Scalar>::isComplex && mode == Teuchos::TRANS, std::logic_error,
120 Teuchos::typeName (*this) << "::apply() does not currently support transposed multiplications for complex scalar types.");
121 if (mode == Teuchos::NO_TRANS) {
122 applyNonTranspose (X, Y, alpha, beta);
123 }
124 else {
125 applyTranspose (X, Y, mode, alpha, beta);
126 }
127 }
128
134 bool hasTransposeApply() const override {
135 return true;
136 }
137
139 Teuchos::RCP<const map_type> getDomainMap () const override {
140 return matrix_->getDomainMap ();
141 }
142
144 Teuchos::RCP<const map_type> getRangeMap () const override {
145 return matrix_->getRangeMap ();
146 }
147
149
150 protected:
152
154 const Teuchos::RCP<const crs_matrix_type> matrix_;
155
157 LocalCrsMatrixOperator<Scalar, MatScalar, typename
159
172 mutable Teuchos::RCP<MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> > importMV_;
173
186 mutable Teuchos::RCP<MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> > exportMV_;
187
190 void
193 Teuchos::ETransp mode,
194 Scalar alpha,
195 Scalar beta) const
196 {
197 using Teuchos::null;
198 using Teuchos::RCP;
199 using Teuchos::rcp;
202 using STS = Teuchos::ScalarTraits<Scalar>;
203 typedef typename MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>:: dual_view_type::t_dev nonconst_view_type;
204
205 const size_t numVectors = X_in.getNumVectors();
206 // because of Views, it is difficult to determine if X and Y point to the same data.
207 // however, if they reference the exact same object, we will do the user the favor of copying X into new storage (with a warning)
208 // we ony need to do this if we have trivial importers; otherwise, we don't actually apply the operator from X into Y
209 RCP<const import_type> importer = matrix_->getGraph()->getImporter();
210 RCP<const export_type> exporter = matrix_->getGraph()->getExporter();
211
212 // some parameters for below
213 const bool Y_is_replicated = ! Y_in.isDistributed ();
214 const bool Y_is_overwritten = (beta == STS::zero ());
215 const int myRank = matrix_->getComm ()->getRank ();
216 if (Y_is_replicated && myRank != 0) {
217 beta = STS::zero ();
218 }
219
220 // access X indirectly, in case we need to create temporary storage
221 RCP<const MV> X;
222 // currently, cannot multiply from multivector of non-constant stride
223 if (! X_in.isConstantStride () && importer == null) {
224 // generate a strided copy of X_in
225 X = Teuchos::rcp (new MV (X_in, Teuchos::Copy));
226 }
227 else {
228 // just temporary, so this non-owning RCP is okay
229 X = Teuchos::rcpFromRef (X_in);
230 }
231
232 // set up import/export temporary multivectors
233 if (importer != null) {
234 if (importMV_ != null && importMV_->getNumVectors () != numVectors) {
235 importMV_ = null;
236 }
237 if (importMV_ == null) {
238 importMV_ = rcp (new MV (matrix_->getColMap (), numVectors));
239 }
240 }
241 if (exporter != null) {
242 if (exportMV_ != null && exportMV_->getNumVectors () != numVectors) {
243 exportMV_ = null;
244 }
245 if (exportMV_ == null) {
246 exportMV_ = rcp (new MV (matrix_->getRowMap (), numVectors));
247 }
248 }
249
250 // If we have a non-trivial exporter, we must import elements that are permuted or are on other processors
251 if (exporter != null) {
252 exportMV_->doImport(X_in,*exporter,INSERT);
253 X = exportMV_; // multiply out of exportMV_
254 }
255
256 auto X_lcl = X->getLocalViewDevice(Access::ReadOnly);
257
258 // If we have a non-trivial importer, we must export elements that are permuted or belong to other processors
259 // We will compute solution into the to-be-exported MV; get a view
260 if (importer != null) {
261 // Beta is zero here, so we clobber Y_lcl
262 auto Y_lcl = importMV_->getLocalViewDevice(Access::OverwriteAll);
263
264 localMultiply_.apply (X_lcl, Y_lcl, mode, alpha, STS::zero ());
265 if (Y_is_overwritten) {
266 Y_in.putScalar (STS::zero ());
267 }
268 else {
269 Y_in.scale (beta);
270 }
271 Y_in.doExport (*importMV_, *importer, ADD_ASSIGN);
272 }
273 // otherwise, multiply into Y
274 else {
275 // can't multiply in-situ; can't multiply into non-strided multivector
276 if (! Y_in.isConstantStride () || X.getRawPtr () == &Y_in) {
277 // generate a strided copy of Y
278 MV Y (Y_in, Teuchos::Copy);
279 nonconst_view_type Y_lcl;
280 if(Y_is_overwritten) Y_lcl = Y.getLocalViewDevice(Access::OverwriteAll);
281 else Y_lcl = Y.getLocalViewDevice(Access::ReadWrite);
282
283 localMultiply_.apply (X_lcl, Y_lcl, mode, alpha, beta);
284 Tpetra::deep_copy (Y_in, Y);
285 }
286 else {
287 nonconst_view_type Y_lcl;
288 if(Y_is_overwritten) Y_lcl = Y_in.getLocalViewDevice(Access::OverwriteAll);
289 else Y_lcl = Y_in.getLocalViewDevice(Access::ReadWrite);
290
291 localMultiply_.apply (X_lcl, Y_lcl, mode, alpha, beta);
292 }
293 }
294 // Handle case of rangemap being a local replicated map: in this case, sum contributions from each processor
295 if (Y_is_replicated) {
296 Y_in.reduce();
297 }
298 }
299
301 void
304 Scalar alpha,
305 Scalar beta) const
306 {
308 using Teuchos::NO_TRANS;
309 using Teuchos::RCP;
310 using Teuchos::rcp;
311 using Teuchos::rcp_const_cast;
312 using Teuchos::rcpFromRef;
315 typedef Teuchos::ScalarTraits<Scalar> STS;
316 typedef typename MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>:: dual_view_type::t_dev nonconst_view_type;
317
318 if (alpha == STS::zero ()) {
319 if (beta == STS::zero ()) {
320 Y_in.putScalar (STS::zero ());
321 } else if (beta != STS::one ()) {
322 Y_in.scale (beta);
323 }
324 return;
325 }
326
327 // It's possible that X is a view of Y or vice versa. We don't
328 // allow this (apply() requires that X and Y not alias one
329 // another), but it's helpful to detect and work around this
330 // case. We don't try to to detect the more subtle cases (e.g.,
331 // one is a subview of the other, but their initial pointers
332 // differ). We only need to do this if this matrix's Import is
333 // trivial; otherwise, we don't actually apply the operator from
334 // X into Y.
335
336 RCP<const import_type> importer = matrix_->getGraph()->getImporter();
337 RCP<const export_type> exporter = matrix_->getGraph()->getExporter();
338
339 // If beta == 0, then the output MV will be overwritten; none of
340 // its entries should be read. (Sparse BLAS semantics say that we
341 // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
342 // This matters if we need to do an Export operation; see below.
343 const bool Y_is_overwritten = (beta == STS::zero());
344
345 // We treat the case of a replicated MV output specially.
346 const bool Y_is_replicated =
347 (! Y_in.isDistributed () && matrix_->getComm ()->getSize () != 1);
348
349 // This is part of the special case for replicated MV output.
350 // We'll let each process do its thing, but do an all-reduce at
351 // the end to sum up the results. Setting beta=0 on all
352 // processes but Proc 0 makes the math work out for the
353 // all-reduce. (This assumes that the replicated data is
354 // correctly replicated, so that the data are the same on all
355 // processes.)
356 if (Y_is_replicated && matrix_->getComm ()->getRank () > 0) {
357 beta = STS::zero();
358 }
359
360 // Temporary MV for Import operation. After the block of code
361 // below, this will be an (Imported if necessary) column Map MV
362 // ready to give to localMultiply_.apply(...).
363 RCP<const MV> X_colMap;
364 if (importer.is_null ()) {
365 if (! X_in.isConstantStride ()) {
366 // Not all sparse mat-vec kernels can handle an input MV with
367 // nonconstant stride correctly, so we have to copy it in that
368 // case into a constant stride MV. To make a constant stride
369 // copy of X_in, we force creation of the column (== domain)
370 // Map MV (if it hasn't already been created, else fetch the
371 // cached copy). This avoids creating a new MV each time.
372 RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in, true);
373 Tpetra::deep_copy (*X_colMapNonConst, X_in);
374 X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
375 }
376 else {
377 // The domain and column Maps are the same, so do the local
378 // multiply using the domain Map input MV X_in.
379 X_colMap = rcpFromRef (X_in);
380 }
381 }
382 else { // need to Import source (multi)vector
383 ProfilingRegion regionImport ("Tpetra::CrsMatrixMultiplyOp::apply: Import");
384
385 // We're doing an Import anyway, which will copy the relevant
386 // elements of the domain Map MV X_in into a separate column Map
387 // MV. Thus, we don't have to worry whether X_in is constant
388 // stride.
389 RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in);
390
391 // Import from the domain Map MV to the column Map MV.
392 X_colMapNonConst->doImport (X_in, *importer, INSERT);
393 X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
394 }
395
396 // Temporary MV for doExport (if needed), or for copying a
397 // nonconstant stride output MV into a constant stride MV. This
398 // is null if we don't need the temporary MV, that is, if the
399 // Export is trivial (null).
400 RCP<MV> Y_rowMap = getRowMapMultiVector (Y_in);
401
402 auto X_lcl = X_colMap->getLocalViewDevice(Access::ReadOnly);
403
404 // If we have a nontrivial Export object, we must perform an
405 // Export. In that case, the local multiply result will go into
406 // the row Map multivector. We don't have to make a
407 // constant-stride version of Y_in in this case, because we had to
408 // make a constant stride Y_rowMap MV and do an Export anyway.
409 if (! exporter.is_null ()) {
410 auto Y_lcl = Y_rowMap->getLocalViewDevice(Access::OverwriteAll);
411
412 localMultiply_.apply (X_lcl, Y_lcl, NO_TRANS,
413 alpha, STS::zero ());
414 {
415 ProfilingRegion regionExport ("Tpetra::CrsMatrixMultiplyOp::apply: Export");
416
417 // If we're overwriting the output MV Y_in completely (beta
418 // == 0), then make sure that it is filled with zeros before
419 // we do the Export. Otherwise, the ADD combine mode will
420 // use data in Y_in, which is supposed to be zero.
421 if (Y_is_overwritten) {
422 Y_in.putScalar (STS::zero());
423 }
424 else {
425 // Scale the output MV by beta, so that the Export sums in
426 // the mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
427 Y_in.scale (beta);
428 }
429 // Do the Export operation.
430 Y_in.doExport (*Y_rowMap, *exporter, ADD_ASSIGN);
431 }
432 }
433 else { // Don't do an Export: row Map and range Map are the same.
434 //
435 // If Y_in does not have constant stride, or if the column Map
436 // MV aliases Y_in, then we can't let the kernel write directly
437 // to Y_in. Instead, we have to use the cached row (== range)
438 // Map MV as temporary storage.
439 //
440 // FIXME (mfh 05 Jun 2014, mfh 07 Dec 2018) This test for
441 // aliasing only tests if the user passed in the same
442 // MultiVector for both X and Y. It won't detect whether one
443 // MultiVector views the other. We should also check the
444 // MultiVectors' raw data pointers.
445 if (! Y_in.isConstantStride () || X_colMap.getRawPtr () == &Y_in) {
446 // Force creating the MV if it hasn't been created already.
447 // This will reuse a previously created cached MV.
448 Y_rowMap = getRowMapMultiVector (Y_in, true);
449
450 // If beta == 0, we don't need to copy Y_in into Y_rowMap,
451 // since we're overwriting it anyway.
452 if (beta != STS::zero ()) {
453 Tpetra::deep_copy (*Y_rowMap, Y_in);
454 }
455 nonconst_view_type Y_lcl;
456 if(Y_is_overwritten) Y_lcl = Y_rowMap->getLocalViewDevice(Access::OverwriteAll);
457 else Y_lcl = Y_rowMap->getLocalViewDevice(Access::ReadWrite);
458
459 localMultiply_.apply (X_lcl, Y_lcl, NO_TRANS, alpha, beta);
460 Tpetra::deep_copy (Y_in, *Y_rowMap);
461 }
462 else {
463 nonconst_view_type Y_lcl;
464 if(Y_is_overwritten) Y_lcl = Y_in.getLocalViewDevice(Access::OverwriteAll);
465 else Y_lcl = Y_in.getLocalViewDevice(Access::ReadWrite);
466
467 localMultiply_.apply (X_lcl, Y_lcl, NO_TRANS, alpha, beta);
468 }
469 }
470
471 // If the range Map is a locally replicated Map, sum up
472 // contributions from each process. We set beta = 0 on all
473 // processes but Proc 0 initially, so this will handle the scaling
474 // factor beta correctly.
475 if (Y_is_replicated) {
476 ProfilingRegion regionReduce ("Tpetra::CrsMatrixMultiplyOp::apply: Reduce Y");
477 Y_in.reduce ();
478 }
479 }
480
481 private:
501 Teuchos::RCP<MV>
502 getColumnMapMultiVector (const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>& X_domainMap,
503 const bool force = false) const
504 {
505 using Teuchos::null;
506 using Teuchos::RCP;
507 using Teuchos::rcp;
509
510 const size_t numVecs = X_domainMap.getNumVectors ();
511 RCP<const import_type> importer = matrix_->getGraph ()->getImporter ();
512 RCP<const map_type> colMap = matrix_->getColMap ();
513
514 RCP<MV> X_colMap; // null by default
515
516 // If the Import object is trivial (null), then we don't need a
517 // separate column Map multivector. Just return null in that
518 // case. The caller is responsible for knowing not to use the
519 // returned null pointer.
520 //
521 // If the Import is nontrivial, then we do need a separate
522 // column Map multivector for the Import operation. Check in
523 // that case if we have to (re)create the column Map
524 // multivector.
525 if (! importer.is_null () || force) {
526 if (importMV_.is_null () || importMV_->getNumVectors () != numVecs) {
527 X_colMap = rcp (new MV (colMap, numVecs));
528
529 // Cache the newly created multivector for later reuse.
530 importMV_ = X_colMap;
531 }
532 else { // Yay, we can reuse the cached multivector!
533 X_colMap = importMV_;
534 // mfh 09 Jan 2013: We don't have to fill with zeros first,
535 // because the Import uses INSERT combine mode, which overwrites
536 // existing entries.
537 //
538 //X_colMap->putScalar (STS::zero ());
539 }
540 }
541 return X_colMap;
542 }
543
565 Teuchos::RCP<MV>
566 getRowMapMultiVector (const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node>& Y_rangeMap,
567 const bool force = false) const
568 {
569 using Teuchos::null;
570 using Teuchos::RCP;
571 using Teuchos::rcp;
572 typedef Export<LocalOrdinal,GlobalOrdinal,Node> export_type;
573
574 const size_t numVecs = Y_rangeMap.getNumVectors ();
575 RCP<const export_type> exporter = matrix_->getGraph ()->getExporter ();
576 RCP<const map_type> rowMap = matrix_->getRowMap ();
577
578 RCP<MV> Y_rowMap; // null by default
579
580 // If the Export object is trivial (null), then we don't need a
581 // separate row Map multivector. Just return null in that case.
582 // The caller is responsible for knowing not to use the returned
583 // null pointer.
584 //
585 // If the Export is nontrivial, then we do need a separate row
586 // Map multivector for the Export operation. Check in that case
587 // if we have to (re)create the row Map multivector.
588 if (! exporter.is_null () || force) {
589 if (exportMV_.is_null () || exportMV_->getNumVectors () != numVecs) {
590 Y_rowMap = rcp (new MV (rowMap, numVecs));
591 exportMV_ = Y_rowMap; // Cache the newly created MV for later reuse
592 }
593 else { // Yay, we can reuse the cached multivector!
594 Y_rowMap = exportMV_;
595 }
596 }
597 return Y_rowMap;
598 }
599 };
600
608 template <class OpScalar,
609 class MatScalar,
610 class LocalOrdinal,
611 class GlobalOrdinal,
612 class Node>
613 Teuchos::RCP<
615 createCrsMatrixMultiplyOp (const Teuchos::RCP<
617 {
618 typedef CrsMatrixMultiplyOp<OpScalar, MatScalar, LocalOrdinal,
619 GlobalOrdinal, Node> op_type;
620 return Teuchos::rcp (new op_type (A));
621 }
622
623} // end of namespace Tpetra
624
625#endif // TPETRA_CRSMATRIXMULTIPLYOP_HPP
Forward declaration of Tpetra::CrsMatrixMultiplyOp.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Stand-alone utility functions and macros.
A class for wrapping a CrsMatrix multiply in a Operator.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > importMV_
Column Map MultiVector used in apply().
bool hasTransposeApply() const override
Whether this Operator's apply() method can apply the transpose or conjugate transpose.
void applyNonTranspose(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X_in, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y_in, Scalar alpha, Scalar beta) const
Apply the matrix (not its transpose) to X_in, producing Y_in.
void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
Compute Y = beta*Y + alpha*Op(A)*X, where Op(A) is either A, , or .
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this Operator.
Map< LocalOrdinal, GlobalOrdinal, Node > map_type
The specialization of Map which this class uses.
CrsMatrix< MatScalar, LocalOrdinal, GlobalOrdinal, Node > crs_matrix_type
The specialization of CrsMatrix which this class wraps.
void applyTranspose(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X_in, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y_in, Teuchos::ETransp mode, Scalar alpha, Scalar beta) const
Apply the transpose or conjugate transpose of the matrix to X_in, producing Y_in.
CrsMatrixMultiplyOp(const Teuchos::RCP< const crs_matrix_type > &A)
Constructor.
~CrsMatrixMultiplyOp() override=default
Destructor (virtual for memory safety of derived classes).
Teuchos::RCP< CrsMatrixMultiplyOp< OpScalar, MatScalar, LocalOrdinal, GlobalOrdinal, Node > > createCrsMatrixMultiplyOp(const Teuchos::RCP< const CrsMatrix< MatScalar, LocalOrdinal, GlobalOrdinal, Node > > &A)
Non-member function to create a CrsMatrixMultiplyOp.
const Teuchos::RCP< const crs_matrix_type > matrix_
The underlying CrsMatrix object.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > exportMV_
Row Map MultiVector used in apply().
LocalCrsMatrixOperator< Scalar, MatScalar, typename crs_matrix_type::device_type > localMultiply_
Implementation of local sparse matrix-vector multiply.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this Operator.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object ("forward mode").
bool isDistributed() const
Whether this is a globally distributed object.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Abstract interface for local operators (e.g., matrices and preconditioners).
A parallel distribution of indices over processes.
One or more distributed dense vectors.
void reduce()
Sum values of a locally replicated multivector across all processes.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
size_t getNumVectors() const
Number of columns in the multivector.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on device. This requires that th...
bool isConstantStride() const
Whether this multivector has constant stride between columns.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
Abstract interface for operators (e.g., matrices and preconditioners).
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
@ ADD_ASSIGN
Accumulate new values into existing values (may not be supported in all classes).
@ INSERT
Insert new values that don't currently exist.