Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10// clang-format off
11
12#ifndef TPETRA_BLOCKCRSMATRIX_DEF_HPP
13#define TPETRA_BLOCKCRSMATRIX_DEF_HPP
14
17
21#include "Tpetra_BlockMultiVector.hpp"
22#include "Tpetra_BlockView.hpp"
23
24#include "Teuchos_TimeMonitor.hpp"
25#ifdef HAVE_TPETRA_DEBUG
26# include <set>
27#endif // HAVE_TPETRA_DEBUG
28
29#include "KokkosSparse_spmv.hpp"
30
31namespace Tpetra {
32
33namespace Impl {
34
35 template<typename T>
36 struct BlockCrsRowStruct {
37 T totalNumEntries, totalNumBytes, maxRowLength;
38 KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct() = default;
39 KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct(const BlockCrsRowStruct &b) = default;
40 KOKKOS_INLINE_FUNCTION BlockCrsRowStruct(const T& numEnt, const T& numBytes, const T& rowLength)
41 : totalNumEntries(numEnt), totalNumBytes(numBytes), maxRowLength(rowLength) {}
42 };
43
44 template<typename T>
45 static
46 KOKKOS_INLINE_FUNCTION
47 void operator+=(BlockCrsRowStruct<T> &a, const BlockCrsRowStruct<T> &b) {
48 a.totalNumEntries += b.totalNumEntries;
49 a.totalNumBytes += b.totalNumBytes;
50 a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
51 }
52
53 template<typename T, typename ExecSpace>
54 struct BlockCrsReducer {
55 typedef BlockCrsReducer reducer;
56 typedef T value_type;
57 typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
58 value_type *value;
59
60 KOKKOS_INLINE_FUNCTION
61 BlockCrsReducer(value_type &val) : value(&val) {}
62
63 KOKKOS_INLINE_FUNCTION void join(value_type &dst, value_type &src) const { dst += src; }
64 KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const { dst += src; }
65 KOKKOS_INLINE_FUNCTION void init(value_type &val) const { val = value_type(); }
66 KOKKOS_INLINE_FUNCTION value_type& reference() { return *value; }
67 KOKKOS_INLINE_FUNCTION result_view_type view() const { return result_view_type(value); }
68 };
69
70} // namespace Impl
71
72namespace { // (anonymous)
73
74// Implementation of BlockCrsMatrix::getLocalDiagCopy (non-deprecated
75// version that takes two Kokkos::View arguments).
76template<class Scalar, class LO, class GO, class Node>
77class GetLocalDiagCopy {
78public:
79 typedef typename Node::device_type device_type;
80 typedef size_t diag_offset_type;
81 typedef Kokkos::View<const size_t*, device_type,
82 Kokkos::MemoryUnmanaged> diag_offsets_type;
83 typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
84 typedef typename global_graph_type::local_graph_device_type local_graph_device_type;
85 typedef typename local_graph_device_type::row_map_type row_offsets_type;
86 typedef typename ::Tpetra::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
87 typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
88 typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
89
90 // Constructor
91 GetLocalDiagCopy (const diag_type& diag,
92 const values_type& val,
93 const diag_offsets_type& diagOffsets,
94 const row_offsets_type& ptr,
95 const LO blockSize) :
96 diag_ (diag),
97 diagOffsets_ (diagOffsets),
98 ptr_ (ptr),
99 blockSize_ (blockSize),
100 offsetPerBlock_ (blockSize_*blockSize_),
101 val_(val)
102 {}
103
104 KOKKOS_INLINE_FUNCTION void
105 operator() (const LO& lclRowInd) const
106 {
107 using Kokkos::ALL;
108
109 // Get row offset
110 const size_t absOffset = ptr_[lclRowInd];
111
112 // Get offset relative to start of row
113 const size_t relOffset = diagOffsets_[lclRowInd];
114
115 // Get the total offset
116 const size_t pointOffset = (absOffset+relOffset)*offsetPerBlock_;
117
118 // Get a view of the block. BCRS currently uses LayoutRight
119 // regardless of the device.
120 typedef Kokkos::View<const IST**, Impl::BlockCrsMatrixLittleBlockArrayLayout,
121 device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
122 const_little_block_type;
123 const_little_block_type D_in (val_.data () + pointOffset,
124 blockSize_, blockSize_);
125 auto D_out = Kokkos::subview (diag_, lclRowInd, ALL (), ALL ());
126 COPY (D_in, D_out);
127 }
128
129 private:
130 diag_type diag_;
131 diag_offsets_type diagOffsets_;
132 row_offsets_type ptr_;
133 LO blockSize_;
134 LO offsetPerBlock_;
135 values_type val_;
136 };
137} // namespace (anonymous)
138
139 template<class Scalar, class LO, class GO, class Node>
140 std::ostream&
143 {
144 * (this->localError_) = true;
145 if ((*errs_).is_null ()) {
146 *errs_ = Teuchos::rcp (new std::ostringstream ());
147 }
148 return **errs_;
149 }
150
151 template<class Scalar, class LO, class GO, class Node>
154 dist_object_type (Teuchos::rcp (new map_type ())), // nonnull, so DistObject doesn't throw
155 graph_ (Teuchos::rcp (new map_type ()), 0), // FIXME (mfh 16 May 2014) no empty ctor yet
156 blockSize_ (static_cast<LO> (0)),
157 X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
158 Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
159 pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
160 offsetPerBlock_ (0),
161 localError_ (new bool (false)),
162 errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
163 {
164 }
165
166 template<class Scalar, class LO, class GO, class Node>
168 BlockCrsMatrix (const crs_graph_type& graph,
169 const LO blockSize) :
170 dist_object_type (graph.getMap ()),
171 graph_ (graph),
172 rowMeshMap_ (* (graph.getRowMap ())),
173 blockSize_ (blockSize),
174 X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
175 Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
176 pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
177 offsetPerBlock_ (blockSize * blockSize),
178 localError_ (new bool (false)),
179 errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
180 {
181
183 TEUCHOS_TEST_FOR_EXCEPTION(
184 ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
185 "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
186 "rows (isSorted() is false). This class assumes sorted rows.");
187
188 graphRCP_ = Teuchos::rcpFromRef(graph_);
189
190 // Trick to test whether LO is nonpositive, without a compiler
191 // warning in case LO is unsigned (which is generally a bad idea
192 // anyway). I don't promise that the trick works, but it
193 // generally does with gcc at least, in my experience.
194 const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
195 TEUCHOS_TEST_FOR_EXCEPTION(
196 blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
197 "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
198 " <= 0. The block size must be positive.");
199
200 domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
201 rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
202
203 {
204 // These are rcp
205 const auto domainPointMap = getDomainMap();
206 const auto colPointMap = Teuchos::rcp
207 (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
208 *pointImporter_ = Teuchos::rcp
209 (new typename crs_graph_type::import_type (domainPointMap, colPointMap));
210 }
211 {
212 auto local_graph_h = graph.getLocalGraphHost ();
213 auto ptr_h = local_graph_h.row_map;
214 ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
215 Kokkos::deep_copy(ptrHost_, ptr_h);
216
217 auto ind_h = local_graph_h.entries;
218 indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
219 // DEEP_COPY REVIEW - HOST-TO-HOST
220 Kokkos::deep_copy (indHost_, ind_h);
221
222 const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
223 val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
224 }
225 }
226
227 template<class Scalar, class LO, class GO, class Node>
229 BlockCrsMatrix (const crs_graph_type& graph,
230 const typename local_matrix_device_type::values_type& values,
231 const LO blockSize) :
232 dist_object_type (graph.getMap ()),
233 graph_ (graph),
234 rowMeshMap_ (* (graph.getRowMap ())),
235 blockSize_ (blockSize),
236 X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
237 Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
238 pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
239 offsetPerBlock_ (blockSize * blockSize),
240 localError_ (new bool (false)),
241 errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
242 {
244 TEUCHOS_TEST_FOR_EXCEPTION(
245 ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
246 "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
247 "rows (isSorted() is false). This class assumes sorted rows.");
248
249 graphRCP_ = Teuchos::rcpFromRef(graph_);
250
251 // Trick to test whether LO is nonpositive, without a compiler
252 // warning in case LO is unsigned (which is generally a bad idea
253 // anyway). I don't promise that the trick works, but it
254 // generally does with gcc at least, in my experience.
255 const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
256 TEUCHOS_TEST_FOR_EXCEPTION(
257 blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
258 "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
259 " <= 0. The block size must be positive.");
260
261 domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
262 rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
263
264 {
265 // These are rcp
266 const auto domainPointMap = getDomainMap();
267 const auto colPointMap = Teuchos::rcp
268 (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
269 *pointImporter_ = Teuchos::rcp
270 (new typename crs_graph_type::import_type (domainPointMap, colPointMap));
271 }
272 {
273 auto local_graph_h = graph.getLocalGraphHost ();
274 auto ptr_h = local_graph_h.row_map;
275 ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
276 Kokkos::deep_copy(ptrHost_, ptr_h);
277
278 auto ind_h = local_graph_h.entries;
279 indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
280 Kokkos::deep_copy (indHost_, ind_h);
281
282 const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
283 TEUCHOS_ASSERT_EQUALITY(numValEnt, values.size());
284 val_ = decltype (val_) (values);
285 }
286 }
287
288 template<class Scalar, class LO, class GO, class Node>
290 BlockCrsMatrix (const crs_graph_type& graph,
291 const map_type& domainPointMap,
292 const map_type& rangePointMap,
293 const LO blockSize) :
294 dist_object_type (graph.getMap ()),
295 graph_ (graph),
296 rowMeshMap_ (* (graph.getRowMap ())),
297 domainPointMap_ (domainPointMap),
298 rangePointMap_ (rangePointMap),
299 blockSize_ (blockSize),
300 X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
301 Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
302 pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
303 offsetPerBlock_ (blockSize * blockSize),
304 localError_ (new bool (false)),
305 errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
306 {
307 TEUCHOS_TEST_FOR_EXCEPTION(
308 ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
309 "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
310 "rows (isSorted() is false). This class assumes sorted rows.");
311
312 graphRCP_ = Teuchos::rcpFromRef(graph_);
313
314 // Trick to test whether LO is nonpositive, without a compiler
315 // warning in case LO is unsigned (which is generally a bad idea
316 // anyway). I don't promise that the trick works, but it
317 // generally does with gcc at least, in my experience.
318 const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
319 TEUCHOS_TEST_FOR_EXCEPTION(
320 blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
321 "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
322 " <= 0. The block size must be positive.");
323 {
324 // These are rcp
325 const auto rcpDomainPointMap = getDomainMap();
326 const auto colPointMap = Teuchos::rcp
327 (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
328 *pointImporter_ = Teuchos::rcp
329 (new typename crs_graph_type::import_type (rcpDomainPointMap, colPointMap));
330 }
331 {
332 auto local_graph_h = graph.getLocalGraphHost ();
333 auto ptr_h = local_graph_h.row_map;
334 ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
335 // DEEP_COPY REVIEW - HOST-TO-HOST
336 Kokkos::deep_copy(ptrHost_, ptr_h);
337
338 auto ind_h = local_graph_h.entries;
339 indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
340 // DEEP_COPY REVIEW - HOST-TO-HOST
341 Kokkos::deep_copy (indHost_, ind_h);
342
343 const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
344 val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
345
346 }
347 }
348
349 template<class Scalar, class LO, class GO, class Node>
350 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
352 getDomainMap () const
353 { // Copy constructor of map_type does a shallow copy.
354 // We're only returning by RCP for backwards compatibility.
355 return Teuchos::rcp (new map_type (domainPointMap_));
356 }
357
358 template<class Scalar, class LO, class GO, class Node>
359 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
361 getRangeMap () const
362 { // Copy constructor of map_type does a shallow copy.
363 // We're only returning by RCP for backwards compatibility.
364 return Teuchos::rcp (new map_type (rangePointMap_));
365 }
366
367 template<class Scalar, class LO, class GO, class Node>
368 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
370 getRowMap () const
371 {
372 return graph_.getRowMap();
373 }
374
375 template<class Scalar, class LO, class GO, class Node>
376 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
378 getColMap () const
379 {
380 return graph_.getColMap();
381 }
382
383 template<class Scalar, class LO, class GO, class Node>
386 getGlobalNumRows() const
387 {
388 return graph_.getGlobalNumRows();
389 }
390
391 template<class Scalar, class LO, class GO, class Node>
392 size_t
394 getLocalNumRows() const
395 {
396 return graph_.getLocalNumRows();
397 }
398
399 template<class Scalar, class LO, class GO, class Node>
400 size_t
403 {
404 return graph_.getLocalMaxNumRowEntries();
405 }
406
407 template<class Scalar, class LO, class GO, class Node>
408 void
410 apply (const mv_type& X,
411 mv_type& Y,
412 Teuchos::ETransp mode,
413 Scalar alpha,
414 Scalar beta) const
415 {
416 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
417 TEUCHOS_TEST_FOR_EXCEPTION(
418 mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
419 std::invalid_argument, "Tpetra::BlockCrsMatrix::apply: "
420 "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
421 "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
422
423 TEUCHOS_TEST_FOR_EXCEPTION(
425 std::invalid_argument, "Tpetra::BlockCrsMatrix::apply: "
426 "X and Y must both be constant stride");
427
428 BMV X_view;
429 BMV Y_view;
430 const LO blockSize = getBlockSize ();
431 try {
432 X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
433 Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
434 }
435 catch (std::invalid_argument& e) {
436 TEUCHOS_TEST_FOR_EXCEPTION(
437 true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
438 "apply: Either the input MultiVector X or the output MultiVector Y "
439 "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
440 "graph. BlockMultiVector's constructor threw the following exception: "
441 << e.what ());
442 }
443
444 try {
445 // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
446 // either that or mark fields of this class as 'mutable'. The
447 // problem is that applyBlock wants to do lazy initialization of
448 // temporary block multivectors.
449 const_cast<this_BCRS_type&> (*this).applyBlock (X_view, Y_view, mode, alpha, beta);
450 } catch (std::invalid_argument& e) {
451 TEUCHOS_TEST_FOR_EXCEPTION(
452 true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
453 "apply: The implementation method applyBlock complained about having "
454 "an invalid argument. It reported the following: " << e.what ());
455 } catch (std::logic_error& e) {
456 TEUCHOS_TEST_FOR_EXCEPTION(
457 true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
458 "apply: The implementation method applyBlock complained about a "
459 "possible bug in its implementation. It reported the following: "
460 << e.what ());
461 } catch (std::exception& e) {
462 TEUCHOS_TEST_FOR_EXCEPTION(
463 true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
464 "apply: The implementation method applyBlock threw an exception which "
465 "is neither std::invalid_argument nor std::logic_error, but is a "
466 "subclass of std::exception. It reported the following: "
467 << e.what ());
468 } catch (...) {
469 TEUCHOS_TEST_FOR_EXCEPTION(
470 true, std::logic_error, "Tpetra::BlockCrsMatrix::"
471 "apply: The implementation method applyBlock threw an exception which "
472 "is not an instance of a subclass of std::exception. This probably "
473 "indicates a bug. Please report this to the Tpetra developers.");
474 }
475 }
476
477 template<class Scalar, class LO, class GO, class Node>
478 void
482 Teuchos::ETransp mode,
483 const Scalar alpha,
484 const Scalar beta)
485 {
486 TEUCHOS_TEST_FOR_EXCEPTION(
487 X.getBlockSize () != Y.getBlockSize (), std::invalid_argument,
488 "Tpetra::BlockCrsMatrix::applyBlock: "
489 "X and Y have different block sizes. X.getBlockSize() = "
490 << X.getBlockSize () << " != Y.getBlockSize() = "
491 << Y.getBlockSize () << ".");
492
493 if (mode == Teuchos::NO_TRANS) {
494 applyBlockNoTrans (X, Y, alpha, beta);
495 } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
496 applyBlockTrans (X, Y, mode, alpha, beta);
497 } else {
498 TEUCHOS_TEST_FOR_EXCEPTION(
499 true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
500 "applyBlock: Invalid 'mode' argument. Valid values are "
501 "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
502 }
503 }
504
505 template<class Scalar, class LO, class GO, class Node>
506 void
509 const Import<LO, GO, Node>& importer) const
510 {
511 using Teuchos::RCP;
512 using Teuchos::rcp;
513 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
514
515 // Right now, we make many assumptions...
516 TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
517 "destMatrix is required to be null.");
518
519 // BlockCrsMatrix requires a complete graph at construction.
520 // So first step is to import and fill complete the destGraph.
521 RCP<crs_graph_type> srcGraph = rcp (new crs_graph_type(this->getCrsGraph()));
522 RCP<crs_graph_type> destGraph = importAndFillCompleteCrsGraph<crs_graph_type>(srcGraph, importer,
523 srcGraph->getDomainMap(),
524 srcGraph->getRangeMap());
525
526
527 // Final step, create and import the destMatrix.
528 destMatrix = rcp (new this_BCRS_type (*destGraph, getBlockSize()));
529 destMatrix->doImport(*this, importer, Tpetra::INSERT);
530 }
531
532
533 template<class Scalar, class LO, class GO, class Node>
534 void
537 const Export<LO, GO, Node>& exporter) const
538 {
539 using Teuchos::RCP;
540 using Teuchos::rcp;
541 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
542
543 // Right now, we make many assumptions...
544 TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
545 "destMatrix is required to be null.");
546
547 // BlockCrsMatrix requires a complete graph at construction.
548 // So first step is to import and fill complete the destGraph.
549 RCP<crs_graph_type> srcGraph = rcp (new crs_graph_type(this->getCrsGraph()));
550 RCP<crs_graph_type> destGraph = exportAndFillCompleteCrsGraph<crs_graph_type>(srcGraph, exporter,
551 srcGraph->getDomainMap(),
552 srcGraph->getRangeMap());
553
554
555 // Final step, create and import the destMatrix.
556 destMatrix = rcp (new this_BCRS_type (*destGraph, getBlockSize()));
557 destMatrix->doExport(*this, exporter, Tpetra::INSERT);
558 }
559
560
561 template<class Scalar, class LO, class GO, class Node>
562 void
564 setAllToScalar (const Scalar& alpha)
565 {
566 auto val_d = val_.getDeviceView(Access::OverwriteAll);
567 Kokkos::deep_copy(val_d, alpha);
568 }
569
570 template<class Scalar, class LO, class GO, class Node>
571 LO
573 replaceLocalValues (const LO localRowInd,
574 const LO colInds[],
575 const Scalar vals[],
576 const LO numColInds) const
577 {
578 std::vector<ptrdiff_t> offsets(numColInds);
579 const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets.data(), colInds, numColInds);
580 const LO validCount = this->replaceLocalValuesByOffsets(localRowInd, offsets.data(), vals, numOffsets);
581 return validCount;
582 }
583
584 template <class Scalar, class LO, class GO, class Node>
585 void
587 getLocalDiagOffsets (const Kokkos::View<size_t*, device_type,
588 Kokkos::MemoryUnmanaged>& offsets) const
589 {
590 graph_.getLocalDiagOffsets (offsets);
591 }
592
593 template <class Scalar, class LO, class GO, class Node>
594 void
596 getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
597 Kokkos::MemoryUnmanaged>& diag,
598 const Kokkos::View<const size_t*, device_type,
599 Kokkos::MemoryUnmanaged>& offsets) const
600 {
601 using Kokkos::parallel_for;
602 const char prefix[] = "Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
603
604 const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getLocalNumElements ());
605 const LO blockSize = this->getBlockSize ();
606 TEUCHOS_TEST_FOR_EXCEPTION
607 (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
608 static_cast<LO> (diag.extent (1)) < blockSize ||
609 static_cast<LO> (diag.extent (2)) < blockSize,
610 std::invalid_argument, prefix <<
611 "The input Kokkos::View is not big enough to hold all the data.");
612 TEUCHOS_TEST_FOR_EXCEPTION
613 (static_cast<LO> (offsets.size ()) < lclNumMeshRows, std::invalid_argument,
614 prefix << "offsets.size() = " << offsets.size () << " < local number of "
615 "diagonal blocks " << lclNumMeshRows << ".");
616
617 typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
618 typedef GetLocalDiagCopy<Scalar, LO, GO, Node> functor_type;
619
620 // FIXME (mfh 26 May 2016) Not really OK to const_cast here, since
621 // we reserve the right to do lazy allocation of device data. (We
622 // don't plan to do lazy allocation for host data; the host
623 // version of the data always exists.)
624 auto val_d = val_.getDeviceView(Access::ReadOnly);
625 parallel_for (policy_type (0, lclNumMeshRows),
626 functor_type (diag, val_d, offsets,
627 graph_.getLocalGraphDevice ().row_map, blockSize_));
628 }
629
630 template<class Scalar, class LO, class GO, class Node>
631 LO
633 absMaxLocalValues (const LO localRowInd,
634 const LO colInds[],
635 const Scalar vals[],
636 const LO numColInds) const
637 {
638 std::vector<ptrdiff_t> offsets(numColInds);
639 const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets.data(), colInds, numColInds);
640 const LO validCount = this->absMaxLocalValuesByOffsets(localRowInd, offsets.data(), vals, numOffsets);
641 return validCount;
642 }
643
644
645 template<class Scalar, class LO, class GO, class Node>
646 LO
648 sumIntoLocalValues (const LO localRowInd,
649 const LO colInds[],
650 const Scalar vals[],
651 const LO numColInds) const
652 {
653 std::vector<ptrdiff_t> offsets(numColInds);
654 const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets.data(), colInds, numColInds);
655 const LO validCount = this->sumIntoLocalValuesByOffsets(localRowInd, offsets.data(), vals, numOffsets);
656 return validCount;
657 }
658 template<class Scalar, class LO, class GO, class Node>
659 void
661 getLocalRowCopy (LO LocalRow,
662 nonconst_local_inds_host_view_type &Indices,
663 nonconst_values_host_view_type &Values,
664 size_t& NumEntries) const
665 {
666 auto vals = getValuesHost(LocalRow);
667 const LO numInds = ptrHost_(LocalRow+1) - ptrHost_(LocalRow);
668 if (numInds > (LO)Indices.extent(0) || numInds*blockSize_*blockSize_ > (LO)Values.extent(0)) {
669 TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
670 "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
671 << numInds << " row entries");
672 }
673 const LO * colInds = indHost_.data() + ptrHost_(LocalRow);
674 for (LO i=0; i<numInds; ++i) {
675 Indices[i] = colInds[i];
676 }
677 for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
678 Values[i] = vals[i];
679 }
680 NumEntries = numInds;
681 }
682
683 template<class Scalar, class LO, class GO, class Node>
684 LO
686 getLocalRowOffsets (const LO localRowInd,
687 ptrdiff_t offsets[],
688 const LO colInds[],
689 const LO numColInds) const
690 {
691 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
692 // We got no offsets, because the input local row index is
693 // invalid on the calling process. That may not be an error, if
694 // numColInds is zero anyway; it doesn't matter. This is the
695 // advantage of returning the number of valid indices.
696 return static_cast<LO> (0);
697 }
698
699 const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
700 LO hint = 0; // Guess for the relative offset into the current row
701 LO validCount = 0; // number of valid column indices in colInds
702
703 for (LO k = 0; k < numColInds; ++k) {
704 const LO relBlockOffset =
705 this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
706 if (relBlockOffset != LINV) {
707 offsets[k] = static_cast<ptrdiff_t> (relBlockOffset);
708 hint = relBlockOffset + 1;
709 ++validCount;
710 }
711 }
712 return validCount;
713 }
714
715
716 template<class Scalar, class LO, class GO, class Node>
717 LO
719 replaceLocalValuesByOffsets (const LO localRowInd,
720 const ptrdiff_t offsets[],
721 const Scalar vals[],
722 const LO numOffsets) const
723 {
724 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
725 // We modified no values, because the input local row index is
726 // invalid on the calling process. That may not be an error, if
727 // numColInds is zero anyway; it doesn't matter. This is the
728 // advantage of returning the number of valid indices.
729 return static_cast<LO> (0);
730 }
731 const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
732 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
733 auto val_out = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
734 impl_scalar_type* vOut = val_out.data();
735
736 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
737 const ptrdiff_t STINV = Teuchos::OrdinalTraits<ptrdiff_t>::invalid ();
738 size_t pointOffset = 0; // Current offset into input values
739 LO validCount = 0; // number of valid offsets
740
741 for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
742 const size_t blockOffset = offsets[k]*perBlockSize;
743 if (offsets[k] != STINV) {
744 little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
745 const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
746 COPY (A_new, A_old);
747 ++validCount;
748 }
749 }
750 return validCount;
751 }
752
753
754 template<class Scalar, class LO, class GO, class Node>
755 LO
757 absMaxLocalValuesByOffsets (const LO localRowInd,
758 const ptrdiff_t offsets[],
759 const Scalar vals[],
760 const LO numOffsets) const
761 {
762 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
763 // We modified no values, because the input local row index is
764 // invalid on the calling process. That may not be an error, if
765 // numColInds is zero anyway; it doesn't matter. This is the
766 // advantage of returning the number of valid indices.
767 return static_cast<LO> (0);
768 }
769 const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
770 auto val_out = getValuesHost(localRowInd);
771 impl_scalar_type* vOut = const_cast<impl_scalar_type*>(val_out.data());
772
773 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
774 const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
775 size_t pointOffset = 0; // Current offset into input values
776 LO validCount = 0; // number of valid offsets
777
778 for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
779 const size_t blockOffset = offsets[k]*perBlockSize;
780 if (blockOffset != STINV) {
781 little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
782 const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
783 ::Tpetra::Impl::absMax (A_old, A_new);
784 ++validCount;
785 }
786 }
787 return validCount;
788 }
789
790
791 template<class Scalar, class LO, class GO, class Node>
792 LO
794 sumIntoLocalValuesByOffsets (const LO localRowInd,
795 const ptrdiff_t offsets[],
796 const Scalar vals[],
797 const LO numOffsets) const
798 {
799 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
800 // We modified no values, because the input local row index is
801 // invalid on the calling process. That may not be an error, if
802 // numColInds is zero anyway; it doesn't matter. This is the
803 // advantage of returning the number of valid indices.
804 return static_cast<LO> (0);
805 }
806 const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
807 const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
808 typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
809 auto val_out = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
810 impl_scalar_type* vOut = val_out.data();
811
812 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
813 const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
814 size_t pointOffset = 0; // Current offset into input values
815 LO validCount = 0; // number of valid offsets
816
817 for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
818 const size_t blockOffset = offsets[k]*perBlockSize;
819 if (blockOffset != STINV) {
820 little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
821 const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
822 AXPY (ONE, A_new, A_old);
823 ++validCount;
824 }
825 }
826 return validCount;
827 }
828
829 template<class Scalar, class LO, class GO, class Node>
831 impl_scalar_type_dualview::t_host::const_type
833 getValuesHost () const
834 {
835 return val_.getHostView(Access::ReadOnly);
836 }
837
838 template<class Scalar, class LO, class GO, class Node>
839 typename BlockCrsMatrix<Scalar, LO, GO, Node>::
840 impl_scalar_type_dualview::t_dev::const_type
841 BlockCrsMatrix<Scalar, LO, GO, Node>::
842 getValuesDevice () const
843 {
844 return val_.getDeviceView(Access::ReadOnly);
845 }
846
847 template<class Scalar, class LO, class GO, class Node>
849 impl_scalar_type_dualview::t_host
852 {
853 return val_.getHostView(Access::ReadWrite);
854 }
855
856 template<class Scalar, class LO, class GO, class Node>
858 impl_scalar_type_dualview::t_dev
861 {
862 return val_.getDeviceView(Access::ReadWrite);
863 }
864
865 template<class Scalar, class LO, class GO, class Node>
866 typename BlockCrsMatrix<Scalar, LO, GO, Node>::
867 impl_scalar_type_dualview::t_host::const_type
869 getValuesHost (const LO& lclRow) const
870 {
871 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
872 auto val = val_.getHostView(Access::ReadOnly);
873 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
874 return r_val;
875 }
876
877 template<class Scalar, class LO, class GO, class Node>
879 impl_scalar_type_dualview::t_dev::const_type
881 getValuesDevice (const LO& lclRow) const
882 {
883 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
884 auto val = val_.getDeviceView(Access::ReadOnly);
885 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
886 return r_val;
887 }
888
889 template<class Scalar, class LO, class GO, class Node>
892 getValuesHostNonConst (const LO& lclRow)
893 {
894 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
895 auto val = val_.getHostView(Access::ReadWrite);
896 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
897 return r_val;
898 }
899
900 template<class Scalar, class LO, class GO, class Node>
903 getValuesDeviceNonConst (const LO& lclRow)
904 {
905 const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
906 auto val = val_.getDeviceView(Access::ReadWrite);
907 auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
908 return r_val;
909 }
910
911 template<class Scalar, class LO, class GO, class Node>
912 size_t
914 getNumEntriesInLocalRow (const LO localRowInd) const
915 {
916 const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
917 if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
918 return static_cast<LO> (0); // the calling process doesn't have that row
919 } else {
920 return static_cast<LO> (numEntInGraph);
921 }
922 }
923
924 template<class Scalar, class LO, class GO, class Node>
925 typename BlockCrsMatrix<Scalar, LO, GO, Node>::local_matrix_device_type
928 {
929 auto numCols = this->graph_.getColMap()->getLocalNumElements();
930 auto val = val_.getDeviceView(Access::ReadWrite);
931 const LO blockSize = this->getBlockSize ();
932 const auto graph = this->graph_.getLocalGraphDevice ();
933
934 return local_matrix_device_type("Tpetra::BlockCrsMatrix::lclMatrixDevice",
935 numCols,
936 val,
937 graph,
938 blockSize);
939 }
940
941 template<class Scalar, class LO, class GO, class Node>
942 void
946 const Teuchos::ETransp mode,
947 const Scalar alpha,
948 const Scalar beta)
949 {
950 (void) X;
951 (void) Y;
952 (void) mode;
953 (void) alpha;
954 (void) beta;
955
956 TEUCHOS_TEST_FOR_EXCEPTION(
957 true, std::logic_error, "Tpetra::BlockCrsMatrix::apply: "
958 "transpose and conjugate transpose modes are not yet implemented.");
959 }
960
961 template<class Scalar, class LO, class GO, class Node>
962 void
963 BlockCrsMatrix<Scalar, LO, GO, Node>::
964 applyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
965 BlockMultiVector<Scalar, LO, GO, Node>& Y,
966 const Scalar alpha,
967 const Scalar beta)
968 {
969 using Teuchos::RCP;
970 using Teuchos::rcp;
971 typedef ::Tpetra::Import<LO, GO, Node> import_type;
972 typedef ::Tpetra::Export<LO, GO, Node> export_type;
973 const Scalar zero = STS::zero ();
974 const Scalar one = STS::one ();
975 RCP<const import_type> import = graph_.getImporter ();
976 // "export" is a reserved C++ keyword, so we can't use it.
977 RCP<const export_type> theExport = graph_.getExporter ();
978 const char prefix[] = "Tpetra::BlockCrsMatrix::applyBlockNoTrans: ";
979
980 if (alpha == zero) {
981 if (beta == zero) {
982 Y.putScalar (zero); // replace Inf or NaN (BLAS rules)
983 }
984 else if (beta != one) {
985 Y.scale (beta);
986 }
987 }
988 else { // alpha != 0
989 const BMV* X_colMap = NULL;
990 if (import.is_null ()) {
991 try {
992 X_colMap = &X;
993 }
994 catch (std::exception& e) {
995 TEUCHOS_TEST_FOR_EXCEPTION
996 (true, std::logic_error, prefix << "Tpetra::MultiVector::"
997 "operator= threw an exception: " << std::endl << e.what ());
998 }
999 }
1000 else {
1001 // X_colMap_ is a pointer to a pointer to BMV. Ditto for
1002 // Y_rowMap_ below. This lets us do lazy initialization
1003 // correctly with view semantics of BlockCrsMatrix. All views
1004 // of this BlockCrsMatrix have the same outer pointer. That
1005 // way, we can set the inner pointer in one view, and all
1006 // other views will see it.
1007 if ((*X_colMap_).is_null () ||
1008 (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1009 (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1010 *X_colMap_ = rcp (new BMV (* (graph_.getColMap ()), getBlockSize (),
1011 static_cast<LO> (X.getNumVectors ())));
1012 }
1013 (*X_colMap_)->getMultiVectorView().doImport (X.getMultiVectorView (),
1014 **pointImporter_,
1016 try {
1017 X_colMap = &(**X_colMap_);
1018 }
1019 catch (std::exception& e) {
1020 TEUCHOS_TEST_FOR_EXCEPTION
1021 (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1022 "operator= threw an exception: " << std::endl << e.what ());
1023 }
1024 }
1025
1026 BMV* Y_rowMap = NULL;
1027 if (theExport.is_null ()) {
1028 Y_rowMap = &Y;
1029 }
1030 else if ((*Y_rowMap_).is_null () ||
1031 (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1032 (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1033 *Y_rowMap_ = rcp (new BMV (* (graph_.getRowMap ()), getBlockSize (),
1034 static_cast<LO> (X.getNumVectors ())));
1035 try {
1036 Y_rowMap = &(**Y_rowMap_);
1037 }
1038 catch (std::exception& e) {
1039 TEUCHOS_TEST_FOR_EXCEPTION(
1040 true, std::logic_error, prefix << "Tpetra::MultiVector::"
1041 "operator= threw an exception: " << std::endl << e.what ());
1042 }
1043 }
1044
1045 try {
1046 localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1047 }
1048 catch (std::exception& e) {
1049 TEUCHOS_TEST_FOR_EXCEPTION
1050 (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1051 "an exception: " << e.what ());
1052 }
1053 catch (...) {
1054 TEUCHOS_TEST_FOR_EXCEPTION
1055 (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1056 "an exception not a subclass of std::exception.");
1057 }
1058
1059 if (! theExport.is_null ()) {
1060 Y.doExport (*Y_rowMap, *theExport, ::Tpetra::REPLACE);
1061 }
1062 }
1063 }
1064
1065// clang-format on
1066template <class Scalar, class LO, class GO, class Node>
1067void BlockCrsMatrix<Scalar, LO, GO, Node>::localApplyBlockNoTrans(
1069 BlockMultiVector<Scalar, LO, GO, Node> &Y, const Scalar alpha,
1070 const Scalar beta) {
1071 ::Tpetra::Details::ProfilingRegion profile_region(
1072 "Tpetra::BlockCrsMatrix::localApplyBlockNoTrans");
1073 const impl_scalar_type alpha_impl = alpha;
1074 const auto graph = this->graph_.getLocalGraphDevice();
1075
1076 mv_type X_mv = X.getMultiVectorView();
1077 mv_type Y_mv = Y.getMultiVectorView();
1078 auto X_lcl = X_mv.getLocalViewDevice(Access::ReadOnly);
1079 auto Y_lcl = Y_mv.getLocalViewDevice(Access::ReadWrite);
1080
1081#if KOKKOSKERNELS_VERSION >= 40299
1082 auto A_lcl = getLocalMatrixDevice();
1083 if(!applyHelper.get()) {
1084 // The apply helper does not exist, so create it
1085 applyHelper = std::make_shared<ApplyHelper>(A_lcl.nnz(), A_lcl.graph.row_map);
1086 }
1087 if(applyHelper->shouldUseIntRowptrs())
1088 {
1089 auto A_lcl_int_rowptrs = applyHelper->getIntRowptrMatrix(A_lcl);
1090 KokkosSparse::spmv(
1091 &applyHelper->handle_int, KokkosSparse::NoTranspose,
1092 alpha_impl, A_lcl_int_rowptrs, X_lcl, beta, Y_lcl);
1093 }
1094 else
1095 {
1096 KokkosSparse::spmv(
1097 &applyHelper->handle, KokkosSparse::NoTranspose,
1098 alpha_impl, A_lcl, X_lcl, beta, Y_lcl);
1099 }
1100#else
1101 auto A_lcl = getLocalMatrixDevice();
1102 KokkosSparse::spmv(KokkosSparse::NoTranspose, alpha_impl, A_lcl, X_lcl, beta,
1103 Y_lcl);
1104#endif
1105}
1106// clang-format off
1107
1108 template<class Scalar, class LO, class GO, class Node>
1109 LO
1111 findRelOffsetOfColumnIndex (const LO localRowIndex,
1112 const LO colIndexToFind,
1113 const LO hint) const
1114 {
1115 const size_t absStartOffset = ptrHost_[localRowIndex];
1116 const size_t absEndOffset = ptrHost_[localRowIndex+1];
1117 const LO numEntriesInRow = static_cast<LO> (absEndOffset - absStartOffset);
1118 // Amortize pointer arithmetic over the search loop.
1119 const LO* const curInd = indHost_.data () + absStartOffset;
1120
1121 // If the hint was correct, then the hint is the offset to return.
1122 if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1123 // Always return the offset relative to the current row.
1124 return hint;
1125 }
1126
1127 // The hint was wrong, so we must search for the given column
1128 // index in the column indices for the given row.
1129 LO relOffset = Teuchos::OrdinalTraits<LO>::invalid ();
1130
1131 // We require that the graph have sorted rows. However, binary
1132 // search only pays if the current row is longer than a certain
1133 // amount. We set this to 32, but you might want to tune this.
1134 const LO maxNumEntriesForLinearSearch = 32;
1135 if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1136 // Use binary search. It would probably be better for us to
1137 // roll this loop by hand. If we wrote it right, a smart
1138 // compiler could perhaps use conditional loads and avoid
1139 // branches (according to Jed Brown on May 2014).
1140 const LO* beg = curInd;
1141 const LO* end = curInd + numEntriesInRow;
1142 std::pair<const LO*, const LO*> p =
1143 std::equal_range (beg, end, colIndexToFind);
1144 if (p.first != p.second) {
1145 // offset is relative to the current row
1146 relOffset = static_cast<LO> (p.first - beg);
1147 }
1148 }
1149 else { // use linear search
1150 for (LO k = 0; k < numEntriesInRow; ++k) {
1151 if (colIndexToFind == curInd[k]) {
1152 relOffset = k; // offset is relative to the current row
1153 break;
1154 }
1155 }
1156 }
1157
1158 return relOffset;
1159 }
1160
1161 template<class Scalar, class LO, class GO, class Node>
1162 LO
1164 offsetPerBlock () const
1165 {
1166 return offsetPerBlock_;
1167 }
1168
1169 template<class Scalar, class LO, class GO, class Node>
1173 const size_t pointOffset) const
1174 {
1175 // Row major blocks
1176 const LO rowStride = blockSize_;
1177 return const_little_block_type (val + pointOffset, blockSize_, rowStride);
1178 }
1179
1180 template<class Scalar, class LO, class GO, class Node>
1184 const size_t pointOffset) const
1185 {
1186 // Row major blocks
1187 const LO rowStride = blockSize_;
1188 return little_block_type (val + pointOffset, blockSize_, rowStride);
1189 }
1190
1191 template<class Scalar, class LO, class GO, class Node>
1192 typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1195 const size_t pointOffset) const
1196 {
1197 // Row major blocks
1198 const LO rowStride = blockSize_;
1199 const size_t bs2 = blockSize_ * blockSize_;
1200 return little_block_host_type (val + bs2 * pointOffset, blockSize_, rowStride);
1201 }
1202
1203 template<class Scalar, class LO, class GO, class Node>
1206 getLocalBlockDeviceNonConst (const LO localRowInd, const LO localColInd) const
1207 {
1208 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1209
1210 const size_t absRowBlockOffset = ptrHost_[localRowInd];
1211 const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1212 if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1213 const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1214 auto vals = const_cast<this_BCRS_type&>(*this).getValuesDeviceNonConst();
1215 auto r_val = getNonConstLocalBlockFromInput (vals.data(), absBlockOffset);
1216 return r_val;
1217 }
1218 else {
1219 return little_block_type ();
1220 }
1221 }
1222
1223 template<class Scalar, class LO, class GO, class Node>
1224 typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1226 getLocalBlockHostNonConst (const LO localRowInd, const LO localColInd) const
1227 {
1228 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1229
1230 const size_t absRowBlockOffset = ptrHost_[localRowInd];
1231 const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1232 if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1233 const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1234 auto vals = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst();
1235 auto r_val = getNonConstLocalBlockFromInputHost (vals.data(), absBlockOffset);
1236 return r_val;
1237 }
1238 else {
1239 return little_block_host_type ();
1240 }
1241 }
1242
1243
1244 template<class Scalar, class LO, class GO, class Node>
1245 bool
1247 checkSizes (const ::Tpetra::SrcDistObject& source)
1248 {
1249 using std::endl;
1250 typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
1251 const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
1252
1253 if (src == NULL) {
1254 std::ostream& err = markLocalErrorAndGetStream ();
1255 err << "checkSizes: The source object of the Import or Export "
1256 "must be a BlockCrsMatrix with the same template parameters as the "
1257 "target object." << endl;
1258 }
1259 else {
1260 // Use a string of ifs, not if-elseifs, because we want to know
1261 // all the errors.
1262 if (src->getBlockSize () != this->getBlockSize ()) {
1263 std::ostream& err = markLocalErrorAndGetStream ();
1264 err << "checkSizes: The source and target objects of the Import or "
1265 << "Export must have the same block sizes. The source's block "
1266 << "size = " << src->getBlockSize () << " != the target's block "
1267 << "size = " << this->getBlockSize () << "." << endl;
1268 }
1269 if (! src->graph_.isFillComplete ()) {
1270 std::ostream& err = markLocalErrorAndGetStream ();
1271 err << "checkSizes: The source object of the Import or Export is "
1272 "not fill complete. Both source and target objects must be fill "
1273 "complete." << endl;
1274 }
1275 if (! this->graph_.isFillComplete ()) {
1276 std::ostream& err = markLocalErrorAndGetStream ();
1277 err << "checkSizes: The target object of the Import or Export is "
1278 "not fill complete. Both source and target objects must be fill "
1279 "complete." << endl;
1280 }
1281 if (src->graph_.getColMap ().is_null ()) {
1282 std::ostream& err = markLocalErrorAndGetStream ();
1283 err << "checkSizes: The source object of the Import or Export does "
1284 "not have a column Map. Both source and target objects must have "
1285 "column Maps." << endl;
1286 }
1287 if (this->graph_.getColMap ().is_null ()) {
1288 std::ostream& err = markLocalErrorAndGetStream ();
1289 err << "checkSizes: The target object of the Import or Export does "
1290 "not have a column Map. Both source and target objects must have "
1291 "column Maps." << endl;
1292 }
1293 }
1294
1295 return ! (* (this->localError_));
1296 }
1297
1298 template<class Scalar, class LO, class GO, class Node>
1299 void
1302 (const ::Tpetra::SrcDistObject& source,
1303 const size_t numSameIDs,
1304 const Kokkos::DualView<const local_ordinal_type*,
1305 buffer_device_type>& permuteToLIDs,
1306 const Kokkos::DualView<const local_ordinal_type*,
1307 buffer_device_type>& permuteFromLIDs,
1308 const CombineMode /*CM*/)
1309 {
1313 using std::endl;
1314 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1315
1316 ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::copyAndPermute");
1317 const bool debug = Behavior::debug();
1318 const bool verbose = Behavior::verbose();
1319
1320 // Define this function prefix
1321 std::string prefix;
1322 {
1323 std::ostringstream os;
1324 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1325 os << "Proc " << myRank << ": BlockCrsMatrix::copyAndPermute : " << endl;
1326 prefix = os.str();
1327 }
1328
1329 // check if this already includes a local error
1330 if (* (this->localError_)) {
1331 std::ostream& err = this->markLocalErrorAndGetStream ();
1332 err << prefix
1333 << "The target object of the Import or Export is already in an error state."
1334 << endl;
1335 return;
1336 }
1337
1338 //
1339 // Verbose input dual view status
1340 //
1341 if (verbose) {
1342 std::ostringstream os;
1343 os << prefix << endl
1344 << prefix << " " << dualViewStatusToString (permuteToLIDs, "permuteToLIDs") << endl
1345 << prefix << " " << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs") << endl;
1346 std::cerr << os.str ();
1347 }
1348
1352 if (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0)) {
1353 std::ostream& err = this->markLocalErrorAndGetStream ();
1354 err << prefix
1355 << "permuteToLIDs.extent(0) = " << permuteToLIDs.extent (0)
1356 << " != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0)
1357 << "." << endl;
1358 return;
1359 }
1360 if (permuteToLIDs.need_sync_host () || permuteFromLIDs.need_sync_host ()) {
1361 std::ostream& err = this->markLocalErrorAndGetStream ();
1362 err << prefix
1363 << "Both permuteToLIDs and permuteFromLIDs must be sync'd to host."
1364 << endl;
1365 return;
1366 }
1367
1368 const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
1369 if (src == NULL) {
1370 std::ostream& err = this->markLocalErrorAndGetStream ();
1371 err << prefix
1372 << "The source (input) object of the Import or "
1373 "Export is either not a BlockCrsMatrix, or does not have the right "
1374 "template parameters. checkSizes() should have caught this. "
1375 "Please report this bug to the Tpetra developers." << endl;
1376 return;
1377 }
1378
1379 bool lclErr = false;
1380#ifdef HAVE_TPETRA_DEBUG
1381 std::set<LO> invalidSrcCopyRows;
1382 std::set<LO> invalidDstCopyRows;
1383 std::set<LO> invalidDstCopyCols;
1384 std::set<LO> invalidDstPermuteCols;
1385 std::set<LO> invalidPermuteFromRows;
1386#endif // HAVE_TPETRA_DEBUG
1387
1388 // Copy the initial sequence of rows that are the same.
1389 //
1390 // The two graphs might have different column Maps, so we need to
1391 // do this using global column indices. This is purely local, so
1392 // we only need to check for local sameness of the two column
1393 // Maps.
1394
1395#ifdef HAVE_TPETRA_DEBUG
1396 const map_type& srcRowMap = * (src->graph_.getRowMap ());
1397#endif // HAVE_TPETRA_DEBUG
1398 const map_type& dstRowMap = * (this->graph_.getRowMap ());
1399 const map_type& srcColMap = * (src->graph_.getColMap ());
1400 const map_type& dstColMap = * (this->graph_.getColMap ());
1401 const bool canUseLocalColumnIndices = srcColMap.locallySameAs (dstColMap);
1402
1403 const size_t numPermute = static_cast<size_t> (permuteFromLIDs.extent(0));
1404 if (verbose) {
1405 std::ostringstream os;
1406 os << prefix
1407 << "canUseLocalColumnIndices: "
1408 << (canUseLocalColumnIndices ? "true" : "false")
1409 << ", numPermute: " << numPermute
1410 << endl;
1411 std::cerr << os.str ();
1412 }
1413
1414 const auto permuteToLIDsHost = permuteToLIDs.view_host();
1415 const auto permuteFromLIDsHost = permuteFromLIDs.view_host();
1416
1417 if (canUseLocalColumnIndices) {
1418 // Copy local rows that are the "same" in both source and target.
1419 for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1420#ifdef HAVE_TPETRA_DEBUG
1421 if (! srcRowMap.isNodeLocalElement (localRow)) {
1422 lclErr = true;
1423 invalidSrcCopyRows.insert (localRow);
1424 continue; // skip invalid rows
1425 }
1426#endif // HAVE_TPETRA_DEBUG
1427
1428 local_inds_host_view_type lclSrcCols;
1429 values_host_view_type vals;
1430 LO numEntries;
1431 // If this call fails, that means the mesh row local index is
1432 // invalid. That means the Import or Export is invalid somehow.
1433 src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1434 if (numEntries > 0) {
1435 LO err = this->replaceLocalValues (localRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1436 if (err != numEntries) {
1437 lclErr = true;
1438 if (! dstRowMap.isNodeLocalElement (localRow)) {
1439#ifdef HAVE_TPETRA_DEBUG
1440 invalidDstCopyRows.insert (localRow);
1441#endif // HAVE_TPETRA_DEBUG
1442 }
1443 else {
1444 // Once there's an error, there's no sense in saving
1445 // time, so we check whether the column indices were
1446 // invalid. However, only remember which ones were
1447 // invalid in a debug build, because that might take a
1448 // lot of space.
1449 for (LO k = 0; k < numEntries; ++k) {
1450 if (! dstColMap.isNodeLocalElement (lclSrcCols[k])) {
1451 lclErr = true;
1452#ifdef HAVE_TPETRA_DEBUG
1453 (void) invalidDstCopyCols.insert (lclSrcCols[k]);
1454#endif // HAVE_TPETRA_DEBUG
1455 }
1456 }
1457 }
1458 }
1459 }
1460 } // for each "same" local row
1461
1462 // Copy the "permute" local rows.
1463 for (size_t k = 0; k < numPermute; ++k) {
1464 const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
1465 const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
1466
1467 local_inds_host_view_type lclSrcCols;
1468 values_host_view_type vals;
1469 LO numEntries;
1470 src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1471 if (numEntries > 0) {
1472 LO err = this->replaceLocalValues (dstLclRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1473 if (err != numEntries) {
1474 lclErr = true;
1475#ifdef HAVE_TPETRA_DEBUG
1476 for (LO c = 0; c < numEntries; ++c) {
1477 if (! dstColMap.isNodeLocalElement (lclSrcCols[c])) {
1478 invalidDstPermuteCols.insert (lclSrcCols[c]);
1479 }
1480 }
1481#endif // HAVE_TPETRA_DEBUG
1482 }
1483 }
1484 }
1485 }
1486 else { // must convert column indices to global
1487 // Reserve space to store the destination matrix's local column indices.
1488 const size_t maxNumEnt = src->graph_.getLocalMaxNumRowEntries ();
1489 Teuchos::Array<LO> lclDstCols (maxNumEnt);
1490
1491 // Copy local rows that are the "same" in both source and target.
1492 for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1493 local_inds_host_view_type lclSrcCols;
1494 values_host_view_type vals;
1495 LO numEntries;
1496
1497 // If this call fails, that means the mesh row local index is
1498 // invalid. That means the Import or Export is invalid somehow.
1499 try {
1500 src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1501 } catch (std::exception& e) {
1502 if (debug) {
1503 std::ostringstream os;
1504 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1505 os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1506 << localRow << ", src->getLocalRowView() threw an exception: "
1507 << e.what ();
1508 std::cerr << os.str ();
1509 }
1510 throw e;
1511 }
1512
1513 if (numEntries > 0) {
1514 if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
1515 lclErr = true;
1516 if (debug) {
1517 std::ostringstream os;
1518 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1519 os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1520 << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
1521 << maxNumEnt << endl;
1522 std::cerr << os.str ();
1523 }
1524 }
1525 else {
1526 // Convert the source matrix's local column indices to the
1527 // destination matrix's local column indices.
1528 Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
1529 for (LO j = 0; j < numEntries; ++j) {
1530 lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
1531 if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
1532 lclErr = true;
1533#ifdef HAVE_TPETRA_DEBUG
1534 invalidDstCopyCols.insert (lclDstColsView[j]);
1535#endif // HAVE_TPETRA_DEBUG
1536 }
1537 }
1538 LO err(0);
1539 try {
1540 err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1541 } catch (std::exception& e) {
1542 if (debug) {
1543 std::ostringstream os;
1544 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1545 os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1546 << localRow << ", this->replaceLocalValues() threw an exception: "
1547 << e.what ();
1548 std::cerr << os.str ();
1549 }
1550 throw e;
1551 }
1552 if (err != numEntries) {
1553 lclErr = true;
1554 if (debug) {
1555 std::ostringstream os;
1556 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1557 os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
1558 "localRow " << localRow << ", this->replaceLocalValues "
1559 "returned " << err << " instead of numEntries = "
1560 << numEntries << endl;
1561 std::cerr << os.str ();
1562 }
1563 }
1564 }
1565 }
1566 }
1567
1568 // Copy the "permute" local rows.
1569 for (size_t k = 0; k < numPermute; ++k) {
1570 const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
1571 const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
1572
1573 local_inds_host_view_type lclSrcCols;
1574 values_host_view_type vals;
1575 LO numEntries;
1576
1577 try {
1578 src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1579 } catch (std::exception& e) {
1580 if (debug) {
1581 std::ostringstream os;
1582 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1583 os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
1584 "srcLclRow " << srcLclRow << " and dstLclRow " << dstLclRow
1585 << ", src->getLocalRowView() threw an exception: " << e.what ();
1586 std::cerr << os.str ();
1587 }
1588 throw e;
1589 }
1590
1591 if (numEntries > 0) {
1592 if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
1593 lclErr = true;
1594 }
1595 else {
1596 // Convert the source matrix's local column indices to the
1597 // destination matrix's local column indices.
1598 Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
1599 for (LO j = 0; j < numEntries; ++j) {
1600 lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
1601 if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
1602 lclErr = true;
1603#ifdef HAVE_TPETRA_DEBUG
1604 invalidDstPermuteCols.insert (lclDstColsView[j]);
1605#endif // HAVE_TPETRA_DEBUG
1606 }
1607 }
1608 LO err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1609 if (err != numEntries) {
1610 lclErr = true;
1611 }
1612 }
1613 }
1614 }
1615 }
1616
1617 if (lclErr) {
1618 std::ostream& err = this->markLocalErrorAndGetStream ();
1619#ifdef HAVE_TPETRA_DEBUG
1620 err << "copyAndPermute: The graph structure of the source of the "
1621 "Import or Export must be a subset of the graph structure of the "
1622 "target. ";
1623 err << "invalidSrcCopyRows = [";
1624 for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
1625 it != invalidSrcCopyRows.end (); ++it) {
1626 err << *it;
1627 typename std::set<LO>::const_iterator itp1 = it;
1628 itp1++;
1629 if (itp1 != invalidSrcCopyRows.end ()) {
1630 err << ",";
1631 }
1632 }
1633 err << "], invalidDstCopyRows = [";
1634 for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
1635 it != invalidDstCopyRows.end (); ++it) {
1636 err << *it;
1637 typename std::set<LO>::const_iterator itp1 = it;
1638 itp1++;
1639 if (itp1 != invalidDstCopyRows.end ()) {
1640 err << ",";
1641 }
1642 }
1643 err << "], invalidDstCopyCols = [";
1644 for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
1645 it != invalidDstCopyCols.end (); ++it) {
1646 err << *it;
1647 typename std::set<LO>::const_iterator itp1 = it;
1648 itp1++;
1649 if (itp1 != invalidDstCopyCols.end ()) {
1650 err << ",";
1651 }
1652 }
1653 err << "], invalidDstPermuteCols = [";
1654 for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
1655 it != invalidDstPermuteCols.end (); ++it) {
1656 err << *it;
1657 typename std::set<LO>::const_iterator itp1 = it;
1658 itp1++;
1659 if (itp1 != invalidDstPermuteCols.end ()) {
1660 err << ",";
1661 }
1662 }
1663 err << "], invalidPermuteFromRows = [";
1664 for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
1665 it != invalidPermuteFromRows.end (); ++it) {
1666 err << *it;
1667 typename std::set<LO>::const_iterator itp1 = it;
1668 itp1++;
1669 if (itp1 != invalidPermuteFromRows.end ()) {
1670 err << ",";
1671 }
1672 }
1673 err << "]" << endl;
1674#else
1675 err << "copyAndPermute: The graph structure of the source of the "
1676 "Import or Export must be a subset of the graph structure of the "
1677 "target." << endl;
1678#endif // HAVE_TPETRA_DEBUG
1679 }
1680
1681 if (debug) {
1682 std::ostringstream os;
1683 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1684 const bool lclSuccess = ! (* (this->localError_));
1685 os << "*** Proc " << myRank << ": copyAndPermute "
1686 << (lclSuccess ? "succeeded" : "FAILED");
1687 if (lclSuccess) {
1688 os << endl;
1689 } else {
1690 os << ": error messages: " << this->errorMessages (); // comes w/ endl
1691 }
1692 std::cerr << os.str ();
1693 }
1694 }
1695
1696 namespace { // (anonymous)
1697
1716 template<class LO, class GO>
1717 size_t
1718 packRowCount (const size_t numEnt,
1719 const size_t numBytesPerValue,
1720 const size_t blkSize)
1721 {
1723
1724 if (numEnt == 0) {
1725 // Empty rows always take zero bytes, to ensure sparsity.
1726 return 0;
1727 }
1728 else {
1729 // We store the number of entries as a local index (LO).
1730 LO numEntLO = 0; // packValueCount wants this.
1731 GO gid {};
1732 const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
1733 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
1734 const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
1735 return numEntLen + gidsLen + valsLen;
1736 }
1737 }
1738
1749 template<class ST, class LO, class GO>
1750 size_t
1751 unpackRowCount (const typename ::Tpetra::Details::PackTraits<LO>::input_buffer_type& imports,
1752 const size_t offset,
1753 const size_t numBytes,
1754 const size_t /* numBytesPerValue */)
1755 {
1756 using Kokkos::subview;
1757 using ::Tpetra::Details::PackTraits;
1758
1759 if (numBytes == 0) {
1760 // Empty rows always take zero bytes, to ensure sparsity.
1761 return static_cast<size_t> (0);
1762 }
1763 else {
1764 LO numEntLO = 0;
1765 const size_t theNumBytes = PackTraits<LO>::packValueCount (numEntLO);
1766 TEUCHOS_TEST_FOR_EXCEPTION
1767 (theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
1768 "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
1769 << ".");
1770 const char* const inBuf = imports.data () + offset;
1771 const size_t actualNumBytes = PackTraits<LO>::unpackValue (numEntLO, inBuf);
1772 TEUCHOS_TEST_FOR_EXCEPTION
1773 (actualNumBytes > numBytes, std::logic_error, "unpackRowCount: "
1774 "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
1775 << ".");
1776 return static_cast<size_t> (numEntLO);
1777 }
1778 }
1779
1783 template<class ST, class LO, class GO>
1784 size_t
1785 packRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<LO>::output_buffer_type exports,
1786 const size_t offset,
1787 const size_t numEnt,
1788 const typename ::Tpetra::Details::PackTraits<GO>::input_array_type& gidsIn,
1789 const typename ::Tpetra::Details::PackTraits<ST>::input_array_type& valsIn,
1790 const size_t numBytesPerValue,
1791 const size_t blockSize)
1792 {
1793 using Kokkos::subview;
1794 using ::Tpetra::Details::PackTraits;
1795
1796 if (numEnt == 0) {
1797 // Empty rows always take zero bytes, to ensure sparsity.
1798 return 0;
1799 }
1800 const size_t numScalarEnt = numEnt * blockSize * blockSize;
1801
1802 const GO gid = 0; // packValueCount wants this
1803 const LO numEntLO = static_cast<size_t> (numEnt);
1804
1805 const size_t numEntBeg = offset;
1806 const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
1807 const size_t gidsBeg = numEntBeg + numEntLen;
1808 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
1809 const size_t valsBeg = gidsBeg + gidsLen;
1810 const size_t valsLen = numScalarEnt * numBytesPerValue;
1811
1812 char* const numEntOut = exports.data () + numEntBeg;
1813 char* const gidsOut = exports.data () + gidsBeg;
1814 char* const valsOut = exports.data () + valsBeg;
1815
1816 size_t numBytesOut = 0;
1817 int errorCode = 0;
1818 numBytesOut += PackTraits<LO>::packValue (numEntOut, numEntLO);
1819
1820 {
1821 Kokkos::pair<int, size_t> p;
1822 p = PackTraits<GO>::packArray (gidsOut, gidsIn.data (), numEnt);
1823 errorCode += p.first;
1824 numBytesOut += p.second;
1825
1826 p = PackTraits<ST>::packArray (valsOut, valsIn.data (), numScalarEnt);
1827 errorCode += p.first;
1828 numBytesOut += p.second;
1829 }
1830
1831 const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
1832 TEUCHOS_TEST_FOR_EXCEPTION
1833 (numBytesOut != expectedNumBytes, std::logic_error,
1834 "packRowForBlockCrs: numBytesOut = " << numBytesOut
1835 << " != expectedNumBytes = " << expectedNumBytes << ".");
1836
1837 TEUCHOS_TEST_FOR_EXCEPTION
1838 (errorCode != 0, std::runtime_error, "packRowForBlockCrs: "
1839 "PackTraits::packArray returned a nonzero error code");
1840
1841 return numBytesOut;
1842 }
1843
1844 // Return the number of bytes actually read / used.
1845 template<class ST, class LO, class GO>
1846 size_t
1847 unpackRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<GO>::output_array_type& gidsOut,
1848 const typename ::Tpetra::Details::PackTraits<ST>::output_array_type& valsOut,
1849 const typename ::Tpetra::Details::PackTraits<int>::input_buffer_type& imports,
1850 const size_t offset,
1851 const size_t numBytes,
1852 const size_t numEnt,
1853 const size_t numBytesPerValue,
1854 const size_t blockSize)
1855 {
1856 using ::Tpetra::Details::PackTraits;
1857
1858 if (numBytes == 0) {
1859 // Rows with zero bytes always have zero entries.
1860 return 0;
1861 }
1862 const size_t numScalarEnt = numEnt * blockSize * blockSize;
1863 TEUCHOS_TEST_FOR_EXCEPTION
1864 (static_cast<size_t> (imports.extent (0)) <= offset,
1865 std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
1866 << imports.extent (0) << " <= offset = " << offset << ".");
1867 TEUCHOS_TEST_FOR_EXCEPTION
1868 (static_cast<size_t> (imports.extent (0)) < offset + numBytes,
1869 std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
1870 << imports.extent (0) << " < offset + numBytes = "
1871 << (offset + numBytes) << ".");
1872
1873 const GO gid = 0; // packValueCount wants this
1874 const LO lid = 0; // packValueCount wants this
1875
1876 const size_t numEntBeg = offset;
1877 const size_t numEntLen = PackTraits<LO>::packValueCount (lid);
1878 const size_t gidsBeg = numEntBeg + numEntLen;
1879 const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
1880 const size_t valsBeg = gidsBeg + gidsLen;
1881 const size_t valsLen = numScalarEnt * numBytesPerValue;
1882
1883 const char* const numEntIn = imports.data () + numEntBeg;
1884 const char* const gidsIn = imports.data () + gidsBeg;
1885 const char* const valsIn = imports.data () + valsBeg;
1886
1887 size_t numBytesOut = 0;
1888 int errorCode = 0;
1889 LO numEntOut;
1890 numBytesOut += PackTraits<LO>::unpackValue (numEntOut, numEntIn);
1891 TEUCHOS_TEST_FOR_EXCEPTION
1892 (static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
1893 "unpackRowForBlockCrs: Expected number of entries " << numEnt
1894 << " != actual number of entries " << numEntOut << ".");
1895
1896 {
1897 Kokkos::pair<int, size_t> p;
1898 p = PackTraits<GO>::unpackArray (gidsOut.data (), gidsIn, numEnt);
1899 errorCode += p.first;
1900 numBytesOut += p.second;
1901
1902 p = PackTraits<ST>::unpackArray (valsOut.data (), valsIn, numScalarEnt);
1903 errorCode += p.first;
1904 numBytesOut += p.second;
1905 }
1906
1907 TEUCHOS_TEST_FOR_EXCEPTION
1908 (numBytesOut != numBytes, std::logic_error,
1909 "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
1910 << " != numBytes = " << numBytes << ".");
1911
1912 const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
1913 TEUCHOS_TEST_FOR_EXCEPTION
1914 (numBytesOut != expectedNumBytes, std::logic_error,
1915 "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
1916 << " != expectedNumBytes = " << expectedNumBytes << ".");
1917
1918 TEUCHOS_TEST_FOR_EXCEPTION
1919 (errorCode != 0, std::runtime_error, "unpackRowForBlockCrs: "
1920 "PackTraits::unpackArray returned a nonzero error code");
1921
1922 return numBytesOut;
1923 }
1924 } // namespace (anonymous)
1925
1926 template<class Scalar, class LO, class GO, class Node>
1927 void
1930 (const ::Tpetra::SrcDistObject& source,
1931 const Kokkos::DualView<const local_ordinal_type*,
1932 buffer_device_type>& exportLIDs,
1933 Kokkos::DualView<packet_type*,
1934 buffer_device_type>& exports, // output
1935 Kokkos::DualView<size_t*,
1936 buffer_device_type> numPacketsPerLID, // output
1937 size_t& constantNumPackets)
1938 {
1943
1944 typedef typename Kokkos::View<int*, device_type>::HostMirror::execution_space host_exec;
1945
1946 typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
1947
1948 ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::packAndPrepare");
1949
1950 const bool debug = Behavior::debug();
1951 const bool verbose = Behavior::verbose();
1952
1953 // Define this function prefix
1954 std::string prefix;
1955 {
1956 std::ostringstream os;
1957 const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1958 os << "Proc " << myRank << ": BlockCrsMatrix::packAndPrepare: " << std::endl;
1959 prefix = os.str();
1960 }
1961
1962 // check if this already includes a local error
1963 if (* (this->localError_)) {
1964 std::ostream& err = this->markLocalErrorAndGetStream ();
1965 err << prefix
1966 << "The target object of the Import or Export is already in an error state."
1967 << std::endl;
1968 return;
1969 }
1970
1971 //
1972 // Verbose input dual view status
1973 //
1974 if (verbose) {
1975 std::ostringstream os;
1976 os << prefix << std::endl
1977 << prefix << " " << dualViewStatusToString (exportLIDs, "exportLIDs") << std::endl
1978 << prefix << " " << dualViewStatusToString (exports, "exports") << std::endl
1979 << prefix << " " << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID") << std::endl;
1980 std::cerr << os.str ();
1981 }
1982
1986 if (exportLIDs.extent (0) != numPacketsPerLID.extent (0)) {
1987 std::ostream& err = this->markLocalErrorAndGetStream ();
1988 err << prefix
1989 << "exportLIDs.extent(0) = " << exportLIDs.extent (0)
1990 << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
1991 << "." << std::endl;
1992 return;
1993 }
1994 if (exportLIDs.need_sync_host ()) {
1995 std::ostream& err = this->markLocalErrorAndGetStream ();
1996 err << prefix << "exportLIDs be sync'd to host." << std::endl;
1997 return;
1998 }
1999
2000 const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
2001 if (src == NULL) {
2002 std::ostream& err = this->markLocalErrorAndGetStream ();
2003 err << prefix
2004 << "The source (input) object of the Import or "
2005 "Export is either not a BlockCrsMatrix, or does not have the right "
2006 "template parameters. checkSizes() should have caught this. "
2007 "Please report this bug to the Tpetra developers." << std::endl;
2008 return;
2009 }
2010
2011 // Graphs and matrices are allowed to have a variable number of
2012 // entries per row. We could test whether all rows have the same
2013 // number of entries, but DistObject can only use this
2014 // optimization if all rows on _all_ processes have the same
2015 // number of entries. Rather than do the all-reduce necessary to
2016 // test for this unlikely case, we tell DistObject (by setting
2017 // constantNumPackets to zero) to assume that different rows may
2018 // have different numbers of entries.
2019 constantNumPackets = 0;
2020
2021 // const values
2022 const crs_graph_type& srcGraph = src->graph_;
2023 const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
2024 const size_t numExportLIDs = exportLIDs.extent (0);
2025 size_t numBytesPerValue(0);
2026 {
2027 auto val_host = val_.getHostView(Access::ReadOnly);
2028 numBytesPerValue =
2029 PackTraits<impl_scalar_type>
2030 ::packValueCount(val_host.extent(0) ? val_host(0) : impl_scalar_type());
2031 }
2032
2033 // Compute the number of bytes ("packets") per row to pack. While
2034 // we're at it, compute the total # of block entries to send, and
2035 // the max # of block entries in any of the rows we're sending.
2036
2037 Impl::BlockCrsRowStruct<size_t> rowReducerStruct;
2038
2039 // Graph information is on host; let's do this on host parallel reduce
2040 auto exportLIDsHost = exportLIDs.view_host();
2041 auto numPacketsPerLIDHost = numPacketsPerLID.view_host(); // we will modify this.
2042 numPacketsPerLID.modify_host ();
2043 {
2044 rowReducerStruct = Impl::BlockCrsRowStruct<size_t>();
2045 for (size_t i = 0; i < numExportLIDs; ++i) {
2046 const LO lclRow = exportLIDsHost(i);
2047 size_t numEnt = srcGraph.getNumEntriesInLocalRow (lclRow);
2048 numEnt = (numEnt == Teuchos::OrdinalTraits<size_t>::invalid () ? 0 : numEnt);
2049
2050 const size_t numBytes =
2051 packRowCount<LO, GO> (numEnt, numBytesPerValue, blockSize);
2052 numPacketsPerLIDHost(i) = numBytes;
2053 rowReducerStruct += Impl::BlockCrsRowStruct<size_t>(numEnt, numBytes, numEnt);
2054 }
2055 }
2056
2057 // Compute the number of bytes ("packets") per row to pack. While
2058 // we're at it, compute the total # of block entries to send, and
2059 // the max # of block entries in any of the rows we're sending.
2060 const size_t totalNumBytes = rowReducerStruct.totalNumBytes;
2061 const size_t totalNumEntries = rowReducerStruct.totalNumEntries;
2062 const size_t maxRowLength = rowReducerStruct.maxRowLength;
2063
2064 if (verbose) {
2065 std::ostringstream os;
2066 os << prefix
2067 << "totalNumBytes = " << totalNumBytes << ", totalNumEntries = " << totalNumEntries
2068 << std::endl;
2069 std::cerr << os.str ();
2070 }
2071
2072 // We use a "struct of arrays" approach to packing each row's
2073 // entries. All the column indices (as global indices) go first,
2074 // then all their owning process ranks, and then the values.
2075 if(exports.extent(0) != totalNumBytes)
2076 {
2077 const std::string oldLabel = exports.view_device().label ();
2078 const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
2079 exports = Kokkos::DualView<packet_type*, buffer_device_type>(newLabel, totalNumBytes);
2080 }
2081 if (totalNumEntries > 0) {
2082 // Current position (in bytes) in the 'exports' output array.
2083 Kokkos::View<size_t*, host_exec> offset("offset", numExportLIDs+1);
2084 {
2085 const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs+1);
2086 Kokkos::parallel_scan
2087 (policy,
2088 [=](const size_t &i, size_t &update, const bool &final) {
2089 if (final) offset(i) = update;
2090 update += (i == numExportLIDs ? 0 : numPacketsPerLIDHost(i));
2091 });
2092 }
2093 if (offset(numExportLIDs) != totalNumBytes) {
2094 std::ostream& err = this->markLocalErrorAndGetStream ();
2095 err << prefix
2096 << "At end of method, the final offset (in bytes) "
2097 << offset(numExportLIDs) << " does not equal the total number of bytes packed "
2098 << totalNumBytes << ". "
2099 << "Please report this bug to the Tpetra developers." << std::endl;
2100 return;
2101 }
2102
2103 // For each block row of the matrix owned by the calling
2104 // process, pack that block row's column indices and values into
2105 // the exports array.
2106
2107 // Source matrix's column Map. We verified in checkSizes() that
2108 // the column Map exists (is not null).
2109 const map_type& srcColMap = * (srcGraph.getColMap ());
2110
2111 // Pack the data for each row to send, into the 'exports' buffer.
2112 // exports will be modified on host.
2113 exports.modify_host();
2114 {
2115 typedef Kokkos::TeamPolicy<host_exec> policy_type;
2116 const auto policy =
2117 policy_type(numExportLIDs, 1, 1)
2118 .set_scratch_size(0, Kokkos::PerTeam(sizeof(GO)*maxRowLength));
2119 // The following parallel_for needs const access to the local values of src.
2120 // (the local graph is also accessed on host, but this does not use WDVs).
2121 getValuesHost();
2123 Kokkos::parallel_for
2124 (policy,
2125 [=](const typename policy_type::member_type &member) {
2126 const size_t i = member.league_rank();
2127 Kokkos::View<GO*, typename host_exec::scratch_memory_space>
2128 gblColInds(member.team_scratch(0), maxRowLength);
2129
2130 const LO lclRowInd = exportLIDsHost(i);
2131 local_inds_host_view_type lclColInds;
2132 values_host_view_type vals;
2133
2134 // It's OK to ignore the return value, since if the calling
2135 // process doesn't own that local row, then the number of
2136 // entries in that row on the calling process is zero.
2137 src->getLocalRowView (lclRowInd, lclColInds, vals);
2138 const size_t numEnt = lclColInds.extent(0);
2139
2140 // Convert column indices from local to global.
2141 for (size_t j = 0; j < numEnt; ++j)
2142 gblColInds(j) = srcColMap.getGlobalElement (lclColInds(j));
2143
2144 // Kyungjoo: additional wrapping scratch view is necessary
2145 // the following function interface need the same execution space
2146 // host scratch space somehow is not considered same as the host_exec
2147 // Copy the row's data into the current spot in the exports array.
2148 const size_t numBytes =
2149 packRowForBlockCrs<impl_scalar_type, LO, GO>
2150 (exports.view_host(),
2151 offset(i),
2152 numEnt,
2153 Kokkos::View<const GO*, host_exec>(gblColInds.data(), maxRowLength),
2154 vals,
2155 numBytesPerValue,
2156 blockSize);
2157
2158 // numBytes should be same as the difference between offsets
2159 if (debug) {
2160 const size_t offsetDiff = offset(i+1) - offset(i);
2161 if (numBytes != offsetDiff) {
2162 std::ostringstream os;
2163 os << prefix
2164 << "numBytes computed from packRowForBlockCrs is different from "
2165 << "precomputed offset values, LID = " << i << std::endl;
2166 std::cerr << os.str ();
2167 }
2168 }
2169 }); // for each LID (of a row) to send
2171 }
2172 } // if totalNumEntries > 0
2173
2174 if (debug) {
2175 std::ostringstream os;
2176 const bool lclSuccess = ! (* (this->localError_));
2177 os << prefix
2178 << (lclSuccess ? "succeeded" : "FAILED")
2179 << std::endl;
2180 std::cerr << os.str ();
2181 }
2182 }
2183
2184 template<class Scalar, class LO, class GO, class Node>
2185 void
2188 (const Kokkos::DualView<const local_ordinal_type*,
2189 buffer_device_type>& importLIDs,
2190 Kokkos::DualView<packet_type*,
2191 buffer_device_type> imports,
2192 Kokkos::DualView<size_t*,
2193 buffer_device_type> numPacketsPerLID,
2194 const size_t /* constantNumPackets */,
2195 const CombineMode combineMode)
2196 {
2201 using std::endl;
2202 using host_exec =
2203 typename Kokkos::View<int*, device_type>::HostMirror::execution_space;
2204
2205 ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::unpackAndCombine");
2206 const bool verbose = Behavior::verbose ();
2207
2208 // Define this function prefix
2209 std::string prefix;
2210 {
2211 std::ostringstream os;
2212 auto map = this->graph_.getRowMap ();
2213 auto comm = map.is_null () ? Teuchos::null : map->getComm ();
2214 const int myRank = comm.is_null () ? -1 : comm->getRank ();
2215 os << "Proc " << myRank << ": Tpetra::BlockCrsMatrix::unpackAndCombine: ";
2216 prefix = os.str ();
2217 if (verbose) {
2218 os << "Start" << endl;
2219 std::cerr << os.str ();
2220 }
2221 }
2222
2223 // check if this already includes a local error
2224 if (* (this->localError_)) {
2225 std::ostream& err = this->markLocalErrorAndGetStream ();
2226 std::ostringstream os;
2227 os << prefix << "{Im/Ex}port target is already in error." << endl;
2228 if (verbose) {
2229 std::cerr << os.str ();
2230 }
2231 err << os.str ();
2232 return;
2233 }
2234
2238 if (importLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2239 std::ostream& err = this->markLocalErrorAndGetStream ();
2240 std::ostringstream os;
2241 os << prefix << "importLIDs.extent(0) = " << importLIDs.extent (0)
2242 << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2243 << "." << endl;
2244 if (verbose) {
2245 std::cerr << os.str ();
2246 }
2247 err << os.str ();
2248 return;
2249 }
2250
2251 if (combineMode != ADD && combineMode != INSERT &&
2252 combineMode != REPLACE && combineMode != ABSMAX &&
2253 combineMode != ZERO) {
2254 std::ostream& err = this->markLocalErrorAndGetStream ();
2255 std::ostringstream os;
2256 os << prefix
2257 << "Invalid CombineMode value " << combineMode << ". Valid "
2258 << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
2259 << std::endl;
2260 if (verbose) {
2261 std::cerr << os.str ();
2262 }
2263 err << os.str ();
2264 return;
2265 }
2266
2267 if (this->graph_.getColMap ().is_null ()) {
2268 std::ostream& err = this->markLocalErrorAndGetStream ();
2269 std::ostringstream os;
2270 os << prefix << "Target matrix's column Map is null." << endl;
2271 if (verbose) {
2272 std::cerr << os.str ();
2273 }
2274 err << os.str ();
2275 return;
2276 }
2277
2278 // Target matrix's column Map. Use to convert the global column
2279 // indices in the receive buffer to local indices. We verified in
2280 // checkSizes() that the column Map exists (is not null).
2281 const map_type& tgtColMap = * (this->graph_.getColMap ());
2282
2283 // Const values
2284 const size_t blockSize = this->getBlockSize ();
2285 const size_t numImportLIDs = importLIDs.extent(0);
2286 // FIXME (mfh 06 Feb 2019) For scalar types with run-time sizes, a
2287 // default-constructed instance could have a different size than
2288 // other instances. (We assume all nominally constructed
2289 // instances have the same size; that's not the issue here.) This
2290 // could be bad if the calling process has no entries, but other
2291 // processes have entries that they want to send to us.
2292 size_t numBytesPerValue(0);
2293 {
2294 auto val_host = val_.getHostView(Access::ReadOnly);
2295 numBytesPerValue =
2296 PackTraits<impl_scalar_type>::packValueCount
2297 (val_host.extent (0) ? val_host(0) : impl_scalar_type ());
2298 }
2299 const size_t maxRowNumEnt = graph_.getLocalMaxNumRowEntries ();
2300 const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
2301
2302 if (verbose) {
2303 std::ostringstream os;
2304 os << prefix << "combineMode: "
2305 << ::Tpetra::combineModeToString (combineMode)
2306 << ", blockSize: " << blockSize
2307 << ", numImportLIDs: " << numImportLIDs
2308 << ", numBytesPerValue: " << numBytesPerValue
2309 << ", maxRowNumEnt: " << maxRowNumEnt
2310 << ", maxRowNumScalarEnt: " << maxRowNumScalarEnt << endl;
2311 std::cerr << os.str ();
2312 }
2313
2314 if (combineMode == ZERO || numImportLIDs == 0) {
2315 if (verbose) {
2316 std::ostringstream os;
2317 os << prefix << "Nothing to unpack. Done!" << std::endl;
2318 std::cerr << os.str ();
2319 }
2320 return;
2321 }
2322
2323 // NOTE (mfh 07 Feb 2019) If we ever implement unpack on device,
2324 // we can remove this sync.
2325 if (imports.need_sync_host ()) {
2326 imports.sync_host ();
2327 }
2328
2329 // NOTE (mfh 07 Feb 2019) DistObject::doTransferNew has already
2330 // sync'd numPacketsPerLID to host, since it needs to do that in
2331 // order to post MPI_Irecv messages with the correct lengths. A
2332 // hypothetical device-specific boundary exchange implementation
2333 // could possibly receive data without sync'ing lengths to host,
2334 // but we don't need to design for that nonexistent thing yet.
2335
2336 if (imports.need_sync_host () || numPacketsPerLID.need_sync_host () ||
2337 importLIDs.need_sync_host ()) {
2338 std::ostream& err = this->markLocalErrorAndGetStream ();
2339 std::ostringstream os;
2340 os << prefix << "All input DualViews must be sync'd to host by now. "
2341 << "imports_nc.need_sync_host()="
2342 << (imports.need_sync_host () ? "true" : "false")
2343 << ", numPacketsPerLID.need_sync_host()="
2344 << (numPacketsPerLID.need_sync_host () ? "true" : "false")
2345 << ", importLIDs.need_sync_host()="
2346 << (importLIDs.need_sync_host () ? "true" : "false")
2347 << "." << endl;
2348 if (verbose) {
2349 std::cerr << os.str ();
2350 }
2351 err << os.str ();
2352 return;
2353 }
2354
2355 const auto importLIDsHost = importLIDs.view_host ();
2356 const auto numPacketsPerLIDHost = numPacketsPerLID.view_host ();
2357
2358 // FIXME (mfh 06 Feb 2019) We could fuse the scan with the unpack
2359 // loop, by only unpacking on final==true (when we know the
2360 // current offset's value).
2361
2362 Kokkos::View<size_t*, host_exec> offset ("offset", numImportLIDs+1);
2363 {
2364 const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numImportLIDs+1);
2365 Kokkos::parallel_scan
2366 ("Tpetra::BlockCrsMatrix::unpackAndCombine: offsets", policy,
2367 [=] (const size_t &i, size_t &update, const bool &final) {
2368 if (final) offset(i) = update;
2369 update += (i == numImportLIDs ? 0 : numPacketsPerLIDHost(i));
2370 });
2371 }
2372
2373 // this variable does not matter with a race condition (just error flag)
2374 //
2375 // NOTE (mfh 06 Feb 2019) CUDA doesn't necessarily like reductions
2376 // or atomics on bool, so we use int instead. (I know we're not
2377 // launching a CUDA loop here, but we could in the future, and I'd
2378 // like to avoid potential trouble.)
2379 Kokkos::View<int, host_exec, Kokkos::MemoryTraits<Kokkos::Atomic> >
2380 errorDuringUnpack ("errorDuringUnpack");
2381 errorDuringUnpack () = 0;
2382 {
2383 using policy_type = Kokkos::TeamPolicy<host_exec>;
2384 size_t scratch_per_row = sizeof(GO) * maxRowNumEnt + sizeof (LO) * maxRowNumEnt + numBytesPerValue * maxRowNumScalarEnt
2385 + 2 * sizeof(GO); // Yeah, this is a fudge factor
2386
2387 const auto policy = policy_type (numImportLIDs, 1, 1)
2388 .set_scratch_size (0, Kokkos::PerTeam (scratch_per_row));
2389 using host_scratch_space = typename host_exec::scratch_memory_space;
2390
2391 using pair_type = Kokkos::pair<size_t, size_t>;
2392
2393 //The following parallel_for modifies values on host while unpacking.
2396 Kokkos::parallel_for
2397 ("Tpetra::BlockCrsMatrix::unpackAndCombine: unpack", policy,
2398 [=] (const typename policy_type::member_type& member) {
2399 const size_t i = member.league_rank();
2400 Kokkos::View<GO*, host_scratch_space> gblColInds
2401 (member.team_scratch (0), maxRowNumEnt);
2402 Kokkos::View<LO*, host_scratch_space> lclColInds
2403 (member.team_scratch (0), maxRowNumEnt);
2404 Kokkos::View<impl_scalar_type*, host_scratch_space> vals
2405 (member.team_scratch (0), maxRowNumScalarEnt);
2406
2407
2408 const size_t offval = offset(i);
2409 const LO lclRow = importLIDsHost(i);
2410 const size_t numBytes = numPacketsPerLIDHost(i);
2411 const size_t numEnt =
2412 unpackRowCount<impl_scalar_type, LO, GO>
2413 (imports.view_host (), offval, numBytes, numBytesPerValue);
2414
2415 if (numBytes > 0) {
2416 if (numEnt > maxRowNumEnt) {
2417 errorDuringUnpack() = 1;
2418 if (verbose) {
2419 std::ostringstream os;
2420 os << prefix
2421 << "At i = " << i << ", numEnt = " << numEnt
2422 << " > maxRowNumEnt = " << maxRowNumEnt
2423 << std::endl;
2424 std::cerr << os.str ();
2425 }
2426 return;
2427 }
2428 }
2429 const size_t numScalarEnt = numEnt*blockSize*blockSize;
2430 auto gidsOut = Kokkos::subview(gblColInds, pair_type(0, numEnt));
2431 auto lidsOut = Kokkos::subview(lclColInds, pair_type(0, numEnt));
2432 auto valsOut = Kokkos::subview(vals, pair_type(0, numScalarEnt));
2433
2434 // Kyungjoo: additional wrapping scratch view is necessary
2435 // the following function interface need the same execution space
2436 // host scratch space somehow is not considered same as the host_exec
2437 size_t numBytesOut = 0;
2438 try {
2439 numBytesOut =
2440 unpackRowForBlockCrs<impl_scalar_type, LO, GO>
2441 (Kokkos::View<GO*,host_exec>(gidsOut.data(), numEnt),
2442 Kokkos::View<impl_scalar_type*,host_exec>(valsOut.data(), numScalarEnt),
2443 imports.view_host(),
2444 offval, numBytes, numEnt,
2445 numBytesPerValue, blockSize);
2446 }
2447 catch (std::exception& e) {
2448 errorDuringUnpack() = 1;
2449 if (verbose) {
2450 std::ostringstream os;
2451 os << prefix << "At i = " << i << ", unpackRowForBlockCrs threw: "
2452 << e.what () << endl;
2453 std::cerr << os.str ();
2454 }
2455 return;
2456 }
2457
2458 if (numBytes != numBytesOut) {
2459 errorDuringUnpack() = 1;
2460 if (verbose) {
2461 std::ostringstream os;
2462 os << prefix
2463 << "At i = " << i << ", numBytes = " << numBytes
2464 << " != numBytesOut = " << numBytesOut << "."
2465 << std::endl;
2466 std::cerr << os.str();
2467 }
2468 return;
2469 }
2470
2471 // Convert incoming global indices to local indices.
2472 for (size_t k = 0; k < numEnt; ++k) {
2473 lidsOut(k) = tgtColMap.getLocalElement (gidsOut(k));
2474 if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
2475 if (verbose) {
2476 std::ostringstream os;
2477 os << prefix
2478 << "At i = " << i << ", GID " << gidsOut(k)
2479 << " is not owned by the calling process."
2480 << std::endl;
2481 std::cerr << os.str();
2482 }
2483 return;
2484 }
2485 }
2486
2487 // Combine the incoming data with the matrix's current data.
2488 LO numCombd = 0;
2489 const LO* const lidsRaw = const_cast<const LO*> (lidsOut.data ());
2490 const Scalar* const valsRaw = reinterpret_cast<const Scalar*>
2491 (const_cast<const impl_scalar_type*> (valsOut.data ()));
2492 if (combineMode == ADD) {
2493 numCombd = this->sumIntoLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2494 } else if (combineMode == INSERT || combineMode == REPLACE) {
2495 numCombd = this->replaceLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2496 } else if (combineMode == ABSMAX) {
2497 numCombd = this->absMaxLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2498 }
2499
2500 if (static_cast<LO> (numEnt) != numCombd) {
2501 errorDuringUnpack() = 1;
2502 if (verbose) {
2503 std::ostringstream os;
2504 os << prefix << "At i = " << i << ", numEnt = " << numEnt
2505 << " != numCombd = " << numCombd << "."
2506 << endl;
2507 std::cerr << os.str ();
2508 }
2509 return;
2510 }
2511 }); // for each import LID i
2513 }
2514
2515 if (errorDuringUnpack () != 0) {
2516 std::ostream& err = this->markLocalErrorAndGetStream ();
2517 err << prefix << "Unpacking failed.";
2518 if (! verbose) {
2519 err << " Please run again with the environment variable setting "
2520 "TPETRA_VERBOSE=1 to get more verbose diagnostic output.";
2521 }
2522 err << endl;
2523 }
2524
2525 if (verbose) {
2526 std::ostringstream os;
2527 const bool lclSuccess = ! (* (this->localError_));
2528 os << prefix
2529 << (lclSuccess ? "succeeded" : "FAILED")
2530 << std::endl;
2531 std::cerr << os.str ();
2532 }
2533 }
2534
2535 template<class Scalar, class LO, class GO, class Node>
2536 std::string
2538 {
2539 using Teuchos::TypeNameTraits;
2540 std::ostringstream os;
2541 os << "\"Tpetra::BlockCrsMatrix\": { "
2542 << "Template parameters: { "
2543 << "Scalar: " << TypeNameTraits<Scalar>::name ()
2544 << "LO: " << TypeNameTraits<LO>::name ()
2545 << "GO: " << TypeNameTraits<GO>::name ()
2546 << "Node: " << TypeNameTraits<Node>::name ()
2547 << " }"
2548 << ", Label: \"" << this->getObjectLabel () << "\""
2549 << ", Global dimensions: ["
2550 << graph_.getDomainMap ()->getGlobalNumElements () << ", "
2551 << graph_.getRangeMap ()->getGlobalNumElements () << "]"
2552 << ", Block size: " << getBlockSize ()
2553 << " }";
2554 return os.str ();
2555 }
2556
2557
2558 template<class Scalar, class LO, class GO, class Node>
2559 void
2561 describe (Teuchos::FancyOStream& out,
2562 const Teuchos::EVerbosityLevel verbLevel) const
2563 {
2564 using Teuchos::ArrayRCP;
2565 using Teuchos::CommRequest;
2566 using Teuchos::FancyOStream;
2567 using Teuchos::getFancyOStream;
2568 using Teuchos::ireceive;
2569 using Teuchos::isend;
2570 using Teuchos::outArg;
2571 using Teuchos::TypeNameTraits;
2572 using Teuchos::VERB_DEFAULT;
2573 using Teuchos::VERB_NONE;
2574 using Teuchos::VERB_LOW;
2575 using Teuchos::VERB_MEDIUM;
2576 // using Teuchos::VERB_HIGH;
2577 using Teuchos::VERB_EXTREME;
2578 using Teuchos::RCP;
2579 using Teuchos::wait;
2580 using std::endl;
2581
2582 // Set default verbosity if applicable.
2583 const Teuchos::EVerbosityLevel vl =
2584 (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
2585
2586 if (vl == VERB_NONE) {
2587 return; // print nothing
2588 }
2589
2590 // describe() always starts with a tab before it prints anything.
2591 Teuchos::OSTab tab0 (out);
2592
2593 out << "\"Tpetra::BlockCrsMatrix\":" << endl;
2594 Teuchos::OSTab tab1 (out);
2595
2596 out << "Template parameters:" << endl;
2597 {
2598 Teuchos::OSTab tab2 (out);
2599 out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
2600 << "LO: " << TypeNameTraits<LO>::name () << endl
2601 << "GO: " << TypeNameTraits<GO>::name () << endl
2602 << "Node: " << TypeNameTraits<Node>::name () << endl;
2603 }
2604 out << "Label: \"" << this->getObjectLabel () << "\"" << endl
2605 << "Global dimensions: ["
2606 << graph_.getDomainMap ()->getGlobalNumElements () << ", "
2607 << graph_.getRangeMap ()->getGlobalNumElements () << "]" << endl;
2608
2609 const LO blockSize = getBlockSize ();
2610 out << "Block size: " << blockSize << endl;
2611
2612 // constituent objects
2613 if (vl >= VERB_MEDIUM) {
2614 const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
2615 const int myRank = comm.getRank ();
2616 if (myRank == 0) {
2617 out << "Row Map:" << endl;
2618 }
2619 getRowMap()->describe (out, vl);
2620
2621 if (! getColMap ().is_null ()) {
2622 if (getColMap() == getRowMap()) {
2623 if (myRank == 0) {
2624 out << "Column Map: same as row Map" << endl;
2625 }
2626 }
2627 else {
2628 if (myRank == 0) {
2629 out << "Column Map:" << endl;
2630 }
2631 getColMap ()->describe (out, vl);
2632 }
2633 }
2634 if (! getDomainMap ().is_null ()) {
2635 if (getDomainMap () == getRowMap ()) {
2636 if (myRank == 0) {
2637 out << "Domain Map: same as row Map" << endl;
2638 }
2639 }
2640 else if (getDomainMap () == getColMap ()) {
2641 if (myRank == 0) {
2642 out << "Domain Map: same as column Map" << endl;
2643 }
2644 }
2645 else {
2646 if (myRank == 0) {
2647 out << "Domain Map:" << endl;
2648 }
2649 getDomainMap ()->describe (out, vl);
2650 }
2651 }
2652 if (! getRangeMap ().is_null ()) {
2653 if (getRangeMap () == getDomainMap ()) {
2654 if (myRank == 0) {
2655 out << "Range Map: same as domain Map" << endl;
2656 }
2657 }
2658 else if (getRangeMap () == getRowMap ()) {
2659 if (myRank == 0) {
2660 out << "Range Map: same as row Map" << endl;
2661 }
2662 }
2663 else {
2664 if (myRank == 0) {
2665 out << "Range Map: " << endl;
2666 }
2667 getRangeMap ()->describe (out, vl);
2668 }
2669 }
2670 }
2671
2672 if (vl >= VERB_EXTREME) {
2673 const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
2674 const int myRank = comm.getRank ();
2675 const int numProcs = comm.getSize ();
2676
2677 // Print the calling process' data to the given output stream.
2678 RCP<std::ostringstream> lclOutStrPtr (new std::ostringstream ());
2679 RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
2680 FancyOStream& os = *osPtr;
2681 os << "Process " << myRank << ":" << endl;
2682 Teuchos::OSTab tab2 (os);
2683
2684 const map_type& meshRowMap = * (graph_.getRowMap ());
2685 const map_type& meshColMap = * (graph_.getColMap ());
2686 for (LO meshLclRow = meshRowMap.getMinLocalIndex ();
2687 meshLclRow <= meshRowMap.getMaxLocalIndex ();
2688 ++meshLclRow) {
2689 const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
2690 os << "Row " << meshGblRow << ": {";
2691
2692 local_inds_host_view_type lclColInds;
2693 values_host_view_type vals;
2694 LO numInds = 0;
2695 this->getLocalRowView (meshLclRow, lclColInds, vals); numInds = lclColInds.extent(0);
2696
2697 for (LO k = 0; k < numInds; ++k) {
2698 const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
2699
2700 os << "Col " << gblCol << ": [";
2701 for (LO i = 0; i < blockSize; ++i) {
2702 for (LO j = 0; j < blockSize; ++j) {
2703 os << vals[blockSize*blockSize*k + i*blockSize + j];
2704 if (j + 1 < blockSize) {
2705 os << ", ";
2706 }
2707 }
2708 if (i + 1 < blockSize) {
2709 os << "; ";
2710 }
2711 }
2712 os << "]";
2713 if (k + 1 < numInds) {
2714 os << ", ";
2715 }
2716 }
2717 os << "}" << endl;
2718 }
2719
2720 // Print data on Process 0. This will automatically respect the
2721 // current indentation level.
2722 if (myRank == 0) {
2723 out << lclOutStrPtr->str ();
2724 lclOutStrPtr = Teuchos::null; // clear it to save space
2725 }
2726
2727 const int sizeTag = 1337;
2728 const int dataTag = 1338;
2729
2730 ArrayRCP<char> recvDataBuf; // only used on Process 0
2731
2732 // Send string sizes and data from each process in turn to
2733 // Process 0, and print on that process.
2734 for (int p = 1; p < numProcs; ++p) {
2735 if (myRank == 0) {
2736 // Receive the incoming string's length.
2737 ArrayRCP<size_t> recvSize (1);
2738 recvSize[0] = 0;
2739 RCP<CommRequest<int> > recvSizeReq =
2740 ireceive<int, size_t> (recvSize, p, sizeTag, comm);
2741 wait<int> (comm, outArg (recvSizeReq));
2742 const size_t numCharsToRecv = recvSize[0];
2743
2744 // Allocate space for the string to receive. Reuse receive
2745 // buffer space if possible. We can do this because in the
2746 // current implementation, we only have one receive in
2747 // flight at a time. Leave space for the '\0' at the end,
2748 // in case the sender doesn't send it.
2749 if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
2750 recvDataBuf.resize (numCharsToRecv + 1);
2751 }
2752 ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
2753 // Post the receive of the actual string data.
2754 RCP<CommRequest<int> > recvDataReq =
2755 ireceive<int, char> (recvData, p, dataTag, comm);
2756 wait<int> (comm, outArg (recvDataReq));
2757
2758 // Print the received data. This will respect the current
2759 // indentation level. Make sure that the string is
2760 // null-terminated.
2761 recvDataBuf[numCharsToRecv] = '\0';
2762 out << recvDataBuf.getRawPtr ();
2763 }
2764 else if (myRank == p) { // if I am not Process 0, and my rank is p
2765 // This deep-copies the string at most twice, depending on
2766 // whether std::string reference counts internally (it
2767 // generally does, so this won't deep-copy at all).
2768 const std::string stringToSend = lclOutStrPtr->str ();
2769 lclOutStrPtr = Teuchos::null; // clear original to save space
2770
2771 // Send the string's length to Process 0.
2772 const size_t numCharsToSend = stringToSend.size ();
2773 ArrayRCP<size_t> sendSize (1);
2774 sendSize[0] = numCharsToSend;
2775 RCP<CommRequest<int> > sendSizeReq =
2776 isend<int, size_t> (sendSize, 0, sizeTag, comm);
2777 wait<int> (comm, outArg (sendSizeReq));
2778
2779 // Send the actual string to Process 0. We know that the
2780 // string has length > 0, so it's save to take the address
2781 // of the first entry. Make a nonowning ArrayRCP to hold
2782 // the string. Process 0 will add a null termination
2783 // character at the end of the string, after it receives the
2784 // message.
2785 ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend, false);
2786 RCP<CommRequest<int> > sendDataReq =
2787 isend<int, char> (sendData, 0, dataTag, comm);
2788 wait<int> (comm, outArg (sendDataReq));
2789 }
2790 } // for each process rank p other than 0
2791 } // extreme verbosity level (print the whole matrix)
2792 }
2793
2794 template<class Scalar, class LO, class GO, class Node>
2795 Teuchos::RCP<const Teuchos::Comm<int> >
2797 getComm() const
2798 {
2799 return graph_.getComm();
2800 }
2801
2802
2803 template<class Scalar, class LO, class GO, class Node>
2806 getGlobalNumCols() const
2807 {
2808 return graph_.getGlobalNumCols();
2809 }
2810
2811
2812 template<class Scalar, class LO, class GO, class Node>
2813 size_t
2815 getLocalNumCols() const
2816 {
2817 return graph_.getLocalNumCols();
2818 }
2819
2820 template<class Scalar, class LO, class GO, class Node>
2821 GO
2823 getIndexBase() const
2824 {
2825 return graph_.getIndexBase();
2826 }
2827
2828 template<class Scalar, class LO, class GO, class Node>
2832 {
2833 return graph_.getGlobalNumEntries();
2834 }
2835
2836 template<class Scalar, class LO, class GO, class Node>
2837 size_t
2839 getLocalNumEntries() const
2840 {
2841 return graph_.getLocalNumEntries();
2842 }
2843
2844 template<class Scalar, class LO, class GO, class Node>
2845 size_t
2847 getNumEntriesInGlobalRow (GO globalRow) const
2848 {
2849 return graph_.getNumEntriesInGlobalRow(globalRow);
2850 }
2851
2852
2853 template<class Scalar, class LO, class GO, class Node>
2854 size_t
2857 {
2858 return graph_.getGlobalMaxNumRowEntries();
2859 }
2860
2861 template<class Scalar, class LO, class GO, class Node>
2862 bool
2864 hasColMap() const
2865 {
2866 return graph_.hasColMap();
2867 }
2868
2869
2870 template<class Scalar, class LO, class GO, class Node>
2871 bool
2873 isLocallyIndexed() const
2874 {
2875 return graph_.isLocallyIndexed();
2876 }
2877
2878 template<class Scalar, class LO, class GO, class Node>
2879 bool
2881 isGloballyIndexed() const
2882 {
2883 return graph_.isGloballyIndexed();
2884 }
2885
2886 template<class Scalar, class LO, class GO, class Node>
2887 bool
2889 isFillComplete() const
2890 {
2891 return graph_.isFillComplete ();
2892 }
2893
2894 template<class Scalar, class LO, class GO, class Node>
2895 bool
2897 supportsRowViews() const
2898 {
2899 return false;
2900 }
2901
2902 template<class Scalar, class LO, class GO, class Node>
2903 void
2905 getGlobalRowCopy (GO /*GlobalRow*/,
2906 nonconst_global_inds_host_view_type &/*Indices*/,
2907 nonconst_values_host_view_type &/*Values*/,
2908 size_t& /*NumEntries*/) const
2909 {
2910 TEUCHOS_TEST_FOR_EXCEPTION(
2911 true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowCopy: "
2912 "This class doesn't support global matrix indexing.");
2913
2914 }
2915
2916 template<class Scalar, class LO, class GO, class Node>
2917 void
2919 getGlobalRowView (GO /* GlobalRow */,
2920 global_inds_host_view_type &/* indices */,
2921 values_host_view_type &/* values */) const
2922 {
2923 TEUCHOS_TEST_FOR_EXCEPTION(
2924 true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowView: "
2925 "This class doesn't support global matrix indexing.");
2926
2927 }
2928
2929 template<class Scalar, class LO, class GO, class Node>
2930 void
2932 getLocalRowView (LO localRowInd,
2933 local_inds_host_view_type &colInds,
2934 values_host_view_type &vals) const
2935 {
2936 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
2937 colInds = local_inds_host_view_type();
2938 vals = values_host_view_type();
2939 }
2940 else {
2941 const size_t absBlockOffsetStart = ptrHost_[localRowInd];
2942 const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
2943 colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
2944
2945 vals = getValuesHost (localRowInd);
2946 }
2947 }
2948
2949 template<class Scalar, class LO, class GO, class Node>
2950 void
2952 getLocalRowViewNonConst (LO localRowInd,
2953 local_inds_host_view_type &colInds,
2954 nonconst_values_host_view_type &vals) const
2955 {
2956 if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
2957 colInds = nonconst_local_inds_host_view_type();
2958 vals = nonconst_values_host_view_type();
2959 }
2960 else {
2961 const size_t absBlockOffsetStart = ptrHost_[localRowInd];
2962 const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
2963 colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
2964
2965 using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
2966 vals = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
2967 }
2968 }
2969
2970 template<class Scalar, class LO, class GO, class Node>
2971 void
2974 {
2975 const size_t lclNumMeshRows = graph_.getLocalNumRows ();
2976
2977 Kokkos::View<size_t*, device_type> diagOffsets ("diagOffsets", lclNumMeshRows);
2978 graph_.getLocalDiagOffsets (diagOffsets);
2979
2980 // The code below works on host, so use a host View.
2981 auto diagOffsetsHost = Kokkos::create_mirror_view (diagOffsets);
2982 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
2983 Kokkos::deep_copy (execution_space(), diagOffsetsHost, diagOffsets);
2984
2985 auto vals_host_out = val_.getHostView(Access::ReadOnly);
2986 const impl_scalar_type* vals_host_out_raw = vals_host_out.data();
2987
2988 // TODO amk: This is a temporary measure to make the code run with Ifpack2
2989 size_t rowOffset = 0;
2990 size_t offset = 0;
2991 LO bs = getBlockSize();
2992 for(size_t r=0; r<getLocalNumRows(); r++)
2993 {
2994 // move pointer to start of diagonal block
2995 offset = rowOffset + diagOffsetsHost(r)*bs*bs;
2996 for(int b=0; b<bs; b++)
2997 {
2998 diag.replaceLocalValue(r*bs+b, vals_host_out_raw[offset+b*(bs+1)]);
2999 }
3000 // move pointer to start of next block row
3001 rowOffset += getNumEntriesInLocalRow(r)*bs*bs;
3002 }
3003
3004 //diag.template sync<memory_space> (); // sync vec of diag entries back to dev
3005 }
3006
3007 template<class Scalar, class LO, class GO, class Node>
3008 void
3010 leftScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3011 {
3012 TEUCHOS_TEST_FOR_EXCEPTION(
3013 true, std::logic_error, "Tpetra::BlockCrsMatrix::leftScale: "
3014 "not implemented.");
3015
3016 }
3017
3018 template<class Scalar, class LO, class GO, class Node>
3019 void
3021 rightScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3022 {
3023 TEUCHOS_TEST_FOR_EXCEPTION(
3024 true, std::logic_error, "Tpetra::BlockCrsMatrix::rightScale: "
3025 "not implemented.");
3026
3027 }
3028
3029 template<class Scalar, class LO, class GO, class Node>
3030 Teuchos::RCP<const ::Tpetra::RowGraph<LO, GO, Node> >
3032 getGraph() const
3033 {
3034 return graphRCP_;
3035 }
3036
3037 template<class Scalar, class LO, class GO, class Node>
3038 typename ::Tpetra::RowMatrix<Scalar, LO, GO, Node>::mag_type
3040 getFrobeniusNorm () const
3041 {
3042 TEUCHOS_TEST_FOR_EXCEPTION(
3043 true, std::logic_error, "Tpetra::BlockCrsMatrix::getFrobeniusNorm: "
3044 "not implemented.");
3045 }
3046
3047} // namespace Tpetra
3048
3049//
3050// Explicit instantiation macro
3051//
3052// Must be expanded from within the Tpetra namespace!
3053//
3054#define TPETRA_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
3055 template class BlockCrsMatrix< S, LO, GO, NODE >;
3056
3057#endif // TPETRA_BLOCKCRSMATRIX_DEF_HPP
Linear algebra kernels for small dense matrices and vectors.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh,...
virtual LO getBlockSize() const override
The number of degrees of freedom per mesh point.
void exportAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Export< LO, GO, Node > &exporter) const
Import from this to the given destination matrix, and make the result fill complete.
virtual bool isLocallyIndexed() const override
Whether matrix indices are locally indexed.
virtual bool isFillComplete() const override
Whether fillComplete() has been called.
virtual size_t getGlobalMaxNumRowEntries() const override
The maximum number of entries in any row over all processes in the matrix's communicator.
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
virtual void getLocalRowCopy(LO LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Not implemented.
std::string errorMessages() const
The current stream of error messages.
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
Scalar scalar_type
The type of entries in the matrix (that is, of each entry in each block).
virtual void getGlobalRowView(GO GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const override
Print a description of this object to the given output stream.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
LO local_ordinal_type
The type of local indices.
virtual void getGlobalRowCopy(GO GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Get a copy of the given global row's entries.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
virtual size_t getLocalNumCols() const override
The number of columns needed to apply the forward operator on this node.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM) override
::Tpetra::MultiVector< Scalar, LO, GO, node_type > mv_type
The implementation of MultiVector that this class uses.
Teuchos::RCP< const map_type > getRangeMap() const override
Get the (point) range Map of this matrix.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
virtual bool hasColMap() const override
Whether this matrix has a well-defined column Map.
device_type::execution_space execution_space
The Kokkos execution space that this class uses.
size_t getLocalNumRows() const override
get the local number of block rows
virtual size_t getLocalNumEntries() const override
The local number of stored (structurally nonzero) entries.
impl_scalar_type_dualview::t_host getValuesHostNonConst() const
Get the host or device View of the matrix's values (val_).
Kokkos::View< impl_scalar_type **, Impl::BlockCrsMatrixLittleBlockArrayLayout, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_block_type
The type used to access nonconst matrix blocks.
Teuchos::RCP< const map_type > getColMap() const override
get the (mesh) map for the columns of this block matrix.
virtual typename::Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const override
The Frobenius norm of the matrix.
virtual void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< packet_type *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode) override
local_matrix_device_type getLocalMatrixDevice() const
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
virtual global_size_t getGlobalNumCols() const override
The global number of columns of this matrix.
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
Teuchos::RCP< const map_type > getRowMap() const override
get the (mesh) map for the rows of this block matrix.
std::string description() const override
One-line description of this object.
void importAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Import< LO, GO, Node > &importer) const
Import from this to the given destination matrix, and make the result fill complete.
void getLocalRowView(LO LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
size_t getNumEntriesInLocalRow(const LO localRowInd) const override
Return the number of entries in the given row on the calling process.
virtual bool supportsRowViews() const override
Whether this object implements getLocalRowView() and getGlobalRowView().
virtual Teuchos::RCP< const ::Tpetra::RowGraph< LO, GO, Node > > getGraph() const override
Get the (mesh) graph.
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh,...
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const override
The current number of entries on the calling process in the specified global row.
typename DistObject< Scalar, LO, GO, Node >::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
Node::device_type device_type
The Kokkos::Device specialization that this class uses.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
Kokkos::View< const impl_scalar_type **, Impl::BlockCrsMatrixLittleBlockArrayLayout, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > const_little_block_type
The type used to access const matrix blocks.
void getLocalRowViewNonConst(LO LocalRow, local_inds_host_view_type &indices, nonconst_values_host_view_type &values) const
char packet_type
Implementation detail; tells.
virtual void packAndPrepare(const SrcDistObject &sourceObj, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< packet_type *, buffer_device_type > &exports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, size_t &constantNumPackets) override
::Tpetra::Map< LO, GO, node_type > map_type
The implementation of Map that this class uses.
virtual void rightScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the right with the given Vector x.
virtual GO getIndexBase() const override
The index base for global indices in this matrix.
virtual global_size_t getGlobalNumEntries() const override
The global number of stored (structurally nonzero) entries.
typename BMV::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which this matrix is distributed.
::Tpetra::CrsGraph< LO, GO, node_type > crs_graph_type
The implementation of CrsGraph that this class uses.
global_size_t getGlobalNumRows() const override
get the global number of block rows
virtual void leftScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the left with the given Vector x.
Teuchos::RCP< const map_type > getDomainMap() const override
Get the (point) domain Map of this matrix.
virtual bool isGloballyIndexed() const override
Whether matrix indices are globally indexed.
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
MultiVector for multiple degrees of freedom per mesh point.
const mv_type & getMultiVectorView() const
Get a Tpetra::MultiVector that views this BlockMultiVector's data.
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
Tpetra::Map< LO, GO, Node > map_type
The specialization of Tpetra::Map that this class uses.
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, device_type, void, size_t > local_graph_device_type
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
::Tpetra::Import< LO, GO, node_type > import_type
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Description of Tpetra's behavior.
virtual Teuchos::RCP< const map_type > getMap() const
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
global_ordinal_type getGlobalElement(local_ordinal_type localIndex) const
The global index corresponding to the given local index.
bool isNodeLocalElement(local_ordinal_type localIndex) const
Whether the given local index is valid for this Map on the calling process.
bool locallySameAs(const Map< local_ordinal_type, global_ordinal_type, node_type > &map) const
Is this Map locally the same as the input Map?
local_ordinal_type getLocalElement(global_ordinal_type globalIndex) const
The local index corresponding to the given global index.
local_ordinal_type getMinLocalIndex() const
The minimum local index.
local_ordinal_type getMaxLocalIndex() const
The maximum local index on the calling process.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on device. This requires that th...
bool isConstantStride() const
Whether this multivector has constant stride between columns.
A distributed dense vector.
void replaceLocalValue(const LocalOrdinal myRow, const Scalar &value)
Replace current value at the specified location with specified values.
void disableWDVTracking()
Disable WrappedDualView reference-count tracking and syncing. Call this before entering a host-parall...
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
void enableWDVTracking()
Enable WrappedDualView reference-count tracking and syncing. Call this after exiting a host-parallel ...
Namespace for new Tpetra features that are not ready for public release, but are ready for evaluation...
Kokkos::LayoutRight BlockCrsMatrixLittleBlockArrayLayout
give an option to use layoutleft
KOKKOS_INLINE_FUNCTION void absMax(const ViewType2 &Y, const ViewType1 &X)
Implementation of Tpetra's ABSMAX CombineMode for the small dense blocks in BlockCrsMatrix,...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Teuchos::RCP< CrsGraphType > exportAndFillCompleteCrsGraph(const Teuchos::RCP< const CrsGraphType > &sourceGraph, const Export< typename CrsGraphType::local_ordinal_type, typename CrsGraphType::global_ordinal_type, typename CrsGraphType::node_type > &exporter, const Teuchos::RCP< const Map< typename CrsGraphType::local_ordinal_type, typename CrsGraphType::global_ordinal_type, typename CrsGraphType::node_type > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< typename CrsGraphType::local_ordinal_type, typename CrsGraphType::global_ordinal_type, typename CrsGraphType::node_type > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Nonmember CrsGraph constructor that fuses Export and fillComplete().
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
Teuchos::RCP< CrsGraphType > importAndFillCompleteCrsGraph(const Teuchos::RCP< const CrsGraphType > &sourceGraph, const Import< typename CrsGraphType::local_ordinal_type, typename CrsGraphType::global_ordinal_type, typename CrsGraphType::node_type > &importer, const Teuchos::RCP< const Map< typename CrsGraphType::local_ordinal_type, typename CrsGraphType::global_ordinal_type, typename CrsGraphType::node_type > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< typename CrsGraphType::local_ordinal_type, typename CrsGraphType::global_ordinal_type, typename CrsGraphType::node_type > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Nonmember CrsGraph constructor that fuses Import and fillComplete().
size_t global_size_t
Global size_t object.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ INSERT
Insert new values that don't currently exist.
@ ZERO
Replace old values with zero.
Traits class for packing / unpacking data of type T.