Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_MultiVector_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10// clang-format off
11#ifndef TPETRA_MULTIVECTOR_DEF_HPP
12#define TPETRA_MULTIVECTOR_DEF_HPP
13
21
22#include "Tpetra_Core.hpp"
23#include "Tpetra_Util.hpp"
24#include "Tpetra_Vector.hpp"
36#include "Tpetra_Details_Random.hpp"
37#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
38# include "Teuchos_SerialDenseMatrix.hpp"
39#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
40#include "Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp"
41#include "KokkosCompat_View.hpp"
42#include "KokkosBlas.hpp"
43#include "KokkosKernels_Utils.hpp"
44#include "Kokkos_Random.hpp"
45#include "Kokkos_ArithTraits.hpp"
46#include <memory>
47#include <sstream>
48
49#ifdef HAVE_TPETRA_INST_FLOAT128
50namespace Kokkos {
51 // FIXME (mfh 04 Sep 2015) Just a stub for now!
52 template<class Generator>
53 struct rand<Generator, __float128> {
54 static KOKKOS_INLINE_FUNCTION __float128 max ()
55 {
56 return static_cast<__float128> (1.0);
57 }
58 static KOKKOS_INLINE_FUNCTION __float128
59 draw (Generator& gen)
60 {
61 // Half the smallest normalized double, is the scaling factor of
62 // the lower-order term in the double-double representation.
63 const __float128 scalingFactor =
64 static_cast<__float128> (std::numeric_limits<double>::min ()) /
65 static_cast<__float128> (2.0);
66 const __float128 higherOrderTerm = static_cast<__float128> (gen.drand ());
67 const __float128 lowerOrderTerm =
68 static_cast<__float128> (gen.drand ()) * scalingFactor;
69 return higherOrderTerm + lowerOrderTerm;
70 }
71 static KOKKOS_INLINE_FUNCTION __float128
72 draw (Generator& gen, const __float128& range)
73 {
74 // FIXME (mfh 05 Sep 2015) Not sure if this is right.
75 const __float128 scalingFactor =
76 static_cast<__float128> (std::numeric_limits<double>::min ()) /
77 static_cast<__float128> (2.0);
78 const __float128 higherOrderTerm =
79 static_cast<__float128> (gen.drand (range));
80 const __float128 lowerOrderTerm =
81 static_cast<__float128> (gen.drand (range)) * scalingFactor;
82 return higherOrderTerm + lowerOrderTerm;
83 }
84 static KOKKOS_INLINE_FUNCTION __float128
85 draw (Generator& gen, const __float128& start, const __float128& end)
86 {
87 // FIXME (mfh 05 Sep 2015) Not sure if this is right.
88 const __float128 scalingFactor =
89 static_cast<__float128> (std::numeric_limits<double>::min ()) /
90 static_cast<__float128> (2.0);
91 const __float128 higherOrderTerm =
92 static_cast<__float128> (gen.drand (start, end));
93 const __float128 lowerOrderTerm =
94 static_cast<__float128> (gen.drand (start, end)) * scalingFactor;
95 return higherOrderTerm + lowerOrderTerm;
96 }
97 };
98} // namespace Kokkos
99#endif // HAVE_TPETRA_INST_FLOAT128
100
101namespace { // (anonymous)
102
106 /// Tpetra::MultiVector.
107 ///
108 /// \param lclNumRows [in] Number of rows in the DualView.
109 /// "Local" means "local to the calling MPI process."
110 /// \param numCols [in] Number of columns in the DualView.
111 /// \param zeroOut [in] Whether to initialize all the entries of the
112 /// DualView to zero. Kokkos does first-touch initialization.
113 /// \param allowPadding [in] Whether to give Kokkos the option to
114 /// pad Views for alignment.
115 ///
116 /// \return The allocated Kokkos::DualView.
117 template<class ST, class LO, class GO, class NT>
118 typename Tpetra::MultiVector<ST, LO, GO, NT>::wrapped_dual_view_type
119 allocDualView (const size_t lclNumRows,
120 const size_t numCols,
121 const bool zeroOut = true,
122 const bool allowPadding = false)
123 {
125 using Kokkos::AllowPadding;
126 using Kokkos::view_alloc;
127 using Kokkos::WithoutInitializing;
129 typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::wrapped_dual_view_type wrapped_dual_view_type;
130 typedef typename dual_view_type::t_dev dev_view_type;
131
132 // This needs to be a string and not a char*, if given as an
133 // argument to Kokkos::view_alloc. This is because view_alloc
134 // also allows a raw pointer as its first argument. See
135 // https://github.com/kokkos/kokkos/issues/434.
136 const std::string label ("MV::DualView");
137 const bool debug = Behavior::debug ();
138
139 // NOTE (mfh 18 Feb 2015, 12 Apr 2015, 22 Sep 2016) Our separate
140 // creation of the DualView's Views works around
141 // Kokkos::DualView's current inability to accept an
142 // AllocationProperties initial argument (as Kokkos::View does).
143 // However, the work-around is harmless, since it does what the
144 // (currently nonexistent) equivalent DualView constructor would
145 // have done anyway.
146
147 dev_view_type d_view;
148 if (zeroOut) {
149 if (allowPadding) {
150 d_view = dev_view_type (view_alloc (label, AllowPadding),
151 lclNumRows, numCols);
152 }
153 else {
154 d_view = dev_view_type (view_alloc (label),
155 lclNumRows, numCols);
156 }
157 }
158 else {
159 if (allowPadding) {
160 d_view = dev_view_type (view_alloc (label,
161 WithoutInitializing,
162 AllowPadding),
163 lclNumRows, numCols);
164 }
165 else {
166 d_view = dev_view_type (view_alloc (label, WithoutInitializing),
167 lclNumRows, numCols);
168 }
169 if (debug) {
170 // Filling with NaN is a cheap and effective way to tell if
171 // downstream code is trying to use a MultiVector's data
172 // without them having been initialized. ArithTraits lets us
173 // call nan() even if the scalar type doesn't define it; it
174 // just returns some undefined value in the latter case. This
175 // won't hurt anything because by setting zeroOut=false, users
176 // already agreed that they don't care about the contents of
177 // the MultiVector.
178 const ST nan = Kokkos::ArithTraits<ST>::nan ();
179 KokkosBlas::fill (d_view, nan);
180 }
181 }
182 if (debug) {
183 TEUCHOS_TEST_FOR_EXCEPTION
184 (static_cast<size_t> (d_view.extent (0)) != lclNumRows ||
185 static_cast<size_t> (d_view.extent (1)) != numCols, std::logic_error,
186 "allocDualView: d_view's dimensions actual dimensions do not match "
187 "requested dimensions. d_view is " << d_view.extent (0) << " x " <<
188 d_view.extent (1) << "; requested " << lclNumRows << " x " << numCols
189 << ". Please report this bug to the Tpetra developers.");
190 }
191
192 return wrapped_dual_view_type(d_view);
193 }
194
195 // Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
196 //
197 // T: The type of the entries of the View.
198 // ExecSpace: The Kokkos execution space.
199 template<class T, class ExecSpace>
200 struct MakeUnmanagedView {
201 // The 'false' part of the branch carefully ensures that this
202 // won't attempt to use a host execution space that hasn't been
203 // initialized. For example, if Kokkos::OpenMP is disabled and
204 // Kokkos::Threads is enabled, the latter is always the default
205 // execution space of Kokkos::HostSpace, even when ExecSpace is
206 // Kokkos::Serial. That's why we go through the trouble of asking
207 // Kokkos::DualView what _its_ space is. That seems to work
208 // around this default execution space issue.
209 //
210 typedef typename std::conditional<
211 Kokkos::SpaceAccessibility<
212 typename ExecSpace::memory_space,
213 Kokkos::HostSpace>::accessible,
214 typename ExecSpace::device_type,
215 typename Kokkos::DualView<T*, ExecSpace>::host_mirror_space>::type host_exec_space;
216 typedef Kokkos::LayoutLeft array_layout;
217 typedef Kokkos::View<T*, array_layout, host_exec_space,
218 Kokkos::MemoryUnmanaged> view_type;
219
220 static view_type getView (const Teuchos::ArrayView<T>& x_in)
221 {
222 const size_t numEnt = static_cast<size_t> (x_in.size ());
223 if (numEnt == 0) {
224 return view_type ();
225 } else {
226 return view_type (x_in.getRawPtr (), numEnt);
227 }
228 }
229 };
230
231
232 template<class WrappedDualViewType>
233 WrappedDualViewType
234 takeSubview (const WrappedDualViewType& X,
235 const std::pair<size_t, size_t>& rowRng,
236 const Kokkos::ALL_t& colRng)
237
238 {
239 // The bug we saw below should be harder to trigger here.
240 return WrappedDualViewType(X,rowRng,colRng);
241 }
242
243 template<class WrappedDualViewType>
244 WrappedDualViewType
245 takeSubview (const WrappedDualViewType& X,
246 const Kokkos::ALL_t& rowRng,
247 const std::pair<size_t, size_t>& colRng)
248 {
249 using DualViewType = typename WrappedDualViewType::DVT;
250 // Look carefullly at the comment in the below version of this function.
251 // The comment applies here as well.
252 if (X.extent (0) == 0 && X.extent (1) != 0) {
253 return WrappedDualViewType(DualViewType ("MV::DualView", 0, colRng.second - colRng.first));
254 }
255 else {
256 return WrappedDualViewType(X,rowRng,colRng);
257 }
258 }
259
260 template<class WrappedDualViewType>
261 WrappedDualViewType
262 takeSubview (const WrappedDualViewType& X,
263 const std::pair<size_t, size_t>& rowRng,
264 const std::pair<size_t, size_t>& colRng)
265 {
266 using DualViewType = typename WrappedDualViewType::DVT;
267 // If you take a subview of a view with zero rows Kokkos::subview()
268 // always returns a DualView with the same data pointers. This will break
269 // pointer equality testing in between two subviews of the same 2D View if
270 // it has zero row extent. While the one (known) case where this was actually used
271 // has been fixed, that sort of check could very easily be reintroduced in the future,
272 // hence I've added this if check here.
273 //
274 // This is not a bug in Kokkos::subview(), just some very subtle behavior which
275 // future developers should be wary of.
276 if (X.extent (0) == 0 && X.extent (1) != 0) {
277 return WrappedDualViewType(DualViewType ("MV::DualView", 0, colRng.second - colRng.first));
278 }
279 else {
280 return WrappedDualViewType(X,rowRng,colRng);
281 }
282 }
283
284 template<class WrappedOrNotDualViewType>
285 size_t
286 getDualViewStride (const WrappedOrNotDualViewType& dv)
287 {
288 // FIXME (mfh 15 Mar 2019) DualView doesn't have a stride
289 // method yet, but its Views do.
290 // NOTE: dv.stride() returns a vector of length one
291 // more than its rank
292 size_t strides[WrappedOrNotDualViewType::t_dev::rank+1];
293 dv.stride(strides);
294 const size_t LDA = strides[1];
295 const size_t numRows = dv.extent (0);
296
297 if (LDA == 0) {
298 return (numRows == 0) ? size_t (1) : numRows;
299 }
300 else {
301 return LDA;
302 }
303 }
304
305 template<class ViewType>
306 size_t
307 getViewStride (const ViewType& view)
308 {
309 const size_t LDA = view.stride (1);
310 const size_t numRows = view.extent (0);
311
312 if (LDA == 0) {
313 return (numRows == 0) ? size_t (1) : numRows;
314 }
315 else {
316 return LDA;
317 }
318 }
319
320 template <class impl_scalar_type, class buffer_device_type>
321 bool
322 runKernelOnHost (
323 Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports
324 )
325 {
326 if (! imports.need_sync_device ()) {
327 return false; // most up-to-date on device
328 }
329 else { // most up-to-date on host,
330 // but if large enough, worth running on device anyway
331 size_t localLengthThreshold =
333 return imports.extent(0) <= localLengthThreshold;
334 }
335 }
336
337
338 template <class SC, class LO, class GO, class NT>
339 bool
340 runKernelOnHost (const ::Tpetra::MultiVector<SC, LO, GO, NT>& X)
341 {
342 if (! X.need_sync_device ()) {
343 return false; // most up-to-date on device
344 }
345 else { // most up-to-date on host
346 // but if large enough, worth running on device anyway
347 size_t localLengthThreshold =
349 return X.getLocalLength () <= localLengthThreshold;
350 }
351 }
352
353 template <class SC, class LO, class GO, class NT>
354 void
355 multiVectorNormImpl (typename ::Tpetra::MultiVector<SC, LO, GO, NT>::mag_type norms[],
357 const ::Tpetra::Details::EWhichNorm whichNorm)
358 {
359 using namespace Tpetra;
362 using val_type = typename MV::impl_scalar_type;
363 using mag_type = typename MV::mag_type;
364 using dual_view_type = typename MV::dual_view_type;
365
366 auto map = X.getMap ();
367 auto comm = map.is_null () ? nullptr : map->getComm ().getRawPtr ();
368 auto whichVecs = getMultiVectorWhichVectors (X);
369 const bool isConstantStride = X.isConstantStride ();
370 const bool isDistributed = X.isDistributed ();
371
372 const bool runOnHost = runKernelOnHost (X);
373 if (runOnHost) {
374 using view_type = typename dual_view_type::t_host;
375 using array_layout = typename view_type::array_layout;
376 using device_type = typename view_type::device_type;
377
378 auto X_lcl = X.getLocalViewHost(Access::ReadOnly);
379 normImpl<val_type, array_layout, device_type,
380 mag_type> (norms, X_lcl, whichNorm, whichVecs,
381 isConstantStride, isDistributed, comm);
382 }
383 else {
384 using view_type = typename dual_view_type::t_dev;
385 using array_layout = typename view_type::array_layout;
386 using device_type = typename view_type::device_type;
387
388 auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
389 normImpl<val_type, array_layout, device_type,
390 mag_type> (norms, X_lcl, whichNorm, whichVecs,
391 isConstantStride, isDistributed, comm);
392 }
393 }
394} // namespace (anonymous)
395
396
397namespace Tpetra {
398
399 namespace Details {
400 template <typename DstView, typename SrcView>
401 struct AddAssignFunctor {
402 // This functor would be better as a lambda, but CUDA cannot compile
403 // lambdas in protected functions. It compiles fine with the functor.
404 AddAssignFunctor(DstView &tgt_, SrcView &src_) : tgt(tgt_), src(src_) {}
405
406 KOKKOS_INLINE_FUNCTION void
407 operator () (const size_t k) const { tgt(k) += src(k); }
408
409 DstView tgt;
410 SrcView src;
411 };
412 }
413
414 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
415 bool
417 vectorIndexOutOfRange (const size_t VectorIndex) const {
418 return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
419 }
420
421 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
423 MultiVector () :
424 base_type (Teuchos::rcp (new map_type ()))
425 {}
426
427 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
429 MultiVector (const Teuchos::RCP<const map_type>& map,
430 const size_t numVecs,
431 const bool zeroOut) : /* default is true */
432 base_type (map)
433 {
434 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,numVecs,zeroOut)");
435
436 const size_t lclNumRows = this->getLocalLength ();
437 view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (lclNumRows, numVecs, zeroOut);
438 }
439
440 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
443 const Teuchos::DataAccess copyOrView) :
444 base_type (source),
445 view_ (source.view_),
447 {
449 const char tfecfFuncName[] = "MultiVector(const MultiVector&, "
450 "const Teuchos::DataAccess): ";
451
452 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV 2-arg \"copy\" ctor");
453
454 if (copyOrView == Teuchos::Copy) {
455 // Reuse the conveniently already existing function that creates
456 // a deep copy.
457 MV cpy = createCopy (source);
458 this->view_ = cpy.view_;
459 this->whichVectors_ = cpy.whichVectors_;
460 }
461 else if (copyOrView == Teuchos::View) {
462 }
463 else {
464 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
465 true, std::invalid_argument, "Second argument 'copyOrView' has an "
466 "invalid value " << copyOrView << ". Valid values include "
467 "Teuchos::Copy = " << Teuchos::Copy << " and Teuchos::View = "
468 << Teuchos::View << ".");
469 }
470 }
471
472 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
474 MultiVector (const Teuchos::RCP<const map_type>& map,
475 const dual_view_type& view) :
476 base_type (map),
477 view_ (wrapped_dual_view_type(view))
478 {
479 const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
480 const size_t lclNumRows_map = map.is_null () ? size_t (0) :
481 map->getLocalNumElements ();
482 const size_t lclNumRows_view = view.extent (0);
483 const size_t LDA = getDualViewStride (view_);
484
485 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
486 (LDA < lclNumRows_map || lclNumRows_map != lclNumRows_view,
487 std::invalid_argument, "Kokkos::DualView does not match Map. "
488 "map->getLocalNumElements() = " << lclNumRows_map
489 << ", view.extent(0) = " << lclNumRows_view
490 << ", and getStride() = " << LDA << ".");
491
493 const bool debug = Behavior::debug ();
494 if (debug) {
495 using ::Tpetra::Details::checkGlobalDualViewValidity;
496 std::ostringstream gblErrStrm;
497 const bool verbose = Behavior::verbose ();
498 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
499 const bool gblValid =
500 checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
501 comm.getRawPtr ());
502 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
503 (! gblValid, std::runtime_error, gblErrStrm.str ());
504 }
505 }
506
507
508 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
510 MultiVector (const Teuchos::RCP<const map_type>& map,
511 const wrapped_dual_view_type& view) :
512 base_type (map),
513 view_ (view)
514 {
515 const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
516 const size_t lclNumRows_map = map.is_null () ? size_t (0) :
517 map->getLocalNumElements ();
518 const size_t lclNumRows_view = view.extent (0);
519 const size_t LDA = getDualViewStride (view);
520
521 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
522 (LDA < lclNumRows_map || lclNumRows_map != lclNumRows_view,
523 std::invalid_argument, "Kokkos::DualView does not match Map. "
524 "map->getLocalNumElements() = " << lclNumRows_map
525 << ", view.extent(0) = " << lclNumRows_view
526 << ", and getStride() = " << LDA << ".");
527
529 const bool debug = Behavior::debug ();
530 if (debug) {
531 using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
532 std::ostringstream gblErrStrm;
533 const bool verbose = Behavior::verbose ();
534 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
535 const bool gblValid =
536 checkGlobalWrappedDualViewValidity (&gblErrStrm, view, verbose,
537 comm.getRawPtr ());
538 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
539 (! gblValid, std::runtime_error, gblErrStrm.str ());
540 }
541 }
542
543
544
545 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
547 MultiVector (const Teuchos::RCP<const map_type>& map,
548 const typename dual_view_type::t_dev& d_view) :
549 base_type (map)
550 {
551 using Teuchos::ArrayRCP;
552 using Teuchos::RCP;
553 const char tfecfFuncName[] = "MultiVector(map,d_view): ";
554
555 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,d_view)");
556
557 const size_t LDA = getViewStride (d_view);
558 const size_t lclNumRows = map->getLocalNumElements ();
559 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
560 (LDA < lclNumRows, std::invalid_argument, "Map does not match "
561 "Kokkos::View. map->getLocalNumElements() = " << lclNumRows
562 << ", View's column stride = " << LDA
563 << ", and View's extent(0) = " << d_view.extent (0) << ".");
564
565 auto h_view = Kokkos::create_mirror_view (d_view);
566 auto dual_view = dual_view_type (d_view, h_view);
567 view_ = wrapped_dual_view_type(dual_view);
568
570 const bool debug = Behavior::debug ();
571 if (debug) {
572 using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
573 std::ostringstream gblErrStrm;
574 const bool verbose = Behavior::verbose ();
575 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
576 const bool gblValid =
577 checkGlobalWrappedDualViewValidity (&gblErrStrm, view_, verbose,
578 comm.getRawPtr ());
579 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
580 (! gblValid, std::runtime_error, gblErrStrm.str ());
581 }
582 // The user gave us a device view. In order to respect its
583 // initial contents, we mark the DualView as "modified on device."
584 // That way, the next sync will synchronize it with the host view.
585 dual_view.modify_device ();
586 }
587
588 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
590 MultiVector (const Teuchos::RCP<const map_type>& map,
591 const dual_view_type& view,
592 const dual_view_type& origView) :
593 base_type (map),
594 view_ (wrapped_dual_view_type(view,origView))
595 {
596 const char tfecfFuncName[] = "MultiVector(map,view,origView): ";
597
598 const size_t LDA = getDualViewStride (origView);
599 const size_t lclNumRows = this->getLocalLength (); // comes from the Map
600 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
601 LDA < lclNumRows, std::invalid_argument, "The input Kokkos::DualView's "
602 "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
603 << ". This may also mean that the input origView's first dimension (number "
604 "of rows = " << origView.extent (0) << ") does not not match the number "
605 "of entries in the Map on the calling process.");
606
608 const bool debug = Behavior::debug ();
609 if (debug) {
610 using ::Tpetra::Details::checkGlobalDualViewValidity;
611 std::ostringstream gblErrStrm;
612 const bool verbose = Behavior::verbose ();
613 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
614 const bool gblValid_0 =
615 checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
616 comm.getRawPtr ());
617 const bool gblValid_1 =
618 checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
619 comm.getRawPtr ());
620 const bool gblValid = gblValid_0 && gblValid_1;
621 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
622 (! gblValid, std::runtime_error, gblErrStrm.str ());
623 }
624 }
625
626
627 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
629 MultiVector (const Teuchos::RCP<const map_type>& map,
630 const dual_view_type& view,
631 const Teuchos::ArrayView<const size_t>& whichVectors) :
632 base_type (map),
633 view_ (view),
634 whichVectors_ (whichVectors.begin (), whichVectors.end ())
635 {
636 using Kokkos::ALL;
637 using Kokkos::subview;
638 const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
639
641 const bool debug = Behavior::debug ();
642 if (debug) {
643 using ::Tpetra::Details::checkGlobalDualViewValidity;
644 std::ostringstream gblErrStrm;
645 const bool verbose = Behavior::verbose ();
646 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
647 const bool gblValid =
648 checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
649 comm.getRawPtr ());
650 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
651 (! gblValid, std::runtime_error, gblErrStrm.str ());
652 }
653
654 const size_t lclNumRows = map.is_null () ? size_t (0) :
655 map->getLocalNumElements ();
656 // Check dimensions of the input DualView. We accept that Kokkos
657 // might not allow construction of a 0 x m (Dual)View with m > 0,
658 // so we only require the number of rows to match if the
659 // (Dual)View has more than zero columns. Likewise, we only
660 // require the number of columns to match if the (Dual)View has
661 // more than zero rows.
662 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
663 view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
664 std::invalid_argument, "view.extent(0) = " << view.extent (0)
665 << " < map->getLocalNumElements() = " << lclNumRows << ".");
666 if (whichVectors.size () != 0) {
667 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
668 view.extent (1) != 0 && view.extent (1) == 0,
669 std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
670 " = " << whichVectors.size () << " > 0.");
671 size_t maxColInd = 0;
672 typedef Teuchos::ArrayView<const size_t>::size_type size_type;
673 for (size_type k = 0; k < whichVectors.size (); ++k) {
674 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
675 whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
676 std::invalid_argument, "whichVectors[" << k << "] = "
677 "Teuchos::OrdinalTraits<size_t>::invalid().");
678 maxColInd = std::max (maxColInd, whichVectors[k]);
679 }
680 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
681 view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
682 std::invalid_argument, "view.extent(1) = " << view.extent (1)
683 << " <= max(whichVectors) = " << maxColInd << ".");
684 }
685
686 // If extent(1) is 0, the stride might be 0. BLAS doesn't like
687 // zero strides, so modify in that case.
688 const size_t LDA = getDualViewStride (view);
689 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
690 (LDA < lclNumRows, std::invalid_argument,
691 "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
692
693 if (whichVectors.size () == 1) {
694 // If whichVectors has only one entry, we don't need to bother
695 // with nonconstant stride. Just shift the view over so it
696 // points to the desired column.
697 //
698 // NOTE (mfh 10 May 2014) This is a special case where we set
699 // origView_ just to view that one column, not all of the
700 // original columns. This ensures that the use of origView_ in
701 // offsetView works correctly.
702 //
703 const std::pair<size_t, size_t> colRng (whichVectors[0],
704 whichVectors[0] + 1);
705 view_ = takeSubview (view_, ALL (), colRng);
706 // whichVectors_.size() == 0 means "constant stride."
707 whichVectors_.clear ();
708 }
709 }
710
711 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
713 MultiVector (const Teuchos::RCP<const map_type>& map,
714 const wrapped_dual_view_type& view,
715 const Teuchos::ArrayView<const size_t>& whichVectors) :
716 base_type (map),
717 view_ (view),
718 whichVectors_ (whichVectors.begin (), whichVectors.end ())
719 {
720 using Kokkos::ALL;
721 using Kokkos::subview;
722 const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
723
725 const bool debug = Behavior::debug ();
726 if (debug) {
727 using ::Tpetra::Details::checkGlobalWrappedDualViewValidity;
728 std::ostringstream gblErrStrm;
729 const bool verbose = Behavior::verbose ();
730 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
731 const bool gblValid =
732 checkGlobalWrappedDualViewValidity (&gblErrStrm, view, verbose,
733 comm.getRawPtr ());
734 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
735 (! gblValid, std::runtime_error, gblErrStrm.str ());
736 }
737
738 const size_t lclNumRows = map.is_null () ? size_t (0) :
739 map->getLocalNumElements ();
740 // Check dimensions of the input DualView. We accept that Kokkos
741 // might not allow construction of a 0 x m (Dual)View with m > 0,
742 // so we only require the number of rows to match if the
743 // (Dual)View has more than zero columns. Likewise, we only
744 // require the number of columns to match if the (Dual)View has
745 // more than zero rows.
746 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
747 view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
748 std::invalid_argument, "view.extent(0) = " << view.extent (0)
749 << " < map->getLocalNumElements() = " << lclNumRows << ".");
750 if (whichVectors.size () != 0) {
751 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
752 view.extent (1) != 0 && view.extent (1) == 0,
753 std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
754 " = " << whichVectors.size () << " > 0.");
755 size_t maxColInd = 0;
756 typedef Teuchos::ArrayView<const size_t>::size_type size_type;
757 for (size_type k = 0; k < whichVectors.size (); ++k) {
758 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
759 whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
760 std::invalid_argument, "whichVectors[" << k << "] = "
761 "Teuchos::OrdinalTraits<size_t>::invalid().");
762 maxColInd = std::max (maxColInd, whichVectors[k]);
763 }
764 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
765 view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
766 std::invalid_argument, "view.extent(1) = " << view.extent (1)
767 << " <= max(whichVectors) = " << maxColInd << ".");
768 }
769
770 // If extent(1) is 0, the stride might be 0. BLAS doesn't like
771 // zero strides, so modify in that case.
772 const size_t LDA = getDualViewStride (view);
773 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
774 (LDA < lclNumRows, std::invalid_argument,
775 "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
776
777 if (whichVectors.size () == 1) {
778 // If whichVectors has only one entry, we don't need to bother
779 // with nonconstant stride. Just shift the view over so it
780 // points to the desired column.
781 //
782 // NOTE (mfh 10 May 2014) This is a special case where we set
783 // origView_ just to view that one column, not all of the
784 // original columns. This ensures that the use of origView_ in
785 // offsetView works correctly.
786 //
787 const std::pair<size_t, size_t> colRng (whichVectors[0],
788 whichVectors[0] + 1);
789 view_ = takeSubview (view_, ALL (), colRng);
790 // whichVectors_.size() == 0 means "constant stride."
791 whichVectors_.clear ();
792 }
793 }
794
795
796 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
798 MultiVector (const Teuchos::RCP<const map_type>& map,
799 const dual_view_type& view,
800 const dual_view_type& origView,
801 const Teuchos::ArrayView<const size_t>& whichVectors) :
802 base_type (map),
803 view_ (wrapped_dual_view_type(view,origView)),
804 whichVectors_ (whichVectors.begin (), whichVectors.end ())
805 {
806 using Kokkos::ALL;
807 using Kokkos::subview;
808 const char tfecfFuncName[] = "MultiVector(map,view,origView,whichVectors): ";
809
811 const bool debug = Behavior::debug ();
812 if (debug) {
813 using ::Tpetra::Details::checkGlobalDualViewValidity;
814 std::ostringstream gblErrStrm;
815 const bool verbose = Behavior::verbose ();
816 const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
817 const bool gblValid_0 =
818 checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
819 comm.getRawPtr ());
820 const bool gblValid_1 =
821 checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
822 comm.getRawPtr ());
823 const bool gblValid = gblValid_0 && gblValid_1;
824 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
825 (! gblValid, std::runtime_error, gblErrStrm.str ());
826 }
827
828 const size_t lclNumRows = this->getLocalLength ();
829 // Check dimensions of the input DualView. We accept that Kokkos
830 // might not allow construction of a 0 x m (Dual)View with m > 0,
831 // so we only require the number of rows to match if the
832 // (Dual)View has more than zero columns. Likewise, we only
833 // require the number of columns to match if the (Dual)View has
834 // more than zero rows.
835 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
836 view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
837 std::invalid_argument, "view.extent(0) = " << view.extent (0)
838 << " < map->getLocalNumElements() = " << lclNumRows << ".");
839 if (whichVectors.size () != 0) {
840 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
841 view.extent (1) != 0 && view.extent (1) == 0,
842 std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
843 " = " << whichVectors.size () << " > 0.");
844 size_t maxColInd = 0;
845 typedef Teuchos::ArrayView<const size_t>::size_type size_type;
846 for (size_type k = 0; k < whichVectors.size (); ++k) {
847 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
848 whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
849 std::invalid_argument, "whichVectors[" << k << "] = "
850 "Teuchos::OrdinalTraits<size_t>::invalid().");
851 maxColInd = std::max (maxColInd, whichVectors[k]);
852 }
853 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
854 view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
855 std::invalid_argument, "view.extent(1) = " << view.extent (1)
856 << " <= max(whichVectors) = " << maxColInd << ".");
857 }
858
859 // If extent(1) is 0, the stride might be 0. BLAS doesn't like
860 // zero strides, so modify in that case.
861 const size_t LDA = getDualViewStride (origView);
862 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
863 (LDA < lclNumRows, std::invalid_argument, "Map and DualView origView "
864 "do not match. LDA = " << LDA << " < this->getLocalLength() = " <<
865 lclNumRows << ". origView.extent(0) = " << origView.extent(0)
866 << ", origView.stride(1) = " << origView.view_device().stride(1) << ".");
867
868 if (whichVectors.size () == 1) {
869 // If whichVectors has only one entry, we don't need to bother
870 // with nonconstant stride. Just shift the view over so it
871 // points to the desired column.
872 //
873 // NOTE (mfh 10 May 2014) This is a special case where we set
874 // origView_ just to view that one column, not all of the
875 // original columns. This ensures that the use of origView_ in
876 // offsetView works correctly.
877 const std::pair<size_t, size_t> colRng (whichVectors[0],
878 whichVectors[0] + 1);
879 view_ = takeSubview (view_, ALL (), colRng);
880 // origView_ = takeSubview (origView_, ALL (), colRng);
881 // whichVectors_.size() == 0 means "constant stride."
882 whichVectors_.clear ();
883 }
884 }
885
886 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
888 MultiVector (const Teuchos::RCP<const map_type>& map,
889 const Teuchos::ArrayView<const Scalar>& data,
890 const size_t LDA,
891 const size_t numVecs) :
892 base_type (map)
893 {
894 typedef LocalOrdinal LO;
895 typedef GlobalOrdinal GO;
896 const char tfecfFuncName[] = "MultiVector(map,data,LDA,numVecs): ";
897 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView,LDA,numVecs)");
898
899 // Deep copy constructor, constant stride (NO whichVectors_).
900 // There is no need for a deep copy constructor with nonconstant stride.
901
902 const size_t lclNumRows =
903 map.is_null () ? size_t (0) : map->getLocalNumElements ();
904 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
905 (LDA < lclNumRows, std::invalid_argument, "LDA = " << LDA << " < "
906 "map->getLocalNumElements() = " << lclNumRows << ".");
907 if (numVecs != 0) {
908 const size_t minNumEntries = LDA * (numVecs - 1) + lclNumRows;
909 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
910 (static_cast<size_t> (data.size ()) < minNumEntries,
911 std::invalid_argument, "Input Teuchos::ArrayView does not have enough "
912 "entries, given the input Map and number of vectors in the MultiVector."
913 " data.size() = " << data.size () << " < (LDA*(numVecs-1)) + "
914 "map->getLocalNumElements () = " << minNumEntries << ".");
915 }
916
917 this->view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
918 //Note: X_out will be completely overwritten
919 auto X_out = this->getLocalViewDevice(Access::OverwriteAll);
921 // Make an unmanaged host Kokkos::View of the input data. First
922 // create a View (X_in_orig) with the original stride. Then,
923 // create a subview (X_in) with the right number of columns.
924 const impl_scalar_type* const X_in_raw =
925 reinterpret_cast<const impl_scalar_type*> (data.getRawPtr ());
926 Kokkos::View<const impl_scalar_type**,
927 Kokkos::LayoutLeft,
928 Kokkos::HostSpace,
929 Kokkos::MemoryUnmanaged> X_in_orig (X_in_raw, LDA, numVecs);
930 const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
931 auto X_in = Kokkos::subview (X_in_orig, rowRng, Kokkos::ALL ());
932
933 // If LDA != X_out's column stride, then we need to copy one
934 // column at a time; Kokkos::deep_copy refuses to work in that
935 // case.
936 const size_t outStride =
937 X_out.extent (1) == 0 ? size_t (1) : X_out.stride (1);
938 if (LDA == outStride) { // strides are the same; deep_copy once
939 // This only works because MultiVector uses LayoutLeft.
940 // We would need a custom copy functor otherwise.
941 // DEEP_COPY REVIEW - HOST-TO-DEVICE
942 Kokkos::deep_copy (execution_space(), X_out, X_in);
943 }
944 else { // strides differ; copy one column at a time
945 typedef decltype (Kokkos::subview (X_out, Kokkos::ALL (), 0))
946 out_col_view_type;
947 typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0))
948 in_col_view_type;
949 for (size_t j = 0; j < numVecs; ++j) {
950 out_col_view_type X_out_j = Kokkos::subview (X_out, Kokkos::ALL (), j);
951 in_col_view_type X_in_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
952 // DEEP_COPY REVIEW - HOST-TO-DEVICE
953 Kokkos::deep_copy (execution_space(), X_out_j, X_in_j);
954 }
955 }
956 }
957
958 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
960 MultiVector (const Teuchos::RCP<const map_type>& map,
961 const Teuchos::ArrayView<const Teuchos::ArrayView<const Scalar> >& ArrayOfPtrs,
962 const size_t numVecs) :
963 base_type (map)
964 {
965 typedef impl_scalar_type IST;
966 typedef LocalOrdinal LO;
967 typedef GlobalOrdinal GO;
968 const char tfecfFuncName[] = "MultiVector(map,ArrayOfPtrs,numVecs): ";
969 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView of ArrayView,numVecs)");
970
971 const size_t lclNumRows =
972 map.is_null () ? size_t (0) : map->getLocalNumElements ();
973 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
974 (numVecs < 1 || numVecs != static_cast<size_t> (ArrayOfPtrs.size ()),
975 std::runtime_error, "Either numVecs (= " << numVecs << ") < 1, or "
976 "ArrayOfPtrs.size() (= " << ArrayOfPtrs.size () << ") != numVecs.");
977 for (size_t j = 0; j < numVecs; ++j) {
978 Teuchos::ArrayView<const Scalar> X_j_av = ArrayOfPtrs[j];
979 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
980 static_cast<size_t> (X_j_av.size ()) < lclNumRows,
981 std::invalid_argument, "ArrayOfPtrs[" << j << "].size() = "
982 << X_j_av.size () << " < map->getLocalNumElements() = " << lclNumRows
983 << ".");
984 }
985
986 view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
987 auto X_out = this->getLocalViewDevice(Access::ReadWrite);
988
989 // Make sure that the type of a single input column has the same
990 // array layout as each output column does, so we can deep_copy.
991 using array_layout = typename decltype (X_out)::array_layout;
992 using input_col_view_type = typename Kokkos::View<const IST*,
993 array_layout,
994 Kokkos::HostSpace,
995 Kokkos::MemoryUnmanaged>;
996
997 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
998 for (size_t j = 0; j < numVecs; ++j) {
999 Teuchos::ArrayView<const IST> X_j_av =
1000 Teuchos::av_reinterpret_cast<const IST> (ArrayOfPtrs[j]);
1001 input_col_view_type X_j_in (X_j_av.getRawPtr (), lclNumRows);
1002 auto X_j_out = Kokkos::subview (X_out, rowRng, j);
1003 // DEEP_COPY REVIEW - HOST-TO-DEVICE
1004 Kokkos::deep_copy (execution_space(), X_j_out, X_j_in);
1005 }
1006 }
1007
1008 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1013
1014 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1015 size_t
1017 getLocalLength () const
1018 {
1019 if (this->getMap ().is_null ()) { // possible, due to replaceMap().
1020 return static_cast<size_t> (0);
1021 } else {
1022 return this->getMap ()->getLocalNumElements ();
1023 }
1024 }
1025
1026 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1029 getGlobalLength () const
1030 {
1031 if (this->getMap ().is_null ()) { // possible, due to replaceMap().
1032 return static_cast<size_t> (0);
1033 } else {
1034 return this->getMap ()->getGlobalNumElements ();
1035 }
1036 }
1037
1038 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1039 size_t
1041 getStride () const
1042 {
1043 return isConstantStride () ? getDualViewStride (view_) : size_t (0);
1044 }
1045
1046 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1047 bool
1050 {
1051 //Don't actually get a view, just get pointers.
1052 auto thisData = view_.getDualView().view_host().data();
1053 auto otherData = other.view_.getDualView().view_host().data();
1054 size_t thisSpan = view_.getDualView().view_host().span();
1055 size_t otherSpan = other.view_.getDualView().view_host().span();
1056 return (otherData <= thisData && thisData < otherData + otherSpan)
1057 || (thisData <= otherData && otherData < thisData + thisSpan);
1058 }
1059
1060 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1061 bool
1063 checkSizes (const SrcDistObject& sourceObj)
1065 // Check whether the source object is a MultiVector. If not, then
1066 // we can't even compare sizes, so it's definitely not OK to
1067 // Import or Export from it.
1069 const MV* src = dynamic_cast<const MV*> (&sourceObj);
1070 if (src == nullptr) {
1071 return false;
1072 }
1073 else {
1074 // The target of the Import or Export calls checkSizes() in
1075 // DistObject::doTransfer(). By that point, we've already
1076 // constructed an Import or Export object using the two
1077 // multivectors' Maps, which means that (hopefully) we've
1078 // already checked other attributes of the multivectors. Thus,
1079 // all we need to do here is check the number of columns.
1080 return src->getNumVectors () == this->getNumVectors ();
1081 }
1082 }
1083
1084 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1085 size_t
1090
1091 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1092 void
1095 (const SrcDistObject& sourceObj,
1096 const size_t numSameIDs,
1097 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
1098 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
1099 const CombineMode CM,
1100 const execution_space &space)
1101 {
1105 using std::endl;
1106 using KokkosRefactor::Details::permute_array_multi_column;
1107 using KokkosRefactor::Details::permute_array_multi_column_variable_stride;
1108 using Kokkos::Compat::create_const_view;
1110
1111 // We've already called checkSizes(), so this cast must succeed.
1112 MV& sourceMV = const_cast<MV &>(dynamic_cast<const MV&> (sourceObj));
1113 const bool copyOnHost = runKernelOnHost(sourceMV);
1114 const char longFuncNameHost[] = "Tpetra::MultiVector::copyAndPermute[Host]";
1115 const char longFuncNameDevice[] = "Tpetra::MultiVector::copyAndPermute[Device]";
1116 const char tfecfFuncName[] = "copyAndPermute: ";
1117 ProfilingRegion regionCAP (copyOnHost ? longFuncNameHost : longFuncNameDevice);
1118
1119 const bool verbose = Behavior::verbose ();
1120 std::unique_ptr<std::string> prefix;
1121 if (verbose) {
1122 auto map = this->getMap ();
1123 auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1124 const int myRank = comm.is_null () ? -1 : comm->getRank ();
1125 std::ostringstream os;
1126 os << "Proc " << myRank << ": MV::copyAndPermute: ";
1127 prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1128 os << "Start" << endl;
1129 std::cerr << os.str ();
1130 }
1131
1132 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1133 (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0),
1134 std::logic_error, "permuteToLIDs.extent(0) = "
1135 << permuteToLIDs.extent (0) << " != permuteFromLIDs.extent(0) = "
1136 << permuteFromLIDs.extent (0) << ".");
1137 const size_t numCols = this->getNumVectors ();
1138
1139 // sourceMV doesn't belong to us, so we can't sync it. Do the
1140 // copying where it's currently sync'd.
1141 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1142 (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
1143 std::logic_error, "Input MultiVector needs sync to both host "
1144 "and device.");
1145 if (verbose) {
1146 std::ostringstream os;
1147 os << *prefix << "copyOnHost=" << (copyOnHost ? "true" : "false") << endl;
1148 std::cerr << os.str ();
1149 }
1151 if (verbose) {
1152 std::ostringstream os;
1153 os << *prefix << "Copy" << endl;
1154 std::cerr << os.str ();
1155 }
1156
1157 // TODO (mfh 15 Sep 2013) When we replace
1158 // KokkosClassic::MultiVector with a Kokkos::View, there are two
1159 // ways to copy the data:
1160 //
1161 // 1. Get a (sub)view of each column and call deep_copy on that.
1162 // 2. Write a custom kernel to copy the data.
1163 //
1164 // The first is easier, but the second might be more performant in
1165 // case we decide to use layouts other than LayoutLeft. It might
1166 // even make sense to hide whichVectors_ in an entirely new layout
1167 // for Kokkos Views.
1168
1169 // Copy rows [0, numSameIDs-1] of the local multivectors.
1170 //
1171 // Note (ETP 2 Jul 2014) We need to always copy one column at a
1172 // time, even when both multivectors are constant-stride, since
1173 // deep_copy between strided subviews with more than one column
1174 // doesn't currently work.
1175
1176 // FIXME (mfh 04 Feb 2019) Need to optimize for the case where
1177 // both source and target are constant stride and have multiple
1178 // columns.
1179 if (numSameIDs > 0) {
1180 const std::pair<size_t, size_t> rows (0, numSameIDs);
1181 if (copyOnHost) {
1182 auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1183 auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1184
1185 for (size_t j = 0; j < numCols; ++j) {
1186 const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1187 const size_t srcCol =
1188 sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1189
1190 auto tgt_j = Kokkos::subview (tgt_h, rows, tgtCol);
1191 auto src_j = Kokkos::subview (src_h, rows, srcCol);
1192 if (CM == ADD_ASSIGN) {
1193 // Sum src_j into tgt_j
1194 using range_t =
1195 Kokkos::RangePolicy<execution_space, size_t>;
1196 range_t rp(space, 0,numSameIDs);
1197 Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1198 aaf(tgt_j, src_j);
1199 Kokkos::parallel_for(rp, aaf);
1200 }
1201 else {
1202 // Copy src_j into tgt_j
1203 // DEEP_COPY REVIEW - HOSTMIRROR-TO-HOSTMIRROR
1204 Kokkos::deep_copy (space, tgt_j, src_j);
1205 space.fence();
1206 }
1207 }
1208 }
1209 else { // copy on device
1210 auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1211 auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1212
1213 for (size_t j = 0; j < numCols; ++j) {
1214 const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1215 const size_t srcCol =
1216 sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1218 auto tgt_j = Kokkos::subview (tgt_d, rows, tgtCol);
1219 auto src_j = Kokkos::subview (src_d, rows, srcCol);
1220 if (CM == ADD_ASSIGN) {
1221 // Sum src_j into tgt_j
1222 using range_t =
1223 Kokkos::RangePolicy<execution_space, size_t>;
1224 range_t rp(space, 0,numSameIDs);
1225 Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1226 aaf(tgt_j, src_j);
1227 Kokkos::parallel_for(rp, aaf);
1228 }
1229 else {
1230 // Copy src_j into tgt_j
1231 // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
1232 Kokkos::deep_copy (space, tgt_j, src_j);
1233 space.fence();
1234 }
1235 }
1236 }
1237 }
1238
1239
1240 // For the remaining GIDs, execute the permutations. This may
1241 // involve noncontiguous access of both source and destination
1242 // vectors, depending on the LID lists.
1243 //
1244 // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
1245 // the same process, this merges their values by replacement of
1246 // the last encountered GID, not by the specified merge rule
1247 // (such as ADD).
1248
1249 // If there are no permutations, we are done
1250 if (permuteFromLIDs.extent (0) == 0 ||
1251 permuteToLIDs.extent (0) == 0) {
1252 if (verbose) {
1253 std::ostringstream os;
1254 os << *prefix << "No permutations. Done!" << endl;
1255 std::cerr << os.str ();
1256 }
1257 return;
1259
1260 if (verbose) {
1261 std::ostringstream os;
1262 os << *prefix << "Permute" << endl;
1263 std::cerr << os.str ();
1264 }
1265
1266 // We could in theory optimize for the case where exactly one of
1267 // them is constant stride, but we don't currently do that.
1268 const bool nonConstStride =
1269 ! this->isConstantStride () || ! sourceMV.isConstantStride ();
1271 if (verbose) {
1272 std::ostringstream os;
1273 os << *prefix << "nonConstStride="
1274 << (nonConstStride ? "true" : "false") << endl;
1275 std::cerr << os.str ();
1276 }
1277
1278 // We only need the "which vectors" arrays if either the source or
1279 // target MV is not constant stride. Since we only have one
1280 // kernel that must do double-duty, we have to create a "which
1281 // vectors" array for the MV that _is_ constant stride.
1282 Kokkos::DualView<const size_t*, device_type> srcWhichVecs;
1283 Kokkos::DualView<const size_t*, device_type> tgtWhichVecs;
1284 if (nonConstStride) {
1285 if (this->whichVectors_.size () == 0) {
1286 Kokkos::DualView<size_t*, device_type> tmpTgt ("tgtWhichVecs", numCols);
1287 tmpTgt.modify_host ();
1288 for (size_t j = 0; j < numCols; ++j) {
1289 tmpTgt.view_host()(j) = j;
1290 }
1291 if (! copyOnHost) {
1292 tmpTgt.sync_device ();
1293 }
1294 tgtWhichVecs = tmpTgt;
1295 }
1296 else {
1297 Teuchos::ArrayView<const size_t> tgtWhichVecsT = this->whichVectors_ ();
1298 tgtWhichVecs =
1299 getDualViewCopyFromArrayView<size_t, device_type> (tgtWhichVecsT,
1300 "tgtWhichVecs",
1301 copyOnHost);
1302 }
1303 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1304 (static_cast<size_t> (tgtWhichVecs.extent (0)) !=
1305 this->getNumVectors (),
1306 std::logic_error, "tgtWhichVecs.extent(0) = " <<
1307 tgtWhichVecs.extent (0) << " != this->getNumVectors() = " <<
1308 this->getNumVectors () << ".");
1309
1310 if (sourceMV.whichVectors_.size () == 0) {
1311 Kokkos::DualView<size_t*, device_type> tmpSrc ("srcWhichVecs", numCols);
1312 tmpSrc.modify_host ();
1313 for (size_t j = 0; j < numCols; ++j) {
1314 tmpSrc.view_host()(j) = j;
1315 }
1316 if (! copyOnHost) {
1317 tmpSrc.sync_device ();
1318 }
1319 srcWhichVecs = tmpSrc;
1320 }
1321 else {
1322 Teuchos::ArrayView<const size_t> srcWhichVecsT =
1323 sourceMV.whichVectors_ ();
1324 srcWhichVecs =
1325 getDualViewCopyFromArrayView<size_t, device_type> (srcWhichVecsT,
1326 "srcWhichVecs",
1327 copyOnHost);
1328 }
1329 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1330 (static_cast<size_t> (srcWhichVecs.extent (0)) !=
1331 sourceMV.getNumVectors (), std::logic_error,
1332 "srcWhichVecs.extent(0) = " << srcWhichVecs.extent (0)
1333 << " != sourceMV.getNumVectors() = " << sourceMV.getNumVectors ()
1334 << ".");
1335 }
1336
1337 if (copyOnHost) { // permute on host too
1338 if (verbose) {
1339 std::ostringstream os;
1340 os << *prefix << "Get permute LIDs on host" << std::endl;
1341 std::cerr << os.str ();
1342 }
1343 auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1344 auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1345
1346 TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
1347 auto permuteToLIDs_h = create_const_view (permuteToLIDs.view_host ());
1348 TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
1349 auto permuteFromLIDs_h =
1350 create_const_view (permuteFromLIDs.view_host ());
1351
1352 if (verbose) {
1353 std::ostringstream os;
1354 os << *prefix << "Permute on host" << endl;
1355 std::cerr << os.str ();
1356 }
1357 if (nonConstStride) {
1358 // No need to sync first, because copyOnHost argument to
1359 // getDualViewCopyFromArrayView puts them in the right place.
1360 auto tgtWhichVecs_h =
1361 create_const_view (tgtWhichVecs.view_host ());
1362 auto srcWhichVecs_h =
1363 create_const_view (srcWhichVecs.view_host ());
1364 if (CM == ADD_ASSIGN) {
1365 using op_type = KokkosRefactor::Details::AddOp;
1366 permute_array_multi_column_variable_stride (tgt_h, src_h,
1367 permuteToLIDs_h,
1368 permuteFromLIDs_h,
1369 tgtWhichVecs_h,
1370 srcWhichVecs_h, numCols,
1371 op_type());
1372 }
1373 else {
1374 using op_type = KokkosRefactor::Details::InsertOp;
1375 permute_array_multi_column_variable_stride (tgt_h, src_h,
1376 permuteToLIDs_h,
1377 permuteFromLIDs_h,
1378 tgtWhichVecs_h,
1379 srcWhichVecs_h, numCols,
1380 op_type());
1381 }
1382 }
1383 else {
1384 if (CM == ADD_ASSIGN) {
1385 using op_type = KokkosRefactor::Details::AddOp;
1386 permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1387 permuteFromLIDs_h, numCols, op_type());
1388 }
1389 else {
1390 using op_type = KokkosRefactor::Details::InsertOp;
1391 permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1392 permuteFromLIDs_h, numCols, op_type());
1393 }
1394 }
1395 }
1396 else { // permute on device
1397 if (verbose) {
1398 std::ostringstream os;
1399 os << *prefix << "Get permute LIDs on device" << endl;
1400 std::cerr << os.str ();
1401 }
1402 auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1403 auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1404
1405 TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_device () );
1406 auto permuteToLIDs_d = create_const_view (permuteToLIDs.view_device ());
1407 TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_device () );
1408 auto permuteFromLIDs_d =
1409 create_const_view (permuteFromLIDs.view_device ());
1410
1411 if (verbose) {
1412 std::ostringstream os;
1413 os << *prefix << "Permute on device" << endl;
1414 std::cerr << os.str ();
1415 }
1416 if (nonConstStride) {
1417 // No need to sync first, because copyOnHost argument to
1418 // getDualViewCopyFromArrayView puts them in the right place.
1419 auto tgtWhichVecs_d = create_const_view (tgtWhichVecs.view_device ());
1420 auto srcWhichVecs_d = create_const_view (srcWhichVecs.view_device ());
1421 if (CM == ADD_ASSIGN) {
1422 using op_type = KokkosRefactor::Details::AddOp;
1423 permute_array_multi_column_variable_stride (space, tgt_d, src_d,
1424 permuteToLIDs_d,
1425 permuteFromLIDs_d,
1426 tgtWhichVecs_d,
1427 srcWhichVecs_d, numCols,
1428 op_type());
1429 }
1430 else {
1431 using op_type = KokkosRefactor::Details::InsertOp;
1432 permute_array_multi_column_variable_stride (space, tgt_d, src_d,
1433 permuteToLIDs_d,
1434 permuteFromLIDs_d,
1435 tgtWhichVecs_d,
1436 srcWhichVecs_d, numCols,
1437 op_type());
1438 }
1440 else {
1441 if (CM == ADD_ASSIGN) {
1442 using op_type = KokkosRefactor::Details::AddOp;
1443 permute_array_multi_column (space, tgt_d, src_d, permuteToLIDs_d,
1444 permuteFromLIDs_d, numCols, op_type());
1445 }
1446 else {
1447 using op_type = KokkosRefactor::Details::InsertOp;
1448 permute_array_multi_column (space, tgt_d, src_d, permuteToLIDs_d,
1449 permuteFromLIDs_d, numCols, op_type());
1450 }
1451 }
1452 }
1453
1454 if (verbose) {
1455 std::ostringstream os;
1456 os << *prefix << "Done!" << endl;
1457 std::cerr << os.str ();
1459 }
1460
1461// clang-format on
1462template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1464 const SrcDistObject &sourceObj, const size_t numSameIDs,
1465 const Kokkos::DualView<const local_ordinal_type *, buffer_device_type>
1466 &permuteToLIDs,
1467 const Kokkos::DualView<const local_ordinal_type *, buffer_device_type>
1468 &permuteFromLIDs,
1469 const CombineMode CM) {
1470 copyAndPermute(sourceObj, numSameIDs, permuteToLIDs, permuteFromLIDs, CM,
1471 execution_space());
1472}
1473// clang-format off
1474
1475 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1476 void
1479 (const SrcDistObject& sourceObj,
1480 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1481 Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1482 Kokkos::DualView<size_t*, buffer_device_type> /* numExportPacketsPerLID */,
1483 size_t& constantNumPackets,
1484 const execution_space &space)
1485 {
1489 using Kokkos::Compat::create_const_view;
1490 using Kokkos::Compat::getKokkosViewDeepCopy;
1491 using std::endl;
1493
1494 // We've already called checkSizes(), so this cast must succeed.
1495 MV& sourceMV = const_cast<MV&>(dynamic_cast<const MV&> (sourceObj));
1496
1497 const bool packOnHost = runKernelOnHost(sourceMV);
1498 const char longFuncNameHost[] = "Tpetra::MultiVector::packAndPrepare[Host]";
1499 const char longFuncNameDevice[] = "Tpetra::MultiVector::packAndPrepare[Device]";
1500 const char tfecfFuncName[] = "packAndPrepare: ";
1501 ProfilingRegion regionPAP (packOnHost ? longFuncNameHost : longFuncNameDevice);
1502
1503 // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1504 // have the option to check indices. We do so when Tpetra is in
1505 // debug mode. It is in debug mode by default in a debug build,
1506 // but you may control this at run time, before launching the
1507 // executable, by setting the TPETRA_DEBUG environment variable to
1508 // "1" (or "TRUE").
1509 const bool debugCheckIndices = Behavior::debug ();
1510 // mfh 03 Aug 2017, 27 Sep 2017: Set the TPETRA_VERBOSE
1511 // environment variable to "1" (or "TRUE") for copious debug
1512 // output to std::cerr on every MPI process. This is unwise for
1513 // runs with large numbers of MPI processes.
1514 const bool printDebugOutput = Behavior::verbose ();
1515
1516 std::unique_ptr<std::string> prefix;
1517 if (printDebugOutput) {
1518 auto map = this->getMap ();
1519 auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1520 const int myRank = comm.is_null () ? -1 : comm->getRank ();
1521 std::ostringstream os;
1522 os << "Proc " << myRank << ": MV::packAndPrepare: ";
1523 prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1524 os << "Start" << endl;
1525 std::cerr << os.str ();
1526 }
1527
1528
1529 const size_t numCols = sourceMV.getNumVectors ();
1530
1531 // This spares us from needing to fill numExportPacketsPerLID.
1532 // Setting constantNumPackets to a nonzero value signals that
1533 // all packets have the same number of entries.
1534 constantNumPackets = numCols;
1535
1536 // If we have no exports, there is nothing to do. Make sure this
1537 // goes _after_ setting constantNumPackets correctly.
1538 if (exportLIDs.extent (0) == 0) {
1539 if (printDebugOutput) {
1540 std::ostringstream os;
1541 os << *prefix << "No exports on this proc, DONE" << std::endl;
1542 std::cerr << os.str ();
1543 }
1544 return;
1545 }
1546
1547 /* The layout in the export for MultiVectors is as follows:
1548 exports = { all of the data from row exportLIDs.front() ;
1549 ....
1550 all of the data from row exportLIDs.back() }
1551 This doesn't have the best locality, but is necessary because
1552 the data for a Packet (all data associated with an LID) is
1553 required to be contiguous. */
1554
1555 // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
1556 // packing scheme in the above comment? The data going to a
1557 // particular process must be contiguous, of course, but those
1558 // data could include entries from multiple LIDs. DistObject just
1559 // needs to know how to index into that data. Kokkos is good at
1560 // decoupling storage intent from data layout choice.
1561
1562 const size_t numExportLIDs = exportLIDs.extent (0);
1563 const size_t newExportsSize = numCols * numExportLIDs;
1564 if (printDebugOutput) {
1565 std::ostringstream os;
1566 os << *prefix << "realloc: "
1567 << "numExportLIDs: " << numExportLIDs
1568 << ", exports.extent(0): " << exports.extent (0)
1569 << ", newExportsSize: " << newExportsSize << std::endl;
1570 std::cerr << os.str ();
1571 }
1572 reallocDualViewIfNeeded (exports, newExportsSize, "exports");
1573
1574 // mfh 04 Feb 2019: sourceMV doesn't belong to us, so we can't
1575 // sync it. Pack it where it's currently sync'd.
1576 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1577 (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
1578 std::logic_error, "Input MultiVector needs sync to both host "
1579 "and device.");
1580 if (printDebugOutput) {
1581 std::ostringstream os;
1582 os << *prefix << "packOnHost=" << (packOnHost ? "true" : "false") << endl;
1583 std::cerr << os.str ();
1584 }
1585
1586 // Mark 'exports' here, since we might have resized it above.
1587 // Resizing currently requires calling the constructor, which
1588 // clears out the 'modified' flags.
1589 if (packOnHost) {
1590 // nde 06 Feb 2020: If 'exports' does not require resize
1591 // when reallocDualViewIfNeeded is called, the modified flags
1592 // are not cleared out. This can result in host and device views
1593 // being out-of-sync, resuling in an error in exports.modify_* calls.
1594 // Clearing the sync flags prevents this possible case.
1595 exports.clear_sync_state ();
1596 exports.modify_host ();
1597 }
1598 else {
1599 // nde 06 Feb 2020: If 'exports' does not require resize
1600 // when reallocDualViewIfNeeded is called, the modified flags
1601 // are not cleared out. This can result in host and device views
1602 // being out-of-sync, resuling in an error in exports.modify_* calls.
1603 // Clearing the sync flags prevents this possible case.
1604 exports.clear_sync_state ();
1605 exports.modify_device ();
1606 }
1607
1608 if (numCols == 1) { // special case for one column only
1609 // MultiVector always represents a single column with constant
1610 // stride, but it doesn't hurt to implement both cases anyway.
1611 //
1612 // ETP: I'm not sure I agree with the above statement. Can't a single-
1613 // column multivector be a subview of another multi-vector, in which case
1614 // sourceMV.whichVectors_[0] != 0 ? I think we have to handle that case
1615 // separately.
1616 //
1617 // mfh 18 Jan 2016: In answer to ETP's comment above:
1618 // MultiVector treats single-column MultiVectors created using a
1619 // "nonconstant stride constructor" as a special case, and makes
1620 // them constant stride (by making whichVectors_ have length 0).
1621 if (sourceMV.isConstantStride ()) {
1622 using KokkosRefactor::Details::pack_array_single_column;
1623 if (printDebugOutput) {
1624 std::ostringstream os;
1625 os << *prefix << "Pack numCols=1 const stride" << endl;
1626 std::cerr << os.str ();
1627 }
1628 if (packOnHost) {
1629 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1630 pack_array_single_column (exports.view_host (),
1631 src_host,
1632 exportLIDs.view_host (),
1633 0,
1634 debugCheckIndices);
1635 }
1636 else { // pack on device
1637 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1638 pack_array_single_column (exports.view_device (),
1639 src_dev,
1640 exportLIDs.view_device (),
1641 0,
1642 debugCheckIndices,
1643 space);
1644 }
1645 }
1646 else {
1647 using KokkosRefactor::Details::pack_array_single_column;
1648 if (printDebugOutput) {
1649 std::ostringstream os;
1650 os << *prefix << "Pack numCols=1 nonconst stride" << endl;
1651 std::cerr << os.str ();
1653 if (packOnHost) {
1654 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1655 pack_array_single_column (exports.view_host (),
1656 src_host,
1657 exportLIDs.view_host (),
1658 sourceMV.whichVectors_[0],
1659 debugCheckIndices);
1660 }
1661 else { // pack on device
1662 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1663 pack_array_single_column (exports.view_device (),
1664 src_dev,
1665 exportLIDs.view_device (),
1666 sourceMV.whichVectors_[0],
1667 debugCheckIndices,
1668 space);
1669 }
1670 }
1671 }
1672 else { // the source MultiVector has multiple columns
1673 if (sourceMV.isConstantStride ()) {
1674 using KokkosRefactor::Details::pack_array_multi_column;
1675 if (printDebugOutput) {
1676 std::ostringstream os;
1677 os << *prefix << "Pack numCols=" << numCols << " const stride" << endl;
1678 std::cerr << os.str ();
1679 }
1680 if (packOnHost) {
1681 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1682 pack_array_multi_column (exports.view_host (),
1683 src_host,
1684 exportLIDs.view_host (),
1685 numCols,
1686 debugCheckIndices);
1687 }
1688 else { // pack on device
1689 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1690 pack_array_multi_column (exports.view_device (),
1691 src_dev,
1692 exportLIDs.view_device (),
1693 numCols,
1694 debugCheckIndices,
1695 space);
1696 }
1697 }
1698 else {
1699 using KokkosRefactor::Details::pack_array_multi_column_variable_stride;
1700 if (printDebugOutput) {
1701 std::ostringstream os;
1702 os << *prefix << "Pack numCols=" << numCols << " nonconst stride"
1703 << endl;
1704 std::cerr << os.str ();
1705 }
1706 // FIXME (mfh 04 Feb 2019) Creating a Kokkos::View for
1707 // whichVectors_ can be expensive, but pack and unpack for
1708 // nonconstant-stride MultiVectors is slower anyway.
1709 using IST = impl_scalar_type;
1710 using DV = Kokkos::DualView<IST*, device_type>;
1711 using HES = typename DV::t_host::execution_space;
1712 using DES = typename DV::t_dev::execution_space;
1713 Teuchos::ArrayView<const size_t> whichVecs = sourceMV.whichVectors_ ();
1714 if (packOnHost) {
1715 auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1716 pack_array_multi_column_variable_stride
1717 (exports.view_host (),
1718 src_host,
1719 exportLIDs.view_host (),
1720 getKokkosViewDeepCopy<HES> (whichVecs),
1721 numCols,
1722 debugCheckIndices);
1723 }
1724 else { // pack on device
1725 auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1726 pack_array_multi_column_variable_stride
1727 (exports.view_device (),
1728 src_dev,
1729 exportLIDs.view_device (),
1730 getKokkosViewDeepCopy<DES> (whichVecs),
1731 numCols,
1732 debugCheckIndices, space);
1733 }
1734 }
1735 }
1736
1737 if (printDebugOutput) {
1738 std::ostringstream os;
1739 os << *prefix << "Done!" << endl;
1740 std::cerr << os.str ();
1741 }
1742
1743 }
1744
1745// clang-format on
1746 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1747 void
1750 (const SrcDistObject& sourceObj,
1751 const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1752 Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1753 Kokkos::DualView<size_t*, buffer_device_type> numExportPacketsPerLID,
1754 size_t& constantNumPackets) {
1755 packAndPrepare(sourceObj, exportLIDs, exports, numExportPacketsPerLID, constantNumPackets, execution_space());
1756 }
1757// clang-format off
1758
1759 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1760 template <class NO>
1761 typename std::enable_if<std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1762 typename NO::device_type::memory_space>::value,
1763 bool>::type
1765 reallocImportsIfNeededImpl (const size_t newSize,
1766 const bool verbose,
1767 const std::string* prefix,
1768 const bool areRemoteLIDsContiguous,
1769 const CombineMode CM)
1770 {
1771 // This implementation of reallocImportsIfNeeded is an
1772 // optimization that is specific to MultiVector. We check if the
1773 // imports_ view can be aliased to the end of the data view_. If
1774 // that is the case, we can skip the unpackAndCombine call.
1775
1776 if (verbose) {
1777 std::ostringstream os;
1778 os << *prefix << "Realloc (if needed) imports_ from "
1779 << this->imports_.extent (0) << " to " << newSize << std::endl;
1780 std::cerr << os.str ();
1781 }
1782
1783 bool reallocated = false;
1784
1785 using IST = impl_scalar_type;
1786 using DV = Kokkos::DualView<IST*, Kokkos::LayoutLeft, buffer_device_type>;
1787
1788 // Conditions for aliasing memory:
1789 // - When both sides of the dual view are in the same memory
1790 // space, we do not need to worry about syncing things.
1791 // - When both memory spaces are different, we only alias if this
1792 // does not incur additional sync'ing.
1793 // - The remote LIDs need to be contiguous, so that we do not need
1794 // to reorder received information.
1795 // - CombineMode needs to be INSERT.
1796 // - The number of vectors needs to be 1, otherwise we need to
1797 // reorder the received data.
1798 if ((std::is_same_v<typename dual_view_type::t_dev::device_type, typename dual_view_type::t_host::device_type> ||
1799 (Details::Behavior::assumeMpiIsGPUAware () && !this->need_sync_device()) ||
1800 (!Details::Behavior::assumeMpiIsGPUAware () && !this->need_sync_host())) &&
1801 areRemoteLIDsContiguous &&
1802 (CM == INSERT || CM == REPLACE) &&
1803 (getNumVectors() == 1) &&
1804 (newSize > 0)) {
1805
1806 size_t offset = getLocalLength () - newSize;
1807 reallocated = this->imports_.view_device().data() != view_.getDualView().view_device().data() + offset;
1808 if (reallocated) {
1809 typedef std::pair<size_t, size_t> range_type;
1810 this->imports_ = DV(view_.getDualView(),
1811 range_type (offset, getLocalLength () ),
1812 0);
1813
1814 if (verbose) {
1815 std::ostringstream os;
1816 os << *prefix << "Aliased imports_ to MV.view_" << std::endl;
1817 std::cerr << os.str ();
1818 }
1819 }
1820 return reallocated;
1821 }
1822 {
1824 reallocated =
1825 reallocDualViewIfNeeded (this->unaliased_imports_, newSize, "imports");
1826 if (verbose) {
1827 std::ostringstream os;
1828 os << *prefix << "Finished realloc'ing unaliased_imports_" << std::endl;
1829 std::cerr << os.str ();
1830 }
1831 this->imports_ = this->unaliased_imports_;
1832 }
1833 return reallocated;
1834 }
1835
1836 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1837 template <class NO>
1838 typename std::enable_if<!std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1839 typename NO::device_type::memory_space>::value,
1840 bool>::type
1842 reallocImportsIfNeededImpl (const size_t newSize,
1843 const bool verbose,
1844 const std::string* prefix,
1845 const bool areRemoteLIDsContiguous,
1846 const CombineMode CM)
1847 {
1848 return DistObject<Scalar, LocalOrdinal, GlobalOrdinal, Node>::reallocImportsIfNeeded(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
1849 }
1850
1851 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1852 bool
1854 reallocImportsIfNeeded(const size_t newSize,
1855 const bool verbose,
1856 const std::string* prefix,
1857 const bool areRemoteLIDsContiguous,
1858 const CombineMode CM) {
1860 return reallocImportsIfNeededImpl<Node>(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
1861 }
1862
1863 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1864 bool
1867 return (this->imports_.view_device().data() + this->imports_.view_device().extent(0) ==
1868 view_.getDualView().view_device().data() + view_.getDualView().view_device().extent(0));
1869 }
1870
1871
1872 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1873 void
1876 (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
1877 Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
1878 Kokkos::DualView<size_t*, buffer_device_type> /* numPacketsPerLID */,
1879 const size_t constantNumPackets,
1880 const CombineMode CM,
1881 const execution_space &space)
1882 {
1885 using KokkosRefactor::Details::unpack_array_multi_column;
1886 using KokkosRefactor::Details::unpack_array_multi_column_variable_stride;
1887 using Kokkos::Compat::getKokkosViewDeepCopy;
1888 using std::endl;
1889
1890 const bool unpackOnHost = runKernelOnHost(imports);
1891
1892 const char longFuncNameHost[] = "Tpetra::MultiVector::unpackAndCombine[Host]";
1893 const char longFuncNameDevice[] = "Tpetra::MultiVector::unpackAndCombine[Device]";
1894 const char * longFuncName = unpackOnHost ? longFuncNameHost : longFuncNameDevice;
1895 const char tfecfFuncName[] = "unpackAndCombine: ";
1896 ProfilingRegion regionUAC (longFuncName);
1897
1898 // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1899 // have the option to check indices. We do so when Tpetra is in
1900 // debug mode. It is in debug mode by default in a debug build,
1901 // but you may control this at run time, before launching the
1902 // executable, by setting the TPETRA_DEBUG environment variable to
1903 // "1" (or "TRUE").
1904 const bool debugCheckIndices = Behavior::debug ();
1905
1906 const bool printDebugOutput = Behavior::verbose ();
1907 std::unique_ptr<std::string> prefix;
1908 if (printDebugOutput) {
1909 auto map = this->getMap ();
1910 auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1911 const int myRank = comm.is_null () ? -1 : comm->getRank ();
1912 std::ostringstream os;
1913 os << "Proc " << myRank << ": " << longFuncName << ": ";
1914 prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1915 os << "Start" << endl;
1916 std::cerr << os.str ();
1917 }
1918
1919 // If we have no imports, there is nothing to do
1920 if (importLIDs.extent (0) == 0) {
1921 if (printDebugOutput) {
1922 std::ostringstream os;
1923 os << *prefix << "No imports. Done!" << endl;
1924 std::cerr << os.str ();
1925 }
1926 return;
1927 }
1928
1929 // Check, whether imports_ is a subview of the MV view.
1930 if (importsAreAliased()) {
1931 if (printDebugOutput) {
1932 std::ostringstream os;
1933 os << *prefix << "Skipping unpack, since imports_ is aliased to MV.view_. Done!" << endl;
1934 std::cerr << os.str ();
1935 }
1936 return;
1937 }
1938
1939
1940 const size_t numVecs = getNumVectors ();
1941 if (debugCheckIndices) {
1942 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1943 (static_cast<size_t> (imports.extent (0)) !=
1944 numVecs * importLIDs.extent (0),
1945 std::runtime_error,
1946 "imports.extent(0) = " << imports.extent (0)
1947 << " != getNumVectors() * importLIDs.extent(0) = " << numVecs
1948 << " * " << importLIDs.extent (0) << " = "
1949 << numVecs * importLIDs.extent (0) << ".");
1950
1951 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1952 (constantNumPackets == static_cast<size_t> (0), std::runtime_error,
1953 "constantNumPackets input argument must be nonzero.");
1954
1955 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1956 (static_cast<size_t> (numVecs) !=
1957 static_cast<size_t> (constantNumPackets),
1958 std::runtime_error, "constantNumPackets must equal numVecs.");
1959 }
1960
1961 // mfh 12 Apr 2016, 04 Feb 2019: Decide where to unpack based on
1962 // the memory space in which the imports buffer was last modified and
1963 // the size of the imports buffer.
1964 // DistObject::doTransferNew decides where it was last modified (based on
1965 // whether communication buffers used were on host or device).
1966 if (unpackOnHost) {
1967 if (this->imports_.need_sync_host()) this->imports_.sync_host();
1968 }
1969 else {
1970 if (this->imports_.need_sync_device()) this->imports_.sync_device();
1971 }
1972
1973 if (printDebugOutput) {
1974 std::ostringstream os;
1975 os << *prefix << "unpackOnHost=" << (unpackOnHost ? "true" : "false")
1976 << endl;
1977 std::cerr << os.str ();
1978 }
1979
1980 // We have to sync before modifying, because this method may read
1981 // as well as write (depending on the CombineMode).
1982 auto imports_d = imports.view_device ();
1983 auto imports_h = imports.view_host ();
1984 auto importLIDs_d = importLIDs.view_device ();
1985 auto importLIDs_h = importLIDs.view_host ();
1986
1987 Kokkos::DualView<size_t*, device_type> whichVecs;
1988 if (! isConstantStride ()) {
1989 Kokkos::View<const size_t*, Kokkos::HostSpace,
1990 Kokkos::MemoryUnmanaged> whichVecsIn (whichVectors_.getRawPtr (),
1991 numVecs);
1992 whichVecs = Kokkos::DualView<size_t*, device_type> ("whichVecs", numVecs);
1993 if (unpackOnHost) {
1994 whichVecs.modify_host ();
1995 // DEEP_COPY REVIEW - NOT TESTED FOR CUDA BUILD
1996 Kokkos::deep_copy (whichVecs.view_host (), whichVecsIn);
1997 }
1998 else {
1999 whichVecs.modify_device ();
2000 // DEEP_COPY REVIEW - HOST-TO-DEVICE
2001 Kokkos::deep_copy (whichVecs.view_device (), whichVecsIn);
2002 }
2003 }
2004 auto whichVecs_d = whichVecs.view_device ();
2005 auto whichVecs_h = whichVecs.view_host ();
2006
2007 /* The layout in the export for MultiVectors is as follows:
2008 imports = { all of the data from row exportLIDs.front() ;
2009 ....
2010 all of the data from row exportLIDs.back() }
2011 This doesn't have the best locality, but is necessary because
2012 the data for a Packet (all data associated with an LID) is
2013 required to be contiguous. */
2014
2015 if (numVecs > 0 && importLIDs.extent (0) > 0) {
2016 using dev_exec_space = typename dual_view_type::t_dev::execution_space;
2017 using host_exec_space = typename dual_view_type::t_host::execution_space;
2018
2019 // This fixes GitHub Issue #4418.
2020 const bool use_atomic_updates = unpackOnHost ?
2021 host_exec_space().concurrency () != 1 :
2022 dev_exec_space().concurrency () != 1;
2023
2024 if (printDebugOutput) {
2025 std::ostringstream os;
2026 os << *prefix << "Unpack: " << combineModeToString (CM) << endl;
2027 std::cerr << os.str ();
2028 }
2029
2030 // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
2031 // custom combine modes, start editing here.
2032
2033 if (CM == INSERT || CM == REPLACE) {
2034 using op_type = KokkosRefactor::Details::InsertOp;
2035 if (isConstantStride ()) {
2036 if (unpackOnHost) {
2037 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2038 unpack_array_multi_column (host_exec_space (),
2039 X_h, imports_h, importLIDs_h,
2040 op_type (), numVecs,
2041 use_atomic_updates,
2042 debugCheckIndices);
2043
2044 }
2045 else { // unpack on device
2046 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2047 unpack_array_multi_column (space,
2048 X_d, imports_d, importLIDs_d,
2049 op_type (), numVecs,
2050 use_atomic_updates,
2051 debugCheckIndices);
2053 }
2054 else { // not constant stride
2055 if (unpackOnHost) {
2056 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2057 unpack_array_multi_column_variable_stride (host_exec_space (),
2058 X_h, imports_h,
2059 importLIDs_h,
2060 whichVecs_h,
2061 op_type (),
2062 numVecs,
2063 use_atomic_updates,
2064 debugCheckIndices);
2065 }
2066 else { // unpack on device
2067 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2068 unpack_array_multi_column_variable_stride (space,
2069 X_d, imports_d,
2070 importLIDs_d,
2071 whichVecs_d,
2072 op_type (),
2073 numVecs,
2074 use_atomic_updates,
2075 debugCheckIndices);
2076 }
2077 }
2078 }
2079 else if (CM == ADD || CM == ADD_ASSIGN) {
2080 using op_type = KokkosRefactor::Details::AddOp;
2081 if (isConstantStride ()) {
2082 if (unpackOnHost) {
2083 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2084 unpack_array_multi_column (host_exec_space (),
2085 X_h, imports_h, importLIDs_h,
2086 op_type (), numVecs,
2087 use_atomic_updates,
2088 debugCheckIndices);
2089 }
2090 else { // unpack on device
2091 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2092 unpack_array_multi_column (space,
2093 X_d, imports_d, importLIDs_d,
2094 op_type (), numVecs,
2095 use_atomic_updates,
2096 debugCheckIndices);
2097 }
2098 }
2099 else { // not constant stride
2100 if (unpackOnHost) {
2101 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2102 unpack_array_multi_column_variable_stride (host_exec_space (),
2103 X_h, imports_h,
2104 importLIDs_h,
2105 whichVecs_h,
2106 op_type (),
2107 numVecs,
2108 use_atomic_updates,
2109 debugCheckIndices);
2110 }
2111 else { // unpack on device
2112 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2113 unpack_array_multi_column_variable_stride (space,
2114 X_d, imports_d,
2115 importLIDs_d,
2116 whichVecs_d,
2117 op_type (),
2118 numVecs,
2119 use_atomic_updates,
2120 debugCheckIndices);
2121 }
2122 }
2123 }
2124 else if (CM == ABSMAX) {
2125 using op_type = KokkosRefactor::Details::AbsMaxOp;
2126 if (isConstantStride ()) {
2127 if (unpackOnHost) {
2128 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2129 unpack_array_multi_column (host_exec_space (),
2130 X_h, imports_h, importLIDs_h,
2131 op_type (), numVecs,
2132 use_atomic_updates,
2133 debugCheckIndices);
2134 }
2135 else { // unpack on device
2136 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2137 unpack_array_multi_column (space,
2138 X_d, imports_d, importLIDs_d,
2139 op_type (), numVecs,
2140 use_atomic_updates,
2141 debugCheckIndices);
2142 }
2143 }
2144 else {
2145 if (unpackOnHost) {
2146 auto X_h = this->getLocalViewHost(Access::ReadWrite);
2147 unpack_array_multi_column_variable_stride (host_exec_space (),
2148 X_h, imports_h,
2149 importLIDs_h,
2150 whichVecs_h,
2151 op_type (),
2152 numVecs,
2153 use_atomic_updates,
2154 debugCheckIndices);
2155 }
2156 else { // unpack on device
2157 auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2158 unpack_array_multi_column_variable_stride (space,
2159 X_d, imports_d,
2160 importLIDs_d,
2161 whichVecs_d,
2162 op_type (),
2163 numVecs,
2164 use_atomic_updates,
2165 debugCheckIndices);
2166 }
2167 }
2168 }
2169 else {
2170 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2171 (true, std::logic_error, "Invalid CombineMode");
2172 }
2173 }
2174 else {
2175 if (printDebugOutput) {
2176 std::ostringstream os;
2177 os << *prefix << "Nothing to unpack" << endl;
2178 std::cerr << os.str ();
2179 }
2180 }
2182 if (printDebugOutput) {
2183 std::ostringstream os;
2184 os << *prefix << "Done!" << endl;
2185 std::cerr << os.str ();
2186 }
2187 }
2188
2189 // clang-format on
2190 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2191 void
2194 (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
2195 Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
2196 Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
2197 const size_t constantNumPackets,
2198 const CombineMode CM) {
2199 unpackAndCombine(importLIDs, imports, numPacketsPerLID, constantNumPackets, CM, execution_space());
2200 }
2201 // clang-format off
2202
2203 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2204 size_t
2206 getNumVectors () const
2207 {
2208 if (isConstantStride ()) {
2209 return static_cast<size_t> (view_.extent (1));
2210 } else {
2211 return static_cast<size_t> (whichVectors_.size ());
2212 }
2213 }
2214
2215 namespace { // (anonymous)
2216
2217 template<class RV>
2218 void
2219 gblDotImpl (const RV& dotsOut,
2220 const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
2221 const bool distributed)
2222 {
2223 using Teuchos::REDUCE_MAX;
2224 using Teuchos::REDUCE_SUM;
2225 using Teuchos::reduceAll;
2226 typedef typename RV::non_const_value_type dot_type;
2227
2228 const size_t numVecs = dotsOut.extent (0);
2229
2230 // If the MultiVector is distributed over multiple processes, do
2231 // the distributed (interprocess) part of the dot product. We
2232 // assume that the MPI implementation can read from and write to
2233 // device memory.
2234 //
2235 // replaceMap() may have removed some processes. Those
2236 // processes have a null Map. They must not participate in any
2237 // collective operations. We ask first whether the Map is null,
2238 // because isDistributed() defers that question to the Map. We
2239 // still compute and return local dots for processes not
2240 // participating in collective operations; those probably don't
2241 // make any sense, but it doesn't hurt to do them, since it's
2242 // illegal to call dot() on those processes anyway.
2243 if (distributed && ! comm.is_null ()) {
2244 // The calling process only participates in the collective if
2245 // both the Map and its Comm on that process are nonnull.
2246 const int nv = static_cast<int> (numVecs);
2247 const bool commIsInterComm = ::Tpetra::Details::isInterComm (*comm);
2248
2249 if (commIsInterComm) {
2250 // If comm is an intercomm, then we may not alias input and
2251 // output buffers, so we have to make a copy of the local
2252 // sum.
2253 typename RV::non_const_type lclDots (Kokkos::ViewAllocateWithoutInitializing ("tmp"), numVecs);
2254 // DEEP_COPY REVIEW - NOT TESTED
2255 Kokkos::deep_copy (lclDots, dotsOut);
2256 const dot_type* const lclSum = lclDots.data ();
2257 dot_type* const gblSum = dotsOut.data ();
2258 reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, lclSum, gblSum);
2259 }
2260 else {
2261 dot_type* const inout = dotsOut.data ();
2262 reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, inout, inout);
2263 }
2264 }
2265 }
2266 } // namespace (anonymous)
2268 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2269 void
2272 const Kokkos::View<dot_type*, Kokkos::HostSpace>& dots) const
2273 {
2275 using Kokkos::subview;
2276 using Teuchos::Comm;
2277 using Teuchos::null;
2278 using Teuchos::RCP;
2279 // View of all the dot product results.
2280 typedef Kokkos::View<dot_type*, Kokkos::HostSpace> RV;
2281 typedef typename dual_view_type::t_dev_const XMV;
2282 const char tfecfFuncName[] = "Tpetra::MultiVector::dot: ";
2283
2284 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Kokkos::View)");
2285
2286 const size_t numVecs = this->getNumVectors ();
2287 if (numVecs == 0) {
2288 return; // nothing to do
2289 }
2290 const size_t lclNumRows = this->getLocalLength ();
2291 const size_t numDots = static_cast<size_t> (dots.extent (0));
2292 const bool debug = Behavior::debug ();
2293
2294 if (debug) {
2295 const bool compat = this->getMap ()->isCompatible (* (A.getMap ()));
2296 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2297 (! compat, std::invalid_argument, "'*this' MultiVector is not "
2298 "compatible with the input MultiVector A. We only test for this "
2299 "in debug mode.");
2300 }
2301
2302 // FIXME (mfh 11 Jul 2014) These exception tests may not
2303 // necessarily be thrown on all processes consistently. We should
2304 // instead pass along error state with the inner product. We
2305 // could do this by setting an extra slot to
2306 // Kokkos::ArithTraits<dot_type>::one() on error. The
2307 // final sum should be
2308 // Kokkos::ArithTraits<dot_type>::zero() if not error.
2309 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2310 lclNumRows != A.getLocalLength (), std::runtime_error,
2311 "MultiVectors do not have the same local length. "
2312 "this->getLocalLength() = " << lclNumRows << " != "
2313 "A.getLocalLength() = " << A.getLocalLength () << ".");
2314 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2315 numVecs != A.getNumVectors (), std::runtime_error,
2316 "MultiVectors must have the same number of columns (vectors). "
2317 "this->getNumVectors() = " << numVecs << " != "
2318 "A.getNumVectors() = " << A.getNumVectors () << ".");
2319 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2320 numDots != numVecs, std::runtime_error,
2321 "The output array 'dots' must have the same number of entries as the "
2322 "number of columns (vectors) in *this and A. dots.extent(0) = " <<
2323 numDots << " != this->getNumVectors() = " << numVecs << ".");
2324
2325 const std::pair<size_t, size_t> colRng (0, numVecs);
2326 RV dotsOut = subview (dots, colRng);
2327 RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
2328 this->getMap ()->getComm ();
2329
2330 auto thisView = this->getLocalViewDevice(Access::ReadOnly);
2331 auto A_view = A.getLocalViewDevice(Access::ReadOnly);
2332
2333 ::Tpetra::Details::lclDot<RV, XMV> (dotsOut, thisView, A_view, lclNumRows, numVecs,
2334 this->whichVectors_.getRawPtr (),
2335 A.whichVectors_.getRawPtr (),
2336 this->isConstantStride (), A.isConstantStride ());
2337
2338 // lbv 15 mar 2023: Kokkos Kernels provides non-blocking BLAS
2339 // functions unless they explicitely return a value to Host.
2340 // Here while the lclDot are on host, they are not a return
2341 // value, therefore they might be avaible to us immediately.
2342 // Adding a frnce here guarantees that we will have the lclDot
2343 // ahead of the MPI reduction.
2344 execution_space exec_space_instance = execution_space();
2345 exec_space_instance.fence();
2346
2347 gblDotImpl (dotsOut, comm, this->isDistributed ());
2348 }
2349
2350 namespace { // (anonymous)
2351 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2353 multiVectorSingleColumnDot (const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& x,
2355 {
2358 using dot_type = typename MV::dot_type;
2359 ProfilingRegion region ("Tpetra::multiVectorSingleColumnDot");
2360
2361 auto map = x.getMap ();
2362 Teuchos::RCP<const Teuchos::Comm<int> > comm =
2363 map.is_null () ? Teuchos::null : map->getComm ();
2364 if (comm.is_null ()) {
2365 return Kokkos::ArithTraits<dot_type>::zero ();
2366 }
2367 else {
2368 using LO = LocalOrdinal;
2369 // The min just ensures that we don't overwrite memory that
2370 // doesn't belong to us, in the erroneous input case where x
2371 // and y have different numbers of rows.
2372 const LO lclNumRows = static_cast<LO> (std::min (x.getLocalLength (),
2373 y.getLocalLength ()));
2374 const Kokkos::pair<LO, LO> rowRng (0, lclNumRows);
2375 dot_type lclDot = Kokkos::ArithTraits<dot_type>::zero ();
2376 dot_type gblDot = Kokkos::ArithTraits<dot_type>::zero ();
2377
2378 // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2379 //const_cast<MV&>(x).sync_device ();
2380 //const_cast<MV&>(y).sync_device ();
2381
2382 auto x_2d = x.getLocalViewDevice(Access::ReadOnly);
2383 auto x_1d = Kokkos::subview (x_2d, rowRng, 0);
2384 auto y_2d = y.getLocalViewDevice(Access::ReadOnly);
2385 auto y_1d = Kokkos::subview (y_2d, rowRng, 0);
2386 lclDot = KokkosBlas::dot (x_1d, y_1d);
2387
2388 if (x.isDistributed ()) {
2389 using Teuchos::outArg;
2390 using Teuchos::REDUCE_SUM;
2391 using Teuchos::reduceAll;
2392 reduceAll<int, dot_type> (*comm, REDUCE_SUM, lclDot, outArg (gblDot));
2393 }
2394 else {
2395 gblDot = lclDot;
2396 }
2397 return gblDot;
2398 }
2399 }
2400 } // namespace (anonymous)
2401
2402
2403
2404 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2405 void
2408 const Teuchos::ArrayView<dot_type>& dots) const
2409 {
2410 const char tfecfFuncName[] = "dot: ";
2411 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Teuchos::ArrayView)");
2412
2413 const size_t numVecs = this->getNumVectors ();
2414 const size_t lclNumRows = this->getLocalLength ();
2415 const size_t numDots = static_cast<size_t> (dots.size ());
2416
2417 // FIXME (mfh 11 Jul 2014, 31 May 2017) These exception tests may
2418 // not necessarily be thrown on all processes consistently. We
2419 // keep them for now, because MultiVector's unit tests insist on
2420 // them. In the future, we should instead pass along error state
2421 // with the inner product. We could do this by setting an extra
2422 // slot to Kokkos::ArithTraits<dot_type>::one() on error.
2423 // The final sum should be
2424 // Kokkos::ArithTraits<dot_type>::zero() if not error.
2425 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2426 (lclNumRows != A.getLocalLength (), std::runtime_error,
2427 "MultiVectors do not have the same local length. "
2428 "this->getLocalLength() = " << lclNumRows << " != "
2429 "A.getLocalLength() = " << A.getLocalLength () << ".");
2430 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2431 (numVecs != A.getNumVectors (), std::runtime_error,
2432 "MultiVectors must have the same number of columns (vectors). "
2433 "this->getNumVectors() = " << numVecs << " != "
2434 "A.getNumVectors() = " << A.getNumVectors () << ".");
2435 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2436 (numDots != numVecs, std::runtime_error,
2437 "The output array 'dots' must have the same number of entries as the "
2438 "number of columns (vectors) in *this and A. dots.extent(0) = " <<
2439 numDots << " != this->getNumVectors() = " << numVecs << ".");
2440
2441 if (numVecs == 1 && this->isConstantStride () && A.isConstantStride ()) {
2442 const dot_type gblDot = multiVectorSingleColumnDot (*this, A);
2443 dots[0] = gblDot;
2444 }
2445 else {
2446 this->dot (A, Kokkos::View<dot_type*, Kokkos::HostSpace>(dots.getRawPtr (), numDots));
2447 }
2448 }
2449
2450 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2451 void
2453 norm2 (const Teuchos::ArrayView<mag_type>& norms) const
2454 {
2456 using ::Tpetra::Details::NORM_TWO;
2458 ProfilingRegion region ("Tpetra::MV::norm2 (host output)");
2459
2460 // The function needs to be able to sync X.
2461 MV& X = const_cast<MV&> (*this);
2462 multiVectorNormImpl (norms.getRawPtr (), X, NORM_TWO);
2463 }
2464
2465 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2466 void
2468 norm2 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2469 {
2470 Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2471 this->norm2 (norms_av);
2472 }
2473
2474
2475 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2476 void
2478 norm1 (const Teuchos::ArrayView<mag_type>& norms) const
2479 {
2481 using ::Tpetra::Details::NORM_ONE;
2483 ProfilingRegion region ("Tpetra::MV::norm1 (host output)");
2484
2485 // The function needs to be able to sync X.
2486 MV& X = const_cast<MV&> (*this);
2487 multiVectorNormImpl (norms.data (), X, NORM_ONE);
2488 }
2489
2490 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2491 void
2493 norm1 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2494 {
2495 Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2496 this->norm1 (norms_av);
2497 }
2498
2499 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2500 void
2502 normInf (const Teuchos::ArrayView<mag_type>& norms) const
2503 {
2505 using ::Tpetra::Details::NORM_INF;
2507 ProfilingRegion region ("Tpetra::MV::normInf (host output)");
2508
2509 // The function needs to be able to sync X.
2510 MV& X = const_cast<MV&> (*this);
2511 multiVectorNormImpl (norms.getRawPtr (), X, NORM_INF);
2512 }
2513
2514 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2515 void
2517 normInf (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2518 {
2519 Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2520 this->normInf (norms_av);
2521 }
2522
2523 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2524 void
2526 meanValue (const Teuchos::ArrayView<impl_scalar_type>& means) const
2527 {
2528 // KR FIXME Overload this method to take a View.
2529 using Kokkos::ALL;
2530 using Kokkos::subview;
2531 using Teuchos::Comm;
2532 using Teuchos::RCP;
2533 using Teuchos::reduceAll;
2534 using Teuchos::REDUCE_SUM;
2535 typedef Kokkos::ArithTraits<impl_scalar_type> ATS;
2536
2537 const size_t lclNumRows = this->getLocalLength ();
2538 const size_t numVecs = this->getNumVectors ();
2539 const size_t numMeans = static_cast<size_t> (means.size ());
2540
2541 TEUCHOS_TEST_FOR_EXCEPTION(
2542 numMeans != numVecs, std::runtime_error,
2543 "Tpetra::MultiVector::meanValue: means.size() = " << numMeans
2544 << " != this->getNumVectors() = " << numVecs << ".");
2545
2546 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2547 const std::pair<size_t, size_t> colRng (0, numVecs);
2548
2549 // Make sure that the final output view has the same layout as the
2550 // temporary view's HostMirror. Left or Right doesn't matter for
2551 // a 1-D array anyway; this is just to placate the compiler.
2552 typedef Kokkos::View<impl_scalar_type*, device_type> local_view_type;
2553 typedef Kokkos::View<impl_scalar_type*,
2554 typename local_view_type::HostMirror::array_layout,
2555 Kokkos::HostSpace,
2556 Kokkos::MemoryTraits<Kokkos::Unmanaged> > host_local_view_type;
2557 host_local_view_type meansOut (means.getRawPtr (), numMeans);
2558
2559 RCP<const Comm<int> > comm = this->getMap ().is_null () ? Teuchos::null :
2560 this->getMap ()->getComm ();
2561
2562 // If we need sync to device, then host has the most recent version.
2563 const bool useHostVersion = this->need_sync_device ();
2564 if (useHostVersion) {
2565 // DualView was last modified on host, so run the local kernel there.
2566 auto X_lcl = subview (getLocalViewHost(Access::ReadOnly),
2567 rowRng, Kokkos::ALL ());
2568 // Compute the local sum of each column.
2569 Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2570 if (isConstantStride ()) {
2571 KokkosBlas::sum (lclSums, X_lcl);
2572 }
2573 else {
2574 for (size_t j = 0; j < numVecs; ++j) {
2575 const size_t col = whichVectors_[j];
2576 KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2577 }
2578 }
2579
2580 // If there are multiple MPI processes, the all-reduce reads
2581 // from lclSums, and writes to meansOut. Otherwise, we just
2582 // copy lclSums into meansOut.
2583 if (! comm.is_null () && this->isDistributed ()) {
2584 reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2585 lclSums.data (), meansOut.data ());
2586 }
2587 else {
2588 // DEEP_COPY REVIEW - NOT TESTED
2589 Kokkos::deep_copy (meansOut, lclSums);
2590 }
2591 }
2592 else {
2593 // DualView was last modified on device, so run the local kernel there.
2594 auto X_lcl = subview (this->getLocalViewDevice(Access::ReadOnly),
2595 rowRng, Kokkos::ALL ());
2596
2597 // Compute the local sum of each column.
2598 Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2599 if (isConstantStride ()) {
2600 KokkosBlas::sum (lclSums, X_lcl);
2601 }
2602 else {
2603 for (size_t j = 0; j < numVecs; ++j) {
2604 const size_t col = whichVectors_[j];
2605 KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2606 }
2607 }
2608 // lbv 10 mar 2023: Kokkos Kernels provides non-blocking BLAS
2609 // functions unless they explicitly return a value to Host.
2610 // Here while the lclSums are on the host, they are not a return
2611 // value, therefore they might be available to us immediately.
2612 // Adding a fence here guarantees that we will have the lclSums
2613 // ahead of the MPI reduction.
2614 execution_space exec_space_instance = execution_space();
2615 exec_space_instance.fence();
2616
2617 // If there are multiple MPI processes, the all-reduce reads
2618 // from lclSums, and writes to meansOut. (We assume that MPI
2619 // can read device memory.) Otherwise, we just copy lclSums
2620 // into meansOut.
2621 if (! comm.is_null () && this->isDistributed ()) {
2622 reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2623 lclSums.data (), meansOut.data ());
2624 }
2625 else {
2626 // DEEP_COPY REVIEW - HOST-TO-HOST - NOT TESTED FOR MPI BUILD
2627 Kokkos::deep_copy (meansOut, lclSums);
2628 }
2629 }
2630
2631 // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
2632 // to the magnitude type, since operator/ (std::complex<T>, int)
2633 // isn't necessarily defined.
2634 const impl_scalar_type OneOverN =
2635 ATS::one () / static_cast<mag_type> (this->getGlobalLength ());
2636 for (size_t k = 0; k < numMeans; ++k) {
2637 meansOut(k) = meansOut(k) * OneOverN;
2638 }
2639 }
2640
2641
2642 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2643 void
2645 randomize ()
2646 {
2647 typedef impl_scalar_type IST;
2648 typedef Kokkos::ArithTraits<IST> ATS;
2649 typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2650 typedef typename pool_type::generator_type generator_type;
2651
2652 const IST max = Kokkos::rand<generator_type, IST>::max ();
2653 const IST min = ATS::is_signed ? IST (-max) : ATS::zero ();
2654
2655 this->randomize (static_cast<Scalar> (min), static_cast<Scalar> (max));
2656 }
2657
2658
2659 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2660 void
2662 randomize (const Scalar& minVal, const Scalar& maxVal)
2663 {
2664 typedef impl_scalar_type IST;
2665 typedef Tpetra::Details::Static_Random_XorShift64_Pool<typename device_type::execution_space> tpetra_pool_type;
2666 typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2667
2668 // Seed the pool based on the system RNG and the MPI rank, if needed
2669 if(!tpetra_pool_type::isSet())
2670 tpetra_pool_type::resetPool(this->getMap()->getComm()->getRank());
2671
2672 pool_type & rand_pool = tpetra_pool_type::getPool();
2673 const IST max = static_cast<IST> (maxVal);
2674 const IST min = static_cast<IST> (minVal);
2675
2676 auto thisView = this->getLocalViewDevice(Access::OverwriteAll);
2677
2678 if (isConstantStride ()) {
2679 Kokkos::fill_random (thisView, rand_pool, min, max);
2680 }
2681 else {
2682 const size_t numVecs = getNumVectors ();
2683 for (size_t k = 0; k < numVecs; ++k) {
2684 const size_t col = whichVectors_[k];
2685 auto X_k = Kokkos::subview (thisView, Kokkos::ALL (), col);
2686 Kokkos::fill_random (X_k, rand_pool, min, max);
2687 }
2688 }
2689 }
2690
2691 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2692 void
2694 putScalar (const Scalar& alpha)
2695 {
2698 using DES = typename dual_view_type::t_dev::execution_space;
2699 using HES = typename dual_view_type::t_host::execution_space;
2700 using LO = LocalOrdinal;
2701 ProfilingRegion region ("Tpetra::MultiVector::putScalar");
2702
2703 // We need this cast for cases like Scalar = std::complex<T> but
2704 // impl_scalar_type = Kokkos::complex<T>.
2705 const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2706 const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
2707 const LO numVecs = static_cast<LO> (this->getNumVectors ());
2708
2709 // Modify the most recently updated version of the data. This
2710 // avoids sync'ing, which could violate users' expectations.
2711 //
2712 // If we need sync to device, then host has the most recent version.
2713 const bool runOnHost = runKernelOnHost(*this);
2714 if (! runOnHost) {
2715 auto X = this->getLocalViewDevice(Access::OverwriteAll);
2716 if (this->isConstantStride ()) {
2717 fill (DES (), X, theAlpha, lclNumRows, numVecs);
2718 }
2719 else {
2720 fill (DES (), X, theAlpha, lclNumRows, numVecs,
2721 this->whichVectors_.getRawPtr ());
2722 }
2723 }
2724 else { // last modified in host memory, so modify data there.
2725 auto X = this->getLocalViewHost(Access::OverwriteAll);
2726 if (this->isConstantStride ()) {
2727 fill (HES (), X, theAlpha, lclNumRows, numVecs);
2728 }
2729 else {
2730 fill (HES (), X, theAlpha, lclNumRows, numVecs,
2731 this->whichVectors_.getRawPtr ());
2732 }
2733 }
2734 }
2735
2736
2737 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2738 void
2740 replaceMap (const Teuchos::RCP<const map_type>& newMap)
2741 {
2742 using Teuchos::ArrayRCP;
2743 using Teuchos::Comm;
2744 using Teuchos::RCP;
2745 using ST = Scalar;
2746 using LO = LocalOrdinal;
2747 using GO = GlobalOrdinal;
2748
2749 // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
2750 // it might work if the MV is a column view of another MV.
2751 // However, things might go wrong when restoring the original
2752 // Map, so we don't allow this case for now.
2753 TEUCHOS_TEST_FOR_EXCEPTION(
2754 ! this->isConstantStride (), std::logic_error,
2755 "Tpetra::MultiVector::replaceMap: This method does not currently work "
2756 "if the MultiVector is a column view of another MultiVector (that is, if "
2757 "isConstantStride() == false).");
2758
2759 // Case 1: current Map and new Map are both nonnull on this process.
2760 // Case 2: current Map is nonnull, new Map is null.
2761 // Case 3: current Map is null, new Map is nonnull.
2762 // Case 4: both Maps are null: forbidden.
2763 //
2764 // Case 1 means that we don't have to do anything on this process,
2765 // other than assign the new Map. (We always have to do that.)
2766 // It's an error for the user to supply a Map that requires
2767 // resizing in this case.
2768 //
2769 // Case 2 means that the calling process is in the current Map's
2770 // communicator, but will be excluded from the new Map's
2771 // communicator. We don't have to do anything on the calling
2772 // process; just leave whatever data it may have alone.
2773 //
2774 // Case 3 means that the calling process is excluded from the
2775 // current Map's communicator, but will be included in the new
2776 // Map's communicator. This means we need to (re)allocate the
2777 // local DualView if it does not have the right number of rows.
2778 // If the new number of rows is nonzero, we'll fill the newly
2779 // allocated local data with zeros, as befits a projection
2780 // operation.
2781 //
2782 // The typical use case for Case 3 is that the MultiVector was
2783 // first created with the Map with more processes, then that Map
2784 // was replaced with a Map with fewer processes, and finally the
2785 // original Map was restored on this call to replaceMap.
2786
2787 if (this->getMap ().is_null ()) { // current Map is null
2788 // If this->getMap() is null, that means that this MultiVector
2789 // has already had replaceMap happen to it. In that case, just
2790 // reallocate the DualView with the right size.
2791
2792 TEUCHOS_TEST_FOR_EXCEPTION(
2793 newMap.is_null (), std::invalid_argument,
2794 "Tpetra::MultiVector::replaceMap: both current and new Maps are null. "
2795 "This probably means that the input Map is incorrect.");
2796
2797 // Case 3: current Map is null, new Map is nonnull.
2798 // Reallocate the DualView with the right dimensions.
2799 const size_t newNumRows = newMap->getLocalNumElements ();
2800 const size_t origNumRows = view_.extent (0);
2801 const size_t numCols = this->getNumVectors ();
2802
2803 if (origNumRows != newNumRows || view_.extent (1) != numCols) {
2804 view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2805 }
2806 }
2807 else if (newMap.is_null ()) { // Case 2: current Map is nonnull, new Map is null
2808 // I am an excluded process. Reinitialize my data so that I
2809 // have 0 rows. Keep the number of columns as before.
2810 const size_t newNumRows = static_cast<size_t> (0);
2811 const size_t numCols = this->getNumVectors ();
2812 view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2813 }
2814
2815 this->map_ = newMap;
2816 }
2817
2818 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2819 void
2821 scale (const Scalar& alpha)
2822 {
2823 using Kokkos::ALL;
2824 using IST = impl_scalar_type;
2825
2826 const IST theAlpha = static_cast<IST> (alpha);
2827 if (theAlpha == Kokkos::ArithTraits<IST>::one ()) {
2828 return; // do nothing
2829 }
2830 const size_t lclNumRows = getLocalLength ();
2831 const size_t numVecs = getNumVectors ();
2832 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2833 const std::pair<size_t, size_t> colRng (0, numVecs);
2834
2835 // We can't substitute putScalar(0.0) for scale(0.0), because the
2836 // former will overwrite NaNs present in the MultiVector. The
2837 // semantics of this call require multiplying them by 0, which
2838 // IEEE 754 requires to be NaN.
2839
2840 // If we need sync to device, then host has the most recent version.
2841 const bool useHostVersion = need_sync_device ();
2842 if (useHostVersion) {
2843 auto Y_lcl = Kokkos::subview (getLocalViewHost(Access::ReadWrite), rowRng, ALL ());
2844 if (isConstantStride ()) {
2845 KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2846 }
2847 else {
2848 for (size_t k = 0; k < numVecs; ++k) {
2849 const size_t Y_col = whichVectors_[k];
2850 auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2851 KokkosBlas::scal (Y_k, theAlpha, Y_k);
2852 }
2853 }
2854 }
2855 else { // work on device
2856 auto Y_lcl = Kokkos::subview (getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
2857 if (isConstantStride ()) {
2858 KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2859 }
2860 else {
2861 for (size_t k = 0; k < numVecs; ++k) {
2862 const size_t Y_col = isConstantStride () ? k : whichVectors_[k];
2863 auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2864 KokkosBlas::scal (Y_k, theAlpha, Y_k);
2865 }
2866 }
2867 }
2868 }
2869
2870
2871 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2872 void
2874 scale (const Teuchos::ArrayView<const Scalar>& alphas)
2875 {
2876 const size_t numVecs = this->getNumVectors ();
2877 const size_t numAlphas = static_cast<size_t> (alphas.size ());
2878 TEUCHOS_TEST_FOR_EXCEPTION(
2879 numAlphas != numVecs, std::invalid_argument, "Tpetra::MultiVector::"
2880 "scale: alphas.size() = " << numAlphas << " != this->getNumVectors() = "
2881 << numVecs << ".");
2882
2883 // Use a DualView to copy the scaling constants onto the device.
2884 using k_alphas_type = Kokkos::DualView<impl_scalar_type*, device_type>;
2885 k_alphas_type k_alphas ("alphas::tmp", numAlphas);
2886 k_alphas.modify_host ();
2887 for (size_t i = 0; i < numAlphas; ++i) {
2888 k_alphas.view_host()(i) = static_cast<impl_scalar_type> (alphas[i]);
2889 }
2890 k_alphas.sync_device ();
2891 // Invoke the scale() overload that takes a device View of coefficients.
2892 this->scale (k_alphas.view_device ());
2893 }
2894
2895 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2896 void
2898 scale (const Kokkos::View<const impl_scalar_type*, device_type>& alphas)
2899 {
2900 using Kokkos::ALL;
2901 using Kokkos::subview;
2902
2903 const size_t lclNumRows = this->getLocalLength ();
2904 const size_t numVecs = this->getNumVectors ();
2905 TEUCHOS_TEST_FOR_EXCEPTION(
2906 static_cast<size_t> (alphas.extent (0)) != numVecs,
2907 std::invalid_argument, "Tpetra::MultiVector::scale(alphas): "
2908 "alphas.extent(0) = " << alphas.extent (0)
2909 << " != this->getNumVectors () = " << numVecs << ".");
2910 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2911 const std::pair<size_t, size_t> colRng (0, numVecs);
2912
2913 // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the
2914 // type of the return value of subview. This is because if we
2915 // switch the array layout from LayoutLeft to LayoutRight
2916 // (preferred for performance of block operations), the types
2917 // below won't be valid. (A view of a column of a LayoutRight
2918 // multivector has LayoutStride, not LayoutLeft.)
2919
2920 // If we need sync to device, then host has the most recent version.
2921 const bool useHostVersion = this->need_sync_device ();
2922 if (useHostVersion) {
2923 // Work in host memory. This means we need to create a host
2924 // mirror of the input View of coefficients.
2925 auto alphas_h = Kokkos::create_mirror_view (alphas);
2926 // DEEP_COPY REVIEW - NOT TESTED
2927 Kokkos::deep_copy (alphas_h, alphas);
2928
2929 auto Y_lcl = subview (this->getLocalViewHost(Access::ReadWrite), rowRng, ALL ());
2930 if (isConstantStride ()) {
2931 KokkosBlas::scal (Y_lcl, alphas_h, Y_lcl);
2932 }
2933 else {
2934 for (size_t k = 0; k < numVecs; ++k) {
2935 const size_t Y_col = this->isConstantStride () ? k :
2936 this->whichVectors_[k];
2937 auto Y_k = subview (Y_lcl, ALL (), Y_col);
2938 // We don't have to use the entire 1-D View here; we can use
2939 // the version that takes a scalar coefficient.
2940 KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2941 }
2942 }
2943 }
2944 else { // Work in device memory, using the input View 'alphas' directly.
2945 auto Y_lcl = subview (this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
2946 if (isConstantStride ()) {
2947 KokkosBlas::scal (Y_lcl, alphas, Y_lcl);
2948 }
2949 else {
2950 // FIXME (mfh 15 Mar 2019) We need one coefficient at a time,
2951 // as values on host, so copy them to host. Another approach
2952 // would be to fix scal() so that it takes a 0-D View as the
2953 // second argument.
2954 auto alphas_h = Kokkos::create_mirror_view (alphas);
2955 // DEEP_COPY REVIEW - NOT TESTED
2956 Kokkos::deep_copy (alphas_h, alphas);
2957
2958 for (size_t k = 0; k < numVecs; ++k) {
2959 const size_t Y_col = this->isConstantStride () ? k :
2960 this->whichVectors_[k];
2961 auto Y_k = subview (Y_lcl, ALL (), Y_col);
2962 KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2963 }
2964 }
2965 }
2966 }
2967
2968 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2969 void
2971 scale (const Scalar& alpha,
2973 {
2974 using Kokkos::ALL;
2975 using Kokkos::subview;
2976 const char tfecfFuncName[] = "scale: ";
2977
2978 const size_t lclNumRows = getLocalLength ();
2979 const size_t numVecs = getNumVectors ();
2980
2981 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2982 lclNumRows != A.getLocalLength (), std::invalid_argument,
2983 "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2984 << A.getLocalLength () << ".");
2985 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2986 numVecs != A.getNumVectors (), std::invalid_argument,
2987 "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2988 << A.getNumVectors () << ".");
2989
2990 const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2991 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2992 const std::pair<size_t, size_t> colRng (0, numVecs);
2993
2994 auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
2995 auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
2996 auto Y_lcl = subview (Y_lcl_orig, rowRng, ALL ());
2997 auto X_lcl = subview (X_lcl_orig, rowRng, ALL ());
2998
2999 if (isConstantStride () && A.isConstantStride ()) {
3000 KokkosBlas::scal (Y_lcl, theAlpha, X_lcl);
3001 }
3002 else {
3003 // Make sure that Kokkos only uses the local length for add.
3004 for (size_t k = 0; k < numVecs; ++k) {
3005 const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
3006 const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
3007 auto Y_k = subview (Y_lcl, ALL (), Y_col);
3008 auto X_k = subview (X_lcl, ALL (), X_col);
3009
3010 KokkosBlas::scal (Y_k, theAlpha, X_k);
3011 }
3012 }
3013 }
3014
3015
3016
3017 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3018 void
3021 {
3022 const char tfecfFuncName[] = "reciprocal: ";
3023
3024 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3025 getLocalLength () != A.getLocalLength (), std::runtime_error,
3026 "MultiVectors do not have the same local length. "
3027 "this->getLocalLength() = " << getLocalLength ()
3028 << " != A.getLocalLength() = " << A.getLocalLength () << ".");
3029 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3030 A.getNumVectors () != this->getNumVectors (), std::runtime_error,
3031 ": MultiVectors do not have the same number of columns (vectors). "
3032 "this->getNumVectors() = " << getNumVectors ()
3033 << " != A.getNumVectors() = " << A.getNumVectors () << ".");
3034
3035 const size_t numVecs = getNumVectors ();
3036
3037 auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
3038 auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
3039
3040 if (isConstantStride () && A.isConstantStride ()) {
3041 KokkosBlas::reciprocal (this_view_dev, A_view_dev);
3042 }
3043 else {
3044 using Kokkos::ALL;
3045 using Kokkos::subview;
3046 for (size_t k = 0; k < numVecs; ++k) {
3047 const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3048 auto vector_k = subview (this_view_dev, ALL (), this_col);
3049 const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
3050 auto vector_Ak = subview (A_view_dev, ALL (), A_col);
3051 KokkosBlas::reciprocal (vector_k, vector_Ak);
3052 }
3053 }
3054 }
3055
3056 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3057 void
3060 {
3061 const char tfecfFuncName[] = "abs";
3062
3063 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3064 getLocalLength () != A.getLocalLength (), std::runtime_error,
3065 ": MultiVectors do not have the same local length. "
3066 "this->getLocalLength() = " << getLocalLength ()
3067 << " != A.getLocalLength() = " << A.getLocalLength () << ".");
3068 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3069 A.getNumVectors () != this->getNumVectors (), std::runtime_error,
3070 ": MultiVectors do not have the same number of columns (vectors). "
3071 "this->getNumVectors() = " << getNumVectors ()
3072 << " != A.getNumVectors() = " << A.getNumVectors () << ".");
3073 const size_t numVecs = getNumVectors ();
3074
3075 auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
3076 auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
3077
3078 if (isConstantStride () && A.isConstantStride ()) {
3079 KokkosBlas::abs (this_view_dev, A_view_dev);
3080 }
3081 else {
3082 using Kokkos::ALL;
3083 using Kokkos::subview;
3084
3085 for (size_t k=0; k < numVecs; ++k) {
3086 const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3087 auto vector_k = subview (this_view_dev, ALL (), this_col);
3088 const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
3089 auto vector_Ak = subview (A_view_dev, ALL (), A_col);
3090 KokkosBlas::abs (vector_k, vector_Ak);
3091 }
3092 }
3093 }
3094
3095 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3096 void
3098 update (const Scalar& alpha,
3100 const Scalar& beta)
3101 {
3102 const char tfecfFuncName[] = "update: ";
3103 using Kokkos::subview;
3104 using Kokkos::ALL;
3105
3106 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta)");
3107
3108 const size_t lclNumRows = getLocalLength ();
3109 const size_t numVecs = getNumVectors ();
3110
3112 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3113 lclNumRows != A.getLocalLength (), std::invalid_argument,
3114 "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
3115 << A.getLocalLength () << ".");
3116 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3117 numVecs != A.getNumVectors (), std::invalid_argument,
3118 "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
3119 << A.getNumVectors () << ".");
3120 }
3121
3122 const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3123 const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
3124 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3125 const std::pair<size_t, size_t> colRng (0, numVecs);
3126
3127 auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
3128 auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
3129 auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
3130 auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
3131
3132 // The device memory of *this is about to be modified
3133 if (isConstantStride () && A.isConstantStride ()) {
3134 KokkosBlas::axpby (theAlpha, X_lcl, theBeta, Y_lcl);
3135 }
3136 else {
3137 // Make sure that Kokkos only uses the local length for add.
3138 for (size_t k = 0; k < numVecs; ++k) {
3139 const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
3140 const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
3141 auto Y_k = subview (Y_lcl, ALL (), Y_col);
3142 auto X_k = subview (X_lcl, ALL (), X_col);
3143
3144 KokkosBlas::axpby (theAlpha, X_k, theBeta, Y_k);
3145 }
3146 }
3147 }
3148
3149 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3150 void
3152 update (const Scalar& alpha,
3154 const Scalar& beta,
3156 const Scalar& gamma)
3157 {
3158 using Kokkos::ALL;
3159 using Kokkos::subview;
3160
3161 const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
3162
3163 ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta,B,gamma)");
3164
3165 const size_t lclNumRows = this->getLocalLength ();
3166 const size_t numVecs = getNumVectors ();
3167
3169 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3170 lclNumRows != A.getLocalLength (), std::invalid_argument,
3171 "The input MultiVector A has " << A.getLocalLength () << " local "
3172 "row(s), but this MultiVector has " << lclNumRows << " local "
3173 "row(s).");
3174 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3175 lclNumRows != B.getLocalLength (), std::invalid_argument,
3176 "The input MultiVector B has " << B.getLocalLength () << " local "
3177 "row(s), but this MultiVector has " << lclNumRows << " local "
3178 "row(s).");
3179 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3180 A.getNumVectors () != numVecs, std::invalid_argument,
3181 "The input MultiVector A has " << A.getNumVectors () << " column(s), "
3182 "but this MultiVector has " << numVecs << " column(s).");
3183 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3184 B.getNumVectors () != numVecs, std::invalid_argument,
3185 "The input MultiVector B has " << B.getNumVectors () << " column(s), "
3186 "but this MultiVector has " << numVecs << " column(s).");
3187 }
3188
3189 const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3190 const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
3191 const impl_scalar_type theGamma = static_cast<impl_scalar_type> (gamma);
3192
3193 const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3194 const std::pair<size_t, size_t> colRng (0, numVecs);
3195
3196 // Prefer 'auto' over specifying the type explicitly. This avoids
3197 // issues with a subview possibly having a different type than the
3198 // original view.
3199 auto C_lcl = subview (this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
3200 auto A_lcl = subview (A.getLocalViewDevice(Access::ReadOnly), rowRng, ALL ());
3201 auto B_lcl = subview (B.getLocalViewDevice(Access::ReadOnly), rowRng, ALL ());
3202
3203 if (isConstantStride () && A.isConstantStride () && B.isConstantStride ()) {
3204 KokkosBlas::update (theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl);
3205 }
3206 else {
3207 // Some input (or *this) is not constant stride,
3208 // so perform the update one column at a time.
3209 for (size_t k = 0; k < numVecs; ++k) {
3210 const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3211 const size_t A_col = A.isConstantStride () ? k : A.whichVectors_[k];
3212 const size_t B_col = B.isConstantStride () ? k : B.whichVectors_[k];
3213 KokkosBlas::update (theAlpha, subview (A_lcl, rowRng, A_col),
3214 theBeta, subview (B_lcl, rowRng, B_col),
3215 theGamma, subview (C_lcl, rowRng, this_col));
3216 }
3217 }
3218 }
3219
3220 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3221 Teuchos::ArrayRCP<const Scalar>
3223 getData (size_t j) const
3224 {
3225 using Kokkos::ALL;
3226 using IST = impl_scalar_type;
3227 const char tfecfFuncName[] = "getData: ";
3228
3229 // Any MultiVector method that called the (classic) Kokkos Node's
3230 // viewBuffer or viewBufferNonConst methods always implied a
3231 // device->host synchronization. Thus, we synchronize here as
3232 // well.
3233
3234 auto hostView = getLocalViewHost(Access::ReadOnly);
3235 const size_t col = isConstantStride () ? j : whichVectors_[j];
3236 auto hostView_j = Kokkos::subview (hostView, ALL (), col);
3237 Teuchos::ArrayRCP<const IST> dataAsArcp =
3238 Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3239
3240 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3241 (static_cast<size_t> (hostView_j.extent (0)) <
3242 static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3243 "hostView_j.extent(0) = " << hostView_j.extent (0)
3244 << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3245 "Please report this bug to the Tpetra developers.");
3246
3247 return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3248 }
3249
3250 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3251 Teuchos::ArrayRCP<Scalar>
3253 getDataNonConst (size_t j)
3254 {
3255 using Kokkos::ALL;
3256 using Kokkos::subview;
3257 using IST = impl_scalar_type;
3258 const char tfecfFuncName[] = "getDataNonConst: ";
3259
3260 auto hostView = getLocalViewHost(Access::ReadWrite);
3261 const size_t col = isConstantStride () ? j : whichVectors_[j];
3262 auto hostView_j = subview (hostView, ALL (), col);
3263 Teuchos::ArrayRCP<IST> dataAsArcp =
3264 Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3265
3266 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3267 (static_cast<size_t> (hostView_j.extent (0)) <
3268 static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3269 "hostView_j.extent(0) = " << hostView_j.extent (0)
3270 << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3271 "Please report this bug to the Tpetra developers.");
3272
3273 return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3274 }
3275
3276 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3277 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3279 subCopy (const Teuchos::ArrayView<const size_t>& cols) const
3280 {
3281 using Teuchos::RCP;
3283
3284 // Check whether the index set in cols is contiguous. If it is,
3285 // use the more efficient Range1D version of subCopy.
3286 bool contiguous = true;
3287 const size_t numCopyVecs = static_cast<size_t> (cols.size ());
3288 for (size_t j = 1; j < numCopyVecs; ++j) {
3289 if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3290 contiguous = false;
3291 break;
3292 }
3293 }
3294 if (contiguous && numCopyVecs > 0) {
3295 return this->subCopy (Teuchos::Range1D (cols[0], cols[numCopyVecs-1]));
3296 }
3297 else {
3298 RCP<const MV> X_sub = this->subView (cols);
3299 RCP<MV> Y (new MV (this->getMap (), numCopyVecs, false));
3300 Y->assign (*X_sub);
3301 return Y;
3302 }
3303 }
3304
3305 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3306 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3308 subCopy (const Teuchos::Range1D &colRng) const
3309 {
3310 using Teuchos::RCP;
3312
3313 RCP<const MV> X_sub = this->subView (colRng);
3314 RCP<MV> Y (new MV (this->getMap (), static_cast<size_t> (colRng.size ()), false));
3315 Y->assign (*X_sub);
3316 return Y;
3317 }
3318
3319 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3320 size_t
3325
3326 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3327 size_t
3332
3333 template <class Scalar, class LO, class GO, class Node>
3336 const Teuchos::RCP<const map_type>& subMap,
3337 const local_ordinal_type rowOffset) :
3338 base_type (subMap)
3339 {
3340 using Kokkos::ALL;
3341 using Kokkos::subview;
3342 using Teuchos::outArg;
3343 using Teuchos::RCP;
3344 using Teuchos::rcp;
3345 using Teuchos::reduceAll;
3346 using Teuchos::REDUCE_MIN;
3347 using std::endl;
3349 const char prefix[] = "Tpetra::MultiVector constructor (offsetView): ";
3350 const char suffix[] = "Please report this bug to the Tpetra developers.";
3351 int lclGood = 1;
3352 int gblGood = 1;
3353 std::unique_ptr<std::ostringstream> errStrm;
3354 const bool debug = ::Tpetra::Details::Behavior::debug ();
3355 const bool verbose = ::Tpetra::Details::Behavior::verbose ();
3356
3357 // Be careful to use the input Map's communicator, not X's. The
3358 // idea is that, on return, *this is a subview of X, using the
3359 // input Map.
3360 const auto comm = subMap->getComm ();
3361 TEUCHOS_ASSERT( ! comm.is_null () );
3362 const int myRank = comm->getRank ();
3363
3364 const LO lclNumRowsBefore = static_cast<LO> (X.getLocalLength ());
3365 const LO numCols = static_cast<LO> (X.getNumVectors ());
3366 const LO newNumRows = static_cast<LO> (subMap->getLocalNumElements ());
3367 if (verbose) {
3368 std::ostringstream os;
3369 os << "Proc " << myRank << ": " << prefix
3370 << "X: {lclNumRows: " << lclNumRowsBefore
3371 << ", origLclNumRows: " << X.getOrigNumLocalRows ()
3372 << ", numCols: " << numCols << "}, "
3373 << "subMap: {lclNumRows: " << newNumRows << "}" << endl;
3374 std::cerr << os.str ();
3375 }
3376 // We ask for the _original_ number of rows in X, because X could
3377 // be a shorter (fewer rows) view of a longer MV. (For example, X
3378 // could be a domain Map view of a column Map MV.)
3379 const bool tooManyElts =
3380 newNumRows + rowOffset > static_cast<LO> (X.getOrigNumLocalRows ());
3381 if (tooManyElts) {
3382 errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3383 *errStrm << " Proc " << myRank << ": subMap->getLocalNumElements() (="
3384 << newNumRows << ") + rowOffset (=" << rowOffset
3385 << ") > X.getOrigNumLocalRows() (=" << X.getOrigNumLocalRows ()
3386 << ")." << endl;
3387 lclGood = 0;
3388 TEUCHOS_TEST_FOR_EXCEPTION
3389 (! debug && tooManyElts, std::invalid_argument,
3390 prefix << errStrm->str () << suffix);
3391 }
3392
3393 if (debug) {
3394 reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3395 if (gblGood != 1) {
3396 std::ostringstream gblErrStrm;
3397 const std::string myErrStr =
3398 errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3399 ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3400 TEUCHOS_TEST_FOR_EXCEPTION
3401 (true, std::invalid_argument, gblErrStrm.str ());
3402 }
3403 }
3404
3405 using range_type = std::pair<LO, LO>;
3406 const range_type origRowRng
3407 (rowOffset, static_cast<LO> (X.view_.origExtent (0)));
3408 const range_type rowRng
3409 (rowOffset, rowOffset + newNumRows);
3410
3411 wrapped_dual_view_type newView = takeSubview (X.view_, rowRng, ALL ());
3412
3413 // NOTE (mfh 06 Jan 2015) Work-around to deal with Kokkos not
3414 // handling subviews of degenerate Views quite so well. For some
3415 // reason, the ([0,0], [0,2]) subview of a 0 x 2 DualView is 0 x
3416 // 0. We work around by creating a new empty DualView of the
3417 // desired (degenerate) dimensions.
3418 if (newView.extent (0) == 0 &&
3419 newView.extent (1) != X.view_.extent (1)) {
3420 newView =
3421 allocDualView<Scalar, LO, GO, Node> (0, X.getNumVectors ());
3422 }
3423
3424 MV subViewMV = X.isConstantStride () ?
3425 MV (subMap, newView) :
3426 MV (subMap, newView, X.whichVectors_ ());
3427
3428 if (debug) {
3429 const LO lclNumRowsRet = static_cast<LO> (subViewMV.getLocalLength ());
3430 const LO numColsRet = static_cast<LO> (subViewMV.getNumVectors ());
3431 if (newNumRows != lclNumRowsRet || numCols != numColsRet) {
3432 lclGood = 0;
3433 if (errStrm.get () == nullptr) {
3434 errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3435 }
3436 *errStrm << " Proc " << myRank <<
3437 ": subMap.getLocalNumElements(): " << newNumRows <<
3438 ", subViewMV.getLocalLength(): " << lclNumRowsRet <<
3439 ", X.getNumVectors(): " << numCols <<
3440 ", subViewMV.getNumVectors(): " << numColsRet << endl;
3441 }
3442 reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3443 if (gblGood != 1) {
3444 std::ostringstream gblErrStrm;
3445 if (myRank == 0) {
3446 gblErrStrm << prefix << "Returned MultiVector has the wrong local "
3447 "dimensions on one or more processes:" << endl;
3448 }
3449 const std::string myErrStr =
3450 errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3451 ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3452 gblErrStrm << suffix << endl;
3453 TEUCHOS_TEST_FOR_EXCEPTION
3454 (true, std::invalid_argument, gblErrStrm.str ());
3455 }
3456 }
3457
3458 if (verbose) {
3459 std::ostringstream os;
3460 os << "Proc " << myRank << ": " << prefix << "Call op=" << endl;
3461 std::cerr << os.str ();
3462 }
3463
3464 *this = subViewMV; // shallow copy
3465
3466 if (verbose) {
3467 std::ostringstream os;
3468 os << "Proc " << myRank << ": " << prefix << "Done!" << endl;
3469 std::cerr << os.str ();
3470 }
3471 }
3472
3473 template <class Scalar, class LO, class GO, class Node>
3476 const map_type& subMap,
3477 const size_t rowOffset) :
3478 MultiVector (X, Teuchos::RCP<const map_type> (new map_type (subMap)),
3479 static_cast<local_ordinal_type> (rowOffset))
3480 {}
3481
3482 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3483 Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3485 offsetView (const Teuchos::RCP<const map_type>& subMap,
3486 const size_t offset) const
3487 {
3489 return Teuchos::rcp (new MV (*this, *subMap, offset));
3490 }
3491
3492 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3493 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3495 offsetViewNonConst (const Teuchos::RCP<const map_type>& subMap,
3496 const size_t offset)
3497 {
3499 return Teuchos::rcp (new MV (*this, *subMap, offset));
3500 }
3501
3502 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3503 Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3505 subView (const Teuchos::ArrayView<const size_t>& cols) const
3506 {
3507 using Teuchos::Array;
3508 using Teuchos::rcp;
3510
3511 const size_t numViewCols = static_cast<size_t> (cols.size ());
3512 TEUCHOS_TEST_FOR_EXCEPTION(
3513 numViewCols < 1, std::runtime_error, "Tpetra::MultiVector::subView"
3514 "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
3515 "contain at least one entry, but cols.size() = " << cols.size ()
3516 << " == 0.");
3517
3518 // Check whether the index set in cols is contiguous. If it is,
3519 // use the more efficient Range1D version of subView.
3520 bool contiguous = true;
3521 for (size_t j = 1; j < numViewCols; ++j) {
3522 if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3523 contiguous = false;
3524 break;
3525 }
3526 }
3527 if (contiguous) {
3528 if (numViewCols == 0) {
3529 // The output MV has no columns, so there is nothing to view.
3530 return rcp (new MV (this->getMap (), numViewCols));
3531 } else {
3532 // Use the more efficient contiguous-index-range version.
3533 return this->subView (Teuchos::Range1D (cols[0], cols[numViewCols-1]));
3534 }
3535 }
3536
3537 if (isConstantStride ()) {
3538 return rcp (new MV (this->getMap (), view_, cols));
3539 }
3540 else {
3541 Array<size_t> newcols (cols.size ());
3542 for (size_t j = 0; j < numViewCols; ++j) {
3543 newcols[j] = whichVectors_[cols[j]];
3544 }
3545 return rcp (new MV (this->getMap (), view_, newcols ()));
3546 }
3547 }
3548
3549
3550 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3551 Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3553 subView (const Teuchos::Range1D& colRng) const
3554 {
3556 using Kokkos::ALL;
3557 using Kokkos::subview;
3558 using Teuchos::Array;
3559 using Teuchos::RCP;
3560 using Teuchos::rcp;
3562 const char tfecfFuncName[] = "subView(Range1D): ";
3563
3564 const size_t lclNumRows = this->getLocalLength ();
3565 const size_t numVecs = this->getNumVectors ();
3566 // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3567 // colRng.size() == 0, std::runtime_error, prefix << "Range must include "
3568 // "at least one vector.");
3569 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3570 static_cast<size_t> (colRng.size ()) > numVecs, std::runtime_error,
3571 "colRng.size() = " << colRng.size () << " > this->getNumVectors() = "
3572 << numVecs << ".");
3573 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3574 numVecs != 0 && colRng.size () != 0 &&
3575 (colRng.lbound () < static_cast<Teuchos::Ordinal> (0) ||
3576 static_cast<size_t> (colRng.ubound ()) >= numVecs),
3577 std::invalid_argument, "Nonempty input range [" << colRng.lbound () <<
3578 "," << colRng.ubound () << "] exceeds the valid range of column indices "
3579 "[0, " << numVecs << "].");
3580
3581 RCP<const MV> X_ret; // the MultiVector subview to return
3582
3583 // FIXME (mfh 14 Apr 2015) Apparently subview on DualView is still
3584 // broken for the case of views with zero rows. I will brutally
3585 // enforce that the subview has the correct dimensions. In
3586 // particular, in the case of zero rows, I will, if necessary,
3587 // create a new dual_view_type with zero rows and the correct
3588 // number of columns. In a debug build, I will use an all-reduce
3589 // to ensure that it has the correct dimensions on all processes.
3590
3591 const std::pair<size_t, size_t> rows (0, lclNumRows);
3592 if (colRng.size () == 0) {
3593 const std::pair<size_t, size_t> cols (0, 0); // empty range
3594 wrapped_dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3595 X_ret = rcp (new MV (this->getMap (), X_sub));
3596 }
3597 else {
3598 // Returned MultiVector is constant stride only if *this is.
3599 if (isConstantStride ()) {
3600 const std::pair<size_t, size_t> cols (colRng.lbound (),
3601 colRng.ubound () + 1);
3602 wrapped_dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3603 X_ret = rcp (new MV (this->getMap (), X_sub));
3604 }
3605 else {
3606 if (static_cast<size_t> (colRng.size ()) == static_cast<size_t> (1)) {
3607 // We're only asking for one column, so the result does have
3608 // constant stride, even though this MultiVector does not.
3609 const std::pair<size_t, size_t> col (whichVectors_[0] + colRng.lbound (),
3610 whichVectors_[0] + colRng.ubound () + 1);
3611 wrapped_dual_view_type X_sub = takeSubview (view_, ALL (), col);
3612 X_ret = rcp (new MV (this->getMap (), X_sub));
3613 }
3614 else {
3615 Array<size_t> which (whichVectors_.begin () + colRng.lbound (),
3616 whichVectors_.begin () + colRng.ubound () + 1);
3617 X_ret = rcp (new MV (this->getMap (), view_, which));
3618 }
3619 }
3620 }
3621
3622 const bool debug = Behavior::debug ();
3623 if (debug) {
3624 using Teuchos::Comm;
3625 using Teuchos::outArg;
3626 using Teuchos::REDUCE_MIN;
3627 using Teuchos::reduceAll;
3628
3629 RCP<const Comm<int> > comm = this->getMap ().is_null () ?
3630 Teuchos::null : this->getMap ()->getComm ();
3631 if (! comm.is_null ()) {
3632 int lclSuccess = 1;
3633 int gblSuccess = 1;
3634
3635 if (X_ret.is_null ()) {
3636 lclSuccess = 0;
3637 }
3638 reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
3639 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3640 (lclSuccess != 1, std::logic_error, "X_ret (the subview of this "
3641 "MultiVector; the return value of this method) is null on some MPI "
3642 "process in this MultiVector's communicator. This should never "
3643 "happen. Please report this bug to the Tpetra developers.");
3644 if (! X_ret.is_null () &&
3645 X_ret->getNumVectors () != static_cast<size_t> (colRng.size ())) {
3646 lclSuccess = 0;
3647 }
3648 reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess,
3649 outArg (gblSuccess));
3650 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3651 (lclSuccess != 1, std::logic_error, "X_ret->getNumVectors() != "
3652 "colRng.size(), on at least one MPI process in this MultiVector's "
3653 "communicator. This should never happen. "
3654 "Please report this bug to the Tpetra developers.");
3655 }
3656 }
3657 return X_ret;
3658 }
3659
3660
3661 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3662 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3664 subViewNonConst (const Teuchos::ArrayView<const size_t> &cols)
3665 {
3667 return Teuchos::rcp_const_cast<MV> (this->subView (cols));
3668 }
3669
3670
3671 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3672 Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3674 subViewNonConst (const Teuchos::Range1D &colRng)
3675 {
3677 return Teuchos::rcp_const_cast<MV> (this->subView (colRng));
3678 }
3679
3680
3681 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3684 const size_t j)
3685 : base_type (X.getMap ())
3686 {
3687 using Kokkos::subview;
3688 typedef std::pair<size_t, size_t> range_type;
3689 const char tfecfFuncName[] = "MultiVector(const MultiVector&, const size_t): ";
3690
3691 const size_t numCols = X.getNumVectors ();
3692 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3693 (j >= numCols, std::invalid_argument, "Input index j (== " << j
3694 << ") exceeds valid column index range [0, " << numCols << " - 1].");
3695 const size_t jj = X.isConstantStride () ?
3696 static_cast<size_t> (j) :
3697 static_cast<size_t> (X.whichVectors_[j]);
3698 this->view_ = takeSubview (X.view_, Kokkos::ALL (), range_type (jj, jj+1));
3699
3700 // mfh 31 Jul 2017: It would be unwise to execute concurrent
3701 // Export or Import operations with different subviews of a
3702 // MultiVector. Thus, it is safe to reuse communication buffers.
3703 // See #1560 discussion.
3704 //
3705 // We only need one column's worth of buffer for imports_ and
3706 // exports_. Taking subviews now ensures that their lengths will
3707 // be exactly what we need, so we won't have to resize them later.
3708 {
3709 const size_t newSize = X.imports_.extent (0) / numCols;
3710 const size_t offset = jj*newSize;
3711 auto device_view = subview (X.imports_.view_device(),
3712 range_type (offset, offset+newSize));
3713 auto host_view = subview (X.imports_.view_host(),
3714 range_type (offset, offset+newSize));
3715 this->imports_ = decltype(X.imports_)(device_view, host_view);
3716 }
3717 {
3718 const size_t newSize = X.exports_.extent (0) / numCols;
3719 const size_t offset = jj*newSize;
3720 auto device_view = subview (X.exports_.view_device(),
3721 range_type (offset, offset+newSize));
3722 auto host_view = subview (X.exports_.view_host(),
3723 range_type (offset, offset+newSize));
3724 this->exports_ = decltype(X.exports_)(device_view, host_view);
3725 }
3726 // These two DualViews already either have the right number of
3727 // entries, or zero entries. This means that we don't need to
3728 // resize them.
3731 }
3732
3733
3734 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3735 Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3737 getVector (const size_t j) const
3738 {
3740 return Teuchos::rcp (new V (*this, j));
3741 }
3742
3743
3744 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3745 Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3747 getVectorNonConst (const size_t j)
3748 {
3750 return Teuchos::rcp (new V (*this, j));
3751 }
3752
3753
3754 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3755 void
3757 get1dCopy (const Teuchos::ArrayView<Scalar>& A, const size_t LDA) const
3758 {
3759 using IST = impl_scalar_type;
3760 using input_view_type = Kokkos::View<IST**, Kokkos::LayoutLeft,
3761 Kokkos::HostSpace,
3762 Kokkos::MemoryUnmanaged>;
3763 const char tfecfFuncName[] = "get1dCopy: ";
3764
3765 const size_t numRows = this->getLocalLength ();
3766 const size_t numCols = this->getNumVectors ();
3767
3768 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3769 (LDA < numRows, std::runtime_error,
3770 "LDA = " << LDA << " < numRows = " << numRows << ".");
3771 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3772 (numRows > size_t (0) && numCols > size_t (0) &&
3773 size_t (A.size ()) < LDA * (numCols - 1) + numRows,
3774 std::runtime_error,
3775 "A.size() = " << A.size () << ", but its size must be at least "
3776 << (LDA * (numCols - 1) + numRows) << " to hold all the entries.");
3777
3778 const std::pair<size_t, size_t> rowRange (0, numRows);
3779 const std::pair<size_t, size_t> colRange (0, numCols);
3780
3781 input_view_type A_view_orig (reinterpret_cast<IST*> (A.getRawPtr ()),
3782 LDA, numCols);
3783 auto A_view = Kokkos::subview (A_view_orig, rowRange, colRange);
3784
3796 if (this->need_sync_host() && this->need_sync_device()) {
3799 throw std::runtime_error("Tpetra::MultiVector: A non-const view is alive outside and we cannot give a copy where host or device view can be modified outside");
3800 }
3801 else {
3802 const bool useHostView = view_.host_view_use_count() >= view_.device_view_use_count();
3803 if (this->isConstantStride ()) {
3804 if (useHostView) {
3805 auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3806 // DEEP_COPY REVIEW - NOT TESTED
3807 Kokkos::deep_copy (A_view, srcView_host);
3808 } else {
3809 auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3810 // DEEP_COPY REVIEW - NOT TESTED
3811 Kokkos::deep_copy (A_view, srcView_device);
3812 }
3813 }
3814 else {
3815 for (size_t j = 0; j < numCols; ++j) {
3816 const size_t srcCol = this->whichVectors_[j];
3817 auto dstColView = Kokkos::subview (A_view, rowRange, j);
3818
3819 if (useHostView) {
3820 auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3821 auto srcColView_host = Kokkos::subview (srcView_host, rowRange, srcCol);
3822 // DEEP_COPY REVIEW - NOT TESTED
3823 Kokkos::deep_copy (dstColView, srcColView_host);
3824 } else {
3825 auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3826 auto srcColView_device = Kokkos::subview (srcView_device, rowRange, srcCol);
3827 // DEEP_COPY REVIEW - NOT TESTED
3828 Kokkos::deep_copy (dstColView, srcColView_device);
3829 }
3830 }
3831 }
3832 }
3833 }
3834
3835
3836 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3837 void
3839 get2dCopy (const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const
3840 {
3842 const char tfecfFuncName[] = "get2dCopy: ";
3843 const size_t numRows = this->getLocalLength ();
3844 const size_t numCols = this->getNumVectors ();
3845
3846 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3847 static_cast<size_t> (ArrayOfPtrs.size ()) != numCols,
3848 std::runtime_error, "Input array of pointers must contain as many "
3849 "entries (arrays) as the MultiVector has columns. ArrayOfPtrs.size() = "
3850 << ArrayOfPtrs.size () << " != getNumVectors() = " << numCols << ".");
3851
3852 if (numRows != 0 && numCols != 0) {
3853 // No side effects until we've validated the input.
3854 for (size_t j = 0; j < numCols; ++j) {
3855 const size_t dstLen = static_cast<size_t> (ArrayOfPtrs[j].size ());
3856 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3857 dstLen < numRows, std::invalid_argument, "Array j = " << j << " of "
3858 "the input array of arrays is not long enough to fit all entries in "
3859 "that column of the MultiVector. ArrayOfPtrs[j].size() = " << dstLen
3860 << " < getLocalLength() = " << numRows << ".");
3861 }
3862
3863 // We've validated the input, so it's safe to start copying.
3864 for (size_t j = 0; j < numCols; ++j) {
3865 Teuchos::RCP<const V> X_j = this->getVector (j);
3866 const size_t LDA = static_cast<size_t> (ArrayOfPtrs[j].size ());
3867 X_j->get1dCopy (ArrayOfPtrs[j], LDA);
3868 }
3869 }
3870 }
3871
3872
3873 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3874 Teuchos::ArrayRCP<const Scalar>
3876 get1dView () const
3877 {
3878 if (getLocalLength () == 0 || getNumVectors () == 0) {
3879 return Teuchos::null;
3880 } else {
3881 TEUCHOS_TEST_FOR_EXCEPTION(
3882 ! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3883 "get1dView: This MultiVector does not have constant stride, so it is "
3884 "not possible to view its data as a single array. You may check "
3885 "whether a MultiVector has constant stride by calling "
3886 "isConstantStride().");
3887 // Since get1dView() is and was always marked const, I have to
3888 // cast away const here in order not to break backwards
3889 // compatibility.
3890 auto X_lcl = getLocalViewHost(Access::ReadOnly);
3891 Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3892 Kokkos::Compat::persistingView (X_lcl);
3893 return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3894 }
3895 }
3896
3897 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3898 Teuchos::ArrayRCP<Scalar>
3901 {
3902 if (this->getLocalLength () == 0 || this->getNumVectors () == 0) {
3903 return Teuchos::null;
3904 }
3905 else {
3906 TEUCHOS_TEST_FOR_EXCEPTION
3907 (! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3908 "get1dViewNonConst: This MultiVector does not have constant stride, "
3909 "so it is not possible to view its data as a single array. You may "
3910 "check whether a MultiVector has constant stride by calling "
3911 "isConstantStride().");
3912 auto X_lcl = getLocalViewHost(Access::ReadWrite);
3913 Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3914 Kokkos::Compat::persistingView (X_lcl);
3915 return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3916 }
3917 }
3918
3919 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3920 Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
3923 {
3924 auto X_lcl = getLocalViewHost(Access::ReadWrite);
3925
3926 // Don't use the row range here on the outside, in order to avoid
3927 // a strided return type (in case Kokkos::subview is conservative
3928 // about that). Instead, use the row range for the column views
3929 // in the loop.
3930 const size_t myNumRows = this->getLocalLength ();
3931 const size_t numCols = this->getNumVectors ();
3932 const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3933
3934 Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > views (numCols);
3935 for (size_t j = 0; j < numCols; ++j) {
3936 const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3937 auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3938 Teuchos::ArrayRCP<impl_scalar_type> X_lcl_j_arcp =
3939 Kokkos::Compat::persistingView (X_lcl_j);
3940 views[j] = Teuchos::arcp_reinterpret_cast<Scalar> (X_lcl_j_arcp);
3941 }
3942 return views;
3943 }
3944
3945 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3948 getLocalViewHost(Access::ReadOnlyStruct s) const
3949 {
3950 return view_.getHostView(s);
3951 }
3952
3953 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3956 getLocalViewHost(Access::ReadWriteStruct s)
3957 {
3958 return view_.getHostView(s);
3959 }
3960
3961 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3964 getLocalViewHost(Access::OverwriteAllStruct s)
3965 {
3966 return view_.getHostView(s);
3967 }
3968
3969 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3972 getLocalViewDevice(Access::ReadOnlyStruct s) const
3973 {
3974 return view_.getDeviceView(s);
3975 }
3976
3977 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3980 getLocalViewDevice(Access::ReadWriteStruct s)
3981 {
3982 return view_.getDeviceView(s);
3983 }
3984
3985 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3988 getLocalViewDevice(Access::OverwriteAllStruct s)
3989 {
3990 return view_.getDeviceView(s);
3991 }
3992
3993 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3994 typename MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>::wrapped_dual_view_type
3999
4000 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4001 Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
4003 get2dView () const
4004 {
4005 // Since get2dView() is and was always marked const, I have to
4006 // cast away const here in order not to break backwards
4007 // compatibility.
4008 auto X_lcl = getLocalViewHost(Access::ReadOnly);
4009
4010 // Don't use the row range here on the outside, in order to avoid
4011 // a strided return type (in case Kokkos::subview is conservative
4012 // about that). Instead, use the row range for the column views
4013 // in the loop.
4014 const size_t myNumRows = this->getLocalLength ();
4015 const size_t numCols = this->getNumVectors ();
4016 const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
4017
4018 Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > views (numCols);
4019 for (size_t j = 0; j < numCols; ++j) {
4020 const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
4021 auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
4022 Teuchos::ArrayRCP<const impl_scalar_type> X_lcl_j_arcp =
4023 Kokkos::Compat::persistingView (X_lcl_j);
4024 views[j] = Teuchos::arcp_reinterpret_cast<const Scalar> (X_lcl_j_arcp);
4025 }
4026 return views;
4027 }
4028
4029 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4030 void
4032 multiply (Teuchos::ETransp transA,
4033 Teuchos::ETransp transB,
4034 const Scalar& alpha,
4037 const Scalar& beta)
4038 {
4040 using Teuchos::CONJ_TRANS;
4041 using Teuchos::NO_TRANS;
4042 using Teuchos::TRANS;
4043 using Teuchos::RCP;
4044 using Teuchos::rcp;
4045 using Teuchos::rcpFromRef;
4046 using std::endl;
4047 using ATS = Kokkos::ArithTraits<impl_scalar_type>;
4048 using LO = local_ordinal_type;
4049 using STS = Teuchos::ScalarTraits<Scalar>;
4051 const char tfecfFuncName[] = "multiply: ";
4052 ProfilingRegion region ("Tpetra::MV::multiply");
4053
4054 // This routine performs a variety of matrix-matrix multiply
4055 // operations, interpreting the MultiVector (this-aka C , A and B)
4056 // as 2D matrices. Variations are due to the fact that A, B and C
4057 // can be locally replicated or globally distributed MultiVectors
4058 // and that we may or may not operate with the transpose of A and
4059 // B. Possible cases are:
4060 //
4061 // Operations # Cases Notes
4062 // 1) C(local) = A^X(local) * B^X(local) 4 X=Trans or Not, no comm needed
4063 // 2) C(local) = A^T(distr) * B (distr) 1 2-D dot product, replicate C
4064 // 3) C(distr) = A (distr) * B^X(local) 2 2-D vector update, no comm needed
4065 //
4066 // The following operations are not meaningful for 1-D
4067 // distributions:
4068 //
4069 // u1) C(local) = A^T(distr) * B^T(distr) 1
4070 // u2) C(local) = A (distr) * B^X(distr) 2
4071 // u3) C(distr) = A^X(local) * B^X(local) 4
4072 // u4) C(distr) = A^X(local) * B^X(distr) 4
4073 // u5) C(distr) = A^T(distr) * B^X(local) 2
4074 // u6) C(local) = A^X(distr) * B^X(local) 4
4075 // u7) C(distr) = A^X(distr) * B^X(local) 4
4076 // u8) C(local) = A^X(local) * B^X(distr) 4
4077 //
4078 // Total number of cases: 32 (= 2^5).
4079
4080 impl_scalar_type beta_local = beta; // local copy of beta; might be reassigned below
4081
4082 const bool A_is_local = ! A.isDistributed ();
4083 const bool B_is_local = ! B.isDistributed ();
4084 const bool C_is_local = ! this->isDistributed ();
4085
4086 // In debug mode, check compatibility of local dimensions. We
4087 // only do this in debug mode, since it requires an all-reduce
4088 // to ensure correctness on all processses. It's entirely
4089 // possible that only some processes may have incompatible local
4090 // dimensions. Throwing an exception only on those processes
4091 // could cause this method to hang.
4092 const bool debug = ::Tpetra::Details::Behavior::debug ();
4093 if (debug) {
4094 auto myMap = this->getMap ();
4095 if (! myMap.is_null () && ! myMap->getComm ().is_null ()) {
4096 using Teuchos::REDUCE_MIN;
4097 using Teuchos::reduceAll;
4098 using Teuchos::outArg;
4099
4100 auto comm = myMap->getComm ();
4101 const size_t A_nrows =
4102 (transA != NO_TRANS) ? A.getNumVectors () : A.getLocalLength ();
4103 const size_t A_ncols =
4104 (transA != NO_TRANS) ? A.getLocalLength () : A.getNumVectors ();
4105 const size_t B_nrows =
4106 (transB != NO_TRANS) ? B.getNumVectors () : B.getLocalLength ();
4107 const size_t B_ncols =
4108 (transB != NO_TRANS) ? B.getLocalLength () : B.getNumVectors ();
4109
4110 const bool lclBad = this->getLocalLength () != A_nrows ||
4111 this->getNumVectors () != B_ncols || A_ncols != B_nrows;
4112
4113 const int myRank = comm->getRank ();
4114 std::ostringstream errStrm;
4115 if (this->getLocalLength () != A_nrows) {
4116 errStrm << "Proc " << myRank << ": this->getLocalLength()="
4117 << this->getLocalLength () << " != A_nrows=" << A_nrows
4118 << "." << std::endl;
4119 }
4120 if (this->getNumVectors () != B_ncols) {
4121 errStrm << "Proc " << myRank << ": this->getNumVectors()="
4122 << this->getNumVectors () << " != B_ncols=" << B_ncols
4123 << "." << std::endl;
4124 }
4125 if (A_ncols != B_nrows) {
4126 errStrm << "Proc " << myRank << ": A_ncols="
4127 << A_ncols << " != B_nrows=" << B_nrows
4128 << "." << std::endl;
4129 }
4130
4131 const int lclGood = lclBad ? 0 : 1;
4132 int gblGood = 0;
4133 reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
4134 if (gblGood != 1) {
4135 std::ostringstream os;
4136 ::Tpetra::Details::gathervPrint (os, errStrm.str (), *comm);
4137
4138 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4139 (true, std::runtime_error, "Inconsistent local dimensions on at "
4140 "least one process in this object's communicator." << std::endl
4141 << "Operation: "
4142 << "C(" << (C_is_local ? "local" : "distr") << ") = "
4143 << alpha << "*A"
4144 << (transA == Teuchos::TRANS ? "^T" :
4145 (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
4146 << "(" << (A_is_local ? "local" : "distr") << ") * "
4147 << beta << "*B"
4148 << (transA == Teuchos::TRANS ? "^T" :
4149 (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
4150 << "(" << (B_is_local ? "local" : "distr") << ")." << std::endl
4151 << "Global dimensions: C(" << this->getGlobalLength () << ", "
4152 << this->getNumVectors () << "), A(" << A.getGlobalLength ()
4153 << ", " << A.getNumVectors () << "), B(" << B.getGlobalLength ()
4154 << ", " << B.getNumVectors () << ")." << std::endl
4155 << os.str ());
4156 }
4157 }
4158 }
4159
4160 // Case 1: C(local) = A^X(local) * B^X(local)
4161 const bool Case1 = C_is_local && A_is_local && B_is_local;
4162 // Case 2: C(local) = A^T(distr) * B (distr)
4163 const bool Case2 = C_is_local && ! A_is_local && ! B_is_local &&
4164 transA != NO_TRANS &&
4165 transB == NO_TRANS;
4166 // Case 3: C(distr) = A (distr) * B^X(local)
4167 const bool Case3 = ! C_is_local && ! A_is_local && B_is_local &&
4168 transA == NO_TRANS;
4169
4170 // Test that we are considering a meaningful case
4171 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4172 (! Case1 && ! Case2 && ! Case3, std::runtime_error,
4173 "Multiplication of op(A) and op(B) into *this is not a "
4174 "supported use case.");
4175
4176 if (beta != STS::zero () && Case2) {
4177 // If Case2, then C is local and contributions must be summed
4178 // across all processes. However, if beta != 0, then accumulate
4179 // beta*C into the sum. When summing across all processes, we
4180 // only want to accumulate this once, so set beta == 0 on all
4181 // processes except Process 0.
4182 const int myRank = this->getMap ()->getComm ()->getRank ();
4183 if (myRank != 0) {
4184 beta_local = ATS::zero ();
4185 }
4186 }
4187
4188 // We only know how to do matrix-matrix multiplies if all the
4189 // MultiVectors have constant stride. If not, we have to make
4190 // temporary copies of those MultiVectors (including possibly
4191 // *this) that don't have constant stride.
4192 RCP<MV> C_tmp;
4193 if (! isConstantStride ()) {
4194 C_tmp = rcp (new MV (*this, Teuchos::Copy)); // deep copy
4195 } else {
4196 C_tmp = rcp (this, false);
4197 }
4198
4199 RCP<const MV> A_tmp;
4200 if (! A.isConstantStride ()) {
4201 A_tmp = rcp (new MV (A, Teuchos::Copy)); // deep copy
4202 } else {
4203 A_tmp = rcpFromRef (A);
4204 }
4205
4206 RCP<const MV> B_tmp;
4207 if (! B.isConstantStride ()) {
4208 B_tmp = rcp (new MV (B, Teuchos::Copy)); // deep copy
4209 } else {
4210 B_tmp = rcpFromRef (B);
4211 }
4212
4213 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4214 (! C_tmp->isConstantStride () || ! B_tmp->isConstantStride () ||
4215 ! A_tmp->isConstantStride (), std::logic_error,
4216 "Failed to make temporary constant-stride copies of MultiVectors.");
4217
4218 {
4219 const LO A_lclNumRows = A_tmp->getLocalLength ();
4220 const LO A_numVecs = A_tmp->getNumVectors ();
4221 auto A_lcl = A_tmp->getLocalViewDevice(Access::ReadOnly);
4222 auto A_sub = Kokkos::subview (A_lcl,
4223 std::make_pair (LO (0), A_lclNumRows),
4224 std::make_pair (LO (0), A_numVecs));
4225
4226
4227 const LO B_lclNumRows = B_tmp->getLocalLength ();
4228 const LO B_numVecs = B_tmp->getNumVectors ();
4229 auto B_lcl = B_tmp->getLocalViewDevice(Access::ReadOnly);
4230 auto B_sub = Kokkos::subview (B_lcl,
4231 std::make_pair (LO (0), B_lclNumRows),
4232 std::make_pair (LO (0), B_numVecs));
4233
4234 const LO C_lclNumRows = C_tmp->getLocalLength ();
4235 const LO C_numVecs = C_tmp->getNumVectors ();
4236
4237 auto C_lcl = C_tmp->getLocalViewDevice(Access::ReadWrite);
4238 auto C_sub = Kokkos::subview (C_lcl,
4239 std::make_pair (LO (0), C_lclNumRows),
4240 std::make_pair (LO (0), C_numVecs));
4241 const char ctransA = (transA == Teuchos::NO_TRANS ? 'N' :
4242 (transA == Teuchos::TRANS ? 'T' : 'C'));
4243 const char ctransB = (transB == Teuchos::NO_TRANS ? 'N' :
4244 (transB == Teuchos::TRANS ? 'T' : 'C'));
4245 const impl_scalar_type alpha_IST (alpha);
4246
4247 ProfilingRegion regionGemm ("Tpetra::MV::multiply-call-gemm");
4248
4249 KokkosBlas::gemm (&ctransA, &ctransB, alpha_IST, A_sub, B_sub,
4250 beta_local, C_sub);
4251 }
4252
4253 if (! isConstantStride ()) {
4254 ::Tpetra::deep_copy (*this, *C_tmp); // Copy the result back into *this.
4255 }
4256
4257 // Dispose of (possibly) extra copies of A and B.
4258 A_tmp = Teuchos::null;
4259 B_tmp = Teuchos::null;
4260
4261 // If Case 2 then sum up *this and distribute it to all processes.
4262 if (Case2) {
4263 this->reduce ();
4264 }
4265 }
4266
4267 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4268 void
4270 elementWiseMultiply (Scalar scalarAB,
4273 Scalar scalarThis)
4274 {
4275 using Kokkos::ALL;
4276 using Kokkos::subview;
4277 const char tfecfFuncName[] = "elementWiseMultiply: ";
4278
4279 const size_t lclNumRows = this->getLocalLength ();
4280 const size_t numVecs = this->getNumVectors ();
4281
4282 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4283 (lclNumRows != A.getLocalLength () || lclNumRows != B.getLocalLength (),
4284 std::runtime_error, "MultiVectors do not have the same local length.");
4285 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4286 numVecs != B.getNumVectors (), std::runtime_error, "this->getNumVectors"
4287 "() = " << numVecs << " != B.getNumVectors() = " << B.getNumVectors ()
4288 << ".");
4289
4290 auto this_view = this->getLocalViewDevice(Access::ReadWrite);
4291 auto A_view = A.getLocalViewDevice(Access::ReadOnly);
4292 auto B_view = B.getLocalViewDevice(Access::ReadOnly);
4293
4294 if (isConstantStride () && B.isConstantStride ()) {
4295 // A is just a Vector; it only has one column, so it always has
4296 // constant stride.
4297 //
4298 // If both *this and B have constant stride, we can do an
4299 // element-wise multiply on all columns at once.
4300 KokkosBlas::mult (scalarThis,
4301 this_view,
4302 scalarAB,
4303 subview (A_view, ALL (), 0),
4304 B_view);
4305 }
4306 else {
4307 for (size_t j = 0; j < numVecs; ++j) {
4308 const size_t C_col = isConstantStride () ? j : whichVectors_[j];
4309 const size_t B_col = B.isConstantStride () ? j : B.whichVectors_[j];
4310 KokkosBlas::mult (scalarThis,
4311 subview (this_view, ALL (), C_col),
4312 scalarAB,
4313 subview (A_view, ALL (), 0),
4314 subview (B_view, ALL (), B_col));
4315 }
4316 }
4317 }
4318
4319 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4320 void
4322 reduce ()
4323 {
4326 ProfilingRegion region ("Tpetra::MV::reduce");
4327
4328 const auto map = this->getMap ();
4329 if (map.get () == nullptr) {
4330 return;
4331 }
4332 const auto comm = map->getComm ();
4333 if (comm.get () == nullptr) {
4334 return;
4335 }
4336
4337 // Avoid giving device buffers to MPI. Even if MPI handles them
4338 // correctly, doing so may not perform well.
4339 const bool changed_on_device = this->need_sync_host ();
4340 if (changed_on_device) {
4341 // NOTE (mfh 17 Mar 2019) If we ever get rid of UVM, then device
4342 // and host will be separate allocations. In that case, it may
4343 // pay to do the all-reduce from device to host.
4344 Kokkos::fence("MultiVector::reduce"); // for UVM getLocalViewDevice is UVM which can be read as host by allReduceView, so we must not read until device is fenced
4345 auto X_lcl = this->getLocalViewDevice(Access::ReadWrite);
4346 allReduceView (X_lcl, X_lcl, *comm);
4347 }
4348 else {
4349 auto X_lcl = this->getLocalViewHost(Access::ReadWrite);
4350 allReduceView (X_lcl, X_lcl, *comm);
4351 }
4352 }
4353
4354 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4355 void
4357 replaceLocalValue (const LocalOrdinal lclRow,
4358 const size_t col,
4359 const impl_scalar_type& ScalarValue)
4360 {
4362 const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4363 const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4364 TEUCHOS_TEST_FOR_EXCEPTION(
4365 lclRow < minLocalIndex || lclRow > maxLocalIndex,
4366 std::runtime_error,
4367 "Tpetra::MultiVector::replaceLocalValue: row index " << lclRow
4368 << " is invalid. The range of valid row indices on this process "
4369 << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4370 << ", " << maxLocalIndex << "].");
4371 TEUCHOS_TEST_FOR_EXCEPTION(
4372 vectorIndexOutOfRange(col),
4373 std::runtime_error,
4374 "Tpetra::MultiVector::replaceLocalValue: vector index " << col
4375 << " of the multivector is invalid.");
4376 }
4377
4378 auto view = this->getLocalViewHost(Access::ReadWrite);
4379 const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4380 view (lclRow, colInd) = ScalarValue;
4381 }
4382
4383
4384 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4385 void
4387 sumIntoLocalValue (const LocalOrdinal lclRow,
4388 const size_t col,
4389 const impl_scalar_type& value,
4390 const bool atomic)
4391 {
4393 const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4394 const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4395 TEUCHOS_TEST_FOR_EXCEPTION(
4396 lclRow < minLocalIndex || lclRow > maxLocalIndex,
4397 std::runtime_error,
4398 "Tpetra::MultiVector::sumIntoLocalValue: row index " << lclRow
4399 << " is invalid. The range of valid row indices on this process "
4400 << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4401 << ", " << maxLocalIndex << "].");
4402 TEUCHOS_TEST_FOR_EXCEPTION(
4403 vectorIndexOutOfRange(col),
4404 std::runtime_error,
4405 "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
4406 << " of the multivector is invalid.");
4407 }
4408
4409 const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4410
4411 auto view = this->getLocalViewHost(Access::ReadWrite);
4412 if (atomic) {
4413 Kokkos::atomic_add (& (view(lclRow, colInd)), value);
4414 }
4415 else {
4416 view(lclRow, colInd) += value;
4417 }
4418 }
4419
4420
4421 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4422 void
4424 replaceGlobalValue (const GlobalOrdinal gblRow,
4425 const size_t col,
4426 const impl_scalar_type& ScalarValue)
4427 {
4428 // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4429 // touches the RCP's reference count, which isn't thread safe.
4430 const LocalOrdinal lclRow = this->map_->getLocalElement (gblRow);
4431
4433 const char tfecfFuncName[] = "replaceGlobalValue: ";
4434 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4435 (lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4436 std::runtime_error,
4437 "Global row index " << gblRow << " is not present on this process "
4438 << this->getMap ()->getComm ()->getRank () << ".");
4439 TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4440 (this->vectorIndexOutOfRange (col), std::runtime_error,
4441 "Vector index " << col << " of the MultiVector is invalid.");
4442 }
4443
4444 this->replaceLocalValue (lclRow, col, ScalarValue);
4445 }
4446
4447 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4448 void
4450 sumIntoGlobalValue (const GlobalOrdinal globalRow,
4451 const size_t col,
4452 const impl_scalar_type& value,
4453 const bool atomic)
4454 {
4455 // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4456 // touches the RCP's reference count, which isn't thread safe.
4457 const LocalOrdinal lclRow = this->map_->getLocalElement (globalRow);
4458
4460 TEUCHOS_TEST_FOR_EXCEPTION(
4461 lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4462 std::runtime_error,
4463 "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
4464 << " is not present on this process "
4465 << this->getMap ()->getComm ()->getRank () << ".");
4466 TEUCHOS_TEST_FOR_EXCEPTION(
4467 vectorIndexOutOfRange(col),
4468 std::runtime_error,
4469 "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
4470 << " of the multivector is invalid.");
4471 }
4472
4473 this->sumIntoLocalValue (lclRow, col, value, atomic);
4474 }
4475
4476 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4477 template <class T>
4478 Teuchos::ArrayRCP<T>
4480 getSubArrayRCP (Teuchos::ArrayRCP<T> arr,
4481 size_t j) const
4482 {
4483 typedef Kokkos::DualView<impl_scalar_type*,
4484 typename dual_view_type::array_layout,
4485 execution_space> col_dual_view_type;
4486 const size_t col = isConstantStride () ? j : whichVectors_[j];
4487 col_dual_view_type X_col =
4488 Kokkos::subview (view_, Kokkos::ALL (), col);
4489 return Kokkos::Compat::persistingView (X_col.view_device());
4490 }
4491
4492 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4493 bool
4495 need_sync_host () const {
4496 return view_.need_sync_host ();
4497 }
4498
4499 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4500 bool
4502 need_sync_device () const {
4503 return view_.need_sync_device ();
4504 }
4505
4506 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4507 std::string
4509 descriptionImpl (const std::string& className) const
4510 {
4511 using Teuchos::TypeNameTraits;
4512
4513 std::ostringstream out;
4514 out << "\"" << className << "\": {";
4515 out << "Template parameters: {Scalar: " << TypeNameTraits<Scalar>::name ()
4516 << ", LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name ()
4517 << ", GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name ()
4518 << ", Node" << Node::name ()
4519 << "}, ";
4520 if (this->getObjectLabel () != "") {
4521 out << "Label: \"" << this->getObjectLabel () << "\", ";
4522 }
4523 out << ", numRows: " << this->getGlobalLength ();
4524 if (className != "Tpetra::Vector") {
4525 out << ", numCols: " << this->getNumVectors ()
4526 << ", isConstantStride: " << this->isConstantStride ();
4527 }
4528 if (this->isConstantStride ()) {
4529 out << ", columnStride: " << this->getStride ();
4530 }
4531 out << "}";
4532
4533 return out.str ();
4534 }
4535
4536 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4537 std::string
4539 description () const
4540 {
4541 return this->descriptionImpl ("Tpetra::MultiVector");
4542 }
4543
4544 namespace Details
4545 {
4546 template<typename ViewType>
4547 void print_vector(Teuchos::FancyOStream & out, const char* prefix, const ViewType& v)
4548 {
4549 using std::endl;
4550 static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace, typename ViewType::memory_space>::accessible,
4551 "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a host-accessible view.");
4552 static_assert(ViewType::rank == 2,
4553 "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a rank-2 view.");
4554 // The square braces [] and their contents are in Matlab
4555 // format, so users may copy and paste directly into Matlab.
4556 out << "Values("<<prefix<<"): " << std::endl
4557 << "[";
4558 const size_t numRows = v.extent(0);
4559 const size_t numCols = v.extent(1);
4560 if (numCols == 1) {
4561 for (size_t i = 0; i < numRows; ++i) {
4562 out << v(i,0);
4563 if (i + 1 < numRows) {
4564 out << "; ";
4565 }
4566 }
4567 }
4568 else {
4569 for (size_t i = 0; i < numRows; ++i) {
4570 for (size_t j = 0; j < numCols; ++j) {
4571 out << v(i,j);
4572 if (j + 1 < numCols) {
4573 out << ", ";
4574 }
4575 }
4576 if (i + 1 < numRows) {
4577 out << "; ";
4578 }
4579 }
4580 }
4581 out << "]" << endl;
4582 }
4583 }
4584
4585 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4586 std::string
4588 localDescribeToString (const Teuchos::EVerbosityLevel vl) const
4589 {
4590 typedef LocalOrdinal LO;
4591 using std::endl;
4592
4593 if (vl <= Teuchos::VERB_LOW) {
4594 return std::string ();
4595 }
4596 auto map = this->getMap ();
4597 if (map.is_null ()) {
4598 return std::string ();
4599 }
4600 auto outStringP = Teuchos::rcp (new std::ostringstream ());
4601 auto outp = Teuchos::getFancyOStream (outStringP);
4602 Teuchos::FancyOStream& out = *outp;
4603 auto comm = map->getComm ();
4604 const int myRank = comm->getRank ();
4605 const int numProcs = comm->getSize ();
4606
4607 out << "Process " << myRank << " of " << numProcs << ":" << endl;
4608 Teuchos::OSTab tab1 (out);
4609
4610 // VERB_MEDIUM and higher prints getLocalLength()
4611 const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
4612 out << "Local number of rows: " << lclNumRows << endl;
4613
4614 if (vl > Teuchos::VERB_MEDIUM) {
4615 // VERB_HIGH and higher prints isConstantStride() and getStride().
4616 // The first is only relevant if the Vector has multiple columns.
4617 if (this->getNumVectors () != static_cast<size_t> (1)) {
4618 out << "isConstantStride: " << this->isConstantStride () << endl;
4619 }
4620 if (this->isConstantStride ()) {
4621 out << "Column stride: " << this->getStride () << endl;
4622 }
4623
4624 if (vl > Teuchos::VERB_HIGH && lclNumRows > 0) {
4625 // VERB_EXTREME prints values. If we're not doing univied memory, we'll
4627 // so we can't use our regular accessor functins
4628
4629 // NOTE: This is an occasion where we do *not* want the auto-sync stuff
4630 // to trigger (since this function is conceptually const). Thus, we
4631 // get *copies* of the view's data instead.
4632 auto X_dev = view_.getDeviceCopy();
4633 auto X_host = view_.getHostCopy();
4634
4635 if(X_dev.data() == X_host.data()) {
4636 // One single allocation
4637 Details::print_vector(out,"unified",X_host);
4638 }
4639 else {
4640 Details::print_vector(out,"host",X_host);
4641 Details::print_vector(out,"dev",X_dev);
4642 }
4643 }
4644 }
4645 out.flush (); // make sure the ostringstream got everything
4646 return outStringP->str ();
4647 }
4648
4649 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4650 void
4652 describeImpl (Teuchos::FancyOStream& out,
4653 const std::string& className,
4654 const Teuchos::EVerbosityLevel verbLevel) const
4655 {
4656 using Teuchos::TypeNameTraits;
4657 using Teuchos::VERB_DEFAULT;
4658 using Teuchos::VERB_NONE;
4659 using Teuchos::VERB_LOW;
4660 using std::endl;
4661 const Teuchos::EVerbosityLevel vl =
4662 (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
4663
4664 if (vl == VERB_NONE) {
4665 return; // don't print anything
4666 }
4667 // If this Vector's Comm is null, then the Vector does not
4668 // participate in collective operations with the other processes.
4669 // In that case, it is not even legal to call this method. The
4670 // reasonable thing to do in that case is nothing.
4671 auto map = this->getMap ();
4672 if (map.is_null ()) {
4673 return;
4674 }
4675 auto comm = map->getComm ();
4676 if (comm.is_null ()) {
4677 return;
4678 }
4679
4680 const int myRank = comm->getRank ();
4681
4682 // Only Process 0 should touch the output stream, but this method
4683 // in general may need to do communication. Thus, we may need to
4684 // preserve the current tab level across multiple "if (myRank ==
4685 // 0) { ... }" inner scopes. This is why we sometimes create
4686 // OSTab instances by pointer, instead of by value. We only need
4687 // to create them by pointer if the tab level must persist through
4688 // multiple inner scopes.
4689 Teuchos::RCP<Teuchos::OSTab> tab0, tab1;
4690
4691 // VERB_LOW and higher prints the equivalent of description().
4692 if (myRank == 0) {
4693 tab0 = Teuchos::rcp (new Teuchos::OSTab (out));
4694 out << "\"" << className << "\":" << endl;
4695 tab1 = Teuchos::rcp (new Teuchos::OSTab (out));
4696 {
4697 out << "Template parameters:" << endl;
4698 Teuchos::OSTab tab2 (out);
4699 out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
4700 << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
4701 << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
4702 << "Node: " << Node::name () << endl;
4703 }
4704 if (this->getObjectLabel () != "") {
4705 out << "Label: \"" << this->getObjectLabel () << "\", ";
4706 }
4707 out << "Global number of rows: " << this->getGlobalLength () << endl;
4708 if (className != "Tpetra::Vector") {
4709 out << "Number of columns: " << this->getNumVectors () << endl;
4710 }
4711 // getStride() may differ on different processes, so it (and
4712 // isConstantStride()) properly belong to per-process data.
4713 }
4714
4715 // This is collective over the Map's communicator.
4716 if (vl > VERB_LOW) { // VERB_MEDIUM, VERB_HIGH, or VERB_EXTREME
4717 const std::string lclStr = this->localDescribeToString (vl);
4718 ::Tpetra::Details::gathervPrint (out, lclStr, *comm);
4719 }
4720 }
4721
4722 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4723 void
4725 describe (Teuchos::FancyOStream &out,
4726 const Teuchos::EVerbosityLevel verbLevel) const
4727 {
4728 this->describeImpl (out, "Tpetra::MultiVector", verbLevel);
4729 }
4730
4731 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4732 void
4734 removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
4735 {
4736 replaceMap (newMap);
4737 }
4738
4739 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4740 void
4743 {
4745 const char prefix[] = "Tpetra::MultiVector::assign: ";
4746
4747 TEUCHOS_TEST_FOR_EXCEPTION
4748 (this->getGlobalLength () != src.getGlobalLength () ||
4749 this->getNumVectors () != src.getNumVectors (), std::invalid_argument,
4750 prefix << "Global dimensions of the two Tpetra::MultiVector "
4751 "objects do not match. src has dimensions [" << src.getGlobalLength ()
4752 << "," << src.getNumVectors () << "], and *this has dimensions ["
4753 << this->getGlobalLength () << "," << this->getNumVectors () << "].");
4754 // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
4755 TEUCHOS_TEST_FOR_EXCEPTION
4756 (this->getLocalLength () != src.getLocalLength (), std::invalid_argument,
4757 prefix << "The local row counts of the two Tpetra::MultiVector "
4758 "objects do not match. src has " << src.getLocalLength () << " row(s) "
4759 << " and *this has " << this->getLocalLength () << " row(s).");
4760
4761
4762 // See #1510. We're writing to *this, so we don't care about its
4763 // contents in either memory space, and we don't want
4764 // DualView::modify to complain about "concurrent modification" of
4765 // host and device Views.
4766
4770
4771 // If need sync to device, then host has most recent version.
4772 const bool src_last_updated_on_host = src.need_sync_device ();
4773
4774 if (src_last_updated_on_host) {
4775 localDeepCopy (this->getLocalViewHost(Access::ReadWrite),
4776 src.getLocalViewHost(Access::ReadOnly),
4777 this->isConstantStride (),
4778 src.isConstantStride (),
4779 this->whichVectors_ (),
4780 src.whichVectors_ ());
4781 }
4782 else {
4783 localDeepCopy (this->getLocalViewDevice(Access::ReadWrite),
4784 src.getLocalViewDevice(Access::ReadOnly),
4785 this->isConstantStride (),
4786 src.isConstantStride (),
4787 this->whichVectors_ (),
4788 src.whichVectors_ ());
4789 }
4790 }
4791
4792 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4793 template<class T>
4794 Teuchos::RCP<MultiVector<T, LocalOrdinal, GlobalOrdinal, Node> >
4796 convert () const
4797 {
4798 using Teuchos::RCP;
4799
4800 auto newMV = Teuchos::rcp(new MultiVector<T,LocalOrdinal,GlobalOrdinal,Node>(this->getMap(),
4801 this->getNumVectors(),
4802 /*zeroOut=*/false));
4803 Tpetra::deep_copy(*newMV, *this);
4804 return newMV;
4805 }
4806
4807 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4808 bool
4811 {
4813
4814 const size_t l1 = this->getLocalLength();
4815 const size_t l2 = vec.getLocalLength();
4816 if ((l1!=l2) || (this->getNumVectors() != vec.getNumVectors())) {
4817 return false;
4818 }
4819
4820 return true;
4821 }
4822
4823 template <class ST, class LO, class GO, class NT>
4826 std::swap(mv.map_, this->map_);
4827 std::swap(mv.view_, this->view_);
4828 std::swap(mv.whichVectors_, this->whichVectors_);
4829 }
4830
4831#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4832 template <class ST, class LO, class GO, class NT>
4833 void
4835 const Teuchos::SerialDenseMatrix<int, ST>& src)
4836 {
4838 using MV = MultiVector<ST, LO, GO, NT>;
4839 using IST = typename MV::impl_scalar_type;
4840 using input_view_type =
4841 Kokkos::View<const IST**, Kokkos::LayoutLeft,
4842 Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4843 using pair_type = std::pair<LO, LO>;
4844
4845 const LO numRows = static_cast<LO> (src.numRows ());
4846 const LO numCols = static_cast<LO> (src.numCols ());
4847
4848 TEUCHOS_TEST_FOR_EXCEPTION
4849 (numRows != static_cast<LO> (dst.getLocalLength ()) ||
4850 numCols != static_cast<LO> (dst.getNumVectors ()),
4851 std::invalid_argument, "Tpetra::deep_copy: On Process "
4852 << dst.getMap ()->getComm ()->getRank () << ", dst is "
4853 << dst.getLocalLength () << " x " << dst.getNumVectors ()
4854 << ", but src is " << numRows << " x " << numCols << ".");
4855
4856 const IST* src_raw = reinterpret_cast<const IST*> (src.values ());
4857 input_view_type src_orig (src_raw, src.stride (), numCols);
4858 input_view_type src_view =
4859 Kokkos::subview (src_orig, pair_type (0, numRows), Kokkos::ALL ());
4860
4861 constexpr bool src_isConstantStride = true;
4862 Teuchos::ArrayView<const size_t> srcWhichVectors (nullptr, 0);
4863 localDeepCopy (dst.getLocalViewHost(Access::ReadWrite),
4864 src_view,
4865 dst.isConstantStride (),
4866 src_isConstantStride,
4867 getMultiVectorWhichVectors (dst),
4868 srcWhichVectors);
4869 }
4870
4871 template <class ST, class LO, class GO, class NT>
4872 void
4873 deep_copy (Teuchos::SerialDenseMatrix<int, ST>& dst,
4874 const MultiVector<ST, LO, GO, NT>& src)
4875 {
4877 using MV = MultiVector<ST, LO, GO, NT>;
4878 using IST = typename MV::impl_scalar_type;
4879 using output_view_type =
4880 Kokkos::View<IST**, Kokkos::LayoutLeft,
4881 Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4882 using pair_type = std::pair<LO, LO>;
4883
4884 const LO numRows = static_cast<LO> (dst.numRows ());
4885 const LO numCols = static_cast<LO> (dst.numCols ());
4886
4887 TEUCHOS_TEST_FOR_EXCEPTION
4888 (numRows != static_cast<LO> (src.getLocalLength ()) ||
4889 numCols != static_cast<LO> (src.getNumVectors ()),
4890 std::invalid_argument, "Tpetra::deep_copy: On Process "
4891 << src.getMap ()->getComm ()->getRank () << ", src is "
4892 << src.getLocalLength () << " x " << src.getNumVectors ()
4893 << ", but dst is " << numRows << " x " << numCols << ".");
4894
4895 IST* dst_raw = reinterpret_cast<IST*> (dst.values ());
4896 output_view_type dst_orig (dst_raw, dst.stride (), numCols);
4897 auto dst_view =
4898 Kokkos::subview (dst_orig, pair_type (0, numRows), Kokkos::ALL ());
4899
4900 constexpr bool dst_isConstantStride = true;
4901 Teuchos::ArrayView<const size_t> dstWhichVectors (nullptr, 0);
4902
4903 // Prefer the host version of src's data.
4904 localDeepCopy (dst_view,
4905 src.getLocalViewHost(Access::ReadOnly),
4906 dst_isConstantStride,
4907 src.isConstantStride (),
4908 dstWhichVectors,
4909 getMultiVectorWhichVectors (src));
4910 }
4911#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4912
4913 template <class Scalar, class LO, class GO, class NT>
4914 Teuchos::RCP<MultiVector<Scalar, LO, GO, NT> >
4915 createMultiVector (const Teuchos::RCP<const Map<LO, GO, NT> >& map,
4916 size_t numVectors)
4917 {
4919 return Teuchos::rcp (new MV (map, numVectors));
4920 }
4921
4922 template <class ST, class LO, class GO, class NT>
4925 {
4926 typedef MultiVector<ST, LO, GO, NT> MV;
4927 MV cpy (src.getMap (), src.getNumVectors (), false);
4928 cpy.assign (src);
4929 return cpy;
4930 }
4931
4932} // namespace Tpetra
4933
4934//
4935// Explicit instantiation macro
4936//
4937// Must be expanded from within the Tpetra namespace!
4938//
4939
4940#ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4941# define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4942 template class MultiVector< SCALAR , LO , GO , NODE >; \
4943 template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src); \
4944 template Teuchos::RCP<MultiVector< SCALAR , LO , GO , NODE > > createMultiVector (const Teuchos::RCP<const Map<LO, GO, NODE> >& map, size_t numVectors); \
4945 template void deep_copy (MultiVector<SCALAR, LO, GO, NODE>& dst, const Teuchos::SerialDenseMatrix<int, SCALAR>& src); \
4946 template void deep_copy (Teuchos::SerialDenseMatrix<int, SCALAR>& dst, const MultiVector<SCALAR, LO, GO, NODE>& src);
4947
4948#else
4949# define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4950 template class MultiVector< SCALAR , LO , GO , NODE >; \
4951 template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src);
4952
4953#endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4954
4955
4956#define TPETRA_MULTIVECTOR_CONVERT_INSTANT(SO,SI,LO,GO,NODE) \
4957 \
4958 template Teuchos::RCP< MultiVector< SO , LO , GO , NODE > > \
4959 MultiVector< SI , LO , GO , NODE >::convert< SO > () const;
4960
4961
4962#endif // TPETRA_MULTIVECTOR_DEF_HPP
Functions for initializing and finalizing Tpetra.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
All-reduce a 1-D or 2-D Kokkos::View.
Declaration of functions for checking whether a given pointer is accessible from a given Kokkos execu...
Declaration and definition of Tpetra::Details::Blas::fill, an implementation detail of Tpetra::MultiV...
void fill(const ExecutionSpace &execSpace, const ViewType &X, const ValueType &alpha, const IndexType numRows, const IndexType numCols)
Fill the entries of the given 1-D or 2-D Kokkos::View with the given scalar value alpha.
Declaration of a function that prints strings from each process.
Declaration of Tpetra::Details::isInterComm.
Declaration and definition of Tpetra::Details::lclDot, an implementation detail of Tpetra::MultiVecto...
Declaration and definition of Tpetra::Details::normImpl, which is an implementation detail of Tpetra:...
Declaration and definition of Tpetra::Details::reallocDualViewIfNeeded, an implementation detail of T...
Stand-alone utility functions and macros.
Description of Tpetra's behavior.
static bool assumeMpiIsGPUAware()
Whether to assume that MPI is CUDA aware.
static bool debug()
Whether Tpetra is in debug mode.
static bool verbose()
Whether Tpetra is in verbose mode.
static size_t multivectorKernelLocationThreshold()
the threshold for transitioning from device to host
virtual bool reallocImportsIfNeeded(const size_t newSize, const bool verbose, const std::string *prefix, const bool remoteLIDsContiguous=false, const CombineMode CM=INSERT)
Reallocate imports_ if needed.
Kokkos::DualView< size_t *, buffer_device_type > numImportPacketsPerLID_
Kokkos::DualView< packet_type *, buffer_device_type > exports_
Buffer from which packed data are exported (sent to other processes).
Kokkos::DualView< packet_type *, buffer_device_type > imports_
Kokkos::DualView< size_t *, buffer_device_type > numExportPacketsPerLID_
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
bool isDistributed() const
Whether this is a globally distributed object.
A parallel distribution of indices over processes.
One or more distributed dense vectors.
void reciprocal(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise reciprocal values of input Multi-vector in target, this(i,j) = 1/A(i,...
void normInf(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the infinity-norm of each vector (column), storing the result in a host View.
void get1dCopy(const Teuchos::ArrayView< Scalar > &A, const size_t LDA) const
Fill the given array with a copy of this multivector's local values.
size_t getStride() const
Stride between columns in the multivector.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
virtual bool reallocImportsIfNeeded(const size_t newSize, const bool verbose, const std::string *prefix, const bool areRemoteLIDsContiguous=false, const CombineMode CM=INSERT) override
Reallocate imports_ if needed.
void reduce()
Sum values of a locally replicated multivector across all processes.
virtual bool checkSizes(const SrcDistObject &sourceObj) override
Whether data redistribution between sourceObj and this object is legal.
void randomize()
Set all values in the multivector to pseudorandom numbers.
typename map_type::local_ordinal_type local_ordinal_type
The type of local indices that this class uses.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subView(const Teuchos::Range1D &colRng) const
Return a const MultiVector with const views of selected columns.
void dot(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Teuchos::ArrayView< dot_type > &dots) const
Compute the dot product of each corresponding pair of vectors (columns) in A and B.
void describeImpl(Teuchos::FancyOStream &out, const std::string &className, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Implementation of describe() for this class, and its subclass Vector.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetView(const Teuchos::RCP< const map_type > &subMap, const size_t offset) const
Return a const view of a subset of rows.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap) override
Remove processes owning zero rows from the Map and their communicator.
Teuchos::ArrayRCP< Scalar > get1dViewNonConst()
Nonconst persisting (1-D) view of this multivector's local values.
void assign(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &src)
Copy the contents of src into *this (deep copy).
Teuchos::RCP< Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVectorNonConst(const size_t j)
Return a Vector which is a nonconst view of column j.
void meanValue(const Teuchos::ArrayView< impl_scalar_type > &means) const
Compute mean (average) value of each column.
std::string descriptionImpl(const std::string &className) const
Implementation of description() for this class, and its subclass Vector.
void multiply(Teuchos::ETransp transA, Teuchos::ETransp transB, const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Scalar &beta)
Matrix-matrix multiplication: this = beta*this + alpha*op(A)*op(B).
bool need_sync_device() const
Whether this MultiVector needs synchronization to the device.
wrapped_dual_view_type view_
The Kokkos::DualView containing the MultiVector's data.
void replaceLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &value)
Replace value in host memory, using local (row) index.
void swap(MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &mv)
Swap contents of mv with contents of *this.
size_t getLocalLength() const
Local number of rows on the calling process.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subCopy(const Teuchos::Range1D &colRng) const
Return a MultiVector with copies of selected columns.
Teuchos::ArrayRCP< const Scalar > getData(size_t j) const
Const view of the local values in a particular vector of this multivector.
bool need_sync_host() const
Whether this MultiVector needs synchronization to the host.
Teuchos::RCP< const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVector(const size_t j) const
Return a Vector which is a const view of column j.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM, const execution_space &space) override
Same as copyAndPermute, but do operations in space.
void norm2(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the two-norm of each vector (column), storing the result in a host View.
global_size_t getGlobalLength() const
Global number of rows in the multivector.
Teuchos::ArrayRCP< const Scalar > get1dView() const
Const persisting (1-D) view of this multivector's local values.
Teuchos::ArrayRCP< T > getSubArrayRCP(Teuchos::ArrayRCP< T > arr, size_t j) const
Persisting view of j-th column in the given ArrayRCP.
void replaceGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value)
Replace value in host memory, using global row index.
void elementWiseMultiply(Scalar scalarAB, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarThis)
Multiply a Vector A elementwise by a MultiVector B.
Map< LocalOrdinal, GlobalOrdinal, Node > map_type
The type of the Map specialization used by this class.
Kokkos::DualView< impl_scalar_type **, Kokkos::LayoutLeft, device_type > dual_view_type
Kokkos::DualView specialization used by this class.
virtual size_t constantNumberOfPackets() const override
Number of packets to send per LID.
wrapped_dual_view_type getWrappedDualView() const
Return the wrapped dual view holding this MultiVector's local data.
void sumIntoLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &val, const bool atomic=useAtomicUpdatesByDefault)
Update (+=) a value in host memory, using local row index.
void replaceMap(const Teuchos::RCP< const map_type > &map)
Replace the underlying Map in place.
Teuchos::RCP< MultiVector< T, LocalOrdinal, GlobalOrdinal, Node > > convert() const
Return another MultiVector with the same entries, but converted to a different Scalar type T.
MultiVector()
Default constructor: makes a MultiVector with no rows or columns.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on device. This requires that th...
typename device_type::execution_space execution_space
Type of the (new) Kokkos execution space.
void abs(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise absolute values of input Multi-vector in target: A = abs(this).
void norm1(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the one-norm of each vector (column), storing the result in a host view.
virtual void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const override
Print the object with the given verbosity level to a FancyOStream.
void get2dCopy(const Teuchos::ArrayView< const Teuchos::ArrayView< Scalar > > &ArrayOfPtrs) const
Fill the given array with a copy of this multivector's local values.
void sumIntoGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value, const bool atomic=useAtomicUpdatesByDefault)
Update (+=) a value in host memory, using global row index.
typename Kokkos::Details::InnerProductSpaceTraits< impl_scalar_type >::dot_type dot_type
Type of an inner ("dot") product result.
bool aliases(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &other) const
Whether this multivector's memory might alias other. This is conservative: if either this or other is...
dual_view_type::t_host::const_type getLocalViewHost(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on host. This requires that ther...
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subViewNonConst(const Teuchos::Range1D &colRng)
Return a MultiVector with views of selected columns.
typename Kokkos::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
Teuchos::Array< size_t > whichVectors_
Indices of columns this multivector is viewing.
Teuchos::ArrayRCP< Scalar > getDataNonConst(size_t j)
View of the local values in a particular vector of this multivector.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
typename Kokkos::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
size_t getOrigNumLocalCols() const
"Original" number of columns in the (local) data.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< Scalar > > get2dViewNonConst()
Return non-const persisting pointers to values.
virtual std::string description() const override
A simple one-line description of this object.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetViewNonConst(const Teuchos::RCP< const map_type > &subMap, const size_t offset)
Return a nonconst view of a subset of rows.
std::string localDescribeToString(const Teuchos::EVerbosityLevel vl) const
Print the calling process' verbose describe() information to the returned string.
bool isSameSize(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &vec) const
size_t getOrigNumLocalRows() const
"Original" number of rows in the (local) data.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< const Scalar > > get2dView() const
Return const persisting pointers to values.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
Abstract base class for objects that can be the source of an Import or Export operation.
A distributed dense vector.
base_type::impl_scalar_type impl_scalar_type
base_type::wrapped_dual_view_type wrapped_dual_view_type
Implementation details of Tpetra.
Nonmember function that computes a residual Computes R = B - A * X.
void normImpl(MagnitudeType norms[], const Kokkos::View< const ValueType **, ArrayLayout, DeviceType > &X, const EWhichNorm whichNorm, const Teuchos::ArrayView< const size_t > &whichVecs, const bool isConstantStride, const bool isDistributed, const Teuchos::Comm< int > *comm)
Implementation of MultiVector norms.
static void allReduceView(const OutputViewType &output, const InputViewType &input, const Teuchos::Comm< int > &comm)
All-reduce from input Kokkos::View to output Kokkos::View.
bool isInterComm(const Teuchos::Comm< int > &)
Return true if and only if the input communicator wraps an MPI intercommunicator.
Kokkos::DualView< T *, DT > getDualViewCopyFromArrayView(const Teuchos::ArrayView< const T > &x_av, const char label[], const bool leaveOnHost)
Get a 1-D Kokkos::DualView which is a deep copy of the input Teuchos::ArrayView (which views host mem...
bool reallocDualViewIfNeeded(Kokkos::DualView< ValueType *, DeviceType > &dv, const size_t newSize, const char newLabel[], const size_t tooBigFactor=2, const bool needFenceBeforeRealloc=true)
Reallocate the DualView in/out argument, if needed.
void localDeepCopy(const DstViewType &dst, const SrcViewType &src, const bool dstConstStride, const bool srcConstStride, const DstWhichVecsType &dstWhichVecs, const SrcWhichVecsType &srcWhichVecs)
Implementation of Tpetra::MultiVector deep copy of local data.
void gathervPrint(std::ostream &out, const std::string &s, const Teuchos::Comm< int > &comm)
On Process 0 in the given communicator, print strings from each process in that communicator,...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > createMultiVector(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &map, const size_t numVectors)
Nonmember MultiVector "constructor": Create a MultiVector from a given Map.
size_t global_size_t
Global size_t object.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ ADD_ASSIGN
Accumulate new values into existing values (may not be supported in all classes).
@ INSERT
Insert new values that don't currently exist.
Traits class for packing / unpacking data of type T.