Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_Details_normImpl.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_DETAILS_NORMIMPL_HPP
11#define TPETRA_DETAILS_NORMIMPL_HPP
12
21
22#include "TpetraCore_config.h"
23#include "Kokkos_Core.hpp"
24#include "Teuchos_ArrayView.hpp"
25#include "Teuchos_CommHelpers.hpp"
26#include "KokkosBlas.hpp"
27#include "Kokkos_ArithTraits.hpp"
28
29#ifndef DOXYGEN_SHOULD_SKIP_THIS
30namespace Teuchos {
31 template<class T>
32 class ArrayView; // forward declaration
33 template<class OrdinalType>
34 class Comm; // forward declaration
35}
36#endif // DOXYGEN_SHOULD_SKIP_THIS
37
39// Declarations start here
41
42namespace Tpetra {
43namespace Details {
44
47 NORM_ONE, //<! Use the one-norm
48 NORM_TWO, //<! Use the two-norm
49 NORM_INF //<! Use the infinity-norm
50};
51
53template <class ValueType,
54 class ArrayLayout,
55 class DeviceType,
56 class MagnitudeType>
57void
58normImpl (MagnitudeType norms[],
59 const Kokkos::View<const ValueType**, ArrayLayout, DeviceType>& X,
60 const EWhichNorm whichNorm,
61 const Teuchos::ArrayView<const size_t>& whichVecs,
62 const bool isConstantStride,
63 const bool isDistributed,
64 const Teuchos::Comm<int>* comm);
65
66} // namespace Details
67} // namespace Tpetra
68
70// Definitions start here
72
73namespace Tpetra {
74namespace Details {
75namespace Impl {
76
77template<class RV, class XMV>
78void
79lclNormImpl (const RV& normsOut,
80 const XMV& X,
81 const size_t numVecs,
82 const Teuchos::ArrayView<const size_t>& whichVecs,
83 const bool constantStride,
84 const EWhichNorm whichNorm)
85{
86 using Kokkos::ALL;
87 using Kokkos::subview;
88 using mag_type = typename RV::non_const_value_type;
89
90 static_assert (static_cast<int> (RV::rank) == 1,
91 "Tpetra::MultiVector::lclNormImpl: "
92 "The first argument normsOut must have rank 1.");
93 static_assert (Kokkos::is_view<XMV>::value,
94 "Tpetra::MultiVector::lclNormImpl: "
95 "The second argument X is not a Kokkos::View.");
96 static_assert (static_cast<int> (XMV::rank) == 2,
97 "Tpetra::MultiVector::lclNormImpl: "
98 "The second argument X must have rank 2.");
99
100 const size_t lclNumRows = static_cast<size_t> (X.extent (0));
101 TEUCHOS_TEST_FOR_EXCEPTION
102 (lclNumRows != 0 && constantStride &&
103 static_cast<size_t> (X.extent (1)) != numVecs,
104 std::logic_error, "Constant Stride X's dimensions are " << X.extent (0)
105 << " x " << X.extent (1) << ", which differ from the local dimensions "
106 << lclNumRows << " x " << numVecs << ". Please report this bug to "
107 "the Tpetra developers.");
108 TEUCHOS_TEST_FOR_EXCEPTION
109 (lclNumRows != 0 && ! constantStride &&
110 static_cast<size_t> (X.extent (1)) < numVecs,
111 std::logic_error, "Strided X's dimensions are " << X.extent (0) << " x "
112 << X.extent (1) << ", which are incompatible with the local dimensions "
113 << lclNumRows << " x " << numVecs << ". Please report this bug to "
114 "the Tpetra developers.");
115
116 if (lclNumRows == 0) {
117 const mag_type zeroMag = Kokkos::ArithTraits<mag_type>::zero ();
118 // DEEP_COPY REVIEW - VALUE-TO-DEVICE
119 using execution_space = typename RV::execution_space;
120 Kokkos::deep_copy (execution_space(), normsOut, zeroMag);
121 }
122 else { // lclNumRows != 0
123 if (constantStride) {
124 if (whichNorm == NORM_INF) {
125 KokkosBlas::nrminf (normsOut, X);
126 }
127 else if (whichNorm == NORM_ONE) {
128 KokkosBlas::nrm1 (normsOut, X);
129 }
130 else if (whichNorm == NORM_TWO) {
131 KokkosBlas::nrm2_squared (normsOut, X);
132 }
133 else {
134 TEUCHOS_TEST_FOR_EXCEPTION
135 (true, std::logic_error, "Should never get here!");
136 }
137 }
138 else { // not constant stride
139 // NOTE (mfh 15 Jul 2014, 11 Apr 2019) This does a kernel launch
140 // for every column. It might be better to have a kernel that
141 // does the work all at once. On the other hand, we don't
142 // prioritize performance of MultiVector views of noncontiguous
143 // columns.
144 for (size_t k = 0; k < numVecs; ++k) {
145 const size_t X_col = constantStride ? k : whichVecs[k];
146 if (whichNorm == NORM_INF) {
147 KokkosBlas::nrminf (subview (normsOut, k),
148 subview (X, ALL (), X_col));
149 }
150 else if (whichNorm == NORM_ONE) {
151 KokkosBlas::nrm1 (subview (normsOut, k),
152 subview (X, ALL (), X_col));
153 }
154 else if (whichNorm == NORM_TWO) {
155 KokkosBlas::nrm2_squared (subview (normsOut, k),
156 subview (X, ALL (), X_col));
157 }
158 else {
159 TEUCHOS_TEST_FOR_EXCEPTION
160 (true, std::logic_error, "Should never get here!");
161 }
162 } // for each column
163 } // constantStride
164 } // lclNumRows != 0
165}
166
167// Kokkos::parallel_for functor for applying square root to each
168// entry of a 1-D Kokkos::View.
169template<class ViewType>
170class SquareRootFunctor {
171public:
172 typedef typename ViewType::execution_space execution_space;
173 typedef typename ViewType::size_type size_type;
174
175 SquareRootFunctor (const ViewType& theView) :
176 theView_ (theView)
177 {}
178
179 KOKKOS_INLINE_FUNCTION void
180 operator() (const size_type& i) const
181 {
182 typedef typename ViewType::non_const_value_type value_type;
183 typedef Kokkos::ArithTraits<value_type> KAT;
184 theView_(i) = KAT::sqrt (theView_(i));
185 }
186
187private:
188 ViewType theView_;
189};
190
191template<class RV>
192void
193gblNormImpl (const RV& normsOut,
194 const Teuchos::Comm<int>* const comm,
195 const bool distributed,
196 const EWhichNorm whichNorm)
197{
198 using Teuchos::REDUCE_MAX;
199 using Teuchos::REDUCE_SUM;
200 using Teuchos::reduceAll;
201 typedef typename RV::non_const_value_type mag_type;
202
203 const size_t numVecs = normsOut.extent (0);
204
205 // If the MultiVector is distributed over multiple processes, do
206 // the distributed (interprocess) part of the norm. We assume
207 // that the MPI implementation can read from and write to device
208 // memory.
209 //
210 // replaceMap() may have removed some processes. Those processes
211 // have a null Map. They must not participate in any collective
212 // operations. We ask first whether the Map is null, because
213 // isDistributed() defers that question to the Map. We still
214 // compute and return local norms for processes not participating
215 // in collective operations; those probably don't make any sense,
216 // but it doesn't hurt to do them, since it's illegal to call
217 // norm*() on those processes anyway.
218 if (distributed && comm != nullptr) {
219 // The calling process only participates in the collective if
220 // both the Map and its Comm on that process are nonnull.
221
222 const int nv = static_cast<int> (numVecs);
223 const bool commIsInterComm = ::Tpetra::Details::isInterComm (*comm);
224
225 if (commIsInterComm) {
226 RV lclNorms (Kokkos::ViewAllocateWithoutInitializing ("MV::normImpl lcl"), numVecs);
227 // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
228 using execution_space = typename RV::execution_space;
229 Kokkos::deep_copy (execution_space(), lclNorms, normsOut);
230 const mag_type* const lclSum = lclNorms.data ();
231 mag_type* const gblSum = normsOut.data ();
232
233 if (whichNorm == NORM_INF) {
234 reduceAll<int, mag_type> (*comm, REDUCE_MAX, nv, lclSum, gblSum);
235 } else {
236 reduceAll<int, mag_type> (*comm, REDUCE_SUM, nv, lclSum, gblSum);
237 }
238 } else {
239 mag_type* const gblSum = normsOut.data ();
240 if (whichNorm == NORM_INF) {
241 reduceAll<int, mag_type> (*comm, REDUCE_MAX, nv, gblSum, gblSum);
242 } else {
243 reduceAll<int, mag_type> (*comm, REDUCE_SUM, nv, gblSum, gblSum);
244 }
245 }
246 }
247
248 if (whichNorm == NORM_TWO) {
249 // Replace the norm-squared results with their square roots in
250 // place, to get the final output. If the device memory and
251 // the host memory are the same, it probably doesn't pay to
252 // launch a parallel kernel for that, since there isn't enough
253 // parallelism for the typical MultiVector case.
254 const bool inHostMemory =
255 std::is_same<typename RV::memory_space,
256 typename RV::host_mirror_space::memory_space>::value;
257 if (inHostMemory) {
258 for (size_t j = 0; j < numVecs; ++j) {
259 normsOut(j) = Kokkos::ArithTraits<mag_type>::sqrt (normsOut(j));
260 }
261 }
262 else {
263 // There's not as much parallelism now, but that's OK. The
264 // point of doing parallel dispatch here is to keep the norm
265 // results on the device, thus avoiding a copy to the host
266 // and back again.
267 SquareRootFunctor<RV> f (normsOut);
268 typedef typename RV::execution_space execution_space;
269 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
270 Kokkos::parallel_for (range_type (0, numVecs), f);
271 }
272 }
273}
274
275} // namespace Impl
276
277template <class ValueType,
278 class ArrayLayout,
279 class DeviceType,
280 class MagnitudeType>
281void
282normImpl (MagnitudeType norms[],
283 const Kokkos::View<const ValueType**, ArrayLayout, DeviceType>& X,
284 const EWhichNorm whichNorm,
285 const Teuchos::ArrayView<const size_t>& whichVecs,
286 const bool isConstantStride,
287 const bool isDistributed,
288 const Teuchos::Comm<int>* comm)
289{
290 using execution_space = typename DeviceType::execution_space;
291 using RV = Kokkos::View<MagnitudeType*, Kokkos::HostSpace>;
292 //using XMV = Kokkos::View<const ValueType**, ArrayLayout, DeviceType>;
293 //using pair_type = std::pair<size_t, size_t>;
294
295 const size_t numVecs = isConstantStride ?
296 static_cast<size_t> (X.extent (1)) :
297 static_cast<size_t> (whichVecs.size ());
298 if (numVecs == 0) {
299 return; // nothing to do
300 }
301 RV normsOut (norms, numVecs);
302
303 Impl::lclNormImpl (normsOut, X, numVecs, whichVecs,
304 isConstantStride, whichNorm);
305
306 // lbv 03/15/23: the data from the local norm calculation
307 // better really be available before communication happens
308 // so fencing to make sure the local computations have
309 // completed on device. We might want to make this an
310 // execution space fence down the road?
311 execution_space exec_space_instance = execution_space();
312 exec_space_instance.fence();
313
314 Impl::gblNormImpl (normsOut, comm, isDistributed, whichNorm);
315}
316
317} // namespace Details
318} // namespace Tpetra
319
320#endif // TPETRA_DETAILS_NORMIMPL_HPP
Nonmember function that computes a residual Computes R = B - A * X.
void normImpl(MagnitudeType norms[], const Kokkos::View< const ValueType **, ArrayLayout, DeviceType > &X, const EWhichNorm whichNorm, const Teuchos::ArrayView< const size_t > &whichVecs, const bool isConstantStride, const bool isDistributed, const Teuchos::Comm< int > *comm)
Implementation of MultiVector norms.
EWhichNorm
Input argument for normImpl() (which see).
bool isInterComm(const Teuchos::Comm< int > &)
Return true if and only if the input communicator wraps an MPI intercommunicator.
Namespace Tpetra contains the class and methods constituting the Tpetra library.