Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_Details_packCrsMatrix_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_DETAILS_PACKCRSMATRIX_DEF_HPP
11#define TPETRA_DETAILS_PACKCRSMATRIX_DEF_HPP
12
13#include "TpetraCore_config.h"
14#include "Teuchos_Array.hpp"
15#include "Teuchos_ArrayView.hpp"
24#include <memory>
25#include <sstream>
26#include <stdexcept>
27#include <string>
28
51
52namespace Tpetra {
53
54//
55// Users must never rely on anything in the Details namespace.
56//
57namespace Details {
58
59namespace PackCrsMatrixImpl {
67template<class OutputOffsetsViewType,
68 class CountsViewType,
69 class InputOffsetsViewType,
70 class InputLocalRowIndicesViewType,
71 class InputLocalRowPidsViewType,
72 const bool debug =
73#ifdef HAVE_TPETRA_DEBUG
74 true
75#else
76 false
77#endif // HAVE_TPETRA_DEBUG
78 >
79class NumPacketsAndOffsetsFunctor {
80public:
81 typedef typename OutputOffsetsViewType::non_const_value_type output_offset_type;
82 typedef typename CountsViewType::non_const_value_type count_type;
83 typedef typename InputOffsetsViewType::non_const_value_type input_offset_type;
84 typedef typename InputLocalRowIndicesViewType::non_const_value_type local_row_index_type;
85 typedef typename InputLocalRowPidsViewType::non_const_value_type local_row_pid_type;
86 // output Views drive where execution happens.
87 typedef typename OutputOffsetsViewType::device_type device_type;
88 static_assert (std::is_same<typename CountsViewType::device_type::execution_space,
89 typename device_type::execution_space>::value,
90 "OutputOffsetsViewType and CountsViewType must have the same execution space.");
91 static_assert (Kokkos::is_view<OutputOffsetsViewType>::value,
92 "OutputOffsetsViewType must be a Kokkos::View.");
93 static_assert (std::is_same<typename OutputOffsetsViewType::value_type, output_offset_type>::value,
94 "OutputOffsetsViewType must be a nonconst Kokkos::View.");
95 static_assert (std::is_integral<output_offset_type>::value,
96 "The type of each entry of OutputOffsetsViewType must be a built-in integer type.");
97 static_assert (Kokkos::is_view<CountsViewType>::value,
98 "CountsViewType must be a Kokkos::View.");
99 static_assert (std::is_same<typename CountsViewType::value_type, output_offset_type>::value,
100 "CountsViewType must be a nonconst Kokkos::View.");
101 static_assert (std::is_integral<count_type>::value,
102 "The type of each entry of CountsViewType must be a built-in integer type.");
103 static_assert (Kokkos::is_view<InputOffsetsViewType>::value,
104 "InputOffsetsViewType must be a Kokkos::View.");
105 static_assert (std::is_integral<input_offset_type>::value,
106 "The type of each entry of InputOffsetsViewType must be a built-in integer type.");
107 static_assert (Kokkos::is_view<InputLocalRowIndicesViewType>::value,
108 "InputLocalRowIndicesViewType must be a Kokkos::View.");
109 static_assert (std::is_integral<local_row_index_type>::value,
110 "The type of each entry of InputLocalRowIndicesViewType must be a built-in integer type.");
111
112 NumPacketsAndOffsetsFunctor (const OutputOffsetsViewType& outputOffsets,
113 const CountsViewType& counts,
114 const InputOffsetsViewType& rowOffsets,
115 const InputLocalRowIndicesViewType& lclRowInds,
116 const InputLocalRowPidsViewType& lclRowPids,
117 const count_type sizeOfLclCount,
118 const count_type sizeOfGblColInd,
119 const count_type sizeOfPid,
120 const count_type sizeOfValue) :
121 outputOffsets_ (outputOffsets),
122 counts_ (counts),
123 rowOffsets_ (rowOffsets),
124 lclRowInds_ (lclRowInds),
125 lclRowPids_ (lclRowPids),
126 sizeOfLclCount_ (sizeOfLclCount),
127 sizeOfGblColInd_ (sizeOfGblColInd),
128 sizeOfPid_ (sizeOfPid),
129 sizeOfValue_ (sizeOfValue),
130 error_ ("error") // don't forget this, or you'll get segfaults!
131 {
132 if (debug) {
133 const size_t numRowsToPack = static_cast<size_t> (lclRowInds_.extent (0));
134
135 if (numRowsToPack != static_cast<size_t> (counts_.extent (0))) {
136 std::ostringstream os;
137 os << "lclRowInds.extent(0) = " << numRowsToPack
138 << " != counts.extent(0) = " << counts_.extent (0)
139 << ".";
140 TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str ());
141 }
142 if (static_cast<size_t> (numRowsToPack + 1) !=
143 static_cast<size_t> (outputOffsets_.extent (0))) {
144 std::ostringstream os;
145 os << "lclRowInds.extent(0) + 1 = " << (numRowsToPack + 1)
146 << " != outputOffsets.extent(0) = " << outputOffsets_.extent (0)
147 << ".";
148 TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str ());
149 }
150 }
151 }
152
153 KOKKOS_INLINE_FUNCTION void
154 operator() (const local_row_index_type& curInd,
155 output_offset_type& update,
156 const bool final) const
157 {
158 if (debug) {
159 if (curInd < static_cast<local_row_index_type> (0)) {
160 error_ () = 1;
161 return;
162 }
163 }
164
165 if (final) {
166 if (debug) {
167 if (curInd >= static_cast<local_row_index_type> (outputOffsets_.extent (0))) {
168 error_ () = 2;
169 return;
170 }
171 }
172 outputOffsets_(curInd) = update;
173 }
174
175 if (curInd < static_cast<local_row_index_type> (counts_.extent (0))) {
176 const auto lclRow = lclRowInds_(curInd);
177 if (static_cast<size_t> (lclRow + 1) >= static_cast<size_t> (rowOffsets_.extent (0)) ||
178 static_cast<local_row_index_type> (lclRow) < static_cast<local_row_index_type> (0)) {
179 error_ () = 3;
180 return;
181 }
182 // count_type could differ from the type of each row offset.
183 // For example, row offsets might each be 64 bits, but if their
184 // difference always fits in 32 bits, we may then safely use a
185 // 32-bit count_type.
186 const count_type count =
187 static_cast<count_type> (rowOffsets_(lclRow+1) - rowOffsets_(lclRow));
188
189 // We pack first the number of entries in the row, then that
190 // many global column indices, then that many pids (if any),
191 // then that many values. However, if the number of entries in
192 // the row is zero, we pack nothing.
193 const count_type numBytes = (count == 0) ?
194 static_cast<count_type> (0) :
195 sizeOfLclCount_ + count * (sizeOfGblColInd_ +
196 (lclRowPids_.size() > 0 ? sizeOfPid_ : 0) +
197 sizeOfValue_);
198
199 if (final) {
200 counts_(curInd) = numBytes;
201 }
202 update += numBytes;
203 }
204 }
205
206 // mfh 31 May 2017: Don't need init or join. If you have join, MUST
207 // have join both with and without volatile! Otherwise intrawarp
208 // joins are really slow on GPUs.
209
211 int getError () const {
212 auto error_h = Kokkos::create_mirror_view (error_);
213 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
214 // Note: In the UVM case, this would otherwise be a no-op
215 // and thus not fence, so the value might not be correct on return
216 // In the non-UVM case, create_mirror_view will block for the allocation
217 Kokkos::deep_copy (error_h, error_);
218 return error_h ();
219 }
220
221private:
222 OutputOffsetsViewType outputOffsets_;
223 CountsViewType counts_;
224 typename InputOffsetsViewType::const_type rowOffsets_;
225 typename InputLocalRowIndicesViewType::const_type lclRowInds_;
226 typename InputLocalRowPidsViewType::const_type lclRowPids_;
227 count_type sizeOfLclCount_;
228 count_type sizeOfGblColInd_;
229 count_type sizeOfPid_;
230 count_type sizeOfValue_;
231 Kokkos::View<int, device_type> error_;
232};
233
243template<class OutputOffsetsViewType,
244 class CountsViewType,
245 class InputOffsetsViewType,
246 class InputLocalRowIndicesViewType,
247 class InputLocalRowPidsViewType>
248typename CountsViewType::non_const_value_type
249computeNumPacketsAndOffsets (const OutputOffsetsViewType& outputOffsets,
250 const CountsViewType& counts,
251 const InputOffsetsViewType& rowOffsets,
252 const InputLocalRowIndicesViewType& lclRowInds,
253 const InputLocalRowPidsViewType& lclRowPids,
254 const typename CountsViewType::non_const_value_type sizeOfLclCount,
255 const typename CountsViewType::non_const_value_type sizeOfGblColInd,
256 const typename CountsViewType::non_const_value_type sizeOfPid,
257 const typename CountsViewType::non_const_value_type sizeOfValue)
258{
259 typedef NumPacketsAndOffsetsFunctor<OutputOffsetsViewType,
260 CountsViewType, typename InputOffsetsViewType::const_type,
261 typename InputLocalRowIndicesViewType::const_type,
262 typename InputLocalRowPidsViewType::const_type> functor_type;
263 typedef typename CountsViewType::non_const_value_type count_type;
264 typedef typename OutputOffsetsViewType::size_type size_type;
265 typedef typename OutputOffsetsViewType::execution_space execution_space;
266 typedef typename functor_type::local_row_index_type LO;
267 typedef Kokkos::RangePolicy<execution_space, LO> range_type;
268 const char prefix[] = "computeNumPacketsAndOffsets: ";
269
270 count_type count = 0;
271 const count_type numRowsToPack = lclRowInds.extent (0);
272
273 if (numRowsToPack == 0) {
274 return count;
275 }
276 else {
277 TEUCHOS_TEST_FOR_EXCEPTION
278 (rowOffsets.extent (0) <= static_cast<size_type> (1),
279 std::invalid_argument, prefix << "There is at least one row to pack, "
280 "but the matrix has no rows. lclRowInds.extent(0) = " <<
281 numRowsToPack << ", but rowOffsets.extent(0) = " <<
282 rowOffsets.extent (0) << " <= 1.");
283 TEUCHOS_TEST_FOR_EXCEPTION
284 (outputOffsets.extent (0) !=
285 static_cast<size_type> (numRowsToPack + 1), std::invalid_argument,
286 prefix << "Output dimension does not match number of rows to pack. "
287 << "outputOffsets.extent(0) = " << outputOffsets.extent (0)
288 << " != lclRowInds.extent(0) + 1 = "
289 << static_cast<size_type> (numRowsToPack + 1) << ".");
290 TEUCHOS_TEST_FOR_EXCEPTION
291 (counts.extent (0) != numRowsToPack, std::invalid_argument,
292 prefix << "counts.extent(0) = " << counts.extent (0)
293 << " != numRowsToPack = " << numRowsToPack << ".");
294
295 functor_type f (outputOffsets, counts, rowOffsets,
296 lclRowInds, lclRowPids, sizeOfLclCount,
297 sizeOfGblColInd, sizeOfPid, sizeOfValue);
298 Kokkos::parallel_scan (range_type (0, numRowsToPack + 1), f);
299
300 // At least in debug mode, this functor checks for errors.
301 const int errCode = f.getError ();
302 TEUCHOS_TEST_FOR_EXCEPTION
303 (errCode != 0, std::runtime_error, prefix << "parallel_scan error code "
304 << errCode << " != 0.");
305
306#if 0
307 size_t total = 0;
308 for (LO k = 0; k < numRowsToPack; ++k) {
309 total += counts[k];
310 }
311 if (outputOffsets(numRowsToPack) != total) {
312 if (errStr.get () == NULL) {
313 errStr = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
314 }
315 std::ostringstream& os = *errStr;
316 os << prefix
317 << "outputOffsets(numRowsToPack=" << numRowsToPack << ") "
318 << outputOffsets(numRowsToPack) << " != sum of counts = "
319 << total << "." << std::endl;
320 if (numRowsToPack != 0) {
321 // Only print the array if it's not too long.
322 if (numRowsToPack < static_cast<LO> (10)) {
323 os << "outputOffsets: [";
324 for (LO i = 0; i <= numRowsToPack; ++i) {
325 os << outputOffsets(i);
326 if (static_cast<LO> (i + 1) <= numRowsToPack) {
327 os << ",";
328 }
329 }
330 os << "]" << std::endl;
331 os << "counts: [";
332 for (LO i = 0; i < numRowsToPack; ++i) {
333 os << counts(i);
334 if (static_cast<LO> (i + 1) < numRowsToPack) {
335 os << ",";
336 }
337 }
338 os << "]" << std::endl;
339 }
340 else {
341 os << "outputOffsets(" << (numRowsToPack-1) << ") = "
342 << outputOffsets(numRowsToPack-1) << "." << std::endl;
343 }
344 }
345 count = outputOffsets(numRowsToPack);
346 return {false, errStr};
347 }
348#endif // HAVE_TPETRA_DEBUG
349
350 // Get last entry of outputOffsets, which is the sum of the entries
351 // of counts. Don't assume UVM.
352 using Tpetra::Details::getEntryOnHost;
353 return static_cast<count_type> (getEntryOnHost (outputOffsets,
354 numRowsToPack));
355 }
356}
357
373template<class ST, class ColumnMap, class BufferDeviceType>
374KOKKOS_FUNCTION
375Kokkos::pair<int, size_t>
376packCrsMatrixRow (const ColumnMap& col_map,
377 const Kokkos::View<char*, BufferDeviceType>& exports,
379 const typename PackTraits<int>::input_array_type& pids_in,
380 const typename PackTraits<ST>::input_array_type& vals_in,
381 const size_t offset,
382 const size_t num_ent,
383 const size_t num_bytes_per_value,
384 const bool pack_pids)
385{
386 using Kokkos::subview;
387 using LO = typename ColumnMap::local_ordinal_type;
388 using GO = typename ColumnMap::global_ordinal_type;
389 using return_type = Kokkos::pair<int, size_t>;
390
391 if (num_ent == 0) {
392 // Empty rows always take zero bytes, to ensure sparsity.
393 return return_type (0, 0);
394 }
395
396 const LO num_ent_LO = static_cast<LO> (num_ent); // packValueCount wants this
397 const size_t num_ent_beg = offset;
398 const size_t num_ent_len = PackTraits<LO>::packValueCount (num_ent_LO);
399
400 const size_t gids_beg = num_ent_beg + num_ent_len;
401 const size_t gids_len = num_ent * PackTraits<GO>::packValueCount (GO (0));
402
403 const size_t pids_beg = gids_beg + gids_len;
404 const size_t pids_len = pack_pids ?
405 num_ent * PackTraits<int>::packValueCount (int (0)) :
406 static_cast<size_t> (0);
407
408 const size_t vals_beg = gids_beg + gids_len + pids_len;
409 const size_t vals_len = num_ent * num_bytes_per_value;
410
411 char* const num_ent_out = exports.data () + num_ent_beg;
412 char* const gids_out = exports.data () + gids_beg;
413 char* const pids_out = pack_pids ? exports.data () + pids_beg : NULL;
414 char* const vals_out = exports.data () + vals_beg;
415
416 size_t num_bytes_out = 0;
417 int error_code = 0;
418 num_bytes_out += PackTraits<LO>::packValue (num_ent_out, num_ent_LO);
419
420 {
421 // Copy column indices one at a time, so that we don't need
422 // temporary storage.
423 for (size_t k = 0; k < num_ent; ++k) {
424 const LO lid = lids_in[k];
425 const GO gid = col_map.getGlobalElement (lid);
426 num_bytes_out += PackTraits<GO>::packValue (gids_out, k, gid);
427 }
428 // Copy PIDs one at a time, so that we don't need temporary storage.
429 if (pack_pids) {
430 for (size_t k = 0; k < num_ent; ++k) {
431 const LO lid = lids_in[k];
432 const int pid = pids_in[lid];
433 num_bytes_out += PackTraits<int>::packValue (pids_out, k, pid);
434 }
435 }
436 const auto p =
437 PackTraits<ST>::packArray (vals_out, vals_in.data (), num_ent);
438 error_code += p.first;
439 num_bytes_out += p.second;
440 }
441
442 if (error_code != 0) {
443 return return_type (10, num_bytes_out);
444 }
445
446 const size_t expected_num_bytes =
447 num_ent_len + gids_len + pids_len + vals_len;
448 if (num_bytes_out != expected_num_bytes) {
449 return return_type (11, num_bytes_out);
450 }
451 return return_type (0, num_bytes_out);
452}
453
454template<class LocalMatrix, class LocalMap, class BufferDeviceType>
455struct PackCrsMatrixFunctor {
456 typedef LocalMatrix local_matrix_device_type;
457 typedef LocalMap local_map_type;
458 typedef typename local_matrix_device_type::value_type ST;
459 typedef typename local_map_type::local_ordinal_type LO;
460 typedef typename local_map_type::global_ordinal_type GO;
461 typedef typename local_matrix_device_type::device_type DT;
462
463 typedef Kokkos::View<const size_t*, BufferDeviceType>
464 num_packets_per_lid_view_type;
465 typedef Kokkos::View<const size_t*, BufferDeviceType> offsets_view_type;
466 typedef Kokkos::View<char*, BufferDeviceType> exports_view_type;
467 using export_lids_view_type = typename PackTraits<LO>::input_array_type;
468 using source_pids_view_type = typename PackTraits<int>::input_array_type;
469
470 typedef typename num_packets_per_lid_view_type::non_const_value_type
471 count_type;
472 typedef typename offsets_view_type::non_const_value_type
473 offset_type;
474 typedef Kokkos::pair<int, LO> value_type;
475
476 static_assert (std::is_same<LO, typename local_matrix_device_type::ordinal_type>::value,
477 "local_map_type::local_ordinal_type and "
478 "local_matrix_device_type::ordinal_type must be the same.");
479
480 local_matrix_device_type local_matrix;
481 local_map_type local_col_map;
482 exports_view_type exports;
483 num_packets_per_lid_view_type num_packets_per_lid;
484 export_lids_view_type export_lids;
485 source_pids_view_type source_pids;
486 offsets_view_type offsets;
487 size_t num_bytes_per_value;
488 bool pack_pids;
489
490 PackCrsMatrixFunctor (const local_matrix_device_type& local_matrix_in,
491 const local_map_type& local_col_map_in,
492 const exports_view_type& exports_in,
493 const num_packets_per_lid_view_type& num_packets_per_lid_in,
494 const export_lids_view_type& export_lids_in,
495 const source_pids_view_type& source_pids_in,
496 const offsets_view_type& offsets_in,
497 const size_t num_bytes_per_value_in,
498 const bool pack_pids_in) :
499 local_matrix (local_matrix_in),
500 local_col_map (local_col_map_in),
501 exports (exports_in),
502 num_packets_per_lid (num_packets_per_lid_in),
503 export_lids (export_lids_in),
504 source_pids (source_pids_in),
505 offsets (offsets_in),
506 num_bytes_per_value (num_bytes_per_value_in),
507 pack_pids (pack_pids_in)
508 {
509 const LO numRows = local_matrix_in.numRows ();
510 const LO rowMapDim =
511 static_cast<LO> (local_matrix.graph.row_map.extent (0));
512 TEUCHOS_TEST_FOR_EXCEPTION
513 (numRows != 0 && rowMapDim != numRows + static_cast<LO> (1),
514 std::logic_error, "local_matrix.graph.row_map.extent(0) = "
515 << rowMapDim << " != numRows (= " << numRows << " ) + 1.");
516 }
517
518 KOKKOS_INLINE_FUNCTION void init (value_type& dst) const
519 {
520 using ::Tpetra::Details::OrdinalTraits;
521 dst = Kokkos::make_pair (0, OrdinalTraits<LO>::invalid ());
522 }
523
524 KOKKOS_INLINE_FUNCTION void
525 join (value_type& dst, const value_type& src) const
526 {
527 // `dst` should reflect the first (least) bad index and all other
528 // associated error codes and data, so prefer keeping it.
529 if (src.first != 0 && dst.first == 0) {
530 dst = src;
531 }
532 }
533
534 KOKKOS_INLINE_FUNCTION
535 void operator() (const LO i, value_type& dst) const
536 {
537 const size_t offset = offsets[i];
538 const LO export_lid = export_lids[i];
539 const size_t buf_size = exports.size();
540 const size_t num_bytes = num_packets_per_lid(i);
541 const size_t num_ent =
542 static_cast<size_t> (local_matrix.graph.row_map[export_lid+1]
543 - local_matrix.graph.row_map[export_lid]);
544
545 // Only pack this row's data if it has a nonzero number of
546 // entries. We can do this because receiving processes get the
547 // number of packets, and will know that zero packets means zero
548 // entries.
549 if (num_ent == 0) {
550 return;
551 }
552
553 if (export_lid >= local_matrix.numRows ()) {
554 if (dst.first != 0) { // keep only the first error
555 dst = Kokkos::make_pair (1, i); // invalid row
556 }
557 return;
558 }
559 else if ((offset > buf_size || offset + num_bytes > buf_size)) {
560 if (dst.first != 0) { // keep only the first error
561 dst = Kokkos::make_pair (2, i); // out of bounds
562 }
563 return;
564 }
565
566 // We can now pack this row
567
568 // Since the matrix is locally indexed on the calling process, we
569 // have to use its column Map (which it _must_ have in this case)
570 // to convert to global indices.
571 const auto row_beg = local_matrix.graph.row_map[export_lid];
572 const auto row_end = local_matrix.graph.row_map[export_lid + 1];
573 auto vals_in = subview (local_matrix.values,
574 Kokkos::make_pair (row_beg, row_end));
575 auto lids_in = subview (local_matrix.graph.entries,
576 Kokkos::make_pair (row_beg, row_end));
577 typedef local_map_type LMT;
578 typedef BufferDeviceType BDT;
579 auto p = packCrsMatrixRow<ST, LMT, BDT> (local_col_map, exports, lids_in,
580 source_pids, vals_in, offset,
581 num_ent, num_bytes_per_value,
582 pack_pids);
583 int error_code_this_row = p.first;
584 size_t num_bytes_packed_this_row = p.second;
585 if (error_code_this_row != 0) {
586 if (dst.first != 0) { // keep only the first error
587 dst = Kokkos::make_pair (error_code_this_row, i); // bad pack
588 }
589 }
590 else if (num_bytes_packed_this_row != num_bytes) {
591 if (dst.first != 0) { // keep only the first error
592 dst = Kokkos::make_pair (3, i);
593 }
594 }
595 }
596};
597
605template<class LocalMatrix, class LocalMap, class BufferDeviceType>
606void
607do_pack (const LocalMatrix& local_matrix,
608 const LocalMap& local_map,
609 const Kokkos::View<char*, BufferDeviceType>& exports,
610 const typename PackTraits<size_t>::input_array_type& num_packets_per_lid,
612 const typename PackTraits<int>::input_array_type& source_pids,
613 const Kokkos::View<const size_t*, BufferDeviceType>& offsets,
614 const size_t num_bytes_per_value,
615 const bool pack_pids)
616{
617 using LO = typename LocalMap::local_ordinal_type;
618 using DT = typename LocalMatrix::device_type;
619 using range_type = Kokkos::RangePolicy<typename DT::execution_space, LO>;
620 const char prefix[] = "Tpetra::Details::do_pack: ";
621
622 if (export_lids.extent (0) != 0) {
623 TEUCHOS_TEST_FOR_EXCEPTION
624 (static_cast<size_t> (offsets.extent (0)) !=
625 static_cast<size_t> (export_lids.extent (0) + 1),
626 std::invalid_argument, prefix << "offsets.extent(0) = "
627 << offsets.extent (0) << " != export_lids.extent(0) (= "
628 << export_lids.extent (0) << ") + 1.");
629 TEUCHOS_TEST_FOR_EXCEPTION
630 (export_lids.extent (0) != num_packets_per_lid.extent (0),
631 std::invalid_argument, prefix << "export_lids.extent(0) = " <<
632 export_lids.extent (0) << " != num_packets_per_lid.extent(0) = "
633 << num_packets_per_lid.extent (0) << ".");
634 // If exports has nonzero length at this point, then the matrix
635 // has at least one entry to pack. Thus, if packing process
636 // ranks, we had better have at least one process rank to pack.
637 TEUCHOS_TEST_FOR_EXCEPTION
638 (pack_pids && exports.extent (0) != 0 &&
639 source_pids.extent (0) == 0, std::invalid_argument, prefix <<
640 "pack_pids is true, and exports.extent(0) = " <<
641 exports.extent (0) << " != 0, meaning that we need to pack at "
642 "least one matrix entry, but source_pids.extent(0) = 0.");
643 }
644
645 using pack_functor_type =
646 PackCrsMatrixFunctor<LocalMatrix, LocalMap, BufferDeviceType>;
647 pack_functor_type f (local_matrix, local_map, exports,
648 num_packets_per_lid, export_lids,
649 source_pids, offsets, num_bytes_per_value,
650 pack_pids);
651
652 typename pack_functor_type::value_type result;
653 range_type range (0, num_packets_per_lid.extent (0));
654 Kokkos::parallel_reduce (range, f, result);
655
656 if (result.first != 0) {
657 // We can't deep_copy from AnonymousSpace Views, so we can't print
658 // out any information from them in case of error.
659 TEUCHOS_TEST_FOR_EXCEPTION
660 (true, std::runtime_error, prefix << "PackCrsMatrixFunctor "
661 "reported error code " << result.first << " for the first "
662 "bad row " << result.second << ".");
663 }
664}
665
695template<typename ST, typename LO, typename GO, typename NT, typename BufferDeviceType>
696void
698 Kokkos::DualView<char*, BufferDeviceType>& exports,
699 const Kokkos::View<size_t*, BufferDeviceType>& num_packets_per_lid,
700 const Kokkos::View<const LO*, BufferDeviceType>& export_lids,
701 const Kokkos::View<const int*, typename NT::device_type>& export_pids,
702 size_t& constant_num_packets,
703 const bool pack_pids)
704{
705 ::Tpetra::Details::ProfilingRegion region_pack_crs_matrix(
706 "Tpetra::Details::PackCrsMatrixImpl::packCrsMatrix",
707 "Import/Export"
708 );
709 using Kokkos::View;
710 typedef BufferDeviceType DT;
711 typedef Kokkos::DualView<char*, BufferDeviceType> exports_view_type;
712 const char prefix[] = "Tpetra::Details::PackCrsMatrixImpl::packCrsMatrix: ";
713 constexpr bool debug = false;
714
715 auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
716 auto local_col_map = sourceMatrix.getColMap ()->getLocalMap ();
717
718 // Setting this to zero tells the caller to expect a possibly
719 // different ("nonconstant") number of packets per local index
720 // (i.e., a possibly different number of entries per row).
721 constant_num_packets = 0;
722
723 const size_t num_export_lids =
724 static_cast<size_t> (export_lids.extent (0));
725 TEUCHOS_TEST_FOR_EXCEPTION
726 (num_export_lids !=
727 static_cast<size_t> (num_packets_per_lid.extent (0)),
728 std::invalid_argument, prefix << "num_export_lids.extent(0) = "
729 << num_export_lids << " != num_packets_per_lid.extent(0) = "
730 << num_packets_per_lid.extent (0) << ".");
731 if (num_export_lids != 0) {
732 TEUCHOS_TEST_FOR_EXCEPTION
733 (num_packets_per_lid.data () == NULL, std::invalid_argument,
734 prefix << "num_export_lids = "<< num_export_lids << " != 0, but "
735 "num_packets_per_lid.data() = "
736 << num_packets_per_lid.data () << " == NULL.");
737 }
738
739 const size_t num_bytes_per_lid = PackTraits<LO>::packValueCount (LO (0));
740 const size_t num_bytes_per_gid = PackTraits<GO>::packValueCount (GO (0));
741 const size_t num_bytes_per_pid = PackTraits<int>::packValueCount (int (0));
742
743 size_t num_bytes_per_value = 0;
745 // Assume ST is default constructible; packValueCount wants an instance.
746 num_bytes_per_value = PackTraits<ST>::packValueCount (ST ());
747 }
748 else {
749 // Since the packed data come from the source matrix, we can use
750 // the source matrix to get the number of bytes per Scalar value
751 // stored in the matrix. This assumes that all Scalar values in
752 // the source matrix require the same number of bytes. If the
753 // source matrix has no entries on the calling process, then we
754 // hope that some process does have some idea how big a Scalar
755 // value is. Of course, if no processes have any entries, then no
756 // values should be packed (though this does assume that in our
757 // packing scheme, rows with zero entries take zero bytes).
758 size_t num_bytes_per_value_l = 0;
759 if (local_matrix.values.extent(0) > 0) {
760 const ST& val = local_matrix.values(0);
761 num_bytes_per_value_l = PackTraits<ST>::packValueCount (val);
762 }
763 using Teuchos::reduceAll;
764 reduceAll<int, size_t> (* (sourceMatrix.getComm ()),
765 Teuchos::REDUCE_MAX,
766 num_bytes_per_value_l,
767 Teuchos::outArg (num_bytes_per_value));
768 }
769
770 if (num_export_lids == 0) {
771 exports = exports_view_type ("exports", 0);
772 return;
773 }
774
775 // Array of offsets into the pack buffer.
776 Kokkos::View<size_t*, DT> offsets ("offsets", num_export_lids + 1);
777
778 // Compute number of packets per LID (row to send), as well as
779 // corresponding offsets (the prefix sum of the packet counts).
780 const size_t count =
781 computeNumPacketsAndOffsets (offsets, num_packets_per_lid,
782 local_matrix.graph.row_map, export_lids,
783 export_pids,
784 num_bytes_per_lid, num_bytes_per_gid,
785 num_bytes_per_pid, num_bytes_per_value);
786
787 // Resize the output pack buffer if needed.
788 if (count > static_cast<size_t> (exports.extent (0))) {
789 exports = exports_view_type ("exports", count);
790 if (debug) {
791 std::ostringstream os;
792 os << "*** exports resized to " << count << std::endl;
793 std::cerr << os.str ();
794 }
795 }
796 if (debug) {
797 std::ostringstream os;
798 os << "*** count: " << count << ", exports.extent(0): "
799 << exports.extent (0) << std::endl;
800 std::cerr << os.str ();
801 }
802
803 // If exports has nonzero length at this point, then the matrix has
804 // at least one entry to pack. Thus, if packing process ranks, we
805 // had better have at least one process rank to pack.
806 TEUCHOS_TEST_FOR_EXCEPTION
807 (pack_pids && exports.extent (0) != 0 &&
808 export_pids.extent (0) == 0, std::invalid_argument, prefix <<
809 "pack_pids is true, and exports.extent(0) = " <<
810 exports.extent (0) << " != 0, meaning that we need to pack at least "
811 "one matrix entry, but export_pids.extent(0) = 0.");
812
813 typedef typename std::decay<decltype (local_matrix)>::type
815 typedef typename std::decay<decltype (local_col_map)>::type
816 local_map_type;
817
818 exports.modify_device ();
819 auto exports_d = exports.view_device ();
821 (local_matrix, local_col_map, exports_d, num_packets_per_lid,
822 export_lids, export_pids, offsets, num_bytes_per_value,
823 pack_pids);
824 // If we got this far, we succeeded.
825}
826
827} // namespace PackCrsMatrixImpl
828
829template<typename ST, typename LO, typename GO, typename NT>
830void
832 Teuchos::Array<char>& exports,
833 const Teuchos::ArrayView<size_t>& numPacketsPerLID,
834 const Teuchos::ArrayView<const LO>& exportLIDs,
835 size_t& constantNumPackets)
836{
837
840 using host_exec_space = typename Kokkos::View<size_t*, device_type>::HostMirror::execution_space;
841 using device_exec_space = typename device_type::execution_space;
842 using host_dev_type = Kokkos::Device<host_exec_space, Kokkos::HostSpace>;
843
844 // Convert all Teuchos::Array to Kokkos::View
845
846 // This is an output array, so we don't have to copy to device here.
847 // However, we'll have to remember to copy back to host when done.
848 Kokkos::View<size_t*, buffer_device_type> num_packets_per_lid_d =
850 numPacketsPerLID.getRawPtr (),
851 numPacketsPerLID.size (), false,
852 "num_packets_per_lid");
853 // FIXME (mfh 05 Feb 2019) We should just pass the exportLIDs
854 // DualView through here, instead of recreating a device View from a
855 // host ArrayView that itself came from a DualView.
856 //
857 // This is an input array, so we have to copy to device here.
858 // However, we never need to copy it back to host.
859 Kokkos::View<const LO*, buffer_device_type> export_lids_d =
861 exportLIDs.getRawPtr (),
862 exportLIDs.size (), true,
863 "export_lids");
864
865 Kokkos::View<int*, device_type> export_pids_d; // output arg
866 Kokkos::DualView<char*, buffer_device_type> exports_dv; // output arg
867 constexpr bool pack_pids = false;
869 sourceMatrix, exports_dv, num_packets_per_lid_d, export_lids_d,
870 export_pids_d, constantNumPackets, pack_pids);
871
872 // The counts are an output of PackCrsMatrixImpl::packCrsMatrix, so we have to
873 // copy them back to host.
874 Kokkos::View<size_t*, host_dev_type> num_packets_per_lid_h
875 (numPacketsPerLID.getRawPtr (),
876 numPacketsPerLID.size ());
877 // DEEP_COPY REVIEW - DEVICE-TO-HOST
878 Kokkos::deep_copy (device_exec_space(), num_packets_per_lid_h, num_packets_per_lid_d);
879
880 // FIXME (mfh 23 Aug 2017) If we're forced to use a DualView for
881 // exports_dv above, then we have two host copies for exports_h.
882
883 // The exports are an output of PackCrsMatrixImpl::packCrsMatrix, so we have
884 // to copy them back to host.
885 if (static_cast<size_t> (exports.size ()) !=
886 static_cast<size_t> (exports_dv.extent (0))) {
887 exports.resize (exports_dv.extent (0));
888 }
889 Kokkos::View<char*, host_dev_type> exports_h (exports.getRawPtr (),
890 exports.size ());
891 // DEEP_COPY REVIEW - DEVICE-TO-HOST
892 Kokkos::deep_copy (device_exec_space(), exports_h, exports_dv.view_device());
893}
894
895template<typename ST, typename LO, typename GO, typename NT>
896void
898 const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
899 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& exports,
900 const Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& numPacketsPerLID,
901 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& exportLIDs,
902 size_t& constantNumPackets)
903{
906
907 // Create an empty array of PIDs, since the interface needs it.
908 Kokkos::View<int*, device_type> exportPIDs_d ("exportPIDs", 0);
909 constexpr bool pack_pids = false;
910
911 // Write-only device access
912 auto numPacketsPerLID_nc = numPacketsPerLID; // const DV& -> DV
913 numPacketsPerLID_nc.clear_sync_state ();
914 numPacketsPerLID_nc.modify_device ();
915 auto numPacketsPerLID_d = numPacketsPerLID.view_device ();
916
917 // Read-only device access
918 TEUCHOS_ASSERT( ! exportLIDs.need_sync_device () );
919 auto exportLIDs_d = exportLIDs.view_device ();
920
921 ::Tpetra::Details::ProfilingRegion region_pack_crs_matrix_new(
922 "Tpetra::Details::packCrsMatrixNew",
923 "Import/Export"
924 );
926 sourceMatrix, exports, numPacketsPerLID_d, exportLIDs_d,
927 exportPIDs_d, constantNumPackets, pack_pids);
928}
929
930template<typename ST, typename LO, typename GO, typename NT>
931void
933 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& exports_dv,
934 const Teuchos::ArrayView<size_t>& numPacketsPerLID,
935 const Teuchos::ArrayView<const LO>& exportLIDs,
936 const Teuchos::ArrayView<const int>& sourcePIDs,
937 size_t& constantNumPackets)
938{
941 typedef typename Kokkos::DualView<char*, buffer_device_type>::t_host::execution_space host_exec_space;
942 typedef Kokkos::Device<host_exec_space, Kokkos::HostSpace> host_dev_type;
943
944 typename local_matrix_device_type::device_type outputDevice;
945 typedef typename NT::execution_space execution_space;
946
947
948 const bool verbose = ::Tpetra::Details::Behavior::verbose ();
949 std::unique_ptr<std::string> prefix;
950 if (verbose) {
951 const int myRank = [&] () {
952 auto map = sourceMatrix.getMap ();
953 if (map.get () == nullptr) {
954 return -1;
955 }
956 auto comm = map->getComm ();
957 if (comm.get () == nullptr) {
958 return -2;
959 }
960 return comm->getRank ();
961 } ();
962 std::ostringstream os;
963 os << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs: ";
964 prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
965
966 std::ostringstream os2;
967 os2 << *prefix << "start" << std::endl;
968 std::cerr << os2.str ();
969 }
970
971 // Convert all Teuchos::Array to Kokkos::View
972
973 // This is an output array, so we don't have to copy to device here.
974 // However, we'll have to remember to copy back to host when done.
975 auto num_packets_per_lid_d =
977 numPacketsPerLID.getRawPtr (),
978 numPacketsPerLID.size (), false,
979 "num_packets_per_lid");
980
981 // This is an input array, so we have to copy to device here.
982 // However, we never need to copy it back to host.
983 auto export_lids_d =
985 exportLIDs.getRawPtr (),
986 exportLIDs.size (), true,
987 "export_lids");
988 // This is an input array, so we have to copy to device here.
989 // However, we never need to copy it back to host.
990 auto export_pids_d =
992 sourcePIDs.getRawPtr (),
993 sourcePIDs.size (), true,
994 "export_pids");
995 constexpr bool pack_pids = true;
996 try {
998 (sourceMatrix, exports_dv, num_packets_per_lid_d, export_lids_d,
999 export_pids_d, constantNumPackets, pack_pids);
1000 }
1001 catch (std::exception& e) {
1002 if (verbose) {
1003 std::ostringstream os;
1004 os << *prefix << "PackCrsMatrixImpl::packCrsMatrix threw: "
1005 << e.what () << std::endl;
1006 std::cerr << os.str ();
1007 }
1008 throw;
1009 }
1010 catch (...) {
1011 if (verbose) {
1012 std::ostringstream os;
1013 os << *prefix << "PackCrsMatrixImpl::packCrsMatrix threw an exception "
1014 "not a subclass of std::exception" << std::endl;
1015 std::cerr << os.str ();
1016 }
1017 throw;
1018 }
1019
1020 if (numPacketsPerLID.size () != 0) {
1021 try {
1022 // The counts are an output of PackCrsMatrixImpl::packCrsMatrix,
1023 // so we have to copy them back to host.
1024 Kokkos::View<size_t*, host_dev_type> num_packets_per_lid_h
1025 (numPacketsPerLID.getRawPtr (), numPacketsPerLID.size ());
1026 // DEEP_COPY REVIEW - DEVICE-TO-HOST
1027 Kokkos::deep_copy (execution_space(), num_packets_per_lid_h, num_packets_per_lid_d);
1028 }
1029 catch (std::exception& e) {
1030 if (verbose) {
1031 std::ostringstream os;
1032 os << *prefix << "Kokkos::deep_copy threw: " << e.what () << std::endl;
1033 std::cerr << os.str ();
1034 }
1035 throw;
1036 }
1037 catch (...) {
1038 if (verbose) {
1039 std::ostringstream os;
1040 os << *prefix << "Kokkos::deep_copy threw an exception not a subclass "
1041 "of std::exception" << std::endl;
1042 std::cerr << os.str ();
1043 }
1044 throw;
1045 }
1046 }
1047
1048 if (verbose) {
1049 std::ostringstream os;
1050 os << *prefix << "done" << std::endl;
1051 std::cerr << os.str ();
1052 }
1053}
1054
1055} // namespace Details
1056} // namespace Tpetra
1057
1058#define TPETRA_DETAILS_PACKCRSMATRIX_INSTANT( ST, LO, GO, NT ) \
1059 template void \
1060 Details::packCrsMatrix<ST, LO, GO, NT> (const CrsMatrix<ST, LO, GO, NT>&, \
1061 Teuchos::Array<char>&, \
1062 const Teuchos::ArrayView<size_t>&, \
1063 const Teuchos::ArrayView<const LO>&, \
1064 size_t&); \
1065 template void \
1066 Details::packCrsMatrixNew<ST, LO, GO, NT> (const CrsMatrix<ST, LO, GO, NT>&, \
1067 Kokkos::DualView<char*, DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1068 const Kokkos::DualView<size_t*, DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1069 const Kokkos::DualView<const LO*, DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1070 size_t&); \
1071 template void \
1072 Details::packCrsMatrixWithOwningPIDs<ST, LO, GO, NT> (const CrsMatrix<ST, LO, GO, NT>&, \
1073 Kokkos::DualView<char*, DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1074 const Teuchos::ArrayView<size_t>&, \
1075 const Teuchos::ArrayView<const LO>&, \
1076 const Teuchos::ArrayView<const int>&, \
1077 size_t&);
1078
1079#endif // TPETRA_DETAILS_PACKCRSMATRIX_DEF_HPP
Declaration of the Tpetra::CrsMatrix class.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Declaration and definition of Tpetra::Details::getEntryOnHost.
CountsViewType::non_const_value_type computeNumPacketsAndOffsets(const OutputOffsetsViewType &outputOffsets, const CountsViewType &counts, const InputOffsetsViewType &rowOffsets, const InputLocalRowIndicesViewType &lclRowInds, const InputLocalRowPidsViewType &lclRowPids, const typename CountsViewType::non_const_value_type sizeOfLclCount, const typename CountsViewType::non_const_value_type sizeOfGblColInd, const typename CountsViewType::non_const_value_type sizeOfPid, const typename CountsViewType::non_const_value_type sizeOfValue)
Compute the number of packets and offsets for the pack procedure.
void packCrsMatrix(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, BufferDeviceType > &exports, const Kokkos::View< size_t *, BufferDeviceType > &num_packets_per_lid, const Kokkos::View< const LO *, BufferDeviceType > &export_lids, const Kokkos::View< const int *, typename NT::device_type > &export_pids, size_t &constant_num_packets, const bool pack_pids)
Pack specified entries of the given local sparse matrix for communication.
void do_pack(const LocalMatrix &local_matrix, const LocalMap &local_map, const Kokkos::View< char *, BufferDeviceType > &exports, const typename PackTraits< size_t >::input_array_type &num_packets_per_lid, const typename PackTraits< typename LocalMap::local_ordinal_type >::input_array_type &export_lids, const typename PackTraits< int >::input_array_type &source_pids, const Kokkos::View< const size_t *, BufferDeviceType > &offsets, const size_t num_bytes_per_value, const bool pack_pids)
Perform the pack operation for the matrix.
KOKKOS_FUNCTION Kokkos::pair< int, size_t > packCrsMatrixRow(const ColumnMap &col_map, const Kokkos::View< char *, BufferDeviceType > &exports, const typename PackTraits< typename ColumnMap::local_ordinal_type >::input_array_type &lids_in, const typename PackTraits< int >::input_array_type &pids_in, const typename PackTraits< ST >::input_array_type &vals_in, const size_t offset, const size_t num_ent, const size_t num_bytes_per_value, const bool pack_pids)
Packs a single row of the CrsMatrix.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
typename Node::device_type device_type
The Kokkos device type.
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static bool verbose()
Whether Tpetra is in verbose mode.
"Local" part of Map suitable for Kokkos kernels.
LocalOrdinal local_ordinal_type
The type of local indices.
GlobalOrdinal global_ordinal_type
The type of global indices.
Compute the number of packets and offsets for the pack procedure.
Kokkos::Device< typename device_type::execution_space, buffer_memory_space > buffer_device_type
Kokkos::Device specialization for communication buffers.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Nonmember function that computes a residual Computes R = B - A * X.
void packCrsMatrix(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Teuchos::Array< char > &exports, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
void packCrsMatrixWithOwningPIDs(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports_dv, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, const Teuchos::ArrayView< const int > &sourcePIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication.
void packCrsMatrixNew(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports, const Kokkos::DualView< size_t *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &numPacketsPerLID, const Kokkos::DualView< const LO *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exportLIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication, for "new" DistObject inter...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > packArray(char outBuf[], const value_type inBuf[], const size_t numEnt)
Pack the first numEnt entries of the given input buffer of value_type, into the output buffer of byte...
static const bool compileTimeSize
Whether the number of bytes required to pack one instance of value_type is fixed at compile time.
static KOKKOS_INLINE_FUNCTION size_t packValue(char outBuf[], const T &inVal)
Pack the given value of type value_type into the given output buffer of bytes (char).
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.