Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
11#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
12
13#include <memory>
14#include <string>
15#include "TpetraCore_config.h"
16#include "Kokkos_Core.hpp"
17#include "Teuchos_Array.hpp"
18#include "Teuchos_ArrayView.hpp"
19#include "Teuchos_OrdinalTraits.hpp"
20#include "Teuchos_TimeMonitor.hpp"
28#include "Tpetra_Details_DefaultTypes.hpp"
30
51
52namespace Tpetra {
53
54//
55// Users must never rely on anything in the Details namespace.
56//
57namespace Details {
58
59namespace UnpackAndCombineCrsMatrixImpl {
60
70template<class ST, class LO, class GO>
71KOKKOS_FUNCTION int
73 const typename PackTraits<int>::output_array_type& pids_out,
74 const typename PackTraits<ST>::output_array_type& vals_out,
75 const char imports[],
76 const size_t offset,
77 const size_t /* num_bytes */,
78 const size_t num_ent,
79 const size_t bytes_per_value)
80{
81 if (num_ent == 0) {
82 // Empty rows always take zero bytes, to ensure sparsity.
83 return 0;
84 }
85 bool unpack_pids = pids_out.size() > 0;
86
87 const size_t num_ent_beg = offset;
88 const size_t num_ent_len = PackTraits<LO>::packValueCount (LO (0));
89
90 const size_t gids_beg = num_ent_beg + num_ent_len;
91 const size_t gids_len =
92 num_ent * PackTraits<GO>::packValueCount (GO (0));
93
94 const size_t pids_beg = gids_beg + gids_len;
95 const size_t pids_len = unpack_pids ?
96 size_t (num_ent * PackTraits<int>::packValueCount (int (0))) :
97 size_t (0);
98
99 const size_t vals_beg = gids_beg + gids_len + pids_len;
100 const size_t vals_len = num_ent * bytes_per_value;
101
102 const char* const num_ent_in = imports + num_ent_beg;
103 const char* const gids_in = imports + gids_beg;
104 const char* const pids_in = unpack_pids ? imports + pids_beg : nullptr;
105 const char* const vals_in = imports + vals_beg;
106
107 size_t num_bytes_out = 0;
108 LO num_ent_out;
109 num_bytes_out += PackTraits<LO>::unpackValue (num_ent_out, num_ent_in);
110 if (static_cast<size_t> (num_ent_out) != num_ent) {
111 return 20; // error code
112 }
113
114 {
115 Kokkos::pair<int, size_t> p;
116 p = PackTraits<GO>::unpackArray (gids_out.data (), gids_in, num_ent);
117 if (p.first != 0) {
118 return 21; // error code
119 }
120 num_bytes_out += p.second;
121
122 if (unpack_pids) {
123 p = PackTraits<int>::unpackArray (pids_out.data (), pids_in, num_ent);
124 if (p.first != 0) {
125 return 22; // error code
126 }
127 num_bytes_out += p.second;
128 }
129
130 p = PackTraits<ST>::unpackArray (vals_out.data (), vals_in, num_ent);
131 if (p.first != 0) {
132 return 23; // error code
133 }
134 num_bytes_out += p.second;
135 }
136
137 const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
138 if (num_bytes_out != expected_num_bytes) {
139 return 24; // error code
140 }
141 return 0; // no errors
142} //unpackRow
143
154template<class LocalMatrix, class LocalMap, class BufferDeviceType>
155struct UnpackCrsMatrixAndCombineFunctor {
156 typedef LocalMatrix local_matrix_type;
157 typedef LocalMap local_map_type;
158
159 typedef typename local_matrix_type::value_type ST;
160 typedef typename local_map_type::local_ordinal_type LO;
161 typedef typename local_map_type::global_ordinal_type GO;
162 typedef typename local_map_type::device_type DT;
163 typedef typename DT::execution_space XS;
164
165 typedef Kokkos::View<const size_t*, BufferDeviceType>
166 num_packets_per_lid_type;
167 typedef Kokkos::View<const size_t*, DT> offsets_type;
168 typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
169 typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
170
171 typedef Kokkos::View<int, DT> error_type;
172 using member_type = typename Kokkos::TeamPolicy<XS>::member_type;
173
174 static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
175 "LocalMap::local_ordinal_type and "
176 "LocalMatrix::ordinal_type must be the same.");
177
178 local_matrix_type local_matrix;
179 local_map_type local_col_map;
180 input_buffer_type imports;
181 num_packets_per_lid_type num_packets_per_lid;
182 import_lids_type import_lids;
183 Kokkos::View<const LO*[2], DT> batch_info;
184 offsets_type offsets;
185 Tpetra::CombineMode combine_mode;
186 size_t batch_size;
187 size_t bytes_per_value;
188 bool atomic;
189 error_type error_code;
190
191 UnpackCrsMatrixAndCombineFunctor(
192 const local_matrix_type& local_matrix_in,
193 const local_map_type& local_col_map_in,
194 const input_buffer_type& imports_in,
195 const num_packets_per_lid_type& num_packets_per_lid_in,
196 const import_lids_type& import_lids_in,
197 const Kokkos::View<const LO*[2], DT>& batch_info_in,
198 const offsets_type& offsets_in,
199 const Tpetra::CombineMode combine_mode_in,
200 const size_t batch_size_in,
201 const size_t bytes_per_value_in,
202 const bool atomic_in) :
203 local_matrix (local_matrix_in),
204 local_col_map (local_col_map_in),
205 imports (imports_in),
206 num_packets_per_lid (num_packets_per_lid_in),
207 import_lids (import_lids_in),
208 batch_info (batch_info_in),
209 offsets (offsets_in),
210 combine_mode (combine_mode_in),
211 batch_size (batch_size_in),
212 bytes_per_value (bytes_per_value_in),
213 atomic (atomic_in),
214 error_code("error")
215 {}
216
217 KOKKOS_INLINE_FUNCTION
218 void operator()(member_type team_member) const
219 {
220 using Kokkos::View;
221 using Kokkos::subview;
222 using Kokkos::MemoryUnmanaged;
223
224 const LO batch = team_member.league_rank();
225 const LO lid_no = batch_info(batch, 0);
226 const LO batch_no = batch_info(batch, 1);
227
228 const size_t num_bytes = num_packets_per_lid(lid_no);
229
230 // Only unpack data if there is a nonzero number of bytes.
231 if (num_bytes == 0)
232 return;
233
234 // there is actually something in the row
235 const LO import_lid = import_lids(lid_no);
236 const size_t buf_size = imports.size();
237 const size_t offset = offsets(lid_no);
238
239 // Get the number of entries to expect in the received data for this row.
240 LO num_ent_LO = 0;
241 const char* const in_buf = imports.data() + offset;
242 (void) PackTraits<LO>::unpackValue(num_ent_LO, in_buf);
243 const size_t num_entries_in_row = static_cast<size_t>(num_ent_LO);
244
245 // Count the number of bytes expected to unpack
246 size_t expected_num_bytes = 0;
247 {
248 expected_num_bytes += PackTraits<LO>::packValueCount(LO(0));
249 expected_num_bytes += num_entries_in_row * PackTraits<GO>::packValueCount(GO(0));
250 expected_num_bytes += num_entries_in_row * PackTraits<ST>::packValueCount(ST());
251 }
252
253 if (expected_num_bytes > num_bytes)
254 {
255// FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
256#ifndef KOKKOS_ENABLE_SYCL
257 printf(
258 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
259 "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
260 (int) lid_no, (int) expected_num_bytes, (int) num_bytes
261 );
262#endif
263 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 21);
264 return;
265 }
266
267 if (offset > buf_size || offset + num_bytes > buf_size)
268 {
269// FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
270#ifndef KOKKOS_ENABLE_SYCL
271 printf(
272 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
273 "At row %d, the offset (%d) > buffer size (%d)\n",
274 (int) lid_no, (int) offset, (int) buf_size
275 );
276#endif
277 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 22);
278 return;
279 }
280
281 // Determine the number of entries to unpack in this batch
282 size_t num_entries_in_batch = 0;
283 if (num_entries_in_row <= batch_size)
284 num_entries_in_batch = num_entries_in_row;
285 else if (num_entries_in_row >= (batch_no + 1) * batch_size)
286 num_entries_in_batch = batch_size;
287 else
288 num_entries_in_batch = num_entries_in_row - batch_no * batch_size;
289
290 const size_t bytes_per_lid = PackTraits<LO>::packValueCount(LO(0));
291 const size_t num_ent_start = offset;
292 const size_t num_ent_end = num_ent_start + bytes_per_lid;
293
294 const size_t bytes_per_gid = PackTraits<GO>::packValueCount(GO(0));
295 const size_t gids_start = num_ent_end;
296 const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid;
297
298 const size_t vals_start = gids_end;
299
300 const size_t shift = batch_no * batch_size;
301 const char* const num_ent_in = imports.data() + num_ent_start;
302 const char* const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
303 const char* const vals_in = imports.data() + vals_start + shift * bytes_per_value;
304
305 LO num_ent_out;
306 (void)PackTraits<LO>::unpackValue(num_ent_out, num_ent_in);
307 if (static_cast<size_t>(num_ent_out) != num_entries_in_row)
308 {
309// FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
310#ifndef KOKKOS_ENABLE_SYCL
311 printf(
312 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
313 "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
314 (int) lid_no, (int) num_entries_in_row, (int) num_ent_out
315 );
316#endif
317 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 23);
318 }
319
320 constexpr bool matrix_has_sorted_rows = true; // see #6282
321 //Note BMK 6-22: this lambda must use capture-by-value [=] and not capture-by-ref [&].
322 //By ref triggers compiler bug in CUDA 10.
323 Kokkos::parallel_for(
324 Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
325 [=](const LO& j)
326 {
327 size_t distance = 0;
328
329 GO gid_out;
330 distance = j * bytes_per_gid;
331 (void) PackTraits<GO>::unpackValue(gid_out, gids_in + distance);
332 auto lid_out = local_col_map.getLocalElement(gid_out);
333
334 // Column indices come in as global indices, in case the
335 // source object's column Map differs from the target object's
336 // (this's) column Map, and must be converted local index values
337
338 // assume that ST is default constructible
339 ST val_out;
340 distance = j * bytes_per_value;
341 (void) PackTraits<ST>::unpackValue(val_out, vals_in + distance);
342
343 if (combine_mode == ADD) {
344 // NOTE (mfh 20 Nov 2019) Must assume atomic is required, unless
345 // different threads don't touch the same row (i.e., no
346 // duplicates in incoming LIDs list).
347 const bool use_atomic_updates = atomic;
348 (void)local_matrix.sumIntoValues(
349 import_lid,
350 &lid_out,
351 1,
352 &val_out,
353 matrix_has_sorted_rows,
354 use_atomic_updates
355 );
356 } else if (combine_mode == REPLACE) {
357 // NOTE (mfh 20 Nov 2019): It's never correct to use REPLACE
358 // combine mode with multiple incoming rows that touch the same
359 // target matrix entries, so we never need atomic updates.
360 const bool use_atomic_updates = false;
361 (void)local_matrix.replaceValues(
362 import_lid,
363 &lid_out,
364 1,
365 &val_out,
366 matrix_has_sorted_rows,
367 use_atomic_updates
368 );
369 } else {
370 // should never get here
371// FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
372#ifndef KOKKOS_ENABLE_SYCL
373 printf(
374 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
375 "At row %d, an unknown error occurred during unpack\n", (int) lid_no
376 );
377#endif
378 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 31);
379 }
380 }
381 );
382
383 team_member.team_barrier();
384
385 }
386
388 int error() const {
389 auto error_code_h = Kokkos::create_mirror_view_and_copy(
390 Kokkos::HostSpace(), error_code
391 );
392 return error_code_h();
393 }
394
395}; //UnpackCrsMatrixAndCombineFunctor
396
397struct MaxNumEntTag {};
398struct TotNumEntTag {};
399
408template<class LO, class DT, class BDT>
409class NumEntriesFunctor {
410public:
411 typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
412 typedef Kokkos::View<const size_t*, DT> offsets_type;
413 typedef Kokkos::View<const char*, BDT> input_buffer_type;
414 // This needs to be public, since it appears in the argument list of
415 // public methods (see below). Otherwise, build errors may happen.
416 typedef size_t value_type;
417
418private:
419 num_packets_per_lid_type num_packets_per_lid;
420 offsets_type offsets;
421 input_buffer_type imports;
422
423public:
424 NumEntriesFunctor (const num_packets_per_lid_type num_packets_per_lid_in,
425 const offsets_type& offsets_in,
426 const input_buffer_type& imports_in) :
427 num_packets_per_lid (num_packets_per_lid_in),
428 offsets (offsets_in),
429 imports (imports_in)
430 {}
431
432 KOKKOS_INLINE_FUNCTION void
433 operator() (const MaxNumEntTag, const LO i, value_type& update) const {
434 // Get how many entries to expect in the received data for this row.
435 const size_t num_bytes = num_packets_per_lid(i);
436 if (num_bytes > 0) {
437 LO num_ent_LO = 0; // output argument of unpackValue
438 const char* const in_buf = imports.data () + offsets(i);
439 (void) PackTraits<LO>::unpackValue (num_ent_LO, in_buf);
440 const size_t num_ent = static_cast<size_t> (num_ent_LO);
441
442 update = (update < num_ent) ? num_ent : update;
443 }
444 }
445
446 KOKKOS_INLINE_FUNCTION void
447 join (const MaxNumEntTag,
448 value_type& dst,
449 const value_type& src) const
450 {
451 if (dst < src) dst = src;
452 }
453
454 KOKKOS_INLINE_FUNCTION void
455 operator() (const TotNumEntTag, const LO i, value_type& tot_num_ent) const {
456 // Get how many entries to expect in the received data for this row.
457 const size_t num_bytes = num_packets_per_lid(i);
458 if (num_bytes > 0) {
459 LO num_ent_LO = 0; // output argument of unpackValue
460 const char* const in_buf = imports.data () + offsets(i);
461 (void) PackTraits<LO>::unpackValue (num_ent_LO, in_buf);
462 tot_num_ent += static_cast<size_t> (num_ent_LO);
463 }
464 }
465}; //NumEntriesFunctor
466
474template<class LO, class DT, class BDT>
475size_t
477 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
478 const Kokkos::View<const size_t*, DT>& offsets,
479 const Kokkos::View<const char*, BDT>& imports)
480{
481 typedef typename DT::execution_space XS;
482 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
483 MaxNumEntTag> range_policy;
484
485 NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
486 imports);
487 const LO numRowsToUnpack =
488 static_cast<LO> (num_packets_per_lid.extent (0));
489 size_t max_num_ent = 0;
490 Kokkos::parallel_reduce ("Max num entries in CRS",
491 range_policy (0, numRowsToUnpack),
492 functor, max_num_ent);
493 return max_num_ent;
494}
495
503template<class LO, class DT, class BDT>
504size_t
506 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
507 const Kokkos::View<const size_t*, DT>& offsets,
508 const Kokkos::View<const char*, BDT>& imports)
509{
510 typedef typename DT::execution_space XS;
511 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
512 size_t tot_num_ent = 0;
513 NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
514 imports);
515 const LO numRowsToUnpack =
516 static_cast<LO> (num_packets_per_lid.extent (0));
517 Kokkos::parallel_reduce ("Total num entries in CRS to unpack",
518 range_policy (0, numRowsToUnpack),
519 functor, tot_num_ent);
520 return tot_num_ent;
521}
522
523template<class LO>
524KOKKOS_INLINE_FUNCTION
525size_t
526unpackRowCount(const char imports[],
527 const size_t offset,
528 const size_t num_bytes)
529{
530 using PT = PackTraits<LO>;
531
532 LO num_ent_LO = 0;
533 if (num_bytes > 0) {
534 const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
535 if (p_num_bytes > num_bytes) {
536 return OrdinalTraits<size_t>::invalid();
537 }
538 const char* const in_buf = imports + offset;
539 (void) PT::unpackValue(num_ent_LO, in_buf);
540 }
541 return static_cast<size_t>(num_ent_LO);
542}
543
548template<class View1, class View2>
549inline
550bool
552 const View1& batches_per_lid,
553 View2& batch_info
554)
555{
556 using LO = typename View2::value_type;
557 size_t batch = 0;
558 for (size_t i=0; i<batches_per_lid.extent(0); i++)
559 {
560 for (size_t batch_no=0; batch_no<batches_per_lid(i); batch_no++)
561 {
562 batch_info(batch, 0) = static_cast<LO>(i);
563 batch_info(batch, 1) = batch_no;
564 batch++;
565 }
566 }
567 return batch == batch_info.extent(0);
568}
569
577template<class LocalMatrix, class LocalMap, class BufferDeviceType>
578void
580 const LocalMatrix& local_matrix,
581 const LocalMap& local_map,
582 const Kokkos::View<const char*, BufferDeviceType>& imports,
583 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
585 const Tpetra::CombineMode combine_mode)
586{
587 using ST = typename LocalMatrix::value_type;
588 using LO = typename LocalMap::local_ordinal_type;
589 using DT = typename LocalMap::device_type;
590 using XS = typename DT::execution_space;
591 const char prefix[] =
592 "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
593 "unpackAndCombineIntoCrsMatrix: ";
594
595 const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
596 if (num_import_lids == 0) {
597 // Nothing to unpack
598 return;
599 }
600
601 {
602 // Check for correct input
603 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == ABSMAX,
604 std::invalid_argument,
605 prefix << "ABSMAX combine mode is not yet implemented for a matrix that has a "
606 "static graph (i.e., was constructed with the CrsMatrix constructor "
607 "that takes a const CrsGraph pointer).");
608
609 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == INSERT,
610 std::invalid_argument,
611 prefix << "INSERT combine mode is not allowed if the matrix has a static graph "
612 "(i.e., was constructed with the CrsMatrix constructor that takes a "
613 "const CrsGraph pointer).");
614
615 // Unknown combine mode!
616 TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode == ADD || combine_mode == REPLACE),
617 std::invalid_argument,
618 prefix << "Invalid combine mode; should never get "
619 "here! Please report this bug to the Tpetra developers.");
620
621 // Check that sizes of input objects are consistent.
622 bool bad_num_import_lids =
623 num_import_lids != static_cast<size_t>(num_packets_per_lid.extent(0));
624 TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
625 std::invalid_argument,
626 prefix << "importLIDs.size() (" << num_import_lids << ") != "
627 "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) << ").");
628 } // end QA error checking
629
630 // Get the offsets
631 Kokkos::View<size_t*, DT> offsets("offsets", num_import_lids+1);
632 computeOffsetsFromCounts(offsets, num_packets_per_lid);
633
634 // Determine the sizes of the unpack batches
635 size_t max_num_ent = compute_maximum_num_entries<LO,DT>(num_packets_per_lid, offsets, imports);
636 const size_t default_batch_size = Tpetra::Details::Behavior::hierarchicalUnpackBatchSize();
637 const size_t batch_size = std::min(default_batch_size, max_num_ent);
638
639 // To achieve some balance amongst threads, unpack each row in equal size batches
640 size_t num_batches = 0;
641 Kokkos::View<LO*[2], DT> batch_info("", num_batches);
642 Kokkos::View<size_t*, DT> batches_per_lid("", num_import_lids);
643 // Compute meta data that allows batch unpacking
644 Kokkos::parallel_reduce(
645 Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
646 KOKKOS_LAMBDA(const size_t i, size_t& batches)
647 {
648 const size_t num_entries_in_row = unpackRowCount<LO>(
649 imports.data(), offsets(i), num_packets_per_lid(i)
650 );
651 batches_per_lid(i) =
652 (num_entries_in_row <= batch_size) ?
653 1 :
654 num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
655 batches += batches_per_lid(i);
656 },
657 num_batches
658 );
659 Kokkos::resize(batch_info, num_batches);
660
661 Kokkos::HostSpace host_space;
662 auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
663 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
664 Kokkos::deep_copy(XS(), batches_per_lid_h, batches_per_lid);
665
666 auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
667
668 (void) compute_batch_info(batches_per_lid_h, batch_info_h);
669 // DEEP_COPY REVIEW - HOSTMIRROR-TO-DEVICE
670 Kokkos::deep_copy(XS(), batch_info, batch_info_h);
671
672 // FIXME (TJF SEP 2017)
673 // The scalar type is not necessarily default constructible
674 size_t bytes_per_value = PackTraits<ST>::packValueCount(ST());
675
676 // Now do the actual unpack!
677 const bool atomic = XS().concurrency() != 1;
679 functor f(
680 local_matrix,
681 local_map,
682 imports,
683 num_packets_per_lid,
684 import_lids,
685 batch_info,
686 offsets,
687 combine_mode,
688 batch_size,
689 bytes_per_value,
690 atomic
691 );
692
693 using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
695 if (!Spaces::is_gpu_exec_space<XS>() || team_size == Teuchos::OrdinalTraits<size_t>::invalid())
696 {
697 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
698 }
699 else
700 {
701 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
702 }
703
704 auto error_code = f.error();
705 TEUCHOS_TEST_FOR_EXCEPTION(
706 error_code != 0,
707 std::runtime_error,
708 prefix << "UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
709 );
710} //unpackAndCombineIntoCrsMatrix (Kokkos version)
711
712template<class LocalMatrix, class BufferDeviceType>
713size_t
715 const LocalMatrix& local_matrix,
717 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
718 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
719 const size_t num_same_ids)
720{
721 using Kokkos::parallel_reduce;
722 typedef typename LocalMatrix::ordinal_type LO;
723 typedef typename LocalMatrix::device_type device_type;
724 typedef typename device_type::execution_space XS;
725 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
726 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
727 typedef BufferDeviceType BDT;
728
729 size_t count = 0;
730 LO num_items;
731
732 // Number of matrix entries to unpack (returned by this function).
733 num_items = static_cast<LO>(num_same_ids);
734 if (num_items) {
735 size_t kcnt = 0;
736 parallel_reduce(range_policy(0, num_items),
737 KOKKOS_LAMBDA(const LO lid, size_t& update) {
738 update += static_cast<size_t>(local_matrix.graph.row_map[lid+1]
739 -local_matrix.graph.row_map[lid]);
740 }, kcnt);
741 count += kcnt;
742 }
743
744 // Count entries copied directly from the source matrix with permuting.
745 num_items = static_cast<LO>(permute_from_lids.extent(0));
746 if (num_items) {
747 size_t kcnt = 0;
748 parallel_reduce(range_policy(0, num_items),
749 KOKKOS_LAMBDA(const LO i, size_t& update) {
750 const LO lid = permute_from_lids(i);
751 update += static_cast<size_t> (local_matrix.graph.row_map[lid+1]
752 - local_matrix.graph.row_map[lid]);
753 }, kcnt);
754 count += kcnt;
755 }
756
757 {
758 // Count entries received from other MPI processes.
759 const size_type np = num_packets_per_lid.extent(0);
760 Kokkos::View<size_t*, device_type> offsets("offsets", np+1);
761 computeOffsetsFromCounts(offsets, num_packets_per_lid);
762 count +=
764 offsets, imports);
765 }
766
767 return count;
768} //unpackAndCombineWithOwningPIDsCount (Kokkos version)
769
771template<class LO, class DT, class BDT>
772int
774 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
775 const typename PackTraits<LO>::input_array_type& import_lids,
776 const Kokkos::View<const char*, BDT>& imports,
777 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
778 const typename PackTraits<size_t>::input_array_type& offsets)
779{
780 using Kokkos::parallel_reduce;
781 typedef typename DT::execution_space XS;
782 typedef typename PackTraits<size_t>::input_array_type::size_type size_type;
783 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
784
785 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
786 const size_type N = num_packets_per_lid.extent(0);
787
788 int errors = 0;
789 parallel_reduce ("Setup row pointers for remotes",
790 range_policy (0, N),
791 KOKKOS_LAMBDA (const size_t i, int& k_error) {
792 typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
793 const size_t num_bytes = num_packets_per_lid(i);
794 const size_t offset = offsets(i);
795 const size_t num_ent = unpackRowCount<LO> (imports.data(), offset, num_bytes);
796 if (num_ent == InvalidNum) {
797 k_error += 1;
798 }
799 Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
800 }, errors);
801 return errors;
802}
803
804// Convert array of row lengths to a CRS pointer array
805template<class DT>
806void
807makeCrsRowPtrFromLengths(
808 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
809 const Kokkos::View<size_t*,DT>& new_start_row)
810{
811 using Kokkos::parallel_scan;
812 typedef typename DT::execution_space XS;
813 typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
814 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
815 const size_type N = new_start_row.extent(0);
816 parallel_scan(range_policy(0, N),
817 KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
818 auto cur_val = tgt_rowptr(i);
819 if (final) {
820 tgt_rowptr(i) = update;
821 new_start_row(i) = tgt_rowptr(i);
822 }
823 update += cur_val;
824 }
825 );
826}
827
828template<class LocalMatrix, class LocalMap>
829void
830copyDataFromSameIDs(
832 const typename PackTraits<int>::output_array_type& tgt_pids,
834 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
835 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
836 const typename PackTraits<int>::input_array_type& src_pids,
837 const LocalMatrix& local_matrix,
838 const LocalMap& local_col_map,
839 const size_t num_same_ids,
840 const int my_pid)
841{
842 using Kokkos::parallel_for;
843 typedef typename LocalMap::device_type DT;
844 typedef typename LocalMap::local_ordinal_type LO;
845 typedef typename DT::execution_space XS;
846 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
847
848 parallel_for(range_policy(0, num_same_ids),
849 KOKKOS_LAMBDA(const size_t i) {
850 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
851
852 const LO src_lid = static_cast<LO>(i);
853 size_t src_row = local_matrix.graph.row_map(src_lid);
854
855 const LO tgt_lid = static_cast<LO>(i);
856 const size_t tgt_row = tgt_rowptr(tgt_lid);
857
858 const size_t nsr = local_matrix.graph.row_map(src_lid+1)
859 - local_matrix.graph.row_map(src_lid);
860 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
861
862 for (size_t j=local_matrix.graph.row_map(src_lid);
863 j<local_matrix.graph.row_map(src_lid+1); ++j) {
864 LO src_col = local_matrix.graph.entries(j);
865 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
866 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
867 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
868 }
869 }
870 );
871}
872
873template<class LocalMatrix, class LocalMap>
874void
875copyDataFromPermuteIDs(
877 const typename PackTraits<int>::output_array_type& tgt_pids,
879 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
880 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
881 const typename PackTraits<int>::input_array_type& src_pids,
884 const LocalMatrix& local_matrix,
885 const LocalMap& local_col_map,
886 const int my_pid)
887{
888 using Kokkos::parallel_for;
889 typedef typename LocalMap::device_type DT;
890 typedef typename LocalMap::local_ordinal_type LO;
891 typedef typename DT::execution_space XS;
892 typedef typename PackTraits<LO>::input_array_type::size_type size_type;
893 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
894
895 const size_type num_permute_to_lids = permute_to_lids.extent(0);
896
897 parallel_for(range_policy(0, num_permute_to_lids),
898 KOKKOS_LAMBDA(const size_t i) {
899 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
900
901 const LO src_lid = permute_from_lids(i);
902 const size_t src_row = local_matrix.graph.row_map(src_lid);
903
904 const LO tgt_lid = permute_to_lids(i);
905 const size_t tgt_row = tgt_rowptr(tgt_lid);
906
907 size_t nsr = local_matrix.graph.row_map(src_lid+1)
908 - local_matrix.graph.row_map(src_lid);
909 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
910
911 for (size_t j=local_matrix.graph.row_map(src_lid);
912 j<local_matrix.graph.row_map(src_lid+1); ++j) {
913 LO src_col = local_matrix.graph.entries(j);
914 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
915 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
916 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
917 }
918 }
919 );
920}
921
922template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
923int
924unpackAndCombineIntoCrsArrays2(
926 const typename PackTraits<int>::output_array_type& tgt_pids,
928 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
929 const typename PackTraits<size_t>::input_array_type& offsets,
931 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
932 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
933 const LocalMatrix& /* local_matrix */,
934 const LocalMap /*& local_col_map*/,
935 const int my_pid,
936 const size_t bytes_per_value)
937{
938 using Kokkos::View;
939 using Kokkos::subview;
940 using Kokkos::MemoryUnmanaged;
941 using Kokkos::parallel_reduce;
942 using Kokkos::atomic_fetch_add;
943 using Tpetra::Details::PackTraits;
944 typedef typename LocalMap::device_type DT;
945 typedef typename LocalMap::local_ordinal_type LO;
946 typedef typename LocalMap::global_ordinal_type GO;
947 typedef typename LocalMatrix::value_type ST;
948 typedef typename DT::execution_space XS;
949 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
950 typedef typename Kokkos::pair<size_type, size_type> slice;
951 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
952
953 typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
954 typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
955 typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
956
957 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
958
959 int errors = 0;
960 const size_type num_import_lids = import_lids.size();
961
962 // RemoteIDs: Loop structure following UnpackAndCombine
963 parallel_reduce ("Unpack and combine into CRS",
964 range_policy (0, num_import_lids),
965 KOKKOS_LAMBDA (const size_t i, int& k_error) {
966 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
967 const size_t num_bytes = num_packets_per_lid(i);
968 const size_t offset = offsets(i);
969 if (num_bytes == 0) {
970 // Empty buffer means that the row is empty.
971 return;
972 }
973 size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
974 if (num_ent == InvalidNum) {
975 k_error += 1;
976 return;
977 }
978 const LO lcl_row = import_lids(i);
979 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
980 const size_t end_row = start_row + num_ent;
981
982 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
983 vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
984 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
985
986 k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
987 imports.data(), offset, num_bytes,
988 num_ent, bytes_per_value);
989
990 // Correct target PIDs.
991 for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
992 const int pid = pids_out(j);
993 pids_out(j) = (pid != my_pid) ? pid : -1;
994 }
995 }, errors);
996
997 return errors;
998}
999
1000template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
1001void
1002unpackAndCombineIntoCrsArrays(
1003 const LocalMatrix & local_matrix,
1004 const LocalMap & local_col_map,
1006 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
1007 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
1010 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
1013 const typename PackTraits<int>::input_array_type& src_pids,
1014 const typename PackTraits<int>::output_array_type& tgt_pids,
1015 const size_t num_same_ids,
1016 const size_t tgt_num_rows,
1017 const size_t tgt_num_nonzeros,
1018 const int my_tgt_pid,
1019 const size_t bytes_per_value)
1020{
1021 using Kokkos::View;
1022 using Kokkos::subview;
1023 using Kokkos::parallel_for;
1024 using Kokkos::MemoryUnmanaged;
1025 using Tpetra::Details::PackTraits;
1026 typedef typename LocalMap::device_type DT;
1027 typedef typename LocalMap::local_ordinal_type LO;
1028 typedef typename DT::execution_space XS;
1029 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1030 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1031 typedef BufferDeviceType BDT;
1032
1033 const char prefix[] = "unpackAndCombineIntoCrsArrays: ";
1034
1035 const size_t N = tgt_num_rows;
1036
1037 // In the case of reduced communicators, the sourceMatrix won't have
1038 // the right "my_pid", so thus we have to supply it.
1039 const int my_pid = my_tgt_pid;
1040
1041 // Zero the rowptr
1042 parallel_for(range_policy(0, N+1),
1043 KOKKOS_LAMBDA(const size_t i) {
1044 tgt_rowptr(i) = 0;
1045 }
1046 );
1047
1048 // same IDs: Always first, always in the same place
1049 parallel_for(range_policy(0, num_same_ids),
1050 KOKKOS_LAMBDA(const size_t i) {
1051 const LO tgt_lid = static_cast<LO>(i);
1052 const LO src_lid = static_cast<LO>(i);
1053 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1054 - local_matrix.graph.row_map(src_lid);
1055 }
1056 );
1057
1058 // Permute IDs: Still local, but reordered
1059 const size_type num_permute_to_lids = permute_to_lids.extent(0);
1060 parallel_for(range_policy(0, num_permute_to_lids),
1061 KOKKOS_LAMBDA(const size_t i) {
1062 const LO tgt_lid = permute_to_lids(i);
1063 const LO src_lid = permute_from_lids(i);
1064 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1065 - local_matrix.graph.row_map(src_lid);
1066 }
1067 );
1068
1069 // Get the offsets from the number of packets per LID
1070 const size_type num_import_lids = import_lids.extent(0);
1071 View<size_t*, DT> offsets("offsets", num_import_lids+1);
1072 computeOffsetsFromCounts(offsets, num_packets_per_lid);
1073
1074#ifdef HAVE_TPETRA_DEBUG
1075 {
1076 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1077 const bool condition =
1078 nth_offset_h != static_cast<size_t>(imports.extent (0));
1079 TEUCHOS_TEST_FOR_EXCEPTION
1080 (condition, std::logic_error, prefix
1081 << "The final offset in bytes " << nth_offset_h
1082 << " != imports.size() = " << imports.extent(0)
1083 << ". Please report this bug to the Tpetra developers.");
1084 }
1085#endif // HAVE_TPETRA_DEBUG
1086
1087 // Setup row pointers for remotes
1088 int k_error =
1090 import_lids, imports, num_packets_per_lid, offsets);
1091 TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1092 << " Error transferring data to target row pointers. "
1093 "Please report this bug to the Tpetra developers.");
1094
1095 // If multiple processes contribute to the same row, we may need to
1096 // update row offsets. This tracks that.
1097 View<size_t*, DT> new_start_row ("new_start_row", N+1);
1098
1099 // Turn row length into a real CRS row pointer
1100 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1101
1102 // SameIDs: Copy the data over
1103 copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1104 tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1105
1106 copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1107 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1108 local_matrix, local_col_map, my_pid);
1109
1110 if (imports.extent(0) <= 0) {
1111 return;
1112 }
1113
1114 int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1115 tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1116 local_matrix, local_col_map, my_pid, bytes_per_value);
1117 TEUCHOS_TEST_FOR_EXCEPTION(
1118 unpack_err != 0, std::logic_error, prefix << "unpack loop failed. This "
1119 "should never happen. Please report this bug to the Tpetra developers.");
1120
1121 return;
1122}
1123
1124} // namespace UnpackAndCombineCrsMatrixImpl
1125
1165template<typename ST, typename LO, typename GO, typename Node>
1166void
1168 const CrsMatrix<ST, LO, GO, Node>& sourceMatrix,
1169 const Teuchos::ArrayView<const char>& imports,
1170 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1171 const Teuchos::ArrayView<const LO>& importLIDs,
1172 size_t /* constantNumPackets */,
1173 CombineMode combineMode)
1174{
1175 using Kokkos::View;
1176 typedef typename Node::device_type device_type;
1178 static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1179 "Node::device_type and LocalMatrix::device_type must be the same.");
1180
1181 // Convert all Teuchos::Array to Kokkos::View.
1182 device_type outputDevice;
1183
1184 // numPacketsPerLID, importLIDs, and imports are input, so we have to copy
1185 // them to device. Since unpacking is done directly in to the local matrix
1186 // (lclMatrix), no copying needs to be performed after unpacking.
1187 auto num_packets_per_lid_d =
1188 create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(),
1189 numPacketsPerLID.size(), true, "num_packets_per_lid");
1190
1191 auto import_lids_d =
1192 create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(),
1193 importLIDs.size(), true, "import_lids");
1194
1195 auto imports_d =
1196 create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(),
1197 imports.size(), true, "imports");
1198
1199 auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1200 auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1201
1202//KDDKDD This loop doesn't appear to do anything; what is it?
1203//KDDKDD for (int i=0; i<importLIDs.size(); i++)
1204//KDDKDD {
1205//KDDKDD auto lclRow = importLIDs[i];
1206//KDDKDD Teuchos::ArrayView<const LO> A_indices;
1207//KDDKDD Teuchos::ArrayView<const ST> A_values;
1208//KDDKDD sourceMatrix.getLocalRowView(lclRow, A_indices, A_values);
1209//KDDKDD }
1210 // Now do the actual unpack!
1212 local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1213 import_lids_d, combineMode);
1214
1215}
1216
1217template<typename ST, typename LO, typename GO, typename NT>
1218void
1219unpackCrsMatrixAndCombineNew(
1220 const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
1221 Kokkos::DualView<char*,
1223 Kokkos::DualView<size_t*,
1224 typename DistObject<char, LO, GO, NT>::buffer_device_type> numPacketsPerLID,
1225 const Kokkos::DualView<const LO*,
1227 const size_t /* constantNumPackets */,
1228 const CombineMode combineMode)
1229{
1230 using Kokkos::View;
1231 using crs_matrix_type = CrsMatrix<ST, LO, GO, NT>;
1232 using dist_object_type = DistObject<char, LO, GO, NT>;
1233 using device_type = typename crs_matrix_type::device_type;
1234 using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
1235 using buffer_device_type = typename dist_object_type::buffer_device_type;
1236
1237 static_assert
1238 (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1239 "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1240 "must be the same.");
1241
1242 if (numPacketsPerLID.need_sync_device()) {
1243 numPacketsPerLID.sync_device ();
1244 }
1245 auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1246
1247 TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1248 auto import_lids_d = importLIDs.view_device ();
1249
1250 if (imports.need_sync_device()) {
1251 imports.sync_device ();
1252 }
1253 auto imports_d = imports.view_device ();
1254
1255 auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1256 auto local_col_map = sourceMatrix.getColMap ()->getLocalMap ();
1257 typedef decltype (local_col_map) local_map_type;
1258
1260 local_matrix_device_type,
1261 local_map_type,
1262 buffer_device_type
1263 > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1264 import_lids_d, combineMode);
1265}
1266
1313//
1322template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1323size_t
1326 const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1327 const Teuchos::ArrayView<const char> &imports,
1328 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1329 size_t /* constantNumPackets */,
1330 CombineMode /* combineMode */,
1331 size_t numSameIDs,
1332 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1333 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1334{
1335 using Kokkos::MemoryUnmanaged;
1336 using Kokkos::View;
1337 typedef typename Node::device_type DT;
1338 const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
1339
1340 TEUCHOS_TEST_FOR_EXCEPTION
1341 (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1342 prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () << " != "
1343 "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1344 // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1345 // process, then the matrix is neither locally nor globally indexed.
1346 const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1347 TEUCHOS_TEST_FOR_EXCEPTION
1348 (! locallyIndexed, std::invalid_argument, prefix << "The input "
1349 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1350 TEUCHOS_TEST_FOR_EXCEPTION
1351 (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1352 prefix << "importLIDs.size() = " << importLIDs.size () << " != "
1353 "numPacketsPerLID.size() = " << numPacketsPerLID.size () << ".");
1354
1355 auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1356
1357 using kokkos_device_type = Kokkos::Device<typename Node::device_type::execution_space,
1358 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>;
1359
1360 Kokkos::View<LocalOrdinal const *, kokkos_device_type, void, void > permute_from_lids_d =
1362 permuteFromLIDs.getRawPtr (),
1363 permuteFromLIDs.size (), true,
1364 "permute_from_lids");
1365
1366 Kokkos::View<const char*, kokkos_device_type, void, void > imports_d =
1368 imports.getRawPtr (),
1369 imports.size (), true,
1370 "imports");
1371
1372 Kokkos::View<const size_t*, kokkos_device_type, void, void > num_packets_per_lid_d =
1374 numPacketsPerLID.getRawPtr (),
1375 numPacketsPerLID.size (), true,
1376 "num_packets_per_lid");
1377
1378 return UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1379 local_matrix, permute_from_lids_d, imports_d,
1380 num_packets_per_lid_d, numSameIDs);
1381} //unpackAndCombineWithOwningPIDsCount (Teuchos::Array version)
1382
1397
1398template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1399void
1402 const Kokkos::View<LocalOrdinal const *,
1403 Kokkos::Device<typename Node::device_type::execution_space,
1404 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1405 void, void > import_lids_d,
1406 const Kokkos::View<const char*,
1407 Kokkos::Device<typename Node::device_type::execution_space,
1408 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1409 void, void > imports_d,
1410 const Kokkos::View<const size_t*,
1411 Kokkos::Device<typename Node::device_type::execution_space,
1412 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1413 void, void > num_packets_per_lid_d,
1414 const size_t numSameIDs,
1415 const Kokkos::View<LocalOrdinal const *,
1416 Kokkos::Device<typename Node::device_type::execution_space,
1417 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1418 void, void > permute_to_lids_d,
1419 const Kokkos::View<LocalOrdinal const *,
1420 Kokkos::Device<typename Node::device_type::execution_space,
1421 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1422 void, void > permute_from_lids_d,
1423 size_t TargetNumRows,
1424 const int MyTargetPID,
1425 Kokkos::View<size_t*,typename Node::device_type> &crs_rowptr_d,
1426 Kokkos::View<GlobalOrdinal*,typename Node::device_type> &crs_colind_d,
1427 Kokkos::View<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type*,typename Node::device_type>& crs_vals_d,
1428 const Teuchos::ArrayView<const int>& SourcePids,
1429 Kokkos::View<int*,typename Node::device_type> &TargetPids)
1430{
1431 using execution_space = typename Node::execution_space;
1433
1434 using Kokkos::View;
1435 using Kokkos::deep_copy;
1436
1437 using Teuchos::ArrayView;
1438 using Teuchos::outArg;
1439 using Teuchos::REDUCE_MAX;
1440 using Teuchos::reduceAll;
1441
1442 typedef typename Node::device_type DT;
1443
1445 typedef typename matrix_type::impl_scalar_type ST;
1446
1447 const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1448# ifdef HAVE_TPETRA_MMM_TIMINGS
1449 using Teuchos::TimeMonitor;
1450 Teuchos::RCP<TimeMonitor> tm;
1451# endif
1452
1453 using Kokkos::MemoryUnmanaged;
1454
1455 TEUCHOS_TEST_FOR_EXCEPTION
1456 (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1457 prefix << "permute_to_lids_d.size() = " << permute_to_lids_d.size () << " != "
1458 "permute_from_lids_d.size() = " << permute_from_lids_d.size() << ".");
1459 // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1460 // process, then the matrix is neither locally nor globally indexed.
1461 const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1462 TEUCHOS_TEST_FOR_EXCEPTION
1463 (! locallyIndexed, std::invalid_argument, prefix << "The input "
1464 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1465 TEUCHOS_TEST_FOR_EXCEPTION
1466 (((size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument,
1467 prefix << "import_lids_d.size() = " << import_lids_d.size () << " != "
1468 "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () << ".");
1469
1470 auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1471
1472 // TargetNumNonzeros is number of nonzeros in local matrix.
1473# ifdef HAVE_TPETRA_MMM_TIMINGS
1474 tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineWithOwningPIDsCount"))));
1475# endif
1476 size_t TargetNumNonzeros =
1477 UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1478 local_matrix, permute_from_lids_d, imports_d,
1479 num_packets_per_lid_d, numSameIDs);
1480# ifdef HAVE_TPETRA_MMM_TIMINGS
1481 tm = Teuchos::null;
1482# endif
1483
1484# ifdef HAVE_TPETRA_MMM_TIMINGS
1485 tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("resize CRS pointers"))));
1486# endif
1487 Kokkos::resize(crs_rowptr_d,TargetNumRows+1);
1488 Kokkos::resize(crs_colind_d,TargetNumNonzeros);
1489 Kokkos::resize(crs_vals_d,TargetNumNonzeros);
1490# ifdef HAVE_TPETRA_MMM_TIMINGS
1491 tm = Teuchos::null;
1492# endif
1493
1494 TEUCHOS_TEST_FOR_EXCEPTION(
1495 permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1496 prefix << "permuteToLIDs.size() = " << permute_to_lids_d.size ()
1497 << "!= permute_from_lids_d.size() = " << permute_from_lids_d.size () << ".");
1498
1499 if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1500 Kokkos::resize(TargetPids,TargetNumNonzeros);
1501 }
1502 Kokkos::deep_copy(execution_space(), TargetPids, -1);
1503
1504 // Grab pointers for sourceMatrix
1505 auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1506
1507# ifdef HAVE_TPETRA_MMM_TIMINGS
1508 tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("create mirror views from inputs"))));
1509# endif
1510 // Convert input arrays to Kokkos::Views
1511 DT outputDevice;
1512
1513 auto src_pids_d =
1514 create_mirror_view_from_raw_host_array(outputDevice, SourcePids.getRawPtr(),
1515 SourcePids.size(), true, "src_pids");
1516
1517# ifdef HAVE_TPETRA_MMM_TIMINGS
1518 tm = Teuchos::null;
1519# endif
1520
1521 size_t bytes_per_value = 0;
1523 // assume that ST is default constructible
1524 bytes_per_value = PackTraits<ST>::packValueCount(ST());
1525 }
1526 else {
1527 // Since the packed data come from the source matrix, we can use the source
1528 // matrix to get the number of bytes per Scalar value stored in the matrix.
1529 // This assumes that all Scalar values in the source matrix require the same
1530 // number of bytes. If the source matrix has no entries on the calling
1531 // process, then we hope that some process does have some idea how big
1532 // a Scalar value is. Of course, if no processes have any entries, then no
1533 // values should be packed (though this does assume that in our packing
1534 // scheme, rows with zero entries take zero bytes).
1535 size_t bytes_per_value_l = 0;
1536 if (local_matrix.values.extent(0) > 0) {
1537 const ST& val = local_matrix.values(0);
1538 bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1539 } else {
1540 const ST& val = crs_vals_d(0);
1541 bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1542 }
1543 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1544 Teuchos::REDUCE_MAX,
1545 bytes_per_value_l,
1546 outArg(bytes_per_value));
1547 }
1548
1549# ifdef HAVE_TPETRA_MMM_TIMINGS
1550 tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineIntoCrsArrays"))));
1551# endif
1552 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays(
1553 local_matrix, local_col_map, import_lids_d, imports_d,
1554 num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1555 crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, TargetPids,
1556 numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1557 bytes_per_value);
1558# ifdef HAVE_TPETRA_MMM_TIMINGS
1559 tm = Teuchos::null;
1560# endif
1561
1562 // Copy outputs back to host
1563# ifdef HAVE_TPETRA_MMM_TIMINGS
1564 tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("copy back to host"))));
1565# endif
1566
1567 Kokkos::parallel_for("setLocalEntriesToPID", Kokkos::RangePolicy<typename DT::execution_space>(0,TargetPids.size()), KOKKOS_LAMBDA (const size_t i) {
1568 if (TargetPids(i) == -1) TargetPids(i) = MyTargetPID;
1569 });
1570
1571} //unpackAndCombineIntoCrsArrays
1572
1573template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1574void
1577 const Kokkos::View<LocalOrdinal const *,
1578 Kokkos::Device<typename Node::device_type::execution_space,
1579 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1580 void, void > import_lids_d,
1581 const Kokkos::View<const char*,
1582 Kokkos::Device<typename Node::device_type::execution_space,
1583 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1584 void, void > imports_d,
1585 const Kokkos::View<const size_t*,
1586 Kokkos::Device<typename Node::device_type::execution_space,
1587 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1588 void, void > num_packets_per_lid_d,
1589 const size_t numSameIDs,
1590 const Kokkos::View<LocalOrdinal const *,
1591 Kokkos::Device<typename Node::device_type::execution_space,
1592 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1593 void, void > permute_to_lids_d,
1594 const Kokkos::View<LocalOrdinal const *,
1595 Kokkos::Device<typename Node::device_type::execution_space,
1596 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1597 void, void > permute_from_lids_d,
1598 size_t TargetNumRows,
1599 const int MyTargetPID,
1600 Teuchos::ArrayRCP<size_t>& CRS_rowptr,
1601 Teuchos::ArrayRCP<GlobalOrdinal>& CRS_colind,
1602 Teuchos::ArrayRCP<Scalar>& CRS_vals,
1603 const Teuchos::ArrayView<const int>& SourcePids,
1604 Teuchos::Array<int>& TargetPids)
1605{
1606 using execution_space = typename Node::execution_space;
1608
1609 using Kokkos::View;
1610 using Kokkos::deep_copy;
1611
1612 using Teuchos::ArrayView;
1613 using Teuchos::outArg;
1614 using Teuchos::REDUCE_MAX;
1615 using Teuchos::reduceAll;
1616
1617 typedef typename Node::device_type DT;
1618
1620 typedef typename matrix_type::impl_scalar_type ST;
1621
1622 const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1623# ifdef HAVE_TPETRA_MMM_TIMINGS
1624 using Teuchos::TimeMonitor;
1625 Teuchos::RCP<TimeMonitor> tm;
1626# endif
1627
1628 using Kokkos::MemoryUnmanaged;
1629
1630 TEUCHOS_TEST_FOR_EXCEPTION
1631 (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1632 prefix << "permute_to_lids_d.size() = " << permute_to_lids_d.size () << " != "
1633 "permute_from_lids_d.size() = " << permute_from_lids_d.size() << ".");
1634 // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1635 // process, then the matrix is neither locally nor globally indexed.
1636 const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1637 TEUCHOS_TEST_FOR_EXCEPTION
1638 (! locallyIndexed, std::invalid_argument, prefix << "The input "
1639 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1640 TEUCHOS_TEST_FOR_EXCEPTION
1641 (((size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument,
1642 prefix << "import_lids_d.size() = " << import_lids_d.size () << " != "
1643 "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () << ".");
1644
1645 auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1646
1647 // TargetNumNonzeros is number of nonzeros in local matrix.
1648# ifdef HAVE_TPETRA_MMM_TIMINGS
1649 tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineWithOwningPIDsCount"))));
1650# endif
1651 size_t TargetNumNonzeros =
1652 UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1653 local_matrix, permute_from_lids_d, imports_d,
1654 num_packets_per_lid_d, numSameIDs);
1655# ifdef HAVE_TPETRA_MMM_TIMINGS
1656 tm = Teuchos::null;
1657# endif
1658
1659# ifdef HAVE_TPETRA_MMM_TIMINGS
1660 tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("resize CRS pointers"))));
1661# endif
1662 CRS_rowptr.resize (TargetNumRows+1);
1663 CRS_colind.resize(TargetNumNonzeros);
1664 CRS_vals.resize(TargetNumNonzeros);
1665 Teuchos::ArrayRCP<ST> const & CRS_vals_impl_scalar_type = Teuchos::arcp_reinterpret_cast<ST>(CRS_vals);
1666# ifdef HAVE_TPETRA_MMM_TIMINGS
1667 tm = Teuchos::null;
1668# endif
1669
1670 TEUCHOS_TEST_FOR_EXCEPTION(
1671 permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1672 prefix << "permuteToLIDs.size() = " << permute_to_lids_d.size ()
1673 << "!= permute_from_lids_d.size() = " << permute_from_lids_d.size () << ".");
1674
1675 // Preseed TargetPids with -1 for local
1676 if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1677 TargetPids.resize (TargetNumNonzeros);
1678 }
1679 TargetPids.assign (TargetNumNonzeros, -1);
1680
1681 // Grab pointers for sourceMatrix
1682 auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1683
1684# ifdef HAVE_TPETRA_MMM_TIMINGS
1685 tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("create mirror views from inputs"))));
1686# endif
1687 // Convert input arrays to Kokkos::Views
1688 DT outputDevice;
1689
1690 auto crs_rowptr_d =
1691 create_mirror_view_from_raw_host_array(outputDevice, CRS_rowptr.getRawPtr(),
1692 CRS_rowptr.size(), true, "crs_rowptr");
1693
1694 auto crs_colind_d =
1695 create_mirror_view_from_raw_host_array(outputDevice, CRS_colind.getRawPtr(),
1696 CRS_colind.size(), true, "crs_colidx");
1697#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1698 static_assert (! std::is_same<
1699 typename std::remove_const<
1700 typename std::decay<
1701 decltype (CRS_vals_impl_scalar_type)
1702 >::type::value_type
1703 >::type,
1704 std::complex<double> >::value,
1705 "CRS_vals::value_type is std::complex<double>; this should never happen"
1706 ", since std::complex does not work in Kokkos::View objects.");
1707#endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1708
1709 auto crs_vals_d =
1710 create_mirror_view_from_raw_host_array(outputDevice, CRS_vals_impl_scalar_type.getRawPtr(),
1711 CRS_vals_impl_scalar_type.size(), true, "crs_vals");
1712
1713#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1714 static_assert (! std::is_same<
1715 typename decltype (crs_vals_d)::non_const_value_type,
1716 std::complex<double> >::value,
1717 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1718 "never happen, since std::complex does not work in Kokkos::View objects.");
1719#endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1720
1721 auto src_pids_d =
1722 create_mirror_view_from_raw_host_array(outputDevice, SourcePids.getRawPtr(),
1723 SourcePids.size(), true, "src_pids");
1724
1725 auto tgt_pids_d =
1726 create_mirror_view_from_raw_host_array(outputDevice, TargetPids.getRawPtr(),
1727 TargetPids.size(), true, "tgt_pids");
1728
1729# ifdef HAVE_TPETRA_MMM_TIMINGS
1730 tm = Teuchos::null;
1731# endif
1732
1733 size_t bytes_per_value = 0;
1735 // assume that ST is default constructible
1736 bytes_per_value = PackTraits<ST>::packValueCount(ST());
1737 }
1738 else {
1739 // Since the packed data come from the source matrix, we can use the source
1740 // matrix to get the number of bytes per Scalar value stored in the matrix.
1741 // This assumes that all Scalar values in the source matrix require the same
1742 // number of bytes. If the source matrix has no entries on the calling
1743 // process, then we hope that some process does have some idea how big
1744 // a Scalar value is. Of course, if no processes have any entries, then no
1745 // values should be packed (though this does assume that in our packing
1746 // scheme, rows with zero entries take zero bytes).
1747 size_t bytes_per_value_l = 0;
1748 if (local_matrix.values.extent(0) > 0) {
1749 const ST& val = local_matrix.values(0);
1750 bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1751 } else {
1752 const ST& val = crs_vals_d(0);
1753 bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1754 }
1755 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1756 Teuchos::REDUCE_MAX,
1757 bytes_per_value_l,
1758 outArg(bytes_per_value));
1759 }
1760
1761#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1762 static_assert (! std::is_same<
1763 typename decltype (crs_vals_d)::non_const_value_type,
1764 std::complex<double> >::value,
1765 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1766 "never happen, since std::complex does not work in Kokkos::View objects.");
1767#endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1768
1769# ifdef HAVE_TPETRA_MMM_TIMINGS
1770 tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineIntoCrsArrays"))));
1771# endif
1772 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays(
1773 local_matrix, local_col_map, import_lids_d, imports_d,
1774 num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1775 crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1776 numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1777 bytes_per_value);
1778# ifdef HAVE_TPETRA_MMM_TIMINGS
1779 tm = Teuchos::null;
1780# endif
1781
1782 // Copy outputs back to host
1783# ifdef HAVE_TPETRA_MMM_TIMINGS
1784 tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("copy back to host"))));
1785# endif
1786 typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1787 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1788 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1789 deep_copy(execution_space(), crs_rowptr_h, crs_rowptr_d);
1790
1791 typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1792 CRS_colind.getRawPtr(), CRS_colind.size());
1793 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1794 deep_copy(execution_space(), crs_colind_h, crs_colind_d);
1795
1796 typename decltype(crs_vals_d)::HostMirror crs_vals_h(
1797 CRS_vals_impl_scalar_type.getRawPtr(), CRS_vals_impl_scalar_type.size());
1798 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1799 deep_copy(execution_space(), crs_vals_h, crs_vals_d);
1800
1801 typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1802 TargetPids.getRawPtr(), TargetPids.size());
1803 // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1804 deep_copy(execution_space(), tgt_pids_h, tgt_pids_d);
1805
1806} //unpackAndCombineIntoCrsArrays
1807
1808
1809} // namespace Details
1810} // namespace Tpetra
1811
1812#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
1813 template void \
1814 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1815 const CrsMatrix<ST, LO, GO, NT>&, \
1816 const Teuchos::ArrayView<const char>&, \
1817 const Teuchos::ArrayView<const size_t>&, \
1818 const Teuchos::ArrayView<const LO>&, \
1819 size_t, \
1820 CombineMode); \
1821 template size_t \
1822 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1823 const CrsMatrix<ST, LO, GO, NT> &, \
1824 const Teuchos::ArrayView<const LO> &, \
1825 const Teuchos::ArrayView<const char> &, \
1826 const Teuchos::ArrayView<const size_t>&, \
1827 size_t, \
1828 CombineMode, \
1829 size_t, \
1830 const Teuchos::ArrayView<const LO>&, \
1831 const Teuchos::ArrayView<const LO>&); \
1832 template void \
1833 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1834 const CrsMatrix<ST, LO, GO, NT>&, \
1835 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1836 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1837 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1838 const size_t, \
1839 const CombineMode); \
1840 template void \
1841 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1842 const CrsMatrix<ST, LO, GO, NT> &, \
1843 const Kokkos::View<LO const *, \
1844 Kokkos::Device<typename NT::device_type::execution_space, \
1845 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1846 void, void >, \
1847 const Kokkos::View<const char*, \
1848 Kokkos::Device<typename NT::device_type::execution_space, \
1849 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1850 void, void >, \
1851 const Kokkos::View<const size_t*, \
1852 Kokkos::Device<typename NT::device_type::execution_space, \
1853 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1854 void, void >, \
1855 const size_t, \
1856 const Kokkos::View<LO const *, \
1857 Kokkos::Device<typename NT::device_type::execution_space, \
1858 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1859 void, void >, \
1860 const Kokkos::View<LO const *, \
1861 Kokkos::Device<typename NT::device_type::execution_space, \
1862 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1863 void, void >, \
1864 size_t, \
1865 const int, \
1866 Kokkos::View<size_t*,typename NT::device_type>&, \
1867 Kokkos::View<GO*,typename NT::device_type>&, \
1868 Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*,typename NT::device_type>&, \
1869 const Teuchos::ArrayView<const int>&, \
1870 Kokkos::View<int*,typename NT::device_type>&); \
1871 template void \
1872 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1873 const CrsMatrix<ST, LO, GO, NT> &, \
1874 const Kokkos::View<LO const *, \
1875 Kokkos::Device<typename NT::device_type::execution_space, \
1876 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1877 void, void >, \
1878 const Kokkos::View<const char*, \
1879 Kokkos::Device<typename NT::device_type::execution_space, \
1880 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1881 void, void >, \
1882 const Kokkos::View<const size_t*, \
1883 Kokkos::Device<typename NT::device_type::execution_space, \
1884 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1885 void, void >, \
1886 const size_t, \
1887 const Kokkos::View<LO const *, \
1888 Kokkos::Device<typename NT::device_type::execution_space, \
1889 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1890 void, void >, \
1891 const Kokkos::View<LO const *, \
1892 Kokkos::Device<typename NT::device_type::execution_space, \
1893 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1894 void, void >, \
1895 size_t, \
1896 const int, \
1897 Teuchos::ArrayRCP<size_t>&, \
1898 Teuchos::ArrayRCP<GO>&, \
1899 Teuchos::ArrayRCP<ST>&, \
1900 const Teuchos::ArrayView<const int>&, \
1901 Teuchos::Array<int>&);
1902
1903#endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Declaration of the Tpetra::CrsMatrix class.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Declaration and definition of Tpetra::Details::getEntryOnHost.
int setupRowPointersForRemotes(const typename PackTraits< size_t >::output_array_type &tgt_rowptr, const typename PackTraits< LO >::input_array_type &import_lids, const Kokkos::View< const char *, BDT > &imports, const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const typename PackTraits< size_t >::input_array_type &offsets)
Setup row pointers for remotes.
size_t compute_total_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Total number of entries in any row of the packed matrix.
void unpackAndCombineIntoCrsMatrix(const LocalMatrix &local_matrix, const LocalMap &local_map, const Kokkos::View< const char *, BufferDeviceType > &imports, const Kokkos::View< const size_t *, BufferDeviceType > &num_packets_per_lid, const typename PackTraits< typename LocalMap::local_ordinal_type >::input_array_type import_lids, const Tpetra::CombineMode combine_mode)
Perform the unpack operation for the matrix.
size_t compute_maximum_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Maximum number of entries in any row of the packed matrix.
KOKKOS_FUNCTION int unpackRow(const typename PackTraits< GO >::output_array_type &gids_out, const typename PackTraits< int >::output_array_type &pids_out, const typename PackTraits< ST >::output_array_type &vals_out, const char imports[], const size_t offset, const size_t, const size_t num_ent, const size_t bytes_per_value)
Unpack a single row of a CrsMatrix.
bool compute_batch_info(const View1 &batches_per_lid, View2 &batch_info)
Compute the index and batch number associated with each batch.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
"Local" part of Map suitable for Kokkos kernels.
LocalOrdinal local_ordinal_type
The type of local indices.
GlobalOrdinal global_ordinal_type
The type of global indices.
DeviceType device_type
The device type.
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
Base class for distributed Tpetra objects that support data redistribution.
Kokkos::Device< typename device_type::execution_space, buffer_memory_space > buffer_device_type
Kokkos::Device specialization for communication buffers.
Nonmember function that computes a residual Computes R = B - A * X.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ INSERT
Insert new values that don't currently exist.
Traits class for packing / unpacking data of type T.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
static const bool compileTimeSize
Whether the number of bytes required to pack one instance of value_type is fixed at compile time.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.