10#ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
11#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
15#include "TpetraCore_config.h"
16#include "Kokkos_Core.hpp"
17#include "Teuchos_Array.hpp"
18#include "Teuchos_ArrayView.hpp"
19#include "Teuchos_OrdinalTraits.hpp"
20#include "Teuchos_TimeMonitor.hpp"
28#include "Tpetra_Details_DefaultTypes.hpp"
59namespace UnpackAndCombineCrsMatrixImpl {
70template<
class ST,
class LO,
class GO>
79 const size_t bytes_per_value)
85 bool unpack_pids = pids_out.size() > 0;
87 const size_t num_ent_beg = offset;
90 const size_t gids_beg = num_ent_beg + num_ent_len;
91 const size_t gids_len =
94 const size_t pids_beg = gids_beg + gids_len;
95 const size_t pids_len = unpack_pids ?
99 const size_t vals_beg = gids_beg + gids_len + pids_len;
100 const size_t vals_len = num_ent * bytes_per_value;
102 const char*
const num_ent_in = imports + num_ent_beg;
103 const char*
const gids_in = imports + gids_beg;
104 const char*
const pids_in = unpack_pids ? imports + pids_beg :
nullptr;
105 const char*
const vals_in = imports + vals_beg;
107 size_t num_bytes_out = 0;
110 if (
static_cast<size_t> (num_ent_out) != num_ent) {
115 Kokkos::pair<int, size_t> p;
120 num_bytes_out += p.second;
127 num_bytes_out += p.second;
134 num_bytes_out += p.second;
137 const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
138 if (num_bytes_out != expected_num_bytes) {
154template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
155struct UnpackCrsMatrixAndCombineFunctor {
156 typedef LocalMatrix local_matrix_type;
159 typedef typename local_matrix_type::value_type ST;
163 typedef typename DT::execution_space XS;
165 typedef Kokkos::View<const size_t*, BufferDeviceType>
166 num_packets_per_lid_type;
167 typedef Kokkos::View<const size_t*, DT> offsets_type;
168 typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
169 typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
171 typedef Kokkos::View<int, DT> error_type;
172 using member_type =
typename Kokkos::TeamPolicy<XS>::member_type;
174 static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
175 "LocalMap::local_ordinal_type and "
176 "LocalMatrix::ordinal_type must be the same.");
178 local_matrix_type local_matrix;
179 local_map_type local_col_map;
180 input_buffer_type imports;
181 num_packets_per_lid_type num_packets_per_lid;
182 import_lids_type import_lids;
183 Kokkos::View<const LO*[2], DT> batch_info;
184 offsets_type offsets;
187 size_t bytes_per_value;
189 error_type error_code;
191 UnpackCrsMatrixAndCombineFunctor(
192 const local_matrix_type& local_matrix_in,
193 const local_map_type& local_col_map_in,
194 const input_buffer_type& imports_in,
195 const num_packets_per_lid_type& num_packets_per_lid_in,
196 const import_lids_type& import_lids_in,
197 const Kokkos::View<
const LO*[2], DT>& batch_info_in,
198 const offsets_type& offsets_in,
200 const size_t batch_size_in,
201 const size_t bytes_per_value_in,
202 const bool atomic_in) :
203 local_matrix (local_matrix_in),
204 local_col_map (local_col_map_in),
205 imports (imports_in),
206 num_packets_per_lid (num_packets_per_lid_in),
207 import_lids (import_lids_in),
208 batch_info (batch_info_in),
209 offsets (offsets_in),
210 combine_mode (combine_mode_in),
211 batch_size (batch_size_in),
212 bytes_per_value (bytes_per_value_in),
217 KOKKOS_INLINE_FUNCTION
218 void operator()(member_type team_member)
const
221 using Kokkos::subview;
222 using Kokkos::MemoryUnmanaged;
224 const LO batch = team_member.league_rank();
225 const LO lid_no = batch_info(batch, 0);
226 const LO batch_no = batch_info(batch, 1);
228 const size_t num_bytes = num_packets_per_lid(lid_no);
235 const LO import_lid = import_lids(lid_no);
236 const size_t buf_size = imports.size();
237 const size_t offset = offsets(lid_no);
241 const char*
const in_buf = imports.data() + offset;
243 const size_t num_entries_in_row =
static_cast<size_t>(num_ent_LO);
246 size_t expected_num_bytes = 0;
253 if (expected_num_bytes > num_bytes)
256#ifndef KOKKOS_ENABLE_SYCL
258 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
259 "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
260 (
int) lid_no, (
int) expected_num_bytes, (
int) num_bytes
263 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 21);
267 if (offset > buf_size || offset + num_bytes > buf_size)
270#ifndef KOKKOS_ENABLE_SYCL
272 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
273 "At row %d, the offset (%d) > buffer size (%d)\n",
274 (
int) lid_no, (
int) offset, (
int) buf_size
277 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 22);
282 size_t num_entries_in_batch = 0;
283 if (num_entries_in_row <= batch_size)
284 num_entries_in_batch = num_entries_in_row;
285 else if (num_entries_in_row >= (batch_no + 1) * batch_size)
286 num_entries_in_batch = batch_size;
288 num_entries_in_batch = num_entries_in_row - batch_no * batch_size;
291 const size_t num_ent_start = offset;
292 const size_t num_ent_end = num_ent_start + bytes_per_lid;
295 const size_t gids_start = num_ent_end;
296 const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid;
298 const size_t vals_start = gids_end;
300 const size_t shift = batch_no * batch_size;
301 const char*
const num_ent_in = imports.data() + num_ent_start;
302 const char*
const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
303 const char*
const vals_in = imports.data() + vals_start + shift * bytes_per_value;
307 if (
static_cast<size_t>(num_ent_out) != num_entries_in_row)
310#ifndef KOKKOS_ENABLE_SYCL
312 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
313 "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
314 (
int) lid_no, (
int) num_entries_in_row, (
int) num_ent_out
317 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 23);
320 constexpr bool matrix_has_sorted_rows =
true;
323 Kokkos::parallel_for(
324 Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
330 distance = j * bytes_per_gid;
332 auto lid_out = local_col_map.getLocalElement(gid_out);
340 distance = j * bytes_per_value;
343 if (combine_mode ==
ADD) {
347 const bool use_atomic_updates = atomic;
348 (void)local_matrix.sumIntoValues(
353 matrix_has_sorted_rows,
356 }
else if (combine_mode ==
REPLACE) {
360 const bool use_atomic_updates =
false;
361 (void)local_matrix.replaceValues(
366 matrix_has_sorted_rows,
372#ifndef KOKKOS_ENABLE_SYCL
374 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
375 "At row %d, an unknown error occurred during unpack\n", (
int) lid_no
378 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 31);
383 team_member.team_barrier();
389 auto error_code_h = Kokkos::create_mirror_view_and_copy(
390 Kokkos::HostSpace(), error_code
392 return error_code_h();
397struct MaxNumEntTag {};
398struct TotNumEntTag {};
408template<
class LO,
class DT,
class BDT>
409class NumEntriesFunctor {
411 typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
412 typedef Kokkos::View<const size_t*, DT> offsets_type;
413 typedef Kokkos::View<const char*, BDT> input_buffer_type;
416 typedef size_t value_type;
419 num_packets_per_lid_type num_packets_per_lid;
420 offsets_type offsets;
421 input_buffer_type imports;
424 NumEntriesFunctor (
const num_packets_per_lid_type num_packets_per_lid_in,
425 const offsets_type& offsets_in,
426 const input_buffer_type& imports_in) :
427 num_packets_per_lid (num_packets_per_lid_in),
428 offsets (offsets_in),
432 KOKKOS_INLINE_FUNCTION
void
433 operator() (
const MaxNumEntTag,
const LO i, value_type& update)
const {
435 const size_t num_bytes = num_packets_per_lid(i);
438 const char*
const in_buf = imports.data () + offsets(i);
440 const size_t num_ent =
static_cast<size_t> (num_ent_LO);
442 update = (update < num_ent) ? num_ent : update;
446 KOKKOS_INLINE_FUNCTION
void
447 join (
const MaxNumEntTag,
449 const value_type& src)
const
451 if (dst < src) dst = src;
454 KOKKOS_INLINE_FUNCTION
void
455 operator() (
const TotNumEntTag,
const LO i, value_type& tot_num_ent)
const {
457 const size_t num_bytes = num_packets_per_lid(i);
460 const char*
const in_buf = imports.data () + offsets(i);
462 tot_num_ent +=
static_cast<size_t> (num_ent_LO);
474template<
class LO,
class DT,
class BDT>
477 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
478 const Kokkos::View<const size_t*, DT>& offsets,
479 const Kokkos::View<const char*, BDT>& imports)
481 typedef typename DT::execution_space XS;
482 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
483 MaxNumEntTag> range_policy;
487 const LO numRowsToUnpack =
488 static_cast<LO
> (num_packets_per_lid.extent (0));
489 size_t max_num_ent = 0;
490 Kokkos::parallel_reduce (
"Max num entries in CRS",
491 range_policy (0, numRowsToUnpack),
492 functor, max_num_ent);
503template<
class LO,
class DT,
class BDT>
506 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
507 const Kokkos::View<const size_t*, DT>& offsets,
508 const Kokkos::View<const char*, BDT>& imports)
510 typedef typename DT::execution_space XS;
511 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
512 size_t tot_num_ent = 0;
515 const LO numRowsToUnpack =
516 static_cast<LO
> (num_packets_per_lid.extent (0));
517 Kokkos::parallel_reduce (
"Total num entries in CRS to unpack",
518 range_policy (0, numRowsToUnpack),
519 functor, tot_num_ent);
524KOKKOS_INLINE_FUNCTION
526unpackRowCount(
const char imports[],
528 const size_t num_bytes)
534 const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
535 if (p_num_bytes > num_bytes) {
536 return OrdinalTraits<size_t>::invalid();
538 const char*
const in_buf = imports + offset;
539 (void) PT::unpackValue(num_ent_LO, in_buf);
541 return static_cast<size_t>(num_ent_LO);
548template<
class View1,
class View2>
552 const View1& batches_per_lid,
556 using LO =
typename View2::value_type;
558 for (
size_t i=0; i<batches_per_lid.extent(0); i++)
560 for (
size_t batch_no=0; batch_no<batches_per_lid(i); batch_no++)
562 batch_info(batch, 0) =
static_cast<LO
>(i);
563 batch_info(batch, 1) = batch_no;
567 return batch == batch_info.extent(0);
577template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
580 const LocalMatrix& local_matrix,
582 const Kokkos::View<const char*, BufferDeviceType>& imports,
583 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
587 using ST =
typename LocalMatrix::value_type;
590 using XS =
typename DT::execution_space;
591 const char prefix[] =
592 "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
593 "unpackAndCombineIntoCrsMatrix: ";
595 const size_t num_import_lids =
static_cast<size_t>(import_lids.extent(0));
596 if (num_import_lids == 0) {
603 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
ABSMAX,
604 std::invalid_argument,
605 prefix <<
"ABSMAX combine mode is not yet implemented for a matrix that has a "
606 "static graph (i.e., was constructed with the CrsMatrix constructor "
607 "that takes a const CrsGraph pointer).");
609 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
INSERT,
610 std::invalid_argument,
611 prefix <<
"INSERT combine mode is not allowed if the matrix has a static graph "
612 "(i.e., was constructed with the CrsMatrix constructor that takes a "
613 "const CrsGraph pointer).");
616 TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode ==
ADD || combine_mode ==
REPLACE),
617 std::invalid_argument,
618 prefix <<
"Invalid combine mode; should never get "
619 "here! Please report this bug to the Tpetra developers.");
622 bool bad_num_import_lids =
623 num_import_lids !=
static_cast<size_t>(num_packets_per_lid.extent(0));
624 TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
625 std::invalid_argument,
626 prefix <<
"importLIDs.size() (" << num_import_lids <<
") != "
627 "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) <<
").");
631 Kokkos::View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
637 const size_t batch_size = std::min(default_batch_size, max_num_ent);
640 size_t num_batches = 0;
641 Kokkos::View<LO*[2], DT> batch_info(
"", num_batches);
642 Kokkos::View<size_t*, DT> batches_per_lid(
"", num_import_lids);
644 Kokkos::parallel_reduce(
645 Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
646 KOKKOS_LAMBDA(
const size_t i,
size_t& batches)
648 const size_t num_entries_in_row = unpackRowCount<LO>(
649 imports.data(), offsets(i), num_packets_per_lid(i)
652 (num_entries_in_row <= batch_size) ?
654 num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
655 batches += batches_per_lid(i);
659 Kokkos::resize(batch_info, num_batches);
661 Kokkos::HostSpace host_space;
662 auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
664 Kokkos::deep_copy(XS(), batches_per_lid_h, batches_per_lid);
666 auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
670 Kokkos::deep_copy(XS(), batch_info, batch_info_h);
677 const bool atomic = XS().concurrency() != 1;
693 using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
695 if (!Spaces::is_gpu_exec_space<XS>() || team_size == Teuchos::OrdinalTraits<size_t>::invalid())
697 Kokkos::parallel_for(policy(
static_cast<LO
>(num_batches), Kokkos::AUTO), f);
701 Kokkos::parallel_for(policy(
static_cast<LO
>(num_batches),
static_cast<int>(team_size)), f);
704 auto error_code = f.error();
705 TEUCHOS_TEST_FOR_EXCEPTION(
708 prefix <<
"UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
712template<
class LocalMatrix,
class BufferDeviceType>
715 const LocalMatrix& local_matrix,
717 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
718 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
719 const size_t num_same_ids)
721 using Kokkos::parallel_reduce;
722 typedef typename LocalMatrix::ordinal_type LO;
723 typedef typename LocalMatrix::device_type
device_type;
724 typedef typename device_type::execution_space XS;
725 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
726 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
727 typedef BufferDeviceType BDT;
733 num_items =
static_cast<LO
>(num_same_ids);
736 parallel_reduce(range_policy(0, num_items),
737 KOKKOS_LAMBDA(
const LO lid,
size_t& update) {
738 update +=
static_cast<size_t>(local_matrix.graph.row_map[lid+1]
739 -local_matrix.graph.row_map[lid]);
745 num_items =
static_cast<LO
>(permute_from_lids.extent(0));
748 parallel_reduce(range_policy(0, num_items),
749 KOKKOS_LAMBDA(
const LO i,
size_t& update) {
750 const LO lid = permute_from_lids(i);
751 update +=
static_cast<size_t> (local_matrix.graph.row_map[lid+1]
752 - local_matrix.graph.row_map[lid]);
759 const size_type np = num_packets_per_lid.extent(0);
760 Kokkos::View<size_t*, device_type> offsets(
"offsets", np+1);
771template<
class LO,
class DT,
class BDT>
776 const Kokkos::View<const char*, BDT>& imports,
777 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
780 using Kokkos::parallel_reduce;
781 typedef typename DT::execution_space XS;
783 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
785 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
786 const size_type N = num_packets_per_lid.extent(0);
789 parallel_reduce (
"Setup row pointers for remotes",
791 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
792 typedef typename std::remove_reference<
decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
793 const size_t num_bytes = num_packets_per_lid(i);
794 const size_t offset = offsets(i);
795 const size_t num_ent = unpackRowCount<LO> (imports.data(), offset, num_bytes);
796 if (num_ent == InvalidNum) {
799 Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
807makeCrsRowPtrFromLengths(
809 const Kokkos::View<size_t*,DT>& new_start_row)
811 using Kokkos::parallel_scan;
812 typedef typename DT::execution_space XS;
813 typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
814 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
815 const size_type N = new_start_row.extent(0);
816 parallel_scan(range_policy(0, N),
817 KOKKOS_LAMBDA(
const size_t& i,
size_t& update,
const bool&
final) {
818 auto cur_val = tgt_rowptr(i);
820 tgt_rowptr(i) = update;
821 new_start_row(i) = tgt_rowptr(i);
828template<
class LocalMatrix,
class LocalMap>
834 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
837 const LocalMatrix& local_matrix,
838 const LocalMap& local_col_map,
839 const size_t num_same_ids,
842 using Kokkos::parallel_for;
845 typedef typename DT::execution_space XS;
846 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
848 parallel_for(range_policy(0, num_same_ids),
849 KOKKOS_LAMBDA(
const size_t i) {
850 typedef typename std::remove_reference<
decltype( new_start_row(0) ) >::type atomic_incr_type;
852 const LO src_lid =
static_cast<LO
>(i);
853 size_t src_row = local_matrix.graph.row_map(src_lid);
855 const LO tgt_lid =
static_cast<LO
>(i);
856 const size_t tgt_row = tgt_rowptr(tgt_lid);
858 const size_t nsr = local_matrix.graph.row_map(src_lid+1)
859 - local_matrix.graph.row_map(src_lid);
860 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
862 for (
size_t j=local_matrix.graph.row_map(src_lid);
863 j<local_matrix.graph.row_map(src_lid+1); ++j) {
864 LO src_col = local_matrix.graph.entries(j);
865 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
866 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
867 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
873template<
class LocalMatrix,
class LocalMap>
875copyDataFromPermuteIDs(
879 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
884 const LocalMatrix& local_matrix,
885 const LocalMap& local_col_map,
888 using Kokkos::parallel_for;
891 typedef typename DT::execution_space XS;
892 typedef typename PackTraits<LO>::input_array_type::size_type size_type;
893 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
895 const size_type num_permute_to_lids = permute_to_lids.extent(0);
897 parallel_for(range_policy(0, num_permute_to_lids),
898 KOKKOS_LAMBDA(
const size_t i) {
899 typedef typename std::remove_reference<
decltype( new_start_row(0) ) >::type atomic_incr_type;
901 const LO src_lid = permute_from_lids(i);
902 const size_t src_row = local_matrix.graph.row_map(src_lid);
904 const LO tgt_lid = permute_to_lids(i);
905 const size_t tgt_row = tgt_rowptr(tgt_lid);
907 size_t nsr = local_matrix.graph.row_map(src_lid+1)
908 - local_matrix.graph.row_map(src_lid);
909 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
911 for (
size_t j=local_matrix.graph.row_map(src_lid);
912 j<local_matrix.graph.row_map(src_lid+1); ++j) {
913 LO src_col = local_matrix.graph.entries(j);
914 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
915 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
916 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
922template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
924unpackAndCombineIntoCrsArrays2(
928 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
931 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
932 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
936 const size_t bytes_per_value)
939 using Kokkos::subview;
940 using Kokkos::MemoryUnmanaged;
941 using Kokkos::parallel_reduce;
942 using Kokkos::atomic_fetch_add;
943 using Tpetra::Details::PackTraits;
947 typedef typename LocalMatrix::value_type ST;
948 typedef typename DT::execution_space XS;
949 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
950 typedef typename Kokkos::pair<size_type, size_type> slice;
951 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
953 typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
954 typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
955 typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
957 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
960 const size_type num_import_lids = import_lids.size();
963 parallel_reduce (
"Unpack and combine into CRS",
964 range_policy (0, num_import_lids),
965 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
966 typedef typename std::remove_reference<
decltype( new_start_row(0) ) >::type atomic_incr_type;
967 const size_t num_bytes = num_packets_per_lid(i);
968 const size_t offset = offsets(i);
969 if (num_bytes == 0) {
973 size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
974 if (num_ent == InvalidNum) {
978 const LO lcl_row = import_lids(i);
979 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
980 const size_t end_row = start_row + num_ent;
982 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
983 vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
984 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
987 imports.data(), offset, num_bytes,
988 num_ent, bytes_per_value);
991 for (
size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
992 const int pid = pids_out(j);
993 pids_out(j) = (pid != my_pid) ? pid : -1;
1000template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
1002unpackAndCombineIntoCrsArrays(
1003 const LocalMatrix & local_matrix,
1004 const LocalMap & local_col_map,
1006 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
1007 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
1015 const size_t num_same_ids,
1016 const size_t tgt_num_rows,
1017 const size_t tgt_num_nonzeros,
1018 const int my_tgt_pid,
1019 const size_t bytes_per_value)
1022 using Kokkos::subview;
1023 using Kokkos::parallel_for;
1024 using Kokkos::MemoryUnmanaged;
1025 using Tpetra::Details::PackTraits;
1028 typedef typename DT::execution_space XS;
1029 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1030 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1031 typedef BufferDeviceType BDT;
1033 const char prefix[] =
"unpackAndCombineIntoCrsArrays: ";
1035 const size_t N = tgt_num_rows;
1039 const int my_pid = my_tgt_pid;
1042 parallel_for(range_policy(0, N+1),
1043 KOKKOS_LAMBDA(
const size_t i) {
1049 parallel_for(range_policy(0, num_same_ids),
1050 KOKKOS_LAMBDA(
const size_t i) {
1051 const LO tgt_lid =
static_cast<LO
>(i);
1052 const LO src_lid =
static_cast<LO
>(i);
1053 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1054 - local_matrix.graph.row_map(src_lid);
1059 const size_type num_permute_to_lids = permute_to_lids.extent(0);
1060 parallel_for(range_policy(0, num_permute_to_lids),
1061 KOKKOS_LAMBDA(
const size_t i) {
1062 const LO tgt_lid = permute_to_lids(i);
1063 const LO src_lid = permute_from_lids(i);
1064 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1065 - local_matrix.graph.row_map(src_lid);
1070 const size_type num_import_lids = import_lids.extent(0);
1071 View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
1074#ifdef HAVE_TPETRA_DEBUG
1076 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1077 const bool condition =
1078 nth_offset_h !=
static_cast<size_t>(imports.extent (0));
1079 TEUCHOS_TEST_FOR_EXCEPTION
1080 (condition, std::logic_error, prefix
1081 <<
"The final offset in bytes " << nth_offset_h
1082 <<
" != imports.size() = " << imports.extent(0)
1083 <<
". Please report this bug to the Tpetra developers.");
1090 import_lids, imports, num_packets_per_lid, offsets);
1091 TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1092 <<
" Error transferring data to target row pointers. "
1093 "Please report this bug to the Tpetra developers.");
1097 View<size_t*, DT> new_start_row (
"new_start_row", N+1);
1100 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1103 copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1104 tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1106 copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1107 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1108 local_matrix, local_col_map, my_pid);
1110 if (imports.extent(0) <= 0) {
1114 int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1115 tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1116 local_matrix, local_col_map, my_pid, bytes_per_value);
1117 TEUCHOS_TEST_FOR_EXCEPTION(
1118 unpack_err != 0, std::logic_error, prefix <<
"unpack loop failed. This "
1119 "should never happen. Please report this bug to the Tpetra developers.");
1165template<
typename ST,
typename LO,
typename GO,
typename Node>
1169 const Teuchos::ArrayView<const char>& imports,
1170 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1171 const Teuchos::ArrayView<const LO>& importLIDs,
1178 static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1179 "Node::device_type and LocalMatrix::device_type must be the same.");
1187 auto num_packets_per_lid_d =
1189 numPacketsPerLID.size(),
true,
"num_packets_per_lid");
1191 auto import_lids_d =
1193 importLIDs.size(),
true,
"import_lids");
1197 imports.size(),
true,
"imports");
1200 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1212 local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1213 import_lids_d, combineMode);
1217template<
typename ST,
typename LO,
typename GO,
typename NT>
1219unpackCrsMatrixAndCombineNew(
1221 Kokkos::DualView<
char*,
1223 Kokkos::DualView<
size_t*,
1225 const Kokkos::DualView<
const LO*,
1233 using device_type =
typename crs_matrix_type::device_type;
1238 (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1239 "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1240 "must be the same.");
1242 if (numPacketsPerLID.need_sync_device()) {
1243 numPacketsPerLID.sync_device ();
1245 auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1247 TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1248 auto import_lids_d = importLIDs.view_device ();
1250 if (imports.need_sync_device()) {
1251 imports.sync_device ();
1253 auto imports_d = imports.view_device ();
1256 auto local_col_map = sourceMatrix.
getColMap ()->getLocalMap ();
1257 typedef decltype (local_col_map) local_map_type;
1260 local_matrix_device_type,
1263 > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1264 import_lids_d, combineMode);
1322template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1326 const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1327 const Teuchos::ArrayView<const char> &imports,
1328 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1332 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1333 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1335 using Kokkos::MemoryUnmanaged;
1337 typedef typename Node::device_type DT;
1338 const char prefix[] =
"unpackAndCombineWithOwningPIDsCount: ";
1340 TEUCHOS_TEST_FOR_EXCEPTION
1341 (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1342 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size () <<
" != "
1343 "permuteFromLIDs.size() = " << permuteFromLIDs.size() <<
".");
1347 TEUCHOS_TEST_FOR_EXCEPTION
1348 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1349 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1350 TEUCHOS_TEST_FOR_EXCEPTION
1351 (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1352 prefix <<
"importLIDs.size() = " << importLIDs.size () <<
" != "
1353 "numPacketsPerLID.size() = " << numPacketsPerLID.size () <<
".");
1357 using kokkos_device_type = Kokkos::Device<
typename Node::device_type::execution_space,
1358 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>;
1360 Kokkos::View<LocalOrdinal const *, kokkos_device_type, void, void > permute_from_lids_d =
1362 permuteFromLIDs.getRawPtr (),
1363 permuteFromLIDs.size (),
true,
1364 "permute_from_lids");
1366 Kokkos::View<const char*, kokkos_device_type, void, void > imports_d =
1368 imports.getRawPtr (),
1369 imports.size (),
true,
1372 Kokkos::View<const size_t*, kokkos_device_type, void, void > num_packets_per_lid_d =
1374 numPacketsPerLID.getRawPtr (),
1375 numPacketsPerLID.size (),
true,
1376 "num_packets_per_lid");
1378 return UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1379 local_matrix, permute_from_lids_d, imports_d,
1380 num_packets_per_lid_d, numSameIDs);
1398template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1402 const Kokkos::View<LocalOrdinal
const *,
1403 Kokkos::Device<
typename Node::device_type::execution_space,
1404 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1405 void,
void > import_lids_d,
1406 const Kokkos::View<
const char*,
1407 Kokkos::Device<
typename Node::device_type::execution_space,
1408 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1409 void,
void > imports_d,
1410 const Kokkos::View<
const size_t*,
1411 Kokkos::Device<
typename Node::device_type::execution_space,
1412 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1413 void,
void > num_packets_per_lid_d,
1414 const size_t numSameIDs,
1415 const Kokkos::View<LocalOrdinal
const *,
1416 Kokkos::Device<
typename Node::device_type::execution_space,
1417 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1418 void,
void > permute_to_lids_d,
1419 const Kokkos::View<LocalOrdinal
const *,
1420 Kokkos::Device<
typename Node::device_type::execution_space,
1421 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1422 void,
void > permute_from_lids_d,
1423 size_t TargetNumRows,
1424 const int MyTargetPID,
1425 Kokkos::View<size_t*,typename Node::device_type> &crs_rowptr_d,
1426 Kokkos::View<GlobalOrdinal*,typename Node::device_type> &crs_colind_d,
1428 const Teuchos::ArrayView<const int>& SourcePids,
1429 Kokkos::View<int*,typename Node::device_type> &TargetPids)
1435 using Kokkos::deep_copy;
1437 using Teuchos::ArrayView;
1438 using Teuchos::outArg;
1439 using Teuchos::REDUCE_MAX;
1440 using Teuchos::reduceAll;
1442 typedef typename Node::device_type DT;
1445 typedef typename matrix_type::impl_scalar_type ST;
1447 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1448# ifdef HAVE_TPETRA_MMM_TIMINGS
1449 using Teuchos::TimeMonitor;
1450 Teuchos::RCP<TimeMonitor> tm;
1453 using Kokkos::MemoryUnmanaged;
1455 TEUCHOS_TEST_FOR_EXCEPTION
1456 (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1457 prefix <<
"permute_to_lids_d.size() = " << permute_to_lids_d.size () <<
" != "
1458 "permute_from_lids_d.size() = " << permute_from_lids_d.size() <<
".");
1462 TEUCHOS_TEST_FOR_EXCEPTION
1463 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1464 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1465 TEUCHOS_TEST_FOR_EXCEPTION
1466 (((
size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument,
1467 prefix <<
"import_lids_d.size() = " << import_lids_d.size () <<
" != "
1468 "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () <<
".");
1473# ifdef HAVE_TPETRA_MMM_TIMINGS
1474 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineWithOwningPIDsCount"))));
1476 size_t TargetNumNonzeros =
1477 UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1478 local_matrix, permute_from_lids_d, imports_d,
1479 num_packets_per_lid_d, numSameIDs);
1480# ifdef HAVE_TPETRA_MMM_TIMINGS
1484# ifdef HAVE_TPETRA_MMM_TIMINGS
1485 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"resize CRS pointers"))));
1487 Kokkos::resize(crs_rowptr_d,TargetNumRows+1);
1488 Kokkos::resize(crs_colind_d,TargetNumNonzeros);
1489 Kokkos::resize(crs_vals_d,TargetNumNonzeros);
1490# ifdef HAVE_TPETRA_MMM_TIMINGS
1494 TEUCHOS_TEST_FOR_EXCEPTION(
1495 permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1496 prefix <<
"permuteToLIDs.size() = " << permute_to_lids_d.size ()
1497 <<
"!= permute_from_lids_d.size() = " << permute_from_lids_d.size () <<
".");
1499 if (
static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1500 Kokkos::resize(TargetPids,TargetNumNonzeros);
1505 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1507# ifdef HAVE_TPETRA_MMM_TIMINGS
1508 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"create mirror views from inputs"))));
1515 SourcePids.size(),
true,
"src_pids");
1517# ifdef HAVE_TPETRA_MMM_TIMINGS
1521 size_t bytes_per_value = 0;
1535 size_t bytes_per_value_l = 0;
1536 if (local_matrix.values.extent(0) > 0) {
1537 const ST& val = local_matrix.values(0);
1540 const ST& val = crs_vals_d(0);
1543 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.
getComm()),
1544 Teuchos::REDUCE_MAX,
1546 outArg(bytes_per_value));
1549# ifdef HAVE_TPETRA_MMM_TIMINGS
1550 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineIntoCrsArrays"))));
1552 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays(
1553 local_matrix, local_col_map, import_lids_d, imports_d,
1554 num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1555 crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, TargetPids,
1556 numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1558# ifdef HAVE_TPETRA_MMM_TIMINGS
1563# ifdef HAVE_TPETRA_MMM_TIMINGS
1564 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"copy back to host"))));
1567 Kokkos::parallel_for(
"setLocalEntriesToPID", Kokkos::RangePolicy<typename DT::execution_space>(0,TargetPids.size()), KOKKOS_LAMBDA (
const size_t i) {
1568 if (TargetPids(i) == -1) TargetPids(i) = MyTargetPID;
1573template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1577 const Kokkos::View<LocalOrdinal
const *,
1578 Kokkos::Device<
typename Node::device_type::execution_space,
1579 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1580 void,
void > import_lids_d,
1581 const Kokkos::View<
const char*,
1582 Kokkos::Device<
typename Node::device_type::execution_space,
1583 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1584 void,
void > imports_d,
1585 const Kokkos::View<
const size_t*,
1586 Kokkos::Device<
typename Node::device_type::execution_space,
1587 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1588 void,
void > num_packets_per_lid_d,
1589 const size_t numSameIDs,
1590 const Kokkos::View<LocalOrdinal
const *,
1591 Kokkos::Device<
typename Node::device_type::execution_space,
1592 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1593 void,
void > permute_to_lids_d,
1594 const Kokkos::View<LocalOrdinal
const *,
1595 Kokkos::Device<
typename Node::device_type::execution_space,
1596 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1597 void,
void > permute_from_lids_d,
1598 size_t TargetNumRows,
1599 const int MyTargetPID,
1600 Teuchos::ArrayRCP<size_t>& CRS_rowptr,
1601 Teuchos::ArrayRCP<GlobalOrdinal>& CRS_colind,
1602 Teuchos::ArrayRCP<Scalar>& CRS_vals,
1603 const Teuchos::ArrayView<const int>& SourcePids,
1604 Teuchos::Array<int>& TargetPids)
1610 using Kokkos::deep_copy;
1612 using Teuchos::ArrayView;
1613 using Teuchos::outArg;
1614 using Teuchos::REDUCE_MAX;
1615 using Teuchos::reduceAll;
1617 typedef typename Node::device_type DT;
1620 typedef typename matrix_type::impl_scalar_type ST;
1622 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1623# ifdef HAVE_TPETRA_MMM_TIMINGS
1624 using Teuchos::TimeMonitor;
1625 Teuchos::RCP<TimeMonitor> tm;
1628 using Kokkos::MemoryUnmanaged;
1630 TEUCHOS_TEST_FOR_EXCEPTION
1631 (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1632 prefix <<
"permute_to_lids_d.size() = " << permute_to_lids_d.size () <<
" != "
1633 "permute_from_lids_d.size() = " << permute_from_lids_d.size() <<
".");
1637 TEUCHOS_TEST_FOR_EXCEPTION
1638 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1639 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1640 TEUCHOS_TEST_FOR_EXCEPTION
1641 (((
size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument,
1642 prefix <<
"import_lids_d.size() = " << import_lids_d.size () <<
" != "
1643 "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () <<
".");
1648# ifdef HAVE_TPETRA_MMM_TIMINGS
1649 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineWithOwningPIDsCount"))));
1651 size_t TargetNumNonzeros =
1652 UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1653 local_matrix, permute_from_lids_d, imports_d,
1654 num_packets_per_lid_d, numSameIDs);
1655# ifdef HAVE_TPETRA_MMM_TIMINGS
1659# ifdef HAVE_TPETRA_MMM_TIMINGS
1660 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"resize CRS pointers"))));
1662 CRS_rowptr.resize (TargetNumRows+1);
1663 CRS_colind.resize(TargetNumNonzeros);
1664 CRS_vals.resize(TargetNumNonzeros);
1665 Teuchos::ArrayRCP<ST>
const & CRS_vals_impl_scalar_type = Teuchos::arcp_reinterpret_cast<ST>(CRS_vals);
1666# ifdef HAVE_TPETRA_MMM_TIMINGS
1670 TEUCHOS_TEST_FOR_EXCEPTION(
1671 permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1672 prefix <<
"permuteToLIDs.size() = " << permute_to_lids_d.size ()
1673 <<
"!= permute_from_lids_d.size() = " << permute_from_lids_d.size () <<
".");
1676 if (
static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1677 TargetPids.resize (TargetNumNonzeros);
1679 TargetPids.assign (TargetNumNonzeros, -1);
1682 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1684# ifdef HAVE_TPETRA_MMM_TIMINGS
1685 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"create mirror views from inputs"))));
1692 CRS_rowptr.size(),
true,
"crs_rowptr");
1696 CRS_colind.size(),
true,
"crs_colidx");
1697#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1698 static_assert (! std::is_same<
1699 typename std::remove_const<
1700 typename std::decay<
1701 decltype (CRS_vals_impl_scalar_type)
1704 std::complex<double> >::value,
1705 "CRS_vals::value_type is std::complex<double>; this should never happen"
1706 ", since std::complex does not work in Kokkos::View objects.");
1711 CRS_vals_impl_scalar_type.size(),
true,
"crs_vals");
1713#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1714 static_assert (! std::is_same<
1715 typename decltype (crs_vals_d)::non_const_value_type,
1716 std::complex<double> >::value,
1717 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1718 "never happen, since std::complex does not work in Kokkos::View objects.");
1723 SourcePids.size(),
true,
"src_pids");
1727 TargetPids.size(),
true,
"tgt_pids");
1729# ifdef HAVE_TPETRA_MMM_TIMINGS
1733 size_t bytes_per_value = 0;
1747 size_t bytes_per_value_l = 0;
1748 if (local_matrix.values.extent(0) > 0) {
1749 const ST& val = local_matrix.values(0);
1752 const ST& val = crs_vals_d(0);
1755 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.
getComm()),
1756 Teuchos::REDUCE_MAX,
1758 outArg(bytes_per_value));
1761#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1762 static_assert (! std::is_same<
1763 typename decltype (crs_vals_d)::non_const_value_type,
1764 std::complex<double> >::value,
1765 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1766 "never happen, since std::complex does not work in Kokkos::View objects.");
1769# ifdef HAVE_TPETRA_MMM_TIMINGS
1770 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineIntoCrsArrays"))));
1772 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays(
1773 local_matrix, local_col_map, import_lids_d, imports_d,
1774 num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1775 crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1776 numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1778# ifdef HAVE_TPETRA_MMM_TIMINGS
1783# ifdef HAVE_TPETRA_MMM_TIMINGS
1784 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"copy back to host"))));
1786 typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1787 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1791 typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1792 CRS_colind.getRawPtr(), CRS_colind.size());
1796 typename decltype(crs_vals_d)::HostMirror crs_vals_h(
1797 CRS_vals_impl_scalar_type.getRawPtr(), CRS_vals_impl_scalar_type.size());
1801 typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1802 TargetPids.getRawPtr(), TargetPids.size());
1812#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
1814 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1815 const CrsMatrix<ST, LO, GO, NT>&, \
1816 const Teuchos::ArrayView<const char>&, \
1817 const Teuchos::ArrayView<const size_t>&, \
1818 const Teuchos::ArrayView<const LO>&, \
1822 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1823 const CrsMatrix<ST, LO, GO, NT> &, \
1824 const Teuchos::ArrayView<const LO> &, \
1825 const Teuchos::ArrayView<const char> &, \
1826 const Teuchos::ArrayView<const size_t>&, \
1830 const Teuchos::ArrayView<const LO>&, \
1831 const Teuchos::ArrayView<const LO>&); \
1833 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1834 const CrsMatrix<ST, LO, GO, NT>&, \
1835 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1836 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1837 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1839 const CombineMode); \
1841 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1842 const CrsMatrix<ST, LO, GO, NT> &, \
1843 const Kokkos::View<LO const *, \
1844 Kokkos::Device<typename NT::device_type::execution_space, \
1845 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1847 const Kokkos::View<const char*, \
1848 Kokkos::Device<typename NT::device_type::execution_space, \
1849 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1851 const Kokkos::View<const size_t*, \
1852 Kokkos::Device<typename NT::device_type::execution_space, \
1853 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1856 const Kokkos::View<LO const *, \
1857 Kokkos::Device<typename NT::device_type::execution_space, \
1858 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1860 const Kokkos::View<LO const *, \
1861 Kokkos::Device<typename NT::device_type::execution_space, \
1862 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1866 Kokkos::View<size_t*,typename NT::device_type>&, \
1867 Kokkos::View<GO*,typename NT::device_type>&, \
1868 Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*,typename NT::device_type>&, \
1869 const Teuchos::ArrayView<const int>&, \
1870 Kokkos::View<int*,typename NT::device_type>&); \
1872 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1873 const CrsMatrix<ST, LO, GO, NT> &, \
1874 const Kokkos::View<LO const *, \
1875 Kokkos::Device<typename NT::device_type::execution_space, \
1876 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1878 const Kokkos::View<const char*, \
1879 Kokkos::Device<typename NT::device_type::execution_space, \
1880 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1882 const Kokkos::View<const size_t*, \
1883 Kokkos::Device<typename NT::device_type::execution_space, \
1884 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1887 const Kokkos::View<LO const *, \
1888 Kokkos::Device<typename NT::device_type::execution_space, \
1889 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1891 const Kokkos::View<LO const *, \
1892 Kokkos::Device<typename NT::device_type::execution_space, \
1893 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1897 Teuchos::ArrayRCP<size_t>&, \
1898 Teuchos::ArrayRCP<GO>&, \
1899 Teuchos::ArrayRCP<ST>&, \
1900 const Teuchos::ArrayView<const int>&, \
1901 Teuchos::Array<int>&);
Declaration of the Tpetra::CrsMatrix class.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Declaration and definition of Tpetra::Details::getEntryOnHost.
int setupRowPointersForRemotes(const typename PackTraits< size_t >::output_array_type &tgt_rowptr, const typename PackTraits< LO >::input_array_type &import_lids, const Kokkos::View< const char *, BDT > &imports, const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const typename PackTraits< size_t >::input_array_type &offsets)
Setup row pointers for remotes.
size_t compute_total_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Total number of entries in any row of the packed matrix.
void unpackAndCombineIntoCrsMatrix(const LocalMatrix &local_matrix, const LocalMap &local_map, const Kokkos::View< const char *, BufferDeviceType > &imports, const Kokkos::View< const size_t *, BufferDeviceType > &num_packets_per_lid, const typename PackTraits< typename LocalMap::local_ordinal_type >::input_array_type import_lids, const Tpetra::CombineMode combine_mode)
Perform the unpack operation for the matrix.
size_t compute_maximum_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Maximum number of entries in any row of the packed matrix.
KOKKOS_FUNCTION int unpackRow(const typename PackTraits< GO >::output_array_type &gids_out, const typename PackTraits< int >::output_array_type &pids_out, const typename PackTraits< ST >::output_array_type &vals_out, const char imports[], const size_t offset, const size_t, const size_t num_ent, const size_t bytes_per_value)
Unpack a single row of a CrsMatrix.
bool compute_batch_info(const View1 &batches_per_lid, View2 &batch_info)
Compute the index and batch number associated with each batch.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
"Local" part of Map suitable for Kokkos kernels.
LocalOrdinal local_ordinal_type
The type of local indices.
GlobalOrdinal global_ordinal_type
The type of global indices.
DeviceType device_type
The device type.
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
Base class for distributed Tpetra objects that support data redistribution.
Kokkos::Device< typename device_type::execution_space, buffer_memory_space > buffer_device_type
Kokkos::Device specialization for communication buffers.
Nonmember function that computes a residual Computes R = B - A * X.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ INSERT
Insert new values that don't currently exist.
Traits class for packing / unpacking data of type T.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
static const bool compileTimeSize
Whether the number of bytes required to pack one instance of value_type is fixed at compile time.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
Unpacks and combines a single row of the CrsMatrix.
int error() const
Host function for getting the error.