Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_Details_unpackCrsGraphAndCombine_def.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Tpetra: Templated Linear Algebra Services Package
4//
5// Copyright 2008 NTESS and the Tpetra contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
11#define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
12
13#include "TpetraCore_config.h"
14#include "Teuchos_Array.hpp"
15#include "Teuchos_ArrayView.hpp"
24#include "Kokkos_Core.hpp"
25#include <memory>
26#include <string>
27
46
47namespace Tpetra {
48
49//
50// Users must never rely on anything in the Details namespace.
51//
52namespace Details {
53
54namespace UnpackAndCombineCrsGraphImpl {
55
65template<class Packet, class GO, class Device, class BufferDevice>
66KOKKOS_FUNCTION int
67unpackRow (const Kokkos::View<GO*,Device,Kokkos::MemoryUnmanaged>& gids_out,
68 const Kokkos::View<int*,Device,Kokkos::MemoryUnmanaged>& pids_out,
69 const Kokkos::View<const Packet*,BufferDevice>& imports,
70 const size_t offset,
71 const size_t num_ent)
72{
73 using size_type = typename Kokkos::View<GO*,Device>::size_type;
74
75 if (num_ent == 0) {
76 // Empty rows always take zero bytes, to ensure sparsity.
77 return 0;
78 }
79
80 // Unpack GIDs
81 for (size_type k=0; k<num_ent; k++)
82 gids_out(k) = imports(offset+k);
83
84 // Unpack PIDs
85 if (pids_out.size() > 0) {
86 for (size_type k=0; k<num_ent; k++) {
87 pids_out(k) = static_cast<int>(imports(offset+num_ent+k));
88 }
89 }
90
91 return 0;
92}
93
104template<class LocalOrdinal,
105 class Packet,
106 class RowView,
107 class IndicesView,
108 class BufferDevice>
109class UnpackAndCombineFunctor {
110
111 using LO = LocalOrdinal;
112 using GO = typename IndicesView::value_type;
113 using packet_type = Packet;
114 using row_ptrs_type = RowView;
115 using indices_type = IndicesView;
116 using buffer_device_type = BufferDevice;
117
118 using device_type = typename IndicesView::device_type;
119 using execution_space = typename device_type::execution_space;
120
121 using num_packets_per_lid_type = Kokkos::View<const size_t*, buffer_device_type>;
122 using offsets_type = Kokkos::View<const size_t*, device_type>;
123 using input_buffer_type = Kokkos::View<const packet_type*, buffer_device_type>;
124 using import_lids_type = Kokkos::View<const LO*, buffer_device_type>;
125
126 using gids_scratch_type = Kokkos::View<GO*, device_type>;
127 using pids_scratch_type = Kokkos::View<int*,device_type>;
128
129 row_ptrs_type row_ptrs_beg;
130 row_ptrs_type row_ptrs_end;
131 indices_type indices;
132 input_buffer_type imports;
133 num_packets_per_lid_type num_packets_per_lid;
134 import_lids_type import_lids;
135 offsets_type offsets;
136 size_t max_num_ent;
137 bool unpack_pids;
138 Kokkos::Experimental::UniqueToken<execution_space,
139 Kokkos::Experimental::UniqueTokenScope::Global> tokens;
140 gids_scratch_type gids_scratch;
141 pids_scratch_type pids_scratch;
142
143 public:
144 using value_type = Kokkos::pair<int, LO>;
145
146 UnpackAndCombineFunctor(
147 const row_ptrs_type& row_ptrs_beg_in,
148 const row_ptrs_type& row_ptrs_end_in,
149 const indices_type& indices_in,
150 const input_buffer_type& imports_in,
151 const num_packets_per_lid_type& num_packets_per_lid_in,
152 const import_lids_type& import_lids_in,
153 const offsets_type& offsets_in,
154 const size_t max_num_ent_in,
155 const bool unpack_pids_in) :
156 row_ptrs_beg(row_ptrs_beg_in),
157 row_ptrs_end(row_ptrs_end_in),
158 indices(indices_in),
159 imports(imports_in),
160 num_packets_per_lid(num_packets_per_lid_in),
161 import_lids(import_lids_in),
162 offsets(offsets_in),
163 max_num_ent(max_num_ent_in),
164 unpack_pids(unpack_pids_in),
165 tokens(execution_space()),
166 gids_scratch("gids_scratch", tokens.size() * max_num_ent),
167 pids_scratch("pids_scratch", tokens.size() * max_num_ent)
168 {}
169
170 KOKKOS_INLINE_FUNCTION void init(value_type& dst) const
171 {
172 using Tpetra::Details::OrdinalTraits;
173 dst = Kokkos::make_pair(0, OrdinalTraits<LO>::invalid());
174 }
175
176 KOKKOS_INLINE_FUNCTION void
177 join(value_type& dst, const value_type& src) const
178 {
179 // `dst` should reflect the first (least) bad index and
180 // all other associated error codes and data. Thus, we need only
181 // check if the `src` object shows an error and if its associated
182 // bad index is less than `dst`'s bad index.
183 using Tpetra::Details::OrdinalTraits;
184 if (src.second != OrdinalTraits<LO>::invalid()) {
185 // An error in the src; check if
186 // 1. `dst` shows errors
187 // 2. If `dst` does show errors, if src's bad index is less than
188 // *this' bad index
189 if (dst.second == OrdinalTraits<LO>::invalid() ||
190 src.second < dst.second) {
191 dst = src;
192 }
193 }
194 }
195
196 KOKKOS_INLINE_FUNCTION
197 void operator()(const LO i, value_type& dst) const
198 {
199 using Kokkos::View;
200 using Kokkos::subview;
201 using Kokkos::MemoryUnmanaged;
202 using size_type = typename execution_space::size_type;
203 using slice = typename Kokkos::pair<size_type, size_type>;
204
205 using pids_out_type = View<int*,device_type, MemoryUnmanaged>;
206 using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
207
208 const size_t num_packets_this_lid = num_packets_per_lid(i);
209 const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2
210 : num_packets_this_lid;
211 if (unpack_pids && num_packets_this_lid%2 != 0) {
212 // Attempting to unpack PIDs, but num_packets_this_lid is not even; this
213 // should never
214 dst = Kokkos::make_pair(1, i);
215 return;
216 }
217
218 // Only unpack data if there is a nonzero number to unpack
219 if (num_ent == 0) {
220 return;
221 }
222
223 // there is actually something in the row
224 const size_t buf_size = imports.size();
225 const size_t offset = offsets(i);
226
227 if (offset > buf_size || offset + num_packets_this_lid > buf_size) {
228 dst = Kokkos::make_pair(2, i); // out of bounds
229 return;
230 }
231
232 // Get subviews in to the scratch arrays. The token returned from acquire
233 // is an integer in [0, tokens.size()). It is used to grab a unique (to
234 // this thread) subview of the scratch arrays.
235 const size_type token = tokens.acquire();
236 const size_t a = static_cast<size_t>(token) * max_num_ent;
237 const size_t b = a + num_ent;
238 gids_out_type gids_out = subview(gids_scratch, slice(a, b));
239 pids_out_type pids_out = subview(pids_scratch, slice(a, (unpack_pids ? b : a)));
240
241 const int err = unpackRow (gids_out, pids_out, imports, offset, num_ent);
242
243 if (err != 0) {
244 dst = Kokkos::make_pair(3, i);
245 tokens.release(token);
246 return;
247 }
248
249 auto import_lid = import_lids(i);
250 for (size_t k = 0; k < num_ent; ++k) {
251 indices(row_ptrs_end(import_lid)) = gids_out(k);
252 // this is OK; don't need atomic, since LIDs to pack don't have repeats.
253 row_ptrs_end(import_lid) += 1;
254 }
255
256 tokens.release(token);
257 }
258
259};
260
267template<class LocalOrdinal, class GlobalOrdinal, class Node,
268 class RowView, class IndicesView, class BufferDevice>
269void
271(const RowView& row_ptrs_beg,
272 const RowView& row_ptrs_end,
273 IndicesView& indices,
274 const Kokkos::View<const GlobalOrdinal*, BufferDevice,
275 Kokkos::MemoryUnmanaged>& imports,
276 const Kokkos::View<const size_t*, BufferDevice,
277 Kokkos::MemoryUnmanaged>& num_packets_per_lid,
278 const Kokkos::View<const LocalOrdinal*, BufferDevice,
279 Kokkos::MemoryUnmanaged>& import_lids,
280 const typename CrsGraph<LocalOrdinal, GlobalOrdinal,
281 Node>::padding_type& padding,
282 const bool unpack_pids,
283 const int myRank,
284 const bool verbose)
285{
286 using LO = LocalOrdinal;
287 using GO = GlobalOrdinal;
288 using device_type = typename Node::device_type;
289 using execution_space = typename BufferDevice::execution_space;
290 using range_policy =
291 Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
292 using unpack_functor_type =
294
295 const char prefix[] =
296 "Tpetra::Details::UnpackAndCombineCrsGraphImpl::unpackAndCombine: ";
297
298 const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
299 if (num_import_lids == 0) {
300 // Nothing to unpack
301 return;
302 }
303
304 // Resize row pointers and indices to accommodate incoming data
305 padCrsArrays(row_ptrs_beg, row_ptrs_end, indices, padding,
306 myRank, verbose);
307
308 // Get the offsets
309 Kokkos::View<size_t*, device_type> offsets("offsets", num_import_lids+1);
310 computeOffsetsFromCounts(offsets, num_packets_per_lid);
311
312 // Determine the maximum number of entries in any row in the graph. The
313 // maximum number of entries is needed to allocate unpack buffers on the
314 // device.
315 size_t max_num_ent;
316 Kokkos::parallel_reduce
317 ("MaxReduce",
318 range_policy (0, LO (num_packets_per_lid.size ())),
319 KOKKOS_LAMBDA (const LO i, size_t& running_max_num_ent) {
320 const size_t num_packets_this_lid = num_packets_per_lid(i);
321 const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2 :
322 num_packets_this_lid;
323 if (num_ent > running_max_num_ent) {
324 running_max_num_ent = num_ent;
325 }
326 }, Kokkos::Max<size_t> (max_num_ent));
327
328 // Now do the actual unpack!
329 unpack_functor_type f (row_ptrs_beg, row_ptrs_end, indices, imports,
330 num_packets_per_lid, import_lids, offsets,
331 max_num_ent, unpack_pids);
332
333 typename unpack_functor_type::value_type x;
334 Kokkos::parallel_reduce(range_policy(0, static_cast<LO>(num_import_lids)), f, x);
335 auto x_h = x.to_std_pair();
336 TEUCHOS_TEST_FOR_EXCEPTION(x_h.first != 0, std::runtime_error,
337 prefix << "UnpackAndCombineFunctor reported error code "
338 << x_h.first << " for the first bad row " << x_h.second);
339}
340
341template<class Packet, class LocalGraph, class BufferDevice>
342size_t
344 const LocalGraph& local_graph,
345 const Kokkos::View<const typename LocalGraph::data_type*,
346 typename LocalGraph::device_type,
347 Kokkos::MemoryUnmanaged> permute_from_lids,
348 const Kokkos::View<const Packet*, BufferDevice>& /* imports */,
349 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
350 const size_t num_same_ids)
351{
352 using Kokkos::parallel_reduce;
353 using local_graph_type = LocalGraph;
354 using LO = typename local_graph_type::data_type;
355 using device_type = typename local_graph_type::device_type;
356 using execution_space = typename device_type::execution_space;
357 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
358
359 size_t count = 0;
360 LO num_items;
361
362 // Number of graph entries to unpack (returned by this function).
363 num_items = static_cast<LO>(num_same_ids);
364 if (num_items) {
365 size_t kcnt = 0;
366 parallel_reduce(
367 range_policy(0, num_items),
368 KOKKOS_LAMBDA(const LO lid, size_t& update) {
369 update += static_cast<size_t>(local_graph.row_map[lid+1]
370 -local_graph.row_map[lid]);
371 }, kcnt);
372 count += kcnt;
373 }
374
375 // Count entries copied directly from the source graph with permuting.
376 num_items = static_cast<LO>(permute_from_lids.extent(0));
377 if (num_items) {
378 size_t kcnt = 0;
379 parallel_reduce(
380 range_policy(0, num_items),
381 KOKKOS_LAMBDA(const LO i, size_t& update) {
382 const LO lid = permute_from_lids(i);
383 update += static_cast<size_t>(local_graph.row_map[lid+1]
384 - local_graph.row_map[lid]);
385 }, kcnt);
386 count += kcnt;
387 }
388
389 {
390 // Count entries received from other MPI processes.
391 size_t tot_num_ent = 0;
392 parallel_reduce("SumReduce",
393 range_policy(0,num_packets_per_lid.size()),
394 KOKKOS_LAMBDA(const int& i, size_t& lsum) {
395 lsum += num_packets_per_lid(i) / 2;
396 }, Kokkos::Sum<size_t>(tot_num_ent));
397 count += tot_num_ent;
398 }
399
400 return count;
401}
402
404template<class Packet, class LO, class Device, class BufferDevice>
405void
407 const Kokkos::View<size_t*, Device>& tgt_rowptr,
408 const Kokkos::View<const LO*, BufferDevice>& import_lids,
409 const Kokkos::View<const Packet*, BufferDevice>& /* imports */,
410 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid)
411{
412 using Kokkos::parallel_reduce;
413 using device_type = Device;
414 using execution_space = typename device_type::execution_space;
415 using size_type = typename Kokkos::View<size_t*,device_type>::size_type;
416 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
417
418 const size_type N = num_packets_per_lid.extent(0);
419 parallel_for("Setup row pointers for remotes",
420 range_policy(0, N),
421 KOKKOS_LAMBDA(const size_t i){
422 using atomic_incr_type = typename std::remove_reference<decltype(tgt_rowptr(0))>::type;
423 const size_t num_packets_this_lid = num_packets_per_lid(i);
424 const size_t num_ent = num_packets_this_lid / 2;
425 Kokkos::atomic_fetch_add(&tgt_rowptr(import_lids(i)), atomic_incr_type(num_ent));
426 });
427}
428
429// Convert array of row lengths to a CRS pointer array
430template<class Device>
431void
432makeCrsRowPtrFromLengths(
433 const Kokkos::View<size_t*,Device,Kokkos::MemoryUnmanaged>& tgt_rowptr,
434 const Kokkos::View<size_t*,Device>& new_start_row)
435{
436 using Kokkos::parallel_scan;
437 using device_type = Device;
438 using execution_space = typename device_type::execution_space;
439 using size_type = typename Kokkos::View<size_t*,device_type>::size_type;
440 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
441 const size_type N = new_start_row.extent(0);
442 parallel_scan(
443 range_policy(0, N),
444 KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
445 auto cur_val = tgt_rowptr(i);
446 if (final) {
447 tgt_rowptr(i) = update;
448 new_start_row(i) = tgt_rowptr(i);
449 }
450 update += cur_val;
451 }
452 );
453}
454
455template<class LocalGraph, class LocalMap>
456void
457copyDataFromSameIDs(
458 const Kokkos::View<typename LocalMap::global_ordinal_type*,
459 typename LocalMap::device_type>& tgt_colind,
460 const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
461 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
462 const Kokkos::View<size_t*, typename LocalMap::device_type>& tgt_rowptr,
463 const Kokkos::View<const int*, typename LocalMap::device_type>& src_pids,
464 const LocalGraph& local_graph,
465 const LocalMap& local_col_map,
466 const size_t num_same_ids,
467 const int my_pid)
468{
469 using Kokkos::parallel_for;
470 using device_type = typename LocalMap::device_type;
471 using LO = typename LocalMap::local_ordinal_type;
472 using execution_space = typename device_type::execution_space;
473 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
474
475 parallel_for(
476 range_policy(0, num_same_ids),
477 KOKKOS_LAMBDA(const size_t i) {
478 using atomic_incr_type =typename std::remove_reference<decltype(new_start_row(0))>::type;
479
480 const LO src_lid = static_cast<LO>(i);
481 size_t src_row = local_graph.row_map(src_lid);
482
483 const LO tgt_lid = static_cast<LO>(i);
484 const size_t tgt_row = tgt_rowptr(tgt_lid);
485
486 const size_t nsr = local_graph.row_map(src_lid+1)
487 - local_graph.row_map(src_lid);
488 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
489
490 for (size_t j=local_graph.row_map(src_lid);
491 j<local_graph.row_map(src_lid+1); ++j) {
492 LO src_col = local_graph.entries(j);
493 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
494 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
495 }
496 }
497 );
498}
499
500template<class LocalGraph, class LocalMap, class BufferDevice>
501void
502copyDataFromPermuteIDs(
503 const Kokkos::View<typename LocalMap::global_ordinal_type*,
504 typename LocalMap::device_type>& tgt_colind,
505 const Kokkos::View<int*,
506 typename LocalMap::device_type>& tgt_pids,
507 const Kokkos::View<size_t*,
508 typename LocalMap::device_type>& new_start_row,
509 const Kokkos::View<size_t*,
510 typename LocalMap::device_type>& tgt_rowptr,
511 const Kokkos::View<const int*,
512 typename LocalMap::device_type>& src_pids,
513 const Kokkos::View<const typename LocalMap::local_ordinal_type*,
514 BufferDevice, Kokkos::MemoryUnmanaged>& permute_to_lids,
515 const Kokkos::View<const typename LocalMap::local_ordinal_type*,
516 BufferDevice, Kokkos::MemoryUnmanaged>& permute_from_lids,
517 const LocalGraph& local_graph,
518 const LocalMap& local_col_map,
519 const int my_pid)
520{
521 using Kokkos::parallel_for;
522 using device_type = typename LocalMap::device_type;
523 using LO = typename LocalMap::local_ordinal_type;
524 using execution_space = typename device_type::execution_space;
525 using size_type = typename Kokkos::View<LO*,device_type>::size_type;
526 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
527
528 const size_type num_permute_to_lids = permute_to_lids.extent(0);
529
530 parallel_for(
531 range_policy(0, num_permute_to_lids),
532 KOKKOS_LAMBDA(const size_t i) {
533 using atomic_incr_type = typename std::remove_reference<decltype(new_start_row(0))>::type;
534
535 const LO src_lid = permute_from_lids(i);
536 const size_t src_row = local_graph.row_map(src_lid);
537
538 const LO tgt_lid = permute_to_lids(i);
539 const size_t tgt_row = tgt_rowptr(tgt_lid);
540
541 size_t nsr = local_graph.row_map(src_lid+1)
542 - local_graph.row_map(src_lid);
543 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
544
545 for (size_t j=local_graph.row_map(src_lid);
546 j<local_graph.row_map(src_lid+1); ++j) {
547 LO src_col = local_graph.entries(j);
548 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
549 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
550 }
551 }
552 );
553}
554
555template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
556void
557unpackAndCombineIntoCrsArrays2(
558 const Kokkos::View<typename LocalMap::global_ordinal_type*, typename LocalMap::device_type>& tgt_colind,
559 const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
560 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
561 const Kokkos::View<const size_t*, typename LocalMap::device_type>& offsets,
562 const Kokkos::View<
563 const typename LocalMap::local_ordinal_type*,
564 BufferDevice,
565 Kokkos::MemoryUnmanaged>& import_lids,
566 const Kokkos::View<const Packet*, BufferDevice>& imports,
567 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
568 const LocalGraph& /* local_graph */,
569 const LocalMap /*& local_col_map*/,
570 const int my_pid)
571{
572 using Kokkos::View;
573 using Kokkos::subview;
574 using Kokkos::MemoryUnmanaged;
575 using Kokkos::parallel_reduce;
576 using Kokkos::atomic_fetch_add;
577
578 using device_type = typename LocalMap::device_type;
579 using LO = typename LocalMap::local_ordinal_type;
580 using GO = typename LocalMap::global_ordinal_type;
581 using execution_space = typename device_type::execution_space;
582 using size_type = typename Kokkos::View<LO*, device_type>::size_type;
583 using slice = typename Kokkos::pair<size_type, size_type>;
584 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
585
586 using pids_out_type = View<int*,device_type, MemoryUnmanaged>;
587 using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
588
589 const size_type num_import_lids = import_lids.size();
590 const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays2: ";
591
592 // RemoteIDs: Loop structure following UnpackAndCombine
593 int gbl_err_count;
594 parallel_reduce("Unpack and combine into CRS",
595 range_policy(0, num_import_lids),
596 KOKKOS_LAMBDA(const size_t i, int& err) {
597 using atomic_incr_type = typename std::remove_reference< decltype( new_start_row(0) )>::type;
598 const size_t num_packets_this_lid = num_packets_per_lid(i);
599 const size_t num_ent = num_packets_this_lid / 2;
600 const size_t offset = offsets(i);
601 const LO lcl_row = import_lids(i);
602 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
603 const size_t end_row = start_row + num_ent;
604
605 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
606 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
607
608 err += unpackRow (gids_out, pids_out, imports, offset, num_ent);
609
610 // Correct target PIDs.
611 for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
612 const int pid = pids_out(j);
613 pids_out(j) = (pid != my_pid) ? pid : -1;
614 }
615 }, gbl_err_count);
616
617 TEUCHOS_TEST_FOR_EXCEPTION(gbl_err_count != 0,
618 std::invalid_argument, prefix <<
619 "Attempting to unpack PIDs, but num_ent is not even; this should never "
620 "happen! Please report this bug to the Tpetra developers.");
621
622 return;
623}
624
625template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
626void
627unpackAndCombineIntoCrsArrays(
628 const LocalGraph & local_graph,
629 const LocalMap & local_col_map,
630 const Kokkos::View<const typename LocalMap::local_ordinal_type*,
631 BufferDevice,
632 Kokkos::MemoryUnmanaged>& import_lids,
633 const Kokkos::View<const Packet*, BufferDevice>& imports,
634 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
635 const Kokkos::View<const typename LocalMap::local_ordinal_type*,
636 BufferDevice,
637 Kokkos::MemoryUnmanaged>& permute_to_lids,
638 const Kokkos::View<const typename LocalMap::local_ordinal_type*,
639 BufferDevice,
640 Kokkos::MemoryUnmanaged>& permute_from_lids,
641 const Kokkos::View<size_t*,
642 typename LocalMap::device_type,
643 Kokkos::MemoryUnmanaged>& tgt_rowptr,
644 const Kokkos::View<typename LocalMap::global_ordinal_type*,
645 typename LocalMap::device_type,
646 Kokkos::MemoryUnmanaged>& tgt_colind,
647 const Kokkos::View<const int*,
648 typename LocalMap::device_type,
649 Kokkos::MemoryUnmanaged>& src_pids,
650 const Kokkos::View<int*,
651 typename LocalMap::device_type,
652 Kokkos::MemoryUnmanaged>& tgt_pids,
653 const size_t num_same_ids,
654 const size_t tgt_num_rows,
655 const size_t tgt_num_nonzeros,
656 const int my_tgt_pid)
657{
658 using Kokkos::View;
659 using Kokkos::subview;
660 using Kokkos::parallel_for;
661 using Kokkos::MemoryUnmanaged;
662 using packet_type = Packet;
663 using local_map_type = LocalMap;
664 using local_graph_type = LocalGraph;
665 using buffer_device_type = BufferDevice;
666 using device_type = typename LocalMap::device_type;
667 using LO = typename LocalMap::local_ordinal_type;
668 using execution_space = typename device_type::execution_space;
669 using size_type = typename Kokkos::View<LO*, device_type>::size_type;
670 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
671
672 const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays: ";
673
674 const size_t N = tgt_num_rows;
675 const size_t mynnz = tgt_num_nonzeros;
676
677 // In the case of reduced communicators, the sourceGraph won't have
678 // the right "my_pid", so thus we have to supply it.
679 const int my_pid = my_tgt_pid;
680
681 // FIXME (mfh 24 Jun 2019)
682 //
683 // 1. Only zero the entries of tgt_rowptr that actually need it.
684 // 2. Consider merging these three kernels into one.
685
686 // Zero the rowptr
687 parallel_for(
688 range_policy(0, N+1),
689 KOKKOS_LAMBDA(const size_t i) {
690 tgt_rowptr(i) = 0;
691 }
692 );
693
694 // same IDs: Always first, always in the same place
695 parallel_for(
696 range_policy(0, num_same_ids),
697 KOKKOS_LAMBDA(const size_t i) {
698 const LO tgt_lid = static_cast<LO>(i);
699 const LO src_lid = static_cast<LO>(i);
700 tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
701 - local_graph.row_map(src_lid);
702 }
703 );
704
705 // Permute IDs: Still local, but reordered
706 const size_type num_permute_to_lids = permute_to_lids.extent(0);
707 parallel_for(
708 range_policy(0, num_permute_to_lids),
709 KOKKOS_LAMBDA(const size_t i) {
710 const LO tgt_lid = permute_to_lids(i);
711 const LO src_lid = permute_from_lids(i);
712 tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
713 - local_graph.row_map(src_lid);
714 }
715 );
716
717 // Get the offsets from the number of packets per LID
718 const size_type num_import_lids = import_lids.extent(0);
719 View<size_t*, device_type> offsets("offsets", num_import_lids+1);
720 computeOffsetsFromCounts(offsets, num_packets_per_lid);
721
722#ifdef HAVE_TPETRA_DEBUG
723 {
724 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
725 const bool condition =
726 nth_offset_h != static_cast<size_t>(imports.extent(0));
727 TEUCHOS_TEST_FOR_EXCEPTION
728 (condition, std::logic_error, prefix
729 << "The final offset in bytes " << nth_offset_h
730 << " != imports.size() = " << imports.extent(0)
731 << ". Please report this bug to the Tpetra developers.");
732 }
733#endif // HAVE_TPETRA_DEBUG
734
735 // Setup row pointers for remotes
737 tgt_rowptr, import_lids, imports, num_packets_per_lid);
738
739 // If multiple processes contribute to the same row, we may need to
740 // update row offsets. This tracks that.
741 View<size_t*, device_type> new_start_row("new_start_row", N+1);
742
743 // Turn row length into a real CRS row pointer
744 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
745 {
746 auto nth_tgt_rowptr_h = getEntryOnHost(tgt_rowptr, N);
747 bool condition = nth_tgt_rowptr_h != mynnz;
748 TEUCHOS_TEST_FOR_EXCEPTION(condition, std::invalid_argument,
749 prefix << "CRS_rowptr[last] = " <<
750 nth_tgt_rowptr_h << "!= mynnz = " << mynnz << ".");
751 }
752
753 // SameIDs: Copy the data over
754 copyDataFromSameIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
755 tgt_rowptr, src_pids, local_graph, local_col_map, num_same_ids, my_pid);
756
757 copyDataFromPermuteIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
758 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
759 local_graph, local_col_map, my_pid);
760
761 if (imports.extent(0) <= 0) {
762 return;
763 }
764
765 unpackAndCombineIntoCrsArrays2<
766 packet_type,local_graph_type,local_map_type,buffer_device_type>(
767 tgt_colind, tgt_pids, new_start_row, offsets, import_lids, imports,
768 num_packets_per_lid, local_graph, local_col_map, my_pid);
769
770 return;
771}
772
773} // namespace UnpackAndCombineCrsGraphImpl
774
822template<class LocalOrdinal, class GlobalOrdinal, class Node>
823size_t
826 const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
827 const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type> &imports,
828 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
829 size_t /* constantNumPackets */,
830 CombineMode /* combineMode */,
831 size_t numSameIDs,
832 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
833 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
834{
835 using Kokkos::MemoryUnmanaged;
836 using Kokkos::View;
837 using device_type = typename Node::device_type;
841 const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
842
843 TEUCHOS_TEST_FOR_EXCEPTION
844 (permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
845 prefix << "permuteToLIDs.size() = " << permuteToLIDs.size() << " != "
846 "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
847 // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
848 // process, then the graph is neither locally nor globally indexed.
849 const bool locallyIndexed = sourceGraph.isLocallyIndexed();
850 TEUCHOS_TEST_FOR_EXCEPTION
851 (! locallyIndexed, std::invalid_argument, prefix << "The input "
852 "CrsGraph 'sourceGraph' must be locally indexed.");
853 TEUCHOS_TEST_FOR_EXCEPTION
854 (importLIDs.size() != numPacketsPerLID.size(), std::invalid_argument,
855 prefix << "importLIDs.size() = " << importLIDs.size() << " != "
856 "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
857
858 auto local_graph = sourceGraph.getLocalGraphDevice();
859 auto permute_from_lids_d =
861 permuteFromLIDs.getRawPtr(),
862 permuteFromLIDs.size(), true,
863 "permute_from_lids");
864 auto imports_d =
866 imports.getRawPtr(),
867 imports.size(), true,
868 "imports");
869 auto num_packets_per_lid_d =
871 numPacketsPerLID.getRawPtr(),
872 numPacketsPerLID.size(), true,
873 "num_packets_per_lid");
874
875 return UnpackAndCombineCrsGraphImpl::unpackAndCombineWithOwningPIDsCount<
877 local_graph, permute_from_lids_d, imports_d, num_packets_per_lid_d, numSameIDs);
878}
879
893template<class LocalOrdinal, class GlobalOrdinal, class Node>
894void
897 const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
898 const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type>& imports,
899 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
900 const size_t /* constantNumPackets */,
901 const CombineMode /* combineMode */,
902 const size_t numSameIDs,
903 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
904 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
905 size_t TargetNumRows,
906 size_t TargetNumNonzeros,
907 const int MyTargetPID,
908 const Teuchos::ArrayView<size_t>& CRS_rowptr,
909 const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
910 const Teuchos::ArrayView<const int>& SourcePids,
911 Teuchos::Array<int>& TargetPids)
912{
913 using Kokkos::View;
914 using Kokkos::deep_copy;
915 using Teuchos::outArg;
916 using Teuchos::REDUCE_MAX;
917 using Teuchos::reduceAll;
918 using LO = LocalOrdinal;
919 using GO = GlobalOrdinal;
921 using packet_type = typename crs_graph_type::packet_type;
922 using local_graph_device_type = typename crs_graph_type::local_graph_device_type;
923 using buffer_device_type = typename crs_graph_type::buffer_device_type;
924 using device_type = typename Node::device_type;
925 using size_type = typename Teuchos::ArrayView<const LO>::size_type;
926
927 const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
928
929 TEUCHOS_TEST_FOR_EXCEPTION(
930 TargetNumRows + 1 != static_cast<size_t>(CRS_rowptr.size()),
931 std::invalid_argument, prefix << "CRS_rowptr.size() = " <<
932 CRS_rowptr.size() << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
933
934 TEUCHOS_TEST_FOR_EXCEPTION(
935 permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
936 prefix << "permuteToLIDs.size() = " << permuteToLIDs.size()
937 << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
938 const size_type numImportLIDs = importLIDs.size();
939
940 TEUCHOS_TEST_FOR_EXCEPTION(
941 numImportLIDs != numPacketsPerLID.size(), std::invalid_argument,
942 prefix << "importLIDs.size() = " << numImportLIDs << " != "
943 "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
944
945 // Preseed TargetPids with -1 for local
946 if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
947 TargetPids.resize(TargetNumNonzeros);
948 }
949 TargetPids.assign(TargetNumNonzeros, -1);
950
951 // Grab pointers for sourceGraph
952 auto local_graph = sourceGraph.getLocalGraphDevice();
953 auto local_col_map = sourceGraph.getColMap()->getLocalMap();
954
955 // Convert input arrays to Kokkos::View
956 device_type outputDevice;
957 buffer_device_type bufferOutputDevice;
958
959 Kokkos::View<const LO*, buffer_device_type> import_lids_d =
961 (bufferOutputDevice, importLIDs.getRawPtr(),
962 importLIDs.size(), true, "import_lids");
963
964 Kokkos::View<const packet_type*, buffer_device_type> imports_d =
966 (bufferOutputDevice, imports.getRawPtr(),
967 imports.size(), true, "imports");
968
969 Kokkos::View<const size_t*, buffer_device_type> num_packets_per_lid_d =
971 numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
972 true, "num_packets_per_lid");
973
974 Kokkos::View<const LO*, buffer_device_type> permute_to_lids_d =
976 permuteToLIDs.getRawPtr(), permuteToLIDs.size(),
977 true, "permute_to_lids");
978
979 Kokkos::View<const LO*, buffer_device_type> permute_from_lids_d =
981 permuteFromLIDs.getRawPtr(), permuteFromLIDs.size(),
982 true, "permute_from_lids");
983
984 Kokkos::View<size_t*, device_type> crs_rowptr_d =
986 CRS_rowptr.getRawPtr(), CRS_rowptr.size(),
987 true, "crs_rowptr");
988
989 Kokkos::View<GO*, device_type> crs_colind_d =
991 CRS_colind.getRawPtr(), CRS_colind.size(),
992 true, "crs_colidx");
993
994 Kokkos::View<const int*, device_type> src_pids_d =
996 SourcePids.getRawPtr(), SourcePids.size(),
997 true, "src_pids");
998
999 Kokkos::View<int*, device_type> tgt_pids_d =
1001 TargetPids.getRawPtr(), TargetPids.size(),
1002 true, "tgt_pids");
1003
1004 using local_map_type = decltype(local_col_map);
1005 UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays<
1006 packet_type,local_graph_device_type,local_map_type,buffer_device_type>(
1007 local_graph, local_col_map, import_lids_d, imports_d, num_packets_per_lid_d,
1008 permute_to_lids_d, permute_from_lids_d, crs_rowptr_d, crs_colind_d, src_pids_d,
1009 tgt_pids_d, numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID);
1010
1011 // FIXME (mfh 25 Jun 2019) HostMirror of CudaUVMSpace is CudaUVMSpace!!!
1012
1013 // Copy outputs back to host
1014 typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1015 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1016 deep_copy(crs_rowptr_h, crs_rowptr_d);
1017
1018 typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1019 CRS_colind.getRawPtr(), CRS_colind.size());
1020 deep_copy(crs_colind_h, crs_colind_d);
1021
1022 typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1023 TargetPids.getRawPtr(), TargetPids.size());
1024 deep_copy(tgt_pids_h, tgt_pids_d);
1025
1026}
1027
1028} // namespace Details
1029} // namespace Tpetra
1030
1031#define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_INSTANT( LO, GO, NT ) \
1032 template void \
1033 Details::unpackAndCombineIntoCrsArrays<LO, GO, NT>( \
1034 const CrsGraph<LO, GO, NT> &, \
1035 const Teuchos::ArrayView<const LO>&, \
1036 const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \
1037 const Teuchos::ArrayView<const size_t>&, \
1038 const size_t, \
1039 const CombineMode, \
1040 const size_t, \
1041 const Teuchos::ArrayView<const LO>&, \
1042 const Teuchos::ArrayView<const LO>&, \
1043 size_t, \
1044 size_t, \
1045 const int, \
1046 const Teuchos::ArrayView<size_t>&, \
1047 const Teuchos::ArrayView<GO>&, \
1048 const Teuchos::ArrayView<const int>&, \
1049 Teuchos::Array<int>&); \
1050 template size_t \
1051 Details::unpackAndCombineWithOwningPIDsCount<LO, GO, NT>( \
1052 const CrsGraph<LO, GO, NT> &, \
1053 const Teuchos::ArrayView<const LO> &, \
1054 const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type> &, \
1055 const Teuchos::ArrayView<const size_t>&, \
1056 size_t, \
1057 CombineMode, \
1058 size_t, \
1059 const Teuchos::ArrayView<const LO>&, \
1060 const Teuchos::ArrayView<const LO>&);
1061
1062#endif // TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
Declaration of the Tpetra::CrsGraph class.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Functions for manipulating CRS arrays.
Declaration and definition of Tpetra::Details::getEntryOnHost.
void unpackAndCombine(const RowView &row_ptrs_beg, const RowView &row_ptrs_end, IndicesView &indices, const Kokkos::View< const GlobalOrdinal *, BufferDevice, Kokkos::MemoryUnmanaged > &imports, const Kokkos::View< const size_t *, BufferDevice, Kokkos::MemoryUnmanaged > &num_packets_per_lid, const Kokkos::View< const LocalOrdinal *, BufferDevice, Kokkos::MemoryUnmanaged > &import_lids, const typename CrsGraph< LocalOrdinal, GlobalOrdinal, Node >::padding_type &padding, const bool unpack_pids, const int myRank, const bool verbose)
Perform the unpack operation for the graph.
KOKKOS_FUNCTION int unpackRow(const Kokkos::View< GO *, Device, Kokkos::MemoryUnmanaged > &gids_out, const Kokkos::View< int *, Device, Kokkos::MemoryUnmanaged > &pids_out, const Kokkos::View< const Packet *, BufferDevice > &imports, const size_t offset, const size_t num_ent)
Unpack a single row of a CrsGraph.
void setupRowPointersForRemotes(const Kokkos::View< size_t *, Device > &tgt_rowptr, const Kokkos::View< const LO *, BufferDevice > &import_lids, const Kokkos::View< const Packet *, BufferDevice > &, const Kokkos::View< const size_t *, BufferDevice > &num_packets_per_lid)
Setup row pointers for remotes.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, device_type, void, size_t > local_graph_device_type
The type of the part of the sparse graph on each MPI process.
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
global_ordinal_type packet_type
Type of each entry of the DistObject communication buffer.
typename dist_object_type::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
bool isLocallyIndexed() const override
Whether the graph's column indices are stored as local indices.
local_graph_device_type getLocalGraphDevice() const
Get the local graph.
LocalOrdinal local_ordinal_type
The type of local indices.
GlobalOrdinal global_ordinal_type
The type of global indices.
DeviceType device_type
The device type.
Nonmember function that computes a residual Computes R = B - A * X.
void padCrsArrays(const RowPtr &rowPtrBeg, const RowPtr &rowPtrEnd, Indices &indices_wdv, const Padding &padding, const int my_rank, const bool verbose)
Determine if the row pointers and indices arrays need to be resized to accommodate new entries....
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
CombineMode
Rule for combining data in an Import or Export.