Ifpack2 Templated Preconditioning Package Version 1.0
Loading...
Searching...
No Matches
Ifpack2_BlockTriDiContainer_impl.hpp
1// @HEADER
2// *****************************************************************************
3// Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4//
5// Copyright 2009 NTESS and the Ifpack2 contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10#ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
11#define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
12
13//#define IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
14//#define IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
15
16#include <Teuchos_Details_MpiTypeTraits.hpp>
17
18#include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
19#include <Tpetra_Distributor.hpp>
20#include <Tpetra_BlockMultiVector.hpp>
21
22#include <Kokkos_ArithTraits.hpp>
23#include <KokkosBatched_Util.hpp>
24#include <KokkosBatched_Vector.hpp>
25#include <KokkosBatched_Copy_Decl.hpp>
26#include <KokkosBatched_Copy_Impl.hpp>
27#include <KokkosBatched_AddRadial_Decl.hpp>
28#include <KokkosBatched_AddRadial_Impl.hpp>
29#include <KokkosBatched_SetIdentity_Decl.hpp>
30#include <KokkosBatched_SetIdentity_Impl.hpp>
31#include <KokkosBatched_Gemm_Decl.hpp>
32#include <KokkosBatched_Gemm_Serial_Impl.hpp>
33#include <KokkosBatched_Gemm_Team_Impl.hpp>
34#include <KokkosBatched_Gemv_Decl.hpp>
35#include <KokkosBatched_Gemv_Team_Impl.hpp>
36#include <KokkosBatched_Trsm_Decl.hpp>
37#include <KokkosBatched_Trsm_Serial_Impl.hpp>
38#include <KokkosBatched_Trsm_Team_Impl.hpp>
39#include <KokkosBatched_Trsv_Decl.hpp>
40#include <KokkosBatched_Trsv_Serial_Impl.hpp>
41#include <KokkosBatched_Trsv_Team_Impl.hpp>
42#include <KokkosBatched_LU_Decl.hpp>
43#include <KokkosBatched_LU_Serial_Impl.hpp>
44#include <KokkosBatched_LU_Team_Impl.hpp>
45
46#include <KokkosBlas1_nrm1.hpp>
47#include <KokkosBlas1_nrm2.hpp>
48
49#include <memory>
50
51#include "Ifpack2_BlockHelper.hpp"
52#include "Ifpack2_BlockComputeResidualVector.hpp"
53
54//#include <KokkosBlas2_gemv.hpp>
55
56// need to interface this into cmake variable (or only use this flag when it is necessary)
57//#define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
58//#undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
59#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
60#include "cuda_profiler_api.h"
61#endif
62
63// I am not 100% sure about the mpi 3 on cuda
64#if MPI_VERSION >= 3
65#define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
66#endif
67
68// ::: Experiments :::
69// define either pinned memory or cudamemory for mpi
70// if both macros are disabled, it will use tpetra memory space which is uvm space for cuda
71// if defined, this use pinned memory instead of device pointer
72// by default, we enable pinned memory
73#define IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI
74//#define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI
75
76// if defined, all views are allocated on cuda space intead of cuda uvm space
77#define IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE
78
79// if defined, btdm_scalar_type is used (if impl_scala_type is double, btdm_scalar_type is float)
80#if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_SMALL_SCALAR)
81#define IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG
82#endif
83
84// if defined, it uses multiple execution spaces
85#define IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES
86
87namespace Ifpack2 {
88
90
91 namespace KB = KokkosBatched;
92
96 using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
97
98 template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
99 using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
100 MemoryTraitsType::is_random_access |
101 flag>;
102
103 template <typename ViewType>
104 using Unmanaged = Kokkos::View<typename ViewType::data_type,
105 typename ViewType::array_layout,
106 typename ViewType::device_type,
107 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
108 template <typename ViewType>
109 using Atomic = Kokkos::View<typename ViewType::data_type,
110 typename ViewType::array_layout,
111 typename ViewType::device_type,
112 MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
113 template <typename ViewType>
114 using Const = Kokkos::View<typename ViewType::const_data_type,
115 typename ViewType::array_layout,
116 typename ViewType::device_type,
117 typename ViewType::memory_traits>;
118 template <typename ViewType>
119 using ConstUnmanaged = Const<Unmanaged<ViewType> >;
120
121 template <typename ViewType>
122 using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
123
124 template <typename ViewType>
125 using Unmanaged = Kokkos::View<typename ViewType::data_type,
126 typename ViewType::array_layout,
127 typename ViewType::device_type,
128 MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
129
130
131 template <typename ViewType>
132 using Scratch = Kokkos::View<typename ViewType::data_type,
133 typename ViewType::array_layout,
134 typename ViewType::execution_space::scratch_memory_space,
135 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
136
140 template<typename T> struct BlockTridiagScalarType { typedef T type; };
141#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
142 template<> struct BlockTridiagScalarType<double> { typedef float type; };
143 //template<> struct SmallScalarType<Kokkos::complex<double> > { typedef Kokkos::complex<float> type; };
144#endif
145
146#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
147#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
148 KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
149
150#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
151 { KOKKOS_IMPL_CUDA_SAFE_CALL( cudaProfilerStop() ); }
152#else
154#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN
155#define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
156#endif
157
161 template<typename MatrixType>
162 typename Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type>
163 createBlockCrsTpetraImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
164 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter", CreateBlockCrsTpetraImporter);
166 using tpetra_map_type = typename impl_type::tpetra_map_type;
167 using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type;
168 using tpetra_import_type = typename impl_type::tpetra_import_type;
169 using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
170 using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
171
172 auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
173 auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
174
175 bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
176
177 // This is OK here to use the graph of the A_crs matrix and a block size of 1
178 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
179
180 const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
181 const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
182 const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize)));
183 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
184 return Teuchos::rcp(new tpetra_import_type(src, tgt));
185 }
186
187 // Partial replacement for forward-mode MultiVector::doImport.
188 // Permits overlapped communication and computation, but also supports sync'ed.
189 // I'm finding that overlapped comm/comp can give quite poor performance on some
190 // platforms, so we can't just use it straightforwardly always.
191
192 template<typename MatrixType>
193 struct AsyncableImport {
194 public:
196
197 private:
201#if !defined(HAVE_IFPACK2_MPI)
202 typedef int MPI_Request;
203 typedef int MPI_Comm;
204#endif
207 using scalar_type = typename impl_type::scalar_type;
208
209 static int isend(const MPI_Comm comm, const char* buf, int count, int dest, int tag, MPI_Request* ireq) {
210#ifdef HAVE_IFPACK2_MPI
211 MPI_Request ureq;
212 int ret = MPI_Isend(const_cast<char*>(buf), count, MPI_CHAR, dest, tag, comm, ireq == NULL ? &ureq : ireq);
213 if (ireq == NULL) MPI_Request_free(&ureq);
214 return ret;
215#else
216 return 0;
217#endif
218 }
219
220 static int irecv(const MPI_Comm comm, char* buf, int count, int src, int tag, MPI_Request* ireq) {
221#ifdef HAVE_IFPACK2_MPI
222 MPI_Request ureq;
223 int ret = MPI_Irecv(buf, count, MPI_CHAR, src, tag, comm, ireq == NULL ? &ureq : ireq);
224 if (ireq == NULL) MPI_Request_free(&ureq);
225 return ret;
226#else
227 return 0;
228#endif
229 }
230
231 static int waitany(int count, MPI_Request* reqs, int* index) {
232#ifdef HAVE_IFPACK2_MPI
233 return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
234#else
235 return 0;
236#endif
237 }
238
239 static int waitall(int count, MPI_Request* reqs) {
240#ifdef HAVE_IFPACK2_MPI
241 return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
242#else
243 return 0;
244#endif
245 }
246
247 public:
248 using tpetra_map_type = typename impl_type::tpetra_map_type;
249 using tpetra_import_type = typename impl_type::tpetra_import_type;
250
251 using local_ordinal_type = typename impl_type::local_ordinal_type;
252 using global_ordinal_type = typename impl_type::global_ordinal_type;
253 using size_type = typename impl_type::size_type;
254 using impl_scalar_type = typename impl_type::impl_scalar_type;
255
256 using int_1d_view_host = Kokkos::View<int*,Kokkos::HostSpace>;
257 using local_ordinal_type_1d_view_host = Kokkos::View<local_ordinal_type*,Kokkos::HostSpace>;
258
259 using execution_space = typename impl_type::execution_space;
260 using memory_space = typename impl_type::memory_space;
261 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
262 using size_type_1d_view = typename impl_type::size_type_1d_view;
263 using size_type_1d_view_host = Kokkos::View<size_type*,Kokkos::HostSpace>;
264
265#if defined(KOKKOS_ENABLE_CUDA)
266 using impl_scalar_type_1d_view =
267 typename std::conditional<std::is_same<execution_space,Kokkos::Cuda>::value,
268# if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_PINNED_MEMORY_FOR_MPI)
269 Kokkos::View<impl_scalar_type*,Kokkos::CudaHostPinnedSpace>,
270# elif defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_MEMORY_FOR_MPI)
271 Kokkos::View<impl_scalar_type*,Kokkos::CudaSpace>,
272# else // no experimental macros are defined
273 typename impl_type::impl_scalar_type_1d_view,
274# endif
275 typename impl_type::impl_scalar_type_1d_view>::type;
276#else
277 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
278#endif
279 using impl_scalar_type_1d_view_host = Kokkos::View<impl_scalar_type*,Kokkos::HostSpace>;
280 using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
281 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
282
283#ifdef HAVE_IFPACK2_MPI
284 MPI_Comm comm;
285#endif
286
287 impl_scalar_type_2d_view_tpetra remote_multivector;
288 local_ordinal_type blocksize;
289
290 template<typename T>
291 struct SendRecvPair {
292 T send, recv;
293 };
294
295 // (s)end and (r)eceive data:
296 SendRecvPair<int_1d_view_host> pids; // mpi ranks
297 SendRecvPair<std::vector<MPI_Request> > reqs; // MPI_Request is pointer, cannot use kokkos view
298 SendRecvPair<size_type_1d_view> offset; // offsets to local id list and data buffer
299 SendRecvPair<size_type_1d_view_host> offset_host; // offsets to local id list and data buffer
300 SendRecvPair<local_ordinal_type_1d_view> lids; // local id list
301 SendRecvPair<impl_scalar_type_1d_view> buffer; // data buffer
302 SendRecvPair<impl_scalar_type_1d_view_host> buffer_host; // data buffer
303
304 local_ordinal_type_1d_view dm2cm; // permutation
305
306#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
307 using exec_instance_1d_std_vector = std::vector<execution_space>;
308 exec_instance_1d_std_vector exec_instances;
309#endif
310
311 // for cuda
312 public:
313 void setOffsetValues(const Teuchos::ArrayView<const size_t> &lens,
314 const size_type_1d_view &offs) {
315 // wrap lens to kokkos view and deep copy to device
316 Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
317 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
318
319 // exclusive scan
320 const Kokkos::RangePolicy<execution_space> policy(0,offs.extent(0));
321 const local_ordinal_type lens_size = lens_device.extent(0);
322 Kokkos::parallel_scan
323 ("AsyncableImport::RangePolicy::setOffsetValues",
324 policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
325 if (final)
326 offs(i) = update;
327 update += (i < lens_size ? lens_device[i] : 0);
328 });
329 }
330
331 void setOffsetValuesHost(const Teuchos::ArrayView<const size_t> &lens,
332 const size_type_1d_view_host &offs) {
333 // wrap lens to kokkos view and deep copy to device
334 Kokkos::View<size_t*,Kokkos::HostSpace> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
335 const auto lens_device = Kokkos::create_mirror_view_and_copy(memory_space(), lens_host);
336
337 // exclusive scan
338 offs(0) = 0;
339 for (local_ordinal_type i=1,iend=offs.extent(0);i<iend;++i) {
340 offs(i) = offs(i-1) + lens[i-1];
341 }
342 }
343
344 private:
345 void createMpiRequests(const tpetra_import_type &import) {
346 Tpetra::Distributor &distributor = import.getDistributor();
347
348 // copy pids from distributor
349 const auto pids_from = distributor.getProcsFrom();
350 pids.recv = int_1d_view_host(do_not_initialize_tag("pids recv"), pids_from.size());
351 memcpy(pids.recv.data(), pids_from.getRawPtr(), sizeof(int)*pids.recv.extent(0));
352
353 const auto pids_to = distributor.getProcsTo();
354 pids.send = int_1d_view_host(do_not_initialize_tag("pids send"), pids_to.size());
355 memcpy(pids.send.data(), pids_to.getRawPtr(), sizeof(int)*pids.send.extent(0));
356
357 // mpi requests
358 reqs.recv.resize(pids.recv.extent(0)); memset(reqs.recv.data(), 0, reqs.recv.size()*sizeof(MPI_Request));
359 reqs.send.resize(pids.send.extent(0)); memset(reqs.send.data(), 0, reqs.send.size()*sizeof(MPI_Request));
360
361 // construct offsets
362#if 0
363 const auto lengths_to = distributor.getLengthsTo();
364 offset.send = size_type_1d_view(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
365
366 const auto lengths_from = distributor.getLengthsFrom();
367 offset.recv = size_type_1d_view(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
368
369 setOffsetValues(lengths_to, offset.send);
370 offset_host.send = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.send);
371
372 setOffsetValues(lengths_from, offset.recv);
373 offset_host.recv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offset.recv);
374#else
375 const auto lengths_to = distributor.getLengthsTo();
376 offset_host.send = size_type_1d_view_host(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
377
378 const auto lengths_from = distributor.getLengthsFrom();
379 offset_host.recv = size_type_1d_view_host(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
380
381 setOffsetValuesHost(lengths_to, offset_host.send);
382 //offset.send = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.send);
383
384 setOffsetValuesHost(lengths_from, offset_host.recv);
385 //offset.recv = Kokkos::create_mirror_view_and_copy(memory_space(), offset_host.recv);
386#endif
387 }
388
389 void createSendRecvIDs(const tpetra_import_type &import) {
390 // For each remote PID, the list of LIDs to receive.
391 const auto remote_lids = import.getRemoteLIDs();
392 const local_ordinal_type_1d_view_host
393 remote_lids_view_host(const_cast<local_ordinal_type*>(remote_lids.getRawPtr()), remote_lids.size());
394 lids.recv = local_ordinal_type_1d_view(do_not_initialize_tag("lids recv"), remote_lids.size());
395 Kokkos::deep_copy(lids.recv, remote_lids_view_host);
396
397 // For each export PID, the list of LIDs to send.
398 auto epids = import.getExportPIDs();
399 auto elids = import.getExportLIDs();
400 TEUCHOS_ASSERT(epids.size() == elids.size());
401 lids.send = local_ordinal_type_1d_view(do_not_initialize_tag("lids send"), elids.size());
402 auto lids_send_host = Kokkos::create_mirror_view(lids.send);
403
404 // naive search (not sure if pids or epids are sorted)
405 for (local_ordinal_type cnt=0,i=0,iend=pids.send.extent(0);i<iend;++i) {
406 const auto pid_send_value = pids.send[i];
407 for (local_ordinal_type j=0,jend=epids.size();j<jend;++j)
408 if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
409 TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset_host.send[i+1]);
410 }
411 Kokkos::deep_copy(lids.send, lids_send_host);
412 }
413
414 void createExecutionSpaceInstances() {
415#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
416 //The following line creates 8 streams:
417 exec_instances =
418 Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1, 1, 1, 1, 1);
419#endif
420 }
421
422 public:
423 // for cuda, all tag types are public
424 struct ToBuffer {};
425 struct ToMultiVector {};
426
427 AsyncableImport (const Teuchos::RCP<const tpetra_map_type>& src_map,
428 const Teuchos::RCP<const tpetra_map_type>& tgt_map,
429 const local_ordinal_type blocksize_,
430 const local_ordinal_type_1d_view dm2cm_) {
431 blocksize = blocksize_;
432 dm2cm = dm2cm_;
433
434#ifdef HAVE_IFPACK2_MPI
435 comm = Tpetra::Details::extractMpiCommFromTeuchos(*tgt_map->getComm());
436#endif
437 const tpetra_import_type import(src_map, tgt_map);
438
439 createMpiRequests(import);
440 createSendRecvIDs(import);
441 createExecutionSpaceInstances();
442 }
443
444 void createDataBuffer(const local_ordinal_type &num_vectors) {
445 const size_type extent_0 = lids.recv.extent(0)*blocksize;
446 const size_type extent_1 = num_vectors;
447 if (remote_multivector.extent(0) == extent_0 &&
448 remote_multivector.extent(1) == extent_1) {
449 // skip
450 } else {
451 remote_multivector =
452 impl_scalar_type_2d_view_tpetra(do_not_initialize_tag("remote multivector"), extent_0, extent_1);
453
454 const auto send_buffer_size = offset_host.send[offset_host.send.extent(0)-1]*blocksize*num_vectors;
455 const auto recv_buffer_size = offset_host.recv[offset_host.recv.extent(0)-1]*blocksize*num_vectors;
456
457 buffer.send = impl_scalar_type_1d_view(do_not_initialize_tag("buffer send"), send_buffer_size);
458 buffer.recv = impl_scalar_type_1d_view(do_not_initialize_tag("buffer recv"), recv_buffer_size);
459
460 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
461 buffer_host.send = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer send"), send_buffer_size);
462 buffer_host.recv = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer recv"), recv_buffer_size);
463 }
464 }
465 }
466
467 void cancel () {
468#ifdef HAVE_IFPACK2_MPI
469 waitall(reqs.recv.size(), reqs.recv.data());
470 waitall(reqs.send.size(), reqs.send.data());
471#endif
472 }
473
474 // ======================================================================
475 // Async version using execution space instances
476 // ======================================================================
477
478#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
479 template<typename PackTag>
480 static
481 void copy(const local_ordinal_type_1d_view &lids_,
482 const impl_scalar_type_1d_view &buffer_,
483 const local_ordinal_type ibeg_,
484 const local_ordinal_type iend_,
485 const impl_scalar_type_2d_view_tpetra &multivector_,
486 const local_ordinal_type blocksize_,
487 const execution_space &exec_instance_) {
488 const local_ordinal_type num_vectors = multivector_.extent(1);
489 const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
490 const local_ordinal_type idiff = iend_ - ibeg_;
491 const auto abase = buffer_.data() + mv_blocksize*ibeg_;
492
493 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
494 local_ordinal_type vector_size(0);
495 if (blocksize_ <= 4) vector_size = 4;
496 else if (blocksize_ <= 8) vector_size = 8;
497 else if (blocksize_ <= 16) vector_size = 16;
498 else vector_size = 32;
499
500 const auto work_item_property = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
501 const team_policy_type policy(exec_instance_, idiff, 1, vector_size);
502 Kokkos::parallel_for
503 (//"AsyncableImport::TeamPolicy::copyViaCudaStream",
504 Kokkos::Experimental::require(policy, work_item_property),
505 KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
506 const local_ordinal_type i = member.league_rank();
507 Kokkos::parallel_for
508 (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
509 auto aptr = abase + blocksize_*(i + idiff*j);
510 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
511 if (std::is_same<PackTag,ToBuffer>::value)
512 Kokkos::parallel_for
513 (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
514 aptr[k] = bptr[k];
515 });
516 else
517 Kokkos::parallel_for
518 (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
519 bptr[k] = aptr[k];
520 });
521 });
522 });
523 }
524
525 void asyncSendRecvVar1(const impl_scalar_type_2d_view_tpetra &mv) {
526 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
527
528#ifdef HAVE_IFPACK2_MPI
529 // constants and reallocate data buffers if necessary
530 const local_ordinal_type num_vectors = mv.extent(1);
531 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
532
533 // 0. post receive async
534 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
535 if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
536 irecv(comm,
537 reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
538 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
539 pids.recv[i],
540 42,
541 &reqs.recv[i]);
542 }
543 else {
544 irecv(comm,
545 reinterpret_cast<char*>(buffer_host.recv.data() + offset_host.recv[i]*mv_blocksize),
546 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
547 pids.recv[i],
548 42,
549 &reqs.recv[i]);
550 }
551 }
552
554 execution_space().fence();
555
556 // 1. async memcpy
557 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
558 // 1.0. enqueue pack buffer
559 if (i<8) exec_instances[i%8].fence();
560 copy<ToBuffer>(lids.send, buffer.send,
561 offset_host.send(i), offset_host.send(i+1),
562 mv, blocksize,
563 //execution_space());
564 exec_instances[i%8]);
565 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
566 //if (i<8) exec_instances[i%8].fence();
567 const local_ordinal_type num_vectors = mv.extent(1);
568 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
569
570 Kokkos::deep_copy(exec_instances[i%8],
571 Kokkos::subview(buffer_host.send,
572 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
573 offset_host.send(i)*mv_blocksize,
574 offset_host.send(i+1)*mv_blocksize)),
575 Kokkos::subview(buffer.send,
576 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
577 offset_host.send(i)*mv_blocksize,
578 offset_host.send(i+1)*mv_blocksize)));
579 }
580 }
582 //execution_space().fence();
583 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.send.extent(0));++i) {
584 // 1.1. sync the stream and isend
585 if (i<8) exec_instances[i%8].fence();
586 if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
587 isend(comm,
588 reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
589 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
590 pids.send[i],
591 42,
592 &reqs.send[i]);
593 }
594 else {
595 isend(comm,
596 reinterpret_cast<const char*>(buffer_host.send.data() + offset_host.send[i]*mv_blocksize),
597 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
598 pids.send[i],
599 42,
600 &reqs.send[i]);
601 }
602 }
603
604 // 2. poke communication
605 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
606 int flag;
607 MPI_Status stat;
608 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
609 }
610#endif // HAVE_IFPACK2_MPI
611 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
612 }
613
614 void syncRecvVar1() {
615 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
616#ifdef HAVE_IFPACK2_MPI
617 // 0. wait for receive async.
618 for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.recv.extent(0));++i) {
619 local_ordinal_type idx = i;
620
621 // 0.0. wait any
622 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
623
624 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
625 const local_ordinal_type num_vectors = remote_multivector.extent(1);
626 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
627
628 Kokkos::deep_copy(
629 Kokkos::subview(buffer.recv,
630 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
631 offset_host.recv(idx)*mv_blocksize,
632 offset_host.recv(idx+1)*mv_blocksize)),
633 Kokkos::subview(buffer_host.recv,
634 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
635 offset_host.recv(idx)*mv_blocksize,
636 offset_host.recv(idx+1)*mv_blocksize)));
637 }
638
639 // 0.1. unpack data after data is moved into a device
640 copy<ToMultiVector>(lids.recv, buffer.recv,
641 offset_host.recv(idx), offset_host.recv(idx+1),
642 remote_multivector, blocksize,
643 exec_instances[idx%8]);
644 }
645
646 // 1. fire up all cuda events
647 Kokkos::fence();
648
649 // 2. cleanup all open comm
650 waitall(reqs.send.size(), reqs.send.data());
651#endif // HAVE_IFPACK2_MPI
652 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
653 }
654#endif //defined(KOKKOS_ENABLE_CUDA|HIP|SYCL)
655
656 // ======================================================================
657 // Generic version without using execution space instances
658 // - only difference between device and host architecture is on using team
659 // or range policies.
660 // ======================================================================
661 template<typename PackTag>
662 static
663 void copy(const local_ordinal_type_1d_view &lids_,
664 const impl_scalar_type_1d_view &buffer_,
665 const local_ordinal_type &ibeg_,
666 const local_ordinal_type &iend_,
667 const impl_scalar_type_2d_view_tpetra &multivector_,
668 const local_ordinal_type blocksize_) {
669 const local_ordinal_type num_vectors = multivector_.extent(1);
670 const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
671 const local_ordinal_type idiff = iend_ - ibeg_;
672 const auto abase = buffer_.data() + mv_blocksize*ibeg_;
673 if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
674 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
675 local_ordinal_type vector_size(0);
676 if (blocksize_ <= 4) vector_size = 4;
677 else if (blocksize_ <= 8) vector_size = 8;
678 else if (blocksize_ <= 16) vector_size = 16;
679 else vector_size = 32;
680 const team_policy_type policy(idiff, 1, vector_size);
681 Kokkos::parallel_for
682 ("AsyncableImport::TeamPolicy::copy",
683 policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
684 const local_ordinal_type i = member.league_rank();
685 Kokkos::parallel_for
686 (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
687 auto aptr = abase + blocksize_*(i + idiff*j);
688 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
689 if (std::is_same<PackTag,ToBuffer>::value)
690 Kokkos::parallel_for
691 (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
692 aptr[k] = bptr[k];
693 });
694 else
695 Kokkos::parallel_for
696 (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
697 bptr[k] = aptr[k];
698 });
699 });
700 });
701 } else {
702 const Kokkos::RangePolicy<execution_space> policy(0, idiff*num_vectors);
703 Kokkos::parallel_for
704 ("AsyncableImport::RangePolicy::copy",
705 policy, KOKKOS_LAMBDA(const local_ordinal_type &ij) {
706 const local_ordinal_type i = ij%idiff;
707 const local_ordinal_type j = ij/idiff;
708 auto aptr = abase + blocksize_*(i + idiff*j);
709 auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
710 auto from = std::is_same<PackTag,ToBuffer>::value ? bptr : aptr;
711 auto to = std::is_same<PackTag,ToBuffer>::value ? aptr : bptr;
712 memcpy(to, from, sizeof(impl_scalar_type)*blocksize_);
713 });
714 }
715 }
716
717
721 void asyncSendRecvVar0(const impl_scalar_type_2d_view_tpetra &mv) {
722 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv", AsyncSendRecv);
723
724#ifdef HAVE_IFPACK2_MPI
725 // constants and reallocate data buffers if necessary
726 const local_ordinal_type num_vectors = mv.extent(1);
727 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
728
729 // receive async
730 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
731 if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
732 irecv(comm,
733 reinterpret_cast<char*>(buffer.recv.data() + offset_host.recv[i]*mv_blocksize),
734 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
735 pids.recv[i],
736 42,
737 &reqs.recv[i]);
738 }
739 else {
740 irecv(comm,
741 reinterpret_cast<char*>(buffer_host.recv.data() + offset_host.recv[i]*mv_blocksize),
742 (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type),
743 pids.recv[i],
744 42,
745 &reqs.recv[i]);
746 }
747 }
748
749 // send async
750 for (local_ordinal_type i=0,iend=pids.send.extent(0);i<iend;++i) {
751 copy<ToBuffer>(lids.send, buffer.send, offset_host.send(i), offset_host.send(i+1),
752 mv, blocksize);
753 Kokkos::fence();
754 if(Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
755 isend(comm,
756 reinterpret_cast<const char*>(buffer.send.data() + offset_host.send[i]*mv_blocksize),
757 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
758 pids.send[i],
759 42,
760 &reqs.send[i]);
761 }
762 else {
763 Kokkos::deep_copy(
764 Kokkos::subview(buffer_host.send,
765 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
766 offset_host.send(i)*mv_blocksize,
767 offset_host.send(i+1)*mv_blocksize)),
768 Kokkos::subview(buffer.send,
769 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
770 offset_host.send(i)*mv_blocksize,
771 offset_host.send(i+1)*mv_blocksize)));
772 isend(comm,
773 reinterpret_cast<const char*>(buffer_host.send.data() + offset_host.send[i]*mv_blocksize),
774 (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type),
775 pids.send[i],
776 42,
777 &reqs.send[i]);
778 }
779 }
780
781 // I find that issuing an Iprobe seems to nudge some MPIs into action,
782 // which helps with overlapped comm/comp performance.
783 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
784 int flag;
785 MPI_Status stat;
786 MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
787 }
788#endif
789 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
790 }
791
792 void syncRecvVar0() {
793 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv", SyncRecv);
794#ifdef HAVE_IFPACK2_MPI
795 // receive async.
796 for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
797 local_ordinal_type idx = i;
798 waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
799 if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) {
800 const local_ordinal_type num_vectors = remote_multivector.extent(1);
801 const local_ordinal_type mv_blocksize = blocksize*num_vectors;
802 Kokkos::deep_copy(
803 Kokkos::subview(buffer.recv,
804 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
805 offset_host.recv(idx)*mv_blocksize,
806 offset_host.recv(idx+1)*mv_blocksize)),
807 Kokkos::subview(buffer_host.recv,
808 Kokkos::pair<local_ordinal_type, local_ordinal_type>(
809 offset_host.recv(idx)*mv_blocksize,
810 offset_host.recv(idx+1)*mv_blocksize)));
811 }
812 copy<ToMultiVector>(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx+1),
813 remote_multivector, blocksize);
814 }
815 // wait on the sends to match all Isends with a cleanup operation.
816 waitall(reqs.send.size(), reqs.send.data());
817#endif
818 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
819 }
820
824 void asyncSendRecv(const impl_scalar_type_2d_view_tpetra &mv) {
825#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
826#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
827 asyncSendRecvVar1(mv);
828#else
829 asyncSendRecvVar0(mv);
830#endif
831#else
832 asyncSendRecvVar0(mv);
833#endif
834 }
835 void syncRecv() {
836#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL)
837#if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_EXEC_SPACE_INSTANCES)
838 syncRecvVar1();
839#else
840 syncRecvVar0();
841#endif
842#else
843 syncRecvVar0();
844#endif
845 }
846
847 void syncExchange(const impl_scalar_type_2d_view_tpetra &mv) {
848 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncExchange", SyncExchange);
849 asyncSendRecv(mv);
850 syncRecv();
851 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
852 }
853
854 impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView() const { return remote_multivector; }
855 };
856
857 template <typename ViewType1, typename ViewType2>
858 struct are_same_struct {
859 ViewType1 keys1;
860 ViewType2 keys2;
861
862 are_same_struct(ViewType1 keys1_, ViewType2 keys2_) : keys1(keys1_), keys2(keys2_) {}
863 KOKKOS_INLINE_FUNCTION
864 void operator()(int i, unsigned int& count) const {
865 if (keys1(i) != keys2(i)) count++;
866 }
867 };
868
869 template <typename ViewType1, typename ViewType2>
870 bool are_same (ViewType1 keys1, ViewType2 keys2) {
871 unsigned int are_same_ = 0;
872
873 Kokkos::parallel_reduce(Kokkos::RangePolicy<typename ViewType1::execution_space>(0, keys1.extent(0)),
874 are_same_struct(keys1, keys2),
875 are_same_);
876 return are_same_==0;
877 }
878
882 template<typename MatrixType>
883 Teuchos::RCP<AsyncableImport<MatrixType> >
884 createBlockCrsAsyncImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A) {
885 IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter", createBlockCrsAsyncImporter);
887 using tpetra_map_type = typename impl_type::tpetra_map_type;
888 using local_ordinal_type = typename impl_type::local_ordinal_type;
889 using global_ordinal_type = typename impl_type::global_ordinal_type;
890 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
891 using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
892 using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
893 using global_indices_array_device_type = Kokkos::View<const global_ordinal_type*, typename tpetra_map_type::device_type>;
894
895 auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
896 auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
897
898 bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
899
900 // This is OK here to use the graph of the A_crs matrix and a block size of 1
901 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
902
903 const auto blocksize = hasBlockCrsMatrix ? A_bcrs->getBlockSize() : 1;
904 const auto domain_map = g.getDomainMap();
905 const auto column_map = g.getColMap();
906
907 std::vector<global_ordinal_type> gids;
908
909 Kokkos::Subview<global_indices_array_device_type, std::pair<int,int>> column_map_global_iD_last;
910
911 bool separate_remotes = true, found_first = false, need_owned_permutation = false;
912 {
913 IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter::loop_over_local_elements", loop_over_local_elements);
914
915 global_indices_array_device_type column_map_global_iD = column_map->getMyGlobalIndicesDevice();
916 global_indices_array_device_type domain_map_global_iD = domain_map->getMyGlobalIndicesDevice();
917
918 if(are_same(domain_map_global_iD, column_map_global_iD)) {
919 // this should be the most likely path
920 separate_remotes = true;
921 need_owned_permutation = false;
922
923 column_map_global_iD_last = Kokkos::subview(column_map_global_iD,
924 std::pair<int,int>(domain_map_global_iD.extent(0), column_map_global_iD.extent(0)));
925 }
926 else {
927 // This loop is relatively expensive
928 for (size_t i=0;i<column_map->getLocalNumElements();++i) {
929 const global_ordinal_type gid = column_map->getGlobalElement(i);
930 if (!domain_map->isNodeGlobalElement(gid)) {
931 found_first = true;
932 gids.push_back(gid);
933 } else if (found_first) {
934 separate_remotes = false;
935 break;
936 }
937 if (!found_first && !need_owned_permutation &&
938 domain_map->getLocalElement(gid) != static_cast<local_ordinal_type>(i)) {
939 // The owned part of the domain and column maps are different
940 // orderings. We *could* do a super efficient impl of this case in the
941 // num_sweeps > 1 case by adding complexity to PermuteAndRepack. But,
942 // really, if a caller cares about speed, they wouldn't make different
943 // local permutations like this. So we punt on the best impl and go for
944 // a pretty good one: the permutation is done in place in
945 // compute_b_minus_Rx for the pure-owned part of the MVP. The only cost
946 // is the presumably worse memory access pattern of the input vector.
947 need_owned_permutation = true;
948 }
949 }
950 }
951 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
952 }
953
954 if (separate_remotes) {
955 IFPACK2_BLOCKHELPER_TIMER("createBlockCrsAsyncImporter::separate_remotes", separate_remotes);
956 const auto invalid = Teuchos::OrdinalTraits<global_ordinal_type>::invalid();
957 const auto parsimonious_col_map
958 = need_owned_permutation ?
959 Teuchos::rcp(new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm())):
960 Teuchos::rcp(new tpetra_map_type(invalid, column_map_global_iD_last, 0, domain_map->getComm()));
961 if (parsimonious_col_map->getGlobalNumElements() > 0) {
962 // make the importer only if needed.
963 local_ordinal_type_1d_view dm2cm;
964 if (need_owned_permutation) {
965 dm2cm = local_ordinal_type_1d_view(do_not_initialize_tag("dm2cm"), domain_map->getLocalNumElements());
966 const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
967 for (size_t i=0;i<domain_map->getLocalNumElements();++i)
968 dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
969 Kokkos::deep_copy(dm2cm, dm2cm_host);
970 }
971 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
972 return Teuchos::rcp(new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
973 }
974 }
975 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
976 return Teuchos::null;
977 }
978
979 template<typename local_ordinal_type>
980 local_ordinal_type costTRSM(const local_ordinal_type block_size) {
981 return block_size*block_size;
982 }
983
984 template<typename local_ordinal_type>
985 local_ordinal_type costGEMV(const local_ordinal_type block_size) {
986 return 2*block_size*block_size;
987 }
988
989 template<typename local_ordinal_type>
990 local_ordinal_type costTriDiagSolve(const local_ordinal_type subline_length, const local_ordinal_type block_size) {
991 return 2 * subline_length * costTRSM(block_size) + 2 * (subline_length-1) * costGEMV(block_size);
992 }
993
994 template<typename local_ordinal_type>
995 local_ordinal_type costSolveSchur(const local_ordinal_type num_parts,
996 const local_ordinal_type num_teams,
997 const local_ordinal_type line_length,
998 const local_ordinal_type block_size,
999 const local_ordinal_type n_subparts_per_part) {
1000 const local_ordinal_type subline_length = ceil(double(line_length - (n_subparts_per_part-1) * 2) / n_subparts_per_part);
1001 if (subline_length < 1) {
1002 return INT_MAX;
1003 }
1004
1005 const local_ordinal_type p_n_lines = ceil(double(num_parts)/num_teams);
1006 const local_ordinal_type p_n_sublines = ceil(double(n_subparts_per_part)*num_parts/num_teams);
1007 const local_ordinal_type p_n_sublines_2 = ceil(double(n_subparts_per_part-1)*num_parts/num_teams);
1008
1009 const local_ordinal_type p_costApplyE = p_n_sublines_2 * subline_length * 2 * costGEMV(block_size);
1010 const local_ordinal_type p_costApplyS = p_n_lines * costTriDiagSolve((n_subparts_per_part-1)*2,block_size);
1011 const local_ordinal_type p_costApplyAinv = p_n_sublines * costTriDiagSolve(subline_length,block_size);
1012 const local_ordinal_type p_costApplyC = p_n_sublines_2 * 2 * costGEMV(block_size);
1013
1014 if (n_subparts_per_part == 1) {
1015 return p_costApplyAinv;
1016 }
1017 return p_costApplyE + p_costApplyS + p_costApplyAinv + p_costApplyC;
1018 }
1019
1020 template<typename local_ordinal_type>
1021 local_ordinal_type getAutomaticNSubparts(const local_ordinal_type num_parts,
1022 const local_ordinal_type num_teams,
1023 const local_ordinal_type line_length,
1024 const local_ordinal_type block_size) {
1025 local_ordinal_type n_subparts_per_part_0 = 1;
1026 local_ordinal_type flop_0 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0);
1027 local_ordinal_type flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, n_subparts_per_part_0+1);
1028 while (flop_0 > flop_1) {
1029 flop_0 = flop_1;
1030 flop_1 = costSolveSchur(num_parts, num_teams, line_length, block_size, (++n_subparts_per_part_0)+1);
1031 }
1032 return n_subparts_per_part_0;
1033 }
1034
1035 template<typename ArgActiveExecutionMemorySpace>
1036 struct SolveTridiagsDefaultModeAndAlgo;
1037
1041 template<typename MatrixType>
1042 BlockHelperDetails::PartInterface<MatrixType>
1043 createPartInterface(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1044 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
1045 const Teuchos::Array<Teuchos::Array<typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type> > &partitions,
1046 const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type n_subparts_per_part_in) {
1047 IFPACK2_BLOCKHELPER_TIMER("createPartInterface", createPartInterface);
1049 using local_ordinal_type = typename impl_type::local_ordinal_type;
1050 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1051 using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
1052 using size_type = typename impl_type::size_type;
1053
1054 auto bA = Teuchos::rcp_dynamic_cast<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type>(A);
1055
1056 TEUCHOS_ASSERT(!bA.is_null() || G->getLocalNumRows() != 0);
1057 const local_ordinal_type blocksize = bA.is_null() ? A->getLocalNumRows() / G->getLocalNumRows() : A->getBlockSize();
1058 constexpr int vector_length = impl_type::vector_length;
1059 constexpr int internal_vector_length = impl_type::internal_vector_length;
1060
1061 const auto comm = A->getRowMap()->getComm();
1062
1063 BlockHelperDetails::PartInterface<MatrixType> interf;
1064
1065 const bool jacobi = partitions.size() == 0;
1066 const local_ordinal_type A_n_lclrows = G->getLocalNumRows();
1067 const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
1068
1069 typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
1070 std::vector<size_idx_pair_type> partsz(nparts);
1071
1072 if (!jacobi) {
1073 for (local_ordinal_type i=0;i<nparts;++i)
1074 partsz[i] = size_idx_pair_type(partitions[i].size(), i);
1075 std::sort(partsz.begin(), partsz.end(),
1076 [] (const size_idx_pair_type& x, const size_idx_pair_type& y) {
1077 return x.first > y.first;
1078 });
1079 }
1080
1081 local_ordinal_type n_subparts_per_part;
1082 if (n_subparts_per_part_in == -1) {
1083 // If the number of subparts is set to -1, the user let the algorithm
1084 // decides the value automatically
1085 using execution_space = typename impl_type::execution_space;
1086
1087 const int line_length = partsz[0].first;
1088
1089 const local_ordinal_type team_size =
1090 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
1091 recommended_team_size(blocksize, vector_length, internal_vector_length);
1092
1093 const local_ordinal_type num_teams = std::max(1, execution_space().concurrency() / (team_size * vector_length));
1094
1095 n_subparts_per_part = getAutomaticNSubparts(nparts, num_teams, line_length, blocksize);
1096
1097#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1098 printf("Automatically chosen n_subparts_per_part = %d for nparts = %d, num_teams = %d, team_size = %d, line_length = %d, and blocksize = %d;\n", n_subparts_per_part, nparts, num_teams, team_size, line_length, blocksize);
1099#endif
1100 }
1101 else {
1102 n_subparts_per_part = n_subparts_per_part_in;
1103 }
1104
1105 // Total number of sub lines:
1106 const local_ordinal_type n_sub_parts = nparts * n_subparts_per_part;
1107 // Total number of sub lines + the Schur complement blocks.
1108 // For a given live 2 sub lines implies one Schur complement, 3 sub lines implies two Schur complements etc.
1109 const local_ordinal_type n_sub_parts_and_schur = n_sub_parts + nparts * (n_subparts_per_part-1);
1110
1111#if defined(BLOCKTRIDICONTAINER_DEBUG)
1112 local_ordinal_type nrows = 0;
1113 if (jacobi)
1114 nrows = nparts;
1115 else
1116 for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
1117
1118 TEUCHOS_TEST_FOR_EXCEPT_MSG
1119 (nrows != A_n_lclrows, BlockHelperDetails::get_msg_prefix(comm) << "The #rows implied by the local partition is not "
1120 << "the same as getLocalNumRows: " << nrows << " vs " << A_n_lclrows);
1121#endif
1122
1123 // permutation vector
1124 std::vector<local_ordinal_type> p;
1125 if (jacobi) {
1126 interf.max_partsz = 1;
1127 interf.max_subpartsz = 0;
1128 interf.n_subparts_per_part = 1;
1129 interf.nparts = nparts;
1130 } else {
1131 // reorder parts to maximize simd packing efficiency
1132 p.resize(nparts);
1133
1134 for (local_ordinal_type i=0;i<nparts;++i)
1135 p[i] = partsz[i].second;
1136
1137 interf.max_partsz = partsz[0].first;
1138
1139 constexpr local_ordinal_type connection_length = 2;
1140 const local_ordinal_type sub_line_length = (interf.max_partsz - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1141 const local_ordinal_type last_sub_line_length = interf.max_partsz - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1142
1143 interf.max_subpartsz = (sub_line_length > last_sub_line_length) ? sub_line_length : last_sub_line_length;
1144 interf.n_subparts_per_part = n_subparts_per_part;
1145 interf.nparts = nparts;
1146 }
1147
1148 // allocate parts
1149 interf.partptr = local_ordinal_type_1d_view(do_not_initialize_tag("partptr"), nparts + 1);
1150 interf.lclrow = local_ordinal_type_1d_view(do_not_initialize_tag("lclrow"), A_n_lclrows);
1151 interf.part2rowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0"), nparts + 1);
1152 interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2packrowidx0"), nparts + 1);
1153 interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1154
1155 interf.part2rowidx0_sub = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0_sub"), n_sub_parts_and_schur + 1);
1156 interf.part2packrowidx0_sub = local_ordinal_type_2d_view(do_not_initialize_tag("part2packrowidx0_sub"), nparts, 2 * n_subparts_per_part);
1157 interf.rowidx2part_sub = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
1158
1159 interf.partptr_sub = local_ordinal_type_2d_view(do_not_initialize_tag("partptr_sub"), n_sub_parts_and_schur, 2);
1160
1161 // mirror to host and compute on host execution space
1162 const auto partptr = Kokkos::create_mirror_view(interf.partptr);
1163 const auto partptr_sub = Kokkos::create_mirror_view(interf.partptr_sub);
1164
1165 const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
1166 const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
1167 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1168 const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
1169
1170 const auto part2rowidx0_sub = Kokkos::create_mirror_view(interf.part2rowidx0_sub);
1171 const auto part2packrowidx0_sub = Kokkos::create_mirror_view(Kokkos::HostSpace(), interf.part2packrowidx0_sub);
1172 const auto rowidx2part_sub = Kokkos::create_mirror_view(interf.rowidx2part_sub);
1173
1174 // Determine parts.
1175 interf.row_contiguous = true;
1176 partptr(0) = 0;
1177 part2rowidx0(0) = 0;
1178 part2packrowidx0(0) = 0;
1179 local_ordinal_type pack_nrows = 0;
1180 local_ordinal_type pack_nrows_sub = 0;
1181 if (jacobi) {
1182 IFPACK2_BLOCKHELPER_TIMER("compute part indices (Jacobi)", Jacobi);
1183 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1184 constexpr local_ordinal_type ipnrows = 1;
1185 //assume No overlap.
1186 part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1187 // Since parts are ordered in decreasing size, the size of the first
1188 // part in a pack is the size for all parts in the pack.
1189 if (ip % vector_length == 0) pack_nrows = ipnrows;
1190 part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1191 const local_ordinal_type offset = partptr(ip);
1192 for (local_ordinal_type i=0;i<ipnrows;++i) {
1193 const auto lcl_row = ip;
1194 TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1195 BlockHelperDetails::get_msg_prefix(comm)
1196 << "partitions[" << p[ip] << "]["
1197 << i << "] = " << lcl_row
1198 << " but input matrix implies limits of [0, " << A_n_lclrows-1
1199 << "].");
1200 lclrow(offset+i) = lcl_row;
1201 rowidx2part(offset+i) = ip;
1202 if (interf.row_contiguous && offset+i > 0 && lclrow((offset+i)-1) + 1 != lcl_row)
1203 interf.row_contiguous = false;
1204 }
1205 partptr(ip+1) = offset + ipnrows;
1206 }
1207 part2rowidx0_sub(0) = 0;
1208 partptr_sub(0, 0) = 0;
1209
1210 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1211 constexpr local_ordinal_type ipnrows = 1;
1212 const local_ordinal_type full_line_length = partptr(ip+1) - partptr(ip);
1213
1214 TEUCHOS_TEST_FOR_EXCEPTION
1215 (full_line_length != ipnrows, std::logic_error,
1216 "In the part " << ip );
1217
1218 constexpr local_ordinal_type connection_length = 2;
1219
1220 if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length )
1221 TEUCHOS_TEST_FOR_EXCEPTION
1222 (true, std::logic_error,
1223 "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1224
1225 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1226 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1227
1228 if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1229
1230 for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part;++local_sub_ip) {
1231 const local_ordinal_type sub_ip = nparts*(2*local_sub_ip) + ip;
1232 const local_ordinal_type schur_ip = nparts*(2*local_sub_ip+1) + ip;
1233 if (local_sub_ip != n_subparts_per_part-1) {
1234 if (local_sub_ip != 0) {
1235 partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1236 }
1237 else if (ip != 0) {
1238 partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1239 }
1240 partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1241 partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1242 partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1243
1244 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1245 part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1246
1247#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1248 printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), sub_line_length);
1249 printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1250#endif
1251 }
1252 else {
1253 if (local_sub_ip != 0) {
1254 partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1255 }
1256 else if (ip != 0) {
1257 partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1258 }
1259 partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1260
1261 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1262
1263#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1264 printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), last_sub_line_length);
1265#endif
1266 }
1267 }
1268 }
1269
1270#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1271 std::cout << "partptr_sub = " << std::endl;
1272 for (size_type i = 0; i < partptr_sub.extent(0); ++i) {
1273 for (size_type j = 0; j < partptr_sub.extent(1); ++j) {
1274 std::cout << partptr_sub(i,j) << " ";
1275 }
1276 std::cout << std::endl;
1277 }
1278 std::cout << "partptr_sub end" << std::endl;
1279#endif
1280
1281 {
1282 local_ordinal_type npacks = ceil(float(nparts)/vector_length);
1283
1284 local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1285 for (local_ordinal_type ip=0;ip<ip_max;++ip) {
1286 part2packrowidx0_sub(ip, 0) = 0;
1287 }
1288 for (local_ordinal_type ipack=0;ipack<npacks;++ipack) {
1289 if (ipack != 0) {
1290 local_ordinal_type ip_min = ipack*vector_length;
1291 ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1292 for (local_ordinal_type ip=ip_min;ip<ip_max;++ip) {
1293 part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip-vector_length, part2packrowidx0_sub.extent(1)-1);
1294 }
1295 }
1296
1297 for (size_type local_sub_ip=0; local_sub_ip<part2packrowidx0_sub.extent(1)-1;++local_sub_ip) {
1298 local_ordinal_type ip_min = ipack*vector_length;
1299 ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1300
1301 const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min);
1302
1303 constexpr local_ordinal_type connection_length = 2;
1304
1305 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1306 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1307
1308 if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1309 if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1310 if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length;
1311
1312 part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1313
1314 for (local_ordinal_type ip=ip_min+1;ip<ip_max;++ip) {
1315 part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1316 }
1317 }
1318 }
1319
1320 Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1321 }
1322 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1323 } else {
1324 IFPACK2_BLOCKHELPER_TIMER("compute part indices", indices);
1325 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1326 const auto* part = &partitions[p[ip]];
1327 const local_ordinal_type ipnrows = part->size();
1328 TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
1329 TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
1330 BlockHelperDetails::get_msg_prefix(comm)
1331 << "partition " << p[ip]
1332 << " is empty, which is not allowed.");
1333 //assume No overlap.
1334 part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
1335 // Since parts are ordered in decreasing size, the size of the first
1336 // part in a pack is the size for all parts in the pack.
1337 if (ip % vector_length == 0) pack_nrows = ipnrows;
1338 part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
1339 const local_ordinal_type offset = partptr(ip);
1340 for (local_ordinal_type i=0;i<ipnrows;++i) {
1341 const auto lcl_row = (*part)[i];
1342 TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
1343 BlockHelperDetails::get_msg_prefix(comm)
1344 << "partitions[" << p[ip] << "]["
1345 << i << "] = " << lcl_row
1346 << " but input matrix implies limits of [0, " << A_n_lclrows-1
1347 << "].");
1348 lclrow(offset+i) = lcl_row;
1349 rowidx2part(offset+i) = ip;
1350 if (interf.row_contiguous && offset+i > 0 && lclrow((offset+i)-1) + 1 != lcl_row)
1351 interf.row_contiguous = false;
1352 }
1353 partptr(ip+1) = offset + ipnrows;
1354
1355#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1356 printf("Part index = ip = %d, first LID associated to the part = partptr(ip) = offset = %d, part->size() = ipnrows = %d;\n", ip, offset, ipnrows);
1357 printf("partptr(%d+1) = %d\n", ip, partptr(ip+1));
1358#endif
1359 }
1360
1361 part2rowidx0_sub(0) = 0;
1362 partptr_sub(0, 0) = 0;
1363 //const local_ordinal_type number_pack_per_sub_part = ceil(float(nparts)/vector_length);
1364
1365 for (local_ordinal_type ip=0;ip<nparts;++ip) {
1366 const auto* part = &partitions[p[ip]];
1367 const local_ordinal_type ipnrows = part->size();
1368 const local_ordinal_type full_line_length = partptr(ip+1) - partptr(ip);
1369
1370 TEUCHOS_TEST_FOR_EXCEPTION
1371 (full_line_length != ipnrows, std::logic_error,
1372 "In the part " << ip );
1373
1374 constexpr local_ordinal_type connection_length = 2;
1375
1376 if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length )
1377 TEUCHOS_TEST_FOR_EXCEPTION
1378 (true, std::logic_error,
1379 "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");
1380
1381 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1382 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1383
1384 if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
1385
1386 for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part;++local_sub_ip) {
1387 const local_ordinal_type sub_ip = nparts*(2*local_sub_ip) + ip;
1388 const local_ordinal_type schur_ip = nparts*(2*local_sub_ip+1) + ip;
1389 if (local_sub_ip != n_subparts_per_part-1) {
1390 if (local_sub_ip != 0) {
1391 partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1392 }
1393 else if (ip != 0) {
1394 partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1395 }
1396 partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
1397 partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
1398 partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
1399
1400 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
1401 part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
1402
1403#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1404 printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), sub_line_length);
1405 printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
1406#endif
1407 }
1408 else {
1409 if (local_sub_ip != 0) {
1410 partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
1411 }
1412 else if (ip != 0) {
1413 partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
1414 }
1415 partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
1416
1417 part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
1418
1419#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
1420 printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(sub_ip, 0), last_sub_line_length);
1421#endif
1422 }
1423 }
1424 }
1425
1426 {
1427 local_ordinal_type npacks = ceil(float(nparts)/vector_length);
1428
1429 local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
1430 for (local_ordinal_type ip=0;ip<ip_max;++ip) {
1431 part2packrowidx0_sub(ip, 0) = 0;
1432 }
1433 for (local_ordinal_type ipack=0;ipack<npacks;++ipack) {
1434 if (ipack != 0) {
1435 local_ordinal_type ip_min = ipack*vector_length;
1436 ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1437 for (local_ordinal_type ip=ip_min;ip<ip_max;++ip) {
1438 part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip-vector_length, part2packrowidx0_sub.extent(1)-1);
1439 }
1440 }
1441
1442 for (size_type local_sub_ip=0; local_sub_ip<part2packrowidx0_sub.extent(1)-1;++local_sub_ip) {
1443 local_ordinal_type ip_min = ipack*vector_length;
1444 ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
1445
1446 const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min);
1447
1448 constexpr local_ordinal_type connection_length = 2;
1449
1450 const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
1451 const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
1452
1453 if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
1454 if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
1455 if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length;
1456
1457 part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
1458
1459 for (local_ordinal_type ip=ip_min+1;ip<ip_max;++ip) {
1460 part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
1461 }
1462 }
1463 }
1464
1465 Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
1466 }
1467 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1468 }
1469#if defined(BLOCKTRIDICONTAINER_DEBUG)
1470 TEUCHOS_ASSERT(partptr(nparts) == nrows);
1471#endif
1472 if (lclrow(0) != 0) interf.row_contiguous = false;
1473
1474 Kokkos::deep_copy(interf.partptr, partptr);
1475 Kokkos::deep_copy(interf.lclrow, lclrow);
1476
1477 Kokkos::deep_copy(interf.partptr_sub, partptr_sub);
1478
1479 //assume No overlap. Thus:
1480 interf.part2rowidx0 = interf.partptr;
1481 Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
1482
1483 interf.part2packrowidx0_back = part2packrowidx0_sub(part2packrowidx0_sub.extent(0) - 1, part2packrowidx0_sub.extent(1) - 1);
1484 Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
1485
1486 { // Fill packptr.
1487 IFPACK2_BLOCKHELPER_TIMER("Fill packptr", packptr0);
1488 local_ordinal_type npacks = ceil(float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1);
1489 npacks = 0;
1490 for (local_ordinal_type ip=1;ip<=nparts;++ip) //n_sub_parts_and_schur
1491 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1492 ++npacks;
1493
1494 interf.packptr = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1495 const auto packptr = Kokkos::create_mirror_view(interf.packptr);
1496 packptr(0) = 0;
1497 for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
1498 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1499 packptr(k++) = ip;
1500
1501 Kokkos::deep_copy(interf.packptr, packptr);
1502
1503 local_ordinal_type npacks_per_subpart = ceil(float(nparts)/vector_length);
1504 npacks = ceil(float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1);
1505
1506 interf.packindices_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packindices_sub"), npacks_per_subpart*n_subparts_per_part);
1507 interf.packindices_schur = local_ordinal_type_2d_view(do_not_initialize_tag("packindices_schur"), npacks_per_subpart,n_subparts_per_part-1);
1508
1509 const auto packindices_sub = Kokkos::create_mirror_view(interf.packindices_sub);
1510 const auto packindices_schur = Kokkos::create_mirror_view(interf.packindices_schur);
1511
1512
1513 // Fill packindices_sub and packindices_schur
1514 for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part-1;++local_sub_ip) {
1515 for (local_ordinal_type local_pack_ip=0; local_pack_ip<npacks_per_subpart;++local_pack_ip) {
1516 packindices_sub(local_sub_ip * npacks_per_subpart + local_pack_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip;
1517 packindices_schur(local_pack_ip,local_sub_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip + npacks_per_subpart;
1518 }
1519 }
1520
1521 for (local_ordinal_type local_pack_ip=0; local_pack_ip<npacks_per_subpart;++local_pack_ip) {
1522 packindices_sub((n_subparts_per_part-1) * npacks_per_subpart + local_pack_ip) = 2 * (n_subparts_per_part-1) * npacks_per_subpart + local_pack_ip;
1523 }
1524
1525#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1526 std::cout << "packindices_sub = " << std::endl;
1527 for (size_type i = 0; i < packindices_sub.extent(0); ++i) {
1528 std::cout << packindices_sub(i) << " ";
1529 }
1530 std::cout << std::endl;
1531 std::cout << "packindices_sub end" << std::endl;
1532
1533 std::cout << "packindices_schur = " << std::endl;
1534 for (size_type i = 0; i < packindices_schur.extent(0); ++i) {
1535 for (size_type j = 0; j < packindices_schur.extent(1); ++j) {
1536 std::cout << packindices_schur(i,j) << " ";
1537 }
1538 std::cout << std::endl;
1539 }
1540
1541 std::cout << "packindices_schur end" << std::endl;
1542#endif
1543
1544 Kokkos::deep_copy(interf.packindices_sub, packindices_sub);
1545 Kokkos::deep_copy(interf.packindices_schur, packindices_schur);
1546
1547 interf.packptr_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
1548 const auto packptr_sub = Kokkos::create_mirror_view(interf.packptr_sub);
1549 packptr_sub(0) = 0;
1550 for (local_ordinal_type k=0;k<npacks + 1;++k)
1551 packptr_sub(k) = packptr(k%npacks_per_subpart) + (k / npacks_per_subpart) * packptr(npacks_per_subpart);
1552
1553 Kokkos::deep_copy(interf.packptr_sub, packptr_sub);
1554 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1555 }
1556 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1557
1558 return interf;
1559 }
1560
1564 template <typename MatrixType>
1565 struct BlockTridiags {
1567 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1568 using size_type_1d_view = typename impl_type::size_type_1d_view;
1569 using size_type_2d_view = typename impl_type::size_type_2d_view;
1570 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1571 using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1572
1573 // flat_td_ptr(i) is the index into flat-array values of the start of the
1574 // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length ==
1575 // 1, pack_td_ptr is the same as flat_td_ptr; if vector_length > 1, then i %
1576 // vector_length is the position in the pack.
1577 size_type_2d_view flat_td_ptr, pack_td_ptr, pack_td_ptr_schur;
1578 // List of local column indices into A from which to grab
1579 // data. flat_td_ptr(i) points to the start of the i'th tridiag's data.
1580 local_ordinal_type_1d_view A_colindsub;
1581 // Tridiag block values. pack_td_ptr(i) points to the start of the i'th
1582 // tridiag's pack, and i % vector_length gives the position in the pack.
1583 vector_type_3d_view values;
1584 // Schur block values. pack_td_ptr_schur(i) points to the start of the i'th
1585 // Schur's pack, and i % vector_length gives the position in the pack.
1586 vector_type_3d_view values_schur;
1587 // inv(A_00)*A_01 block values.
1588 vector_type_4d_view e_values;
1589
1590 bool is_diagonal_only;
1591
1592 BlockTridiags() = default;
1593 BlockTridiags(const BlockTridiags &b) = default;
1594
1595 // Index into row-major block of a tridiag.
1596 template <typename idx_type>
1597 static KOKKOS_FORCEINLINE_FUNCTION
1598 idx_type IndexToRow (const idx_type& ind) { return (ind + 1) / 3; }
1599 // Given a row of a row-major tridiag, return the index of the first block
1600 // in that row.
1601 template <typename idx_type>
1602 static KOKKOS_FORCEINLINE_FUNCTION
1603 idx_type RowToIndex (const idx_type& row) { return row > 0 ? 3*row - 1 : 0; }
1604 // Number of blocks in a tridiag having a given number of rows.
1605 template <typename idx_type>
1606 static KOKKOS_FORCEINLINE_FUNCTION
1607 idx_type NumBlocks (const idx_type& nrows) { return nrows > 0 ? 3*nrows - 2 : 0; }
1608 // Number of blocks associated to a Schur complement having a given number of rows.
1609 template <typename idx_type>
1610 static KOKKOS_FORCEINLINE_FUNCTION
1611 idx_type NumBlocksSchur (const idx_type& nrows) { return nrows > 0 ? 3*nrows + 2 : 0; }
1612 };
1613
1614
1618 template<typename MatrixType>
1620 createBlockTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf) {
1621 IFPACK2_BLOCKHELPER_TIMER("createBlockTridiags", createBlockTridiags0);
1623 using execution_space = typename impl_type::execution_space;
1624 using local_ordinal_type = typename impl_type::local_ordinal_type;
1625 using size_type = typename impl_type::size_type;
1626 using size_type_2d_view = typename impl_type::size_type_2d_view;
1627
1628 constexpr int vector_length = impl_type::vector_length;
1629
1631
1632 const local_ordinal_type ntridiags = interf.partptr_sub.extent(0);
1633
1634 { // construct the flat index pointers into the tridiag values array.
1635 btdm.flat_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.flat_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part);
1636 const Kokkos::RangePolicy<execution_space> policy(0, 2 * interf.nparts * interf.n_subparts_per_part );
1637 Kokkos::parallel_scan
1638 ("createBlockTridiags::RangePolicy::flat_td_ptr",
1639 policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1640 const local_ordinal_type partidx = i/(2 * interf.n_subparts_per_part);
1641 const local_ordinal_type local_subpartidx = i % (2 * interf.n_subparts_per_part);
1642
1643 if (final) {
1644 btdm.flat_td_ptr(partidx, local_subpartidx) = update;
1645 }
1646 if (local_subpartidx != (2 * interf.n_subparts_per_part -1)) {
1647 const local_ordinal_type nrows = interf.partptr_sub(interf.nparts*local_subpartidx + partidx,1) - interf.partptr_sub(interf.nparts*local_subpartidx + partidx,0);
1648 if (local_subpartidx % 2 == 0)
1649 update += btdm.NumBlocks(nrows);
1650 else
1651 update += btdm.NumBlocksSchur(nrows);
1652 }
1653 });
1654
1655 const auto nblocks = Kokkos::create_mirror_view_and_copy
1656 (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, interf.nparts-1, 2*interf.n_subparts_per_part-1));
1657 btdm.is_diagonal_only = (static_cast<local_ordinal_type>(nblocks()) == ntridiags);
1658 }
1659
1660 // And the packed index pointers.
1661 if (vector_length == 1) {
1662 btdm.pack_td_ptr = btdm.flat_td_ptr;
1663 } else {
1664 //const local_ordinal_type npacks = interf.packptr_sub.extent(0) - 1;
1665
1666 local_ordinal_type npacks_per_subpart = 0;
1667 const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
1668 Kokkos::deep_copy(part2packrowidx0, interf.part2packrowidx0);
1669 for (local_ordinal_type ip=1;ip<=interf.nparts;++ip) //n_sub_parts_and_schur
1670 if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
1671 ++npacks_per_subpart;
1672
1673 btdm.pack_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part);
1674 const Kokkos::RangePolicy<execution_space> policy(0,npacks_per_subpart);
1675
1676 Kokkos::parallel_for
1677 ("createBlockTridiags::RangePolicy::pack_td_ptr",
1678 policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1679 for (local_ordinal_type j = 0; j < 2*interf.n_subparts_per_part; ++j) {
1680 const local_ordinal_type pack_id = ( j == 2*interf.n_subparts_per_part-1 ) ? i+(j-1)*npacks_per_subpart : i+j*npacks_per_subpart;
1681 const local_ordinal_type nparts_in_pack = interf.packptr_sub(pack_id+1) - interf.packptr_sub(pack_id);
1682
1683 const local_ordinal_type parti = interf.packptr_sub(pack_id);
1684 const local_ordinal_type partidx = parti%interf.nparts;
1685
1686 for (local_ordinal_type pti=0;pti<nparts_in_pack;++pti) {
1687 btdm.pack_td_ptr(partidx+pti, j) = btdm.flat_td_ptr(i, j);
1688 }
1689 }
1690 });
1691 }
1692
1693 btdm.pack_td_ptr_schur = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr_schur"), interf.nparts, interf.n_subparts_per_part);
1694
1695 const auto host_pack_td_ptr_schur = Kokkos::create_mirror_view(btdm.pack_td_ptr_schur);
1696 constexpr local_ordinal_type connection_length = 2;
1697
1698 host_pack_td_ptr_schur(0,0) = 0;
1699 for (local_ordinal_type i = 0; i < interf.nparts; ++i) {
1700 if (i % vector_length == 0) {
1701 if (i != 0)
1702 host_pack_td_ptr_schur(i,0) = host_pack_td_ptr_schur(i-1,host_pack_td_ptr_schur.extent(1)-1);
1703 for (local_ordinal_type j = 0; j < interf.n_subparts_per_part-1; ++j) {
1704 host_pack_td_ptr_schur(i,j+1) = host_pack_td_ptr_schur(i,j) + btdm.NumBlocks(connection_length) + (j != 0 ? 1 : 0) + (j != interf.n_subparts_per_part-2 ? 1 : 0);
1705 }
1706 }
1707 else {
1708 for (local_ordinal_type j = 0; j < interf.n_subparts_per_part; ++j) {
1709 host_pack_td_ptr_schur(i,j) = host_pack_td_ptr_schur(i-1,j);
1710 }
1711 }
1712 }
1713
1714 Kokkos::deep_copy(btdm.pack_td_ptr_schur, host_pack_td_ptr_schur);
1715
1716#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
1717 const auto host_flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1718 std::cout << "flat_td_ptr = " << std::endl;
1719 for (size_type i = 0; i < host_flat_td_ptr.extent(0); ++i) {
1720 for (size_type j = 0; j < host_flat_td_ptr.extent(1); ++j) {
1721 std::cout << host_flat_td_ptr(i,j) << " ";
1722 }
1723 std::cout << std::endl;
1724 }
1725 std::cout << "flat_td_ptr end" << std::endl;
1726
1727 const auto host_pack_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.pack_td_ptr);
1728
1729 std::cout << "pack_td_ptr = " << std::endl;
1730 for (size_type i = 0; i < host_pack_td_ptr.extent(0); ++i) {
1731 for (size_type j = 0; j < host_pack_td_ptr.extent(1); ++j) {
1732 std::cout << host_pack_td_ptr(i,j) << " ";
1733 }
1734 std::cout << std::endl;
1735 }
1736 std::cout << "pack_td_ptr end" << std::endl;
1737
1738
1739 std::cout << "pack_td_ptr_schur = " << std::endl;
1740 for (size_type i = 0; i < host_pack_td_ptr_schur.extent(0); ++i) {
1741 for (size_type j = 0; j < host_pack_td_ptr_schur.extent(1); ++j) {
1742 std::cout << host_pack_td_ptr_schur(i,j) << " ";
1743 }
1744 std::cout << std::endl;
1745 }
1746 std::cout << "pack_td_ptr_schur end" << std::endl;
1747#endif
1748
1749 // values and A_colindsub are created in the symbolic phase
1750 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
1751
1752 return btdm;
1753 }
1754
1755 // Set the tridiags to be I to the full pack block size. That way, if a
1756 // tridiag within a pack is shorter than the longest one, the extra blocks are
1757 // processed in a safe way. Similarly, in the solve phase, if the extra blocks
1758 // in the packed multvector are 0, and the tridiag LU reflects the extra I
1759 // blocks, then the solve proceeds as though the extra blocks aren't
1760 // present. Since this extra work is part of the SIMD calls, it's not actually
1761 // extra work. Instead, it means we don't have to put checks or masks in, or
1762 // quiet NaNs. This functor has to be called just once, in the symbolic phase,
1763 // since the numeric phase fills in only the used entries, leaving these I
1764 // blocks intact.
1765 template<typename MatrixType>
1766 void
1767 setTridiagsToIdentity
1768 (const BlockTridiags<MatrixType>& btdm,
1769 const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view& packptr)
1770 {
1772 using execution_space = typename impl_type::execution_space;
1773 using local_ordinal_type = typename impl_type::local_ordinal_type;
1774 using size_type_2d_view = typename impl_type::size_type_2d_view;
1775
1776 const ConstUnmanaged<size_type_2d_view> pack_td_ptr(btdm.pack_td_ptr);
1777 const local_ordinal_type blocksize = btdm.values.extent(1);
1778
1779 {
1780 const int vector_length = impl_type::vector_length;
1781 const int internal_vector_length = impl_type::internal_vector_length;
1782
1783 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
1784 using internal_vector_type = typename impl_type::internal_vector_type;
1785 using internal_vector_type_4d_view =
1786 typename impl_type::internal_vector_type_4d_view;
1787
1788 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1789 const internal_vector_type_4d_view values
1790 (reinterpret_cast<internal_vector_type*>(btdm.values.data()),
1791 btdm.values.extent(0),
1792 btdm.values.extent(1),
1793 btdm.values.extent(2),
1794 vector_length/internal_vector_length);
1795 const local_ordinal_type vector_loop_size = values.extent(3);
1796#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1797 local_ordinal_type total_team_size(0);
1798 if (blocksize <= 5) total_team_size = 32;
1799 else if (blocksize <= 9) total_team_size = 64;
1800 else if (blocksize <= 12) total_team_size = 96;
1801 else if (blocksize <= 16) total_team_size = 128;
1802 else if (blocksize <= 20) total_team_size = 160;
1803 else total_team_size = 160;
1804 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1805 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1806#elif defined(KOKKOS_ENABLE_HIP)
1807 // FIXME: HIP
1808 // These settings might be completely wrong
1809 // will have to do some experiments to decide
1810 // what makes sense on AMD GPUs
1811 local_ordinal_type total_team_size(0);
1812 if (blocksize <= 5) total_team_size = 32;
1813 else if (blocksize <= 9) total_team_size = 64;
1814 else if (blocksize <= 12) total_team_size = 96;
1815 else if (blocksize <= 16) total_team_size = 128;
1816 else if (blocksize <= 20) total_team_size = 160;
1817 else total_team_size = 160;
1818 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1819 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1820#elif defined(KOKKOS_ENABLE_SYCL)
1821 // SYCL: FIXME
1822 local_ordinal_type total_team_size(0);
1823 if (blocksize <= 5) total_team_size = 32;
1824 else if (blocksize <= 9) total_team_size = 64;
1825 else if (blocksize <= 12) total_team_size = 96;
1826 else if (blocksize <= 16) total_team_size = 128;
1827 else if (blocksize <= 20) total_team_size = 160;
1828 else total_team_size = 160;
1829 const local_ordinal_type team_size = total_team_size/vector_loop_size;
1830 const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1831#else
1832 // Host architecture: team size is always one
1833 const team_policy_type policy(packptr.extent(0)-1, 1, 1);
1834#endif
1835 Kokkos::parallel_for
1836 ("setTridiagsToIdentity::TeamPolicy",
1837 policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
1838 const local_ordinal_type k = member.league_rank();
1839 const local_ordinal_type ibeg = pack_td_ptr(packptr(k),0);
1840 const local_ordinal_type iend = pack_td_ptr(packptr(k),pack_td_ptr.extent(1)-1);
1841
1842 const local_ordinal_type diff = iend - ibeg;
1843 const local_ordinal_type icount = diff/3 + (diff%3 > 0);
1844 const btdm_scalar_type one(1);
1845 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
1846 Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](const local_ordinal_type &ii) {
1847 const local_ordinal_type i = ibeg + ii*3;
1848 for (local_ordinal_type j=0;j<blocksize;++j) {
1849 values(i,j,j,v) = one;
1850 }
1851 });
1852 });
1853 });
1854 }
1855 }
1856
1860 template<typename MatrixType>
1861 void
1862 performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
1863 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &g,
1864 const BlockHelperDetails::PartInterface<MatrixType> &interf,
1867 const bool overlap_communication_and_computation,
1868 const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
1869 bool useSeqMethod) {
1870 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SymbolicPhase", SymbolicPhase);
1871
1873
1874 using execution_space = typename impl_type::execution_space;
1875 using host_execution_space = typename impl_type::host_execution_space;
1876
1877 using local_ordinal_type = typename impl_type::local_ordinal_type;
1878 using global_ordinal_type = typename impl_type::global_ordinal_type;
1879 using size_type = typename impl_type::size_type;
1880 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1881 using size_type_1d_view = typename impl_type::size_type_1d_view;
1882 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1883 using vector_type_4d_view = typename impl_type::vector_type_4d_view;
1884 using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
1885 using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1886
1887 constexpr int vector_length = impl_type::vector_length;
1888
1889 const auto comm = A->getRowMap()->getComm();
1890
1891 auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A);
1892 auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A);
1893
1894 bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
1895 TEUCHOS_ASSERT(hasBlockCrsMatrix || g->getLocalNumRows() != 0);
1896 const local_ordinal_type blocksize = hasBlockCrsMatrix ? A->getBlockSize() : A->getLocalNumRows()/g->getLocalNumRows();
1897
1898 // mirroring to host
1899 const auto partptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.partptr);
1900 const auto lclrow = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.lclrow);
1901 const auto rowidx2part = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.rowidx2part);
1902 const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1903 const auto packptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.packptr);
1904
1905 const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1906
1907 Kokkos::View<local_ordinal_type*,host_execution_space> col2row("col2row", A->getLocalNumCols());
1908
1909 // find column to row map on host
1910
1911 Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1912 {
1913 const auto rowmap = g->getRowMap();
1914 const auto colmap = g->getColMap();
1915 const auto dommap = g->getDomainMap();
1916 TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1917
1918#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
1919 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1920 Kokkos::parallel_for
1921 ("performSymbolicPhase::RangePolicy::col2row",
1922 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1923 const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1924 TEUCHOS_ASSERT(gid != Teuchos::OrdinalTraits<global_ordinal_type>::invalid());
1925 if (dommap->isNodeGlobalElement(gid)) {
1926 const local_ordinal_type lc = colmap->getLocalElement(gid);
1927# if defined(BLOCKTRIDICONTAINER_DEBUG)
1928 TEUCHOS_TEST_FOR_EXCEPT_MSG(lc == Teuchos::OrdinalTraits<local_ordinal_type>::invalid(),
1929 BlockHelperDetails::get_msg_prefix(comm) << "GID " << gid
1930 << " gives an invalid local column.");
1931# endif
1932 col2row(lc) = lr;
1933 }
1934 });
1935#endif
1936 }
1937
1938 // construct the D and R graphs in A = D + R.
1939 {
1940 const auto local_graph = g->getLocalGraphHost();
1941 const auto local_graph_rowptr = local_graph.row_map;
1942 TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
1943 const auto local_graph_colidx = local_graph.entries;
1944
1945 //assume no overlap.
1946
1947 Kokkos::View<local_ordinal_type*,host_execution_space> lclrow2idx("lclrow2idx", nrows);
1948 {
1949 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1950 Kokkos::parallel_for
1951 ("performSymbolicPhase::RangePolicy::lclrow2idx",
1952 policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1953 lclrow2idx[lclrow(i)] = i;
1954 });
1955 }
1956
1957 // count (block) nnzs in D and R.
1959 typename sum_reducer_type::value_type sum_reducer_value;
1960 {
1961 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1962 Kokkos::parallel_reduce
1963 // profiling interface does not work
1964 (//"performSymbolicPhase::RangePolicy::count_nnz",
1965 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, typename sum_reducer_type::value_type &update) {
1966 // LID -> index.
1967 const local_ordinal_type ri0 = lclrow2idx[lr];
1968 const local_ordinal_type pi0 = rowidx2part(ri0);
1969 for (size_type j=local_graph_rowptr(lr);j<local_graph_rowptr(lr+1);++j) {
1970 const local_ordinal_type lc = local_graph_colidx(j);
1971 const local_ordinal_type lc2r = col2row[lc];
1972 bool incr_R = false;
1973 do { // breakable
1974 if (lc2r == (local_ordinal_type) -1) {
1975 incr_R = true;
1976 break;
1977 }
1978 const local_ordinal_type ri = lclrow2idx[lc2r];
1979 const local_ordinal_type pi = rowidx2part(ri);
1980 if (pi != pi0) {
1981 incr_R = true;
1982 break;
1983 }
1984 // Test for being in the tridiag. This is done in index space. In
1985 // LID space, tridiag LIDs in a row are not necessarily related by
1986 // {-1, 0, 1}.
1987 if (ri0 + 1 >= ri && ri0 <= ri + 1)
1988 ++update.v[0]; // D_nnz
1989 else
1990 incr_R = true;
1991 } while (0);
1992 if (incr_R) {
1993 if (lc < nrows) ++update.v[1]; // R_nnz_owned
1994 else ++update.v[2]; // R_nnz_remote
1995 }
1996 }
1997 }, sum_reducer_type(sum_reducer_value));
1998 }
1999 size_type D_nnz = sum_reducer_value.v[0];
2000 size_type R_nnz_owned = sum_reducer_value.v[1];
2001 size_type R_nnz_remote = sum_reducer_value.v[2];
2002
2003 if (!overlap_communication_and_computation) {
2004 R_nnz_owned += R_nnz_remote;
2005 R_nnz_remote = 0;
2006 }
2007
2008 // construct the D_00 graph.
2009 {
2010 const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
2011
2012 btdm.A_colindsub = local_ordinal_type_1d_view("btdm.A_colindsub", D_nnz);
2013 const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
2014
2015#if defined(BLOCKTRIDICONTAINER_DEBUG)
2016 Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
2017#endif
2018
2019 const local_ordinal_type nparts = partptr.extent(0) - 1;
2020
2021 {
2022 const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
2023 Kokkos::parallel_for
2024 ("performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
2025 policy, KOKKOS_LAMBDA(const local_ordinal_type &pi0) {
2026 const local_ordinal_type part_ri0 = part2rowidx0(pi0);
2027 local_ordinal_type offset = 0;
2028 for (local_ordinal_type ri0=partptr(pi0);ri0<partptr(pi0+1);++ri0) {
2029 const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
2030 offset = 1;
2031 const local_ordinal_type lr0 = lclrow(ri0);
2032 const size_type j0 = local_graph_rowptr(lr0);
2033 for (size_type j=j0;j<local_graph_rowptr(lr0+1);++j) {
2034 const local_ordinal_type lc = local_graph_colidx(j);
2035 const local_ordinal_type lc2r = col2row[lc];
2036 if (lc2r == (local_ordinal_type) -1) continue;
2037 const local_ordinal_type ri = lclrow2idx[lc2r];
2038 const local_ordinal_type pi = rowidx2part(ri);
2039 if (pi != pi0) continue;
2040 if (ri + 1 < ri0 || ri > ri0 + 1) continue;
2041 const local_ordinal_type row_entry = j - j0;
2042 D_A_colindsub(flat_td_ptr(pi0,0) + ((td_row_os + ri) - ri0)) = row_entry;
2043 }
2044 }
2045 });
2046 }
2047#if defined(BLOCKTRIDICONTAINER_DEBUG)
2048 for (size_t i=0;i<D_A_colindsub.extent(0);++i)
2049 TEUCHOS_ASSERT(D_A_colindsub(i) != Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
2050#endif
2051 Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
2052
2053 // Allocate values.
2054 {
2055 const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, btdm.pack_td_ptr.extent(0)-1, btdm.pack_td_ptr.extent(1)-1);
2056 const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
2057 btdm.values = vector_type_3d_view("btdm.values", num_packed_blocks(), blocksize, blocksize);
2058
2059 if (interf.n_subparts_per_part > 1) {
2060 const auto pack_td_ptr_schur_last = Kokkos::subview(btdm.pack_td_ptr_schur, btdm.pack_td_ptr_schur.extent(0)-1, btdm.pack_td_ptr_schur.extent(1)-1);
2061 const auto num_packed_blocks_schur = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_schur_last);
2062 btdm.values_schur = vector_type_3d_view("btdm.values_schur", num_packed_blocks_schur(), blocksize, blocksize);
2063 }
2064
2065 if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
2066 }
2067 }
2068
2069 // Construct the R graph.
2070 {
2071 amd.rowptr = size_type_1d_view("amd.rowptr", nrows + 1);
2072 amd.A_colindsub = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub"), R_nnz_owned);
2073
2074 const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
2075 const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
2076
2077 amd.rowptr_remote = size_type_1d_view("amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
2078 amd.A_colindsub_remote = local_ordinal_type_1d_view(do_not_initialize_tag("amd.A_colindsub_remote"), R_nnz_remote);
2079
2080 const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
2081 const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
2082
2083 {
2084 const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
2085 Kokkos::parallel_for
2086 ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
2087 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
2088 const local_ordinal_type ri0 = lclrow2idx[lr];
2089 const local_ordinal_type pi0 = rowidx2part(ri0);
2090 const size_type j0 = local_graph_rowptr(lr);
2091 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
2092 const local_ordinal_type lc = local_graph_colidx(j);
2093 const local_ordinal_type lc2r = col2row[lc];
2094 if (lc2r != (local_ordinal_type) -1) {
2095 const local_ordinal_type ri = lclrow2idx[lc2r];
2096 const local_ordinal_type pi = rowidx2part(ri);
2097 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
2098 continue;
2099 }
2100 }
2101 // exclusive scan will be performed later
2102 if (!overlap_communication_and_computation || lc < nrows) {
2103 ++R_rowptr(lr);
2104 } else {
2105 ++R_rowptr_remote(lr);
2106 }
2107 }
2108 });
2109 }
2110
2111 // exclusive scan
2113 {
2114 Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
2115 Kokkos::parallel_scan
2116 ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
2117 policy, KOKKOS_LAMBDA(const local_ordinal_type &lr,
2118 update_type &update,
2119 const bool &final) {
2120 update_type val;
2121 val.v[0] = R_rowptr(lr);
2122 if (overlap_communication_and_computation)
2123 val.v[1] = R_rowptr_remote(lr);
2124
2125 if (final) {
2126 R_rowptr(lr) = update.v[0];
2127 if (overlap_communication_and_computation)
2128 R_rowptr_remote(lr) = update.v[1];
2129
2130 if (lr < nrows) {
2131 const local_ordinal_type ri0 = lclrow2idx[lr];
2132 const local_ordinal_type pi0 = rowidx2part(ri0);
2133
2134 size_type cnt_rowptr = R_rowptr(lr);
2135 size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage
2136
2137 const size_type j0 = local_graph_rowptr(lr);
2138 for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
2139 const local_ordinal_type lc = local_graph_colidx(j);
2140 const local_ordinal_type lc2r = col2row[lc];
2141 if (lc2r != (local_ordinal_type) -1) {
2142 const local_ordinal_type ri = lclrow2idx[lc2r];
2143 const local_ordinal_type pi = rowidx2part(ri);
2144 if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
2145 continue;
2146 }
2147 const local_ordinal_type row_entry = j - j0;
2148 if (!overlap_communication_and_computation || lc < nrows)
2149 R_A_colindsub(cnt_rowptr++) = row_entry;
2150 else
2151 R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
2152 }
2153 }
2154 }
2155 update += val;
2156 });
2157 }
2158 TEUCHOS_ASSERT(R_rowptr(nrows) == R_nnz_owned);
2159 Kokkos::deep_copy(amd.rowptr, R_rowptr);
2160 Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
2161 if (overlap_communication_and_computation) {
2162 TEUCHOS_ASSERT(R_rowptr_remote(nrows) == R_nnz_remote);
2163 Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
2164 Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
2165 }
2166
2167 // Allocate or view values.
2168 if (hasBlockCrsMatrix)
2169 amd.tpetra_values = (const_cast<block_crs_matrix_type*>(A_bcrs.get())->getValuesDeviceNonConst());
2170 else {
2171 amd.tpetra_values = (const_cast<crs_matrix_type*>(A_crs.get()))->getLocalValuesDevice (Tpetra::Access::ReadWrite);
2172 }
2173 }
2174
2175 // Allocate view for E and initialize the values with B:
2176
2177 if (interf.n_subparts_per_part > 1)
2178 btdm.e_values = vector_type_4d_view("btdm.e_values", 2, interf.part2packrowidx0_back, blocksize, blocksize);
2179 }
2180 // Precompute offsets of each A and x entry to speed up residual.
2181 // Applies if all of these are true:
2182 // - hasBlockCrsMatrix
2183 // - execution_space is a GPU
2184 // - !useSeqMethod (since this uses a different scheme for indexing A,x)
2185 //
2186 // Reading A, x take up to 4 and 6 levels of indirection respectively,
2187 // but precomputing the offsets reduces it to 2 for both (get index, then value)
2188 if(BlockHelperDetails::is_device<execution_space>::value && !useSeqMethod && hasBlockCrsMatrix)
2189 {
2190 bool is_async_importer_active = !async_importer.is_null();
2191 local_ordinal_type_1d_view dm2cm = is_async_importer_active ? async_importer->dm2cm : local_ordinal_type_1d_view();
2192 bool ownedRemoteSeparate = overlap_communication_and_computation || !is_async_importer_active;
2193 BlockHelperDetails::precompute_A_x_offsets<MatrixType>(amd, interf, g, dm2cm, blocksize, ownedRemoteSeparate);
2194 }
2195
2196 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
2197 }
2198
2199
2203 template<typename ArgActiveExecutionMemorySpace>
2205
2206 template<>
2207 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
2208 typedef KB::Mode::Serial mode_type;
2209#if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2210 typedef KB::Algo::Level3::CompactMKL algo_type;
2211#else
2212 typedef KB::Algo::Level3::Blocked algo_type;
2213#endif
2214 static int recommended_team_size(const int /* blksize */,
2215 const int /* vector_length */,
2216 const int /* internal_vector_length */) {
2217 return 1;
2218 }
2219
2220 };
2221
2222#if defined(KOKKOS_ENABLE_CUDA)
2223 static inline int ExtractAndFactorizeRecommendedCudaTeamSize(const int blksize,
2224 const int vector_length,
2225 const int internal_vector_length) {
2226 const int vector_size = vector_length/internal_vector_length;
2227 int total_team_size(0);
2228 if (blksize <= 5) total_team_size = 32;
2229 else if (blksize <= 9) total_team_size = 32; // 64
2230 else if (blksize <= 12) total_team_size = 96;
2231 else if (blksize <= 16) total_team_size = 128;
2232 else if (blksize <= 20) total_team_size = 160;
2233 else total_team_size = 160;
2234 return 2*total_team_size/vector_size;
2235 }
2236 template<>
2237 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2238 typedef KB::Mode::Team mode_type;
2239 typedef KB::Algo::Level3::Unblocked algo_type;
2240 static int recommended_team_size(const int blksize,
2241 const int vector_length,
2242 const int internal_vector_length) {
2243 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2244 }
2245 };
2246 template<>
2247 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2248 typedef KB::Mode::Team mode_type;
2249 typedef KB::Algo::Level3::Unblocked algo_type;
2250 static int recommended_team_size(const int blksize,
2251 const int vector_length,
2252 const int internal_vector_length) {
2253 return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2254 }
2255 };
2256#endif
2257
2258#if defined(KOKKOS_ENABLE_HIP)
2259 static inline int ExtractAndFactorizeRecommendedHIPTeamSize(const int blksize,
2260 const int vector_length,
2261 const int internal_vector_length) {
2262 const int vector_size = vector_length/internal_vector_length;
2263 int total_team_size(0);
2264 if (blksize <= 5) total_team_size = 32;
2265 else if (blksize <= 9) total_team_size = 32; // 64
2266 else if (blksize <= 12) total_team_size = 96;
2267 else if (blksize <= 16) total_team_size = 128;
2268 else if (blksize <= 20) total_team_size = 160;
2269 else total_team_size = 160;
2270 return 2*total_team_size/vector_size;
2271 }
2272 template<>
2273 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
2274 typedef KB::Mode::Team mode_type;
2275 typedef KB::Algo::Level3::Unblocked algo_type;
2276 static int recommended_team_size(const int blksize,
2277 const int vector_length,
2278 const int internal_vector_length) {
2279 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2280 }
2281 };
2282 template<>
2283 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
2284 typedef KB::Mode::Team mode_type;
2285 typedef KB::Algo::Level3::Unblocked algo_type;
2286 static int recommended_team_size(const int blksize,
2287 const int vector_length,
2288 const int internal_vector_length) {
2289 return ExtractAndFactorizeRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
2290 }
2291 };
2292#endif
2293
2294#if defined(KOKKOS_ENABLE_SYCL)
2295 static inline int ExtractAndFactorizeRecommendedSYCLTeamSize(const int blksize,
2296 const int vector_length,
2297 const int internal_vector_length) {
2298 const int vector_size = vector_length/internal_vector_length;
2299 int total_team_size(0);
2300 if (blksize <= 5) total_team_size = 32;
2301 else if (blksize <= 9) total_team_size = 32; // 64
2302 else if (blksize <= 12) total_team_size = 96;
2303 else if (blksize <= 16) total_team_size = 128;
2304 else if (blksize <= 20) total_team_size = 160;
2305 else total_team_size = 160;
2306 return 2*total_team_size/vector_size;
2307 }
2308 template<>
2309 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
2310 typedef KB::Mode::Team mode_type;
2311 typedef KB::Algo::Level3::Unblocked algo_type;
2312 static int recommended_team_size(const int blksize,
2313 const int vector_length,
2314 const int internal_vector_length) {
2315 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2316 }
2317 };
2318 template<>
2319 struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
2320 typedef KB::Mode::Team mode_type;
2321 typedef KB::Algo::Level3::Unblocked algo_type;
2322 static int recommended_team_size(const int blksize,
2323 const int vector_length,
2324 const int internal_vector_length) {
2325 return ExtractAndFactorizeRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
2326 }
2327 };
2328#endif
2329
2330 template<typename impl_type, typename WWViewType>
2331 KOKKOS_INLINE_FUNCTION
2332 void
2333 solveMultiVector(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2334 const typename impl_type::local_ordinal_type &/* blocksize */,
2335 const typename impl_type::local_ordinal_type &i0,
2336 const typename impl_type::local_ordinal_type &r0,
2337 const typename impl_type::local_ordinal_type &nrows,
2338 const typename impl_type::local_ordinal_type &v,
2339 const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2340 const Unmanaged<typename impl_type::internal_vector_type_4d_view> X_internal_vector_values,
2341 const WWViewType &WW,
2342 const bool skip_first_pass=false) {
2343 using execution_space = typename impl_type::execution_space;
2344 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2345 using member_type = typename team_policy_type::member_type;
2346 using local_ordinal_type = typename impl_type::local_ordinal_type;
2347
2348 typedef SolveTridiagsDefaultModeAndAlgo
2349 <typename execution_space::memory_space> default_mode_and_algo_type;
2350
2351 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2352 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2353
2354 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2355
2356 // constant
2357 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2358 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2359
2360 // subview pattern
2361 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2362 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2363 auto X2 = X1;
2364
2365 local_ordinal_type i = i0, r = r0;
2366
2367
2368 if (nrows > 1) {
2369 // solve Lx = x
2370 if (skip_first_pass) {
2371 i += (nrows-2) * 3;
2372 r += (nrows-2);
2373 A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2374 X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2375 A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2376 KB::Trsm<member_type,
2377 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2378 default_mode_type,default_algo_type>
2379 ::invoke(member, one, A, X2);
2380 X1.assign_data( X2.data() );
2381 i+=3;
2382 }
2383 else {
2384 KB::Trsm<member_type,
2385 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2386 default_mode_type,default_algo_type>
2387 ::invoke(member, one, A, X1);
2388 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2389 A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2390 X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2391 member.team_barrier();
2392 KB::Gemm<member_type,
2393 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2394 default_mode_type,default_algo_type>
2395 ::invoke(member, -one, A, X1, one, X2);
2396 A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2397 KB::Trsm<member_type,
2398 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2399 default_mode_type,default_algo_type>
2400 ::invoke(member, one, A, X2);
2401 X1.assign_data( X2.data() );
2402 }
2403 }
2404
2405 // solve Ux = x
2406 KB::Trsm<member_type,
2407 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2408 default_mode_type,default_algo_type>
2409 ::invoke(member, one, A, X1);
2410 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2411 i -= 3;
2412 A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
2413 X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
2414 member.team_barrier();
2415 KB::Gemm<member_type,
2416 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2417 default_mode_type,default_algo_type>
2418 ::invoke(member, -one, A, X1, one, X2);
2419
2420 A.assign_data( &D_internal_vector_values(i,0,0,v) );
2421 KB::Trsm<member_type,
2422 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2423 default_mode_type,default_algo_type>
2424 ::invoke(member, one, A, X2);
2425 X1.assign_data( X2.data() );
2426 }
2427 } else {
2428 // matrix is already inverted
2429 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2430 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2431 ::invoke(member, X1, W);
2432 member.team_barrier();
2433 KB::Gemm<member_type,
2434 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2435 default_mode_type,default_algo_type>
2436 ::invoke(member, one, A, W, zero, X1);
2437 }
2438
2439 }
2440
2441 template<typename impl_type, typename WWViewType, typename XViewType>
2442 KOKKOS_INLINE_FUNCTION
2443 void
2444 solveSingleVectorNew(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
2445 const typename impl_type::local_ordinal_type &blocksize,
2446 const typename impl_type::local_ordinal_type &i0,
2447 const typename impl_type::local_ordinal_type &r0,
2448 const typename impl_type::local_ordinal_type &nrows,
2449 const typename impl_type::local_ordinal_type &v,
2450 const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
2451 const XViewType &X_internal_vector_values, //Unmanaged<typename impl_type::internal_vector_type_4d_view>
2452 const WWViewType &WW) {
2453 using execution_space = typename impl_type::execution_space;
2454 //using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2455 //using member_type = typename team_policy_type::member_type;
2456 using local_ordinal_type = typename impl_type::local_ordinal_type;
2457
2458 typedef SolveTridiagsDefaultModeAndAlgo
2459 <typename execution_space::memory_space> default_mode_and_algo_type;
2460
2461 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
2462 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2463
2464 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2465
2466 // base pointers
2467 auto A = D_internal_vector_values.data();
2468 auto X = X_internal_vector_values.data();
2469
2470 // constant
2471 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
2472 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
2473 //const local_ordinal_type num_vectors = X_scalar_values.extent(2);
2474
2475 // const local_ordinal_type blocksize = D_scalar_values.extent(1);
2476 const local_ordinal_type astep = D_internal_vector_values.stride_0();
2477 const local_ordinal_type as0 = D_internal_vector_values.stride_1(); //blocksize*vector_length;
2478 const local_ordinal_type as1 = D_internal_vector_values.stride_2(); //vector_length;
2479 const local_ordinal_type xstep = X_internal_vector_values.stride_0();
2480 const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); //vector_length;
2481
2482 // move to starting point
2483 A += i0*astep + v;
2484 X += r0*xstep + v;
2485
2486 //for (local_ordinal_type col=0;col<num_vectors;++col)
2487 if (nrows > 1) {
2488 // solve Lx = x
2489 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2490 (default_mode_type,default_algo_type,
2491 member,
2492 KB::Diag::Unit,
2493 blocksize,blocksize,
2494 one,
2495 A, as0, as1,
2496 X, xs0);
2497
2498 for (local_ordinal_type tr=1;tr<nrows;++tr) {
2499 member.team_barrier();
2500 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2501 (default_mode_type,default_algo_type,
2502 member,
2503 blocksize, blocksize,
2504 -one,
2505 A+2*astep, as0, as1,
2506 X, xs0,
2507 one,
2508 X+1*xstep, xs0);
2509 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2510 (default_mode_type,default_algo_type,
2511 member,
2512 KB::Diag::Unit,
2513 blocksize,blocksize,
2514 one,
2515 A+3*astep, as0, as1,
2516 X+1*xstep, xs0);
2517
2518 A += 3*astep;
2519 X += 1*xstep;
2520 }
2521
2522 // solve Ux = x
2523 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2524 (default_mode_type,default_algo_type,
2525 member,
2526 KB::Diag::NonUnit,
2527 blocksize, blocksize,
2528 one,
2529 A, as0, as1,
2530 X, xs0);
2531
2532 for (local_ordinal_type tr=nrows;tr>1;--tr) {
2533 A -= 3*astep;
2534 member.team_barrier();
2535 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2536 (default_mode_type,default_algo_type,
2537 member,
2538 blocksize, blocksize,
2539 -one,
2540 A+1*astep, as0, as1,
2541 X, xs0,
2542 one,
2543 X-1*xstep, xs0);
2544 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2545 (default_mode_type,default_algo_type,
2546 member,
2547 KB::Diag::NonUnit,
2548 blocksize, blocksize,
2549 one,
2550 A, as0, as1,
2551 X-1*xstep,xs0);
2552 X -= 1*xstep;
2553 }
2554 // for multiple rhs
2555 //X += xs1;
2556 } else {
2557 const local_ordinal_type ws0 = WW.stride_0();
2558 auto W = WW.data() + v;
2559 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
2560 (default_mode_type,
2561 member, blocksize, X, xs0, W, ws0);
2562 member.team_barrier();
2563 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2564 (default_mode_type,default_algo_type,
2565 member,
2566 blocksize, blocksize,
2567 one,
2568 A, as0, as1,
2569 W, xs0,
2570 zero,
2571 X, xs0);
2572 }
2573 }
2574
2575 template<typename local_ordinal_type, typename ViewType>
2576 void writeBTDValuesToFile (const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2577#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2578 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2579 std::ofstream myfile;
2580 myfile.open (fileName);
2581
2582 const local_ordinal_type n_parts_per_pack = n_parts < (local_ordinal_type) scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2583 local_ordinal_type nnz = scalar_values.extent(0) * scalar_values.extent(1) * scalar_values.extent(2) * n_parts_per_pack;
2584 const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack;
2585 const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2586
2587 const local_ordinal_type block_size = scalar_values.extent(1);
2588
2589 const local_ordinal_type n_rows_per_part = (n_blocks_per_part+2)/3 * block_size;
2590 const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2591
2592 const local_ordinal_type n_packs = ceil(float(n_parts)/n_parts_per_pack);
2593
2594 myfile << "%%MatrixMarket matrix coordinate real general"<< std::endl;
2595 myfile << "%%nnz = " << nnz;
2596 myfile << " block size = " << block_size;
2597 myfile << " number of blocks = " << n_blocks;
2598 myfile << " number of parts = " << n_parts;
2599 myfile << " number of blocks per part = " << n_blocks_per_part;
2600 myfile << " number of rows = " << n_rows ;
2601 myfile << " number of cols = " << n_rows;
2602 myfile << " number of packs = " << n_packs << std::endl;
2603
2604 myfile << n_rows << " " << n_rows << " " << nnz << std::setprecision(9) << std::endl;
2605
2606 local_ordinal_type current_part_idx, current_block_idx, current_row_offset, current_col_offset, current_row, current_col;
2607 for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2608 for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2609 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2610 for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2611 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2612 if (current_block_idx >= (local_ordinal_type) scalar_values.extent(0))
2613 continue;
2614 if (i_block_in_part % 3 == 0) {
2615 current_row_offset = i_block_in_part/3 * block_size;
2616 current_col_offset = i_block_in_part/3 * block_size;
2617 }
2618 else if (i_block_in_part % 3 == 1) {
2619 current_row_offset = (i_block_in_part-1)/3 * block_size;
2620 current_col_offset = ((i_block_in_part-1)/3+1) * block_size;
2621 }
2622 else if (i_block_in_part % 3 == 2) {
2623 current_row_offset = ((i_block_in_part-2)/3+1) * block_size;
2624 current_col_offset = (i_block_in_part-2)/3 * block_size;
2625 }
2626 current_row_offset += current_part_idx * n_rows_per_part;
2627 current_col_offset += current_part_idx * n_rows_per_part;
2628 for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2629 for (local_ordinal_type j_in_block=0;j_in_block<block_size;++j_in_block) {
2630 current_row = current_row_offset + i_in_block + 1;
2631 current_col = current_col_offset + j_in_block + 1;
2632 myfile << current_row << " " << current_col << " " << scalar_values(current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2633 }
2634 }
2635 }
2636 }
2637 }
2638
2639 myfile.close();
2640#endif
2641 }
2642
2643 template<typename local_ordinal_type, typename ViewType>
2644 void write4DMultiVectorValuesToFile (const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2645#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2646 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2647 std::ofstream myfile;
2648 myfile.open (fileName);
2649
2650 const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
2651 const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack;
2652 const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2653
2654 const local_ordinal_type block_size = scalar_values.extent(1);
2655 const local_ordinal_type n_cols = scalar_values.extent(2);
2656
2657 const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2658 const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2659
2660 const local_ordinal_type n_packs = ceil(float(n_parts)/n_parts_per_pack);
2661
2662
2663 myfile << "%%MatrixMarket matrix array real general"<< std::endl;
2664 myfile << "%%block size = " << block_size;
2665 myfile << " number of blocks = " << n_blocks;
2666 myfile << " number of parts = " << n_parts;
2667 myfile << " number of blocks per part = " << n_blocks_per_part;
2668 myfile << " number of rows = " << n_rows ;
2669 myfile << " number of cols = " << n_cols;
2670 myfile << " number of packs = " << n_packs << std::endl;
2671
2672 myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2673
2674 local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2675 (void) current_row_offset;
2676 (void) current_part_idx;
2677 for (local_ordinal_type j_in_block=0;j_in_block<n_cols;++j_in_block) {
2678 for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2679 for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2680 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2681 for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2682 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2683
2684 if (current_block_idx >= (local_ordinal_type) scalar_values.extent(0))
2685 continue;
2686 for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2687 myfile << scalar_values(current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2688 }
2689 }
2690 }
2691 }
2692 }
2693 myfile.close();
2694#endif
2695 }
2696
2697 template<typename local_ordinal_type, typename ViewType>
2698 void write5DMultiVectorValuesToFile (const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
2699#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
2700 auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
2701 std::ofstream myfile;
2702 myfile.open (fileName);
2703
2704 const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(4) ? n_parts : scalar_values.extent(4);
2705 const local_ordinal_type n_blocks = scalar_values.extent(1)*n_parts_per_pack;
2706 const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
2707
2708 const local_ordinal_type block_size = scalar_values.extent(2);
2709 const local_ordinal_type n_blocks_cols = scalar_values.extent(0);
2710 const local_ordinal_type n_cols = n_blocks_cols * block_size;
2711
2712 const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
2713 const local_ordinal_type n_rows = n_rows_per_part*n_parts;
2714
2715 const local_ordinal_type n_packs = ceil(float(n_parts)/n_parts_per_pack);
2716
2717 myfile << "%%MatrixMarket matrix array real general"<< std::endl;
2718 myfile << "%%block size = " << block_size;
2719 myfile << " number of blocks = " << n_blocks;
2720 myfile << " number of parts = " << n_parts;
2721 myfile << " number of blocks per part = " << n_blocks_per_part;
2722 myfile << " number of rows = " << n_rows ;
2723 myfile << " number of cols = " << n_cols;
2724 myfile << " number of packs = " << n_packs << std::endl;
2725
2726 myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;
2727
2728 local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
2729 (void) current_row_offset;
2730 (void) current_part_idx;
2731 for (local_ordinal_type i_block_col=0;i_block_col<n_blocks_cols;++i_block_col) {
2732 for (local_ordinal_type j_in_block=0;j_in_block<block_size;++j_in_block) {
2733 for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
2734 for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
2735 current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
2736 for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
2737 current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
2738
2739 if (current_block_idx >= (local_ordinal_type) scalar_values.extent(1))
2740 continue;
2741 for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
2742 myfile << scalar_values(i_block_col,current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
2743 }
2744 }
2745 }
2746 }
2747 }
2748 }
2749 myfile.close();
2750#endif
2751 }
2752
2753 template<typename local_ordinal_type, typename member_type, typename ViewType1, typename ViewType2>
2754 KOKKOS_INLINE_FUNCTION
2755 void
2756 copy3DView(const member_type &member, const ViewType1 &view1, const ViewType2 &view2) {
2757/*
2758 // Kokkos::Experimental::local_deep_copy
2759 auto teamVectorRange =
2760 Kokkos::TeamVectorMDRange<Kokkos::Rank<3>, member_type>(
2761 member, view1.extent(0), view1.extent(1), view1.extent(2));
2762
2763 Kokkos::parallel_for
2764 (teamVectorRange,
2765 [&](const local_ordinal_type &i, const local_ordinal_type &j, const local_ordinal_type &k) {
2766 view1(i,j,k) = view2(i,j,k);
2767 });
2768*/
2769 Kokkos::Experimental::local_deep_copy(member, view1, view2);
2770 }
2771 template<typename MatrixType, int ScratchLevel>
2772 struct ExtractAndFactorizeTridiags {
2773 public:
2774 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
2775 // a functor cannot have both device_type and execution_space; specialization error in kokkos
2776 using execution_space = typename impl_type::execution_space;
2777 using memory_space = typename impl_type::memory_space;
2779 using local_ordinal_type = typename impl_type::local_ordinal_type;
2780 using size_type = typename impl_type::size_type;
2781 using impl_scalar_type = typename impl_type::impl_scalar_type;
2782 using magnitude_type = typename impl_type::magnitude_type;
2784 using row_matrix_type = typename impl_type::tpetra_row_matrix_type;
2785 using crs_graph_type = typename impl_type::tpetra_crs_graph_type;
2787 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2788 using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
2789 using size_type_2d_view = typename impl_type::size_type_2d_view;
2790 using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra;
2792 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
2793 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
2794 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2795 using vector_type_4d_view = typename impl_type::vector_type_4d_view;
2796 using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
2797 using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
2798 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
2799 using btdm_scalar_type_5d_view = typename impl_type::btdm_scalar_type_5d_view;
2800 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2801 using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
2802
2803 using internal_vector_type = typename impl_type::internal_vector_type;
2804 static constexpr int vector_length = impl_type::vector_length;
2805 static constexpr int internal_vector_length = impl_type::internal_vector_length;
2806 static_assert(vector_length >= internal_vector_length, "Ifpack2 BlockTriDi Numeric: vector_length must be at least as large as internal_vector_length");
2807 static_assert(vector_length % internal_vector_length == 0, "Ifpack2 BlockTriDi Numeric: vector_length must be divisible by internal_vector_length");
2808
2810 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2811 using member_type = typename team_policy_type::member_type;
2812
2813 private:
2814 // part interface
2815 const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr, packindices_sub, packptr_sub;
2816 const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub, part2packrowidx0_sub, packindices_schur;
2817 const local_ordinal_type max_partsz;
2818 // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
2819 using size_type_1d_view_tpetra = Kokkos::View<size_t*,typename impl_type::node_device_type>;
2820 ConstUnmanaged<size_type_1d_view_tpetra> A_block_rowptr;
2821 ConstUnmanaged<size_type_1d_view_tpetra> A_point_rowptr;
2822 ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
2823 // block tridiags
2824 const ConstUnmanaged<size_type_2d_view> pack_td_ptr, flat_td_ptr, pack_td_ptr_schur;
2825 const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
2826 const Unmanaged<internal_vector_type_4d_view> internal_vector_values, internal_vector_values_schur;
2827 const Unmanaged<internal_vector_type_5d_view> e_internal_vector_values;
2828 const Unmanaged<btdm_scalar_type_4d_view> scalar_values, scalar_values_schur;
2829 const Unmanaged<btdm_scalar_type_5d_view> e_scalar_values;
2830 // shared information
2831 const local_ordinal_type blocksize, blocksize_square;
2832 // diagonal safety
2833 const magnitude_type tiny;
2834 const local_ordinal_type vector_loop_size;
2835
2836 bool hasBlockCrsMatrix;
2837
2838 public:
2839 ExtractAndFactorizeTridiags(const BlockTridiags<MatrixType> &btdm_,
2840 const BlockHelperDetails::PartInterface<MatrixType> &interf_,
2841 const Teuchos::RCP<const row_matrix_type> &A_,
2842 const Teuchos::RCP<const crs_graph_type> &G_,
2843 const magnitude_type& tiny_) :
2844 // interface
2845 partptr(interf_.partptr),
2846 lclrow(interf_.lclrow),
2847 packptr(interf_.packptr),
2848 packindices_sub(interf_.packindices_sub),
2849 packptr_sub(interf_.packptr_sub),
2850 partptr_sub(interf_.partptr_sub),
2851 part2packrowidx0_sub(interf_.part2packrowidx0_sub),
2852 packindices_schur(interf_.packindices_schur),
2853 max_partsz(interf_.max_partsz),
2854 // block tridiags
2855 pack_td_ptr(btdm_.pack_td_ptr),
2856 flat_td_ptr(btdm_.flat_td_ptr),
2857 pack_td_ptr_schur(btdm_.pack_td_ptr_schur),
2858 A_colindsub(btdm_.A_colindsub),
2859 internal_vector_values((internal_vector_type*)btdm_.values.data(),
2860 btdm_.values.extent(0),
2861 btdm_.values.extent(1),
2862 btdm_.values.extent(2),
2863 vector_length/internal_vector_length),
2864 internal_vector_values_schur((internal_vector_type*)btdm_.values_schur.data(),
2865 btdm_.values_schur.extent(0),
2866 btdm_.values_schur.extent(1),
2867 btdm_.values_schur.extent(2),
2868 vector_length/internal_vector_length),
2869 e_internal_vector_values((internal_vector_type*)btdm_.e_values.data(),
2870 btdm_.e_values.extent(0),
2871 btdm_.e_values.extent(1),
2872 btdm_.e_values.extent(2),
2873 btdm_.e_values.extent(3),
2874 vector_length/internal_vector_length),
2875 scalar_values((btdm_scalar_type*)btdm_.values.data(),
2876 btdm_.values.extent(0),
2877 btdm_.values.extent(1),
2878 btdm_.values.extent(2),
2879 vector_length),
2880 scalar_values_schur((btdm_scalar_type*)btdm_.values_schur.data(),
2881 btdm_.values_schur.extent(0),
2882 btdm_.values_schur.extent(1),
2883 btdm_.values_schur.extent(2),
2884 vector_length),
2885 e_scalar_values((btdm_scalar_type*)btdm_.e_values.data(),
2886 btdm_.e_values.extent(0),
2887 btdm_.e_values.extent(1),
2888 btdm_.e_values.extent(2),
2889 btdm_.e_values.extent(3),
2890 vector_length),
2891 blocksize(btdm_.values.extent(1)),
2892 blocksize_square(blocksize*blocksize),
2893 // diagonal weight to avoid zero pivots
2894 tiny(tiny_),
2895 vector_loop_size(vector_length/internal_vector_length) {
2896 using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
2897 using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
2898
2899 auto A_crs = Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A_);
2900 auto A_bcrs = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(A_);
2901
2902 hasBlockCrsMatrix = ! A_bcrs.is_null ();
2903
2904 A_block_rowptr = G_->getLocalGraphDevice().row_map;
2905 if (hasBlockCrsMatrix) {
2906 A_values = const_cast<block_crs_matrix_type*>(A_bcrs.get())->getValuesDeviceNonConst();
2907 }
2908 else {
2909 A_point_rowptr = A_crs->getCrsGraph()->getLocalGraphDevice().row_map;
2910 A_values = A_crs->getLocalValuesDevice (Tpetra::Access::ReadOnly);
2911 }
2912 }
2913
2914 private:
2915
2916 KOKKOS_INLINE_FUNCTION
2917 void
2918 extract(local_ordinal_type partidx,
2919 local_ordinal_type local_subpartidx,
2920 local_ordinal_type npacks) const {
2921#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2922 printf("extract partidx = %d, local_subpartidx = %d, npacks = %d;\n", partidx, local_subpartidx, npacks);
2923#endif
2924 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
2925 const size_type kps = pack_td_ptr(partidx, local_subpartidx);
2926 local_ordinal_type kfs[vector_length] = {};
2927 local_ordinal_type ri0[vector_length] = {};
2928 local_ordinal_type nrows[vector_length] = {};
2929
2930 for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
2931 kfs[vi] = flat_td_ptr(partidx,local_subpartidx);
2932 ri0[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidx,0);
2933 nrows[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidx,1) - ri0[vi];
2934#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2935 printf("kfs[%d] = %d;\n", vi, kfs[vi]);
2936 printf("ri0[%d] = %d;\n", vi, ri0[vi]);
2937 printf("nrows[%d] = %d;\n", vi, nrows[vi]);
2938#endif
2939 }
2940 local_ordinal_type tr_min = 0;
2941 local_ordinal_type tr_max = nrows[0];
2942 if (local_subpartidx % 2 == 1) {
2943 tr_min -= 1;
2944 tr_max += 1;
2945 }
2946#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2947 printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
2948#endif
2949 for (local_ordinal_type tr=tr_min,j=0;tr<tr_max;++tr) {
2950 for (local_ordinal_type e=0;e<3;++e) {
2951 if (hasBlockCrsMatrix) {
2952 const impl_scalar_type* block[vector_length] = {};
2953 for (local_ordinal_type vi=0;vi<npacks;++vi) {
2954 const size_type Aj = A_block_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
2955
2956 block[vi] = &A_values(Aj*blocksize_square);
2957 }
2958 const size_type pi = kps + j;
2959#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
2960 printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d\n", pi, ri0[0] + tr, kfs[0] + j);
2961#endif
2962 ++j;
2963 for (local_ordinal_type ii=0;ii<blocksize;++ii) {
2964 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
2965 const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
2966 auto& v = internal_vector_values(pi, ii, jj, 0);
2967 for (local_ordinal_type vi=0;vi<npacks;++vi) {
2968 v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
2969 }
2970 }
2971 }
2972 }
2973 else {
2974 const size_type pi = kps + j;
2975
2976 for (local_ordinal_type vi=0;vi<npacks;++vi) {
2977 const size_type Aj_c = A_colindsub(kfs[vi] + j);
2978
2979 for (local_ordinal_type ii=0;ii<blocksize;++ii) {
2980 auto point_row_offset = A_point_rowptr(lclrow(ri0[vi] + tr)*blocksize + ii);
2981
2982 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
2983 scalar_values(pi, ii, jj, vi) = A_values(point_row_offset + Aj_c*blocksize + jj);
2984 }
2985 }
2986 }
2987 ++j;
2988 }
2989 if (nrows[0] == 1) break;
2990 if (local_subpartidx % 2 == 0) {
2991 if (e == 1 && (tr == 0 || tr+1 == nrows[0])) break;
2992 for (local_ordinal_type vi=1;vi<npacks;++vi) {
2993 if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
2994 npacks = vi;
2995 break;
2996 }
2997 }
2998 }
2999 else {
3000 if (e == 0 && (tr == -1 || tr == nrows[0])) break;
3001 for (local_ordinal_type vi=1;vi<npacks;++vi) {
3002 if ((e == 0 && nrows[vi] == 1) || (e == 0 && tr == nrows[vi])) {
3003 npacks = vi;
3004 break;
3005 }
3006 }
3007 }
3008 }
3009 }
3010 }
3011
3012 KOKKOS_INLINE_FUNCTION
3013 void
3014 extract(const member_type &member,
3015 const local_ordinal_type &partidxbeg,
3016 local_ordinal_type local_subpartidx,
3017 const local_ordinal_type &npacks,
3018 const local_ordinal_type &vbeg) const {
3019#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3020 printf("extract partidxbeg = %d, local_subpartidx = %d, npacks = %d, vbeg = %d;\n", partidxbeg, local_subpartidx, npacks, vbeg);
3021#endif
3022 using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
3023 local_ordinal_type kfs_vals[internal_vector_length] = {};
3024 local_ordinal_type ri0_vals[internal_vector_length] = {};
3025 local_ordinal_type nrows_vals[internal_vector_length] = {};
3026
3027 const size_type kps = pack_td_ptr(partidxbeg,local_subpartidx);
3028 for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
3029 kfs_vals[vi] = flat_td_ptr(partidxbeg+vi,local_subpartidx);
3030 ri0_vals[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidxbeg+vi,0);
3031 nrows_vals[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidxbeg+vi,1) - ri0_vals[vi];
3032#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3033 printf("kfs_vals[%d] = %d;\n", vi, kfs_vals[vi]);
3034 printf("ri0_vals[%d] = %d;\n", vi, ri0_vals[vi]);
3035 printf("nrows_vals[%d] = %d;\n", vi, nrows_vals[vi]);
3036#endif
3037 }
3038
3039 local_ordinal_type j_vals[internal_vector_length] = {};
3040
3041 local_ordinal_type tr_min = 0;
3042 local_ordinal_type tr_max = nrows_vals[0];
3043 if (local_subpartidx % 2 == 1) {
3044 tr_min -= 1;
3045 tr_max += 1;
3046 }
3047#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3048 printf("tr_min = %d and tr_max = %d;\n", tr_min, tr_max);
3049#endif
3050 for (local_ordinal_type tr=tr_min;tr<tr_max;++tr) {
3051 for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
3052 const local_ordinal_type nrows = (local_subpartidx % 2 == 0 ? nrows_vals[vi] : nrows_vals[vi]);
3053 if ((local_subpartidx % 2 == 0 && tr < nrows) || (local_subpartidx % 2 == 1 && tr < nrows+1)) {
3054 auto &j = j_vals[vi];
3055 const local_ordinal_type kfs = kfs_vals[vi];
3056 const local_ordinal_type ri0 = ri0_vals[vi];
3057 local_ordinal_type lbeg, lend;
3058 if (local_subpartidx % 2 == 0) {
3059 lbeg = (tr == tr_min ? 1 : 0);
3060 lend = (tr == nrows - 1 ? 2 : 3);
3061 }
3062 else {
3063 lbeg = 0;
3064 lend = 3;
3065 if (tr == tr_min) {
3066 lbeg = 1;
3067 lend = 2;
3068 }
3069 else if (tr == nrows) {
3070 lbeg = 0;
3071 lend = 1;
3072 }
3073 }
3074 if (hasBlockCrsMatrix) {
3075 for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
3076 const size_type Aj = A_block_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
3077 const impl_scalar_type* block = &A_values(Aj*blocksize_square);
3078 const size_type pi = kps + j;
3079#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3080 printf("Extract pi = %ld, ri0 + tr = %d, kfs + j = %d, tr = %d, lbeg = %d, lend = %d, l = %d\n", pi, ri0 + tr, kfs + j, tr, lbeg, lend, l);
3081#endif
3082 Kokkos::parallel_for
3083 (Kokkos::TeamThreadRange(member,blocksize),
3084 [&](const local_ordinal_type &ii) {
3085 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
3086 scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[tlb::getFlatIndex(ii,jj,blocksize)]);
3087 }
3088 });
3089 }
3090 }
3091 else {
3092 for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
3093 const size_type Aj_c = A_colindsub(kfs + j);
3094 const size_type pi = kps + j;
3095 Kokkos::parallel_for
3096 (Kokkos::TeamThreadRange(member,blocksize),
3097 [&](const local_ordinal_type &ii) {
3098 auto point_row_offset = A_point_rowptr(lclrow(ri0 + tr)*blocksize + ii);
3099 for (local_ordinal_type jj=0;jj<blocksize;++jj) {
3100 scalar_values(pi, ii, jj, v) = A_values(point_row_offset + Aj_c*blocksize + jj);
3101 }
3102 });
3103 }
3104 }
3105 }
3106 }
3107 }
3108 }
3109
3110 template<typename AAViewType,
3111 typename WWViewType>
3112 KOKKOS_INLINE_FUNCTION
3113 void
3114 factorize_subline(const member_type &member,
3115 const local_ordinal_type &i0,
3116 const local_ordinal_type &nrows,
3117 const local_ordinal_type &v,
3118 const AAViewType &AA,
3119 const WWViewType &WW) const {
3120
3121 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
3122 <typename execution_space::memory_space> default_mode_and_algo_type;
3123
3124 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3125 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3126
3127 // constant
3128 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3129
3130#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3131 printf("i0 = %d, nrows = %d, v = %d, AA.extent(0) = %ld;\n", i0, nrows, v, AA.extent(0));
3132#endif
3133
3134 // subview pattern
3135 auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
3136 KB::LU<member_type,
3137 default_mode_type,KB::Algo::LU::Unblocked>
3138 ::invoke(member, A , tiny);
3139
3140 if (nrows > 1) {
3141 auto B = A;
3142 auto C = A;
3143 local_ordinal_type i = i0;
3144 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
3145#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3146 printf("tr = %d, i = %d;\n", tr, i);
3147#endif
3148 B.assign_data( &AA(i+1,0,0,v) );
3149 KB::Trsm<member_type,
3150 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
3151 default_mode_type,default_algo_type>
3152 ::invoke(member, one, A, B);
3153 C.assign_data( &AA(i+2,0,0,v) );
3154 KB::Trsm<member_type,
3155 KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
3156 default_mode_type,default_algo_type>
3157 ::invoke(member, one, A, C);
3158 A.assign_data( &AA(i+3,0,0,v) );
3159
3160 member.team_barrier();
3161 KB::Gemm<member_type,
3162 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
3163 default_mode_type,default_algo_type>
3164 ::invoke(member, -one, C, B, one, A);
3165 KB::LU<member_type,
3166 default_mode_type,KB::Algo::LU::Unblocked>
3167 ::invoke(member, A, tiny);
3168 }
3169 } else {
3170 // for block jacobi invert a matrix here
3171 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
3172 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
3173 ::invoke(member, A, W);
3174 KB::SetIdentity<member_type,default_mode_type>
3175 ::invoke(member, A);
3176 member.team_barrier();
3177 KB::Trsm<member_type,
3178 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
3179 default_mode_type,default_algo_type>
3180 ::invoke(member, one, W, A);
3181 KB::Trsm<member_type,
3182 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
3183 default_mode_type,default_algo_type>
3184 ::invoke(member, one, W, A);
3185 }
3186 }
3187
3188 public:
3189
3190 struct ExtractAndFactorizeSubLineTag {};
3191 struct ExtractBCDTag {};
3192 struct ComputeETag {};
3193 struct ComputeSchurTag {};
3194 struct FactorizeSchurTag {};
3195
3196 KOKKOS_INLINE_FUNCTION
3197 void
3198 operator() (const ExtractAndFactorizeSubLineTag &, const member_type &member) const {
3199 // btdm is packed and sorted from largest one
3200 const local_ordinal_type packidx = packindices_sub(member.league_rank());
3201
3202 const local_ordinal_type subpartidx = packptr_sub(packidx);
3203 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3204 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3205 const local_ordinal_type partidx = subpartidx%n_parts;
3206
3207 const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3208 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3209 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3210
3211 internal_vector_scratch_type_3d_view
3212 WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3213
3214#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3215 printf("rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx);
3216 printf("vector_loop_size = %d\n", vector_loop_size);
3217#endif
3218
3219 if (vector_loop_size == 1) {
3220 extract(partidx, local_subpartidx, npacks);
3221 factorize_subline(member, i0, nrows, 0, internal_vector_values, WW);
3222 } else {
3223 Kokkos::parallel_for
3224 (Kokkos::ThreadVectorRange(member, vector_loop_size),
3225 [&](const local_ordinal_type &v) {
3226 const local_ordinal_type vbeg = v*internal_vector_length;
3227#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3228 printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3229#endif
3230 if (vbeg < npacks)
3231 extract(member, partidx+vbeg, local_subpartidx, npacks, vbeg);
3232 // this is not safe if vector loop size is different from vector size of
3233 // the team policy. we always make sure this when constructing the team policy
3234 member.team_barrier();
3235 factorize_subline(member, i0, nrows, v, internal_vector_values, WW);
3236 });
3237 }
3238 }
3239
3240 KOKKOS_INLINE_FUNCTION
3241 void
3242 operator() (const ExtractBCDTag &, const member_type &member) const {
3243 // btdm is packed and sorted from largest one
3244 const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3245 const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3246 const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3247
3248 const local_ordinal_type subpartidx = packptr_sub(packidx);
3249 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3250 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3251 const local_ordinal_type partidx = subpartidx%n_parts;
3252
3253 const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3254 //const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3255 //const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3256
3257 if (vector_loop_size == 1) {
3258 extract(partidx, local_subpartidx, npacks);
3259 }
3260 else {
3261 Kokkos::parallel_for
3262 (Kokkos::ThreadVectorRange(member, vector_loop_size),
3263 [&](const local_ordinal_type &v) {
3264 const local_ordinal_type vbeg = v*internal_vector_length;
3265#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3266 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3267 printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
3268#endif
3269 if (vbeg < npacks)
3270 extract(member, partidx+vbeg, local_subpartidx, npacks, vbeg);
3271 });
3272 }
3273
3274 member.team_barrier();
3275
3276 const size_type kps1 = pack_td_ptr(partidx, local_subpartidx);
3277 const size_type kps2 = pack_td_ptr(partidx, local_subpartidx+1)-1;
3278
3279 const local_ordinal_type r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1;
3280 const local_ordinal_type r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2;
3281
3282#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3283 printf("Copy for Schur complement part id = %d from kps1 = %ld to r1 = %d and from kps2 = %ld to r2 = %d partidx = %d local_subpartidx = %d;\n", packidx, kps1, r1, kps2, r2, partidx, local_subpartidx);
3284#endif
3285
3286 // Need to copy D to e_internal_vector_values.
3287 copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 0, r1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3288 Kokkos::subview(internal_vector_values, kps1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3289
3290 copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 1, r2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3291 Kokkos::subview(internal_vector_values, kps2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3292
3293 }
3294
3295 KOKKOS_INLINE_FUNCTION
3296 void
3297 operator() (const ComputeETag &, const member_type &member) const {
3298 // btdm is packed and sorted from largest one
3299 const local_ordinal_type packidx = packindices_sub(member.league_rank());
3300
3301 const local_ordinal_type subpartidx = packptr_sub(packidx);
3302 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3303 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3304 const local_ordinal_type partidx = subpartidx%n_parts;
3305
3306 const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3307 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3308 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
3309 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3310 const local_ordinal_type num_vectors = blocksize;
3311
3312 (void) npacks;
3313
3314 internal_vector_scratch_type_3d_view
3315 WW(member.team_scratch(ScratchLevel), blocksize, num_vectors, vector_loop_size);
3316 if (local_subpartidx == 0) {
3317 Kokkos::parallel_for
3318 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3319 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3320 });
3321 }
3322 else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
3323 Kokkos::parallel_for
3324 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3325 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3326 });
3327 }
3328 else {
3329 Kokkos::parallel_for
3330 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3331 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
3332 solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
3333 });
3334 }
3335 }
3336
3337 KOKKOS_INLINE_FUNCTION
3338 void
3339 operator() (const ComputeSchurTag &, const member_type &member) const {
3340 // btdm is packed and sorted from largest one
3341 const local_ordinal_type packindices_schur_i = member.league_rank() % packindices_schur.extent(0);
3342 const local_ordinal_type packindices_schur_j = member.league_rank() / packindices_schur.extent(0);
3343 const local_ordinal_type packidx = packindices_schur(packindices_schur_i, packindices_schur_j);
3344
3345 const local_ordinal_type subpartidx = packptr_sub(packidx);
3346 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3347 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
3348 const local_ordinal_type partidx = subpartidx%n_parts;
3349
3350 //const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
3351 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
3352 //const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
3353 //const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
3354
3355 // Compute S = D - C E
3356
3357 const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
3358 const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1;
3359 const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2;
3360
3361 for (local_ordinal_type i = 0; i < 4; ++i) { //pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-i0_schur
3362 copy3DView<local_ordinal_type>(member, Kokkos::subview(internal_vector_values_schur, i0_schur+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
3363 Kokkos::subview(internal_vector_values, i0_offset+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
3364 }
3365
3366 member.team_barrier();
3367
3368 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
3369
3370 const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx)+1;
3371 const size_type c_kps2 = pack_td_ptr(partidx, local_subpartidx+1)-2;
3372
3373 const local_ordinal_type e_r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1;
3374 const local_ordinal_type e_r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2;
3375
3376 typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
3377 <typename execution_space::memory_space> default_mode_and_algo_type;
3378
3379 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
3380 typedef typename default_mode_and_algo_type::algo_type default_algo_type;
3381
3382 Kokkos::parallel_for
3383 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
3384 for (size_type i = 0; i < pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-pack_td_ptr_schur(partidx,local_subpartidx_schur); ++i) {
3385 local_ordinal_type e_r, e_c, c_kps;
3386
3387 if ( local_subpartidx_schur == 0 ) {
3388 if ( i == 0 ) {
3389 e_r = e_r1;
3390 e_c = 0;
3391 c_kps = c_kps1;
3392 }
3393 else if ( i == 3 ) {
3394 e_r = e_r2;
3395 e_c = 1;
3396 c_kps = c_kps2;
3397 }
3398 else if ( i == 4 ) {
3399 e_r = e_r2;
3400 e_c = 0;
3401 c_kps = c_kps2;
3402 }
3403 else {
3404 continue;
3405 }
3406 }
3407 else {
3408 if ( i == 0 ) {
3409 e_r = e_r1;
3410 e_c = 1;
3411 c_kps = c_kps1;
3412 }
3413 else if ( i == 1 ) {
3414 e_r = e_r1;
3415 e_c = 0;
3416 c_kps = c_kps1;
3417 }
3418 else if ( i == 4 ) {
3419 e_r = e_r2;
3420 e_c = 1;
3421 c_kps = c_kps2;
3422 }
3423 else if ( i == 5 ) {
3424 e_r = e_r2;
3425 e_c = 0;
3426 c_kps = c_kps2;
3427 }
3428 else {
3429 continue;
3430 }
3431 }
3432
3433 auto S = Kokkos::subview(internal_vector_values_schur, pack_td_ptr_schur(partidx,local_subpartidx_schur)+i, Kokkos::ALL(), Kokkos::ALL(), v);
3434 auto C = Kokkos::subview(internal_vector_values, c_kps, Kokkos::ALL(), Kokkos::ALL(), v);
3435 auto E = Kokkos::subview(e_internal_vector_values, e_c, e_r, Kokkos::ALL(), Kokkos::ALL(), v);
3436 KB::Gemm<member_type,
3437 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
3438 default_mode_type,default_algo_type>
3439 ::invoke(member, -one, C, E, one, S);
3440 }
3441 });
3442 }
3443
3444 KOKKOS_INLINE_FUNCTION
3445 void
3446 operator() (const FactorizeSchurTag &, const member_type &member) const {
3447 const local_ordinal_type packidx = packindices_schur(member.league_rank(), 0);
3448
3449 const local_ordinal_type subpartidx = packptr_sub(packidx);
3450
3451 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3452 const local_ordinal_type partidx = subpartidx%n_parts;
3453
3454 const local_ordinal_type i0 = pack_td_ptr_schur(partidx,0);
3455 const local_ordinal_type nrows = 2*(pack_td_ptr_schur.extent(1)-1);
3456
3457 internal_vector_scratch_type_3d_view
3458 WW(member.team_scratch(ScratchLevel), blocksize, blocksize, vector_loop_size);
3459
3460#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3461 printf("FactorizeSchurTag rank = %d, i0 = %d, nrows = %d, vector_loop_size = %d;\n", member.league_rank(), i0, nrows, vector_loop_size);
3462#endif
3463
3464 if (vector_loop_size == 1) {
3465 factorize_subline(member, i0, nrows, 0, internal_vector_values_schur, WW);
3466 } else {
3467 Kokkos::parallel_for
3468 (Kokkos::ThreadVectorRange(member, vector_loop_size),
3469 [&](const local_ordinal_type &v) {
3470 factorize_subline(member, i0, nrows, v, internal_vector_values_schur, WW);
3471 });
3472 }
3473 }
3474
3475 void run() {
3476 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3477 const local_ordinal_type team_size =
3478 ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
3479 recommended_team_size(blocksize, vector_length, internal_vector_length);
3480 const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
3481 shmem_size(blocksize, blocksize, vector_loop_size);
3482
3483 {
3484#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3485 printf("Start ExtractAndFactorizeSubLineTag\n");
3486#endif
3487 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeSubLineTag", ExtractAndFactorizeSubLineTag0);
3488 Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeSubLineTag>
3489 policy(packindices_sub.extent(0), team_size, vector_loop_size);
3490
3491
3492 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
3493 writeBTDValuesToFile(n_parts, scalar_values, "before.mm");
3494
3495 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3496 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeSubLineTag>",
3497 policy, *this);
3498 execution_space().fence();
3499
3500 writeBTDValuesToFile(n_parts, scalar_values, "after.mm");
3501#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3502 printf("End ExtractAndFactorizeSubLineTag\n");
3503#endif
3504 }
3505
3506 if (packindices_schur.extent(1) > 0)
3507 {
3508 {
3509#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3510 printf("Start ExtractBCDTag\n");
3511#endif
3512 Kokkos::deep_copy(e_scalar_values, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3513 Kokkos::deep_copy(scalar_values_schur, Kokkos::ArithTraits<btdm_magnitude_type>::zero());
3514
3515 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_before_extract.mm");
3516
3517 {
3518 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractBCDTag", ExtractBCDTag0);
3519 Kokkos::TeamPolicy<execution_space,ExtractBCDTag>
3520 policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);
3521
3522 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3523 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractBCDTag>",
3524 policy, *this);
3525 execution_space().fence();
3526 }
3527
3528#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3529 printf("End ExtractBCDTag\n");
3530#endif
3531 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values, "after_extraction_of_BCD.mm");
3532#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3533 printf("Start ComputeETag\n");
3534#endif
3535 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_extract.mm");
3536 {
3537 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeETag", ComputeETag0);
3538 Kokkos::TeamPolicy<execution_space,ComputeETag>
3539 policy(packindices_sub.extent(0), team_size, vector_loop_size);
3540
3541 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3542 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeETag>",
3543 policy, *this);
3544 execution_space().fence();
3545 }
3546 write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_compute.mm");
3547
3548#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3549 printf("End ComputeETag\n");
3550#endif
3551 }
3552
3553 {
3554#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3555 printf("Start ComputeSchurTag\n");
3556#endif
3557 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeSchurTag", ComputeSchurTag0);
3558 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "before_schur.mm");
3559 Kokkos::TeamPolicy<execution_space,ComputeSchurTag>
3560 policy(packindices_schur.extent(0)*packindices_schur.extent(1), team_size, vector_loop_size);
3561
3562 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeSchurTag>",
3563 policy, *this);
3564 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_schur.mm");
3565 execution_space().fence();
3566#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3567 printf("End ComputeSchurTag\n");
3568#endif
3569 }
3570
3571 {
3572#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3573 printf("Start FactorizeSchurTag\n");
3574#endif
3575 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::FactorizeSchurTag", FactorizeSchurTag0);
3576 Kokkos::TeamPolicy<execution_space,FactorizeSchurTag>
3577 policy(packindices_schur.extent(0), team_size, vector_loop_size);
3578 policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
3579 Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<FactorizeSchurTag>",
3580 policy, *this);
3581 execution_space().fence();
3582 writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_factor_schur.mm");
3583#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
3584 printf("End FactorizeSchurTag\n");
3585#endif
3586 }
3587 }
3588
3589 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3590 }
3591
3592 };
3593
3597 template<typename MatrixType>
3598 void
3599 performNumericPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
3600 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
3601 const BlockHelperDetails::PartInterface<MatrixType> &interf,
3603 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny) {
3605 using execution_space = typename impl_type::execution_space;
3606 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
3607 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
3608
3609 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase", NumericPhase);
3610
3611 int blocksize = btdm.values.extent(1);
3612 // Both Kokkos policy vector length and SIMD type vector length are hardcoded in KokkosBatched.
3613 // For large block sizes, have to fall back to level 1 scratch.
3614 int scratch_required = internal_vector_scratch_type_3d_view::shmem_size(blocksize, blocksize, impl_type::vector_length / impl_type::internal_vector_length);
3615 int max_scratch = team_policy_type::scratch_size_max(0);
3616
3617 if(scratch_required < max_scratch) {
3618 // Can use level 0 scratch
3619 ExtractAndFactorizeTridiags<MatrixType, 0> function(btdm, interf, A, G, tiny);
3620 function.run();
3621 }
3622 else {
3623 // Not enough level 0 scratch, so fall back to level 1
3624 ExtractAndFactorizeTridiags<MatrixType, 1> function(btdm, interf, A, G, tiny);
3625 function.run();
3626 }
3627 IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
3628 }
3629
3633 template<typename MatrixType>
3634 struct MultiVectorConverter {
3635 public:
3637 using execution_space = typename impl_type::execution_space;
3638 using memory_space = typename impl_type::memory_space;
3639
3640 using local_ordinal_type = typename impl_type::local_ordinal_type;
3641 using impl_scalar_type = typename impl_type::impl_scalar_type;
3642 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
3643 using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
3644 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3645 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3646 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
3647 using const_impl_scalar_type_2d_view_tpetra = typename impl_scalar_type_2d_view_tpetra::const_type;
3648 static constexpr int vector_length = impl_type::vector_length;
3649
3650 using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
3651
3652 private:
3653 // part interface
3654 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3655 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3656 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3657 const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
3658 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3659 const local_ordinal_type blocksize;
3660 const local_ordinal_type num_vectors;
3661
3662 // packed multivector output (or input)
3663 vector_type_3d_view packed_multivector;
3664 const_impl_scalar_type_2d_view_tpetra scalar_multivector;
3665
3666 template<typename TagType>
3667 KOKKOS_INLINE_FUNCTION
3668 void copy_multivectors(const local_ordinal_type &j,
3669 const local_ordinal_type &vi,
3670 const local_ordinal_type &pri,
3671 const local_ordinal_type &ri0) const {
3672 for (local_ordinal_type col=0;col<num_vectors;++col)
3673 for (local_ordinal_type i=0;i<blocksize;++i)
3674 packed_multivector(pri, i, col)[vi] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
3675 }
3676
3677 public:
3678
3679 MultiVectorConverter(const BlockHelperDetails::PartInterface<MatrixType> &interf,
3680 const vector_type_3d_view &pmv)
3681 : partptr(interf.partptr),
3682 packptr(interf.packptr),
3683 part2packrowidx0(interf.part2packrowidx0),
3684 part2rowidx0(interf.part2rowidx0),
3685 lclrow(interf.lclrow),
3686 blocksize(pmv.extent(1)),
3687 num_vectors(pmv.extent(2)),
3688 packed_multivector(pmv) {}
3689
3690 // TODO:: modify this routine similar to the team level functions
3691 KOKKOS_INLINE_FUNCTION
3692 void
3693 operator() (const local_ordinal_type &packidx) const {
3694 local_ordinal_type partidx = packptr(packidx);
3695 local_ordinal_type npacks = packptr(packidx+1) - partidx;
3696 const local_ordinal_type pri0 = part2packrowidx0(partidx);
3697
3698 local_ordinal_type ri0[vector_length] = {};
3699 local_ordinal_type nrows[vector_length] = {};
3700 for (local_ordinal_type v=0;v<npacks;++v,++partidx) {
3701 ri0[v] = part2rowidx0(partidx);
3702 nrows[v] = part2rowidx0(partidx+1) - ri0[v];
3703 }
3704 for (local_ordinal_type j=0;j<nrows[0];++j) {
3705 local_ordinal_type cnt = 1;
3706 for (;cnt<npacks && j!= nrows[cnt];++cnt);
3707 npacks = cnt;
3708 const local_ordinal_type pri = pri0 + j;
3709 for (local_ordinal_type col=0;col<num_vectors;++col)
3710 for (local_ordinal_type i=0;i<blocksize;++i)
3711 for (local_ordinal_type v=0;v<npacks;++v)
3712 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0[v]+j)+i,col));
3713 }
3714 }
3715
3716 KOKKOS_INLINE_FUNCTION
3717 void
3718 operator() (const member_type &member) const {
3719 const local_ordinal_type packidx = member.league_rank();
3720 const local_ordinal_type partidx_begin = packptr(packidx);
3721 const local_ordinal_type npacks = packptr(packidx+1) - partidx_begin;
3722 const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
3723 Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](const local_ordinal_type &v) {
3724 const local_ordinal_type partidx = partidx_begin + v;
3725 const local_ordinal_type ri0 = part2rowidx0(partidx);
3726 const local_ordinal_type nrows = part2rowidx0(partidx+1) - ri0;
3727
3728 if (nrows == 1) {
3729 const local_ordinal_type pri = pri0;
3730 for (local_ordinal_type col=0;col<num_vectors;++col) {
3731 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](const local_ordinal_type &i) {
3732 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0)+i,col));
3733 });
3734 }
3735 } else {
3736 Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](const local_ordinal_type &j) {
3737 const local_ordinal_type pri = pri0 + j;
3738 for (local_ordinal_type col=0;col<num_vectors;++col)
3739 for (local_ordinal_type i=0;i<blocksize;++i)
3740 packed_multivector(pri, i, col)[v] = static_cast<btdm_scalar_type>(scalar_multivector(blocksize*lclrow(ri0+j)+i,col));
3741 });
3742 }
3743 });
3744 }
3745
3746 void run(const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
3747 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
3748 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::MultiVectorConverter", MultiVectorConverter0);
3749
3750 scalar_multivector = scalar_multivector_;
3751 if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
3752 const local_ordinal_type vl = vector_length;
3753 const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
3754 Kokkos::parallel_for
3755 ("MultiVectorConverter::TeamPolicy", policy, *this);
3756 } else {
3757 const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
3758 Kokkos::parallel_for
3759 ("MultiVectorConverter::RangePolicy", policy, *this);
3760 }
3761 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3762 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
3763 }
3764 };
3765
3769
3770 template<>
3771 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
3772 typedef KB::Mode::Serial mode_type;
3773 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3774#if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
3775 typedef KB::Algo::Level3::CompactMKL multi_vector_algo_type;
3776#else
3777 typedef KB::Algo::Level3::Blocked multi_vector_algo_type;
3778#endif
3779 static int recommended_team_size(const int /* blksize */,
3780 const int /* vector_length */,
3781 const int /* internal_vector_length */) {
3782 return 1;
3783 }
3784 };
3785
3786#if defined(KOKKOS_ENABLE_CUDA)
3787 static inline int SolveTridiagsRecommendedCudaTeamSize(const int blksize,
3788 const int vector_length,
3789 const int internal_vector_length) {
3790 const int vector_size = vector_length/internal_vector_length;
3791 int total_team_size(0);
3792 if (blksize <= 5) total_team_size = 32;
3793 else if (blksize <= 9) total_team_size = 32; // 64
3794 else if (blksize <= 12) total_team_size = 96;
3795 else if (blksize <= 16) total_team_size = 128;
3796 else if (blksize <= 20) total_team_size = 160;
3797 else total_team_size = 160;
3798 return total_team_size/vector_size;
3799 }
3800
3801 template<>
3802 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
3803 typedef KB::Mode::Team mode_type;
3804 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3805 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3806 static int recommended_team_size(const int blksize,
3807 const int vector_length,
3808 const int internal_vector_length) {
3809 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3810 }
3811 };
3812 template<>
3813 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
3814 typedef KB::Mode::Team mode_type;
3815 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3816 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3817 static int recommended_team_size(const int blksize,
3818 const int vector_length,
3819 const int internal_vector_length) {
3820 return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
3821 }
3822 };
3823#endif
3824
3825#if defined(KOKKOS_ENABLE_HIP)
3826 static inline int SolveTridiagsRecommendedHIPTeamSize(const int blksize,
3827 const int vector_length,
3828 const int internal_vector_length) {
3829 const int vector_size = vector_length/internal_vector_length;
3830 int total_team_size(0);
3831 if (blksize <= 5) total_team_size = 32;
3832 else if (blksize <= 9) total_team_size = 32; // 64
3833 else if (blksize <= 12) total_team_size = 96;
3834 else if (blksize <= 16) total_team_size = 128;
3835 else if (blksize <= 20) total_team_size = 160;
3836 else total_team_size = 160;
3837 return total_team_size/vector_size;
3838 }
3839
3840 template<>
3841 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPSpace> {
3842 typedef KB::Mode::Team mode_type;
3843 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3844 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3845 static int recommended_team_size(const int blksize,
3846 const int vector_length,
3847 const int internal_vector_length) {
3848 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3849 }
3850 };
3851 template<>
3852 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HIPHostPinnedSpace> {
3853 typedef KB::Mode::Team mode_type;
3854 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3855 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3856 static int recommended_team_size(const int blksize,
3857 const int vector_length,
3858 const int internal_vector_length) {
3859 return SolveTridiagsRecommendedHIPTeamSize(blksize, vector_length, internal_vector_length);
3860 }
3861 };
3862#endif
3863
3864#if defined(KOKKOS_ENABLE_SYCL)
3865 static inline int SolveTridiagsRecommendedSYCLTeamSize(const int blksize,
3866 const int vector_length,
3867 const int internal_vector_length) {
3868 const int vector_size = vector_length/internal_vector_length;
3869 int total_team_size(0);
3870 if (blksize <= 5) total_team_size = 32;
3871 else if (blksize <= 9) total_team_size = 32; // 64
3872 else if (blksize <= 12) total_team_size = 96;
3873 else if (blksize <= 16) total_team_size = 128;
3874 else if (blksize <= 20) total_team_size = 160;
3875 else total_team_size = 160;
3876 return total_team_size/vector_size;
3877 }
3878
3879 template<>
3880 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLSharedUSMSpace> {
3881 typedef KB::Mode::Team mode_type;
3882 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3883 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3884 static int recommended_team_size(const int blksize,
3885 const int vector_length,
3886 const int internal_vector_length) {
3887 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
3888 }
3889 };
3890 template<>
3891 struct SolveTridiagsDefaultModeAndAlgo<Kokkos::Experimental::SYCLDeviceUSMSpace> {
3892 typedef KB::Mode::Team mode_type;
3893 typedef KB::Algo::Level2::Unblocked single_vector_algo_type;
3894 typedef KB::Algo::Level3::Unblocked multi_vector_algo_type;
3895 static int recommended_team_size(const int blksize,
3896 const int vector_length,
3897 const int internal_vector_length) {
3898 return SolveTridiagsRecommendedSYCLTeamSize(blksize, vector_length, internal_vector_length);
3899 }
3900 };
3901#endif
3902
3903
3904
3905
3906 template<typename MatrixType>
3907 struct SolveTridiags {
3908 public:
3909 using impl_type = BlockHelperDetails::ImplType<MatrixType>;
3910 using execution_space = typename impl_type::execution_space;
3911
3912 using local_ordinal_type = typename impl_type::local_ordinal_type;
3913 using size_type = typename impl_type::size_type;
3914 using impl_scalar_type = typename impl_type::impl_scalar_type;
3915 using magnitude_type = typename impl_type::magnitude_type;
3916 using btdm_scalar_type = typename impl_type::btdm_scalar_type;
3917 using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
3919 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3920 using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
3921 using size_type_2d_view = typename impl_type::size_type_2d_view;
3923 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3924 using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
3925 using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
3926 using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
3927
3928 using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
3929
3930 using internal_vector_type =typename impl_type::internal_vector_type;
3931 static constexpr int vector_length = impl_type::vector_length;
3932 static constexpr int internal_vector_length = impl_type::internal_vector_length;
3933
3935 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
3936 using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
3937
3939 using team_policy_type = Kokkos::TeamPolicy<execution_space>;
3940 using member_type = typename team_policy_type::member_type;
3941
3942 private:
3943 // part interface
3944 local_ordinal_type n_subparts_per_part;
3945 const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
3946 const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
3947 const ConstUnmanaged<local_ordinal_type_1d_view> packindices_sub;
3948 const ConstUnmanaged<local_ordinal_type_2d_view> packindices_schur;
3949 const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
3950 const ConstUnmanaged<local_ordinal_type_2d_view> part2packrowidx0_sub;
3951 const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
3952 const ConstUnmanaged<local_ordinal_type_1d_view> packptr_sub;
3953
3954 const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub;
3955 const ConstUnmanaged<size_type_2d_view> pack_td_ptr_schur;
3956
3957 // block tridiags
3958 const ConstUnmanaged<size_type_2d_view> pack_td_ptr;
3959
3960 // block tridiags values
3961 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
3962 const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
3963 const Unmanaged<btdm_scalar_type_4d_view> X_internal_scalar_values;
3964
3965 internal_vector_type_4d_view X_internal_vector_values_schur;
3966
3967 const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values_schur;
3968 const ConstUnmanaged<internal_vector_type_5d_view> e_internal_vector_values;
3969
3970
3971 const local_ordinal_type vector_loop_size;
3972
3973 // copy to multivectors : damping factor and Y_scalar_multivector
3974 Unmanaged<impl_scalar_type_2d_view_tpetra> Y_scalar_multivector;
3975#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
3976 AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
3977#else
3978 /* */ Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
3979#endif
3980 const impl_scalar_type df;
3981 const bool compute_diff;
3982
3983 public:
3984 SolveTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf,
3985 const BlockTridiags<MatrixType> &btdm,
3986 const vector_type_3d_view &pmv,
3987 const impl_scalar_type damping_factor,
3988 const bool is_norm_manager_active)
3989 :
3990 // interface
3991 n_subparts_per_part(interf.n_subparts_per_part),
3992 partptr(interf.partptr),
3993 packptr(interf.packptr),
3994 packindices_sub(interf.packindices_sub),
3995 packindices_schur(interf.packindices_schur),
3996 part2packrowidx0(interf.part2packrowidx0),
3997 part2packrowidx0_sub(interf.part2packrowidx0_sub),
3998 lclrow(interf.lclrow),
3999 packptr_sub(interf.packptr_sub),
4000 partptr_sub(interf.partptr_sub),
4001 pack_td_ptr_schur(btdm.pack_td_ptr_schur),
4002 // block tridiags and multivector
4003 pack_td_ptr(btdm.pack_td_ptr),
4004 D_internal_vector_values((internal_vector_type*)btdm.values.data(),
4005 btdm.values.extent(0),
4006 btdm.values.extent(1),
4007 btdm.values.extent(2),
4008 vector_length/internal_vector_length),
4009 X_internal_vector_values((internal_vector_type*)pmv.data(),
4010 pmv.extent(0),
4011 pmv.extent(1),
4012 pmv.extent(2),
4013 vector_length/internal_vector_length),
4014 X_internal_scalar_values((btdm_scalar_type*)pmv.data(),
4015 pmv.extent(0),
4016 pmv.extent(1),
4017 pmv.extent(2),
4018 vector_length),
4019 X_internal_vector_values_schur(do_not_initialize_tag("X_internal_vector_values_schur"),
4020 2*(n_subparts_per_part-1) * part2packrowidx0_sub.extent(0),
4021 pmv.extent(1),
4022 pmv.extent(2),
4023 vector_length/internal_vector_length),
4024 D_internal_vector_values_schur((internal_vector_type*)btdm.values_schur.data(),
4025 btdm.values_schur.extent(0),
4026 btdm.values_schur.extent(1),
4027 btdm.values_schur.extent(2),
4028 vector_length/internal_vector_length),
4029 e_internal_vector_values((internal_vector_type*)btdm.e_values.data(),
4030 btdm.e_values.extent(0),
4031 btdm.e_values.extent(1),
4032 btdm.e_values.extent(2),
4033 btdm.e_values.extent(3),
4034 vector_length/internal_vector_length),
4035 vector_loop_size(vector_length/internal_vector_length),
4036 Y_scalar_multivector(),
4037 Z_scalar_vector(),
4038 df(damping_factor),
4039 compute_diff(is_norm_manager_active)
4040 {}
4041
4042 public:
4043
4045 KOKKOS_INLINE_FUNCTION
4046 void
4047 copyToFlatMultiVector(const member_type &member,
4048 const local_ordinal_type partidxbeg, // partidx for v = 0
4049 const local_ordinal_type npacks,
4050 const local_ordinal_type pri0,
4051 const local_ordinal_type v, // index with a loop of vector_loop_size
4052 const local_ordinal_type blocksize,
4053 const local_ordinal_type num_vectors) const {
4054 const local_ordinal_type vbeg = v*internal_vector_length;
4055 if (vbeg < npacks) {
4056 local_ordinal_type ri0_vals[internal_vector_length] = {};
4057 local_ordinal_type nrows_vals[internal_vector_length] = {};
4058 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
4059 const local_ordinal_type partidx = partidxbeg+vv;
4060 ri0_vals[vi] = partptr(partidx);
4061 nrows_vals[vi] = partptr(partidx+1) - ri0_vals[vi];
4062 }
4063
4064 impl_scalar_type z_partial_sum(0);
4065 if (nrows_vals[0] == 1) {
4066 const local_ordinal_type j=0, pri=pri0;
4067 {
4068 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
4069 const local_ordinal_type ri0 = ri0_vals[vi];
4070 const local_ordinal_type nrows = nrows_vals[vi];
4071 if (j < nrows) {
4072 Kokkos::parallel_for
4073 (Kokkos::TeamThreadRange(member, blocksize),
4074 [&](const local_ordinal_type &i) {
4075 const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
4076 for (local_ordinal_type col=0;col<num_vectors;++col) {
4077 impl_scalar_type &y = Y_scalar_multivector(row,col);
4078 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4079 y += df*yd;
4080
4081 {//if (compute_diff) {
4082 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4083 z_partial_sum += yd_abs*yd_abs;
4084 }
4085 }
4086 });
4087 }
4088 }
4089 }
4090 } else {
4091 Kokkos::parallel_for
4092 (Kokkos::TeamThreadRange(member, nrows_vals[0]),
4093 [&](const local_ordinal_type &j) {
4094 const local_ordinal_type pri = pri0 + j;
4095 for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
4096 const local_ordinal_type ri0 = ri0_vals[vi];
4097 const local_ordinal_type nrows = nrows_vals[vi];
4098 if (j < nrows) {
4099 for (local_ordinal_type col=0;col<num_vectors;++col) {
4100 for (local_ordinal_type i=0;i<blocksize;++i) {
4101 const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
4102 impl_scalar_type &y = Y_scalar_multivector(row,col);
4103 const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
4104 y += df*yd;
4105
4106 {//if (compute_diff) {
4107 const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
4108 z_partial_sum += yd_abs*yd_abs;
4109 }
4110 }
4111 }
4112 }
4113 }
4114 });
4115 }
4116 //if (compute_diff)
4117 Z_scalar_vector(member.league_rank()) += z_partial_sum;
4118 }
4119 }
4120
4124 template<typename WWViewType>
4125 KOKKOS_INLINE_FUNCTION
4126 void
4127 solveSingleVector(const member_type &member,
4128 const local_ordinal_type &blocksize,
4129 const local_ordinal_type &i0,
4130 const local_ordinal_type &r0,
4131 const local_ordinal_type &nrows,
4132 const local_ordinal_type &v,
4133 const WWViewType &WW) const {
4134
4135 typedef SolveTridiagsDefaultModeAndAlgo
4136 <typename execution_space::memory_space> default_mode_and_algo_type;
4137
4138 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4139 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4140
4141 // base pointers
4142 auto A = D_internal_vector_values.data();
4143 auto X = X_internal_vector_values.data();
4144
4145 // constant
4146 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4147 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4148 //const local_ordinal_type num_vectors = X_scalar_values.extent(2);
4149
4150 // const local_ordinal_type blocksize = D_scalar_values.extent(1);
4151 const local_ordinal_type astep = D_internal_vector_values.stride_0();
4152 const local_ordinal_type as0 = D_internal_vector_values.stride_1(); //blocksize*vector_length;
4153 const local_ordinal_type as1 = D_internal_vector_values.stride_2(); //vector_length;
4154 const local_ordinal_type xstep = X_internal_vector_values.stride_0();
4155 const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); //vector_length;
4156
4157 // move to starting point
4158 A += i0*astep + v;
4159 X += r0*xstep + v;
4160
4161 //for (local_ordinal_type col=0;col<num_vectors;++col)
4162 if (nrows > 1) {
4163 // solve Lx = x
4164 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
4165 (default_mode_type,default_algo_type,
4166 member,
4167 KB::Diag::Unit,
4168 blocksize,blocksize,
4169 one,
4170 A, as0, as1,
4171 X, xs0);
4172
4173 for (local_ordinal_type tr=1;tr<nrows;++tr) {
4174 member.team_barrier();
4175 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4176 (default_mode_type,default_algo_type,
4177 member,
4178 blocksize, blocksize,
4179 -one,
4180 A+2*astep, as0, as1,
4181 X, xs0,
4182 one,
4183 X+1*xstep, xs0);
4184 KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
4185 (default_mode_type,default_algo_type,
4186 member,
4187 KB::Diag::Unit,
4188 blocksize,blocksize,
4189 one,
4190 A+3*astep, as0, as1,
4191 X+1*xstep, xs0);
4192
4193 A += 3*astep;
4194 X += 1*xstep;
4195 }
4196
4197 // solve Ux = x
4198 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
4199 (default_mode_type,default_algo_type,
4200 member,
4201 KB::Diag::NonUnit,
4202 blocksize, blocksize,
4203 one,
4204 A, as0, as1,
4205 X, xs0);
4206
4207 for (local_ordinal_type tr=nrows;tr>1;--tr) {
4208 A -= 3*astep;
4209 member.team_barrier();
4210 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4211 (default_mode_type,default_algo_type,
4212 member,
4213 blocksize, blocksize,
4214 -one,
4215 A+1*astep, as0, as1,
4216 X, xs0,
4217 one,
4218 X-1*xstep, xs0);
4219 KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
4220 (default_mode_type,default_algo_type,
4221 member,
4222 KB::Diag::NonUnit,
4223 blocksize, blocksize,
4224 one,
4225 A, as0, as1,
4226 X-1*xstep,xs0);
4227 X -= 1*xstep;
4228 }
4229 // for multiple rhs
4230 //X += xs1;
4231 } else {
4232 const local_ordinal_type ws0 = WW.stride_0();
4233 auto W = WW.data() + v;
4234 KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
4235 (default_mode_type,
4236 member, blocksize, X, xs0, W, ws0);
4237 member.team_barrier();
4238 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4239 (default_mode_type,default_algo_type,
4240 member,
4241 blocksize, blocksize,
4242 one,
4243 A, as0, as1,
4244 W, xs0,
4245 zero,
4246 X, xs0);
4247 }
4248 }
4249
4250 template<typename WWViewType>
4251 KOKKOS_INLINE_FUNCTION
4252 void
4253 solveMultiVector(const member_type &member,
4254 const local_ordinal_type &/* blocksize */,
4255 const local_ordinal_type &i0,
4256 const local_ordinal_type &r0,
4257 const local_ordinal_type &nrows,
4258 const local_ordinal_type &v,
4259 const WWViewType &WW) const {
4260
4261 typedef SolveTridiagsDefaultModeAndAlgo
4262 <typename execution_space::memory_space> default_mode_and_algo_type;
4263
4264 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4265 typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
4266
4267 // constant
4268 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4269 const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
4270
4271 // subview pattern
4272 auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
4273 auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
4274 auto X2 = X1;
4275
4276 local_ordinal_type i = i0, r = r0;
4277
4278
4279 if (nrows > 1) {
4280 // solve Lx = x
4281 KB::Trsm<member_type,
4282 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
4283 default_mode_type,default_algo_type>
4284 ::invoke(member, one, A, X1);
4285 for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
4286 A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
4287 X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
4288 member.team_barrier();
4289 KB::Gemm<member_type,
4290 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4291 default_mode_type,default_algo_type>
4292 ::invoke(member, -one, A, X1, one, X2);
4293 A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
4294 KB::Trsm<member_type,
4295 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
4296 default_mode_type,default_algo_type>
4297 ::invoke(member, one, A, X2);
4298 X1.assign_data( X2.data() );
4299 }
4300
4301 // solve Ux = x
4302 KB::Trsm<member_type,
4303 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
4304 default_mode_type,default_algo_type>
4305 ::invoke(member, one, A, X1);
4306 for (local_ordinal_type tr=nrows;tr>1;--tr) {
4307 i -= 3;
4308 A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
4309 X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
4310 member.team_barrier();
4311 KB::Gemm<member_type,
4312 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4313 default_mode_type,default_algo_type>
4314 ::invoke(member, -one, A, X1, one, X2);
4315
4316 A.assign_data( &D_internal_vector_values(i,0,0,v) );
4317 KB::Trsm<member_type,
4318 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
4319 default_mode_type,default_algo_type>
4320 ::invoke(member, one, A, X2);
4321 X1.assign_data( X2.data() );
4322 }
4323 } else {
4324 // matrix is already inverted
4325 auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
4326 KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
4327 ::invoke(member, X1, W);
4328 member.team_barrier();
4329 KB::Gemm<member_type,
4330 KB::Trans::NoTranspose,KB::Trans::NoTranspose,
4331 default_mode_type,default_algo_type>
4332 ::invoke(member, one, A, W, zero, X1);
4333 }
4334 }
4335
4336 template<int B> struct SingleVectorTag {};
4337 template<int B> struct MultiVectorTag {};
4338
4339 template<int B> struct SingleVectorSubLineTag {};
4340 template<int B> struct MultiVectorSubLineTag {};
4341 template<int B> struct SingleVectorApplyCTag {};
4342 template<int B> struct MultiVectorApplyCTag {};
4343 template<int B> struct SingleVectorSchurTag {};
4344 template<int B> struct MultiVectorSchurTag {};
4345 template<int B> struct SingleVectorApplyETag {};
4346 template<int B> struct MultiVectorApplyETag {};
4347 template<int B> struct SingleVectorCopyToFlatTag {};
4348 template<int B> struct SingleZeroingTag {};
4349
4350 template<int B>
4351 KOKKOS_INLINE_FUNCTION
4352 void
4353 operator() (const SingleVectorTag<B> &, const member_type &member) const {
4354 const local_ordinal_type packidx = member.league_rank();
4355 const local_ordinal_type partidx = packptr(packidx);
4356 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4357 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4358 const local_ordinal_type i0 = pack_td_ptr(partidx,0);
4359 const local_ordinal_type r0 = part2packrowidx0(partidx);
4360 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
4361 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4362 const local_ordinal_type num_vectors = 1;
4363 internal_vector_scratch_type_3d_view
4364 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4365 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4366 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4367 });
4368 Kokkos::parallel_for
4369 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4370 solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
4371 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4372 });
4373 }
4374
4375 template<int B>
4376 KOKKOS_INLINE_FUNCTION
4377 void
4378 operator() (const MultiVectorTag<B> &, const member_type &member) const {
4379 const local_ordinal_type packidx = member.league_rank();
4380 const local_ordinal_type partidx = packptr(packidx);
4381 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4382 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4383 const local_ordinal_type i0 = pack_td_ptr(partidx,0);
4384 const local_ordinal_type r0 = part2packrowidx0(partidx);
4385 const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
4386 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4387 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4388
4389 internal_vector_scratch_type_3d_view
4390 WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
4391 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4392 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4393 });
4394 Kokkos::parallel_for
4395 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4396 solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
4397 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4398 });
4399 }
4400
4401 template<int B>
4402 KOKKOS_INLINE_FUNCTION
4403 void
4404 operator() (const SingleVectorSubLineTag<B> &, const member_type &member) const {
4405 // btdm is packed and sorted from largest one
4406 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4407
4408 const local_ordinal_type subpartidx = packptr_sub(packidx);
4409 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4410 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4411 const local_ordinal_type partidx = subpartidx%n_parts;
4412
4413 const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
4414 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
4415 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4416 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4417 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4418
4419 //(void) i0;
4420 //(void) nrows;
4421 (void) npacks;
4422
4423 internal_vector_scratch_type_3d_view
4424 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
4425
4426 Kokkos::parallel_for
4427 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4428 solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, D_internal_vector_values, X_internal_vector_values, WW);
4429 });
4430 }
4431
4432 template<int B>
4433 KOKKOS_INLINE_FUNCTION
4434 void
4435 operator() (const SingleVectorApplyCTag<B> &, const member_type &member) const {
4436 // btdm is packed and sorted from largest one
4437 //const local_ordinal_type packidx = packindices_schur(member.league_rank());
4438 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4439
4440 const local_ordinal_type subpartidx = packptr_sub(packidx);
4441 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4442 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4443 const local_ordinal_type partidx = subpartidx%n_parts;
4444 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4445
4446 //const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
4447 const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
4448 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4449 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4450
4451 internal_vector_scratch_type_3d_view
4452 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4453
4454 // Compute v_2 = v_2 - C v_1
4455
4456 const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
4457 const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1;
4458 const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2;
4459
4460 (void) i0_schur;
4461 (void) i0_offset;
4462
4463 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4464
4465 const size_type c_kps2 = local_subpartidx > 0 ? pack_td_ptr(partidx, local_subpartidx)-2 : 0;
4466 const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx+1)+1;
4467
4468 typedef SolveTridiagsDefaultModeAndAlgo
4469 <typename execution_space::memory_space> default_mode_and_algo_type;
4470
4471 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4472 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4473
4474 if (local_subpartidx == 0) {
4475 Kokkos::parallel_for
4476 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4477 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+nrows-1, Kokkos::ALL(), 0, v);
4478 auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4479 auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4480
4481 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4482 (default_mode_type,default_algo_type,
4483 member,
4484 blocksize, blocksize,
4485 -one,
4486 C.data(), C.stride_0(), C.stride_1(),
4487 v_1.data(), v_1.stride_0(),
4488 one,
4489 v_2.data(), v_2.stride_0());
4490 });
4491 }
4492 else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
4493 Kokkos::parallel_for
4494 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4495 auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4496 auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4497 auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4498
4499 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4500 (default_mode_type,default_algo_type,
4501 member,
4502 blocksize, blocksize,
4503 -one,
4504 C.data(), C.stride_0(), C.stride_1(),
4505 v_1.data(), v_1.stride_0(),
4506 one,
4507 v_2.data(), v_2.stride_0());
4508 });
4509 }
4510 else {
4511 Kokkos::parallel_for
4512 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4513 {
4514 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+nrows-1, Kokkos::ALL(), 0, v);
4515 auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4516 auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
4517
4518 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4519 (default_mode_type,default_algo_type,
4520 member,
4521 blocksize, blocksize,
4522 -one,
4523 C.data(), C.stride_0(), C.stride_1(),
4524 v_1.data(), v_1.stride_0(),
4525 one,
4526 v_2.data(), v_2.stride_0());
4527 }
4528 {
4529 auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
4530 auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4531 auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
4532
4533 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4534 (default_mode_type,default_algo_type,
4535 member,
4536 blocksize, blocksize,
4537 -one,
4538 C.data(), C.stride_0(), C.stride_1(),
4539 v_1.data(), v_1.stride_0(),
4540 one,
4541 v_2.data(), v_2.stride_0());
4542 }
4543 });
4544 }
4545 }
4546
4547 template<int B>
4548 KOKKOS_INLINE_FUNCTION
4549 void
4550 operator() (const SingleVectorSchurTag<B> &, const member_type &member) const {
4551 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4552
4553 const local_ordinal_type partidx = packptr_sub(packidx);
4554
4555 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4556
4557 const local_ordinal_type i0_schur = pack_td_ptr_schur(partidx,0);
4558 const local_ordinal_type nrows = 2*(n_subparts_per_part-1);
4559
4560 const local_ordinal_type r0_schur = nrows * member.league_rank();
4561
4562 internal_vector_scratch_type_3d_view
4563 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4564
4565 for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part-1; ++schur_sub_part) {
4566 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,2*schur_sub_part+1);
4567 for (local_ordinal_type i = 0; i < 2; ++i) {
4568 copy3DView<local_ordinal_type>(member,
4569 Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4570 Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4571 }
4572 }
4573
4574 Kokkos::parallel_for
4575 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4576 solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0_schur, r0_schur, nrows, v, D_internal_vector_values_schur, X_internal_vector_values_schur, WW);
4577 });
4578
4579 for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part-1; ++schur_sub_part) {
4580 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,2*schur_sub_part+1);
4581 for (local_ordinal_type i = 0; i < 2; ++i) {
4582 copy3DView<local_ordinal_type>(member,
4583 Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()),
4584 Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
4585 }
4586 }
4587 }
4588
4589 template<int B>
4590 KOKKOS_INLINE_FUNCTION
4591 void
4592 operator() (const SingleVectorApplyETag<B> &, const member_type &member) const {
4593 const local_ordinal_type packidx = packindices_sub(member.league_rank());
4594
4595 const local_ordinal_type subpartidx = packptr_sub(packidx);
4596 const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
4597 const local_ordinal_type local_subpartidx = subpartidx/n_parts;
4598 const local_ordinal_type partidx = subpartidx%n_parts;
4599 const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
4600
4601 const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
4602 const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
4603
4604 internal_vector_scratch_type_3d_view
4605 WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
4606
4607 // Compute v_2 = v_2 - C v_1
4608
4609 const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
4610
4611 typedef SolveTridiagsDefaultModeAndAlgo
4612 <typename execution_space::memory_space> default_mode_and_algo_type;
4613
4614 typedef typename default_mode_and_algo_type::mode_type default_mode_type;
4615 typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
4616
4617 if (local_subpartidx == 0) {
4618 Kokkos::parallel_for
4619 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4620
4621 auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4622
4623 for (local_ordinal_type row = 0; row < nrows; ++row) {
4624 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4625 auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4626
4627 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4628 (default_mode_type,default_algo_type,
4629 member,
4630 blocksize, blocksize,
4631 -one,
4632 E.data(), E.stride_0(), E.stride_1(),
4633 v_2.data(), v_2.stride_0(),
4634 one,
4635 v_1.data(), v_1.stride_0());
4636 }
4637 });
4638 }
4639 else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
4640 Kokkos::parallel_for
4641 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4642 auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4643
4644 for (local_ordinal_type row = 0; row < nrows; ++row) {
4645 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4646 auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4647
4648 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4649 (default_mode_type,default_algo_type,
4650 member,
4651 blocksize, blocksize,
4652 -one,
4653 E.data(), E.stride_0(), E.stride_1(),
4654 v_2.data(), v_2.stride_0(),
4655 one,
4656 v_1.data(), v_1.stride_0());
4657 }
4658 });
4659 }
4660 else {
4661 Kokkos::parallel_for
4662 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4663 {
4664 auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
4665
4666 for (local_ordinal_type row = 0; row < nrows; ++row) {
4667 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4668 auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4669
4670 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4671 (default_mode_type,default_algo_type,
4672 member,
4673 blocksize, blocksize,
4674 -one,
4675 E.data(), E.stride_0(), E.stride_1(),
4676 v_2.data(), v_2.stride_0(),
4677 one,
4678 v_1.data(), v_1.stride_0());
4679 }
4680 }
4681 {
4682 auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
4683
4684 for (local_ordinal_type row = 0; row < nrows; ++row) {
4685 auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
4686 auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
4687
4688 KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
4689 (default_mode_type,default_algo_type,
4690 member,
4691 blocksize, blocksize,
4692 -one,
4693 E.data(), E.stride_0(), E.stride_1(),
4694 v_2.data(), v_2.stride_0(),
4695 one,
4696 v_1.data(), v_1.stride_0());
4697 }
4698 }
4699 });
4700 }
4701 }
4702
4703 template<int B>
4704 KOKKOS_INLINE_FUNCTION
4705 void
4706 operator() (const SingleVectorCopyToFlatTag<B> &, const member_type &member) const {
4707 const local_ordinal_type packidx = member.league_rank();
4708 const local_ordinal_type partidx = packptr(packidx);
4709 const local_ordinal_type npacks = packptr(packidx+1) - partidx;
4710 const local_ordinal_type pri0 = part2packrowidx0(partidx);
4711 const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
4712 const local_ordinal_type num_vectors = 1;
4713
4714 Kokkos::parallel_for
4715 (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
4716 copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
4717 });
4718 }
4719
4720 template<int B>
4721 KOKKOS_INLINE_FUNCTION
4722 void
4723 operator() (const SingleZeroingTag<B> &, const member_type &member) const {
4724 Kokkos::single(Kokkos::PerTeam(member), [&]() {
4725 Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
4726 });
4727 }
4728
4729 void run(const impl_scalar_type_2d_view_tpetra &Y,
4730 const impl_scalar_type_1d_view &Z) {
4731 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
4732 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SolveTridiags", SolveTridiags);
4733
4735 this->Y_scalar_multivector = Y;
4736 this->Z_scalar_vector = Z;
4737
4738 const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
4739 const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
4740
4741 const local_ordinal_type team_size =
4742 SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
4743 recommended_team_size(blocksize, vector_length, internal_vector_length);
4744 const int per_team_scratch = internal_vector_scratch_type_3d_view
4745 ::shmem_size(blocksize, num_vectors, vector_loop_size);
4746
4747#if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
4748#define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4749 if (num_vectors == 1) { \
4750 const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
4751 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4752 Kokkos::parallel_for \
4753 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4754 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
4755 } else { \
4756 const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
4757 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4758 Kokkos::parallel_for \
4759 ("SolveTridiags::TeamPolicy::run<MultiVector>", \
4760 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
4761 } break
4762#else
4763#define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
4764 if (num_vectors == 1) { \
4765 if (packindices_schur.extent(1) <= 0) { \
4766 Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
4767 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4768 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4769 Kokkos::parallel_for \
4770 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4771 policy, *this); \
4772 } \
4773 else { \
4774 { \
4775 \
4776 Kokkos::TeamPolicy<execution_space,SingleZeroingTag<B> > \
4777 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4778 Kokkos::parallel_for \
4779 ("SolveTridiags::TeamPolicy::run<SingleZeroingTag>", \
4780 policy, *this); \
4781 } \
4782 { \
4783 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSubLineTag", SingleVectorSubLineTag0); \
4784 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSubLineTag.mm"); \
4785 Kokkos::TeamPolicy<execution_space,SingleVectorSubLineTag<B> > \
4786 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4787 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4788 Kokkos::parallel_for \
4789 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4790 policy, *this); \
4791 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSubLineTag.mm"); \
4792 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4793 } \
4794 { \
4795 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyCTag", SingleVectorApplyCTag0); \
4796 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyCTag.mm"); \
4797 Kokkos::TeamPolicy<execution_space,SingleVectorApplyCTag<B> > \
4798 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4799 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4800 Kokkos::parallel_for \
4801 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4802 policy, *this); \
4803 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyCTag.mm"); \
4804 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4805 } \
4806 { \
4807 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSchurTag", SingleVectorSchurTag0); \
4808 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSchurTag.mm"); \
4809 Kokkos::TeamPolicy<execution_space,SingleVectorSchurTag<B> > \
4810 policy(packindices_schur.extent(0), team_size, vector_loop_size); \
4811 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4812 Kokkos::parallel_for \
4813 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4814 policy, *this); \
4815 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSchurTag.mm"); \
4816 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4817 } \
4818 { \
4819 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyETag", SingleVectorApplyETag0); \
4820 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyETag.mm"); \
4821 Kokkos::TeamPolicy<execution_space,SingleVectorApplyETag<B> > \
4822 policy(packindices_sub.extent(0), team_size, vector_loop_size); \
4823 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4824 Kokkos::parallel_for \
4825 ("SolveTridiags::TeamPolicy::run<SingleVector>", \
4826 policy, *this); \
4827 write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyETag.mm"); \
4828 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
4829 } \
4830 { \
4831 \
4832 Kokkos::TeamPolicy<execution_space,SingleVectorCopyToFlatTag<B> > \
4833 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4834 Kokkos::parallel_for \
4835 ("SolveTridiags::TeamPolicy::run<SingleVectorCopyToFlatTag>", \
4836 policy, *this); \
4837 } \
4838 } \
4839 } else { \
4840 Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
4841 policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
4842 policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
4843 Kokkos::parallel_for \
4844 ("SolveTridiags::TeamPolicy::run<MultiVector>", \
4845 policy, *this); \
4846 } break
4847#endif
4848 switch (blocksize) {
4849 case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
4850 case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
4851 case 6: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 6);
4852 case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
4853 case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
4854 case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
4855 case 12: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(12);
4856 case 13: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(13);
4857 case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
4858 case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
4859 case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
4860 case 19: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(19);
4861 default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
4862 }
4863#undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
4864
4865 IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
4866 IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
4867 }
4868 };
4869
4873 template<typename MatrixType>
4874 int
4876 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_row_matrix_type> &A,
4877 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
4878 const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
4879 const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
4880 const bool overlap_communication_and_computation,
4881 // tpetra interface
4882 const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
4883 /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
4884 /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method)
4885 /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
4886 // local object interface
4887 const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
4888 const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
4889 const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
4890 /* */ typename BlockHelperDetails::ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
4892 // preconditioner parameters
4894 /* */ bool is_y_zero,
4895 const int max_num_sweeps,
4896 const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
4897 const int check_tol_every) {
4898 IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi", ApplyInverseJacobi);
4899
4901 using node_memory_space = typename impl_type::node_memory_space;
4902 using local_ordinal_type = typename impl_type::local_ordinal_type;
4903 using size_type = typename impl_type::size_type;
4904 using impl_scalar_type = typename impl_type::impl_scalar_type;
4905 using magnitude_type = typename impl_type::magnitude_type;
4906 using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
4907 using vector_type_1d_view = typename impl_type::vector_type_1d_view;
4908 using vector_type_3d_view = typename impl_type::vector_type_3d_view;
4909 using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
4910
4911 using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
4912
4913 // either tpetra importer or async importer must be active
4914 TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
4915 "Neither Tpetra importer nor Async importer is null.");
4916 // max number of sweeps should be positive number
4917 TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
4918 "Maximum number of sweeps must be >= 1.");
4919
4920 // const parameters
4921 const bool is_seq_method_requested = !tpetra_importer.is_null();
4922 const bool is_async_importer_active = !async_importer.is_null();
4923 const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
4924 const magnitude_type tolerance = tol*tol;
4925 const local_ordinal_type blocksize = btdm.values.extent(1);
4926 const local_ordinal_type num_vectors = Y.getNumVectors();
4927 const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
4928
4929 const impl_scalar_type zero(0.0);
4930
4931 TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
4932 "The seq method for applyInverseJacobi, " <<
4933 "which in any case is for developer use only, " <<
4934 "does not support norm-based termination.");
4935 const bool device_accessible_from_host = Kokkos::SpaceAccessibility<
4936 Kokkos::DefaultHostExecutionSpace, node_memory_space>::accessible;
4937 TEUCHOS_TEST_FOR_EXCEPTION(is_seq_method_requested && !device_accessible_from_host,
4938 std::invalid_argument,
4939 "The seq method for applyInverseJacobi, " <<
4940 "which in any case is for developer use only, " <<
4941 "only supports memory spaces accessible from host.");
4942
4943 // if workspace is needed more, resize it
4944 const size_type work_span_required = num_blockrows*num_vectors*blocksize;
4945 if (work.span() < work_span_required)
4946 work = vector_type_1d_view("vector workspace 1d view", work_span_required);
4947
4948 // construct W
4949 const local_ordinal_type W_size = interf.packptr.extent(0)-1;
4950 if (local_ordinal_type(W.extent(0)) < W_size)
4951 W = impl_scalar_type_1d_view("W", W_size);
4952
4953 typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
4954 {
4955 if (is_seq_method_requested) {
4956 if (Z.getNumVectors() != Y.getNumVectors())
4957 Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false);
4958 } else {
4959 if (is_async_importer_active) {
4960 // create comm data buffer and keep it here
4961 async_importer->createDataBuffer(num_vectors);
4962 remote_multivector = async_importer->getRemoteMultiVectorLocalView();
4963 }
4964 }
4965 }
4966
4967 // wrap the workspace with 3d view
4968 vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
4969 const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
4970 const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
4971 const auto ZZ = Z.getLocalViewDevice(Tpetra::Access::ReadWrite);
4972 if (is_y_zero) Kokkos::deep_copy(YY, zero);
4973
4974 MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
4975 SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
4976 damping_factor, is_norm_manager_active);
4977
4978 const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
4979
4980
4981 auto A_crs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_crs_matrix_type>(A);
4982 auto A_bcrs = Teuchos::rcp_dynamic_cast<const typename impl_type::tpetra_block_crs_matrix_type>(A);
4983
4984 bool hasBlockCrsMatrix = ! A_bcrs.is_null ();
4985
4986 // This is OK here to use the graph of the A_crs matrix and a block size of 1
4987 const auto g = hasBlockCrsMatrix ? A_bcrs->getCrsGraph() : *(A_crs->getCrsGraph()); // tpetra crs graph object
4988
4989 BlockHelperDetails::ComputeResidualVector<MatrixType>
4990 compute_residual_vector(amd, G->getLocalGraphDevice(), g.getLocalGraphDevice(), blocksize, interf,
4991 is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view,
4992 hasBlockCrsMatrix);
4993
4994 // norm manager workspace resize
4995 if (is_norm_manager_active)
4996 norm_manager.setCheckFrequency(check_tol_every);
4997
4998 // iterate
4999 int sweep = 0;
5000 for (;sweep<max_num_sweeps;++sweep) {
5001 {
5002 if (is_y_zero) {
5003 // pmv := x(lclrow)
5004 multivector_converter.run(XX);
5005 } else {
5006 if (is_seq_method_requested) {
5007 // SEQ METHOD IS TESTING ONLY
5008
5009 // y := x - R y
5010 Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
5011 compute_residual_vector.run(YY, XX, ZZ);
5012
5013 // pmv := y(lclrow).
5014 multivector_converter.run(YY);
5015 } else {
5016 // fused y := x - R y and pmv := y(lclrow);
5017 // real use case does not use overlap comp and comm
5018 if (overlap_communication_and_computation || !is_async_importer_active) {
5019 if (is_async_importer_active) async_importer->asyncSendRecv(YY);
5020 // OverlapTag, compute_owned = true
5021 compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
5022 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
5023 if (is_async_importer_active) async_importer->cancel();
5024 break;
5025 }
5026 if (is_async_importer_active) {
5027 async_importer->syncRecv();
5028 // OverlapTag, compute_owned = false
5029 compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
5030 }
5031 } else {
5032 if (is_async_importer_active)
5033 async_importer->syncExchange(YY);
5034 if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
5035 // AsyncTag
5036 compute_residual_vector.run(pmv, XX, YY, remote_multivector);
5037 }
5038 }
5039 }
5040 }
5041
5042 // pmv := inv(D) pmv.
5043 {
5044 solve_tridiags.run(YY, W);
5045 }
5046 {
5047 if (is_norm_manager_active) {
5048 // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
5049 BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
5050 if (sweep + 1 == max_num_sweeps) {
5051 norm_manager.ireduce(sweep, true);
5052 norm_manager.checkDone(sweep + 1, tolerance, true);
5053 } else {
5054 norm_manager.ireduce(sweep);
5055 }
5056 }
5057 }
5058 is_y_zero = false;
5059 }
5060
5061 //sqrt the norms for the caller's use.
5062 if (is_norm_manager_active) norm_manager.finalize();
5063
5064 return sweep;
5065 }
5066
5067
5068 template<typename MatrixType>
5069 struct ImplObject {
5071 using part_interface_type = BlockHelperDetails::PartInterface<MatrixType>;
5072 using block_tridiags_type = BlockTridiags<MatrixType>;
5073 using amd_type = BlockHelperDetails::AmD<MatrixType>;
5074 using norm_manager_type = BlockHelperDetails::NormManager<MatrixType>;
5075 using async_import_type = AsyncableImport<MatrixType>;
5076
5077 // distructed objects
5078 Teuchos::RCP<const typename impl_type::tpetra_row_matrix_type> A;
5079 Teuchos::RCP<const typename impl_type::tpetra_crs_graph_type> blockGraph;
5080 Teuchos::RCP<const typename impl_type::tpetra_import_type> tpetra_importer;
5081 Teuchos::RCP<async_import_type> async_importer;
5082 bool overlap_communication_and_computation;
5083
5084 // copy of Y (mutable to penentrate const)
5085 mutable typename impl_type::tpetra_multivector_type Z;
5086 mutable typename impl_type::impl_scalar_type_1d_view W;
5087
5088 // local objects
5089 part_interface_type part_interface;
5090 block_tridiags_type block_tridiags; // D
5091 amd_type a_minus_d; // R = A - D
5092 mutable typename impl_type::vector_type_1d_view work; // right hand side workspace
5093 mutable norm_manager_type norm_manager;
5094 };
5095
5096 } // namespace BlockTriDiContainerDetails
5097
5098} // namespace Ifpack2
5099
5100#endif
Definition Ifpack2_BlockTriDiContainer_decl.hpp:78
BlockHelperDetails::PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::Array< Teuchos::Array< typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type > > &partitions, const typename BlockHelperDetails::ImplType< MatrixType >::local_ordinal_type n_subparts_per_part_in)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:1043
int applyInverseJacobi(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &X, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Y, typename BlockHelperDetails::ImplType< MatrixType >::tpetra_multivector_type &Z, typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type_1d_view &W, const BlockHelperDetails::PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const BlockHelperDetails::AmD< MatrixType > &amd, typename BlockHelperDetails::ImplType< MatrixType >::vector_type_1d_view &work, BlockHelperDetails::NormManager< MatrixType > &norm_manager, const typename BlockHelperDetails::ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:4875
void performSymbolicPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &g, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, BlockHelperDetails::AmD< MatrixType > &amd, const bool overlap_communication_and_computation, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, bool useSeqMethod)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:1862
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition Ifpack2_BlockTriDiContainer_impl.hpp:96
Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:163
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:884
void performNumericPhase(const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_row_matrix_type > &A, const Teuchos::RCP< const typename BlockHelperDetails::ImplType< MatrixType >::tpetra_crs_graph_type > &G, const BlockHelperDetails::PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename BlockHelperDetails::ImplType< MatrixType >::magnitude_type tiny)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:3599
BlockTridiags< MatrixType > createBlockTridiags(const BlockHelperDetails::PartInterface< MatrixType > &interf)
Definition Ifpack2_BlockTriDiContainer_impl.hpp:1620
Preconditioners and smoothers for Tpetra sparse matrices.
Definition Ifpack2_AdditiveSchwarz_decl.hpp:41
Definition Ifpack2_BlockComputeResidualVector.hpp:23
Definition Ifpack2_BlockHelper.hpp:188
Definition Ifpack2_BlockHelper.hpp:249
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition Ifpack2_BlockHelper.hpp:320
size_t size_type
Definition Ifpack2_BlockHelper.hpp:253
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition Ifpack2_BlockHelper.hpp:262
Definition Ifpack2_BlockHelper.hpp:351
Definition Ifpack2_BlockHelper.hpp:215
Definition Ifpack2_BlockTriDiContainer_impl.hpp:140
Definition Ifpack2_BlockTriDiContainer_impl.hpp:1565
forward declaration
Definition Ifpack2_BlockTriDiContainer_impl.hpp:5069
Definition Ifpack2_BlockTriDiContainer_impl.hpp:3634