55 typedef Teuchos::OrdinalTraits<GO> TOT;
62 RCP<const map_type>
const rowMap = A.
getRowMap();
63 RCP<const Teuchos::Comm<int> > comm = rowMap->getComm();
64 const int myRank = comm->getRank();
65 const size_t numProcs = comm->getSize();
68 bool alwaysUseParallelAlgorithm =
false;
69 if (params.isParameter(
"always use parallel algorithm"))
70 alwaysUseParallelAlgorithm = params.get<
bool>(
"always use parallel algorithm");
71 bool printMatrixMarketHeader =
true;
72 if (params.isParameter(
"print MatrixMarket header"))
73 printMatrixMarketHeader = params.get<
bool>(
"print MatrixMarket header");
75 if (printMatrixMarketHeader && myRank==0) {
76 std::time_t now = std::time(NULL);
78 const std::string dataTypeStr =
79 Teuchos::ScalarTraits<Scalar>::isComplex ?
"complex" :
"real";
89 os <<
"%%MatrixMarket matrix coordinate " << dataTypeStr <<
" general" << std::endl;
90 os <<
"% time stamp: " << ctime(&now);
91 os <<
"% written from " << numProcs <<
" processes" << std::endl;
92 os <<
"% point representation of Tpetra::BlockCrsMatrix" << std::endl;
95 os <<
"% " << numRows <<
" block rows, " << numCols <<
" block columns" << std::endl;
97 os <<
"% block size " << blockSize << std::endl;
98 os << numRows*blockSize <<
" " << numCols*blockSize <<
" " << A.
getGlobalNumEntries()*blockSize*blockSize << std::endl;
101 if (numProcs==1 && !alwaysUseParallelAlgorithm) {
104 size_t numRows = rowMap->getLocalNumElements();
107 RCP<const map_type> allMeshGidsMap = rcp(
new map_type(TOT::invalid(), numRows, A.
getIndexBase(), comm));
110 mv_type allMeshGids(allMeshGidsMap,1);
111 Teuchos::ArrayRCP<GO> allMeshGidsData = allMeshGids.getDataNonConst(0);
113 for (
size_t i=0; i<numRows; i++)
114 allMeshGidsData[i] = rowMap->getGlobalElement(i);
115 allMeshGidsData = Teuchos::null;
118 size_t stripSize = allMeshGids.getGlobalLength() / numProcs;
119 size_t remainder = allMeshGids.getGlobalLength() % numProcs;
121 size_t curStripSize = 0;
122 Teuchos::Array<GO> importMeshGidList;
123 for (
size_t i=0; i<numProcs; i++) {
125 curStripSize = stripSize;
126 if (i<remainder) curStripSize++;
127 importMeshGidList.resize(curStripSize);
128 for (
size_t j=0; j<curStripSize; j++) importMeshGidList[j] = j + curStart + A.
getIndexBase();
129 curStart += curStripSize;
132 TEUCHOS_TEST_FOR_EXCEPTION(myRank>0 && curStripSize!=0,
133 std::runtime_error,
"Tpetra::blockCrsMatrixWriter: (pid "
134 << myRank <<
") map size should be zero, but is " << curStripSize);
135 RCP<map_type> importMeshGidMap = rcp(
new map_type(TOT::invalid(), importMeshGidList(), A.
getIndexBase(), comm));
136 import_type gidImporter(allMeshGidsMap, importMeshGidMap);
137 mv_type importMeshGids(importMeshGidMap, 1);
138 importMeshGids.doImport(allMeshGids, gidImporter,
INSERT);
144 Teuchos::ArrayRCP<const GO> importMeshGidsData = importMeshGids.getData(0);
145 Teuchos::Array<GO> importMeshGidsGO;
146 importMeshGidsGO.reserve(importMeshGidsData.size());
147 for (
typename Teuchos::ArrayRCP<const GO>::size_type j=0; j<importMeshGidsData.size(); ++j)
148 importMeshGidsGO.push_back(importMeshGidsData[j]);
149 RCP<const map_type> importMap = rcp(
new map_type(TOT::invalid(), importMeshGidsGO(), rowMap->getIndexBase(), comm) );
153 RCP<crs_graph_type> graph =
createCrsGraph(importMap,numEntriesPerRow);
154 RCP<const map_type> domainMap = A.getCrsGraph().
getDomainMap();
155 graph->doImport(A.getCrsGraph(), importer,
INSERT);
156 graph->fillComplete(domainMap, importMap);
158 block_crs_matrix_type importA(*graph, A.
getBlockSize());
159 importA.doImport(A, importer,
INSERT);
172 using bcrs_local_inds_host_view_type =
typename bcrs_type::local_inds_host_view_type;
173 using bcrs_values_host_view_type =
typename bcrs_type::values_host_view_type;
177 RCP<const map_type> rowMap = A.
getRowMap();
178 RCP<const map_type> colMap = A.
getColMap();
179 RCP<const Teuchos::Comm<int> > comm = rowMap->getComm();
180 const int myRank = comm->getRank();
182 const size_t meshRowOffset = rowMap->getIndexBase();
183 const size_t meshColOffset = colMap->getIndexBase();
184 TEUCHOS_TEST_FOR_EXCEPTION(meshRowOffset != meshColOffset,
185 std::runtime_error,
"Tpetra::writeMatrixStrip: "
186 "mesh row index base != mesh column index base");
191 std::runtime_error,
"Tpetra::writeMatrixStrip: pid "
194 std::runtime_error,
"Tpetra::writeMatrixStrip: pid "
195 << myRank <<
" should have 0 columns but has " << A.
getLocalNumCols());
200 std::runtime_error,
"Tpetra::writeMatrixStrip: "
201 "number of rows on pid 0 does not match global number of rows");
207 bool precisionChanged=
false;
208 int oldPrecision = 0;
209 if (params.isParameter(
"precision")) {
210 oldPrecision = os.precision(params.get<
int>(
"precision"));
211 precisionChanged=
true;
214 if (params.isParameter(
"zero-based indexing")) {
215 if (params.get<
bool>(
"zero-based indexing") ==
true)
220 for (localRowInd = 0; localRowInd < numLocalRows; ++localRowInd) {
223 bcrs_local_inds_host_view_type localColInds;
224 bcrs_values_host_view_type vals;
226 A.
getLocalRowView (localRowInd, localColInds, vals); numEntries = localColInds.extent(0);
227 GO globalMeshRowID = rowMap->getGlobalElement(localRowInd) - meshRowOffset;
229 for (LO k = 0; k < numEntries; ++k) {
230 GO globalMeshColID = colMap->getGlobalElement(localColInds[k]) - meshColOffset;
233 for (LO j = 0; j < blockSize; ++j) {
234 GO globalPointRowID = globalMeshRowID * blockSize + j + pointOffset;
235 for (LO i = 0; i < blockSize; ++i) {
236 GO globalPointColID = globalMeshColID * blockSize + i + pointOffset;
239 os << globalPointRowID <<
" " << globalPointColID <<
" ";
240 if (Teuchos::ScalarTraits<impl_scalar_type>::isComplex) {
243 os << Teuchos::ScalarTraits<impl_scalar_type>::real (curVal) <<
" "
244 << Teuchos::ScalarTraits<impl_scalar_type>::imag (curVal);
254 if (precisionChanged)
255 os.precision(oldPrecision);
256 TEUCHOS_TEST_FOR_EXCEPTION(err != 0,
257 std::runtime_error,
"Tpetra::writeMatrixStrip: "
258 "error getting view of local row " << localRowInd);
268 TEUCHOS_FUNC_TIME_MONITOR_DIFF(
"Tpetra::getBlockCrsGraph", getBlockCrsGraph0);
277 using Teuchos::Array;
278 using Teuchos::ArrayView;
286 using row_map_type =
typename local_graph_device_type::row_map_type::non_const_type;
287 using entries_type =
typename local_graph_device_type::entries_type::non_const_type;
289 using offset_type =
typename row_map_type::non_const_value_type;
292 using range_type = Kokkos::RangePolicy<execution_space, LO>;
295 RCP<const map_type> meshRowMap, meshColMap, meshDomainMap, meshRangeMap;
302 TEUCHOS_FUNC_TIME_MONITOR_DIFF(
"Tpetra::getBlockCrsGraph::createMeshMaps", getBlockCrsGraph1);
307 Kokkos::DefaultExecutionSpace().fence();
310 if(meshColMap.is_null())
throw std::runtime_error(
"ERROR: Cannot create mesh colmap");
312 auto localMeshColMap = meshColMap->getLocalMap();
313 auto localPointColMap = pointColMap.getLocalMap();
314 auto localPointRowMap = pointRowMap.getLocalMap();
316 RCP<crs_graph_type> meshCrsGraph;
318 const offset_type bs2 = blockSize * blockSize;
321 TEUCHOS_FUNC_TIME_MONITOR_DIFF(
"Tpetra::getBlockCrsGraph::LID", getBlockCrsGraph2);
322 auto pointLocalGraph = pointMatrix.
getCrsGraph()->getLocalGraphDevice();
323 auto pointRowptr = pointLocalGraph.row_map;
324 auto pointColind = pointLocalGraph.entries;
326 TEUCHOS_TEST_FOR_EXCEPTION(pointColind.extent(0) % bs2 != 0,
327 std::runtime_error,
"Tpetra::getBlockCrsGraph: "
328 "local number of non zero entries is not a multiple of blockSize^2 ");
330 LO block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0)-1)/blockSize;
331 row_map_type blockRowptr(
"blockRowptr", block_rows+1);
332 entries_type blockColind(
"blockColind", pointColind.extent(0)/(bs2));
334 Kokkos::parallel_for(
"fillMesh",range_type(0,block_rows), KOKKOS_LAMBDA(
const LO i) {
336 const LO offset_b = pointRowptr(i*blockSize)/bs2;
337 const LO offset_b_max = pointRowptr((i+1)*blockSize)/bs2;
340 blockRowptr(i+1) = offset_b_max;
341 blockRowptr(i) = offset_b;
343 const LO offset_p = pointRowptr(i*blockSize);
345 for (LO k=0; k<offset_b_max-offset_b; ++k) {
346 blockColind(offset_b + k) = pointColind(offset_p + k * blockSize)/blockSize;
350 meshCrsGraph = rcp(
new crs_graph_type(meshRowMap, meshColMap, blockRowptr, blockColind));
351 meshCrsGraph->fillComplete(meshDomainMap,meshRangeMap);
352 Kokkos::DefaultExecutionSpace().fence();
355 TEUCHOS_FUNC_TIME_MONITOR_DIFF(
"Tpetra::getBlockCrsGraph::GID", getBlockCrsGraph3);
356 auto pointLocalGraph = pointMatrix.
getCrsGraph()->getLocalGraphDevice();
357 auto pointRowptr = pointLocalGraph.row_map;
358 auto pointColind = pointLocalGraph.entries;
360 TEUCHOS_TEST_FOR_EXCEPTION(pointColind.extent(0) % bs2 != 0,
361 std::runtime_error,
"Tpetra::getBlockCrsGraph: "
362 "local number of non zero entries is not a multiple of blockSize^2 ");
364 LO block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0)-1)/blockSize;
365 row_map_type blockRowptr(
"blockRowptr", block_rows+1);
366 entries_type blockColind(
"blockColind", pointColind.extent(0)/(bs2));
368 Kokkos::parallel_for(
"fillMesh",range_type(0,block_rows), KOKKOS_LAMBDA(
const LO i) {
370 const LO offset_b = pointRowptr(i*blockSize)/bs2;
371 const LO offset_b_max = pointRowptr((i+1)*blockSize)/bs2;
374 blockRowptr(i+1) = offset_b_max;
375 blockRowptr(i) = offset_b;
377 const LO offset_p = pointRowptr(i*blockSize);
378 const LO offset_p_max = pointRowptr((i+1)*blockSize);
381 for (LO p_i=0; p_i<offset_p_max-offset_p; ++p_i) {
382 auto bcol_GID = localPointColMap.getGlobalElement(pointColind(offset_p + p_i))/blockSize;
383 auto bcol_LID = localMeshColMap.getLocalElement(bcol_GID);
385 bool visited =
false;
386 for (LO k=0; k<filled_block; ++k) {
387 if (blockColind(offset_b + k) == bcol_LID)
391 blockColind(offset_b + filled_block) = bcol_LID;
397 meshCrsGraph = rcp(
new crs_graph_type(meshRowMap, meshColMap, blockRowptr, blockColind));
398 meshCrsGraph->fillComplete(meshDomainMap,meshRangeMap);
399 Kokkos::DefaultExecutionSpace().fence();
409 TEUCHOS_FUNC_TIME_MONITOR_DIFF(
"Tpetra::convertToBlockCrsMatrix", convertToBlockCrsMatrix0);
418 using Teuchos::Array;
419 using Teuchos::ArrayView;
427 using row_map_type =
typename local_graph_device_type::row_map_type::non_const_type;
428 using values_type =
typename local_matrix_device_type::values_type::non_const_type;
430 using offset_type =
typename row_map_type::non_const_value_type;
433 using range_type = Kokkos::RangePolicy<execution_space, LO>;
435 RCP<block_crs_matrix_type> blockMatrix;
437 const offset_type bs2 = blockSize * blockSize;
442 TEUCHOS_FUNC_TIME_MONITOR_DIFF(
"Tpetra::convertToBlockCrsMatrix::LID", convertToBlockCrsMatrix1);
443 auto pointLocalGraph = pointMatrix.
getCrsGraph()->getLocalGraphDevice();
444 auto pointRowptr = pointLocalGraph.row_map;
445 auto pointColind = pointLocalGraph.entries;
447 offset_type block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0)-1)/blockSize;
448 values_type blockValues(
"values", meshCrsGraph->getLocalNumEntries()*bs2);
450 auto blockRowptr = meshCrsGraph->getLocalGraphDevice().row_map;
452 Kokkos::parallel_for(
"copyblockValues",range_type(0,block_rows),KOKKOS_LAMBDA(
const LO i) {
453 const offset_type blkBeg = blockRowptr[i];
454 const offset_type numBlocks = blockRowptr[i+1] - blkBeg;
457 for (offset_type block=0; block < numBlocks; block++) {
460 for(LO little_row=0; little_row<blockSize; little_row++) {
461 offset_type point_row_offset = pointRowptr[i*blockSize + little_row];
462 for(LO little_col=0; little_col<blockSize; little_col++) {
463 blockValues((blkBeg+block) * bs2 + little_row * blockSize + little_col) =
464 pointValues[point_row_offset + block*blockSize + little_col];
470 blockMatrix = rcp(
new block_crs_matrix_type(*meshCrsGraph, blockValues, blockSize));
471 Kokkos::DefaultExecutionSpace().fence();
474 TEUCHOS_FUNC_TIME_MONITOR_DIFF(
"Tpetra::convertToBlockCrsMatrix::GID", convertToBlockCrsMatrix2);
475 auto localMeshColMap = meshCrsGraph->getColMap()->getLocalMap();
476 auto localPointColMap = pointMatrix.
getColMap()->getLocalMap();
478 values_type blockValues(
"values", meshCrsGraph->getLocalNumEntries()*bs2);
480 auto pointLocalGraph = pointMatrix.
getCrsGraph()->getLocalGraphDevice();
481 auto pointRowptr = pointLocalGraph.row_map;
482 auto pointColind = pointLocalGraph.entries;
484 offset_type block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0)-1)/blockSize;
486 auto blockRowptr = meshCrsGraph->getLocalGraphDevice().row_map;
487 auto blockColind = meshCrsGraph->getLocalGraphDevice().entries;
489 row_map_type pointGColind(
"pointGColind", pointColind.extent(0));
491 Kokkos::parallel_for(
"computePointGColind",range_type(0,pointColind.extent(0)),KOKKOS_LAMBDA(
const LO i) {
492 pointGColind(i) = localPointColMap.getGlobalElement(pointColind(i));
495 row_map_type blockGColind(
"blockGColind", blockColind.extent(0));
497 Kokkos::parallel_for(
"computeBlockGColind",range_type(0,blockGColind.extent(0)),KOKKOS_LAMBDA(
const LO i) {
498 blockGColind(i) = localMeshColMap.getGlobalElement(blockColind(i));
501 Kokkos::parallel_for(
"copyblockValues",range_type(0,block_rows),KOKKOS_LAMBDA(
const LO i) {
502 const offset_type blkBeg = blockRowptr[i];
503 const offset_type numBlocks = blockRowptr[i+1] - blkBeg;
505 for (offset_type point_i=0; point_i < pointRowptr[i*blockSize + 1] - pointRowptr[i*blockSize]; point_i++) {
507 offset_type block_inv=
static_cast<offset_type
>(-1);
508 offset_type little_col_inv=
static_cast<offset_type
>(-1);
509 for (offset_type block_2=0; block_2 < numBlocks; block_2++) {
510 for (LO little_col_2=0; little_col_2 < blockSize; little_col_2++) {
511 if (blockGColind(blkBeg+block_2)*blockSize + little_col_2 == pointGColind(pointRowptr[i*blockSize] + point_i)) {
513 little_col_inv = little_col_2;
517 if (block_inv!=
static_cast<offset_type
>(-1))
521 for(LO little_row=0; little_row<blockSize; little_row++) {
522 blockValues((blkBeg+block_inv) * bs2 + little_row * blockSize + little_col_inv) = pointValues[pointRowptr[i*blockSize+little_row] + point_i];
527 blockMatrix = rcp(
new block_crs_matrix_type(*meshCrsGraph, blockValues, blockSize));
528 Kokkos::DefaultExecutionSpace().fence();
541 using dev_row_view_t =
typename crs_t::local_graph_device_type::row_map_type::non_const_type;
542 using dev_col_view_t =
typename crs_t::local_graph_device_type::entries_type::non_const_type;
543 using dev_val_view_t =
typename crs_t::local_matrix_device_type::values_type::non_const_type;
544 using range_type = Kokkos::RangePolicy<execution_space, size_t>;
545 using team_policy = Kokkos::TeamPolicy<execution_space>;
546 using member_type =
typename team_policy::member_type;
547 using scratch_view = Kokkos::View<bool*, typename execution_space::scratch_memory_space>;
548 using Ordinal =
typename dev_row_view_t::non_const_value_type;
552 auto row_ptrs = local.graph.row_map;
553 auto col_inds = local.graph.entries;
554 auto values = local.values;
562 dev_row_view_t new_rowmap(
"new_rowmap", nrows+1);
563 const auto blocks_per_row = nrows / blockSize;
564 dev_row_view_t active_block_row_map(
"active_block_row_map", blocks_per_row + 1);
565 const int max_threads = execution_space::concurrency();
566 assert(blockSize > 1);
567 assert(nrows % blockSize == 0);
568 const int mem_level = 1;
569 const int bytes = scratch_view::shmem_size(blocks_per_row);
571 if (max_threads >= blockSize) {
573 team_policy tp(blocks_per_row, blockSize);
576 Kokkos::parallel_for(
"countActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(
const member_type& team) {
577 Ordinal block_row = team.league_rank();
579 scratch_view row_block_active(team.team_scratch(mem_level), blocks_per_row);
581 Kokkos::PerTeam(team), [&] () {
582 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
583 row_block_active(row_block_idx) =
false;
590 Kokkos::parallel_for(
591 Kokkos::TeamThreadRange(team, blockSize), [&] (Ordinal block_offset) {
593 Ordinal row = block_row*blockSize + block_offset;
594 Ordinal row_itr = row_ptrs(row);
595 Ordinal row_end = row_ptrs(row+1);
597 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
598 const Ordinal first_possible_col_in_block = row_block_idx * blockSize;
599 const Ordinal first_possible_col_in_next_block = (row_block_idx+1) * blockSize;
600 Ordinal curr_nnz_col = col_inds(row_itr);
601 while (curr_nnz_col >= first_possible_col_in_block && curr_nnz_col < first_possible_col_in_next_block && row_itr < row_end) {
603 row_block_active(row_block_idx) =
true;
605 if (row_itr == row_end)
break;
606 curr_nnz_col = col_inds(row_itr);
614 Kokkos::PerTeam(team), [&] () {
616 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
617 if (row_block_active(row_block_idx)) {
621 active_block_row_map(block_row) = count;
628 team_policy tp(blocks_per_row, 1);
631 Kokkos::parallel_for(
"countActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(
const member_type& team) {
632 Ordinal block_row = team.league_rank();
634 scratch_view row_block_active(team.team_scratch(mem_level), blocks_per_row);
635 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
636 row_block_active(row_block_idx) =
false;
640 for (
int block_offset=0; block_offset < blockSize; ++block_offset) {
641 Ordinal row = block_row*blockSize + block_offset;
642 Ordinal row_itr = row_ptrs(row);
643 Ordinal row_end = row_ptrs(row+1);
645 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
646 const Ordinal first_possible_col_in_block = row_block_idx * blockSize;
647 const Ordinal first_possible_col_in_next_block = (row_block_idx+1) * blockSize;
648 Ordinal curr_nnz_col = col_inds(row_itr);
649 while (curr_nnz_col >= first_possible_col_in_block && curr_nnz_col < first_possible_col_in_next_block && row_itr < row_end) {
651 row_block_active(row_block_idx) =
true;
653 if (row_itr == row_end)
break;
654 curr_nnz_col = col_inds(row_itr);
660 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
661 if (row_block_active(row_block_idx)) {
665 active_block_row_map(block_row) = count;
669 Ordinal nnz_block_count = 0;
670#if KOKKOSKERNELS_VERSION >= 40199
671 KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
672 execution_space>(active_block_row_map.extent(0), active_block_row_map, nnz_block_count);
674 KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
675 dev_row_view_t,
execution_space>(active_block_row_map.extent(0), active_block_row_map, nnz_block_count);
677 dev_col_view_t block_col_ids(
"block_col_ids", nnz_block_count);
680 if (max_threads >= blockSize) {
682 team_policy tp(blocks_per_row, blockSize);
684 Kokkos::parallel_for(
"findActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(
const member_type& team) {
685 Ordinal block_row = team.league_rank();
687 scratch_view row_block_active(team.team_scratch(mem_level), blocks_per_row);
689 Kokkos::PerTeam(team), [&] () {
690 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
691 row_block_active(row_block_idx) =
false;
698 Kokkos::parallel_for(
699 Kokkos::TeamThreadRange(team, blockSize), [&] (Ordinal block_offset) {
701 Ordinal row = block_row*blockSize + block_offset;
702 Ordinal row_itr = row_ptrs(row);
703 Ordinal row_end = row_ptrs(row+1);
705 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
706 const Ordinal first_possible_col_in_block = row_block_idx * blockSize;
707 const Ordinal first_possible_col_in_next_block = (row_block_idx+1) * blockSize;
708 Ordinal curr_nnz_col = col_inds(row_itr);
709 while (curr_nnz_col >= first_possible_col_in_block && curr_nnz_col < first_possible_col_in_next_block && row_itr < row_end) {
711 row_block_active(row_block_idx) =
true;
713 if (row_itr == row_end)
break;
714 curr_nnz_col = col_inds(row_itr);
722 Kokkos::PerTeam(team), [&] () {
723 Ordinal offset = active_block_row_map[block_row];
724 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
725 if (row_block_active(row_block_idx)) {
726 block_col_ids(offset) = row_block_idx;
734 team_policy tp(blocks_per_row, 1);
736 Kokkos::parallel_for(
"findActiveBlocks", tp.set_scratch_size(mem_level, Kokkos::PerTeam(bytes)), KOKKOS_LAMBDA(
const member_type& team) {
737 Ordinal block_row = team.league_rank();
739 scratch_view row_block_active(team.team_scratch(mem_level), blocks_per_row);
740 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
741 row_block_active(row_block_idx) =
false;
745 for (
int block_offset=0; block_offset < blockSize; ++block_offset) {
746 Ordinal row = block_row*blockSize + block_offset;
747 Ordinal row_itr = row_ptrs(row);
748 Ordinal row_end = row_ptrs(row+1);
750 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
751 const Ordinal first_possible_col_in_block = row_block_idx * blockSize;
752 const Ordinal first_possible_col_in_next_block = (row_block_idx+1) * blockSize;
753 Ordinal curr_nnz_col = col_inds(row_itr);
754 while (curr_nnz_col >= first_possible_col_in_block && curr_nnz_col < first_possible_col_in_next_block && row_itr < row_end) {
756 row_block_active(row_block_idx) =
true;
758 if (row_itr == row_end)
break;
759 curr_nnz_col = col_inds(row_itr);
764 Ordinal offset = active_block_row_map[block_row];
765 for (
size_t row_block_idx = 0; row_block_idx < blocks_per_row; ++row_block_idx) {
766 if (row_block_active(row_block_idx)) {
767 block_col_ids(offset) = row_block_idx;
775 Kokkos::parallel_for(
"sizing", range_type(0, nrows), KOKKOS_LAMBDA(
const size_t row) {
776 const auto block_row = row / blockSize;
777 const Ordinal block_row_begin = active_block_row_map(block_row);
778 const Ordinal block_row_end = active_block_row_map(block_row+1);
779 const Ordinal row_nnz = (block_row_end - block_row_begin) * blockSize;
780 new_rowmap(row) = row_nnz;
783 Ordinal new_nnz_count = 0;
784#if KOKKOSKERNELS_VERSION >= 40199
785 KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
788 KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
789 dev_row_view_t,
execution_space>(new_rowmap.extent(0), new_rowmap, new_nnz_count);
792 dev_col_view_t new_col_ids(
"new_col_ids", new_nnz_count);
793 dev_val_view_t new_vals(
"new_vals", new_nnz_count);
794 Kokkos::parallel_for(
"entries", range_type(0, nrows), KOKKOS_LAMBDA(
const size_t row) {
795 Ordinal row_itr = row_ptrs(row);
796 Ordinal row_end = row_ptrs(row+1);
797 Ordinal row_itr_new = new_rowmap(row);
799 Ordinal block_row = row / blockSize;
800 Ordinal block_row_begin = active_block_row_map(block_row);
801 Ordinal block_row_end = active_block_row_map(block_row+1);
803 for (Ordinal row_block_idx = block_row_begin; row_block_idx < block_row_end; ++row_block_idx) {
804 const Ordinal block_col = block_col_ids(row_block_idx);
805 const Ordinal first_possible_col_in_block = block_col * blockSize;
806 const Ordinal first_possible_col_in_next_block = (block_col+1) * blockSize;
807 for (Ordinal possible_col = first_possible_col_in_block; possible_col < first_possible_col_in_next_block; ++possible_col, ++row_itr_new) {
808 new_col_ids(row_itr_new) = possible_col;
809 Ordinal curr_nnz_col = col_inds(row_itr);
810 if (possible_col == curr_nnz_col && row_itr < row_end) {
812 new_vals(row_itr_new) = values(row_itr);
820 auto crs_row_map = pointMatrix.
getRowMap();
821 auto crs_col_map = pointMatrix.
getColMap();
822 Teuchos::RCP<crs_t> result = Teuchos::rcp(
new crs_t(crs_row_map, crs_col_map, new_rowmap, new_col_ids, new_vals));
823 result->fillComplete();
832 using dev_row_view_t =
typename crs_t::local_graph_device_type::row_map_type::non_const_type;
833 using dev_col_view_t =
typename crs_t::local_graph_device_type::entries_type::non_const_type;
834 using dev_val_view_t =
typename crs_t::local_matrix_device_type::values_type::non_const_type;
835 using impl_scalar_t =
typename Kokkos::ArithTraits<Scalar>::val_type;
836 using STS = Kokkos::ArithTraits<impl_scalar_t>;
837 using Ordinal =
typename dev_row_view_t::non_const_value_type;
839 using range_type = Kokkos::RangePolicy<execution_space, size_t>;
842 auto local = pointMatrix.getLocalMatrixHost();
843 auto row_ptrs = local.graph.row_map;
844 auto col_inds = local.graph.entries;
845 auto values = local.values;
849 dev_row_view_t new_rowmap(
"new_rowmap", nrows+1);
850 const impl_scalar_t zero = STS::zero();
851 Kokkos::parallel_for(
"sizing", range_type(0, nrows), KOKKOS_LAMBDA(
const size_t row) {
852 const Ordinal row_nnz_begin = row_ptrs(row);
853 Ordinal row_nnzs = 0;
854 for (Ordinal row_nnz = row_nnz_begin; row_nnz < row_ptrs(row+1); ++row_nnz) {
855 const impl_scalar_t value = values(row_nnz);
860 new_rowmap(row) = row_nnzs;
863 Ordinal real_nnzs = 0;
864#if KOKKOSKERNELS_VERSION >= 40199
865 KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
868 KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<
869 dev_row_view_t,
execution_space>(new_rowmap.extent(0), new_rowmap, real_nnzs);
872 dev_col_view_t new_col_ids(
"new_col_ids", real_nnzs);
873 dev_val_view_t new_vals(
"new_vals", real_nnzs);
874 Kokkos::parallel_for(
"entries", range_type(0, nrows), KOKKOS_LAMBDA(
const size_t row) {
875 Ordinal row_nnzs = 0;
876 const Ordinal old_row_nnz_begin = row_ptrs(row);
877 const Ordinal new_row_nnz_begin = new_rowmap(row);
878 for (Ordinal old_row_nnz = old_row_nnz_begin; old_row_nnz < row_ptrs(row+1); ++old_row_nnz) {
879 const impl_scalar_t value = values(old_row_nnz);
881 new_col_ids(new_row_nnz_begin + row_nnzs) = col_inds(old_row_nnz);
882 new_vals(new_row_nnz_begin + row_nnzs) = value;
889 auto crs_row_map = pointMatrix.
getRowMap();
890 auto crs_col_map = pointMatrix.
getColMap();
891 Teuchos::RCP<crs_t> result = Teuchos::rcp(
new crs_t(crs_row_map, crs_col_map, new_rowmap, new_col_ids, new_vals));
892 result->fillComplete();
972 using Teuchos::Array;
973 using Teuchos::ArrayView;
980 using crs_graph_type =
typename block_crs_matrix_type::crs_graph_type;
983 using row_map_type =
typename local_graph_device_type::row_map_type::non_const_type;
984 using entries_type =
typename local_graph_device_type::entries_type::non_const_type;
985 using values_type =
typename local_matrix_device_type::values_type::non_const_type;
987 using row_map_type_const =
typename local_graph_device_type::row_map_type;
988 using entries_type_const =
typename local_graph_device_type::entries_type;
990 using little_block_type =
typename block_crs_matrix_type::const_little_block_type;
991 using offset_type =
typename row_map_type::non_const_value_type;
992 using column_type =
typename entries_type::non_const_value_type;
995 using range_type = Kokkos::RangePolicy<execution_space, LO>;
999 const offset_type bs2 = blocksize * blocksize;
1001 size_t point_nnz = block_nnz * bs2;
1004 RCP<const map_type> pointDomainMap = blockMatrix.
getDomainMap();
1005 RCP<const map_type> pointRangeMap = blockMatrix.
getRangeMap();
1008 RCP<const map_type> blockRowMap = blockMatrix.
getRowMap();
1011 RCP<const map_type> blockColMap = blockMatrix.
getColMap();
1017 LO point_rows = (LO) pointRowMap->getLocalNumElements();
1018 LO block_rows = (LO) blockRowMap->getLocalNumElements();
1019 auto blockValues = blockMatrix.getValuesDevice();
1020 auto blockLocalGraph = blockGraph.getLocalGraphDevice();
1021 row_map_type_const blockRowptr = blockLocalGraph.row_map;
1022 entries_type_const blockColind = blockLocalGraph.entries;
1025 row_map_type rowptr(
"row_map", point_rows+1);
1026 entries_type colind(
"entries", point_nnz);
1027 values_type values(
"values", point_nnz);
1030 Kokkos::parallel_for(
"fillRowPtr",range_type(0,block_rows),KOKKOS_LAMBDA(
const LO i) {
1031 offset_type base = blockRowptr[i];
1032 offset_type diff = blockRowptr[i+1] - base;
1033 for(LO j=0; j<blocksize; j++) {
1034 rowptr[i*blocksize +j] = base*bs2 + j*diff*blocksize;
1038 if(i==block_rows-1) {
1039 rowptr[block_rows*blocksize] = blockRowptr[i+1] * bs2;
1044 Kokkos::parallel_for(
"copyEntriesAndValues",range_type(0,block_rows),KOKKOS_LAMBDA(
const LO i) {
1045 const offset_type blkBeg = blockRowptr[i];
1046 const offset_type numBlocks = blockRowptr[i+1] - blkBeg;
1049 for (offset_type block=0; block < numBlocks; block++) {
1050 column_type point_col_base = blockColind[blkBeg + block] * blocksize;
1051 little_block_type my_block(blockValues.data () + (blkBeg+block) * bs2, blocksize, blocksize);
1054 for(LO little_row=0; little_row<blocksize; little_row++) {
1055 offset_type point_row_offset = rowptr[i*blocksize + little_row];
1056 for(LO little_col=0; little_col<blocksize; little_col++) {
1057 colind[point_row_offset + block*blocksize + little_col] = point_col_base + little_col;
1058 values[point_row_offset + block*blocksize + little_col] = my_block(little_row,little_col);
1066 RCP<crs_matrix_type> pointCrsMatrix = rcp(
new crs_matrix_type(pointRowMap, pointColMap, 0));
1067 pointCrsMatrix->setAllValues(rowptr,colind,values);
1070 pointCrsMatrix->expertStaticFillComplete(pointDomainMap,pointRangeMap);
1072 return pointCrsMatrix;