399 typedef typename Teuchos::ScalarTraits<Scalar>::coordinateType coordinate_type;
400 typedef Xpetra::MultiVectorFactory<coordinate_type, LO, GO, NO> RealValuedMultiVectorFactory;
402 std::string nspName =
"Nullspace";
403 if (pL.isParameter(
"Nullspace name")) nspName = pL.get<std::string>(
"Nullspace name");
410 RCP<RealValuedMultiVector> fineCoords;
415 RCP<Matrix> Ptentative;
418 if (aggregates->GetNumGlobalAggregatesComputeIfNeeded() == 0) {
419 Ptentative = Teuchos::null;
420 Set(coarseLevel,
"P", Ptentative);
423 RCP<MultiVector> coarseNullspace;
424 RCP<RealValuedMultiVector> coarseCoords;
427 RCP<const Map> coarseCoordMap;
428 using array_type =
typename Map::global_indices_array_device_type;
431 if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
432 blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();
437 coarseCoordMap = coarseMap;
443 using range_policy = Kokkos::RangePolicy<typename Node::execution_space>;
444 array_type elementAList = coarseMap->getMyGlobalIndicesDevice();
445 GO indexBase = coarseMap->getIndexBase();
446 auto numElements = elementAList.size() / blkSize;
447 typename array_type::non_const_type elementList_nc(
"elementList", numElements);
450 Kokkos::parallel_for(
451 "Amalgamate Element List", range_policy(0, numElements), KOKKOS_LAMBDA(LO i) {
452 elementList_nc[i] = (elementAList[i * blkSize] - indexBase) / blkSize + indexBase;
454 array_type elementList = elementList_nc;
455 coarseCoordMap = MapFactory::Build(coarseMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
456 elementList, indexBase, coarseMap->getComm());
459 coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());
462 auto uniqueMap = fineCoords->getMap();
463 RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
464 if (aggregates->AggregatesCrossProcessors()) {
465 auto nonUniqueMap = aggregates->GetMap();
466 auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
468 ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
469 ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
474 auto aggGraph = aggregates->GetGraph();
475 auto numAggs = aggGraph.numRows();
477 auto fineCoordsView = fineCoords->getDeviceLocalView(Xpetra::Access::ReadOnly);
478 auto coarseCoordsView = coarseCoords->getDeviceLocalView(Xpetra::Access::OverwriteAll);
484 const auto dim = fineCoords->getNumVectors();
486 typename AppendTrait<
decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
487 for (
size_t j = 0; j < dim; j++) {
488 Kokkos::parallel_for(
489 "MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
490 KOKKOS_LAMBDA(
const LO i) {
494 auto aggregate = aggGraph.rowConst(i);
496 coordinate_type sum = 0.0;
497 for (
size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
498 sum += fineCoordsRandomView(aggregate(colID), j);
500 coarseCoordsView(i, j) = sum / aggregate.length;
506 if (!aggregates->AggregatesCrossProcessors()) {
507 if (Xpetra::Helpers<SC, LO, GO, NO>::isTpetraBlockCrs(A)) {
508 BuildPuncoupledBlockCrs(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,
511 BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.
GetLevelID());
514 BuildPcoupled(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
524 if (A->IsView(
"stridedMaps") ==
true)
525 Ptentative->CreateView(
"stridedMaps", A->getRowMap(
"stridedMaps"), coarseMap);
527 Ptentative->CreateView(
"stridedMaps", Ptentative->getRangeMap(), coarseMap);
530 Set(coarseLevel,
"Coordinates", coarseCoords);
539 Set(coarseLevel,
"Nullspace", coarseNullspace);
540 Set(coarseLevel,
"P", Ptentative);
543 RCP<ParameterList> params = rcp(
new ParameterList());
544 params->set(
"printLoadBalancingInfo",
true);
552 RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
553 RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
554 RCP<MultiVector>& coarseNullspace,
const int levelID)
const {
555 auto rowMap = A->getRowMap();
556 auto colMap = A->getColMap();
558 const size_t numRows = rowMap->getLocalNumElements();
559 const size_t NSDim = fineNullspace->getNumVectors();
561 typedef Kokkos::ArithTraits<SC> ATS;
562 using impl_SC =
typename ATS::val_type;
563 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
564 const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one();
566 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
571 aggGraph = aggregates->GetGraph();
573 auto aggRows = aggGraph.row_map;
574 auto aggCols = aggGraph.entries;
586 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
587 "(i.e. \"matching\" row and column maps)");
596 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
598 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
599 GO globalOffset = amalgInfo->GlobalOffset();
602 auto procWinner = aggregates->GetProcWinner()->getDeviceLocalView(Xpetra::Access::ReadOnly);
603 auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
604 const size_t numAggregates = aggregates->GetNumAggregates();
606 int myPID = aggregates->GetMap()->getComm()->getRank();
611 typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
612 AggSizeType aggDofSizes;
614 if (stridedBlockSize == 1) {
618 aggDofSizes = AggSizeType(
"agg_dof_sizes", numAggregates + 1);
620 auto sizesConst = aggregates->ComputeAggregateSizes();
621 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(
static_cast<size_t>(1), numAggregates + 1)), sizesConst);
627 aggDofSizes = AggSizeType(
"agg_dof_sizes", numAggregates + 1);
629 auto nodeMap = aggregates->GetMap()->getLocalMap();
630 auto dofMap = colMap->getLocalMap();
632 Kokkos::parallel_for(
633 "MueLu:TentativePF:Build:compute_agg_sizes",
range_type(0, numAggregates),
634 KOKKOS_LAMBDA(
const LO agg) {
635 auto aggRowView = aggGraph.rowConst(agg);
638 for (LO colID = 0; colID < aggRowView.length; colID++) {
639 GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
641 for (LO k = 0; k < stridedBlockSize; k++) {
642 GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
644 if (dofMap.getLocalElement(dofGID) != INVALID)
648 aggDofSizes(agg + 1) = size;
655 ReduceMaxFunctor<LO,
decltype(aggDofSizes)> reduceMax(aggDofSizes);
656 Kokkos::parallel_reduce(
"MueLu:TentativePF:Build:max_agg_size",
range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
660 Kokkos::parallel_scan(
661 "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan",
range_type(0, numAggregates + 1),
662 KOKKOS_LAMBDA(
const LO i, LO& update,
const bool& final_pass) {
663 update += aggDofSizes(i);
665 aggDofSizes(i) = update;
670 Kokkos::View<LO*, DeviceType> agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing(
"agg2row_map_LO"), numRows);
674 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing(
"aggOffsets"), numAggregates);
675 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(
static_cast<size_t>(0), numAggregates)));
677 Kokkos::parallel_for(
678 "MueLu:TentativePF:Build:createAgg2RowMap",
range_type(0, vertex2AggId.extent(0)),
679 KOKKOS_LAMBDA(
const LO lnode) {
680 if (procWinner(lnode, 0) == myPID) {
682 auto aggID = vertex2AggId(lnode, 0);
684 auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
688 for (LO k = 0; k < stridedBlockSize; k++)
689 agg2RowMapLO(offset + k) = lnode * stridedBlockSize + k;
696 coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
699 auto fineNS = fineNullspace->getDeviceLocalView(Xpetra::Access::ReadWrite);
700 auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
704 typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_type local_matrix_type;
705 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
706 typedef typename local_matrix_type::index_type::non_const_type cols_type;
707 typedef typename local_matrix_type::values_type::non_const_type vals_type;
710 typedef Kokkos::View<int[10], DeviceType> status_type;
711 status_type status(
"status");
713 typename AppendTrait<
decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
717 const bool& doQRStep = pL.get<
bool>(
"tentative: calculate qr");
721 GetOStream(
Warnings0) <<
"TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
724 size_t nnzEstimate = numRows * NSDim;
725 rows_type rowsAux(Kokkos::ViewAllocateWithoutInitializing(
"Ptent_aux_rows"), numRows + 1);
726 cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing(
"Ptent_aux_cols"), nnzEstimate);
727 vals_type valsAux(
"Ptent_aux_vals", nnzEstimate);
728 rows_type rows(
"Ptent_rows", numRows + 1);
735 Kokkos::parallel_for(
736 "MueLu:TentativePF:BuildPuncoupled:for1",
range_type(0, numRows + 1),
737 KOKKOS_LAMBDA(
const LO row) {
738 rowsAux(row) = row * NSDim;
740 Kokkos::parallel_for(
741 "MueLu:TentativePF:BuildUncoupled:for2",
range_type(0, nnzEstimate),
742 KOKKOS_LAMBDA(
const LO j) {
743 colsAux(j) = INVALID;
759 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
762 Kokkos::parallel_for(
763 "MueLu:TentativePF:BuildUncoupled:main_loop", policy,
764 KOKKOS_LAMBDA(
const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
765 auto agg = thread.league_rank();
768 LO aggSize = aggRows(agg + 1) - aggRows(agg);
773 auto norm = impl_ATS::magnitude(zero);
778 for (
decltype(aggSize) k = 0; k < aggSize; k++) {
779 auto dnorm = impl_ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0));
780 norm += dnorm * dnorm;
786 statusAtomic(1) =
true;
791 coarseNS(agg, 0) = norm;
794 for (
decltype(aggSize) k = 0; k < aggSize; k++) {
795 LO localRow = agg2RowMapLO(aggRows(agg) + k);
796 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0) / norm;
798 rows(localRow + 1) = 1;
799 colsAux(localRow) = agg;
800 valsAux(localRow) = localVal;
804 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
805 Kokkos::deep_copy(statusHost, status);
806 for (
decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
808 std::ostringstream oss;
809 oss <<
"MueLu::TentativePFactory::MakeTentative: ";
811 case 0: oss <<
"!goodMap is not implemented";
break;
812 case 1: oss <<
"fine level NS part has a zero column";
break;
818 Kokkos::parallel_for(
819 "MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
820 KOKKOS_LAMBDA(
const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
821 auto agg = thread.league_rank();
824 LO aggSize = aggRows(agg + 1) - aggRows(agg);
827 coarseNS(agg, 0) = one;
830 for (
decltype(aggSize) k = 0; k < aggSize; k++) {
831 LO localRow = agg2RowMapLO(aggRows(agg) + k);
832 impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0);
834 rows(localRow + 1) = 1;
835 colsAux(localRow) = agg;
836 valsAux(localRow) = localVal;
841 Kokkos::parallel_reduce(
842 "MueLu:TentativeP:CountNNZ",
range_type(0, numRows + 1),
843 KOKKOS_LAMBDA(
const LO i,
size_t& nnz_count) {
844 nnz_count += rows(i);
862 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
864 decltype(aggDofSizes ),
decltype(maxAggSize),
decltype(agg2RowMapLO),
865 decltype(statusAtomic),
decltype(rows),
decltype(rowsAux),
decltype(colsAux),
867 localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
868 rows, rowsAux, colsAux, valsAux, doQRStep);
869 Kokkos::parallel_reduce(
"MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
872 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
873 Kokkos::deep_copy(statusHost, status);
874 for (
decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
876 std::ostringstream oss;
877 oss <<
"MueLu::TentativePFactory::MakeTentative: ";
879 case 0: oss <<
"!goodMap is not implemented";
break;
880 case 1: oss <<
"fine level NS part has a zero column";
break;
893 if (nnz != nnzEstimate) {
898 Kokkos::parallel_scan(
899 "MueLu:TentativePF:Build:compress_rows",
range_type(0, numRows + 1),
900 KOKKOS_LAMBDA(
const LO i, LO& upd,
const bool&
final) {
910 cols = cols_type(
"Ptent_cols", nnz);
911 vals = vals_type(
"Ptent_vals", nnz);
916 Kokkos::parallel_for(
917 "MueLu:TentativePF:Build:compress_cols_vals",
range_type(0, numRows),
918 KOKKOS_LAMBDA(
const LO i) {
919 LO rowStart = rows(i);
922 for (
auto j = rowsAux(i); j < rowsAux(i + 1); j++)
923 if (colsAux(j) != INVALID) {
924 cols(rowStart + lnnz) = colsAux(j);
925 vals(rowStart + lnnz) = valsAux(j);
937 GetOStream(
Runtime1) <<
"TentativePFactory : aggregates do not cross process boundaries" << std::endl;
943 local_matrix_type lclMatrix = local_matrix_type(
"A", numRows, coarseMap->getLocalNumElements(), nnz, vals, rows, cols);
946 RCP<ParameterList> FCparams;
947 if (pL.isSublist(
"matrixmatrix: kernel params"))
948 FCparams = rcp(
new ParameterList(pL.sublist(
"matrixmatrix: kernel params")));
950 FCparams = rcp(
new ParameterList);
953 FCparams->set(
"compute global constants", FCparams->get(
"compute global constants",
false));
954 FCparams->set(
"Timer Label", std::string(
"MueLu::TentativeP-") +
toString(levelID));
956 auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
957 Ptentative = rcp(
new CrsMatrixWrap(PtentCrs));
964 RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
965 RCP<const Map> coarsePointMap, RCP<Matrix>& Ptentative,
966 RCP<MultiVector>& coarseNullspace,
const int levelID)
const {
976 RCP<const Map> rowMap = A->getRowMap();
977 RCP<const Map> rangeMap = A->getRangeMap();
978 RCP<const Map> colMap = A->getColMap();
980 const size_t numFineBlockRows = rowMap->getLocalNumElements();
984 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
986 typedef Kokkos::ArithTraits<SC> ATS;
987 using impl_SC =
typename ATS::val_type;
988 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
989 const impl_SC one = impl_ATS::one();
992 const size_t NSDim = fineNullspace->getNumVectors();
993 auto aggSizes = aggregates->ComputeAggregateSizes();
998 aggGraph = aggregates->GetGraph();
1000 auto aggRows = aggGraph.row_map;
1001 auto aggCols = aggGraph.entries;
1007 const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim;
1008 RCP<const Map> coarseBlockMap = MapFactory::Build(coarsePointMap->lib(),
1009 Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
1011 coarsePointMap->getIndexBase(),
1012 coarsePointMap->getComm());
1024 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
1025 "(i.e. \"matching\" row and column maps)");
1034 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
1036 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
1040 auto procWinner = aggregates->GetProcWinner()->getDeviceLocalView(Xpetra::Access::ReadOnly);
1041 auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
1042 const size_t numAggregates = aggregates->GetNumAggregates();
1044 int myPID = aggregates->GetMap()->getComm()->getRank();
1049 typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
1050 AggSizeType aggDofSizes;
1056 aggDofSizes = AggSizeType(
"agg_dof_sizes", numAggregates + 1);
1058 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(
static_cast<size_t>(1), numAggregates + 1)), aggSizes);
1064 ReduceMaxFunctor<LO,
decltype(aggDofSizes)> reduceMax(aggDofSizes);
1065 Kokkos::parallel_reduce(
"MueLu:TentativePF:Build:max_agg_size",
range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
1069 Kokkos::parallel_scan(
1070 "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan",
range_type(0, numAggregates + 1),
1071 KOKKOS_LAMBDA(
const LO i, LO& update,
const bool& final_pass) {
1072 update += aggDofSizes(i);
1074 aggDofSizes(i) = update;
1079 Kokkos::View<LO*, DeviceType> aggToRowMapLO(Kokkos::ViewAllocateWithoutInitializing(
"aggtorow_map_LO"), numFineBlockRows);
1083 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing(
"aggOffsets"), numAggregates);
1084 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(
static_cast<size_t>(0), numAggregates)));
1086 Kokkos::parallel_for(
1087 "MueLu:TentativePF:Build:createAgg2RowMap",
range_type(0, vertex2AggId.extent(0)),
1088 KOKKOS_LAMBDA(
const LO lnode) {
1089 if (procWinner(lnode, 0) == myPID) {
1091 auto aggID = vertex2AggId(lnode, 0);
1093 auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
1097 for (LO k = 0; k < stridedBlockSize; k++)
1098 aggToRowMapLO(offset + k) = lnode * stridedBlockSize + k;
1105 coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim);
1108 auto fineNS = fineNullspace->getDeviceLocalView(Xpetra::Access::ReadWrite);
1109 auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
1111 typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_type local_matrix_type;
1112 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
1113 typedef typename local_matrix_type::index_type::non_const_type cols_type;
1117 typedef Kokkos::View<int[10], DeviceType> status_type;
1118 status_type status(
"status");
1120 typename AppendTrait<
decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
1130 rows_type ia(Kokkos::ViewAllocateWithoutInitializing(
"BlockGraph_rowptr"), numFineBlockRows + 1);
1131 cols_type ja(Kokkos::ViewAllocateWithoutInitializing(
"BlockGraph_colind"), numFineBlockRows);
1133 Kokkos::parallel_for(
1134 "MueLu:TentativePF:BlockCrs:graph_init",
range_type(0, numFineBlockRows),
1135 KOKKOS_LAMBDA(
const LO j) {
1139 if (j == (LO)numFineBlockRows - 1)
1140 ia[numFineBlockRows] = numFineBlockRows;
1144 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
1145 Kokkos::parallel_for(
1146 "MueLu:TentativePF:BlockCrs:fillGraph", policy,
1147 KOKKOS_LAMBDA(
const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1148 auto agg = thread.league_rank();
1149 Xpetra::global_size_t offset = agg;
1152 LO aggSize = aggRows(agg + 1) - aggRows(agg);
1154 for (LO j = 0; j < aggSize; j++) {
1156 const LO localRow = aggToRowMapLO[aggDofSizes[agg] + j];
1157 const size_t rowStart = ia[localRow];
1158 ja[rowStart] = offset;
1168 rows_type i_temp(Kokkos::ViewAllocateWithoutInitializing(
"BlockGraph_rowptr"), numFineBlockRows + 1);
1170 Kokkos::parallel_scan(
1171 "MueLu:TentativePF:BlockCrs:compress_rows",
range_type(0, numFineBlockRows),
1172 KOKKOS_LAMBDA(
const LO i, LO& upd,
const bool&
final) {
1175 for (
auto j = ia[i]; j < ia[i + 1]; j++)
1176 if (ja[j] != INVALID)
1178 if (
final && i == (LO)numFineBlockRows - 1)
1179 i_temp[numFineBlockRows] = upd;
1183 cols_type j_temp(Kokkos::ViewAllocateWithoutInitializing(
"BlockGraph_colind"), nnz);
1185 Kokkos::parallel_for(
1186 "MueLu:TentativePF:BlockCrs:compress_cols",
range_type(0, numFineBlockRows),
1187 KOKKOS_LAMBDA(
const LO i) {
1188 size_t rowStart = i_temp[i];
1190 for (
auto j = ia[i]; j < ia[i + 1]; j++)
1191 if (ja[j] != INVALID) {
1192 j_temp[rowStart + lnnz] = ja[j];
1201 RCP<CrsGraph> BlockGraph = CrsGraphFactory::Build(rowMap, coarseBlockMap, ia, ja);
1205 RCP<ParameterList> FCparams;
1206 if (pL.isSublist(
"matrixmatrix: kernel params"))
1207 FCparams = rcp(
new ParameterList(pL.sublist(
"matrixmatrix: kernel params")));
1209 FCparams = rcp(
new ParameterList);
1211 FCparams->set(
"compute global constants", FCparams->get(
"compute global constants",
false));
1212 std::string levelIDs =
toString(levelID);
1213 FCparams->set(
"Timer Label", std::string(
"MueLu::TentativeP-") + levelIDs);
1214 RCP<const Export> dummy_e;
1215 RCP<const Import> dummy_i;
1216 BlockGraph->expertStaticFillComplete(coarseBlockMap, rowMap, dummy_i, dummy_e, FCparams);
1226 RCP<Xpetra::CrsMatrix<SC, LO, GO, NO>> P_xpetra = Xpetra::CrsMatrixFactory<SC, LO, GO, NO>::BuildBlock(BlockGraph, coarsePointMap, rangeMap, NSDim);
1227 RCP<Xpetra::TpetraBlockCrsMatrix<SC, LO, GO, NO>> P_tpetra = rcp_dynamic_cast<Xpetra::TpetraBlockCrsMatrix<SC, LO, GO, NO>>(P_xpetra);
1228 if (P_tpetra.is_null())
throw std::runtime_error(
"BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix");
1229 RCP<CrsMatrixWrap> P_wrap = rcp(
new CrsMatrixWrap(P_xpetra));
1231 auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst();
1232 const LO stride = NSDim * NSDim;
1234 Kokkos::parallel_for(
1235 "MueLu:TentativePF:BlockCrs:main_loop_noqr", policy,
1236 KOKKOS_LAMBDA(
const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1237 auto agg = thread.league_rank();
1240 LO aggSize = aggRows(agg + 1) - aggRows(agg);
1241 Xpetra::global_size_t offset = agg * NSDim;
1244 for (LO j = 0; j < aggSize; j++) {
1245 LO localBlockRow = aggToRowMapLO(aggRows(agg) + j);
1246 LO rowStart = localBlockRow * stride;
1247 for (LO r = 0; r < (LO)NSDim; r++) {
1248 LO localPointRow = localBlockRow * NSDim + r;
1249 for (LO c = 0; c < (LO)NSDim; c++) {
1250 values[rowStart + r * NSDim + c] = fineNSRandom(localPointRow, c);
1256 for (LO j = 0; j < (LO)NSDim; j++)
1257 coarseNS(offset + j, j) = one;
1260 Ptentative = P_wrap;