Intrepid2
Intrepid2_IntegrationToolsDef.hpp
Go to the documentation of this file.
1// @HEADER
2// *****************************************************************************
3// Intrepid2 Package
4//
5// Copyright 2007 NTESS and the Intrepid2 contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
14
15#ifndef __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
16#define __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__
17
21
22#include "Teuchos_TimeMonitor.hpp"
23
24namespace Intrepid2 {
25
26 namespace Impl
27 {
31 template<class Scalar, class DeviceType, int integralViewRank>
32 class F_Integrate
33 {
34 using ExecutionSpace = typename DeviceType::execution_space;
35 using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace>;
36 using TeamMember = typename TeamPolicy::member_type;
37
38 using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
39 IntegralViewType integralView_;
40 TensorData<Scalar,DeviceType> leftComponent_;
41 Data<Scalar,DeviceType> composedTransform_;
42 TensorData<Scalar,DeviceType> rightComponent_;
44 int a_offset_;
45 int b_offset_;
46 int leftComponentSpan_; // leftComponentSpan tracks the dimensions spanned by the left component
47 int rightComponentSpan_; // rightComponentSpan tracks the dimensions spanned by the right component
48 int numTensorComponents_;
49 int leftFieldOrdinalOffset_;
50 int rightFieldOrdinalOffset_;
51 bool forceNonSpecialized_; // if true, don't use the specialized (more manual) implementation(s) for particular component counts. Primary use case is for testing.
52
53 size_t fad_size_output_ = 0; // 0 if not a fad type
54
55 Kokkos::Array<int, 7> offsetsForComponentOrdinal_;
56
57 // as an optimization, we do all the bounds and argument iteration within the functor rather than relying on TensorArgumentIterator
58 // (this also makes it easier to reorder loops, etc., for further optimizations)
59 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
60 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
61 Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
62
63 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldRelativeEnumerationSpans_; // total number of enumeration indices with arguments prior to the startingComponent fixed
64 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldRelativeEnumerationSpans_;
65
66 int maxFieldsLeft_;
67 int maxFieldsRight_;
68 int maxPointCount_;
69 public:
70 F_Integrate(Data<Scalar,DeviceType> integralData,
72 Data<Scalar,DeviceType> composedTransform,
73 TensorData<Scalar,DeviceType> rightComponent,
75 int a_offset,
76 int b_offset,
77 int leftFieldOrdinalOffset,
78 int rightFieldOrdinalOffset,
79 bool forceNonSpecialized)
80 :
81 integralView_(integralData.template getUnderlyingView<integralViewRank>()),
82 leftComponent_(leftComponent),
83 composedTransform_(composedTransform),
84 rightComponent_(rightComponent),
85 cellMeasures_(cellMeasures),
86 a_offset_(a_offset),
87 b_offset_(b_offset),
88 leftComponentSpan_(leftComponent.extent_int(2)),
89 rightComponentSpan_(rightComponent.extent_int(2)),
90 numTensorComponents_(leftComponent.numTensorComponents()),
91 leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
92 rightFieldOrdinalOffset_(rightFieldOrdinalOffset),
93 forceNonSpecialized_(forceNonSpecialized)
94 {
95 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument, "Left and right components must have matching number of tensorial components");
96
97 // set up bounds containers
98 const int FIELD_DIM = 0;
99 const int POINT_DIM = 1;
100 maxFieldsLeft_ = 0;
101 maxFieldsRight_ = 0;
102 maxPointCount_ = 0;
103 for (int r=0; r<numTensorComponents_; r++)
104 {
105 leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
106 maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
107 rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
108 maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
109 pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
110 maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
111 }
112
113 // set up relative enumeration spans: total number of enumeration indices with arguments prior to the startingComponent fixed. These are for *truncated* iterators; hence the -2 rather than -1 for the first startingComponent value.
114 int leftRelativeEnumerationSpan = 1;
115 int rightRelativeEnumerationSpan = 1;
116 for (int startingComponent=numTensorComponents_-2; startingComponent>=0; startingComponent--)
117 {
118 leftRelativeEnumerationSpan *= leftFieldBounds_[startingComponent];
119 rightRelativeEnumerationSpan *= rightFieldBounds_[startingComponent];
120 leftFieldRelativeEnumerationSpans_ [startingComponent] = leftRelativeEnumerationSpan;
121 rightFieldRelativeEnumerationSpans_[startingComponent] = rightRelativeEnumerationSpan;
122 }
123
124 // prepare for allocation of temporary storage
125 // note: tempStorage goes "backward", starting from the final component, which needs just one entry
126
127 const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
128 if (allocateFadStorage)
129 {
130 fad_size_output_ = dimension_scalar(integralView_);
131 }
132
133 const int R = numTensorComponents_ - 1;
134
135 int num_ij = 1; // this counts how many entries there are corresponding to components from r to R-1.
136 int allocationSoFar = 0;
137 offsetsForComponentOrdinal_[R] = allocationSoFar;
138 allocationSoFar++; // we store one entry corresponding to R, the final component
139
140 for (int r=R-1; r>0; r--) // last component is innermost in the for loops (requires least storage)
141 {
142 const int leftFields = leftComponent.getTensorComponent(r).extent_int(0);
143 const int rightFields = rightComponent.getTensorComponent(r).extent_int(0);
144
145 num_ij *= leftFields * rightFields;
146
147 offsetsForComponentOrdinal_[r] = allocationSoFar;
148 allocationSoFar += num_ij;
149 }
150 offsetsForComponentOrdinal_[0] = allocationSoFar; // first component stores directly to final integralView.
151 }
152
153 template<size_t maxComponents, size_t numComponents = maxComponents>
154 KOKKOS_INLINE_FUNCTION
155 int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
156 const Kokkos::Array<int,maxComponents> &bounds) const
157 {
158 if (numComponents == 0)
159 {
160 return -1;
161 }
162 else
163 {
164 int r = static_cast<int>(numComponents - 1);
165 while (arguments[r] + 1 >= bounds[r])
166 {
167 arguments[r] = 0; // reset
168 r--;
169 if (r < 0) break;
170 }
171 if (r >= 0) ++arguments[r];
172 return r;
173 }
174 }
175
177 KOKKOS_INLINE_FUNCTION
178 int incrementArgument( Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
179 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
180 const int &numComponents) const
181 {
182 if (numComponents == 0) return -1;
183 int r = static_cast<int>(numComponents - 1);
184 while (arguments[r] + 1 >= bounds[r])
185 {
186 arguments[r] = 0; // reset
187 r--;
188 if (r < 0) break;
189 }
190 if (r >= 0) ++arguments[r];
191 return r;
192 }
193
194 template<size_t maxComponents, size_t numComponents = maxComponents>
195 KOKKOS_INLINE_FUNCTION
196 int nextIncrementResult(const Kokkos::Array<int,maxComponents> &arguments,
197 const Kokkos::Array<int,maxComponents> &bounds) const
198 {
199 if (numComponents == 0)
200 {
201 return -1;
202 }
203 else
204 {
205 int r = static_cast<int>(numComponents - 1);
206 while (arguments[r] + 1 >= bounds[r])
207 {
208 r--;
209 if (r < 0) break;
210 }
211 return r;
212 }
213 }
214
216 KOKKOS_INLINE_FUNCTION
217 int nextIncrementResult(const Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
218 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
219 const int &numComponents) const
220 {
221 if (numComponents == 0) return -1;
222 int r = numComponents - 1;
223 while (arguments[r] + 1 >= bounds[r])
224 {
225 r--;
226 if (r < 0) break;
227 }
228 return r;
229 }
230
231 template<size_t maxComponents, size_t numComponents = maxComponents>
232 KOKKOS_INLINE_FUNCTION
233 int relativeEnumerationIndex(const Kokkos::Array<int,maxComponents> &arguments,
234 const Kokkos::Array<int,maxComponents> &bounds,
235 const int startIndex) const
236 {
237 // the following mirrors what is done in TensorData
238 if (numComponents == 0)
239 {
240 return 0;
241 }
242 else
243 {
244 int enumerationIndex = 0;
245 for (size_t r=numComponents-1; r>static_cast<size_t>(startIndex); r--)
246 {
247 enumerationIndex += arguments[r];
248 enumerationIndex *= bounds[r-1];
249 }
250 enumerationIndex += arguments[startIndex];
251 return enumerationIndex;
252 }
253 }
254
256
257 // nvcc refuses to compile the below with error, "explicit specialization is not allowed in the current scope". Clang is OK with it. We just do a non-templated version below.
258// template<size_t numTensorComponents>
259// KOKKOS_INLINE_FUNCTION
260// void runSpecialized( const TeamMember & teamMember ) const;
261
262// template<>
263// KOKKOS_INLINE_FUNCTION
264// void runSpecialized<3>( const TeamMember & teamMember ) const
265 KOKKOS_INLINE_FUNCTION
266 void runSpecialized3( const TeamMember & teamMember ) const
267 {
268 constexpr int numTensorComponents = 3;
269
270 Kokkos::Array<int,numTensorComponents> pointBounds;
271 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
272 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
273 for (unsigned r=0; r<numTensorComponents; r++)
274 {
275 pointBounds[r] = pointBounds_[r];
276 leftFieldBounds[r] = leftFieldBounds_[r];
277 rightFieldBounds[r] = rightFieldBounds_[r];
278 }
279
280 const int cellDataOrdinal = teamMember.league_rank();
281 const int numThreads = teamMember.team_size(); // num threads
282 const int scratchViewSize = offsetsForComponentOrdinal_[0]; // per thread
283
284 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView; // for caching partial integration values
285 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure
286 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, leftFields_y, leftFields_z, rightFields_x, rightFields_y, rightFields_z; // cache the field values for faster access
287 if (fad_size_output_ > 0) {
288 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
289 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
290 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0], fad_size_output_);
291 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0], fad_size_output_);
292 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1], fad_size_output_);
293 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1], fad_size_output_);
294 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2], fad_size_output_);
295 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2], fad_size_output_);
296 }
297 else {
298 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads);
299 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
300 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0]);
301 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0]);
302 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1]);
303 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1]);
304 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2]);
305 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2]);
306 }
307
308// int approximateFlopCount = 0;
309// int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
310
311 constexpr int R = numTensorComponents - 1;
312
313 const int composedTransformRank = composedTransform_.rank();
314
315 for (int a_component=0; a_component < leftComponentSpan_; a_component++)
316 {
317 const int a = a_offset_ + a_component;
318 for (int b_component=0; b_component < rightComponentSpan_; b_component++)
319 {
320 const int b = b_offset_ + b_component;
321
322 const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
323 const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
324
325 const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
326 const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
327
328 const Data<Scalar,DeviceType> & leftTensorComponent_x = leftComponent_.getTensorComponent(0);
329 const Data<Scalar,DeviceType> & rightTensorComponent_x = rightComponent_.getTensorComponent(0);
330 const Data<Scalar,DeviceType> & leftTensorComponent_y = leftComponent_.getTensorComponent(1);
331 const Data<Scalar,DeviceType> & rightTensorComponent_y = rightComponent_.getTensorComponent(1);
332 const Data<Scalar,DeviceType> & leftTensorComponent_z = leftComponent_.getTensorComponent(2);
333 const Data<Scalar,DeviceType> & rightTensorComponent_z = rightComponent_.getTensorComponent(2);
334
335 const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
336 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields * maxPointCount_), [&] (const int& fieldOrdinalPointOrdinal) {
337 const int fieldOrdinal = fieldOrdinalPointOrdinal % maxFields;
338 const int pointOrdinal = fieldOrdinalPointOrdinal / maxFields;
339 if ((fieldOrdinal < leftTensorComponent_x.extent_int(0)) && (pointOrdinal < leftTensorComponent_x.extent_int(1)))
340 {
341 const int leftRank = leftTensorComponent_x.rank();
342 leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
343 }
344 if ((fieldOrdinal < leftTensorComponent_y.extent_int(0)) && (pointOrdinal < leftTensorComponent_y.extent_int(1)))
345 {
346 const int leftRank = leftTensorComponent_y.rank();
347 leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
348 }
349 if ((fieldOrdinal < leftTensorComponent_z.extent_int(0)) && (pointOrdinal < leftTensorComponent_z.extent_int(1)))
350 {
351 const int leftRank = leftTensorComponent_z.rank();
352 leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
353 }
354 if ((fieldOrdinal < rightTensorComponent_x.extent_int(0)) && (pointOrdinal < rightTensorComponent_x.extent_int(1)))
355 {
356 const int rightRank = rightTensorComponent_x.rank();
357 rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,b_component);
358 }
359 if ((fieldOrdinal < rightTensorComponent_y.extent_int(0)) && (pointOrdinal < rightTensorComponent_y.extent_int(1)))
360 {
361 const int rightRank = rightTensorComponent_y.rank();
362 rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,b_component);
363 }
364 if ((fieldOrdinal < rightTensorComponent_z.extent_int(0)) && (pointOrdinal < rightTensorComponent_z.extent_int(1)))
365 {
366 const int rightRank = rightTensorComponent_z.rank();
367 rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,b_component);
368 }
369 });
370
371 if (composedTransform_.underlyingMatchesLogical())
372 {
373 if (composedTransformRank == 4) // (C,P,D,D)
374 {
375 const auto & composedTransformView = composedTransform_.getUnderlyingView4();
376 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (const int& pointOrdinal) {
377 pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
378 });
379 }
380 else // rank 2, then: (C,P)
381 {
382 const auto & composedTransformView = composedTransform_.getUnderlyingView2();
383 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (const int& pointOrdinal) {
384 pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
385 });
386 }
387 }
388 else
389 {
390 if (composedTransformRank == 4) // (C,P,D,D)
391 {
392 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
393 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
394 });
395 }
396 else // rank 2, then: (C,P)
397 {
398 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
399 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
400 });
401 }
402 }
403
404 // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
405
406 // synchronize threads
407 teamMember.team_barrier();
408
409 // Setting scratchView to 0 here is not necessary from an algorithmic point of view, but *might* help with performance (due to a first-touch policy)
410 const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
411 for (int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
412 {
413 scratchView(i) = 0.0;
414 }
415
416 // TODO: consider adding an innerLoopRange that is sized to be the maximum of the size of the inner loops we'd like to parallelize over. (note that this means we do the work in the outer loop redundantly that many times…)
417 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (const int& leftRightFieldEnumeration) {
418 const int iz = leftRightFieldEnumeration % numLeftFieldsFinal;
419 const int jz = leftRightFieldEnumeration / numLeftFieldsFinal;
420
421 // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
422 // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
423 Kokkos::Array<int,numTensorComponents> leftFieldArguments;
424 Kokkos::Array<int,numTensorComponents> rightFieldArguments;
425 rightFieldArguments[R] = jz;
426 leftFieldArguments[R] = iz;
427
428 Kokkos::Array<int,numTensorComponents> pointArguments;
429 for (int i=0; i<numTensorComponents; i++)
430 {
431 pointArguments[i] = 0;
432 }
433
434 for (int lx=0; lx<pointBounds[0]; lx++)
435 {
436 pointArguments[0] = lx;
437
438 // clear Gy scratch:
439 // in scratch, Gz (1 entry) comes first, then Gy entries.
440 const int Gy_start_index = scratchOffsetForThread + offsetsForComponentOrdinal_[1];
441 const int Gy_end_index = scratchOffsetForThread + offsetsForComponentOrdinal_[0];
442
443 for (int Gy_index=Gy_start_index; Gy_index < Gy_end_index; Gy_index++)
444 {
445 scratchView(Gy_index) = 0;
446 }
447
448 for (int ly=0; ly<pointBounds[1]; ly++)
449 {
450 pointArguments[1] = ly;
451
452 Scalar * Gz = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
453 *Gz = 0;
454
455 for (int lz=0; lz < pointBounds[2]; lz++)
456 {
457 const Scalar & leftValue = leftFields_z(iz,lz);
458 const Scalar & rightValue = rightFields_z(jz,lz);
459
460 pointArguments[2] = lz;
461 int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
462
463 *Gz += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
464
465 // approximateFlopCount += 3; // 2 multiplies, 1 sum
466 } // lz
467
468 for (int iy=0; iy<leftFieldBounds_[1]; iy++)
469 {
470 leftFieldArguments[1] = iy;
471 const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
472
473 const Scalar & leftValue = leftFields_y(iy,ly);
474
475 for (int jy=0; jy<rightFieldBounds_[1]; jy++)
476 {
477 rightFieldArguments[1] = jy;
478
479 const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
480 const Scalar & rightValue = rightFields_y(jy,ly);
481
482 const int & rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
483 const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
484
485 const int Gz_index = 0;
486 const Scalar & Gz = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[2] + Gz_index);
487
488 Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
489
490 Gy += leftValue * Gz * rightValue;
491 // approximateFlopCount += 3; // 2 multiplies, 1 sum
492 }
493 }
494 } // ly
495 for (int ix=0; ix<leftFieldBounds_[0]; ix++)
496 {
497 leftFieldArguments[0] = ix;
498 const Scalar & leftValue = leftFields_x(ix,lx);
499
500 for (int iy=0; iy<leftFieldBounds_[1]; iy++)
501 {
502 leftFieldArguments[1] = iy;
503
504 const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
505
506 for (int jx=0; jx<rightFieldBounds_[0]; jx++)
507 {
508 rightFieldArguments[0] = jx;
509 const Scalar & rightValue = rightFields_x(jx,lx);
510
511 for (int jy=0; jy<rightFieldBounds_[1]; jy++)
512 {
513 rightFieldArguments[1] = jy;
514 const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
515
516 const int rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
517
518 const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
519 Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
520
521 // compute enumeration indices to get field indices into output view
522 int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
523 int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
524 const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
525 const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
526
527 if (integralViewRank == 3)
528 {
529// if ((leftFieldIndex==0) && (rightFieldIndex==2))
530// {
531// using std::cout;
532// using std::endl;
533// cout << "***** Contribution to (0,0,2) *****\n";
534// cout << "lx = " << lx << endl;
535// cout << "ix = " << ix << endl;
536// cout << "iy = " << iy << endl;
537// cout << "jx = " << jx << endl;
538// cout << "jy = " << jy << endl;
539// cout << "iz = " << iz << endl;
540// cout << "jz = " << jz << endl;
541// cout << " leftValue = " << leftValue << endl;
542// cout << "rightValue = " << rightValue << endl;
543// cout << "Gy = " << Gy << endl;
544//
545// cout << endl;
546// }
547
548 // shape (C,F1,F2)
549 integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex) += leftValue * Gy * rightValue;
550 }
551 else
552 {
553 // shape (F1,F2)
554 integralView_.access(leftFieldIndex,rightFieldIndex,0) += leftValue * Gy * rightValue;
555 }
556 // approximateFlopCount += 3; // 2 multiplies, 1 sum
557 } // jy
558 } // ix
559 } // iy
560 } // ix
561 } // lx
562 }); // TeamThreadRange parallel_for - (iz,jz) loop
563 }
564 }
565// std::cout << "flop count per cell (within operator()) : " << approximateFlopCount << std::endl;
566 }
567
568 template<size_t numTensorComponents>
569 KOKKOS_INLINE_FUNCTION
570 void run( const TeamMember & teamMember ) const
571 {
572 Kokkos::Array<int,numTensorComponents> pointBounds;
573 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
574 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
575 for (unsigned r=0; r<numTensorComponents; r++)
576 {
577 pointBounds[r] = pointBounds_[r];
578 leftFieldBounds[r] = leftFieldBounds_[r];
579 rightFieldBounds[r] = rightFieldBounds_[r];
580 }
581
582 const int cellDataOrdinal = teamMember.league_rank();
583 const int numThreads = teamMember.team_size(); // num threads
584 const int scratchViewSize = offsetsForComponentOrdinal_[0]; // per thread
585 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView; // for caching partial integration values
586 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure
587 Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields; // cache the field values for faster access
588 if (fad_size_output_ > 0) {
589 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
590 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
591 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_, fad_size_output_);
592 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_, fad_size_output_);
593 }
594 else {
595 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize*numThreads);
596 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
597 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_);
598 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_);
599 }
600
601// int approximateFlopCount = 0;
602// int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
603
604 constexpr int R = numTensorComponents - 1;
605
606 const int composedTransformRank = composedTransform_.rank();
607
608 for (int a_component=0; a_component < leftComponentSpan_; a_component++)
609 {
610 const int a = a_offset_ + a_component;
611 for (int b_component=0; b_component < rightComponentSpan_; b_component++)
612 {
613 const int b = b_offset_ + b_component;
614
615 const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
616 const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
617
618 const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
619 const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
620
621 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numTensorComponents), [&] (const int& r) {
622 const Data<Scalar,DeviceType> & leftTensorComponent = leftComponent_.getTensorComponent(r);
623 const Data<Scalar,DeviceType> & rightTensorComponent = rightComponent_.getTensorComponent(r);
624 const int leftFieldCount = leftTensorComponent.extent_int(0);
625 const int pointCount = leftTensorComponent.extent_int(1);
626 const int leftRank = leftTensorComponent.rank();
627 const int rightFieldCount = rightTensorComponent.extent_int(0);
628 const int rightRank = rightTensorComponent.rank();
629 for (int fieldOrdinal=0; fieldOrdinal<maxFieldsLeft_; fieldOrdinal++)
630 {
631 // slightly weird logic here in effort to avoid branch divergence
632 const int fieldAddress = (fieldOrdinal < leftFieldCount) ? fieldOrdinal : leftFieldCount - 1;
633 for (int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
634 {
635 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
636 leftFields(r,fieldAddress,pointAddress) = (leftRank == 2) ? leftTensorComponent(fieldAddress,pointAddress) : leftTensorComponent(fieldAddress,pointAddress,a_component);
637 }
638 }
639 for (int fieldOrdinal=0; fieldOrdinal<maxFieldsRight_; fieldOrdinal++)
640 {
641 // slightly weird logic here in effort to avoid branch divergence
642 const int fieldAddress = (fieldOrdinal < rightFieldCount) ? fieldOrdinal : rightFieldCount - 1;
643 for (int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
644 {
645 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
646 rightFields(r,fieldAddress,pointAddress) = (rightRank == 2) ? rightTensorComponent(fieldAddress,pointAddress) : rightTensorComponent(fieldAddress,pointAddress,b_component);
647 }
648 }
649 });
650
651 if (composedTransformRank == 4)
652 {
653 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
654 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
655 });
656 }
657 else
658 {
659 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
660 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
661 });
662 }
663 // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
664
665 // synchronize threads
666 teamMember.team_barrier();
667
668 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (const int& leftRightFieldEnumeration) {
669 const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
670 const int i_R = leftRightFieldEnumeration % numLeftFieldsFinal;
671 const int j_R = leftRightFieldEnumeration / numLeftFieldsFinal;
672
673 // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
674 // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
675 Kokkos::Array<int,numTensorComponents> leftFieldArguments;
676 Kokkos::Array<int,numTensorComponents> rightFieldArguments;
677 rightFieldArguments[R] = j_R;
678 leftFieldArguments[R] = i_R;
679
680 //TODO: I believe that this can be moved outside the thread parallel_for
681 for (int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
682 {
683 scratchView(i) = 0.0;
684 }
685 Kokkos::Array<int,numTensorComponents> pointArguments;
686 for (unsigned i=0; i<numTensorComponents; i++)
687 {
688 pointArguments[i] = 0;
689 }
690
691 int r = R;
692 while (r >= 0)
693 {
694 // integrate in final component dimension; this is where we need the M weight, as well as the weighted measure
695 const int pointBounds_R = pointBounds[R];
696 int & pointArgument_R = pointArguments[R];
697 for (pointArgument_R=0; pointArgument_R < pointBounds_R; pointArgument_R++)
698 {
699 Scalar * G_R;
700 if (R != 0)
701 {
702 G_R = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
703 }
704 else
705 {
706 const int leftFieldIndex = i_R + leftFieldOrdinalOffset_;
707 const int rightFieldIndex = j_R + rightFieldOrdinalOffset_;
708
709 if (integralViewRank == 3)
710 {
711 // shape (C,F1,F2)
712 G_R = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
713 }
714 else
715 {
716 // shape (F1,F2)
717 G_R = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
718 }
719 }
720
721 const int & pointIndex_R = pointArguments[R];
722
723 const Scalar & leftValue = leftFields(R,i_R,pointIndex_R);
724 const Scalar & rightValue = rightFields(R,j_R,pointIndex_R);
725
726 int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
727
728 *G_R += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
729
730// approximateFlopCount += 3; // 2 multiplies, 1 sum
731 } // pointArgument_R
732
733 const int r_next = nextIncrementResult<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds); // numTensorComponents_-1 means that we won't touch the [R] argument, which is treated in for loop above
734 const int r_min = (r_next >= 0) ? r_next : 0;
735
736 for (int s=R-1; s>=r_min; s--)
737 {
738 const int & pointIndex_s = pointArguments[s];
739
740 // want to cover all the multi-indices from s to R-1
741 for (int s1=s; s1<R; s1++)
742 {
743 leftFieldArguments[s1] = 0;
744 }
745
746 // i_s, j_s are the indices into the "current" tensor component; these are references, so they actually vary as the arguments are incremented.
747 const int & i_s = leftFieldArguments[s];
748 const int & j_s = rightFieldArguments[s];
749
750 int sLeft = s; // hereafter, sLeft is the return from the left field increment: the lowest rank that was incremented. If this is less than s, we've gotten through all the multi-indexes from rank s through R-1 (inclusive).
751 const int & rightEnumerationSpan_s = rightFieldRelativeEnumerationSpans_[s];
752 const int & rightEnumerationSpan_sp = rightFieldRelativeEnumerationSpans_[s+1];
753
754 while (sLeft >= s)
755 {
756 const int leftEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s);
757
758 // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage (hence the "0" index possibility here)
759 const int leftEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s+1);
760
761 const Scalar & leftValue = leftFields(s,i_s,pointIndex_s);
762
763 for (int s1=s; s1<R; s1++)
764 {
765 rightFieldArguments[s1] = 0;
766 }
767 int sRight = s; // hereafter, sRight is the return from the leftFieldTensorIterator: the lowest rank that was incremented. If this is less than s, we've gotten through all the multi-indexes from rank s through R-1 (inclusive).
768 while (sRight >= s)
769 {
770 const int rightEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s);
771
772 // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage (hence the "0" index possibility here)
773 const int rightEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s+1);
774
775 const Scalar & rightValue = rightFields(s,j_s,pointIndex_s);
776
777 const int G_s_index = leftEnumerationIndex_s * rightEnumerationSpan_s + rightEnumerationIndex_s;
778
779 Scalar* G_s;
780
781 // for final tensor component, the indices i_R, j_R are fixed, so we only have one slot for temporary storage
782 // (above, we have set the leftEnumerationIndex_sp, rightEnumerationIndex_sp to be 0 in this case, so G_sp_index will then also be 0)
783 const int G_sp_index = leftEnumerationIndex_sp * rightEnumerationSpan_sp + rightEnumerationIndex_sp;
784
785 const Scalar & G_sp = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s+1] + G_sp_index);
786
787
788 if (s != 0)
789 {
790 G_s = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s] + G_s_index);
791 }
792 else
793 {
794 // compute enumeration indices
795 int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
796 int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
797 const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
798 const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
799
800 if (integralViewRank == 3)
801 {
802 // shape (C,F1,F2)
803 G_s = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
804 }
805 else
806 {
807 // shape (F1,F2)
808 G_s = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
809 }
810 }
811
812 *G_s += leftValue * G_sp * rightValue;
813
814// approximateFlopCount += 3; // 2 multiplies, 1 sum
815
816 // increment rightField
817 sRight = incrementArgument<numTensorComponents,R>(rightFieldArguments, rightFieldBounds);
818 }
819
820 // increment leftField
821 sLeft = incrementArgument<numTensorComponents,R>(leftFieldArguments, leftFieldBounds);
822 }
823 }
824
825 // clear tempStorage for r_next+1 to R
826 if (r_min+1 <= R)
827 {
828 const int endIndex = scratchOffsetForThread + offsetsForComponentOrdinal_[r_min];
829 for (int i=scratchOffsetForThread; i<endIndex; i++)
830 {
831 scratchView(i) = 0.0;
832 }
833// auto tempStorageSubview = Kokkos::subview(scratchView, Kokkos::pair<int,int>{0,offsetsForComponentOrdinal_[r_min]});
834// Kokkos::deep_copy(tempStorageSubview, 0.0);
835 }
836
837 // proceed to next point
838 r = incrementArgument<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds); // numTensorComponents_-1 means that we won't touch the [R] argument, which is treated in the G_R integration for loop above
839 }
840 }); // TeamThreadRange parallel_for
841 }
842 }
843// std::cout << "flop count per cell (within operator()) : " << approximateFlopCount << std::endl;
844 }
845
846 KOKKOS_INLINE_FUNCTION
847 void operator()( const TeamMember & teamMember ) const
848 {
849 switch (numTensorComponents_)
850 {
851 case 1: run<1>(teamMember); break;
852 case 2: run<2>(teamMember); break;
853 case 3:
854 if (forceNonSpecialized_)
855 run<3>(teamMember);
856 else
857 runSpecialized3(teamMember);
858 break;
859 case 4: run<4>(teamMember); break;
860 case 5: run<5>(teamMember); break;
861 case 6: run<6>(teamMember); break;
862 case 7: run<7>(teamMember); break;
863#ifdef INTREPID2_HAVE_DEBUG
864 default:
865 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true,std::invalid_argument,"Unsupported component count");
866#endif
867 }
868 }
869
872 {
873 // compute flop count on a single cell, then multiply by the number of cells
874 int flopCount = 0;
875
876 const int R = numTensorComponents_ - 1;
877
878 // we cache the value of M_ab * cellMeasure at each point.
879 // access to cellMeasures involves cellMeasures.numTensorComponents() - 1 multiplies, so total is the below:
880 const int flopsPerPoint_ab = cellMeasures_.numTensorComponents(); // the access involves multiplying all the components together
881
882 for (int a_component=0; a_component < leftComponentSpan_; a_component++)
883 {
884 for (int b_component=0; b_component < rightComponentSpan_; b_component++)
885 {
886 const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
887 const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
888
889 const int numLeftFieldsFinal = leftFinalComponent.extent_int(0); // shape (F,P[,D])
890 const int numRightFieldsFinal = rightFinalComponent.extent_int(0); // shape (F,P[,D])
891
892 flopCount += flopsPerPoint_ab * cellMeasures_.extent_int(1);
893
894 int flopsPer_i_R_j_R = 0;
895
896 // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
897 // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
898 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldArguments;
899 leftFieldArguments[R] = 0;
900
901 Kokkos::Array<int,Parameters::MaxTensorComponents> pointArguments;
902 for (int i=0; i<numTensorComponents_; i++)
903 {
904 pointArguments[i] = 0;
905 }
906
907 // as an optimization, we are using the same argument array for both the truncated field that spans s to R-1 and the full one that goes to R
908 // the "R" argument here is ignored by the methods that treat the truncated field (it's beyond their bounds)
909 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldArguments;
910 rightFieldArguments[R] = 0;
911
912 int r = R;
913 while (r >= 0)
914 {
915 // integrate in final component dimension
916 for (pointArguments[R]=0; pointArguments[R] < pointBounds_[R]; pointArguments[R]++)
917 {
918 flopsPer_i_R_j_R += 4;
919 }
920 // TODO: figure out why the below is not the same thing as the above -- the below overcounts, somehow
921// if (0 < pointBounds_[R])
922// {
923// flopsPer_i_R_j_R += pointBounds_[R] * 4;
924// }
925
926 const int r_next = nextIncrementResult(pointArguments, pointBounds_, numTensorComponents_);
927 const int r_min = (r_next >= 0) ? r_next : 0;
928
929 for (int s=R-1; s>=r_min; s--)
930 {
931 // want to cover all the multi-indices from s to R-1: for each we have 2 multiplies and one add (3 flops)
932 int numLeftIterations = leftFieldRelativeEnumerationSpans_[s];
933 int numRightIterations = rightFieldRelativeEnumerationSpans_[s];
934
935 flopsPer_i_R_j_R += numLeftIterations * numRightIterations * 3;
936 }
937
938 // proceed to next point
939 r = incrementArgument(pointArguments, pointBounds_, numTensorComponents_);
940 }
941
942 flopCount += flopsPer_i_R_j_R * numLeftFieldsFinal * numRightFieldsFinal;
943 }
944 }
945// std::cout << "flop count per cell: " << flopCount << std::endl;
946 return flopCount;
947 }
948
950 int teamSize(const int &maxTeamSizeFromKokkos) const
951 {
952 const int R = numTensorComponents_ - 1;
953 const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
954 return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
955 }
956
958 size_t team_shmem_size (int team_size) const
959 {
960 // we use shared memory to create a fast buffer for intermediate values, as well as fast access to the current-cell's field values
961 size_t shmem_size = 0;
962
963 if (fad_size_output_ > 0)
964 {
965 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(offsetsForComponentOrdinal_[0] * team_size, fad_size_output_);
966 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.extent_int(1), fad_size_output_);
967 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_, fad_size_output_);
968 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_, fad_size_output_);
969 }
970 else
971 {
972 shmem_size += Kokkos::View<Scalar*, DeviceType>::shmem_size(offsetsForComponentOrdinal_[0] * team_size);
973 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.extent_int(1));
974 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_);
975 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_);
976 }
977
978 return shmem_size;
979 }
980 };
981
991 template<class Scalar, class DeviceType, int integralViewRank>
992 class F_IntegratePointValueCache
993 {
994 using ExecutionSpace = typename DeviceType::execution_space;
995 using TeamPolicy = Kokkos::TeamPolicy<DeviceType>;
996 using TeamMember = typename TeamPolicy::member_type;
997
998 using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
999 IntegralViewType integralView_;
1000 TensorData<Scalar,DeviceType> leftComponent_;
1001 Data<Scalar,DeviceType> composedTransform_;
1002 TensorData<Scalar,DeviceType> rightComponent_;
1003 TensorData<Scalar,DeviceType> cellMeasures_;
1004 int a_offset_;
1005 int b_offset_;
1006 int leftComponentSpan_; // leftComponentSpan tracks the dimensions spanned by the left component
1007 int rightComponentSpan_; // rightComponentSpan tracks the dimensions spanned by the right component
1008 int numTensorComponents_;
1009 int leftFieldOrdinalOffset_;
1010 int rightFieldOrdinalOffset_;
1011
1012 size_t fad_size_output_ = 0; // 0 if not a fad type
1013
1014 // as an optimization, we do all the bounds and argument iteration within the functor rather than relying on TensorArgumentIterator
1015 // (this also makes it easier to reorder loops, etc., for further optimizations)
1016 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
1017 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
1018 Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
1019
1020 int maxFieldsLeft_;
1021 int maxFieldsRight_;
1022 int maxPointCount_;
1023 public:
1024 F_IntegratePointValueCache(Data<Scalar,DeviceType> integralData,
1025 TensorData<Scalar,DeviceType> leftComponent,
1026 Data<Scalar,DeviceType> composedTransform,
1027 TensorData<Scalar,DeviceType> rightComponent,
1028 TensorData<Scalar,DeviceType> cellMeasures,
1029 int a_offset,
1030 int b_offset,
1031 int leftFieldOrdinalOffset,
1032 int rightFieldOrdinalOffset)
1033 :
1034 integralView_(integralData.template getUnderlyingView<integralViewRank>()),
1035 leftComponent_(leftComponent),
1036 composedTransform_(composedTransform),
1037 rightComponent_(rightComponent),
1038 cellMeasures_(cellMeasures),
1039 a_offset_(a_offset),
1040 b_offset_(b_offset),
1041 leftComponentSpan_(leftComponent.extent_int(2)),
1042 rightComponentSpan_(rightComponent.extent_int(2)),
1043 numTensorComponents_(leftComponent.numTensorComponents()),
1044 leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
1045 rightFieldOrdinalOffset_(rightFieldOrdinalOffset)
1046 {
1047 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument, "Left and right components must have matching number of tensorial components");
1048
1049 const int FIELD_DIM = 0;
1050 const int POINT_DIM = 1;
1051 maxFieldsLeft_ = 0;
1052 maxFieldsRight_ = 0;
1053 maxPointCount_ = 0;
1054 for (int r=0; r<numTensorComponents_; r++)
1055 {
1056 leftFieldBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1057 maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
1058 rightFieldBounds_[r] = rightComponent_.getTensorComponent(r).extent_int(FIELD_DIM);
1059 maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
1060 pointBounds_[r] = leftComponent_.getTensorComponent(r).extent_int(POINT_DIM);
1061 maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
1062 }
1063
1064 // prepare for allocation of temporary storage
1065 // note: tempStorage goes "backward", starting from the final component, which needs just one entry
1066
1067 const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
1068 if (allocateFadStorage)
1069 {
1070 fad_size_output_ = dimension_scalar(integralView_);
1071 }
1072 }
1073
1074 template<size_t maxComponents, size_t numComponents = maxComponents>
1075 KOKKOS_INLINE_FUNCTION
1076 int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
1077 const Kokkos::Array<int,maxComponents> &bounds) const
1078 {
1079 if (numComponents == 0) return -1;
1080 int r = static_cast<int>(numComponents - 1);
1081 while (arguments[r] + 1 >= bounds[r])
1082 {
1083 arguments[r] = 0; // reset
1084 r--;
1085 if (r < 0) break;
1086 }
1087 if (r >= 0) ++arguments[r];
1088 return r;
1089 }
1090
1092 KOKKOS_INLINE_FUNCTION
1093 int incrementArgument( Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
1094 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1095 const int &numComponents) const
1096 {
1097 if (numComponents == 0) return -1;
1098 int r = static_cast<int>(numComponents - 1);
1099 while (arguments[r] + 1 >= bounds[r])
1100 {
1101 arguments[r] = 0; // reset
1102 r--;
1103 if (r < 0) break;
1104 }
1105 if (r >= 0) ++arguments[r];
1106 return r;
1107 }
1108
1109 template<size_t maxComponents, size_t numComponents = maxComponents>
1110 KOKKOS_INLINE_FUNCTION
1111 int nextIncrementResult(const Kokkos::Array<int,maxComponents> &arguments,
1112 const Kokkos::Array<int,maxComponents> &bounds) const
1113 {
1114 if (numComponents == 0) return -1;
1115 int r = static_cast<int>(numComponents - 1);
1116 while (arguments[r] + 1 >= bounds[r])
1117 {
1118 r--;
1119 if (r < 0) break;
1120 }
1121 return r;
1122 }
1123
1125 KOKKOS_INLINE_FUNCTION
1126 int nextIncrementResult(const Kokkos::Array<int,Parameters::MaxTensorComponents> &arguments,
1127 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1128 const int &numComponents) const
1129 {
1130 if (numComponents == 0) return -1;
1131 int r = numComponents - 1;
1132 while (arguments[r] + 1 >= bounds[r])
1133 {
1134 r--;
1135 if (r < 0) break;
1136 }
1137 return r;
1138 }
1139
1140 template<size_t maxComponents, size_t numComponents = maxComponents>
1141 KOKKOS_INLINE_FUNCTION
1142 int relativeEnumerationIndex(const Kokkos::Array<int,maxComponents> &arguments,
1143 const Kokkos::Array<int,maxComponents> &bounds,
1144 const int startIndex) const
1145 {
1146 // the following mirrors what is done in TensorData
1147 if (numComponents == 0)
1148 {
1149 return 0;
1150 }
1151 int enumerationIndex = 0;
1152 for (size_t r=numComponents-1; r>static_cast<size_t>(startIndex); r--)
1153 {
1154 enumerationIndex += arguments[r];
1155 enumerationIndex *= bounds[r-1];
1156 }
1157 enumerationIndex += arguments[startIndex];
1158 return enumerationIndex;
1159 }
1160
1161 template<int rank>
1162 KOKKOS_INLINE_FUNCTION
1163 enable_if_t<rank==3 && rank==integralViewRank, Scalar &>
1164 integralViewEntry(const IntegralViewType& integralView, const int &cellDataOrdinal, const int &i, const int &j) const
1165 {
1166 return integralView(cellDataOrdinal,i,j);
1167 }
1168
1169 template<int rank>
1170 KOKKOS_INLINE_FUNCTION
1171 enable_if_t<rank==2 && rank==integralViewRank, Scalar &>
1172 integralViewEntry(const IntegralViewType& integralView, const int &cellDataOrdinal, const int &i, const int &j) const
1173 {
1174 return integralView(i,j);
1175 }
1176
1178 KOKKOS_INLINE_FUNCTION
1179 void runSpecialized3( const TeamMember & teamMember ) const
1180 {
1181 constexpr int numTensorComponents = 3;
1182
1183 const int pointBounds_x = pointBounds_[0];
1184 const int pointBounds_y = pointBounds_[1];
1185 const int pointBounds_z = pointBounds_[2];
1186 const int pointsInNonzeroComponentDimensions = pointBounds_y * pointBounds_z;
1187
1188 const int leftFieldBounds_x = leftFieldBounds_[0];
1189 const int rightFieldBounds_x = rightFieldBounds_[0];
1190 const int leftFieldBounds_y = leftFieldBounds_[1];
1191 const int rightFieldBounds_y = rightFieldBounds_[1];
1192 const int leftFieldBounds_z = leftFieldBounds_[2];
1193 const int rightFieldBounds_z = rightFieldBounds_[2];
1194
1195 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1196 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1197 for (unsigned r=0; r<numTensorComponents; r++)
1198 {
1199 leftFieldBounds[r] = leftFieldBounds_[r];
1200 rightFieldBounds[r] = rightFieldBounds_[r];
1201 }
1202
1203 const auto integralView = integralView_;
1204 const auto leftFieldOrdinalOffset = leftFieldOrdinalOffset_;
1205 const auto rightFieldOrdinalOffset = rightFieldOrdinalOffset_;
1206
1207 const int cellDataOrdinal = teamMember.league_rank();
1208 const int threadNumber = teamMember.team_rank();
1209
1210 const int numThreads = teamMember.team_size(); // num threads
1211 const int GyEntryCount = pointBounds_z; // for each thread: store one Gy value per z coordinate
1212 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GxIntegrals; // for caching Gx values: we integrate out the first component dimension for each coordinate in the remaining dimensios
1213 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GyIntegrals; // for caching Gy values (each thread gets a stack, of the same height as tensorComponents - 1)
1214 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure; shared by team
1215
1216 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, rightFields_x;
1217 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_y, rightFields_y;
1218 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_z, rightFields_z;
1219 if (fad_size_output_ > 0) {
1220 GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1221 GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads, fad_size_output_);
1222 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
1223
1224 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x, fad_size_output_);
1225 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x, fad_size_output_);
1226 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y, fad_size_output_);
1227 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y, fad_size_output_);
1228 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z, fad_size_output_);
1229 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z, fad_size_output_);
1230 }
1231 else {
1232 GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1233 GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads);
1234 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
1235
1236 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x);
1237 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x);
1238 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y);
1239 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y);
1240 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z);
1241 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z);
1242 }
1243
1244// int approximateFlopCount = 0;
1245// int flopsPerCellMeasuresAccess = cellMeasures_.numTensorComponents() - 1;
1246
1247 // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1248
1249 const int composedTransformRank = composedTransform_.rank();
1250
1251 // synchronize threads
1252 teamMember.team_barrier();
1253
1254 for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1255 {
1256 const int a = a_offset_ + a_component;
1257 for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1258 {
1259 const int b = b_offset_ + b_component;
1260
1261 const Data<Scalar,DeviceType> & leftTensorComponent_x = leftComponent_.getTensorComponent(0);
1262 const Data<Scalar,DeviceType> & rightTensorComponent_x = rightComponent_.getTensorComponent(0);
1263 const Data<Scalar,DeviceType> & leftTensorComponent_y = leftComponent_.getTensorComponent(1);
1264 const Data<Scalar,DeviceType> & rightTensorComponent_y = rightComponent_.getTensorComponent(1);
1265 const Data<Scalar,DeviceType> & leftTensorComponent_z = leftComponent_.getTensorComponent(2);
1266 const Data<Scalar,DeviceType> & rightTensorComponent_z = rightComponent_.getTensorComponent(2);
1267
1268 const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
1269 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields), [&] (const int& fieldOrdinal) {
1270 if (fieldOrdinal < leftTensorComponent_x.extent_int(0))
1271 {
1272 const int pointCount = leftTensorComponent_x.extent_int(1);
1273 const int leftRank = leftTensorComponent_x.rank();
1274 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1275 {
1276 leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1277 }
1278 }
1279 if (fieldOrdinal < leftTensorComponent_y.extent_int(0))
1280 {
1281 const int pointCount = leftTensorComponent_y.extent_int(1);
1282 const int leftRank = leftTensorComponent_y.rank();
1283 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1284 {
1285 leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1286 }
1287 }
1288 if (fieldOrdinal < leftTensorComponent_z.extent_int(0))
1289 {
1290 const int pointCount = leftTensorComponent_z.extent_int(1);
1291 const int leftRank = leftTensorComponent_z.rank();
1292 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1293 {
1294 leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1295 }
1296 }
1297 if (fieldOrdinal < rightTensorComponent_x.extent_int(0))
1298 {
1299 const int pointCount = rightTensorComponent_x.extent_int(1);
1300 const int rightRank = rightTensorComponent_x.rank();
1301 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1302 {
1303 rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1304 }
1305 }
1306 if (fieldOrdinal < rightTensorComponent_y.extent_int(0))
1307 {
1308 const int pointCount = rightTensorComponent_y.extent_int(1);
1309 const int rightRank = rightTensorComponent_y.rank();
1310 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1311 {
1312 rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1313 }
1314 }
1315 if (fieldOrdinal < rightTensorComponent_z.extent_int(0))
1316 {
1317 const int pointCount = rightTensorComponent_z.extent_int(1);
1318 const int rightRank = rightTensorComponent_z.rank();
1319 for (int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1320 {
1321 rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1322 }
1323 }
1324 });
1325
1326 if (composedTransformRank == 4) // (C,P,D,D)
1327 {
1328 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1329 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1330 });
1331 }
1332 else // (C,P)
1333 {
1334 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1335 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1336 });
1337 }
1338
1339 // synchronize threads
1340 teamMember.team_barrier();
1341
1342 for (int i0=0; i0<leftFieldBounds_x; i0++)
1343 {
1344 for (int j0=0; j0<rightFieldBounds_x; j0++)
1345 {
1346 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1347 // first component is fastest-moving; we can get a tensorPointEnumerationOffset just by multiplying by the pointBounds in x
1348 const int tensorPointEnumerationOffset = pointBounds_x * pointEnumerationIndexLaterDimensions; // compute offset for pointWeights container, for which x is the fastest-moving
1349
1350 Scalar & Gx = GxIntegrals(pointEnumerationIndexLaterDimensions);
1351
1352 Gx = 0;
1353 if (fad_size_output_ == 0)
1354 {
1355 // not a Fad type; we're allow to have a vector range
1356 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_x), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1357 {
1358 integralThusFar += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1359 }, Gx);
1360 }
1361 else
1362 {
1363 for (int x_pointOrdinal=0; x_pointOrdinal<pointBounds_x; x_pointOrdinal++)
1364 {
1365 Gx += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1366 }
1367 }
1368 });
1369
1370 // synchronize threads
1371 teamMember.team_barrier();
1372
1373 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds_y * rightFieldBounds_y), [&] (const int& i1j1) {
1374 const int i1 = i1j1 % leftFieldBounds_y;
1375 const int j1 = i1j1 / leftFieldBounds_y;
1376
1377 int Gy_index_offset = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1378
1379 for (int lz=0; lz<pointBounds_z; lz++)
1380 {
1381 int pointEnumerationIndex = lz * pointBounds_y;
1382 if (fad_size_output_ == 0)
1383 {
1384 Scalar Gy_local = 0;
1385
1386 // not a Fad type; we're allow to have a vector range
1387 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_y), [&] (const int &ly, Scalar &integralThusFar)
1388 {
1389 const Scalar & leftValue = leftFields_y(i1,ly);
1390 const Scalar & rightValue = rightFields_y(j1,ly);
1391
1392 integralThusFar += leftValue * rightValue * GxIntegrals(pointEnumerationIndex + ly);
1393 }, Gy_local);
1394
1395 GyIntegrals(Gy_index_offset + lz) = Gy_local;
1396 }
1397 else
1398 {
1399 Scalar & Gy = GyIntegrals(Gy_index_offset + lz);
1400 for (int ly=0; ly<pointBounds_y; ly++)
1401 {
1402 const Scalar & leftValue = leftFields_y(i1,ly);
1403 const Scalar & rightValue = rightFields_y(j1,ly);
1404
1405 Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex + ly);
1406 }
1407 }
1408 }
1409
1410 for (int i2=0; i2<leftFieldBounds_z; i2++)
1411 {
1412 for (int j2=0; j2<rightFieldBounds_z; j2++)
1413 {
1414 Scalar Gz = 0.0;
1415
1416 int Gy_index_offset = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1417
1418 if (fad_size_output_ == 0)
1419 {
1420 // not a Fad type; we're allow to have a vector range
1421 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_z), [&] (const int &lz, Scalar &integralThusFar)
1422 {
1423 const Scalar & leftValue = leftFields_z(i2,lz);
1424 const Scalar & rightValue = rightFields_z(j2,lz);
1425
1426 integralThusFar += leftValue * rightValue * GyIntegrals(Gy_index_offset+lz);
1427 }, Gz);
1428 }
1429 else
1430 {
1431 for (int lz=0; lz<pointBounds_z; lz++)
1432 {
1433 const Scalar & leftValue = leftFields_z(i2,lz);
1434 const Scalar & rightValue = rightFields_z(j2,lz);
1435
1436 Gz += leftValue * rightValue * GyIntegrals(Gy_index_offset+lz);
1437 }
1438 }
1439
1440 const int i = leftFieldOrdinalOffset + i0 + (i1 + i2 * leftFieldBounds_y) * leftFieldBounds_x;
1441 const int j = rightFieldOrdinalOffset + j0 + (j1 + j2 * rightFieldBounds_y) * rightFieldBounds_x;
1442 // the above are an optimization of the below, undertaken on the occasion of a weird Intel compiler segfault, possibly a compiler bug.
1443// const int i = relativeEnumerationIndex( leftArguments, leftFieldBounds, 0) + leftFieldOrdinalOffset;
1444// const int j = relativeEnumerationIndex(rightArguments, rightFieldBounds, 0) + rightFieldOrdinalOffset;
1445
1446 Kokkos::single (Kokkos::PerThread(teamMember), [&] () {
1447 integralViewEntry<integralViewRank>(integralView, cellDataOrdinal, i, j) += Gz;
1448 });
1449 }
1450 }
1451 });
1452 // synchronize threads
1453 teamMember.team_barrier();
1454 }
1455 }
1456 }
1457 }
1458 }
1459
1460 template<size_t numTensorComponents>
1461 KOKKOS_INLINE_FUNCTION
1462 void run( const TeamMember & teamMember ) const
1463 {
1464 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true, std::logic_error, "implementation incomplete");
1465// Kokkos::Array<int,numTensorComponents> pointBounds;
1466// Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1467// Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1468//
1469// int pointsInNonzeroComponentDimensions = 1;
1470// for (unsigned r=0; r<numTensorComponents; r++)
1471// {
1472// pointBounds[r] = pointBounds_[r];
1473// if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1474// leftFieldBounds[r] = leftFieldBounds_[r];
1475// rightFieldBounds[r] = rightFieldBounds_[r];
1476// }
1477//
1478// const int cellDataOrdinal = teamMember.league_rank();
1479// const int numThreads = teamMember.team_size(); // num threads
1480// const int G_k_StackHeight = numTensorComponents - 1; // per thread
1481// Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> G_0_IntegralsView; // for caching G0 values: we integrate out the first component dimension for each coordinate in the remaining dimensios
1482// Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> G_k_StackView; // for caching G_k values (each thread gets a stack, of the same height as tensorComponents - 1)
1483// Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights; // indexed by (expanded) point; stores M_ab * cell measure; shared by team
1484// Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields; // cache the field values at each level for faster access
1485// if (fad_size_output_ > 0) {
1486// G_k_StackView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), G_k_StackHeight * numThreads, fad_size_output_);
1487// G_0_IntegralsView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1488// pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
1489// leftFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_, fad_size_output_);
1490// rightFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_, fad_size_output_);
1491// }
1492// else {
1493// G_k_StackView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), G_k_StackHeight * numThreads);
1494// G_0_IntegralsView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1495// pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
1496// leftFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_);
1497// rightFields = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), maxPointCount_);
1498// }
1499//
1502//
1503// constexpr int R = numTensorComponents - 1;
1504//
1505// // approximateFlopCount += composedTransform_.extent_int(1) * cellMeasures.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1506//
1507// // synchronize threads
1508// teamMember.team_barrier();
1509//
1510// for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1511// {
1512// const int a = a_offset_ + a_component;
1513// for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1514// {
1515// const int b = b_offset_ + b_component;
1516//
1517// const Data<Scalar,DeviceType> & leftFirstComponent = leftComponent_.getTensorComponent(0);
1518// const Data<Scalar,DeviceType> & rightFirstComponent = rightComponent_.getTensorComponent(0);
1519//
1520// const int numLeftFieldsFirst = leftFirstComponent.extent_int(0); // shape (F,P[,D])
1521// const int numRightFieldsFirst = rightFirstComponent.extent_int(0); // shape (F,P[,D])
1522//
1523// const int numPointsFirst = leftFirstComponent.extent_int(1);
1524//
1525// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1526// pointWeights(pointOrdinal) = composedTransform_.access(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1527// });
1528//
1529// // synchronize threads
1530// teamMember.team_barrier();
1531//
1532// for (int i0=0; i0<numLeftFieldsFirst; i0++)
1533// {
1534// for (int j0=0; j0<numRightFieldsFirst; j0++)
1535// {
1536// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFirst*numPointsFirst), [&] (const int& fieldOrdinalByPointOrdinal) {
1537// const int fieldOrdinal = fieldOrdinalByPointOrdinal % numPointsFirst;
1538// const int pointOrdinal = fieldOrdinalByPointOrdinal / numPointsFirst;
1539// leftFields(pointOrdinal) = leftFirstComponent(fieldOrdinal,pointOrdinal);
1540// });
1541//
1542// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numRightFieldsFirst*numPointsFirst), [&] (const int& fieldOrdinalByPointOrdinal) {
1543// const int fieldOrdinal = fieldOrdinalByPointOrdinal % numPointsFirst;
1544// const int pointOrdinal = fieldOrdinalByPointOrdinal / numPointsFirst;
1545// rightFields(pointOrdinal) = rightFirstComponent(fieldOrdinal,pointOrdinal);
1546// });
1547//
1548// // synchronize threads
1549// teamMember.team_barrier();
1550//
1551// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1552// Kokkos::Array<int,numTensorComponents-1> pointArgumentsInLaterDimensions;
1553// int remainingIndex = pointEnumerationIndexLaterDimensions;
1554//
1555// for (int d=R-1; d>0; d--) // last component (z in 3D hypercube) is fastest-moving // TODO: consider doing first component as fastest-moving. That would make indexing into pointWeights simpler
1556// {
1557// pointArgumentsInLaterDimensions[d] = pointEnumerationIndexLaterDimensions % pointBounds[d+1];
1558// remainingIndex /= pointBounds[d+1];
1559// }
1560// pointArgumentsInLaterDimensions[0] = remainingIndex;
1561//
1562// int tensorPointEnumerationOffset = 0; // compute offset for pointWeights container, for which x is the fastest-moving
1563// for (int d=R; d>0; d--)
1564// {
1565// tensorPointEnumerationOffset += pointArgumentsInLaterDimensions[d-1]; // pointArgumentsInLaterDimensions does not have an x component, hence d-1 here
1566// tensorPointEnumerationOffset *= pointBounds[d-1];
1567// }
1568//
1569// Scalar integralValue = 0;
1570// if (fad_size_output_ == 0)
1571// {
1572// // not a Fad type; we're allow to have a vector range
1573// Kokkos::parallel_reduce("first component integral", Kokkos::ThreadVectorRange(teamMember, numPointsFirst), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1574// {
1575// integralThusFar += leftFields(x_pointOrdinal) * rightFields(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset);
1576// }, integralValue);
1577// }
1578// else
1579// {
1580// for (int pointOrdinal=0; pointOrdinal<numPointsFirst; pointOrdinal++)
1581// {
1582// integralValue += leftFields(pointOrdinal) * rightFields(pointOrdinal) * pointWeights(tensorPointEnumerationOffset);
1583// }
1584// }
1585//
1586// G_0_IntegralsView(pointEnumerationIndexLaterDimensions) = integralValue;
1587// });
1588//
1589// // synchronize threads
1590// teamMember.team_barrier();
1591//
1592// // TODO: finish this, probably after having written up the algorithm for arbitrary component count. (I have it written down for 3D.)
1593// }
1594// }
1595// }
1596// }
1598 }
1599
1600 KOKKOS_INLINE_FUNCTION
1601 void operator()( const TeamMember & teamMember ) const
1602 {
1603 switch (numTensorComponents_)
1604 {
1605 case 1: run<1>(teamMember); break;
1606 case 2: run<2>(teamMember); break;
1607 case 3: runSpecialized3(teamMember); break;
1608 case 4: run<4>(teamMember); break;
1609 case 5: run<5>(teamMember); break;
1610 case 6: run<6>(teamMember); break;
1611 case 7: run<7>(teamMember); break;
1612#ifdef INTREPID2_HAVE_DEBUG
1613 default:
1614 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(true,std::invalid_argument,"Unsupported component count");
1615#endif
1616 }
1617 }
1618
1621 {
1622 // compute flop count on a single cell
1623 int flopCount = 0;
1624
1625 constexpr int numTensorComponents = 3;
1626 Kokkos::Array<int,numTensorComponents> pointBounds;
1627 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1628 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1629
1630 int pointsInNonzeroComponentDimensions = 1;
1631 for (unsigned r=0; r<numTensorComponents; r++)
1632 {
1633 pointBounds[r] = pointBounds_[r];
1634 if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1635 leftFieldBounds[r] = leftFieldBounds_[r];
1636 rightFieldBounds[r] = rightFieldBounds_[r];
1637 }
1638
1639 for (int a_component=0; a_component < leftComponentSpan_; a_component++)
1640 {
1641 for (int b_component=0; b_component < rightComponentSpan_; b_component++)
1642 {
1643// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (const int& pointOrdinal) {
1644// pointWeights(pointOrdinal) = composedTransform_.access(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1645// });
1646 flopCount += composedTransform_.extent_int(1) * cellMeasures_.numTensorComponents(); // cellMeasures does numTensorComponents - 1 multiplies on each access
1647
1648 for (int i0=0; i0<leftFieldBounds[0]; i0++)
1649 {
1650 for (int j0=0; j0<rightFieldBounds[0]; j0++)
1651 {
1652 flopCount += pointsInNonzeroComponentDimensions * pointBounds[0] * 3; // 3 flops per integration point in the loop commented out below
1653// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (const int& pointEnumerationIndexLaterDimensions) {
1654// Kokkos::Array<int,numTensorComponents-1> pointArgumentsInLaterDimensions;
1655// int remainingIndex = pointEnumerationIndexLaterDimensions;
1656//
1657// for (int d=0; d<R-1; d++) // first component is fastest-moving; this is determined by order of access in the lz/ly loop to compute Gy (integrals in y dimension)
1658// {
1659// pointArgumentsInLaterDimensions[d] = pointEnumerationIndexLaterDimensions % pointBounds[d+1]; // d+1 because x dimension is being integrated away
1660// remainingIndex /= pointBounds[d+1];
1661// }
1662// pointArgumentsInLaterDimensions[R-1] = remainingIndex;
1663//
1664// int tensorPointEnumerationOffset = 0; // compute offset for pointWeights container, for which x is the fastest-moving
1665// for (int d=R; d>0; d--)
1666// {
1667// tensorPointEnumerationOffset += pointArgumentsInLaterDimensions[d-1]; // pointArgumentsInLaterDimensions does not have an x component, hence d-1 here
1668// tensorPointEnumerationOffset *= pointBounds[d-1];
1669// }
1670//
1671// Scalar integralValue = 0;
1672// if (fad_size_output_ == 0)
1673// {
1674// // not a Fad type; we're allow to have a vector range
1675// Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, numPointsFirst), [&] (const int &x_pointOrdinal, Scalar &integralThusFar)
1676// {
1677// integralThusFar += leftFields(x_pointOrdinal) * rightFields(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1678// }, integralValue);
1679// }
1680// else
1681// {
1682// for (int x_pointOrdinal=0; x_pointOrdinal<numPointsFirst; x_pointOrdinal++)
1683// {
1684// integralValue += leftFields_x(x_pointOrdinal) * rightFields_x(x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1685// }
1686// }
1687//
1688// GxIntegrals(pointEnumerationIndexLaterDimensions) = integralValue;
1689// });
1690
1691
1692 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * pointBounds[1] * pointBounds[2] * 3; // 3 flops for each Gy += line in the below
1693 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * pointBounds[2] * 3; // 3 flops for each Gz += line in the below
1694 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * 1; // 1 flops for the integralView += line below
1695
1696// Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds[1] * rightFieldBounds[1]), [&] (const int& i1j1) {
1697// const int i1 = i1j1 % leftFieldBounds[1];
1698// const int j1 = i1j1 / leftFieldBounds[1];
1699//
1701//
1702// int pointEnumerationIndex = 0; // incremented at bottom of lz loop below.
1703// for (int lz=0; lz<pointBounds[2]; lz++)
1704// {
1705// Scalar & Gy = GyIntegrals(Gy_index);
1706// Gy = 0.0;
1707//
1708// const bool leftRankIs3 = ( leftFields_y.rank() == 3);
1709// const bool rightRankIs3 = (rightFields_y.rank() == 3);
1710// for (int ly=0; ly<pointBounds[1]; ly++)
1711// {
1712// const Scalar & leftValue = leftRankIs3 ? leftFields_y(i1,ly,a_component) : leftFields_y(i1,ly);
1713// const Scalar & rightValue = rightRankIs3 ? rightFields_y(j1,ly,b_component) : rightFields_y(j1,ly);
1714//
1715// Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex);
1716//
1717// pointEnumerationIndex++;
1718// }
1719// Gy_index++;
1720// }
1721//
1722// for (int i2=0; i2<leftFieldBounds[2]; i2++)
1723// {
1724// for (int j2=0; j2<rightFieldBounds[2]; j2++)
1725// {
1726// Scalar Gz = 0.0;
1727//
1728// int Gy_index = GyEntryCount * threadNumber; // thread-relative index into GyIntegrals container; store one value per z coordinate
1729//
1730// const bool leftRankIs3 = ( leftFields_z.rank() == 3);
1731// const bool rightRankIs3 = (rightFields_z.rank() == 3);
1732// for (int lz=0; lz<pointBounds[2]; lz++)
1733// {
1734// const Scalar & leftValue = leftRankIs3 ? leftFields_z(i2,lz,a_component) : leftFields_z(i2,lz);
1735// const Scalar & rightValue = rightRankIs3 ? rightFields_z(j2,lz,b_component) : rightFields_z(j2,lz);
1736//
1737// Gz += leftValue * rightValue * GyIntegrals(Gy_index);
1738//
1739// Gy_index++;
1740// }
1741//
1742// Kokkos::Array<int,3> leftArguments {i0,i1,i2};
1743// Kokkos::Array<int,3> rightArguments {j0,j1,j2};
1744//
1745// const int i = relativeEnumerationIndex( leftArguments, leftFieldBounds, 0) + leftFieldOrdinalOffset_;
1746// const int j = relativeEnumerationIndex(rightArguments, rightFieldBounds, 0) + rightFieldOrdinalOffset_;
1747//
1748// if (integralViewRank == 2)
1749// {
1750// integralView_.access(i,j,0) += Gz;
1751// }
1752// else
1753// {
1754// integralView_.access(cellDataOrdinal,i,j) += Gz;
1755// }
1756// }
1757// }
1758// });
1759 }
1760 }
1761 }
1762 }
1763 return flopCount;
1764 }
1765
1767 int teamSize(const int &maxTeamSizeFromKokkos) const
1768 {
1769 // TODO: fix this to match the actual parallelism expressed
1770 const int R = numTensorComponents_ - 1;
1771 const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
1772 return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
1773 }
1774
1776 size_t team_shmem_size (int numThreads) const
1777 {
1778 // we use shared memory to create a fast buffer for intermediate values, as well as fast access to the current-cell's field values
1779 size_t shmem_size = 0;
1780
1781 const int GyEntryCount = pointBounds_[2]; // for each thread: store one Gy value per z coordinate
1782
1783 int pointsInNonzeroComponentDimensions = 1;
1784 for (int d=1; d<numTensorComponents_; d++)
1785 {
1786 pointsInNonzeroComponentDimensions *= pointBounds_[d];
1787 }
1788
1789 if (fad_size_output_ > 0)
1790 {
1791 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions, fad_size_output_); // GxIntegrals: entries with x integrated away
1792 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads, fad_size_output_); // GyIntegrals: entries with x,y integrated away
1793 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.extent_int(1), fad_size_output_); // pointWeights
1794
1795 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0], fad_size_output_); // leftFields_x
1796 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0], fad_size_output_); // rightFields_x
1797 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1], fad_size_output_); // leftFields_y
1798 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1], fad_size_output_); // rightFields_y
1799 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2], fad_size_output_); // leftFields_z
1800 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2], fad_size_output_); // rightFields_z
1801 }
1802 else
1803 {
1804 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions); // GxIntegrals: entries with x integrated away
1805 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads); // GyIntegrals: entries with x,y integrated away
1806 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.extent_int(1)); // pointWeights
1807
1808 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0]); // leftFields_x
1809 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0]); // rightFields_x
1810 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1]); // leftFields_y
1811 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1]); // rightFields_y
1812 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2]); // leftFields_z
1813 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2]); // rightFields_z
1814 }
1815
1816 return shmem_size;
1817 }
1818 };
1819
1820 template<class Scalar, class DeviceType>
1821 class F_RefSpaceIntegral
1822 {
1823 ScalarView<Scalar,DeviceType> integral_;
1826 Data<Scalar,DeviceType> weights_;
1827 ordinal_type dimSpan_;
1828 ordinal_type leftRank_;
1829 ordinal_type rightRank_;
1830 ordinal_type numPoints_;
1831 public:
1832 F_RefSpaceIntegral(ScalarView<Scalar,DeviceType> integralView,
1834 ordinal_type dimSpan)
1835 :
1836 integral_(integralView),
1837 left_(left),
1838 right_(right),
1839 weights_(weights),
1840 dimSpan_(dimSpan)
1841 {
1842 leftRank_ = left.rank();
1843 rightRank_ = right.rank();
1844 numPoints_ = weights.extent_int(0);
1845 }
1846
1847 KOKKOS_INLINE_FUNCTION
1848 void operator()( const ordinal_type & i, const ordinal_type & j ) const
1849 {
1850 Scalar refSpaceIntegral = 0.0;
1851 for (int ptOrdinal=0; ptOrdinal<numPoints_; ptOrdinal++)
1852 {
1853 const Scalar & weight = weights_(ptOrdinal);
1854 for (int a=0; a<dimSpan_; a++)
1855 {
1856 const Scalar & leftValue = ( leftRank_ == 2) ? left_(i,ptOrdinal) : left_(i,ptOrdinal,a);
1857 const Scalar & rightValue = (rightRank_ == 2) ? right_(j,ptOrdinal) : right_(j,ptOrdinal,a);
1858 refSpaceIntegral += leftValue * rightValue * weight;
1859 }
1860 }
1861 integral_(i,j) = refSpaceIntegral;
1862 }
1863 };
1864 }
1865
1866template<typename DeviceType>
1867template<class Scalar>
1869 const TensorData<Scalar,DeviceType> cellMeasures,
1870 const TransformedBasisValues<Scalar,DeviceType> vectorDataRight)
1871{
1872 // allocates a (C,F,F) container for storing integral data
1873
1874 // ordinal filter is used for Serendipity basis; we don't yet support Serendipity for sum factorization.
1875 // (when we do, the strategy will likely be to do sum factorized assembly and then filter the result).
1876 const bool leftHasOrdinalFilter = vectorDataLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1877 const bool rightHasOrdinalFilter = vectorDataRight.basisValues().ordinalFilter().extent_int(0) > 0;
1878 TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument, "Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1879
1880 // determine cellDataExtent and variation type. We currently support CONSTANT, MODULAR, and GENERAL as possible output variation types, depending on the inputs.
1881 // If cellMeasures has non-trivial tensor structure, the rank-1 cell Data object is the first component.
1882 // If cellMeasures has trivial tensor structure, then the first and only component has the cell index in its first dimension.
1883 // I.e., either way the relevant Data object is cellMeasures.getTensorComponent(0)
1884 const int CELL_DIM = 0;
1885 const auto cellMeasureData = cellMeasures.getTensorComponent(0);
1886 const auto leftTransform = vectorDataLeft.transform();
1887
1888 DimensionInfo combinedCellDimInfo = cellMeasureData.getDimensionInfo(CELL_DIM);
1889 // transforms may be invalid, indicating an identity transform. If so, it will not constrain the output at all.
1890 if (vectorDataLeft.transform().isValid())
1891 {
1892 combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataLeft.transform().getDimensionInfo(CELL_DIM));
1893 }
1894 if (vectorDataRight.transform().isValid())
1895 {
1896 combinedCellDimInfo = combinedDimensionInfo(combinedCellDimInfo, vectorDataRight.transform().getDimensionInfo(CELL_DIM));
1897 }
1898
1899 DataVariationType cellVariationType = combinedCellDimInfo.variationType;
1900 int cellDataExtent = combinedCellDimInfo.dataExtent;
1901
1902 const int numCells = vectorDataLeft.numCells();
1903 const int numFieldsLeft = vectorDataLeft.numFields();
1904 const int numFieldsRight = vectorDataRight.numFields();
1905
1906 Kokkos::Array<int,3> extents {numCells, numFieldsLeft, numFieldsRight};
1907 Kokkos::Array<DataVariationType,3> variationTypes {cellVariationType,GENERAL,GENERAL};
1908
1909 if (cellVariationType != CONSTANT)
1910 {
1911 Kokkos::View<Scalar***,DeviceType> data("Intrepid2 integral data",cellDataExtent,numFieldsLeft,numFieldsRight);
1912 return Data<Scalar,DeviceType>(data, extents, variationTypes);
1913 }
1914 else
1915 {
1916 Kokkos::View<Scalar**,DeviceType> data("Intrepid2 integral data",numFieldsLeft,numFieldsRight);
1917 return Data<Scalar,DeviceType>(data, extents, variationTypes);
1918 }
1919}
1920
1924template<typename DeviceType>
1925template<class Scalar>
1927 const TensorData<Scalar,DeviceType> & cellMeasures,
1928 const TransformedBasisValues<Scalar,DeviceType> & basisValuesRight, const bool sumInto,
1929 double* approximateFlops)
1930{
1931 using ExecutionSpace = typename DeviceType::execution_space;
1932
1933 // ordinal filter is used for Serendipity basis; we don't yet support Serendipity for sum factorization.
1934 // (when we do, the strategy will likely be to do sum factorized assembly and then filter the result).
1935 const bool leftHasOrdinalFilter = basisValuesLeft.basisValues().ordinalFilter().extent_int(0) > 0;
1936 const bool rightHasOrdinalFilter = basisValuesRight.basisValues().ordinalFilter().extent_int(0) > 0;
1937 TEUCHOS_TEST_FOR_EXCEPTION(leftHasOrdinalFilter || rightHasOrdinalFilter, std::invalid_argument, "Ordinal filters for BasisValues are not yet supported by IntegrationTools");
1938
1939 if (approximateFlops != NULL)
1940 {
1941 *approximateFlops = 0;
1942 }
1943
1944 // integral data may have shape (C,F1,F2) or (if the variation type is CONSTANT in the cell dimension) shape (F1,F2)
1945 const int integralViewRank = integrals.getUnderlyingViewRank();
1946
1947 if (!sumInto)
1948 {
1949 integrals.clear();
1950 }
1951
1952 const int cellDataExtent = integrals.getDataExtent(0);
1953 const int numFieldsLeft = basisValuesLeft.numFields();
1954 const int numFieldsRight = basisValuesRight.numFields();
1955 const int spaceDim = basisValuesLeft.spaceDim();
1956
1957 INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.spaceDim() != basisValuesRight.spaceDim(), std::invalid_argument, "vectorDataLeft and vectorDataRight must agree on the space dimension");
1958
1959 const int leftFamilyCount = basisValuesLeft.basisValues().numFamilies();
1960 const int rightFamilyCount = basisValuesRight.basisValues().numFamilies();
1961
1962 // we require that the number of tensor components in the vectors are the same for each vector entry
1963 // this is not strictly necessary, but it makes implementation easier, and we don't at present anticipate other use cases
1964 int numTensorComponentsLeft = -1;
1965 const bool leftIsVectorValued = basisValuesLeft.vectorData().isValid();
1966
1967 if (leftIsVectorValued)
1968 {
1969 const auto &refVectorLeft = basisValuesLeft.vectorData();
1970 int numFamiliesLeft = refVectorLeft.numFamilies();
1971 int numVectorComponentsLeft = refVectorLeft.numComponents();
1972 Kokkos::Array<int,7> maxFieldsForComponentLeft {0,0,0,0,0,0,0};
1973 for (int familyOrdinal=0; familyOrdinal<numFamiliesLeft; familyOrdinal++)
1974 {
1975 for (int vectorComponent=0; vectorComponent<numVectorComponentsLeft; vectorComponent++)
1976 {
1977 const TensorData<Scalar,DeviceType> &tensorData = refVectorLeft.getComponent(familyOrdinal,vectorComponent);
1978 if (tensorData.numTensorComponents() > 0)
1979 {
1980 if (numTensorComponentsLeft == -1)
1981 {
1982 numTensorComponentsLeft = tensorData.numTensorComponents();
1983 }
1984 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataLeft must have the same number of tensor components as every other");
1985 for (int r=0; r<numTensorComponentsLeft; r++)
1986 {
1987 maxFieldsForComponentLeft[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentLeft[r]);
1988 }
1989 }
1990 }
1991 }
1992 }
1993 else
1994 {
1995 numTensorComponentsLeft = basisValuesLeft.basisValues().tensorData(0).numTensorComponents(); // family ordinal 0
1996 for (int familyOrdinal = 0; familyOrdinal < leftFamilyCount; familyOrdinal++)
1997 {
1998 INTREPID2_TEST_FOR_EXCEPTION(basisValuesLeft.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument, "All families must match in the number of tensor components");
1999 }
2000 }
2001 int numTensorComponentsRight = -1;
2002 const bool rightIsVectorValued = basisValuesRight.vectorData().isValid();
2003
2004 if (rightIsVectorValued)
2005 {
2006 const auto &refVectorRight = basisValuesRight.vectorData();
2007 int numFamiliesRight = refVectorRight.numFamilies();
2008 int numVectorComponentsRight = refVectorRight.numComponents();
2009 Kokkos::Array<int,7> maxFieldsForComponentRight {0,0,0,0,0,0,0};
2010 for (int familyOrdinal=0; familyOrdinal<numFamiliesRight; familyOrdinal++)
2011 {
2012 for (int vectorComponent=0; vectorComponent<numVectorComponentsRight; vectorComponent++)
2013 {
2014 const auto &tensorData = refVectorRight.getComponent(familyOrdinal,vectorComponent);
2015 if (tensorData.numTensorComponents() > 0)
2016 {
2017 if (numTensorComponentsRight == -1)
2018 {
2019 numTensorComponentsRight = tensorData.numTensorComponents();
2020 }
2021 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsRight != tensorData.numTensorComponents(), std::invalid_argument, "Each valid entry in vectorDataRight must have the same number of tensor components as every other");
2022 for (int r=0; r<numTensorComponentsRight; r++)
2023 {
2024 maxFieldsForComponentRight[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentRight[r]);
2025 }
2026 }
2027 }
2028 }
2029 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponentsRight != numTensorComponentsLeft, std::invalid_argument, "Right families must match left in the number of tensor components");
2030 }
2031 else
2032 {
2033 // check that right tensor component count agrees with left
2034 for (int familyOrdinal=0; familyOrdinal< rightFamilyCount; familyOrdinal++)
2035 {
2036 INTREPID2_TEST_FOR_EXCEPTION(basisValuesRight.basisValues().tensorData(familyOrdinal).numTensorComponents() != numTensorComponentsLeft, std::invalid_argument, "Right families must match left in the number of tensor components");
2037 }
2038 }
2039 const int numPointTensorComponents = cellMeasures.numTensorComponents() - 1;
2040
2041 if ((numPointTensorComponents == numTensorComponentsLeft) && basisValuesLeft.axisAligned() && basisValuesRight.axisAligned())
2042 {
2043 // cellMeasures is a non-trivial tensor product, and the pullbacks are all diagonals.
2044
2045 // in this case, the integrals in each tensorial direction are entirely separable
2046 // allocate some temporary storage for the integrals in each tensorial direction
2047
2048 // if cellMeasures is a nontrivial tensor product, that means that all cells have the same shape, up to scaling.
2049
2050 Kokkos::Array<int,Parameters::MaxTensorComponents> pointDimensions;
2051 for (int r=0; r<numPointTensorComponents; r++)
2052 {
2053 // first tensorial component of cell measures is the cell dimension; after that we have (P1,P2,…)
2054 pointDimensions[r] = cellMeasures.getTensorComponent(r+1).extent_int(0);
2055 }
2056
2057 // only one of these will be a valid container:
2058 Kokkos::View<Scalar**, DeviceType> integralView2;
2059 Kokkos::View<Scalar***, DeviceType> integralView3;
2060 if (integralViewRank == 3)
2061 {
2062 integralView3 = integrals.getUnderlyingView3();
2063 }
2064 else
2065 {
2066 integralView2 = integrals.getUnderlyingView2();
2067 }
2068 for (int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2069 {
2070 int a_offset = 0; // left vector component offset
2071 int leftFieldOffset = basisValuesLeft.basisValues().familyFieldOrdinalOffset(leftFamilyOrdinal);
2072
2073 const int leftVectorComponentCount = leftIsVectorValued ? basisValuesLeft.vectorData().numComponents() : 1;
2074 for (int leftVectorComponentOrdinal = 0; leftVectorComponentOrdinal < leftVectorComponentCount; leftVectorComponentOrdinal++)
2075 {
2076 TensorData<Scalar,DeviceType> leftComponent = leftIsVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftVectorComponentOrdinal)
2077 : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
2078 if (!leftComponent.isValid())
2079 {
2080 a_offset++; // empty components are understood to take up one dimension
2081 continue;
2082 }
2083 const int leftDimSpan = leftComponent.extent_int(2);
2084
2085 const int leftComponentFieldCount = leftComponent.extent_int(0);
2086
2087 for (int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2088 {
2089 int b_offset = 0; // right vector component offset
2090 int rightFieldOffset = basisValuesRight.vectorData().familyFieldOrdinalOffset(rightFamilyOrdinal);
2091
2092 const int rightVectorComponentCount = rightIsVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
2093 for (int rightVectorComponentOrdinal = 0; rightVectorComponentOrdinal < rightVectorComponentCount; rightVectorComponentOrdinal++)
2094 {
2095 TensorData<Scalar,DeviceType> rightComponent = rightIsVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightVectorComponentOrdinal)
2096 : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2097 if (!rightComponent.isValid())
2098 {
2099 b_offset++; // empty components are understood to take up one dimension
2100 continue;
2101 }
2102 const int rightDimSpan = rightComponent.extent_int(2);
2103
2104 const int rightComponentFieldCount = rightComponent.extent_int(0);
2105
2106 // we only accumulate for a == b (since this is the axis-aligned case). Do the a, b spans intersect?
2107 if ((a_offset + leftDimSpan <= b_offset) || (b_offset + rightDimSpan <= a_offset)) // no intersection
2108 {
2109 b_offset += rightDimSpan;
2110
2111 continue;
2112 }
2113
2114 // if the a, b spans intersect, by assumption they should align exactly
2115 INTREPID2_TEST_FOR_EXCEPTION(( a_offset != b_offset), std::logic_error, "left and right dimension offsets should match.");
2116 INTREPID2_TEST_FOR_EXCEPTION(( leftDimSpan != rightDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions.");
2117
2118 const int d_start = a_offset;
2119 const int d_end = d_start + leftDimSpan;
2120
2121 using ComponentIntegralsArray = Kokkos::Array< Kokkos::Array<ScalarView<Scalar,DeviceType>, Parameters::MaxTensorComponents>, Parameters::MaxTensorComponents>;
2122 ComponentIntegralsArray componentIntegrals;
2123 for (int r=0; r<numPointTensorComponents; r++)
2124 {
2125 /*
2126 Four vector data cases to consider:
2127 1. Both vector data containers are filled with axial components - first component in 3D has form (f,0,0), second (0,f,0), third (0,0,f).
2128 2. Both vector data containers have arbitrary components - in 3D: (f1,f2,f3) where f1 is given by the first component, f2 by the second, f3 by the third.
2129 3. First container is axial, second arbitrary.
2130 4. First is arbitrary, second axial.
2131
2132 But note that in all four cases, the structure of the integral is the same: you have three vector component integrals that get summed. The actual difference between
2133 the cases does not show up in the reference-space integrals here, but in the accumulation in physical space below, where the tensor field numbering comes into play.
2134
2135 The choice between axial and arbitrary affects the way the fields are numbered; the arbitrary components' indices refer to the same vector function, so they correspond,
2136 while the axial components refer to distinct scalar functions, so their numbering in the data container is cumulative.
2137 */
2138
2139 Data<Scalar,DeviceType> quadratureWeights = cellMeasures.getTensorComponent(r+1);
2140 const int numPoints = pointDimensions[r];
2141
2142 // It may be worth considering the possibility that some of these components point to the same data -- if so, we could possibly get better data locality by making the corresponding componentIntegral entries point to the same location as well. (And we could avoid some computations here.)
2143
2144 Data<Scalar,DeviceType> leftTensorComponent = leftComponent.getTensorComponent(r);
2145 Data<Scalar,DeviceType> rightTensorComponent = rightComponent.getTensorComponent(r);
2146
2147 const int leftTensorComponentDimSpan = leftTensorComponent.extent_int(2);
2148 const int leftTensorComponentFields = leftTensorComponent.extent_int(0);
2149 const int rightTensorComponentDimSpan = rightTensorComponent.extent_int(2);
2150 const int rightTensorComponentFields = rightTensorComponent.extent_int(0);
2151
2152 INTREPID2_TEST_FOR_EXCEPTION(( leftTensorComponentDimSpan != rightTensorComponentDimSpan), std::invalid_argument, "left and right components must span the same number of dimensions.");
2153
2154 for (int d=d_start; d<d_end; d++)
2155 {
2156 ScalarView<Scalar,DeviceType> componentIntegralView;
2157
2158 const bool allocateFadStorage = !(std::is_standard_layout<Scalar>::value && std::is_trivial<Scalar>::value);
2159 if (allocateFadStorage)
2160 {
2161 auto fad_size_output = dimension_scalar(integrals.getUnderlyingView());
2162 componentIntegralView = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields, fad_size_output);
2163 }
2164 else
2165 {
2166 componentIntegralView = ScalarView<Scalar,DeviceType>("componentIntegrals for tensor component " + std::to_string(r) + ", in dimension " + std::to_string(d), leftTensorComponentFields, rightTensorComponentFields);
2167 }
2168
2169 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{leftTensorComponentFields,rightTensorComponentFields});
2170
2171 Impl::F_RefSpaceIntegral<Scalar, DeviceType> refSpaceIntegralFunctor(componentIntegralView, leftTensorComponent, rightTensorComponent, quadratureWeights,
2172 leftTensorComponentDimSpan);
2173 Kokkos::parallel_for("compute componentIntegrals", policy, refSpaceIntegralFunctor);
2174
2175 componentIntegrals[r][d] = componentIntegralView;
2176
2177 if (approximateFlops != NULL)
2178 {
2179 *approximateFlops += leftTensorComponentFields*rightTensorComponentFields*numPoints*(3); // two multiplies, one add in innermost loop
2180 }
2181 } // d
2182 } // r
2183
2184 ExecutionSpace().fence();
2185
2186 Kokkos::Array<int,3> upperBounds {cellDataExtent,leftComponentFieldCount,rightComponentFieldCount}; // separately declared in effort to get around Intel 17.0.1 compiler weirdness.
2187 Kokkos::Array<int,3> lowerBounds {0,0,0};
2188 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>(lowerBounds, upperBounds);
2189 // TODO: note that for best performance, especially with Fad types, we should replace this parallel for with a Functor and use hierarchical parallelism
2190 Kokkos::parallel_for("compute field integrals", policy,
2191 KOKKOS_LAMBDA (const int &cellDataOrdinal, const int &leftFieldOrdinal, const int &rightFieldOrdinal) {
2192 const Scalar & cellMeasureWeight = cellMeasures.getTensorComponent(0)(cellDataOrdinal);
2193
2194 TensorArgumentIterator leftTensorIterator(leftComponent, 0); // shape is (F,P), and we walk the F dimension of the container
2195 leftTensorIterator.setEnumerationIndex(leftFieldOrdinal);
2196
2197 TensorArgumentIterator rightTensorIterator(rightComponent, 0); // shape is (F,P), and we walk the F dimension of the container
2198 rightTensorIterator.setEnumerationIndex(rightFieldOrdinal);
2199
2200 Scalar integralSum = 0.0;
2201 for (int d=d_start; d<d_end; d++)
2202 {
2203 const Scalar & transformLeft_d = basisValuesLeft.transformWeight(cellDataOrdinal,0,d,d);
2204 const Scalar & transformRight_d = basisValuesRight.transformWeight(cellDataOrdinal,0,d,d);
2205
2206 const Scalar & leftRightTransform_d = transformLeft_d * transformRight_d;
2207 // approximateFlopCount++;
2208
2209 Scalar integral_d = 1.0;
2210
2211 for (int r=0; r<numPointTensorComponents; r++)
2212 {
2213 integral_d *= componentIntegrals[r][d](leftTensorIterator.argument(r),rightTensorIterator.argument(r));
2214 // approximateFlopCount++; // product
2215 }
2216 integralSum += leftRightTransform_d * integral_d;
2217 // approximateFlopCount += 2; // multiply and sum
2218
2219 const int i = leftFieldOrdinal + leftFieldOffset;
2220 const int j = rightFieldOrdinal + rightFieldOffset;
2221
2222 if (integralViewRank == 3)
2223 {
2224 integralView3(cellDataOrdinal,i,j) += cellMeasureWeight * integralSum;
2225 }
2226 else
2227 {
2228 integralView2(i,j) += cellMeasureWeight * integralSum;
2229 }
2230 }
2231 });
2232 b_offset += rightDimSpan;
2233 } // rightVectorComponentOrdinal
2234 } // rightFamilyOrdinal
2235 a_offset += leftDimSpan;
2236 } // leftVectorComponentOrdinal
2237 } // leftFamilyOrdinal
2238
2239 if (approximateFlops != NULL)
2240 {
2241 // TODO: check the accuracy of this
2242 *approximateFlops += (2 + spaceDim * (3 + numPointTensorComponents)) * cellDataExtent * numFieldsLeft * numFieldsRight;
2243 }
2244 }
2245 else // general case (not axis-aligned + affine tensor-product structure)
2246 {
2247 // prepare composed transformation matrices
2248 const Data<Scalar,DeviceType> & leftTransform = basisValuesLeft.transform();
2249 const Data<Scalar,DeviceType> & rightTransform = basisValuesRight.transform();
2250 const bool transposeLeft = true;
2251 const bool transposeRight = false;
2252// auto timer = Teuchos::TimeMonitor::getNewTimer("mat-mat");
2253// timer->start();
2254 // transforms can be matrices -- (C,P,D,D): rank 4 -- or scalar weights -- (C,P): rank 2 -- or vector weights -- (C,P,D): rank 3
2255 Data<Scalar,DeviceType> composedTransform;
2256 // invalid/empty transforms are used when the identity is intended.
2257 const int leftRank = leftTransform.rank();
2258 const int rightRank = rightTransform.rank();
2259
2260 if (leftTransform.isValid() && rightTransform.isValid())
2261 {
2262 const bool bothRank4 = (leftRank == 4) && (rightRank == 4);
2263 const bool bothRank3 = (leftRank == 3) && (rightRank == 3);
2264 const bool bothRank2 = (leftRank == 2) && (rightRank == 2);
2265 const bool ranks32 = ((leftRank == 3) && (rightRank == 2)) || ((leftRank == 2) && (rightRank == 3));
2266 const bool ranks42 = ((leftRank == 4) && (rightRank == 2)) || ((leftRank == 2) && (rightRank == 4));
2267
2268 if (bothRank4) // (C,P,D,D)
2269 {
2270 composedTransform = Data<Scalar,DeviceType>::allocateMatMatResult(transposeLeft, leftTransform, transposeRight, rightTransform);
2271 composedTransform.storeMatMat(transposeLeft, leftTransform, transposeRight, rightTransform);
2272
2273 // if the composedTransform matrices are full, the following is a good estimate. If they have some diagonal portions, this will overcount.
2274 if (approximateFlops != NULL)
2275 {
2276 *approximateFlops += composedTransform.getUnderlyingViewSize() * (spaceDim - 1) * 2;
2277 }
2278 }
2279 else if (bothRank3) // (C,P,D)
2280 {
2281 // re-cast leftTransform as a rank 4 (C,P,1,D) object -- a 1 x D matrix at each (C,P).
2282 const int newRank = 4;
2283 auto extents = leftTransform.getExtents();
2284 auto variationTypes = leftTransform.getVariationTypes();
2285 extents[3] = extents[2];
2286 extents[2] = 1;
2287 variationTypes[3] = variationTypes[2];
2288 variationTypes[2] = CONSTANT;
2289 auto leftTransformMatrix = leftTransform.shallowCopy(newRank, extents, variationTypes);
2290
2291 // re-cast rightTransform as a rank 4 (C,P,1,D) object -- a 1 x D matrix at each (C,P)
2292 extents = rightTransform.getExtents();
2293 variationTypes = rightTransform.getVariationTypes();
2294 extents[3] = extents[2];
2295 extents[2] = 1;
2296 variationTypes[3] = variationTypes[2];
2297 variationTypes[2] = CONSTANT;
2298 auto rightTransformMatrix = rightTransform.shallowCopy(newRank, extents, variationTypes);
2299
2300 composedTransform = Data<Scalar,DeviceType>::allocateMatMatResult(transposeLeft, leftTransformMatrix, transposeRight, rightTransformMatrix); // false: don't transpose
2301 composedTransform.storeMatMat(transposeLeft, leftTransformMatrix, transposeRight, rightTransformMatrix);
2302
2303 if (approximateFlops != NULL)
2304 {
2305 *approximateFlops += composedTransform.getUnderlyingViewSize(); // one multiply per entry
2306 }
2307 }
2308 else if (bothRank2)
2309 {
2310 composedTransform = leftTransform.allocateInPlaceCombinationResult(leftTransform, rightTransform);
2311 composedTransform.storeInPlaceProduct(leftTransform, rightTransform);
2312
2313 // re-cast composedTranform as a rank 4 (C,P,1,1) object -- a 1 x 1 matrix at each (C,P).
2314 const int newRank = 4;
2315 auto extents = composedTransform.getExtents();
2316 auto variationTypes = composedTransform.getVariationTypes();
2317 composedTransform = composedTransform.shallowCopy(newRank, extents, variationTypes);
2318 if (approximateFlops != NULL)
2319 {
2320 *approximateFlops += composedTransform.getUnderlyingViewSize(); // one multiply per entry
2321 }
2322 }
2323 else if (ranks32) // rank 2 / rank 3 combination.
2324 {
2325 const auto & rank3Transform = (leftRank == 3) ? leftTransform : rightTransform;
2326 const auto & rank2Transform = (leftRank == 2) ? leftTransform : rightTransform;
2327
2328 composedTransform = DataTools::multiplyByCPWeights(rank3Transform, rank2Transform);
2329
2330 // re-cast composedTransform as a rank 4 object:
2331 // logically, the original rank-3 transform can be understood as a 1xD matrix. The composed transform is leftTransform^T * rightTransform, so:
2332 // - if left has the rank-3 transform, composedTransform should be a (C,P,D,1) object -- a D x 1 matrix at each (C,P).
2333 // - if right has the rank-3 transform, composedTransform should be a (C,P,1,D) object -- a 1 x D matrix at each (C,P).
2334 const int newRank = 4;
2335 auto extents = composedTransform.getExtents();
2336 auto variationTypes = composedTransform.getVariationTypes();
2337 if (leftRank == 3)
2338 {
2339 // extents[3] and variationTypes[3] will already be 1 and CONSTANT, respectively
2340 // extents[3] = 1;
2341 // variationTypes[3] = CONSTANT;
2342 }
2343 else
2344 {
2345 extents[3] = extents[2];
2346 extents[2] = 1;
2347 variationTypes[3] = variationTypes[2];
2348 variationTypes[2] = CONSTANT;
2349 }
2350 composedTransform = composedTransform.shallowCopy(newRank, extents, variationTypes);
2351 }
2352 else if (ranks42) // rank 4 / rank 2 combination.
2353 {
2354 if (leftRank == 4)
2355 {
2356 // want to transpose left matrix, and multiply by the values from rightTransform
2357 // start with the multiplication:
2358 auto composedTransformTransposed = DataTools::multiplyByCPWeights(leftTransform, rightTransform);
2359 composedTransform = DataTools::transposeMatrix(composedTransformTransposed);
2360 }
2361 else // (leftRank == 2)
2362 {
2363 composedTransform = DataTools::multiplyByCPWeights(rightTransform, leftTransform);
2364 }
2365 }
2366 else
2367 {
2368 INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported transform combination");
2369 }
2370 }
2371 else if (leftTransform.isValid())
2372 {
2373 // rightTransform is the identity
2374 switch (leftRank)
2375 {
2376 case 4: composedTransform = DataTools::transposeMatrix(leftTransform); break;
2377 case 3:
2378 {
2379 // - if left has the rank-3 transform, composedTransform should be a (C,P,D,1) object -- a D x 1 matrix at each (C,P).
2380 const int newRank = 4;
2381 auto extents = leftTransform.getExtents();
2382 auto variationTypes = leftTransform.getVariationTypes();
2383
2384 composedTransform = leftTransform.shallowCopy(newRank, extents, variationTypes);
2385 }
2386 break;
2387 case 2: composedTransform = leftTransform; break;
2388 default:
2389 INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported transform combination");
2390 }
2391 }
2392 else if (rightTransform.isValid())
2393 {
2394 // leftTransform is the identity
2395 composedTransform = rightTransform;
2396 switch (rightRank)
2397 {
2398 case 4: composedTransform = rightTransform; break;
2399 case 3:
2400 {
2401 // - if right has the rank-3 transform, composedTransform should be a (C,P,1,D) object -- a 1 x D matrix at each (C,P).
2402 const int newRank = 4;
2403 auto extents = rightTransform.getExtents();
2404 auto variationTypes = rightTransform.getVariationTypes();
2405 extents[3] = extents[2];
2406 variationTypes[3] = variationTypes[2];
2407 extents[2] = 1;
2408 variationTypes[2] = CONSTANT;
2409
2410 composedTransform = rightTransform.shallowCopy(newRank, extents, variationTypes);
2411 }
2412 break;
2413 case 2: composedTransform = rightTransform; break;
2414 default:
2415 INTREPID2_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Unsupported transform combination");
2416 }
2417 }
2418 else
2419 {
2420 // both left and right transforms are identity
2421 Kokkos::Array<ordinal_type,4> extents {basisValuesLeft.numCells(),basisValuesLeft.numPoints(),spaceDim,spaceDim};
2422 Kokkos::Array<DataVariationType,4> variationTypes {CONSTANT,CONSTANT,BLOCK_PLUS_DIAGONAL,BLOCK_PLUS_DIAGONAL};
2423
2424 Kokkos::View<Scalar*,DeviceType> identityUnderlyingView("Intrepid2::FST::integrate() - identity view",spaceDim);
2425 Kokkos::deep_copy(identityUnderlyingView, 1.0);
2426 composedTransform = Data<Scalar,DeviceType>(identityUnderlyingView,extents,variationTypes);
2427 }
2428
2429// timer->stop();
2430// std::cout << "Completed mat-mat in " << timer->totalElapsedTime() << " seconds.\n";
2431// timer->reset();
2432
2433 const int leftFamilyCount = basisValuesLeft. basisValues().numFamilies();
2434 const int rightFamilyCount = basisValuesRight.basisValues().numFamilies();
2435 const int leftComponentCount = leftIsVectorValued ? basisValuesLeft. vectorData().numComponents() : 1;
2436 const int rightComponentCount = rightIsVectorValued ? basisValuesRight.vectorData().numComponents() : 1;
2437
2438 int leftFieldOrdinalOffset = 0; // keeps track of the number of fields in prior families
2439 for (int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2440 {
2441 // "a" keeps track of the spatial dimension over which we are integrating in the left vector
2442 // components are allowed to span several dimensions; we keep track of the offset for the component in a_offset
2443 int a_offset = 0;
2444 bool haveLaunchedContributionToCurrentFamilyLeft = false; // helps to track whether we need a Kokkos::fence before launching a kernel.
2445 for (int leftComponentOrdinal=0; leftComponentOrdinal<leftComponentCount; leftComponentOrdinal++)
2446 {
2447 TensorData<Scalar,DeviceType> leftComponent = leftIsVectorValued ? basisValuesLeft.vectorData().getComponent(leftFamilyOrdinal, leftComponentOrdinal)
2448 : basisValuesLeft.basisValues().tensorData(leftFamilyOrdinal);
2449 if (!leftComponent.isValid())
2450 {
2451 // represents zero
2452 a_offset += basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal);
2453 continue;
2454 }
2455
2456 int rightFieldOrdinalOffset = 0; // keeps track of the number of fields in prior families
2457 for (int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2458 {
2459 // "b" keeps track of the spatial dimension over which we are integrating in the right vector
2460 // components are allowed to span several dimensions; we keep track of the offset for the component in b_offset
2461 bool haveLaunchedContributionToCurrentFamilyRight = false; // helps to track whether we need a Kokkos::fence before launching a kernel.
2462 int b_offset = 0;
2463 for (int rightComponentOrdinal=0; rightComponentOrdinal<rightComponentCount; rightComponentOrdinal++)
2464 {
2465 TensorData<Scalar,DeviceType> rightComponent = rightIsVectorValued ? basisValuesRight.vectorData().getComponent(rightFamilyOrdinal, rightComponentOrdinal)
2466 : basisValuesRight.basisValues().tensorData(rightFamilyOrdinal);
2467 if (!rightComponent.isValid())
2468 {
2469 // represents zero
2470 b_offset += basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal);
2471 continue;
2472 }
2473
2474 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(leftComponent.numTensorComponents() != rightComponent.numTensorComponents(), std::invalid_argument, "left TensorData and right TensorData have different number of tensor components. This is not supported.");
2475
2476 const int vectorSize = getVectorSizeForHierarchicalParallelism<Scalar>();
2477 Kokkos::TeamPolicy<ExecutionSpace> policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,Kokkos::AUTO(),vectorSize);
2478
2479 // TODO: expose the options for forceNonSpecialized and usePointCacheForRank3Tensor through an IntegrationAlgorithm enumeration.
2480 // AUTOMATIC: let Intrepid2 choose an algorithm based on the inputs (and, perhaps, the execution space)
2481 // STANDARD: don't use sum factorization or axis alignment -- just do the simple contraction, a (p+1)^9 algorithm in 3D
2482 // SUM_FACTORIZATION // (p+1)^7 algorithm in 3D
2483 // SUM_FACTORIZATION_AXIS_ALIGNED // (p+1)^6 algorithm in 3D
2484 // SUM_FACTORIZATION_FORCE_GENERIC_IMPLEMENTATION // mainly intended for testing purposes (specialized implementations perform better when they are provided)
2485 // SUM_FACTORIZATION_WITH_POINT_CACHE // novel (p+1)^7 (in 3D) algorithm in Intrepid2; unclear as yet when and whether this may be a superior approach
2486 bool forceNonSpecialized = false; // We might expose this in the integrate() arguments in the future. We *should* default to false in the future.
2487 bool usePointCacheForRank3Tensor = true; // EXPERIMENTAL; has better performance under CUDA, but slightly worse performance than standard on serial CPU
2488
2489 // in one branch or another below, we will launch a parallel kernel that contributes to (leftFamily, rightFamily) field ordinal pairs.
2490 // if we have already launched something that contributes to that part of the integral container, we need a Kokkos fence() to ensure that these do not interfere with each other.
2491 if (haveLaunchedContributionToCurrentFamilyLeft && haveLaunchedContributionToCurrentFamilyRight)
2492 {
2493 ExecutionSpace().fence();
2494 }
2495 haveLaunchedContributionToCurrentFamilyLeft = true;
2496 haveLaunchedContributionToCurrentFamilyRight = true;
2497
2498 if (integralViewRank == 2)
2499 {
2500 if (usePointCacheForRank3Tensor && (leftComponent.numTensorComponents() == 3))
2501 {
2502 auto functor = Impl::F_IntegratePointValueCache<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2503
2504 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2505 const int teamSize = functor.teamSize(recommendedTeamSize);
2506
2507 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2508
2509 Kokkos::parallel_for("F_IntegratePointValueCache rank 2", policy, functor);
2510
2511 if (approximateFlops != NULL)
2512 {
2513 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2514 }
2515 }
2516 else
2517 {
2518 auto functor = Impl::F_Integrate<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2519
2520 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2521 const int teamSize = functor.teamSize(recommendedTeamSize);
2522
2523 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2524
2525 Kokkos::parallel_for("F_Integrate rank 2", policy, functor);
2526
2527 if (approximateFlops != NULL)
2528 {
2529 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2530 }
2531 }
2532 }
2533 else if (integralViewRank == 3)
2534 {
2535 if (usePointCacheForRank3Tensor && (leftComponent.numTensorComponents() == 3))
2536 {
2537 auto functor = Impl::F_IntegratePointValueCache<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2538
2539 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2540 const int teamSize = functor.teamSize(recommendedTeamSize);
2541
2542 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2543
2544 Kokkos::parallel_for("F_IntegratePointValueCache rank 3", policy, functor);
2545
2546 if (approximateFlops != NULL)
2547 {
2548 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2549 }
2550 }
2551 else
2552 {
2553 auto functor = Impl::F_Integrate<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2554
2555 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2556 const int teamSize = functor.teamSize(recommendedTeamSize);
2557
2558 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2559
2560 Kokkos::parallel_for("F_Integrate rank 3", policy, functor);
2561
2562 if (approximateFlops != NULL)
2563 {
2564 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2565 }
2566 }
2567 }
2568 b_offset += rightIsVectorValued ? basisValuesRight.vectorData().numDimsForComponent(rightComponentOrdinal) : 1;
2569 }
2570 rightFieldOrdinalOffset += rightIsVectorValued ? basisValuesRight.vectorData().numFieldsInFamily(rightFamilyOrdinal) : basisValuesRight.basisValues().numFieldsInFamily(rightFamilyOrdinal);
2571 }
2572 a_offset += leftIsVectorValued ? basisValuesLeft.vectorData().numDimsForComponent(leftComponentOrdinal) : 1;
2573 }
2574 leftFieldOrdinalOffset += leftIsVectorValued ? basisValuesLeft.vectorData().numFieldsInFamily(leftFamilyOrdinal) : basisValuesLeft.basisValues().numFieldsInFamily(leftFamilyOrdinal);
2575 }
2576 }
2577// if (approximateFlops != NULL)
2578// {
2579// std::cout << "Approximate flop count (new): " << *approximateFlops << std::endl;
2580// }
2581 ExecutionSpace().fence(); // make sure we've finished writing to integrals container before we return
2582}
2583
2584} // end namespace Intrepid2
2585
2586#endif
KOKKOS_INLINE_FUNCTION DimensionInfo combinedDimensionInfo(const DimensionInfo &myData, const DimensionInfo &otherData)
Returns DimensionInfo for a Data container that combines (through multiplication, say,...
Utility methods for manipulating Intrepid2::Data objects.
@ GENERAL
arbitrary variation
@ BLOCK_PLUS_DIAGONAL
one of two dimensions in a matrix; bottom-right part of matrix is diagonal
Defines the Intrepid2::FunctorIterator class, as well as the Intrepid2::functor_returns_ref SFINAE he...
Defines TensorArgumentIterator, which allows systematic enumeration of a TensorData object.
constexpr int getVectorSizeForHierarchicalParallelism()
Returns a vector size to be used for the provided Scalar type in the context of hierarchically-parall...
#define INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(test, x, msg)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< std::is_standard_layout< T >::value &&std::is_trivial< T >::value, unsigned >::type dimension_scalar(const Kokkos::DynRankView< T, P... >)
specialization of functions for pod types, returning the scalar dimension (1 for pod types) of a view...
static Data< Scalar, DeviceType > transposeMatrix(const Data< Scalar, DeviceType > &matrixDataIn)
static void multiplyByCPWeights(Data< Scalar, DeviceType > &resultMatrixData, const Data< Scalar, DeviceType > &matrixDataIn, const Data< Scalar, DeviceType > &scalarDataIn)
Wrapper around a Kokkos::View that allows data that is constant or repeating in various logical dimen...
void storeInPlaceProduct(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
stores the in-place (entrywise) product, A .* B, into this container.
static Data< DataScalar, DeviceType > allocateMatMatResult(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION enable_if_t< rank==1, const Kokkos::View< typename RankExpander< DataScalar, rank >::value_type, DeviceType > & > getUnderlyingView() const
Returns the underlying view. Throws an exception if the underlying view is not rank 1.
KOKKOS_INLINE_FUNCTION int extent_int(const int &r) const
Returns the logical extent in the specified dimension.
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar **, DeviceType > & getUnderlyingView2() const
returns the View that stores the unique data. For rank-2 underlying containers.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
returns true for containers that have data; false for those that don't (namely, those that have been ...
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ***, DeviceType > & getUnderlyingView3() const
returns the View that stores the unique data. For rank-3 underlying containers.
void clear() const
Copies 0.0 to the underlying View.
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewSize() const
returns the number of entries in the View that stores the unique data
static Data< DataScalar, DeviceType > allocateInPlaceCombinationResult(const Data< DataScalar, DeviceType > &A, const Data< DataScalar, DeviceType > &B)
KOKKOS_INLINE_FUNCTION const Kokkos::Array< DataVariationType, 7 > & getVariationTypes() const
Returns an array with the variation types in each logical dimension.
KOKKOS_INLINE_FUNCTION int getDataExtent(const ordinal_type &d) const
returns the true extent of the data corresponding to the logical dimension provided; if the data does...
void storeMatMat(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION Kokkos::Array< int, 7 > getExtents() const
Returns an array containing the logical extents in each dimension.
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewRank() const
returns the rank of the View that stores the unique data
KOKKOS_INLINE_FUNCTION DimensionInfo getDimensionInfo(const int &dim) const
Returns an object fully specifying the indicated dimension. This is used in determining appropriate s...
Data shallowCopy(const int rank, const Kokkos::Array< int, 7 > &extents, const Kokkos::Array< DataVariationType, 7 > &variationTypes) const
Creates a new Data object with the same underlying view, but with the specified logical rank,...
KOKKOS_INLINE_FUNCTION unsigned rank() const
Returns the logical rank of the Data container.
Implementation of a general sum factorization algorithm, using a novel approach developed by Roberts,...
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
Hand-coded 3-component version.
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums,...
size_t team_shmem_size(int numThreads) const
Provide the shared memory capacity.
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
Implementation of a general sum factorization algorithm, abstracted from the algorithm described by M...
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
size_t team_shmem_size(int team_size) const
Provide the shared memory capacity.
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
runSpecialized implementations are hand-coded variants of run() for a particular number of components...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums,...
static Data< Scalar, DeviceType > allocateIntegralData(const TransformedBasisValues< Scalar, DeviceType > vectorDataLeft, const TensorData< Scalar, DeviceType > cellMeasures, const TransformedBasisValues< Scalar, DeviceType > vectorDataRight)
Allocates storage for the contraction of vectorDataLeft and vectorDataRight containers on point and s...
static void integrate(Data< Scalar, DeviceType > integrals, const TransformedBasisValues< Scalar, DeviceType > &vectorDataLeft, const TensorData< Scalar, DeviceType > &cellMeasures, const TransformedBasisValues< Scalar, DeviceType > &vectorDataRight, const bool sumInto=false, double *approximateFlops=NULL)
Contracts vectorDataLeft and vectorDataRight containers on point and space dimensions,...
static constexpr ordinal_type MaxTensorComponents
Maximum number of tensor/Cartesian products that can be taken: this allows hypercube basis in 7D to b...
Allows systematic enumeration of all entries in a TensorData object, tracking indices for each tensor...
KOKKOS_INLINE_FUNCTION void setEnumerationIndex(const ordinal_type &enumerationIndex)
Sets the enumeration index; this refers to a 1D enumeration of the possible in-bound arguments.
KOKKOS_INLINE_FUNCTION const ordinal_type & argument(const ordinal_type &r) const
View-like interface to tensor data; tensor components are stored separately and multiplied together a...
KOKKOS_INLINE_FUNCTION const Data< Scalar, DeviceType > & getTensorComponent(const ordinal_type &r) const
Returns the requested tensor component.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
Returns true for containers that have data; false for those that don't (e.g., those that have been co...
KOKKOS_INLINE_FUNCTION std::enable_if< std::is_integral< iType >::value, ordinal_type >::type extent_int(const iType &d) const
Returns the logical extent in the requested dimension.
KOKKOS_INLINE_FUNCTION ordinal_type numTensorComponents() const
Return the number of tensorial components.
Structure-preserving representation of transformed vector data; reference space values and transforma...
KOKKOS_INLINE_FUNCTION bool axisAligned() const
Returns true if the transformation matrix is diagonal.
KOKKOS_INLINE_FUNCTION Scalar transformWeight(const int &cellOrdinal, const int &pointOrdinal) const
Returns the specified entry in the (scalar) transform. (Only valid for scalar-valued BasisValues; see...
const VectorData< Scalar, DeviceType > & vectorData() const
Returns the reference-space vector data.
KOKKOS_INLINE_FUNCTION int spaceDim() const
Returns the logical extent in the space dimension, which is the 3 dimension in this container.
const Data< Scalar, DeviceType > & transform() const
Returns the transform matrix. An invalid/empty container indicates the identity transform.
KOKKOS_INLINE_FUNCTION int numCells() const
Returns the logical extent in the cell dimension, which is the 0 dimension in this container.
KOKKOS_INLINE_FUNCTION int numPoints() const
Returns the logical extent in the points dimension, which is the 2 dimension in this container.
KOKKOS_INLINE_FUNCTION int numFields() const
Returns the logical extent in the fields dimension, which is the 1 dimension in this container.
Struct expressing all variation information about a Data object in a single dimension,...