49 #ifndef __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__ 50 #define __INTREPID2_INTEGRATIONTOOLS_DEF_HPP__ 55 #include "Teuchos_TimeMonitor.hpp" 64 template<
class Scalar,
class DeviceType,
int integralViewRank>
67 using ExecutionSpace =
typename DeviceType::execution_space;
68 using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace>;
69 using TeamMember =
typename TeamPolicy::member_type;
71 using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
72 IntegralViewType integralView_;
79 int leftComponentSpan_;
80 int rightComponentSpan_;
81 int numTensorComponents_;
82 int leftFieldOrdinalOffset_;
83 int rightFieldOrdinalOffset_;
84 bool forceNonSpecialized_;
86 size_t fad_size_output_ = 0;
88 Kokkos::Array<int, 7> offsetsForComponentOrdinal_;
92 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
93 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
94 Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
96 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldRelativeEnumerationSpans_;
97 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldRelativeEnumerationSpans_;
110 int leftFieldOrdinalOffset,
111 int rightFieldOrdinalOffset,
112 bool forceNonSpecialized)
114 integralView_(integralData.template getUnderlyingView<integralViewRank>()),
115 leftComponent_(leftComponent),
116 composedTransform_(composedTransform),
117 rightComponent_(rightComponent),
118 cellMeasures_(cellMeasures),
121 leftComponentSpan_(leftComponent.
extent_int(2)),
122 rightComponentSpan_(rightComponent.
extent_int(2)),
124 leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
125 rightFieldOrdinalOffset_(rightFieldOrdinalOffset),
126 forceNonSpecialized_(forceNonSpecialized)
128 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.
numTensorComponents(), std::invalid_argument,
"Left and right components must have matching number of tensorial components");
131 const int FIELD_DIM = 0;
132 const int POINT_DIM = 1;
136 for (
int r=0; r<numTensorComponents_; r++)
139 maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
141 maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
143 maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
147 int leftRelativeEnumerationSpan = 1;
148 int rightRelativeEnumerationSpan = 1;
149 for (
int startingComponent=numTensorComponents_-2; startingComponent>=0; startingComponent--)
151 leftRelativeEnumerationSpan *= leftFieldBounds_[startingComponent];
152 rightRelativeEnumerationSpan *= rightFieldBounds_[startingComponent];
153 leftFieldRelativeEnumerationSpans_ [startingComponent] = leftRelativeEnumerationSpan;
154 rightFieldRelativeEnumerationSpans_[startingComponent] = rightRelativeEnumerationSpan;
160 const bool allocateFadStorage = !std::is_pod<Scalar>::value;
161 if (allocateFadStorage)
166 const int R = numTensorComponents_ - 1;
169 int allocationSoFar = 0;
170 offsetsForComponentOrdinal_[R] = allocationSoFar;
173 for (
int r=R-1; r>0; r--)
178 num_ij *= leftFields * rightFields;
180 offsetsForComponentOrdinal_[r] = allocationSoFar;
181 allocationSoFar += num_ij;
183 offsetsForComponentOrdinal_[0] = allocationSoFar;
186 template<
size_t maxComponents,
size_t numComponents = maxComponents>
187 KOKKOS_INLINE_FUNCTION
188 int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
189 const Kokkos::Array<int,maxComponents> &bounds)
const 191 if (numComponents == 0)
197 int r =
static_cast<int>(numComponents - 1);
198 while (arguments[r] + 1 >= bounds[r])
204 if (r >= 0) ++arguments[r];
210 KOKKOS_INLINE_FUNCTION
212 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
213 const int &numComponents)
const 215 if (numComponents == 0)
return -1;
216 int r =
static_cast<int>(numComponents - 1);
217 while (arguments[r] + 1 >= bounds[r])
223 if (r >= 0) ++arguments[r];
227 template<
size_t maxComponents,
size_t numComponents = maxComponents>
228 KOKKOS_INLINE_FUNCTION
229 int nextIncrementResult(
const Kokkos::Array<int,maxComponents> &arguments,
230 const Kokkos::Array<int,maxComponents> &bounds)
const 232 if (numComponents == 0)
238 int r =
static_cast<int>(numComponents - 1);
239 while (arguments[r] + 1 >= bounds[r])
249 KOKKOS_INLINE_FUNCTION
251 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
252 const int &numComponents)
const 254 if (numComponents == 0)
return -1;
255 int r = numComponents - 1;
256 while (arguments[r] + 1 >= bounds[r])
264 template<
size_t maxComponents,
size_t numComponents = maxComponents>
265 KOKKOS_INLINE_FUNCTION
266 int relativeEnumerationIndex(
const Kokkos::Array<int,maxComponents> &arguments,
267 const Kokkos::Array<int,maxComponents> &bounds,
268 const int startIndex)
const 271 if (numComponents == 0)
277 int enumerationIndex = 0;
278 for (
size_t r=numComponents-1; r>
static_cast<size_t>(startIndex); r--)
280 enumerationIndex += arguments[r];
281 enumerationIndex *= bounds[r-1];
283 enumerationIndex += arguments[startIndex];
284 return enumerationIndex;
298 KOKKOS_INLINE_FUNCTION
301 constexpr
int numTensorComponents = 3;
303 Kokkos::Array<int,numTensorComponents> pointBounds;
304 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
305 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
306 for (
unsigned r=0; r<numTensorComponents; r++)
308 pointBounds[r] = pointBounds_[r];
309 leftFieldBounds[r] = leftFieldBounds_[r];
310 rightFieldBounds[r] = rightFieldBounds_[r];
313 const int cellDataOrdinal = teamMember.league_rank();
314 const int numThreads = teamMember.team_size();
315 const int scratchViewSize = offsetsForComponentOrdinal_[0];
317 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView;
318 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights;
319 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, leftFields_y, leftFields_z, rightFields_x, rightFields_y, rightFields_z;
320 if (fad_size_output_ > 0) {
321 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
322 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1), fad_size_output_);
323 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0], fad_size_output_);
324 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0], fad_size_output_);
325 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1], fad_size_output_);
326 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1], fad_size_output_);
327 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2], fad_size_output_);
328 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2], fad_size_output_);
331 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads);
332 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1));
333 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[0], pointBounds[0]);
334 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[0], pointBounds[0]);
335 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[1], pointBounds[1]);
336 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[1], pointBounds[1]);
337 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds[2], pointBounds[2]);
338 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds[2], pointBounds[2]);
344 constexpr
int R = numTensorComponents - 1;
346 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
348 const int a = a_offset_ + a_component;
349 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
351 const int b = b_offset_ + b_component;
356 const int numLeftFieldsFinal = leftFinalComponent.
extent_int(0);
357 const int numRightFieldsFinal = rightFinalComponent.
extent_int(0);
366 const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
367 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields * maxPointCount_), [&] (
const int& fieldOrdinalPointOrdinal) {
368 const int fieldOrdinal = fieldOrdinalPointOrdinal % maxFields;
369 const int pointOrdinal = fieldOrdinalPointOrdinal / maxFields;
370 if ((fieldOrdinal < leftTensorComponent_x.
extent_int(0)) && (pointOrdinal < leftTensorComponent_x.
extent_int(1)))
372 const int leftRank = leftTensorComponent_x.
rank();
373 leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
375 if ((fieldOrdinal < leftTensorComponent_y.
extent_int(0)) && (pointOrdinal < leftTensorComponent_y.
extent_int(1)))
377 const int leftRank = leftTensorComponent_y.
rank();
378 leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
380 if ((fieldOrdinal < leftTensorComponent_z.
extent_int(0)) && (pointOrdinal < leftTensorComponent_z.
extent_int(1)))
382 const int leftRank = leftTensorComponent_z.
rank();
383 leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
385 if ((fieldOrdinal < rightTensorComponent_x.
extent_int(0)) && (pointOrdinal < rightTensorComponent_x.
extent_int(1)))
387 const int rightRank = rightTensorComponent_x.
rank();
388 rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,b_component);
390 if ((fieldOrdinal < rightTensorComponent_y.
extent_int(0)) && (pointOrdinal < rightTensorComponent_y.
extent_int(1)))
392 const int rightRank = rightTensorComponent_y.
rank();
393 rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,b_component);
395 if ((fieldOrdinal < rightTensorComponent_z.
extent_int(0)) && (pointOrdinal < rightTensorComponent_z.
extent_int(1)))
397 const int rightRank = rightTensorComponent_z.
rank();
398 rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,b_component);
405 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransformView.extent_int(1)), [&] (
const int& pointOrdinal) {
406 pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
411 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
412 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
419 teamMember.team_barrier();
422 const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
423 for (
int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
425 scratchView(i) = 0.0;
429 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (
const int& leftRightFieldEnumeration) {
430 const int iz = leftRightFieldEnumeration % numLeftFieldsFinal;
431 const int jz = leftRightFieldEnumeration / numLeftFieldsFinal;
435 Kokkos::Array<int,numTensorComponents> leftFieldArguments;
436 Kokkos::Array<int,numTensorComponents> rightFieldArguments;
437 rightFieldArguments[R] = jz;
438 leftFieldArguments[R] = iz;
440 Kokkos::Array<int,numTensorComponents> pointArguments;
441 for (
int i=0; i<numTensorComponents; i++)
443 pointArguments[i] = 0;
446 for (
int lx=0; lx<pointBounds[0]; lx++)
448 pointArguments[0] = lx;
452 const int Gy_start_index = scratchOffsetForThread + offsetsForComponentOrdinal_[1];
453 const int Gy_end_index = scratchOffsetForThread + offsetsForComponentOrdinal_[0];
455 for (
int Gy_index=Gy_start_index; Gy_index < Gy_end_index; Gy_index++)
457 scratchView(Gy_index) = 0;
460 for (
int ly=0; ly<pointBounds[1]; ly++)
462 pointArguments[1] = ly;
464 Scalar * Gz = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
467 for (
int lz=0; lz < pointBounds[2]; lz++)
469 const Scalar & leftValue = leftFields_z(iz,lz);
470 const Scalar & rightValue = rightFields_z(jz,lz);
472 pointArguments[2] = lz;
473 int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
475 *Gz += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
480 for (
int iy=0; iy<leftFieldBounds_[1]; iy++)
482 leftFieldArguments[1] = iy;
483 const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
485 const Scalar & leftValue = leftFields_y(iy,ly);
487 for (
int jy=0; jy<rightFieldBounds_[1]; jy++)
489 rightFieldArguments[1] = jy;
491 const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
492 const Scalar & rightValue = rightFields_y(jy,ly);
494 const int & rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
495 const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
497 const int Gz_index = 0;
498 const Scalar & Gz = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[2] + Gz_index);
500 Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
502 Gy += leftValue * Gz * rightValue;
507 for (
int ix=0; ix<leftFieldBounds_[0]; ix++)
509 leftFieldArguments[0] = ix;
510 const Scalar & leftValue = leftFields_x(ix,lx);
512 for (
int iy=0; iy<leftFieldBounds_[1]; iy++)
514 leftFieldArguments[1] = iy;
516 const int leftEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, 1);
518 for (
int jx=0; jx<rightFieldBounds_[0]; jx++)
520 rightFieldArguments[0] = jx;
521 const Scalar & rightValue = rightFields_x(jx,lx);
523 for (
int jy=0; jy<rightFieldBounds_[1]; jy++)
525 rightFieldArguments[1] = jy;
526 const int rightEnumerationIndex_y = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, 1);
528 const int rightEnumerationSpan_y = rightFieldRelativeEnumerationSpans_[1];
530 const int Gy_index = leftEnumerationIndex_y * rightEnumerationSpan_y + rightEnumerationIndex_y;
531 Scalar & Gy = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[1] + Gy_index);
534 int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
535 int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
536 const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
537 const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
539 if (integralViewRank == 3)
561 integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex) += leftValue * Gy * rightValue;
566 integralView_.access(leftFieldIndex,rightFieldIndex,0) += leftValue * Gy * rightValue;
580 template<
size_t numTensorComponents>
581 KOKKOS_INLINE_FUNCTION
582 void run(
const TeamMember & teamMember )
const 584 Kokkos::Array<int,numTensorComponents> pointBounds;
585 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
586 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
587 for (
unsigned r=0; r<numTensorComponents; r++)
589 pointBounds[r] = pointBounds_[r];
590 leftFieldBounds[r] = leftFieldBounds_[r];
591 rightFieldBounds[r] = rightFieldBounds_[r];
594 const int cellDataOrdinal = teamMember.league_rank();
595 const int numThreads = teamMember.team_size();
596 const int scratchViewSize = offsetsForComponentOrdinal_[0];
597 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView;
598 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights;
599 Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields;
600 if (fad_size_output_ > 0) {
601 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
602 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1), fad_size_output_);
603 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_, fad_size_output_);
604 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_, fad_size_output_);
607 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize*numThreads);
608 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1));
609 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_);
610 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_);
616 constexpr
int R = numTensorComponents - 1;
618 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
620 const int a = a_offset_ + a_component;
621 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
623 const int b = b_offset_ + b_component;
625 const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.
getTensorComponent(R);
626 const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.
getTensorComponent(R);
628 const int numLeftFieldsFinal = leftFinalComponent.
extent_int(0);
629 const int numRightFieldsFinal = rightFinalComponent.extent_int(0);
631 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numTensorComponents), [&] (
const int& r) {
632 const Data<Scalar,DeviceType> & leftTensorComponent = leftComponent_.
getTensorComponent(r);
633 const Data<Scalar,DeviceType> & rightTensorComponent = rightComponent_.
getTensorComponent(r);
634 const int leftFieldCount = leftTensorComponent.
extent_int(0);
635 const int pointCount = leftTensorComponent.extent_int(1);
636 const int leftRank = leftTensorComponent.rank();
637 const int rightFieldCount = rightTensorComponent.extent_int(0);
638 const int rightRank = rightTensorComponent.rank();
639 for (
int fieldOrdinal=0; fieldOrdinal<maxFieldsLeft_; fieldOrdinal++)
642 const int fieldAddress = (fieldOrdinal < leftFieldCount) ? fieldOrdinal : leftFieldCount - 1;
643 for (
int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
645 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
646 leftFields(r,fieldAddress,pointAddress) = (leftRank == 2) ? leftTensorComponent(fieldAddress,pointAddress) : leftTensorComponent(fieldAddress,pointAddress,a_component);
649 for (
int fieldOrdinal=0; fieldOrdinal<maxFieldsRight_; fieldOrdinal++)
652 const int fieldAddress = (fieldOrdinal < rightFieldCount) ? fieldOrdinal : rightFieldCount - 1;
653 for (
int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
655 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
656 rightFields(r,fieldAddress,pointAddress) = (rightRank == 2) ? rightTensorComponent(fieldAddress,pointAddress) : rightTensorComponent(fieldAddress,pointAddress,b_component);
661 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
662 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
667 teamMember.team_barrier();
669 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (
const int& leftRightFieldEnumeration) {
670 const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
671 const int i_R = leftRightFieldEnumeration % numLeftFieldsFinal;
672 const int j_R = leftRightFieldEnumeration / numLeftFieldsFinal;
676 Kokkos::Array<int,numTensorComponents> leftFieldArguments;
677 Kokkos::Array<int,numTensorComponents> rightFieldArguments;
678 rightFieldArguments[R] = j_R;
679 leftFieldArguments[R] = i_R;
682 for (
int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
684 scratchView(i) = 0.0;
686 Kokkos::Array<int,numTensorComponents> pointArguments;
687 for (
unsigned i=0; i<numTensorComponents; i++)
689 pointArguments[i] = 0;
696 const int pointBounds_R = pointBounds[R];
697 int & pointArgument_R = pointArguments[R];
698 for (pointArgument_R=0; pointArgument_R < pointBounds_R; pointArgument_R++)
703 G_R = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
707 const int leftFieldIndex = i_R + leftFieldOrdinalOffset_;
708 const int rightFieldIndex = j_R + rightFieldOrdinalOffset_;
710 if (integralViewRank == 3)
713 G_R = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
718 G_R = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
722 const int & pointIndex_R = pointArguments[R];
724 const Scalar & leftValue = leftFields(R,i_R,pointIndex_R);
725 const Scalar & rightValue = rightFields(R,j_R,pointIndex_R);
727 int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
729 *G_R += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
734 const int r_next = nextIncrementResult<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds);
735 const int r_min = (r_next >= 0) ? r_next : 0;
737 for (
int s=R-1; s>=r_min; s--)
739 const int & pointIndex_s = pointArguments[s];
742 for (
int s1=s; s1<R; s1++)
744 leftFieldArguments[s1] = 0;
748 const int & i_s = leftFieldArguments[s];
749 const int & j_s = rightFieldArguments[s];
752 const int & rightEnumerationSpan_s = rightFieldRelativeEnumerationSpans_[s];
753 const int & rightEnumerationSpan_sp = rightFieldRelativeEnumerationSpans_[s+1];
757 const int leftEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s);
760 const int leftEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s+1);
762 const Scalar & leftValue = leftFields(s,i_s,pointIndex_s);
764 for (
int s1=s; s1<R; s1++)
766 rightFieldArguments[s1] = 0;
771 const int rightEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s);
774 const int rightEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s+1);
776 const Scalar & rightValue = rightFields(s,j_s,pointIndex_s);
778 const int G_s_index = leftEnumerationIndex_s * rightEnumerationSpan_s + rightEnumerationIndex_s;
784 const int G_sp_index = leftEnumerationIndex_sp * rightEnumerationSpan_sp + rightEnumerationIndex_sp;
786 const Scalar & G_sp = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s+1] + G_sp_index);
791 G_s = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s] + G_s_index);
796 int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
797 int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
798 const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
799 const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
801 if (integralViewRank == 3)
804 G_s = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
809 G_s = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
813 *G_s += leftValue * G_sp * rightValue;
818 sRight = incrementArgument<numTensorComponents,R>(rightFieldArguments, rightFieldBounds);
822 sLeft = incrementArgument<numTensorComponents,R>(leftFieldArguments, leftFieldBounds);
829 const int endIndex = scratchOffsetForThread + offsetsForComponentOrdinal_[r_min];
830 for (
int i=scratchOffsetForThread; i<endIndex; i++)
832 scratchView(i) = 0.0;
839 r = incrementArgument<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds);
847 KOKKOS_INLINE_FUNCTION
848 void operator()(
const TeamMember & teamMember )
const 850 switch (numTensorComponents_)
852 case 1: run<1>(teamMember);
break;
853 case 2: run<2>(teamMember);
break;
855 if (forceNonSpecialized_)
860 case 4: run<4>(teamMember);
break;
861 case 5: run<5>(teamMember);
break;
862 case 6: run<6>(teamMember);
break;
863 case 7: run<7>(teamMember);
break;
864 #ifdef INTREPID2_HAVE_DEBUG 866 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(
true,std::invalid_argument,
"Unsupported component count");
877 const int R = numTensorComponents_ - 1;
883 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
885 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
890 const int numLeftFieldsFinal = leftFinalComponent.
extent_int(0);
891 const int numRightFieldsFinal = rightFinalComponent.
extent_int(0);
893 flopCount += flopsPerPoint_ab * cellMeasures_.
extent_int(1);
895 int flopsPer_i_R_j_R = 0;
899 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldArguments;
900 leftFieldArguments[R] = 0;
902 Kokkos::Array<int,Parameters::MaxTensorComponents> pointArguments;
903 for (
int i=0; i<numTensorComponents_; i++)
905 pointArguments[i] = 0;
910 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldArguments;
911 rightFieldArguments[R] = 0;
917 for (pointArguments[R]=0; pointArguments[R] < pointBounds_[R]; pointArguments[R]++)
919 flopsPer_i_R_j_R += 4;
927 const int r_next = nextIncrementResult(pointArguments, pointBounds_, numTensorComponents_);
928 const int r_min = (r_next >= 0) ? r_next : 0;
930 for (
int s=R-1; s>=r_min; s--)
933 int numLeftIterations = leftFieldRelativeEnumerationSpans_[s];
934 int numRightIterations = rightFieldRelativeEnumerationSpans_[s];
936 flopsPer_i_R_j_R += numLeftIterations * numRightIterations * 3;
940 r = incrementArgument(pointArguments, pointBounds_, numTensorComponents_);
943 flopCount += flopsPer_i_R_j_R * numLeftFieldsFinal * numRightFieldsFinal;
951 int teamSize(
const int &maxTeamSizeFromKokkos)
const 953 const int R = numTensorComponents_ - 1;
954 const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
955 return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
962 size_t shmem_size = 0;
964 if (fad_size_output_ > 0)
966 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(offsetsForComponentOrdinal_[0] * team_size, fad_size_output_);
967 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.
extent_int(1), fad_size_output_);
968 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_, fad_size_output_);
969 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_, fad_size_output_);
973 shmem_size += Kokkos::View<Scalar*, DeviceType>::shmem_size(offsetsForComponentOrdinal_[0] * team_size);
974 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.
extent_int(1));
975 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_);
976 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_);
992 template<
class Scalar,
class DeviceType,
int integralViewRank>
995 using ExecutionSpace =
typename DeviceType::execution_space;
996 using TeamPolicy = Kokkos::TeamPolicy<DeviceType>;
997 using TeamMember =
typename TeamPolicy::member_type;
999 using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
1000 IntegralViewType integralView_;
1007 int leftComponentSpan_;
1008 int rightComponentSpan_;
1009 int numTensorComponents_;
1010 int leftFieldOrdinalOffset_;
1011 int rightFieldOrdinalOffset_;
1013 size_t fad_size_output_ = 0;
1017 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
1018 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
1019 Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
1022 int maxFieldsRight_;
1032 int leftFieldOrdinalOffset,
1033 int rightFieldOrdinalOffset)
1035 integralView_(integralData.template getUnderlyingView<integralViewRank>()),
1036 leftComponent_(leftComponent),
1037 composedTransform_(composedTransform),
1038 rightComponent_(rightComponent),
1039 cellMeasures_(cellMeasures),
1040 a_offset_(a_offset),
1041 b_offset_(b_offset),
1042 leftComponentSpan_(leftComponent.
extent_int(2)),
1043 rightComponentSpan_(rightComponent.
extent_int(2)),
1045 leftFieldOrdinalOffset_(leftFieldOrdinalOffset),
1046 rightFieldOrdinalOffset_(rightFieldOrdinalOffset)
1048 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.
numTensorComponents(), std::invalid_argument,
"Left and right components must have matching number of tensorial components");
1050 const int FIELD_DIM = 0;
1051 const int POINT_DIM = 1;
1053 maxFieldsRight_ = 0;
1055 for (
int r=0; r<numTensorComponents_; r++)
1058 maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[r]);
1060 maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[r]);
1062 maxPointCount_ = std::max(maxPointCount_, pointBounds_[r]);
1068 const bool allocateFadStorage = !std::is_pod<Scalar>::value;
1069 if (allocateFadStorage)
1075 template<
size_t maxComponents,
size_t numComponents = maxComponents>
1076 KOKKOS_INLINE_FUNCTION
1077 int incrementArgument( Kokkos::Array<int,maxComponents> &arguments,
1078 const Kokkos::Array<int,maxComponents> &bounds)
const 1080 if (numComponents == 0)
return -1;
1081 int r =
static_cast<int>(numComponents - 1);
1082 while (arguments[r] + 1 >= bounds[r])
1088 if (r >= 0) ++arguments[r];
1093 KOKKOS_INLINE_FUNCTION
1095 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1096 const int &numComponents)
const 1098 if (numComponents == 0)
return -1;
1099 int r =
static_cast<int>(numComponents - 1);
1100 while (arguments[r] + 1 >= bounds[r])
1106 if (r >= 0) ++arguments[r];
1110 template<
size_t maxComponents,
size_t numComponents = maxComponents>
1111 KOKKOS_INLINE_FUNCTION
1112 int nextIncrementResult(
const Kokkos::Array<int,maxComponents> &arguments,
1113 const Kokkos::Array<int,maxComponents> &bounds)
const 1115 if (numComponents == 0)
return -1;
1116 int r =
static_cast<int>(numComponents - 1);
1117 while (arguments[r] + 1 >= bounds[r])
1126 KOKKOS_INLINE_FUNCTION
1128 const Kokkos::Array<int,Parameters::MaxTensorComponents> &bounds,
1129 const int &numComponents)
const 1131 if (numComponents == 0)
return -1;
1132 int r = numComponents - 1;
1133 while (arguments[r] + 1 >= bounds[r])
1141 template<
size_t maxComponents,
size_t numComponents = maxComponents>
1142 KOKKOS_INLINE_FUNCTION
1143 int relativeEnumerationIndex(
const Kokkos::Array<int,maxComponents> &arguments,
1144 const Kokkos::Array<int,maxComponents> &bounds,
1145 const int startIndex)
const 1148 if (numComponents == 0)
1152 int enumerationIndex = 0;
1153 for (
size_t r=numComponents-1; r>
static_cast<size_t>(startIndex); r--)
1155 enumerationIndex += arguments[r];
1156 enumerationIndex *= bounds[r-1];
1158 enumerationIndex += arguments[startIndex];
1159 return enumerationIndex;
1163 KOKKOS_INLINE_FUNCTION
1164 enable_if_t<rank==3 && rank==integralViewRank, Scalar &>
1165 integralViewEntry(
const IntegralViewType& integralView,
const int &cellDataOrdinal,
const int &i,
const int &j)
const 1167 return integralView(cellDataOrdinal,i,j);
1171 KOKKOS_INLINE_FUNCTION
1172 enable_if_t<rank==2 && rank==integralViewRank, Scalar &>
1173 integralViewEntry(
const IntegralViewType& integralView,
const int &cellDataOrdinal,
const int &i,
const int &j)
const 1175 return integralView(i,j);
1179 KOKKOS_INLINE_FUNCTION
1182 constexpr
int numTensorComponents = 3;
1184 const int pointBounds_x = pointBounds_[0];
1185 const int pointBounds_y = pointBounds_[1];
1186 const int pointBounds_z = pointBounds_[2];
1187 const int pointsInNonzeroComponentDimensions = pointBounds_y * pointBounds_z;
1189 const int leftFieldBounds_x = leftFieldBounds_[0];
1190 const int rightFieldBounds_x = rightFieldBounds_[0];
1191 const int leftFieldBounds_y = leftFieldBounds_[1];
1192 const int rightFieldBounds_y = rightFieldBounds_[1];
1193 const int leftFieldBounds_z = leftFieldBounds_[2];
1194 const int rightFieldBounds_z = rightFieldBounds_[2];
1196 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1197 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1198 for (
unsigned r=0; r<numTensorComponents; r++)
1200 leftFieldBounds[r] = leftFieldBounds_[r];
1201 rightFieldBounds[r] = rightFieldBounds_[r];
1204 const auto integralView = integralView_;
1205 const auto leftFieldOrdinalOffset = leftFieldOrdinalOffset_;
1206 const auto rightFieldOrdinalOffset = rightFieldOrdinalOffset_;
1208 const int cellDataOrdinal = teamMember.league_rank();
1209 const int threadNumber = teamMember.team_rank();
1211 const int numThreads = teamMember.team_size();
1212 const int GyEntryCount = pointBounds_z;
1213 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GxIntegrals;
1214 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GyIntegrals;
1215 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> GzIntegral;
1216 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights;
1218 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_x, rightFields_x;
1219 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_y, rightFields_y;
1220 Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged> leftFields_z, rightFields_z;
1221 if (fad_size_output_ > 0) {
1222 GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions, fad_size_output_);
1223 GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads, fad_size_output_);
1224 GzIntegral = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numThreads, fad_size_output_);
1225 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1), fad_size_output_);
1227 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x, fad_size_output_);
1228 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x, fad_size_output_);
1229 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y, fad_size_output_);
1230 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y, fad_size_output_);
1231 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z, fad_size_output_);
1232 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z, fad_size_output_);
1235 GxIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), pointsInNonzeroComponentDimensions);
1236 GyIntegrals = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), GyEntryCount * numThreads);
1237 GzIntegral = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numThreads);
1238 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.
extent_int(1));
1240 leftFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_x, pointBounds_x);
1241 rightFields_x = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_x, pointBounds_x);
1242 leftFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_y, pointBounds_y);
1243 rightFields_y = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_y, pointBounds_y);
1244 leftFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), leftFieldBounds_z, pointBounds_z);
1245 rightFields_z = Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), rightFieldBounds_z, pointBounds_z);
1254 teamMember.team_barrier();
1256 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
1258 const int a = a_offset_ + a_component;
1259 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
1261 const int b = b_offset_ + b_component;
1270 const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
1271 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,maxFields), [&] (
const int& fieldOrdinal) {
1272 if (fieldOrdinal < leftTensorComponent_x.
extent_int(0))
1274 const int pointCount = leftTensorComponent_x.
extent_int(1);
1275 const int leftRank = leftTensorComponent_x.
rank();
1276 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1278 leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1281 if (fieldOrdinal < leftTensorComponent_y.
extent_int(0))
1283 const int pointCount = leftTensorComponent_y.
extent_int(1);
1284 const int leftRank = leftTensorComponent_y.
rank();
1285 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1287 leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1290 if (fieldOrdinal < leftTensorComponent_z.
extent_int(0))
1292 const int pointCount = leftTensorComponent_z.
extent_int(1);
1293 const int leftRank = leftTensorComponent_z.
rank();
1294 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1296 leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1299 if (fieldOrdinal < rightTensorComponent_x.
extent_int(0))
1301 const int pointCount = rightTensorComponent_x.
extent_int(1);
1302 const int rightRank = rightTensorComponent_x.
rank();
1303 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1305 rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
1308 if (fieldOrdinal < rightTensorComponent_y.
extent_int(0))
1310 const int pointCount = rightTensorComponent_y.
extent_int(1);
1311 const int rightRank = rightTensorComponent_y.
rank();
1312 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1314 rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
1317 if (fieldOrdinal < rightTensorComponent_z.
extent_int(0))
1319 const int pointCount = rightTensorComponent_z.
extent_int(1);
1320 const int rightRank = rightTensorComponent_z.
rank();
1321 for (
int pointOrdinal=0; pointOrdinal<pointCount; pointOrdinal++)
1323 rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
1328 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.
extent_int(1)), [&] (
const int& pointOrdinal) {
1329 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
1333 teamMember.team_barrier();
1335 for (
int i0=0; i0<leftFieldBounds_x; i0++)
1337 for (
int j0=0; j0<rightFieldBounds_x; j0++)
1339 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,pointsInNonzeroComponentDimensions), [&] (
const int& pointEnumerationIndexLaterDimensions) {
1341 const int tensorPointEnumerationOffset = pointBounds_x * pointEnumerationIndexLaterDimensions;
1343 Scalar & Gx = GxIntegrals(pointEnumerationIndexLaterDimensions);
1346 if (fad_size_output_ == 0)
1349 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(teamMember, pointBounds_x), [&] (
const int &x_pointOrdinal, Scalar &integralThusFar)
1351 integralThusFar += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1356 for (
int x_pointOrdinal=0; x_pointOrdinal<pointBounds_x; x_pointOrdinal++)
1358 Gx += leftFields_x(i0,x_pointOrdinal) * rightFields_x(j0,x_pointOrdinal) * pointWeights(tensorPointEnumerationOffset + x_pointOrdinal);
1364 teamMember.team_barrier();
1366 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,leftFieldBounds_y * rightFieldBounds_y), [&] (
const int& i1j1) {
1367 const int i1 = i1j1 % leftFieldBounds_y;
1368 const int j1 = i1j1 / leftFieldBounds_y;
1370 int Gy_index = GyEntryCount * threadNumber;
1372 int pointEnumerationIndex = 0;
1373 for (
int lz=0; lz<pointBounds_z; lz++)
1375 Scalar & Gy = GyIntegrals(Gy_index);
1378 for (
int ly=0; ly<pointBounds_y; ly++)
1380 const Scalar & leftValue = leftFields_y(i1,ly);
1381 const Scalar & rightValue = rightFields_y(j1,ly);
1383 Gy += leftValue * rightValue * GxIntegrals(pointEnumerationIndex);
1385 pointEnumerationIndex++;
1390 Scalar & Gz = GzIntegral(threadNumber);
1391 for (
int i2=0; i2<leftFieldBounds_z; i2++)
1393 for (
int j2=0; j2<rightFieldBounds_z; j2++)
1397 int Gy_index = GyEntryCount * threadNumber;
1399 for (
int lz=0; lz<pointBounds_z; lz++)
1401 const Scalar & leftValue = leftFields_z(i2,lz);
1402 const Scalar & rightValue = rightFields_z(j2,lz);
1404 Gz += leftValue * rightValue * GyIntegrals(Gy_index);
1409 const int i = leftFieldOrdinalOffset + i0 + (i1 + i2 * leftFieldBounds_y) * leftFieldBounds_x;
1410 const int j = rightFieldOrdinalOffset + j0 + (j1 + j2 * rightFieldBounds_y) * rightFieldBounds_x;
1415 integralViewEntry<integralViewRank>(integralView, cellDataOrdinal, i, j) += Gz;
1420 teamMember.team_barrier();
1427 template<
size_t numTensorComponents>
1428 KOKKOS_INLINE_FUNCTION
1429 void run(
const TeamMember & teamMember )
const 1431 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(
true, std::logic_error,
"implementation incomplete");
1567 KOKKOS_INLINE_FUNCTION
1568 void operator()(
const TeamMember & teamMember )
const 1570 switch (numTensorComponents_)
1572 case 1: run<1>(teamMember);
break;
1573 case 2: run<2>(teamMember);
break;
1575 case 4: run<4>(teamMember);
break;
1576 case 5: run<5>(teamMember);
break;
1577 case 6: run<6>(teamMember);
break;
1578 case 7: run<7>(teamMember);
break;
1579 #ifdef INTREPID2_HAVE_DEBUG 1581 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(
true,std::invalid_argument,
"Unsupported component count");
1592 constexpr
int numTensorComponents = 3;
1593 Kokkos::Array<int,numTensorComponents> pointBounds;
1594 Kokkos::Array<int,numTensorComponents> leftFieldBounds;
1595 Kokkos::Array<int,numTensorComponents> rightFieldBounds;
1597 int pointsInNonzeroComponentDimensions = 1;
1598 for (
unsigned r=0; r<numTensorComponents; r++)
1600 pointBounds[r] = pointBounds_[r];
1601 if (r > 0) pointsInNonzeroComponentDimensions *= pointBounds[r];
1602 leftFieldBounds[r] = leftFieldBounds_[r];
1603 rightFieldBounds[r] = rightFieldBounds_[r];
1606 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
1608 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
1615 for (
int i0=0; i0<leftFieldBounds[0]; i0++)
1617 for (
int j0=0; j0<rightFieldBounds[0]; j0++)
1619 flopCount += pointsInNonzeroComponentDimensions * pointBounds[0] * 3;
1659 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * pointBounds[1] * pointBounds[2] * 3;
1660 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * pointBounds[2] * 3;
1661 flopCount += leftFieldBounds[1] * rightFieldBounds[1] * leftFieldBounds[2] * rightFieldBounds[2] * 1;
1737 const int R = numTensorComponents_ - 1;
1738 const int threadParallelismExpressed = leftFieldBounds_[R] * rightFieldBounds_[R];
1739 return std::min(maxTeamSizeFromKokkos, threadParallelismExpressed);
1746 size_t shmem_size = 0;
1748 const int GyEntryCount = pointBounds_[2];
1750 int pointsInNonzeroComponentDimensions = 1;
1751 for (
int d=1; d<numTensorComponents_; d++)
1753 pointsInNonzeroComponentDimensions *= pointBounds_[d];
1756 if (fad_size_output_ > 0)
1758 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions, fad_size_output_);
1759 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads, fad_size_output_);
1760 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( 1 * numThreads, fad_size_output_);
1761 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.
extent_int(1), fad_size_output_);
1763 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0], fad_size_output_);
1764 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0], fad_size_output_);
1765 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1], fad_size_output_);
1766 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1], fad_size_output_);
1767 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2], fad_size_output_);
1768 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2], fad_size_output_);
1772 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(pointsInNonzeroComponentDimensions);
1773 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(GyEntryCount * numThreads);
1774 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( 1 * numThreads);
1775 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size (composedTransform_.
extent_int(1));
1777 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[0], pointBounds_[0]);
1778 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[0], pointBounds_[0]);
1779 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[1], pointBounds_[1]);
1780 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[1], pointBounds_[1]);
1781 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( leftFieldBounds_[2], pointBounds_[2]);
1782 shmem_size += Kokkos::View<Scalar**, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size( rightFieldBounds_[2], pointBounds_[2]);
1790 template<
typename DeviceType>
1791 template<
class Scalar>
1802 const int CELL_DIM = 0;
1804 const auto leftTransform = vectorDataLeft.
transform();
1806 DimensionInfo combinedCellDimInfo = cellMeasureData.getDimensionInfo(CELL_DIM);
1811 int cellDataExtent = combinedCellDimInfo.dataExtent;
1813 const int numCells = vectorDataLeft.
numCells();
1814 const int numFieldsLeft = vectorDataLeft.
numFields();
1815 const int numFieldsRight = vectorDataRight.
numFields();
1817 Kokkos::Array<int,3> extents {numCells, numFieldsLeft, numFieldsRight};
1818 Kokkos::Array<DataVariationType,3> variationTypes {cellVariationType,
GENERAL,
GENERAL};
1822 Kokkos::View<Scalar***,DeviceType> data(
"Intrepid2 integral data",cellDataExtent,numFieldsLeft,numFieldsRight);
1827 Kokkos::View<Scalar**,DeviceType> data(
"Intrepid2 integral data",numFieldsLeft,numFieldsRight);
1835 template<
typename DeviceType>
1836 template<
class Scalar>
1840 double* approximateFlops)
1842 using ExecutionSpace =
typename DeviceType::execution_space;
1844 if (approximateFlops != NULL)
1846 *approximateFlops = 0;
1858 const int numFieldsLeft = vectorDataLeft.
numFields();
1859 const int numFieldsRight = vectorDataRight.
numFields();
1860 const int spaceDim = vectorDataLeft.
spaceDim();
1862 INTREPID2_TEST_FOR_EXCEPTION(vectorDataLeft.
spaceDim() != vectorDataRight.
spaceDim(), std::invalid_argument,
"vectorDataLeft and vectorDataRight must agree on the space dimension");
1868 int numTensorComponentsLeft = -1;
1869 const auto &refVectorLeft = vectorDataLeft.
vectorData();
1870 int numFamiliesLeft = refVectorLeft.numFamilies();
1871 int numVectorComponentsLeft = refVectorLeft.numComponents();
1872 Kokkos::Array<int,7> maxFieldsForComponentLeft {0,0,0,0,0,0,0};
1873 Kokkos::Array<int,7> maxFieldsForComponentRight {0,0,0,0,0,0,0};
1874 for (
int familyOrdinal=0; familyOrdinal<numFamiliesLeft; familyOrdinal++)
1876 for (
int vectorComponent=0; vectorComponent<numVectorComponentsLeft; vectorComponent++)
1881 if (numTensorComponentsLeft == -1)
1885 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != tensorData.
numTensorComponents(), std::invalid_argument,
"Each valid entry in vectorDataLeft must have the same number of tensor components as every other");
1886 for (
int r=0; r<numTensorComponentsLeft; r++)
1893 int numTensorComponentsRight = -1;
1894 const auto &refVectorRight = vectorDataRight.
vectorData();
1895 int numFamiliesRight = refVectorRight.numFamilies();
1896 int numVectorComponentsRight = refVectorRight.numComponents();
1897 for (
int familyOrdinal=0; familyOrdinal<numFamiliesRight; familyOrdinal++)
1899 for (
int vectorComponent=0; vectorComponent<numVectorComponentsRight; vectorComponent++)
1901 const auto &tensorData = refVectorRight.getComponent(familyOrdinal,vectorComponent);
1902 if (tensorData.numTensorComponents() > 0)
1904 if (numTensorComponentsRight == -1)
1906 numTensorComponentsRight = tensorData.numTensorComponents();
1908 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsRight != tensorData.numTensorComponents(), std::invalid_argument,
"Each valid entry in vectorDataRight must have the same number of tensor components as every other");
1909 for (
int r=0; r<numTensorComponentsRight; r++)
1911 maxFieldsForComponentRight[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentRight[r]);
1916 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != numVectorComponentsRight, std::invalid_argument,
"Left and right vector entries must have the same number of tensorial components");
1919 if ((numPointTensorComponents == numTensorComponentsLeft) && vectorDataLeft.
axisAligned() && vectorDataRight.
axisAligned())
1928 Kokkos::Array<int,Parameters::MaxTensorComponents> pointDimensions;
1929 for (
int r=0; r<numPointTensorComponents; r++)
1937 for (
int r=0; r<numPointTensorComponents; r++)
1939 for (
int d=0; d<spaceDim; d++)
1962 int leftComponentCount = leftComponent.
extent_int(0);
1963 int rightComponentCount = rightComponent.
extent_int(0);
1965 const bool allocateFadStorage = !std::is_pod<Scalar>::value;
1966 if (allocateFadStorage)
1969 componentIntegrals[r][d] = ScalarView<Scalar,DeviceType>(
"componentIntegrals for tensor component " + std::to_string(r) +
", in dimension " + std::to_string(d), leftComponentCount, rightComponentCount, fad_size_output);
1973 componentIntegrals[r][d] = ScalarView<Scalar,DeviceType>(
"componentIntegrals for tensor component " + std::to_string(r) +
", in dimension " + std::to_string(d), leftComponentCount, rightComponentCount);
1976 Kokkos::deep_copy(componentIntegrals[r][d], 0.0);
1978 const int & numPoints = pointDimensions[r];
1979 const int leftComponentRank = leftComponent.
rank();
1980 const int rightComponentRank = rightComponent.
rank();
1982 const int leftComponentDimSpan = leftComponent.
extent_int(2);
1983 const int rightComponentDimSpan = rightComponent.
extent_int(2);
1984 INTREPID2_TEST_FOR_EXCEPTION(( leftComponentDimSpan != rightComponentDimSpan), std::invalid_argument,
"left and right components must span the same number of dimensions.");
1990 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{leftComponentCount,rightComponentCount});
1992 for (
int a=0; a<leftComponentDimSpan; a++)
1994 Kokkos::parallel_for(
"compute componentIntegrals", policy,
1995 KOKKOS_LAMBDA (
const int &i,
const int &j) {
1996 Scalar refSpaceIntegral = 0.0;
1997 for (
int ptOrdinal=0; ptOrdinal<numPoints; ptOrdinal++)
1999 const Scalar & leftValue = ( leftComponentRank == 2) ? leftComponent(i,ptOrdinal) : leftComponent(i,ptOrdinal,a);
2000 const Scalar & rightValue = (rightComponentRank == 2) ? rightComponent(j,ptOrdinal) : rightComponent(j,ptOrdinal,a);
2001 refSpaceIntegral += leftValue * rightValue * quadratureWeights(ptOrdinal);
2003 componentIntegrals[r][d](i,j) = refSpaceIntegral;
2007 if (approximateFlops != NULL)
2009 *approximateFlops += leftComponentCount*rightComponentCount*numPoints*(3);
2013 ExecutionSpace().fence();
2016 Kokkos::View<Scalar**, DeviceType> integralView2;
2017 Kokkos::View<Scalar***, DeviceType> integralView3;
2018 if (integralViewRank == 3)
2027 const int leftFamilyCount = vectorDataLeft.
vectorData().numFamilies();
2028 const int rightFamilyCount = vectorDataRight.
vectorData().numFamilies();
2030 for (
int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2032 bool haveLaunchedContributionToLeftFamily =
false;
2035 int leftFieldOffset = vectorDataLeft.
vectorData().familyFieldOrdinalOffset(leftFamilyOrdinal);
2037 const int leftVectorComponentCount = vectorDataLeft.
vectorData().numComponents();
2038 for (
int leftVectorComponentOrdinal = 0; leftVectorComponentOrdinal < leftVectorComponentCount; leftVectorComponentOrdinal++)
2040 const auto & leftComponent = vectorDataLeft.
vectorData().getComponent(leftFamilyOrdinal, leftVectorComponentOrdinal);
2046 const int leftDimSpan = leftComponent.
extent_int(2);
2048 const int leftComponentFieldCount = leftComponent.
extent_int(0);
2050 for (
int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2052 bool haveLaunchedContributionToRightFamily =
false;
2055 int rightFieldOffset = vectorDataRight.
vectorData().familyFieldOrdinalOffset(rightFamilyOrdinal);
2057 const int rightVectorComponentCount = vectorDataRight.
vectorData().numComponents();
2058 for (
int rightVectorComponentOrdinal = 0; rightVectorComponentOrdinal < rightVectorComponentCount; rightVectorComponentOrdinal++)
2060 const auto & rightComponent = vectorDataRight.
vectorData().getComponent(rightFamilyOrdinal, rightVectorComponentOrdinal);
2061 if (!rightComponent.
isValid())
2066 const int rightDimSpan = rightComponent.
extent_int(2);
2068 const int rightComponentFieldCount = rightComponent.
extent_int(0);
2071 if ((a_offset + leftDimSpan <= b_offset) || (b_offset + rightDimSpan <= a_offset))
2073 b_offset += rightDimSpan;
2079 INTREPID2_TEST_FOR_EXCEPTION(( a_offset != b_offset), std::logic_error,
"left and right dimension offsets should match.");
2080 INTREPID2_TEST_FOR_EXCEPTION(( leftDimSpan != rightDimSpan), std::invalid_argument,
"left and right components must span the same number of dimensions.");
2082 const int d_start = a_offset;
2083 const int d_end = d_start + leftDimSpan;
2085 if (haveLaunchedContributionToLeftFamily && haveLaunchedContributionToRightFamily)
2088 ExecutionSpace().fence();
2090 haveLaunchedContributionToLeftFamily =
true;
2091 haveLaunchedContributionToRightFamily =
true;
2093 Kokkos::Array<int,3> upperBounds {cellDataExtent,leftComponentFieldCount,rightComponentFieldCount};
2094 Kokkos::Array<int,3> lowerBounds {0,0,0};
2095 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>(lowerBounds, upperBounds);
2097 Kokkos::parallel_for(
"compute field integrals", policy,
2098 KOKKOS_LAMBDA (
const int &cellDataOrdinal,
const int &leftFieldOrdinal,
const int &rightFieldOrdinal) {
2099 const Scalar & cellMeasureWeight = cellMeasures.
getTensorComponent(0)(cellDataOrdinal);
2107 Scalar integralSum = 0.0;
2108 for (
int d=d_start; d<d_end; d++)
2110 const Scalar & transformLeft_d = vectorDataLeft.
transformWeight(cellDataOrdinal,0,d,d);
2111 const Scalar & transformRight_d = vectorDataRight.
transformWeight(cellDataOrdinal,0,d,d);
2113 const Scalar & leftRightTransform_d = transformLeft_d * transformRight_d;
2116 Scalar integral_d = 1.0;
2118 for (
int r=0; r<numPointTensorComponents; r++)
2120 integral_d *= componentIntegrals[r][d](leftTensorIterator.
argument(r),rightTensorIterator.
argument(r));
2123 integralSum += leftRightTransform_d * integral_d;
2126 const int i = leftFieldOrdinal + leftFieldOffset;
2127 const int j = rightFieldOrdinal + rightFieldOffset;
2129 if (integralViewRank == 3)
2131 integralView3(cellDataOrdinal,i,j) += cellMeasureWeight * integralSum;
2135 integralView2(i,j) += cellMeasureWeight * integralSum;
2140 b_offset += rightDimSpan;
2144 a_offset += leftDimSpan;
2148 if (approximateFlops != NULL)
2151 *approximateFlops += (2 + spaceDim * (3 + numPointTensorComponents)) * cellDataExtent * numFieldsLeft * numFieldsRight;
2159 const bool transposeLeft =
true;
2160 const bool transposeRight =
false;
2164 composedTransform.
storeMatMat(transposeLeft, leftTransform, transposeRight, rightTransform);
2170 if (approximateFlops != NULL)
2175 const int leftFamilyCount = vectorDataLeft. vectorData().numFamilies();
2176 const int rightFamilyCount = vectorDataRight.
vectorData().numFamilies();
2177 const int leftComponentCount = vectorDataLeft. vectorData().numComponents();
2178 const int rightComponentCount = vectorDataRight.
vectorData().numComponents();
2180 int leftFieldOrdinalOffset = 0;
2181 for (
int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2186 bool haveLaunchedContributionToCurrentFamilyLeft =
false;
2187 for (
int leftComponentOrdinal=0; leftComponentOrdinal<leftComponentCount; leftComponentOrdinal++)
2193 a_offset += vectorDataLeft.
vectorData().numDimsForComponent(leftComponentOrdinal);
2197 int rightFieldOrdinalOffset = 0;
2198 for (
int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2202 bool haveLaunchedContributionToCurrentFamilyRight =
false;
2204 for (
int rightComponentOrdinal=0; rightComponentOrdinal<rightComponentCount; rightComponentOrdinal++)
2207 if (!rightComponent.
isValid())
2210 b_offset += vectorDataRight.
vectorData().numDimsForComponent(rightComponentOrdinal);
2214 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(leftComponent.
numTensorComponents() != rightComponent.
numTensorComponents(), std::invalid_argument,
"left TensorData and right TensorData have different number of tensor components. This is not supported.");
2216 const int vectorSize = getVectorSizeForHierarchicalParallelism<Scalar>();
2217 Kokkos::TeamPolicy<ExecutionSpace> policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,Kokkos::AUTO(),vectorSize);
2226 bool forceNonSpecialized =
false;
2227 bool usePointCacheForRank3Tensor =
true;
2231 if (haveLaunchedContributionToCurrentFamilyLeft && haveLaunchedContributionToCurrentFamilyRight)
2233 ExecutionSpace().fence();
2235 haveLaunchedContributionToCurrentFamilyLeft =
true;
2236 haveLaunchedContributionToCurrentFamilyRight =
true;
2238 if (integralViewRank == 2)
2242 auto functor =
Impl::F_IntegratePointValueCache<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2244 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2245 const int teamSize = functor.
teamSize(recommendedTeamSize);
2247 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2249 Kokkos::parallel_for( policy , functor,
"F_IntegratePointValueCache rank 2");
2251 if (approximateFlops != NULL)
2253 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2258 auto functor =
Impl::F_Integrate<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2260 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2261 const int teamSize = functor.
teamSize(recommendedTeamSize);
2263 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2265 Kokkos::parallel_for( policy , functor,
"F_Integrate rank 2");
2267 if (approximateFlops != NULL)
2269 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2273 else if (integralViewRank == 3)
2277 auto functor =
Impl::F_IntegratePointValueCache<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2279 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2280 const int teamSize = functor.
teamSize(recommendedTeamSize);
2282 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2284 Kokkos::parallel_for( policy , functor,
"F_IntegratePointValueCache rank 3");
2286 if (approximateFlops != NULL)
2288 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2293 auto functor =
Impl::F_Integrate<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2295 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2296 const int teamSize = functor.
teamSize(recommendedTeamSize);
2298 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2300 Kokkos::parallel_for( policy , functor,
"F_Integrate rank 3");
2302 if (approximateFlops != NULL)
2304 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.getDataExtent(0);
2308 b_offset += vectorDataRight.
vectorData().numDimsForComponent(rightComponentOrdinal);
2310 rightFieldOrdinalOffset += vectorDataRight.
vectorData().numFieldsInFamily(rightFamilyOrdinal);
2312 a_offset += vectorDataLeft.
vectorData().numDimsForComponent(leftComponentOrdinal);
2314 leftFieldOrdinalOffset += vectorDataLeft.
vectorData().numFieldsInFamily(leftFamilyOrdinal);
2321 ExecutionSpace().fence();
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
Hand-coded 3-component version.
KOKKOS_INLINE_FUNCTION DimensionInfo getDimensionInfo(const int &dim) const
Returns an object fully specifying the indicated dimension. This is used in determining appropriate s...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums...
KOKKOS_INLINE_FUNCTION void runSpecialized3(const TeamMember &teamMember) const
runSpecialized implementations are hand-coded variants of run() for a particular number of components...
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewRank() const
returns the rank of the View that stores the unique data
Implementation of a general sum factorization algorithm, using a novel approach developed by Roberts...
size_t team_shmem_size(int team_size) const
Provide the shared memory capacity.
KOKKOS_INLINE_FUNCTION enable_if_t< rank==1, const Kokkos::View< typename RankExpander< DataScalar, rank >::value_type, DeviceType > & > getUnderlyingView() const
Returns the underlying view. Throws an exception if the underlying view is not rank 1...
void clear() const
Copies 0.0 to the underlying View.
KOKKOS_INLINE_FUNCTION constexpr bool isValid() const
Returns true for containers that have data; false for those that don't (e.g., those that have been co...
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION bool underlyingMatchesLogical() const
Returns true if the underlying container has exactly the same rank and extents as the logical contain...
Defines TensorArgumentIterator, which allows systematic enumeration of a TensorData object...
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
Struct expressing all variation information about a Data object in a single dimension, including its logical extent and storage extent.
KOKKOS_INLINE_FUNCTION unsigned rank() const
Returns the logical rank of the Data container.
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< std::is_pod< T >::value, unsigned >::type dimension_scalar(const Kokkos::DynRankView< T, P... >)
specialization of functions for pod types, returning the scalar dimension (1 for pod types) of a view...
size_t team_shmem_size(int numThreads) const
Provide the shared memory capacity.
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
KOKKOS_INLINE_FUNCTION void setEnumerationIndex(const ordinal_type &enumerationIndex)
Sets the enumeration index; this refers to a 1D enumeration of the possible in-bound arguments...
KOKKOS_INLINE_FUNCTION int extent_int(const int &r) const
Returns the logical extent in the specified dimension.
KOKKOS_INLINE_FUNCTION const ordinal_type & argument(const ordinal_type &r) const
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ****, DeviceType > & getUnderlyingView4() const
returns the View that stores the unique data. For rank-4 underlying containers.
void storeMatMat(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION std::enable_if< std::is_integral< iType >::value, ordinal_type >::type extent_int(const iType &d) const
Returns the logical extent in the requested dimension.
KOKKOS_INLINE_FUNCTION int incrementArgument(Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of incrementArgument; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION const Data< Scalar, DeviceType > & getTensorComponent(const ordinal_type &r) const
Returns the requested tensor component.
KOKKOS_INLINE_FUNCTION DimensionInfo combinedDimensionInfo(const DimensionInfo &myData, const DimensionInfo &otherData)
Returns DimensionInfo for a Data container that combines (through multiplication, say...
Defines the Intrepid2::FunctorIterator class, as well as the Intrepid2::functor_returns_ref SFINAE he...
KOKKOS_INLINE_FUNCTION ordinal_type numTensorComponents() const
Return the number of tensorial components.
KOKKOS_INLINE_FUNCTION int nextIncrementResult(const Kokkos::Array< int, Parameters::MaxTensorComponents > &arguments, const Kokkos::Array< int, Parameters::MaxTensorComponents > &bounds, const int &numComponents) const
runtime-sized variant of nextIncrementResult; gets used by approximate flop count.
KOKKOS_INLINE_FUNCTION int getDataExtent(const ordinal_type &d) const
returns the true extent of the data corresponding to the logical dimension provided; if the data does...
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar **, DeviceType > & getUnderlyingView2() const
returns the View that stores the unique data. For rank-2 underlying containers.
View-like interface to tensor data; tensor components are stored separately and multiplied together a...
Allows systematic enumeration of all entries in a TensorData object, tracking indices for each tensor...
DataVariationType
Enumeration to indicate how data varies in a particular dimension of an Intrepid2::Data object...
long approximateFlopCountPerCell() const
returns an estimate of the number of floating point operations per cell (counting sums...
KOKKOS_INLINE_FUNCTION const Kokkos::View< DataScalar ***, DeviceType > & getUnderlyingView3() const
returns the View that stores the unique data. For rank-3 underlying containers.
int teamSize(const int &maxTeamSizeFromKokkos) const
returns the team size that should be provided to the policy constructor, based on the Kokkos maximum ...
Implementation of a general sum factorization algorithm, abstracted from the algorithm described by M...
static Data< DataScalar, DeviceType > allocateMatMatResult(const bool transposeA, const Data< DataScalar, DeviceType > &A_MatData, const bool transposeB, const Data< DataScalar, DeviceType > &B_MatData)
KOKKOS_INLINE_FUNCTION ordinal_type getUnderlyingViewSize() const
returns the number of entries in the View that stores the unique data
static constexpr ordinal_type MaxTensorComponents
Maximum number of tensor/Cartesian products that can be taken: this allows hypercube basis in 7D to b...