42 #ifndef KOKKOS_PARALLEL_MP_VECTOR_HPP 43 #define KOKKOS_PARALLEL_MP_VECTOR_HPP 46 #include "Kokkos_Core.hpp" 57 template<
class ExecSpace >
69 const size_t shared_ = 0 ) :
75 #if defined( KOKKOS_HAVE_PTHREAD ) 84 template<
class FunctorType >
85 class ParallelFor< FunctorType , MPVectorWorkConfig< Threads > > :
86 public ParallelFor< FunctorType , Kokkos::RangePolicy< Threads > > {
87 typedef Kokkos::RangePolicy< Threads > Policy ;
89 ParallelFor(
const FunctorType & functor ,
90 const MPVectorWorkConfig< Threads > & work_config ) :
91 ParallelFor< FunctorType , Policy >( functor ,
92 Policy( 0, work_config.range ) ) {}
96 #if defined( KOKKOS_HAVE_OPENMP ) 105 template<
class FunctorType >
106 class ParallelFor< FunctorType , MPVectorWorkConfig< OpenMP > > :
107 public ParallelFor< FunctorType , Kokkos::RangePolicy< OpenMP > > {
108 typedef Kokkos::RangePolicy< OpenMP > Policy ;
110 ParallelFor(
const FunctorType & functor ,
111 const MPVectorWorkConfig< OpenMP > & work_config ) :
112 ParallelFor< FunctorType , Policy >( functor ,
113 Policy( 0, work_config.range ) ) {}
117 #if defined(KOKKOS_HAVE_SERIAL) 126 template<
class FunctorType >
127 class ParallelFor< FunctorType , MPVectorWorkConfig< Serial > > :
128 public ParallelFor< FunctorType , Kokkos::RangePolicy< Serial > > {
129 typedef Kokkos::RangePolicy< Serial > Policy ;
131 ParallelFor(
const FunctorType & functor ,
132 const MPVectorWorkConfig< Serial > & work_config ) :
133 ParallelFor< FunctorType , Policy >( functor ,
134 Policy( 0, work_config.range ) ) {}
136 #endif // defined(KOKKOS_HAVE_SERIAL) 138 #if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ ) 142 template<
class FunctorType >
143 class ParallelFor< FunctorType , MPVectorWorkConfig< Cuda > > {
146 const FunctorType m_functor ;
147 const MPVectorWorkConfig< Cuda > m_config;
148 const Cuda::size_type m_work ;
152 void operator()(
void)
const 154 const Cuda::size_type work_stride = blockDim.y * gridDim.x ;
156 for ( Cuda::size_type iwork = threadIdx.y + blockDim.y * blockIdx.x ;
158 iwork += work_stride ) {
159 m_functor( iwork , threadIdx.x );
163 ParallelFor(
const FunctorType & functor ,
164 const MPVectorWorkConfig< Cuda > & work_config )
165 : m_functor( functor ) ,
166 m_config( work_config ) ,
167 m_work( work_config.range )
177 Cuda::size_type nwarp = 0;
178 if (m_config.team > CudaTraits::WarpSize) {
179 const Cuda::size_type warps_per_team =
180 ( m_config.team + CudaTraits::WarpSize-1 ) / CudaTraits::WarpSize;
181 nwarp = cuda_internal_maximum_warp_count() / warps_per_team;
184 const Cuda::size_type teams_per_warp =
185 CudaTraits::WarpSize / m_config.team ;
186 nwarp = cuda_internal_maximum_warp_count() * teams_per_warp;
188 const dim3 block( m_config.team , nwarp , 1 );
190 Cuda::size_type nblock =
191 std::min( (m_work + block.y - 1 ) / block.y ,
192 cuda_internal_maximum_grid_count() );
193 const dim3 grid( nblock , 1 , 1 );
195 const Cuda::size_type shared = m_config.shared;
196 CudaParallelLaunch< ParallelFor >( *this , grid , block , shared );
MPVectorWorkConfig(const size_t range_, const size_t team_, const size_t shared_=0)
ExecSpace execution_space
KOKKOS_INLINE_FUNCTION PCE< Storage > min(const typename PCE< Storage >::value_type &a, const PCE< Storage > &b)
Team-based parallel work configuration for Sacado::MP::Vector.
MPVectorWorkConfig execution_policy