42 #ifndef KOKKOS_CRSMATRIX_MP_VECTOR_CUDA_HPP 43 #define KOKKOS_CRSMATRIX_MP_VECTOR_CUDA_HPP 45 #if defined( __CUDACC__) 48 #include "Kokkos_Core.hpp" 64 template <
typename Kernel>
66 #if __CUDA_ARCH__ >= 300 69 FullOccupancyKernelLaunch(Kernel kernel) {
81 template <
typename MatrixStorage,
82 typename MatrixOrdinal,
83 typename MatrixMemory,
85 typename InputStorage,
87 typename OutputStorage,
90 class MPMultiply<
Kokkos::CrsMatrix<Sacado::MP::Vector<MatrixStorage>,
95 Kokkos::View< Sacado::MP::Vector<InputStorage>*,
97 Kokkos::View< Sacado::MP::Vector<OutputStorage>*,
109 typedef typename Kokkos::Cuda Device;
111 typedef typename execution_space::size_type size_type;
113 typedef Kokkos::CrsMatrix<MatrixValue,
117 MatrixSize> matrix_type;
118 typedef typename matrix_type::values_type matrix_values_type;
119 typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
120 typedef Kokkos::View< InputVectorValue*,
121 InputP... > input_vector_type;
122 typedef Kokkos::View< OutputVectorValue*,
123 OutputP... > output_vector_type;
124 typedef Update update_type;
128 template <
unsigned NumPerThread>
133 const matrix_graph_type m_Agraph;
134 const matrix_values_type m_Avals;
135 const input_vector_type m_x;
136 const output_vector_type m_y;
137 const update_type m_update;
138 const size_type m_row_count;
140 Kernel(
const matrix_type & A,
141 const input_vector_type &
x,
142 const output_vector_type &
y,
143 const update_type&
update )
144 : m_Agraph( A.graph )
145 , m_Avals( A.values )
149 , m_row_count( A.graph.row_map.dimension_0()-1 )
153 inline void operator()(
void)
const 155 volatile size_type *
const sh_row =
156 kokkos_impl_cuda_shared_memory<size_type>();
157 volatile size_type *
const sh_col = sh_row + blockDim.y+1;
159 const size_type tid = blockDim.x*threadIdx.y + threadIdx.x;
160 const size_type nid = blockDim.x*blockDim.y;
162 const size_type block_row = blockDim.y*blockIdx.x;
165 const size_type num_row =
166 block_row+blockDim.y+1 <= m_row_count+1 ? blockDim.y+1 :
167 m_row_count+1 - block_row;
168 for (size_type i=tid; i<num_row; i+=nid)
169 sh_row[i] = m_Agraph.row_map[block_row+i];
172 const size_type iRow = block_row + threadIdx.y;
173 if (iRow < m_row_count) {
175 const size_type iEntryBegin = sh_row[threadIdx.y];
176 const size_type iEntryEnd = sh_row[threadIdx.y+1];
178 for (size_type e=0; e<NumPerThread; ++e)
181 for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
182 col_block+=blockDim.x) {
183 const size_type num_col =
184 col_block+blockDim.x <= iEntryEnd ?
185 blockDim.x : iEntryEnd-col_block;
192 if (threadIdx.x < num_col)
193 sh_col[tid] = m_Agraph.entries(col_block+threadIdx.x);
194 if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
197 for ( size_type col = 0; col < num_col; ++col ) {
198 size_type iCol = sh_col[blockDim.x*threadIdx.y + col];
200 for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
201 ++e, ee+=blockDim.x) {
202 sum[e] += m_Avals(col_block+col).fastAccessCoeff(ee) *
203 m_x(iCol).fastAccessCoeff(ee);
209 for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
210 ++e, ee+=blockDim.x) {
211 m_update( m_y(iRow).fastAccessCoeff(ee),
sum[e] );
217 static void apply(
const matrix_type & A,
218 const input_vector_type &
x,
219 const output_vector_type &
y,
220 const update_type &
update )
232 size_type threads_per_vector = A.dev_config.block_dim.x;
233 if (threads_per_vector == 0)
234 threads_per_vector = value_dimension ;
235 size_type rows_per_block = A.dev_config.block_dim.y;
236 if (rows_per_block == 0)
237 rows_per_block = 256 / threads_per_vector;
238 const size_type row_count = A.graph.row_map.dimension_0()-1;
239 const size_type num_blocks = (row_count+rows_per_block-1)/rows_per_block;
240 const dim3 block( threads_per_vector, rows_per_block, 1 );
241 const dim3 grid( num_blocks, 1 );
244 size_type num_per_thread = value_dimension / threads_per_vector;
245 TEUCHOS_TEST_FOR_EXCEPTION(
246 num_per_thread * threads_per_vector != value_dimension, std::logic_error,
247 "Entries/thread * threads/vector must equal number of vector entries");
251 const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
252 TEUCHOS_TEST_FOR_EXCEPTION(
253 threads_per_vector > warp_size, std::logic_error,
254 "Threads/vector cannont exceed Cuda warp size");
257 if (num_per_thread == 1) {
258 launch_impl<1>( A,
x,
y,
update, block, grid );
260 else if (num_per_thread == 2) {
261 launch_impl<2>( A,
x,
y,
update, block, grid );
263 else if (num_per_thread == 3) {
264 launch_impl<3>( A,
x,
y,
update, block, grid );
266 else if (num_per_thread == 4) {
267 launch_impl<4>( A,
x,
y,
update, block, grid );
270 TEUCHOS_TEST_FOR_EXCEPTION(
271 true, std::logic_error,
"Invalid num_per_thread == " << num_per_thread);
278 template <
unsigned num_per_thread>
279 static void launch_impl(
const matrix_type & A,
280 const input_vector_type &
x,
281 const output_vector_type &
y,
282 const update_type &
update,
286 typedef Kernel<num_per_thread> Krnl;
289 const bool occupancy_check =
false;
290 if (occupancy_check) {
291 DeviceProp device_prop;
292 size_type warps_per_sm;
293 if (num_per_thread == 1)
294 warps_per_sm = device_prop.get_resident_warps_per_sm(
295 FullOccupancyKernelLaunch<Krnl>);
297 warps_per_sm = device_prop.get_resident_warps_per_sm(
298 Kokkos::Impl::cuda_parallel_launch_local_memory<Krnl>);
299 std::cout <<
"warps_per_sm = " << warps_per_sm
300 <<
" max_warps_per_sm = " << device_prop.max_warps_per_sm
304 const size_t shared = (block.y+1 + block.x*block.y)*
sizeof(size_type);
305 if (
sizeof(size_type) <= 4)
306 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
308 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
311 if (num_per_thread == 1)
312 FullOccupancyKernelLaunch<<< grid, block, shared >>>
315 Kokkos::Impl::cuda_parallel_launch_local_memory<<< grid, block, shared >>>
328 template <
typename MatrixStorage,
329 typename MatrixOrdinal,
330 typename MatrixMemory,
332 typename InputStorage,
334 typename OutputStorage,
335 typename ... OutputP,
337 class MPMultiply<
Kokkos::CrsMatrix<Sacado::MP::Vector<MatrixStorage>,
342 Kokkos::View< Sacado::MP::Vector<InputStorage>**,
344 Kokkos::View< Sacado::MP::Vector<OutputStorage>**,
356 typedef typename Kokkos::Cuda Device;
358 typedef typename execution_space::size_type size_type;
360 typedef Kokkos::CrsMatrix<MatrixValue,
364 MatrixSize> matrix_type;
365 typedef typename matrix_type::values_type matrix_values_type;
366 typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
367 typedef Kokkos::View< InputVectorValue**,
368 InputP... > input_vector_type;
369 typedef Kokkos::View< OutputVectorValue**,
370 OutputP... > output_vector_type;
371 typedef Update update_type;
375 template <
unsigned NumPerThread>
380 const matrix_graph_type m_Agraph;
381 const matrix_values_type m_Avals;
382 const input_vector_type m_x;
383 const output_vector_type m_y;
384 const update_type m_update;
385 const size_type m_row_count;
386 const size_type m_num_vec_cols;
388 Kernel(
const matrix_type & A,
389 const input_vector_type &
x,
390 const output_vector_type &
y,
391 const update_type&
update )
392 : m_Agraph( A.graph )
393 , m_Avals( A.values )
397 , m_row_count( A.graph.row_map.dimension_0()-1 )
398 , m_num_vec_cols(
x.dimension_1() )
402 inline void operator()(
void)
const 404 volatile size_type *
const sh_row =
405 kokkos_impl_cuda_shared_memory<size_type>();
406 volatile size_type *
const sh_col = sh_row + blockDim.y+1;
408 const size_type tid = blockDim.x*threadIdx.y + threadIdx.x;
409 const size_type nid = blockDim.x*blockDim.y;
411 const size_type block_row = blockDim.y*blockIdx.x;
414 const size_type num_row =
415 block_row+blockDim.y+1 <= m_row_count+1 ? blockDim.y+1 :
416 m_row_count+1 - block_row;
417 for (size_type i=tid; i<num_row; i+=nid)
418 sh_row[i] = m_Agraph.row_map[block_row+i];
421 const size_type iRow = block_row + threadIdx.y;
422 if (iRow < m_row_count) {
424 const size_type iEntryBegin = sh_row[threadIdx.y];
425 const size_type iEntryEnd = sh_row[threadIdx.y+1];
431 for (size_type vec_col=0; vec_col<m_num_vec_cols; vec_col++) {
433 for (size_type e=0; e<NumPerThread; ++e)
436 for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
437 col_block+=blockDim.x) {
438 const size_type num_col =
439 col_block+blockDim.x <= iEntryEnd ?
440 blockDim.x : iEntryEnd-col_block;
447 if (threadIdx.x < num_col)
448 sh_col[tid] = m_Agraph.entries(col_block+threadIdx.x);
449 if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
452 for ( size_type col = 0; col < num_col; ++col ) {
453 size_type iCol = sh_col[blockDim.x*threadIdx.y + col];
455 for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
456 ++e, ee+=blockDim.x) {
457 sum[e] += m_Avals(col_block+col).fastAccessCoeff(ee) *
458 m_x(iCol, vec_col).fastAccessCoeff(ee);
464 for (size_type e=0, ee=threadIdx.x; e<NumPerThread;
465 ++e, ee+=blockDim.x) {
466 m_update( m_y(iRow, vec_col).fastAccessCoeff(ee),
sum[e] );
474 static void apply(
const matrix_type & A,
475 const input_vector_type &
x,
476 const output_vector_type &
y,
477 const update_type &
update )
489 size_type threads_per_vector = A.dev_config.block_dim.x;
490 if (threads_per_vector == 0)
491 threads_per_vector = value_dimension ;
492 size_type rows_per_block = A.dev_config.block_dim.y;
493 if (rows_per_block == 0)
494 rows_per_block = 256 / threads_per_vector;
495 const size_type row_count = A.graph.row_map.dimension_0()-1;
496 const size_type num_blocks = (row_count+rows_per_block-1)/rows_per_block;
497 const dim3 block( threads_per_vector, rows_per_block, 1 );
498 const dim3 grid( num_blocks, 1 );
501 size_type num_per_thread = value_dimension / threads_per_vector;
502 TEUCHOS_TEST_FOR_EXCEPTION(
503 num_per_thread * threads_per_vector != value_dimension, std::logic_error,
504 "Entries/thread * threads/vector must equal number of vector entries");
507 if (num_per_thread == 1) {
508 launch_impl<1>( A,
x,
y,
update, block, grid );
510 else if (num_per_thread == 2) {
511 launch_impl<2>( A,
x,
y,
update, block, grid );
513 else if (num_per_thread == 3) {
514 launch_impl<3>( A,
x,
y,
update, block, grid );
516 else if (num_per_thread == 4) {
517 launch_impl<4>( A,
x,
y,
update, block, grid );
520 TEUCHOS_TEST_FOR_EXCEPTION(
521 true, std::logic_error,
"Invalid num_per_thread == " << num_per_thread);
528 template <
unsigned num_per_thread>
529 static void launch_impl(
const matrix_type & A,
530 const input_vector_type &
x,
531 const output_vector_type &
y,
532 const update_type &
update,
536 typedef Kernel<num_per_thread> Krnl;
539 const bool occupancy_check =
false;
540 if (occupancy_check) {
541 DeviceProp device_prop;
542 size_type warps_per_sm;
543 if (num_per_thread == 1)
544 warps_per_sm = device_prop.get_resident_warps_per_sm(
545 FullOccupancyKernelLaunch<Krnl>);
547 warps_per_sm = device_prop.get_resident_warps_per_sm(
548 Kokkos::Impl::cuda_parallel_launch_local_memory<Krnl>);
549 std::cout <<
"warps_per_sm = " << warps_per_sm
550 <<
" max_warps_per_sm = " << device_prop.max_warps_per_sm
554 const size_t shared = (block.y+1 + block.x*block.y)*
sizeof(size_type);
555 if (
sizeof(size_type) <= 4)
556 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
558 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
561 if (num_per_thread == 1)
562 FullOccupancyKernelLaunch<<< grid, block, shared >>>
565 Kokkos::Impl::cuda_parallel_launch_local_memory<<< grid, block, shared >>>
Kokkos::DefaultExecutionSpace execution_space
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP... > >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP... > >::value >::type sum(const Kokkos::View< RD, RP... > &r, const Kokkos::View< XD, XP... > &x)
__launch_bounds__(VECTORS_PER_BLOCK *THREADS_PER_VECTOR, 1) __global__ void spmm_csr_vector_kernel_col(const IndexType Anum_rows
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType * x
Top-level namespace for Stokhos classes and functions.
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P... > >::value, unsigned >::type dimension_scalar(const View< T, P... > &view)
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType ValueType * y
void update(const ValueType &alpha, VectorType &x, const ValueType &beta, const VectorType &y)