Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
Kokkos_CrsMatrix_UQ_PCE.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #ifndef KOKKOS_CRSMATRIX_UQ_PCE_HPP
43 #define KOKKOS_CRSMATRIX_UQ_PCE_HPP
44 
45 #include "Sacado_UQ_PCE.hpp"
46 #include "Kokkos_View_UQ_PCE.hpp"
48 #include "Kokkos_Sparse.hpp"
49 #include "Kokkos_Blas1_UQ_PCE.hpp" // for some utilities
50 
51 #include "Stokhos_Multiply.hpp"
53 
54 namespace Stokhos {
55 
56 //----------------------------------------------------------------------------
57 // Specialization of KokkosSparse::CrsMatrix for Sacado::UQ::PCE scalar type
58 //----------------------------------------------------------------------------
59 
60 // Kernel implementing y = A * x where
61 // A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>,
62 // x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
63 // x and y are rank 1
64 template <typename MatrixDevice,
65  typename MatrixStorage,
66  typename MatrixOrdinal,
67  typename MatrixMemory,
68  typename MatrixSize,
69  typename InputStorage,
70  typename ... InputP,
71  typename OutputStorage,
72  typename ... OutputP>
73 class Multiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
74  MatrixOrdinal,
75  MatrixDevice,
76  MatrixMemory,
77  MatrixSize>,
78  Kokkos::View< Sacado::UQ::PCE<InputStorage>*,
79  InputP... >,
80  Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
81  OutputP... >
82  >
83 {
84 public:
88 
89  typedef MatrixDevice execution_space;
90 
91  typedef KokkosSparse::CrsMatrix< MatrixValue,
92  MatrixOrdinal,
93  MatrixDevice,
94  MatrixMemory,
95  MatrixSize> matrix_type;
96  typedef typename matrix_type::values_type matrix_values_type;
98  typedef typename tensor_type::size_type size_type;
99  typedef Kokkos::View< InputVectorValue*,
100  InputP... > input_vector_type;
101  typedef Kokkos::View< OutputVectorValue*,
102  OutputP... > output_vector_type;
103 
104 private:
105 
106  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
107  typedef typename matrix_values_type::array_type matrix_array_type;
108  typedef typename input_vector_type::array_type input_array_type;
109  typedef typename output_vector_type::array_type output_array_type;
110 
115 
123 
124  Multiply( const matrix_type & A ,
125  const input_vector_type & x ,
126  const output_vector_type & y ,
127  const input_scalar & a ,
128  const output_scalar & b )
129  : m_A_values( A.values )
130  , m_A_graph( A.graph )
131  , m_x( x )
132  , m_y( y )
133  , m_tensor( Kokkos::cijk(A.values) )
134  , m_a( a )
135  , m_b( b )
136  {}
137 
138 public:
139 
140  //
141  // Non-team functor interface -- no threads within PCE multiply
142  //
143  // Note: Rember that matrix currently is always LayoutRight!
144  //
145  KOKKOS_INLINE_FUNCTION
146  void operator()( const size_type iBlockRow ) const
147  {
148  // Prefer that y[ m_tensor.dimension() ] be scratch space
149  // on the local thread, but cannot dynamically allocate
150  output_scalar * const y = & m_y(0,iBlockRow);
151 
152  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
153  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
154 
155  // Leading dimension guaranteed contiguous for LayoutLeft
156  if ( m_b == output_scalar(0) )
157  for ( size_type j = 0 ; j < m_tensor.dimension() ; ++j )
158  y[j] = 0 ;
159  else
160  for ( size_type j = 0 ; j < m_tensor.dimension() ; ++j )
161  y[j] = m_b * y[j] ;
162  //loop over cols of A
163  for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
164  const input_scalar * const x = & m_x( 0 , m_A_graph.entries(iEntry) );
165  const matrix_scalar * const A = & m_A_values( iEntry , 0 );
166 
168  m_tensor , A , x , y , m_a );
169  }
170 
171  }
172 
173 #if defined(__MIC__)
174 
175  //
176  // Team functor interface with threading within PCE multiply
177  //
178  // Note: Rember that matrix currently is always LayoutRight!
179  //
180  // This is a MIC-specific version of that processes multiple FEM columns
181  // at a time to reduce tensor reads
182  //
183  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
184  KOKKOS_INLINE_FUNCTION
185  void operator()( const team_member & device ) const
186  {
187  const size_type iBlockRow = device.league_rank();
188 
189  // Check for valid row
190  const size_type row_count = m_A_graph.row_map.dimension_0()-1;
191  if (iBlockRow >= row_count)
192  return;
193 
194  const size_type num_thread = device.team_size();
195  const size_type thread_idx = device.team_rank();
196  const Kokkos::pair<size_type,size_type> work_range =
197  details::compute_work_range<output_scalar>(
198  device, m_tensor.dimension(), num_thread, thread_idx);
199 
200  // Prefer that y[ m_tensor.dimension() ] be scratch space
201  // on the local thread, but cannot dynamically allocate
202  output_scalar * const y = & m_y(0,iBlockRow);
203 
204  // Leading dimension guaranteed contiguous for LayoutLeft
205  if ( m_b == output_scalar(0) )
206  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
207  y[j] = 0 ;
208  else
209  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
210  y[j] = m_b * y[j] ;
211 
212  const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
213  const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
214  const size_type BlockSize = 9;
215  const size_type numBlock =
216  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
217 
218  const matrix_scalar* sh_A[BlockSize];
219  const input_scalar* sh_x[BlockSize];
220 
221  size_type iBlockEntry = iBlockEntryBeg;
222  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
223  const size_type block_size =
224  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
225 
226  for ( size_type col = 0; col < block_size; ++col ) {
227  const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
228  sh_x[col] = & m_x( 0 , iBlockColumn );
229  sh_A[col] = & m_A_values( iBlockEntry + col , 0);
230  }
231 
232  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
233 
234  const size_type nEntry = m_tensor.num_entry(iy);
235  const size_type iEntryBeg = m_tensor.entry_begin(iy);
236  const size_type iEntryEnd = iEntryBeg + nEntry;
237  size_type iEntry = iEntryBeg;
238 
239  output_scalar ytmp = 0 ;
240 
241  // Do entries with a blocked loop of size blocksize
242  const size_type nBlock = nEntry / tensor_type::vectorsize;
243  const size_type nEntryB = nBlock * tensor_type::vectorsize;
244  const size_type iEnd = iEntryBeg + nEntryB;
245 
246  typedef TinyVec<tensor_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> ValTV;
247  typedef TinyVec<matrix_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> MatTV;
248  typedef TinyVec<output_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> VecTV;
249  VecTV vy;
250  vy.zero();
251  for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
252  const size_type *j = &m_tensor.coord(iEntry,0);
253  const size_type *k = &m_tensor.coord(iEntry,1);
254  ValTV c(&(m_tensor.value(iEntry)));
255 
256  for ( size_type col = 0; col < block_size; ++col ) {
257  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
258  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
259 
260  // vy += c * ( aj * xk + ak * xj)
261  aj.times_equal(xk);
262  aj.multiply_add(ak, xj);
263  vy.multiply_add(c, aj);
264  }
265  }
266  ytmp += vy.sum();
267 
268  // The number of nonzeros is always constrained to be a multiple of 8
269 
270  const size_type rem = iEntryEnd-iEntry;
271  if (rem >= 8) {
272  typedef TinyVec<tensor_scalar,8,tensor_type::use_intrinsics> ValTV2;
273  typedef TinyVec<matrix_scalar,8,tensor_type::use_intrinsics> MatTV2;
274  typedef TinyVec<output_scalar,8,tensor_type::use_intrinsics> VecTV2;
275  const size_type *j = &m_tensor.coord(iEntry,0);
276  const size_type *k = &m_tensor.coord(iEntry,1);
277  ValTV2 c(&(m_tensor.value(iEntry)));
278 
279  for ( size_type col = 0; col < block_size; ++col ) {
280  MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
281  VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
282 
283  // vy += c * ( aj * xk + ak * xj)
284  aj.times_equal(xk);
285  aj.multiply_add(ak, xj);
286  aj.times_equal(c);
287  ytmp += aj.sum();
288  }
289  }
290 
291  y[iy] += m_a * ytmp ;
292  }
293 
294  // Add a team barrier to keep the thread team in-sync before going on
295  // to the next block
296  device.team_barrier();
297  }
298 
299  }
300 
301 #else
302 
303  //
304  // Team functor interface with threading within PCE multiply
305  //
306  // Note: Rember that matrix currently is always LayoutRight!
307  //
308  // This is a general, hand-vectorized version that processes multiple FEM
309  // columns at a time to reduce tensor reads. Note that auto-vectorization
310  // doesn't work here because of the inner-loop over FEM columns.
311  //
312  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
313  KOKKOS_INLINE_FUNCTION
314  void operator()( const team_member & device ) const
315  {
316  const size_type iBlockRow = device.league_rank();
317 
318  // Check for valid row
319  const size_type row_count = m_A_graph.row_map.dimension_0()-1;
320  if (iBlockRow >= row_count)
321  return;
322 
323  const size_type num_thread = device.team_size();
324  const size_type thread_idx = device.team_rank();
325  const Kokkos::pair<size_type,size_type> work_range =
326  details::compute_work_range<output_scalar>(
327  device, m_tensor.dimension(), num_thread, thread_idx);
328 
329  // Prefer that y[ m_tensor.dimension() ] be scratch space
330  // on the local thread, but cannot dynamically allocate
331  output_scalar * const y = & m_y(0,iBlockRow);
332 
333  // Leading dimension guaranteed contiguous for LayoutLeft
334  if ( m_b == output_scalar(0) )
335  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
336  y[j] = 0 ;
337  else
338  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
339  y[j] = m_b * y[j] ;
340 
341  const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
342  const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
343  const size_type BlockSize = 14;
344  const size_type numBlock =
345  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
346 
347  const matrix_scalar* sh_A[BlockSize];
348  const input_scalar* sh_x[BlockSize];
349 
350  size_type iBlockEntry = iBlockEntryBeg;
351  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
352  const size_type block_size =
353  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
354 
355  for ( size_type col = 0; col < block_size; ++col ) {
356  const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
357  sh_x[col] = & m_x( 0 , iBlockColumn );
358  sh_A[col] = & m_A_values( iBlockEntry + col , 0 );
359  }
360 
361  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ) {
362 
363  const size_type nEntry = m_tensor.num_entry(iy);
364  const size_type iEntryBeg = m_tensor.entry_begin(iy);
365  const size_type iEntryEnd = iEntryBeg + nEntry;
366  size_type iEntry = iEntryBeg;
367 
368  output_scalar ytmp = 0 ;
369 
370  // Do entries with a blocked loop of size blocksize
371  if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize) {
372  const size_type nBlock = nEntry / tensor_type::vectorsize;
373  const size_type nEntryB = nBlock * tensor_type::vectorsize;
374  const size_type iEnd = iEntryBeg + nEntryB;
375 
379  VecTV vy;
380  vy.zero();
381  for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
382  const size_type *j = &m_tensor.coord(iEntry,0);
383  const size_type *k = &m_tensor.coord(iEntry,1);
384  ValTV c(&(m_tensor.value(iEntry)));
385 
386  for ( size_type col = 0; col < block_size; ++col ) {
387  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
388  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
389 
390  // vy += c * ( aj * xk + ak * xj)
391  aj.times_equal(xk);
392  aj.multiply_add(ak, xj);
393  vy.multiply_add(c, aj);
394  }
395  }
396  ytmp += vy.sum();
397  }
398 
399  // Do remaining entries with a scalar loop
400  for ( ; iEntry<iEntryEnd; ++iEntry) {
401  const size_type j = m_tensor.coord(iEntry,0);
402  const size_type k = m_tensor.coord(iEntry,1);
403  tensor_scalar cijk = m_tensor.value(iEntry);
404 
405  for ( size_type col = 0; col < block_size; ++col ) {
406  ytmp += cijk * ( sh_A[col][j] * sh_x[col][k] +
407  sh_A[col][k] * sh_x[col][j] );
408  }
409 
410  }
411 
412  y[iy] += m_a * ytmp ;
413  }
414 
415  // Add a team barrier to keep the thread team in-sync before going on
416  // to the next block
417  device.team_barrier();
418  }
419 
420  }
421 
422 #endif
423 
424  static void apply( const matrix_type & A ,
425  const input_vector_type & x ,
426  const output_vector_type & y ,
427  const input_scalar & a = input_scalar(1) ,
428  const output_scalar & b = output_scalar(0) )
429  {
430  // Generally the block algorithm seems to perform better on the MIC,
431  // as long as the stochastic size isn't too big, but doesn't perform
432  // any better on the CPU (probably because the CPU has a fat L3 cache
433  // to store the sparse 3 tensor).
434 #ifdef __MIC__
435  const bool use_block_algorithm = true;
436 #else
437  const bool use_block_algorithm = false;
438 #endif
439 
440  const size_t row_count = A.graph.row_map.dimension_0() - 1 ;
441  if (use_block_algorithm) {
442 #ifdef __MIC__
443  const size_t team_size = 4; // 4 hyperthreads for MIC
444 #else
445  const size_t team_size = 2; // 2 for everything else
446 #endif
447  const size_t league_size = row_count;
448  Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
449  Kokkos::parallel_for( config , Multiply(A,x,y,a,b) );
450  }
451  else {
452  Kokkos::parallel_for( row_count , Multiply(A,x,y,a,b) );
453  }
454  }
455 };
456 
457 // Kernel implementing y = A * x where
458 // A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>,
459 // x, y == Kokkos::View< Sacado::UQ::PCE<...>**,...>,
460 // x and y are rank 2
461 template <typename MatrixDevice,
462  typename MatrixStorage,
463  typename MatrixOrdinal,
464  typename MatrixMemory,
465  typename MatrixSize,
466  typename InputStorage,
467  typename ... InputP,
468  typename OutputStorage,
469  typename ... OutputP>
470 class Multiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
471  MatrixOrdinal,
472  MatrixDevice,
473  MatrixMemory,
474  MatrixSize>,
475  Kokkos::View< Sacado::UQ::PCE<InputStorage>**,
476  InputP... >,
477  Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
478  OutputP... >
479  >
480 {
481 public:
485 
486  typedef MatrixDevice execution_space;
487 
488  typedef KokkosSparse::CrsMatrix< MatrixValue,
489  MatrixOrdinal,
490  MatrixDevice,
491  MatrixMemory,
492  MatrixSize> matrix_type;
493  typedef typename matrix_type::values_type matrix_values_type;
495  typedef typename tensor_type::size_type size_type;
496  typedef Kokkos::View< InputVectorValue**,
497  InputP... > input_vector_type;
498  typedef Kokkos::View< OutputVectorValue**,
499  OutputP... > output_vector_type;
500 
501 private:
502 
503  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
504  typedef typename matrix_values_type::array_type matrix_array_type;
505  typedef typename input_vector_type::array_type input_array_type;
506  typedef typename output_vector_type::array_type output_array_type;
507 
512 
520 
521  Multiply( const matrix_type & A ,
522  const input_vector_type & x ,
523  const output_vector_type & y ,
524  const input_scalar & a ,
525  const output_scalar & b )
526  : m_A_values( A.values )
527  , m_A_graph( A.graph )
528  , m_x( x )
529  , m_y( y )
530  , m_tensor( Kokkos::cijk(A.values) )
531  , m_a( a )
532  , m_b( b )
533  {}
534 
535 public:
536 
537  //
538  // Non-team functor interface -- no threads within PCE multiply
539  //
540  // Note: Rember that matrix currently is always LayoutRight!
541  //
542  KOKKOS_INLINE_FUNCTION
543  void operator()( const size_type iBlockRow ) const
544  {
545  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
546  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
547 
548  const size_type num_col = m_y.dimension_2();
549 
550  // Leading dimension guaranteed contiguous for LayoutLeft
551  if ( m_b == output_scalar(0) )
552  for (size_type col=0; col<num_col; ++col)
553  for ( size_type j = 0 ; j < m_tensor.dimension() ; ++j )
554  m_y(j, iBlockRow, col) = 0 ;
555  else
556  for (size_type col=0; col<num_col; ++col)
557  for ( size_type j = 0 ; j < m_tensor.dimension() ; ++j )
558  m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
559 
560  // Put the x-column loop inside the A-column loop to reuse entries in A.
561  // This way all of the entries for that particular column of A should stay
562  // in L1 cache for all of the columns of x.
563 
564  for ( size_type iEntry = iEntryBegin ; iEntry < iEntryEnd ; ++iEntry ) {
565  const matrix_scalar * const A = &m_A_values( iEntry, 0 );
566  const size_type iBlockCol = m_A_graph.entries(iEntry);
567 
568  for (size_type col=0; col<num_col; ++col) {
569  output_scalar * const y = &m_y( 0, iBlockRow, col );
570  const input_scalar * const x = &m_x( 0, iBlockCol, col );
571  BlockMultiply< tensor_type >::apply( m_tensor , A , x , y , m_a );
572  }
573 
574  }
575 
576  }
577 
578 #if defined(__MIC__)
579 
580  //
581  // Team functor interface with threading within PCE multiply
582  //
583  // Note: Rember that matrix currently is always LayoutRight!
584  //
585  // This is a MIC-specific version of that processes multiple FEM columns
586  // at a time to reduce tensor reads
587  //
588  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
589 
590  KOKKOS_INLINE_FUNCTION
591  void operator()( const team_member & device ) const
592  {
593  const size_type iBlockRow = device.league_rank();
594 
595  // Check for valid row
596  const size_type row_count = m_A_graph.row_map.dimension_0()-1;
597  if (iBlockRow >= row_count)
598  return;
599 
600  const size_type num_thread = device.team_size();
601  const size_type thread_idx = device.team_rank();
602  const Kokkos::pair<size_type,size_type> work_range =
603  details::compute_work_range<output_scalar>(
604  device, m_tensor.dimension(), num_thread, thread_idx);
605 
606  const size_type num_col = m_y.dimension_2();
607 
608  // Leading dimension guaranteed contiguous for LayoutLeft
609  if ( m_b == output_scalar(0) )
610  for (size_type col=0; col<num_col; ++col)
611  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
612  m_y(j, iBlockRow, col) = 0 ;
613  else
614  for (size_type col=0; col<num_col; ++col)
615  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
616  m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
617 
618  const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
619  const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
620  const size_type BlockSize = 9;
621  const size_type numBlock =
622  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
623 
624  const matrix_scalar* sh_A[BlockSize];
625  const input_scalar* sh_x[BlockSize];
626 
627  size_type iBlockEntry = iBlockEntryBeg;
628  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
629  const size_type block_size =
630  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
631 
632  // Loop over columns of x, y
633  for (size_type vec_col=0; vec_col<num_col; ++vec_col) {
634 
635  output_scalar * const y = & m_y( 0 , iBlockRow , vec_col );
636 
637  for ( size_type col = 0; col < block_size; ++col ) {
638  const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
639  sh_x[col] = & m_x( 0 , iBlockColumn , vec_col );
640  sh_A[col] = & m_A_values( iBlockEntry + col , 0);
641  }
642 
643  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ){
644 
645  const size_type nEntry = m_tensor.num_entry(iy);
646  const size_type iEntryBeg = m_tensor.entry_begin(iy);
647  const size_type iEntryEnd = iEntryBeg + nEntry;
648  size_type iEntry = iEntryBeg;
649 
650  output_scalar ytmp = 0 ;
651 
652  // Do entries with a blocked loop of size blocksize
653  const size_type nBlock = nEntry / tensor_type::vectorsize;
654  const size_type nEntryB = nBlock * tensor_type::vectorsize;
655  const size_type iEnd = iEntryBeg + nEntryB;
656 
657  typedef TinyVec<tensor_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> ValTV;
658  typedef TinyVec<matrix_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> MatTV;
659  typedef TinyVec<output_scalar,tensor_type::vectorsize,tensor_type::use_intrinsics> VecTV;
660  VecTV vy;
661  vy.zero();
662  for (size_type block=0; block<nBlock; ++block, iEntry+=tensor_type::vectorsize) {
663  const size_type *j = &m_tensor.coord(iEntry,0);
664  const size_type *k = &m_tensor.coord(iEntry,1);
665  ValTV c(&(m_tensor.value(iEntry)));
666 
667  for ( size_type col = 0; col < block_size; ++col ) {
668  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
669  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
670 
671  // vy += c * ( aj * xk + ak * xj)
672  aj.times_equal(xk);
673  aj.multiply_add(ak, xj);
674  vy.multiply_add(c, aj);
675  }
676  }
677  ytmp += vy.sum();
678 
679  // The number of nonzeros is always constrained to be a multiple of 8
680 
681  const size_type rem = iEntryEnd-iEntry;
682  if (rem >= 8) {
683  typedef TinyVec<tensor_scalar,8,tensor_type::use_intrinsics> ValTV2;
684  typedef TinyVec<matrix_scalar,8,tensor_type::use_intrinsics> MatTV2;
685  typedef TinyVec<output_scalar,8,tensor_type::use_intrinsics> VecTV2;
686  const size_type *j = &m_tensor.coord(iEntry,0);
687  const size_type *k = &m_tensor.coord(iEntry,1);
688  ValTV2 c(&(m_tensor.value(iEntry)));
689 
690  for ( size_type col = 0; col < block_size; ++col ) {
691  MatTV2 aj(sh_A[col], j), ak(sh_A[col], k);
692  VecTV2 xj(sh_x[col], j), xk(sh_x[col], k);
693 
694  // vy += c * ( aj * xk + ak * xj)
695  aj.times_equal(xk);
696  aj.multiply_add(ak, xj);
697  aj.times_equal(c);
698  ytmp += aj.sum();
699  }
700  }
701 
702  y[iy] += m_a * ytmp ;
703  }
704 
705  }
706 
707  // Add a team barrier to keep the thread team in-sync before going on
708  // to the next block
709  device.team_barrier();
710 
711  }
712 
713  }
714 
715 #else
716 
717  //
718  // Team functor interface with threading within PCE multiply
719  //
720  // Note: Rember that matrix currently is always LayoutRight!
721  //
722  // This is a general, hand-vectorized version that processes multiple FEM
723  // columns at a time to reduce tensor reads. Note that auto-vectorization
724  // doesn't work here because of the inner-loop over FEM columns.
725  //
726  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member ;
727 
728  KOKKOS_INLINE_FUNCTION
729  void operator()( const team_member & device ) const
730  {
731  const size_type iBlockRow = device.league_rank();
732 
733  // Check for valid row
734  const size_type row_count = m_A_graph.row_map.dimension_0()-1;
735  if (iBlockRow >= row_count)
736  return;
737 
738  const size_type num_thread = device.team_size();
739  const size_type thread_idx = device.team_rank();
740  const Kokkos::pair<size_type,size_type> work_range =
741  details::compute_work_range<output_scalar>(
742  device, m_tensor.dimension(), num_thread, thread_idx);
743 
744  const size_type num_col = m_y.dimension_2();
745 
746  // Leading dimension guaranteed contiguous for LayoutLeft
747  if ( m_b == output_scalar(0) )
748  for (size_type col=0; col<num_col; ++col)
749  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
750  m_y(j, iBlockRow, col) = 0 ;
751  else
752  for (size_type col=0; col<num_col; ++col)
753  for ( size_type j = work_range.first ; j < work_range.second ; ++j )
754  m_y(j, iBlockRow, col) = m_b * m_y(j, iBlockRow, col) ;
755 
756  const size_type iBlockEntryBeg = m_A_graph.row_map[ iBlockRow ];
757  const size_type iBlockEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
758  const size_type BlockSize = 14;
759  const size_type numBlock =
760  (iBlockEntryEnd-iBlockEntryBeg+BlockSize-1) / BlockSize;
761 
762  const matrix_scalar* sh_A[BlockSize];
763  const input_scalar* sh_x[BlockSize];
764 
765  size_type iBlockEntry = iBlockEntryBeg;
766  for (size_type block = 0; block<numBlock; ++block, iBlockEntry+=BlockSize) {
767  const size_type block_size =
768  block == numBlock-1 ? iBlockEntryEnd-iBlockEntry : BlockSize;
769 
770  // Loop over columns of x, y
771  for (size_type vec_col=0; vec_col<num_col; ++vec_col) {
772 
773  output_scalar * const y = & m_y( 0 , iBlockRow , vec_col );
774 
775  for ( size_type col = 0; col < block_size; ++col ) {
776  const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
777  sh_x[col] = & m_x( 0 , iBlockColumn , vec_col );
778  sh_A[col] = & m_A_values( iBlockEntry + col , 0 );
779  }
780 
781  for ( size_type iy = work_range.first ; iy < work_range.second ; ++iy ){
782 
783  const size_type nEntry = m_tensor.num_entry(iy);
784  const size_type iEntryBeg = m_tensor.entry_begin(iy);
785  const size_type iEntryEnd = iEntryBeg + nEntry;
786  size_type iEntry = iEntryBeg;
787 
788  output_scalar ytmp = 0 ;
789 
790  // Do entries with a blocked loop of size blocksize
791  if (tensor_type::vectorsize > 1 && nEntry >= tensor_type::vectorsize){
792  const size_type nBlock = nEntry / tensor_type::vectorsize;
793  const size_type nEntryB = nBlock * tensor_type::vectorsize;
794  const size_type iEnd = iEntryBeg + nEntryB;
795 
799  VecTV vy;
800  vy.zero();
801  for (; iEntry<iEnd; iEntry+=tensor_type::vectorsize) {
802  const size_type *j = &m_tensor.coord(iEntry,0);
803  const size_type *k = &m_tensor.coord(iEntry,1);
804  ValTV c(&(m_tensor.value(iEntry)));
805 
806  for ( size_type col = 0; col < block_size; ++col ) {
807  MatTV aj(sh_A[col], j), ak(sh_A[col], k);
808  VecTV xj(sh_x[col], j), xk(sh_x[col], k);
809 
810  // vy += c * ( aj * xk + ak * xj)
811  aj.times_equal(xk);
812  aj.multiply_add(ak, xj);
813  vy.multiply_add(c, aj);
814  }
815  }
816  ytmp += vy.sum();
817  }
818 
819  // Do remaining entries with a scalar loop
820  for ( ; iEntry<iEntryEnd; ++iEntry) {
821  const size_type j = m_tensor.coord(iEntry,0);
822  const size_type k = m_tensor.coord(iEntry,1);
823  tensor_scalar cijk = m_tensor.value(iEntry);
824 
825  for ( size_type col = 0; col < block_size; ++col ) {
826  ytmp += cijk * ( sh_A[col][j] * sh_x[col][k] +
827  sh_A[col][k] * sh_x[col][j] );
828  }
829 
830  }
831 
832  y[iy] += m_a * ytmp ;
833  }
834 
835  }
836 
837  // Add a team barrier to keep the thread team in-sync before going on
838  // to the next block
839  device.team_barrier();
840  }
841 
842  }
843 
844 #endif
845 
846  static void apply( const matrix_type & A ,
847  const input_vector_type & x ,
848  const output_vector_type & y ,
849  const input_scalar & a = input_scalar(1) ,
850  const output_scalar & b = output_scalar(0) )
851  {
852  // Generally the block algorithm seems to perform better on the MIC,
853  // as long as the stochastic size isn't too big, but doesn't perform
854  // any better on the CPU (probably because the CPU has a fat L3 cache
855  // to store the sparse 3 tensor).
856 #ifdef __MIC__
857  const bool use_block_algorithm = true;
858 #else
859  const bool use_block_algorithm = false;
860 #endif
861 
862  const size_t row_count = A.graph.row_map.dimension_0() - 1 ;
863  if (use_block_algorithm) {
864 #ifdef __MIC__
865  const size_t team_size = 4; // 4 hyperthreads for MIC
866 #else
867  const size_t team_size = 2; // 2 for everything else
868 #endif
869  const size_t league_size = row_count;
870  Kokkos::TeamPolicy< execution_space > config(league_size, team_size);
871  Kokkos::parallel_for( config , Multiply(A,x,y,a,b) );
872  }
873  else {
874  Kokkos::parallel_for( row_count , Multiply(A,x,y,a,b) );
875  }
876  }
877 };
878 
879 template <typename MatrixType, typename InputViewType, typename OutputViewType>
880 class MeanMultiply {};
881 
882 // Kernel implementing y = A * x where PCE size of A is 1
883 // A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>, with A.values.sacado_size() == 1
884 // x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
885 // x and y are rank 1
886 template <typename MatrixDevice,
887  typename MatrixStorage,
888  typename MatrixOrdinal,
889  typename MatrixMemory,
890  typename MatrixSize,
891  typename InputStorage,
892  typename ... InputP,
893  typename OutputStorage,
894  typename ... OutputP>
895 class MeanMultiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
896  MatrixOrdinal,
897  MatrixDevice,
898  MatrixMemory,
899  MatrixSize>,
900  Kokkos::View< Sacado::UQ::PCE<InputStorage>*,
901  InputP... >,
902  Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
903  OutputP... >
904  >
905 {
906 public:
910 
911  typedef KokkosSparse::CrsMatrix< MatrixValue,
912  MatrixOrdinal,
913  MatrixDevice,
914  MatrixMemory,
915  MatrixSize> matrix_type;
916  typedef typename matrix_type::values_type matrix_values_type;
918  typedef Kokkos::View< InputVectorValue*,
919  InputP... > input_vector_type;
920  typedef Kokkos::View< OutputVectorValue*,
921  OutputP... > output_vector_type;
922 
923  typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
927 
928  template <int BlockSize>
929  struct BlockKernel {
930  typedef MatrixDevice execution_space;
932  typedef typename input_vector_type::array_type input_array_type;
933  typedef typename output_vector_type::array_type output_array_type;
934 
941  const size_type dim ;
943  const size_type rem ;
945 
947  const input_vector_type & x ,
948  const output_vector_type & y ,
949  const input_scalar & a ,
950  const output_scalar & b )
951  : m_A_values( A.values )
952  , m_A_graph( A.graph )
953  , v_y( y )
954  , v_x( x )
955  , m_a( a )
956  , m_b( b )
957  , dim( dimension_scalar(x) )
958  , numBlocks( dim / BlockSize )
959  , rem( dim % BlockSize )
960  , dim_block( numBlocks*BlockSize )
961  {}
962 
963  KOKKOS_INLINE_FUNCTION
964  void operator()( const size_type iBlockRow ) const
965  {
966 #if defined(__INTEL_COMPILER)&& ! defined(__CUDA_ARCH__)
967  output_scalar s[BlockSize] __attribute__((aligned(64))) = {};
968 #else
969  output_scalar s[BlockSize] = {};
970 #endif
971 
972  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
973  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
974  size_type pce_block = 0;
975  for (; pce_block < dim_block; pce_block+=BlockSize) {
976  output_scalar * const y = &v_y(pce_block, iBlockRow);
977  if (m_b == output_scalar(0))
978 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
979 #pragma ivdep
980 //#pragma vector aligned
981 #endif
982  for (size_type k = 0; k < BlockSize; ++k)
983  s[k] = 0.0;
984  else
985 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
986 #pragma ivdep
987 //#pragma vector aligned
988 #endif
989  for (size_type k = 0; k < BlockSize; ++k)
990  s[k] = m_b*y[k];
991  for (size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
992  const matrix_scalar aA = m_a*m_A_values(iEntry);
993  const size_type col = m_A_graph.entries(iEntry);
994  const input_scalar * const x = &v_x(pce_block, col);
995 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
996 #pragma ivdep
997 //#pragma vector aligned
998 #endif
999  for (size_type k = 0; k < BlockSize; ++k)
1000  s[k] += aA*x[k];
1001  }
1002 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1003 #pragma ivdep
1004 //#pragma vector aligned
1005 #endif
1006  for (size_type k = 0; k < BlockSize; ++k) {
1007  y[k] = s[k];
1008  }
1009  }
1010 
1011  // Remaining coeffs
1012  if (rem > 0) {
1013  output_scalar * const y = &v_y(pce_block, iBlockRow);
1014  if (m_b == output_scalar(0))
1015 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1016 #pragma ivdep
1017 //#pragma vector aligned
1018 #endif
1019  for (size_type k = 0; k < rem; ++k)
1020  s[k] = 0.0;
1021  else
1022 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1023 #pragma ivdep
1024 //#pragma vector aligned
1025 #endif
1026  for (size_type k = 0; k < rem; ++k)
1027  s[k] = m_b*y[k];
1028  for (size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
1029  const matrix_scalar aA = m_a*m_A_values(iEntry);
1030  const size_type col = m_A_graph.entries(iEntry);
1031  const input_scalar * const x = &v_x(pce_block, col);
1032 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1033 #pragma ivdep
1034 //#pragma vector aligned
1035 #endif
1036  for (size_type k = 0; k < rem; ++k)
1037  s[k] += aA*x[k];
1038  }
1039 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1040 #pragma ivdep
1041 //#pragma vector aligned
1042 #endif
1043  for (size_type k = 0; k < rem; ++k) {
1044  y[k] = s[k];
1045  }
1046  }
1047 
1048  }
1049 
1050  };
1051 
1052  struct Kernel {
1053  typedef MatrixDevice execution_space;
1055  typedef typename input_vector_type::array_type input_array_type;
1056  typedef typename output_vector_type::array_type output_array_type;
1057 
1064  const size_type dim ;
1065 
1066  Kernel( const matrix_type & A ,
1067  const input_vector_type & x ,
1068  const output_vector_type & y ,
1069  const input_scalar & a ,
1070  const output_scalar & b )
1071  : m_A_values( A.values )
1072  , m_A_graph( A.graph )
1073  , v_y( y )
1074  , v_x( x )
1075  , m_a( a )
1076  , m_b( b )
1077  , dim( dimension_scalar(x) )
1078  {}
1079 
1080  KOKKOS_INLINE_FUNCTION
1081  void operator()( const size_type iBlockRow ) const
1082  {
1083  const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
1084  const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
1085  output_scalar * const y = &v_y(0, iBlockRow);
1086  if (m_b == output_scalar(0))
1087 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1088 #pragma ivdep
1089 //#pragma vector aligned
1090 #endif
1091  for (size_type k = 0; k < dim; ++k)
1092  y[k] = 0.0;
1093  else
1094 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1095 #pragma ivdep
1096 //#pragma vector aligned
1097 #endif
1098  for (size_type k = 0; k < dim; ++k)
1099  y[k] = m_b*y[k];
1100  for (size_type iEntry = iEntryBegin; iEntry < iEntryEnd; ++iEntry) {
1101  const matrix_scalar aA = m_a*m_A_values(iEntry);
1102  const size_type col = m_A_graph.entries(iEntry);
1103  const input_scalar * const x = &v_x(0, col);
1104 #if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
1105 #pragma ivdep
1106 //#pragma vector aligned
1107 #endif
1108  for (size_type k = 0; k < dim; ++k)
1109  y[k] += aA*x[k];
1110  }
1111  }
1112 
1113  };
1114 
1115  static void apply( const matrix_type & A ,
1116  const input_vector_type & x ,
1117  const output_vector_type & y ,
1118  const input_scalar & a = input_scalar(1) ,
1119  const output_scalar & b = output_scalar(0) )
1120  {
1121  const size_t row_count = A.graph.row_map.dimension_0() - 1 ;
1122  const size_type dim = Kokkos::dimension_scalar(x);
1123 
1124  // Choose block size appropriately for PCE dimension
1125 #if defined (__MIC__)
1126  if (dim >= 128)
1127  Kokkos::parallel_for( row_count , Kernel(A,x,y,a,b) );
1128  else if (dim >= 64)
1129  Kokkos::parallel_for( row_count , BlockKernel<64>(A,x,y,a,b) );
1130  else if (dim >= 32)
1131  Kokkos::parallel_for( row_count , BlockKernel<32>(A,x,y,a,b) );
1132  else if (dim >= 16)
1133  Kokkos::parallel_for( row_count , BlockKernel<16>(A,x,y,a,b) );
1134  else if (dim >= 8)
1135  Kokkos::parallel_for( row_count , BlockKernel<8>(A,x,y,a,b) );
1136  else
1137  Kokkos::parallel_for( row_count , BlockKernel<4>(A,x,y,a,b) );
1138 #else
1139  if (dim >= 32)
1140  Kokkos::parallel_for( row_count , BlockKernel<32>(A,x,y,a,b) );
1141  else if (dim >= 16)
1142  Kokkos::parallel_for( row_count , BlockKernel<16>(A,x,y,a,b) );
1143  else if (dim >= 8)
1144  Kokkos::parallel_for( row_count , BlockKernel<8>(A,x,y,a,b) );
1145  else
1146  Kokkos::parallel_for( row_count , BlockKernel<4>(A,x,y,a,b) );
1147 #endif
1148  }
1149 };
1150 
1151 // Kernel implementing y = A * x where A has PCE size = 1
1152 // A == KokkosSparse::CrsMatrix< Sacado::UQ::PCE<...>,...>,
1153 // x, y == Kokkos::View< Sacado::UQ::PCE<...>**,...>,
1154 // x and y are rank 2
1155 template <typename MatrixDevice,
1156  typename MatrixStorage,
1157  typename MatrixOrdinal,
1158  typename MatrixMemory,
1159  typename MatrixSize,
1160  typename InputStorage,
1161  typename ... InputP,
1162  typename OutputStorage,
1163  typename ... OutputP>
1164 class MeanMultiply< KokkosSparse::CrsMatrix< Sacado::UQ::PCE<MatrixStorage>,
1165  MatrixOrdinal,
1166  MatrixDevice,
1167  MatrixMemory,
1168  MatrixSize>,
1169  Kokkos::View< Sacado::UQ::PCE<InputStorage>**,
1170  InputP... >,
1171  Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
1172  OutputP... >
1173  >
1174 {
1175 public:
1179 
1180  typedef KokkosSparse::CrsMatrix< MatrixValue,
1181  MatrixOrdinal,
1182  MatrixDevice,
1183  MatrixMemory,
1184  MatrixSize> matrix_type;
1185  typedef Kokkos::View< InputVectorValue**,
1186  InputP... > input_vector_type;
1187  typedef Kokkos::View< OutputVectorValue**,
1188  OutputP... > output_vector_type;
1189 
1190  typedef MatrixDevice execution_space;
1194 
1195  static void apply( const matrix_type & A ,
1196  const input_vector_type & x ,
1197  const output_vector_type & y ,
1198  const input_scalar & a = input_scalar(1) ,
1199  const output_scalar & b = output_scalar(0) )
1200  {
1201  typedef Kokkos::View< InputVectorValue*,
1202  InputP... > input_vector_1d_type;
1203  typedef Kokkos::View< OutputVectorValue*,
1204  OutputP... > output_vector_1d_type;
1205  typedef MeanMultiply<matrix_type,input_vector_1d_type,
1206  output_vector_1d_type> MeanMultiply1D;
1207  const size_type num_col = x.dimension_1();
1208  for (size_type i=0; i<num_col; ++i) {
1209  input_vector_1d_type x_col =
1210  Kokkos::subview(x, Kokkos::ALL(), i);
1211  output_vector_1d_type y_col =
1212  Kokkos::subview(y, Kokkos::ALL(), i);
1213  MeanMultiply1D::apply( A, x_col, y_col, a, b );
1214  }
1215  }
1216 };
1217 
1218 } // namespace Stokhos
1219 
1220 namespace KokkosSparse {
1221 
1222 template <typename AlphaType,
1223  typename BetaType,
1224  typename MatrixType,
1225  typename InputType,
1226  typename ... InputP,
1227  typename OutputType,
1228  typename ... OutputP>
1229 typename std::enable_if<
1230  Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&
1231  Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value
1232  >::type
1234  const char mode[],
1235  const AlphaType& a,
1236  const MatrixType& A,
1237  const Kokkos::View< InputType, InputP... >& x,
1238  const BetaType& b,
1239  const Kokkos::View< OutputType, OutputP... >& y,
1240  const RANK_ONE)
1241 {
1242  typedef Kokkos::View< OutputType, OutputP... > OutputVectorType;
1243  typedef Kokkos::View< InputType, InputP... > InputVectorType;
1244  typedef Stokhos::Multiply<MatrixType,InputVectorType,
1245  OutputVectorType> multiply_type;
1246  typedef Stokhos::MeanMultiply<MatrixType,InputVectorType,
1247  OutputVectorType> mean_multiply_type;
1248 
1249  if(mode[0]!='N') {
1251  "Stokhos spmv not implemented for transposed or conjugated matrix-vector multiplies");
1252  }
1253 
1254  if (!Sacado::is_constant(a) || !Sacado::is_constant(b)) {
1256  "Stokhos spmv not implemented for non-constant a or b");
1257  }
1258  if (dimension_scalar(A.values) == 1 && dimension_scalar(x) != 1) {
1259  mean_multiply_type::apply( A, x, y,
1260  Sacado::Value<AlphaType>::eval(a),
1261  Sacado::Value<BetaType>::eval(b) );
1262  }
1263  else
1264  multiply_type::apply( A, x, y,
1265  Sacado::Value<AlphaType>::eval(a),
1266  Sacado::Value<BetaType>::eval(b) );
1267 }
1268 
1269 template <typename AlphaType,
1270  typename BetaType,
1271  typename MatrixType,
1272  typename InputType,
1273  typename ... InputP,
1274  typename OutputType,
1275  typename ... OutputP>
1276 typename std::enable_if<
1277  Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&
1278  Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value
1279  >::type
1281  const char mode[],
1282  const AlphaType& a,
1283  const MatrixType& A,
1284  const Kokkos::View< InputType, InputP... >& x,
1285  const BetaType& b,
1286  const Kokkos::View< OutputType, OutputP... >& y,
1287  const RANK_TWO)
1288 {
1289  if(mode[0]!='N') {
1291  "Stokhos spmv not implemented for transposed or conjugated matrix-vector multiplies");
1292  }
1293  if (y.dimension_1() == 1) {
1294  auto y_1D = subview(y, Kokkos::ALL(), 0);
1295  auto x_1D = subview(x, Kokkos::ALL(), 0);
1296  spmv(mode, a, A, x_1D, b, y_1D, RANK_ONE());
1297  }
1298  else {
1299  typedef Kokkos::View< OutputType, OutputP... > OutputVectorType;
1300  typedef Kokkos::View< InputType, InputP... > InputVectorType;
1301  typedef Stokhos::Multiply<MatrixType,InputVectorType,
1302  OutputVectorType> multiply_type;
1303  typedef Stokhos::MeanMultiply<MatrixType,InputVectorType,
1304  OutputVectorType> mean_multiply_type;
1305 
1306  if (!Sacado::is_constant(a) || !Sacado::is_constant(b)) {
1308  "Stokhos spmv not implemented for non-constant a or b");
1309  }
1310  if (dimension_scalar(A.values) == 1 && dimension_scalar(x) != 1) {
1311  mean_multiply_type::apply( A, x, y,
1312  Sacado::Value<AlphaType>::eval(a),
1313  Sacado::Value<BetaType>::eval(b));
1314  }
1315  else
1316  multiply_type::apply( A, x, y,
1317  Sacado::Value<AlphaType>::eval(a),
1318  Sacado::Value<BetaType>::eval(b));
1319  }
1320 }
1321 
1322 }
1323 
1324 #endif /* #ifndef KOKKOS_CRSMATRIX_UQ_PCE_HPP */
KOKKOS_INLINE_FUNCTION void raise_error(const char *msg)
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType * x
Definition: csr_vector.h:260
Top-level namespace for Stokhos classes and functions.
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P... > >::value, unsigned >::type dimension_scalar(const View< T, P... > &view)
KOKKOS_INLINE_FUNCTION bool is_constant(const T &x)
expr expr expr expr j
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
Kokkos::DefaultExecutionSpace device
KOKKOS_INLINE_FUNCTION void zero()
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType ValueType * y
Definition: csr_vector.h:267
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value >::type spmv(const char mode[], const AlphaType &a, const MatrixType &A, const Kokkos::View< InputType, InputP... > &x, const BetaType &b, const Kokkos::View< OutputType, OutputP... > &y, const RANK_ONE)