1 #ifndef VIENNACL_LINALG_HOST_BASED_SSE_KERNELS_HPP_
2 #define VIENNACL_LINALG_HOST_BASED_SSE_KERNELS_HPP_
27 #ifdef VIENNACL_WITH_OPENMP
49 template<
typename ScalarType>
60 template<
typename ScalarType>
72 template<
typename ScalarType>
97 s=-s/A[
row][row+from];
98 _axpy(&A[row][row+from],&A[j][row+from],to-from,s);
111 ss[i]=-ss[i]/A[row][row+from];
113 _axpy(ss,&A[row+i][row],width,A[row][row+i]);
116 for (
vcl_size_t col=row+from;col<row+to;col++)
126 template<
typename ScalarType>
137 for (;k<n-belowDiagonal;k++)
144 for (
vcl_size_t bulgeStart=k+1;bulgeStart<n-belowDiagonal;bulgeStart+=belowDiagonal)
158 template<
typename ScalarType>
164 for (
vcl_size_t k=0;k<n-block_size;k+=block_size)
170 norms[bi]=
_nrm2(&A[k+bi][k+bi+block_size],n-k-bi-block_size);
177 if (std::abs(A[k+bi][k+bi+block_size]-
ScalarType(1))>std::abs(A[k+bi][k+bi+block_size]+
ScalarType(1)))
178 norms[bi]=-norms[bi];
180 A[k+bi][i]/=norms[bi];
186 ScalarType s=
_dotc(n-k-bi-block_size,&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size]);
187 s=-s/A[k+bi][k+bi+block_size];
188 _axpy(&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size],n-k-bi-block_size,s);
196 #ifdef VIENNACL_WITH_OPENMP
197 #pragma omp parallel for
198 for (
int j=k+block_size;j<(int)n;j++)
207 ScalarType s=
_dotc(n-k-bi-block_size,&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size]);
208 s=-s/A[k+bi][k+bi+block_size];
209 _axpy(&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size],n-k-bi-block_size,s);
220 #ifdef VIENNACL_WITH_OPENMP
221 #pragma omp parallel for
222 for (
int section=0;section<(int)num_threads;section++)
224 for (
vcl_size_t section=0;section<num_threads;section++)
228 vcl_size_t end =((n-k)*(section+1))/num_threads+k;
239 ss[i]=-ss[i]/A[k+bi][k+bi+block_size];
241 _axpy(ss+start,&A[i][start],length,A[k+bi][i]);
265 template<
typename ScalarType>
269 std::cerr <<
"ViennaCL: Warning in inplace_tred2(): Matrix is not hermitian (or real symmetric)" << std::endl;
275 if (bandwidth*bandwidth*num_threads<n*4 || 2*block_size+1>bandwidth)
294 template<
typename ScalarType>
306 block_size=
std::min(std::min(m-j,n-j),block_size);
317 if (std::abs(A[i][j+bi])>std::abs(A[p][j+bi]))
345 _axpy(&(A[j+bi_][j+block_size]),&(A[
row][j+block_size]),n-j-block_size,-A[
row][j+bi_]);
351 for (
vcl_size_t col=j+bi;col<j+block_size;col++)
352 A[
row][col]-=multiplier*A[j+bi][col];
353 A[
row][j+bi]=multiplier;
372 _axpy(&(A[j+bi][j+block_size]),&(A[
row][j+block_size]),n-j-block_size,-A[
row][j+bi]);
386 #ifdef VIENNACL_OPENMP
387 #pragma omp parallel for
388 for (
int row=j+block_size;
row<(int)m;
row++)
394 _axpy(&(A[j+bi][j+block_size]),&(A[
row][j+block_size]),n-j-block_size,-A[
row][j+bi]);
406 template<
typename ScalarType>
409 std::vector<ScalarType> betas(
std::min(m,n));
414 block_size=
std::min(std::min(m-k,n-k),block_size);
420 norms[bi]=
_nrm2(&A[k+bi][k+bi],m-k-bi);
429 A[k+bi][i]/=norms[bi];
433 for (
vcl_size_t j=k+bi+1; j<k+block_size; j++)
436 s = -s/A[k+bi][k+bi];
437 _axpy(&A[k+bi][k+bi],&A[j][k+bi],m-k-bi,s);
441 betas[k+bi]=-norms[bi];
445 #ifdef VIENNACL_OPENMP
446 #pragma omp parallel for
447 for (
int j=k+block_size; j<(int)n; j++)
457 s = -s/A[k+bi][k+bi];
458 _axpy(&A[k+bi][k+bi],A[j]+k+bi,m-k-bi,s);
486 template<
typename ScalarType>
489 std::vector<ScalarType> betas(
std::min(m,n));
500 block_size=
std::min(std::min(m-k,n-k),block_size);
505 block_cols[bi][i]=A[k+i][k+bi];
511 norms[bi]=
_nrm2(&block_cols[bi][bi],m-k-bi);
520 block_cols[bi][i]/=norms[bi];
527 s = -s/block_cols[bi][bi];
528 _axpy(&block_cols[bi][bi],&block_cols[j][bi],m-k-bi,s);
532 betas[k+bi]=-norms[bi];
538 A[k+i][k+bi]=block_cols[bi][i];
541 #ifdef VIENNACL_OPENMP
542 #pragma omp parallel for
543 for (
int section=0;section<(int)num_threads;section++)
545 for (
vcl_size_t section=0;section<num_threads;section++)
548 vcl_size_t start=((n-k-block_size)*(section+0))/num_threads+k+block_size;
549 vcl_size_t end =((n-k-block_size)*(section+1))/num_threads+k+block_size;
558 _axpy(&A[i][start],ss+start,length,A[i][k+bi]);
560 ss[i]=-ss[i]/A[k+bi][k+bi];
562 _axpy(ss+start,&A[i][start],length,A[i][k+bi]);
580 delete [] block_cols[i];
581 delete [] block_cols;
optimized BLAS functions using SSE2 and SSE3 intrinsic functions
std::vector< ScalarType > inplace_qr_row_major(ScalarType **A, vcl_size_t m, vcl_size_t n, vcl_size_t block_size=8, vcl_size_t num_threads=1)
Inplace qr factorization of an m x n dense row-major matrix, returning the householder normalization ...
void _axpy(const T *, T *, vcl_size_t, T)
std::vector< ScalarType > inplace_qr_col_major(ScalarType **A, vcl_size_t m, vcl_size_t n, vcl_size_t block_size=8)
Inplace qr factorization of an m x n dense column-major matrix, returning the householder normalizati...
void eliminateHermitian(ScalarType **A, vcl_size_t row, vcl_size_t from, vcl_size_t to, vcl_size_t width, ScalarType *ss)
void reduceHermitianToBandedMatrix(ScalarType **A, vcl_size_t n, vcl_size_t block_size, vcl_size_t num_threads)
result_of::size_type< T >::type start(T const &obj)
bool lu_factorize_row_major(ScalarType **A, vcl_size_t m, vcl_size_t n, vcl_size_t *piv=NULL, vcl_size_t block_size=8)
Inplace lu factorization of an m x n dense row-major matrix with optional partial pivoting...
T _nrm2(const T *, vcl_size_t)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
T _dotc(vcl_size_t, const T *, const T *)
bool isHermitian(ScalarType **const A, vcl_size_t n)
vcl_size_t getHermitianBandwidth(ScalarType **const A, vcl_size_t n)
T min(const T &lhs, const T &rhs)
Minimum.
void tridiagonalizeHermitianBandedMatrix(ScalarType **A, vcl_size_t n, vcl_size_t bandwidth)
void inplace_tred2(ScalarType **A, vcl_size_t n, vcl_size_t block_size=1, vcl_size_t num_threads=1)
Inplace reduction of a dense n x n row-major or column-major hermitian (or real symmetric) matrix to ...