17#include "../_c_arithmetics/c_arithmetics.h"
18#include "../_definitions/definitions.h"
21#if defined(USE_OPENMP) && (USE_OPENMP == 1)
25#if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
69template <
typename DataType>
79 #if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
84 CBLAS_TRANSPOSE transpose = CblasNoTrans;
88 layout = CblasRowMajor;
94 layout = CblasColMajor;
101 transpose = CblasTrans;
107 DataType alpha = 1.0;
112 cblas_api::xsymv(layout, uplo, num_columns, alpha, A, lda, b, incb,
117 cblas_api::xgemv(layout, transpose, num_rows, num_columns, alpha, A,
118 lda, b, incb, beta, c, incc);
127 const long int chunk = 5;
128 const long int num_columns_chunked = num_columns - (num_columns % chunk);
131 #if !defined(USE_LOOP_UNROLLING) || (USE_LOOP_UNROLLING != 1)
132 (void) num_columns_chunked;
140 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
141 #pragma omp parallel for \
143 if (!omp_in_parallel()) \
145 shared(A, b, c, jump, num_rows, num_columns, num_columns_chunked, \
149 for (
long int i=0; i < num_rows; ++i)
152 jump = i * num_columns;
153 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
154 for (j=0; j < num_columns_chunked; j+= chunk)
157 sum += A[jump + j] * b[j] +
158 A[jump + j+1] * b[j+1] +
159 A[jump + j+2] * b[j+2] +
160 A[jump + j+3] * b[j+3] +
161 A[jump + j+4] * b[j+4];
165 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
166 for (j=num_columns_chunked; j < num_columns; ++j)
168 for (j=0; j < num_columns; ++j)
171 sum += A[jump + j] * b[j];
174 c[i] =
static_cast<DataType
>(sum);
177 else if (A_is_symmetric)
183 A, b, num_rows, num_columns, A_is_row_major, A_is_symmetric, c);
188 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
189 #pragma omp parallel for \
191 if (!omp_in_parallel()) \
193 shared(A, b, c, num_rows, num_columns) \
196 for (
long int i=0; i < num_rows; ++i)
199 for (j=0; j < num_columns; ++j)
202 sum += A[i + num_rows*j] * b[j];
204 c[i] =
static_cast<DataType
>(sum);
253template <
typename DataType>
257 const DataType alpha,
264 #if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
269 CBLAS_TRANSPOSE transpose = CblasNoTrans;
273 layout = CblasRowMajor;
279 layout = CblasColMajor;
286 transpose = CblasTrans;
296 cblas_api::xsymv(layout, uplo, num_columns, alpha, A, lda, b, incb,
301 cblas_api::xgemv(layout, transpose, num_rows, num_columns, alpha, A,
302 lda, b, incb, beta, c, incc);
317 const long int chunk = 5;
318 const long int num_columns_chunked = num_columns - (num_columns % chunk);
321 #if !defined(USE_LOOP_UNROLLING) || (USE_LOOP_UNROLLING != 1)
322 (void) num_columns_chunked;
330 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
331 #pragma omp parallel for \
333 if (!omp_in_parallel()) \
335 shared(A, b, c, jump, alpha, num_rows, num_columns, chunk, \
336 num_columns_chunked) \
339 for (
long int i=0; i < num_rows; ++i)
342 jump = i * num_columns;
343 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
344 for (j=0; j < num_columns_chunked; j+= chunk)
346 sum += A[jump + j] * b[j] +
347 A[jump + j+1] * b[j+1] +
348 A[jump + j+2] * b[j+2] +
349 A[jump + j+3] * b[j+3] +
350 A[jump + j+4] * b[j+4];
354 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
355 for (j=num_columns_chunked; j < num_columns; ++j)
357 for (j=0; j < num_columns; ++j)
360 sum += A[jump + j] * b[j];
363 c[i] += alpha *
static_cast<DataType
>(sum);
366 else if (A_is_symmetric)
372 A, b, alpha, num_rows, num_columns, A_is_row_major, A_is_symmetric,
378 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
379 #pragma omp parallel for \
381 if (!omp_in_parallel()) \
383 shared(A, b, c, alpha, num_rows, num_columns) \
386 for (
long int i=0; i < num_rows; ++i)
389 for (j=0; j < num_columns; ++j)
391 sum += A[i + num_rows*j] * b[j];
393 c[i] += alpha*
static_cast<DataType
>(sum);
441template <
typename DataType>
451 #if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
456 CBLAS_TRANSPOSE transpose = CblasTrans;
460 layout = CblasRowMajor;
466 layout = CblasColMajor;
473 transpose = CblasNoTrans;
479 DataType alpha = 1.0;
484 cblas_api::xsymv(layout, uplo, num_columns, alpha, A, lda, b, incb,
489 cblas_api::xgemv(layout, transpose, num_rows, num_columns, alpha, A,
490 lda, b, incb, beta, c, incc);
499 const long int chunk = 5;
500 const long int num_rows_chunked = num_rows - (num_rows % chunk);
503 #if !defined(USE_LOOP_UNROLLING) || (USE_LOOP_UNROLLING != 1)
504 (void) num_rows_chunked;
513 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
514 #pragma omp parallel for \
516 if (!omp_in_parallel()) \
518 shared(A, b, c, jump, num_rows, num_columns, num_rows_chunked, \
522 for (
long int j=0; j < num_columns; ++j)
527 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
528 for (i=0; i < num_rows_chunked; i += chunk)
530 sum += A[i + jump] * b[i] +
531 A[i+1 + jump] * b[i+1] +
532 A[i+2 + jump] * b[i+2] +
533 A[i+3 + jump] * b[i+3] +
534 A[i+4 + jump] * b[i+4];
538 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
539 for (i=num_rows_chunked; i < num_rows; ++i)
541 for (i=0; i < num_rows; ++i)
544 sum += A[i + jump] * b[i];
547 c[j] =
static_cast<DataType
>(sum);
550 else if (A_is_symmetric)
556 A, b, num_rows, num_columns, A_is_row_major, A_is_symmetric, c);
561 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
562 #pragma omp parallel for \
564 if (!omp_in_parallel()) \
566 shared(A, b, c, num_rows, num_columns) \
569 for (
long int j=0; j < num_columns; ++j)
572 for (i=0; i < num_rows; ++i)
574 sum += A[i*num_columns + j] * b[i];
576 c[j] =
static_cast<DataType
>(sum);
626template <
typename DataType>
630 const DataType alpha,
637 #if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
641 CBLAS_TRANSPOSE transpose = CblasTrans;
646 layout = CblasRowMajor;
652 layout = CblasColMajor;
659 transpose = CblasNoTrans;
669 cblas_api::xsymv(layout, uplo, num_columns, alpha, A, lda, b, incb,
674 cblas_api::xgemv(layout, transpose, num_rows, num_columns, alpha, A,
675 lda, b, incb, beta, c, incc);
690 const long int chunk = 5;
691 const long int num_rows_chunked = num_rows - (num_rows % chunk);
694 #if !defined(USE_LOOP_UNROLLING) || (USE_LOOP_UNROLLING != 1)
695 (void) num_rows_chunked;
704 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
705 #pragma omp parallel for \
707 if (!omp_in_parallel()) \
709 shared(A, b, c, jump, alpha, num_rows, num_columns, \
710 num_rows_chunked, chunk) \
713 for (
long int j=0; j < num_columns; ++j)
717 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
718 for (i=0; i < num_rows_chunked; i += chunk)
720 sum += A[i + jump] * b[i] +
721 A[i+1 + jump] * b[i+1] +
722 A[i+2 + jump] * b[i+2] +
723 A[i+3 + jump] * b[i+3] +
724 A[i+4 + jump] * b[i+4];
728 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
729 for (i=num_rows_chunked; i < num_rows; ++i)
731 for (i=0; i < num_rows; ++i)
734 sum += A[i + jump] * b[i];
737 c[j] += alpha *
static_cast<DataType
>(sum);
740 else if (A_is_symmetric)
746 A, b, alpha, num_rows, num_columns, A_is_row_major, A_is_symmetric,
752 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
753 #pragma omp parallel for \
755 if (!omp_in_parallel()) \
757 shared(A, b, c, alpha, num_rows, num_columns) \
760 for (
long int j=0; j < num_columns; ++j)
763 for (i=0; i < num_rows; ++i)
765 sum += A[i*num_columns + j] * b[i];
767 c[j] += alpha *
static_cast<DataType
>(sum);
813template <
typename DataType>
827 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
828 #pragma omp parallel for \
830 if (!omp_in_parallel()) \
832 shared(A_data, A_column_indices, A_index_pointer, b, c, num_rows) \
833 private(index_pointer, column, sum)
835 for (row=0; row < num_rows; ++row)
838 for (index_pointer=A_index_pointer[row];
839 index_pointer < A_index_pointer[row+1];
842 column = A_column_indices[index_pointer];
843 sum += A_data[index_pointer] * b[column];
845 c[row] =
static_cast<DataType
>(sum);
892template <
typename DataType>
894 const DataType* A_data,
898 const DataType alpha,
913 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
914 #pragma omp parallel for \
916 if (!omp_in_parallel()) \
918 shared(A_data, A_column_indices, A_index_pointer, b, c, alpha, \
920 private(index_pointer, column, sum)
922 for (row=0; row < num_rows; ++row)
925 for (index_pointer=A_index_pointer[row];
926 index_pointer < A_index_pointer[row+1];
929 column = A_column_indices[index_pointer];
930 sum += A_data[index_pointer] * b[column];
932 c[row] += alpha *
static_cast<DataType
>(sum);
971template <
typename DataType>
986 A_data, A_column_indices, A_index_pointer, b, num_rows, c);
996 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
997 #pragma omp parallel for \
999 if ((!omp_in_parallel()) && (num_columns >= LARGE_ARRAY_SIZE)) \
1000 default(none) shared(c, num_columns)
1002 for (column=0; column < num_columns; ++column)
1007 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1008 #pragma omp parallel for \
1010 if (!omp_in_parallel()) \
1012 shared(A_data, A_column_indices, A_index_pointer, b, c, num_rows) \
1013 private(index_pointer, column)
1015 for (row=0; row < num_rows; ++row)
1017 for (index_pointer=A_index_pointer[row];
1018 index_pointer < A_index_pointer[row+1];
1021 column = A_column_indices[index_pointer];
1023 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1024 #pragma omp atomic update
1026 c[column] += A_data[index_pointer] * b[row];
1069template <
typename DataType>
1076 const DataType alpha,
1080 DataType zero = 0.0;
1090 A_data, A_column_indices, A_index_pointer, b, alpha, num_rows, c);
1099 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1100 #pragma omp parallel for \
1102 if (!omp_in_parallel()) \
1104 shared(A_data, A_column_indices, A_index_pointer, b, c, alpha, \
1106 private(index_pointer, column)
1108 for (row=0; row < num_rows; ++row)
1110 for (index_pointer=A_index_pointer[row];
1111 index_pointer < A_index_pointer[row+1];
1114 column = A_column_indices[index_pointer];
1116 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1117 #pragma omp atomic update
1119 c[column] += alpha * A_data[index_pointer] * b[row];
1160template <
typename DataType>
1175 A_data, A_row_indices, A_index_pointer, b, num_columns, c);
1185 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1186 #pragma omp parallel for \
1188 if (!omp_in_parallel() && (num_rows >= LARGE_ARRAY_SIZE)) \
1192 for (row=0; row < num_rows; ++row)
1197 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1198 #pragma omp parallel for \
1200 if (!omp_in_parallel()) \
1202 shared(A_data, A_row_indices, A_index_pointer, b, c, num_columns) \
1203 private(index_pointer, row)
1205 for (column=0; column < num_columns; ++column)
1207 for (index_pointer=A_index_pointer[column];
1208 index_pointer < A_index_pointer[column+1];
1211 row = A_row_indices[index_pointer];
1213 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1214 #pragma omp atomic update
1216 c[row] += A_data[index_pointer] * b[column];
1259template <
typename DataType>
1266 const DataType alpha,
1270 DataType zero = 0.0;
1280 A_data, A_row_indices, A_index_pointer, b, alpha, num_columns, c);
1289 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1290 #pragma omp parallel for \
1292 if (!omp_in_parallel()) \
1294 shared(A_data, A_row_indices, A_index_pointer, b, c, alpha, \
1296 private(index_pointer, row)
1298 for (column=0; column < num_columns; ++column)
1300 for (index_pointer=A_index_pointer[column];
1301 index_pointer < A_index_pointer[column+1];
1304 row = A_row_indices[index_pointer];
1306 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1307 #pragma omp atomic update
1309 c[row] += alpha * A_data[index_pointer] * b[column];
1355template <
typename DataType>
1369 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1370 #pragma omp parallel for \
1372 if (!omp_in_parallel()) \
1374 shared(A_data, A_row_indices, A_index_pointer, b, c, num_columns) \
1375 private(index_pointer, row, sum)
1377 for (column=0; column < num_columns; ++column)
1380 for (index_pointer=A_index_pointer[column];
1381 index_pointer < A_index_pointer[column+1];
1384 row = A_row_indices[index_pointer];
1385 sum += A_data[index_pointer] * b[row];
1387 c[column] =
static_cast<DataType
>(sum);
1434template <
typename DataType>
1440 const DataType alpha,
1444 DataType zero = 0.0;
1455 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1456 #pragma omp parallel for \
1458 if (!omp_in_parallel()) \
1460 shared(A_data, A_row_indices, A_index_pointer, b, c, alpha, \
1462 private(index_pointer, row, sum)
1464 for (column=0; column < num_columns; ++column)
1467 for (index_pointer=A_index_pointer[column];
1468 index_pointer < A_index_pointer[column+1];
1471 row = A_row_indices[index_pointer];
1472 sum += A_data[index_pointer] * b[row];
1474 c[column] +=
static_cast<DataType
>(alpha * sum);
1535template <
typename DataType>
1541 const DataType*
RESTRICT diagonals,
1542 const DataType*
RESTRICT supdiagonals,
1549 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1550 #pragma omp parallel for \
1552 if (!omp_in_parallel() && (non_zero_size >= LARGE_ARRAY_SIZE)) \
1554 shared(A, diagonals, supdiagonals, non_zero_size, tridiagonal, \
1557 for (
IndexType j=0; j < non_zero_size; ++j)
1560 A[j*num_columns + j] = diagonals[j];
1563 if (j < non_zero_size-1)
1566 A[j*num_columns + j+1] = supdiagonals[j];
1571 A[(j+1)*num_columns + j] = supdiagonals[j];
1579 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1580 #pragma omp parallel for \
1582 if (!omp_in_parallel() && (non_zero_size >= LARGE_ARRAY_SIZE)) \
1584 shared(A, diagonals, supdiagonals, non_zero_size, tridiagonal, \
1587 for (
IndexType j=0; j < non_zero_size; ++j)
1590 A[j + num_rows*j] = diagonals[j];
1593 if (j < non_zero_size-1)
1596 A[j + (j+1)*num_rows] = supdiagonals[j];
1601 A[j+1 + j*num_rows] = supdiagonals[j];
A static class for matrix-vector operations, which are similar to the level-2 operations of the BLAS ...
static void csc_transposed_matvec(const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_row_indices, const LongIndexType *RESTRICT A_index_pointer, const DataType *RESTRICT b, const LongIndexType num_columns, DataType *RESTRICT c)
Computes where is compressed sparse column (CSC) matrix and is a dense vector. The output is a de...
static void dense_matvec(const DataType *RESTRICT A, const DataType *RESTRICT b, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, const FlagType A_is_symmetric, DataType *RESTRICT c)
Computes the matrix vector multiplication where is a dense matrix.
static void csr_transposed_matvec(const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_column_indices, const LongIndexType *RESTRICT A_index_pointer, const FlagType A_is_symmetric, const DataType *RESTRICT b, const LongIndexType num_rows, const LongIndexType num_columns, DataType *RESTRICT c)
Computes where is compressed sparse row (CSR) matrix and is a dense vector. The output is a dense...
static void dense_matvec_plus(const DataType *RESTRICT A, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, const FlagType A_is_symmetric, DataType *RESTRICT c)
Computes the operation where is a dense matrix.
static void create_band_matrix(DataType *RESTRICT A, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, const DataType *RESTRICT diagonals, const DataType *RESTRICT supdiagonals, const IndexType non_zero_size, const FlagType tridiagonal)
Creates bi-diagonal or symmetric tri-diagonal matrix from the diagonal array (diagonals) and off-diag...
static void csc_transposed_matvec_plus(const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_row_indices, const LongIndexType *RESTRICT A_index_pointer, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_columns, DataType *RESTRICT c)
Computes where is compressed sparse column (CSC) matrix and is a dense vector. The output is a de...
static void csc_matvec_plus(const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_row_indices, const LongIndexType *RESTRICT A_index_pointer, const FlagType A_is_symmetric, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_columns, DataType *RESTRICT c)
Computes where is compressed sparse column (CSC) matrix and is a dense vector. The output is a de...
static void dense_transposed_matvec(const DataType *RESTRICT A, const DataType *RESTRICT b, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, const FlagType A_is_symmetric, DataType *RESTRICT c)
Computes matrix vector multiplication where is dense, and is the transpose of the matrix .
static void csr_transposed_matvec_plus(const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_column_indices, const LongIndexType *RESTRICT A_index_pointer, const FlagType A_is_symmetric, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_rows, DataType *RESTRICT c)
Computes where is compressed sparse row (CSR) matrix and is a dense vector. The output is a dense...
static void csr_matvec(const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_column_indices, const LongIndexType *RESTRICT A_index_pointer, const DataType *RESTRICT b, const LongIndexType num_rows, DataType *RESTRICT c)
Computes where is compressed sparse row (CSR) matrix and is a dense vector. The output is a dense...
static void dense_transposed_matvec_plus(const DataType *RESTRICT A, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, const FlagType A_is_symmetric, DataType *RESTRICT c)
Computes where is dense, and is the transpose of the matrix .
static void csc_matvec(const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_row_indices, const LongIndexType *RESTRICT A_index_pointer, const FlagType A_is_symmetric, const DataType *RESTRICT b, const LongIndexType num_rows, const LongIndexType num_columns, DataType *RESTRICT c)
Computes where is compressed sparse column (CSC) matrix and is a dense vector. The output is a de...
static void csr_matvec_plus(const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_column_indices, const LongIndexType *RESTRICT A_index_pointer, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_rows, DataType *RESTRICT c)
Computes where is compressed sparse row (CSR) matrix and is a dense vector. The output is a dense...
bool is_equal(DataType x, DataType y)
Check if two floating point numbers are equal within a tolerance.