imate
C++/CUDA Reference
Loading...
Searching...
No Matches
cMatrixOperations< DataType > Class Template Reference

A static class for matrix-vector operations, which are similar to the level-2 operations of the BLAS library. This class acts as a templated namespace, where all member methods are public and static. More...

#include <c_matrix_operations.h>

Static Public Member Functions

static void dense_matvec (const DataType *RESTRICT A, const DataType *RESTRICT b, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, const FlagType A_is_symmetric, DataType *RESTRICT c)
 Computes the matrix vector multiplication \( \boldsymbol{c} = \mathbf{A} \boldsymbol{b} \) where \( \mathbf{A} \) is a dense matrix.
 
static void dense_matvec_plus (const DataType *RESTRICT A, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, const FlagType A_is_symmetric, DataType *RESTRICT c)
 Computes the operation \( \boldsymbol{c} = \boldsymbol{c} + \alpha \mathbf{A} \boldsymbol{b} \) where \( \mathbf{A} \) is a dense matrix.
 
static void dense_transposed_matvec (const DataType *RESTRICT A, const DataType *RESTRICT b, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, const FlagType A_is_symmetric, DataType *RESTRICT c)
 Computes matrix vector multiplication \(\boldsymbol{c} = \mathbf{A}^{\intercal} \boldsymbol{b} \) where \( \mathbf{A} \) is dense, and \( \mathbf{A}^{\intercal} \) is the transpose of the matrix \( \mathbf{A} \).
 
static void dense_transposed_matvec_plus (const DataType *RESTRICT A, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, const FlagType A_is_symmetric, DataType *RESTRICT c)
 Computes \( \boldsymbol{c} = \boldsymbol{c} + \alpha \mathbf{A}^{\intercal} \boldsymbol{b} \) where \( \mathbf{A} \) is dense, and \( \mathbf{A}^{\intercal} \) is the transpose of the matrix \( \mathbf{A} \).
 
static void csr_matvec (const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_column_indices, const LongIndexType *RESTRICT A_index_pointer, const DataType *RESTRICT b, const LongIndexType num_rows, DataType *RESTRICT c)
 Computes \( \boldsymbol{c} = \mathbf{A} \boldsymbol{b} \) where \( \mathbf{A} \) is compressed sparse row (CSR) matrix and \( \boldsymbol{b} \) is a dense vector. The output \( \boldsymbol{c} \) is a dense vector.
 
static void csr_matvec_plus (const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_column_indices, const LongIndexType *RESTRICT A_index_pointer, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_rows, DataType *RESTRICT c)
 Computes \( \boldsymbol{c} = \boldsymbol{c} + \alpha \mathbf{A} \boldsymbol{b} \) where \( \mathbf{A} \) is compressed sparse row (CSR) matrix and \( \boldsymbol{b} \) is a dense vector. The output \( \boldsymbol{c} \) is a dense vector.
 
static void csr_transposed_matvec (const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_column_indices, const LongIndexType *RESTRICT A_index_pointer, const FlagType A_is_symmetric, const DataType *RESTRICT b, const LongIndexType num_rows, const LongIndexType num_columns, DataType *RESTRICT c)
 Computes \(\boldsymbol{c} =\mathbf{A}^{\intercal} \boldsymbol{b}\) where \( \mathbf{A} \) is compressed sparse row (CSR) matrix and \( \boldsymbol{b} \) is a dense vector. The output \( \boldsymbol{c} \) is a dense vector.
 
static void csr_transposed_matvec_plus (const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_column_indices, const LongIndexType *RESTRICT A_index_pointer, const FlagType A_is_symmetric, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_rows, DataType *RESTRICT c)
 Computes \( \boldsymbol{c} = \boldsymbol{c} + \alpha \mathbf{A}^{\intercal} \boldsymbol{b}\) where \( \mathbf{A} \) is compressed sparse row (CSR) matrix and \( \boldsymbol{b} \) is a dense vector. The output \( \boldsymbol{c} \) is a dense vector.
 
static void csc_matvec (const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_row_indices, const LongIndexType *RESTRICT A_index_pointer, const FlagType A_is_symmetric, const DataType *RESTRICT b, const LongIndexType num_rows, const LongIndexType num_columns, DataType *RESTRICT c)
 Computes \( \boldsymbol{c} = \mathbf{A} \boldsymbol{b} \) where \( \mathbf{A} \) is compressed sparse column (CSC) matrix and \( \boldsymbol{b} \) is a dense vector. The output \( \boldsymbol{c} \) is a dense vector.
 
static void csc_matvec_plus (const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_row_indices, const LongIndexType *RESTRICT A_index_pointer, const FlagType A_is_symmetric, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_columns, DataType *RESTRICT c)
 Computes \( \boldsymbol{c} = \boldsymbol{c} + \alpha \mathbf{A} \boldsymbol{b} \) where \( \mathbf{A} \) is compressed sparse column (CSC) matrix and \( \boldsymbol{b} \) is a dense vector. The output \( \boldsymbol{c} \) is a dense vector.
 
static void csc_transposed_matvec (const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_row_indices, const LongIndexType *RESTRICT A_index_pointer, const DataType *RESTRICT b, const LongIndexType num_columns, DataType *RESTRICT c)
 Computes \(\boldsymbol{c} =\mathbf{A}^{\intercal} \boldsymbol{b} \) where \( \mathbf{A} \) is compressed sparse column (CSC) matrix and \( \boldsymbol{b} \) is a dense vector. The output \( \boldsymbol{c} \) is a dense vector.
 
static void csc_transposed_matvec_plus (const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_row_indices, const LongIndexType *RESTRICT A_index_pointer, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_columns, DataType *RESTRICT c)
 Computes \( \boldsymbol{c} = \boldsymbol{c} + \alpha \mathbf{A}^{\intercal} \boldsymbol{b} \) where \( \mathbf{A} \) is compressed sparse column (CSC) matrix and \( \boldsymbol{b} \) is a dense vector. The output \( \boldsymbol{c} \) is a dense vector.
 
static void create_band_matrix (DataType *RESTRICT A, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, const DataType *RESTRICT diagonals, const DataType *RESTRICT supdiagonals, const IndexType non_zero_size, const FlagType tridiagonal)
 Creates bi-diagonal or symmetric tri-diagonal matrix from the diagonal array (diagonals) and off-diagonal array (supdiagonals).
 

Detailed Description

template<typename DataType>
class cMatrixOperations< DataType >

A static class for matrix-vector operations, which are similar to the level-2 operations of the BLAS library. This class acts as a templated namespace, where all member methods are public and static.

This class implements matrix-ector multiplication for three types of matrices:

  • Dense matrix (both row major and column major)
  • Compressed sparse row matrix (CSR)
  • Compressed sparse column matrix (CSC)

For each of the above matrix types, there are four kinds of matrix vector multiplications implemented.

  1. dot : performs \( \boldsymbol{c} = \mathbf{A} \boldsymbol{b} \).
  2. dot_plus : performs \( \boldsymbol{c} = \boldsymbol{c} + \alpha \mathbf{A} \boldsymbol{b} \).
  3. transpose_dot : performs \( \boldsymbol{c} = \mathbf{A}^{\intercal} \boldsymbol{b} \).
  4. transpose_dot_plus : performs \( \boldsymbol{c} = \boldsymbol{c} + \alpha \mathbf{A}^{\intercal} \boldsymbol{b} \).
See also
cVectorOperations

Definition at line 67 of file c_matrix_operations.h.

Member Function Documentation

◆ create_band_matrix()

template<typename DataType >
void cMatrixOperations< DataType >::create_band_matrix ( DataType *RESTRICT  A,
const LongIndexType  num_rows,
const LongIndexType  num_columns,
const FlagType  A_is_row_major,
const DataType *RESTRICT  diagonals,
const DataType *RESTRICT  supdiagonals,
const IndexType  non_zero_size,
const FlagType  tridiagonal 
)
static

Creates bi-diagonal or symmetric tri-diagonal matrix from the diagonal array (diagonals) and off-diagonal array (supdiagonals).

The output is written in place (in A). The output is only written up to the non_zero_size element, that is: A[:non_zero_size,:non_zero_size] is filled, and the rest is assumed to be zero.

Depending on tridiagonal, the matrix is upper bi-diagonal or symmetric tri-diagonal.

Note
The matrix A is assumed to be initialized to zero before calling this function.
Parameters
[out]A1D array that represents a 2D dense array with either C (row) major ordering or Fortran (column) major ordering. The major ordering should de defined by A_is_row_major flag.
[in]num_rowsNumber of rows of A
[in]num_columnsNumber of columns of A
[in]A_is_row_majorBoolean, can be 0 or 1 as follows:
  • If A is row major (C ordering where the last index is contiguous) this value should be 1.
  • If A is column major (Fortran ordering where the first index is contiguous), this value should be set to 0.
[in]diagonalsAn array of length n. All elements diagonals create the diagonals of A.
[in]supdiagonalsAn array of length n. Elements supdiagonals[0:-1] create the upper off-diagonal of A, making A an upper bi-diagonal matrix. In addition, if tridiagonal is set to 1, the lower off-diagonal is also created similar to the upper off-diagonal, making A a symmetric tri-diagonal matrix.
[in]non_zero_sizeUp to the A[:non_zero_size,:non_zero_size] of A will be written. At most, non_zero_size can be n, which is the size of diagonals array and the size of the square matrix. If non_zero_size is less than n, it is due to the fact that either diagonals or supdiagonals has zero elements after the size element (possibly due to early termination of Lanczos iterations method).
[in]tridiagonalBoolean. If set to 0, the matrix T becomes upper bi-diagonal. If set to 1, the matrix becomes symmetric tri-diagonal.

Definition at line 1536 of file c_matrix_operations.cpp.

1545{
1546 if (A_is_row_major)
1547 {
1548 // A is row-major
1549 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1550 #pragma omp parallel for \
1551 schedule(static) \
1552 if (!omp_in_parallel() && (non_zero_size >= LARGE_ARRAY_SIZE)) \
1553 default(none) \
1554 shared(A, diagonals, supdiagonals, non_zero_size, tridiagonal, \
1555 num_columns)
1556 #endif
1557 for (IndexType j=0; j < non_zero_size; ++j)
1558 {
1559 // Diagonals
1560 A[j*num_columns + j] = diagonals[j];
1561
1562 // Off diagonals
1563 if (j < non_zero_size-1)
1564 {
1565 // Sup-diagonal
1566 A[j*num_columns + j+1] = supdiagonals[j];
1567
1568 // Sub-diagonal, making symmetric tri-diagonal matrix
1569 if (tridiagonal)
1570 {
1571 A[(j+1)*num_columns + j] = supdiagonals[j];
1572 }
1573 }
1574 }
1575 }
1576 else
1577 {
1578 // A is column-major
1579 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1580 #pragma omp parallel for \
1581 schedule(static) \
1582 if (!omp_in_parallel() && (non_zero_size >= LARGE_ARRAY_SIZE)) \
1583 default(none) \
1584 shared(A, diagonals, supdiagonals, non_zero_size, tridiagonal, \
1585 num_rows)
1586 #endif
1587 for (IndexType j=0; j < non_zero_size; ++j)
1588 {
1589 // Diagonals
1590 A[j + num_rows*j] = diagonals[j];
1591
1592 // Off diagonals
1593 if (j < non_zero_size-1)
1594 {
1595 // Sup-diagonal
1596 A[j + (j+1)*num_rows] = supdiagonals[j];
1597
1598 // Sub-diagonal, making symmetric tri-diagonal matrix
1599 if (tridiagonal)
1600 {
1601 A[j+1 + j*num_rows] = supdiagonals[j];
1602 }
1603 }
1604 }
1605 }
1606}
int IndexType
Definition types.h:65

◆ csc_matvec()

template<typename DataType >
void cMatrixOperations< DataType >::csc_matvec ( const DataType *RESTRICT  A_data,
const LongIndexType *RESTRICT  A_row_indices,
const LongIndexType *RESTRICT  A_index_pointer,
const FlagType  A_is_symmetric,
const DataType *RESTRICT  b,
const LongIndexType  num_rows,
const LongIndexType  num_columns,
DataType *RESTRICT  c 
)
static

Computes \( \boldsymbol{c} = \mathbf{A} \boldsymbol{b} \) where \( \mathbf{A} \) is compressed sparse column (CSC) matrix and \( \boldsymbol{b} \) is a dense vector. The output \( \boldsymbol{c} \) is a dense vector.

Parameters
[in]A_dataCSC format data array of the sparse matrix. The length of this array is the nnz of the matrix.
[in]A_row_indicesCSC format column indices of the sparse matrix. The length of this array is the nnz of the matrix.
[in]A_index_pointerCSC format index pointer. The length of this array is one plus the number of columns of the matrix. Also, the first element of this array is 0, and the last element is the nnz of the matrix.
[in]A_is_symmetricBoolean. If A is symmetric, set this value to 1, otherwise 0.
[in]bColumn vector with same size of the number of columns of A.
[in]num_rowsNumber of rows of the matrix A.
[in]num_columnsNumber of columns of the matrix A. This is essentially the size of A_index_pointer array minus one.
[out]cOutput column vector with the same size as b. This array is written in-place.

Definition at line 1161 of file c_matrix_operations.cpp.

1170{
1171 if (A_is_symmetric)
1172 {
1173 // For symmetric A, use transposed operation instead for efficiency
1175 A_data, A_row_indices, A_index_pointer, b, num_columns, c);
1176 }
1177 else
1178 {
1179 // A is non-symmetric, use non-transposed product operation
1180 LongIndexType index_pointer;
1181 LongIndexType row;
1182 LongIndexType column;
1183
1184 // Initialize output to zero
1185 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1186 #pragma omp parallel for \
1187 schedule(static) \
1188 if (!omp_in_parallel() && (num_rows >= LARGE_ARRAY_SIZE)) \
1189 default(none) \
1190 shared(c, num_rows)
1191 #endif
1192 for (row=0; row < num_rows; ++row)
1193 {
1194 c[row] = 0.0;
1195 }
1196
1197 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1198 #pragma omp parallel for \
1199 schedule(static) \
1200 if (!omp_in_parallel()) \
1201 default(none) \
1202 shared(A_data, A_row_indices, A_index_pointer, b, c, num_columns) \
1203 private(index_pointer, row)
1204 #endif
1205 for (column=0; column < num_columns; ++column)
1206 {
1207 for (index_pointer=A_index_pointer[column];
1208 index_pointer < A_index_pointer[column+1];
1209 ++index_pointer)
1210 {
1211 row = A_row_indices[index_pointer];
1212
1213 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1214 #pragma omp atomic update
1215 #endif
1216 c[row] += A_data[index_pointer] * b[column];
1217 }
1218 }
1219 }
1220}
static void csc_transposed_matvec(const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_row_indices, const LongIndexType *RESTRICT A_index_pointer, const DataType *RESTRICT b, const LongIndexType num_columns, DataType *RESTRICT c)
Computes where is compressed sparse column (CSC) matrix and is a dense vector. The output is a de...
int LongIndexType
Definition types.h:60

References cMatrixOperations< DataType >::csc_transposed_matvec().

Referenced by cCSCMatrix< DataType >::dot().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ csc_matvec_plus()

template<typename DataType >
void cMatrixOperations< DataType >::csc_matvec_plus ( const DataType *RESTRICT  A_data,
const LongIndexType *RESTRICT  A_row_indices,
const LongIndexType *RESTRICT  A_index_pointer,
const FlagType  A_is_symmetric,
const DataType *RESTRICT  b,
const DataType  alpha,
const LongIndexType  num_columns,
DataType *RESTRICT  c 
)
static

Computes \( \boldsymbol{c} = \boldsymbol{c} + \alpha \mathbf{A} \boldsymbol{b} \) where \( \mathbf{A} \) is compressed sparse column (CSC) matrix and \( \boldsymbol{b} \) is a dense vector. The output \( \boldsymbol{c} \) is a dense vector.

Parameters
[in]A_dataCSC format data array of the sparse matrix. The length of this array is the nnz of the matrix.
[in]A_row_indicesCSC format column indices of the sparse matrix. The length of this array is the nnz of the matrix.
[in]A_index_pointerCSC format index pointer. The length of this array is one plus the number of columns of the matrix. Also, the first element of this array is 0, and the last element is the nnz of the matrix.
[in]A_is_symmetricBoolean. If A is symmetric, set this value to 1, otherwise 0.
[in]bColumn vector with same size of the number of columns of A.
[in]alphaA scalar that scales the matrix vector multiplication.
[in]num_columnsNumber of columns of the matrix A. This is essentially the size of A_index_pointer array minus one.
[in,out]cOutput column vector with the same size as b. This array is written in-place.

Definition at line 1260 of file c_matrix_operations.cpp.

1269{
1270 DataType zero = 0.0;
1271 if (c_arithmetics::is_equal(alpha, zero))
1272 {
1273 return;
1274 }
1275
1276 if (A_is_symmetric)
1277 {
1278 // For symmetric A, use transposed operation instead for efficiency
1280 A_data, A_row_indices, A_index_pointer, b, alpha, num_columns, c);
1281
1282 }
1283 else
1284 {
1285 LongIndexType index_pointer;
1286 LongIndexType row;
1287 LongIndexType column;
1288
1289 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1290 #pragma omp parallel for \
1291 schedule(static) \
1292 if (!omp_in_parallel()) \
1293 default(none) \
1294 shared(A_data, A_row_indices, A_index_pointer, b, c, alpha, \
1295 num_columns) \
1296 private(index_pointer, row)
1297 #endif
1298 for (column=0; column < num_columns; ++column)
1299 {
1300 for (index_pointer=A_index_pointer[column];
1301 index_pointer < A_index_pointer[column+1];
1302 ++index_pointer)
1303 {
1304 row = A_row_indices[index_pointer];
1305
1306 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1307 #pragma omp atomic update
1308 #endif
1309 c[row] += alpha * A_data[index_pointer] * b[column];
1310 }
1311 }
1312 }
1313}
static void csc_transposed_matvec_plus(const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_row_indices, const LongIndexType *RESTRICT A_index_pointer, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_columns, DataType *RESTRICT c)
Computes where is compressed sparse column (CSC) matrix and is a dense vector. The output is a de...
bool is_equal(DataType x, DataType y)
Check if two floating point numbers are equal within a tolerance.
Definition _c_is_equal.h:49

References cMatrixOperations< DataType >::csc_transposed_matvec_plus(), and c_arithmetics::is_equal().

Referenced by cCSCMatrix< DataType >::dot_plus().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ csc_transposed_matvec()

template<typename DataType >
void cMatrixOperations< DataType >::csc_transposed_matvec ( const DataType *RESTRICT  A_data,
const LongIndexType *RESTRICT  A_row_indices,
const LongIndexType *RESTRICT  A_index_pointer,
const DataType *RESTRICT  b,
const LongIndexType  num_columns,
DataType *RESTRICT  c 
)
static

Computes \(\boldsymbol{c} =\mathbf{A}^{\intercal} \boldsymbol{b} \) where \( \mathbf{A} \) is compressed sparse column (CSC) matrix and \( \boldsymbol{b} \) is a dense vector. The output \( \boldsymbol{c} \) is a dense vector.

The reduction variable (here, sum ) is of the type long double. This is becase when DataType is float, the summation loses the precision, especially when the vector size is large. It seems that using long double is slightly faster than using double. The advantage of using a type with larger bits for the reduction variable is only sensible if the compiler is optimized with -O2 or -O3 flags.

Parameters
[in]A_dataCSC format data array of the sparse matrix. The length of this array is the nnz of the matrix.
[in]A_row_indicesCSC format column indices of the sparse matrix. The length of this array is the nnz of the matrix.
[in]A_index_pointerCSC format index pointer. The length of this array is one plus the number of columns of the matrix. Also, the first element of this array is 0, and the last element is the nnz of the matrix.
[in]bColumn vector with same size of the number of columns of A.
num_columnsNumber of columns of the matrix A. This is essentially the size of A_index_pointer array minus one.
[out]cOutput column vector with the same size as b. This array is written in-place.

Definition at line 1356 of file c_matrix_operations.cpp.

1363{
1364 LongIndexType index_pointer;
1365 LongIndexType row;
1366 LongIndexType column;
1367 long double sum;
1368
1369 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1370 #pragma omp parallel for \
1371 schedule(static) \
1372 if (!omp_in_parallel()) \
1373 default(none) \
1374 shared(A_data, A_row_indices, A_index_pointer, b, c, num_columns) \
1375 private(index_pointer, row, sum)
1376 #endif
1377 for (column=0; column < num_columns; ++column)
1378 {
1379 sum = 0.0;
1380 for (index_pointer=A_index_pointer[column];
1381 index_pointer < A_index_pointer[column+1];
1382 ++index_pointer)
1383 {
1384 row = A_row_indices[index_pointer];
1385 sum += A_data[index_pointer] * b[row];
1386 }
1387 c[column] = static_cast<DataType>(sum);
1388 }
1389}

Referenced by cMatrixOperations< DataType >::csc_matvec(), and cCSCMatrix< DataType >::transpose_dot().

Here is the caller graph for this function:

◆ csc_transposed_matvec_plus()

template<typename DataType >
void cMatrixOperations< DataType >::csc_transposed_matvec_plus ( const DataType *RESTRICT  A_data,
const LongIndexType *RESTRICT  A_row_indices,
const LongIndexType *RESTRICT  A_index_pointer,
const DataType *RESTRICT  b,
const DataType  alpha,
const LongIndexType  num_columns,
DataType *RESTRICT  c 
)
static

Computes \( \boldsymbol{c} = \boldsymbol{c} + \alpha \mathbf{A}^{\intercal} \boldsymbol{b} \) where \( \mathbf{A} \) is compressed sparse column (CSC) matrix and \( \boldsymbol{b} \) is a dense vector. The output \( \boldsymbol{c} \) is a dense vector.

The reduction variable (here, sum ) is of the type long double. This is becase when DataType is float the summation loses the precision, especially when the vector size is large. It seems that using long double is slightly faster than using double. The advantage of using a type with larger bits for the reduction variable is only sensible if the compiler is optimized with -O2 or -O3 flags.

Parameters
[in]A_dataCSC format data array of the sparse matrix. The length of this array is the nnz of the matrix.
[in]A_row_indicesCSC format column indices of the sparse matrix. The length of this array is the nnz of the matrix.
[in]A_index_pointerCSC format index pointer. The length of this array is one plus the number of columns of the matrix. Also, the first element of this array is 0, and the last element is the nnz of the matrix.
[in]bColumn vector with same size of the number of columns of A.
[in]alphaA scalar that scales the matrix vector multiplication.
num_columnsNumber of columns of the matrix A. This is essentially the size of A_index_pointer array minus one.
[in,out]cOutput column vector with the same size as b. This array is written in-place.

Definition at line 1435 of file c_matrix_operations.cpp.

1443{
1444 DataType zero = 0.0;
1445 if (c_arithmetics::is_equal(alpha, zero))
1446 {
1447 return;
1448 }
1449
1450 LongIndexType index_pointer;
1451 LongIndexType row;
1452 LongIndexType column;
1453 long double sum;
1454
1455 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1456 #pragma omp parallel for \
1457 schedule(static) \
1458 if (!omp_in_parallel()) \
1459 default(none) \
1460 shared(A_data, A_row_indices, A_index_pointer, b, c, alpha, \
1461 num_columns) \
1462 private(index_pointer, row, sum)
1463 #endif
1464 for (column=0; column < num_columns; ++column)
1465 {
1466 sum = 0.0;
1467 for (index_pointer=A_index_pointer[column];
1468 index_pointer < A_index_pointer[column+1];
1469 ++index_pointer)
1470 {
1471 row = A_row_indices[index_pointer];
1472 sum += A_data[index_pointer] * b[row];
1473 }
1474 c[column] += static_cast<DataType>(alpha * sum);
1475 }
1476}

References c_arithmetics::is_equal().

Referenced by cMatrixOperations< DataType >::csc_matvec_plus(), and cCSCMatrix< DataType >::transpose_dot_plus().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ csr_matvec()

template<typename DataType >
void cMatrixOperations< DataType >::csr_matvec ( const DataType *RESTRICT  A_data,
const LongIndexType *RESTRICT  A_column_indices,
const LongIndexType *RESTRICT  A_index_pointer,
const DataType *RESTRICT  b,
const LongIndexType  num_rows,
DataType *RESTRICT  c 
)
static

Computes \( \boldsymbol{c} = \mathbf{A} \boldsymbol{b} \) where \( \mathbf{A} \) is compressed sparse row (CSR) matrix and \( \boldsymbol{b} \) is a dense vector. The output \( \boldsymbol{c} \) is a dense vector.

The reduction variable (here, sum ) is of the type long double. This is becase when DataType is float, the summation loses the precision, especially when the vector size is large. It seems that using long double is slightly faster than using double. The advantage of using a type with larger bits for the reduction variable is only sensible if the compiler is optimized with -O2 or -O3 flags.

Parameters
[in]A_dataCSR format data array of the sparse matrix. The length of this array is the nnz of the matrix.
[in]A_column_indicesCSR format column indices of the sparse matrix. The length of this array is the nnz of the matrix.
[in]A_index_pointerCSR format index pointer. The length of this array is one plus the number of rows of the matrix. Also, the first element of this array is 0, and the last element is the nnz of the matrix.
[in]bColumn vector with same size of the number of columns of A.
[in]num_rowsNumber of rows of the matrix A. This is essentially the size of A_index_pointer array minus one.
[out]cOutput column vector with the same size as b. This array is written in-place.

Definition at line 814 of file c_matrix_operations.cpp.

821{
822 LongIndexType index_pointer;
823 LongIndexType row;
824 LongIndexType column;
825 long double sum;
826
827 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
828 #pragma omp parallel for \
829 schedule(static) \
830 if (!omp_in_parallel()) \
831 default(none) \
832 shared(A_data, A_column_indices, A_index_pointer, b, c, num_rows) \
833 private(index_pointer, column, sum)
834 #endif
835 for (row=0; row < num_rows; ++row)
836 {
837 sum = 0.0;
838 for (index_pointer=A_index_pointer[row];
839 index_pointer < A_index_pointer[row+1];
840 ++index_pointer)
841 {
842 column = A_column_indices[index_pointer];
843 sum += A_data[index_pointer] * b[column];
844 }
845 c[row] = static_cast<DataType>(sum);
846 }
847}

Referenced by cMatrixOperations< DataType >::csr_transposed_matvec(), and cCSRMatrix< DataType >::dot().

Here is the caller graph for this function:

◆ csr_matvec_plus()

template<typename DataType >
void cMatrixOperations< DataType >::csr_matvec_plus ( const DataType *RESTRICT  A_data,
const LongIndexType *RESTRICT  A_column_indices,
const LongIndexType *RESTRICT  A_index_pointer,
const DataType *RESTRICT  b,
const DataType  alpha,
const LongIndexType  num_rows,
DataType *RESTRICT  c 
)
static

Computes \( \boldsymbol{c} = \boldsymbol{c} + \alpha \mathbf{A} \boldsymbol{b} \) where \( \mathbf{A} \) is compressed sparse row (CSR) matrix and \( \boldsymbol{b} \) is a dense vector. The output \( \boldsymbol{c} \) is a dense vector.

The reduction variable (here, sum ) is of the type long double. This is becase when DataType is float the summation loses the precision, especially when the vector size is large. It seems that using long double is slightly faster than using double. The advantage of using a type with larger bits for the reduction variable is only sensible if the compiler is optimized with -O2 or -O3 flags.

Parameters
[in]A_dataCSR format data array of the sparse matrix. The length of this array is the nnz of the matrix.
[in]A_column_indicesCSR format column indices of the sparse matrix. The length of this array is the nnz of the matrix.
[in]A_index_pointerCSR format index pointer. The length of this array is one plus the number of rows of the matrix. Also, the first element of this array is 0, and the last element is the nnz of the matrix.
[in]bColumn vector with same size of the number of columns of A.
[in]alphaA scalar that scales the matrix vector multiplication.
[in]num_rowsNumber of rows of the matrix A. This is essentially the size of A_index_pointer array minus one.
[in,out]cOutput column vector with the same size as b. This array is written in-place.

Definition at line 893 of file c_matrix_operations.cpp.

901{
902 DataType zero = 0.0;
903 if (c_arithmetics::is_equal(alpha, zero))
904 {
905 return;
906 }
907
908 LongIndexType index_pointer;
909 LongIndexType row;
910 LongIndexType column;
911 long double sum;
912
913 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
914 #pragma omp parallel for \
915 schedule(static) \
916 if (!omp_in_parallel()) \
917 default(none) \
918 shared(A_data, A_column_indices, A_index_pointer, b, c, alpha, \
919 num_rows) \
920 private(index_pointer, column, sum)
921 #endif
922 for (row=0; row < num_rows; ++row)
923 {
924 sum = 0.0;
925 for (index_pointer=A_index_pointer[row];
926 index_pointer < A_index_pointer[row+1];
927 ++index_pointer)
928 {
929 column = A_column_indices[index_pointer];
930 sum += A_data[index_pointer] * b[column];
931 }
932 c[row] += alpha * static_cast<DataType>(sum);
933 }
934}

References c_arithmetics::is_equal().

Referenced by cMatrixOperations< DataType >::csr_transposed_matvec_plus(), and cCSRMatrix< DataType >::dot_plus().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ csr_transposed_matvec()

template<typename DataType >
void cMatrixOperations< DataType >::csr_transposed_matvec ( const DataType *RESTRICT  A_data,
const LongIndexType *RESTRICT  A_column_indices,
const LongIndexType *RESTRICT  A_index_pointer,
const FlagType  A_is_symmetric,
const DataType *RESTRICT  b,
const LongIndexType  num_rows,
const LongIndexType  num_columns,
DataType *RESTRICT  c 
)
static

Computes \(\boldsymbol{c} =\mathbf{A}^{\intercal} \boldsymbol{b}\) where \( \mathbf{A} \) is compressed sparse row (CSR) matrix and \( \boldsymbol{b} \) is a dense vector. The output \( \boldsymbol{c} \) is a dense vector.

Parameters
[in]A_dataCSR format data array of the sparse matrix. The length of this array is the nnz of the matrix.
[in]A_column_indicesCSR format column indices of the sparse matrix. The length of this array is the nnz of the matrix.
[in]A_index_pointerCSR format index pointer. The length of this array is one plus the number of rows of the matrix. Also, the first element of this array is 0, and the last element is the nnz of the matrix.
[in]A_is_symmetricBoolean. If A is symmetric, set this value to 1, otherwise 0.
[in]bColumn vector with same size of the number of columns of A.
[in]num_rowsNumber of rows of the matrix A. This is essentially the size of A_index_pointer array minus one.
[in]num_columnsNumber of columns of the matrix A.
[out]cOutput column vector with the same size as b. This array is written in-place.

Definition at line 972 of file c_matrix_operations.cpp.

981{
982 if (A_is_symmetric)
983 {
984 // For symmetric A, use non-transposed operation instead for efficiency
986 A_data, A_column_indices, A_index_pointer, b, num_rows, c);
987 }
988 else
989 {
990 // A is non-symmetric, use transposed product operation
991 LongIndexType index_pointer;
992 LongIndexType row;
993 LongIndexType column;
994
995 // Initialize output to zero
996 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
997 #pragma omp parallel for \
998 schedule(static) \
999 if ((!omp_in_parallel()) && (num_columns >= LARGE_ARRAY_SIZE)) \
1000 default(none) shared(c, num_columns)
1001 #endif
1002 for (column=0; column < num_columns; ++column)
1003 {
1004 c[column] = 0.0;
1005 }
1006
1007 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1008 #pragma omp parallel for \
1009 schedule(static) \
1010 if (!omp_in_parallel()) \
1011 default(none) \
1012 shared(A_data, A_column_indices, A_index_pointer, b, c, num_rows) \
1013 private(index_pointer, column)
1014 #endif
1015 for (row=0; row < num_rows; ++row)
1016 {
1017 for (index_pointer=A_index_pointer[row];
1018 index_pointer < A_index_pointer[row+1];
1019 ++index_pointer)
1020 {
1021 column = A_column_indices[index_pointer];
1022
1023 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1024 #pragma omp atomic update
1025 #endif
1026 c[column] += A_data[index_pointer] * b[row];
1027 }
1028 }
1029 }
1030}
static void csr_matvec(const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_column_indices, const LongIndexType *RESTRICT A_index_pointer, const DataType *RESTRICT b, const LongIndexType num_rows, DataType *RESTRICT c)
Computes where is compressed sparse row (CSR) matrix and is a dense vector. The output is a dense...

References cMatrixOperations< DataType >::csr_matvec().

Referenced by cCSRMatrix< DataType >::transpose_dot().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ csr_transposed_matvec_plus()

template<typename DataType >
void cMatrixOperations< DataType >::csr_transposed_matvec_plus ( const DataType *RESTRICT  A_data,
const LongIndexType *RESTRICT  A_column_indices,
const LongIndexType *RESTRICT  A_index_pointer,
const FlagType  A_is_symmetric,
const DataType *RESTRICT  b,
const DataType  alpha,
const LongIndexType  num_rows,
DataType *RESTRICT  c 
)
static

Computes \( \boldsymbol{c} = \boldsymbol{c} + \alpha \mathbf{A}^{\intercal} \boldsymbol{b}\) where \( \mathbf{A} \) is compressed sparse row (CSR) matrix and \( \boldsymbol{b} \) is a dense vector. The output \( \boldsymbol{c} \) is a dense vector.

Parameters
[in]A_dataCSR format data array of the sparse matrix. The length of this array is the nnz of the matrix.
[in]A_column_indicesCSR format column indices of the sparse matrix. The length of this array is the nnz of the matrix.
[in]A_index_pointerCSR format index pointer. The length of this array is one plus the number of rows of the matrix. Also, the first element of this array is 0, and the last element is the nnz of the matrix.
[in]A_is_symmetricBoolean. If A is symmetric, set this value to 1, otherwise 0.
[in]bColumn vector with same size of the number of columns of A.
[in]alphaA scalar that scales the matrix vector multiplication.
[in]num_rowsNumber of rows of the matrix A. This is essentially the size of A_index_pointer array minus one.
[in,out]cOutput column vector with the same size as b. This array is written in-place.

Definition at line 1070 of file c_matrix_operations.cpp.

1079{
1080 DataType zero = 0.0;
1081 if (c_arithmetics::is_equal(alpha, zero))
1082 {
1083 return;
1084 }
1085
1086 if (A_is_symmetric)
1087 {
1088 // For symmetric A, use non-transposed operation instead for efficiency
1090 A_data, A_column_indices, A_index_pointer, b, alpha, num_rows, c);
1091 }
1092 else
1093 {
1094 // A is non-symmetric, use transposed product operation
1095 LongIndexType index_pointer;
1096 LongIndexType row;
1097 LongIndexType column;
1098
1099 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1100 #pragma omp parallel for \
1101 schedule(static) \
1102 if (!omp_in_parallel()) \
1103 default(none) \
1104 shared(A_data, A_column_indices, A_index_pointer, b, c, alpha, \
1105 num_rows) \
1106 private(index_pointer, column)
1107 #endif
1108 for (row=0; row < num_rows; ++row)
1109 {
1110 for (index_pointer=A_index_pointer[row];
1111 index_pointer < A_index_pointer[row+1];
1112 ++index_pointer)
1113 {
1114 column = A_column_indices[index_pointer];
1115
1116 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1117 #pragma omp atomic update
1118 #endif
1119 c[column] += alpha * A_data[index_pointer] * b[row];
1120 }
1121 }
1122 }
1123}
static void csr_matvec_plus(const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_column_indices, const LongIndexType *RESTRICT A_index_pointer, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_rows, DataType *RESTRICT c)
Computes where is compressed sparse row (CSR) matrix and is a dense vector. The output is a dense...

References cMatrixOperations< DataType >::csr_matvec_plus(), and c_arithmetics::is_equal().

Referenced by cCSRMatrix< DataType >::transpose_dot_plus().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ dense_matvec()

template<typename DataType >
void cMatrixOperations< DataType >::dense_matvec ( const DataType *RESTRICT  A,
const DataType *RESTRICT  b,
const LongIndexType  num_rows,
const LongIndexType  num_columns,
const FlagType  A_is_row_major,
const FlagType  A_is_symmetric,
DataType *RESTRICT  c 
)
static

Computes the matrix vector multiplication \( \boldsymbol{c} = \mathbf{A} \boldsymbol{b} \) where \( \mathbf{A} \) is a dense matrix.

The reduction variable (here, sum ) is of the type long double. This is becase when DataType is float, the summation loses the precision, especially when the vector size is large. It seems that using long double is slightly faster than using double. The advantage of using a type with larger bits for the reduction variable is only sensible if the compiler is optimized with -O2 or -O3 flags.

Parameters
[in]A1D array that represents a 2D dense array with either C (row) major ordering or Fortran (column) major ordering. The major ordering should de defined by A_is_row_major flag.
[in]bColumn vector
[in]num_rowsNumber of rows of A
[in]num_columnsNumber of columns of A
[in]A_is_row_majorBoolean, can be 0 or 1 as follows:
  • If A is row major (C ordering where the last index is contiguous) this value should be 1.
  • If A is column major (Fortran ordering where the first index is contiguous), this value should be set to 0.
[in]A_is_symmetricBoolean. If A is symmetric, set this value to 1, otherwise 0.
[out]cThe output column vector (written in-place).

Definition at line 70 of file c_matrix_operations.cpp.

78{
79 #if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
80
81 // Using BLAS
82 CBLAS_LAYOUT layout;
83 CBLAS_UPLO uplo;
84 CBLAS_TRANSPOSE transpose = CblasNoTrans;
85 int lda;
86 if (A_is_row_major)
87 {
88 layout = CblasRowMajor;
89 uplo = CblasUpper;
90 lda = num_columns;
91 }
92 else
93 {
94 layout = CblasColMajor;
95 uplo = CblasLower;
96 lda = num_rows;
97
98 // For efficiency, use transpose op for symmetric column-major matrices
99 if (A_is_symmetric)
100 {
101 transpose = CblasTrans;
102 }
103 }
104
105 int incb = 1;
106 int incc = 1;
107 DataType alpha = 1.0;
108 DataType beta = 0.0;
109
110 if (A_is_symmetric)
111 {
112 cblas_api::xsymv(layout, uplo, num_columns, alpha, A, lda, b, incb,
113 beta, c, incc);
114 }
115 else
116 {
117 cblas_api::xgemv(layout, transpose, num_rows, num_columns, alpha, A,
118 lda, b, incb, beta, c, incc);
119 }
120
121 #else
122
123 // Not using BLAS
124 long int j; // This should be long int to avoid multiplication overflow
125 long int jump;
126 long double sum;
127 const long int chunk = 5;
128 const long int num_columns_chunked = num_columns - (num_columns % chunk);
129
130 // Void unused variables to avoid compiler warnings (-Wno-unused-parameter)
131 #if !defined(USE_LOOP_UNROLLING) || (USE_LOOP_UNROLLING != 1)
132 (void) num_columns_chunked;
133 (void) chunk;
134 #endif
135
136 // Determine major order of A
137 if (A_is_row_major)
138 {
139 // For row-major (C ordering) matrix A (symmetric or non-symmetric)
140 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
141 #pragma omp parallel for \
142 schedule(static) \
143 if (!omp_in_parallel()) \
144 default(none) \
145 shared(A, b, c, jump, num_rows, num_columns, num_columns_chunked, \
146 chunk) \
147 private(j, sum)
148 #endif
149 for (long int i=0; i < num_rows; ++i)
150 {
151 sum = 0.0;
152 jump = i * num_columns;
153 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
154 for (j=0; j < num_columns_chunked; j+= chunk)
155 {
156 // Loop unrolling
157 sum += A[jump + j] * b[j] +
158 A[jump + j+1] * b[j+1] +
159 A[jump + j+2] * b[j+2] +
160 A[jump + j+3] * b[j+3] +
161 A[jump + j+4] * b[j+4];
162 }
163 #endif
164
165 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
166 for (j=num_columns_chunked; j < num_columns; ++j)
167 #else
168 for (j=0; j < num_columns; ++j)
169 #endif
170 {
171 sum += A[jump + j] * b[j];
172 }
173
174 c[i] = static_cast<DataType>(sum);
175 }
176 }
177 else if (A_is_symmetric)
178 {
179 // A is column-major, but symmetric, so we can use its transposed
180 // operation instead, which is more efficient for column-major
181 // matrices.
183 A, b, num_rows, num_columns, A_is_row_major, A_is_symmetric, c);
184 }
185 else
186 {
187 // For column-major (Fortran ordering) non-symmetric matrix A
188 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
189 #pragma omp parallel for \
190 schedule(static) \
191 if (!omp_in_parallel()) \
192 default(none) \
193 shared(A, b, c, num_rows, num_columns) \
194 private(j, sum)
195 #endif
196 for (long int i=0; i < num_rows; ++i)
197 {
198 sum = 0.0;
199 for (j=0; j < num_columns; ++j)
200 {
201 // Make sure j is long int to avoid overflow in num_rows*j
202 sum += A[i + num_rows*j] * b[j];
203 }
204 c[i] = static_cast<DataType>(sum);
205 }
206 }
207
208 #endif
209}
static void dense_transposed_matvec(const DataType *RESTRICT A, const DataType *RESTRICT b, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, const FlagType A_is_symmetric, DataType *RESTRICT c)
Computes matrix vector multiplication where is dense, and is the transpose of the matrix .

References cMatrixOperations< DataType >::dense_transposed_matvec().

Referenced by cMatrixOperations< DataType >::dense_transposed_matvec(), and cDenseMatrix< DataType >::dot().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ dense_matvec_plus()

template<typename DataType >
void cMatrixOperations< DataType >::dense_matvec_plus ( const DataType *RESTRICT  A,
const DataType *RESTRICT  b,
const DataType  alpha,
const LongIndexType  num_rows,
const LongIndexType  num_columns,
const FlagType  A_is_row_major,
const FlagType  A_is_symmetric,
DataType *RESTRICT  c 
)
static

Computes the operation \( \boldsymbol{c} = \boldsymbol{c} + \alpha \mathbf{A} \boldsymbol{b} \) where \( \mathbf{A} \) is a dense matrix.

The reduction variable (here, sum ) is of the type long double. This is becase when DataType is float the summation loses the precision, especially when the vector size is large. It seems that using long double is slightly faster than using double. The advantage of using a type with larger bits for the reduction variable is only sensible if the compiler is optimized with -O2 or -O3 flags.

Parameters
[in]A1D array that represents a 2D dense array with either C (row) major ordering or Fortran (column) major ordering. The major ordering should de defined by A_is_row_major flag.
[in]bColumn vector
[in]alphaA scalar that scales the matrix vector multiplication.
[in]num_rowsNumber of rows of A
[in]num_columnsNumber of columns of A
[in]A_is_row_majorBoolean, can be 0 or 1 as follows:
  • If A is row major (C ordering where the last index is contiguous) this value should be 1.
  • If A is column major (Fortran ordering where the first index is contiguous), this value should be set to 0.
[in]A_is_symmetricBoolean. If A is symmetric, set this value to 1, otherwise 0.
[in,out]cThe output column vector (written in-place).

Definition at line 254 of file c_matrix_operations.cpp.

263{
264 #if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
265
266 // Using BLAS
267 CBLAS_LAYOUT layout;
268 CBLAS_UPLO uplo;
269 CBLAS_TRANSPOSE transpose = CblasNoTrans;
270 int lda;
271 if (A_is_row_major)
272 {
273 layout = CblasRowMajor;
274 uplo = CblasUpper;
275 lda = num_columns;
276 }
277 else
278 {
279 layout = CblasColMajor;
280 uplo = CblasLower;
281 lda = num_rows;
282
283 // For efficiency, use transpose op for symmetric column-major matrices
284 if (A_is_symmetric)
285 {
286 transpose = CblasTrans;
287 }
288 }
289
290 int incb = 1;
291 int incc = 1;
292 DataType beta = 1.0;
293
294 if (A_is_symmetric)
295 {
296 cblas_api::xsymv(layout, uplo, num_columns, alpha, A, lda, b, incb,
297 beta, c, incc);
298 }
299 else
300 {
301 cblas_api::xgemv(layout, transpose, num_rows, num_columns, alpha, A,
302 lda, b, incb, beta, c, incc);
303 }
304
305 #else
306
307 // Not using BLAS
308 DataType zero = 0.0;
309 if (c_arithmetics::is_equal(alpha, zero))
310 {
311 return;
312 }
313
314 long int j;
315 long int jump;
316 long double sum;
317 const long int chunk = 5;
318 const long int num_columns_chunked = num_columns - (num_columns % chunk);
319
320 // Void unused variables to avoid compiler warnings (-Wno-unused-parameter)
321 #if !defined(USE_LOOP_UNROLLING) || (USE_LOOP_UNROLLING != 1)
322 (void) num_columns_chunked;
323 (void) chunk;
324 #endif
325
326 // Determine major order of A
327 if (A_is_row_major)
328 {
329 // For row-major (C ordering) matrix A (symmetric or non-symmetric)
330 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
331 #pragma omp parallel for \
332 schedule(static) \
333 if (!omp_in_parallel()) \
334 default(none) \
335 shared(A, b, c, jump, alpha, num_rows, num_columns, chunk, \
336 num_columns_chunked) \
337 private(j, sum)
338 #endif
339 for (long int i=0; i < num_rows; ++i)
340 {
341 sum = 0.0;
342 jump = i * num_columns;
343 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
344 for (j=0; j < num_columns_chunked; j+= chunk)
345 {
346 sum += A[jump + j] * b[j] +
347 A[jump + j+1] * b[j+1] +
348 A[jump + j+2] * b[j+2] +
349 A[jump + j+3] * b[j+3] +
350 A[jump + j+4] * b[j+4];
351 }
352 #endif
353
354 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
355 for (j=num_columns_chunked; j < num_columns; ++j)
356 #else
357 for (j=0; j < num_columns; ++j)
358 #endif
359 {
360 sum += A[jump + j] * b[j];
361 }
362
363 c[i] += alpha * static_cast<DataType>(sum);
364 }
365 }
366 else if (A_is_symmetric)
367 {
368 // A is column-major, but symmetric, so we can use its transposed
369 // operation instead, which is more efficient for column-major
370 // matrices.
372 A, b, alpha, num_rows, num_columns, A_is_row_major, A_is_symmetric,
373 c);
374 }
375 else
376 {
377 // For column-major (Fortran ordering) non-symmetric matrix A
378 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
379 #pragma omp parallel for \
380 schedule(static) \
381 if (!omp_in_parallel()) \
382 default(none) \
383 shared(A, b, c, alpha, num_rows, num_columns) \
384 private(j, sum)
385 #endif
386 for (long int i=0; i < num_rows; ++i)
387 {
388 sum = 0.0;
389 for (j=0; j < num_columns; ++j)
390 {
391 sum += A[i + num_rows*j] * b[j];
392 }
393 c[i] += alpha* static_cast<DataType>(sum);
394 }
395 }
396
397 #endif
398}
static void dense_transposed_matvec_plus(const DataType *RESTRICT A, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, const FlagType A_is_symmetric, DataType *RESTRICT c)
Computes where is dense, and is the transpose of the matrix .

References cMatrixOperations< DataType >::dense_transposed_matvec_plus(), and c_arithmetics::is_equal().

Referenced by cMatrixOperations< DataType >::dense_transposed_matvec_plus(), and cDenseMatrix< DataType >::dot_plus().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ dense_transposed_matvec()

template<typename DataType >
void cMatrixOperations< DataType >::dense_transposed_matvec ( const DataType *RESTRICT  A,
const DataType *RESTRICT  b,
const LongIndexType  num_rows,
const LongIndexType  num_columns,
const FlagType  A_is_row_major,
const FlagType  A_is_symmetric,
DataType *RESTRICT  c 
)
static

Computes matrix vector multiplication \(\boldsymbol{c} = \mathbf{A}^{\intercal} \boldsymbol{b} \) where \( \mathbf{A} \) is dense, and \( \mathbf{A}^{\intercal} \) is the transpose of the matrix \( \mathbf{A} \).

The reduction variable (here, sum ) is of the type long double. This is becase when DataType is float, the summation loses the precision, especially when the vector size is large. It seems that using long double is slightly faster than using double. The advantage of using a type with larger bits for the reduction variable is only sensible if the compiler is optimized with -O2 or -O3 flags.

Parameters
[in]A1D array that represents a 2D dense array with either C (row) major ordering or Fortran (column) major ordering. The major ordering should de defined by A_is_row_major flag.
[in]bColumn vector
[in]num_rowsNumber of rows of A
[in]num_columnsNumber of columns of A
[in]A_is_row_majorBoolean, can be 0 or 1 as follows:
  • If A is row major (C ordering where the last index is contiguous) this value should be 1.
  • f A is column major (Fortran ordering where the first index is contiguous), this value should be set to 0.
[in]A_is_symmetricBoolean. If A is symmetric, set this value to 1, otherwise 0.
[out]cThe output column vector (written in-place).

Definition at line 442 of file c_matrix_operations.cpp.

450{
451 #if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
452
453 // Using BLAS
454 CBLAS_LAYOUT layout;
455 CBLAS_UPLO uplo;
456 CBLAS_TRANSPOSE transpose = CblasTrans;
457 int lda;
458 if (A_is_row_major)
459 {
460 layout = CblasRowMajor;
461 uplo = CblasUpper;
462 lda = num_columns;
463 }
464 else
465 {
466 layout = CblasColMajor;
467 uplo = CblasLower;
468 lda = num_rows;
469
470 // For efficiency, use transpose op for symmetric column-major matrices
471 if (A_is_symmetric)
472 {
473 transpose = CblasNoTrans;
474 }
475 }
476
477 int incb = 1;
478 int incc = 1;
479 DataType alpha = 1.0;
480 DataType beta = 0.0;
481
482 if (A_is_symmetric)
483 {
484 cblas_api::xsymv(layout, uplo, num_columns, alpha, A, lda, b, incb,
485 beta, c, incc);
486 }
487 else
488 {
489 cblas_api::xgemv(layout, transpose, num_rows, num_columns, alpha, A,
490 lda, b, incb, beta, c, incc);
491 }
492
493 #else
494
495 // Not using BLAS
496 long int i;
497 long int jump;
498 long double sum;
499 const long int chunk = 5;
500 const long int num_rows_chunked = num_rows - (num_rows % chunk);
501
502 // Void unused variables to avoid compiler warnings (-Wno-unused-parameter)
503 #if !defined(USE_LOOP_UNROLLING) || (USE_LOOP_UNROLLING != 1)
504 (void) num_rows_chunked;
505 (void) chunk;
506 #endif
507
508 // Determine major order of A
509 if (!A_is_row_major)
510 {
511 // For column-major (Fortran ordering) matrix A (symmetric or
512 // non-symmetric)
513 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
514 #pragma omp parallel for \
515 schedule(static) \
516 if (!omp_in_parallel()) \
517 default(none) \
518 shared(A, b, c, jump, num_rows, num_columns, num_rows_chunked, \
519 chunk) \
520 private(i, sum)
521 #endif
522 for (long int j=0; j < num_columns; ++j)
523 {
524 // Loop unrolling
525 sum = 0.0;
526 jump = num_rows * j;
527 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
528 for (i=0; i < num_rows_chunked; i += chunk)
529 {
530 sum += A[i + jump] * b[i] +
531 A[i+1 + jump] * b[i+1] +
532 A[i+2 + jump] * b[i+2] +
533 A[i+3 + jump] * b[i+3] +
534 A[i+4 + jump] * b[i+4];
535 }
536 #endif
537
538 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
539 for (i=num_rows_chunked; i < num_rows; ++i)
540 #else
541 for (i=0; i < num_rows; ++i)
542 #endif
543 {
544 sum += A[i + jump] * b[i];
545 }
546
547 c[j] = static_cast<DataType>(sum);
548 }
549 }
550 else if (A_is_symmetric)
551 {
552 // A is row-major, but symmetric, so we can use its non-transposed
553 // operation instead, which is more efficient for row-major
554 // matrices.
556 A, b, num_rows, num_columns, A_is_row_major, A_is_symmetric, c);
557 }
558 else
559 {
560 // For row-major (C ordering) non-symmetric matrix A
561 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
562 #pragma omp parallel for \
563 schedule(static) \
564 if (!omp_in_parallel()) \
565 default(none) \
566 shared(A, b, c, num_rows, num_columns) \
567 private(i, sum)
568 #endif
569 for (long int j=0; j < num_columns; ++j)
570 {
571 sum = 0.0;
572 for (i=0; i < num_rows; ++i)
573 {
574 sum += A[i*num_columns + j] * b[i];
575 }
576 c[j] = static_cast<DataType>(sum);
577 }
578 }
579
580 #endif
581}
static void dense_matvec(const DataType *RESTRICT A, const DataType *RESTRICT b, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, const FlagType A_is_symmetric, DataType *RESTRICT c)
Computes the matrix vector multiplication where is a dense matrix.

References cMatrixOperations< DataType >::dense_matvec().

Referenced by cMatrixOperations< DataType >::dense_matvec(), and cDenseMatrix< DataType >::transpose_dot().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ dense_transposed_matvec_plus()

template<typename DataType >
void cMatrixOperations< DataType >::dense_transposed_matvec_plus ( const DataType *RESTRICT  A,
const DataType *RESTRICT  b,
const DataType  alpha,
const LongIndexType  num_rows,
const LongIndexType  num_columns,
const FlagType  A_is_row_major,
const FlagType  A_is_symmetric,
DataType *RESTRICT  c 
)
static

Computes \( \boldsymbol{c} = \boldsymbol{c} + \alpha \mathbf{A}^{\intercal} \boldsymbol{b} \) where \( \mathbf{A} \) is dense, and \( \mathbf{A}^{\intercal} \) is the transpose of the matrix \( \mathbf{A} \).

The reduction variable (here, sum ) is of the type long double. This is becase when DataType is float the summation loses the precision, especially when the vector size is large. It seems that using long double is slightly faster than using double. The advantage of using a type with larger bits for the reduction variable is only sensible if the compiler is optimized with -O2 or -O3 flags.

Parameters
[in]A1D array that represents a 2D dense array with either C (row) major ordering or Fortran (column) major ordering. The major ordering should de defined by A_is_row_major flag.
[in]bColumn vector
[in]alphaA scalar that scales the matrix vector multiplication.
[in]num_rowsNumber of rows of A
[in]num_columnsNumber of columns of A
[in]A_is_row_majorBoolean, can be 0 or 1 as follows:
  • If A is row major (C ordering where the last index is contiguous) this value should be 1.
  • f A is column major (Fortran ordering where the first index is contiguous), this value should be set to 0.
[in]A_is_symmetricBoolean. If A is symmetric, set this value to 1, otherwise 0.
[in,out]cThe output column vector (written in-place).

Definition at line 627 of file c_matrix_operations.cpp.

636{
637 #if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
638
639 // Using BLAS
640 CBLAS_LAYOUT layout;
641 CBLAS_TRANSPOSE transpose = CblasTrans;
642 CBLAS_UPLO uplo;
643 int lda;
644 if (A_is_row_major)
645 {
646 layout = CblasRowMajor;
647 uplo = CblasUpper;
648 lda = num_columns;
649 }
650 else
651 {
652 layout = CblasColMajor;
653 uplo = CblasLower;
654 lda = num_rows;
655
656 // For efficiency, use transpose op for symmetric column-major matrices
657 if (A_is_symmetric)
658 {
659 transpose = CblasNoTrans;
660 }
661 }
662
663 int incb = 1;
664 int incc = 1;
665 DataType beta = 1.0;
666
667 if (A_is_symmetric)
668 {
669 cblas_api::xsymv(layout, uplo, num_columns, alpha, A, lda, b, incb,
670 beta, c, incc);
671 }
672 else
673 {
674 cblas_api::xgemv(layout, transpose, num_rows, num_columns, alpha, A,
675 lda, b, incb, beta, c, incc);
676 }
677
678 #else
679
680 // Not using BLAS
681 DataType zero = 0.0;
682 if (c_arithmetics::is_equal(alpha, zero))
683 {
684 return;
685 }
686
687 long int i;
688 long int jump;
689 long double sum;
690 const long int chunk = 5;
691 const long int num_rows_chunked = num_rows - (num_rows % chunk);
692
693 // Void unused variables to avoid compiler warnings (-Wno-unused-parameter)
694 #if !defined(USE_LOOP_UNROLLING) || (USE_LOOP_UNROLLING != 1)
695 (void) num_rows_chunked;
696 (void) chunk;
697 #endif
698
699 // Determine major order of A
700 if (!A_is_row_major)
701 {
702 // For column-major (Fortran ordering) matrix A (symmetric or
703 // non-symmetric)
704 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
705 #pragma omp parallel for \
706 schedule(static) \
707 if (!omp_in_parallel()) \
708 default(none) \
709 shared(A, b, c, jump, alpha, num_rows, num_columns, \
710 num_rows_chunked, chunk) \
711 private(i, sum)
712 #endif
713 for (long int j=0; j < num_columns; ++j)
714 {
715 sum = 0.0;
716 jump = num_rows * j;
717 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
718 for (i=0; i < num_rows_chunked; i += chunk)
719 {
720 sum += A[i + jump] * b[i] +
721 A[i+1 + jump] * b[i+1] +
722 A[i+2 + jump] * b[i+2] +
723 A[i+3 + jump] * b[i+3] +
724 A[i+4 + jump] * b[i+4];
725 }
726 #endif
727
728 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
729 for (i=num_rows_chunked; i < num_rows; ++i)
730 #else
731 for (i=0; i < num_rows; ++i)
732 #endif
733 {
734 sum += A[i + jump] * b[i];
735 }
736
737 c[j] += alpha * static_cast<DataType>(sum);
738 }
739 }
740 else if (A_is_symmetric)
741 {
742 // A is row-major, but symmetric, so we can use its non-transposed
743 // operation instead, which is more efficient for row-major
744 // matrices.
746 A, b, alpha, num_rows, num_columns, A_is_row_major, A_is_symmetric,
747 c);
748 }
749 else
750 {
751 // For row-major (C ordering) non-symmetric matrix A
752 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
753 #pragma omp parallel for \
754 schedule(static) \
755 if (!omp_in_parallel()) \
756 default(none) \
757 shared(A, b, c, alpha, num_rows, num_columns) \
758 private(i, sum)
759 #endif
760 for (long int j=0; j < num_columns; ++j)
761 {
762 sum = 0.0;
763 for (i=0; i < num_rows; ++i)
764 {
765 sum += A[i*num_columns + j] * b[i];
766 }
767 c[j] += alpha * static_cast<DataType>(sum);
768 }
769 }
770
771 #endif
772}
static void dense_matvec_plus(const DataType *RESTRICT A, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, const FlagType A_is_symmetric, DataType *RESTRICT c)
Computes the operation where is a dense matrix.

References cMatrixOperations< DataType >::dense_matvec_plus(), and c_arithmetics::is_equal().

Referenced by cMatrixOperations< DataType >::dense_matvec_plus(), and cDenseMatrix< DataType >::transpose_dot_plus().

Here is the call graph for this function:
Here is the caller graph for this function:

The documentation for this class was generated from the following files: