17#include "../_definitions/definitions.h"
18#include "../_cu_definitions/cu_types.h"
21#if defined(USE_OPENMP) && (USE_OPENMP == 1)
27#include "../_cu_arithmetics/cu_arithmetics.h"
28#include "../_cu_basic_algebra/cu_matrix_operations.h"
29#include "../_cuda_utilities/cuda_api.h"
39template <
typename DataType>
76template <
typename DataType>
83 const int num_gpu_devices_):
93 A_is_row_major(A_is_row_major_)
107template <
typename DataType>
111 if (this->copied_host_to_device)
114 for (
int device_id = 0; device_id < this->num_gpu_devices; ++device_id)
123 delete[] this->device_A;
124 this->device_A = NULL;
136template <
typename DataType>
139 if (!this->copied_host_to_device)
142 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
147 this->device_A =
new DataType*[this->num_gpu_devices];
150 size_t A_size =
static_cast<size_t>(this->num_rows) * \
151 static_cast<size_t>(this->num_columns);
153 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
158 unsigned int thread_id;
159 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
170 this->device_A[thread_id]);
174 this->copied_host_to_device =
true;
192template <
typename DataType>
196 DataType matrix_element;
197 const DataType diagonal = 1.0;
198 const DataType off_diagonal = 0.0;
201 if (this->A_is_row_major)
207 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
208 #pragma omp parallel for \
210 if (!omp_in_parallel()) \
212 shared(matrix_is_identity, diagonal, off_diagonal) \
213 private(column, num_checking_columns, matrix_element)
217 if (matrix_is_identity)
219 if (this->A_is_symmetric)
222 num_checking_columns = row + 1;
226 num_checking_columns = this->num_columns;
229 for (column=0; column < num_checking_columns; ++column)
232 matrix_element = this->A[row * this->num_columns + column];
235 if (((row == column) && \
238 ((row != column) && \
242 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
243 #pragma omp atomic write
245 matrix_is_identity = 0;
259 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
260 #pragma omp parallel for \
262 if (!omp_in_parallel()) \
264 shared(matrix_is_identity, diagonal, off_diagonal) \
265 private(row, num_checking_rows, matrix_element)
267 for (
LongIndexType column=0; column < this-> num_columns; ++column)
269 if (matrix_is_identity)
271 if (this->A_is_symmetric)
274 num_checking_rows = column + 1;
278 num_checking_rows = this->num_rows;
281 for (row=0; row < num_checking_rows; ++row)
284 matrix_element = this->A[column * this->num_rows + row];
287 if (((row == column) && \
290 ((row != column) && \
294 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
295 #pragma omp atomic write
297 matrix_is_identity = 0;
306 return matrix_is_identity;
332template <
typename DataType>
334 const DataType* device_vector,
335 DataType* device_product)
337 assert(this->copied_host_to_device);
343 this->cublas_handle[device_id],
344 this->device_A[device_id],
348 this->A_is_row_major,
377template <
typename DataType>
379 const DataType* device_vector,
380 const DataType alpha,
381 DataType* device_product)
383 assert(this->copied_host_to_device);
389 this->cublas_handle[device_id],
390 this->device_A[device_id],
395 this->A_is_row_major,
422template <
typename DataType>
424 const DataType* device_vector,
425 DataType* device_product)
427 assert(this->copied_host_to_device);
433 this->cublas_handle[device_id],
434 this->device_A[device_id],
438 this->A_is_row_major,
468template <
typename DataType>
470 const DataType* device_vector,
471 const DataType alpha,
472 DataType* device_product)
474 assert(this->copied_host_to_device);
480 this->cublas_handle[device_id],
481 this->device_A[device_id],
486 this->A_is_row_major,
495#if defined(USE_CUDA_FP8_E5M2) && (USE_CUDA_FP8_E5M2 == 1)
499#if defined(USE_CUDA_FP8_E4M3) && (USE_CUDA_FP8_E4M3 == 1)
503#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
507#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
511#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
515#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
static void set_device(int device_id)
Sets the current device in multi-gpu applications.
static ArrayType * alloc(const size_t array_size)
Allocates memory on gpu device. This function creates a pointer and returns it.
static void del(void *device_array)
Deletes memory on gpu device if its pointer is not NULL, then sets the pointer to NULL.
static int get_device()
Gets the current device in multi-gpu applications.
static void copy_to_device(const ArrayType *host_array, const size_t array_size, ArrayType *device_array)
Copies memory on host to device memory.
Base class for cLinearOperator and cuLinearOperator . This class is not templated so that both cpp an...
Container for dense matrices.
virtual void transpose_dot_plus(const DataType *device_vector, const DataType alpha, DataType *device_product)
Transposed-matrix vector product written in place.
virtual void transpose_dot(const DataType *device_vector, DataType *device_product)
Transposed-matrix vector product.
virtual FlagType is_identity_matrix() const
Checks whether the matrix is identity.
virtual void dot_plus(const DataType *device_vector, const DataType alpha, DataType *device_product)
Matrix vector product written in place.
cuDenseMatrix()
Default constructor.
virtual void copy_host_to_device()
Copies the member data from the host memory to the device memory.
virtual ~cuDenseMatrix()
Destructor. This function removes data from GPU devices.
virtual void dot(const DataType *device_vector, DataType *device_product)
Matrix vector product.
Base class for linear operators. This class serves as interface for all derived classes.
void initialize_cublas_handle()
Creates a cublasHandle_t object, if not created already.
static void dense_matvec(cublasHandle_t cublas_handle, const DataType *RESTRICT A, const DataType *RESTRICT b, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, DataType *RESTRICT c)
Computes the matrix vector multiplication where is a dense matrix.
static void dense_transposed_matvec(cublasHandle_t cublas_handle, const DataType *RESTRICT A, const DataType *RESTRICT b, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, DataType *RESTRICT c)
Computes matrix vector multiplication where is dense, and is the transpose of the matrix .
static void dense_transposed_matvec_plus(cublasHandle_t cublas_handle, const DataType *RESTRICT A, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, DataType *RESTRICT c)
Computes where is dense, and is the transpose of the matrix .
static void dense_matvec_plus(cublasHandle_t cublas_handle, const DataType *RESTRICT A, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, DataType *RESTRICT c)
Computes the operation where is a dense matrix.
Base class for constant matrices.
void omp_set_num_threads(int num_threads)
bool is_equal(DataType x, DataType y)
Check if two floating point numbers are equal within a tolerance.