17#include "../_definitions/definitions.h"
18#include "../_cu_definitions/cu_types.h"
21#if defined(USE_OPENMP) && (USE_OPENMP == 1)
27#include "../_cu_basic_algebra/cu_matrix_operations.h"
28#include "../_cu_basic_algebra/cusparse_api.h"
29#include "../_cuda_utilities/cuda_api.h"
30#include "../_cu_arithmetics/cu_arithmetics.h"
40template <
typename DataType>
44 A_index_pointer(NULL),
46 device_A_indices(NULL),
47 device_A_index_pointer(NULL),
49 device_buffer_num_bytes(NULL),
50 cusparse_matrix_A(NULL)
82template <
typename DataType>
84 const DataType* A_data_,
90 const int num_gpu_devices_):
99 A_indices(A_indices_),
100 A_index_pointer(A_index_pointer_),
102 device_A_indices(NULL),
103 device_A_index_pointer(NULL),
105 cusparse_matrix_A(NULL)
128template <
typename DataType>
132 if (this->copied_host_to_device)
135 for (
int device_id=0; device_id < this->num_gpu_devices; ++device_id)
143 this->device_A_indices[device_id]);
145 this->device_A_index_pointer[device_id]);
148 this->cusparse_matrix_A[device_id]);
153 if (this->device_A_data != NULL)
155 delete[] this->device_A_data;
156 this->device_A_data = NULL;
159 if (this->device_A_indices != NULL)
161 delete[] this->device_A_indices;
162 this->device_A_indices = NULL;
165 if (this->device_A_index_pointer != NULL)
167 delete[] this->device_A_index_pointer;
168 this->device_A_index_pointer = NULL;
171 if (this->device_buffer != NULL)
173 delete[] this->device_buffer;
174 this->device_buffer = NULL;
177 if (this->device_buffer_num_bytes != NULL)
179 delete[] this->device_buffer_num_bytes;
180 this->device_buffer_num_bytes = NULL;
183 if (this->cusparse_matrix_A != NULL)
185 delete[] this->cusparse_matrix_A;
186 this->cusparse_matrix_A = NULL;
198template <
typename DataType>
201 if (!this->copied_host_to_device)
204 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
215 this->device_A_data =
new DataType*[this->num_gpu_devices];
216 this->device_A_indices =
new LongIndexType*[this->num_gpu_devices];
217 this->device_A_index_pointer = \
219 this->cusparse_matrix_A = \
220 new cusparseSpMatDescr_t[this->num_gpu_devices];
222 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
227 unsigned int thread_id;
228 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
240 this->A_data, A_data_size, this->device_A_data[thread_id]);
244 this->device_A_indices[thread_id], A_indices_size);
246 this->A_indices, A_indices_size,
247 this->device_A_indices[thread_id]);
251 this->device_A_index_pointer[thread_id],
252 A_index_pointer_size);
254 this->A_index_pointer, A_index_pointer_size,
255 this->device_A_index_pointer[thread_id]);
259 this->cusparse_matrix_A[thread_id], this->num_rows,
260 this->num_columns, A_nnz, this->device_A_data[thread_id],
261 this->device_A_indices[thread_id],
262 this->device_A_index_pointer[thread_id]);
266 this->copied_host_to_device =
true;
305template <
typename DataType>
308 cusparseOperation_t cusparse_operation,
309 const DataType alpha,
311 cusparseDnVecDescr_t& cusparse_input_vector,
312 cusparseDnVecDescr_t& cusparse_output_vector,
313 cusparseSpMVAlg_t algorithm)
316 size_t required_buffer_size;
318 this->cusparse_handle[device_id], cusparse_operation, alpha,
319 this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
320 cusparse_output_vector, algorithm, &required_buffer_size);
322 if (this->device_buffer_num_bytes[device_id] != required_buffer_size)
325 this->device_buffer_num_bytes[device_id] = required_buffer_size;
332 this->device_buffer[device_id],
333 this->device_buffer_num_bytes[device_id]);
351template <
typename DataType>
357 DataType matrix_element;
358 const DataType diagonal = 1.0;
359 const DataType off_diagonal = 0.0;
362 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
363 #pragma omp parallel for \
365 if (!omp_in_parallel()) \
367 shared(matrix_is_identity, diagonal, off_diagonal) \
368 private(index_pointer, column, matrix_element)
372 if (matrix_is_identity)
374 for (index_pointer=this->A_index_pointer[row];
375 index_pointer < this->A_index_pointer[row+1];
378 column = this->A_indices[index_pointer];
380 if (!((this->A_is_symmetric) && (column >= row)))
382 matrix_element = this->A_data[index_pointer];
384 if (((row == column) && \
387 ((row != column) && \
391 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
392 #pragma omp atomic write
394 matrix_is_identity = 0;
403 return matrix_is_identity;
419template <
typename DataType>
422 return this->A_index_pointer[this->num_rows];
448template <
typename DataType>
450 const DataType* device_vector,
451 DataType* device_product)
453 assert(this->copied_host_to_device);
456 cusparseDnVecDescr_t cusparse_input_vector;
458 cusparse_input_vector, this->num_columns,
459 const_cast<DataType*
>(device_vector));
462 cusparseDnVecDescr_t cusparse_output_vector;
464 cusparse_output_vector, this->num_rows, device_product);
469 cusparseOperation_t cusparse_operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
476 this->allocate_buffer(device_id, cusparse_operation, alpha, beta,
477 cusparse_input_vector, cusparse_output_vector,
482 this->cusparse_handle[device_id], cusparse_operation, alpha,
483 this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
484 cusparse_output_vector, algorithm, this->device_buffer[device_id]);
516template <
typename DataType>
518 const DataType* device_vector,
519 const DataType alpha,
520 DataType* device_product)
522 assert(this->copied_host_to_device);
525 cusparseDnVecDescr_t cusparse_input_vector;
527 cusparse_input_vector, this->num_columns,
528 const_cast<DataType*
>(device_vector));
531 cusparseDnVecDescr_t cusparse_output_vector;
533 cusparse_output_vector, this->num_rows, device_product);
537 cusparseOperation_t cusparse_operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
544 this->allocate_buffer(device_id, cusparse_operation, alpha, beta,
545 cusparse_input_vector, cusparse_output_vector,
550 this->cusparse_handle[device_id], cusparse_operation, alpha,
551 this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
552 cusparse_output_vector, algorithm, this->device_buffer[device_id]);
582template <
typename DataType>
584 const DataType* device_vector,
585 DataType* device_product)
587 assert(this->copied_host_to_device);
590 cusparseDnVecDescr_t cusparse_input_vector;
592 cusparse_input_vector, this->num_columns,
593 const_cast<DataType*
>(device_vector));
596 cusparseDnVecDescr_t cusparse_output_vector;
598 cusparse_output_vector, this->num_rows, device_product);
603 cusparseOperation_t cusparse_operation = CUSPARSE_OPERATION_TRANSPOSE;
610 this->allocate_buffer(device_id, cusparse_operation, alpha, beta,
611 cusparse_input_vector, cusparse_output_vector,
616 this->cusparse_handle[device_id], cusparse_operation, alpha,
617 this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
618 cusparse_output_vector, algorithm, this->device_buffer[device_id]);
651template <
typename DataType>
653 const DataType* device_vector,
654 const DataType alpha,
655 DataType* device_product)
657 assert(this->copied_host_to_device);
660 cusparseDnVecDescr_t cusparse_input_vector;
662 cusparse_input_vector, this->num_columns,
663 const_cast<DataType*
>(device_vector));
666 cusparseDnVecDescr_t cusparse_output_vector;
668 cusparse_output_vector, this->num_rows, device_product);
672 cusparseOperation_t cusparse_operation = CUSPARSE_OPERATION_TRANSPOSE;
679 this->allocate_buffer(device_id, cusparse_operation, alpha, beta,
680 cusparse_input_vector, cusparse_output_vector,
685 this->cusparse_handle[device_id], cusparse_operation, alpha,
686 this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
687 cusparse_output_vector, algorithm, this->device_buffer[device_id]);
699#if defined(USE_CUDA_FP8_E5M2) && (USE_CUDA_FP8_E5M2 == 1)
703#if defined(USE_CUDA_FP8_E4M3) && (USE_CUDA_FP8_E4M3 == 1)
707#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
711#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
715#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
719#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
static void set_device(int device_id)
Sets the current device in multi-gpu applications.
static ArrayType * alloc(const size_t array_size)
Allocates memory on gpu device. This function creates a pointer and returns it.
static void del(void *device_array)
Deletes memory on gpu device if its pointer is not NULL, then sets the pointer to NULL.
static void alloc_bytes(void *&device_array, const size_t num_bytes)
Allocates memory on gpu device. This function uses an existing given pointer.
static int get_device()
Gets the current device in multi-gpu applications.
static void copy_to_device(const ArrayType *host_array, const size_t array_size, ArrayType *device_array)
Copies memory on host to device memory.
Base class for cLinearOperator and cuLinearOperator . This class is not templated so that both cpp an...
Container for CSR matrices.
size_t * device_buffer_num_bytes
cuCSRMatrix()
Default constructor.
virtual ~cuCSRMatrix()
Destructor.
virtual void transpose_dot(const DataType *device_vector, DataType *device_product)
Transposed-matrix vector product.
virtual void dot_plus(const DataType *device_vector, const DataType alpha, DataType *device_product)
Matrix vector product written in place.
LongIndexType get_nnz() const
Returns the number of non-zero elements of the sparse matrix.
virtual void dot(const DataType *device_vector, DataType *device_product)
Matrix vector product.
virtual void transpose_dot_plus(const DataType *device_vector, const DataType alpha, DataType *device_product)
Transposed-matrix vector product written in place.
void allocate_buffer(const int device_id, cusparseOperation_t cusparse_operation, const DataType alpha, const DataType beta, cusparseDnVecDescr_t &cusparse_input_vector, cusparseDnVecDescr_t &cusparse_output_vector, cusparseSpMVAlg_t algorithm)
Allocates an external buffer for matrix-vector multiplication using cusparseSpMV function.
virtual void copy_host_to_device()
Copies the member data from the host memory to the device memory.
virtual FlagType is_identity_matrix() const
Checks whether the matrix is identity.
Base class for linear operators. This class serves as interface for all derived classes.
void initialize_cusparse_handle()
Creates a cusparseHandle_t object, if not created already.
Base class for constant matrices.
void omp_set_num_threads(int num_threads)
#define CUSPARSE_SPMV_ALG_DEFAULT
__host__ __device__ DataType abs(const DataType x)
Absolute value of a floating point number.
bool is_equal(DataType x, DataType y)
Check if two floating point numbers are equal within a tolerance.
void cusparse_matvec(cusparseHandle_t cusparse_handle, cusparseOperation_t cusparse_operation, const DataType alpha, cusparseSpMatDescr_t cusparse_matrix, cusparseDnVecDescr_t cusparse_input_vector, const DataType beta, cusparseDnVecDescr_t cusparse_output_vector, cusparseSpMVAlg_t algorithm, void *external_buffer)
void destroy_cusparse_matrix(cusparseSpMatDescr_t &cusparse_matrix)
Destroy cusparse matrix.
void create_cusparse_vector(cusparseDnVecDescr_t &cusparse_vector, const LongIndexType vector_size, DataType *RESTRICT device_vector)
void destroy_cusparse_vector(cusparseDnVecDescr_t &cusparse_vector)
Destroys cusparse vector.
void cusparse_matrix_buffer_size(cusparseHandle_t cusparse_handle, cusparseOperation_t cusparse_operation, const DataType alpha, cusparseSpMatDescr_t cusparse_matrix, cusparseDnVecDescr_t cusparse_input_vector, const DataType beta, cusparseDnVecDescr_t cusparse_output_vector, cusparseSpMVAlg_t algorithm, size_t *buffer_size)
void create_cusparse_csr_matrix(cusparseSpMatDescr_t &cusparse_matrix, const DataIndexType num_rows, const DataIndexType num_columns, const DataIndexType nnz, DataType *RESTRICT device_A_data, DataIndexType *RESTRICT device_A_indices, DataIndexType *RESTRICT device_A_index_pointer)