17#include "../_definitions/definitions.h"
18#include "../_cu_definitions/cu_types.h"
21#if defined(USE_OPENMP) && (USE_OPENMP == 1)
28#include "../_cu_basic_algebra/cu_matrix_operations.h"
29#include "../_cu_basic_algebra/cusparse_api.h"
30#include "../_cuda_utilities/cuda_api.h"
31#include "../_cu_arithmetics/cu_arithmetics.h"
41template <
typename DataType>
45 A_index_pointer(NULL),
47 device_A_indices(NULL),
48 device_A_index_pointer(NULL),
50 device_buffer_num_bytes(NULL),
51 cusparse_matrix_A(NULL)
83template <
typename DataType>
85 const DataType* A_data_,
91 const int num_gpu_devices_):
100 A_indices(A_indices_),
101 A_index_pointer(A_index_pointer_),
103 device_A_indices(NULL),
104 device_A_index_pointer(NULL),
106 cusparse_matrix_A(NULL)
129template <
typename DataType>
133 if (this->copied_host_to_device)
136 for (
int device_id=0; device_id < this->num_gpu_devices; ++device_id)
144 this->device_A_indices[device_id]);
146 this->device_A_index_pointer[device_id]);
149 this->cusparse_matrix_A[device_id]);
154 if (this->device_A_data != NULL)
156 delete[] this->device_A_data;
157 this->device_A_data = NULL;
160 if (this->device_A_indices != NULL)
162 delete[] this->device_A_indices;
163 this->device_A_indices = NULL;
166 if (this->device_A_index_pointer != NULL)
168 delete[] this->device_A_index_pointer;
169 this->device_A_index_pointer = NULL;
172 if (this->device_buffer != NULL)
174 delete[] this->device_buffer;
175 this->device_buffer = NULL;
178 if (this->device_buffer_num_bytes != NULL)
180 delete[] this->device_buffer_num_bytes;
181 this->device_buffer_num_bytes = NULL;
184 if (this->cusparse_matrix_A != NULL)
186 delete[] this->cusparse_matrix_A;
187 this->cusparse_matrix_A = NULL;
204template <
typename DataType>
207 if (!this->copied_host_to_device)
210 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
222 #error CUDA_VERSION Undefined!
223 #elif CUDA_VERSION < 12000
231 this->device_A_data =
new DataType*[this->num_gpu_devices];
232 this->device_A_indices =
new LongIndexType*[this->num_gpu_devices];
233 this->device_A_index_pointer = \
235 this->cusparse_matrix_A = \
236 new cusparseSpMatDescr_t[this->num_gpu_devices];
238 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
243 unsigned int thread_id;
244 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
256 this->A_data, A_data_size, this->device_A_data[thread_id]);
260 this->device_A_indices[thread_id], A_indices_size);
262 this->A_indices, A_indices_size,
263 this->device_A_indices[thread_id]);
267 this->device_A_index_pointer[thread_id],
268 A_index_pointer_size);
270 this->A_index_pointer, A_index_pointer_size,
271 this->device_A_index_pointer[thread_id]);
275 #error CUDA_VERSION Undefined!
276 #elif CUDA_VERSION < 12000
279 this->cusparse_matrix_A[thread_id], csc_num_rows,
280 csc_num_columns, A_nnz, this->device_A_data[thread_id],
281 this->device_A_indices[thread_id],
282 this->device_A_index_pointer[thread_id]);
286 this->cusparse_matrix_A[thread_id], this->num_rows,
287 this->num_columns, A_nnz,
288 this->device_A_data[thread_id],
289 this->device_A_indices[thread_id],
290 this->device_A_index_pointer[thread_id]);
295 this->copied_host_to_device =
true;
334template <
typename DataType>
337 cusparseOperation_t cusparse_operation,
338 const DataType alpha,
340 cusparseDnVecDescr_t& cusparse_input_vector,
341 cusparseDnVecDescr_t& cusparse_output_vector,
342 cusparseSpMVAlg_t algorithm)
345 size_t required_buffer_size;
347 this->cusparse_handle[device_id], cusparse_operation, alpha,
348 this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
349 cusparse_output_vector, algorithm, &required_buffer_size);
351 if (this->device_buffer_num_bytes[device_id] != required_buffer_size)
354 this->device_buffer_num_bytes[device_id] = required_buffer_size;
361 this->device_buffer[device_id],
362 this->device_buffer_num_bytes[device_id]);
380template <
typename DataType>
386 DataType matrix_element;
387 const DataType diagonal = 1.0;
388 const DataType off_diagonal = 0.0;
391 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
392 #pragma omp parallel for \
394 if (!omp_in_parallel()) \
396 shared(matrix_is_identity, diagonal, off_diagonal) \
397 private(index_pointer, row, matrix_element)
399 for (
LongIndexType column=0; column < this->num_columns; ++column)
401 if (matrix_is_identity)
403 for (index_pointer=this->A_index_pointer[column];
404 index_pointer < this->A_index_pointer[column+1];
407 row = this->A_indices[index_pointer];
409 if (!((this->A_is_symmetric) && (column >= row)))
411 matrix_element = this->A_data[index_pointer];
413 if (((row == column) && \
416 ((row != column) && \
420 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
421 #pragma omp atomic write
423 matrix_is_identity = 0;
432 return matrix_is_identity;
448template <
typename DataType>
451 return this->A_index_pointer[this->num_columns];
477template <
typename DataType>
479 const DataType* device_vector,
480 DataType* device_product)
482 assert(this->copied_host_to_device);
485 cusparseDnVecDescr_t cusparse_input_vector;
487 cusparse_input_vector, this->num_columns,
488 const_cast<DataType*
>(device_vector));
491 cusparseDnVecDescr_t cusparse_output_vector;
493 cusparse_output_vector, this->num_rows, device_product);
500 #error CUDA_VERSION Undefined!
501 #elif CUDA_VERSION < 12000
503 cusparseOperation_t cusparse_operation = CUSPARSE_OPERATION_TRANSPOSE;
505 cusparseOperation_t cusparse_operation = \
506 CUSPARSE_OPERATION_NON_TRANSPOSE;
515 this->allocate_buffer(device_id, cusparse_operation, alpha, beta,
516 cusparse_input_vector, cusparse_output_vector,
521 this->cusparse_handle[device_id], cusparse_operation, alpha,
522 this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
523 cusparse_output_vector, algorithm, this->device_buffer[device_id]);
555template <
typename DataType>
557 const DataType* device_vector,
558 const DataType alpha,
559 DataType* device_product)
561 assert(this->copied_host_to_device);
564 cusparseDnVecDescr_t cusparse_input_vector;
566 cusparse_input_vector, this->num_columns,
567 const_cast<DataType*
>(device_vector));
570 cusparseDnVecDescr_t cusparse_output_vector;
572 cusparse_output_vector, this->num_rows, device_product);
578 #error CUDA_VERSION Undefined!
579 #elif CUDA_VERSION < 12000
581 cusparseOperation_t cusparse_operation = CUSPARSE_OPERATION_TRANSPOSE;
583 cusparseOperation_t cusparse_operation = \
584 CUSPARSE_OPERATION_NON_TRANSPOSE;
593 this->allocate_buffer(device_id, cusparse_operation, alpha, beta,
594 cusparse_input_vector, cusparse_output_vector,
599 this->cusparse_handle[device_id], cusparse_operation, alpha,
600 this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
601 cusparse_output_vector, algorithm, this->device_buffer[device_id]);
631template <
typename DataType>
633 const DataType* device_vector,
634 DataType* device_product)
636 assert(this->copied_host_to_device);
639 cusparseDnVecDescr_t cusparse_input_vector;
641 cusparse_input_vector, this->num_columns,
642 const_cast<DataType*
>(device_vector));
645 cusparseDnVecDescr_t cusparse_output_vector;
647 cusparse_output_vector, this->num_rows, device_product);
654 #error CUDA_VERSION Undefined!
655 #elif CUDA_VERSION < 12000
657 cusparseOperation_t cusparse_operation = \
658 CUSPARSE_OPERATION_NON_TRANSPOSE;
660 cusparseOperation_t cusparse_operation = CUSPARSE_OPERATION_TRANSPOSE;
669 this->allocate_buffer(device_id, cusparse_operation, alpha, beta,
670 cusparse_input_vector, cusparse_output_vector,
675 this->cusparse_handle[device_id], cusparse_operation, alpha,
676 this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
677 cusparse_output_vector, algorithm, this->device_buffer[device_id]);
710template <
typename DataType>
712 const DataType* device_vector,
713 const DataType alpha,
714 DataType* device_product)
716 assert(this->copied_host_to_device);
719 cusparseDnVecDescr_t cusparse_input_vector;
721 cusparse_input_vector, this->num_columns,
722 const_cast<DataType*
>(device_vector));
725 cusparseDnVecDescr_t cusparse_output_vector;
727 cusparse_output_vector, this->num_rows, device_product);
733 #error CUDA_VERSION Undefined!
734 #elif CUDA_VERSION < 12000
736 cusparseOperation_t cusparse_operation = \
737 CUSPARSE_OPERATION_NON_TRANSPOSE;
739 cusparseOperation_t cusparse_operation = CUSPARSE_OPERATION_TRANSPOSE;
748 this->allocate_buffer(device_id, cusparse_operation, alpha, beta,
749 cusparse_input_vector, cusparse_output_vector,
754 this->cusparse_handle[device_id], cusparse_operation, alpha,
755 this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
756 cusparse_output_vector, algorithm, this->device_buffer[device_id]);
768#if defined(USE_CUDA_FP8_E5M2) && (USE_CUDA_FP8_E5M2 == 1)
772#if defined(USE_CUDA_FP8_E4M3) && (USE_CUDA_FP8_E4M3 == 1)
776#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
780#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
784#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
788#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
static void set_device(int device_id)
Sets the current device in multi-gpu applications.
static ArrayType * alloc(const size_t array_size)
Allocates memory on gpu device. This function creates a pointer and returns it.
static void del(void *device_array)
Deletes memory on gpu device if its pointer is not NULL, then sets the pointer to NULL.
static void alloc_bytes(void *&device_array, const size_t num_bytes)
Allocates memory on gpu device. This function uses an existing given pointer.
static int get_device()
Gets the current device in multi-gpu applications.
static void copy_to_device(const ArrayType *host_array, const size_t array_size, ArrayType *device_array)
Copies memory on host to device memory.
Base class for cLinearOperator and cuLinearOperator . This class is not templated so that both cpp an...
Container for CSC matrices.
virtual void transpose_dot(const DataType *device_vector, DataType *device_product)
Transposed-matrix vector product.
virtual void transpose_dot_plus(const DataType *device_vector, const DataType alpha, DataType *device_product)
Transposed-matrix vector product written in place.
virtual void dot(const DataType *device_vector, DataType *device_product)
Matrix vector product.
LongIndexType get_nnz() const
Returns the number of non-zero elements of the sparse matrix.
virtual FlagType is_identity_matrix() const
Checks whether the matrix is identity.
size_t * device_buffer_num_bytes
cuCSCMatrix()
Default constructor.
virtual void dot_plus(const DataType *device_vector, const DataType alpha, DataType *device_product)
Matrix vector product written in place.
virtual void copy_host_to_device()
Copies the member data from the host memory to the device memory.
void allocate_buffer(const int device_id, cusparseOperation_t cusparse_operation, const DataType alpha, const DataType beta, cusparseDnVecDescr_t &cusparse_input_vector, cusparseDnVecDescr_t &cusparse_output_vector, cusparseSpMVAlg_t algorithm)
Allocates an external buffer for matrix-vector multiplication using cusparseSpMV function.
virtual ~cuCSCMatrix()
Destructor.
Base class for linear operators. This class serves as interface for all derived classes.
void initialize_cusparse_handle()
Creates a cusparseHandle_t object, if not created already.
Base class for constant matrices.
void omp_set_num_threads(int num_threads)
#define CUSPARSE_SPMV_ALG_DEFAULT
__host__ __device__ DataType abs(const DataType x)
Absolute value of a floating point number.
bool is_equal(DataType x, DataType y)
Check if two floating point numbers are equal within a tolerance.
void create_cusparse_csc_matrix(cusparseSpMatDescr_t &cusparse_matrix, const DataIndexType num_rows, const DataIndexType num_columns, const DataIndexType nnz, DataType *RESTRICT device_A_data, DataIndexType *RESTRICT device_A_indices, DataIndexType *RESTRICT device_A_index_pointer)
void cusparse_matvec(cusparseHandle_t cusparse_handle, cusparseOperation_t cusparse_operation, const DataType alpha, cusparseSpMatDescr_t cusparse_matrix, cusparseDnVecDescr_t cusparse_input_vector, const DataType beta, cusparseDnVecDescr_t cusparse_output_vector, cusparseSpMVAlg_t algorithm, void *external_buffer)
void destroy_cusparse_matrix(cusparseSpMatDescr_t &cusparse_matrix)
Destroy cusparse matrix.
void create_cusparse_vector(cusparseDnVecDescr_t &cusparse_vector, const LongIndexType vector_size, DataType *RESTRICT device_vector)
void destroy_cusparse_vector(cusparseDnVecDescr_t &cusparse_vector)
Destroys cusparse vector.
void cusparse_matrix_buffer_size(cusparseHandle_t cusparse_handle, cusparseOperation_t cusparse_operation, const DataType alpha, cusparseSpMatDescr_t cusparse_matrix, cusparseDnVecDescr_t cusparse_input_vector, const DataType beta, cusparseDnVecDescr_t cusparse_output_vector, cusparseSpMVAlg_t algorithm, size_t *buffer_size)
void create_cusparse_csr_matrix(cusparseSpMatDescr_t &cusparse_matrix, const DataIndexType num_rows, const DataIndexType num_columns, const DataIndexType nnz, DataType *RESTRICT device_A_data, DataIndexType *RESTRICT device_A_indices, DataIndexType *RESTRICT device_A_index_pointer)