imate
C++/CUDA Reference
Loading...
Searching...
No Matches
cu_csr_matrix.cu
Go to the documentation of this file.
1/*
2 * SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>
3 * SPDX-License-Identifier: BSD-3-Clause
4 * SPDX-FileType: SOURCE
5 *
6 * This program is free software: you can redistribute it and/or modify it
7 * under the terms of the license found in the LICENSE.txt file in the root
8 * directory of this source tree.
9 */
10
11
12// =======
13// Headers
14// =======
15
16#include "./cu_csr_matrix.h"
17#include "../_definitions/definitions.h" // USE_OPENMP
18#include "../_cu_definitions/cu_types.h" // __nv_fp8_e5m2, __nv_fp8_e4m3,
19 // __half, __nv_bfloat16
20
21#if defined(USE_OPENMP) && (USE_OPENMP == 1)
22 #include <omp.h> // omp_set_num_threads
23#endif
24
25#include <cstddef> // NULL
26#include <cassert> // assert
27#include "../_cu_basic_algebra/cu_matrix_operations.h" // cuMatrixOperations
28#include "../_cu_basic_algebra/cusparse_api.h" // cusparse_api
29#include "../_cuda_utilities/cuda_api.h" // CudaAPI
30#include "../_cu_arithmetics/cu_arithmetics.h" // cu_arithmetics
31
32
33// =============
34// constructor 1
35// =============
36
39
40template <typename DataType>
42 A_data(NULL),
43 A_indices(NULL),
44 A_index_pointer(NULL),
45 device_A_data(NULL),
46 device_A_indices(NULL),
47 device_A_index_pointer(NULL),
48 device_buffer(NULL),
49 device_buffer_num_bytes(NULL),
50 cusparse_matrix_A(NULL)
51{
52}
53
54
55// =============
56// constructor 2
57// =============
58
81
82template <typename DataType>
84 const DataType* A_data_,
85 const LongIndexType* A_indices_,
86 const LongIndexType* A_index_pointer_,
87 const LongIndexType num_rows_,
88 const LongIndexType num_columns_,
89 const FlagType A_is_symmetric_,
90 const int num_gpu_devices_):
91
92 // Base class constructor
93 cLinearOperatorBase(num_rows_, num_columns_),
94 cuLinearOperator<DataType>(num_gpu_devices_),
95 cuMatrix<DataType>(A_is_symmetric_),
96
97 // Initializer list
98 A_data(A_data_),
99 A_indices(A_indices_),
100 A_index_pointer(A_index_pointer_),
101 device_A_data(NULL),
102 device_A_indices(NULL),
103 device_A_index_pointer(NULL),
104 device_buffer(NULL),
105 cusparse_matrix_A(NULL)
106{
108 this->copy_host_to_device();
109
110 // Initialize device buffer
111 this->device_buffer = new void*[this->num_gpu_devices];
112 this->device_buffer_num_bytes = new size_t[this->num_gpu_devices];
113 for (int device_id=0; device_id < this->num_gpu_devices; ++device_id)
114 {
115 this->device_buffer[device_id] = NULL;
116 this->device_buffer_num_bytes[device_id] = 0;
117 }
118}
119
120
121// ==========
122// destructor
123// ==========
124
127
128template <typename DataType>
130{
131 // Member objects exist if the second constructor was called.
132 if (this->copied_host_to_device)
133 {
134 // Deallocate arrays of data on gpu
135 for (int device_id=0; device_id < this->num_gpu_devices; ++device_id)
136 {
137 // Switch to a device
139
140 // Deallocate
141 CudaAPI<DataType>::del(this->device_A_data[device_id]);
143 this->device_A_indices[device_id]);
145 this->device_A_index_pointer[device_id]);
146 CudaAPI<LongIndexType>::del(this->device_buffer[device_id]);
148 this->cusparse_matrix_A[device_id]);
149 }
150 }
151
152 // Deallocate arrays of pointers on cpu
153 if (this->device_A_data != NULL)
154 {
155 delete[] this->device_A_data;
156 this->device_A_data = NULL;
157 }
158
159 if (this->device_A_indices != NULL)
160 {
161 delete[] this->device_A_indices;
162 this->device_A_indices = NULL;
163 }
164
165 if (this->device_A_index_pointer != NULL)
166 {
167 delete[] this->device_A_index_pointer;
168 this->device_A_index_pointer = NULL;
169 }
170
171 if (this->device_buffer != NULL)
172 {
173 delete[] this->device_buffer;
174 this->device_buffer = NULL;
175 }
176
177 if (this->device_buffer_num_bytes != NULL)
178 {
179 delete[] this->device_buffer_num_bytes;
180 this->device_buffer_num_bytes = NULL;
181 }
182
183 if (this->cusparse_matrix_A != NULL)
184 {
185 delete[] this->cusparse_matrix_A;
186 this->cusparse_matrix_A = NULL;
187 }
188}
189
190
191// ===================
192// copy host to device
193// ===================
194
197
198template <typename DataType>
200{
201 if (!this->copied_host_to_device)
202 {
203 // Set the number of threads
204 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
205 omp_set_num_threads(this->num_gpu_devices);
206 #endif
207
208 // Array sizes
209 LongIndexType A_data_size = this->get_nnz();
210 LongIndexType A_indices_size = A_data_size;
211 LongIndexType A_index_pointer_size = this->num_rows + 1;
212 LongIndexType A_nnz = this->get_nnz();
213
214 // Create array of pointers for data on each gpu device
215 this->device_A_data = new DataType*[this->num_gpu_devices];
216 this->device_A_indices = new LongIndexType*[this->num_gpu_devices];
217 this->device_A_index_pointer = \
218 new LongIndexType*[this->num_gpu_devices];
219 this->cusparse_matrix_A = \
220 new cusparseSpMatDescr_t[this->num_gpu_devices];
221
222 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
223 #pragma omp parallel
224 #endif
225 {
226 // Switch to a device with the same device id as the cpu thread id
227 unsigned int thread_id;
228 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
229 thread_id = omp_get_thread_num();
230 #else
231 thread_id = 0;
232 #endif
233
235
236 // A_data
237 CudaAPI<DataType>::alloc(this->device_A_data[thread_id],
238 A_data_size);
240 this->A_data, A_data_size, this->device_A_data[thread_id]);
241
242 // A_indices
244 this->device_A_indices[thread_id], A_indices_size);
246 this->A_indices, A_indices_size,
247 this->device_A_indices[thread_id]);
248
249 // A_index_pointer
251 this->device_A_index_pointer[thread_id],
252 A_index_pointer_size);
254 this->A_index_pointer, A_index_pointer_size,
255 this->device_A_index_pointer[thread_id]);
256
257 // Create cusparse matrix
259 this->cusparse_matrix_A[thread_id], this->num_rows,
260 this->num_columns, A_nnz, this->device_A_data[thread_id],
261 this->device_A_indices[thread_id],
262 this->device_A_index_pointer[thread_id]);
263 }
264
265 // Flag to prevent reinitialization
266 this->copied_host_to_device = true;
267 }
268}
269
270
271// ===============
272// allocate buffer
273// ===============
274
304
305template <typename DataType>
307 const int device_id,
308 cusparseOperation_t cusparse_operation,
309 const DataType alpha,
310 const DataType beta,
311 cusparseDnVecDescr_t& cusparse_input_vector,
312 cusparseDnVecDescr_t& cusparse_output_vector,
313 cusparseSpMVAlg_t algorithm)
314{
315 // Find the buffer size needed for matrix-vector multiplication
316 size_t required_buffer_size;
318 this->cusparse_handle[device_id], cusparse_operation, alpha,
319 this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
320 cusparse_output_vector, algorithm, &required_buffer_size);
321
322 if (this->device_buffer_num_bytes[device_id] != required_buffer_size)
323 {
324 // Update the buffer size
325 this->device_buffer_num_bytes[device_id] = required_buffer_size;
326
327 // Delete buffer if it was allocated previously
328 CudaAPI<DataType>::del(this->device_buffer[device_id]);
329
330 // Allocate (or reallocate) buffer on device.
332 this->device_buffer[device_id],
333 this->device_buffer_num_bytes[device_id]);
334 }
335}
336
337
338// ==================
339// is identity matrix
340// ==================
341
350
351template <typename DataType>
353{
354 FlagType matrix_is_identity = 1;
355 LongIndexType index_pointer;
356 LongIndexType column;
357 DataType matrix_element;
358 const DataType diagonal = 1.0;
359 const DataType off_diagonal = 0.0;
360
361 // Check matrix element-wise
362 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
363 #pragma omp parallel for \
364 schedule(static) \
365 if (!omp_in_parallel()) \
366 default(none) \
367 shared(matrix_is_identity, diagonal, off_diagonal) \
368 private(index_pointer, column, matrix_element)
369 #endif
370 for (LongIndexType row=0; row < this->num_rows; ++row)
371 {
372 if (matrix_is_identity)
373 {
374 for (index_pointer=this->A_index_pointer[row];
375 index_pointer < this->A_index_pointer[row+1];
376 ++index_pointer)
377 {
378 column = this->A_indices[index_pointer];
379
380 if (!((this->A_is_symmetric) && (column >= row)))
381 {
382 matrix_element = this->A_data[index_pointer];
383
384 if (((row == column) && \
385 (!cu_arithmetics::is_equal(matrix_element,
386 diagonal))) || \
387 ((row != column) && \
388 (!cu_arithmetics::is_equal(matrix_element,
389 off_diagonal))))
390 {
391 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
392 #pragma omp atomic write
393 #endif
394 matrix_is_identity = 0;
395
396 break;
397 }
398 }
399 }
400 }
401 }
402
403 return matrix_is_identity;
404}
405
406
407// =======
408// get nnz
409// =======
410
418
419template <typename DataType>
421{
422 return this->A_index_pointer[this->num_rows];
423}
424
425
426// ===
427// dot
428// ===
429
447
448template <typename DataType>
450 const DataType* device_vector,
451 DataType* device_product)
452{
453 assert(this->copied_host_to_device);
454
455 // Create cusparse vector for the input vector
456 cusparseDnVecDescr_t cusparse_input_vector;
458 cusparse_input_vector, this->num_columns,
459 const_cast<DataType*>(device_vector));
460
461 // Create cusparse vector for the output vector
462 cusparseDnVecDescr_t cusparse_output_vector;
464 cusparse_output_vector, this->num_rows, device_product);
465
466 // Matrix vector settings
467 DataType alpha = cu_arithmetics::cast<float, DataType>(1.0f);
468 DataType beta = cu_arithmetics::cast<float, DataType>(0.0f);
469 cusparseOperation_t cusparse_operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
470 cusparseSpMVAlg_t algorithm = CUSPARSE_SPMV_ALG_DEFAULT;
471
472 // Get device id
473 int device_id = CudaAPI<DataType>::get_device();
474
475 // Allocate device buffer (or reallocation if needed)
476 this->allocate_buffer(device_id, cusparse_operation, alpha, beta,
477 cusparse_input_vector, cusparse_output_vector,
478 algorithm);
479
480 // Matrix vector multiplication
482 this->cusparse_handle[device_id], cusparse_operation, alpha,
483 this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
484 cusparse_output_vector, algorithm, this->device_buffer[device_id]);
485
486 // Destroy cusparse vectors
487 cusparse_api::destroy_cusparse_vector(cusparse_input_vector);
488 cusparse_api::destroy_cusparse_vector(cusparse_output_vector);
489}
490
491
492// ========
493// dot plus
494// ========
495
515
516template <typename DataType>
518 const DataType* device_vector,
519 const DataType alpha,
520 DataType* device_product)
521{
522 assert(this->copied_host_to_device);
523
524 // Create cusparse vector for the input vector
525 cusparseDnVecDescr_t cusparse_input_vector;
527 cusparse_input_vector, this->num_columns,
528 const_cast<DataType*>(device_vector));
529
530 // Create cusparse vector for the output vector
531 cusparseDnVecDescr_t cusparse_output_vector;
533 cusparse_output_vector, this->num_rows, device_product);
534
535 // Matrix vector settings
536 DataType beta = cu_arithmetics::cast<float, DataType>(1.0f);
537 cusparseOperation_t cusparse_operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
538 cusparseSpMVAlg_t algorithm = CUSPARSE_SPMV_ALG_DEFAULT;
539
540 // Get device id
541 int device_id = CudaAPI<DataType>::get_device();
542
543 // Allocate device buffer (or reallocation if needed)
544 this->allocate_buffer(device_id, cusparse_operation, alpha, beta,
545 cusparse_input_vector, cusparse_output_vector,
546 algorithm);
547
548 // Matrix vector multiplication
550 this->cusparse_handle[device_id], cusparse_operation, alpha,
551 this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
552 cusparse_output_vector, algorithm, this->device_buffer[device_id]);
553
554 // Destroy cusparse vectors
555 cusparse_api::destroy_cusparse_vector(cusparse_input_vector);
556 cusparse_api::destroy_cusparse_vector(cusparse_output_vector);
557}
558
559
560// =============
561// transpose dot
562// =============
563
581
582template <typename DataType>
584 const DataType* device_vector,
585 DataType* device_product)
586{
587 assert(this->copied_host_to_device);
588
589 // Create cusparse vector for the input vector
590 cusparseDnVecDescr_t cusparse_input_vector;
592 cusparse_input_vector, this->num_columns,
593 const_cast<DataType*>(device_vector));
594
595 // Create cusparse vector for the output vector
596 cusparseDnVecDescr_t cusparse_output_vector;
598 cusparse_output_vector, this->num_rows, device_product);
599
600 // Matrix vector settings
601 DataType alpha = cu_arithmetics::cast<float, DataType>(1.0f);
602 DataType beta = cu_arithmetics::cast<float, DataType>(0.0f);
603 cusparseOperation_t cusparse_operation = CUSPARSE_OPERATION_TRANSPOSE;
604 cusparseSpMVAlg_t algorithm = CUSPARSE_SPMV_ALG_DEFAULT;
605
606 // Get device id
607 int device_id = CudaAPI<DataType>::get_device();
608
609 // Allocate device buffer (or reallocation if needed)
610 this->allocate_buffer(device_id, cusparse_operation, alpha, beta,
611 cusparse_input_vector, cusparse_output_vector,
612 algorithm);
613
614 // Matrix vector multiplication
616 this->cusparse_handle[device_id], cusparse_operation, alpha,
617 this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
618 cusparse_output_vector, algorithm, this->device_buffer[device_id]);
619
620 // Destroy cusparse vectors
621 cusparse_api::destroy_cusparse_vector(cusparse_input_vector);
622 cusparse_api::destroy_cusparse_vector(cusparse_output_vector);
623}
624
625
626// ==================
627// transpose dot plus
628// ==================
629
650
651template <typename DataType>
653 const DataType* device_vector,
654 const DataType alpha,
655 DataType* device_product)
656{
657 assert(this->copied_host_to_device);
658
659 // Create cusparse vector for the input vector
660 cusparseDnVecDescr_t cusparse_input_vector;
662 cusparse_input_vector, this->num_columns,
663 const_cast<DataType*>(device_vector));
664
665 // Create cusparse vector for the output vector
666 cusparseDnVecDescr_t cusparse_output_vector;
668 cusparse_output_vector, this->num_rows, device_product);
669
670 // Matrix vector settings
671 DataType beta = cu_arithmetics::cast<float, DataType>(1.0f);
672 cusparseOperation_t cusparse_operation = CUSPARSE_OPERATION_TRANSPOSE;
673 cusparseSpMVAlg_t algorithm = CUSPARSE_SPMV_ALG_DEFAULT;
674
675 // Get device id
676 int device_id = CudaAPI<DataType>::get_device();
677
678 // Allocate device buffer (or reallocation if needed)
679 this->allocate_buffer(device_id, cusparse_operation, alpha, beta,
680 cusparse_input_vector, cusparse_output_vector,
681 algorithm);
682
683 // Matrix vector multiplication
685 this->cusparse_handle[device_id], cusparse_operation, alpha,
686 this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
687 cusparse_output_vector, algorithm, this->device_buffer[device_id]);
688
689 // Destroy cusparse vectors
690 cusparse_api::destroy_cusparse_vector(cusparse_input_vector);
691 cusparse_api::destroy_cusparse_vector(cusparse_output_vector);
692}
693
694
695// ===============================
696// Explicit template instantiation
697// ===============================
698
699#if defined(USE_CUDA_FP8_E5M2) && (USE_CUDA_FP8_E5M2 == 1)
700 template class cuCSRMatrix<__nv_fp8_e5m2>;
701#endif
702
703#if defined(USE_CUDA_FP8_E4M3) && (USE_CUDA_FP8_E4M3 == 1)
704 template class cuCSRMatrix<__nv_fp8_e4m3>;
705#endif
706
707#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
708 template class cuCSRMatrix<__half>;
709#endif
710
711#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
712 template class cuCSRMatrix<__nv_bfloat16>;
713#endif
714
715#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
716 template class cuCSRMatrix<float>;
717#endif
718
719#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
720 template class cuCSRMatrix<double>;
721#endif
static void set_device(int device_id)
Sets the current device in multi-gpu applications.
Definition cuda_api.cu:191
static ArrayType * alloc(const size_t array_size)
Allocates memory on gpu device. This function creates a pointer and returns it.
Definition cuda_api.cu:39
static void del(void *device_array)
Deletes memory on gpu device if its pointer is not NULL, then sets the pointer to NULL.
Definition cuda_api.cu:169
static void alloc_bytes(void *&device_array, const size_t num_bytes)
Allocates memory on gpu device. This function uses an existing given pointer.
Definition cuda_api.cu:118
static int get_device()
Gets the current device in multi-gpu applications.
Definition cuda_api.cu:209
static void copy_to_device(const ArrayType *host_array, const size_t array_size, ArrayType *device_array)
Copies memory on host to device memory.
Definition cuda_api.cu:145
Base class for cLinearOperator and cuLinearOperator . This class is not templated so that both cpp an...
Container for CSR matrices.
size_t * device_buffer_num_bytes
cuCSRMatrix()
Default constructor.
virtual ~cuCSRMatrix()
Destructor.
virtual void transpose_dot(const DataType *device_vector, DataType *device_product)
Transposed-matrix vector product.
virtual void dot_plus(const DataType *device_vector, const DataType alpha, DataType *device_product)
Matrix vector product written in place.
LongIndexType get_nnz() const
Returns the number of non-zero elements of the sparse matrix.
virtual void dot(const DataType *device_vector, DataType *device_product)
Matrix vector product.
virtual void transpose_dot_plus(const DataType *device_vector, const DataType alpha, DataType *device_product)
Transposed-matrix vector product written in place.
void allocate_buffer(const int device_id, cusparseOperation_t cusparse_operation, const DataType alpha, const DataType beta, cusparseDnVecDescr_t &cusparse_input_vector, cusparseDnVecDescr_t &cusparse_output_vector, cusparseSpMVAlg_t algorithm)
Allocates an external buffer for matrix-vector multiplication using cusparseSpMV function.
virtual void copy_host_to_device()
Copies the member data from the host memory to the device memory.
void ** device_buffer
virtual FlagType is_identity_matrix() const
Checks whether the matrix is identity.
Base class for linear operators. This class serves as interface for all derived classes.
void initialize_cusparse_handle()
Creates a cusparseHandle_t object, if not created already.
Base class for constant matrices.
Definition cu_matrix.h:45
void omp_set_num_threads(int num_threads)
int omp_get_thread_num()
#define CUSPARSE_SPMV_ALG_DEFAULT
__host__ __device__ DataType abs(const DataType x)
Absolute value of a floating point number.
bool is_equal(DataType x, DataType y)
Check if two floating point numbers are equal within a tolerance.
void cusparse_matvec(cusparseHandle_t cusparse_handle, cusparseOperation_t cusparse_operation, const DataType alpha, cusparseSpMatDescr_t cusparse_matrix, cusparseDnVecDescr_t cusparse_input_vector, const DataType beta, cusparseDnVecDescr_t cusparse_output_vector, cusparseSpMVAlg_t algorithm, void *external_buffer)
void destroy_cusparse_matrix(cusparseSpMatDescr_t &cusparse_matrix)
Destroy cusparse matrix.
void create_cusparse_vector(cusparseDnVecDescr_t &cusparse_vector, const LongIndexType vector_size, DataType *RESTRICT device_vector)
void destroy_cusparse_vector(cusparseDnVecDescr_t &cusparse_vector)
Destroys cusparse vector.
void cusparse_matrix_buffer_size(cusparseHandle_t cusparse_handle, cusparseOperation_t cusparse_operation, const DataType alpha, cusparseSpMatDescr_t cusparse_matrix, cusparseDnVecDescr_t cusparse_input_vector, const DataType beta, cusparseDnVecDescr_t cusparse_output_vector, cusparseSpMVAlg_t algorithm, size_t *buffer_size)
void create_cusparse_csr_matrix(cusparseSpMatDescr_t &cusparse_matrix, const DataIndexType num_rows, const DataIndexType num_columns, const DataIndexType nnz, DataType *RESTRICT device_A_data, DataIndexType *RESTRICT device_A_indices, DataIndexType *RESTRICT device_A_index_pointer)
int LongIndexType
Definition types.h:60
int FlagType
Definition types.h:68