imate
C++/CUDA Reference
cu_csc_matrix.cu
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>
3  * SPDX-License-Identifier: BSD-3-Clause
4  * SPDX-FileType: SOURCE
5  *
6  * This program is free software: you can redistribute it and/or modify it
7  * under the terms of the license found in the LICENSE.txt file in the root
8  * directory of this source tree.
9  */
10 
11 
12 // =======
13 // Headers
14 // =======
15 
16 #include "./cu_csc_matrix.h"
17 #include <omp.h> // omp_set_num_threads
18 #include <cstddef> // NULL
19 #include <cassert> // assert
20 #include "../_cu_basic_algebra/cu_matrix_operations.h" // cuMatrixOperations
21 #include "../_cu_basic_algebra/cusparse_interface.h" // cusparse_interface
22 #include "../_cuda_utilities/cuda_interface.h" // CudaInterface
23 
24 
25 // =============
26 // constructor 1
27 // =============
28 
31 
32 template <typename DataType>
34  device_A_data(NULL),
35  device_A_indices(NULL),
36  device_A_index_pointer(NULL),
37  device_buffer(NULL),
38  device_buffer_num_bytes(NULL),
39  cusparse_matrix_A(NULL)
40 {
41 }
42 
43 
44 // =============
45 // constructor 2
46 // =============
47 
50 
51 template <typename DataType>
53  const DataType* A_data_,
54  const LongIndexType* A_indices_,
55  const LongIndexType* A_index_pointer_,
56  const LongIndexType num_rows_,
57  const LongIndexType num_columns_,
58  const int num_gpu_devices_):
59 
60  // Base class constructor
61  cLinearOperator<DataType>(num_rows_, num_columns_),
62  cCSCMatrix<DataType>(A_data_, A_indices_, A_index_pointer_, num_rows_,
63  num_columns_),
64  cuMatrix<DataType>(num_gpu_devices_),
65 
66  // Initializer list
67  device_A_data(NULL),
68  device_A_indices(NULL),
69  device_A_index_pointer(NULL),
70  device_buffer(NULL),
71  cusparse_matrix_A(NULL)
72 {
74  this->copy_host_to_device();
75 
76  // Initialize device buffer
77  this->device_buffer = new void*[this->num_gpu_devices];
78  this->device_buffer_num_bytes = new size_t[this->num_gpu_devices];
79  for (int device_id=0; device_id < this->num_gpu_devices; ++device_id)
80  {
81  this->device_buffer[device_id] = NULL;
82  this->device_buffer_num_bytes[device_id] = 0;
83  }
84 }
85 
86 
87 // ==========
88 // destructor
89 // ==========
90 
93 
94 template <typename DataType>
96 {
97  // Member objects exist if the second constructor was called.
98  if (this->copied_host_to_device)
99  {
100  // Deallocate arrays of data on gpu
101  for (int device_id=0; device_id < this->num_gpu_devices; ++device_id)
102  {
103  // Switch to a device
105 
106  // Deallocate
107  CudaInterface<DataType>::del(this->device_A_data[device_id]);
109  this->device_A_indices[device_id]);
111  this->device_A_index_pointer[device_id]);
112  CudaInterface<LongIndexType>::del(this->device_buffer[device_id]);
114  this->cusparse_matrix_A[device_id]);
115  }
116  }
117 
118  // Deallocate arrays of pointers on cpu
119  if (this->device_A_data != NULL)
120  {
121  delete[] this->device_A_data;
122  this->device_A_data = NULL;
123  }
124 
125  if (this->device_A_indices != NULL)
126  {
127  delete[] this->device_A_indices;
128  this->device_A_indices = NULL;
129  }
130 
131  if (this->device_A_index_pointer != NULL)
132  {
133  delete[] this->device_A_index_pointer;
134  this->device_A_index_pointer = NULL;
135  }
136 
137  if (this->device_buffer != NULL)
138  {
139  delete[] this->device_buffer;
140  this->device_buffer = NULL;
141  }
142 
143  if (this->device_buffer_num_bytes != NULL)
144  {
145  delete[] this->device_buffer_num_bytes;
146  this->device_buffer_num_bytes = NULL;
147  }
148 
149  if (this->cusparse_matrix_A != NULL)
150  {
151  delete[] this->cusparse_matrix_A;
152  this->cusparse_matrix_A = NULL;
153  }
154 }
155 
156 
157 // ===================
158 // copy host to device
159 // ===================
160 
167 
168 template <typename DataType>
170 {
171  if (!this->copied_host_to_device)
172  {
173  // Set the number of threads
174  omp_set_num_threads(this->num_gpu_devices);
175 
176  // Array sizes
177  LongIndexType A_data_size = this->get_nnz();
178  LongIndexType A_indices_size = A_data_size;
179  LongIndexType A_index_pointer_size = this->num_rows + 1;
180  LongIndexType A_nnz = this->get_nnz();
181 
182  // Swapping the number of rows and columns to treat the input CSC
183  // matrix as a CSR matrix.
184  LongIndexType csc_num_rows = this->num_columns;
185  LongIndexType csc_num_columns = this->num_rows;
186 
187  // Create array of pointers for data on each gpu device
188  this->device_A_data = new DataType*[this->num_gpu_devices];
189  this->device_A_indices = new LongIndexType*[this->num_gpu_devices];
190  this->device_A_index_pointer = \
191  new LongIndexType*[this->num_gpu_devices];
192  this->cusparse_matrix_A = \
193  new cusparseSpMatDescr_t[this->num_gpu_devices];
194 
195  #pragma omp parallel
196  {
197  // Switch to a device with the same device id as the cpu thread id
198  unsigned int thread_id = omp_get_thread_num();
200 
201  // A_data
202  CudaInterface<DataType>::alloc(this->device_A_data[thread_id],
203  A_data_size);
205  this->A_data, A_data_size, this->device_A_data[thread_id]);
206 
207  // A_indices
209  this->device_A_indices[thread_id], A_indices_size);
211  this->A_indices, A_indices_size,
212  this->device_A_indices[thread_id]);
213 
214  // A_index_pointer
216  this->device_A_index_pointer[thread_id],
217  A_index_pointer_size);
219  this->A_index_pointer, A_index_pointer_size,
220  this->device_A_index_pointer[thread_id]);
221 
222  // Create cusparse matrix
224  this->cusparse_matrix_A[thread_id], csc_num_rows,
225  csc_num_columns, A_nnz, this->device_A_data[thread_id],
226  this->device_A_indices[thread_id],
227  this->device_A_index_pointer[thread_id]);
228  }
229 
230  // Flag to prevent reinitialization
231  this->copied_host_to_device = true;
232  }
233 }
234 
235 
236 // ===============
237 // allocate buffer
238 // ===============
239 
248 
249 template <typename DataType>
251  const int device_id,
252  cusparseOperation_t cusparse_operation,
253  const DataType alpha,
254  const DataType beta,
255  cusparseDnVecDescr_t& cusparse_input_vector,
256  cusparseDnVecDescr_t& cusparse_output_vector,
257  cusparseSpMVAlg_t algorithm)
258 {
259  // Find the buffer size needed for matrix-vector multiplication
260  size_t required_buffer_size;
262  this->cusparse_handle[device_id], cusparse_operation, alpha,
263  this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
264  cusparse_output_vector, algorithm, &required_buffer_size);
265 
266  if (this->device_buffer_num_bytes[device_id] != required_buffer_size)
267  {
268  // Update the buffer size
269  this->device_buffer_num_bytes[device_id] = required_buffer_size;
270 
271  // Delete buffer if it was allocated previously
272  CudaInterface<DataType>::del(this->device_buffer[device_id]);
273 
274  // Allocate (or reallocate) buffer on device.
276  this->device_buffer[device_id],
277  this->device_buffer_num_bytes[device_id]);
278  }
279 }
280 
281 
282 // ===
283 // dot
284 // ===
285 
286 template <typename DataType>
288  const DataType* device_vector,
289  DataType* device_product)
290 {
291  assert(this->copied_host_to_device);
292 
293  // Create cusparse vector for the input vector
294  cusparseDnVecDescr_t cusparse_input_vector;
296  cusparse_input_vector, this->num_columns,
297  const_cast<DataType*>(device_vector));
298 
299  // Create cusparse vector for the output vector
300  cusparseDnVecDescr_t cusparse_output_vector;
302  cusparse_output_vector, this->num_rows, device_product);
303 
304  // Matrix vector settings
305  DataType alpha = 1.0;
306  DataType beta = 0.0;
307 
308  // Using transpose operation since we treat CSC matrix as CSR
309  cusparseOperation_t cusparse_operation = CUSPARSE_OPERATION_TRANSPOSE;
310  cusparseSpMVAlg_t algorithm = CUSPARSE_SPMV_ALG_DEFAULT;
311 
312  // Get device id
313  int device_id = CudaInterface<DataType>::get_device();
314 
315  // Allocate device buffer (or reallocation if needed)
316  this->allocate_buffer(device_id, cusparse_operation, alpha, beta,
317  cusparse_input_vector, cusparse_output_vector,
318  algorithm);
319 
320  // Matrix vector multiplication
322  this->cusparse_handle[device_id], cusparse_operation, alpha,
323  this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
324  cusparse_output_vector, algorithm, this->device_buffer[device_id]);
325 
326  // Destroy cusparse vectors
327  cusparse_interface::destroy_cusparse_vector(cusparse_input_vector);
328  cusparse_interface::destroy_cusparse_vector(cusparse_output_vector);
329 }
330 
331 
332 // ========
333 // dot plus
334 // ========
335 
336 template <typename DataType>
338  const DataType* device_vector,
339  const DataType alpha,
340  DataType* device_product)
341 {
342  assert(this->copied_host_to_device);
343 
344  // Create cusparse vector for the input vector
345  cusparseDnVecDescr_t cusparse_input_vector;
347  cusparse_input_vector, this->num_columns,
348  const_cast<DataType*>(device_vector));
349 
350  // Create cusparse vector for the output vector
351  cusparseDnVecDescr_t cusparse_output_vector;
353  cusparse_output_vector, this->num_rows, device_product);
354 
355  // Matrix vector settings
356  DataType beta = 1.0;
357 
358  // Using transpose operation since we treat CSC matrix as CSR
359  cusparseOperation_t cusparse_operation = CUSPARSE_OPERATION_TRANSPOSE;
360  cusparseSpMVAlg_t algorithm = CUSPARSE_SPMV_ALG_DEFAULT;
361 
362  // Get device id
363  int device_id = CudaInterface<DataType>::get_device();
364 
365  // Allocate device buffer (or reallocation if needed)
366  this->allocate_buffer(device_id, cusparse_operation, alpha, beta,
367  cusparse_input_vector, cusparse_output_vector,
368  algorithm);
369 
370  // Matrix vector multiplication
372  this->cusparse_handle[device_id], cusparse_operation, alpha,
373  this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
374  cusparse_output_vector, algorithm, this->device_buffer[device_id]);
375 
376  // Destroy cusparse vectors
377  cusparse_interface::destroy_cusparse_vector(cusparse_input_vector);
378  cusparse_interface::destroy_cusparse_vector(cusparse_output_vector);
379 }
380 
381 
382 // =============
383 // transpose dot
384 // =============
385 
386 template <typename DataType>
388  const DataType* device_vector,
389  DataType* device_product)
390 {
391  assert(this->copied_host_to_device);
392 
393  // Create cusparse vector for the input vector
394  cusparseDnVecDescr_t cusparse_input_vector;
396  cusparse_input_vector, this->num_columns,
397  const_cast<DataType*>(device_vector));
398 
399  // Create cusparse vector for the output vector
400  cusparseDnVecDescr_t cusparse_output_vector;
402  cusparse_output_vector, this->num_rows, device_product);
403 
404  // Matrix vector settings
405  DataType alpha = 1.0;
406  DataType beta = 0.0;
407 
408  // Using non-transpose operation since we treat CSC matrix as CSR
409  cusparseOperation_t cusparse_operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
410  cusparseSpMVAlg_t algorithm = CUSPARSE_SPMV_ALG_DEFAULT;
411 
412  // Get device id
413  int device_id = CudaInterface<DataType>::get_device();
414 
415  // Allocate device buffer (or reallocation if needed)
416  this->allocate_buffer(device_id, cusparse_operation, alpha, beta,
417  cusparse_input_vector, cusparse_output_vector,
418  algorithm);
419 
420  // Matrix vector multiplication
422  this->cusparse_handle[device_id], cusparse_operation, alpha,
423  this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
424  cusparse_output_vector, algorithm, this->device_buffer[device_id]);
425 
426  // Destroy cusparse vectors
427  cusparse_interface::destroy_cusparse_vector(cusparse_input_vector);
428  cusparse_interface::destroy_cusparse_vector(cusparse_output_vector);
429 }
430 
431 
432 // ==================
433 // transpose dot plus
434 // ==================
435 
436 template <typename DataType>
438  const DataType* device_vector,
439  const DataType alpha,
440  DataType* device_product)
441 {
442  assert(this->copied_host_to_device);
443 
444  // Create cusparse vector for the input vector
445  cusparseDnVecDescr_t cusparse_input_vector;
447  cusparse_input_vector, this->num_columns,
448  const_cast<DataType*>(device_vector));
449 
450  // Create cusparse vector for the output vector
451  cusparseDnVecDescr_t cusparse_output_vector;
453  cusparse_output_vector, this->num_rows, device_product);
454 
455  // Matrix vector settings
456  DataType beta = 1.0;
457 
458  // Using non-transpose operation since we treat CSC matrix as CSR
459  cusparseOperation_t cusparse_operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
460  cusparseSpMVAlg_t algorithm = CUSPARSE_SPMV_ALG_DEFAULT;
461 
462  // Get device id
463  int device_id = CudaInterface<DataType>::get_device();
464 
465  // Allocate device buffer (or reallocation if needed)
466  this->allocate_buffer(device_id, cusparse_operation, alpha, beta,
467  cusparse_input_vector, cusparse_output_vector,
468  algorithm);
469 
470  // Matrix vector multiplication
472  this->cusparse_handle[device_id], cusparse_operation, alpha,
473  this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
474  cusparse_output_vector, algorithm, this->device_buffer[device_id]);
475 
476  // Destroy cusparse vectors
477  cusparse_interface::destroy_cusparse_vector(cusparse_input_vector);
478  cusparse_interface::destroy_cusparse_vector(cusparse_output_vector);
479 }
480 
481 
482 // ===============================
483 // Explicit template instantiation
484 // ===============================
485 
486 template class cuCSCMatrix<float>;
487 template class cuCSCMatrix<double>;
static int get_device()
Gets the current device in multi-gpu applications.
static void del(void *device_array)
Deletes memory on gpu device if its pointer is not NULL, then sets the pointer to NULL.
static void alloc_bytes(void *&device_array, const size_t num_bytes)
Allocates memory on gpu device. This function uses an existing given pointer.
static ArrayType * alloc(const LongIndexType array_size)
Allocates memory on gpu device. This function creates a pointer and returns it.
static void copy_to_device(const ArrayType *host_array, const LongIndexType array_size, ArrayType *device_array)
Copies memory on host to device memory.
static void set_device(int device_id)
Sets the current device in multi-gpu applications.
Base class for linear operators. This class serves as interface for all derived classes.
virtual void transpose_dot(const DataType *device_vector, DataType *device_product)
virtual void transpose_dot_plus(const DataType *device_vector, const DataType alpha, DataType *device_product)
virtual void dot(const DataType *device_vector, DataType *device_product)
size_t * device_buffer_num_bytes
Definition: cu_csc_matrix.h:86
cuCSCMatrix()
Default constructor.
virtual void dot_plus(const DataType *device_vector, const DataType alpha, DataType *device_product)
void ** device_buffer
Definition: cu_csc_matrix.h:85
virtual void copy_host_to_device()
Copies the member data from the host memory to the device memory.
void allocate_buffer(const int device_id, cusparseOperation_t cusparse_operation, const DataType alpha, const DataType beta, cusparseDnVecDescr_t &cusparse_input_vector, cusparseDnVecDescr_t &cusparse_output_vector, cusparseSpMVAlg_t algorithm)
Allocates an external buffer for matrix-vector multiplication using cusparseSpMV function.
virtual ~cuCSCMatrix()
Virtual desructor.
void initialize_cusparse_handle()
Creates a cusparseHandle_t object, if not created already.
Base class for constant matrices.
Definition: cu_matrix.h:41
#define CUSPARSE_SPMV_ALG_DEFAULT
void create_cusparse_matrix(cusparseSpMatDescr_t &cusparse_matrix, const LongIndexType num_rows, const LongIndexType num_columns, const LongIndexType nnz, DataType *device_A_data, LongIndexType *device_A_indices, LongIndexType *device_A_index_pointer)
void destroy_cusparse_matrix(cusparseSpMatDescr_t &cusparse_matrix)
Destroys cusparse matrix.
void destroy_cusparse_vector(cusparseDnVecDescr_t &cusparse_vector)
Destroys cusparse vector.
void cusparse_matvec(cusparseHandle_t cusparse_handle, cusparseOperation_t cusparse_operation, const DataType alpha, cusparseSpMatDescr_t cusparse_matrix, cusparseDnVecDescr_t cusparse_input_vector, const DataType beta, cusparseDnVecDescr_t cusparse_output_vector, cusparseSpMVAlg_t algorithm, void *external_buffer)
void create_cusparse_vector(cusparseDnVecDescr_t &cusparse_vector, const LongIndexType vector_size, DataType *device_vector)
void cusparse_matrix_buffer_size(cusparseHandle_t cusparse_handle, cusparseOperation_t cusparse_operation, const DataType alpha, cusparseSpMatDescr_t cusparse_matrix, cusparseDnVecDescr_t cusparse_input_vector, const DataType beta, cusparseDnVecDescr_t cusparse_output_vector, cusparseSpMVAlg_t algorithm, size_t *buffer_size)
int LongIndexType
Definition: types.h:60