imate
C++/CUDA Reference
cu_csr_matrix.cu
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>
3  * SPDX-License-Identifier: BSD-3-Clause
4  * SPDX-FileType: SOURCE
5  *
6  * This program is free software: you can redistribute it and/or modify it
7  * under the terms of the license found in the LICENSE.txt file in the root
8  * directory of this source tree.
9  */
10 
11 
12 // =======
13 // Headers
14 // =======
15 
16 #include "./cu_csr_matrix.h"
17 #include <omp.h> // omp_set_num_threads
18 #include <cstddef> // NULL
19 #include <cassert> // assert
20 #include "../_cu_basic_algebra/cu_matrix_operations.h" // cuMatrixOperations
21 #include "../_cu_basic_algebra/cusparse_interface.h" // cusparse_interface
22 #include "../_cuda_utilities/cuda_interface.h" // CudaInterface
23 
24 
25 // =============
26 // constructor 1
27 // =============
28 
31 
32 template <typename DataType>
34  device_A_data(NULL),
35  device_A_indices(NULL),
36  device_A_index_pointer(NULL),
37  device_buffer(NULL),
38  device_buffer_num_bytes(NULL),
39  cusparse_matrix_A(NULL)
40 {
41 }
42 
43 
44 // =============
45 // constructor 2
46 // =============
47 
50 
51 template <typename DataType>
53  const DataType* A_data_,
54  const LongIndexType* A_indices_,
55  const LongIndexType* A_index_pointer_,
56  const LongIndexType num_rows_,
57  const LongIndexType num_columns_,
58  const int num_gpu_devices_):
59 
60  // Base class constructor
61  cLinearOperator<DataType>(num_rows_, num_columns_),
62  cCSRMatrix<DataType>(A_data_, A_indices_, A_index_pointer_, num_rows_,
63  num_columns_),
64  cuMatrix<DataType>(num_gpu_devices_),
65 
66  // Initializer list
67  device_A_data(NULL),
68  device_A_indices(NULL),
69  device_A_index_pointer(NULL),
70  device_buffer(NULL),
71  cusparse_matrix_A(NULL)
72 {
74  this->copy_host_to_device();
75 
76  // Initialize device buffer
77  this->device_buffer = new void*[this->num_gpu_devices];
78  this->device_buffer_num_bytes = new size_t[this->num_gpu_devices];
79  for (int device_id=0; device_id < this->num_gpu_devices; ++device_id)
80  {
81  this->device_buffer[device_id] = NULL;
82  this->device_buffer_num_bytes[device_id] = 0;
83  }
84 }
85 
86 
87 // ==========
88 // destructor
89 // ==========
90 
93 
94 template <typename DataType>
96 {
97  // Member objects exist if the second constructor was called.
98  if (this->copied_host_to_device)
99  {
100  // Deallocate arrays of data on gpu
101  for (int device_id=0; device_id < this->num_gpu_devices; ++device_id)
102  {
103  // Switch to a device
105 
106  // Deallocate
107  CudaInterface<DataType>::del(this->device_A_data[device_id]);
109  this->device_A_indices[device_id]);
111  this->device_A_index_pointer[device_id]);
112  CudaInterface<LongIndexType>::del(this->device_buffer[device_id]);
114  this->cusparse_matrix_A[device_id]);
115  }
116  }
117 
118  // Deallocate arrays of pointers on cpu
119  if (this->device_A_data != NULL)
120  {
121  delete[] this->device_A_data;
122  this->device_A_data = NULL;
123  }
124 
125  if (this->device_A_indices != NULL)
126  {
127  delete[] this->device_A_indices;
128  this->device_A_indices = NULL;
129  }
130 
131  if (this->device_A_index_pointer != NULL)
132  {
133  delete[] this->device_A_index_pointer;
134  this->device_A_index_pointer = NULL;
135  }
136 
137  if (this->device_buffer != NULL)
138  {
139  delete[] this->device_buffer;
140  this->device_buffer = NULL;
141  }
142 
143  if (this->device_buffer_num_bytes != NULL)
144  {
145  delete[] this->device_buffer_num_bytes;
146  this->device_buffer_num_bytes = NULL;
147  }
148 
149  if (this->cusparse_matrix_A != NULL)
150  {
151  delete[] this->cusparse_matrix_A;
152  this->cusparse_matrix_A = NULL;
153  }
154 }
155 
156 
157 // ===================
158 // copy host to device
159 // ===================
160 
163 
164 template <typename DataType>
166 {
167  if (!this->copied_host_to_device)
168  {
169  // Set the number of threads
170  omp_set_num_threads(this->num_gpu_devices);
171 
172  // Array sizes
173  LongIndexType A_data_size = this->get_nnz();
174  LongIndexType A_indices_size = A_data_size;
175  LongIndexType A_index_pointer_size = this->num_rows + 1;
176  LongIndexType A_nnz = this->get_nnz();
177 
178  // Create array of pointers for data on each gpu device
179  this->device_A_data = new DataType*[this->num_gpu_devices];
180  this->device_A_indices = new LongIndexType*[this->num_gpu_devices];
181  this->device_A_index_pointer = \
182  new LongIndexType*[this->num_gpu_devices];
183  this->cusparse_matrix_A = \
184  new cusparseSpMatDescr_t[this->num_gpu_devices];
185 
186  #pragma omp parallel
187  {
188  // Switch to a device with the same device id as the cpu thread id
189  unsigned int thread_id = omp_get_thread_num();
191 
192  // A_data
193  CudaInterface<DataType>::alloc(this->device_A_data[thread_id],
194  A_data_size);
196  this->A_data, A_data_size, this->device_A_data[thread_id]);
197 
198  // A_indices
200  this->device_A_indices[thread_id], A_indices_size);
202  this->A_indices, A_indices_size,
203  this->device_A_indices[thread_id]);
204 
205  // A_index_pointer
207  this->device_A_index_pointer[thread_id],
208  A_index_pointer_size);
210  this->A_index_pointer, A_index_pointer_size,
211  this->device_A_index_pointer[thread_id]);
212 
213  // Create cusparse matrix
215  this->cusparse_matrix_A[thread_id], this->num_rows,
216  this->num_columns, A_nnz, this->device_A_data[thread_id],
217  this->device_A_indices[thread_id],
218  this->device_A_index_pointer[thread_id]);
219  }
220 
221  // Flag to prevent reinitialization
222  this->copied_host_to_device = true;
223  }
224 }
225 
226 
227 // ===============
228 // allocate buffer
229 // ===============
230 
239 
240 template <typename DataType>
242  const int device_id,
243  cusparseOperation_t cusparse_operation,
244  const DataType alpha,
245  const DataType beta,
246  cusparseDnVecDescr_t& cusparse_input_vector,
247  cusparseDnVecDescr_t& cusparse_output_vector,
248  cusparseSpMVAlg_t algorithm)
249 {
250  // Find the buffer size needed for matrix-vector multiplication
251  size_t required_buffer_size;
253  this->cusparse_handle[device_id], cusparse_operation, alpha,
254  this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
255  cusparse_output_vector, algorithm, &required_buffer_size);
256 
257  if (this->device_buffer_num_bytes[device_id] != required_buffer_size)
258  {
259  // Update the buffer size
260  this->device_buffer_num_bytes[device_id] = required_buffer_size;
261 
262  // Delete buffer if it was allocated previously
263  CudaInterface<DataType>::del(this->device_buffer[device_id]);
264 
265  // Allocate (or reallocate) buffer on device.
267  this->device_buffer[device_id],
268  this->device_buffer_num_bytes[device_id]);
269  }
270 }
271 
272 
273 // ===
274 // dot
275 // ===
276 
277 template <typename DataType>
279  const DataType* device_vector,
280  DataType* device_product)
281 {
282  assert(this->copied_host_to_device);
283 
284  // Create cusparse vector for the input vector
285  cusparseDnVecDescr_t cusparse_input_vector;
287  cusparse_input_vector, this->num_columns,
288  const_cast<DataType*>(device_vector));
289 
290  // Create cusparse vector for the output vector
291  cusparseDnVecDescr_t cusparse_output_vector;
293  cusparse_output_vector, this->num_rows, device_product);
294 
295  // Matrix vector settings
296  DataType alpha = 1.0;
297  DataType beta = 0.0;
298  cusparseOperation_t cusparse_operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
299  cusparseSpMVAlg_t algorithm = CUSPARSE_SPMV_ALG_DEFAULT;
300 
301  // Get device id
302  int device_id = CudaInterface<DataType>::get_device();
303 
304  // Allocate device buffer (or reallocation if needed)
305  this->allocate_buffer(device_id, cusparse_operation, alpha, beta,
306  cusparse_input_vector, cusparse_output_vector,
307  algorithm);
308 
309  // Matrix vector multiplication
311  this->cusparse_handle[device_id], cusparse_operation, alpha,
312  this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
313  cusparse_output_vector, algorithm, this->device_buffer[device_id]);
314 
315  // Destroy cusparse vectors
316  cusparse_interface::destroy_cusparse_vector(cusparse_input_vector);
317  cusparse_interface::destroy_cusparse_vector(cusparse_output_vector);
318 }
319 
320 
321 // ========
322 // dot plus
323 // ========
324 
325 template <typename DataType>
327  const DataType* device_vector,
328  const DataType alpha,
329  DataType* device_product)
330 {
331  assert(this->copied_host_to_device);
332 
333  // Create cusparse vector for the input vector
334  cusparseDnVecDescr_t cusparse_input_vector;
336  cusparse_input_vector, this->num_columns,
337  const_cast<DataType*>(device_vector));
338 
339  // Create cusparse vector for the output vector
340  cusparseDnVecDescr_t cusparse_output_vector;
342  cusparse_output_vector, this->num_rows, device_product);
343 
344  // Matrix vector settings
345  DataType beta = 1.0;
346  cusparseOperation_t cusparse_operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
347  cusparseSpMVAlg_t algorithm = CUSPARSE_SPMV_ALG_DEFAULT;
348 
349  // Get device id
350  int device_id = CudaInterface<DataType>::get_device();
351 
352  // Allocate device buffer (or reallocation if needed)
353  this->allocate_buffer(device_id, cusparse_operation, alpha, beta,
354  cusparse_input_vector, cusparse_output_vector,
355  algorithm);
356 
357  // Matrix vector multiplication
359  this->cusparse_handle[device_id], cusparse_operation, alpha,
360  this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
361  cusparse_output_vector, algorithm, this->device_buffer[device_id]);
362 
363  // Destroy cusparse vectors
364  cusparse_interface::destroy_cusparse_vector(cusparse_input_vector);
365  cusparse_interface::destroy_cusparse_vector(cusparse_output_vector);
366 }
367 
368 
369 // =============
370 // transpose dot
371 // =============
372 
373 template <typename DataType>
375  const DataType* device_vector,
376  DataType* device_product)
377 {
378  assert(this->copied_host_to_device);
379 
380  // Create cusparse vector for the input vector
381  cusparseDnVecDescr_t cusparse_input_vector;
383  cusparse_input_vector, this->num_columns,
384  const_cast<DataType*>(device_vector));
385 
386  // Create cusparse vector for the output vector
387  cusparseDnVecDescr_t cusparse_output_vector;
389  cusparse_output_vector, this->num_rows, device_product);
390 
391  // Matrix vector settings
392  DataType alpha = 1.0;
393  DataType beta = 0.0;
394  cusparseOperation_t cusparse_operation = CUSPARSE_OPERATION_TRANSPOSE;
395  cusparseSpMVAlg_t algorithm = CUSPARSE_SPMV_ALG_DEFAULT;
396 
397  // Get device id
398  int device_id = CudaInterface<DataType>::get_device();
399 
400  // Allocate device buffer (or reallocation if needed)
401  this->allocate_buffer(device_id, cusparse_operation, alpha, beta,
402  cusparse_input_vector, cusparse_output_vector,
403  algorithm);
404 
405  // Matrix vector multiplication
407  this->cusparse_handle[device_id], cusparse_operation, alpha,
408  this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
409  cusparse_output_vector, algorithm, this->device_buffer[device_id]);
410 
411  // Destroy cusparse vectors
412  cusparse_interface::destroy_cusparse_vector(cusparse_input_vector);
413  cusparse_interface::destroy_cusparse_vector(cusparse_output_vector);
414 }
415 
416 
417 // ==================
418 // transpose dot plus
419 // ==================
420 
421 template <typename DataType>
423  const DataType* device_vector,
424  const DataType alpha,
425  DataType* device_product)
426 {
427  assert(this->copied_host_to_device);
428 
429  // Create cusparse vector for the input vector
430  cusparseDnVecDescr_t cusparse_input_vector;
432  cusparse_input_vector, this->num_columns,
433  const_cast<DataType*>(device_vector));
434 
435  // Create cusparse vector for the output vector
436  cusparseDnVecDescr_t cusparse_output_vector;
438  cusparse_output_vector, this->num_rows, device_product);
439 
440  // Matrix vector settings
441  DataType beta = 1.0;
442  cusparseOperation_t cusparse_operation = CUSPARSE_OPERATION_TRANSPOSE;
443  cusparseSpMVAlg_t algorithm = CUSPARSE_SPMV_ALG_DEFAULT;
444 
445  // Get device id
446  int device_id = CudaInterface<DataType>::get_device();
447 
448  // Allocate device buffer (or reallocation if needed)
449  this->allocate_buffer(device_id, cusparse_operation, alpha, beta,
450  cusparse_input_vector, cusparse_output_vector,
451  algorithm);
452 
453  // Matrix vector multiplication
455  this->cusparse_handle[device_id], cusparse_operation, alpha,
456  this->cusparse_matrix_A[device_id], cusparse_input_vector, beta,
457  cusparse_output_vector, algorithm, this->device_buffer[device_id]);
458 
459  // Destroy cusparse vectors
460  cusparse_interface::destroy_cusparse_vector(cusparse_input_vector);
461  cusparse_interface::destroy_cusparse_vector(cusparse_output_vector);
462 }
463 
464 
465 // ===============================
466 // Explicit template instantiation
467 // ===============================
468 
469 template class cuCSRMatrix<float>;
470 template class cuCSRMatrix<double>;
static int get_device()
Gets the current device in multi-gpu applications.
static void del(void *device_array)
Deletes memory on gpu device if its pointer is not NULL, then sets the pointer to NULL.
static void alloc_bytes(void *&device_array, const size_t num_bytes)
Allocates memory on gpu device. This function uses an existing given pointer.
static ArrayType * alloc(const LongIndexType array_size)
Allocates memory on gpu device. This function creates a pointer and returns it.
static void copy_to_device(const ArrayType *host_array, const LongIndexType array_size, ArrayType *device_array)
Copies memory on host to device memory.
static void set_device(int device_id)
Sets the current device in multi-gpu applications.
Base class for linear operators. This class serves as interface for all derived classes.
size_t * device_buffer_num_bytes
Definition: cu_csr_matrix.h:86
cuCSRMatrix()
Default constructor.
virtual ~cuCSRMatrix()
Virtual desructor.
virtual void transpose_dot(const DataType *device_vector, DataType *device_product)
virtual void dot_plus(const DataType *device_vector, const DataType alpha, DataType *device_product)
virtual void dot(const DataType *device_vector, DataType *device_product)
virtual void transpose_dot_plus(const DataType *device_vector, const DataType alpha, DataType *device_product)
void allocate_buffer(const int device_id, cusparseOperation_t cusparse_operation, const DataType alpha, const DataType beta, cusparseDnVecDescr_t &cusparse_input_vector, cusparseDnVecDescr_t &cusparse_output_vector, cusparseSpMVAlg_t algorithm)
Allocates an external buffer for matrix-vector multiplication using cusparseSpMV function.
virtual void copy_host_to_device()
Copies the member data from the host memory to the device memory.
void ** device_buffer
Definition: cu_csr_matrix.h:85
void initialize_cusparse_handle()
Creates a cusparseHandle_t object, if not created already.
Base class for constant matrices.
Definition: cu_matrix.h:41
#define CUSPARSE_SPMV_ALG_DEFAULT
void create_cusparse_matrix(cusparseSpMatDescr_t &cusparse_matrix, const LongIndexType num_rows, const LongIndexType num_columns, const LongIndexType nnz, DataType *device_A_data, LongIndexType *device_A_indices, LongIndexType *device_A_index_pointer)
void destroy_cusparse_matrix(cusparseSpMatDescr_t &cusparse_matrix)
Destroys cusparse matrix.
void destroy_cusparse_vector(cusparseDnVecDescr_t &cusparse_vector)
Destroys cusparse vector.
void cusparse_matvec(cusparseHandle_t cusparse_handle, cusparseOperation_t cusparse_operation, const DataType alpha, cusparseSpMatDescr_t cusparse_matrix, cusparseDnVecDescr_t cusparse_input_vector, const DataType beta, cusparseDnVecDescr_t cusparse_output_vector, cusparseSpMVAlg_t algorithm, void *external_buffer)
void create_cusparse_vector(cusparseDnVecDescr_t &cusparse_vector, const LongIndexType vector_size, DataType *device_vector)
void cusparse_matrix_buffer_size(cusparseHandle_t cusparse_handle, cusparseOperation_t cusparse_operation, const DataType alpha, cusparseSpMatDescr_t cusparse_matrix, cusparseDnVecDescr_t cusparse_input_vector, const DataType beta, cusparseDnVecDescr_t cusparse_output_vector, cusparseSpMVAlg_t algorithm, size_t *buffer_size)
int LongIndexType
Definition: types.h:60