imate
C++/CUDA Reference
cu_dense_matrix.cu
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>
3  * SPDX-License-Identifier: BSD-3-Clause
4  * SPDX-FileType: SOURCE
5  *
6  * This program is free software: you can redistribute it and/or modify it
7  * under the terms of the license found in the LICENSE.txt file in the root
8  * directory of this source tree.
9  */
10 
11 
12 // =======
13 // Headers
14 // =======
15 
16 #include "./cu_dense_matrix.h"
17 #include <omp.h> // omp_set_num_threads
18 #include <cstddef> // NULL
19 #include <cassert> // assert
20 #include "../_cu_basic_algebra/cu_matrix_operations.h" // cuMatrixOperations
21 #include "../_cuda_utilities/cuda_interface.h" // alloc, copy_to_device, del
22 
23 
24 // =============
25 // constructor 1
26 // =============
27 
28 template <typename DataType>
30  device_A(NULL)
31 {
32 }
33 
34 
35 // =============
36 // constructor 2
37 // =============
38 
39 template <typename DataType>
41  const DataType* A_,
42  const LongIndexType num_rows_,
43  const LongIndexType num_columns_,
44  const FlagType A_is_row_major_,
45  const int num_gpu_devices_):
46 
47  // Base class constructor
48  cLinearOperator<DataType>(num_rows_, num_columns_),
49  cDenseMatrix<DataType>(A_, num_rows_, num_columns_, A_is_row_major_),
50  cuMatrix<DataType>(num_gpu_devices_),
51 
52  // Initializer list
53  device_A(NULL)
54 {
56  this->copy_host_to_device();
57 }
58 
59 
60 // ==========
61 // destructor
62 // ==========
63 
64 
65 template <typename DataType>
67 {
68  // Member objects exist if the second constructor was called.
69  if (this->copied_host_to_device)
70  {
71  // Deallocate arrays of data on gpu
72  for (int device_id = 0; device_id < this->num_gpu_devices; ++device_id)
73  {
74  // Switch to a device
76 
77  // Deallocate
78  CudaInterface<DataType>::del(this->device_A[device_id]);
79  }
80 
81  delete[] this->device_A;
82  this->device_A = NULL;
83  }
84 }
85 
86 
87 // ===================
88 // copy host to device
89 // ===================
90 
93 
94 template <typename DataType>
96 {
97  if (!this->copied_host_to_device)
98  {
99  // Set the number of threads
100  omp_set_num_threads(this->num_gpu_devices);
101 
102  // Create array of pointers for data on each gpu device
103  this->device_A = new DataType*[this->num_gpu_devices];
104 
105  // Size of data
106  LongIndexType A_size = this->num_rows * this->num_columns;
107 
108  #pragma omp parallel
109  {
110  // Switch to a device with the same device id as the cpu thread id
111  unsigned int thread_id = omp_get_thread_num();
113 
114  // Allocate device memory and copy data from host
115  CudaInterface<DataType>::alloc(this->device_A[thread_id], A_size);
117  this->device_A[thread_id]);
118  }
119 
120  // Flag to prevent reinitialization
121  this->copied_host_to_device = true;
122  }
123 }
124 
125 
126 // ===
127 // dot
128 // ===
129 
130 template <typename DataType>
132  const DataType* device_vector,
133  DataType* device_product)
134 {
135  assert(this->copied_host_to_device);
136 
137  // Get device id
138  int device_id = CudaInterface<DataType>::get_device();
139 
141  this->cublas_handle[device_id],
142  this->device_A[device_id],
143  device_vector,
144  this->num_rows,
145  this->num_columns,
146  this->A_is_row_major,
147  device_product);
148 }
149 
150 
151 // ========
152 // dot plus
153 // ========
154 
155 template <typename DataType>
157  const DataType* device_vector,
158  const DataType alpha,
159  DataType* device_product)
160 {
161  assert(this->copied_host_to_device);
162 
163  // Get device id
164  int device_id = CudaInterface<DataType>::get_device();
165 
167  this->cublas_handle[device_id],
168  this->device_A[device_id],
169  device_vector,
170  alpha,
171  this->num_rows,
172  this->num_columns,
173  this->A_is_row_major,
174  device_product);
175 }
176 
177 
178 // =============
179 // transpose dot
180 // =============
181 
182 template <typename DataType>
184  const DataType* device_vector,
185  DataType* device_product)
186 {
187  assert(this->copied_host_to_device);
188 
189  // Get device id
190  int device_id = CudaInterface<DataType>::get_device();
191 
193  this->cublas_handle[device_id],
194  this->device_A[device_id],
195  device_vector,
196  this->num_rows,
197  this->num_columns,
198  this->A_is_row_major,
199  device_product);
200 }
201 
202 
203 // ==================
204 // transpose dot plus
205 // ==================
206 
207 template <typename DataType>
209  const DataType* device_vector,
210  const DataType alpha,
211  DataType* device_product)
212 {
213  assert(this->copied_host_to_device);
214 
215  // Get device id
216  int device_id = CudaInterface<DataType>::get_device();
217 
219  this->cublas_handle[device_id],
220  this->device_A[device_id],
221  device_vector,
222  alpha,
223  this->num_rows,
224  this->num_columns,
225  this->A_is_row_major,
226  device_product);
227 }
228 
229 
230 // ===============================
231 // Explicit template instantiation
232 // ===============================
233 
234 template class cuDenseMatrix<float>;
235 template class cuDenseMatrix<double>;
static int get_device()
Gets the current device in multi-gpu applications.
static void del(void *device_array)
Deletes memory on gpu device if its pointer is not NULL, then sets the pointer to NULL.
static ArrayType * alloc(const LongIndexType array_size)
Allocates memory on gpu device. This function creates a pointer and returns it.
static void copy_to_device(const ArrayType *host_array, const LongIndexType array_size, ArrayType *device_array)
Copies memory on host to device memory.
static void set_device(int device_id)
Sets the current device in multi-gpu applications.
Base class for linear operators. This class serves as interface for all derived classes.
virtual void transpose_dot_plus(const DataType *device_vector, const DataType alpha, DataType *device_product)
virtual void transpose_dot(const DataType *device_vector, DataType *device_product)
virtual void dot_plus(const DataType *device_vector, const DataType alpha, DataType *device_product)
virtual void copy_host_to_device()
Copies the member data from the host memory to the device memory.
virtual ~cuDenseMatrix()
virtual void dot(const DataType *device_vector, DataType *device_product)
void initialize_cublas_handle()
Creates a cublasHandle_t object, if not created already.
static void dense_matvec(cublasHandle_t cublas_handle, const DataType *A, const DataType *b, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, DataType *c)
Computes the matrix vector multiplication where is a dense matrix.
static void dense_transposed_matvec_plus(cublasHandle_t cublas_handle, const DataType *A, const DataType *b, const DataType alpha, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, DataType *c)
Computes where is dense, and is the transpose of the matrix .
static void dense_transposed_matvec(cublasHandle_t cublas_handle, const DataType *A, const DataType *b, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, DataType *c)
Computes matrix vector multiplication where is dense, and is the transpose of the matrix .
static void dense_matvec_plus(cublasHandle_t cublas_handle, const DataType *A, const DataType *b, const DataType alpha, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, DataType *c)
Computes the operation where is a dense matrix.
Base class for constant matrices.
Definition: cu_matrix.h:41
int LongIndexType
Definition: types.h:60
int FlagType
Definition: types.h:68