imate
C++/CUDA Reference
Loading...
Searching...
No Matches
cu_dense_matrix.cu
Go to the documentation of this file.
1/*
2 * SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>
3 * SPDX-License-Identifier: BSD-3-Clause
4 * SPDX-FileType: SOURCE
5 *
6 * This program is free software: you can redistribute it and/or modify it
7 * under the terms of the license found in the LICENSE.txt file in the root
8 * directory of this source tree.
9 */
10
11
12// =======
13// Headers
14// =======
15
16#include "./cu_dense_matrix.h"
17#include "../_definitions/definitions.h" // USE_OPENMP
18#include "../_cu_definitions/cu_types.h" // __nv_fp8_e5m2, __nv_fp8_e4m3,
19 // __half, __nv_bfloat16
20
21#if defined(USE_OPENMP) && (USE_OPENMP == 1)
22 #include <omp.h> // omp_set_num_threads
23#endif
24
25#include <cstddef> // NULL
26#include <cassert> // assert
27#include "../_cu_arithmetics/cu_arithmetics.h" // cu_arithmetics
28#include "../_cu_basic_algebra/cu_matrix_operations.h" // cuMatrixOperations
29#include "../_cuda_utilities/cuda_api.h" // alloc, copy_to_device, del
30
31
32// =============
33// constructor 1
34// =============
35
38
39template <typename DataType>
41
42 // Initializer list
43 A(NULL),
44 device_A(NULL),
45 A_is_row_major(0)
46{
47}
48
49
50// =============
51// constructor 2
52// =============
53
75
76template <typename DataType>
78 const DataType* A_,
79 const LongIndexType num_rows_,
80 const LongIndexType num_columns_,
81 const FlagType A_is_row_major_,
82 const FlagType A_is_symmetric_,
83 const int num_gpu_devices_):
84
85 // Base class constructor
86 cLinearOperatorBase(num_rows_, num_columns_),
87 cuLinearOperator<DataType>(num_gpu_devices_),
88 cuMatrix<DataType>(A_is_row_major_),
89
90 // Initializer list
91 A(A_),
92 device_A(NULL),
93 A_is_row_major(A_is_row_major_)
94{
96 this->copy_host_to_device();
97}
98
99
100// ==========
101// destructor
102// ==========
103
106
107template <typename DataType>
109{
110 // Member objects exist if the second constructor was called.
111 if (this->copied_host_to_device)
112 {
113 // Deallocate arrays of data on gpu
114 for (int device_id = 0; device_id < this->num_gpu_devices; ++device_id)
115 {
116 // Switch to a device
118
119 // Deallocate
120 CudaAPI<DataType>::del(this->device_A[device_id]);
121 }
122
123 delete[] this->device_A;
124 this->device_A = NULL;
125 }
126}
127
128
129// ===================
130// copy host to device
131// ===================
132
135
136template <typename DataType>
138{
139 if (!this->copied_host_to_device)
140 {
141 // Set the number of threads
142 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
143 omp_set_num_threads(this->num_gpu_devices);
144 #endif
145
146 // Create array of pointers for data on each gpu device
147 this->device_A = new DataType*[this->num_gpu_devices];
148
149 // Size of data
150 size_t A_size = static_cast<size_t>(this->num_rows) * \
151 static_cast<size_t>(this->num_columns);
152
153 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
154 #pragma omp parallel
155 #endif
156 {
157 // Switch to a device with the same device id as the cpu thread id
158 unsigned int thread_id;
159 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
160 thread_id = omp_get_thread_num();
161 #else
162 thread_id = 0;
163 #endif
164
166
167 // Allocate device memory and copy data from host
168 CudaAPI<DataType>::alloc(this->device_A[thread_id], A_size);
169 CudaAPI<DataType>::copy_to_device(this->A, A_size,
170 this->device_A[thread_id]);
171 }
172
173 // Flag to prevent reinitialization
174 this->copied_host_to_device = true;
175 }
176}
177
178
179// ==================
180// is identity matrix
181// ==================
182
191
192template <typename DataType>
194{
195 FlagType matrix_is_identity = 1;
196 DataType matrix_element;
197 const DataType diagonal = 1.0;
198 const DataType off_diagonal = 0.0;
199
200 // Check matrix element-wise
201 if (this->A_is_row_major)
202 {
203 // Row-major matrix
204 LongIndexType column;
205 LongIndexType num_checking_columns;
206
207 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
208 #pragma omp parallel for \
209 schedule(static) \
210 if (!omp_in_parallel()) \
211 default(none) \
212 shared(matrix_is_identity, diagonal, off_diagonal) \
213 private(column, num_checking_columns, matrix_element)
214 #endif
215 for (LongIndexType row=0; row < this->num_rows; ++row)
216 {
217 if (matrix_is_identity)
218 {
219 if (this->A_is_symmetric)
220 {
221 // Check only half of the columns up to diagonal element
222 num_checking_columns = row + 1;
223 }
224 else
225 {
226 num_checking_columns = this->num_columns;
227 }
228
229 for (column=0; column < num_checking_columns; ++column)
230 {
231 // Get an element of the matrix
232 matrix_element = this->A[row * this->num_columns + column];
233
234 // Check the value of element with identity matrix
235 if (((row == column) && \
236 (!cu_arithmetics::is_equal(matrix_element,
237 diagonal))) || \
238 ((row != column) && \
239 (!cu_arithmetics::is_equal(matrix_element,
240 off_diagonal))))
241 {
242 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
243 #pragma omp atomic write
244 #endif
245 matrix_is_identity = 0;
246
247 break;
248 }
249 }
250 }
251 }
252 }
253 else
254 {
255 // Column-major matrix
256 LongIndexType row;
257 LongIndexType num_checking_rows;
258
259 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
260 #pragma omp parallel for \
261 schedule(static) \
262 if (!omp_in_parallel()) \
263 default(none) \
264 shared(matrix_is_identity, diagonal, off_diagonal) \
265 private(row, num_checking_rows, matrix_element)
266 #endif
267 for (LongIndexType column=0; column < this-> num_columns; ++column)
268 {
269 if (matrix_is_identity)
270 {
271 if (this->A_is_symmetric)
272 {
273 // Check only half of the rows up to diagonal element
274 num_checking_rows = column + 1;
275 }
276 else
277 {
278 num_checking_rows = this->num_rows;
279 }
280
281 for (row=0; row < num_checking_rows; ++row)
282 {
283 // Get an element of the matrix
284 matrix_element = this->A[column * this->num_rows + row];
285
286 // Check the value of element with identity matrix
287 if (((row == column) && \
288 (!cu_arithmetics::is_equal(matrix_element,
289 diagonal))) || \
290 ((row != column) && \
291 (!cu_arithmetics::is_equal(matrix_element,
292 off_diagonal))))
293 {
294 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
295 #pragma omp atomic write
296 #endif
297 matrix_is_identity = 0;
298
299 break;
300 }
301 }
302 }
303 }
304 }
305
306 return matrix_is_identity;
307}
308
309
310// ===
311// dot
312// ===
313
331
332template <typename DataType>
334 const DataType* device_vector,
335 DataType* device_product)
336{
337 assert(this->copied_host_to_device);
338
339 // Get device id
340 int device_id = CudaAPI<DataType>::get_device();
341
343 this->cublas_handle[device_id],
344 this->device_A[device_id],
345 device_vector,
346 this->num_rows,
347 this->num_columns,
348 this->A_is_row_major,
349 device_product);
350}
351
352
353// ========
354// dot plus
355// ========
356
376
377template <typename DataType>
379 const DataType* device_vector,
380 const DataType alpha,
381 DataType* device_product)
382{
383 assert(this->copied_host_to_device);
384
385 // Get device id
386 int device_id = CudaAPI<DataType>::get_device();
387
389 this->cublas_handle[device_id],
390 this->device_A[device_id],
391 device_vector,
392 alpha,
393 this->num_rows,
394 this->num_columns,
395 this->A_is_row_major,
396 device_product);
397}
398
399
400// =============
401// transpose dot
402// =============
403
421
422template <typename DataType>
424 const DataType* device_vector,
425 DataType* device_product)
426{
427 assert(this->copied_host_to_device);
428
429 // Get device id
430 int device_id = CudaAPI<DataType>::get_device();
431
433 this->cublas_handle[device_id],
434 this->device_A[device_id],
435 device_vector,
436 this->num_rows,
437 this->num_columns,
438 this->A_is_row_major,
439 device_product);
440}
441
442
443// ==================
444// transpose dot plus
445// ==================
446
467
468template <typename DataType>
470 const DataType* device_vector,
471 const DataType alpha,
472 DataType* device_product)
473{
474 assert(this->copied_host_to_device);
475
476 // Get device id
477 int device_id = CudaAPI<DataType>::get_device();
478
480 this->cublas_handle[device_id],
481 this->device_A[device_id],
482 device_vector,
483 alpha,
484 this->num_rows,
485 this->num_columns,
486 this->A_is_row_major,
487 device_product);
488}
489
490
491// ===============================
492// Explicit template instantiation
493// ===============================
494
495#if defined(USE_CUDA_FP8_E5M2) && (USE_CUDA_FP8_E5M2 == 1)
496 template class cuDenseMatrix<__nv_fp8_e5m2>;
497#endif
498
499#if defined(USE_CUDA_FP8_E4M3) && (USE_CUDA_FP8_E4M3 == 1)
500 template class cuDenseMatrix<__nv_fp8_e4m3>;
501#endif
502
503#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
504 template class cuDenseMatrix<__half>;
505#endif
506
507#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
508 template class cuDenseMatrix<__nv_bfloat16>;
509#endif
510
511#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
512 template class cuDenseMatrix<float>;
513#endif
514
515#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
516 template class cuDenseMatrix<double>;
517#endif
static void set_device(int device_id)
Sets the current device in multi-gpu applications.
Definition cuda_api.cu:191
static ArrayType * alloc(const size_t array_size)
Allocates memory on gpu device. This function creates a pointer and returns it.
Definition cuda_api.cu:39
static void del(void *device_array)
Deletes memory on gpu device if its pointer is not NULL, then sets the pointer to NULL.
Definition cuda_api.cu:169
static int get_device()
Gets the current device in multi-gpu applications.
Definition cuda_api.cu:209
static void copy_to_device(const ArrayType *host_array, const size_t array_size, ArrayType *device_array)
Copies memory on host to device memory.
Definition cuda_api.cu:145
Base class for cLinearOperator and cuLinearOperator . This class is not templated so that both cpp an...
Container for dense matrices.
virtual void transpose_dot_plus(const DataType *device_vector, const DataType alpha, DataType *device_product)
Transposed-matrix vector product written in place.
virtual void transpose_dot(const DataType *device_vector, DataType *device_product)
Transposed-matrix vector product.
virtual FlagType is_identity_matrix() const
Checks whether the matrix is identity.
virtual void dot_plus(const DataType *device_vector, const DataType alpha, DataType *device_product)
Matrix vector product written in place.
cuDenseMatrix()
Default constructor.
virtual void copy_host_to_device()
Copies the member data from the host memory to the device memory.
virtual ~cuDenseMatrix()
Destructor. This function removes data from GPU devices.
virtual void dot(const DataType *device_vector, DataType *device_product)
Matrix vector product.
Base class for linear operators. This class serves as interface for all derived classes.
void initialize_cublas_handle()
Creates a cublasHandle_t object, if not created already.
static void dense_matvec(cublasHandle_t cublas_handle, const DataType *RESTRICT A, const DataType *RESTRICT b, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, DataType *RESTRICT c)
Computes the matrix vector multiplication where is a dense matrix.
static void dense_transposed_matvec(cublasHandle_t cublas_handle, const DataType *RESTRICT A, const DataType *RESTRICT b, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, DataType *RESTRICT c)
Computes matrix vector multiplication where is dense, and is the transpose of the matrix .
static void dense_transposed_matvec_plus(cublasHandle_t cublas_handle, const DataType *RESTRICT A, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, DataType *RESTRICT c)
Computes where is dense, and is the transpose of the matrix .
static void dense_matvec_plus(cublasHandle_t cublas_handle, const DataType *RESTRICT A, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, DataType *RESTRICT c)
Computes the operation where is a dense matrix.
Base class for constant matrices.
Definition cu_matrix.h:45
void omp_set_num_threads(int num_threads)
int omp_get_thread_num()
bool is_equal(DataType x, DataType y)
Check if two floating point numbers are equal within a tolerance.
int LongIndexType
Definition types.h:60
int FlagType
Definition types.h:68