imate
C++/CUDA Reference
Loading...
Searching...
No Matches
cu_linear_operator.cu
Go to the documentation of this file.
1/*
2 * SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>
3 * SPDX-License-Identifier: BSD-3-Clause
4 * SPDX-FileType: SOURCE
5 *
6 * This program is free software: you can redistribute it and/or modify it
7 * under the terms of the license found in the LICENSE.txt file in the root
8 * directory of this source tree.
9 */
10
11
12// =======
13// Headers
14// =======
15
17#include "../_definitions/definitions.h" // USE_OPENMP
18#include "../_cu_definitions/cu_types.h" // __nv_fp8_e5m2, __nv_fp8_e4m3,
19 // __half, __nv_bfloat16
20
21#if defined(USE_OPENMP) && (USE_OPENMP == 1)
22 #include <omp.h> // omp_set_num_threads
23#endif
24
25#include <cstddef> // NULL
26#include <cassert> // assert
27#include <cstdlib> // abort
28#include <iostream>
29#include "../_cuda_utilities/cuda_api.h" // CudaAPI
30
31
32// =============
33// constructor 1
34// =============
35
38
39template <typename DataType>
41
42 // Initializer list
43 num_gpu_devices(0),
44 copied_host_to_device(false),
45 cublas_handle(NULL),
46 cusparse_handle(NULL)
47{
48 // Check any gpu device exists
49 this->num_gpu_devices = this->query_gpu_devices();
50
51 // Regardless of using dense (cublas) or sparse (cusparse) matrices, the
52 // cublas handle should be initialized, since it is needed for the methods
53 // in cuVectorOperations
55}
56
57
58// =============
59// constructor 2
60// =============
61
71
72template <typename DataType>
74 const int num_gpu_devices_):
75
76 // Initializer list
77 num_gpu_devices(0),
78 copied_host_to_device(false),
79 cublas_handle(NULL),
80 cusparse_handle(NULL)
81{
82 // Check any gpu device exists
83 int device_count = this->query_gpu_devices();
84
85 // Set number of gpu devices
86 if (num_gpu_devices_ == 0)
87 {
88 this->num_gpu_devices = device_count;
89 }
90 else if (num_gpu_devices_ > device_count)
91 {
92 std::cerr << "ERROR: Number of requested gpu devices exceeds the " \
93 << "number of available gpu devices. Nummber of detected " \
94 << "devices are " << device_count << " while the " \
95 << "requested number of devices are " << num_gpu_devices_ \
96 << "." << std::endl;
97 abort();
98 }
99 else
100 {
101 this->num_gpu_devices = num_gpu_devices_;
102 }
103
104 // Regardless of using dense (cublas) or sparse (cusparse) matrices, the
105 // cublas handle should be initialized, since it is needed for the methods
106 // in cuVectorOperations
108}
109
110
111// ==========
112// destructor
113// ==========
114
118
119template <typename DataType>
121{
122 // cublas handle
123 if (this->cublas_handle != NULL)
124 {
125 // Set the number of threads
126 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
127 omp_set_num_threads(this->num_gpu_devices);
128 #endif
129
130 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
131 #pragma omp parallel
132 #endif
133 {
134 // Switch to a device with the same device id as the cpu thread id
135 unsigned int thread_id;
136 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
137 thread_id = omp_get_thread_num();
138 #else
139 thread_id = 0;
140 #endif
141
143
144 cublasStatus_t status = cublasDestroy(
145 this->cublas_handle[thread_id]);
146 assert(status == CUBLAS_STATUS_SUCCESS);
147 }
148
149 // Deallocate arrays of pointers on cpu
150 delete[] this->cublas_handle;
151 this->cublas_handle = NULL;
152 }
153
154 // cusparse handle
155 if (this->cusparse_handle != NULL)
156 {
157 // Set the number of threads
158 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
159 omp_set_num_threads(this->num_gpu_devices);
160 #endif
161
162 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
163 #pragma omp parallel
164 #endif
165 {
166 // Switch to a device with the same device id as the cpu thread id
167 unsigned int thread_id;
168 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
169 thread_id = omp_get_thread_num();
170 #else
171 thread_id = 0;
172 #endif
173
175
176 cusparseStatus_t status = cusparseDestroy(
177 this->cusparse_handle[thread_id]);
178 assert(status == CUSPARSE_STATUS_SUCCESS);
179 }
180
181 // Deallocate arrays of pointers on cpu
182 delete[] this->cusparse_handle;
183 this->cusparse_handle = NULL;
184 }
185}
186
187
188// =================
189// get cublas handle
190// =================
191
203
204template <typename DataType>
206{
207 // Get device id
208 int device_id = CudaAPI<DataType>::get_device();
209
210 return this->cublas_handle[device_id];
211}
212
213
214// ========================
215// initialize cublas handle
216// ========================
217
220
221template <typename DataType>
223{
224 if (this->cublas_handle == NULL)
225 {
226 // Allocate pointers for each gpu device
227 this->cublas_handle = new cublasHandle_t[this->num_gpu_devices];
228
229 // Set the number of threads
230 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
231 omp_set_num_threads(this->num_gpu_devices);
232 #endif
233
234 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
235 #pragma omp parallel
236 #endif
237 {
238 // Switch to a device with the same device id as the cpu thread id
239 unsigned int thread_id;
240 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
241 thread_id = omp_get_thread_num();
242 #else
243 thread_id = 0;
244 #endif
245
247
248 cublasStatus_t status_create = cublasCreate(
249 &this->cublas_handle[thread_id]);
250 assert(status_create == CUBLAS_STATUS_SUCCESS);
251
252 // Set tensor core whenever possible (usually for cublasXgemm)
253 cublasStatus_t status_set = cublasSetMathMode(
254 this->cublas_handle[thread_id], CUBLAS_TENSOR_OP_MATH);
255 assert(status_set == CUBLAS_STATUS_SUCCESS);
256 }
257 }
258}
259
260
261// ==========================
262// initialize cusparse handle
263// ==========================
264
267
268template <typename DataType>
270{
271 if (this->cusparse_handle == NULL)
272 {
273 // Allocate pointers for each gpu device
274 this->cusparse_handle = new cusparseHandle_t[this->num_gpu_devices];
275
276 // Set the number of threads
277 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
278 omp_set_num_threads(this->num_gpu_devices);
279 #endif
280
281 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
282 #pragma omp parallel
283 #endif
284 {
285 // Switch to a device with the same device id as the cpu thread id
286 unsigned int thread_id;
287 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
288 thread_id = omp_get_thread_num();
289 #else
290 thread_id = 0;
291 #endif
292
294
295 cusparseStatus_t status = cusparseCreate(
296 &this->cusparse_handle[thread_id]);
297 assert(status == CUSPARSE_STATUS_SUCCESS);
298 }
299 }
300}
301
302
303// =================
304// query gpu devices
305// =================
306
312
313template <typename DataType>
315{
316 int device_count = 0;
317 cudaError_t error = cudaGetDeviceCount(&device_count);
318
319 // Error code 38 means no cuda-capable device was detected.
320 if ((error != cudaSuccess) || (device_count < 1))
321 {
322 std::cerr << "ERROR: No cuda-capable GPU device was detected on " \
323 << "this machine. If a cuda-capable GPU device exists, " \
324 << "install its cuda driver. Alternatively, set " \
325 << "'gpu=False' to use cpu instead." \
326 << std::endl;
327
328 abort();
329 }
330
331 return device_count;
332}
333
334
335// ==============
336// set parameters
337// ==============
338
345
346template <typename DataType>
348{
349 this->parameters = parameters_;
350}
351
352
353// ===============================
354// Explicit template instantiation
355// ===============================
356
357#if defined(USE_CUDA_FP8_E5M2) && (USE_CUDA_FP8_E5M2 == 1)
358 template class cuLinearOperator<__nv_fp8_e5m2>;
359#endif
360
361#if defined(USE_CUDA_FP8_E4M3) && (USE_CUDA_FP8_E4M3 == 1)
362 template class cuLinearOperator<__nv_fp8_e4m3>;
363#endif
364
365#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
366 template class cuLinearOperator<__half>;
367#endif
368
369#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
370 template class cuLinearOperator<__nv_bfloat16>;
371#endif
372
373#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
374 template class cuLinearOperator<float>;
375#endif
376
377#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
378 template class cuLinearOperator<double>;
379#endif
static void set_device(int device_id)
Sets the current device in multi-gpu applications.
Definition cuda_api.cu:191
static int get_device()
Gets the current device in multi-gpu applications.
Definition cuda_api.cu:209
Base class for linear operators. This class serves as interface for all derived classes.
cuLinearOperator()
Default constructor.
void initialize_cusparse_handle()
Creates a cusparseHandle_t object, if not created already.
int query_gpu_devices() const
Before any numerical computation, this method chechs if any gpu device is available on the machine,...
cublasHandle_t get_cublas_handle() const
This function returns a reference to the cublasHandle_t object. The object will be created,...
virtual ~cuLinearOperator()
Destructor.
void initialize_cublas_handle()
Creates a cublasHandle_t object, if not created already.
void set_parameters(DataType *parameters_)
Sets the scalar parameter this->parameters. Parameter is initialized to NULL. However,...
void omp_set_num_threads(int num_threads)
int omp_get_thread_num()
cublasStatus_t cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode)
Definition of CUDA's cublasSetmathMode function using dynamically loaded cublas library.
cudaError_t cudaGetDeviceCount(int *count)
Definition of CUDA's cudaGetDeviceCount function using dynamically loaded cudart library.
cusparseStatus_t cusparseDestroy(cusparseHandle_t handle)
Definition of CUDA's cusparseDestroy function using dynamically loaded cublas library.
cusparseStatus_t cusparseCreate(cusparseHandle_t *handle)
Definition of CUDA's cusparseCreate function using dynamically loaded cublas library.