imate
C++/CUDA Reference
Loading...
Searching...
No Matches
cublas_impl.cu
Go to the documentation of this file.
1/*
2 * SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>
3 * SPDX-License-Identifier: BSD-3-Clause
4 * SPDX-FileType: SOURCE
5 *
6 * This program is free software: you can redistribute it and/or modify it
7 * under the terms of the license found in the LICENSE.txt file in the root
8 * directory of this source tree.
9 */
10
11
12// =======
13// Headers
14// =======
15
16#include "./cublas_impl.h"
17#include "./cublas_impl_kernels.h" // cublas_impl_kernels
18#include "../_cu_arithmetics/cu_arithmetics.h" // cu_arithmetics
19#include <cuda_runtime.h>
20#include "../_cu_definitions/cu_types.h" // __nv_fp8_e5m2, __nv_fp8_e4m3,
21 // __half, __nv_bfloat16
22#include <stdexcept> // std::invalid_argument
23
24
25// ===========
26// cublas impl
27// ===========
28
29namespace cublas_impl
30{
31 // ===========
32 // cublasTgemv
33 // ===========
34
75
76 template <typename DataType, typename ComputeType>
77 cudaError_t cublasTgemv(
78 cublasOperation_t trans,
79 int m,
80 int n,
81 const DataType* RESTRICT alpha,
82 const DataType* RESTRICT A,
83 int lda,
84 const DataType* RESTRICT x,
85 int incx,
86 const DataType* RESTRICT beta,
87 DataType* RESTRICT y,
88 int incy)
89 {
90 // Determine array sizes based on operation of A
91 bool trans_;
92 int x_size;
93 int y_size;
94
95 if (trans == CUBLAS_OP_N)
96 {
97 // A is not transposed
98 trans_ = false;
99 y_size = m;
100 x_size = n;
101 }
102 else if (trans == CUBLAS_OP_T)
103 {
104 // A is transposed
105 trans_ = true;
106 y_size = n;
107 x_size = m;
108 }
109 else
110 {
111 throw std::invalid_argument(
112 "'trans' argument must be CUBLAS_OP_N or CUBLAS_OP_T.");
113 }
114
115 // The optimal number of threads per block (here 640) is obtained by
116 // calling cudaOccupancyMaxPotentialBlockSize() in a separate
117 // benchmark.
118 const int threads_per_block = 640;
119 dim3 dim_block(threads_per_block);
120
121 // We assume each thread represents one element of y. That is, the
122 // total number of threads is the size of y.
123 int blocks_per_grid = \
124 (y_size + threads_per_block - 1) / threads_per_block;
125 dim3 dim_grid(blocks_per_grid);
126
127 // Calling kernel code
129 DataType, ComputeType, threads_per_block>
130 <<<dim_grid, dim_block>>>(
131 trans_, y_size, x_size, *alpha, A, lda, x, incx, *beta, y,
132 incy);
133
134 cudaError_t error = cudaDeviceSynchronize();
135
136 return error;
137 }
138
139
140 // ===========
141 // cublasTcopy
142 // ===========
143
167
168 template <typename DataType>
169 cudaError_t cublasTcopy(
170 int n,
171 const DataType* RESTRICT x,
172 int incx,
173 DataType* RESTRICT y,
174 int incy)
175 {
176 // Set number of device threads and blocks
177 const int threads_per_block = 256;
178 int blocks_per_grid = (n + threads_per_block - 1) / threads_per_block;
179
180 // Call device code
181 cublas_impl_kernels::cublasTcopy_kernel<DataType><<<
182 blocks_per_grid, threads_per_block>>>(
183 n, x, incx, y, incy);
184
185 cudaError_t error = cudaDeviceSynchronize();
186
187 return error;
188 }
189
190
191 // ===========
192 // cublasTaxpy
193 // ===========
194
221
222 template <typename DataType>
223 cudaError_t cublasTaxpy(
224 int n,
225 const DataType* RESTRICT alpha,
226 const DataType* RESTRICT x,
227 int incx,
228 DataType* RESTRICT y,
229 int incy)
230 {
231 // Set number of device threads and blocks
232 const int threads_per_block = 256;
233 int blocks_per_grid = (n + threads_per_block - 1) / threads_per_block;
234
235 // Call device code
236 cublas_impl_kernels::cublasTaxpy_kernel<DataType><<<
237 blocks_per_grid, threads_per_block>>>(
238 n, *alpha, x, incx, y, incy);
239
240 cudaError_t error = cudaDeviceSynchronize();
241
242 return error;
243 }
244
245
246 // ==========
247 // cublasTdot
248 // ==========
249
275
276 template <typename DataType, typename ComputeType>
277 cudaError_t cublasTdot(
278 int n,
279 const DataType* RESTRICT x,
280 int incx,
281 const DataType* RESTRICT y,
282 int incy,
283 DataType* RESTRICT result)
284 {
285 // device pointer to store the result (this is a scalar value)
286 ComputeType *device_result;
287 cudaMalloc(&device_result, sizeof(ComputeType));
288 cudaMemset(device_result, static_cast<ComputeType>(0.0f),
289 sizeof(ComputeType));
290
291 // Set number of device threads and blocks
292 const int threads_per_block = 256;
293 int blocks_per_grid = (n + threads_per_block - 1) / threads_per_block;
294
295 // Call device code
297 DataType, ComputeType, threads_per_block><<<
298 blocks_per_grid, threads_per_block>>>(
299 n, x, incx, y, incy, device_result);
300
301 cudaError_t error = cudaDeviceSynchronize();
302
303 // Return back result from device and store as higher precision type
304 ComputeType host_result_comp;
305 cudaMemcpy(&host_result_comp, device_result, sizeof(ComputeType),
306 cudaMemcpyDeviceToHost);
307
308 // Convert type to match output type
310 host_result_comp);
311
312 cudaFree(device_result);
313
314 return error;
315 }
316
317
318 // ===========
319 // cublasTnrm2
320 // ===========
321
342
343 template <typename DataType, typename ComputeType>
344 cudaError_t cublasTnrm2(
345 int n,
346 const DataType* RESTRICT x,
347 int incx,
348 DataType* RESTRICT result)
349 {
350 // device pointer to store the result (this is a scalar value)
351 ComputeType *device_result;
352 cudaMalloc(&device_result, sizeof(ComputeType));
353 cudaMemset(device_result, static_cast<ComputeType>(0.0f),
354 sizeof(ComputeType));
355
356 // Set number of device threads and blocks
357 const int threads_per_block = 256;
358 int blocks_per_grid = (n + threads_per_block - 1) / threads_per_block;
359
360 // Call device code
362 DataType, ComputeType, threads_per_block><<<
363 blocks_per_grid, threads_per_block>>>(
364 n, x, incx, device_result);
365
366 cudaError_t error = cudaDeviceSynchronize();
367
368 // Return back result from device and store as higher precision type
369 ComputeType host_result_comp;
370 cudaMemcpy(&host_result_comp, device_result, sizeof(ComputeType),
371 cudaMemcpyDeviceToHost);
372
373 // Convert type to match output type
375 host_result_comp);
376
377 cudaFree(device_result);
378
379 return error;
380 }
381
382
383 // ===========
384 // cublasTscal
385 // ===========
386
409
410 template <typename DataType>
411 cudaError_t cublasTscal(
412 int n,
413 const DataType* RESTRICT alpha,
414 DataType* RESTRICT x,
415 int incx)
416 {
417 // Set number of device threads and blocks
418 int threads_per_block = 256;
419 int blocks_per_grid = (n + threads_per_block - 1) / threads_per_block;
420
421 // Call device code
422 cublas_impl_kernels::cublasTscal_kernel<DataType><<<
423 blocks_per_grid, threads_per_block>>>(
424 n, *alpha, x, incx);
425
426 cudaError_t error = cudaDeviceSynchronize();
427
428 return error;
429 }
430
431} // namespace cublas_impl
432
433
434// ===============================
435// Explicit template instantiation
436// ===============================
437
438// cublasTgemv (__nv_fp8_e5m2)
439#if defined(USE_CUDA_FP8_E5M2) && (USE_CUDA_FP8_E5M2 == 1)
440 template
441 cudaError_t cublas_impl::cublasTgemv<__nv_fp8_e5m2, float>(
442 cublasOperation_t trans,
443 int m,
444 int n,
445 const __nv_fp8_e5m2* RESTRICT alpha,
446 const __nv_fp8_e5m2* RESTRICT A,
447 int lda,
448 const __nv_fp8_e5m2* RESTRICT x,
449 int incx,
450 const __nv_fp8_e5m2* RESTRICT beta,
452 int incy);
453#endif
454
455// cublasTgemv (__nv_fp8_e4m3)
456#if defined(USE_CUDA_FP8_E4M3) && (USE_CUDA_FP8_E4M3 == 1)
457 template
458 cudaError_t cublas_impl::cublasTgemv<__nv_fp8_e4m3, float>(
459 cublasOperation_t trans,
460 int m,
461 int n,
462 const __nv_fp8_e4m3* RESTRICT alpha,
463 const __nv_fp8_e4m3* RESTRICT A,
464 int lda,
465 const __nv_fp8_e4m3* RESTRICT x,
466 int incx,
467 const __nv_fp8_e4m3* RESTRICT beta,
469 int incy);
470#endif
471
472// cublasTgemv (__half)
473#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
474 template
475 cudaError_t cublas_impl::cublasTgemv<__half, float>(
476 cublasOperation_t trans,
477 int m,
478 int n,
479 const __half* RESTRICT alpha,
480 const __half* RESTRICT A,
481 int lda,
482 const __half* RESTRICT x,
483 int incx,
484 const __half* RESTRICT beta,
485 __half* RESTRICT y,
486 int incy);
487#endif
488
489// cublasTgemv (__nv_bfloat16)
490#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
491 template
492 cudaError_t cublas_impl::cublasTgemv<__nv_bfloat16, float>(
493 cublasOperation_t trans,
494 int m,
495 int n,
496 const __nv_bfloat16* RESTRICT alpha,
497 const __nv_bfloat16* RESTRICT A,
498 int lda,
499 const __nv_bfloat16* RESTRICT x,
500 int incx,
501 const __nv_bfloat16* RESTRICT beta,
502 __nv_bfloat16* RESTRICT y,
503 int incy);
504#endif
505
506// cublasTgemv (float)
507#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
508#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
509 template
510 cudaError_t cublas_impl::cublasTgemv<float, float>(
511 cublasOperation_t trans,
512 int m,
513 int n,
514 const float* RESTRICT alpha,
515 const float* RESTRICT A,
516 int lda,
517 const float* RESTRICT x,
518 int incx,
519 const float* RESTRICT beta,
520 float* RESTRICT y,
521 int incy);
522#endif
523#endif
524
525// cublasTgemv (double)
526#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
527#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
528 template
529 cudaError_t cublas_impl::cublasTgemv<double, double>(
530 cublasOperation_t trans,
531 int m,
532 int n,
533 const double* RESTRICT alpha,
534 const double* RESTRICT A,
535 int lda,
536 const double* RESTRICT x,
537 int incx,
538 const double* RESTRICT beta,
539 double* RESTRICT y,
540 int incy);
541#endif
542#endif
543
544// cublasTcopy (__half)
545#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
546 template
547 cudaError_t cublas_impl::cublasTcopy<__half>(
548 int n,
549 const __half* RESTRICT x,
550 int incx,
551 __half* RESTRICT y,
552 int incy);
553#endif
554
555// cublasTcopy (__nv_bfloat16)
556#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
557 template
558 cudaError_t cublas_impl::cublasTcopy<__nv_bfloat16>(
559 int n,
560 const __nv_bfloat16* RESTRICT x,
561 int incx,
562 __nv_bfloat16* RESTRICT y,
563 int incy);
564#endif
565
566// cublasTcopy (float)
567#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
568#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
569 template
570 cudaError_t cublas_impl::cublasTcopy<float>(
571 int n,
572 const float* RESTRICT x,
573 int incx,
574 float* RESTRICT y,
575 int incy);
576#endif
577#endif
578
579// cublasTcopy (double)
580#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
581#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
582 template
583 cudaError_t cublas_impl::cublasTcopy<double>(
584 int n,
585 const double* RESTRICT x,
586 int incx,
587 double* RESTRICT y,
588 int incy);
589#endif
590#endif
591
592// cublasTaxpy (__half)
593#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
594 template
595 cudaError_t cublas_impl::cublasTaxpy<__half>(
596 int n,
597 const __half* RESTRICT alpha,
598 const __half* RESTRICT x,
599 int incx,
600 __half* RESTRICT y,
601 int incy);
602#endif
603
604// cublasTaxpy (__nv_bfloat16)
605#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
606 template
607 cudaError_t cublas_impl::cublasTaxpy<__nv_bfloat16>(
608 int n,
609 const __nv_bfloat16* RESTRICT alpha,
610 const __nv_bfloat16* RESTRICT x,
611 int incx,
612 __nv_bfloat16* RESTRICT y,
613 int incy);
614#endif
615
616// cublasTaxpy (float)
617#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
618#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
619 template
620 cudaError_t cublas_impl::cublasTaxpy<float>(
621 int n,
622 const float* RESTRICT alpha,
623 const float* RESTRICT x,
624 int incx,
625 float* RESTRICT y,
626 int incy);
627#endif
628#endif
629
630// cublasTaxpy (double)
631#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
632#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
633 template
634 cudaError_t cublas_impl::cublasTaxpy<double>(
635 int n,
636 const double* RESTRICT alpha,
637 const double* RESTRICT x,
638 int incx,
639 double* RESTRICT y,
640 int incy);
641#endif
642#endif
643
644// cublasTdot (__half)
645#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
646 template
647 cudaError_t cublas_impl::cublasTdot<__half, float>(
648 int n,
649 const __half* RESTRICT x,
650 int incx,
651 const __half* RESTRICT y,
652 int incy,
653 __half* RESTRICT result);
654#endif
655
656// cublasTdot (__nv_bfloat16)
657#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
658 template
659 cudaError_t cublas_impl::cublasTdot<__nv_bfloat16, float>(
660 int n,
661 const __nv_bfloat16* RESTRICT x,
662 int incx,
663 const __nv_bfloat16* RESTRICT y,
664 int incy,
665 __nv_bfloat16* RESTRICT result);
666#endif
667
668// cublasTdot (float)
669#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
670#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
671 template
672 cudaError_t cublas_impl::cublasTdot<float, float>(
673 int n,
674 const float* RESTRICT x,
675 int incx,
676 const float* RESTRICT y,
677 int incy,
678 float* RESTRICT result);
679#endif
680#endif
681
682// cublasTdot (double)
683#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
684#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
685 template
686 cudaError_t cublas_impl::cublasTdot<double, double>(
687 int n,
688 const double* RESTRICT x,
689 int incx,
690 const double* RESTRICT y,
691 int incy,
692 double* RESTRICT result);
693#endif
694#endif
695
696// cublasTnrm2 (__half)
697#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
698 template
699 cudaError_t cublas_impl::cublasTnrm2<__half, float>(
700 int n,
701 const __half* RESTRICT x,
702 int incx,
703 __half* RESTRICT result);
704#endif
705
706// cublasTnrm2 (__nv_bfloat16)
707#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
708 template
709 cudaError_t cublas_impl::cublasTnrm2<__nv_bfloat16, float>(
710 int n,
711 const __nv_bfloat16* RESTRICT x,
712 int incx,
713 __nv_bfloat16* RESTRICT result);
714#endif
715
716// cublasTnrm2 (float)
717#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
718#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
719template
720 cudaError_t cublas_impl::cublasTnrm2<float, float>(
721 int n,
722 const float* RESTRICT x,
723 int incx,
724 float* RESTRICT result);
725#endif
726#endif
727
728// cublasTnrm2 (double)
729#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
730#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
731 template
732 cudaError_t cublas_impl::cublasTnrm2<double, double>(
733 int n,
734 const double* RESTRICT x,
735 int incx,
736 double* RESTRICT result);
737#endif
738#endif
739
740// cublasTscal (__half)
741#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
742 template
743 cudaError_t cublas_impl::cublasTscal<__half>(
744 int n,
745 const __half* RESTRICT alpha,
746 __half* RESTRICT x,
747 int incx);
748#endif
749
750// cublasTscal (__nv_bfloat16)
751#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
752 template
753 cudaError_t cublas_impl::cublasTscal<__nv_bfloat16>(
754 int n,
755 const __nv_bfloat16* RESTRICT alpha,
756 __nv_bfloat16* RESTRICT x,
757 int incx);
758#endif
759
760// cublasTscal (float)
761#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
762#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
763 template
764 cudaError_t cublas_impl::cublasTscal<float>(
765 int n,
766 const float* RESTRICT alpha,
767 float* RESTRICT x,
768 int incx);
769#endif
770#endif
771
772// cublasTscal (double)
773#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
774#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
775 template
776 cudaError_t cublas_impl::cublasTscal<double>(
777 int n,
778 const double* RESTRICT alpha,
779 double* RESTRICT x,
780 int incx);
781#endif
782#endif
#define RESTRICT
cudaError_t cudaFree(void *devPtr)
Definition of CUDA's cudaFree function using dynamically loaded cudart library.
cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, cudaMemcpyKind kind)
Definition of CUDA's cudaMemcpy function using dynamically loaded cudart library.
cudaError_t cudaMalloc(void **devPtr, size_t size)
Definition of CUDA's cudaMalloc function using dynamically loaded cudart library.
__host__ __device__ DataType abs(const DataType x)
Absolute value of a floating point number.
__global__ void cublasTnrm2_kernel(const int n, const DataType *RESTRICT x, const int incx, ComputeType *RESTRICT result)
Computes .
__global__ void cublasTdot_kernel(const int n, const DataType *RESTRICT x, const int incx, const DataType *RESTRICT y, const int incy, ComputeType *RESTRICT result)
Computes .
__global__ void cublasTgemv_kernel(const bool trans, const int m, const int n, const DataType alpha, const DataType *RESTRICT A, const int lda, const DataType *RESTRICT x, const int incx, const DataType beta, DataType *RESTRICT y, const int incy)
Performs the operation .
Templated implenentations of several BLAS-type functions in CUDA.
cudaError_t cublasTaxpy(int n, const DataType *RESTRICT alpha, const DataType *RESTRICT x, int incx, DataType *RESTRICT y, int incy)
Performs .
cudaError_t cublasTgemv(cublasOperation_t trans, int m, int n, const DataType *RESTRICT alpha, const DataType *RESTRICT A, int lda, const DataType *RESTRICT x, int incx, const DataType *RESTRICT beta, DataType *RESTRICT y, int incy)
Performs .
cudaError_t cublasTdot(int n, const DataType *RESTRICT x, int incx, const DataType *RESTRICT y, int incy, DataType *RESTRICT result)
Computes .
cudaError_t cublasTcopy(int n, const DataType *RESTRICT x, int incx, DataType *RESTRICT y, int incy)
Performs .
cudaError_t cublasTnrm2(int n, const DataType *RESTRICT x, int incx, DataType *RESTRICT result)
Computes .
cudaError_t cublasTscal(int n, const DataType *RESTRICT alpha, DataType *RESTRICT x, int incx)
Performs .