imate
C++/CUDA Reference
Loading...
Searching...
No Matches
cublas_api.cu
Go to the documentation of this file.
1/*
2 * SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>
3 * SPDX-License-Identifier: BSD-3-Clause
4 * SPDX-FileType: SOURCE
5 *
6 * This program is free software: you can redistribute it and/or modify it
7 * under the terms of the license found in the LICENSE.txt file in the root
8 * directory of this source tree.
9 */
10
11
12// =======
13// Headers
14// =======
15
16#include "./cublas_api.h"
17#include <cuda_runtime.h> // cudaError_t, cudaSuccess
18#include "../_cu_definitions/cu_types.h" // __nv_fp8_e5m2, __nv_fp8_e4m3,
19 // __half, __nv_bfloat16
20#include "./cublas_impl.h" // cublas_impl
21
22
23// ==========
24// cublas api
25// ==========
26
32
33namespace cublas_api
34{
35 // ===========
36 // cublasXgemv (__nv_fp8_e5m2)
37 // ===========
38
79
80 #if defined(USE_CUDA_FP8_E5M2) && (USE_CUDA_FP8_E5M2 == 1)
81 template<>
82 cublasStatus_t cublasXgemv<__nv_fp8_e5m2>(
83 cublasHandle_t handle,
84 cublasOperation_t trans,
85 int m,
86 int n,
87 const __nv_fp8_e5m2* RESTRICT alpha,
88 const __nv_fp8_e5m2* RESTRICT A,
89 int lda,
90 const __nv_fp8_e5m2* RESTRICT x,
91 int incx,
92 const __nv_fp8_e5m2* RESTRICT beta,
94 int incy)
95 {
96 // Void unused variables to avoid compiler warnings
97 // (-Wno-unused-parameter)
98 (void) handle;
99
100 cudaError_t error = cublas_impl::cublasTgemv<__nv_fp8_e5m2, float>(
101 trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
102
103 if (error != cudaSuccess)
104 {
105 return CUBLAS_STATUS_SUCCESS;
106 }
107 else
108 {
109 return CUBLAS_STATUS_INTERNAL_ERROR;
110 }
111 }
112 #endif
113
114
115 // ===========
116 // cublasXgemv (__nv_fp8_e4m3)
117 // ===========
118
159
160 #if defined(USE_CUDA_FP8_E4M3) && (USE_CUDA_FP8_E4M3 == 1)
161 template<>
162 cublasStatus_t cublasXgemv<__nv_fp8_e4m3>(
163 cublasHandle_t handle,
164 cublasOperation_t trans,
165 int m,
166 int n,
167 const __nv_fp8_e4m3* RESTRICT alpha,
168 const __nv_fp8_e4m3* RESTRICT A,
169 int lda,
170 const __nv_fp8_e4m3* RESTRICT x,
171 int incx,
172 const __nv_fp8_e4m3* RESTRICT beta,
174 int incy)
175 {
176 // Void unused variables to avoid compiler warnings
177 // (-Wno-unused-parameter)
178 (void) handle;
179
180 cudaError_t error = cublas_impl::cublasTgemv<__nv_fp8_e4m3, float>(
181 trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
182
183 if (error != cudaSuccess)
184 {
185 return CUBLAS_STATUS_SUCCESS;
186 }
187 else
188 {
189 return CUBLAS_STATUS_INTERNAL_ERROR;
190 }
191 }
192 #endif
193
194
195 // ===========
196 // cublasXgemv (__half)
197 // ===========
198
239
240 #if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
241 template<>
242 cublasStatus_t cublasXgemv<__half>(
243 cublasHandle_t handle,
244 cublasOperation_t trans,
245 int m,
246 int n,
247 const __half* RESTRICT alpha,
248 const __half* RESTRICT A,
249 int lda,
250 const __half* RESTRICT x,
251 int incx,
252 const __half* RESTRICT beta,
253 __half* RESTRICT y,
254 int incy)
255 {
256 // Void unused variables to avoid compiler warnings
257 // (-Wno-unused-parameter)
258 (void) handle;
259
260 cudaError_t error = cublas_impl::cublasTgemv<__half, float>(
261 trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
262
263 if (error != cudaSuccess)
264 {
265 return CUBLAS_STATUS_SUCCESS;
266 }
267 else
268 {
269 return CUBLAS_STATUS_INTERNAL_ERROR;
270 }
271 }
272 #endif
273
274
275 // ===========
276 // cublasXgemv (__nv_bfloat16)
277 // ===========
278
319
320 #if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
321 template<>
322 cublasStatus_t cublasXgemv<__nv_bfloat16>(
323 cublasHandle_t handle,
324 cublasOperation_t trans,
325 int m,
326 int n,
327 const __nv_bfloat16* RESTRICT alpha,
328 const __nv_bfloat16* RESTRICT A,
329 int lda,
330 const __nv_bfloat16* RESTRICT x,
331 int incx,
332 const __nv_bfloat16* RESTRICT beta,
333 __nv_bfloat16* RESTRICT y,
334 int incy)
335 {
336 // Void unused variables to avoid compiler warnings
337 // (-Wno-unused-parameter)
338 (void) handle;
339
340 cudaError_t error = cublas_impl::cublasTgemv<__nv_bfloat16, float>(
341 trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
342
343 if (error != cudaSuccess)
344 {
345 return CUBLAS_STATUS_SUCCESS;
346 }
347 else
348 {
349 return CUBLAS_STATUS_INTERNAL_ERROR;
350 }
351 }
352 #endif
353
354
355 // ===========
356 // cublasXgemv (float)
357 // ===========
358
396
397 #if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
398 template<>
399 cublasStatus_t cublasXgemv<float>(
400 cublasHandle_t handle,
401 cublasOperation_t trans,
402 int m,
403 int n,
404 const float* RESTRICT alpha,
405 const float* RESTRICT A,
406 int lda,
407 const float* RESTRICT x,
408 int incx,
409 const float* RESTRICT beta,
410 float* RESTRICT y,
411 int incy)
412 {
413
414 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
415 // Use in-house implementation
416 cudaError_t error = cublas_impl::cublasTgemv<float, float>(
417 trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
418
419 if (error != cudaSuccess)
420 {
421 return CUBLAS_STATUS_SUCCESS;
422 }
423 else
424 {
425 return CUBLAS_STATUS_INTERNAL_ERROR;
426 }
427
428 #else
429 // Use Nvidia's CuBLAS
430 return cublasSgemv(handle, trans, m, n, alpha, A, lda, x, incx,
431 beta, y, incy);
432 #endif
433 }
434 #endif
435
436
437 // ===========
438 // cublasXgemv (double)
439 // ===========
440
478
479 #if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
480 template<>
481 cublasStatus_t cublasXgemv<double>(
482 cublasHandle_t handle,
483 cublasOperation_t trans,
484 int m,
485 int n,
486 const double* RESTRICT alpha,
487 const double* RESTRICT A,
488 int lda,
489 const double* RESTRICT x,
490 int incx,
491 const double* RESTRICT beta,
492 double* RESTRICT y,
493 int incy)
494 {
495 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
496 // Use in-house implementation
497 cudaError_t error = cublas_impl::cublasTgemv<double, double>(
498 trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
499
500 if (error != cudaSuccess)
501 {
502 return CUBLAS_STATUS_SUCCESS;
503 }
504 else
505 {
506 return CUBLAS_STATUS_INTERNAL_ERROR;
507 }
508
509 #else
510 // Use Nvidia's CuBLAS
511 return cublasDgemv(handle, trans, m, n, alpha, A, lda, x, incx,
512 beta, y, incy);
513 #endif
514 }
515 #endif
516
517
518 // ===========
519 // cublasXcopy (__half)
520 // ===========
521
546
547 #if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
548 template<>
549 cublasStatus_t cublasXcopy<__half>(
550 cublasHandle_t handle,
551 int n,
552 const __half* RESTRICT x,
553 int incx,
554 __half* RESTRICT y,
555 int incy)
556 {
557 // Void unused variables to avoid compiler warnings
558 // (-Wno-unused-parameter)
559 (void) handle;
560
561 cudaError_t error = cublas_impl::cublasTcopy<__half>(
562 n, x, incx, y, incy);
563
564 if (error != cudaSuccess)
565 {
566 return CUBLAS_STATUS_SUCCESS;
567 }
568 else
569 {
570 return CUBLAS_STATUS_INTERNAL_ERROR;
571 }
572 }
573 #endif
574
575
576 // ===========
577 // cublasXcopy (__nv_bfloat16)
578 // ===========
579
604
605 #if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
606 template<>
607 cublasStatus_t cublasXcopy<__nv_bfloat16>(
608 cublasHandle_t handle,
609 int n,
610 const __nv_bfloat16* RESTRICT x,
611 int incx,
612 __nv_bfloat16* RESTRICT y,
613 int incy)
614 {
615 // Void unused variables to avoid compiler warnings
616 // (-Wno-unused-parameter)
617 (void) handle;
618
619 cudaError_t error = cublas_impl::cublasTcopy<__nv_bfloat16>(
620 n, x, incx, y, incy);
621
622 if (error != cudaSuccess)
623 {
624 return CUBLAS_STATUS_SUCCESS;
625 }
626 else
627 {
628 return CUBLAS_STATUS_INTERNAL_ERROR;
629 }
630 }
631 #endif
632
633
634 // ===========
635 // cublasXcopy (float)
636 // ===========
637
657
658 #if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
659 template<>
660 cublasStatus_t cublasXcopy<float>(
661 cublasHandle_t handle,
662 int n,
663 const float* RESTRICT x,
664 int incx,
665 float* RESTRICT y,
666 int incy)
667 {
668 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
669 // Use in-house implementation
670 cudaError_t error = cublas_impl::cublasTcopy<float>(
671 n, x, incx, y, incy);
672
673 if (error != cudaSuccess)
674 {
675 return CUBLAS_STATUS_SUCCESS;
676 }
677 else
678 {
679 return CUBLAS_STATUS_INTERNAL_ERROR;
680 }
681
682 #else
683 // Use Nvidia's CuBLAS
684 return cublasScopy(handle, n, x, incx, y, incy);
685 #endif
686 }
687 #endif
688
689
690 // ===========
691 // cublasXcopy (double)
692 // ===========
693
713
714 #if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
715 template<>
716 cublasStatus_t cublasXcopy<double>(
717 cublasHandle_t handle,
718 int n,
719 const double* RESTRICT x,
720 int incx,
721 double* RESTRICT y,
722 int incy)
723 {
724 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
725 // Use in-house implementation
726 cudaError_t error = cublas_impl::cublasTcopy<double>(
727 n, x, incx, y, incy);
728
729 if (error != cudaSuccess)
730 {
731 return CUBLAS_STATUS_SUCCESS;
732 }
733 else
734 {
735 return CUBLAS_STATUS_INTERNAL_ERROR;
736 }
737
738 #else
739 // Use Nvidia's CuBLAS
740 return cublasDcopy(handle, n, x, incx, y, incy);
741 #endif
742 }
743 #endif
744
745
746 // ===========
747 // cublasXaxpy (__half)
748 // ===========
749
774
775 #if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
776 template<>
777 cublasStatus_t cublasXaxpy<__half>(
778 cublasHandle_t handle,
779 int n,
780 const __half* RESTRICT alpha,
781 const __half* RESTRICT x,
782 int incx,
783 __half* RESTRICT y,
784 int incy)
785 {
786 // Void unused variables to avoid compiler warnings
787 // (-Wno-unused-parameter)
788 (void) handle;
789
790 cudaError_t error = cublas_impl::cublasTaxpy<__half>(
791 n, alpha, x, incx, y, incy);
792
793 if (error != cudaSuccess)
794 {
795 return CUBLAS_STATUS_SUCCESS;
796 }
797 else
798 {
799 return CUBLAS_STATUS_INTERNAL_ERROR;
800 }
801 }
802 #endif
803
804
805 // ===========
806 // cublasXaxpy (__nv_bfloat16)
807 // ===========
808
833
834 #if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
835 template<>
836 cublasStatus_t cublasXaxpy<__nv_bfloat16>(
837 cublasHandle_t handle,
838 int n,
839 const __nv_bfloat16* RESTRICT alpha,
840 const __nv_bfloat16* RESTRICT x,
841 int incx,
842 __nv_bfloat16* RESTRICT y,
843 int incy)
844 {
845 // Void unused variables to avoid compiler warnings
846 // (-Wno-unused-parameter)
847 (void) handle;
848
849 cudaError_t error = cublas_impl::cublasTaxpy<__nv_bfloat16>(
850 n, alpha, x, incx, y, incy);
851
852 if (error != cudaSuccess)
853 {
854 return CUBLAS_STATUS_SUCCESS;
855 }
856 else
857 {
858 return CUBLAS_STATUS_INTERNAL_ERROR;
859 }
860 }
861 #endif
862
863
864 // ===========
865 // cublasXaxpy (float)
866 // ===========
867
892
893 #if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
894 template<>
895 cublasStatus_t cublasXaxpy<float>(
896 cublasHandle_t handle,
897 int n,
898 const float* RESTRICT alpha,
899 const float* RESTRICT x,
900 int incx,
901 float* RESTRICT y,
902 int incy)
903 {
904 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
905 // Use in-house implementation
906 cudaError_t error = cublas_impl::cublasTaxpy<float>(
907 n, alpha, x, incx, y, incy);
908
909 if (error != cudaSuccess)
910 {
911 return CUBLAS_STATUS_SUCCESS;
912 }
913 else
914 {
915 return CUBLAS_STATUS_INTERNAL_ERROR;
916 }
917
918 #else
919 return cublasSaxpy(handle, n, alpha, x, incx, y, incy);
920 #endif
921 }
922 #endif
923
924
925 // ===========
926 // cublasXaxpy (double)
927 // ===========
928
953
954 #if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
955 template<>
956 cublasStatus_t cublasXaxpy<double>(
957 cublasHandle_t handle,
958 int n,
959 const double* RESTRICT alpha,
960 const double* RESTRICT x,
961 int incx,
962 double* RESTRICT y,
963 int incy)
964 {
965 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
966 // Use in-house implementation
967 cudaError_t error = cublas_impl::cublasTaxpy<double>(
968 n, alpha, x, incx, y, incy);
969
970 if (error != cudaSuccess)
971 {
972 return CUBLAS_STATUS_SUCCESS;
973 }
974 else
975 {
976 return CUBLAS_STATUS_INTERNAL_ERROR;
977 }
978
979 #else
980 return cublasDaxpy(handle, n, alpha, x, incx, y, incy);
981 #endif
982 }
983 #endif
984
985
986 // ==========
987 // cublasXdot (__half)
988 // ==========
989
1014
1015 #if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
1016 template<>
1017 cublasStatus_t cublasXdot<__half>(
1018 cublasHandle_t handle,
1019 int n,
1020 const __half* RESTRICT x,
1021 int incx,
1022 const __half* RESTRICT y,
1023 int incy,
1024 __half* RESTRICT result)
1025 {
1026 // Void unused variables to avoid compiler warnings
1027 // (-Wno-unused-parameter)
1028 (void) handle;
1029
1030 cudaError_t error = cublas_impl::cublasTdot<__half, float>(
1031 n, x, incx, y, incy, result);
1032
1033 if (error != cudaSuccess)
1034 {
1035 return CUBLAS_STATUS_SUCCESS;
1036 }
1037 else
1038 {
1039 return CUBLAS_STATUS_INTERNAL_ERROR;
1040 }
1041 }
1042 #endif
1043
1044
1045 // ==========
1046 // cublasXdot (__nv_bfloat16)
1047 // ==========
1048
1073
1074 #if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
1075 template<>
1076 cublasStatus_t cublasXdot<__nv_bfloat16>(
1077 cublasHandle_t handle,
1078 int n,
1079 const __nv_bfloat16* RESTRICT x,
1080 int incx,
1081 const __nv_bfloat16* RESTRICT y,
1082 int incy,
1083 __nv_bfloat16* RESTRICT result)
1084 {
1085 // Void unused variables to avoid compiler warnings
1086 // (-Wno-unused-parameter)
1087 (void) handle;
1088
1089 cudaError_t error = cublas_impl::cublasTdot<__nv_bfloat16, float>(
1090 n, x, incx, y, incy, result);
1091
1092 if (error != cudaSuccess)
1093 {
1094 return CUBLAS_STATUS_SUCCESS;
1095 }
1096 else
1097 {
1098 return CUBLAS_STATUS_INTERNAL_ERROR;
1099 }
1100 }
1101 #endif
1102
1103
1104 // ==========
1105 // cublasXdot (float)
1106 // ==========
1107
1132
1133 #if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
1134 template<>
1135 cublasStatus_t cublasXdot<float>(
1136 cublasHandle_t handle,
1137 int n,
1138 const float* RESTRICT x,
1139 int incx,
1140 const float* RESTRICT y,
1141 int incy,
1142 float* RESTRICT result)
1143 {
1144 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
1145 // Use in-house implementation
1146 cudaError_t error = cublas_impl::cublasTdot<float, float>(
1147 n, x, incx, y, incy, result);
1148
1149 if (error != cudaSuccess)
1150 {
1151 return CUBLAS_STATUS_SUCCESS;
1152 }
1153 else
1154 {
1155 return CUBLAS_STATUS_INTERNAL_ERROR;
1156 }
1157
1158 #else
1159 return cublasSdot(handle, n, x, incx, y, incy, result);
1160 #endif
1161 }
1162 #endif
1163
1164
1165 // ==========
1166 // cublasXdot (double)
1167 // ==========
1168
1193
1194 #if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
1195 template<>
1196 cublasStatus_t cublasXdot<double>(
1197 cublasHandle_t handle,
1198 int n,
1199 const double* RESTRICT x,
1200 int incx,
1201 const double* RESTRICT y,
1202 int incy,
1203 double* RESTRICT result)
1204 {
1205 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
1206 // Use in-house implementation
1207 cudaError_t error = cublas_impl::cublasTdot<double, double>(
1208 n, x, incx, y, incy, result);
1209
1210 if (error != cudaSuccess)
1211 {
1212 return CUBLAS_STATUS_SUCCESS;
1213 }
1214 else
1215 {
1216 return CUBLAS_STATUS_INTERNAL_ERROR;
1217 }
1218
1219 #else
1220 return cublasDdot(handle, n, x, incx, y, incy, result);
1221 #endif
1222 }
1223 #endif
1224
1225
1226 // ===========
1227 // cublasXnrm2 (__half)
1228 // ===========
1229
1249
1250 #if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
1251 template<>
1252 cublasStatus_t cublasXnrm2<__half>(
1253 cublasHandle_t handle,
1254 int n,
1255 const __half* RESTRICT x,
1256 int incx,
1257 __half* RESTRICT result)
1258 {
1259 // Void unused variables to avoid compiler warnings
1260 // (-Wno-unused-parameter)
1261 (void) handle;
1262
1263 cudaError_t error = cublas_impl::cublasTnrm2<__half, float>(
1264 n, x, incx, result);
1265
1266 if (error != cudaSuccess)
1267 {
1268 return CUBLAS_STATUS_SUCCESS;
1269 }
1270 else
1271 {
1272 return CUBLAS_STATUS_INTERNAL_ERROR;
1273 }
1274 }
1275 #endif
1276
1277
1278 // ===========
1279 // cublasXnrm2 (__nv_bfloat16)
1280 // ===========
1281
1301
1302 #if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
1303 template<>
1304 cublasStatus_t cublasXnrm2<__nv_bfloat16>(
1305 cublasHandle_t handle,
1306 int n,
1307 const __nv_bfloat16* RESTRICT x,
1308 int incx,
1309 __nv_bfloat16* RESTRICT result)
1310 {
1311 // Void unused variables to avoid compiler warnings
1312 // (-Wno-unused-parameter)
1313 (void) handle;
1314
1315 cudaError_t error = cublas_impl::cublasTnrm2<__nv_bfloat16, float>(
1316 n, x, incx, result);
1317
1318 if (error != cudaSuccess)
1319 {
1320 return CUBLAS_STATUS_SUCCESS;
1321 }
1322 else
1323 {
1324 return CUBLAS_STATUS_INTERNAL_ERROR;
1325 }
1326 }
1327 #endif
1328
1329
1330 // ===========
1331 // cublasXnrm2 (float)
1332 // ===========
1333
1353
1354 #if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
1355 template<>
1356 cublasStatus_t cublasXnrm2<float>(
1357 cublasHandle_t handle,
1358 int n,
1359 const float* RESTRICT x,
1360 int incx,
1361 float* RESTRICT result)
1362 {
1363 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
1364 // Use in-house implementation
1365 cudaError_t error = cublas_impl::cublasTnrm2<float, float>(
1366 n, x, incx, result);
1367
1368 if (error != cudaSuccess)
1369 {
1370 return CUBLAS_STATUS_SUCCESS;
1371 }
1372 else
1373 {
1374 return CUBLAS_STATUS_INTERNAL_ERROR;
1375 }
1376
1377 #else
1378 return cublasSnrm2(handle, n, x, incx, result);
1379 #endif
1380 }
1381 #endif
1382
1383
1384 // ===========
1385 // cublasXnrm2 (double)
1386 // ===========
1387
1407
1408 #if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
1409 template<>
1410 cublasStatus_t cublasXnrm2<double>(
1411 cublasHandle_t handle,
1412 int n,
1413 const double* RESTRICT x,
1414 int incx,
1415 double* RESTRICT result)
1416 {
1417 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
1418 // Use in-house implementation
1419 cudaError_t error = cublas_impl::cublasTnrm2<double, double>(
1420 n, x, incx, result);
1421
1422 if (error != cudaSuccess)
1423 {
1424 return CUBLAS_STATUS_SUCCESS;
1425 }
1426 else
1427 {
1428 return CUBLAS_STATUS_INTERNAL_ERROR;
1429 }
1430
1431 #else
1432 return cublasDnrm2(handle, n, x, incx, result);
1433 #endif
1434 }
1435 #endif
1436
1437
1438 // ===========
1439 // cublasXscal (__half)
1440 // ===========
1441
1462
1463 #if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
1464 template<>
1465 cublasStatus_t cublasXscal<__half>(
1466 cublasHandle_t handle,
1467 int n,
1468 const __half* RESTRICT alpha,
1469 __half* RESTRICT x,
1470 int incx)
1471 {
1472 // Void unused variables to avoid compiler warnings
1473 // (-Wno-unused-parameter)
1474 (void) handle;
1475
1476 cudaError_t error = cublas_impl::cublasTscal<__half>(
1477 n, alpha, x, incx);
1478
1479 if (error != cudaSuccess)
1480 {
1481 return CUBLAS_STATUS_SUCCESS;
1482 }
1483 else
1484 {
1485 return CUBLAS_STATUS_INTERNAL_ERROR;
1486 }
1487 }
1488 #endif
1489
1490
1491 // ===========
1492 // cublasXscal (__nv_bfloat16)
1493 // ===========
1494
1515
1516 #if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
1517 template<>
1518 cublasStatus_t cublasXscal<__nv_bfloat16>(
1519 cublasHandle_t handle,
1520 int n,
1521 const __nv_bfloat16* RESTRICT alpha,
1522 __nv_bfloat16* RESTRICT x,
1523 int incx)
1524 {
1525 // Void unused variables to avoid compiler warnings
1526 // (-Wno-unused-parameter)
1527 (void) handle;
1528
1529 cudaError_t error = cublas_impl::cublasTscal<__nv_bfloat16>(
1530 n, alpha, x, incx);
1531
1532 if (error != cudaSuccess)
1533 {
1534 return CUBLAS_STATUS_SUCCESS;
1535 }
1536 else
1537 {
1538 return CUBLAS_STATUS_INTERNAL_ERROR;
1539 }
1540 }
1541 #endif
1542
1543
1544 // ===========
1545 // cublasXscal (float)
1546 // ===========
1547
1568
1569 #if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
1570 template<>
1571 cublasStatus_t cublasXscal<float>(
1572 cublasHandle_t handle,
1573 int n,
1574 const float* RESTRICT alpha,
1575 float* RESTRICT x,
1576 int incx)
1577 {
1578 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
1579 // Use in-house implementation
1580 cudaError_t error = cublas_impl::cublasTscal<float>(
1581 n, alpha, x, incx);
1582
1583 if (error != cudaSuccess)
1584 {
1585 return CUBLAS_STATUS_SUCCESS;
1586 }
1587 else
1588 {
1589 return CUBLAS_STATUS_INTERNAL_ERROR;
1590 }
1591
1592 #else
1593 return cublasSscal(handle, n, alpha, x, incx);
1594 #endif
1595 }
1596 #endif
1597
1598
1599 // ===========
1600 // cublasXscal (double)
1601 // ===========
1602
1623
1624 #if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
1625 template<>
1626 cublasStatus_t cublasXscal<double>(
1627 cublasHandle_t handle,
1628 int n,
1629 const double* RESTRICT alpha,
1630 double* RESTRICT x,
1631 int incx)
1632 {
1633 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
1634 // Use in-house implementation
1635 cudaError_t error = cublas_impl::cublasTscal<double>(
1636 n, alpha, x, incx);
1637
1638 if (error != cudaSuccess)
1639 {
1640 return CUBLAS_STATUS_SUCCESS;
1641 }
1642 else
1643 {
1644 return CUBLAS_STATUS_INTERNAL_ERROR;
1645 }
1646
1647 #else
1648 return cublasDscal(handle, n, alpha, x, incx);
1649 #endif
1650 }
1651 #endif
1652
1653} // namespace cublas_api
#define RESTRICT
cublasStatus_t cublasDcopy(cublasHandle_t handle, int n, const double *x, int incx, double *y, int incy)
Definition of CUDA's cublasDcopy function using dynamically loaded cublas library.
cublasStatus_t cublasSscal(cublasHandle_t handle, int n, const float *alpha, float *x, int incx)
Definition of CUDA's cublasSscal function using dynamically loaded cublas library.
cublasStatus_t cublasDscal(cublasHandle_t handle, int n, const double *alpha, double *x, int incx)
Definition of CUDA's cublasDscal function using dynamically loaded cublas library.
cublasStatus_t cublasSdot(cublasHandle_t handle, int n, const float *x, int incx, const float *y, int incy, float *result)
Definition of CUDA's cublasSdot function using dynamically loaded cublas library.
cublasStatus_t cublasSnrm2(cublasHandle_t handle, int n, const float *x, int incx, float *result)
Definition of CUDA's cublasSnrm2 function using dynamically loaded cublas library.
cublasStatus_t cublasSaxpy(cublasHandle_t handle, int n, const float *alpha, const float *x, int incx, float *y, int incy)
Definition of CUDA's cublasSaxpy function using dynamically loaded cublas library.
cublasStatus_t cublasDaxpy(cublasHandle_t handle, int n, const double *alpha, const double *x, int incx, double *y, int incy)
Definition of CUDA's cublasDaxpy function using dynamically loaded cublas library.
cublasStatus_t cublasScopy(cublasHandle_t handle, int n, const float *x, int incx, float *y, int incy)
Definition of CUDA's cublasScopy function using dynamically loaded cublas library.
cublasStatus_t cublasDnrm2(cublasHandle_t handle, int n, const double *x, int incx, double *result)
Definition of CUDA's cublasDnrm2 function using dynamically loaded cublas library.
cublasStatus_t cublasDdot(cublasHandle_t handle, int n, const double *x, int incx, const double *y, int incy, double *result)
Definition of CUDA's cublasDdot function using dynamically loaded cublas library.
A collection of templates to wrapper cublas functions.
Definition cublas_api.cu:34
cublasStatus_t cublasXaxpy< double >(cublasHandle_t handle, int n, const double *RESTRICT alpha, const double *RESTRICT x, int incx, double *RESTRICT y, int incy)
Performs on double precision.
cublasStatus_t cublasXgemv< double >(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const double *RESTRICT alpha, const double *RESTRICT A, int lda, const double *RESTRICT x, int incx, const double *RESTRICT beta, double *RESTRICT y, int incy)
Performs .
cublasStatus_t cublasXdot< float >(cublasHandle_t handle, int n, const float *RESTRICT x, int incx, const float *RESTRICT y, int incy, float *RESTRICT result)
Performs on __half precision.
cublasStatus_t cublasXscal< float >(cublasHandle_t handle, int n, const float *RESTRICT alpha, float *RESTRICT x, int incx)
Performs on __half precision.
cublasStatus_t cublasXcopy< double >(cublasHandle_t handle, int n, const double *RESTRICT x, int incx, double *RESTRICT y, int incy)
Performs in double type.
cublasStatus_t cublasXdot< double >(cublasHandle_t handle, int n, const double *RESTRICT x, int incx, const double *RESTRICT y, int incy, double *RESTRICT result)
Performs on double precision.
cublasStatus_t cublasXaxpy< float >(cublasHandle_t handle, int n, const float *RESTRICT alpha, const float *RESTRICT x, int incx, float *RESTRICT y, int incy)
Performs on __half precision.
cublasStatus_t cublasXnrm2< double >(cublasHandle_t handle, int n, const double *RESTRICT x, int incx, double *RESTRICT result)
Performs on double precision.
cublasStatus_t cublasXnrm2< float >(cublasHandle_t handle, int n, const float *RESTRICT x, int incx, float *RESTRICT result)
Performs on __half precision.
cublasStatus_t cublasXgemv< float >(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float *RESTRICT alpha, const float *RESTRICT A, int lda, const float *RESTRICT x, int incx, const float *RESTRICT beta, float *RESTRICT y, int incy)
Performs .
cublasStatus_t cublasXscal< double >(cublasHandle_t handle, int n, const double *RESTRICT alpha, double *RESTRICT x, int incx)
Performs on double precision.
cublasStatus_t cublasXcopy< float >(cublasHandle_t handle, int n, const float *RESTRICT x, int incx, float *RESTRICT y, int incy)
Performs in __half type.