17#include <cuda_runtime.h>
18#include "../_cu_definitions/cu_types.h"
80 #if defined(USE_CUDA_FP8_E5M2) && (USE_CUDA_FP8_E5M2 == 1)
82 cublasStatus_t cublasXgemv<__nv_fp8_e5m2>(
83 cublasHandle_t handle,
84 cublasOperation_t trans,
100 cudaError_t error = cublas_impl::cublasTgemv<__nv_fp8_e5m2, float>(
101 trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
103 if (error != cudaSuccess)
105 return CUBLAS_STATUS_SUCCESS;
109 return CUBLAS_STATUS_INTERNAL_ERROR;
160 #if defined(USE_CUDA_FP8_E4M3) && (USE_CUDA_FP8_E4M3 == 1)
162 cublasStatus_t cublasXgemv<__nv_fp8_e4m3>(
163 cublasHandle_t handle,
164 cublasOperation_t trans,
180 cudaError_t error = cublas_impl::cublasTgemv<__nv_fp8_e4m3, float>(
181 trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
183 if (error != cudaSuccess)
185 return CUBLAS_STATUS_SUCCESS;
189 return CUBLAS_STATUS_INTERNAL_ERROR;
240 #if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
242 cublasStatus_t cublasXgemv<__half>(
243 cublasHandle_t handle,
244 cublasOperation_t trans,
260 cudaError_t error = cublas_impl::cublasTgemv<__half, float>(
261 trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
263 if (error != cudaSuccess)
265 return CUBLAS_STATUS_SUCCESS;
269 return CUBLAS_STATUS_INTERNAL_ERROR;
320 #if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
322 cublasStatus_t cublasXgemv<__nv_bfloat16>(
323 cublasHandle_t handle,
324 cublasOperation_t trans,
327 const __nv_bfloat16*
RESTRICT alpha,
340 cudaError_t error = cublas_impl::cublasTgemv<__nv_bfloat16, float>(
341 trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
343 if (error != cudaSuccess)
345 return CUBLAS_STATUS_SUCCESS;
349 return CUBLAS_STATUS_INTERNAL_ERROR;
397 #if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
400 cublasHandle_t handle,
401 cublasOperation_t trans,
414 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
416 cudaError_t error = cublas_impl::cublasTgemv<float, float>(
417 trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
419 if (error != cudaSuccess)
421 return CUBLAS_STATUS_SUCCESS;
425 return CUBLAS_STATUS_INTERNAL_ERROR;
430 return cublasSgemv(handle, trans, m, n, alpha, A, lda, x, incx,
479 #if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
482 cublasHandle_t handle,
483 cublasOperation_t trans,
495 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
497 cudaError_t error = cublas_impl::cublasTgemv<double, double>(
498 trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
500 if (error != cudaSuccess)
502 return CUBLAS_STATUS_SUCCESS;
506 return CUBLAS_STATUS_INTERNAL_ERROR;
511 return cublasDgemv(handle, trans, m, n, alpha, A, lda, x, incx,
547 #if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
549 cublasStatus_t cublasXcopy<__half>(
550 cublasHandle_t handle,
561 cudaError_t error = cublas_impl::cublasTcopy<__half>(
562 n, x, incx, y, incy);
564 if (error != cudaSuccess)
566 return CUBLAS_STATUS_SUCCESS;
570 return CUBLAS_STATUS_INTERNAL_ERROR;
605 #if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
607 cublasStatus_t cublasXcopy<__nv_bfloat16>(
608 cublasHandle_t handle,
619 cudaError_t error = cublas_impl::cublasTcopy<__nv_bfloat16>(
620 n, x, incx, y, incy);
622 if (error != cudaSuccess)
624 return CUBLAS_STATUS_SUCCESS;
628 return CUBLAS_STATUS_INTERNAL_ERROR;
658 #if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
661 cublasHandle_t handle,
668 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
670 cudaError_t error = cublas_impl::cublasTcopy<float>(
671 n, x, incx, y, incy);
673 if (error != cudaSuccess)
675 return CUBLAS_STATUS_SUCCESS;
679 return CUBLAS_STATUS_INTERNAL_ERROR;
714 #if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
717 cublasHandle_t handle,
724 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
726 cudaError_t error = cublas_impl::cublasTcopy<double>(
727 n, x, incx, y, incy);
729 if (error != cudaSuccess)
731 return CUBLAS_STATUS_SUCCESS;
735 return CUBLAS_STATUS_INTERNAL_ERROR;
775 #if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
777 cublasStatus_t cublasXaxpy<__half>(
778 cublasHandle_t handle,
790 cudaError_t error = cublas_impl::cublasTaxpy<__half>(
791 n, alpha, x, incx, y, incy);
793 if (error != cudaSuccess)
795 return CUBLAS_STATUS_SUCCESS;
799 return CUBLAS_STATUS_INTERNAL_ERROR;
834 #if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
836 cublasStatus_t cublasXaxpy<__nv_bfloat16>(
837 cublasHandle_t handle,
839 const __nv_bfloat16*
RESTRICT alpha,
849 cudaError_t error = cublas_impl::cublasTaxpy<__nv_bfloat16>(
850 n, alpha, x, incx, y, incy);
852 if (error != cudaSuccess)
854 return CUBLAS_STATUS_SUCCESS;
858 return CUBLAS_STATUS_INTERNAL_ERROR;
893 #if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
896 cublasHandle_t handle,
904 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
906 cudaError_t error = cublas_impl::cublasTaxpy<float>(
907 n, alpha, x, incx, y, incy);
909 if (error != cudaSuccess)
911 return CUBLAS_STATUS_SUCCESS;
915 return CUBLAS_STATUS_INTERNAL_ERROR;
919 return cublasSaxpy(handle, n, alpha, x, incx, y, incy);
954 #if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
957 cublasHandle_t handle,
965 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
967 cudaError_t error = cublas_impl::cublasTaxpy<double>(
968 n, alpha, x, incx, y, incy);
970 if (error != cudaSuccess)
972 return CUBLAS_STATUS_SUCCESS;
976 return CUBLAS_STATUS_INTERNAL_ERROR;
980 return cublasDaxpy(handle, n, alpha, x, incx, y, incy);
1015 #if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
1017 cublasStatus_t cublasXdot<__half>(
1018 cublasHandle_t handle,
1030 cudaError_t error = cublas_impl::cublasTdot<__half, float>(
1031 n, x, incx, y, incy, result);
1033 if (error != cudaSuccess)
1035 return CUBLAS_STATUS_SUCCESS;
1039 return CUBLAS_STATUS_INTERNAL_ERROR;
1074 #if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
1076 cublasStatus_t cublasXdot<__nv_bfloat16>(
1077 cublasHandle_t handle,
1089 cudaError_t error = cublas_impl::cublasTdot<__nv_bfloat16, float>(
1090 n, x, incx, y, incy, result);
1092 if (error != cudaSuccess)
1094 return CUBLAS_STATUS_SUCCESS;
1098 return CUBLAS_STATUS_INTERNAL_ERROR;
1133 #if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
1136 cublasHandle_t handle,
1144 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
1146 cudaError_t error = cublas_impl::cublasTdot<float, float>(
1147 n, x, incx, y, incy, result);
1149 if (error != cudaSuccess)
1151 return CUBLAS_STATUS_SUCCESS;
1155 return CUBLAS_STATUS_INTERNAL_ERROR;
1159 return cublasSdot(handle, n, x, incx, y, incy, result);
1194 #if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
1197 cublasHandle_t handle,
1205 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
1207 cudaError_t error = cublas_impl::cublasTdot<double, double>(
1208 n, x, incx, y, incy, result);
1210 if (error != cudaSuccess)
1212 return CUBLAS_STATUS_SUCCESS;
1216 return CUBLAS_STATUS_INTERNAL_ERROR;
1220 return cublasDdot(handle, n, x, incx, y, incy, result);
1250 #if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
1252 cublasStatus_t cublasXnrm2<__half>(
1253 cublasHandle_t handle,
1263 cudaError_t error = cublas_impl::cublasTnrm2<__half, float>(
1264 n, x, incx, result);
1266 if (error != cudaSuccess)
1268 return CUBLAS_STATUS_SUCCESS;
1272 return CUBLAS_STATUS_INTERNAL_ERROR;
1302 #if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
1304 cublasStatus_t cublasXnrm2<__nv_bfloat16>(
1305 cublasHandle_t handle,
1315 cudaError_t error = cublas_impl::cublasTnrm2<__nv_bfloat16, float>(
1316 n, x, incx, result);
1318 if (error != cudaSuccess)
1320 return CUBLAS_STATUS_SUCCESS;
1324 return CUBLAS_STATUS_INTERNAL_ERROR;
1354 #if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
1357 cublasHandle_t handle,
1363 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
1365 cudaError_t error = cublas_impl::cublasTnrm2<float, float>(
1366 n, x, incx, result);
1368 if (error != cudaSuccess)
1370 return CUBLAS_STATUS_SUCCESS;
1374 return CUBLAS_STATUS_INTERNAL_ERROR;
1408 #if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
1411 cublasHandle_t handle,
1417 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
1419 cudaError_t error = cublas_impl::cublasTnrm2<double, double>(
1420 n, x, incx, result);
1422 if (error != cudaSuccess)
1424 return CUBLAS_STATUS_SUCCESS;
1428 return CUBLAS_STATUS_INTERNAL_ERROR;
1463 #if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
1465 cublasStatus_t cublasXscal<__half>(
1466 cublasHandle_t handle,
1476 cudaError_t error = cublas_impl::cublasTscal<__half>(
1479 if (error != cudaSuccess)
1481 return CUBLAS_STATUS_SUCCESS;
1485 return CUBLAS_STATUS_INTERNAL_ERROR;
1516 #if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
1518 cublasStatus_t cublasXscal<__nv_bfloat16>(
1519 cublasHandle_t handle,
1521 const __nv_bfloat16*
RESTRICT alpha,
1529 cudaError_t error = cublas_impl::cublasTscal<__nv_bfloat16>(
1532 if (error != cudaSuccess)
1534 return CUBLAS_STATUS_SUCCESS;
1538 return CUBLAS_STATUS_INTERNAL_ERROR;
1569 #if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
1572 cublasHandle_t handle,
1578 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
1580 cudaError_t error = cublas_impl::cublasTscal<float>(
1583 if (error != cudaSuccess)
1585 return CUBLAS_STATUS_SUCCESS;
1589 return CUBLAS_STATUS_INTERNAL_ERROR;
1624 #if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
1627 cublasHandle_t handle,
1633 #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
1635 cudaError_t error = cublas_impl::cublasTscal<double>(
1638 if (error != cudaSuccess)
1640 return CUBLAS_STATUS_SUCCESS;
1644 return CUBLAS_STATUS_INTERNAL_ERROR;
cublasStatus_t cublasDcopy(cublasHandle_t handle, int n, const double *x, int incx, double *y, int incy)
Definition of CUDA's cublasDcopy function using dynamically loaded cublas library.
cublasStatus_t cublasSscal(cublasHandle_t handle, int n, const float *alpha, float *x, int incx)
Definition of CUDA's cublasSscal function using dynamically loaded cublas library.
cublasStatus_t cublasDscal(cublasHandle_t handle, int n, const double *alpha, double *x, int incx)
Definition of CUDA's cublasDscal function using dynamically loaded cublas library.
cublasStatus_t cublasSdot(cublasHandle_t handle, int n, const float *x, int incx, const float *y, int incy, float *result)
Definition of CUDA's cublasSdot function using dynamically loaded cublas library.
cublasStatus_t cublasSnrm2(cublasHandle_t handle, int n, const float *x, int incx, float *result)
Definition of CUDA's cublasSnrm2 function using dynamically loaded cublas library.
cublasStatus_t cublasSaxpy(cublasHandle_t handle, int n, const float *alpha, const float *x, int incx, float *y, int incy)
Definition of CUDA's cublasSaxpy function using dynamically loaded cublas library.
cublasStatus_t cublasDaxpy(cublasHandle_t handle, int n, const double *alpha, const double *x, int incx, double *y, int incy)
Definition of CUDA's cublasDaxpy function using dynamically loaded cublas library.
cublasStatus_t cublasScopy(cublasHandle_t handle, int n, const float *x, int incx, float *y, int incy)
Definition of CUDA's cublasScopy function using dynamically loaded cublas library.
cublasStatus_t cublasDnrm2(cublasHandle_t handle, int n, const double *x, int incx, double *result)
Definition of CUDA's cublasDnrm2 function using dynamically loaded cublas library.
cublasStatus_t cublasDdot(cublasHandle_t handle, int n, const double *x, int incx, const double *y, int incy, double *result)
Definition of CUDA's cublasDdot function using dynamically loaded cublas library.
A collection of templates to wrapper cublas functions.
cublasStatus_t cublasXaxpy< double >(cublasHandle_t handle, int n, const double *RESTRICT alpha, const double *RESTRICT x, int incx, double *RESTRICT y, int incy)
Performs on double precision.
cublasStatus_t cublasXgemv< double >(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const double *RESTRICT alpha, const double *RESTRICT A, int lda, const double *RESTRICT x, int incx, const double *RESTRICT beta, double *RESTRICT y, int incy)
Performs .
cublasStatus_t cublasXdot< float >(cublasHandle_t handle, int n, const float *RESTRICT x, int incx, const float *RESTRICT y, int incy, float *RESTRICT result)
Performs on __half precision.
cublasStatus_t cublasXscal< float >(cublasHandle_t handle, int n, const float *RESTRICT alpha, float *RESTRICT x, int incx)
Performs on __half precision.
cublasStatus_t cublasXcopy< double >(cublasHandle_t handle, int n, const double *RESTRICT x, int incx, double *RESTRICT y, int incy)
Performs in double type.
cublasStatus_t cublasXdot< double >(cublasHandle_t handle, int n, const double *RESTRICT x, int incx, const double *RESTRICT y, int incy, double *RESTRICT result)
Performs on double precision.
cublasStatus_t cublasXaxpy< float >(cublasHandle_t handle, int n, const float *RESTRICT alpha, const float *RESTRICT x, int incx, float *RESTRICT y, int incy)
Performs on __half precision.
cublasStatus_t cublasXnrm2< double >(cublasHandle_t handle, int n, const double *RESTRICT x, int incx, double *RESTRICT result)
Performs on double precision.
cublasStatus_t cublasXnrm2< float >(cublasHandle_t handle, int n, const float *RESTRICT x, int incx, float *RESTRICT result)
Performs on __half precision.
cublasStatus_t cublasXgemv< float >(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float *RESTRICT alpha, const float *RESTRICT A, int lda, const float *RESTRICT x, int incx, const float *RESTRICT beta, float *RESTRICT y, int incy)
Performs .
cublasStatus_t cublasXscal< double >(cublasHandle_t handle, int n, const double *RESTRICT alpha, double *RESTRICT x, int incx)
Performs on double precision.
cublasStatus_t cublasXcopy< float >(cublasHandle_t handle, int n, const float *RESTRICT x, int incx, float *RESTRICT y, int incy)
Performs in __half type.