18#include "../_cu_arithmetics/cu_arithmetics.h"
19#include <cuda_runtime.h>
20#include "../_cu_definitions/cu_types.h"
76 template <
typename DataType,
typename ComputeType>
78 cublasOperation_t trans,
95 if (trans == CUBLAS_OP_N)
102 else if (trans == CUBLAS_OP_T)
111 throw std::invalid_argument(
112 "'trans' argument must be CUBLAS_OP_N or CUBLAS_OP_T.");
118 const int threads_per_block = 640;
119 dim3 dim_block(threads_per_block);
123 int blocks_per_grid = \
124 (y_size + threads_per_block - 1) / threads_per_block;
125 dim3 dim_grid(blocks_per_grid);
129 DataType, ComputeType, threads_per_block>
130 <<<dim_grid, dim_block>>>(
131 trans_, y_size, x_size, *alpha, A, lda, x, incx, *beta, y,
134 cudaError_t error = cudaDeviceSynchronize();
168 template <
typename DataType>
177 const int threads_per_block = 256;
178 int blocks_per_grid = (n + threads_per_block - 1) / threads_per_block;
181 cublas_impl_kernels::cublasTcopy_kernel<DataType><<<
182 blocks_per_grid, threads_per_block>>>(
183 n, x, incx, y, incy);
185 cudaError_t error = cudaDeviceSynchronize();
222 template <
typename DataType>
232 const int threads_per_block = 256;
233 int blocks_per_grid = (n + threads_per_block - 1) / threads_per_block;
236 cublas_impl_kernels::cublasTaxpy_kernel<DataType><<<
237 blocks_per_grid, threads_per_block>>>(
238 n, *alpha, x, incx, y, incy);
240 cudaError_t error = cudaDeviceSynchronize();
276 template <
typename DataType,
typename ComputeType>
286 ComputeType *device_result;
287 cudaMalloc(&device_result,
sizeof(ComputeType));
288 cudaMemset(device_result,
static_cast<ComputeType
>(0.0f),
289 sizeof(ComputeType));
292 const int threads_per_block = 256;
293 int blocks_per_grid = (n + threads_per_block - 1) / threads_per_block;
297 DataType, ComputeType, threads_per_block><<<
298 blocks_per_grid, threads_per_block>>>(
299 n, x, incx, y, incy, device_result);
301 cudaError_t error = cudaDeviceSynchronize();
304 ComputeType host_result_comp;
305 cudaMemcpy(&host_result_comp, device_result,
sizeof(ComputeType),
306 cudaMemcpyDeviceToHost);
343 template <
typename DataType,
typename ComputeType>
351 ComputeType *device_result;
352 cudaMalloc(&device_result,
sizeof(ComputeType));
353 cudaMemset(device_result,
static_cast<ComputeType
>(0.0f),
354 sizeof(ComputeType));
357 const int threads_per_block = 256;
358 int blocks_per_grid = (n + threads_per_block - 1) / threads_per_block;
362 DataType, ComputeType, threads_per_block><<<
363 blocks_per_grid, threads_per_block>>>(
364 n, x, incx, device_result);
366 cudaError_t error = cudaDeviceSynchronize();
369 ComputeType host_result_comp;
370 cudaMemcpy(&host_result_comp, device_result,
sizeof(ComputeType),
371 cudaMemcpyDeviceToHost);
410 template <
typename DataType>
418 int threads_per_block = 256;
419 int blocks_per_grid = (n + threads_per_block - 1) / threads_per_block;
422 cublas_impl_kernels::cublasTscal_kernel<DataType><<<
423 blocks_per_grid, threads_per_block>>>(
426 cudaError_t error = cudaDeviceSynchronize();
439#if defined(USE_CUDA_FP8_E5M2) && (USE_CUDA_FP8_E5M2 == 1)
441 cudaError_t cublas_impl::cublasTgemv<__nv_fp8_e5m2, float>(
442 cublasOperation_t trans,
456#if defined(USE_CUDA_FP8_E4M3) && (USE_CUDA_FP8_E4M3 == 1)
458 cudaError_t cublas_impl::cublasTgemv<__nv_fp8_e4m3, float>(
459 cublasOperation_t trans,
473#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
475 cudaError_t cublas_impl::cublasTgemv<__half, float>(
476 cublasOperation_t trans,
490#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
492 cudaError_t cublas_impl::cublasTgemv<__nv_bfloat16, float>(
493 cublasOperation_t trans,
496 const __nv_bfloat16*
RESTRICT alpha,
507#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
508#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
510 cudaError_t cublas_impl::cublasTgemv<float, float>(
511 cublasOperation_t trans,
526#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
527#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
529 cudaError_t cublas_impl::cublasTgemv<double, double>(
530 cublasOperation_t trans,
545#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
547 cudaError_t cublas_impl::cublasTcopy<__half>(
556#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
558 cudaError_t cublas_impl::cublasTcopy<__nv_bfloat16>(
567#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
568#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
570 cudaError_t cublas_impl::cublasTcopy<float>(
580#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
581#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
583 cudaError_t cublas_impl::cublasTcopy<double>(
593#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
595 cudaError_t cublas_impl::cublasTaxpy<__half>(
605#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
607 cudaError_t cublas_impl::cublasTaxpy<__nv_bfloat16>(
609 const __nv_bfloat16*
RESTRICT alpha,
617#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
618#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
620 cudaError_t cublas_impl::cublasTaxpy<float>(
631#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
632#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
634 cudaError_t cublas_impl::cublasTaxpy<double>(
645#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
647 cudaError_t cublas_impl::cublasTdot<__half, float>(
657#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
659 cudaError_t cublas_impl::cublasTdot<__nv_bfloat16, float>(
669#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
670#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
672 cudaError_t cublas_impl::cublasTdot<float, float>(
683#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
684#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
686 cudaError_t cublas_impl::cublasTdot<double, double>(
697#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
699 cudaError_t cublas_impl::cublasTnrm2<__half, float>(
707#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
709 cudaError_t cublas_impl::cublasTnrm2<__nv_bfloat16, float>(
717#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
718#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
720 cudaError_t cublas_impl::cublasTnrm2<float, float>(
729#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
730#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
732 cudaError_t cublas_impl::cublasTnrm2<double, double>(
741#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
743 cudaError_t cublas_impl::cublasTscal<__half>(
751#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
753 cudaError_t cublas_impl::cublasTscal<__nv_bfloat16>(
755 const __nv_bfloat16*
RESTRICT alpha,
761#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
762#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
764 cudaError_t cublas_impl::cublasTscal<float>(
773#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
774#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
776 cudaError_t cublas_impl::cublasTscal<double>(
cudaError_t cudaFree(void *devPtr)
Definition of CUDA's cudaFree function using dynamically loaded cudart library.
cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, cudaMemcpyKind kind)
Definition of CUDA's cudaMemcpy function using dynamically loaded cudart library.
cudaError_t cudaMalloc(void **devPtr, size_t size)
Definition of CUDA's cudaMalloc function using dynamically loaded cudart library.
__host__ __device__ DataType abs(const DataType x)
Absolute value of a floating point number.
__global__ void cublasTnrm2_kernel(const int n, const DataType *RESTRICT x, const int incx, ComputeType *RESTRICT result)
Computes .
__global__ void cublasTdot_kernel(const int n, const DataType *RESTRICT x, const int incx, const DataType *RESTRICT y, const int incy, ComputeType *RESTRICT result)
Computes .
__global__ void cublasTgemv_kernel(const bool trans, const int m, const int n, const DataType alpha, const DataType *RESTRICT A, const int lda, const DataType *RESTRICT x, const int incx, const DataType beta, DataType *RESTRICT y, const int incy)
Performs the operation .
Templated implenentations of several BLAS-type functions in CUDA.
cudaError_t cublasTaxpy(int n, const DataType *RESTRICT alpha, const DataType *RESTRICT x, int incx, DataType *RESTRICT y, int incy)
Performs .
cudaError_t cublasTgemv(cublasOperation_t trans, int m, int n, const DataType *RESTRICT alpha, const DataType *RESTRICT A, int lda, const DataType *RESTRICT x, int incx, const DataType *RESTRICT beta, DataType *RESTRICT y, int incy)
Performs .
cudaError_t cublasTdot(int n, const DataType *RESTRICT x, int incx, const DataType *RESTRICT y, int incy, DataType *RESTRICT result)
Computes .
cudaError_t cublasTcopy(int n, const DataType *RESTRICT x, int incx, DataType *RESTRICT y, int incy)
Performs .
cudaError_t cublasTnrm2(int n, const DataType *RESTRICT x, int incx, DataType *RESTRICT result)
Computes .
cudaError_t cublasTscal(int n, const DataType *RESTRICT alpha, DataType *RESTRICT x, int incx)
Performs .