17#include <cuda_runtime.h>
18#include "../_cu_definitions/cu_types.h"
21#include "../_cu_arithmetics/cu_arithmetics.h"
78 typename DataType,
typename ComputeType,
unsigned int block_size>
93 const unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
98 __shared__ DataType x_shared[block_size];
103 ComputeType sum = 0.0f;
106 const unsigned int num_blocks = (n + block_size - 1) / block_size;
111 for (
unsigned long int block_counter = 0;
112 block_counter < num_blocks;
117 unsigned long int j = threadIdx.x + \
118 block_counter *
static_cast<unsigned long int>(block_size);
124 x_shared[threadIdx.x] = x[j * incx];
129 x_shared[threadIdx.x] = \
130 cu_arithmetics::cast<ComputeType, DataType>(0.0f);
140 for (
unsigned int e = 0; e < block_size; ++e)
145 unsigned long int e_j = e + \
146 block_counter *
static_cast<unsigned long int>(block_size);
151 if ((i < m) && (e_j < n))
159 A[i * lda + e_j]) * \
160 cu_arithmetics::cast<DataType, ComputeType>(
166 A[i + e_j * lda]) * \
167 cu_arithmetics::cast<DataType, ComputeType>(
184 cu_arithmetics::add<DataType>(
222 template <
typename DataType>
230 int i = threadIdx.x + blockIdx.x * blockDim.x;
234 y[i * incy] = x[i * incx];
266 template <
typename DataType>
269 const DataType alpha,
275 const int i = threadIdx.x + blockIdx.x * blockDim.x;
280 cu_arithmetics::add<DataType>(
315 typename DataType,
typename ComputeType,
unsigned int block_size>
326 __shared__ ComputeType partial_sum[block_size];
328 const int tid = threadIdx.x;
329 int i = blockIdx.x * blockDim.x + threadIdx.x;
331 ComputeType sum =
static_cast<ComputeType
>(0.0f);
335 cu_arithmetics::cast<DataType, ComputeType>(y[i * incy]);
337 i += blockDim.x * gridDim.x;
340 partial_sum[tid] = sum;
345 for (
int stride = blockDim.x / 2; stride > 0; stride >>= 1)
349 partial_sum[tid] += partial_sum[tid + stride];
357 atomicAdd(result, partial_sum[0]);
384 typename DataType,
typename ComputeType,
unsigned int block_size>
393 __shared__ ComputeType partial_sum[block_size];
395 const int tid = threadIdx.x;
396 int i = blockIdx.x * blockDim.x + threadIdx.x;
398 ComputeType sum =
static_cast<ComputeType
>(0.0f);
404 i += blockDim.x * gridDim.x;
407 partial_sum[tid] = sum;
412 for (
int stride = blockDim.x / 2; stride > 0; stride >>= 1)
416 partial_sum[tid] += partial_sum[tid + stride];
424 atomicAdd(result, partial_sum[0]);
452 template <
typename DataType>
455 const DataType alpha,
459 const int i = threadIdx.x + blockIdx.x * blockDim.x;
475#if defined(USE_CUDA_FP8_E5M2) && (USE_CUDA_FP8_E5M2 == 1)
493#if defined(USE_CUDA_FP8_e4m3) && (USE_CUDA_FP8_e4m3 == 1)
511#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
529#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
532 __nv_bfloat16, float, 640>(
536 const __nv_bfloat16 alpha,
541 const __nv_bfloat16 beta,
547#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
548#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
567#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
568#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
571 double, double, 640>(
587#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
589 __global__
void cublas_impl_kernels::cublasTcopy_kernel<__half>(
598#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
600 __global__
void cublas_impl_kernels::cublasTcopy_kernel<__nv_bfloat16>(
609#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
610#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
612 __global__
void cublas_impl_kernels::cublasTcopy_kernel<float>(
622#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
623#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
625 __global__
void cublas_impl_kernels::cublasTcopy_kernel<double>(
635#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
637 __global__
void cublas_impl_kernels::cublasTaxpy_kernel<__half>(
647#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
649 __global__
void cublas_impl_kernels::cublasTaxpy_kernel<__nv_bfloat16>(
651 const __nv_bfloat16 alpha,
659#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
660#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
662 __global__
void cublas_impl_kernels::cublasTaxpy_kernel<float>(
673#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
674#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
676 __global__
void cublas_impl_kernels::cublasTaxpy_kernel<double>(
687#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
700#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
703 __nv_bfloat16, float, 256>(
713#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
714#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
728#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
729#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
732 double, double, 256>(
743#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
754#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
757 __nv_bfloat16, float, 256>(
765#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
766#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
778#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
779#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
782 double, double, 256>(
791#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
793 __global__
void cublas_impl_kernels::cublasTscal_kernel<__half>(
801#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
803 __global__
void cublas_impl_kernels::cublasTscal_kernel<__nv_bfloat16>(
805 const __nv_bfloat16 alpha,
811#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
812#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
814 __global__
void cublas_impl_kernels::cublasTscal_kernel<float>(
823#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
824#if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
826 __global__
void cublas_impl_kernels::cublasTscal_kernel<double>(
__host__ __device__ DataType abs(const DataType x)
Absolute value of a floating point number.
Templated kernel code for implenentations of several BLAS-type functions in CUDA.
__global__ void cublasTscal_kernel(const int n, const DataType alpha, DataType *RESTRICT x, const int incx)
Performs .
__global__ void cublasTaxpy_kernel(const int n, const DataType alpha, const DataType *RESTRICT x, const int incx, DataType *RESTRICT y, const int incy)
Performs .
__global__ void cublasTnrm2_kernel(const int n, const DataType *RESTRICT x, const int incx, ComputeType *RESTRICT result)
Computes .
__global__ void cublasTcopy_kernel(const int n, const DataType *RESTRICT x, const int incx, DataType *RESTRICT y, const int incy)
Performs .
__global__ void cublasTdot_kernel(const int n, const DataType *RESTRICT x, const int incx, const DataType *RESTRICT y, const int incy, ComputeType *RESTRICT result)
Computes .
__global__ void cublasTgemv_kernel(const bool trans, const int m, const int n, const DataType alpha, const DataType *RESTRICT A, const int lda, const DataType *RESTRICT x, const int incx, const DataType beta, DataType *RESTRICT y, const int incy)
Performs the operation .