A collection of templates to wrapper cublas functions. More...

Functions
template<>
cublasStatus_t	cublasXgemv< float > (cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float RESTRICT alpha, const float RESTRICT A, int lda, const float RESTRICT x, int incx, const float RESTRICT beta, float *RESTRICT y, int incy)
	Performs \( \boldsymbol{y} = \alpha \text{op}(\mathbf{A}) \boldsymbol{x} + \beta \boldsymbol{y} \).

template<>
cublasStatus_t	cublasXgemv< double > (cublasHandle_t handle, cublasOperation_t trans, int m, int n, const double RESTRICT alpha, const double RESTRICT A, int lda, const double RESTRICT x, int incx, const double RESTRICT beta, double *RESTRICT y, int incy)
	Performs \( \boldsymbol{y} = \alpha \text{op}(\mathbf{A}) \boldsymbol{x} + \beta \boldsymbol{y} \).

template<>
cublasStatus_t	cublasXcopy< float > (cublasHandle_t handle, int n, const float RESTRICT x, int incx, float RESTRICT y, int incy)
	Performs \( \boldsymbol{y} = \boldsymbol{x} \) in `__half` type.

template<>
cublasStatus_t	cublasXcopy< double > (cublasHandle_t handle, int n, const double RESTRICT x, int incx, double RESTRICT y, int incy)
	Performs \( \boldsymbol{y} = \boldsymbol{x} \) in `double` type.

template<>
cublasStatus_t	cublasXaxpy< float > (cublasHandle_t handle, int n, const float RESTRICT alpha, const float RESTRICT x, int incx, float *RESTRICT y, int incy)
	Performs \( \boldsymbol{y} = \alpha \boldsymbol{x} + \boldsymbol{y} \) on `__half` precision.

template<>
cublasStatus_t	cublasXaxpy< double > (cublasHandle_t handle, int n, const double RESTRICT alpha, const double RESTRICT x, int incx, double *RESTRICT y, int incy)
	Performs \( \boldsymbol{y} = \alpha \boldsymbol{x} + \boldsymbol{y} \) on `double` precision.

template<>
cublasStatus_t	cublasXdot< float > (cublasHandle_t handle, int n, const float RESTRICT x, int incx, const float RESTRICT y, int incy, float *RESTRICT result)
	Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot \boldsymbol{y} \) on `__half` precision.

template<>
cublasStatus_t	cublasXdot< double > (cublasHandle_t handle, int n, const double RESTRICT x, int incx, const double RESTRICT y, int incy, double *RESTRICT result)
	Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot \boldsymbol{y} \) on `double` precision.

template<>
cublasStatus_t	cublasXnrm2< float > (cublasHandle_t handle, int n, const float RESTRICT x, int incx, float RESTRICT result)
	Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot \boldsymbol{x} \) on `__half` precision.

template<>
cublasStatus_t	cublasXnrm2< double > (cublasHandle_t handle, int n, const double RESTRICT x, int incx, double RESTRICT result)
	Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot \boldsymbol{x} \) on `double` precision.

template<>
cublasStatus_t	cublasXscal< float > (cublasHandle_t handle, int n, const float RESTRICT alpha, float RESTRICT x, int incx)
	Performs \( \boldsymbol{x} = \alpha \boldsymbol{x} \) on `__half` precision.

template<>
cublasStatus_t	cublasXscal< double > (cublasHandle_t handle, int n, const double RESTRICT alpha, double RESTRICT x, int incx)
	Performs \( \boldsymbol{x} = \alpha \boldsymbol{x} \) on `double` precision.

template<typename DataType >
cublasStatus_t	cublasXgemv (cublasHandle_t handle, cublasOperation_t trans, int m, int n, const DataType RESTRICT alpha, const DataType RESTRICT A, int lda, const DataType RESTRICT x, int incx, const DataType RESTRICT beta, DataType *RESTRICT y, int incy)

template<typename DataType >
cublasStatus_t	cublasXcopy (cublasHandle_t handle, int n, const DataType RESTRICT x, int incx, DataType RESTRICT y, int incy)

template<typename DataType >
cublasStatus_t	cublasXaxpy (cublasHandle_t handle, int n, const DataType RESTRICT alpha, const DataType RESTRICT x, int incx, DataType *RESTRICT y, int incy)

template<typename DataType >
cublasStatus_t	cublasXdot (cublasHandle_t handle, int n, const DataType RESTRICT x, int incx, const DataType RESTRICT y, int incy, DataType *RESTRICT result)

template<typename DataType >
cublasStatus_t	cublasXnrm2 (cublasHandle_t handle, int n, const DataType RESTRICT x, int incx, DataType RESTRICT result)

template<typename DataType >
cublasStatus_t	cublasXscal (cublasHandle_t handle, int n, const DataType RESTRICT alpha, DataType RESTRICT x, int incx)

Detailed Description

A collection of templates to wrapper cublas functions.

Note: The implementation in the cu file is wrapped inside the namepsace clause. This is not necessary in general, however, it is needed to avoid the old gcc compiler error (this is a gcc bug) which complains "no instance of function template matches the argument list const float".

Function Documentation

◆ cublasXaxpy()

template<typename DataType >

cublasStatus_t cublas_api::cublasXaxpy	(	cublasHandle_t	handle,
		int	n,
		const DataType *RESTRICT	alpha,
		const DataType *RESTRICT	x,
		int	incx,
		DataType *RESTRICT	y,
		int	incy
	)

Referenced by cuVectorOperations< DataType >::subtract_scaled_vector().

Here is the caller graph for this function:

◆ cublasXaxpy< double >()

template<>

cublasStatus_t cublas_api::cublasXaxpy< double >	(	cublasHandle_t	handle,
		int	n,
		const double *RESTRICT	alpha,
		const double *RESTRICT	x,
		int	incx,
		double *RESTRICT	y,
		int	incy
	)

Performs \( \boldsymbol{y} = \alpha \boldsymbol{x} + \boldsymbol{y} \) on double precision.

This function is a half type implementation similar to cuBLAS's cublasSaxpy.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	n	Size of array \( \boldsymbol{x} \).
[in]	alpha	The scalar parameter \( \alpha \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[out]	y	Output vector \( \boldsymbol{y} \) stored on GPU device.
[in]	incy	Stride between consecutive elements of \( \boldsymbol{y} \).

See also: cublasXgemv

Definition at line 956 of file cublas_api.cu.

    {
        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
            // Use in-house implementation
            cudaError_t error = cublas_impl::cublasTaxpy<double>(
                    n, alpha, x, incx, y, incy);
 
            if (error != cudaSuccess)
            {
                return CUBLAS_STATUS_SUCCESS;
            }
            else
            {
                return CUBLAS_STATUS_INTERNAL_ERROR;
            }
 
        #else
            return cublasDaxpy(handle, n, alpha, x, incx, y, incy);
        #endif
    }

References cublasDaxpy().

Here is the call graph for this function:

◆ cublasXaxpy< float >()

template<>

cublasStatus_t cublas_api::cublasXaxpy< float >	(	cublasHandle_t	handle,
		int	n,
		const float *RESTRICT	alpha,
		const float *RESTRICT	x,
		int	incx,
		float *RESTRICT	y,
		int	incy
	)

Performs \( \boldsymbol{y} = \alpha \boldsymbol{x} + \boldsymbol{y} \) on __half precision.

This function is a half type implementation similar to cuBLAS's cublasSaxpy.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	n	Size of array \( \boldsymbol{x} \).
[in]	alpha	The scalar parameter \( \alpha \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[out]	y	Output vector \( \boldsymbol{y} \) stored on GPU device.
[in]	incy	Stride between consecutive elements of \( \boldsymbol{y} \).

See also: cublasXgemv

Performs \( \boldsymbol{y} = \alpha \boldsymbol{x} + \boldsymbol{y} \) on __nv_bfloat16 precision.

This function is a half type implementation similar to cuBLAS's cublasSaxpy.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	n	Size of array \( \boldsymbol{x} \).
[in]	alpha	The scalar parameter \( \alpha \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[out]	y	Output vector \( \boldsymbol{y} \) stored on GPU device.
[in]	incy	Stride between consecutive elements of \( \boldsymbol{y} \).

See also: cublasXgemv

Performs \( \boldsymbol{y} = \alpha \boldsymbol{x} + \boldsymbol{y} \) on float precision.

This function is a half type implementation similar to cuBLAS's cublasSaxpy.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	n	Size of array \( \boldsymbol{x} \).
[in]	alpha	The scalar parameter \( \alpha \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[out]	y	Output vector \( \boldsymbol{y} \) stored on GPU device.
[in]	incy	Stride between consecutive elements of \( \boldsymbol{y} \).

See also: cublasXgemv

Definition at line 895 of file cublas_api.cu.

    {
        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
            // Use in-house implementation
            cudaError_t error = cublas_impl::cublasTaxpy<float>(
                    n, alpha, x, incx, y, incy);
 
            if (error != cudaSuccess)
            {
                return CUBLAS_STATUS_SUCCESS;
            }
            else
            {
                return CUBLAS_STATUS_INTERNAL_ERROR;
            }
 
        #else
            return cublasSaxpy(handle, n, alpha, x, incx, y, incy);
        #endif
    }

References cublasSaxpy().

Here is the call graph for this function:

◆ cublasXcopy()

template<typename DataType >

cublasStatus_t cublas_api::cublasXcopy	(	cublasHandle_t	handle,
		int	n,
		const DataType *RESTRICT	x,
		int	incx,
		DataType *RESTRICT	y,
		int	incy
	)

Referenced by cuVectorOperations< DataType >::copy_scaled_vector(), and cuVectorOperations< DataType >::copy_vector().

Here is the caller graph for this function:

◆ cublasXcopy< double >()

template<>

cublasStatus_t cublas_api::cublasXcopy< double >	(	cublasHandle_t	handle,
		int	n,
		const double *RESTRICT	x,
		int	incx,
		double *RESTRICT	y,
		int	incy
	)

Performs \( \boldsymbol{y} = \boldsymbol{x} \) in double type.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	n	Size of the array \( \boldsymbol{x} \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[out]	y	Output vector \( \boldsymbol{y} \) stored on GPU device.
[in]	incy	Stride between consecutive elements of \( \boldsymbol{y} \).

See also: cublasXaxpy

Definition at line 716 of file cublas_api.cu.

    {
        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
            // Use in-house implementation
            cudaError_t error = cublas_impl::cublasTcopy<double>(
                    n, x, incx, y, incy);
 
            if (error != cudaSuccess)
            {
                return CUBLAS_STATUS_SUCCESS;
            }
            else
            {
                return CUBLAS_STATUS_INTERNAL_ERROR;
            }
 
        #else
            // Use Nvidia's CuBLAS
            return cublasDcopy(handle, n, x, incx, y, incy);
        #endif
    }

References cublasDcopy().

Here is the call graph for this function:

◆ cublasXcopy< float >()

template<>

cublasStatus_t cublas_api::cublasXcopy< float >	(	cublasHandle_t	handle,
		int	n,
		const float *RESTRICT	x,
		int	incx,
		float *RESTRICT	y,
		int	incy
	)

Performs \( \boldsymbol{y} = \boldsymbol{x} \) in __half type.

This function is not a template wrapper for CuBLAS, since CuBLAS API does not have cublasHcopy (where H is used for __half type). As such, this function is implemented with CUDA, rather than from CuBLAS.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	n	Size of the array \( \boldsymbol{x} \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[out]	y	Output vector \( \boldsymbol{y} \) stored on GPU device.
[in]	incy	Stride between consecutive elements of \( \boldsymbol{y} \).

See also: cublasXaxpy

Performs \( \boldsymbol{y} = \boldsymbol{x} \) in __nv_bfloat16 type.

This function is not a template wrapper for CuBLAS, since CuBLAS API does not have cublasHcopy (where H is used for __nv_bfloat16 type). As such, this function is implemented with CUDA, rather than from CuBLAS.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	n	Size of the array \( \boldsymbol{x} \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[out]	y	Output vector \( \boldsymbol{y} \) stored on GPU device.
[in]	incy	Stride between consecutive elements of \( \boldsymbol{y} \).

See also: cublasXaxpy

Performs \( \boldsymbol{y} = \boldsymbol{x} \) in float type.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	n	Size of the array \( \boldsymbol{x} \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[out]	y	Output vector \( \boldsymbol{y} \) stored on GPU device.
[in]	incy	Stride between consecutive elements of \( \boldsymbol{y} \).

See also: cublasXaxpy

Definition at line 660 of file cublas_api.cu.

    {
        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
            // Use in-house implementation
            cudaError_t error = cublas_impl::cublasTcopy<float>(
                    n, x, incx, y, incy);
            
            if (error != cudaSuccess)
            {
                return CUBLAS_STATUS_SUCCESS;
            }
            else
            {
                return CUBLAS_STATUS_INTERNAL_ERROR;
            }
 
        #else
            // Use Nvidia's CuBLAS
            return cublasScopy(handle, n, x, incx, y, incy);
        #endif
    }

References cublasScopy().

Here is the call graph for this function:

◆ cublasXdot()

template<typename DataType >

cublasStatus_t cublas_api::cublasXdot	(	cublasHandle_t	handle,
		int	n,
		const DataType *RESTRICT	x,
		int	incx,
		const DataType *RESTRICT	y,
		int	incy,
		DataType *RESTRICT	result
	)

Referenced by cuVectorOperations< DataType >::inner_product().

Here is the caller graph for this function:

◆ cublasXdot< double >()

template<>

cublasStatus_t cublas_api::cublasXdot< double >	(	cublasHandle_t	handle,
		int	n,
		const double *RESTRICT	x,
		int	incx,
		const double *RESTRICT	y,
		int	incy,
		double *RESTRICT	result
	)

Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot \boldsymbol{y} \) on double precision.

This function is a half type implementation similar to cuBLAS's cublasSdot.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	n	Size of array \( \boldsymbol{x} \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[out]	y	Output vector \( \boldsymbol{y} \) stored on GPU device.
[in]	incy	Stride between consecutive elements of \( \boldsymbol{y} \).
[out]	result	The dot product of two vectors.

See also: cublasHaxpy

Definition at line 1196 of file cublas_api.cu.

    {
        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
            // Use in-house implementation
            cudaError_t error = cublas_impl::cublasTdot<double, double>(
                    n, x, incx, y, incy, result);
 
            if (error != cudaSuccess)
            {
                return CUBLAS_STATUS_SUCCESS;
            }
            else
            {
                return CUBLAS_STATUS_INTERNAL_ERROR;
            }
 
        #else
            return cublasDdot(handle, n, x, incx, y, incy, result);
        #endif
    }

References cublasDdot().

Here is the call graph for this function:

◆ cublasXdot< float >()

template<>

cublasStatus_t cublas_api::cublasXdot< float >	(	cublasHandle_t	handle,
		int	n,
		const float *RESTRICT	x,
		int	incx,
		const float *RESTRICT	y,
		int	incy,
		float *RESTRICT	result
	)

Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot \boldsymbol{y} \) on __half precision.

This function is a half type implementation similar to cuBLAS's cublasSdot.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	n	Size of array \( \boldsymbol{x} \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[out]	y	Output vector \( \boldsymbol{y} \) stored on GPU device.
[in]	incy	Stride between consecutive elements of \( \boldsymbol{y} \).
[out]	result	The dot product of two vectors.

See also: cublasHaxpy

Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot \boldsymbol{y} \) on __nv_bfloat16 precision.

This function is a half type implementation similar to cuBLAS's cublasSdot.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	n	Size of array \( \boldsymbol{x} \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[out]	y	Output vector \( \boldsymbol{y} \) stored on GPU device.
[in]	incy	Stride between consecutive elements of \( \boldsymbol{y} \).
[out]	result	The dot product of two vectors.

See also: cublasHaxpy

Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot \boldsymbol{y} \) on float precision.

This function is a half type implementation similar to cuBLAS's cublasSdot.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	n	Size of array \( \boldsymbol{x} \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[out]	y	Output vector \( \boldsymbol{y} \) stored on GPU device.
[in]	incy	Stride between consecutive elements of \( \boldsymbol{y} \).
[out]	result	The dot product of two vectors.

See also: cublasHaxpy

Definition at line 1135 of file cublas_api.cu.

    {
        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
            // Use in-house implementation
            cudaError_t error = cublas_impl::cublasTdot<float, float>(
                    n, x, incx, y, incy, result);
 
            if (error != cudaSuccess)
            {
                return CUBLAS_STATUS_SUCCESS;
            }
            else
            {
                return CUBLAS_STATUS_INTERNAL_ERROR;
            }
 
        #else
            return cublasSdot(handle, n, x, incx, y, incy, result);
        #endif
    }

References cublasSdot().

Here is the call graph for this function:

◆ cublasXgemv()

template<typename DataType >

cublasStatus_t cublas_api::cublasXgemv	(	cublasHandle_t	handle,
		cublasOperation_t	trans,
		int	m,
		int	n,
		const DataType *RESTRICT	alpha,
		const DataType *RESTRICT	A,
		int	lda,
		const DataType *RESTRICT	x,
		int	incx,
		const DataType *RESTRICT	beta,
		DataType *RESTRICT	y,
		int	incy
	)

◆ cublasXgemv< double >()

template<>

cublasStatus_t cublas_api::cublasXgemv< double >	(	cublasHandle_t	handle,
		cublasOperation_t	trans,
		int	m,
		int	n,
		const double *RESTRICT	alpha,
		const double *RESTRICT	A,
		int	lda,
		const double *RESTRICT	x,
		int	incx,
		const double *RESTRICT	beta,
		double *RESTRICT	y,
		int	incy
	)

Performs \( \boldsymbol{y} = \alpha \text{op}(\mathbf{A}) \boldsymbol{x} + \beta \boldsymbol{y} \).

This function is a template wrapper for cublasDgemv.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	trans	If set to `CUBLAS_OP_N` or `CUBLAS_OP_T`, the operator \( \mathbf{A} \) is not transposed or transposed, respectively.
[in]	m	Number of rows of matrix \( \mathbf{A} \).
[in]	n	Number of columns of matrix \( \mathbf{A} \).
[in]	alpha	The scalar parameter \( \alpha \).
[in]	A	Two-dimensional matrix \( \mathbf{A} \) stored on GPU device as one-dimensional array with column-major ordering.
[in]	lda	Leading dimension of two-dimensional matrix \( \mathbf{A} \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[in]	beta	The scalar parameter \( \beta \).
[out]	y	Output vector \( \boldsymbol{y} \) stored on GPU device.
[in]	incy	Stride between consecutive elements of \( \boldsymbol{y} \).

See also: cublasXaxpy

Definition at line 481 of file cublas_api.cu.

    {
        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
            // Use in-house implementation
            cudaError_t error = cublas_impl::cublasTgemv<double, double>(
                    trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 
            if (error != cudaSuccess)
            {
                return CUBLAS_STATUS_SUCCESS;
            }
            else
            {
                return CUBLAS_STATUS_INTERNAL_ERROR;
            }
 
        #else
            // Use Nvidia's CuBLAS
            return cublasDgemv(handle, trans, m, n, alpha, A, lda, x, incx,
                               beta, y, incy);
        #endif
    }

◆ cublasXgemv< float >()

template<>

cublasStatus_t cublas_api::cublasXgemv< float >	(	cublasHandle_t	handle,
		cublasOperation_t	trans,
		int	m,
		int	n,
		const float *RESTRICT	alpha,
		const float *RESTRICT	A,
		int	lda,
		const float *RESTRICT	x,
		int	incx,
		const float *RESTRICT	beta,
		float *RESTRICT	y,
		int	incy
	)

Performs \( \boldsymbol{y} = \alpha \text{op}(\mathbf{A}) \boldsymbol{x} + \beta \boldsymbol{y} \).

This function is not a template wrapper for CuBLAS, since CuBLAS API does not have cublasHgemv (where H is used for __nv_fp8_e5m2 type). As such, this function is implemented with CUDA, rather than from CuBLAS.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	trans	If set to `CUBLAS_OP_N` or `CUBLAS_OP_T`, the operator \( \mathbf{A} \) is not transposed or transposed, respectively.
[in]	m	Number of rows of matrix \( \mathbf{A} \).
[in]	n	Number of columns of matrix \( \mathbf{A} \).
[in]	alpha	The scalar parameter \( \alpha \).
[in]	A	Two-dimensional matrix \( \mathbf{A} \) stored on GPU device as one-dimensional array with column-major ordering.
[in]	lda	Leading dimension of two-dimensional matrix \( \mathbf{A} \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[in]	beta	The scalar parameter \( \beta \).
[out]	y	Output vector \( \boldsymbol{y} \) stored on GPU device.
[in]	incy	Stride between consecutive elements of \( \boldsymbol{y} \).

See also: cublasXaxpy

Performs \( \boldsymbol{y} = \alpha \text{op}(\mathbf{A}) \boldsymbol{x} + \beta \boldsymbol{y} \).

This function is not a template wrapper for CuBLAS, since CuBLAS API does not have cublasHgemv (where H is used for __nv_fp8_e4m3 type). As such, this function is implemented with CUDA, rather than from CuBLAS.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	trans	If set to `CUBLAS_OP_N` or `CUBLAS_OP_T`, the operator \( \mathbf{A} \) is not transposed or transposed, respectively.
[in]	m	Number of rows of matrix \( \mathbf{A} \).
[in]	n	Number of columns of matrix \( \mathbf{A} \).
[in]	alpha	The scalar parameter \( \alpha \).
[in]	A	Two-dimensional matrix \( \mathbf{A} \) stored on GPU device as one-dimensional array with column-major ordering.
[in]	lda	Leading dimension of two-dimensional matrix \( \mathbf{A} \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[in]	beta	The scalar parameter \( \beta \).
[out]	y	Output vector \( \boldsymbol{y} \) stored on GPU device.
[in]	incy	Stride between consecutive elements of \( \boldsymbol{y} \).

See also: cublasXaxpy

Performs \( \boldsymbol{y} = \alpha \text{op}(\mathbf{A}) \boldsymbol{x} + \beta \boldsymbol{y} \).

This function is not a template wrapper for CuBLAS, since CuBLAS API does not have cublasHgemv (where H is used for __half type). As such, this function is implemented with CUDA, rather than from CuBLAS.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	trans	If set to `CUBLAS_OP_N` or `CUBLAS_OP_T`, the operator \( \mathbf{A} \) is not transposed or transposed, respectively.
[in]	m	Number of rows of matrix \( \mathbf{A} \).
[in]	n	Number of columns of matrix \( \mathbf{A} \).
[in]	alpha	The scalar parameter \( \alpha \).
[in]	A	Two-dimensional matrix \( \mathbf{A} \) stored on GPU device as one-dimensional array with column-major ordering.
[in]	lda	Leading dimension of two-dimensional matrix \( \mathbf{A} \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[in]	beta	The scalar parameter \( \beta \).
[out]	y	Output vector \( \boldsymbol{y} \) stored on GPU device.
[in]	incy	Stride between consecutive elements of \( \boldsymbol{y} \).

See also: cublasXaxpy

Performs \( \boldsymbol{y} = \alpha \text{op}(\mathbf{A}) \boldsymbol{x} + \beta \boldsymbol{y} \).

This function is not a template wrapper for CuBLAS, since CuBLAS API does not have cublasHgemv (where H is used for __nv_bfloat16 type). As such, this function is implemented with CUDA, rather than from CuBLAS.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	trans	If set to `CUBLAS_OP_N` or `CUBLAS_OP_T`, the operator \( \mathbf{A} \) is not transposed or transposed, respectively.
[in]	m	Number of rows of matrix \( \mathbf{A} \).
[in]	n	Number of columns of matrix \( \mathbf{A} \).
[in]	alpha	The scalar parameter \( \alpha \).
[in]	A	Two-dimensional matrix \( \mathbf{A} \) stored on GPU device as one-dimensional array with column-major ordering.
[in]	lda	Leading dimension of two-dimensional matrix \( \mathbf{A} \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[in]	beta	The scalar parameter \( \beta \).
[out]	y	Output vector \( \boldsymbol{y} \) stored on GPU device.
[in]	incy	Stride between consecutive elements of \( \boldsymbol{y} \).

See also: cublasXaxpy

Performs \( \boldsymbol{y} = \alpha \text{op}(\mathbf{A}) \boldsymbol{x} + \beta \boldsymbol{y} \).

This function is a template wrapper for cublasSgemv.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	trans	If set to `CUBLAS_OP_N` or `CUBLAS_OP_T`, the operator \( \mathbf{A} \) is not transposed or transposed, respectively.
[in]	m	Number of rows of matrix \( \mathbf{A} \).
[in]	n	Number of columns of matrix \( \mathbf{A} \).
[in]	alpha	The scalar parameter \( \alpha \).
[in]	A	Two-dimensional matrix \( \mathbf{A} \) stored on GPU device as one-dimensional array with column-major ordering.
[in]	lda	Leading dimension of two-dimensional matrix \( \mathbf{A} \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[in]	beta	The scalar parameter \( \beta \).
[out]	y	Output vector \( \boldsymbol{y} \) stored on GPU device.
[in]	incy	Stride between consecutive elements of \( \boldsymbol{y} \).

See also: cublasXaxpy

Definition at line 399 of file cublas_api.cu.

    {
        
        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
            // Use in-house implementation
            cudaError_t error = cublas_impl::cublasTgemv<float, float>(
                    trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 
            if (error != cudaSuccess)
            {
                return CUBLAS_STATUS_SUCCESS;
            }
            else
            {
                return CUBLAS_STATUS_INTERNAL_ERROR;
            }
 
        #else
            // Use Nvidia's CuBLAS
            return cublasSgemv(handle, trans, m, n, alpha, A, lda, x, incx,
                               beta, y, incy);
        #endif
    }

◆ cublasXnrm2()

template<typename DataType >

cublasStatus_t cublas_api::cublasXnrm2	(	cublasHandle_t	handle,
		int	n,
		const DataType *RESTRICT	x,
		int	incx,
		DataType *RESTRICT	result
	)

Referenced by cuVectorOperations< DataType >::euclidean_norm().

Here is the caller graph for this function:

◆ cublasXnrm2< double >()

template<>

cublasStatus_t cublas_api::cublasXnrm2< double >	(	cublasHandle_t	handle,
		int	n,
		const double *RESTRICT	x,
		int	incx,
		double *RESTRICT	result
	)

Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot \boldsymbol{x} \) on double precision.

This function is a half type implementation similar to cuBLAS's cublasSnrm2.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	n	Size of array \( \boldsymbol{x} \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[out]	result	The norm squared of a vector.

See also: cublasHdot

Definition at line 1410 of file cublas_api.cu.

    {
        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
            // Use in-house implementation
            cudaError_t error = cublas_impl::cublasTnrm2<double, double>(
                    n, x, incx, result);
 
            if (error != cudaSuccess)
            {
                return CUBLAS_STATUS_SUCCESS;
            }
            else
            {
                return CUBLAS_STATUS_INTERNAL_ERROR;
            }
 
        #else
            return cublasDnrm2(handle, n, x, incx, result);
        #endif
    }

References cublasDnrm2().

Here is the call graph for this function:

◆ cublasXnrm2< float >()

template<>

cublasStatus_t cublas_api::cublasXnrm2< float >	(	cublasHandle_t	handle,
		int	n,
		const float *RESTRICT	x,
		int	incx,
		float *RESTRICT	result
	)

Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot \boldsymbol{x} \) on __half precision.

This function is a half type implementation similar to cuBLAS's cublasSnrm2.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	n	Size of array \( \boldsymbol{x} \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[out]	result	The norm squared of a vector.

See also: cublasHdot

Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot \boldsymbol{x} \) on __nv_bfloat16 precision.

This function is a half type implementation similar to cuBLAS's cublasSnrm2.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	n	Size of array \( \boldsymbol{x} \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[out]	result	The norm squared of a vector.

See also: cublasHdot

Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot \boldsymbol{x} \) on float precision.

This function is a half type implementation similar to cuBLAS's cublasSnrm2.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	n	Size of array \( \boldsymbol{x} \).
[in]	x	Input vector \( \boldsymbol{x} \) stored on GPU device.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).
[out]	result	The norm squared of a vector.

See also: cublasHdot

Definition at line 1356 of file cublas_api.cu.

    {
        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
            // Use in-house implementation
            cudaError_t error = cublas_impl::cublasTnrm2<float, float>(
                    n, x, incx, result);
 
            if (error != cudaSuccess)
            {
                return CUBLAS_STATUS_SUCCESS;
            }
            else
            {
                return CUBLAS_STATUS_INTERNAL_ERROR;
            }
 
        #else
            return cublasSnrm2(handle, n, x, incx, result);
        #endif
    }

References cublasSnrm2().

Here is the call graph for this function:

◆ cublasXscal()

template<typename DataType >

cublasStatus_t cublas_api::cublasXscal	(	cublasHandle_t	handle,
		int	n,
		const DataType *RESTRICT	alpha,
		DataType *RESTRICT	x,
		int	incx
	)

Referenced by cuVectorOperations< DataType >::copy_scaled_vector(), and cuVectorOperations< DataType >::normalize_vector_in_place().

Here is the caller graph for this function:

◆ cublasXscal< double >()

template<>

cublasStatus_t cublas_api::cublasXscal< double >	(	cublasHandle_t	handle,
		int	n,
		const double *RESTRICT	alpha,
		double *RESTRICT	x,
		int	incx
	)

Performs \( \boldsymbol{x} = \alpha \boldsymbol{x} \) on double precision.

This function is a half type implementation similar to cuBLAS's cublasSscal.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	n	Size of array \( \boldsymbol{x} \).
[in]	alpha	The scalar parameter \( \alpha \).
[in,out]	x	Input and output vector \( \boldsymbol{x} \) stored on GPU device. This vector is written in-place.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).

See also: cublasHcopy

Definition at line 1626 of file cublas_api.cu.

    {
        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
            // Use in-house implementation
            cudaError_t error = cublas_impl::cublasTscal<double>(
                    n, alpha, x, incx);
 
            if (error != cudaSuccess)
            {
                return CUBLAS_STATUS_SUCCESS;
            }
            else
            {
                return CUBLAS_STATUS_INTERNAL_ERROR;
            }
 
        #else
            return cublasDscal(handle, n, alpha, x, incx);
        #endif
    }

References cublasDscal().

Here is the call graph for this function:

◆ cublasXscal< float >()

template<>

cublasStatus_t cublas_api::cublasXscal< float >	(	cublasHandle_t	handle,
		int	n,
		const float *RESTRICT	alpha,
		float *RESTRICT	x,
		int	incx
	)

Performs \( \boldsymbol{x} = \alpha \boldsymbol{x} \) on __half precision.

This function is a half type implementation similar to cuBLAS's cublasSscal.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	n	Size of array \( \boldsymbol{x} \).
[in]	alpha	The scalar parameter \( \alpha \).
[in,out]	x	Input and output vector \( \boldsymbol{x} \) stored on GPU device. This vector is written in-place.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).

See also: cublasHcopy

Performs \( \boldsymbol{x} = \alpha \boldsymbol{x} \) on __nv_bfloat16 precision.

This function is a half type implementation similar to cuBLAS's cublasSscal.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	n	Size of array \( \boldsymbol{x} \).
[in]	alpha	The scalar parameter \( \alpha \).
[in,out]	x	Input and output vector \( \boldsymbol{x} \) stored on GPU device. This vector is written in-place.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).

See also: cublasHcopy

Performs \( \boldsymbol{x} = \alpha \boldsymbol{x} \) on float precision.

This function is a half type implementation similar to cuBLAS's cublasSscal.

Parameters

[in]	handle	Handle object for CuBLAS library context.
[in]	n	Size of array \( \boldsymbol{x} \).
[in]	alpha	The scalar parameter \( \alpha \).
[in,out]	x	Input and output vector \( \boldsymbol{x} \) stored on GPU device. This vector is written in-place.
[in]	incx	Stride between consecutive elements of \( \boldsymbol{x} \).

See also: cublasHcopy

Definition at line 1571 of file cublas_api.cu.

    {
        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)
            // Use in-house implementation
            cudaError_t error = cublas_impl::cublasTscal<float>(
                    n, alpha, x, incx);
 
            if (error != cudaSuccess)
            {
                return CUBLAS_STATUS_SUCCESS;
            }
            else
            {
                return CUBLAS_STATUS_INTERNAL_ERROR;
            }
 
        #else
            return cublasSscal(handle, n, alpha, x, incx);
        #endif
    }

References cublasSscal().

Here is the call graph for this function:

Functions

Detailed Description

Function Documentation

◆ cublasXaxpy()

◆ cublasXaxpy< double >()

◆ cublasXaxpy< float >()

◆ cublasXcopy()

◆ cublasXcopy< double >()

◆ cublasXcopy< float >()

◆ cublasXdot()

◆ cublasXdot< double >()

◆ cublasXdot< float >()

◆ cublasXgemv()

◆ cublasXgemv< double >()

◆ cublasXgemv< float >()

◆ cublasXnrm2()

◆ cublasXnrm2< double >()

◆ cublasXnrm2< float >()

◆ cublasXscal()

◆ cublasXscal< double >()

◆ cublasXscal< float >()