![]() |
imate
C++/CUDA Reference
|
A collection of templates to wrapper cublas functions. More...
Functions | |
| template<> | |
| cublasStatus_t | cublasXgemv< float > (cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float *RESTRICT alpha, const float *RESTRICT A, int lda, const float *RESTRICT x, int incx, const float *RESTRICT beta, float *RESTRICT y, int incy) |
| Performs \( \boldsymbol{y} = \alpha \text{op}(\mathbf{A})
\boldsymbol{x} + \beta \boldsymbol{y} \). | |
| template<> | |
| cublasStatus_t | cublasXgemv< double > (cublasHandle_t handle, cublasOperation_t trans, int m, int n, const double *RESTRICT alpha, const double *RESTRICT A, int lda, const double *RESTRICT x, int incx, const double *RESTRICT beta, double *RESTRICT y, int incy) |
| Performs \( \boldsymbol{y} = \alpha \text{op}(\mathbf{A})
\boldsymbol{x} + \beta \boldsymbol{y} \). | |
| template<> | |
| cublasStatus_t | cublasXcopy< float > (cublasHandle_t handle, int n, const float *RESTRICT x, int incx, float *RESTRICT y, int incy) |
Performs \( \boldsymbol{y} = \boldsymbol{x} \) in __half type. | |
| template<> | |
| cublasStatus_t | cublasXcopy< double > (cublasHandle_t handle, int n, const double *RESTRICT x, int incx, double *RESTRICT y, int incy) |
Performs \( \boldsymbol{y} = \boldsymbol{x} \) in double type. | |
| template<> | |
| cublasStatus_t | cublasXaxpy< float > (cublasHandle_t handle, int n, const float *RESTRICT alpha, const float *RESTRICT x, int incx, float *RESTRICT y, int incy) |
Performs \( \boldsymbol{y} = \alpha \boldsymbol{x} +
\boldsymbol{y} \) on __half precision. | |
| template<> | |
| cublasStatus_t | cublasXaxpy< double > (cublasHandle_t handle, int n, const double *RESTRICT alpha, const double *RESTRICT x, int incx, double *RESTRICT y, int incy) |
Performs \( \boldsymbol{y} = \alpha \boldsymbol{x} +
\boldsymbol{y} \) on double precision. | |
| template<> | |
| cublasStatus_t | cublasXdot< float > (cublasHandle_t handle, int n, const float *RESTRICT x, int incx, const float *RESTRICT y, int incy, float *RESTRICT result) |
Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot
\boldsymbol{y} \) on __half precision. | |
| template<> | |
| cublasStatus_t | cublasXdot< double > (cublasHandle_t handle, int n, const double *RESTRICT x, int incx, const double *RESTRICT y, int incy, double *RESTRICT result) |
Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot
\boldsymbol{y} \) on double precision. | |
| template<> | |
| cublasStatus_t | cublasXnrm2< float > (cublasHandle_t handle, int n, const float *RESTRICT x, int incx, float *RESTRICT result) |
Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot
\boldsymbol{x} \) on __half precision. | |
| template<> | |
| cublasStatus_t | cublasXnrm2< double > (cublasHandle_t handle, int n, const double *RESTRICT x, int incx, double *RESTRICT result) |
Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot
\boldsymbol{x} \) on double precision. | |
| template<> | |
| cublasStatus_t | cublasXscal< float > (cublasHandle_t handle, int n, const float *RESTRICT alpha, float *RESTRICT x, int incx) |
Performs \( \boldsymbol{x} = \alpha \boldsymbol{x}
\) on __half precision. | |
| template<> | |
| cublasStatus_t | cublasXscal< double > (cublasHandle_t handle, int n, const double *RESTRICT alpha, double *RESTRICT x, int incx) |
Performs \( \boldsymbol{x} = \alpha \boldsymbol{x}
\) on double precision. | |
| template<typename DataType > | |
| cublasStatus_t | cublasXgemv (cublasHandle_t handle, cublasOperation_t trans, int m, int n, const DataType *RESTRICT alpha, const DataType *RESTRICT A, int lda, const DataType *RESTRICT x, int incx, const DataType *RESTRICT beta, DataType *RESTRICT y, int incy) |
| template<typename DataType > | |
| cublasStatus_t | cublasXcopy (cublasHandle_t handle, int n, const DataType *RESTRICT x, int incx, DataType *RESTRICT y, int incy) |
| template<typename DataType > | |
| cublasStatus_t | cublasXaxpy (cublasHandle_t handle, int n, const DataType *RESTRICT alpha, const DataType *RESTRICT x, int incx, DataType *RESTRICT y, int incy) |
| template<typename DataType > | |
| cublasStatus_t | cublasXdot (cublasHandle_t handle, int n, const DataType *RESTRICT x, int incx, const DataType *RESTRICT y, int incy, DataType *RESTRICT result) |
| template<typename DataType > | |
| cublasStatus_t | cublasXnrm2 (cublasHandle_t handle, int n, const DataType *RESTRICT x, int incx, DataType *RESTRICT result) |
| template<typename DataType > | |
| cublasStatus_t | cublasXscal (cublasHandle_t handle, int n, const DataType *RESTRICT alpha, DataType *RESTRICT x, int incx) |
A collection of templates to wrapper cublas functions.
cu file is wrapped inside the namepsace clause. This is not necessary in general, however, it is needed to avoid the old gcc compiler error (this is a gcc bug) which complains "no instance of function template matches
the argument list const float". | cublasStatus_t cublas_api::cublasXaxpy | ( | cublasHandle_t | handle, |
| int | n, | ||
| const DataType *RESTRICT | alpha, | ||
| const DataType *RESTRICT | x, | ||
| int | incx, | ||
| DataType *RESTRICT | y, | ||
| int | incy | ||
| ) |
Referenced by cuVectorOperations< DataType >::subtract_scaled_vector().

| cublasStatus_t cublas_api::cublasXaxpy< double > | ( | cublasHandle_t | handle, |
| int | n, | ||
| const double *RESTRICT | alpha, | ||
| const double *RESTRICT | x, | ||
| int | incx, | ||
| double *RESTRICT | y, | ||
| int | incy | ||
| ) |
Performs \( \boldsymbol{y} = \alpha \boldsymbol{x} +
\boldsymbol{y} \) on double precision.
This function is a half type implementation similar to cuBLAS's cublasSaxpy.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | n | Size of array \( \boldsymbol{x} \). |
| [in] | alpha | The scalar parameter \( \alpha \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [out] | y | Output vector \( \boldsymbol{y} \) stored on GPU device. |
| [in] | incy | Stride between consecutive elements of \( \boldsymbol{y} \). |
Definition at line 956 of file cublas_api.cu.
References cublasDaxpy().

| cublasStatus_t cublas_api::cublasXaxpy< float > | ( | cublasHandle_t | handle, |
| int | n, | ||
| const float *RESTRICT | alpha, | ||
| const float *RESTRICT | x, | ||
| int | incx, | ||
| float *RESTRICT | y, | ||
| int | incy | ||
| ) |
Performs \( \boldsymbol{y} = \alpha \boldsymbol{x} +
\boldsymbol{y} \) on __half precision.
This function is a half type implementation similar to cuBLAS's cublasSaxpy.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | n | Size of array \( \boldsymbol{x} \). |
| [in] | alpha | The scalar parameter \( \alpha \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [out] | y | Output vector \( \boldsymbol{y} \) stored on GPU device. |
| [in] | incy | Stride between consecutive elements of \( \boldsymbol{y} \). |
Performs \( \boldsymbol{y} = \alpha \boldsymbol{x} +
\boldsymbol{y} \) on __nv_bfloat16 precision.
This function is a half type implementation similar to cuBLAS's cublasSaxpy.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | n | Size of array \( \boldsymbol{x} \). |
| [in] | alpha | The scalar parameter \( \alpha \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [out] | y | Output vector \( \boldsymbol{y} \) stored on GPU device. |
| [in] | incy | Stride between consecutive elements of \( \boldsymbol{y} \). |
Performs \( \boldsymbol{y} = \alpha \boldsymbol{x} +
\boldsymbol{y} \) on float precision.
This function is a half type implementation similar to cuBLAS's cublasSaxpy.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | n | Size of array \( \boldsymbol{x} \). |
| [in] | alpha | The scalar parameter \( \alpha \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [out] | y | Output vector \( \boldsymbol{y} \) stored on GPU device. |
| [in] | incy | Stride between consecutive elements of \( \boldsymbol{y} \). |
Definition at line 895 of file cublas_api.cu.
References cublasSaxpy().

| cublasStatus_t cublas_api::cublasXcopy | ( | cublasHandle_t | handle, |
| int | n, | ||
| const DataType *RESTRICT | x, | ||
| int | incx, | ||
| DataType *RESTRICT | y, | ||
| int | incy | ||
| ) |
Referenced by cuVectorOperations< DataType >::copy_scaled_vector(), and cuVectorOperations< DataType >::copy_vector().

| cublasStatus_t cublas_api::cublasXcopy< double > | ( | cublasHandle_t | handle, |
| int | n, | ||
| const double *RESTRICT | x, | ||
| int | incx, | ||
| double *RESTRICT | y, | ||
| int | incy | ||
| ) |
Performs \( \boldsymbol{y} = \boldsymbol{x} \) in double type.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | n | Size of the array \( \boldsymbol{x} \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [out] | y | Output vector \( \boldsymbol{y} \) stored on GPU device. |
| [in] | incy | Stride between consecutive elements of \( \boldsymbol{y} \). |
Definition at line 716 of file cublas_api.cu.
References cublasDcopy().

| cublasStatus_t cublas_api::cublasXcopy< float > | ( | cublasHandle_t | handle, |
| int | n, | ||
| const float *RESTRICT | x, | ||
| int | incx, | ||
| float *RESTRICT | y, | ||
| int | incy | ||
| ) |
Performs \( \boldsymbol{y} = \boldsymbol{x} \) in __half type.
This function is not a template wrapper for CuBLAS, since CuBLAS API does not have cublasHcopy (where H is used for __half type). As such, this function is implemented with CUDA, rather than from CuBLAS.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | n | Size of the array \( \boldsymbol{x} \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [out] | y | Output vector \( \boldsymbol{y} \) stored on GPU device. |
| [in] | incy | Stride between consecutive elements of \( \boldsymbol{y} \). |
Performs \( \boldsymbol{y} = \boldsymbol{x} \) in __nv_bfloat16 type.
This function is not a template wrapper for CuBLAS, since CuBLAS API does not have cublasHcopy (where H is used for __nv_bfloat16 type). As such, this function is implemented with CUDA, rather than from CuBLAS.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | n | Size of the array \( \boldsymbol{x} \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [out] | y | Output vector \( \boldsymbol{y} \) stored on GPU device. |
| [in] | incy | Stride between consecutive elements of \( \boldsymbol{y} \). |
Performs \( \boldsymbol{y} = \boldsymbol{x} \) in float type.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | n | Size of the array \( \boldsymbol{x} \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [out] | y | Output vector \( \boldsymbol{y} \) stored on GPU device. |
| [in] | incy | Stride between consecutive elements of \( \boldsymbol{y} \). |
Definition at line 660 of file cublas_api.cu.
References cublasScopy().

| cublasStatus_t cublas_api::cublasXdot | ( | cublasHandle_t | handle, |
| int | n, | ||
| const DataType *RESTRICT | x, | ||
| int | incx, | ||
| const DataType *RESTRICT | y, | ||
| int | incy, | ||
| DataType *RESTRICT | result | ||
| ) |
Referenced by cuVectorOperations< DataType >::inner_product().

| cublasStatus_t cublas_api::cublasXdot< double > | ( | cublasHandle_t | handle, |
| int | n, | ||
| const double *RESTRICT | x, | ||
| int | incx, | ||
| const double *RESTRICT | y, | ||
| int | incy, | ||
| double *RESTRICT | result | ||
| ) |
Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot
\boldsymbol{y} \) on double precision.
This function is a half type implementation similar to cuBLAS's cublasSdot.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | n | Size of array \( \boldsymbol{x} \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [out] | y | Output vector \( \boldsymbol{y} \) stored on GPU device. |
| [in] | incy | Stride between consecutive elements of \( \boldsymbol{y} \). |
| [out] | result | The dot product of two vectors. |
Definition at line 1196 of file cublas_api.cu.
References cublasDdot().

| cublasStatus_t cublas_api::cublasXdot< float > | ( | cublasHandle_t | handle, |
| int | n, | ||
| const float *RESTRICT | x, | ||
| int | incx, | ||
| const float *RESTRICT | y, | ||
| int | incy, | ||
| float *RESTRICT | result | ||
| ) |
Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot
\boldsymbol{y} \) on __half precision.
This function is a half type implementation similar to cuBLAS's cublasSdot.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | n | Size of array \( \boldsymbol{x} \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [out] | y | Output vector \( \boldsymbol{y} \) stored on GPU device. |
| [in] | incy | Stride between consecutive elements of \( \boldsymbol{y} \). |
| [out] | result | The dot product of two vectors. |
Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot
\boldsymbol{y} \) on __nv_bfloat16 precision.
This function is a half type implementation similar to cuBLAS's cublasSdot.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | n | Size of array \( \boldsymbol{x} \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [out] | y | Output vector \( \boldsymbol{y} \) stored on GPU device. |
| [in] | incy | Stride between consecutive elements of \( \boldsymbol{y} \). |
| [out] | result | The dot product of two vectors. |
Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot
\boldsymbol{y} \) on float precision.
This function is a half type implementation similar to cuBLAS's cublasSdot.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | n | Size of array \( \boldsymbol{x} \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [out] | y | Output vector \( \boldsymbol{y} \) stored on GPU device. |
| [in] | incy | Stride between consecutive elements of \( \boldsymbol{y} \). |
| [out] | result | The dot product of two vectors. |
Definition at line 1135 of file cublas_api.cu.
References cublasSdot().

| cublasStatus_t cublas_api::cublasXgemv | ( | cublasHandle_t | handle, |
| cublasOperation_t | trans, | ||
| int | m, | ||
| int | n, | ||
| const DataType *RESTRICT | alpha, | ||
| const DataType *RESTRICT | A, | ||
| int | lda, | ||
| const DataType *RESTRICT | x, | ||
| int | incx, | ||
| const DataType *RESTRICT | beta, | ||
| DataType *RESTRICT | y, | ||
| int | incy | ||
| ) |
| cublasStatus_t cublas_api::cublasXgemv< double > | ( | cublasHandle_t | handle, |
| cublasOperation_t | trans, | ||
| int | m, | ||
| int | n, | ||
| const double *RESTRICT | alpha, | ||
| const double *RESTRICT | A, | ||
| int | lda, | ||
| const double *RESTRICT | x, | ||
| int | incx, | ||
| const double *RESTRICT | beta, | ||
| double *RESTRICT | y, | ||
| int | incy | ||
| ) |
Performs \( \boldsymbol{y} = \alpha \text{op}(\mathbf{A}) \boldsymbol{x} + \beta \boldsymbol{y} \).
This function is a template wrapper for cublasDgemv.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | trans | If set to CUBLAS_OP_N or CUBLAS_OP_T, the operator \( \mathbf{A} \) is not transposed or transposed, respectively. |
| [in] | m | Number of rows of matrix \( \mathbf{A} \). |
| [in] | n | Number of columns of matrix \( \mathbf{A} \). |
| [in] | alpha | The scalar parameter \( \alpha \). |
| [in] | A | Two-dimensional matrix \( \mathbf{A} \) stored on GPU device as one-dimensional array with column-major ordering. |
| [in] | lda | Leading dimension of two-dimensional matrix \( \mathbf{A} \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [in] | beta | The scalar parameter \( \beta \). |
| [out] | y | Output vector \( \boldsymbol{y} \) stored on GPU device. |
| [in] | incy | Stride between consecutive elements of \( \boldsymbol{y} \). |
Definition at line 481 of file cublas_api.cu.
| cublasStatus_t cublas_api::cublasXgemv< float > | ( | cublasHandle_t | handle, |
| cublasOperation_t | trans, | ||
| int | m, | ||
| int | n, | ||
| const float *RESTRICT | alpha, | ||
| const float *RESTRICT | A, | ||
| int | lda, | ||
| const float *RESTRICT | x, | ||
| int | incx, | ||
| const float *RESTRICT | beta, | ||
| float *RESTRICT | y, | ||
| int | incy | ||
| ) |
Performs \( \boldsymbol{y} = \alpha \text{op}(\mathbf{A}) \boldsymbol{x} + \beta \boldsymbol{y} \).
This function is not a template wrapper for CuBLAS, since CuBLAS API does not have cublasHgemv (where H is used for __nv_fp8_e5m2 type). As such, this function is implemented with CUDA, rather than from CuBLAS.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | trans | If set to CUBLAS_OP_N or CUBLAS_OP_T, the operator \( \mathbf{A} \) is not transposed or transposed, respectively. |
| [in] | m | Number of rows of matrix \( \mathbf{A} \). |
| [in] | n | Number of columns of matrix \( \mathbf{A} \). |
| [in] | alpha | The scalar parameter \( \alpha \). |
| [in] | A | Two-dimensional matrix \( \mathbf{A} \) stored on GPU device as one-dimensional array with column-major ordering. |
| [in] | lda | Leading dimension of two-dimensional matrix \( \mathbf{A} \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [in] | beta | The scalar parameter \( \beta \). |
| [out] | y | Output vector \( \boldsymbol{y} \) stored on GPU device. |
| [in] | incy | Stride between consecutive elements of \( \boldsymbol{y} \). |
Performs \( \boldsymbol{y} = \alpha \text{op}(\mathbf{A}) \boldsymbol{x} + \beta \boldsymbol{y} \).
This function is not a template wrapper for CuBLAS, since CuBLAS API does not have cublasHgemv (where H is used for __nv_fp8_e4m3 type). As such, this function is implemented with CUDA, rather than from CuBLAS.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | trans | If set to CUBLAS_OP_N or CUBLAS_OP_T, the operator \( \mathbf{A} \) is not transposed or transposed, respectively. |
| [in] | m | Number of rows of matrix \( \mathbf{A} \). |
| [in] | n | Number of columns of matrix \( \mathbf{A} \). |
| [in] | alpha | The scalar parameter \( \alpha \). |
| [in] | A | Two-dimensional matrix \( \mathbf{A} \) stored on GPU device as one-dimensional array with column-major ordering. |
| [in] | lda | Leading dimension of two-dimensional matrix \( \mathbf{A} \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [in] | beta | The scalar parameter \( \beta \). |
| [out] | y | Output vector \( \boldsymbol{y} \) stored on GPU device. |
| [in] | incy | Stride between consecutive elements of \( \boldsymbol{y} \). |
Performs \( \boldsymbol{y} = \alpha \text{op}(\mathbf{A}) \boldsymbol{x} + \beta \boldsymbol{y} \).
This function is not a template wrapper for CuBLAS, since CuBLAS API does not have cublasHgemv (where H is used for __half type). As such, this function is implemented with CUDA, rather than from CuBLAS.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | trans | If set to CUBLAS_OP_N or CUBLAS_OP_T, the operator \( \mathbf{A} \) is not transposed or transposed, respectively. |
| [in] | m | Number of rows of matrix \( \mathbf{A} \). |
| [in] | n | Number of columns of matrix \( \mathbf{A} \). |
| [in] | alpha | The scalar parameter \( \alpha \). |
| [in] | A | Two-dimensional matrix \( \mathbf{A} \) stored on GPU device as one-dimensional array with column-major ordering. |
| [in] | lda | Leading dimension of two-dimensional matrix \( \mathbf{A} \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [in] | beta | The scalar parameter \( \beta \). |
| [out] | y | Output vector \( \boldsymbol{y} \) stored on GPU device. |
| [in] | incy | Stride between consecutive elements of \( \boldsymbol{y} \). |
Performs \( \boldsymbol{y} = \alpha \text{op}(\mathbf{A}) \boldsymbol{x} + \beta \boldsymbol{y} \).
This function is not a template wrapper for CuBLAS, since CuBLAS API does not have cublasHgemv (where H is used for __nv_bfloat16 type). As such, this function is implemented with CUDA, rather than from CuBLAS.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | trans | If set to CUBLAS_OP_N or CUBLAS_OP_T, the operator \( \mathbf{A} \) is not transposed or transposed, respectively. |
| [in] | m | Number of rows of matrix \( \mathbf{A} \). |
| [in] | n | Number of columns of matrix \( \mathbf{A} \). |
| [in] | alpha | The scalar parameter \( \alpha \). |
| [in] | A | Two-dimensional matrix \( \mathbf{A} \) stored on GPU device as one-dimensional array with column-major ordering. |
| [in] | lda | Leading dimension of two-dimensional matrix \( \mathbf{A} \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [in] | beta | The scalar parameter \( \beta \). |
| [out] | y | Output vector \( \boldsymbol{y} \) stored on GPU device. |
| [in] | incy | Stride between consecutive elements of \( \boldsymbol{y} \). |
Performs \( \boldsymbol{y} = \alpha \text{op}(\mathbf{A}) \boldsymbol{x} + \beta \boldsymbol{y} \).
This function is a template wrapper for cublasSgemv.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | trans | If set to CUBLAS_OP_N or CUBLAS_OP_T, the operator \( \mathbf{A} \) is not transposed or transposed, respectively. |
| [in] | m | Number of rows of matrix \( \mathbf{A} \). |
| [in] | n | Number of columns of matrix \( \mathbf{A} \). |
| [in] | alpha | The scalar parameter \( \alpha \). |
| [in] | A | Two-dimensional matrix \( \mathbf{A} \) stored on GPU device as one-dimensional array with column-major ordering. |
| [in] | lda | Leading dimension of two-dimensional matrix \( \mathbf{A} \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [in] | beta | The scalar parameter \( \beta \). |
| [out] | y | Output vector \( \boldsymbol{y} \) stored on GPU device. |
| [in] | incy | Stride between consecutive elements of \( \boldsymbol{y} \). |
Definition at line 399 of file cublas_api.cu.
| cublasStatus_t cublas_api::cublasXnrm2 | ( | cublasHandle_t | handle, |
| int | n, | ||
| const DataType *RESTRICT | x, | ||
| int | incx, | ||
| DataType *RESTRICT | result | ||
| ) |
Referenced by cuVectorOperations< DataType >::euclidean_norm().

| cublasStatus_t cublas_api::cublasXnrm2< double > | ( | cublasHandle_t | handle, |
| int | n, | ||
| const double *RESTRICT | x, | ||
| int | incx, | ||
| double *RESTRICT | result | ||
| ) |
Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot
\boldsymbol{x} \) on double precision.
This function is a half type implementation similar to cuBLAS's cublasSnrm2.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | n | Size of array \( \boldsymbol{x} \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [out] | result | The norm squared of a vector. |
Definition at line 1410 of file cublas_api.cu.
References cublasDnrm2().

| cublasStatus_t cublas_api::cublasXnrm2< float > | ( | cublasHandle_t | handle, |
| int | n, | ||
| const float *RESTRICT | x, | ||
| int | incx, | ||
| float *RESTRICT | result | ||
| ) |
Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot
\boldsymbol{x} \) on __half precision.
This function is a half type implementation similar to cuBLAS's cublasSnrm2.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | n | Size of array \( \boldsymbol{x} \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [out] | result | The norm squared of a vector. |
Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot
\boldsymbol{x} \) on __nv_bfloat16 precision.
This function is a half type implementation similar to cuBLAS's cublasSnrm2.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | n | Size of array \( \boldsymbol{x} \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [out] | result | The norm squared of a vector. |
Performs \( \boldsymbol{y} = \boldsymbol{x} \cdot
\boldsymbol{x} \) on float precision.
This function is a half type implementation similar to cuBLAS's cublasSnrm2.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | n | Size of array \( \boldsymbol{x} \). |
| [in] | x | Input vector \( \boldsymbol{x} \) stored on GPU device. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
| [out] | result | The norm squared of a vector. |
Definition at line 1356 of file cublas_api.cu.
References cublasSnrm2().

| cublasStatus_t cublas_api::cublasXscal | ( | cublasHandle_t | handle, |
| int | n, | ||
| const DataType *RESTRICT | alpha, | ||
| DataType *RESTRICT | x, | ||
| int | incx | ||
| ) |
Referenced by cuVectorOperations< DataType >::copy_scaled_vector(), and cuVectorOperations< DataType >::normalize_vector_in_place().

| cublasStatus_t cublas_api::cublasXscal< double > | ( | cublasHandle_t | handle, |
| int | n, | ||
| const double *RESTRICT | alpha, | ||
| double *RESTRICT | x, | ||
| int | incx | ||
| ) |
Performs \( \boldsymbol{x} = \alpha \boldsymbol{x}
\) on double precision.
This function is a half type implementation similar to cuBLAS's cublasSscal.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | n | Size of array \( \boldsymbol{x} \). |
| [in] | alpha | The scalar parameter \( \alpha \). |
| [in,out] | x | Input and output vector \( \boldsymbol{x} \) stored on GPU device. This vector is written in-place. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
Definition at line 1626 of file cublas_api.cu.
References cublasDscal().

| cublasStatus_t cublas_api::cublasXscal< float > | ( | cublasHandle_t | handle, |
| int | n, | ||
| const float *RESTRICT | alpha, | ||
| float *RESTRICT | x, | ||
| int | incx | ||
| ) |
Performs \( \boldsymbol{x} = \alpha \boldsymbol{x}
\) on __half precision.
This function is a half type implementation similar to cuBLAS's cublasSscal.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | n | Size of array \( \boldsymbol{x} \). |
| [in] | alpha | The scalar parameter \( \alpha \). |
| [in,out] | x | Input and output vector \( \boldsymbol{x} \) stored on GPU device. This vector is written in-place. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
Performs \( \boldsymbol{x} = \alpha \boldsymbol{x}
\) on __nv_bfloat16 precision.
This function is a half type implementation similar to cuBLAS's cublasSscal.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | n | Size of array \( \boldsymbol{x} \). |
| [in] | alpha | The scalar parameter \( \alpha \). |
| [in,out] | x | Input and output vector \( \boldsymbol{x} \) stored on GPU device. This vector is written in-place. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
Performs \( \boldsymbol{x} = \alpha \boldsymbol{x}
\) on float precision.
This function is a half type implementation similar to cuBLAS's cublasSscal.
| [in] | handle | Handle object for CuBLAS library context. |
| [in] | n | Size of array \( \boldsymbol{x} \). |
| [in] | alpha | The scalar parameter \( \alpha \). |
| [in,out] | x | Input and output vector \( \boldsymbol{x} \) stored on GPU device. This vector is written in-place. |
| [in] | incx | Stride between consecutive elements of \( \boldsymbol{x} \). |
Definition at line 1571 of file cublas_api.cu.
References cublasSscal().
