doxygen/html/cublas__api_8cu_source.html

/*

 *  SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>

 *  SPDX-License-Identifier: BSD-3-Clause

 *  SPDX-FileType: SOURCE

 *

 *  This program is free software: you can redistribute it and/or modify it

 *  under the terms of the license found in the LICENSE.txt file in the root

 *  directory of this source tree.

 */


// =======

// Headers

// =======


#include "./cublas_api.h"

#include <cuda_runtime.h>  // cudaError_t, cudaSuccess

#include "../_cu_definitions/cu_types.h" // __nv_fp8_e5m2, __nv_fp8_e4m3,

                                         // __half, __nv_bfloat16

#include "./cublas_impl.h"  // cublas_impl


// ==========

// cublas api

// ==========


namespace cublas_api

{

    // ===========

    // cublasXgemv (__nv_fp8_e5m2)

    // ===========


    #if defined(USE_CUDA_FP8_E5M2) && (USE_CUDA_FP8_E5M2 == 1)

    template<>

    cublasStatus_t cublasXgemv<__nv_fp8_e5m2>(

            cublasHandle_t handle,

            cublasOperation_t trans,

            int m,

            int n,

            const __nv_fp8_e5m2* RESTRICT alpha,

            const __nv_fp8_e5m2* RESTRICT A,

            int lda,

            const __nv_fp8_e5m2* RESTRICT x,

            int incx,

            const __nv_fp8_e5m2* RESTRICT beta,

            __nv_fp8_e5m2* RESTRICT y,

            int incy)

    {

        // Void unused variables to avoid compiler warnings

        // (-Wno-unused-parameter)

        (void) handle;


        cudaError_t error = cublas_impl::cublasTgemv<__nv_fp8_e5m2, float>(

                trans, m, n, alpha, A, lda, x, incx, beta, y, incy);


        if (error != cudaSuccess)

        {

            return CUBLAS_STATUS_SUCCESS;

        }

        else

        {

            return CUBLAS_STATUS_INTERNAL_ERROR;

        }

    }

    #endif


    // ===========

    // cublasXgemv (__nv_fp8_e4m3)

    // ===========


    #if defined(USE_CUDA_FP8_E4M3) && (USE_CUDA_FP8_E4M3 == 1)

    template<>

    cublasStatus_t cublasXgemv<__nv_fp8_e4m3>(

            cublasHandle_t handle,

            cublasOperation_t trans,

            int m,

            int n,

            const __nv_fp8_e4m3* RESTRICT alpha,

            const __nv_fp8_e4m3* RESTRICT A,

            int lda,

            const __nv_fp8_e4m3* RESTRICT x,

            int incx,

            const __nv_fp8_e4m3* RESTRICT beta,

            __nv_fp8_e4m3* RESTRICT y,

            int incy)

    {

        // Void unused variables to avoid compiler warnings

        // (-Wno-unused-parameter)

        (void) handle;


        cudaError_t error = cublas_impl::cublasTgemv<__nv_fp8_e4m3, float>(

                trans, m, n, alpha, A, lda, x, incx, beta, y, incy);


        if (error != cudaSuccess)

        {

            return CUBLAS_STATUS_SUCCESS;

        }

        else

        {

            return CUBLAS_STATUS_INTERNAL_ERROR;

        }

    }

    #endif


    // ===========

    // cublasXgemv (__half)

    // ===========


    #if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)

    template<>

    cublasStatus_t cublasXgemv<__half>(

            cublasHandle_t handle,

            cublasOperation_t trans,

            int m,

            int n,

            const __half* RESTRICT alpha,

            const __half* RESTRICT A,

            int lda,

            const __half* RESTRICT x,

            int incx,

            const __half* RESTRICT beta,

            __half* RESTRICT y,

            int incy)

    {

        // Void unused variables to avoid compiler warnings

        // (-Wno-unused-parameter)

        (void) handle;


        cudaError_t error = cublas_impl::cublasTgemv<__half, float>(

                trans, m, n, alpha, A, lda, x, incx, beta, y, incy);


        if (error != cudaSuccess)

        {

            return CUBLAS_STATUS_SUCCESS;

        }

        else

        {

            return CUBLAS_STATUS_INTERNAL_ERROR;

        }

    }

    #endif


    // ===========

    // cublasXgemv (__nv_bfloat16)

    // ===========


    #if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)

    template<>

    cublasStatus_t cublasXgemv<__nv_bfloat16>(

            cublasHandle_t handle,

            cublasOperation_t trans,

            int m,

            int n,

            const __nv_bfloat16* RESTRICT alpha,

            const __nv_bfloat16* RESTRICT A,

            int lda,

            const __nv_bfloat16* RESTRICT x,

            int incx,

            const __nv_bfloat16* RESTRICT beta,

            __nv_bfloat16* RESTRICT y,

            int incy)

    {

        // Void unused variables to avoid compiler warnings

        // (-Wno-unused-parameter)

        (void) handle;


        cudaError_t error = cublas_impl::cublasTgemv<__nv_bfloat16, float>(

                trans, m, n, alpha, A, lda, x, incx, beta, y, incy);


        if (error != cudaSuccess)

        {

            return CUBLAS_STATUS_SUCCESS;

        }

        else

        {

            return CUBLAS_STATUS_INTERNAL_ERROR;

        }

    }

    #endif


    // ===========

    // cublasXgemv (float)

    // ===========


    #if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)

    template<>


    cublasStatus_t cublasXgemv<float>(

            cublasHandle_t handle,

            cublasOperation_t trans,

            int m,

            int n,

            const float* RESTRICT alpha,

            const float* RESTRICT A,

            int lda,

            const float* RESTRICT x,

            int incx,

            const float* RESTRICT beta,

            float* RESTRICT y,

            int incy)

    {


        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)

            // Use in-house implementation

            cudaError_t error = cublas_impl::cublasTgemv<float, float>(

                    trans, m, n, alpha, A, lda, x, incx, beta, y, incy);


            if (error != cudaSuccess)

            {

                return CUBLAS_STATUS_SUCCESS;

            }

            else

            {

                return CUBLAS_STATUS_INTERNAL_ERROR;

            }


        #else

            // Use Nvidia's CuBLAS

            return cublasSgemv(handle, trans, m, n, alpha, A, lda, x, incx,

                               beta, y, incy);

        #endif

    }


    #endif


    // ===========

    // cublasXgemv (double)

    // ===========


    #if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)

    template<>


    cublasStatus_t cublasXgemv<double>(

            cublasHandle_t handle,

            cublasOperation_t trans,

            int m,

            int n,

            const double* RESTRICT alpha,

            const double* RESTRICT A,

            int lda,

            const double* RESTRICT x,

            int incx,

            const double* RESTRICT beta,

            double* RESTRICT y,

            int incy)

    {

        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)

            // Use in-house implementation

            cudaError_t error = cublas_impl::cublasTgemv<double, double>(

                    trans, m, n, alpha, A, lda, x, incx, beta, y, incy);


            if (error != cudaSuccess)

            {

                return CUBLAS_STATUS_SUCCESS;

            }

            else

            {

                return CUBLAS_STATUS_INTERNAL_ERROR;

            }


        #else

            // Use Nvidia's CuBLAS

            return cublasDgemv(handle, trans, m, n, alpha, A, lda, x, incx,

                               beta, y, incy);

        #endif

    }


    #endif


    // ===========

    // cublasXcopy (__half)

    // ===========


    #if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)

    template<>

    cublasStatus_t cublasXcopy<__half>(

            cublasHandle_t handle,

            int n,

            const __half* RESTRICT x,

            int incx,

            __half* RESTRICT y,

            int incy)

    {

        // Void unused variables to avoid compiler warnings

        // (-Wno-unused-parameter)

        (void) handle;


        cudaError_t error = cublas_impl::cublasTcopy<__half>(

                n, x, incx, y, incy);


        if (error != cudaSuccess)

        {

            return CUBLAS_STATUS_SUCCESS;

        }

        else

        {

            return CUBLAS_STATUS_INTERNAL_ERROR;

        }

    }

    #endif


    // ===========

    // cublasXcopy (__nv_bfloat16)

    // ===========


    #if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)

    template<>

    cublasStatus_t cublasXcopy<__nv_bfloat16>(

            cublasHandle_t handle,

            int n,

            const __nv_bfloat16* RESTRICT x,

            int incx,

            __nv_bfloat16* RESTRICT y,

            int incy)

    {

        // Void unused variables to avoid compiler warnings

        // (-Wno-unused-parameter)

        (void) handle;


        cudaError_t error =  cublas_impl::cublasTcopy<__nv_bfloat16>(

                n, x, incx, y, incy);


        if (error != cudaSuccess)

        {

            return CUBLAS_STATUS_SUCCESS;

        }

        else

        {

            return CUBLAS_STATUS_INTERNAL_ERROR;

        }

    }

    #endif


    // ===========

    // cublasXcopy (float)

    // ===========


    #if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)

    template<>


    cublasStatus_t cublasXcopy<float>(

            cublasHandle_t handle,

            int n,

            const float* RESTRICT x,

            int incx,

            float* RESTRICT y,

            int incy)

    {

        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)

            // Use in-house implementation

            cudaError_t error = cublas_impl::cublasTcopy<float>(

                    n, x, incx, y, incy);


            if (error != cudaSuccess)

            {

                return CUBLAS_STATUS_SUCCESS;

            }

            else

            {

                return CUBLAS_STATUS_INTERNAL_ERROR;

            }


        #else

            // Use Nvidia's CuBLAS

            return cublasScopy(handle, n, x, incx, y, incy);

        #endif

    }


    #endif


    // ===========

    // cublasXcopy (double)

    // ===========


    #if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)

    template<>


    cublasStatus_t cublasXcopy<double>(

            cublasHandle_t handle,

            int n,

            const double* RESTRICT x,

            int incx,

            double* RESTRICT y,

            int incy)

    {

        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)

            // Use in-house implementation

            cudaError_t error = cublas_impl::cublasTcopy<double>(

                    n, x, incx, y, incy);


            if (error != cudaSuccess)

            {

                return CUBLAS_STATUS_SUCCESS;

            }

            else

            {

                return CUBLAS_STATUS_INTERNAL_ERROR;

            }


        #else

            // Use Nvidia's CuBLAS

            return cublasDcopy(handle, n, x, incx, y, incy);

        #endif

    }


    #endif


    // ===========

    // cublasXaxpy (__half)

    // ===========


    #if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)

    template<>

    cublasStatus_t cublasXaxpy<__half>(

            cublasHandle_t handle,

            int n,

            const __half* RESTRICT alpha,

            const __half* RESTRICT x,

            int incx,

            __half* RESTRICT y,

            int incy)

    {

        // Void unused variables to avoid compiler warnings

        // (-Wno-unused-parameter)

        (void) handle;


        cudaError_t error = cublas_impl::cublasTaxpy<__half>(

                n, alpha, x, incx, y, incy);


        if (error != cudaSuccess)

        {

            return CUBLAS_STATUS_SUCCESS;

        }

        else

        {

            return CUBLAS_STATUS_INTERNAL_ERROR;

        }

    }

    #endif


    // ===========

    // cublasXaxpy (__nv_bfloat16)

    // ===========


    #if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)

    template<>

    cublasStatus_t cublasXaxpy<__nv_bfloat16>(

            cublasHandle_t handle,

            int n,

            const __nv_bfloat16* RESTRICT alpha,

            const __nv_bfloat16* RESTRICT x,

            int incx,

            __nv_bfloat16* RESTRICT y,

            int incy)

    {

        // Void unused variables to avoid compiler warnings

        // (-Wno-unused-parameter)

        (void) handle;


        cudaError_t error = cublas_impl::cublasTaxpy<__nv_bfloat16>(

                n, alpha, x, incx, y, incy);


        if (error != cudaSuccess)

        {

            return CUBLAS_STATUS_SUCCESS;

        }

        else

        {

            return CUBLAS_STATUS_INTERNAL_ERROR;

        }

    }

    #endif


    // ===========

    // cublasXaxpy (float)

    // ===========


    #if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)

    template<>


    cublasStatus_t cublasXaxpy<float>(

            cublasHandle_t handle,

            int n,

            const float* RESTRICT alpha,

            const float* RESTRICT x,

            int incx,

            float* RESTRICT y,

            int incy)

    {

        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)

            // Use in-house implementation

            cudaError_t error = cublas_impl::cublasTaxpy<float>(

                    n, alpha, x, incx, y, incy);


            if (error != cudaSuccess)

            {

                return CUBLAS_STATUS_SUCCESS;

            }

            else

            {

                return CUBLAS_STATUS_INTERNAL_ERROR;

            }


        #else

            return cublasSaxpy(handle, n, alpha, x, incx, y, incy);

        #endif

    }


    #endif


    // ===========

    // cublasXaxpy (double)

    // ===========


    #if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)

    template<>


    cublasStatus_t cublasXaxpy<double>(

            cublasHandle_t handle,

            int n,

            const double* RESTRICT alpha,

            const double* RESTRICT x,

            int incx,

            double* RESTRICT y,

            int incy)

    {

        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)

            // Use in-house implementation

            cudaError_t error = cublas_impl::cublasTaxpy<double>(

                    n, alpha, x, incx, y, incy);


            if (error != cudaSuccess)

            {

                return CUBLAS_STATUS_SUCCESS;

            }

            else

            {

                return CUBLAS_STATUS_INTERNAL_ERROR;

            }


        #else

            return cublasDaxpy(handle, n, alpha, x, incx, y, incy);

        #endif

    }


    #endif


    // ==========

    // cublasXdot (__half)

    // ==========


    #if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)

    template<>

    cublasStatus_t cublasXdot<__half>(

            cublasHandle_t handle,

            int n,

            const __half* RESTRICT x,

            int incx,

            const __half* RESTRICT y,

            int incy,

            __half* RESTRICT result)

    {

        // Void unused variables to avoid compiler warnings

        // (-Wno-unused-parameter)

        (void) handle;


        cudaError_t error = cublas_impl::cublasTdot<__half, float>(

                n, x, incx, y, incy, result);


        if (error != cudaSuccess)

        {

            return CUBLAS_STATUS_SUCCESS;

        }

        else

        {

            return CUBLAS_STATUS_INTERNAL_ERROR;

        }

    }

    #endif


    // ==========

    // cublasXdot (__nv_bfloat16)

    // ==========


    #if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)

    template<>

    cublasStatus_t cublasXdot<__nv_bfloat16>(

            cublasHandle_t handle,

            int n,

            const __nv_bfloat16* RESTRICT x,

            int incx,

            const __nv_bfloat16* RESTRICT y,

            int incy,

            __nv_bfloat16* RESTRICT result)

    {

        // Void unused variables to avoid compiler warnings

        // (-Wno-unused-parameter)

        (void) handle;


        cudaError_t error = cublas_impl::cublasTdot<__nv_bfloat16, float>(

                n, x, incx, y, incy, result);


        if (error != cudaSuccess)

        {

            return CUBLAS_STATUS_SUCCESS;

        }

        else

        {

            return CUBLAS_STATUS_INTERNAL_ERROR;

        }

    }

    #endif


    // ==========

    // cublasXdot (float)

    // ==========


    #if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)

    template<>


    cublasStatus_t cublasXdot<float>(

            cublasHandle_t handle,

            int n,

            const float* RESTRICT x,

            int incx,

            const float* RESTRICT y,

            int incy,

            float* RESTRICT result)

    {

        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)

            // Use in-house implementation

            cudaError_t error = cublas_impl::cublasTdot<float, float>(

                    n, x, incx, y, incy, result);


            if (error != cudaSuccess)

            {

                return CUBLAS_STATUS_SUCCESS;

            }

            else

            {

                return CUBLAS_STATUS_INTERNAL_ERROR;

            }


        #else

            return cublasSdot(handle, n, x, incx, y, incy, result);

        #endif

    }


    #endif


    // ==========

    // cublasXdot (double)

    // ==========


    #if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)

    template<>


    cublasStatus_t cublasXdot<double>(

            cublasHandle_t handle,

            int n,

            const double* RESTRICT x,

            int incx,

            const double* RESTRICT y,

            int incy,

            double* RESTRICT result)

    {

        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)

            // Use in-house implementation

            cudaError_t error = cublas_impl::cublasTdot<double, double>(

                    n, x, incx, y, incy, result);


            if (error != cudaSuccess)

            {

                return CUBLAS_STATUS_SUCCESS;

            }

            else

            {

                return CUBLAS_STATUS_INTERNAL_ERROR;

            }


        #else

            return cublasDdot(handle, n, x, incx, y, incy, result);

        #endif

    }


    #endif


    // ===========

    // cublasXnrm2 (__half)

    // ===========


    #if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)

    template<>

    cublasStatus_t cublasXnrm2<__half>(

            cublasHandle_t handle,

            int n,

            const __half* RESTRICT x,

            int incx,

            __half* RESTRICT result)

    {

        // Void unused variables to avoid compiler warnings

        // (-Wno-unused-parameter)

        (void) handle;


        cudaError_t error = cublas_impl::cublasTnrm2<__half, float>(

                n, x, incx, result);


        if (error != cudaSuccess)

        {

            return CUBLAS_STATUS_SUCCESS;

        }

        else

        {

            return CUBLAS_STATUS_INTERNAL_ERROR;

        }

    }

    #endif


    // ===========

    // cublasXnrm2 (__nv_bfloat16)

    // ===========


    #if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)

    template<>

    cublasStatus_t cublasXnrm2<__nv_bfloat16>(

            cublasHandle_t handle,

            int n,

            const __nv_bfloat16* RESTRICT x,

            int incx,

            __nv_bfloat16* RESTRICT result)

    {

        // Void unused variables to avoid compiler warnings

        // (-Wno-unused-parameter)

        (void) handle;


        cudaError_t error = cublas_impl::cublasTnrm2<__nv_bfloat16, float>(

                n, x, incx, result);


        if (error != cudaSuccess)

        {

            return CUBLAS_STATUS_SUCCESS;

        }

        else

        {

            return CUBLAS_STATUS_INTERNAL_ERROR;

        }

    }

    #endif


    // ===========

    // cublasXnrm2 (float)

    // ===========


    #if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)

    template<>


    cublasStatus_t cublasXnrm2<float>(

            cublasHandle_t handle,

            int n,

            const float* RESTRICT x,

            int incx,

            float* RESTRICT result)

    {

        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)

            // Use in-house implementation

            cudaError_t error = cublas_impl::cublasTnrm2<float, float>(

                    n, x, incx, result);


            if (error != cudaSuccess)

            {

                return CUBLAS_STATUS_SUCCESS;

            }

            else

            {

                return CUBLAS_STATUS_INTERNAL_ERROR;

            }


        #else

            return cublasSnrm2(handle, n, x, incx, result);

        #endif

    }


    #endif


    // ===========

    // cublasXnrm2 (double)

    // ===========


    #if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)

    template<>


    cublasStatus_t cublasXnrm2<double>(

            cublasHandle_t handle,

            int n,

            const double* RESTRICT x,

            int incx,

            double* RESTRICT result)

    {

        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)

            // Use in-house implementation

            cudaError_t error = cublas_impl::cublasTnrm2<double, double>(

                    n, x, incx, result);


            if (error != cudaSuccess)

            {

                return CUBLAS_STATUS_SUCCESS;

            }

            else

            {

                return CUBLAS_STATUS_INTERNAL_ERROR;

            }


        #else

            return cublasDnrm2(handle, n, x, incx, result);

        #endif

    }


    #endif


    // ===========

    // cublasXscal (__half)

    // ===========


    #if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)

    template<>

    cublasStatus_t cublasXscal<__half>(

            cublasHandle_t handle,

            int n,

            const __half* RESTRICT alpha,

            __half* RESTRICT x,

            int incx)

    {

        // Void unused variables to avoid compiler warnings

        // (-Wno-unused-parameter)

        (void) handle;


        cudaError_t error = cublas_impl::cublasTscal<__half>(

                n, alpha, x, incx);


        if (error != cudaSuccess)

        {

            return CUBLAS_STATUS_SUCCESS;

        }

        else

        {

            return CUBLAS_STATUS_INTERNAL_ERROR;

        }

    }

    #endif


    // ===========

    // cublasXscal (__nv_bfloat16)

    // ===========


    #if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)

    template<>

    cublasStatus_t cublasXscal<__nv_bfloat16>(

            cublasHandle_t handle,

            int n,

            const __nv_bfloat16* RESTRICT alpha,

            __nv_bfloat16* RESTRICT x,

            int incx)

    {

        // Void unused variables to avoid compiler warnings

        // (-Wno-unused-parameter)

        (void) handle;


        cudaError_t error = cublas_impl::cublasTscal<__nv_bfloat16>(

                n, alpha, x, incx);


        if (error != cudaSuccess)

        {

            return CUBLAS_STATUS_SUCCESS;

        }

        else

        {

            return CUBLAS_STATUS_INTERNAL_ERROR;

        }

    }

    #endif


    // ===========

    // cublasXscal (float)

    // ===========


    #if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)

    template<>


    cublasStatus_t cublasXscal<float>(

            cublasHandle_t handle,

            int n,

            const float* RESTRICT alpha,

            float* RESTRICT x,

            int incx)

    {

        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)

            // Use in-house implementation

            cudaError_t error = cublas_impl::cublasTscal<float>(

                    n, alpha, x, incx);


            if (error != cudaSuccess)

            {

                return CUBLAS_STATUS_SUCCESS;

            }

            else

            {

                return CUBLAS_STATUS_INTERNAL_ERROR;

            }


        #else

            return cublasSscal(handle, n, alpha, x, incx);

        #endif

    }


    #endif


    // ===========

    // cublasXscal (double)

    // ===========


    #if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)

    template<>


    cublasStatus_t cublasXscal<double>(

            cublasHandle_t handle,

            int n,

            const double* RESTRICT alpha,

            double* RESTRICT x,

            int incx)

    {

        #if !defined(USE_CUBLAS) || (USE_CUBLAS != 1)

            // Use in-house implementation

            cudaError_t error = cublas_impl::cublasTscal<double>(

                    n, alpha, x, incx);


            if (error != cudaSuccess)

            {

                return CUBLAS_STATUS_SUCCESS;

            }

            else

            {

                return CUBLAS_STATUS_INTERNAL_ERROR;

            }


        #else

            return cublasDscal(handle, n, alpha, x, incx);

        #endif

    }


    #endif


}  // namespace cublas_api


RESTRICT
#define RESTRICT
Definition c_matrix_operations.h:29

cublas_api.h

cublas_impl.h

cublasDcopy
cublasStatus_t cublasDcopy(cublasHandle_t handle, int n, const double *x, int incx, double *y, int incy)
Definition of CUDA's cublasDcopy function using dynamically loaded cublas library.
Definition cublas_symbols.cpp:293

cublasSscal
cublasStatus_t cublasSscal(cublasHandle_t handle, int n, const float *alpha, float *x, int incx)
Definition of CUDA's cublasSscal function using dynamically loaded cublas library.
Definition cublas_symbols.cpp:504

cublasDscal
cublasStatus_t cublasDscal(cublasHandle_t handle, int n, const double *alpha, double *x, int incx)
Definition of CUDA's cublasDscal function using dynamically loaded cublas library.
Definition cublas_symbols.cpp:533

cublasSdot
cublasStatus_t cublasSdot(cublasHandle_t handle, int n, const float *x, int incx, const float *y, int incy, float *result)
Definition of CUDA's cublasSdot function using dynamically loaded cublas library.
Definition cublas_symbols.cpp:384

cublasSnrm2
cublasStatus_t cublasSnrm2(cublasHandle_t handle, int n, const float *x, int incx, float *result)
Definition of CUDA's cublasSnrm2 function using dynamically loaded cublas library.
Definition cublas_symbols.cpp:446

cublasSaxpy
cublasStatus_t cublasSaxpy(cublasHandle_t handle, int n, const float *alpha, const float *x, int incx, float *y, int incy)
Definition of CUDA's cublasSaxpy function using dynamically loaded cublas library.
Definition cublas_symbols.cpp:322

cublasDaxpy
cublasStatus_t cublasDaxpy(cublasHandle_t handle, int n, const double *alpha, const double *x, int incx, double *y, int incy)
Definition of CUDA's cublasDaxpy function using dynamically loaded cublas library.
Definition cublas_symbols.cpp:353

cublasScopy
cublasStatus_t cublasScopy(cublasHandle_t handle, int n, const float *x, int incx, float *y, int incy)
Definition of CUDA's cublasScopy function using dynamically loaded cublas library.
Definition cublas_symbols.cpp:264

cublasDnrm2
cublasStatus_t cublasDnrm2(cublasHandle_t handle, int n, const double *x, int incx, double *result)
Definition of CUDA's cublasDnrm2 function using dynamically loaded cublas library.
Definition cublas_symbols.cpp:475

cublasDdot
cublasStatus_t cublasDdot(cublasHandle_t handle, int n, const double *x, int incx, const double *y, int incy, double *result)
Definition of CUDA's cublasDdot function using dynamically loaded cublas library.
Definition cublas_symbols.cpp:415

cublas_api
A collection of templates to wrapper cublas functions.
Definition cublas_api.cu:34

cublas_api::cublasXaxpy< double >
cublasStatus_t cublasXaxpy< double >(cublasHandle_t handle, int n, const double *RESTRICT alpha, const double *RESTRICT x, int incx, double *RESTRICT y, int incy)
Performs  on double precision.
Definition cublas_api.cu:956

cublas_api::cublasXgemv< double >
cublasStatus_t cublasXgemv< double >(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const double *RESTRICT alpha, const double *RESTRICT A, int lda, const double *RESTRICT x, int incx, const double *RESTRICT beta, double *RESTRICT y, int incy)
Performs .
Definition cublas_api.cu:481

cublas_api::cublasXdot< float >
cublasStatus_t cublasXdot< float >(cublasHandle_t handle, int n, const float *RESTRICT x, int incx, const float *RESTRICT y, int incy, float *RESTRICT result)
Performs  on __half precision.
Definition cublas_api.cu:1135

cublas_api::cublasXscal< float >
cublasStatus_t cublasXscal< float >(cublasHandle_t handle, int n, const float *RESTRICT alpha, float *RESTRICT x, int incx)
Performs  on __half precision.
Definition cublas_api.cu:1571

cublas_api::cublasXcopy< double >
cublasStatus_t cublasXcopy< double >(cublasHandle_t handle, int n, const double *RESTRICT x, int incx, double *RESTRICT y, int incy)
Performs  in double type.
Definition cublas_api.cu:716

cublas_api::cublasXdot< double >
cublasStatus_t cublasXdot< double >(cublasHandle_t handle, int n, const double *RESTRICT x, int incx, const double *RESTRICT y, int incy, double *RESTRICT result)
Performs  on double precision.
Definition cublas_api.cu:1196

cublas_api::cublasXaxpy< float >
cublasStatus_t cublasXaxpy< float >(cublasHandle_t handle, int n, const float *RESTRICT alpha, const float *RESTRICT x, int incx, float *RESTRICT y, int incy)
Performs  on __half precision.
Definition cublas_api.cu:895

cublas_api::cublasXnrm2< double >
cublasStatus_t cublasXnrm2< double >(cublasHandle_t handle, int n, const double *RESTRICT x, int incx, double *RESTRICT result)
Performs  on double precision.
Definition cublas_api.cu:1410

cublas_api::cublasXnrm2< float >
cublasStatus_t cublasXnrm2< float >(cublasHandle_t handle, int n, const float *RESTRICT x, int incx, float *RESTRICT result)
Performs  on __half precision.
Definition cublas_api.cu:1356

cublas_api::cublasXgemv< float >
cublasStatus_t cublasXgemv< float >(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float *RESTRICT alpha, const float *RESTRICT A, int lda, const float *RESTRICT x, int incx, const float *RESTRICT beta, float *RESTRICT y, int incy)
Performs .
Definition cublas_api.cu:399

cublas_api::cublasXscal< double >
cublasStatus_t cublasXscal< double >(cublasHandle_t handle, int n, const double *RESTRICT alpha, double *RESTRICT x, int incx)
Performs  on double precision.
Definition cublas_api.cu:1626

cublas_api::cublasXcopy< float >
cublasStatus_t cublasXcopy< float >(cublasHandle_t handle, int n, const float *RESTRICT x, int incx, float *RESTRICT y, int incy)
Performs  in __half type.
Definition cublas_api.cu:660

__nv_fp8_e4m3
Definition cu_types.h:34

__nv_fp8_e5m2
Definition cu_types.h:27