17#include "../_cu_definitions/cu_types.h"
21#include "../_cu_basic_algebra/cu_vector_operations.h"
22#include "../_cuda_utilities/cuda_api.h"
23#include "../_cu_arithmetics/cu_arithmetics.h"
27 #pragma warning(push, 0)
28 #include <cublas_v2.h>
30#elif defined(__INTEL_LLVM_COMPILER) || defined(__INTEL_COMPILER)
31 #pragma warning(push, 0)
32 #include <cublas_v2.h>
34#elif defined(__GNUC__) || defined(__clang__)
35 #pragma GCC diagnostic push
36 #pragma GCC diagnostic ignored "-Wswitch-enum"
37 #include <cublas_v2.h>
38 #pragma GCC diagnostic pop
40 #include <cublas_v2.h>
138template <
typename DataType>
144 const DataType lanczos_tol,
154 if (orthogonalize == 0 || orthogonalize == 1)
159 else if ((orthogonalize < 0) ||
160 (orthogonalize >
static_cast<FlagType>(m)))
168 buffer_size = orthogonalize;
184 cublas_handle, device_r, n);
192 for (j=0; j < m; ++j)
201 cublas_handle, device_r, n,
203 &device_V[(j % buffer_size)*n]);
208 cublas_handle, device_r, n,
210 &device_V[(j % buffer_size)*n]);
214 A->
dot(&device_V[(j % buffer_size)*n], device_r);
218 cublas_handle, &device_V[(j % buffer_size)*n], device_r, n);
222 cublas_handle, &device_V[(j % buffer_size)*n], n, alpha[j],
229 cublas_handle, &device_V[((j-1) % buffer_size)*n], n,
230 beta[j-1], device_r);
234 if (orthogonalize != 0)
243 num_ortho = buffer_size;
248 cublas_handle, &device_V[0], n, buffer_size, j%buffer_size,
249 num_ortho, device_r);
254 cublas_handle, device_r, n);
262 static_cast<double>(std::sqrt(n)))
283#if defined(USE_CUDA_FP8_E5M2) && (USE_CUDA_FP8_E5M2 == 1)
284 template IndexType cu_lanczos_tridiagonalization<__nv_fp8_e5m2>(
296#if defined(USE_CUDA_FP8_E4M3) && (USE_CUDA_FP8_E4M3 == 1)
297 template IndexType cu_lanczos_tridiagonalization<__nv_fp8_e4m3>(
309#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
310 template IndexType cu_lanczos_tridiagonalization<__half>(
315 const __half lanczos_tol,
322#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
323 template IndexType cu_lanczos_tridiagonalization<__nv_bfloat16>(
325 const __nv_bfloat16* v,
328 const __nv_bfloat16 lanczos_tol,
330 __nv_bfloat16* alpha,
331 __nv_bfloat16* beta);
335#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
341 const float lanczos_tol,
348#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
354 const double lanczos_tol,
static ArrayType * alloc(const size_t array_size)
Allocates memory on gpu device. This function creates a pointer and returns it.
static void del(void *device_array)
Deletes memory on gpu device if its pointer is not NULL, then sets the pointer to NULL.
static void copy_to_device(const ArrayType *host_array, const size_t array_size, ArrayType *device_array)
Copies memory on host to device memory.
Base class for linear operators. This class serves as interface for all derived classes.
virtual void dot(const DataType *vector, DataType *product)=0
cublasHandle_t get_cublas_handle() const
This function returns a reference to the cublasHandle_t object. The object will be created,...
static void gram_schmidt_process(cublasHandle_t cublas_handle, const DataType *V, const LongIndexType vector_size, const IndexType num_vectors, const IndexType last_vector, const FlagType num_ortho, DataType *r)
Modified Gram-Schmidt orthogonalization process to orthogonalize the vector v against a subset of the...
static void copy_scaled_vector(cublasHandle_t cublas_handle, const DataType *RESTRICT input_vector, const LongIndexType vector_size, const DataType scale, DataType *RESTRICT output_vector)
Scales a vector and stores to a new vector.
static void subtract_scaled_vector(cublasHandle_t cublas_handle, const DataType *RESTRICT input_vector, const LongIndexType vector_size, const DataType scale, DataType *RESTRICT output_vector)
Subtracts the scaled input vector from the output vector.
static DataType inner_product(cublasHandle_t cublas_handle, const DataType *RESTRICT vector1, const DataType *RESTRICT vector2, const LongIndexType vector_size)
Computes Euclidean inner product of two vectors.
static DataType euclidean_norm(cublasHandle_t cublas_handle, const DataType *RESTRICT vector, const LongIndexType vector_size)
Computes the Euclidean 2-norm of a 1D array.
template IndexType cu_lanczos_tridiagonalization< double >(cuLinearOperator< double > *A, const double *v, const LongIndexType n, const IndexType m, const double lanczos_tol, const FlagType orthogonalize, double *alpha, double *beta)
template IndexType cu_lanczos_tridiagonalization< float >(cuLinearOperator< float > *A, const float *v, const LongIndexType n, const IndexType m, const float lanczos_tol, const FlagType orthogonalize, float *alpha, float *beta)
IndexType cu_lanczos_tridiagonalization(cuLinearOperator< DataType > *A, const DataType *v, const LongIndexType n, const IndexType m, const DataType lanczos_tol, const FlagType orthogonalize, DataType *alpha, DataType *beta)
Tri-diagonalizes matrix A to T using the start vector v. is the Lanczos degree, which will be the siz...
__host__ __device__ DataType mul(const DataType x, const DataType y)
Multiply two floating point numbers in round-to-nearest-even mode.
__host__ __device__ DataType abs(const DataType x)
Absolute value of a floating point number.