doxygen/html/cu__golub__kahn__bidiagonalization_8cu_source.html

 /*

  *  SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>

  *  SPDX-License-Identifier: BSD-3-Clause

  *  SPDX-FileType: SOURCE

  *

  *  This program is free software: you can redistribute it and/or modify it

  *  under the terms of the license found in the LICENSE.txt file in the root

  *  directory of this source tree.

  */


 // =======

 // Headers

 // =======


 #include "./cu_golub_kahn_bidiagonalization.h"

 #include <cublas_v2.h>  // cublasHandle_t

 #include <cmath>  // sqrt

 #include "../_cu_basic_algebra/cu_vector_operations.h"  // cuVectorOperations

 #include "../_cu_trace_estimator/cu_orthogonalization.h"  // cuOrthogonaliza...

 #include "../_cuda_utilities/cuda_interface.h"  // alloc, copy_to_device, del


 // ============================

 // golub-kahn bidiagonalization

 // ============================


 template <typename DataType>

 IndexType cu_golub_kahn_bidiagonalization(

         cuLinearOperator<DataType>* A,

         const DataType* v,

         const LongIndexType n,

         const IndexType m,

         const DataType lanczos_tol,

         const FlagType orthogonalize,

         DataType* alpha,

         DataType* beta)

 {

     // Get cublas handle

     cublasHandle_t cublas_handle = A->get_cublas_handle();


     // buffer_size is number of last orthogonal vectors to keep in buffers U, V

     IndexType buffer_size;

     if (orthogonalize == 0)

     {

         // At least two vectors must be stored in buffer for Lanczos recursion

         buffer_size = 2;

     }

     else if ((orthogonalize < 0) ||

              (orthogonalize > static_cast<FlagType>(m) - 1))

     {

         // Using full reorthogonalization, keep all of the m vectors in buffer

         buffer_size = m;

     }

     else

     {

         // Orthogonalize with less than m vectors (0 < orthogonalize < m-1)

         // plus one vector for the latest (the j-th) vector

         buffer_size = orthogonalize + 1;

     }


     // Allocate 2D array (as 1D array, and coalesced row-wise) to store

     // the last buffer_size of orthogonalized vectors of length n. New vectors

     // are stored by cycling through the buffer to replace with old ones.

     DataType* device_U = CudaInterface<DataType>::alloc(n * buffer_size);

     DataType* device_V = CudaInterface<DataType>::alloc(n * buffer_size);


     // Normalize vector v and copy to v_old

     CudaInterface<DataType>::copy_to_device(v, n, &device_V[0]);

     cuVectorOperations<DataType>::normalize_vector_in_place(

             cublas_handle, &device_V[0], n);


     // Declare iterators

     IndexType j;

     IndexType lanczos_size = 0;

     IndexType num_ortho;


     // Golub-Kahn iteration

     for (j=0; j < m; ++j)

     {

         // Counter for the non-zero size of alpha and beta

         ++lanczos_size;


         // u_new = A.dot(v_old)

         A->dot(&device_V[(j % buffer_size)*n], &device_U[(j % buffer_size)*n]);


         // Performing: u_new[i] = u_new[i] - beta[j] * u_old[i]

         if (j > 0)

         {

             cuVectorOperations<DataType>::subtract_scaled_vector(

                     cublas_handle,

                     &device_U[((j-1) % buffer_size)*n], n, beta[j-1],

                     &device_U[(j % buffer_size)*n]);

         }


         // orthogonalize u_new against previous vectors

         if (orthogonalize != 0)

         {

             // Find how many column vectors are filled so far in the buffer V

             if (j < buffer_size)

             {

                 num_ortho = j;

             }

             else

             {

                 num_ortho = buffer_size - 1;

             }


             // Gram-Schmidt process

             if (j > 0)

             {

                 cuOrthogonalization<DataType>::gram_schmidt_process(

                         cublas_handle, &device_U[0], n, buffer_size,

                         (j-1)%buffer_size, num_ortho,

                         &device_U[(j % buffer_size)*n]);

             }

         }


         // Normalize u_new and set its norm to alpha[j]

         alpha[j] = cuVectorOperations<DataType>::normalize_vector_in_place(

                 cublas_handle, &device_U[(j % buffer_size)*n], n);


         // Performing: v_new = A.T.dot(u_new) - alpha[j] * v_old

         A->transpose_dot(&device_U[(j % buffer_size)*n],

                &device_V[((j+1) % buffer_size)*n]);


         // Performing: v_new[i] = v_new[i] - alpha[j] * v_old[i]

         cuVectorOperations<DataType>::subtract_scaled_vector(

                 cublas_handle, &device_V[(j % buffer_size)*n], n, alpha[j],

                 &device_V[((j+1) % buffer_size)*n]);


         // orthogonalize v_new against previous vectors

         if (orthogonalize != 0)

         {

             cuOrthogonalization<DataType>::gram_schmidt_process(

                     cublas_handle, &device_V[0], n, buffer_size, j%buffer_size,

                     num_ortho, &device_V[((j+1) % buffer_size)*n]);

         }


         // Update beta as the norm of v_new

         beta[j] = cuVectorOperations<DataType>::normalize_vector_in_place(

                 cublas_handle, &device_V[((j+1) % buffer_size)*n], n);


         // Exit criterion when the vector r is zero. If each component of a

         // zero vector has the tolerance epsilon, (which is called lanczos_tol

         // here), the tolerance of norm of r is epsilon times sqrt of n.

         if (beta[j] < lanczos_tol * sqrt(n))

         {

             break;

         }

     }


     // Free dynamic memory

     CudaInterface<DataType>::del(device_U);

     CudaInterface<DataType>::del(device_V);


     return lanczos_size;

 }


 // ===============================

 // Explicit template instantiation

 // ===============================


 // golub kahn bidiagonalization

 template IndexType cu_golub_kahn_bidiagonalization<float>(

         cuLinearOperator<float>* A,

         const float* v,

         const LongIndexType n,

         const IndexType m,

         const float lanczos_tol,

         const FlagType orthogonalize,

         float* alpha,

         float* beta);


 template IndexType cu_golub_kahn_bidiagonalization<double>(

         cuLinearOperator<double>* A,

         const double* v,

         const LongIndexType n,

         const IndexType m,

         const double lanczos_tol,

         const FlagType orthogonalize,

         double* alpha,

         double* beta);

CudaInterface::del
static void del(void *device_array)
Deletes memory on gpu device if its pointer is not NULL, then sets the pointer to NULL.
Definition: cuda_interface.cu:166

CudaInterface::alloc
static ArrayType * alloc(const LongIndexType array_size)
Allocates memory on gpu device. This function creates a pointer and returns it.
Definition: cuda_interface.cu:36

CudaInterface::copy_to_device
static void copy_to_device(const ArrayType *host_array, const LongIndexType array_size, ArrayType *device_array)
Copies memory on host to device memory.
Definition: cuda_interface.cu:142

cLinearOperator::transpose_dot
virtual void transpose_dot(const DataType *vector, DataType *product)=0

cLinearOperator::dot
virtual void dot(const DataType *vector, DataType *product)=0

cuLinearOperator
Base class for linear operators. This class serves as interface for all derived classes.
Definition: cu_linear_operator.h:44

cuLinearOperator::get_cublas_handle
cublasHandle_t get_cublas_handle() const
This function returns a reference to the cublasHandle_t object. The object will be created,...
Definition: cu_linear_operator.cu:168

cuOrthogonalization::gram_schmidt_process
static void gram_schmidt_process(cublasHandle_t cublas_handle, const DataType *V, const LongIndexType vector_size, const IndexType num_vectors, const IndexType last_vector, const FlagType num_ortho, DataType *r)
Modified Gram-Schmidt orthogonalization process to orthogonalize the vector v against a subset of the...
Definition: cu_orthogonalization.cu:128

cuVectorOperations::subtract_scaled_vector
static void subtract_scaled_vector(cublasHandle_t cublas_handle, const DataType *input_vector, const LongIndexType vector_size, const DataType scale, DataType *output_vector)
Subtracts the scaled input vector from the output vector.
Definition: cu_vector_operations.cu:126

cuVectorOperations::normalize_vector_in_place
static DataType normalize_vector_in_place(cublasHandle_t cublas_handle, DataType *vector, const LongIndexType vector_size)
Normalizes a vector based on Euclidean 2-norm. The result is written in-place.
Definition: cu_vector_operations.cu:234

cu_golub_kahn_bidiagonalization< double >
template IndexType cu_golub_kahn_bidiagonalization< double >(cuLinearOperator< double > *A, const double *v, const LongIndexType n, const IndexType m, const double lanczos_tol, const FlagType orthogonalize, double *alpha, double *beta)

cu_golub_kahn_bidiagonalization
IndexType cu_golub_kahn_bidiagonalization(cuLinearOperator< DataType > *A, const DataType *v, const LongIndexType n, const IndexType m, const DataType lanczos_tol, const FlagType orthogonalize, DataType *alpha, DataType *beta)
Bi-diagonalizes the positive-definite matrix A using Golub-Kahn-Lanczos method.
Definition: cu_golub_kahn_bidiagonalization.cu:113

cu_golub_kahn_bidiagonalization< float >
template IndexType cu_golub_kahn_bidiagonalization< float >(cuLinearOperator< float > *A, const float *v, const LongIndexType n, const IndexType m, const float lanczos_tol, const FlagType orthogonalize, float *alpha, float *beta)

cu_golub_kahn_bidiagonalization.h

LongIndexType
int LongIndexType
Definition: types.h:60

FlagType
int FlagType
Definition: types.h:68

IndexType
int IndexType
Definition: types.h:65