#include "./cu_golub_kahn_bidiagonalization.h"
#include <cublas_v2.h>
#include <cmath>
#include "../_cu_basic_algebra/cu_vector_operations.h"
#include "../_cu_trace_estimator/cu_orthogonalization.h"
#include "../_cuda_utilities/cuda_interface.h"

Include dependency graph for cu_golub_kahn_bidiagonalization.cu:

Functions
template<typename DataType >
IndexType	cu_golub_kahn_bidiagonalization (cuLinearOperator< DataType > A, const DataType v, const LongIndexType n, const IndexType m, const DataType lanczos_tol, const FlagType orthogonalize, DataType alpha, DataType beta)
	Bi-diagonalizes the positive-definite matrix `A` using Golub-Kahn-Lanczos method. More...

template IndexType	cu_golub_kahn_bidiagonalization< float > (cuLinearOperator< float > A, const float v, const LongIndexType n, const IndexType m, const float lanczos_tol, const FlagType orthogonalize, float alpha, float beta)

template IndexType	cu_golub_kahn_bidiagonalization< double > (cuLinearOperator< double > A, const double v, const LongIndexType n, const IndexType m, const double lanczos_tol, const FlagType orthogonalize, double alpha, double beta)

Function Documentation

◆ cu_golub_kahn_bidiagonalization()

template<typename DataType >

IndexType cu_golub_kahn_bidiagonalization	(	cuLinearOperator< DataType > *	A,
		const DataType *	v,
		const LongIndexType	n,
		const IndexType	m,
		const DataType	lanczos_tol,
		const FlagType	orthogonalize,
		DataType *	alpha,
		DataType *	beta
	)

Bi-diagonalizes the positive-definite matrix A using Golub-Kahn-Lanczos method.

This method bi-diagonalizes matrix A to B using the start vector w. m is the Lanczos degree, which will be the size of square matrix B.

The output of this function are alpha (of length m) and beta (of length m+1) which are diagonal (alpha[:]) and off-diagonal (beta[1:]) elements of the bi-diagonal (m,m) symmetric and positive-definite matrix B.

Lanczos tridiagonalization vs Golub-Kahn Bidiagonalization

The Lanczos tri-diagonalization is twice faster (in runtime), as it has only one matrix-vector multiplication. Whereas the Golub-Kahn bi-diagonalization has two matrix-vector multiplications.
The Lanczos tri-diagonalization can only be applied to symmetric matrices. Whereas the Golub-Kahn bi-diagonalization can be applied to any matrix.

Reference

NetLib Algorithm 6.27, netlib.org/utk/people/JackDongarra/etemplates/node198.html
Matrix Computations, Golub, p. 495
Demmel, J., Templates for Solution of Algebraic Eigenvalue Problem, p. 143

Warning: When the matrix A is very close to the identity matrix, the Golub-Kahn bi-diagonalization method can not find beta, as beta becomes zero. If A is not exactly identity, you may decrease the Tolerance to a very small number. However, if A is almost identity matrix, decreasing lanczos_tol will not help, and this function cannot be used.

See also: lanczos_tridiagonalizaton

Parameters

[in]	A	A linear operator that represents a matrix of size (n,n) and can perform matrix-vector operation with `dot()` method and transposed matrix-vector operation with `transpose_dot()` method. This matrix should be positive-definite.
[in]	v	Start vector for the Lanczos tri-diagonalization. Column vector of size `n`. It could be generated randomly. Often it is generated by the Rademacher distribution with entries `+1` and `-1`.
[in]	n	Size of the square matrix `A`, which is also the size of the vector `v`.
[in]	m	Lanczos degree, which is the number of Lanczos iterations.
[in]	lanczos_tol	The tolerance of the residual error of the Lanczos iteration.
[in]	orthogonalize	Indicates whether to orthogonalize the orthogonal eigenvectors during Lanczos recursive iterations. If set to `0`, no orthogonalization is performed. If set to a negative integer, a newly computed eigenvector is orthogonalized against all the previous eigenvectors (full reorthogonalization). If set to a positive integer, say `q` less than `lanczos_degree`, the newly computed eigenvector is orthogonalized against the last `q` previous eigenvectors (partial reorthogonalization). If set to an integer larger than `lanczos_degree`, it is cut to `lanczos_degree`, which effectively orthogonalizes against all previous eigenvectors (full reorthogonalization).
[out]	alpha	This is a 1D array of size `m` and `alpha`[:] constitute the diagonal elements of the bi-diagonal matrix `B`. This is the output and written in place.
[out]	beta	This is a 1D array of size `m`, and the elements `beta`[:] constitute the sup-diagonals of the bi-diagonal matrix `B`. This array is the output and written in place.

Returns: Counter for the Lanczos iterations. Normally, the size of the output matrix should be (m,m), which is the Lanczos degree. However, if the algorithm terminates early, the size of alpha and beta, and hence the output tri-diagonal matrix, is smaller. This counter keeps track of the non-zero size of alpha and beta.

Definition at line 113 of file cu_golub_kahn_bidiagonalization.cu.

 {
     // Get cublas handle
     cublasHandle_t cublas_handle = A->get_cublas_handle();
  
     // buffer_size is number of last orthogonal vectors to keep in buffers U, V
     IndexType buffer_size;
     if (orthogonalize == 0)
     {
         // At least two vectors must be stored in buffer for Lanczos recursion
         buffer_size = 2;
     }
     else if ((orthogonalize < 0) ||
              (orthogonalize > static_cast<FlagType>(m) - 1))
     {
         // Using full reorthogonalization, keep all of the m vectors in buffer
         buffer_size = m;
     }
     else
     {
         // Orthogonalize with less than m vectors (0 < orthogonalize < m-1)
         // plus one vector for the latest (the j-th) vector
         buffer_size = orthogonalize + 1;
     }
  
     // Allocate 2D array (as 1D array, and coalesced row-wise) to store
     // the last buffer_size of orthogonalized vectors of length n. New vectors
     // are stored by cycling through the buffer to replace with old ones.
     DataType* device_U = CudaInterface<DataType>::alloc(n * buffer_size);
     DataType* device_V = CudaInterface<DataType>::alloc(n * buffer_size);
  
     // Normalize vector v and copy to v_old
     CudaInterface<DataType>::copy_to_device(v, n, &device_V[0]);
     cuVectorOperations<DataType>::normalize_vector_in_place(
             cublas_handle, &device_V[0], n);
  
     // Declare iterators
     IndexType j;
     IndexType lanczos_size = 0;
     IndexType num_ortho;
  
     // Golub-Kahn iteration
     for (j=0; j < m; ++j)
     {
         // Counter for the non-zero size of alpha and beta
         ++lanczos_size;
  
         // u_new = A.dot(v_old)
         A->dot(&device_V[(j % buffer_size)*n], &device_U[(j % buffer_size)*n]);
  
         // Performing: u_new[i] = u_new[i] - beta[j] * u_old[i]
         if (j > 0)
         {
             cuVectorOperations<DataType>::subtract_scaled_vector(
                     cublas_handle,
                     &device_U[((j-1) % buffer_size)*n], n, beta[j-1],
                     &device_U[(j % buffer_size)*n]);
         }
  
         // orthogonalize u_new against previous vectors
         if (orthogonalize != 0)
         {
             // Find how many column vectors are filled so far in the buffer V
             if (j < buffer_size)
             {
                 num_ortho = j;
             }
             else
             {
                 num_ortho = buffer_size - 1;
             }
  
             // Gram-Schmidt process
             if (j > 0)
             {
                 cuOrthogonalization<DataType>::gram_schmidt_process(
                         cublas_handle, &device_U[0], n, buffer_size,
                         (j-1)%buffer_size, num_ortho,
                         &device_U[(j % buffer_size)*n]);
             }
         }
  
         // Normalize u_new and set its norm to alpha[j]
         alpha[j] = cuVectorOperations<DataType>::normalize_vector_in_place(
                 cublas_handle, &device_U[(j % buffer_size)*n], n);
  
         // Performing: v_new = A.T.dot(u_new) - alpha[j] * v_old
         A->transpose_dot(&device_U[(j % buffer_size)*n],
                &device_V[((j+1) % buffer_size)*n]);
  
         // Performing: v_new[i] = v_new[i] - alpha[j] * v_old[i]
         cuVectorOperations<DataType>::subtract_scaled_vector(
                 cublas_handle, &device_V[(j % buffer_size)*n], n, alpha[j],
                 &device_V[((j+1) % buffer_size)*n]);
  
         // orthogonalize v_new against previous vectors
         if (orthogonalize != 0)
         {
             cuOrthogonalization<DataType>::gram_schmidt_process(
                     cublas_handle, &device_V[0], n, buffer_size, j%buffer_size,
                     num_ortho, &device_V[((j+1) % buffer_size)*n]);
         }
  
         // Update beta as the norm of v_new
         beta[j] = cuVectorOperations<DataType>::normalize_vector_in_place(
                 cublas_handle, &device_V[((j+1) % buffer_size)*n], n);
  
         // Exit criterion when the vector r is zero. If each component of a
         // zero vector has the tolerance epsilon, (which is called lanczos_tol
         // here), the tolerance of norm of r is epsilon times sqrt of n.
         if (beta[j] < lanczos_tol * sqrt(n))
         {
             break;
         }
     }
  
     // Free dynamic memory
     CudaInterface<DataType>::del(device_U);
     CudaInterface<DataType>::del(device_V);
  
     return lanczos_size;
 }

References CudaInterface< ArrayType >::alloc(), CudaInterface< ArrayType >::copy_to_device(), CudaInterface< ArrayType >::del(), cLinearOperator< DataType >::dot(), cuLinearOperator< DataType >::get_cublas_handle(), cuOrthogonalization< DataType >::gram_schmidt_process(), cuVectorOperations< DataType >::normalize_vector_in_place(), cuVectorOperations< DataType >::subtract_scaled_vector(), and cLinearOperator< DataType >::transpose_dot().

Referenced by cuTraceEstimator< DataType >::_cu_stochastic_lanczos_quadrature().

Here is the call graph for this function:

Here is the caller graph for this function:

◆ cu_golub_kahn_bidiagonalization< double >()

template IndexType cu_golub_kahn_bidiagonalization< double >	(	cuLinearOperator< double > *	A,
		const double *	v,
		const LongIndexType	n,
		const IndexType	m,
		const double	lanczos_tol,
		const FlagType	orthogonalize,
		double *	alpha,
		double *	beta
	)

◆ cu_golub_kahn_bidiagonalization< float >()