imate
C++/CUDA Reference
cu_golub_kahn_bidiagonalization.cu
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>
3  * SPDX-License-Identifier: BSD-3-Clause
4  * SPDX-FileType: SOURCE
5  *
6  * This program is free software: you can redistribute it and/or modify it
7  * under the terms of the license found in the LICENSE.txt file in the root
8  * directory of this source tree.
9  */
10 
11 
12 // =======
13 // Headers
14 // =======
15 
17 #include <cublas_v2.h> // cublasHandle_t
18 #include <cmath> // sqrt
19 #include "../_cu_basic_algebra/cu_vector_operations.h" // cuVectorOperations
20 #include "../_cu_trace_estimator/cu_orthogonalization.h" // cuOrthogonaliza...
21 #include "../_cuda_utilities/cuda_interface.h" // alloc, copy_to_device, del
22 
23 
24 // ============================
25 // golub-kahn bidiagonalization
26 // ============================
27 
111 
112 template <typename DataType>
115  const DataType* v,
116  const LongIndexType n,
117  const IndexType m,
118  const DataType lanczos_tol,
119  const FlagType orthogonalize,
120  DataType* alpha,
121  DataType* beta)
122 {
123  // Get cublas handle
124  cublasHandle_t cublas_handle = A->get_cublas_handle();
125 
126  // buffer_size is number of last orthogonal vectors to keep in buffers U, V
127  IndexType buffer_size;
128  if (orthogonalize == 0)
129  {
130  // At least two vectors must be stored in buffer for Lanczos recursion
131  buffer_size = 2;
132  }
133  else if ((orthogonalize < 0) ||
134  (orthogonalize > static_cast<FlagType>(m) - 1))
135  {
136  // Using full reorthogonalization, keep all of the m vectors in buffer
137  buffer_size = m;
138  }
139  else
140  {
141  // Orthogonalize with less than m vectors (0 < orthogonalize < m-1)
142  // plus one vector for the latest (the j-th) vector
143  buffer_size = orthogonalize + 1;
144  }
145 
146  // Allocate 2D array (as 1D array, and coalesced row-wise) to store
147  // the last buffer_size of orthogonalized vectors of length n. New vectors
148  // are stored by cycling through the buffer to replace with old ones.
149  DataType* device_U = CudaInterface<DataType>::alloc(n * buffer_size);
150  DataType* device_V = CudaInterface<DataType>::alloc(n * buffer_size);
151 
152  // Normalize vector v and copy to v_old
153  CudaInterface<DataType>::copy_to_device(v, n, &device_V[0]);
155  cublas_handle, &device_V[0], n);
156 
157  // Declare iterators
158  IndexType j;
159  IndexType lanczos_size = 0;
160  IndexType num_ortho;
161 
162  // Golub-Kahn iteration
163  for (j=0; j < m; ++j)
164  {
165  // Counter for the non-zero size of alpha and beta
166  ++lanczos_size;
167 
168  // u_new = A.dot(v_old)
169  A->dot(&device_V[(j % buffer_size)*n], &device_U[(j % buffer_size)*n]);
170 
171  // Performing: u_new[i] = u_new[i] - beta[j] * u_old[i]
172  if (j > 0)
173  {
175  cublas_handle,
176  &device_U[((j-1) % buffer_size)*n], n, beta[j-1],
177  &device_U[(j % buffer_size)*n]);
178  }
179 
180  // orthogonalize u_new against previous vectors
181  if (orthogonalize != 0)
182  {
183  // Find how many column vectors are filled so far in the buffer V
184  if (j < buffer_size)
185  {
186  num_ortho = j;
187  }
188  else
189  {
190  num_ortho = buffer_size - 1;
191  }
192 
193  // Gram-Schmidt process
194  if (j > 0)
195  {
197  cublas_handle, &device_U[0], n, buffer_size,
198  (j-1)%buffer_size, num_ortho,
199  &device_U[(j % buffer_size)*n]);
200  }
201  }
202 
203  // Normalize u_new and set its norm to alpha[j]
205  cublas_handle, &device_U[(j % buffer_size)*n], n);
206 
207  // Performing: v_new = A.T.dot(u_new) - alpha[j] * v_old
208  A->transpose_dot(&device_U[(j % buffer_size)*n],
209  &device_V[((j+1) % buffer_size)*n]);
210 
211  // Performing: v_new[i] = v_new[i] - alpha[j] * v_old[i]
213  cublas_handle, &device_V[(j % buffer_size)*n], n, alpha[j],
214  &device_V[((j+1) % buffer_size)*n]);
215 
216  // orthogonalize v_new against previous vectors
217  if (orthogonalize != 0)
218  {
220  cublas_handle, &device_V[0], n, buffer_size, j%buffer_size,
221  num_ortho, &device_V[((j+1) % buffer_size)*n]);
222  }
223 
224  // Update beta as the norm of v_new
226  cublas_handle, &device_V[((j+1) % buffer_size)*n], n);
227 
228  // Exit criterion when the vector r is zero. If each component of a
229  // zero vector has the tolerance epsilon, (which is called lanczos_tol
230  // here), the tolerance of norm of r is epsilon times sqrt of n.
231  if (beta[j] < lanczos_tol * sqrt(n))
232  {
233  break;
234  }
235  }
236 
237  // Free dynamic memory
240 
241  return lanczos_size;
242 }
243 
244 
245 // ===============================
246 // Explicit template instantiation
247 // ===============================
248 
249 // golub kahn bidiagonalization
252  const float* v,
253  const LongIndexType n,
254  const IndexType m,
255  const float lanczos_tol,
256  const FlagType orthogonalize,
257  float* alpha,
258  float* beta);
259 
262  const double* v,
263  const LongIndexType n,
264  const IndexType m,
265  const double lanczos_tol,
266  const FlagType orthogonalize,
267  double* alpha,
268  double* beta);
static void del(void *device_array)
Deletes memory on gpu device if its pointer is not NULL, then sets the pointer to NULL.
static ArrayType * alloc(const LongIndexType array_size)
Allocates memory on gpu device. This function creates a pointer and returns it.
static void copy_to_device(const ArrayType *host_array, const LongIndexType array_size, ArrayType *device_array)
Copies memory on host to device memory.
virtual void transpose_dot(const DataType *vector, DataType *product)=0
virtual void dot(const DataType *vector, DataType *product)=0
Base class for linear operators. This class serves as interface for all derived classes.
cublasHandle_t get_cublas_handle() const
This function returns a reference to the cublasHandle_t object. The object will be created,...
static void gram_schmidt_process(cublasHandle_t cublas_handle, const DataType *V, const LongIndexType vector_size, const IndexType num_vectors, const IndexType last_vector, const FlagType num_ortho, DataType *r)
Modified Gram-Schmidt orthogonalization process to orthogonalize the vector v against a subset of the...
static void subtract_scaled_vector(cublasHandle_t cublas_handle, const DataType *input_vector, const LongIndexType vector_size, const DataType scale, DataType *output_vector)
Subtracts the scaled input vector from the output vector.
static DataType normalize_vector_in_place(cublasHandle_t cublas_handle, DataType *vector, const LongIndexType vector_size)
Normalizes a vector based on Euclidean 2-norm. The result is written in-place.
template IndexType cu_golub_kahn_bidiagonalization< double >(cuLinearOperator< double > *A, const double *v, const LongIndexType n, const IndexType m, const double lanczos_tol, const FlagType orthogonalize, double *alpha, double *beta)
IndexType cu_golub_kahn_bidiagonalization(cuLinearOperator< DataType > *A, const DataType *v, const LongIndexType n, const IndexType m, const DataType lanczos_tol, const FlagType orthogonalize, DataType *alpha, DataType *beta)
Bi-diagonalizes the positive-definite matrix A using Golub-Kahn-Lanczos method.
template IndexType cu_golub_kahn_bidiagonalization< float >(cuLinearOperator< float > *A, const float *v, const LongIndexType n, const IndexType m, const float lanczos_tol, const FlagType orthogonalize, float *alpha, float *beta)
int LongIndexType
Definition: types.h:60
int FlagType
Definition: types.h:68
int IndexType
Definition: types.h:65