imate
C++/CUDA Reference
cu_lanczos_tridiagonalization.cu
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>
3  * SPDX-License-Identifier: BSD-3-Clause
4  * SPDX-FileType: SOURCE
5  *
6  * This program is free software: you can redistribute it and/or modify it
7  * under the terms of the license found in the LICENSE.txt file in the root
8  * directory of this source tree.
9  */
10 
11 
12 // =======
13 // Headers
14 // =======
15 
17 #include <cublas_v2.h> // cublasHandle_t
18 #include <cmath> // sqrt
19 #include "./cu_orthogonalization.h" // cuOrthogonalization
20 #include "../_cu_basic_algebra/cu_vector_operations.h" // cuVectorOperations
21 #include "../_cuda_utilities/cuda_interface.h" // alloc, copy_to_device, del
22 
23 
24 // ============================
25 // c lanczos tridiagonalization
26 // ============================
27 
117 
118 template <typename DataType>
121  const DataType* v,
122  const LongIndexType n,
123  const IndexType m,
124  const DataType lanczos_tol,
125  const FlagType orthogonalize,
126  DataType* alpha,
127  DataType* beta)
128 {
129  // Get cublas handle
130  cublasHandle_t cublas_handle = A->get_cublas_handle();
131 
132  // buffer_size is number of last orthogonal vectors to keep in the buffer V
133  IndexType buffer_size;
134  if (orthogonalize == 0 || orthogonalize == 1)
135  {
136  // At least two vectors must be stored in buffer for Lanczos recursion
137  buffer_size = 2;
138  }
139  else if ((orthogonalize < 0) ||
140  (orthogonalize > static_cast<FlagType>(m)))
141  {
142  // Using full reorthogonalization, keep all of the m vectors in buffer
143  buffer_size = m;
144  }
145  else
146  {
147  // Orthogonalize with less than m vectors (0 < orthogonalize < m)
148  buffer_size = orthogonalize;
149  }
150 
151  // Allocate 2D array (as 1D array, and coalesced row-wise) to store
152  // the last buffer_size of orthogonalized vectors of length n. New vectors
153  // are stored by cycling through the buffer to replace with old ones.
154  DataType* device_V = CudaInterface<DataType>::alloc(n * buffer_size);
155 
156  // Allocate vector r
157  DataType* device_r = CudaInterface<DataType>::alloc(n);
158 
159  // Copy v into r
161 
162  // Initial beta
163  DataType initial_beta = cuVectorOperations<DataType>::euclidean_norm(
164  cublas_handle, device_r, n);
165 
166  // Declare iterators
167  IndexType j;
168  IndexType lanczos_size = 0;
169  IndexType num_ortho;
170 
171  // In the following, beta[j] means beta[j-1] in the Demmel text
172  for (j=0; j < m; ++j)
173  {
174  // Update the size of Lanczos tridiagonal matrix
175  ++lanczos_size;
176 
177  // Normalize r and copy to the j-th column of V
178  if (j == 0)
179  {
181  cublas_handle, device_r, n, 1.0/initial_beta,
182  &device_V[(j % buffer_size)*n]);
183  }
184  else
185  {
187  cublas_handle, device_r, n, 1.0/beta[j-1],
188  &device_V[(j % buffer_size)*n]);
189  }
190 
191  // Multiply A to the j-th column of V, write into r
192  A->dot(&device_V[(j % buffer_size)*n], device_r);
193 
194  // alpha[j] is V[:, j] dot r
196  cublas_handle, &device_V[(j % buffer_size)*n], device_r, n);
197 
198  // Subtract V[:,j] * alpha[j] from r
200  cublas_handle, &device_V[(j % buffer_size)*n], n, alpha[j],
201  device_r);
202 
203  // Subtract V[:,j-1] * beta[j] from r
204  if (j > 0)
205  {
207  cublas_handle, &device_V[((j-1) % buffer_size)*n], n,
208  beta[j-1], device_r);
209  }
210 
211  // Gram-Schmidt process (full re-orthogonalization)
212  if (orthogonalize != 0)
213  {
214  // Find how many column vectors are filled so far in the buffer V
215  if (j < buffer_size)
216  {
217  num_ortho = j+1;
218  }
219  else
220  {
221  num_ortho = buffer_size;
222  }
223 
224  // Gram-Schmidt process
226  cublas_handle, &device_V[0], n, buffer_size, j%buffer_size,
227  num_ortho, device_r);
228  }
229 
230  // beta is norm of r
232  cublas_handle, device_r, n);
233 
234  // Exit criterion when the vector r is zero. If each component of a
235  // zero vector has the tolerance epsilon, (which is called lanczos_tol
236  // here), the tolerance of norm of r is epsilon times sqrt of n.
237  if (beta[j] < lanczos_tol * sqrt(n))
238  {
239  break;
240  }
241  }
242 
243  // Free dynamic memory
246 
247  return lanczos_size;
248 }
249 
250 
251 // ===============================
252 // Explicit template instantiation
253 // ===============================
254 
255 // lanczos tridiagonalization
258  const float* v,
259  const LongIndexType n,
260  const IndexType m,
261  const float lanczos_tol,
262  const FlagType orthogonalize,
263  float* alpha,
264  float* beta);
265 
268  const double* v,
269  const LongIndexType n,
270  const IndexType m,
271  const double lanczos_tol,
272  const FlagType orthogonalize,
273  double* alpha,
274  double* beta);
static void del(void *device_array)
Deletes memory on gpu device if its pointer is not NULL, then sets the pointer to NULL.
static ArrayType * alloc(const LongIndexType array_size)
Allocates memory on gpu device. This function creates a pointer and returns it.
static void copy_to_device(const ArrayType *host_array, const LongIndexType array_size, ArrayType *device_array)
Copies memory on host to device memory.
virtual void dot(const DataType *vector, DataType *product)=0
Base class for linear operators. This class serves as interface for all derived classes.
cublasHandle_t get_cublas_handle() const
This function returns a reference to the cublasHandle_t object. The object will be created,...
static void gram_schmidt_process(cublasHandle_t cublas_handle, const DataType *V, const LongIndexType vector_size, const IndexType num_vectors, const IndexType last_vector, const FlagType num_ortho, DataType *r)
Modified Gram-Schmidt orthogonalization process to orthogonalize the vector v against a subset of the...
static void copy_scaled_vector(cublasHandle_t cublas_handle, const DataType *input_vector, const LongIndexType vector_size, const DataType scale, DataType *output_vector)
Scales a vector and stores to a new vector.
static void subtract_scaled_vector(cublasHandle_t cublas_handle, const DataType *input_vector, const LongIndexType vector_size, const DataType scale, DataType *output_vector)
Subtracts the scaled input vector from the output vector.
static DataType inner_product(cublasHandle_t cublas_handle, const DataType *vector1, const DataType *vector2, const LongIndexType vector_size)
Computes Euclidean inner product of two vectors.
static DataType euclidean_norm(cublasHandle_t cublas_handle, const DataType *vector, const LongIndexType vector_size)
Computes the Euclidean 2-norm of a 1D array.
template IndexType cu_lanczos_tridiagonalization< double >(cuLinearOperator< double > *A, const double *v, const LongIndexType n, const IndexType m, const double lanczos_tol, const FlagType orthogonalize, double *alpha, double *beta)
template IndexType cu_lanczos_tridiagonalization< float >(cuLinearOperator< float > *A, const float *v, const LongIndexType n, const IndexType m, const float lanczos_tol, const FlagType orthogonalize, float *alpha, float *beta)
IndexType cu_lanczos_tridiagonalization(cuLinearOperator< DataType > *A, const DataType *v, const LongIndexType n, const IndexType m, const DataType lanczos_tol, const FlagType orthogonalize, DataType *alpha, DataType *beta)
Tri-diagonalizes matrix A to T using the start vector v. is the Lanczos degree, which will be the siz...
int LongIndexType
Definition: types.h:60
int FlagType
Definition: types.h:68
int IndexType
Definition: types.h:65