imate
C++/CUDA Reference
Loading...
Searching...
No Matches
cu_lanczos_tridiagonalization.cu
Go to the documentation of this file.
1/*
2 * SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>
3 * SPDX-License-Identifier: BSD-3-Clause
4 * SPDX-FileType: SOURCE
5 *
6 * This program is free software: you can redistribute it and/or modify it
7 * under the terms of the license found in the LICENSE.txt file in the root
8 * directory of this source tree.
9 */
10
11
12// =======
13// Headers
14// =======
15
17#include "../_cu_definitions/cu_types.h" // __nv_fp8_e5m2, __nv_fp8_e4m3,
18 // __half, __nv_bfloat16
19#include <cmath> // std::sqrt
20#include "./cu_orthogonalization.h" // cuOrthogonalization
21#include "../_cu_basic_algebra/cu_vector_operations.h" // cuVectorOperations
22#include "../_cuda_utilities/cuda_api.h" // CudaAPI
23#include "../_cu_arithmetics/cu_arithmetics.h" // cu_arithmetics
24
25// Avoid CUBLAS numeration value not handled in switch [-Wswitch-enum] warning
26#ifdef _MSC_VER
27 #pragma warning(push, 0) // Suppress all warnings from the followings
28 #include <cublas_v2.h> // cublasHandle_t
29 #pragma warning(pop) // Restore previous warning level
30#elif defined(__INTEL_LLVM_COMPILER) || defined(__INTEL_COMPILER)
31 #pragma warning(push, 0)
32 #include <cublas_v2.h> // cublasHandle_t
33 #pragma warning(pop)
34#elif defined(__GNUC__) || defined(__clang__)
35 #pragma GCC diagnostic push
36 #pragma GCC diagnostic ignored "-Wswitch-enum"
37 #include <cublas_v2.h> // cublasHandle_t
38 #pragma GCC diagnostic pop
39#else
40 #include <cublas_v2.h> // cublasHandle_t
41#endif
42
43
44// ============================
45// c lanczos tridiagonalization
46// ============================
47
137
138template <typename DataType>
141 const DataType* v,
142 const LongIndexType n,
143 const IndexType m,
144 const DataType lanczos_tol,
145 const FlagType orthogonalize,
146 DataType* alpha,
147 DataType* beta)
148{
149 // Get cublas handle
150 cublasHandle_t cublas_handle = A->get_cublas_handle();
151
152 // buffer_size is number of last orthogonal vectors to keep in the buffer V
153 IndexType buffer_size;
154 if (orthogonalize == 0 || orthogonalize == 1)
155 {
156 // At least two vectors must be stored in buffer for Lanczos recursion
157 buffer_size = 2;
158 }
159 else if ((orthogonalize < 0) ||
160 (orthogonalize > static_cast<FlagType>(m)))
161 {
162 // Using full re-orthogonalization, keep all of the m vectors in buffer
163 buffer_size = m;
164 }
165 else
166 {
167 // Orthogonalize with less than m vectors (0 < orthogonalize < m)
168 buffer_size = orthogonalize;
169 }
170
171 // Allocate 2D array (as 1D array, and coalesced row-wise) to store
172 // the last buffer_size of orthogonalized vectors of length n. New vectors
173 // are stored by cycling through the buffer to replace with old ones.
174 DataType* device_V = CudaAPI<DataType>::alloc(n * buffer_size);
175
176 // Allocate vector r
177 DataType* device_r = CudaAPI<DataType>::alloc(n);
178
179 // Copy v into r
180 CudaAPI<DataType>::copy_to_device(v, n, device_r);
181
182 // Initial beta
183 DataType initial_beta = cuVectorOperations<DataType>::euclidean_norm(
184 cublas_handle, device_r, n);
185
186 // Declare iterators
187 IndexType j;
188 IndexType lanczos_size = 0;
189 IndexType num_ortho;
190
191 // In the following, beta[j] means beta[j-1] in the Demmel text
192 for (j=0; j < m; ++j)
193 {
194 // Update the size of Lanczos tridiagonal matrix
195 ++lanczos_size;
196
197 // Normalize r and copy to the j-th column of V
198 if (j == 0)
199 {
201 cublas_handle, device_r, n,
203 &device_V[(j % buffer_size)*n]);
204 }
205 else
206 {
208 cublas_handle, device_r, n,
210 &device_V[(j % buffer_size)*n]);
211 }
212
213 // Multiply A to the j-th column of V, write into r
214 A->dot(&device_V[(j % buffer_size)*n], device_r);
215
216 // alpha[j] is V[:, j] dot r
218 cublas_handle, &device_V[(j % buffer_size)*n], device_r, n);
219
220 // Subtract V[:,j] * alpha[j] from r
222 cublas_handle, &device_V[(j % buffer_size)*n], n, alpha[j],
223 device_r);
224
225 // Subtract V[:,j-1] * beta[j] from r
226 if (j > 0)
227 {
229 cublas_handle, &device_V[((j-1) % buffer_size)*n], n,
230 beta[j-1], device_r);
231 }
232
233 // Gram-Schmidt process (full re-orthogonalization)
234 if (orthogonalize != 0)
235 {
236 // Find how many column vectors are filled so far in the buffer V
237 if (j < buffer_size)
238 {
239 num_ortho = j+1;
240 }
241 else
242 {
243 num_ortho = buffer_size;
244 }
245
246 // Gram-Schmidt process
248 cublas_handle, &device_V[0], n, buffer_size, j%buffer_size,
249 num_ortho, device_r);
250 }
251
252 // beta is norm of r
254 cublas_handle, device_r, n);
255
256 // Exit criterion when the vector r is zero. If each component of a
257 // zero vector has the tolerance epsilon, (which is called lanczos_tol
258 // here), the tolerance of norm of r is epsilon times sqrt of n.
259 if (beta[j] < cu_arithmetics::mul(
260 lanczos_tol,
262 static_cast<double>(std::sqrt(n)))
263 )
264 )
265 {
266 break;
267 }
268 }
269
270 // Free dynamic memory
271 CudaAPI<DataType>::del(device_V);
272 CudaAPI<DataType>::del(device_r);
273
274 return lanczos_size;
275}
276
277
278// ===============================
279// Explicit template instantiation
280// ===============================
281
282// lanczos tridiagonalization (__nv_fp8_e5m2)
283#if defined(USE_CUDA_FP8_E5M2) && (USE_CUDA_FP8_E5M2 == 1)
284 template IndexType cu_lanczos_tridiagonalization<__nv_fp8_e5m2>(
286 const __nv_fp8_e5m2* v,
287 const LongIndexType n,
288 const IndexType m,
289 const __nv_fp8_e5m2 lanczos_tol,
290 const FlagType orthogonalize,
291 __nv_fp8_e5m2* alpha,
292 __nv_fp8_e5m2* beta);
293#endif
294
295// lanczos tridiagonalization (__nv_fp8_e4m3)
296#if defined(USE_CUDA_FP8_E4M3) && (USE_CUDA_FP8_E4M3 == 1)
297 template IndexType cu_lanczos_tridiagonalization<__nv_fp8_e4m3>(
299 const __nv_fp8_e4m3* v,
300 const LongIndexType n,
301 const IndexType m,
302 const __nv_fp8_e4m3 lanczos_tol,
303 const FlagType orthogonalize,
304 __nv_fp8_e4m3* alpha,
305 __nv_fp8_e4m3* beta);
306#endif
307
308// lanczos tridiagonalization (__half)
309#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
310 template IndexType cu_lanczos_tridiagonalization<__half>(
312 const __half* v,
313 const LongIndexType n,
314 const IndexType m,
315 const __half lanczos_tol,
316 const FlagType orthogonalize,
317 __half* alpha,
318 __half* beta);
319#endif
320
321// lanczos tridiagonalization (__nv_bfloat16)
322#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
323 template IndexType cu_lanczos_tridiagonalization<__nv_bfloat16>(
325 const __nv_bfloat16* v,
326 const LongIndexType n,
327 const IndexType m,
328 const __nv_bfloat16 lanczos_tol,
329 const FlagType orthogonalize,
330 __nv_bfloat16* alpha,
331 __nv_bfloat16* beta);
332#endif
333
334// lanczos tridiagonalization (float)
335#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
338 const float* v,
339 const LongIndexType n,
340 const IndexType m,
341 const float lanczos_tol,
342 const FlagType orthogonalize,
343 float* alpha,
344 float* beta);
345#endif
346
347// lanczos tridiagonalization (double)
348#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
351 const double* v,
352 const LongIndexType n,
353 const IndexType m,
354 const double lanczos_tol,
355 const FlagType orthogonalize,
356 double* alpha,
357 double* beta);
358#endif
static ArrayType * alloc(const size_t array_size)
Allocates memory on gpu device. This function creates a pointer and returns it.
Definition cuda_api.cu:39
static void del(void *device_array)
Deletes memory on gpu device if its pointer is not NULL, then sets the pointer to NULL.
Definition cuda_api.cu:169
static void copy_to_device(const ArrayType *host_array, const size_t array_size, ArrayType *device_array)
Copies memory on host to device memory.
Definition cuda_api.cu:145
Base class for linear operators. This class serves as interface for all derived classes.
virtual void dot(const DataType *vector, DataType *product)=0
cublasHandle_t get_cublas_handle() const
This function returns a reference to the cublasHandle_t object. The object will be created,...
static void gram_schmidt_process(cublasHandle_t cublas_handle, const DataType *V, const LongIndexType vector_size, const IndexType num_vectors, const IndexType last_vector, const FlagType num_ortho, DataType *r)
Modified Gram-Schmidt orthogonalization process to orthogonalize the vector v against a subset of the...
static void copy_scaled_vector(cublasHandle_t cublas_handle, const DataType *RESTRICT input_vector, const LongIndexType vector_size, const DataType scale, DataType *RESTRICT output_vector)
Scales a vector and stores to a new vector.
static void subtract_scaled_vector(cublasHandle_t cublas_handle, const DataType *RESTRICT input_vector, const LongIndexType vector_size, const DataType scale, DataType *RESTRICT output_vector)
Subtracts the scaled input vector from the output vector.
static DataType inner_product(cublasHandle_t cublas_handle, const DataType *RESTRICT vector1, const DataType *RESTRICT vector2, const LongIndexType vector_size)
Computes Euclidean inner product of two vectors.
static DataType euclidean_norm(cublasHandle_t cublas_handle, const DataType *RESTRICT vector, const LongIndexType vector_size)
Computes the Euclidean 2-norm of a 1D array.
template IndexType cu_lanczos_tridiagonalization< double >(cuLinearOperator< double > *A, const double *v, const LongIndexType n, const IndexType m, const double lanczos_tol, const FlagType orthogonalize, double *alpha, double *beta)
template IndexType cu_lanczos_tridiagonalization< float >(cuLinearOperator< float > *A, const float *v, const LongIndexType n, const IndexType m, const float lanczos_tol, const FlagType orthogonalize, float *alpha, float *beta)
IndexType cu_lanczos_tridiagonalization(cuLinearOperator< DataType > *A, const DataType *v, const LongIndexType n, const IndexType m, const DataType lanczos_tol, const FlagType orthogonalize, DataType *alpha, DataType *beta)
Tri-diagonalizes matrix A to T using the start vector v. is the Lanczos degree, which will be the siz...
__host__ __device__ DataType mul(const DataType x, const DataType y)
Multiply two floating point numbers in round-to-nearest-even mode.
__host__ __device__ DataType abs(const DataType x)
Absolute value of a floating point number.
int LongIndexType
Definition types.h:60
int FlagType
Definition types.h:68
int IndexType
Definition types.h:65