imate
C++/CUDA Reference
Loading...
Searching...
No Matches
cu_orthogonalization.cu
Go to the documentation of this file.
1/*
2 * SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>
3 * SPDX-License-Identifier: BSD-3-Clause
4 * SPDX-FileType: SOURCE
5 *
6 * This program is free software: you can redistribute it and/or modify it
7 * under the terms of the license found in the LICENSE.txt file in the root
8 * directory of this source tree.
9 */
10
11
12// =======
13// Imports
14// =======
15
17#include "../_cu_definitions/cu_types.h" // __nv_fp8_e5m2, __nv_fp8_e4m3,
18 // __half, __nv_bfloat16
19#include <cstdlib> // abort, NULL
20#include <iostream> // std::cerr, std::endl
21#include <cmath> // std::sqrt, std::fabs
22#include <limits> // std::numeric_limits
23#include "../_cu_basic_algebra/cu_vector_operations.h" // cuVectorOperations
24#include "../_random_generator/random_array_generator.h" // RandomArrayGene...
25#include "../_random_generator/random_number_generator.h" // RandomNumberGe...
26#include "../_cuda_utilities/cuda_api.h" // CudaAPI
27#include "../_cu_arithmetics/cu_arithmetics.h" // cu_arithmetics
28
29
30// ====================
31// gram schmidt process
32// ====================
33
129
130template <typename DataType>
132 cublasHandle_t cublas_handle,
133 const DataType* V,
134 const LongIndexType vector_size,
135 const IndexType num_vectors,
136 const IndexType last_vector,
137 const FlagType num_ortho,
138 DataType* v)
139{
140 // Determine how many previous vectors to orthogonalize against
141 IndexType num_steps;
142 if ((num_ortho == 0) || (num_vectors < 2))
143 {
144 // No orthogonalization is performed
145 return;
146 }
147 else if ((num_ortho < 0) ||
148 (num_ortho > static_cast<FlagType>(num_vectors)))
149 {
150 // Orthogonalize against all vectors
151 num_steps = num_vectors;
152 }
153 else
154 {
155 // Orthogonalize against only the last num_ortho vectors
156 num_steps = num_ortho;
157 }
158
159 // Vectors can be orthogonalized at most to the full basis of the vector
160 // space. Thus, num_steps cannot be larger than the dimension of vector
161 // space, which is vector_size.
162 if (num_steps > static_cast<IndexType>(vector_size))
163 {
164 num_steps = vector_size;
165 }
166
167 IndexType i;
168 DataType inner_prod;
169 DataType norm;
170 DataType norm_v;
171 DataType epsilon = cu_arithmetics::epsilon<DataType>();
172 DataType distance2;
173
174 // Iterate over vectors
175 for (IndexType step=0; step < num_steps; ++step)
176 {
177 // i is the index of a column vector in V to orthogonalize v against it
178 if ((last_vector % num_vectors) >= step)
179 {
180 i = (last_vector % num_vectors) - step;
181 }
182 else
183 {
184 // Wrap around negative indices from the end of column index
185 i = (last_vector % num_vectors) - step + num_vectors;
186 }
187
188 // Norm of j-th vector
190 cublas_handle, &V[vector_size*i], vector_size);
191
192 // Check norm
193 if (norm < \
195 epsilon,
197 std::sqrt(vector_size))
198 )
199 )
200 {
201 std::cerr << "WARNING: norm of the given vector is too small. " \
202 << "Cannot orthogonalize against zero vector. " \
203 << "Skipping." << std::endl;
204 continue;
205 }
206
207 // Projection
209 cublas_handle, &V[vector_size*i], v, vector_size);
210
211 // scale for subtraction
212 DataType scale = cu_arithmetics::div(
213 inner_prod,
214 cu_arithmetics::mul(norm, norm));
215 DataType scale_abs = cu_arithmetics::abs(scale);
216 DataType one = cu_arithmetics::cast<double, DataType>(1.0);
217 DataType two = cu_arithmetics::cast<double, DataType>(2.0);
218
219 // If scale is 1, it is possible that vector v and j-th vector are
220 // identical (or close).
221 if (cu_arithmetics::abs(cu_arithmetics::sub(scale_abs, one)) <= \
222 cu_arithmetics::mul(two, epsilon))
223 {
224 // Norm of the vector v
226 cublas_handle, v, vector_size);
227
228 // Compute distance between the j-th vector and vector v
229 distance2 = \
230 cu_arithmetics::add(
231 cu_arithmetics::mul(norm_v, norm_v),
232 cu_arithmetics::mul(-two, inner_prod),
233 cu_arithmetics::mul(norm, norm)
234 );
235
236 // If distance is zero, do not re-orthogonalize i-th against
237 // the j-th vector.
238 if (distance2 < \
240 two,
241 epsilon,
243 static_cast<unsigned long long int>(vector_size))
244 )
245 )
246 {
247 continue;
248 }
249 }
250
251 // Subtraction
253 cublas_handle, &V[vector_size*i], vector_size, scale, v);
254 }
255}
256
257
258// =====================
259// orthogonalize vectors
260// =====================
261
300
301template <typename DataType>
303 cublasHandle_t cublas_handle,
304 DataType* vectors,
305 const LongIndexType vector_size,
306 const IndexType num_vectors)
307{
308 // Do nothing if there is only one vector
309 if (num_vectors < 2)
310 {
311 return;
312 }
313
314 IndexType i = 0;
315 IndexType j;
316 IndexType start_j;
317 DataType inner_prod;
318 DataType norm_j;
319 DataType norm_i;
320 DataType epsilon = cu_arithmetics::epsilon<DataType>();
321 IndexType success = 1;
322 IndexType max_num_trials = 20;
323 IndexType num_trials = 0;
324 IndexType num_threads = 1;
325 RandomNumberGenerator random_number_generator(num_threads);
326 DataType* buffer = NULL;
327
328 while (i < num_vectors)
329 {
330 if ((success == 0) && (num_trials >= max_num_trials))
331 {
332 std::cerr << "ERROR: Cannot orthogonalize vectors after " \
333 << num_trials << " trials. Aborting." \
334 << std::endl;
335 abort();
336 }
337
338 // Reset on new trial (if it was set to 0 before to start a new trial)
339 success = 1;
340
341 // j iterates on previous vectors in a window of at most vector_size
342 if (static_cast<LongIndexType>(i) > vector_size)
343 {
344 // When vector_size is smaller than i, it is fine to cast to signed
345 start_j = i - static_cast<IndexType>(vector_size);
346 }
347 else
348 {
349 start_j = 0;
350 }
351
352 // Re-orthogonalize against previous vectors
353 for (j=start_j; j < i; ++j)
354 {
355 // Norm of the j-th vector
357 cublas_handle, &vectors[j*vector_size], vector_size);
358
359 // Check norm
360 if (norm_j < \
362 epsilon,
364 std::sqrt(vector_size))
365 )
366 )
367 {
368 std::cerr << "WARNING: norm of the given vector is too " \
369 << " small. Cannot re-orthogonalize against zero" \
370 << "vector. Skipping."
371 << std::endl;
372 continue;
373 }
374
375 // Projecting i-th vector to j-th vector
377 cublas_handle, &vectors[i*vector_size],
378 &vectors[j*vector_size], vector_size);
379
380 // Scale of subtraction
381 DataType scale = cu_arithmetics::div(
382 inner_prod,
383 cu_arithmetics::mul(norm_j, norm_j));
384
385 // Subtraction
387 cublas_handle, &vectors[vector_size*j], vector_size, scale,
388 &vectors[vector_size*i]);
389
390 // Norm of the i-th vector
392 cublas_handle, &vectors[i*vector_size], vector_size);
393
394 // If the norm is too small, regenerate the i-th vector randomly
395 if (norm_i < \
397 epsilon,
399 std::sqrt(vector_size))
400 )
401 )
402 {
403 // Allocate buffer
404 if (buffer == NULL)
405 {
406 buffer = new DataType[vector_size];
407 }
408
409 // Regenerate new random vector on buffer
411 random_number_generator, buffer,
412 vector_size, num_threads);
413
414 // Copy buffer to the i-th vector on device
416 buffer, vector_size, &vectors[i*vector_size]);
417
418 // Repeat the re-orthogonalization for i-th vector against
419 // all previous vectors again.
420 success = 0;
421 ++num_trials;
422 break;
423 }
424 }
425
426 if (success == 1)
427 {
428 ++i;
429
430 // Reset if num_trials was incremented before.
431 num_trials = 0;
432 }
433 }
434
435 // Deallocate buffer
436 if (buffer != NULL)
437 {
438 delete[] buffer;
439 buffer = NULL;
440 }
441}
442
443
444// ===============================
445// Explicit template instantiation
446// ===============================
447
448#if defined(USE_CUDA_FP8_E5M2) && (USE_CUDA_FP8_E5M2 == 1)
450#endif
451
452#if defined(USE_CUDA_FP8_E4M3) && (USE_CUDA_FP8_E4M3 == 1)
454#endif
455
456#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
457 template class cuOrthogonalization<__half>;
458#endif
459
460#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
462#endif
463
464#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
465 template class cuOrthogonalization<float>;
466#endif
467
468#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
469 template class cuOrthogonalization<double>;
470#endif
static void copy_to_device(const ArrayType *host_array, const size_t array_size, ArrayType *device_array)
Copies memory on host to device memory.
Definition cuda_api.cu:145
static void generate_random_array(RandomNumberGenerator &random_number_generator, DataType *array, const LongIndexType array_size, const IndexType num_threads)
Generates a pseudo-random array with Rademacher distribution where elements are either +1 or -1.
Generates 64-bit integers on multiple parallel threads.
A static class for orthogonalization of vector bases. This class acts as a templated namespace,...
static void gram_schmidt_process(cublasHandle_t cublas_handle, const DataType *V, const LongIndexType vector_size, const IndexType num_vectors, const IndexType last_vector, const FlagType num_ortho, DataType *r)
Modified Gram-Schmidt orthogonalization process to orthogonalize the vector v against a subset of the...
static void orthogonalize_vectors(cublasHandle_t cublas_handle, DataType *vectors, const LongIndexType vector_size, const IndexType num_vectors)
Orthogonalizes set of vectors mutually using modified Gram-Schmidt process.
static void subtract_scaled_vector(cublasHandle_t cublas_handle, const DataType *RESTRICT input_vector, const LongIndexType vector_size, const DataType scale, DataType *RESTRICT output_vector)
Subtracts the scaled input vector from the output vector.
static DataType inner_product(cublasHandle_t cublas_handle, const DataType *RESTRICT vector1, const DataType *RESTRICT vector2, const LongIndexType vector_size)
Computes Euclidean inner product of two vectors.
static DataType euclidean_norm(cublasHandle_t cublas_handle, const DataType *RESTRICT vector, const LongIndexType vector_size)
Computes the Euclidean 2-norm of a 1D array.
__host__ __device__ DataType mul(const DataType x, const DataType y)
Multiply two floating point numbers in round-to-nearest-even mode.
__host__ __device__ DataType abs(const DataType x)
Absolute value of a floating point number.
__host__ __device__ DataType sub(const DataType x, const DataType y)
Subtract two floating point numbers in round-to-nearest-even mode.
__host__ __device__ DataType div(const DataType x, const DataType y)
Divide two floating point numbers in round-to-nearest-even mode.
int LongIndexType
Definition types.h:60
int FlagType
Definition types.h:68
int IndexType
Definition types.h:65