imate
C++/CUDA Reference
Loading...
Searching...
No Matches
cu_trace_estimator.cu
Go to the documentation of this file.
1/*
2 * SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>
3 * SPDX-License-Identifier: BSD-3-Clause
4 * SPDX-FileType: SOURCE
5 *
6 * This program is free software: you can redistribute it and/or modify it
7 * under the terms of the license found in the LICENSE.txt file in the root
8 * directory of this source tree.
9 */
10
11
12// =======
13// Headers
14// =======
15
17#include "../_definitions/definitions.h" // USE_OPENMP
18#include "../_cu_definitions/cu_types.h" // __nv_fp8_e5m2, __nv_fp8_e4m3,
19 // __half, __nv_bfloat16
20
21#if defined(USE_OPENMP) && (USE_OPENMP == 1)
22 #include <omp.h> // omp_set_num_threads
23#endif
24
25#include <cmath> // std::sqrt, std::pow
26#include <cstddef> // NULL
27#include "./cu_lanczos_tridiagonalization.h" // cu_lanczos_tridiagonalization
28#include "./cu_golub_kahn_bidiagonalization.h" // cu_golub_kahn_bidiagonali...
29#include "../_random_generator/random_array_generator.h" // RandomArrayGene...
30#include "../_c_trace_estimator/diagonalization.h" // Diagonalization
31#include "../_c_trace_estimator/convergence_tools.h" // check_convergence, ...
32#include "../_cuda_utilities/cuda_timer.h" // CudaTimer
33#include "../_cuda_utilities/cuda_api.h" // CudaAPI
34#include "../_cu_arithmetics/cu_arithmetics.h" // cu_arithmetics
35
36
37// ==================
38// cu trace estimator
39// ==================
40
203
204template <typename DataType>
207 DataType* parameters,
208 const IndexType num_inquiries,
209 const Function* matrix_function,
210 const FlagType gram,
211 const DataType exponent,
212 const FlagType orthogonalize,
213 const int64_t seed,
214 const IndexType lanczos_degree,
215 const DataType lanczos_tol,
216 const IndexType min_num_samples,
217 const IndexType max_num_samples,
218 const DataType error_atol,
219 const DataType error_rtol,
220 const DataType confidence_level,
221 const DataType outlier_significance_level,
222 const IndexType num_threads,
223 const IndexType num_gpu_devices,
224 DataType* trace,
225 DataType* error,
226 DataType** samples,
227 IndexType* processed_samples_indices,
228 IndexType* num_samples_used,
229 IndexType* num_outliers,
230 FlagType* converged,
231 double& alg_wall_time)
232{
233 // Matrix size
234 IndexType matrix_size = A->get_num_rows();
235
236 // Set the number of threads
237 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
238 omp_set_num_threads(num_gpu_devices);
239 #else
240 num_threads = 1;
241 #endif
242
243 // Allocate 1D array of random vectors We only allocate a random vector
244 // per parallel thread. Thus, the total size of the random vectors is
245 // matrix_size*num_threads. On each iteration in parallel threads, the
246 // allocated memory is reused. That is, in each iteration, a new random
247 // vector is generated for that specific thread id.
248 IndexType random_vectors_size = matrix_size * num_gpu_devices;
249 DataType* random_vectors = new DataType[random_vectors_size];
250
251 // Initialize random number generator to generate in parallel threads
252 // independently.
253 RandomNumberGenerator random_number_generator(num_gpu_devices, seed);
254
255 // The counter of filled size of processed_samples_indices array
256 // This scalar variable is defined as array to be shared among al threads
257 IndexType num_processed_samples = 0;
258
259 // Criterion for early termination of iterations if convergence reached
260 // This scalar variable is defined as array to be shared among al threads
261 FlagType all_converged = 0;
262
263 // Using square-root of max possible chunk size for parallel schedules
264 unsigned int chunk_size = static_cast<int>(
265 std::sqrt(static_cast<double>(max_num_samples) / \
266 static_cast<double>(num_gpu_devices)));
267 if (chunk_size < 1)
268 {
269 chunk_size = 1;
270 }
271
272 // Timing elapsed time of algorithm
273 CudaTimer cuda_timer;
274 cuda_timer.start();
275
276 // Shared-memory parallelism over Monte-Carlo ensemble sampling
277 IndexType i;
278 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
279 #pragma omp parallel for \
280 schedule(dynamic, chunk_size)
281 #endif
282 for (i=0; i < max_num_samples; ++i)
283 {
284 if (!static_cast<bool>(all_converged))
285 {
286 // Switch to a device with the same device id as the cpu thread id
287 unsigned int thread_id = omp_get_thread_num();
289
290 // Perform one Monte-Carlo sampling to estimate trace
292 A, parameters, num_inquiries, matrix_function, gram,
293 exponent, orthogonalize, lanczos_degree, lanczos_tol,
294 random_number_generator,
295 &random_vectors[matrix_size*thread_id], converged,
296 samples[i]);
297
298 // Critical section
299 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
300 #pragma omp critical
301 #endif
302 {
303 // Store the index of processed samples
304 processed_samples_indices[num_processed_samples] = i;
305 ++num_processed_samples;
306
307 // Check whether convergence criterion has been met to stop.
308 // This check can also be done after another parallel thread
309 // set all_converged to "1", but we continue to update error.
311 samples, min_num_samples, num_inquiries,
312 processed_samples_indices, num_processed_samples,
313 confidence_level, error_atol, error_rtol, error,
314 num_samples_used, converged);
315 }
316 }
317 }
318
319 // Elapsed wall time of the algorithm (computation only, not array i/o)
320 cuda_timer.stop();
321 alg_wall_time = cuda_timer.elapsed();
322
323 // Remove outliers from trace estimates and average trace estimates
325 confidence_level, outlier_significance_level, num_inquiries,
326 max_num_samples, num_samples_used, processed_samples_indices,
327 samples, num_outliers, trace, error);
328
329 // Deallocate memory
330 delete[] random_vectors;
331
332 return all_converged;
333}
334
335
336// ================================
337// cu stochastic lanczos quadrature
338// ================================
339
435
436template <typename DataType>
439 DataType* parameters,
440 const IndexType num_inquiries,
441 const Function* matrix_function,
442 const FlagType gram,
443 const DataType exponent,
444 const FlagType orthogonalize,
445 const IndexType lanczos_degree,
446 const DataType lanczos_tol,
447 RandomNumberGenerator& random_number_generator,
448 DataType* random_vector,
449 FlagType* converged,
450 DataType* trace_estimate)
451{
452 // Matrix size
453 IndexType matrix_size = A->get_num_rows();
454
455 // Fill random vectors with Rademacher distribution (+1, -1), normalized
456 // but not orthogonalized. Setting num_threads to zero indicates to not
457 // create any new threads in RandomNumbrGenerator since the current
458 // function is inside a parallel thread.
459 IndexType num_threads = 0;
461 random_number_generator, random_vector, matrix_size, num_threads);
462
463 // Allocate diagonals (alpha) and supdiagonals (beta) of Lanczos matrix
464 DataType* alpha = new DataType[lanczos_degree];
465 DataType* beta = new DataType[lanczos_degree];
466
467 // Define 2D arrays needed to decomposition. All these arrays are
468 // defined as 1D array with Fortran ordering
469 DataType* eigenvectors = NULL;
470 DataType* left_singularvectors = NULL;
471 DataType* right_singularvectors_transposed = NULL;
472
473 // Actual number of inquiries
474 IndexType required_num_inquiries = num_inquiries;
476 {
477 // When a relation between eigenvalues and the parameters of the linear
478 // operator is known, to compute eigenvalues of for each inquiry, only
479 // computing one inquiry is enough. This is because an eigenvalue for
480 // one parameter setting is enough to compute eigenvalue of another set
481 // of parameters.
482 required_num_inquiries = 1;
483 }
484
485 // Allocate and initialize theta
486 IndexType i;
487 IndexType j;
488 DataType** theta = new DataType*[num_inquiries];
489 for (j=0; j < num_inquiries; ++j)
490 {
491 theta[j] = new DataType[lanczos_degree];
492
493 // Initialize components to zero
494 for (i=0; i < lanczos_degree; ++i)
495 {
496 theta[j][i] = 0.0;
497 }
498 }
499
500 // Allocate and initialize tau
501 DataType** tau = new DataType*[num_inquiries];
502 for (j=0; j < num_inquiries; ++j)
503 {
504 tau[j] = new DataType[lanczos_degree];
505
506 // Initialize components to zero
507 for (i=0; i < lanczos_degree; ++i)
508 {
509 tau[j][i] = 0.0;
510 }
511 }
512
513 // Allocate lanczos size for each inquiry. This variable keeps the non-zero
514 // size of the tri-diagonal (or bi-diagonal) matrix. Ideally, this matrix
515 // is of the size lanczos_degree. But, due to the early termination, this
516 // size might be smaller.
517 IndexType* lanczos_size = new IndexType[num_inquiries];
518
519 // Number of parameters of linear operator A
520 IndexType num_parameters = A->get_num_parameters();
521
522 // Lanczos iterations, computes theta and tau for each inquiry parameter
523 for (j=0; j < required_num_inquiries; ++j)
524 {
525 // If trace is already converged, do not compute on the new sample.
526 // However, exclude the case where required_num_inquiries is not the
527 // same as num_inquiries, since in this case, we compute one inquiry
528 // for multiple parameters.
529 if ((converged[j] == 1) && (required_num_inquiries == num_inquiries))
530 {
531 continue;
532 }
533
534 // Set parameter of linear operator A
535 A->set_parameters(&parameters[j*num_parameters]);
536
537 if (gram)
538 {
539 // Use Golub-Kahn-Lanczos Bi-diagonalization
540 lanczos_size[j] = cu_golub_kahn_bidiagonalization(
541 A, random_vector, matrix_size, lanczos_degree, lanczos_tol,
542 orthogonalize, alpha, beta);
543
544 // Allocate matrix of singular vectors (1D array, Fortran ordering)
545 left_singularvectors = \
546 new DataType[lanczos_size[j] * lanczos_size[j]];
547 right_singularvectors_transposed = \
548 new DataType[lanczos_size[j] * lanczos_size[j]];
549
550 // Note: alpha is written in-place with singular values
552 alpha, beta, left_singularvectors,
553 right_singularvectors_transposed, lanczos_size[j]);
554
555 // theta and tau from singular values and vectors
556 for (i=0; i < lanczos_size[j]; ++i)
557 {
558 theta[j][i] = cu_arithmetics::mul(alpha[i], alpha[i]);
559 tau[j][i] = right_singularvectors_transposed[i];
560 }
561 }
562 else
563 {
564 // Use Lanczos Tri-diagonalization
565 lanczos_size[j] = cu_lanczos_tridiagonalization(
566 A, random_vector, matrix_size, lanczos_degree, lanczos_tol,
567 orthogonalize, alpha, beta);
568
569 // Allocate eigenvectors matrix (1D array with Fortran ordering)
570 eigenvectors = new DataType[lanczos_size[j] * lanczos_size[j]];
571
572 // Note: alpha is written in-place with eigenvalues
574 alpha, beta, eigenvectors, lanczos_size[j]);
575
576 // theta and tau from singular values and vectors
577 for (i=0; i < lanczos_size[j]; ++i)
578 {
579 theta[j][i] = alpha[i];
580 tau[j][i] = eigenvectors[i * lanczos_size[j]];
581 }
582 }
583 }
584
585 // If an eigenvalue relation is known, compute the rest of eigenvalues
586 // using the eigenvalue relation given in the operator A for its
587 // eigenvalues. If no eigenvalue relation is not known, the rest of
588 // eigenvalues were already computed in the above loop and no other
589 // computation is needed.
590 if (A->is_eigenvalue_relation_known() && num_inquiries > 1)
591 {
592 // When the code execution reaches this function, at least one of the
593 // inquiries is not converged, but some others might have been
594 // converged already. Here, we force-update those that are even
595 // converged already by setting converged to false. The extra update is
596 // free of charge when a relation for the eigenvalues are known.
597 for (j=0; j < num_inquiries; ++j)
598 {
599 converged[j] = 0;
600 }
601
602 // Compute theta and tau for the rest of inquiry parameters
603 for (j=1; j < num_inquiries; ++j)
604 {
605 // Only j=0 was iterated before. Set the same size for other j-s
606 lanczos_size[j] = lanczos_size[0];
607
608 for (i=0; i < lanczos_size[j]; ++i)
609 {
610 // Shift eigenvalues by the old and new parameters
611 theta[j][i] = A->get_eigenvalue(
612 &parameters[0],
613 theta[0][i],
614 &parameters[j*num_parameters]);
615
616 // tau is the same (at least for the affine operator)
617 tau[j][i] = tau[0][i];
618 }
619 }
620 }
621
622 // Estimate trace using quadrature method
623 DataType quadrature_sum;
624 for (j=0; j < num_inquiries; ++j)
625 {
626 // If the j-th inquiry is already converged, skip.
627 if (converged[j] == 1)
628 {
629 continue;
630 }
631
632 // Initialize sum for the integral of quadrature
633 quadrature_sum = 0.0;
634
635 // Important: This loop should iterate till lanczos_size[j], but not
636 // lanczos_degree. Otherwise the computation is wrong for certain
637 // matrices, such as if the input matrix is identity, or rank
638 // deficient. By using lanczos_size[j] instead of lanczos_degree, all
639 // issues with special matrices will resolve.
640 for (i=0; i < lanczos_size[j]; ++i)
641 {
642 quadrature_sum += cu_arithmetics::mul(
643 tau[j][i],
644 tau[j][i],
646 matrix_function->function(std::pow(
649 )
650 )
651 )
652 );
653 }
654
655 trace_estimate[j] = \
656 cu_arithmetics::mul(
657 quadrature_sum,
659 static_cast<unsigned long long int>(matrix_size)
660 )
661 );
662 }
663
664 // Release dynamic memory
665 delete[] alpha;
666 delete[] beta;
667 delete[] lanczos_size;
668
669 for (j=0; j < required_num_inquiries; ++j)
670 {
671 delete[] theta[j];
672 }
673 delete[] theta;
674
675 for (j=0; j < required_num_inquiries; ++j)
676 {
677 delete[] tau[j];
678 }
679 delete[] tau;
680
681 if (eigenvectors != NULL)
682 {
683 delete[] eigenvectors;
684 }
685
686 if (left_singularvectors != NULL)
687 {
688 delete[] left_singularvectors;
689 }
690
691 if (right_singularvectors_transposed != NULL)
692 {
693 delete[] right_singularvectors_transposed;
694 }
695}
696
697
698// ===============================
699// Explicit template instantiation
700// ===============================
701
702#if defined(USE_CUDA_FP8_E5M2) && (USE_CUDA_FP8_E5M2 == 1)
703 template class cuTraceEstimator<__nv_fp8_e5m2>;
704#endif
705
706#if defined(USE_CUDA_FP8_E4M3) && (USE_CUDA_FP8_E4M3 == 1)
707 template class cuTraceEstimator<__nv_fp8_e4m3>;
708#endif
709
710#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
711 template class cuTraceEstimator<__half>;
712#endif
713
714#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
715 template class cuTraceEstimator<__nv_bfloat16>;
716#endif
717
718#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
719 template class cuTraceEstimator<float>;
720#endif
721
722#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
723 template class cuTraceEstimator<double>;
724#endif
static FlagType check_convergence(DataType **samples, const IndexType min_num_samples, const IndexType num_inquiries, const IndexType *processed_samples_indices, const IndexType num_processed_samples, const DataType confidence_level, const DataType error_atol, const DataType error_rtol, DataType *error, IndexType *num_samples_used, FlagType *converged)
Checks if the standard deviation of the set of the cumulative averages of trace estimators converged ...
static void average_estimates(const DataType confidence_level, const DataType outlier_significance_level, const IndexType num_inquiries, const IndexType max_num_samples, const IndexType *num_samples_used, const IndexType *processed_samples_indices, DataType **samples, IndexType *num_outliers, DataType *trace, DataType *error)
Averages the estimates of trace. Removes outliers and reevaluates the error to take into account for ...
static void set_device(int device_id)
Sets the current device in multi-gpu applications.
Definition cuda_api.cu:191
Records elasped time between two CUDA events.
Definition cuda_timer.h:62
void stop()
Stops the timer.
Definition cuda_timer.cu:67
void start()
Starts the timer.
Definition cuda_timer.cu:54
float elapsed() const
Returns the elapsed time in seconds.
Definition cuda_timer.cu:80
static int eigh_tridiagonal(DataType *diagonals, DataType *subdiagonals, DataType *eigenvectors, IndexType matrix_size)
Computes all eigenvalues and eigenvectors of a real and symmetric tridiagonal matrix.
static int svd_bidiagonal(DataType *diagonals, DataType *subdiagonals, DataType *U, DataType *Vt, IndexType matrix_size)
Computes all singular-values and left and right eigenvectors of a real and symmetric upper bi-diagona...
Defines the function .
Definition functions.h:38
virtual float function(const float lambda_) const =0
static void generate_random_array(RandomNumberGenerator &random_number_generator, DataType *array, const LongIndexType array_size, const IndexType num_threads)
Generates a pseudo-random array with Rademacher distribution where elements are either +1 or -1.
Generates 64-bit integers on multiple parallel threads.
IndexType get_num_parameters() const
Returns the number of parameters of the linear operator.
LongIndexType get_num_rows() const
Returns the number of rows of the matrix.
FlagType is_eigenvalue_relation_known() const
Returns a flag that determines whether a relation between the parameters of the operator and its eige...
Base class for linear operators. This class serves as interface for all derived classes.
void set_parameters(DataType *parameters_)
Sets the scalar parameter this->parameters. Parameter is initialized to NULL. However,...
virtual DataType get_eigenvalue(const DataType *known_parameters, const DataType known_eigenvalue, const DataType *inquiry_parameters) const =0
A static class to compute the trace of implicit matrix functions using stochastic Lanczos quadrature ...
static void _cu_stochastic_lanczos_quadrature(cuLinearOperator< DataType > *A, DataType *parameters, const IndexType num_inquiries, const Function *matrix_function, const FlagType gram, const DataType exponent, const FlagType orthogonalize, const IndexType lanczos_degree, const DataType lanczos_tol, RandomNumberGenerator &random_number_generator, DataType *random_vector, FlagType *converged, DataType *trace_estimate)
For a given random input vector, computes one Monte-Carlo sample to estimate trace using Lanczos quad...
static FlagType cu_trace_estimator(cuLinearOperator< DataType > *A, DataType *parameters, const IndexType num_inquiries, const Function *matrix_function, const FlagType gram, const DataType exponent, const FlagType orthogonalize, const int64_t seed, const IndexType lanczos_degree, const DataType lanczos_tol, const IndexType min_num_samples, const IndexType max_num_samples, const DataType error_atol, const DataType error_rtol, const DataType confidence_level, const DataType outlier_significance_level, const IndexType num_threads, const IndexType num_gpu_devices, DataType *trace, DataType *error, DataType **samples, IndexType *processed_samples_indices, IndexType *num_samples_used, IndexType *num_outliers, FlagType *converged, double &alg_wall_time)
Stochastic Lanczos quadrature method to estimate trace of a function of a linear operator....
void omp_set_num_threads(int num_threads)
int omp_get_thread_num()
IndexType cu_golub_kahn_bidiagonalization(cuLinearOperator< DataType > *A, const DataType *v, const LongIndexType n, const IndexType m, const DataType lanczos_tol, const FlagType orthogonalize, DataType *alpha, DataType *beta)
Bi-diagonalizes the positive-definite matrix A using Golub-Kahn-Lanczos method.
IndexType cu_lanczos_tridiagonalization(cuLinearOperator< DataType > *A, const DataType *v, const LongIndexType n, const IndexType m, const DataType lanczos_tol, const FlagType orthogonalize, DataType *alpha, DataType *beta)
Tri-diagonalizes matrix A to T using the start vector v. is the Lanczos degree, which will be the siz...
__host__ __device__ DataType mul(const DataType x, const DataType y)
Multiply two floating point numbers in round-to-nearest-even mode.
__host__ __device__ DataType abs(const DataType x)
Absolute value of a floating point number.
int FlagType
Definition types.h:68
int IndexType
Definition types.h:65