17#include "../_definitions/definitions.h"
18#include "../_cu_definitions/cu_types.h"
21#if defined(USE_OPENMP) && (USE_OPENMP == 1)
29#include "../_random_generator/random_array_generator.h"
30#include "../_c_trace_estimator/diagonalization.h"
31#include "../_c_trace_estimator/convergence_tools.h"
32#include "../_cuda_utilities/cuda_timer.h"
33#include "../_cuda_utilities/cuda_api.h"
34#include "../_cu_arithmetics/cu_arithmetics.h"
204template <
typename DataType>
207 DataType* parameters,
211 const DataType exponent,
215 const DataType lanczos_tol,
218 const DataType error_atol,
219 const DataType error_rtol,
220 const DataType confidence_level,
221 const DataType outlier_significance_level,
231 double& alg_wall_time)
237 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
248 IndexType random_vectors_size = matrix_size * num_gpu_devices;
249 DataType* random_vectors =
new DataType[random_vectors_size];
264 unsigned int chunk_size =
static_cast<int>(
265 std::sqrt(
static_cast<double>(max_num_samples) / \
266 static_cast<double>(num_gpu_devices)));
278 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
279 #pragma omp parallel for \
280 schedule(dynamic, chunk_size)
282 for (i=0; i < max_num_samples; ++i)
284 if (!
static_cast<bool>(all_converged))
292 A, parameters, num_inquiries, matrix_function, gram,
293 exponent, orthogonalize, lanczos_degree, lanczos_tol,
294 random_number_generator,
295 &random_vectors[matrix_size*thread_id], converged,
299 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
304 processed_samples_indices[num_processed_samples] = i;
305 ++num_processed_samples;
311 samples, min_num_samples, num_inquiries,
312 processed_samples_indices, num_processed_samples,
313 confidence_level, error_atol, error_rtol, error,
314 num_samples_used, converged);
321 alg_wall_time = cuda_timer.
elapsed();
325 confidence_level, outlier_significance_level, num_inquiries,
326 max_num_samples, num_samples_used, processed_samples_indices,
327 samples, num_outliers, trace, error);
330 delete[] random_vectors;
332 return all_converged;
436template <
typename DataType>
439 DataType* parameters,
443 const DataType exponent,
446 const DataType lanczos_tol,
448 DataType* random_vector,
450 DataType* trace_estimate)
461 random_number_generator, random_vector, matrix_size, num_threads);
464 DataType* alpha =
new DataType[lanczos_degree];
465 DataType* beta =
new DataType[lanczos_degree];
469 DataType* eigenvectors = NULL;
470 DataType* left_singularvectors = NULL;
471 DataType* right_singularvectors_transposed = NULL;
474 IndexType required_num_inquiries = num_inquiries;
482 required_num_inquiries = 1;
488 DataType** theta =
new DataType*[num_inquiries];
489 for (j=0; j < num_inquiries; ++j)
491 theta[j] =
new DataType[lanczos_degree];
494 for (i=0; i < lanczos_degree; ++i)
501 DataType** tau =
new DataType*[num_inquiries];
502 for (j=0; j < num_inquiries; ++j)
504 tau[j] =
new DataType[lanczos_degree];
507 for (i=0; i < lanczos_degree; ++i)
523 for (j=0; j < required_num_inquiries; ++j)
529 if ((converged[j] == 1) && (required_num_inquiries == num_inquiries))
541 A, random_vector, matrix_size, lanczos_degree, lanczos_tol,
542 orthogonalize, alpha, beta);
545 left_singularvectors = \
546 new DataType[lanczos_size[j] * lanczos_size[j]];
547 right_singularvectors_transposed = \
548 new DataType[lanczos_size[j] * lanczos_size[j]];
552 alpha, beta, left_singularvectors,
553 right_singularvectors_transposed, lanczos_size[j]);
556 for (i=0; i < lanczos_size[j]; ++i)
559 tau[j][i] = right_singularvectors_transposed[i];
566 A, random_vector, matrix_size, lanczos_degree, lanczos_tol,
567 orthogonalize, alpha, beta);
570 eigenvectors =
new DataType[lanczos_size[j] * lanczos_size[j]];
574 alpha, beta, eigenvectors, lanczos_size[j]);
577 for (i=0; i < lanczos_size[j]; ++i)
579 theta[j][i] = alpha[i];
580 tau[j][i] = eigenvectors[i * lanczos_size[j]];
597 for (j=0; j < num_inquiries; ++j)
603 for (j=1; j < num_inquiries; ++j)
606 lanczos_size[j] = lanczos_size[0];
608 for (i=0; i < lanczos_size[j]; ++i)
614 ¶meters[j*num_parameters]);
617 tau[j][i] = tau[0][i];
623 DataType quadrature_sum;
624 for (j=0; j < num_inquiries; ++j)
627 if (converged[j] == 1)
633 quadrature_sum = 0.0;
640 for (i=0; i < lanczos_size[j]; ++i)
655 trace_estimate[j] = \
659 static_cast<unsigned long long int>(matrix_size)
667 delete[] lanczos_size;
669 for (j=0; j < required_num_inquiries; ++j)
675 for (j=0; j < required_num_inquiries; ++j)
681 if (eigenvectors != NULL)
683 delete[] eigenvectors;
686 if (left_singularvectors != NULL)
688 delete[] left_singularvectors;
691 if (right_singularvectors_transposed != NULL)
693 delete[] right_singularvectors_transposed;
702#if defined(USE_CUDA_FP8_E5M2) && (USE_CUDA_FP8_E5M2 == 1)
706#if defined(USE_CUDA_FP8_E4M3) && (USE_CUDA_FP8_E4M3 == 1)
710#if defined(USE_CUDA_FP16) && (USE_CUDA_FP16 == 1)
714#if defined(USE_CUDA_BF16) && (USE_CUDA_BF16 == 1)
718#if defined(USE_CUDA_FP32) && (USE_CUDA_FP32 == 1)
722#if defined(USE_CUDA_FP64) && (USE_CUDA_FP64 == 1)
static void set_device(int device_id)
Sets the current device in multi-gpu applications.
Records elasped time between two CUDA events.
void stop()
Stops the timer.
void start()
Starts the timer.
float elapsed() const
Returns the elapsed time in seconds.
static int eigh_tridiagonal(DataType *diagonals, DataType *subdiagonals, DataType *eigenvectors, IndexType matrix_size)
Computes all eigenvalues and eigenvectors of a real and symmetric tridiagonal matrix.
static int svd_bidiagonal(DataType *diagonals, DataType *subdiagonals, DataType *U, DataType *Vt, IndexType matrix_size)
Computes all singular-values and left and right eigenvectors of a real and symmetric upper bi-diagona...
virtual float function(const float lambda_) const =0
static void generate_random_array(RandomNumberGenerator &random_number_generator, DataType *array, const LongIndexType array_size, const IndexType num_threads)
Generates a pseudo-random array with Rademacher distribution where elements are either +1 or -1.
Generates 64-bit integers on multiple parallel threads.
IndexType get_num_parameters() const
Returns the number of parameters of the linear operator.
LongIndexType get_num_rows() const
Returns the number of rows of the matrix.
FlagType is_eigenvalue_relation_known() const
Returns a flag that determines whether a relation between the parameters of the operator and its eige...
Base class for linear operators. This class serves as interface for all derived classes.
void set_parameters(DataType *parameters_)
Sets the scalar parameter this->parameters. Parameter is initialized to NULL. However,...
virtual DataType get_eigenvalue(const DataType *known_parameters, const DataType known_eigenvalue, const DataType *inquiry_parameters) const =0
A static class to compute the trace of implicit matrix functions using stochastic Lanczos quadrature ...
static void _cu_stochastic_lanczos_quadrature(cuLinearOperator< DataType > *A, DataType *parameters, const IndexType num_inquiries, const Function *matrix_function, const FlagType gram, const DataType exponent, const FlagType orthogonalize, const IndexType lanczos_degree, const DataType lanczos_tol, RandomNumberGenerator &random_number_generator, DataType *random_vector, FlagType *converged, DataType *trace_estimate)
For a given random input vector, computes one Monte-Carlo sample to estimate trace using Lanczos quad...
static FlagType cu_trace_estimator(cuLinearOperator< DataType > *A, DataType *parameters, const IndexType num_inquiries, const Function *matrix_function, const FlagType gram, const DataType exponent, const FlagType orthogonalize, const int64_t seed, const IndexType lanczos_degree, const DataType lanczos_tol, const IndexType min_num_samples, const IndexType max_num_samples, const DataType error_atol, const DataType error_rtol, const DataType confidence_level, const DataType outlier_significance_level, const IndexType num_threads, const IndexType num_gpu_devices, DataType *trace, DataType *error, DataType **samples, IndexType *processed_samples_indices, IndexType *num_samples_used, IndexType *num_outliers, FlagType *converged, double &alg_wall_time)
Stochastic Lanczos quadrature method to estimate trace of a function of a linear operator....
void omp_set_num_threads(int num_threads)
IndexType cu_golub_kahn_bidiagonalization(cuLinearOperator< DataType > *A, const DataType *v, const LongIndexType n, const IndexType m, const DataType lanczos_tol, const FlagType orthogonalize, DataType *alpha, DataType *beta)
Bi-diagonalizes the positive-definite matrix A using Golub-Kahn-Lanczos method.
IndexType cu_lanczos_tridiagonalization(cuLinearOperator< DataType > *A, const DataType *v, const LongIndexType n, const IndexType m, const DataType lanczos_tol, const FlagType orthogonalize, DataType *alpha, DataType *beta)
Tri-diagonalizes matrix A to T using the start vector v. is the Lanczos degree, which will be the siz...
__host__ __device__ DataType mul(const DataType x, const DataType y)
Multiply two floating point numbers in round-to-nearest-even mode.
__host__ __device__ DataType abs(const DataType x)
Absolute value of a floating point number.