doxygen/html/cu__trace__estimator_8cu_source.html

 /*

  *  SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>

  *  SPDX-License-Identifier: BSD-3-Clause

  *  SPDX-FileType: SOURCE

  *

  *  This program is free software: you can redistribute it and/or modify it

  *  under the terms of the license found in the LICENSE.txt file in the root

  *  directory of this source tree.

  */


 // =======

 // Headers

 // =======


 #include "./cu_trace_estimator.h"

 #include <omp.h>  // omp_set_num_threads

 #include <cmath>  // sqrt, pow

 #include <cstddef>  // NULL

 #include "./cu_lanczos_tridiagonalization.h"  // cu_lanczos_tridiagonalization

 #include "./cu_golub_kahn_bidiagonalization.h"  // cu_golub_kahn_bidiagonali...

 #include "../_random_generator/random_array_generator.h"  // RandomArrayGene...

 #include "../_c_trace_estimator/diagonalization.h"  // Diagonalization

 #include "../_c_trace_estimator/convergence_tools.h"  // check_convergence, ...

 #include "../_cuda_utilities/cuda_timer.h"  // CudaTimer

 #include "../_cuda_utilities/cuda_interface.h"  // CudaInterface


 // ==================

 // cu trace estimator

 // ==================


 template <typename DataType>

 FlagType cuTraceEstimator<DataType>::cu_trace_estimator(

         cuLinearOperator<DataType>* A,

         DataType* parameters,

         const IndexType num_inquiries,

         const Function* matrix_function,

         const FlagType gram,

         const DataType exponent,

         const FlagType orthogonalize,

         const int64_t seed,

         const IndexType lanczos_degree,

         const DataType lanczos_tol,

         const IndexType min_num_samples,

         const IndexType max_num_samples,

         const DataType error_atol,

         const DataType error_rtol,

         const DataType confidence_level,

         const DataType outlier_significance_level,

         const IndexType num_threads,

         const IndexType num_gpu_devices,

         DataType* trace,

         DataType* error,

         DataType** samples,

         IndexType* processed_samples_indices,

         IndexType* num_samples_used,

         IndexType* num_outliers,

         FlagType* converged,

         float& alg_wall_time)

 {

     // Matrix size

     IndexType matrix_size = A->get_num_rows();


     // Set the number of threads

     omp_set_num_threads(num_gpu_devices);


     // Allocate 1D array of random vectors We only allocate a random vector

     // per parallel thread. Thus, the total size of the random vectors is

     // matrix_size*num_threads. On each iteration in parallel threads, the

     // allocated memory is reused. That is, in each iteration, a new random

     // vector is generated for that specific thread id.

     IndexType random_vectors_size = matrix_size * num_gpu_devices;

     DataType* random_vectors = new DataType[random_vectors_size];


     // Initialize random number generator to generate in parallel threads

     // independently.

     RandomNumberGenerator random_number_generator(num_gpu_devices, seed);


     // The counter of filled size of processed_samples_indices array

     // This scalar variable is defined as array to be shared among al threads

     IndexType num_processed_samples = 0;


     // Criterion for early termination of iterations if convergence reached

     // This scalar variable is defined as array to be shared among al threads

     FlagType all_converged = 0;


     // Using square-root of max possible chunk size for parallel schedules

     unsigned int chunk_size = static_cast<int>(

             sqrt(static_cast<DataType>(max_num_samples) / num_gpu_devices));

     if (chunk_size < 1)

     {

         chunk_size = 1;

     }


     // Timing elapsed time of algorithm

     CudaTimer cuda_timer;

     cuda_timer.start();


     // Shared-memory parallelism over Monte-Carlo ensemble sampling

     IndexType i;

     #pragma omp parallel for schedule(dynamic, chunk_size)

     for (i=0; i < max_num_samples; ++i)

     {

         if (!static_cast<bool>(all_converged))

         {

             // Switch to a device with the same device id as the cpu thread id

             unsigned int thread_id = omp_get_thread_num();

             CudaInterface<DataType>::set_device(thread_id);


             // Perform one Monte-Carlo sampling to estimate trace

             cuTraceEstimator<DataType>::_cu_stochastic_lanczos_quadrature(

                     A, parameters, num_inquiries, matrix_function, gram,

                     exponent, orthogonalize, lanczos_degree, lanczos_tol,

                     random_number_generator,

                     &random_vectors[matrix_size*thread_id], converged,

                     samples[i]);


             // Critical section

             #pragma omp critical

             {

                 // Store the index of processed samples

                 processed_samples_indices[num_processed_samples] = i;

                 ++num_processed_samples;


                 // Check whether convergence criterion has been met to stop.

                 // This check can also be done after another parallel thread

                 // set all_converged to "1", but we continue to update error.

                 all_converged = ConvergenceTools<DataType>::check_convergence(

                         samples, min_num_samples, num_inquiries,

                         processed_samples_indices, num_processed_samples,

                         confidence_level, error_atol, error_rtol, error,

                         num_samples_used, converged);

             }

         }

     }


     // Elapsed wall time of the algorithm (computation only, not array i/o)

     cuda_timer.stop();

     alg_wall_time = cuda_timer.elapsed();


     // Remove outliers from trace estimates and average trace estimates

     ConvergenceTools<DataType>::average_estimates(

             confidence_level, outlier_significance_level, num_inquiries,

             max_num_samples, num_samples_used, processed_samples_indices,

             samples, num_outliers, trace, error);


     // Deallocate memory

     delete[] random_vectors;


     return all_converged;

 }


 // ================================

 // cu stochastic lanczos quadrature

 // ================================


 template <typename DataType>

 void cuTraceEstimator<DataType>::_cu_stochastic_lanczos_quadrature(

         cuLinearOperator<DataType>* A,

         DataType* parameters,

         const IndexType num_inquiries,

         const Function* matrix_function,

         const FlagType gram,

         const DataType exponent,

         const FlagType orthogonalize,

         const IndexType lanczos_degree,

         const DataType lanczos_tol,

         RandomNumberGenerator& random_number_generator,

         DataType* random_vector,

         FlagType* converged,

         DataType* trace_estimate)

 {

     // Matrix size

     IndexType matrix_size = A->get_num_rows();


     // Fill random vectors with Rademacher distribution (+1, -1), normalized

     // but not orthogonalized. Setting num_threads to zero indicates to not

     // create any new threads in RandomNumbrGenerator since the current

     // function is inside a parallel thread.

     IndexType num_threads = 0;

     RandomArrayGenerator<DataType>::generate_random_array(

             random_number_generator, random_vector, matrix_size, num_threads);


     // Allocate diagonals (alpha) and supdiagonals (beta) of Lanczos matrix

     DataType* alpha = new DataType[lanczos_degree];

     DataType* beta = new DataType[lanczos_degree];


     // Define 2D arrays needed to decomposition. All these arrays are

     // defined as 1D array with Fortran ordering

     DataType* eigenvectors = NULL;

     DataType* left_singularvectors = NULL;

     DataType* right_singularvectors_transposed = NULL;


     // Actual number of inquiries

     IndexType required_num_inquiries = num_inquiries;

     if (A->is_eigenvalue_relation_known())

     {

         // When a relation between eigenvalues and the parameters of the linear

         // operator is known, to compute eigenvalues of for each inquiry, only

         // computing one inquiry is enough. This is because an eigenvalue for

         // one parameter setting is enough to compute eigenvalue of another set

         // of parameters.

         required_num_inquiries = 1;

     }


     // Allocate and initialize theta

     IndexType i;

     IndexType j;

     DataType** theta = new DataType*[num_inquiries];

     for (j=0; j < num_inquiries; ++j)

     {

         theta[j] = new DataType[lanczos_degree];


         // Initialize components to zero

         for (i=0; i < lanczos_degree; ++i)

         {

             theta[j][i] = 0.0;

         }

     }


     // Allocate and initialize tau

     DataType** tau = new DataType*[num_inquiries];

     for (j=0; j < num_inquiries; ++j)

     {

         tau[j] = new DataType[lanczos_degree];


         // Initialize components to zero

         for (i=0; i < lanczos_degree; ++i)

         {

             tau[j][i] = 0.0;

         }

     }


     // Allocate lanczos size for each inquiry. This variable keeps the non-zero

     // size of the tri-diagonal (or bi-diagonal) matrix. Ideally, this matrix

     // is of the size lanczos_degree. But, due to the early termination, this

     // size might be smaller.

     IndexType* lanczos_size = new IndexType[num_inquiries];


     // Number of parameters of linear operator A

     IndexType num_parameters = A->get_num_parameters();


     // Lanczos iterations, computes theta and tau for each inquiry parameter

     for (j=0; j < required_num_inquiries; ++j)

     {

         // If trace is already converged, do not compute on the new sample.

         // However, exclude the case where required_num_inquiries is not the

         // same as num_inquiries, since in this case, we compute one inquiry

         // for multiple parameters.

         if ((converged[j] == 1) && (required_num_inquiries == num_inquiries))

         {

             continue;

         }


         // Set parameter of linear operator A

         A->set_parameters(&parameters[j*num_parameters]);


         if (gram)

         {

             // Use Golub-Kahn-Lanczos Bi-diagonalization

             lanczos_size[j] = cu_golub_kahn_bidiagonalization(

                     A, random_vector, matrix_size, lanczos_degree, lanczos_tol,

                     orthogonalize, alpha, beta);


             // Allocate matrix of singular vectors (1D array, Fortran ordering)

             left_singularvectors = \

                 new DataType[lanczos_size[j] * lanczos_size[j]];

             right_singularvectors_transposed = \

                 new DataType[lanczos_size[j] * lanczos_size[j]];


             // Note: alpha is written in-place with singular values

             Diagonalization<DataType>::svd_bidiagonal(

                     alpha, beta, left_singularvectors,

                     right_singularvectors_transposed, lanczos_size[j]);


             // theta and tau from singular values and vectors

             for (i=0; i < lanczos_size[j]; ++i)

             {

                 theta[j][i] = alpha[i] * alpha[i];

                 tau[j][i] = right_singularvectors_transposed[i];

             }

         }

         else

         {

             // Use Lanczos Tri-diagonalization

             lanczos_size[j] = cu_lanczos_tridiagonalization(

                     A, random_vector, matrix_size, lanczos_degree, lanczos_tol,

                     orthogonalize, alpha, beta);


             // Allocate eigenvectors matrix (1D array with Fortran ordering)

             eigenvectors = new DataType[lanczos_size[j] * lanczos_size[j]];


             // Note: alpha is written in-place with eigenvalues

             Diagonalization<DataType>::eigh_tridiagonal(

                     alpha, beta, eigenvectors, lanczos_size[j]);


             // theta and tau from singular values and vectors

             for (i=0; i < lanczos_size[j]; ++i)

             {

                 theta[j][i] = alpha[i];

                 tau[j][i] = eigenvectors[i * lanczos_size[j]];

             }

         }

     }


     // If an eigenvalue relation is known, compute the rest of eigenvalues

     // using the eigenvalue relation given in the operator A for its

     // eigenvalues. If no eigenvalue relation is not known, the rest of

     // eigenvalues were already computed in the above loop and no other

     // computation is needed.

     if (A->is_eigenvalue_relation_known() && num_inquiries > 1)

     {

         // When the code execution reaches this function, at least one of the

         // inquiries is not converged, but some others might have been

         // converged already. Here, we force-update those that are even

         // converged already by setting converged to false. The extra update is

         // free of charge when a relation for the eigenvalues are known.

         for (j=0; j < num_inquiries; ++j)

         {

             converged[j] = 0;

         }


         // Compute theta and tau for the rest of inquiry parameters

         for (j=1; j < num_inquiries; ++j)

         {

             // Only j=0 was iterated before. Set the same size for other j-s

             lanczos_size[j] = lanczos_size[0];


             for (i=0; i < lanczos_size[j]; ++i)

             {

                 // Shift eigenvalues by the old and new parameters

                 theta[j][i] = A->get_eigenvalue(

                         &parameters[0],

                         theta[0][i],

                         &parameters[j*num_parameters]);


                 // tau is the same (at least for the affine operator)

                 tau[j][i] = tau[0][i];

             }

         }

     }


     // Estimate trace using quadrature method

     DataType quadrature_sum;

     for (j=0; j < num_inquiries; ++j)

     {

         // If the j-th inquiry is already converged, skip.

         if (converged[j] == 1)

         {

             continue;

         }


         // Initialize sum for the integral of quadrature

         quadrature_sum = 0.0;


         // Important: This loop should iterate till lanczos_size[j], but not

         // lanczos_degree. Otherwise the computation is wrong for certain

         // matrices, such as if the input matrix is identity, or rank

         // deficient. By using lanczos_size[j] instead of lanczos_degree, all

         // issues with special matrices will resolve.

         for (i=0; i < lanczos_size[j]; ++i)

         {

             quadrature_sum += tau[j][i] * tau[j][i] * \

                     matrix_function->function(pow(theta[j][i], exponent));

         }


         trace_estimate[j] = matrix_size * quadrature_sum;

     }


     // Release dynamic memory

     delete[] alpha;

     delete[] beta;

     delete[] lanczos_size;


     for (j=0; j < required_num_inquiries; ++j)

     {

         delete[] theta[j];

     }

     delete[] theta;


     for (j=0; j < required_num_inquiries; ++j)

     {

         delete[] tau[j];

     }

     delete[] tau;


     if (eigenvectors != NULL)

     {

         delete[] eigenvectors;

     }


     if (left_singularvectors != NULL)

     {

         delete[] left_singularvectors;

     }


     if (right_singularvectors_transposed != NULL)

     {

         delete[] right_singularvectors_transposed;

     }

 }


 // ===============================

 // Explicit template instantiation

 // ===============================


 template class cuTraceEstimator<float>;

 template class cuTraceEstimator<double>;

ConvergenceTools::check_convergence
static FlagType check_convergence(DataType **samples, const IndexType min_num_samples, const IndexType num_inquiries, const IndexType *processed_samples_indices, const IndexType num_processed_samples, const DataType confidence_level, const DataType error_atol, const DataType error_rtol, DataType *error, IndexType *num_samples_used, FlagType *converged)
Checks if the standard deviation of the set of the cumulative averages of trace estimators converged ...
Definition: convergence_tools.cpp:95

ConvergenceTools::average_estimates
static void average_estimates(const DataType confidence_level, const DataType outlier_significance_level, const IndexType num_inquiries, const IndexType max_num_samples, const IndexType *num_samples_used, const IndexType *processed_samples_indices, DataType **samples, IndexType *num_outliers, DataType *trace, DataType *error)
Averages the estimates of trace. Removes outliers and reevaluates the error to take into account for ...
Definition: convergence_tools.cpp:256

CudaInterface::set_device
static void set_device(int device_id)
Sets the current device in multi-gpu applications.
Definition: cuda_interface.cu:188

CudaTimer
Records elasped time between two CUDA events.
Definition: cuda_timer.h:62

CudaTimer::stop
void stop()
Stops the timer.
Definition: cuda_timer.cu:67

CudaTimer::start
void start()
Starts the timer.
Definition: cuda_timer.cu:54

CudaTimer::elapsed
float elapsed() const
Returns the elapsed time in seconds.
Definition: cuda_timer.cu:80

Diagonalization::eigh_tridiagonal
static int eigh_tridiagonal(DataType *diagonals, DataType *subdiagonals, DataType *eigenvectors, IndexType matrix_size)
Computes all eigenvalues and eigenvectors of a real and symmetric tridiagonal matrix.
Definition: diagonalization.cpp:89

Diagonalization::svd_bidiagonal
static int svd_bidiagonal(DataType *diagonals, DataType *subdiagonals, DataType *U, DataType *Vt, IndexType matrix_size)
Computes all singular-values and left and right eigenvectors of a real and symmetric upper bi-diagona...
Definition: diagonalization.cpp:190

Function
Defines the function .
Definition: functions.h:38

RandomArrayGenerator::generate_random_array
static void generate_random_array(RandomNumberGenerator &random_number_generator, DataType *array, const LongIndexType array_size, const IndexType num_threads)
Generates a pseudo-random array with Rademacher distribution where elements are either +1 or -1.
Definition: random_array_generator.cpp:63

RandomNumberGenerator
Generates 64-bit integers on multiple parallel threads.
Definition: random_number_generator.h:104

cLinearOperator::get_eigenvalue
virtual DataType get_eigenvalue(const DataType *known_parameters, const DataType known_eigenvalue, const DataType *inquiry_parameters) const =0

cLinearOperator::set_parameters
void set_parameters(DataType *parameters_)
Sets the scalar parameter this->parameters. Parameter is initialized to NULL. However,...
Definition: c_linear_operator.cpp:108

cLinearOperator::get_num_rows
LongIndexType get_num_rows() const
Definition: c_linear_operator.cpp:79

cLinearOperator::is_eigenvalue_relation_known
FlagType is_eigenvalue_relation_known() const
Returns a flag that determines whether a relation between the parameters of the operator and its eige...
Definition: c_linear_operator.cpp:136

cLinearOperator::get_num_parameters
IndexType get_num_parameters() const
Definition: c_linear_operator.cpp:119

cuLinearOperator
Base class for linear operators. This class serves as interface for all derived classes.
Definition: cu_linear_operator.h:44

cuTraceEstimator
A static class to compute the trace of implicit matrix functions using stochastic Lanczos quadrature ...
Definition: cu_trace_estimator.h:40

cuTraceEstimator::_cu_stochastic_lanczos_quadrature
static void _cu_stochastic_lanczos_quadrature(cuLinearOperator< DataType > *A, DataType *parameters, const IndexType num_inquiries, const Function *matrix_function, const FlagType gram, const DataType exponent, const FlagType orthogonalize, const IndexType lanczos_degree, const DataType lanczos_tol, RandomNumberGenerator &random_number_generator, DataType *random_vector, FlagType *converged, DataType *trace_estimate)
For a given random input vector, computes one Monte-Carlo sample to estimate trace using Lanczos quad...
Definition: cu_trace_estimator.cu:419

cuTraceEstimator::cu_trace_estimator
static FlagType cu_trace_estimator(cuLinearOperator< DataType > *A, DataType *parameters, const IndexType num_inquiries, const Function *matrix_function, const FlagType gram, const DataType exponent, const FlagType orthogonalize, const int64_t seed, const IndexType lanczos_degree, const DataType lanczos_tol, const IndexType min_num_samples, const IndexType max_num_samples, const DataType error_atol, const DataType error_rtol, const DataType confidence_level, const DataType outlier_significance_level, const IndexType num_threads, const IndexType num_gpu_devices, DataType *trace, DataType *error, DataType **samples, IndexType *processed_samples_indices, IndexType *num_samples_used, IndexType *num_outliers, FlagType *converged, float &alg_wall_time)
Stochastic Lanczos quadrature method to estimate trace of a function of a linear operator....
Definition: cu_trace_estimator.cu:197

cu_golub_kahn_bidiagonalization
IndexType cu_golub_kahn_bidiagonalization(cuLinearOperator< DataType > *A, const DataType *v, const LongIndexType n, const IndexType m, const DataType lanczos_tol, const FlagType orthogonalize, DataType *alpha, DataType *beta)
Bi-diagonalizes the positive-definite matrix A using Golub-Kahn-Lanczos method.
Definition: cu_golub_kahn_bidiagonalization.cu:113

cu_golub_kahn_bidiagonalization.h

cu_lanczos_tridiagonalization
IndexType cu_lanczos_tridiagonalization(cuLinearOperator< DataType > *A, const DataType *v, const LongIndexType n, const IndexType m, const DataType lanczos_tol, const FlagType orthogonalize, DataType *alpha, DataType *beta)
Tri-diagonalizes matrix A to T using the start vector v. is the Lanczos degree, which will be the siz...
Definition: cu_lanczos_tridiagonalization.cu:119

cu_lanczos_tridiagonalization.h

cu_trace_estimator.h

FlagType
int FlagType
Definition: types.h:68

IndexType
int IndexType
Definition: types.h:65