22 #include "../_random_generator/random_array_generator.h"   
   23 #include "../_c_trace_estimator/diagonalization.h"   
   24 #include "../_c_trace_estimator/convergence_tools.h"   
   25 #include "../_cuda_utilities/cuda_timer.h"   
   26 #include "../_cuda_utilities/cuda_interface.h"   
  196 template <
typename DataType>
 
  199         DataType* parameters,
 
  203         const DataType exponent,
 
  207         const DataType lanczos_tol,
 
  210         const DataType error_atol,
 
  211         const DataType error_rtol,
 
  212         const DataType confidence_level,
 
  213         const DataType outlier_significance_level,
 
  223         float& alg_wall_time)
 
  229     omp_set_num_threads(num_gpu_devices);
 
  236     IndexType random_vectors_size = matrix_size * num_gpu_devices;
 
  237     DataType* random_vectors = 
new DataType[random_vectors_size];
 
  252     unsigned int chunk_size = 
static_cast<int>(
 
  253             sqrt(
static_cast<DataType
>(max_num_samples) / num_gpu_devices));
 
  265     #pragma omp parallel for schedule(dynamic, chunk_size) 
  266     for (i=0; i < max_num_samples; ++i)
 
  268         if (!
static_cast<bool>(all_converged))
 
  271             unsigned int thread_id = omp_get_thread_num();
 
  276                     A, parameters, num_inquiries, matrix_function, gram,
 
  277                     exponent, orthogonalize, lanczos_degree, lanczos_tol,
 
  278                     random_number_generator,
 
  279                     &random_vectors[matrix_size*thread_id], converged,
 
  286                 processed_samples_indices[num_processed_samples] = i;
 
  287                 ++num_processed_samples;
 
  293                         samples, min_num_samples, num_inquiries,
 
  294                         processed_samples_indices, num_processed_samples,
 
  295                         confidence_level, error_atol, error_rtol, error,
 
  296                         num_samples_used, converged);
 
  303     alg_wall_time = cuda_timer.
elapsed();
 
  307             confidence_level, outlier_significance_level, num_inquiries,
 
  308             max_num_samples, num_samples_used, processed_samples_indices,
 
  309             samples, num_outliers, trace, error);
 
  312     delete[] random_vectors;
 
  314     return all_converged;
 
  418 template <
typename DataType>
 
  421         DataType* parameters,
 
  425         const DataType exponent,
 
  428         const DataType lanczos_tol,
 
  430         DataType* random_vector,
 
  432         DataType* trace_estimate)
 
  443             random_number_generator, random_vector, matrix_size, num_threads);
 
  446     DataType* alpha = 
new DataType[lanczos_degree];
 
  447     DataType* beta = 
new DataType[lanczos_degree];
 
  451     DataType* eigenvectors = NULL;
 
  452     DataType* left_singularvectors = NULL;
 
  453     DataType* right_singularvectors_transposed = NULL;
 
  456     IndexType required_num_inquiries = num_inquiries;
 
  464         required_num_inquiries = 1;
 
  470     DataType** theta = 
new DataType*[num_inquiries];
 
  471     for (j=0; j < num_inquiries; ++j)
 
  473         theta[j] = 
new DataType[lanczos_degree];
 
  476         for (i=0; i < lanczos_degree; ++i)
 
  483     DataType** tau = 
new DataType*[num_inquiries];
 
  484     for (j=0; j < num_inquiries; ++j)
 
  486         tau[j] = 
new DataType[lanczos_degree];
 
  489         for (i=0; i < lanczos_degree; ++i)
 
  505     for (j=0; j < required_num_inquiries; ++j)
 
  511         if ((converged[j] == 1) && (required_num_inquiries == num_inquiries))
 
  523                     A, random_vector, matrix_size, lanczos_degree, lanczos_tol,
 
  524                     orthogonalize, alpha, beta);
 
  527             left_singularvectors = \
 
  528                 new DataType[lanczos_size[j] * lanczos_size[j]];
 
  529             right_singularvectors_transposed = \
 
  530                 new DataType[lanczos_size[j] * lanczos_size[j]];
 
  534                     alpha, beta, left_singularvectors,
 
  535                     right_singularvectors_transposed, lanczos_size[j]);
 
  538             for (i=0; i < lanczos_size[j]; ++i)
 
  540                 theta[j][i] = alpha[i] * alpha[i];
 
  541                 tau[j][i] = right_singularvectors_transposed[i];
 
  548                     A, random_vector, matrix_size, lanczos_degree, lanczos_tol,
 
  549                     orthogonalize, alpha, beta);
 
  552             eigenvectors = 
new DataType[lanczos_size[j] * lanczos_size[j]];
 
  556                     alpha, beta, eigenvectors, lanczos_size[j]);
 
  559             for (i=0; i < lanczos_size[j]; ++i)
 
  561                 theta[j][i] = alpha[i];
 
  562                 tau[j][i] = eigenvectors[i * lanczos_size[j]];
 
  579         for (j=0; j < num_inquiries; ++j)
 
  585         for (j=1; j < num_inquiries; ++j)
 
  588             lanczos_size[j] = lanczos_size[0];
 
  590             for (i=0; i < lanczos_size[j]; ++i)
 
  596                         ¶meters[j*num_parameters]);
 
  599                 tau[j][i] = tau[0][i];
 
  605     DataType quadrature_sum;
 
  606     for (j=0; j < num_inquiries; ++j)
 
  609         if (converged[j] == 1)
 
  615         quadrature_sum = 0.0;
 
  622         for (i=0; i < lanczos_size[j]; ++i)
 
  624             quadrature_sum += tau[j][i] * tau[j][i] * \
 
  625                     matrix_function->function(pow(theta[j][i], exponent));
 
  628         trace_estimate[j] = matrix_size * quadrature_sum;
 
  634     delete[] lanczos_size;
 
  636     for (j=0; j < required_num_inquiries; ++j)
 
  642     for (j=0; j < required_num_inquiries; ++j)
 
  648     if (eigenvectors != NULL)
 
  650         delete[] eigenvectors;
 
  653     if (left_singularvectors != NULL)
 
  655         delete[] left_singularvectors;
 
  658     if (right_singularvectors_transposed != NULL)
 
  660         delete[] right_singularvectors_transposed;
 
static void set_device(int device_id)
Sets the current device in multi-gpu applications.
 
Records elasped time between two CUDA events.
 
void stop()
Stops the timer.
 
void start()
Starts the timer.
 
float elapsed() const
Returns the elapsed time in seconds.
 
static int eigh_tridiagonal(DataType *diagonals, DataType *subdiagonals, DataType *eigenvectors, IndexType matrix_size)
Computes all eigenvalues and eigenvectors of a real and symmetric tridiagonal matrix.
 
static int svd_bidiagonal(DataType *diagonals, DataType *subdiagonals, DataType *U, DataType *Vt, IndexType matrix_size)
Computes all singular-values and left and right eigenvectors of a real and symmetric upper bi-diagona...
 
static void generate_random_array(RandomNumberGenerator &random_number_generator, DataType *array, const LongIndexType array_size, const IndexType num_threads)
Generates a pseudo-random array with Rademacher distribution where elements are either +1 or -1.
 
Generates 64-bit integers on multiple parallel threads.
 
virtual DataType get_eigenvalue(const DataType *known_parameters, const DataType known_eigenvalue, const DataType *inquiry_parameters) const =0
 
void set_parameters(DataType *parameters_)
Sets the scalar parameter this->parameters. Parameter is initialized to NULL. However,...
 
LongIndexType get_num_rows() const
 
FlagType is_eigenvalue_relation_known() const
Returns a flag that determines whether a relation between the parameters of the operator and its eige...
 
IndexType get_num_parameters() const
 
Base class for linear operators. This class serves as interface for all derived classes.
 
A static class to compute the trace of implicit matrix functions using stochastic Lanczos quadrature ...
 
static void _cu_stochastic_lanczos_quadrature(cuLinearOperator< DataType > *A, DataType *parameters, const IndexType num_inquiries, const Function *matrix_function, const FlagType gram, const DataType exponent, const FlagType orthogonalize, const IndexType lanczos_degree, const DataType lanczos_tol, RandomNumberGenerator &random_number_generator, DataType *random_vector, FlagType *converged, DataType *trace_estimate)
For a given random input vector, computes one Monte-Carlo sample to estimate trace using Lanczos quad...
 
static FlagType cu_trace_estimator(cuLinearOperator< DataType > *A, DataType *parameters, const IndexType num_inquiries, const Function *matrix_function, const FlagType gram, const DataType exponent, const FlagType orthogonalize, const int64_t seed, const IndexType lanczos_degree, const DataType lanczos_tol, const IndexType min_num_samples, const IndexType max_num_samples, const DataType error_atol, const DataType error_rtol, const DataType confidence_level, const DataType outlier_significance_level, const IndexType num_threads, const IndexType num_gpu_devices, DataType *trace, DataType *error, DataType **samples, IndexType *processed_samples_indices, IndexType *num_samples_used, IndexType *num_outliers, FlagType *converged, float &alg_wall_time)
Stochastic Lanczos quadrature method to estimate trace of a function of a linear operator....
 
IndexType cu_golub_kahn_bidiagonalization(cuLinearOperator< DataType > *A, const DataType *v, const LongIndexType n, const IndexType m, const DataType lanczos_tol, const FlagType orthogonalize, DataType *alpha, DataType *beta)
Bi-diagonalizes the positive-definite matrix A using Golub-Kahn-Lanczos method.
 
IndexType cu_lanczos_tridiagonalization(cuLinearOperator< DataType > *A, const DataType *v, const LongIndexType n, const IndexType m, const DataType lanczos_tol, const FlagType orthogonalize, DataType *alpha, DataType *beta)
Tri-diagonalizes matrix A to T using the start vector v. is the Lanczos degree, which will be the siz...