imate
C++/CUDA Reference
cu_trace_estimator.cu
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>
3  * SPDX-License-Identifier: BSD-3-Clause
4  * SPDX-FileType: SOURCE
5  *
6  * This program is free software: you can redistribute it and/or modify it
7  * under the terms of the license found in the LICENSE.txt file in the root
8  * directory of this source tree.
9  */
10 
11 
12 // =======
13 // Headers
14 // =======
15 
16 #include "./cu_trace_estimator.h"
17 #include <omp.h> // omp_set_num_threads
18 #include <cmath> // sqrt, pow
19 #include <cstddef> // NULL
20 #include "./cu_lanczos_tridiagonalization.h" // cu_lanczos_tridiagonalization
21 #include "./cu_golub_kahn_bidiagonalization.h" // cu_golub_kahn_bidiagonali...
22 #include "../_random_generator/random_array_generator.h" // RandomArrayGene...
23 #include "../_c_trace_estimator/diagonalization.h" // Diagonalization
24 #include "../_c_trace_estimator/convergence_tools.h" // check_convergence, ...
25 #include "../_cuda_utilities/cuda_timer.h" // CudaTimer
26 #include "../_cuda_utilities/cuda_interface.h" // CudaInterface
27 
28 
29 // ==================
30 // cu trace estimator
31 // ==================
32 
195 
196 template <typename DataType>
199  DataType* parameters,
200  const IndexType num_inquiries,
201  const Function* matrix_function,
202  const FlagType gram,
203  const DataType exponent,
204  const FlagType orthogonalize,
205  const int64_t seed,
206  const IndexType lanczos_degree,
207  const DataType lanczos_tol,
208  const IndexType min_num_samples,
209  const IndexType max_num_samples,
210  const DataType error_atol,
211  const DataType error_rtol,
212  const DataType confidence_level,
213  const DataType outlier_significance_level,
214  const IndexType num_threads,
215  const IndexType num_gpu_devices,
216  DataType* trace,
217  DataType* error,
218  DataType** samples,
219  IndexType* processed_samples_indices,
220  IndexType* num_samples_used,
221  IndexType* num_outliers,
222  FlagType* converged,
223  float& alg_wall_time)
224 {
225  // Matrix size
226  IndexType matrix_size = A->get_num_rows();
227 
228  // Set the number of threads
229  omp_set_num_threads(num_gpu_devices);
230 
231  // Allocate 1D array of random vectors We only allocate a random vector
232  // per parallel thread. Thus, the total size of the random vectors is
233  // matrix_size*num_threads. On each iteration in parallel threads, the
234  // allocated memory is reused. That is, in each iteration, a new random
235  // vector is generated for that specific thread id.
236  IndexType random_vectors_size = matrix_size * num_gpu_devices;
237  DataType* random_vectors = new DataType[random_vectors_size];
238 
239  // Initialize random number generator to generate in parallel threads
240  // independently.
241  RandomNumberGenerator random_number_generator(num_gpu_devices, seed);
242 
243  // The counter of filled size of processed_samples_indices array
244  // This scalar variable is defined as array to be shared among al threads
245  IndexType num_processed_samples = 0;
246 
247  // Criterion for early termination of iterations if convergence reached
248  // This scalar variable is defined as array to be shared among al threads
249  FlagType all_converged = 0;
250 
251  // Using square-root of max possible chunk size for parallel schedules
252  unsigned int chunk_size = static_cast<int>(
253  sqrt(static_cast<DataType>(max_num_samples) / num_gpu_devices));
254  if (chunk_size < 1)
255  {
256  chunk_size = 1;
257  }
258 
259  // Timing elapsed time of algorithm
260  CudaTimer cuda_timer;
261  cuda_timer.start();
262 
263  // Shared-memory parallelism over Monte-Carlo ensemble sampling
264  IndexType i;
265  #pragma omp parallel for schedule(dynamic, chunk_size)
266  for (i=0; i < max_num_samples; ++i)
267  {
268  if (!static_cast<bool>(all_converged))
269  {
270  // Switch to a device with the same device id as the cpu thread id
271  unsigned int thread_id = omp_get_thread_num();
273 
274  // Perform one Monte-Carlo sampling to estimate trace
276  A, parameters, num_inquiries, matrix_function, gram,
277  exponent, orthogonalize, lanczos_degree, lanczos_tol,
278  random_number_generator,
279  &random_vectors[matrix_size*thread_id], converged,
280  samples[i]);
281 
282  // Critical section
283  #pragma omp critical
284  {
285  // Store the index of processed samples
286  processed_samples_indices[num_processed_samples] = i;
287  ++num_processed_samples;
288 
289  // Check whether convergence criterion has been met to stop.
290  // This check can also be done after another parallel thread
291  // set all_converged to "1", but we continue to update error.
293  samples, min_num_samples, num_inquiries,
294  processed_samples_indices, num_processed_samples,
295  confidence_level, error_atol, error_rtol, error,
296  num_samples_used, converged);
297  }
298  }
299  }
300 
301  // Elapsed wall time of the algorithm (computation only, not array i/o)
302  cuda_timer.stop();
303  alg_wall_time = cuda_timer.elapsed();
304 
305  // Remove outliers from trace estimates and average trace estimates
307  confidence_level, outlier_significance_level, num_inquiries,
308  max_num_samples, num_samples_used, processed_samples_indices,
309  samples, num_outliers, trace, error);
310 
311  // Deallocate memory
312  delete[] random_vectors;
313 
314  return all_converged;
315 }
316 
317 
318 // ================================
319 // cu stochastic lanczos quadrature
320 // ================================
321 
417 
418 template <typename DataType>
421  DataType* parameters,
422  const IndexType num_inquiries,
423  const Function* matrix_function,
424  const FlagType gram,
425  const DataType exponent,
426  const FlagType orthogonalize,
427  const IndexType lanczos_degree,
428  const DataType lanczos_tol,
429  RandomNumberGenerator& random_number_generator,
430  DataType* random_vector,
431  FlagType* converged,
432  DataType* trace_estimate)
433 {
434  // Matrix size
435  IndexType matrix_size = A->get_num_rows();
436 
437  // Fill random vectors with Rademacher distribution (+1, -1), normalized
438  // but not orthogonalized. Setting num_threads to zero indicates to not
439  // create any new threads in RandomNumbrGenerator since the current
440  // function is inside a parallel thread.
441  IndexType num_threads = 0;
443  random_number_generator, random_vector, matrix_size, num_threads);
444 
445  // Allocate diagonals (alpha) and supdiagonals (beta) of Lanczos matrix
446  DataType* alpha = new DataType[lanczos_degree];
447  DataType* beta = new DataType[lanczos_degree];
448 
449  // Define 2D arrays needed to decomposition. All these arrays are
450  // defined as 1D array with Fortran ordering
451  DataType* eigenvectors = NULL;
452  DataType* left_singularvectors = NULL;
453  DataType* right_singularvectors_transposed = NULL;
454 
455  // Actual number of inquiries
456  IndexType required_num_inquiries = num_inquiries;
458  {
459  // When a relation between eigenvalues and the parameters of the linear
460  // operator is known, to compute eigenvalues of for each inquiry, only
461  // computing one inquiry is enough. This is because an eigenvalue for
462  // one parameter setting is enough to compute eigenvalue of another set
463  // of parameters.
464  required_num_inquiries = 1;
465  }
466 
467  // Allocate and initialize theta
468  IndexType i;
469  IndexType j;
470  DataType** theta = new DataType*[num_inquiries];
471  for (j=0; j < num_inquiries; ++j)
472  {
473  theta[j] = new DataType[lanczos_degree];
474 
475  // Initialize components to zero
476  for (i=0; i < lanczos_degree; ++i)
477  {
478  theta[j][i] = 0.0;
479  }
480  }
481 
482  // Allocate and initialize tau
483  DataType** tau = new DataType*[num_inquiries];
484  for (j=0; j < num_inquiries; ++j)
485  {
486  tau[j] = new DataType[lanczos_degree];
487 
488  // Initialize components to zero
489  for (i=0; i < lanczos_degree; ++i)
490  {
491  tau[j][i] = 0.0;
492  }
493  }
494 
495  // Allocate lanczos size for each inquiry. This variable keeps the non-zero
496  // size of the tri-diagonal (or bi-diagonal) matrix. Ideally, this matrix
497  // is of the size lanczos_degree. But, due to the early termination, this
498  // size might be smaller.
499  IndexType* lanczos_size = new IndexType[num_inquiries];
500 
501  // Number of parameters of linear operator A
502  IndexType num_parameters = A->get_num_parameters();
503 
504  // Lanczos iterations, computes theta and tau for each inquiry parameter
505  for (j=0; j < required_num_inquiries; ++j)
506  {
507  // If trace is already converged, do not compute on the new sample.
508  // However, exclude the case where required_num_inquiries is not the
509  // same as num_inquiries, since in this case, we compute one inquiry
510  // for multiple parameters.
511  if ((converged[j] == 1) && (required_num_inquiries == num_inquiries))
512  {
513  continue;
514  }
515 
516  // Set parameter of linear operator A
517  A->set_parameters(&parameters[j*num_parameters]);
518 
519  if (gram)
520  {
521  // Use Golub-Kahn-Lanczos Bi-diagonalization
522  lanczos_size[j] = cu_golub_kahn_bidiagonalization(
523  A, random_vector, matrix_size, lanczos_degree, lanczos_tol,
524  orthogonalize, alpha, beta);
525 
526  // Allocate matrix of singular vectors (1D array, Fortran ordering)
527  left_singularvectors = \
528  new DataType[lanczos_size[j] * lanczos_size[j]];
529  right_singularvectors_transposed = \
530  new DataType[lanczos_size[j] * lanczos_size[j]];
531 
532  // Note: alpha is written in-place with singular values
534  alpha, beta, left_singularvectors,
535  right_singularvectors_transposed, lanczos_size[j]);
536 
537  // theta and tau from singular values and vectors
538  for (i=0; i < lanczos_size[j]; ++i)
539  {
540  theta[j][i] = alpha[i] * alpha[i];
541  tau[j][i] = right_singularvectors_transposed[i];
542  }
543  }
544  else
545  {
546  // Use Lanczos Tri-diagonalization
547  lanczos_size[j] = cu_lanczos_tridiagonalization(
548  A, random_vector, matrix_size, lanczos_degree, lanczos_tol,
549  orthogonalize, alpha, beta);
550 
551  // Allocate eigenvectors matrix (1D array with Fortran ordering)
552  eigenvectors = new DataType[lanczos_size[j] * lanczos_size[j]];
553 
554  // Note: alpha is written in-place with eigenvalues
556  alpha, beta, eigenvectors, lanczos_size[j]);
557 
558  // theta and tau from singular values and vectors
559  for (i=0; i < lanczos_size[j]; ++i)
560  {
561  theta[j][i] = alpha[i];
562  tau[j][i] = eigenvectors[i * lanczos_size[j]];
563  }
564  }
565  }
566 
567  // If an eigenvalue relation is known, compute the rest of eigenvalues
568  // using the eigenvalue relation given in the operator A for its
569  // eigenvalues. If no eigenvalue relation is not known, the rest of
570  // eigenvalues were already computed in the above loop and no other
571  // computation is needed.
572  if (A->is_eigenvalue_relation_known() && num_inquiries > 1)
573  {
574  // When the code execution reaches this function, at least one of the
575  // inquiries is not converged, but some others might have been
576  // converged already. Here, we force-update those that are even
577  // converged already by setting converged to false. The extra update is
578  // free of charge when a relation for the eigenvalues are known.
579  for (j=0; j < num_inquiries; ++j)
580  {
581  converged[j] = 0;
582  }
583 
584  // Compute theta and tau for the rest of inquiry parameters
585  for (j=1; j < num_inquiries; ++j)
586  {
587  // Only j=0 was iterated before. Set the same size for other j-s
588  lanczos_size[j] = lanczos_size[0];
589 
590  for (i=0; i < lanczos_size[j]; ++i)
591  {
592  // Shift eigenvalues by the old and new parameters
593  theta[j][i] = A->get_eigenvalue(
594  &parameters[0],
595  theta[0][i],
596  &parameters[j*num_parameters]);
597 
598  // tau is the same (at least for the affine operator)
599  tau[j][i] = tau[0][i];
600  }
601  }
602  }
603 
604  // Estimate trace using quadrature method
605  DataType quadrature_sum;
606  for (j=0; j < num_inquiries; ++j)
607  {
608  // If the j-th inquiry is already converged, skip.
609  if (converged[j] == 1)
610  {
611  continue;
612  }
613 
614  // Initialize sum for the integral of quadrature
615  quadrature_sum = 0.0;
616 
617  // Important: This loop should iterate till lanczos_size[j], but not
618  // lanczos_degree. Otherwise the computation is wrong for certain
619  // matrices, such as if the input matrix is identity, or rank
620  // deficient. By using lanczos_size[j] instead of lanczos_degree, all
621  // issues with special matrices will resolve.
622  for (i=0; i < lanczos_size[j]; ++i)
623  {
624  quadrature_sum += tau[j][i] * tau[j][i] * \
625  matrix_function->function(pow(theta[j][i], exponent));
626  }
627 
628  trace_estimate[j] = matrix_size * quadrature_sum;
629  }
630 
631  // Release dynamic memory
632  delete[] alpha;
633  delete[] beta;
634  delete[] lanczos_size;
635 
636  for (j=0; j < required_num_inquiries; ++j)
637  {
638  delete[] theta[j];
639  }
640  delete[] theta;
641 
642  for (j=0; j < required_num_inquiries; ++j)
643  {
644  delete[] tau[j];
645  }
646  delete[] tau;
647 
648  if (eigenvectors != NULL)
649  {
650  delete[] eigenvectors;
651  }
652 
653  if (left_singularvectors != NULL)
654  {
655  delete[] left_singularvectors;
656  }
657 
658  if (right_singularvectors_transposed != NULL)
659  {
660  delete[] right_singularvectors_transposed;
661  }
662 }
663 
664 
665 // ===============================
666 // Explicit template instantiation
667 // ===============================
668 
669 template class cuTraceEstimator<float>;
670 template class cuTraceEstimator<double>;
static FlagType check_convergence(DataType **samples, const IndexType min_num_samples, const IndexType num_inquiries, const IndexType *processed_samples_indices, const IndexType num_processed_samples, const DataType confidence_level, const DataType error_atol, const DataType error_rtol, DataType *error, IndexType *num_samples_used, FlagType *converged)
Checks if the standard deviation of the set of the cumulative averages of trace estimators converged ...
static void average_estimates(const DataType confidence_level, const DataType outlier_significance_level, const IndexType num_inquiries, const IndexType max_num_samples, const IndexType *num_samples_used, const IndexType *processed_samples_indices, DataType **samples, IndexType *num_outliers, DataType *trace, DataType *error)
Averages the estimates of trace. Removes outliers and reevaluates the error to take into account for ...
static void set_device(int device_id)
Sets the current device in multi-gpu applications.
Records elasped time between two CUDA events.
Definition: cuda_timer.h:62
void stop()
Stops the timer.
Definition: cuda_timer.cu:67
void start()
Starts the timer.
Definition: cuda_timer.cu:54
float elapsed() const
Returns the elapsed time in seconds.
Definition: cuda_timer.cu:80
static int eigh_tridiagonal(DataType *diagonals, DataType *subdiagonals, DataType *eigenvectors, IndexType matrix_size)
Computes all eigenvalues and eigenvectors of a real and symmetric tridiagonal matrix.
static int svd_bidiagonal(DataType *diagonals, DataType *subdiagonals, DataType *U, DataType *Vt, IndexType matrix_size)
Computes all singular-values and left and right eigenvectors of a real and symmetric upper bi-diagona...
Defines the function .
Definition: functions.h:38
static void generate_random_array(RandomNumberGenerator &random_number_generator, DataType *array, const LongIndexType array_size, const IndexType num_threads)
Generates a pseudo-random array with Rademacher distribution where elements are either +1 or -1.
Generates 64-bit integers on multiple parallel threads.
virtual DataType get_eigenvalue(const DataType *known_parameters, const DataType known_eigenvalue, const DataType *inquiry_parameters) const =0
void set_parameters(DataType *parameters_)
Sets the scalar parameter this->parameters. Parameter is initialized to NULL. However,...
LongIndexType get_num_rows() const
FlagType is_eigenvalue_relation_known() const
Returns a flag that determines whether a relation between the parameters of the operator and its eige...
IndexType get_num_parameters() const
Base class for linear operators. This class serves as interface for all derived classes.
A static class to compute the trace of implicit matrix functions using stochastic Lanczos quadrature ...
static void _cu_stochastic_lanczos_quadrature(cuLinearOperator< DataType > *A, DataType *parameters, const IndexType num_inquiries, const Function *matrix_function, const FlagType gram, const DataType exponent, const FlagType orthogonalize, const IndexType lanczos_degree, const DataType lanczos_tol, RandomNumberGenerator &random_number_generator, DataType *random_vector, FlagType *converged, DataType *trace_estimate)
For a given random input vector, computes one Monte-Carlo sample to estimate trace using Lanczos quad...
static FlagType cu_trace_estimator(cuLinearOperator< DataType > *A, DataType *parameters, const IndexType num_inquiries, const Function *matrix_function, const FlagType gram, const DataType exponent, const FlagType orthogonalize, const int64_t seed, const IndexType lanczos_degree, const DataType lanczos_tol, const IndexType min_num_samples, const IndexType max_num_samples, const DataType error_atol, const DataType error_rtol, const DataType confidence_level, const DataType outlier_significance_level, const IndexType num_threads, const IndexType num_gpu_devices, DataType *trace, DataType *error, DataType **samples, IndexType *processed_samples_indices, IndexType *num_samples_used, IndexType *num_outliers, FlagType *converged, float &alg_wall_time)
Stochastic Lanczos quadrature method to estimate trace of a function of a linear operator....
IndexType cu_golub_kahn_bidiagonalization(cuLinearOperator< DataType > *A, const DataType *v, const LongIndexType n, const IndexType m, const DataType lanczos_tol, const FlagType orthogonalize, DataType *alpha, DataType *beta)
Bi-diagonalizes the positive-definite matrix A using Golub-Kahn-Lanczos method.
IndexType cu_lanczos_tridiagonalization(cuLinearOperator< DataType > *A, const DataType *v, const LongIndexType n, const IndexType m, const DataType lanczos_tol, const FlagType orthogonalize, DataType *alpha, DataType *beta)
Tri-diagonalizes matrix A to T using the start vector v. is the Lanczos degree, which will be the siz...
int FlagType
Definition: types.h:68
int IndexType
Definition: types.h:65