imate
C++/CUDA Reference
c_trace_estimator.cpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>
3  * SPDX-License-Identifier: BSD-3-Clause
4  * SPDX-FileType: SOURCE
5  *
6  * This program is free software: you can redistribute it and/or modify it
7  * under the terms of the license found in the LICENSE.txt file in the root
8  * directory of this source tree.
9  */
10 
11 
12 // =======
13 // Headers
14 // =======
15 
16 #include "./c_trace_estimator.h"
17 #include <omp.h> // omp_set_num_threads
18 #include <cmath> // sqrt, pow
19 #include <cstddef> // NULL
20 #include "./c_lanczos_tridiagonalization.h" // c_lanczos_tridiagonalization
21 #include "./c_golub_kahn_bidiagonalization.h" // c_golub_kahn_bidiagonaliza...
22 #include "../_random_generator/random_array_generator.h" // RandomArrayGene...
23 #include "./diagonalization.h" // Diagonalization
24 #include "./convergence_tools.h" // check_convergence, average_estimates
25 #include "../_utilities/timer.h" // Timer
26 
27 
28 // =================
29 // c trace estimator
30 // =================
31 
190 
191 template <typename DataType>
194  DataType* parameters,
195  const IndexType num_inquiries,
196  const Function* matrix_function,
197  const FlagType gram,
198  const DataType exponent,
199  const FlagType orthogonalize,
200  const int64_t seed,
201  const IndexType lanczos_degree,
202  const DataType lanczos_tol,
203  const IndexType min_num_samples,
204  const IndexType max_num_samples,
205  const DataType error_atol,
206  const DataType error_rtol,
207  const DataType confidence_level,
208  const DataType outlier_significance_level,
209  const IndexType num_threads,
210  DataType* trace,
211  DataType* error,
212  DataType** samples,
213  IndexType* processed_samples_indices,
214  IndexType* num_samples_used,
215  IndexType* num_outliers,
216  FlagType* converged,
217  float& alg_wall_time)
218 {
219  // Matrix size
220  IndexType matrix_size = A->get_num_rows();
221 
222  // Set the number of threads
223  omp_set_num_threads(num_threads);
224 
225  // Allocate 1D array of random vectors We only allocate a random vector
226  // per parallel thread. Thus, the total size of the random vectors is
227  // matrix_size*num_threads. On each iteration in parallel threads, the
228  // allocated memory is reused. That is, in each iteration, a new random
229  // vector is generated for that specific thread id.
230  IndexType random_vectors_size = matrix_size * num_threads;
231  DataType* random_vectors = new DataType[random_vectors_size];
232 
233  // Initialize random number generator to generate in parallel threads
234  // independently.
235  RandomNumberGenerator random_number_generator(num_threads, seed);
236 
237  // The counter of filled size of processed_samples_indices array
238  // This scalar variable is defined as array to be shared among al threads
239  IndexType num_processed_samples = 0;
240 
241  // Criterion for early termination of iterations if convergence reached
242  // This scalar variable is defined as array to be shared among al threads
243  FlagType all_converged = 0;
244 
245  // Using square-root of max possible chunk size for parallel schedules
246  unsigned int chunk_size = static_cast<int>(
247  sqrt(static_cast<DataType>(max_num_samples) / num_threads));
248  if (chunk_size < 1)
249  {
250  chunk_size = 1;
251  }
252 
253  // Timing elapsed time of algorithm
254  Timer timer;
255  timer.start();
256 
257  // Shared-memory parallelism over Monte-Carlo ensemble sampling
258  IndexType i;
259  #pragma omp parallel for schedule(dynamic, chunk_size)
260  for (i=0; i < max_num_samples; ++i)
261  {
262  if (!static_cast<bool>(all_converged))
263  {
264  int thread_id = omp_get_thread_num();
265 
266  // Perform one Monte-Carlo sampling to estimate trace
268  A, parameters, num_inquiries, matrix_function, gram,
269  exponent, orthogonalize, lanczos_degree, lanczos_tol,
270  random_number_generator,
271  &random_vectors[matrix_size*thread_id], converged,
272  samples[i]);
273 
274  // Critical section
275  #pragma omp critical
276  {
277  // Store the index of processed samples
278  processed_samples_indices[num_processed_samples] = i;
279  ++num_processed_samples;
280 
281  // Check whether convergence criterion has been met to stop.
282  // This check can also be done after another parallel thread
283  // set all_converged to "1", but we continue to update error.
285  samples, min_num_samples, num_inquiries,
286  processed_samples_indices, num_processed_samples,
287  confidence_level, error_atol, error_rtol, error,
288  num_samples_used, converged);
289  }
290  }
291  }
292 
293  // Elapsed wall time of the algorithm (computation only, not array i/o)
294  timer.stop();
295  alg_wall_time = timer.elapsed();
296 
297  // Remove outliers from trace estimates and average trace estimates
299  confidence_level, outlier_significance_level, num_inquiries,
300  max_num_samples, num_samples_used, processed_samples_indices,
301  samples, num_outliers, trace, error);
302 
303  // Deallocate memory
304  delete[] random_vectors;
305 
306  return all_converged;
307 }
308 
309 
310 // ===============================
311 // c stochastic lanczos quadrature
312 // ===============================
313 
409 
410 template <typename DataType>
413  DataType* parameters,
414  const IndexType num_inquiries,
415  const Function* matrix_function,
416  const FlagType gram,
417  const DataType exponent,
418  const FlagType orthogonalize,
419  const IndexType lanczos_degree,
420  const DataType lanczos_tol,
421  RandomNumberGenerator& random_number_generator,
422  DataType* random_vector,
423  FlagType* converged,
424  DataType* trace_estimate)
425 {
426  // Matrix size
427  IndexType matrix_size = A->get_num_rows();
428 
429  // Fill random vectors with Rademacher distribution (+1, -1), normalized
430  // but not orthogonalized. Setting num_threads to zero indicates to not
431  // create any new threads in RandomNumbrGenerator since the current
432  // function is inside a parallel thread.
433  IndexType num_threads = 0;
435  random_number_generator, random_vector, matrix_size, num_threads);
436 
437  // Allocate diagonals (alpha) and supdiagonals (beta) of Lanczos matrix
438  DataType* alpha = new DataType[lanczos_degree];
439  DataType* beta = new DataType[lanczos_degree];
440 
441  // Define 2D arrays needed to decomposition. All these arrays are
442  // defined as 1D array with Fortran ordering
443  DataType* eigenvectors = NULL;
444  DataType* left_singularvectors = NULL;
445  DataType* right_singularvectors_transposed = NULL;
446 
447  // Actual number of inquiries
448  IndexType required_num_inquiries = num_inquiries;
450  {
451  // When a relation between eigenvalues and the parameters of the linear
452  // operator is known, to compute eigenvalues of for each inquiry, only
453  // computing one inquiry is enough. This is because an eigenvalue for
454  // one parameter setting is enough to compute eigenvalue of another set
455  // of parameters.
456  required_num_inquiries = 1;
457  }
458 
459  // Allocate and initialize theta
460  IndexType i;
461  IndexType j;
462  DataType** theta = new DataType*[num_inquiries];
463  for (j=0; j < num_inquiries; ++j)
464  {
465  theta[j] = new DataType[lanczos_degree];
466 
467  // Initialize components to zero
468  for (i=0; i < lanczos_degree; ++i)
469  {
470  theta[j][i] = 0.0;
471  }
472  }
473 
474  // Allocate and initialize tau
475  DataType** tau = new DataType*[num_inquiries];
476  for (j=0; j < num_inquiries; ++j)
477  {
478  tau[j] = new DataType[lanczos_degree];
479 
480  // Initialize components to zero
481  for (i=0; i < lanczos_degree; ++i)
482  {
483  tau[j][i] = 0.0;
484  }
485  }
486 
487  // Allocate lanczos size for each inquiry. This variable keeps the non-zero
488  // size of the tri-diagonal (or bi-diagonal) matrix. Ideally, this matrix
489  // is of the size lanczos_degree. But, due to the early termination, this
490  // size might be smaller.
491  IndexType* lanczos_size = new IndexType[num_inquiries];
492 
493  // Number of parameters of linear operator A
494  IndexType num_parameters = A->get_num_parameters();
495 
496  // Lanczos iterations, computes theta and tau for each inquiry parameter
497  for (j=0; j < required_num_inquiries; ++j)
498  {
499  // If trace is already converged, do not compute on the new sample.
500  // However, exclude the case where required_num_inquiries is not the
501  // same as num_inquiries, since in this case, we compute one inquiry
502  // for multiple parameters.
503  if ((converged[j] == 1) && (required_num_inquiries == num_inquiries))
504  {
505  continue;
506  }
507 
508  // Set parameter of linear operator A
509  A->set_parameters(&parameters[j*num_parameters]);
510 
511  if (gram)
512  {
513  // Use Golub-Kahn-Lanczos Bi-diagonalization
514  lanczos_size[j] = c_golub_kahn_bidiagonalization(
515  A, random_vector, matrix_size, lanczos_degree, lanczos_tol,
516  orthogonalize, alpha, beta);
517 
518  // Allocate matrix of singular vectors (1D array, Fortran ordering)
519  left_singularvectors = \
520  new DataType[lanczos_size[j] * lanczos_size[j]];
521  right_singularvectors_transposed = \
522  new DataType[lanczos_size[j] * lanczos_size[j]];
523 
524  // Note: alpha is written in-place with singular values
526  alpha, beta, left_singularvectors,
527  right_singularvectors_transposed, lanczos_size[j]);
528 
529  // theta and tau from singular values and vectors
530  for (i=0; i < lanczos_size[j]; ++i)
531  {
532  theta[j][i] = alpha[i] * alpha[i];
533  tau[j][i] = right_singularvectors_transposed[i];
534  }
535  }
536  else
537  {
538  // Use Lanczos Tri-diagonalization
539  lanczos_size[j] = c_lanczos_tridiagonalization(
540  A, random_vector, matrix_size, lanczos_degree, lanczos_tol,
541  orthogonalize, alpha, beta);
542 
543  // Allocate eigenvectors matrix (1D array with Fortran ordering)
544  eigenvectors = new DataType[lanczos_size[j] * lanczos_size[j]];
545 
546  // Note: alpha is written in-place with eigenvalues
548  alpha, beta, eigenvectors, lanczos_size[j]);
549 
550  // theta and tau from singular values and vectors
551  for (i=0; i < lanczos_size[j]; ++i)
552  {
553  theta[j][i] = alpha[i];
554  tau[j][i] = eigenvectors[i * lanczos_size[j]];
555  }
556  }
557  }
558 
559  // If an eigenvalue relation is known, compute the rest of eigenvalues
560  // using the eigenvalue relation given in the operator A for its
561  // eigenvalues. If no eigenvalue relation is not known, the rest of
562  // eigenvalues were already computed in the above loop and no other
563  // computation is needed.
564  if (A->is_eigenvalue_relation_known() && num_inquiries > 1)
565  {
566  // When the code execution reaches this function, at least one of the
567  // inquiries is not converged, but some others might have been
568  // converged already. Here, we force-update those that are even
569  // converged already by setting converged to false. The extra update is
570  // free of charge when a relation for the eigenvalues are known.
571  for (j=0; j < num_inquiries; ++j)
572  {
573  converged[j] = 0;
574  }
575 
576  // Compute theta and tau for the rest of inquiry parameters
577  for (j=1; j < num_inquiries; ++j)
578  {
579  // Only j=0 was iterated before. Set the same size for other j-s
580  lanczos_size[j] = lanczos_size[0];
581 
582  for (i=0; i < lanczos_size[j]; ++i)
583  {
584  // Shift eigenvalues by the old and new parameters
585  theta[j][i] = A->get_eigenvalue(
586  &parameters[0],
587  theta[0][i],
588  &parameters[j*num_parameters]);
589 
590  // tau is the same (at least for the affine operator)
591  tau[j][i] = tau[0][i];
592  }
593  }
594  }
595 
596  // Estimate trace using quadrature method
597  DataType quadrature_sum;
598  for (j=0; j < num_inquiries; ++j)
599  {
600  // If the j-th inquiry is already converged, skip.
601  if (converged[j] == 1)
602  {
603  continue;
604  }
605 
606  // Initialize sum for the integral of quadrature
607  quadrature_sum = 0.0;
608 
609  // Important: This loop should iterate till lanczos_size[j], but not
610  // lanczos_degree. Otherwise the computation is wrong for certain
611  // matrices, such as if the input matrix is identity, or rank
612  // deficient. By using lanczos_size[j] instead of lanczos_degree, all
613  // issues with special matrices will resolve.
614  for (i=0; i < lanczos_size[j]; ++i)
615  {
616  quadrature_sum += tau[j][i] * tau[j][i] * \
617  matrix_function->function(pow(theta[j][i], exponent));
618  }
619 
620  trace_estimate[j] = matrix_size * quadrature_sum;
621  }
622 
623  // Release dynamic memory
624  delete[] alpha;
625  delete[] beta;
626  delete[] lanczos_size;
627 
628  for (j=0; j < required_num_inquiries; ++j)
629  {
630  delete[] theta[j];
631  }
632  delete[] theta;
633 
634  for (j=0; j < required_num_inquiries; ++j)
635  {
636  delete[] tau[j];
637  }
638  delete[] tau;
639 
640  if (eigenvectors != NULL)
641  {
642  delete[] eigenvectors;
643  }
644 
645  if (left_singularvectors != NULL)
646  {
647  delete[] left_singularvectors;
648  }
649 
650  if (right_singularvectors_transposed != NULL)
651  {
652  delete[] right_singularvectors_transposed;
653  }
654 }
655 
656 
657 // ===============================
658 // Explicit template instantiation
659 // ===============================
660 
661 template class cTraceEstimator<float>;
662 template class cTraceEstimator<double>;
663 template class cTraceEstimator<long double>;
IndexType c_golub_kahn_bidiagonalization(cLinearOperator< DataType > *A, const DataType *v, const LongIndexType n, const IndexType m, const DataType lanczos_tol, const FlagType orthogonalize, DataType *alpha, DataType *beta)
Bi-diagonalizes the positive-definite matrix A using Golub-Kahn-Lanczos method.
IndexType c_lanczos_tridiagonalization(cLinearOperator< DataType > *A, const DataType *v, const LongIndexType n, const IndexType m, const DataType lanczos_tol, const FlagType orthogonalize, DataType *alpha, DataType *beta)
Tri-diagonalizes matrix A to T using the start vector v. is the Lanczos degree, which will be the siz...
static FlagType check_convergence(DataType **samples, const IndexType min_num_samples, const IndexType num_inquiries, const IndexType *processed_samples_indices, const IndexType num_processed_samples, const DataType confidence_level, const DataType error_atol, const DataType error_rtol, DataType *error, IndexType *num_samples_used, FlagType *converged)
Checks if the standard deviation of the set of the cumulative averages of trace estimators converged ...
static void average_estimates(const DataType confidence_level, const DataType outlier_significance_level, const IndexType num_inquiries, const IndexType max_num_samples, const IndexType *num_samples_used, const IndexType *processed_samples_indices, DataType **samples, IndexType *num_outliers, DataType *trace, DataType *error)
Averages the estimates of trace. Removes outliers and reevaluates the error to take into account for ...
static int eigh_tridiagonal(DataType *diagonals, DataType *subdiagonals, DataType *eigenvectors, IndexType matrix_size)
Computes all eigenvalues and eigenvectors of a real and symmetric tridiagonal matrix.
static int svd_bidiagonal(DataType *diagonals, DataType *subdiagonals, DataType *U, DataType *Vt, IndexType matrix_size)
Computes all singular-values and left and right eigenvectors of a real and symmetric upper bi-diagona...
Defines the function .
Definition: functions.h:38
static void generate_random_array(RandomNumberGenerator &random_number_generator, DataType *array, const LongIndexType array_size, const IndexType num_threads)
Generates a pseudo-random array with Rademacher distribution where elements are either +1 or -1.
Generates 64-bit integers on multiple parallel threads.
Records elasped wall time between two events.
Definition: timer.h:50
void start()
Starts the timer.
Definition: timer.cpp:66
void stop()
Stops the timer.
Definition: timer.cpp:79
double elapsed() const
Returns the elapsed time in seconds.
Definition: timer.cpp:92
Base class for linear operators. This class serves as interface for all derived classes.
virtual DataType get_eigenvalue(const DataType *known_parameters, const DataType known_eigenvalue, const DataType *inquiry_parameters) const =0
void set_parameters(DataType *parameters_)
Sets the scalar parameter this->parameters. Parameter is initialized to NULL. However,...
LongIndexType get_num_rows() const
FlagType is_eigenvalue_relation_known() const
Returns a flag that determines whether a relation between the parameters of the operator and its eige...
IndexType get_num_parameters() const
A static class to compute the trace of implicit matrix functions using stochastic Lanczos quadrature ...
static void _c_stochastic_lanczos_quadrature(cLinearOperator< DataType > *A, DataType *parameters, const IndexType num_inquiries, const Function *matrix_function, const FlagType gram, const DataType exponent, const FlagType orthogonalize, const IndexType lanczos_degree, const DataType lanczos_tol, RandomNumberGenerator &random_number_generator, DataType *random_vector, FlagType *converged, DataType *trace_estimate)
For a given random input vector, computes one Monte-Carlo sample to estimate trace using Lanczos quad...
static FlagType c_trace_estimator(cLinearOperator< DataType > *A, DataType *parameters, const IndexType num_inquiries, const Function *matrix_function, const FlagType gram, const DataType exponent, const FlagType orthogonalize, const int64_t seed, const IndexType lanczos_degree, const DataType lanczos_tol, const IndexType min_num_samples, const IndexType max_num_samples, const DataType error_atol, const DataType error_rtol, const DataType confidence_level, const DataType outlier_significance_level, const IndexType num_threads, DataType *trace, DataType *error, DataType **samples, IndexType *processed_samples_indices, IndexType *num_samples_used, IndexType *num_outliers, FlagType *converged, float &alg_wall_time)
Stochastic Lanczos quadrature method to estimate trace of a function of a linear operator....
int FlagType
Definition: types.h:68
int IndexType
Definition: types.h:65