imate
C++/CUDA Reference
Loading...
Searching...
No Matches
c_trace_estimator.cpp
Go to the documentation of this file.
1/*
2 * SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>
3 * SPDX-License-Identifier: BSD-3-Clause
4 * SPDX-FileType: SOURCE
5 *
6 * This program is free software: you can redistribute it and/or modify it
7 * under the terms of the license found in the LICENSE.txt file in the root
8 * directory of this source tree.
9 */
10
11
12// =======
13// Headers
14// =======
15
16#include "./c_trace_estimator.h"
17#include "../_definitions/definitions.h" // USE_OPENMP
18#if defined(USE_OPENMP) && (USE_OPENMP == 1)
19 #include <omp.h> // omp_set_num_threads, omp_in_parallel
20#endif
21#include <cmath> // std::sqrt, std::pow
22#include <cstddef> // NULL
23#include "./c_lanczos_tridiagonalization.h" // c_lanczos_tridiagonalization
24#include "./c_golub_kahn_bidiagonalization.h" // c_golub_kahn_bidiagonaliza...
25#include "../_random_generator/random_array_generator.h" // RandomArrayGene...
26#include "./diagonalization.h" // Diagonalization
27#include "./convergence_tools.h" // check_convergence, average_estimates
28#include "../_utilities/timer.h" // Timer
29
30
31// =================
32// c trace estimator
33// =================
34
193
194template <typename DataType>
197 DataType* parameters,
198 const IndexType num_inquiries,
199 const Function* matrix_function,
200 const FlagType gram,
201 const DataType exponent,
202 const FlagType orthogonalize,
203 const int64_t seed,
204 const IndexType lanczos_degree,
205 const DataType lanczos_tol,
206 const IndexType min_num_samples,
207 const IndexType max_num_samples,
208 const DataType error_atol,
209 const DataType error_rtol,
210 const DataType confidence_level,
211 const DataType outlier_significance_level,
212 const IndexType num_threads,
213 DataType* trace,
214 DataType* error,
215 DataType** samples,
216 IndexType* processed_samples_indices,
217 IndexType* num_samples_used,
218 IndexType* num_outliers,
219 FlagType* converged,
220 double& alg_wall_time)
221{
222 // Matrix size
223 IndexType matrix_size = A->get_num_rows();
224
225 // Set the number of threads
226 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
227 omp_set_num_threads(num_threads);
228 #endif
229
230 // Allocate 1D array of random vectors We only allocate a random vector
231 // per parallel thread. Thus, the total size of the random vectors is
232 // matrix_size*num_threads. On each iteration in parallel threads, the
233 // allocated memory is reused. That is, in each iteration, a new random
234 // vector is generated for that specific thread id.
235 IndexType random_vectors_size = matrix_size * num_threads;
236 DataType* random_vectors = new DataType[random_vectors_size];
237
238 // Initialize random number generator to generate in parallel threads
239 // independently.
240 RandomNumberGenerator random_number_generator(num_threads, seed);
241
242 // The counter of filled size of processed_samples_indices array
243 // This scalar variable is defined as array to be shared among al threads
244 IndexType num_processed_samples = 0;
245
246 // Criterion for early termination of iterations if convergence reached
247 // This scalar variable is defined as array to be shared among al threads
248 FlagType all_converged = 0;
249
250 // Using square-root of max possible chunk size for parallel schedules
251 unsigned int chunk_size = static_cast<int>(
252 std::sqrt(static_cast<DataType>(max_num_samples) / \
253 static_cast<DataType>(num_threads)));
254 if (chunk_size < 1)
255 {
256 chunk_size = 1;
257 }
258
259 // Timing elapsed time of algorithm
260 Timer timer;
261 timer.start();
262
263 // Shared-memory parallelism over Monte-Carlo ensemble sampling
264 IndexType i;
265 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
266 #pragma omp parallel for \
267 schedule(dynamic, chunk_size) \
268 if (max_num_samples >= num_threads)
269 #endif
270 for (i=0; i < max_num_samples; ++i)
271 {
272 if (!static_cast<bool>(all_converged))
273 {
274 int thread_id;
275 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
276 thread_id = omp_get_thread_num();
277 #else
278 thread_id = 0;
279 #endif
280
281 // Perform one Monte-Carlo sampling to estimate trace
283 A, parameters, num_inquiries, matrix_function, gram,
284 exponent, orthogonalize, lanczos_degree, lanczos_tol,
285 random_number_generator,
286 &random_vectors[matrix_size*thread_id], converged,
287 samples[i]);
288
289 // Critical section
290 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
291 #pragma omp critical
292 #endif
293 {
294 // Store the index of processed samples
295 processed_samples_indices[num_processed_samples] = i;
296 ++num_processed_samples;
297
298 // Check whether convergence criterion has been met to stop.
299 // This check can also be done after another parallel thread
300 // set all_converged to "1", but we continue to update error.
302 samples, min_num_samples, num_inquiries,
303 processed_samples_indices, num_processed_samples,
304 confidence_level, error_atol, error_rtol, error,
305 num_samples_used, converged);
306 }
307 }
308 }
309
310 // Elapsed wall time of the algorithm (computation only, not array i/o)
311 timer.stop();
312 alg_wall_time = timer.elapsed();
313
314 // Remove outliers from trace estimates and average trace estimates
316 confidence_level, outlier_significance_level, num_inquiries,
317 max_num_samples, num_samples_used, processed_samples_indices,
318 samples, num_outliers, trace, error);
319
320 // Deallocate memory
321 delete[] random_vectors;
322
323 return all_converged;
324}
325
326
327// ===============================
328// c stochastic lanczos quadrature
329// ===============================
330
426
427template <typename DataType>
430 DataType* parameters,
431 const IndexType num_inquiries,
432 const Function* matrix_function,
433 const FlagType gram,
434 const DataType exponent,
435 const FlagType orthogonalize,
436 const IndexType lanczos_degree,
437 const DataType lanczos_tol,
438 RandomNumberGenerator& random_number_generator,
439 DataType* random_vector,
440 FlagType* converged,
441 DataType* trace_estimate)
442{
443 // Matrix size
444 IndexType matrix_size = A->get_num_rows();
445
446 // Fill random vectors with Rademacher distribution (+1, -1), normalized
447 // but not orthogonalized. Setting num_threads to zero indicates to not
448 // create any new threads in RandomNumbrGenerator since the current
449 // function is inside a parallel thread.
450 IndexType num_threads = 0;
452 random_number_generator, random_vector, matrix_size, num_threads);
453
454 // Allocate diagonals (alpha) and supdiagonals (beta) of Lanczos matrix
455 DataType* alpha = new DataType[lanczos_degree];
456 DataType* beta = new DataType[lanczos_degree];
457
458 // Define 2D arrays needed to decomposition. All these arrays are
459 // defined as 1D array with Fortran ordering
460 DataType* eigenvectors = NULL;
461 DataType* left_singularvectors = NULL;
462 DataType* right_singularvectors_transposed = NULL;
463
464 // Actual number of inquiries
465 IndexType required_num_inquiries = num_inquiries;
467 {
468 // When a relation between eigenvalues and the parameters of the linear
469 // operator is known, to compute eigenvalues of for each inquiry, only
470 // computing one inquiry is enough. This is because an eigenvalue for
471 // one parameter setting is enough to compute eigenvalue of another set
472 // of parameters.
473 required_num_inquiries = 1;
474 }
475
476 // Allocate and initialize theta
477 IndexType i;
478 IndexType j;
479 DataType** theta = new DataType*[num_inquiries];
480 for (j=0; j < num_inquiries; ++j)
481 {
482 theta[j] = new DataType[lanczos_degree];
483
484 // Initialize components to zero
485 for (i=0; i < lanczos_degree; ++i)
486 {
487 theta[j][i] = 0.0;
488 }
489 }
490
491 // Allocate and initialize tau
492 DataType** tau = new DataType*[num_inquiries];
493 for (j=0; j < num_inquiries; ++j)
494 {
495 tau[j] = new DataType[lanczos_degree];
496
497 // Initialize components to zero
498 for (i=0; i < lanczos_degree; ++i)
499 {
500 tau[j][i] = 0.0;
501 }
502 }
503
504 // Allocate lanczos size for each inquiry. This variable keeps the non-zero
505 // size of the tri-diagonal (or bi-diagonal) matrix. Ideally, this matrix
506 // is of the size lanczos_degree. But, due to the early termination, this
507 // size might be smaller.
508 IndexType* lanczos_size = new IndexType[num_inquiries];
509
510 // Number of parameters of linear operator A
511 IndexType num_parameters = A->get_num_parameters();
512
513 // Lanczos iterations, computes theta and tau for each inquiry parameter
514 for (j=0; j < required_num_inquiries; ++j)
515 {
516 // If trace is already converged, do not compute on the new sample.
517 // However, exclude the case where required_num_inquiries is not the
518 // same as num_inquiries, since in this case, we compute one inquiry
519 // for multiple parameters.
520 if ((converged[j] == 1) && (required_num_inquiries == num_inquiries))
521 {
522 continue;
523 }
524
525 // Set parameter of linear operator A
526 A->set_parameters(&parameters[j*num_parameters]);
527
528 if (gram)
529 {
530 // Use Golub-Kahn-Lanczos Bi-diagonalization
531 lanczos_size[j] = c_golub_kahn_bidiagonalization(
532 A, random_vector, matrix_size, lanczos_degree, lanczos_tol,
533 orthogonalize, alpha, beta);
534
535 // Allocate matrix of singular vectors (1D array, Fortran ordering)
536 left_singularvectors = \
537 new DataType[lanczos_size[j] * lanczos_size[j]];
538 right_singularvectors_transposed = \
539 new DataType[lanczos_size[j] * lanczos_size[j]];
540
541 // Note: alpha is written in-place with singular values
543 alpha, beta, left_singularvectors,
544 right_singularvectors_transposed, lanczos_size[j]);
545
546 // theta and tau from singular values and vectors
547 for (i=0; i < lanczos_size[j]; ++i)
548 {
549 theta[j][i] = alpha[i] * alpha[i];
550 tau[j][i] = right_singularvectors_transposed[i];
551 }
552 }
553 else
554 {
555 // Use Lanczos Tri-diagonalization
556 lanczos_size[j] = c_lanczos_tridiagonalization(
557 A, random_vector, matrix_size, lanczos_degree, lanczos_tol,
558 orthogonalize, alpha, beta);
559
560 // Allocate eigenvectors matrix (1D array with Fortran ordering)
561 eigenvectors = new DataType[lanczos_size[j] * lanczos_size[j]];
562
563 // Note: alpha is written in-place with eigenvalues
565 alpha, beta, eigenvectors, lanczos_size[j]);
566
567 // theta and tau from singular values and vectors
568 for (i=0; i < lanczos_size[j]; ++i)
569 {
570 theta[j][i] = alpha[i];
571 tau[j][i] = eigenvectors[i * lanczos_size[j]];
572 }
573 }
574 }
575
576 // If an eigenvalue relation is known, compute the rest of eigenvalues
577 // using the eigenvalue relation given in the operator A for its
578 // eigenvalues. If no eigenvalue relation is not known, the rest of
579 // eigenvalues were already computed in the above loop and no other
580 // computation is needed.
581 if (A->is_eigenvalue_relation_known() && num_inquiries > 1)
582 {
583 // When the code execution reaches this function, at least one of the
584 // inquiries is not converged, but some others might have been
585 // converged already. Here, we force-update those that are even
586 // converged already by setting converged to false. The extra update is
587 // free of charge when a relation for the eigenvalues are known.
588 for (j=0; j < num_inquiries; ++j)
589 {
590 converged[j] = 0;
591 }
592
593 // Compute theta and tau for the rest of inquiry parameters
594 for (j=1; j < num_inquiries; ++j)
595 {
596 // Only j=0 was iterated before. Set the same size for other j-s
597 lanczos_size[j] = lanczos_size[0];
598
599 for (i=0; i < lanczos_size[j]; ++i)
600 {
601 // Shift eigenvalues by the old and new parameters
602 theta[j][i] = A->get_eigenvalue(
603 &parameters[0],
604 theta[0][i],
605 &parameters[j*num_parameters]);
606
607 // tau is the same (at least for the affine operator)
608 tau[j][i] = tau[0][i];
609 }
610 }
611 }
612
613 // Estimate trace using quadrature method
614 DataType quadrature_sum;
615 for (j=0; j < num_inquiries; ++j)
616 {
617 // If the j-th inquiry is already converged, skip.
618 if (converged[j] == 1)
619 {
620 continue;
621 }
622
623 // Initialize sum for the integral of quadrature
624 quadrature_sum = 0.0;
625
626 // Important: This loop should iterate till lanczos_size[j], but not
627 // lanczos_degree. Otherwise the computation is wrong for certain
628 // matrices, such as if the input matrix is identity, or rank
629 // deficient. By using lanczos_size[j] instead of lanczos_degree, all
630 // issues with special matrices will resolve.
631 for (i=0; i < lanczos_size[j]; ++i)
632 {
633 quadrature_sum += tau[j][i] * tau[j][i] * \
634 matrix_function->function(std::pow(theta[j][i], exponent));
635 }
636
637 trace_estimate[j] = \
638 static_cast<DataType>(matrix_size) * quadrature_sum;
639 }
640
641 // Release dynamic memory
642 delete[] alpha;
643 delete[] beta;
644 delete[] lanczos_size;
645
646 for (j=0; j < required_num_inquiries; ++j)
647 {
648 delete[] theta[j];
649 }
650 delete[] theta;
651
652 for (j=0; j < required_num_inquiries; ++j)
653 {
654 delete[] tau[j];
655 }
656 delete[] tau;
657
658 if (eigenvectors != NULL)
659 {
660 delete[] eigenvectors;
661 }
662
663 if (left_singularvectors != NULL)
664 {
665 delete[] left_singularvectors;
666 }
667
668 if (right_singularvectors_transposed != NULL)
669 {
670 delete[] right_singularvectors_transposed;
671 }
672}
673
674
675// ===============================
676// Explicit template instantiation
677// ===============================
678
679template class cTraceEstimator<float>;
680template class cTraceEstimator<double>;
681template class cTraceEstimator<long double>;
IndexType c_golub_kahn_bidiagonalization(cLinearOperator< DataType > *A, const DataType *v, const LongIndexType n, const IndexType m, const DataType lanczos_tol, const FlagType orthogonalize, DataType *alpha, DataType *beta)
Bi-diagonalizes the positive-definite matrix A using Golub-Kahn-Lanczos method.
IndexType c_lanczos_tridiagonalization(cLinearOperator< DataType > *A, const DataType *v, const LongIndexType n, const IndexType m, const DataType lanczos_tol, const FlagType orthogonalize, DataType *alpha, DataType *beta)
Tri-diagonalizes matrix A to T using the start vector v. is the Lanczos degree, which will be the siz...
static FlagType check_convergence(DataType **samples, const IndexType min_num_samples, const IndexType num_inquiries, const IndexType *processed_samples_indices, const IndexType num_processed_samples, const DataType confidence_level, const DataType error_atol, const DataType error_rtol, DataType *error, IndexType *num_samples_used, FlagType *converged)
Checks if the standard deviation of the set of the cumulative averages of trace estimators converged ...
static void average_estimates(const DataType confidence_level, const DataType outlier_significance_level, const IndexType num_inquiries, const IndexType max_num_samples, const IndexType *num_samples_used, const IndexType *processed_samples_indices, DataType **samples, IndexType *num_outliers, DataType *trace, DataType *error)
Averages the estimates of trace. Removes outliers and reevaluates the error to take into account for ...
static int eigh_tridiagonal(DataType *diagonals, DataType *subdiagonals, DataType *eigenvectors, IndexType matrix_size)
Computes all eigenvalues and eigenvectors of a real and symmetric tridiagonal matrix.
static int svd_bidiagonal(DataType *diagonals, DataType *subdiagonals, DataType *U, DataType *Vt, IndexType matrix_size)
Computes all singular-values and left and right eigenvectors of a real and symmetric upper bi-diagona...
Defines the function .
Definition functions.h:38
virtual float function(const float lambda_) const =0
static void generate_random_array(RandomNumberGenerator &random_number_generator, DataType *array, const LongIndexType array_size, const IndexType num_threads)
Generates a pseudo-random array with Rademacher distribution where elements are either +1 or -1.
Generates 64-bit integers on multiple parallel threads.
Records elasped wall time between two events.
Definition timer.h:50
void start()
Starts the timer.
Definition timer.cpp:66
void stop()
Stops the timer.
Definition timer.cpp:79
double elapsed() const
Returns the elapsed time in seconds.
Definition timer.cpp:92
IndexType get_num_parameters() const
Returns the number of parameters of the linear operator.
LongIndexType get_num_rows() const
Returns the number of rows of the matrix.
FlagType is_eigenvalue_relation_known() const
Returns a flag that determines whether a relation between the parameters of the operator and its eige...
Base class for linear operators. This class serves as interface for all derived classes.
virtual DataType get_eigenvalue(const DataType *known_parameters, const DataType known_eigenvalue, const DataType *inquiry_parameters) const =0
void set_parameters(DataType *parameters_)
Sets the scalar parameter this->parameters. Parameter is initialized to NULL. However,...
A static class to compute the trace of implicit matrix functions using stochastic Lanczos quadrature ...
static void _c_stochastic_lanczos_quadrature(cLinearOperator< DataType > *A, DataType *parameters, const IndexType num_inquiries, const Function *matrix_function, const FlagType gram, const DataType exponent, const FlagType orthogonalize, const IndexType lanczos_degree, const DataType lanczos_tol, RandomNumberGenerator &random_number_generator, DataType *random_vector, FlagType *converged, DataType *trace_estimate)
For a given random input vector, computes one Monte-Carlo sample to estimate trace using Lanczos quad...
static FlagType c_trace_estimator(cLinearOperator< DataType > *A, DataType *parameters, const IndexType num_inquiries, const Function *matrix_function, const FlagType gram, const DataType exponent, const FlagType orthogonalize, const int64_t seed, const IndexType lanczos_degree, const DataType lanczos_tol, const IndexType min_num_samples, const IndexType max_num_samples, const DataType error_atol, const DataType error_rtol, const DataType confidence_level, const DataType outlier_significance_level, const IndexType num_threads, DataType *trace, DataType *error, DataType **samples, IndexType *processed_samples_indices, IndexType *num_samples_used, IndexType *num_outliers, FlagType *converged, double &alg_wall_time)
Stochastic Lanczos quadrature method to estimate trace of a function of a linear operator....
void omp_set_num_threads(int num_threads)
int omp_get_thread_num()
int FlagType
Definition types.h:68
int IndexType
Definition types.h:65