imate
C++/CUDA Reference
Loading...
Searching...
No Matches
definitions.h
Go to the documentation of this file.
1/*
2 * SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>
3 * SPDX-License-Identifier: BSD-3-Clause
4 * SPDX-FileType: SOURCE
5 *
6 * This program is free software: you can redistribute it and/or modify it
7 * under the terms of the license found in the LICENSE.txt file in the root
8 * directory of this source tree.
9 */
10
11
12#ifndef _DEFINITIONS_DEFINITIONS_H_
13#define _DEFINITIONS_DEFINITIONS_H_
14
15
16// ===========
17// Definitions
18// ===========
19
20// To suppress warning: __STDC_VERSION__" is not defined, evaluates to 0
21// #ifndef __STDC_VERSION__
22// #define __STDC_VERSION__ 0
23// #endif
24
25// If set to 0, the LongIndexType is declared as 32-bit integer. Whereas if set
26// to 1, the LongIndexType is declared as 64-bit integer. The long integer will
27// slow down the performance on reading array if integers. Note that in C++,
28// there is no difference between "int" and "long int". That is, both are 32
29// bit. To see the real effect of long type, define the integer by "long long"
30// rather than "long int". The "long long" is indeed 64-bit. Currently, the
31// long type in "./types.h" is defined as "long int". Hence, setting LONG_INT
32// to 1 will not make any difference unless "long long" is used.
33//
34// Note: The malloc and cudaMalloc can only allocate at maximum, an array of
35// the limit size of "size_t" (unsigned int). So, using "long long int" is
36// not indeed practical for malloc. Thus, it is better to set the type of array
37// indices as just "signed int".
38#ifndef LONG_INT
39 #define LONG_INT 0
40#endif
41
42// If set to 0, the LongIndexType is declared as signed integer, whereas if set
43// to 1, the LongIndexType is declared as unsigned integer. The unsigned type
44// will double the limit of the largest integer index, while keeps the same
45// speed for index operations. Note that the indices and index pointers of
46// scipy sparse arrays are defined by "signed int". Hence, by setting
47// UNSIGNED_LONG_INT to 1, there is a one-time overhead of convening the numpy
48// int arrays (two matrices of scipy.sparse.csr_matrix.indices and
49// scipy.sparse.csr_matrix.indptr) from "int" to "unsigned int". This overhead
50// is only one-time and should be around half a second for moderate to large
51// arrays. But, on the positive side, the unsigned int can handle arrays of
52// up to twice the index size.
53//
54// Note: The malloc and cudaMalloc can only allocate at maximum, an array of
55// the limit size of "size_t" (unsigned int). So, using "unsigned int" for
56// index is not indeed practical since the array size in bytes is the size of
57// array times sizeof(DataType). That is, if DataType is double for instance,
58// the maximum array size could potentially be 8 times the size of maximum
59// of "size_t" (unsigned int) which is not possible for malloc. Thus, it is
60// better to set the type of array indices as just "signed int".
61#ifndef UNSIGNED_LONG_INT
62 #define UNSIGNED_LONG_INT 0
63#endif
64
65// If USE_LOOP_UNROLLING is set to 1, the for loops in dense matrix-vector
66// multiplications and vector-vector multiplications use loop unrolling.
67// Otherwise set to 0. Default is 1.
68#ifndef USE_LOOP_UNROLLING
69 #define USE_LOOP_UNROLLING 1
70#endif
71
72// Small tasks (e.g. vector operations) do not leverage OpenMP parallelization
73// unless the arrays size are large enough to overcome the OpenMP overhead.
74// Here we define the threshold of array size in which arrays smaller than this
75// size will have vector operations in serial, and arrays larger than this size
76// will have vector operations in parallel. A threshold between 100K and 1M
77// seems to be a fine spot to make this switch.
78#define LARGE_ARRAY_SIZE 100000
79
80// If USE_OPENMP is set to 1, the OpenMP for shared-memory parallelization will
81// be enabled. Otherwise, set this to 0. You can also set this as an
82// environment variable, or in setup.py script.
83#ifndef USE_OPENMP
84 #define USE_OPENMP 1
85#endif
86
87// If USE_CBLAS is set to 1, the OpenBLAS library is used for dense vector and
88// matrix operations. Note that OpenBLAS does not declare operations on "long
89// double" type, rather, only "float" and "double" types are supported. To use
90// "long double" type, set USE_CBLAS to 0. OpenBLAS is nearly twice faster, but
91// it looses accuracy on large arrays of float type. This inaccuracy could
92// matter a lot when computing dot product and norm of very large vectors.
93#ifndef USE_CBLAS
94 #define USE_CBLAS 0
95#endif
96
97// If USE_MKL is set to 1, the MKL library is used for dense vector and matrix
98// operations. Note that MKL does not declare operations on "long double" type,
99// rather, only "float" and "double" types are supported. To use "long double"
100// type, set USE_MKL to 0. MKL is nearly twice faster, but it looses accuracy
101// on large arrays of float type. This inaccuracy could matter a lot when
102// computing dot product and norm of very large vectors.
103#ifndef USE_MKL
104 #define USE_MKL 0
105#endif
106
107// USE_CBLAS or USE_MKL is set to 1, the USE_ANY_CBLAS is set to signal other
108// parts of the code to use CBLAS. Since the BLAS interface (regardless of
109// using OpenBLAS, MKL, etc) is the same, we use a unified USE_ANY_CBLAS flag.
110#if (defined(USE_CBLAS) && (USE_CBLAS == 1)) || \
111 (defined(USE_MKL) && (USE_MKL == 1))
112 #define USE_ANY_CBLAS 1
113#endif
114
115
116#endif // _DEFINITIONS_DEFINITIONS_H_