imate
C++/CUDA Reference
Loading...
Searching...
No Matches
c_vector_operations.cpp
Go to the documentation of this file.
1/*
2 * SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>
3 * SPDX-License-Identifier: BSD-3-Clause
4 * SPDX-FileType: SOURCE
5 *
6 * This program is free software: you can redistribute it and/or modify it
7 * under the terms of the license found in the LICENSE.txt file in the root
8 * directory of this source tree.
9 */
10
11
12// =======
13// Headers
14// =======
15
17#include <cmath> // sqrt
18#include "../_c_arithmetics/c_arithmetics.h" // c_arithmetics
19#include "../_definitions/definitions.h" // USE_OPENMP, USE_ANY_CBLAS,
20 // USE_LOOP_UNROLLING,
21 // LARGE_ARRAY_SIZE
22#if defined(USE_OPENMP) && (USE_OPENMP == 1)
23 #include <omp.h> // omp_in_parallel
24#endif
25
26#if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
27 #include "./cblas_api.h" // cblas_api
28#endif
29
30
31// ===========
32// copy vector
33// ===========
34
43
44template <typename DataType>
46 const DataType* RESTRICT input_vector,
47 const LongIndexType vector_size,
48 DataType* RESTRICT output_vector)
49{
50 #if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
51
52 // Using BLAS
53 int incx = 1;
54 int incy = 1;
55
56 cblas_api::xcopy(vector_size, input_vector, incx, output_vector, incy);
57
58 #else
59
60 // Not using BLAS
61 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
62 #pragma omp parallel for \
63 schedule(static) \
64 if ((!omp_in_parallel()) && (vector_size >= LARGE_ARRAY_SIZE)) \
65 default(none) \
66 shared(input_vector, output_vector, vector_size)
67 #endif
68 for (LongIndexType i=0; i < vector_size; ++i)
69 {
70 output_vector[i] = input_vector[i];
71 }
72
73 #endif
74}
75
76// ==================
77// copy scaled vector
78// ==================
79
91
92template <typename DataType>
94 const DataType* RESTRICT input_vector,
95 const LongIndexType vector_size,
96 const DataType scale,
97 DataType* RESTRICT output_vector)
98{
99 #if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
100
101 // Using BLAS
102 int incx = 1;
103 int incy = 1;
104
105 cblas_api::xcopy(vector_size, input_vector, incx, output_vector, incy);
106
107 cblas_api::xscal(vector_size, scale, output_vector, incy);
108
109 #else
110
111 // Not using BLAS
112 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
113 #pragma omp parallel for \
114 schedule(static) \
115 if ((!omp_in_parallel()) && (vector_size >= LARGE_ARRAY_SIZE)) \
116 default(none) \
117 shared(input_vector, output_vector, vector_size, scale)
118 #endif
119 for (LongIndexType i=0; i < vector_size; ++i)
120 {
121 output_vector[i] = scale * input_vector[i];
122 }
123
124 #endif
125}
126
127
128// ======================
129// subtract scaled vector
130// ======================
131
151
152template <typename DataType>
154 const DataType* RESTRICT input_vector,
155 const LongIndexType vector_size,
156 const DataType scale,
157 DataType* RESTRICT output_vector)
158{
159
160 #if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
161
162 // Using BLAS
163 int incx = 1;
164 int incy = 1;
165
166 DataType neg_scale = -scale;
167 cblas_api::xaxpy(vector_size, neg_scale, input_vector, incx, output_vector,
168 incy);
169
170 #else
171
172 // Not using BLAS
173 DataType zero = 0.0;
174 if (c_arithmetics::is_equal(scale, zero))
175 {
176 return;
177 }
178
179 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
180 #pragma omp parallel for \
181 schedule(static) \
182 if ((!omp_in_parallel()) && (vector_size >= LARGE_ARRAY_SIZE)) \
183 default(none) \
184 shared(input_vector, output_vector, vector_size, scale)
185 #endif
186 for (LongIndexType i=0; i < vector_size; ++i)
187 {
188 output_vector[i] -= scale * input_vector[i];
189 }
190
191 #endif
192}
193
194
195// =============
196// inner product
197// =============
198
228
229template <typename DataType>
231 const DataType* RESTRICT vector1,
232 const DataType* RESTRICT vector2,
233 const LongIndexType vector_size)
234{
235 #if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
236
237 // Using BLAS
238 int incx = 1;
239 int incy = 1;
240
241 DataType inner_prod = cblas_api::xdot(vector_size, vector1, incx, vector2,
242 incy);
243
244 return inner_prod;
245
246 #else
247
248 // Not using BLAS
249 long double inner_prod = 0.0;
250 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
251 LongIndexType chunk = 5;
252 LongIndexType vector_size_chunked = vector_size - (vector_size % chunk);
253 #endif
254
255 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
256 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
257 #pragma omp parallel for \
258 schedule(static) \
259 if ((!omp_in_parallel()) && \
260 (vector_size_chunked >= LARGE_ARRAY_SIZE)) \
261 default(none) \
262 shared(vector1, vector2, vector_size, vector_size_chunked, chunk) \
263 reduction(+: inner_prod)
264 #endif
265 for (LongIndexType i=0; i < vector_size_chunked; i += chunk)
266 {
267 inner_prod += vector1[i] * vector2[i] +
268 vector1[i+1] * vector2[i+1] +
269 vector1[i+2] * vector2[i+2] +
270 vector1[i+3] * vector2[i+3] +
271 vector1[i+4] * vector2[i+4];
272 }
273 #endif
274
275 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
276 for (LongIndexType i=vector_size_chunked; i < vector_size; ++i)
277 #else
278 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
279 #pragma omp parallel for \
280 schedule(static) \
281 if ((!omp_in_parallel()) && (vector_size >= LARGE_ARRAY_SIZE)) \
282 default(none) \
283 shared(vector1, vector2, vector_size) \
284 reduction(+: inner_prod)
285 #endif
286 for (LongIndexType i=0; i < vector_size; ++i)
287 #endif
288 {
289 inner_prod += vector1[i] * vector2[i];
290 }
291
292 return static_cast<DataType>(inner_prod);
293
294 #endif
295}
296
297
298// ==============
299// euclidean norm
300// ==============
301
330
331template <typename DataType>
333 const DataType* RESTRICT vector,
334 const LongIndexType vector_size)
335{
336 #if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
337
338 // Using BLAS
339 int incx = 1;
340
341 DataType norm = cblas_api::xnrm2(vector_size, vector, incx);
342
343 return norm;
344
345 #else
346
347 // Compute norm squared
348 long double norm2 = 0.0;
349 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
350 LongIndexType chunk = 5;
351 LongIndexType vector_size_chunked = vector_size - (vector_size % chunk);
352 #endif
353
354 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
355 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
356 #pragma omp parallel for \
357 schedule(static) \
358 if ((!omp_in_parallel()) && \
359 (vector_size_chunked >= LARGE_ARRAY_SIZE)) \
360 default(none) \
361 shared(vector, vector_size, vector_size_chunked, chunk) \
362 reduction(+: norm2)
363 #endif
364 for (LongIndexType i=0; i < vector_size_chunked; i += chunk)
365 {
366 norm2 += vector[i] * vector[i] +
367 vector[i+1] * vector[i+1] +
368 vector[i+2] * vector[i+2] +
369 vector[i+3] * vector[i+3] +
370 vector[i+4] * vector[i+4];
371 }
372 #endif
373
374 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
375 for (LongIndexType i=vector_size_chunked; i < vector_size; ++i)
376 #else
377 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
378 #pragma omp parallel for \
379 schedule(static) \
380 if ((!omp_in_parallel()) && (vector_size >= LARGE_ARRAY_SIZE)) \
381 default(none) \
382 shared(vector, vector_size) \
383 reduction(+: norm2)
384 #endif
385 for (LongIndexType i=0; i < vector_size; ++i)
386 #endif
387 {
388 norm2 += vector[i] * vector[i];
389 }
390
391 // Norm
392 DataType norm = std::sqrt(static_cast<DataType>(norm2));
393
394 return norm;
395
396 #endif
397}
398
399
400// =========================
401// normalize vector in place
402// =========================
403
412
413template <typename DataType>
415 DataType* RESTRICT vector,
416 const LongIndexType vector_size)
417{
418 #if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
419
420 // Norm of vector
422 vector, vector_size);
423
424 // Normalize in place
425 DataType scale = 1.0 / norm;
426 int incx = 1;
427 cblas_api::xscal(vector_size, scale, vector, incx);
428
429 return norm;
430
431 #else
432
433 // Norm of vector
434 DataType norm = cVectorOperations<DataType>::euclidean_norm(vector,
435 vector_size);
436
437 // Normalize in place
438 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
439 #pragma omp parallel for \
440 schedule(static) \
441 if ((!omp_in_parallel()) && (vector_size >= LARGE_ARRAY_SIZE)) \
442 default(none) \
443 shared(vector, vector_size, norm)
444 #endif
445 for (LongIndexType i=0; i < vector_size; ++i)
446 {
447 vector[i] /= norm;
448 }
449
450 return norm;
451
452 #endif
453}
454
455
456// =========================
457// normalize vector and copy
458// =========================
459
470
471template <typename DataType>
473 const DataType* RESTRICT vector,
474 const LongIndexType vector_size,
475 DataType* RESTRICT output_vector)
476{
477 #if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
478
479 // Norm of vector
481 vector, vector_size);
482
483 // Normalize to output
484 DataType scale = 1.0 / norm;
485 cVectorOperations<DataType>::copy_scaled_vector(vector, vector_size, scale,
486 output_vector);
487
488 return norm;
489
490 #else
491
492 // Norm of vector
493 DataType norm = cVectorOperations<DataType>::euclidean_norm(vector,
494 vector_size);
495
496 // Normalize to output
497 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
498 #pragma omp parallel for \
499 schedule(static) \
500 if ((!omp_in_parallel()) && (vector_size >= LARGE_ARRAY_SIZE)) \
501 default(none) \
502 shared(vector, output_vector, vector_size, norm)
503 #endif
504 for (LongIndexType i=0; i < vector_size; ++i)
505 {
506 output_vector[i] = vector[i] / norm;
507 }
508
509 return norm;
510
511 #endif
512}
513
514
515// ===============================
516// Explicit template instantiation
517// ===============================
518
519template class cVectorOperations<float>;
520template class cVectorOperations<double>;
521template class cVectorOperations<long double>;
#define RESTRICT
A static class for vector operations, similar to level-1 operations of the BLAS library....
static void copy_scaled_vector(const DataType *RESTRICT input_vector, const LongIndexType vector_size, const DataType scale, DataType *RESTRICT output_vector)
Scales a vector and stores to a new vector.
static DataType inner_product(const DataType *RESTRICT vector1, const DataType *RESTRICT vector2, const LongIndexType vector_size)
Computes Euclidean inner product of two vectors.
static DataType normalize_vector_and_copy(const DataType *RESTRICT vector, const LongIndexType vector_size, DataType *RESTRICT output_vector)
Normalizes a vector based on Euclidean 2-norm. The result is written into another vector.
static void copy_vector(const DataType *RESTRICT input_vector, const LongIndexType vector_size, DataType *RESTRICT output_vector)
Copies a vector to a new vector. Result is written in-place.
static DataType normalize_vector_in_place(DataType *RESTRICT vector, const LongIndexType vector_size)
Normalizes a vector based on Euclidean 2-norm. The result is written in-place.
static void subtract_scaled_vector(const DataType *RESTRICT input_vector, const LongIndexType vector_size, const DataType scale, DataType *RESTRICT output_vector)
Subtracts the scaled input vector from the output vector.
static DataType euclidean_norm(const DataType *RESTRICT vector, const LongIndexType vector_size)
Computes the Euclidean norm of a 1D array.
#define USE_OPENMP
Definition definitions.h:84
#define LARGE_ARRAY_SIZE
Definition definitions.h:78
bool is_equal(DataType x, DataType y)
Check if two floating point numbers are equal within a tolerance.
Definition _c_is_equal.h:49
int LongIndexType
Definition types.h:60