imate
C++/CUDA Reference
Loading...
Searching...
No Matches
c_matrix_operations.cpp
Go to the documentation of this file.
1/*
2 * SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>
3 * SPDX-License-Identifier: BSD-3-Clause
4 * SPDX-FileType: SOURCE
5 *
6 * This program is free software: you can redistribute it and/or modify it
7 * under the terms of the license found in the LICENSE.txt file in the root
8 * directory of this source tree.
9 */
10
11
12// =======
13// Headers
14// =======
15
17#include "../_c_arithmetics/c_arithmetics.h" // c_arithmetics
18#include "../_definitions/definitions.h" // USE_OPENMP, USE_ANY_CBLAS,
19 // USE_LOOP_UNROLLING,
20 // LARGE_ARRAY_SIZE
21#if defined(USE_OPENMP) && (USE_OPENMP == 1)
22 #include <omp.h> // omp_in_parallel
23#endif
24
25#if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
26 #include "./cblas_api.h" // cblas_api
27#endif
28
29
30// ============
31// dense matvec
32// ============
33
68
69template <typename DataType>
71 const DataType* RESTRICT A,
72 const DataType* RESTRICT b,
73 const LongIndexType num_rows,
74 const LongIndexType num_columns,
75 const FlagType A_is_row_major,
76 const FlagType A_is_symmetric,
77 DataType* RESTRICT c)
78{
79 #if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
80
81 // Using BLAS
82 CBLAS_LAYOUT layout;
83 CBLAS_UPLO uplo;
84 CBLAS_TRANSPOSE transpose = CblasNoTrans;
85 int lda;
86 if (A_is_row_major)
87 {
88 layout = CblasRowMajor;
89 uplo = CblasUpper;
90 lda = num_columns;
91 }
92 else
93 {
94 layout = CblasColMajor;
95 uplo = CblasLower;
96 lda = num_rows;
97
98 // For efficiency, use transpose op for symmetric column-major matrices
99 if (A_is_symmetric)
100 {
101 transpose = CblasTrans;
102 }
103 }
104
105 int incb = 1;
106 int incc = 1;
107 DataType alpha = 1.0;
108 DataType beta = 0.0;
109
110 if (A_is_symmetric)
111 {
112 cblas_api::xsymv(layout, uplo, num_columns, alpha, A, lda, b, incb,
113 beta, c, incc);
114 }
115 else
116 {
117 cblas_api::xgemv(layout, transpose, num_rows, num_columns, alpha, A,
118 lda, b, incb, beta, c, incc);
119 }
120
121 #else
122
123 // Not using BLAS
124 long int j; // This should be long int to avoid multiplication overflow
125 long int jump;
126 long double sum;
127 const long int chunk = 5;
128 const long int num_columns_chunked = num_columns - (num_columns % chunk);
129
130 // Void unused variables to avoid compiler warnings (-Wno-unused-parameter)
131 #if !defined(USE_LOOP_UNROLLING) || (USE_LOOP_UNROLLING != 1)
132 (void) num_columns_chunked;
133 (void) chunk;
134 #endif
135
136 // Determine major order of A
137 if (A_is_row_major)
138 {
139 // For row-major (C ordering) matrix A (symmetric or non-symmetric)
140 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
141 #pragma omp parallel for \
142 schedule(static) \
143 if (!omp_in_parallel()) \
144 default(none) \
145 shared(A, b, c, jump, num_rows, num_columns, num_columns_chunked, \
146 chunk) \
147 private(j, sum)
148 #endif
149 for (long int i=0; i < num_rows; ++i)
150 {
151 sum = 0.0;
152 jump = i * num_columns;
153 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
154 for (j=0; j < num_columns_chunked; j+= chunk)
155 {
156 // Loop unrolling
157 sum += A[jump + j] * b[j] +
158 A[jump + j+1] * b[j+1] +
159 A[jump + j+2] * b[j+2] +
160 A[jump + j+3] * b[j+3] +
161 A[jump + j+4] * b[j+4];
162 }
163 #endif
164
165 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
166 for (j=num_columns_chunked; j < num_columns; ++j)
167 #else
168 for (j=0; j < num_columns; ++j)
169 #endif
170 {
171 sum += A[jump + j] * b[j];
172 }
173
174 c[i] = static_cast<DataType>(sum);
175 }
176 }
177 else if (A_is_symmetric)
178 {
179 // A is column-major, but symmetric, so we can use its transposed
180 // operation instead, which is more efficient for column-major
181 // matrices.
183 A, b, num_rows, num_columns, A_is_row_major, A_is_symmetric, c);
184 }
185 else
186 {
187 // For column-major (Fortran ordering) non-symmetric matrix A
188 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
189 #pragma omp parallel for \
190 schedule(static) \
191 if (!omp_in_parallel()) \
192 default(none) \
193 shared(A, b, c, num_rows, num_columns) \
194 private(j, sum)
195 #endif
196 for (long int i=0; i < num_rows; ++i)
197 {
198 sum = 0.0;
199 for (j=0; j < num_columns; ++j)
200 {
201 // Make sure j is long int to avoid overflow in num_rows*j
202 sum += A[i + num_rows*j] * b[j];
203 }
204 c[i] = static_cast<DataType>(sum);
205 }
206 }
207
208 #endif
209}
210
211
212// =================
213// dense matvec plus
214// =================
215
252
253template <typename DataType>
255 const DataType* RESTRICT A,
256 const DataType* RESTRICT b,
257 const DataType alpha,
258 const LongIndexType num_rows,
259 const LongIndexType num_columns,
260 const FlagType A_is_row_major,
261 const FlagType A_is_symmetric,
262 DataType* RESTRICT c)
263{
264 #if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
265
266 // Using BLAS
267 CBLAS_LAYOUT layout;
268 CBLAS_UPLO uplo;
269 CBLAS_TRANSPOSE transpose = CblasNoTrans;
270 int lda;
271 if (A_is_row_major)
272 {
273 layout = CblasRowMajor;
274 uplo = CblasUpper;
275 lda = num_columns;
276 }
277 else
278 {
279 layout = CblasColMajor;
280 uplo = CblasLower;
281 lda = num_rows;
282
283 // For efficiency, use transpose op for symmetric column-major matrices
284 if (A_is_symmetric)
285 {
286 transpose = CblasTrans;
287 }
288 }
289
290 int incb = 1;
291 int incc = 1;
292 DataType beta = 1.0;
293
294 if (A_is_symmetric)
295 {
296 cblas_api::xsymv(layout, uplo, num_columns, alpha, A, lda, b, incb,
297 beta, c, incc);
298 }
299 else
300 {
301 cblas_api::xgemv(layout, transpose, num_rows, num_columns, alpha, A,
302 lda, b, incb, beta, c, incc);
303 }
304
305 #else
306
307 // Not using BLAS
308 DataType zero = 0.0;
309 if (c_arithmetics::is_equal(alpha, zero))
310 {
311 return;
312 }
313
314 long int j;
315 long int jump;
316 long double sum;
317 const long int chunk = 5;
318 const long int num_columns_chunked = num_columns - (num_columns % chunk);
319
320 // Void unused variables to avoid compiler warnings (-Wno-unused-parameter)
321 #if !defined(USE_LOOP_UNROLLING) || (USE_LOOP_UNROLLING != 1)
322 (void) num_columns_chunked;
323 (void) chunk;
324 #endif
325
326 // Determine major order of A
327 if (A_is_row_major)
328 {
329 // For row-major (C ordering) matrix A (symmetric or non-symmetric)
330 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
331 #pragma omp parallel for \
332 schedule(static) \
333 if (!omp_in_parallel()) \
334 default(none) \
335 shared(A, b, c, jump, alpha, num_rows, num_columns, chunk, \
336 num_columns_chunked) \
337 private(j, sum)
338 #endif
339 for (long int i=0; i < num_rows; ++i)
340 {
341 sum = 0.0;
342 jump = i * num_columns;
343 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
344 for (j=0; j < num_columns_chunked; j+= chunk)
345 {
346 sum += A[jump + j] * b[j] +
347 A[jump + j+1] * b[j+1] +
348 A[jump + j+2] * b[j+2] +
349 A[jump + j+3] * b[j+3] +
350 A[jump + j+4] * b[j+4];
351 }
352 #endif
353
354 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
355 for (j=num_columns_chunked; j < num_columns; ++j)
356 #else
357 for (j=0; j < num_columns; ++j)
358 #endif
359 {
360 sum += A[jump + j] * b[j];
361 }
362
363 c[i] += alpha * static_cast<DataType>(sum);
364 }
365 }
366 else if (A_is_symmetric)
367 {
368 // A is column-major, but symmetric, so we can use its transposed
369 // operation instead, which is more efficient for column-major
370 // matrices.
372 A, b, alpha, num_rows, num_columns, A_is_row_major, A_is_symmetric,
373 c);
374 }
375 else
376 {
377 // For column-major (Fortran ordering) non-symmetric matrix A
378 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
379 #pragma omp parallel for \
380 schedule(static) \
381 if (!omp_in_parallel()) \
382 default(none) \
383 shared(A, b, c, alpha, num_rows, num_columns) \
384 private(j, sum)
385 #endif
386 for (long int i=0; i < num_rows; ++i)
387 {
388 sum = 0.0;
389 for (j=0; j < num_columns; ++j)
390 {
391 sum += A[i + num_rows*j] * b[j];
392 }
393 c[i] += alpha* static_cast<DataType>(sum);
394 }
395 }
396
397 #endif
398}
399
400
401// =======================
402// dense transposed matvec
403// =======================
404
440
441template <typename DataType>
443 const DataType* RESTRICT A,
444 const DataType* RESTRICT b,
445 const LongIndexType num_rows,
446 const LongIndexType num_columns,
447 const FlagType A_is_row_major,
448 const FlagType A_is_symmetric,
449 DataType* RESTRICT c)
450{
451 #if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
452
453 // Using BLAS
454 CBLAS_LAYOUT layout;
455 CBLAS_UPLO uplo;
456 CBLAS_TRANSPOSE transpose = CblasTrans;
457 int lda;
458 if (A_is_row_major)
459 {
460 layout = CblasRowMajor;
461 uplo = CblasUpper;
462 lda = num_columns;
463 }
464 else
465 {
466 layout = CblasColMajor;
467 uplo = CblasLower;
468 lda = num_rows;
469
470 // For efficiency, use transpose op for symmetric column-major matrices
471 if (A_is_symmetric)
472 {
473 transpose = CblasNoTrans;
474 }
475 }
476
477 int incb = 1;
478 int incc = 1;
479 DataType alpha = 1.0;
480 DataType beta = 0.0;
481
482 if (A_is_symmetric)
483 {
484 cblas_api::xsymv(layout, uplo, num_columns, alpha, A, lda, b, incb,
485 beta, c, incc);
486 }
487 else
488 {
489 cblas_api::xgemv(layout, transpose, num_rows, num_columns, alpha, A,
490 lda, b, incb, beta, c, incc);
491 }
492
493 #else
494
495 // Not using BLAS
496 long int i;
497 long int jump;
498 long double sum;
499 const long int chunk = 5;
500 const long int num_rows_chunked = num_rows - (num_rows % chunk);
501
502 // Void unused variables to avoid compiler warnings (-Wno-unused-parameter)
503 #if !defined(USE_LOOP_UNROLLING) || (USE_LOOP_UNROLLING != 1)
504 (void) num_rows_chunked;
505 (void) chunk;
506 #endif
507
508 // Determine major order of A
509 if (!A_is_row_major)
510 {
511 // For column-major (Fortran ordering) matrix A (symmetric or
512 // non-symmetric)
513 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
514 #pragma omp parallel for \
515 schedule(static) \
516 if (!omp_in_parallel()) \
517 default(none) \
518 shared(A, b, c, jump, num_rows, num_columns, num_rows_chunked, \
519 chunk) \
520 private(i, sum)
521 #endif
522 for (long int j=0; j < num_columns; ++j)
523 {
524 // Loop unrolling
525 sum = 0.0;
526 jump = num_rows * j;
527 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
528 for (i=0; i < num_rows_chunked; i += chunk)
529 {
530 sum += A[i + jump] * b[i] +
531 A[i+1 + jump] * b[i+1] +
532 A[i+2 + jump] * b[i+2] +
533 A[i+3 + jump] * b[i+3] +
534 A[i+4 + jump] * b[i+4];
535 }
536 #endif
537
538 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
539 for (i=num_rows_chunked; i < num_rows; ++i)
540 #else
541 for (i=0; i < num_rows; ++i)
542 #endif
543 {
544 sum += A[i + jump] * b[i];
545 }
546
547 c[j] = static_cast<DataType>(sum);
548 }
549 }
550 else if (A_is_symmetric)
551 {
552 // A is row-major, but symmetric, so we can use its non-transposed
553 // operation instead, which is more efficient for row-major
554 // matrices.
556 A, b, num_rows, num_columns, A_is_row_major, A_is_symmetric, c);
557 }
558 else
559 {
560 // For row-major (C ordering) non-symmetric matrix A
561 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
562 #pragma omp parallel for \
563 schedule(static) \
564 if (!omp_in_parallel()) \
565 default(none) \
566 shared(A, b, c, num_rows, num_columns) \
567 private(i, sum)
568 #endif
569 for (long int j=0; j < num_columns; ++j)
570 {
571 sum = 0.0;
572 for (i=0; i < num_rows; ++i)
573 {
574 sum += A[i*num_columns + j] * b[i];
575 }
576 c[j] = static_cast<DataType>(sum);
577 }
578 }
579
580 #endif
581}
582
583
584// ============================
585// dense transposed matvec plus
586// ============================
587
625
626template <typename DataType>
628 const DataType* RESTRICT A,
629 const DataType* RESTRICT b,
630 const DataType alpha,
631 const LongIndexType num_rows,
632 const LongIndexType num_columns,
633 const FlagType A_is_row_major,
634 const FlagType A_is_symmetric,
635 DataType* RESTRICT c)
636{
637 #if defined(USE_ANY_CBLAS) && (USE_ANY_CBLAS == 1)
638
639 // Using BLAS
640 CBLAS_LAYOUT layout;
641 CBLAS_TRANSPOSE transpose = CblasTrans;
642 CBLAS_UPLO uplo;
643 int lda;
644 if (A_is_row_major)
645 {
646 layout = CblasRowMajor;
647 uplo = CblasUpper;
648 lda = num_columns;
649 }
650 else
651 {
652 layout = CblasColMajor;
653 uplo = CblasLower;
654 lda = num_rows;
655
656 // For efficiency, use transpose op for symmetric column-major matrices
657 if (A_is_symmetric)
658 {
659 transpose = CblasNoTrans;
660 }
661 }
662
663 int incb = 1;
664 int incc = 1;
665 DataType beta = 1.0;
666
667 if (A_is_symmetric)
668 {
669 cblas_api::xsymv(layout, uplo, num_columns, alpha, A, lda, b, incb,
670 beta, c, incc);
671 }
672 else
673 {
674 cblas_api::xgemv(layout, transpose, num_rows, num_columns, alpha, A,
675 lda, b, incb, beta, c, incc);
676 }
677
678 #else
679
680 // Not using BLAS
681 DataType zero = 0.0;
682 if (c_arithmetics::is_equal(alpha, zero))
683 {
684 return;
685 }
686
687 long int i;
688 long int jump;
689 long double sum;
690 const long int chunk = 5;
691 const long int num_rows_chunked = num_rows - (num_rows % chunk);
692
693 // Void unused variables to avoid compiler warnings (-Wno-unused-parameter)
694 #if !defined(USE_LOOP_UNROLLING) || (USE_LOOP_UNROLLING != 1)
695 (void) num_rows_chunked;
696 (void) chunk;
697 #endif
698
699 // Determine major order of A
700 if (!A_is_row_major)
701 {
702 // For column-major (Fortran ordering) matrix A (symmetric or
703 // non-symmetric)
704 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
705 #pragma omp parallel for \
706 schedule(static) \
707 if (!omp_in_parallel()) \
708 default(none) \
709 shared(A, b, c, jump, alpha, num_rows, num_columns, \
710 num_rows_chunked, chunk) \
711 private(i, sum)
712 #endif
713 for (long int j=0; j < num_columns; ++j)
714 {
715 sum = 0.0;
716 jump = num_rows * j;
717 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
718 for (i=0; i < num_rows_chunked; i += chunk)
719 {
720 sum += A[i + jump] * b[i] +
721 A[i+1 + jump] * b[i+1] +
722 A[i+2 + jump] * b[i+2] +
723 A[i+3 + jump] * b[i+3] +
724 A[i+4 + jump] * b[i+4];
725 }
726 #endif
727
728 #if defined(USE_LOOP_UNROLLING) && (USE_LOOP_UNROLLING == 1)
729 for (i=num_rows_chunked; i < num_rows; ++i)
730 #else
731 for (i=0; i < num_rows; ++i)
732 #endif
733 {
734 sum += A[i + jump] * b[i];
735 }
736
737 c[j] += alpha * static_cast<DataType>(sum);
738 }
739 }
740 else if (A_is_symmetric)
741 {
742 // A is row-major, but symmetric, so we can use its non-transposed
743 // operation instead, which is more efficient for row-major
744 // matrices.
746 A, b, alpha, num_rows, num_columns, A_is_row_major, A_is_symmetric,
747 c);
748 }
749 else
750 {
751 // For row-major (C ordering) non-symmetric matrix A
752 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
753 #pragma omp parallel for \
754 schedule(static) \
755 if (!omp_in_parallel()) \
756 default(none) \
757 shared(A, b, c, alpha, num_rows, num_columns) \
758 private(i, sum)
759 #endif
760 for (long int j=0; j < num_columns; ++j)
761 {
762 sum = 0.0;
763 for (i=0; i < num_rows; ++i)
764 {
765 sum += A[i*num_columns + j] * b[i];
766 }
767 c[j] += alpha * static_cast<DataType>(sum);
768 }
769 }
770
771 #endif
772}
773
774
775// ==========
776// csr matvec
777// ==========
778
812
813template <typename DataType>
815 const DataType* RESTRICT A_data,
816 const LongIndexType* RESTRICT A_column_indices,
817 const LongIndexType* RESTRICT A_index_pointer,
818 const DataType* RESTRICT b,
819 const LongIndexType num_rows,
820 DataType* RESTRICT c)
821{
822 LongIndexType index_pointer;
823 LongIndexType row;
824 LongIndexType column;
825 long double sum;
826
827 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
828 #pragma omp parallel for \
829 schedule(static) \
830 if (!omp_in_parallel()) \
831 default(none) \
832 shared(A_data, A_column_indices, A_index_pointer, b, c, num_rows) \
833 private(index_pointer, column, sum)
834 #endif
835 for (row=0; row < num_rows; ++row)
836 {
837 sum = 0.0;
838 for (index_pointer=A_index_pointer[row];
839 index_pointer < A_index_pointer[row+1];
840 ++index_pointer)
841 {
842 column = A_column_indices[index_pointer];
843 sum += A_data[index_pointer] * b[column];
844 }
845 c[row] = static_cast<DataType>(sum);
846 }
847}
848
849
850// ===============
851// csr matvec plus
852// ===============
853
891
892template <typename DataType>
894 const DataType* A_data,
895 const LongIndexType* RESTRICT A_column_indices,
896 const LongIndexType* RESTRICT A_index_pointer,
897 const DataType* RESTRICT b,
898 const DataType alpha,
899 const LongIndexType num_rows,
900 DataType* RESTRICT c)
901{
902 DataType zero = 0.0;
903 if (c_arithmetics::is_equal(alpha, zero))
904 {
905 return;
906 }
907
908 LongIndexType index_pointer;
909 LongIndexType row;
910 LongIndexType column;
911 long double sum;
912
913 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
914 #pragma omp parallel for \
915 schedule(static) \
916 if (!omp_in_parallel()) \
917 default(none) \
918 shared(A_data, A_column_indices, A_index_pointer, b, c, alpha, \
919 num_rows) \
920 private(index_pointer, column, sum)
921 #endif
922 for (row=0; row < num_rows; ++row)
923 {
924 sum = 0.0;
925 for (index_pointer=A_index_pointer[row];
926 index_pointer < A_index_pointer[row+1];
927 ++index_pointer)
928 {
929 column = A_column_indices[index_pointer];
930 sum += A_data[index_pointer] * b[column];
931 }
932 c[row] += alpha * static_cast<DataType>(sum);
933 }
934}
935
936
937// =====================
938// csr transposed matvec
939// =====================
940
970
971template <typename DataType>
973 const DataType* RESTRICT A_data,
974 const LongIndexType* RESTRICT A_column_indices,
975 const LongIndexType* RESTRICT A_index_pointer,
976 const FlagType A_is_symmetric,
977 const DataType* RESTRICT b,
978 const LongIndexType num_rows,
979 const LongIndexType num_columns,
980 DataType* RESTRICT c)
981{
982 if (A_is_symmetric)
983 {
984 // For symmetric A, use non-transposed operation instead for efficiency
986 A_data, A_column_indices, A_index_pointer, b, num_rows, c);
987 }
988 else
989 {
990 // A is non-symmetric, use transposed product operation
991 LongIndexType index_pointer;
992 LongIndexType row;
993 LongIndexType column;
994
995 // Initialize output to zero
996 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
997 #pragma omp parallel for \
998 schedule(static) \
999 if ((!omp_in_parallel()) && (num_columns >= LARGE_ARRAY_SIZE)) \
1000 default(none) shared(c, num_columns)
1001 #endif
1002 for (column=0; column < num_columns; ++column)
1003 {
1004 c[column] = 0.0;
1005 }
1006
1007 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1008 #pragma omp parallel for \
1009 schedule(static) \
1010 if (!omp_in_parallel()) \
1011 default(none) \
1012 shared(A_data, A_column_indices, A_index_pointer, b, c, num_rows) \
1013 private(index_pointer, column)
1014 #endif
1015 for (row=0; row < num_rows; ++row)
1016 {
1017 for (index_pointer=A_index_pointer[row];
1018 index_pointer < A_index_pointer[row+1];
1019 ++index_pointer)
1020 {
1021 column = A_column_indices[index_pointer];
1022
1023 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1024 #pragma omp atomic update
1025 #endif
1026 c[column] += A_data[index_pointer] * b[row];
1027 }
1028 }
1029 }
1030}
1031
1032
1033// ==========================
1034// csr transposed matvec plus
1035// ==========================
1036
1068
1069template <typename DataType>
1071 const DataType* RESTRICT A_data,
1072 const LongIndexType* RESTRICT A_column_indices,
1073 const LongIndexType* RESTRICT A_index_pointer,
1074 const FlagType A_is_symmetric,
1075 const DataType* RESTRICT b,
1076 const DataType alpha,
1077 const LongIndexType num_rows,
1078 DataType* RESTRICT c)
1079{
1080 DataType zero = 0.0;
1081 if (c_arithmetics::is_equal(alpha, zero))
1082 {
1083 return;
1084 }
1085
1086 if (A_is_symmetric)
1087 {
1088 // For symmetric A, use non-transposed operation instead for efficiency
1090 A_data, A_column_indices, A_index_pointer, b, alpha, num_rows, c);
1091 }
1092 else
1093 {
1094 // A is non-symmetric, use transposed product operation
1095 LongIndexType index_pointer;
1096 LongIndexType row;
1097 LongIndexType column;
1098
1099 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1100 #pragma omp parallel for \
1101 schedule(static) \
1102 if (!omp_in_parallel()) \
1103 default(none) \
1104 shared(A_data, A_column_indices, A_index_pointer, b, c, alpha, \
1105 num_rows) \
1106 private(index_pointer, column)
1107 #endif
1108 for (row=0; row < num_rows; ++row)
1109 {
1110 for (index_pointer=A_index_pointer[row];
1111 index_pointer < A_index_pointer[row+1];
1112 ++index_pointer)
1113 {
1114 column = A_column_indices[index_pointer];
1115
1116 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1117 #pragma omp atomic update
1118 #endif
1119 c[column] += alpha * A_data[index_pointer] * b[row];
1120 }
1121 }
1122 }
1123}
1124
1125
1126// ==========
1127// csc matvec
1128// ==========
1129
1159
1160template <typename DataType>
1162 const DataType* RESTRICT A_data,
1163 const LongIndexType* RESTRICT A_row_indices,
1164 const LongIndexType* RESTRICT A_index_pointer,
1165 const FlagType A_is_symmetric,
1166 const DataType* RESTRICT b,
1167 const LongIndexType num_rows,
1168 const LongIndexType num_columns,
1169 DataType* RESTRICT c)
1170{
1171 if (A_is_symmetric)
1172 {
1173 // For symmetric A, use transposed operation instead for efficiency
1175 A_data, A_row_indices, A_index_pointer, b, num_columns, c);
1176 }
1177 else
1178 {
1179 // A is non-symmetric, use non-transposed product operation
1180 LongIndexType index_pointer;
1181 LongIndexType row;
1182 LongIndexType column;
1183
1184 // Initialize output to zero
1185 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1186 #pragma omp parallel for \
1187 schedule(static) \
1188 if (!omp_in_parallel() && (num_rows >= LARGE_ARRAY_SIZE)) \
1189 default(none) \
1190 shared(c, num_rows)
1191 #endif
1192 for (row=0; row < num_rows; ++row)
1193 {
1194 c[row] = 0.0;
1195 }
1196
1197 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1198 #pragma omp parallel for \
1199 schedule(static) \
1200 if (!omp_in_parallel()) \
1201 default(none) \
1202 shared(A_data, A_row_indices, A_index_pointer, b, c, num_columns) \
1203 private(index_pointer, row)
1204 #endif
1205 for (column=0; column < num_columns; ++column)
1206 {
1207 for (index_pointer=A_index_pointer[column];
1208 index_pointer < A_index_pointer[column+1];
1209 ++index_pointer)
1210 {
1211 row = A_row_indices[index_pointer];
1212
1213 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1214 #pragma omp atomic update
1215 #endif
1216 c[row] += A_data[index_pointer] * b[column];
1217 }
1218 }
1219 }
1220}
1221
1222
1223// ===============
1224// csc matvec plus
1225// ===============
1226
1258
1259template <typename DataType>
1261 const DataType* RESTRICT A_data,
1262 const LongIndexType* RESTRICT A_row_indices,
1263 const LongIndexType* RESTRICT A_index_pointer,
1264 const FlagType A_is_symmetric,
1265 const DataType* RESTRICT b,
1266 const DataType alpha,
1267 const LongIndexType num_columns,
1268 DataType* RESTRICT c)
1269{
1270 DataType zero = 0.0;
1271 if (c_arithmetics::is_equal(alpha, zero))
1272 {
1273 return;
1274 }
1275
1276 if (A_is_symmetric)
1277 {
1278 // For symmetric A, use transposed operation instead for efficiency
1280 A_data, A_row_indices, A_index_pointer, b, alpha, num_columns, c);
1281
1282 }
1283 else
1284 {
1285 LongIndexType index_pointer;
1286 LongIndexType row;
1287 LongIndexType column;
1288
1289 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1290 #pragma omp parallel for \
1291 schedule(static) \
1292 if (!omp_in_parallel()) \
1293 default(none) \
1294 shared(A_data, A_row_indices, A_index_pointer, b, c, alpha, \
1295 num_columns) \
1296 private(index_pointer, row)
1297 #endif
1298 for (column=0; column < num_columns; ++column)
1299 {
1300 for (index_pointer=A_index_pointer[column];
1301 index_pointer < A_index_pointer[column+1];
1302 ++index_pointer)
1303 {
1304 row = A_row_indices[index_pointer];
1305
1306 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1307 #pragma omp atomic update
1308 #endif
1309 c[row] += alpha * A_data[index_pointer] * b[column];
1310 }
1311 }
1312 }
1313}
1314
1315
1316// =====================
1317// csc transposed matvec
1318// =====================
1319
1354
1355template <typename DataType>
1357 const DataType* RESTRICT A_data,
1358 const LongIndexType* RESTRICT A_row_indices,
1359 const LongIndexType* RESTRICT A_index_pointer,
1360 const DataType* RESTRICT b,
1361 const LongIndexType num_columns,
1362 DataType* RESTRICT c)
1363{
1364 LongIndexType index_pointer;
1365 LongIndexType row;
1366 LongIndexType column;
1367 long double sum;
1368
1369 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1370 #pragma omp parallel for \
1371 schedule(static) \
1372 if (!omp_in_parallel()) \
1373 default(none) \
1374 shared(A_data, A_row_indices, A_index_pointer, b, c, num_columns) \
1375 private(index_pointer, row, sum)
1376 #endif
1377 for (column=0; column < num_columns; ++column)
1378 {
1379 sum = 0.0;
1380 for (index_pointer=A_index_pointer[column];
1381 index_pointer < A_index_pointer[column+1];
1382 ++index_pointer)
1383 {
1384 row = A_row_indices[index_pointer];
1385 sum += A_data[index_pointer] * b[row];
1386 }
1387 c[column] = static_cast<DataType>(sum);
1388 }
1389}
1390
1391
1392// ==========================
1393// csc transposed matvec plus
1394// ==========================
1395
1433
1434template <typename DataType>
1436 const DataType* RESTRICT A_data,
1437 const LongIndexType* RESTRICT A_row_indices,
1438 const LongIndexType* RESTRICT A_index_pointer,
1439 const DataType* RESTRICT b,
1440 const DataType alpha,
1441 const LongIndexType num_columns,
1442 DataType* RESTRICT c)
1443{
1444 DataType zero = 0.0;
1445 if (c_arithmetics::is_equal(alpha, zero))
1446 {
1447 return;
1448 }
1449
1450 LongIndexType index_pointer;
1451 LongIndexType row;
1452 LongIndexType column;
1453 long double sum;
1454
1455 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1456 #pragma omp parallel for \
1457 schedule(static) \
1458 if (!omp_in_parallel()) \
1459 default(none) \
1460 shared(A_data, A_row_indices, A_index_pointer, b, c, alpha, \
1461 num_columns) \
1462 private(index_pointer, row, sum)
1463 #endif
1464 for (column=0; column < num_columns; ++column)
1465 {
1466 sum = 0.0;
1467 for (index_pointer=A_index_pointer[column];
1468 index_pointer < A_index_pointer[column+1];
1469 ++index_pointer)
1470 {
1471 row = A_row_indices[index_pointer];
1472 sum += A_data[index_pointer] * b[row];
1473 }
1474 c[column] += static_cast<DataType>(alpha * sum);
1475 }
1476}
1477
1478
1479// ==================
1480// create band matrix
1481// ==================
1482
1534
1535template <typename DataType>
1537 DataType* RESTRICT A,
1538 const LongIndexType num_rows,
1539 const LongIndexType num_columns,
1540 const FlagType A_is_row_major,
1541 const DataType* RESTRICT diagonals,
1542 const DataType* RESTRICT supdiagonals,
1543 const IndexType non_zero_size,
1544 const FlagType tridiagonal)
1545{
1546 if (A_is_row_major)
1547 {
1548 // A is row-major
1549 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1550 #pragma omp parallel for \
1551 schedule(static) \
1552 if (!omp_in_parallel() && (non_zero_size >= LARGE_ARRAY_SIZE)) \
1553 default(none) \
1554 shared(A, diagonals, supdiagonals, non_zero_size, tridiagonal, \
1555 num_columns)
1556 #endif
1557 for (IndexType j=0; j < non_zero_size; ++j)
1558 {
1559 // Diagonals
1560 A[j*num_columns + j] = diagonals[j];
1561
1562 // Off diagonals
1563 if (j < non_zero_size-1)
1564 {
1565 // Sup-diagonal
1566 A[j*num_columns + j+1] = supdiagonals[j];
1567
1568 // Sub-diagonal, making symmetric tri-diagonal matrix
1569 if (tridiagonal)
1570 {
1571 A[(j+1)*num_columns + j] = supdiagonals[j];
1572 }
1573 }
1574 }
1575 }
1576 else
1577 {
1578 // A is column-major
1579 #if defined(USE_OPENMP) && (USE_OPENMP == 1)
1580 #pragma omp parallel for \
1581 schedule(static) \
1582 if (!omp_in_parallel() && (non_zero_size >= LARGE_ARRAY_SIZE)) \
1583 default(none) \
1584 shared(A, diagonals, supdiagonals, non_zero_size, tridiagonal, \
1585 num_rows)
1586 #endif
1587 for (IndexType j=0; j < non_zero_size; ++j)
1588 {
1589 // Diagonals
1590 A[j + num_rows*j] = diagonals[j];
1591
1592 // Off diagonals
1593 if (j < non_zero_size-1)
1594 {
1595 // Sup-diagonal
1596 A[j + (j+1)*num_rows] = supdiagonals[j];
1597
1598 // Sub-diagonal, making symmetric tri-diagonal matrix
1599 if (tridiagonal)
1600 {
1601 A[j+1 + j*num_rows] = supdiagonals[j];
1602 }
1603 }
1604 }
1605 }
1606}
1607
1608
1609// ===============================
1610// Explicit template instantiation
1611// ===============================
1612
1613template class cMatrixOperations<float>;
1614template class cMatrixOperations<double>;
1615template class cMatrixOperations<long double>;
#define RESTRICT
A static class for matrix-vector operations, which are similar to the level-2 operations of the BLAS ...
static void csc_transposed_matvec(const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_row_indices, const LongIndexType *RESTRICT A_index_pointer, const DataType *RESTRICT b, const LongIndexType num_columns, DataType *RESTRICT c)
Computes where is compressed sparse column (CSC) matrix and is a dense vector. The output is a de...
static void dense_matvec(const DataType *RESTRICT A, const DataType *RESTRICT b, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, const FlagType A_is_symmetric, DataType *RESTRICT c)
Computes the matrix vector multiplication where is a dense matrix.
static void csr_transposed_matvec(const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_column_indices, const LongIndexType *RESTRICT A_index_pointer, const FlagType A_is_symmetric, const DataType *RESTRICT b, const LongIndexType num_rows, const LongIndexType num_columns, DataType *RESTRICT c)
Computes where is compressed sparse row (CSR) matrix and is a dense vector. The output is a dense...
static void dense_matvec_plus(const DataType *RESTRICT A, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, const FlagType A_is_symmetric, DataType *RESTRICT c)
Computes the operation where is a dense matrix.
static void create_band_matrix(DataType *RESTRICT A, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, const DataType *RESTRICT diagonals, const DataType *RESTRICT supdiagonals, const IndexType non_zero_size, const FlagType tridiagonal)
Creates bi-diagonal or symmetric tri-diagonal matrix from the diagonal array (diagonals) and off-diag...
static void csc_transposed_matvec_plus(const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_row_indices, const LongIndexType *RESTRICT A_index_pointer, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_columns, DataType *RESTRICT c)
Computes where is compressed sparse column (CSC) matrix and is a dense vector. The output is a de...
static void csc_matvec_plus(const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_row_indices, const LongIndexType *RESTRICT A_index_pointer, const FlagType A_is_symmetric, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_columns, DataType *RESTRICT c)
Computes where is compressed sparse column (CSC) matrix and is a dense vector. The output is a de...
static void dense_transposed_matvec(const DataType *RESTRICT A, const DataType *RESTRICT b, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, const FlagType A_is_symmetric, DataType *RESTRICT c)
Computes matrix vector multiplication where is dense, and is the transpose of the matrix .
static void csr_transposed_matvec_plus(const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_column_indices, const LongIndexType *RESTRICT A_index_pointer, const FlagType A_is_symmetric, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_rows, DataType *RESTRICT c)
Computes where is compressed sparse row (CSR) matrix and is a dense vector. The output is a dense...
static void csr_matvec(const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_column_indices, const LongIndexType *RESTRICT A_index_pointer, const DataType *RESTRICT b, const LongIndexType num_rows, DataType *RESTRICT c)
Computes where is compressed sparse row (CSR) matrix and is a dense vector. The output is a dense...
static void dense_transposed_matvec_plus(const DataType *RESTRICT A, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_rows, const LongIndexType num_columns, const FlagType A_is_row_major, const FlagType A_is_symmetric, DataType *RESTRICT c)
Computes where is dense, and is the transpose of the matrix .
static void csc_matvec(const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_row_indices, const LongIndexType *RESTRICT A_index_pointer, const FlagType A_is_symmetric, const DataType *RESTRICT b, const LongIndexType num_rows, const LongIndexType num_columns, DataType *RESTRICT c)
Computes where is compressed sparse column (CSC) matrix and is a dense vector. The output is a de...
static void csr_matvec_plus(const DataType *RESTRICT A_data, const LongIndexType *RESTRICT A_column_indices, const LongIndexType *RESTRICT A_index_pointer, const DataType *RESTRICT b, const DataType alpha, const LongIndexType num_rows, DataType *RESTRICT c)
Computes where is compressed sparse row (CSR) matrix and is a dense vector. The output is a dense...
bool is_equal(DataType x, DataType y)
Check if two floating point numbers are equal within a tolerance.
Definition _c_is_equal.h:49
int LongIndexType
Definition types.h:60
int FlagType
Definition types.h:68
int IndexType
Definition types.h:65