ergo
mm_kernel_inner_sse2_A.h
Go to the documentation of this file.
1/* Ergo, version 3.8.2, a program for linear scaling electronic structure
2 * calculations.
3 * Copyright (C) 2023 Elias Rudberg, Emanuel H. Rubensson, Pawel Salek,
4 * and Anastasia Kruchinina.
5 *
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 *
19 * Primary academic reference:
20 * Ergo: An open-source program for linear-scaling electronic structure
21 * calculations,
22 * Elias Rudberg, Emanuel H. Rubensson, Pawel Salek, and Anastasia
23 * Kruchinina,
24 * SoftwareX 7, 107 (2018),
25 * <http://dx.doi.org/10.1016/j.softx.2018.03.005>
26 *
27 * For further information about Ergo, see <http://www.ergoscf.org>.
28 */
29
39#ifndef MM_KERNEL_INNER_SSE2_A_H
40#define MM_KERNEL_INNER_SSE2_A_H
41#include "common.h"
42#include "vector_intrin.h"
43
62template<typename T_real, typename T_reg, int T_M, int T_N, int T_K>
64 public:
65 typedef T_real real;
66 static int const M = T_M;
67 static int const N = T_N;
68 static int const K = T_K;
69 protected:
70 static int const floats_per_register = ( sizeof(T_reg) / sizeof(real) );
72
73 private:
75 template<int T_ROWS_kernel, int T_COLS_kernel, typename T_ordering_kernel, int T_repetitions>
76 class Pack;
77 public:
81
82
83
84 // Consider passing derived class from std::vector as arguments
85 // that have compile time constant length that can be checked at
86 // compile time using static asserts
90 static void exec( real const * const * const A,
91 real const * const * const B,
92 real * const C,
93 int const i = 1,
94 int const offset_A = 0,
95 int const offset_B = 0,
96 int const offset_C = 0 );
97
98 template<int T_offset_A, int T_offset_B, int T_offset_C>
99 static void exec( real const * const * const A,
100 real const * const * const B,
101 real * const C,
102 int const i = 1 );
103
104
105
106 protected:
107 template<int T_loop_index, int T_end>
108 struct Loop {
110 X_reg[T_loop_index].set_to_zero();
112 }
113 static inline void ALWAYS_INLINE inner( int const row_A_reg, // == row_C_reg
114 int const row_B,
115 Vector_intrin<real, T_reg> const & A_reg,
117 real const * B_packed ) {
119 B_reg.load_p( &B_packed[row_B * T_N * floats_per_register +
120 T_loop_index * floats_per_register] );
121 B_reg *= A_reg;
122 C_reg[row_A_reg + T_loop_index * T_M / floats_per_register] += B_reg;
123 Loop<T_loop_index+1, T_end>::inner( row_A_reg, row_B,
124 A_reg, C_reg,
125 B_packed );
126 }
127 static inline void ALWAYS_INLINE middle( int const col_A, // == row_B
129 real const * A,
130 real const * B_packed ) {
132 A_reg.load_p( &A[col_A * T_M + T_loop_index * floats_per_register] );
133 // Loop over cols of B
134 Loop<0, T_N>::inner( T_loop_index, // == row_A_reg == row_C_reg
135 col_A, // == row_B
136 A_reg,
137 C_reg,
138 B_packed );
139 Loop<T_loop_index+1, T_end>::middle( col_A, C_reg, A, B_packed );
140 }
141 static inline void ALWAYS_INLINE outer( int const start_i,
143 real const * A,
144 real const * B_packed ) {
145 // Loop over (register) rows of A and C
146 Loop<0, T_M/floats_per_register>::middle( start_i + T_loop_index,
147 C_reg,
148 A,
149 B_packed );
150 Loop<T_loop_index+1, T_end>::outer( start_i, C_reg, A, B_packed );
151 }
152 static inline void ALWAYS_INLINE add( Vector_intrin<real, T_reg> * X_reg,
153 real const * X ) {
154 X_reg[T_loop_index] += &X[T_loop_index * floats_per_register];
156 }
157 static inline void ALWAYS_INLINE store( Vector_intrin<real, T_reg> const * X_reg,
158 real * X ) {
159 X_reg[T_loop_index].store_p( &X[T_loop_index * floats_per_register] );
161 }
162
164 real const * const * const A,
165 real const * const * const B ) {
166 // Loop over columns of A and rows of B^T
167 Loop<0, T_K>::outer( 0, C_reg, A[T_loop_index], B[T_loop_index] );
169 }
170 };
171
172 template<int T_end>
173 struct Loop<T_end, T_end> {
174 static inline void ALWAYS_INLINE set_to_zero( Vector_intrin<real, T_reg> * X_reg ) {}
175 static inline void ALWAYS_INLINE inner( int const row_A_reg, // == row_C_reg
176 int const row_B,
177 Vector_intrin<real, T_reg> const & A_reg,
179 real const * B_packed ) {}
180 static inline void ALWAYS_INLINE middle( int const col_A, // == row_B
182 real const * A,
183 real const * B_packed ) {}
184 static inline void ALWAYS_INLINE outer( int const start_i,
186 real const * A,
187 real const * B_packed ) {}
188 static inline void ALWAYS_INLINE add( Vector_intrin<real, T_reg> * X_reg,
189 real const * X ) {}
190 static inline void ALWAYS_INLINE store( Vector_intrin<real, T_reg> const * X_reg,
191 real * X ) {}
193 real const * const * const A,
194 real const * const * const B ) {}
195 };
196};
197
198// Doesn't matter if inlined or not... same performance for 4, 4, 5
199// IT DOES MATTER WHEN OUTER CONTROL STRUCTURE IS ADDED ON TOP!!!
200// THEN, INLINING IS BAD
201template<typename real, typename T_reg, int T_M, int T_N, int T_K>
203 real const * const * const B,
204 real * const C,
205 int const i,
206 int const offset_A,
207 int const offset_B,
208 int const offset_C) {
209 STATIC_ASSERT_DEBUG(!(T_M%floats_per_register), TEMPLATE_ARGUMENT_T_M_MUST_BE_MULTIPLE_OF_floats_per_register);
212#if 1 // I loose a bit performance because of the offsets
213 for (int ind = 0; ind < i; ++ind)
214 MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_K>::outer( 0, C_reg, A[ind] + offset_A, B[ind] + offset_B );
216 // MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_K>::outer( 0, C_reg, A, B );
219#else
220 for (int ind = 0; ind < i; ++ind)
223 // MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_K>::outer( 0, C_reg, A, B );
226#endif
227} // end exec
228
229template<typename real, typename T_reg, int T_M, int T_N, int T_K>
230 template<int T_offset_A, int T_offset_B, int T_offset_C>
232 real const * const * const B,
233 real * const C,
234 int const i ) {
235 STATIC_ASSERT_DEBUG(!(T_M%floats_per_register), TEMPLATE_ARGUMENT_T_M_MUST_BE_MULTIPLE_OF_floats_per_register);
238 for (int ind = 0; ind < i; ++ind)
239 MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_K>::outer( 0, C_reg, A[ind] + T_offset_A, B[ind] + T_offset_B );
241 // MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_K>::outer( 0, C_reg, A, B );
244} // end exec template
245
246
247
248
249
250
252template<typename real, typename T_reg, int T_M, int T_N, int T_K>
253 template<int T_rows, int T_cols, typename T_ordering_kernel, int T_repetitions>
254 class MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::Pack {
255 public:
256 static int const size_packed = T_rows * T_cols * T_repetitions;
257 static int const rows = T_rows;
258 static int const cols = T_cols;
259
260 template<typename T_ordering_matrix>
262 typedef real * PtrTypePacked;
263 typedef real const * const PtrType;
264 inline static void exec( PtrType X, PtrTypePacked X_packed,
265 int const row_k,
266 int const col_k,
267 int const rows_total_matrix,
268 int const cols_total_matrix ) {
269 for ( int ir = 0; ir < T_repetitions; ++ir)
270 X_packed[ T_ordering_kernel::get( row_k, col_k, T_rows, T_cols ) * T_repetitions + ir ]
271 = X[ T_ordering_matrix::get(row_k, col_k, rows_total_matrix, cols_total_matrix) ];
272 }
273 };
274
275 template<typename T_ordering_matrix>
277 typedef real const * const PtrTypePacked;
278 typedef real * PtrType;
279 inline static void exec( PtrType X, PtrTypePacked X_packed,
280 int const row_k,
281 int const col_k,
282 int const rows_total_matrix,
283 int const cols_total_matrix ) {
284 for ( int ir = 0; ir < T_repetitions; ++ir)
285 X[ T_ordering_matrix::get(row_k, col_k, rows_total_matrix, cols_total_matrix) ] =
286 X_packed[ T_ordering_kernel::get( row_k, col_k, T_rows, T_cols ) * T_repetitions + ir ];
287 }
288 };
289
290 template<template<typename T_ordering> class T_assign, typename T_ordering_matrix>
291 static void exec(typename T_assign<T_ordering_matrix>::PtrType X,
292 typename T_assign<T_ordering_matrix>::PtrTypePacked X_packed,
293 int const rows_total_matrix, int const cols_total_matrix) {
294 // Loop over columns of kernel
295 for ( int col_k = 0; col_k < T_cols; ++col_k ) {
296 // Loop over rows of kernel
297 for ( int row_k = 0; row_k < T_rows; ++row_k ) {
298 T_assign<T_ordering_matrix>::exec( X, X_packed,
299 row_k, col_k,
300 rows_total_matrix, cols_total_matrix );
301 }
302 }
303 } // end exec()
304
309 template<typename T_ordering_matrix>
310 inline static void pack(real const * const X, real * X_packed,
311 int const rows_total_matrix, int const cols_total_matrix) {
312 exec< Assign_to_packed, T_ordering_matrix >(X, X_packed, rows_total_matrix, cols_total_matrix);
313 }
314
318 template<typename T_ordering_matrix>
319 inline static void unpack(real * X, real const * const X_packed,
320 int const rows_total_matrix, int const cols_total_matrix) {
321 exec< Extract_from_packed, T_ordering_matrix >(X, X_packed, rows_total_matrix, cols_total_matrix);
322 }
323};
324#endif
Template for packing of matrix elements.
Definition mm_kernel_inner_sse2_A.h:254
static int const rows
Definition mm_kernel_inner_sse2_A.h:257
static void exec(typename T_assign< T_ordering_matrix >::PtrType X, typename T_assign< T_ordering_matrix >::PtrTypePacked X_packed, int const rows_total_matrix, int const cols_total_matrix)
Definition mm_kernel_inner_sse2_A.h:291
static void pack(real const *const X, real *X_packed, int const rows_total_matrix, int const cols_total_matrix)
Convenience function for assignments to packed matrix.
Definition mm_kernel_inner_sse2_A.h:310
static int const size_packed
Definition mm_kernel_inner_sse2_A.h:256
static int const cols
Definition mm_kernel_inner_sse2_A.h:258
static void unpack(real *X, real const *const X_packed, int const rows_total_matrix, int const cols_total_matrix)
Convenience function for extracting matrix from packed matrix.
Definition mm_kernel_inner_sse2_A.h:319
Matrix multiplication template for architectures with SSE2 or higher and compilers that support C++ i...
Definition mm_kernel_inner_sse2_A.h:63
static int const K
Number of columns of A and rows of B.
Definition mm_kernel_inner_sse2_A.h:68
T_real real
Real number type (usually float or double)
Definition mm_kernel_inner_sse2_A.h:65
static int const floats_per_register
Number of real numbers that fit in one register.
Definition mm_kernel_inner_sse2_A.h:70
static int const N
Number of columns of B and C.
Definition mm_kernel_inner_sse2_A.h:67
Pack< K, N, Ordering_row_wise, floats_per_register > Pack_type_B
Type that can (should) be used to pack B.
Definition mm_kernel_inner_sse2_A.h:79
static void exec(real const *const *const A, real const *const *const B, real *const C, int const i=1, int const offset_A=0, int const offset_B=0, int const offset_C=0)
Definition mm_kernel_inner_sse2_A.h:202
static void exec(real const *const *const A, real const *const *const B, real *const C, int const i=1)
Definition mm_kernel_inner_sse2_A.h:231
Pack< M, K, Ordering_col_wise, 1 > Pack_type_A
Type that can (should) be used to pack A.
Definition mm_kernel_inner_sse2_A.h:78
Pack< M, N, Ordering_col_wise, 1 > Pack_type_C
Type that can (should) be used to pack C.
Definition mm_kernel_inner_sse2_A.h:80
static int const M
Number of rows of A and C.
Definition mm_kernel_inner_sse2_A.h:66
Vector class template for access to SIMD operations.
Definition vector_intrin.h:60
void ALWAYS_INLINE load_p(Treal const *ptr)
Definition vector_intrin.h:62
void ALWAYS_INLINE store_p(Treal *ptr) const
Definition vector_intrin.h:68
void ALWAYS_INLINE set_to_zero()
Definition vector_intrin.h:90
Macros for inlining and static assertions and structs for access to matrix elements specifying the la...
#define ALWAYS_INLINE
Definition common.h:45
#define STATIC_ASSERT_DEBUG(expr, msg)
Definition common.h:72
#define B
#define A
static void ALWAYS_INLINE multiple_loop(Vector_intrin< real, T_reg > *C_reg, real const *const *const A, real const *const *const B)
Definition mm_kernel_inner_sse2_A.h:192
static void ALWAYS_INLINE middle(int const col_A, Vector_intrin< real, T_reg > *C_reg, real const *A, real const *B_packed)
Definition mm_kernel_inner_sse2_A.h:180
static void ALWAYS_INLINE outer(int const start_i, Vector_intrin< real, T_reg > *C_reg, real const *A, real const *B_packed)
Definition mm_kernel_inner_sse2_A.h:184
static void ALWAYS_INLINE set_to_zero(Vector_intrin< real, T_reg > *X_reg)
Definition mm_kernel_inner_sse2_A.h:174
static void ALWAYS_INLINE inner(int const row_A_reg, int const row_B, Vector_intrin< real, T_reg > const &A_reg, Vector_intrin< real, T_reg > *C_reg, real const *B_packed)
Definition mm_kernel_inner_sse2_A.h:175
static void ALWAYS_INLINE store(Vector_intrin< real, T_reg > const *X_reg, real *X)
Definition mm_kernel_inner_sse2_A.h:190
static void ALWAYS_INLINE add(Vector_intrin< real, T_reg > *X_reg, real const *X)
Definition mm_kernel_inner_sse2_A.h:188
Definition mm_kernel_inner_sse2_A.h:108
static void ALWAYS_INLINE inner(int const row_A_reg, int const row_B, Vector_intrin< real, T_reg > const &A_reg, Vector_intrin< real, T_reg > *C_reg, real const *B_packed)
Definition mm_kernel_inner_sse2_A.h:113
static void ALWAYS_INLINE outer(int const start_i, Vector_intrin< real, T_reg > *C_reg, real const *A, real const *B_packed)
Definition mm_kernel_inner_sse2_A.h:141
static void ALWAYS_INLINE middle(int const col_A, Vector_intrin< real, T_reg > *C_reg, real const *A, real const *B_packed)
Definition mm_kernel_inner_sse2_A.h:127
static void ALWAYS_INLINE store(Vector_intrin< real, T_reg > const *X_reg, real *X)
Definition mm_kernel_inner_sse2_A.h:157
static void ALWAYS_INLINE multiple_loop(Vector_intrin< real, T_reg > *C_reg, real const *const *const A, real const *const *const B)
Definition mm_kernel_inner_sse2_A.h:163
static void ALWAYS_INLINE add(Vector_intrin< real, T_reg > *X_reg, real const *X)
Definition mm_kernel_inner_sse2_A.h:152
static void ALWAYS_INLINE set_to_zero(Vector_intrin< real, T_reg > *X_reg)
Definition mm_kernel_inner_sse2_A.h:109
Definition mm_kernel_inner_sse2_A.h:261
static void exec(PtrType X, PtrTypePacked X_packed, int const row_k, int const col_k, int const rows_total_matrix, int const cols_total_matrix)
Definition mm_kernel_inner_sse2_A.h:264
real * PtrTypePacked
Type of packed pointer - note the absence of const qualifiers.
Definition mm_kernel_inner_sse2_A.h:262
real const *const PtrType
Type of matrix pointer - note the presence of const qualifiers.
Definition mm_kernel_inner_sse2_A.h:263
Definition mm_kernel_inner_sse2_A.h:276
static void exec(PtrType X, PtrTypePacked X_packed, int const row_k, int const col_k, int const rows_total_matrix, int const cols_total_matrix)
Definition mm_kernel_inner_sse2_A.h:279
real const *const PtrTypePacked
Type of packed pointer - note the presence of const qualifiers.
Definition mm_kernel_inner_sse2_A.h:277
real * PtrType
Type of matrix pointer - note the absence of const qualifiers.
Definition mm_kernel_inner_sse2_A.h:278
Vector template for convenient access to SIMD operations.