Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_32f_expfast_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
52 #include <inttypes.h>
53 #include <math.h>
54 #include <stdio.h>
55 
56 #define Mln2 0.6931471805f
57 #define A 8388608.0f
58 #define B 1065353216.0f
59 #define C 60801.0f
60 
61 
62 #ifndef INCLUDED_volk_32f_expfast_32f_a_H
63 #define INCLUDED_volk_32f_expfast_32f_a_H
64 
65 #if LV_HAVE_AVX && LV_HAVE_FMA
66 
67 #include <immintrin.h>
68 
69 static inline void volk_32f_expfast_32f_a_avx_fma(float* bVector,
70  const float* aVector,
71  unsigned int num_points)
72 {
73  float* bPtr = bVector;
74  const float* aPtr = aVector;
75 
76  unsigned int number = 0;
77  const unsigned int eighthPoints = num_points / 8;
78 
79  __m256 aVal, bVal, a, b;
80  __m256i exp;
81  a = _mm256_set1_ps(A / Mln2);
82  b = _mm256_set1_ps(B - C);
83 
84  for (; number < eighthPoints; number++) {
85  aVal = _mm256_load_ps(aPtr);
86  exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
87  bVal = _mm256_castsi256_ps(exp);
88 
89  _mm256_store_ps(bPtr, bVal);
90  aPtr += 8;
91  bPtr += 8;
92  }
93 
94  number = eighthPoints * 8;
95  for (; number < num_points; number++) {
96  *bPtr++ = expf(*aPtr++);
97  }
98 }
99 
100 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned */
101 
102 #ifdef LV_HAVE_AVX
103 
104 #include <immintrin.h>
105 
106 static inline void
107 volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
108 {
109  float* bPtr = bVector;
110  const float* aPtr = aVector;
111 
112  unsigned int number = 0;
113  const unsigned int eighthPoints = num_points / 8;
114 
115  __m256 aVal, bVal, a, b;
116  __m256i exp;
117  a = _mm256_set1_ps(A / Mln2);
118  b = _mm256_set1_ps(B - C);
119 
120  for (; number < eighthPoints; number++) {
121  aVal = _mm256_load_ps(aPtr);
122  exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
123  bVal = _mm256_castsi256_ps(exp);
124 
125  _mm256_store_ps(bPtr, bVal);
126  aPtr += 8;
127  bPtr += 8;
128  }
129 
130  number = eighthPoints * 8;
131  for (; number < num_points; number++) {
132  *bPtr++ = expf(*aPtr++);
133  }
134 }
135 
136 #endif /* LV_HAVE_AVX for aligned */
137 
138 #ifdef LV_HAVE_SSE4_1
139 #include <smmintrin.h>
140 
141 static inline void volk_32f_expfast_32f_a_sse4_1(float* bVector,
142  const float* aVector,
143  unsigned int num_points)
144 {
145  float* bPtr = bVector;
146  const float* aPtr = aVector;
147 
148  unsigned int number = 0;
149  const unsigned int quarterPoints = num_points / 4;
150 
151  __m128 aVal, bVal, a, b;
152  __m128i exp;
153  a = _mm_set1_ps(A / Mln2);
154  b = _mm_set1_ps(B - C);
155 
156  for (; number < quarterPoints; number++) {
157  aVal = _mm_load_ps(aPtr);
158  exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
159  bVal = _mm_castsi128_ps(exp);
160 
161  _mm_store_ps(bPtr, bVal);
162  aPtr += 4;
163  bPtr += 4;
164  }
165 
166  number = quarterPoints * 4;
167  for (; number < num_points; number++) {
168  *bPtr++ = expf(*aPtr++);
169  }
170 }
171 
172 #endif /* LV_HAVE_SSE4_1 for aligned */
173 
174 #endif /* INCLUDED_volk_32f_expfast_32f_a_H */
175 
176 #ifndef INCLUDED_volk_32f_expfast_32f_u_H
177 #define INCLUDED_volk_32f_expfast_32f_u_H
178 
179 #if LV_HAVE_AVX && LV_HAVE_FMA
180 #include <immintrin.h>
181 
182 static inline void volk_32f_expfast_32f_u_avx_fma(float* bVector,
183  const float* aVector,
184  unsigned int num_points)
185 {
186  float* bPtr = bVector;
187  const float* aPtr = aVector;
188 
189  unsigned int number = 0;
190  const unsigned int eighthPoints = num_points / 8;
191 
192  __m256 aVal, bVal, a, b;
193  __m256i exp;
194  a = _mm256_set1_ps(A / Mln2);
195  b = _mm256_set1_ps(B - C);
196 
197  for (; number < eighthPoints; number++) {
198  aVal = _mm256_loadu_ps(aPtr);
199  exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
200  bVal = _mm256_castsi256_ps(exp);
201 
202  _mm256_storeu_ps(bPtr, bVal);
203  aPtr += 8;
204  bPtr += 8;
205  }
206 
207  number = eighthPoints * 8;
208  for (; number < num_points; number++) {
209  *bPtr++ = expf(*aPtr++);
210  }
211 }
212 
213 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for unaligned */
214 
215 #ifdef LV_HAVE_AVX
216 #include <immintrin.h>
217 
218 static inline void
219 volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
220 {
221  float* bPtr = bVector;
222  const float* aPtr = aVector;
223 
224  unsigned int number = 0;
225  const unsigned int eighthPoints = num_points / 8;
226 
227  __m256 aVal, bVal, a, b;
228  __m256i exp;
229  a = _mm256_set1_ps(A / Mln2);
230  b = _mm256_set1_ps(B - C);
231 
232  for (; number < eighthPoints; number++) {
233  aVal = _mm256_loadu_ps(aPtr);
234  exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
235  bVal = _mm256_castsi256_ps(exp);
236 
237  _mm256_storeu_ps(bPtr, bVal);
238  aPtr += 8;
239  bPtr += 8;
240  }
241 
242  number = eighthPoints * 8;
243  for (; number < num_points; number++) {
244  *bPtr++ = expf(*aPtr++);
245  }
246 }
247 
248 #endif /* LV_HAVE_AVX for unaligned */
249 
250 
251 #ifdef LV_HAVE_SSE4_1
252 #include <smmintrin.h>
253 
254 static inline void volk_32f_expfast_32f_u_sse4_1(float* bVector,
255  const float* aVector,
256  unsigned int num_points)
257 {
258  float* bPtr = bVector;
259  const float* aPtr = aVector;
260 
261  unsigned int number = 0;
262  const unsigned int quarterPoints = num_points / 4;
263 
264  __m128 aVal, bVal, a, b;
265  __m128i exp;
266  a = _mm_set1_ps(A / Mln2);
267  b = _mm_set1_ps(B - C);
268 
269  for (; number < quarterPoints; number++) {
270  aVal = _mm_loadu_ps(aPtr);
271  exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
272  bVal = _mm_castsi128_ps(exp);
273 
274  _mm_storeu_ps(bPtr, bVal);
275  aPtr += 4;
276  bPtr += 4;
277  }
278 
279  number = quarterPoints * 4;
280  for (; number < num_points; number++) {
281  *bPtr++ = expf(*aPtr++);
282  }
283 }
284 
285 #endif /* LV_HAVE_SSE4_1 for unaligned */
286 
287 
288 #ifdef LV_HAVE_GENERIC
289 
290 static inline void volk_32f_expfast_32f_generic(float* bVector,
291  const float* aVector,
292  unsigned int num_points)
293 {
294  float* bPtr = bVector;
295  const float* aPtr = aVector;
296  unsigned int number = 0;
297 
298  for (number = 0; number < num_points; number++) {
299  *bPtr++ = expf(*aPtr++);
300  }
301 }
302 #endif /* LV_HAVE_GENERIC */
303 
304 #ifdef LV_HAVE_RVV
305 #include <riscv_vector.h>
306 
307 static inline void
308 volk_32f_expfast_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
309 {
310  size_t vlmax = __riscv_vsetvlmax_e32m8();
311  const vfloat32m8_t ca = __riscv_vfmv_v_f_f32m8(A / Mln2, vlmax);
312  const vfloat32m8_t cb = __riscv_vfmv_v_f_f32m8(B - C, vlmax);
313 
314  size_t n = num_points;
315  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
316  vl = __riscv_vsetvl_e32m8(n);
317  vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl);
318  v = __riscv_vfmadd(v, ca, cb, vl);
319  v = __riscv_vreinterpret_f32m8(__riscv_vfcvt_x(v, vl));
320  __riscv_vse32(bVector, v, vl);
321  }
322 }
323 #endif /*LV_HAVE_RVV*/
324 
325 #endif /* INCLUDED_volk_32f_expfast_32f_u_H */
#define Mln2
Definition: volk_32f_expfast_32f.h:56
#define B
Definition: volk_32f_expfast_32f.h:58
#define A
Definition: volk_32f_expfast_32f.h:57
static void volk_32f_expfast_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:219
static void volk_32f_expfast_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:290
#define C
Definition: volk_32f_expfast_32f.h:59
static void volk_32f_expfast_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:107