Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_32f_expfast_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
65 #include <inttypes.h>
66 #include <math.h>
67 #include <stdio.h>
68 
69 #define Mln2 0.6931471805f
70 #define A 8388608.0f
71 #define B 1065353216.0f
72 #define C 60801.0f
73 
74 
75 #ifndef INCLUDED_volk_32f_expfast_32f_a_H
76 #define INCLUDED_volk_32f_expfast_32f_a_H
77 
78 #if LV_HAVE_AVX && LV_HAVE_FMA
79 
80 #include <immintrin.h>
81 
82 static inline void volk_32f_expfast_32f_a_avx_fma(float* bVector,
83  const float* aVector,
84  unsigned int num_points)
85 {
86  float* bPtr = bVector;
87  const float* aPtr = aVector;
88 
89  unsigned int number = 0;
90  const unsigned int eighthPoints = num_points / 8;
91 
92  __m256 aVal, bVal, a, b;
93  __m256i exp;
94  a = _mm256_set1_ps(A / Mln2);
95  b = _mm256_set1_ps(B - C);
96 
97  for (; number < eighthPoints; number++) {
98  aVal = _mm256_load_ps(aPtr);
99  exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
100  bVal = _mm256_castsi256_ps(exp);
101 
102  _mm256_store_ps(bPtr, bVal);
103  aPtr += 8;
104  bPtr += 8;
105  }
106 
107  number = eighthPoints * 8;
108  for (; number < num_points; number++) {
109  *bPtr++ = expf(*aPtr++);
110  }
111 }
112 
113 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned */
114 
115 #ifdef LV_HAVE_AVX
116 
117 #include <immintrin.h>
118 
119 static inline void
120 volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
121 {
122  float* bPtr = bVector;
123  const float* aPtr = aVector;
124 
125  unsigned int number = 0;
126  const unsigned int eighthPoints = num_points / 8;
127 
128  __m256 aVal, bVal, a, b;
129  __m256i exp;
130  a = _mm256_set1_ps(A / Mln2);
131  b = _mm256_set1_ps(B - C);
132 
133  for (; number < eighthPoints; number++) {
134  aVal = _mm256_load_ps(aPtr);
135  exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
136  bVal = _mm256_castsi256_ps(exp);
137 
138  _mm256_store_ps(bPtr, bVal);
139  aPtr += 8;
140  bPtr += 8;
141  }
142 
143  number = eighthPoints * 8;
144  for (; number < num_points; number++) {
145  *bPtr++ = expf(*aPtr++);
146  }
147 }
148 
149 #endif /* LV_HAVE_AVX for aligned */
150 
151 #ifdef LV_HAVE_SSE4_1
152 #include <smmintrin.h>
153 
154 static inline void volk_32f_expfast_32f_a_sse4_1(float* bVector,
155  const float* aVector,
156  unsigned int num_points)
157 {
158  float* bPtr = bVector;
159  const float* aPtr = aVector;
160 
161  unsigned int number = 0;
162  const unsigned int quarterPoints = num_points / 4;
163 
164  __m128 aVal, bVal, a, b;
165  __m128i exp;
166  a = _mm_set1_ps(A / Mln2);
167  b = _mm_set1_ps(B - C);
168 
169  for (; number < quarterPoints; number++) {
170  aVal = _mm_load_ps(aPtr);
171  exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
172  bVal = _mm_castsi128_ps(exp);
173 
174  _mm_store_ps(bPtr, bVal);
175  aPtr += 4;
176  bPtr += 4;
177  }
178 
179  number = quarterPoints * 4;
180  for (; number < num_points; number++) {
181  *bPtr++ = expf(*aPtr++);
182  }
183 }
184 
185 #endif /* LV_HAVE_SSE4_1 for aligned */
186 
187 #endif /* INCLUDED_volk_32f_expfast_32f_a_H */
188 
189 #ifndef INCLUDED_volk_32f_expfast_32f_u_H
190 #define INCLUDED_volk_32f_expfast_32f_u_H
191 
192 #if LV_HAVE_AVX && LV_HAVE_FMA
193 #include <immintrin.h>
194 
195 static inline void volk_32f_expfast_32f_u_avx_fma(float* bVector,
196  const float* aVector,
197  unsigned int num_points)
198 {
199  float* bPtr = bVector;
200  const float* aPtr = aVector;
201 
202  unsigned int number = 0;
203  const unsigned int eighthPoints = num_points / 8;
204 
205  __m256 aVal, bVal, a, b;
206  __m256i exp;
207  a = _mm256_set1_ps(A / Mln2);
208  b = _mm256_set1_ps(B - C);
209 
210  for (; number < eighthPoints; number++) {
211  aVal = _mm256_loadu_ps(aPtr);
212  exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
213  bVal = _mm256_castsi256_ps(exp);
214 
215  _mm256_storeu_ps(bPtr, bVal);
216  aPtr += 8;
217  bPtr += 8;
218  }
219 
220  number = eighthPoints * 8;
221  for (; number < num_points; number++) {
222  *bPtr++ = expf(*aPtr++);
223  }
224 }
225 
226 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for unaligned */
227 
228 #ifdef LV_HAVE_AVX
229 #include <immintrin.h>
230 
231 static inline void
232 volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
233 {
234  float* bPtr = bVector;
235  const float* aPtr = aVector;
236 
237  unsigned int number = 0;
238  const unsigned int eighthPoints = num_points / 8;
239 
240  __m256 aVal, bVal, a, b;
241  __m256i exp;
242  a = _mm256_set1_ps(A / Mln2);
243  b = _mm256_set1_ps(B - C);
244 
245  for (; number < eighthPoints; number++) {
246  aVal = _mm256_loadu_ps(aPtr);
247  exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
248  bVal = _mm256_castsi256_ps(exp);
249 
250  _mm256_storeu_ps(bPtr, bVal);
251  aPtr += 8;
252  bPtr += 8;
253  }
254 
255  number = eighthPoints * 8;
256  for (; number < num_points; number++) {
257  *bPtr++ = expf(*aPtr++);
258  }
259 }
260 
261 #endif /* LV_HAVE_AVX for unaligned */
262 
263 
264 #ifdef LV_HAVE_SSE4_1
265 #include <smmintrin.h>
266 
267 static inline void volk_32f_expfast_32f_u_sse4_1(float* bVector,
268  const float* aVector,
269  unsigned int num_points)
270 {
271  float* bPtr = bVector;
272  const float* aPtr = aVector;
273 
274  unsigned int number = 0;
275  const unsigned int quarterPoints = num_points / 4;
276 
277  __m128 aVal, bVal, a, b;
278  __m128i exp;
279  a = _mm_set1_ps(A / Mln2);
280  b = _mm_set1_ps(B - C);
281 
282  for (; number < quarterPoints; number++) {
283  aVal = _mm_loadu_ps(aPtr);
284  exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
285  bVal = _mm_castsi128_ps(exp);
286 
287  _mm_storeu_ps(bPtr, bVal);
288  aPtr += 4;
289  bPtr += 4;
290  }
291 
292  number = quarterPoints * 4;
293  for (; number < num_points; number++) {
294  *bPtr++ = expf(*aPtr++);
295  }
296 }
297 
298 #endif /* LV_HAVE_SSE4_1 for unaligned */
299 
300 
301 #ifdef LV_HAVE_GENERIC
302 
303 static inline void volk_32f_expfast_32f_generic(float* bVector,
304  const float* aVector,
305  unsigned int num_points)
306 {
307  float* bPtr = bVector;
308  const float* aPtr = aVector;
309  unsigned int number = 0;
310 
311  for (number = 0; number < num_points; number++) {
312  *bPtr++ = expf(*aPtr++);
313  }
314 }
315 #endif /* LV_HAVE_GENERIC */
316 
317 #endif /* INCLUDED_volk_32f_expfast_32f_u_H */
static void volk_32f_expfast_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:232
static void volk_32f_expfast_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:120
#define B
Definition: volk_32f_expfast_32f.h:71
#define C
Definition: volk_32f_expfast_32f.h:72
#define Mln2
Definition: volk_32f_expfast_32f.h:69
static void volk_32f_expfast_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:303
#define A
Definition: volk_32f_expfast_32f.h:70