Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_32fc_accumulator_s32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2019 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
51 #ifndef INCLUDED_volk_32fc_accumulator_s32fc_a_H
52 #define INCLUDED_volk_32fc_accumulator_s32fc_a_H
53 
54 #include <inttypes.h>
55 #include <volk/volk_common.h>
56 
57 #ifdef LV_HAVE_GENERIC
59  const lv_32fc_t* inputBuffer,
60  unsigned int num_points)
61 {
62  const lv_32fc_t* aPtr = inputBuffer;
63  unsigned int number = 0;
64  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
65 
66  for (; number < num_points; number++) {
67  returnValue += (*aPtr++);
68  }
69  *result = returnValue;
70 }
71 #endif /* LV_HAVE_GENERIC */
72 
73 #ifdef LV_HAVE_AVX
74 #include <immintrin.h>
75 
76 static inline void volk_32fc_accumulator_s32fc_u_avx(lv_32fc_t* result,
77  const lv_32fc_t* inputBuffer,
78  unsigned int num_points)
79 {
80  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
81  unsigned int number = 0;
82  const unsigned int quarterPoints = num_points / 4;
83 
84  const lv_32fc_t* aPtr = inputBuffer;
85  __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
86 
87  __m256 accumulator = _mm256_setzero_ps();
88  __m256 aVal = _mm256_setzero_ps();
89 
90  for (; number < quarterPoints; number++) {
91  aVal = _mm256_loadu_ps((float*)aPtr);
92  accumulator = _mm256_add_ps(accumulator, aVal);
93  aPtr += 4;
94  }
95 
96  _mm256_store_ps(tempBuffer, accumulator);
97 
98  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
99  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
100  returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
101  returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
102 
103  number = quarterPoints * 4;
104  for (; number < num_points; number++) {
105  returnValue += (*aPtr++);
106  }
107  *result = returnValue;
108 }
109 #endif /* LV_HAVE_AVX */
110 
111 #ifdef LV_HAVE_SSE
112 #include <xmmintrin.h>
113 
114 static inline void volk_32fc_accumulator_s32fc_u_sse(lv_32fc_t* result,
115  const lv_32fc_t* inputBuffer,
116  unsigned int num_points)
117 {
118  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
119  unsigned int number = 0;
120  const unsigned int halfPoints = num_points / 2;
121 
122  const lv_32fc_t* aPtr = inputBuffer;
123  __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
124 
125  __m128 accumulator = _mm_setzero_ps();
126  __m128 aVal = _mm_setzero_ps();
127 
128  for (; number < halfPoints; number++) {
129  aVal = _mm_loadu_ps((float*)aPtr);
130  accumulator = _mm_add_ps(accumulator, aVal);
131  aPtr += 2;
132  }
133 
134  _mm_store_ps(tempBuffer, accumulator);
135 
136  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
137  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
138 
139  number = halfPoints * 2;
140  for (; number < num_points; number++) {
141  returnValue += (*aPtr++);
142  }
143  *result = returnValue;
144 }
145 #endif /* LV_HAVE_SSE */
146 
147 #ifdef LV_HAVE_AVX
148 #include <immintrin.h>
149 
150 static inline void volk_32fc_accumulator_s32fc_a_avx(lv_32fc_t* result,
151  const lv_32fc_t* inputBuffer,
152  unsigned int num_points)
153 {
154  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
155  unsigned int number = 0;
156  const unsigned int quarterPoints = num_points / 4;
157 
158  const lv_32fc_t* aPtr = inputBuffer;
159  __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
160 
161  __m256 accumulator = _mm256_setzero_ps();
162  __m256 aVal = _mm256_setzero_ps();
163 
164  for (; number < quarterPoints; number++) {
165  aVal = _mm256_load_ps((float*)aPtr);
166  accumulator = _mm256_add_ps(accumulator, aVal);
167  aPtr += 4;
168  }
169 
170  _mm256_store_ps(tempBuffer, accumulator);
171 
172  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
173  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
174  returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
175  returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
176 
177  number = quarterPoints * 4;
178  for (; number < num_points; number++) {
179  returnValue += (*aPtr++);
180  }
181  *result = returnValue;
182 }
183 #endif /* LV_HAVE_AVX */
184 
185 #ifdef LV_HAVE_SSE
186 #include <xmmintrin.h>
187 
188 static inline void volk_32fc_accumulator_s32fc_a_sse(lv_32fc_t* result,
189  const lv_32fc_t* inputBuffer,
190  unsigned int num_points)
191 {
192  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
193  unsigned int number = 0;
194  const unsigned int halfPoints = num_points / 2;
195 
196  const lv_32fc_t* aPtr = inputBuffer;
197  __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
198 
199  __m128 accumulator = _mm_setzero_ps();
200  __m128 aVal = _mm_setzero_ps();
201 
202  for (; number < halfPoints; number++) {
203  aVal = _mm_load_ps((float*)aPtr);
204  accumulator = _mm_add_ps(accumulator, aVal);
205  aPtr += 2;
206  }
207 
208  _mm_store_ps(tempBuffer, accumulator);
209 
210  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
211  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
212 
213  number = halfPoints * 2;
214  for (; number < num_points; number++) {
215  returnValue += (*aPtr++);
216  }
217  *result = returnValue;
218 }
219 #endif /* LV_HAVE_SSE */
220 
221 #ifdef LV_HAVE_NEON
222 #include <arm_neon.h>
223 static inline void volk_32fc_accumulator_s32fc_neon(lv_32fc_t* result,
224  const lv_32fc_t* inputBuffer,
225  unsigned int num_points)
226 {
227  const lv_32fc_t* aPtr = inputBuffer;
228  unsigned int number = 0;
229  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
230  unsigned int eighthPoints = num_points / 8;
231  float32x4_t in_vec;
232  float32x4_t out_vec0 = { 0.f, 0.f, 0.f, 0.f };
233  float32x4_t out_vec1 = { 0.f, 0.f, 0.f, 0.f };
234  float32x4_t out_vec2 = { 0.f, 0.f, 0.f, 0.f };
235  float32x4_t out_vec3 = { 0.f, 0.f, 0.f, 0.f };
236  __VOLK_ATTR_ALIGNED(32) float tempBuffer[4];
237 
238  for (; number < eighthPoints; number++) {
239  in_vec = vld1q_f32((float*)aPtr);
240  out_vec0 = vaddq_f32(in_vec, out_vec0);
241  aPtr += 2;
242 
243  in_vec = vld1q_f32((float*)aPtr);
244  out_vec1 = vaddq_f32(in_vec, out_vec1);
245  aPtr += 2;
246 
247  in_vec = vld1q_f32((float*)aPtr);
248  out_vec2 = vaddq_f32(in_vec, out_vec2);
249  aPtr += 2;
250 
251  in_vec = vld1q_f32((float*)aPtr);
252  out_vec3 = vaddq_f32(in_vec, out_vec3);
253  aPtr += 2;
254  }
255  vst1q_f32(tempBuffer, out_vec0);
256  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
257  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
258 
259  vst1q_f32(tempBuffer, out_vec1);
260  returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
261  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
262 
263  vst1q_f32(tempBuffer, out_vec2);
264  returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
265  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
266 
267  vst1q_f32(tempBuffer, out_vec3);
268  returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
269  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
270 
271  number = eighthPoints * 8;
272  for (; number < num_points; number++) {
273  returnValue += (*aPtr++);
274  }
275  *result = returnValue;
276 }
277 #endif /* LV_HAVE_NEON */
278 
279 #ifdef LV_HAVE_RVV
280 #include <riscv_vector.h>
282 
283 static inline void volk_32fc_accumulator_s32fc_rvv(lv_32fc_t* result,
284  const lv_32fc_t* inputBuffer,
285  unsigned int num_points)
286 {
287  size_t vlmax = __riscv_vsetvlmax_e32m8();
288  vfloat32m8_t vsum = __riscv_vfmv_v_f_f32m8(0, vlmax);
289  const float* in = (const float*)inputBuffer;
290  size_t n = num_points * 2;
291  for (size_t vl; n > 0; n -= vl, in += vl) {
292  vl = __riscv_vsetvl_e32m8(n < vlmax ? n : vlmax); /* force exact vl */
293  vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl);
294  vsum = __riscv_vfadd_tu(vsum, vsum, v, vl);
295  }
296  vuint64m8_t vsumu = __riscv_vreinterpret_u64m8(__riscv_vreinterpret_u32m8(vsum));
297  vfloat32m4_t vsum1 = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vsumu, 0, vlmax));
298  vfloat32m4_t vsum2 = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vsumu, 32, vlmax));
299  vlmax = __riscv_vsetvlmax_e32m1();
300  vfloat32m1_t vr = RISCV_SHRINK4(vfadd, f, 32, vsum1);
301  vfloat32m1_t vi = RISCV_SHRINK4(vfadd, f, 32, vsum2);
302  vfloat32m1_t z = __riscv_vfmv_s_f_f32m1(0, vlmax);
303  *result = lv_cmake(__riscv_vfmv_f(__riscv_vfredusum(vr, z, vlmax)),
304  __riscv_vfmv_f(__riscv_vfredusum(vi, z, vlmax)));
305 }
306 #endif /*LV_HAVE_RVV*/
307 
308 #endif /* INCLUDED_volk_32fc_accumulator_s32fc_a_H */
static void volk_32fc_accumulator_s32fc_generic(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:58
static void volk_32fc_accumulator_s32fc_a_sse(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:188
static void volk_32fc_accumulator_s32fc_u_sse(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:114
static void volk_32fc_accumulator_s32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:150
static void volk_32fc_accumulator_s32fc_neon(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:223
static void volk_32fc_accumulator_s32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:76
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
#define lv_cmake(r, i)
Definition: volk_complex.h:77
float complex lv_32fc_t
Definition: volk_complex.h:74
#define RISCV_SHRINK4(op, T, S, v)
Definition: volk_rvv_intrinsics.h:24