Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_32f_x2_s32f_interleave_16ic.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
62 #ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
63 #define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
64 
65 #include <inttypes.h>
66 #include <stdio.h>
67 #include <volk/volk_common.h>
68 
69 #ifdef LV_HAVE_AVX2
70 #include <immintrin.h>
71 
72 static inline void volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector,
73  const float* iBuffer,
74  const float* qBuffer,
75  const float scalar,
76  unsigned int num_points)
77 {
78  unsigned int number = 0;
79  const float* iBufferPtr = iBuffer;
80  const float* qBufferPtr = qBuffer;
81 
82  __m256 vScalar = _mm256_set1_ps(scalar);
83 
84  const unsigned int eighthPoints = num_points / 8;
85 
86  __m256 iValue, qValue, cplxValue1, cplxValue2;
87  __m256i intValue1, intValue2;
88 
89  int16_t* complexVectorPtr = (int16_t*)complexVector;
90 
91  for (; number < eighthPoints; number++) {
92  iValue = _mm256_load_ps(iBufferPtr);
93  qValue = _mm256_load_ps(qBufferPtr);
94 
95  // Interleaves the lower two values in the i and q variables into one buffer
96  cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
97  cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
98 
99  // Interleaves the upper two values in the i and q variables into one buffer
100  cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
101  cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
102 
103  intValue1 = _mm256_cvtps_epi32(cplxValue1);
104  intValue2 = _mm256_cvtps_epi32(cplxValue2);
105 
106  intValue1 = _mm256_packs_epi32(intValue1, intValue2);
107 
108  _mm256_store_si256((__m256i*)complexVectorPtr, intValue1);
109  complexVectorPtr += 16;
110 
111  iBufferPtr += 8;
112  qBufferPtr += 8;
113  }
114 
115  number = eighthPoints * 8;
116  complexVectorPtr = (int16_t*)(&complexVector[number]);
117  for (; number < num_points; number++) {
118  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
119  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
120  }
121 }
122 #endif /* LV_HAVE_AVX2 */
123 
124 
125 #ifdef LV_HAVE_SSE2
126 #include <emmintrin.h>
127 
128 static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector,
129  const float* iBuffer,
130  const float* qBuffer,
131  const float scalar,
132  unsigned int num_points)
133 {
134  unsigned int number = 0;
135  const float* iBufferPtr = iBuffer;
136  const float* qBufferPtr = qBuffer;
137 
138  __m128 vScalar = _mm_set_ps1(scalar);
139 
140  const unsigned int quarterPoints = num_points / 4;
141 
142  __m128 iValue, qValue, cplxValue1, cplxValue2;
143  __m128i intValue1, intValue2;
144 
145  int16_t* complexVectorPtr = (int16_t*)complexVector;
146 
147  for (; number < quarterPoints; number++) {
148  iValue = _mm_load_ps(iBufferPtr);
149  qValue = _mm_load_ps(qBufferPtr);
150 
151  // Interleaves the lower two values in the i and q variables into one buffer
152  cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
153  cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
154 
155  // Interleaves the upper two values in the i and q variables into one buffer
156  cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
157  cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
158 
159  intValue1 = _mm_cvtps_epi32(cplxValue1);
160  intValue2 = _mm_cvtps_epi32(cplxValue2);
161 
162  intValue1 = _mm_packs_epi32(intValue1, intValue2);
163 
164  _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
165  complexVectorPtr += 8;
166 
167  iBufferPtr += 4;
168  qBufferPtr += 4;
169  }
170 
171  number = quarterPoints * 4;
172  complexVectorPtr = (int16_t*)(&complexVector[number]);
173  for (; number < num_points; number++) {
174  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
175  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
176  }
177 }
178 #endif /* LV_HAVE_SSE2 */
179 
180 
181 #ifdef LV_HAVE_SSE
182 #include <xmmintrin.h>
183 
184 static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector,
185  const float* iBuffer,
186  const float* qBuffer,
187  const float scalar,
188  unsigned int num_points)
189 {
190  unsigned int number = 0;
191  const float* iBufferPtr = iBuffer;
192  const float* qBufferPtr = qBuffer;
193 
194  __m128 vScalar = _mm_set_ps1(scalar);
195 
196  const unsigned int quarterPoints = num_points / 4;
197 
198  __m128 iValue, qValue, cplxValue;
199 
200  int16_t* complexVectorPtr = (int16_t*)complexVector;
201 
202  __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
203 
204  for (; number < quarterPoints; number++) {
205  iValue = _mm_load_ps(iBufferPtr);
206  qValue = _mm_load_ps(qBufferPtr);
207 
208  // Interleaves the lower two values in the i and q variables into one buffer
209  cplxValue = _mm_unpacklo_ps(iValue, qValue);
210  cplxValue = _mm_mul_ps(cplxValue, vScalar);
211 
212  _mm_store_ps(floatBuffer, cplxValue);
213 
214  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
215  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
216  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
217  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
218 
219  // Interleaves the upper two values in the i and q variables into one buffer
220  cplxValue = _mm_unpackhi_ps(iValue, qValue);
221  cplxValue = _mm_mul_ps(cplxValue, vScalar);
222 
223  _mm_store_ps(floatBuffer, cplxValue);
224 
225  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
226  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
227  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
228  *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
229 
230  iBufferPtr += 4;
231  qBufferPtr += 4;
232  }
233 
234  number = quarterPoints * 4;
235  complexVectorPtr = (int16_t*)(&complexVector[number]);
236  for (; number < num_points; number++) {
237  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
238  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
239  }
240 }
241 #endif /* LV_HAVE_SSE */
242 
243 
244 #ifdef LV_HAVE_GENERIC
245 
246 static inline void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector,
247  const float* iBuffer,
248  const float* qBuffer,
249  const float scalar,
250  unsigned int num_points)
251 {
252  int16_t* complexVectorPtr = (int16_t*)complexVector;
253  const float* iBufferPtr = iBuffer;
254  const float* qBufferPtr = qBuffer;
255  unsigned int number = 0;
256 
257  for (number = 0; number < num_points; number++) {
258  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
259  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
260  }
261 }
262 #endif /* LV_HAVE_GENERIC */
263 
264 
265 #endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H */
266 
267 #ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
268 #define INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
269 
270 #include <inttypes.h>
271 #include <stdio.h>
272 #include <volk/volk_common.h>
273 
274 #ifdef LV_HAVE_AVX2
275 #include <immintrin.h>
276 
277 static inline void volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector,
278  const float* iBuffer,
279  const float* qBuffer,
280  const float scalar,
281  unsigned int num_points)
282 {
283  unsigned int number = 0;
284  const float* iBufferPtr = iBuffer;
285  const float* qBufferPtr = qBuffer;
286 
287  __m256 vScalar = _mm256_set1_ps(scalar);
288 
289  const unsigned int eighthPoints = num_points / 8;
290 
291  __m256 iValue, qValue, cplxValue1, cplxValue2;
292  __m256i intValue1, intValue2;
293 
294  int16_t* complexVectorPtr = (int16_t*)complexVector;
295 
296  for (; number < eighthPoints; number++) {
297  iValue = _mm256_loadu_ps(iBufferPtr);
298  qValue = _mm256_loadu_ps(qBufferPtr);
299 
300  // Interleaves the lower two values in the i and q variables into one buffer
301  cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
302  cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
303 
304  // Interleaves the upper two values in the i and q variables into one buffer
305  cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
306  cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
307 
308  intValue1 = _mm256_cvtps_epi32(cplxValue1);
309  intValue2 = _mm256_cvtps_epi32(cplxValue2);
310 
311  intValue1 = _mm256_packs_epi32(intValue1, intValue2);
312 
313  _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1);
314  complexVectorPtr += 16;
315 
316  iBufferPtr += 8;
317  qBufferPtr += 8;
318  }
319 
320  number = eighthPoints * 8;
321  complexVectorPtr = (int16_t*)(&complexVector[number]);
322  for (; number < num_points; number++) {
323  *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
324  *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
325  }
326 }
327 #endif /* LV_HAVE_AVX2 */
328 
329 #ifdef LV_HAVE_RVV
330 #include <riscv_vector.h>
331 
332 static inline void volk_32f_x2_s32f_interleave_16ic_rvv(lv_16sc_t* complexVector,
333  const float* iBuffer,
334  const float* qBuffer,
335  const float scalar,
336  unsigned int num_points)
337 {
338  uint32_t* out = (uint32_t*)complexVector;
339  size_t n = num_points;
340  for (size_t vl; n > 0; n -= vl, out += vl, iBuffer += vl, qBuffer += vl) {
341  vl = __riscv_vsetvl_e32m8(n);
342  vfloat32m8_t vrf = __riscv_vle32_v_f32m8(iBuffer, vl);
343  vfloat32m8_t vif = __riscv_vle32_v_f32m8(qBuffer, vl);
344  vint16m4_t vri = __riscv_vfncvt_x(__riscv_vfmul(vrf, scalar, vl), vl);
345  vint16m4_t vii = __riscv_vfncvt_x(__riscv_vfmul(vif, scalar, vl), vl);
346  vuint16m4_t vr = __riscv_vreinterpret_u16m4(vri);
347  vuint16m4_t vi = __riscv_vreinterpret_u16m4(vii);
348  vuint32m8_t vc = __riscv_vwmaccu(__riscv_vwaddu_vv(vr, vi, vl), 0xFFFF, vi, vl);
349  __riscv_vse32(out, vc, vl);
350  }
351 }
352 #endif /*LV_HAVE_RVV*/
353 
354 #ifdef LV_HAVE_RVVSEG
355 #include <riscv_vector.h>
356 
357 static inline void volk_32f_x2_s32f_interleave_16ic_rvvseg(lv_16sc_t* complexVector,
358  const float* iBuffer,
359  const float* qBuffer,
360  const float scalar,
361  unsigned int num_points)
362 {
363  size_t n = num_points;
364  for (size_t vl; n > 0; n -= vl, complexVector += vl, iBuffer += vl, qBuffer += vl) {
365  vl = __riscv_vsetvl_e32m8(n);
366  vfloat32m8_t vrf = __riscv_vle32_v_f32m8(iBuffer, vl);
367  vfloat32m8_t vif = __riscv_vle32_v_f32m8(qBuffer, vl);
368  vint16m4_t vri = __riscv_vfncvt_x(__riscv_vfmul(vrf, scalar, vl), vl);
369  vint16m4_t vii = __riscv_vfncvt_x(__riscv_vfmul(vif, scalar, vl), vl);
370  __riscv_vsseg2e16(
371  (int16_t*)complexVector, __riscv_vcreate_v_i16m4x2(vri, vii), vl);
372  }
373 }
374 #endif /*LV_HAVE_RVVSEG*/
375 
376 #endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H */
static float rintf(float x)
Definition: config.h:45
static void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t *complexVector, const float *iBuffer, const float *qBuffer, const float scalar, unsigned int num_points)
Definition: volk_32f_x2_s32f_interleave_16ic.h:128
static void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t *complexVector, const float *iBuffer, const float *qBuffer, const float scalar, unsigned int num_points)
Definition: volk_32f_x2_s32f_interleave_16ic.h:184
static void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t *complexVector, const float *iBuffer, const float *qBuffer, const float scalar, unsigned int num_points)
Definition: volk_32f_x2_s32f_interleave_16ic.h:246
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:62
short complex lv_16sc_t
Definition: volk_complex.h:71