Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_8ic_x2_s32f_multiply_conjugate_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
44 #ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
45 #define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
46 
47 #include <inttypes.h>
48 #include <stdio.h>
49 #include <volk/volk_complex.h>
50 
51 #ifdef LV_HAVE_AVX2
52 #include <immintrin.h>
53 
54 static inline void
55 volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector,
56  const lv_8sc_t* aVector,
57  const lv_8sc_t* bVector,
58  const float scalar,
59  unsigned int num_points)
60 {
61  unsigned int number = 0;
62  const unsigned int oneEigthPoints = num_points / 8;
63 
64  __m256i x, y, realz, imagz;
65  __m256 ret, retlo, rethi;
66  lv_32fc_t* c = cVector;
67  const lv_8sc_t* a = aVector;
68  const lv_8sc_t* b = bVector;
69  __m256i conjugateSign =
70  _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
71 
72  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
73 
74  for (; number < oneEigthPoints; number++) {
75  // Convert 8 bit values into 16 bit values
76  x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
77  y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
78 
79  // Calculate the ar*cr - ai*(-ci) portions
80  realz = _mm256_madd_epi16(x, y);
81 
82  // Calculate the complex conjugate of the cr + ci j values
83  y = _mm256_sign_epi16(y, conjugateSign);
84 
85  // Shift the order of the cr and ci values
86  y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
87  _MM_SHUFFLE(2, 3, 0, 1));
88 
89  // Calculate the ar*(-ci) + cr*(ai)
90  imagz = _mm256_madd_epi16(x, y);
91 
92  // Interleave real and imaginary and then convert to float values
93  retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
94 
95  // Normalize the floating point values
96  retlo = _mm256_mul_ps(retlo, invScalar);
97 
98  // Interleave real and imaginary and then convert to float values
99  rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
100 
101  // Normalize the floating point values
102  rethi = _mm256_mul_ps(rethi, invScalar);
103 
104  ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
105  _mm256_store_ps((float*)c, ret);
106  c += 4;
107 
108  ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
109  _mm256_store_ps((float*)c, ret);
110  c += 4;
111 
112  a += 8;
113  b += 8;
114  }
115 
116  number = oneEigthPoints * 8;
117  float* cFloatPtr = (float*)&cVector[number];
118  int8_t* a8Ptr = (int8_t*)&aVector[number];
119  int8_t* b8Ptr = (int8_t*)&bVector[number];
120  for (; number < num_points; number++) {
121  float aReal = (float)*a8Ptr++;
122  float aImag = (float)*a8Ptr++;
123  lv_32fc_t aVal = lv_cmake(aReal, aImag);
124  float bReal = (float)*b8Ptr++;
125  float bImag = (float)*b8Ptr++;
126  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
127  lv_32fc_t temp = aVal * bVal;
128 
129  *cFloatPtr++ = lv_creal(temp) / scalar;
130  *cFloatPtr++ = lv_cimag(temp) / scalar;
131  }
132 }
133 #endif /* LV_HAVE_AVX2*/
134 
135 
136 #ifdef LV_HAVE_SSE4_1
137 #include <smmintrin.h>
138 
139 static inline void
140 volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector,
141  const lv_8sc_t* aVector,
142  const lv_8sc_t* bVector,
143  const float scalar,
144  unsigned int num_points)
145 {
146  unsigned int number = 0;
147  const unsigned int quarterPoints = num_points / 4;
148 
149  __m128i x, y, realz, imagz;
150  __m128 ret;
151  lv_32fc_t* c = cVector;
152  const lv_8sc_t* a = aVector;
153  const lv_8sc_t* b = bVector;
154  __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
155 
156  __m128 invScalar = _mm_set_ps1(1.0 / scalar);
157 
158  for (; number < quarterPoints; number++) {
159  // Convert into 8 bit values into 16 bit values
160  x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
161  y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
162 
163  // Calculate the ar*cr - ai*(-ci) portions
164  realz = _mm_madd_epi16(x, y);
165 
166  // Calculate the complex conjugate of the cr + ci j values
167  y = _mm_sign_epi16(y, conjugateSign);
168 
169  // Shift the order of the cr and ci values
170  y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
171  _MM_SHUFFLE(2, 3, 0, 1));
172 
173  // Calculate the ar*(-ci) + cr*(ai)
174  imagz = _mm_madd_epi16(x, y);
175 
176  // Interleave real and imaginary and then convert to float values
177  ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
178 
179  // Normalize the floating point values
180  ret = _mm_mul_ps(ret, invScalar);
181 
182  // Store the floating point values
183  _mm_store_ps((float*)c, ret);
184  c += 2;
185 
186  // Interleave real and imaginary and then convert to float values
187  ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
188 
189  // Normalize the floating point values
190  ret = _mm_mul_ps(ret, invScalar);
191 
192  // Store the floating point values
193  _mm_store_ps((float*)c, ret);
194  c += 2;
195 
196  a += 4;
197  b += 4;
198  }
199 
200  number = quarterPoints * 4;
201  float* cFloatPtr = (float*)&cVector[number];
202  int8_t* a8Ptr = (int8_t*)&aVector[number];
203  int8_t* b8Ptr = (int8_t*)&bVector[number];
204  for (; number < num_points; number++) {
205  float aReal = (float)*a8Ptr++;
206  float aImag = (float)*a8Ptr++;
207  lv_32fc_t aVal = lv_cmake(aReal, aImag);
208  float bReal = (float)*b8Ptr++;
209  float bImag = (float)*b8Ptr++;
210  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
211  lv_32fc_t temp = aVal * bVal;
212 
213  *cFloatPtr++ = lv_creal(temp) / scalar;
214  *cFloatPtr++ = lv_cimag(temp) / scalar;
215  }
216 }
217 #endif /* LV_HAVE_SSE4_1 */
218 
219 
220 #ifdef LV_HAVE_GENERIC
221 
222 static inline void
224  const lv_8sc_t* aVector,
225  const lv_8sc_t* bVector,
226  const float scalar,
227  unsigned int num_points)
228 {
229  unsigned int number = 0;
230  float* cPtr = (float*)cVector;
231  const float invScalar = 1.0 / scalar;
232  int8_t* a8Ptr = (int8_t*)aVector;
233  int8_t* b8Ptr = (int8_t*)bVector;
234  for (number = 0; number < num_points; number++) {
235  float aReal = (float)*a8Ptr++;
236  float aImag = (float)*a8Ptr++;
237  lv_32fc_t aVal = lv_cmake(aReal, aImag);
238  float bReal = (float)*b8Ptr++;
239  float bImag = (float)*b8Ptr++;
240  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
241  lv_32fc_t temp = aVal * bVal;
242 
243  *cPtr++ = (lv_creal(temp) * invScalar);
244  *cPtr++ = (lv_cimag(temp) * invScalar);
245  }
246 }
247 #endif /* LV_HAVE_GENERIC */
248 
249 
250 #endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H */
251 
252 #ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
253 #define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
254 
255 #include <inttypes.h>
256 #include <stdio.h>
257 #include <volk/volk_complex.h>
258 
259 #ifdef LV_HAVE_AVX2
260 #include <immintrin.h>
261 
262 static inline void
263 volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector,
264  const lv_8sc_t* aVector,
265  const lv_8sc_t* bVector,
266  const float scalar,
267  unsigned int num_points)
268 {
269  unsigned int number = 0;
270  const unsigned int oneEigthPoints = num_points / 8;
271 
272  __m256i x, y, realz, imagz;
273  __m256 ret, retlo, rethi;
274  lv_32fc_t* c = cVector;
275  const lv_8sc_t* a = aVector;
276  const lv_8sc_t* b = bVector;
277  __m256i conjugateSign =
278  _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
279 
280  __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
281 
282  for (; number < oneEigthPoints; number++) {
283  // Convert 8 bit values into 16 bit values
284  x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
285  y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
286 
287  // Calculate the ar*cr - ai*(-ci) portions
288  realz = _mm256_madd_epi16(x, y);
289 
290  // Calculate the complex conjugate of the cr + ci j values
291  y = _mm256_sign_epi16(y, conjugateSign);
292 
293  // Shift the order of the cr and ci values
294  y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
295  _MM_SHUFFLE(2, 3, 0, 1));
296 
297  // Calculate the ar*(-ci) + cr*(ai)
298  imagz = _mm256_madd_epi16(x, y);
299 
300  // Interleave real and imaginary and then convert to float values
301  retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
302 
303  // Normalize the floating point values
304  retlo = _mm256_mul_ps(retlo, invScalar);
305 
306  // Interleave real and imaginary and then convert to float values
307  rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
308 
309  // Normalize the floating point values
310  rethi = _mm256_mul_ps(rethi, invScalar);
311 
312  ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
313  _mm256_storeu_ps((float*)c, ret);
314  c += 4;
315 
316  ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
317  _mm256_storeu_ps((float*)c, ret);
318  c += 4;
319 
320  a += 8;
321  b += 8;
322  }
323 
324  number = oneEigthPoints * 8;
325  float* cFloatPtr = (float*)&cVector[number];
326  int8_t* a8Ptr = (int8_t*)&aVector[number];
327  int8_t* b8Ptr = (int8_t*)&bVector[number];
328  for (; number < num_points; number++) {
329  float aReal = (float)*a8Ptr++;
330  float aImag = (float)*a8Ptr++;
331  lv_32fc_t aVal = lv_cmake(aReal, aImag);
332  float bReal = (float)*b8Ptr++;
333  float bImag = (float)*b8Ptr++;
334  lv_32fc_t bVal = lv_cmake(bReal, -bImag);
335  lv_32fc_t temp = aVal * bVal;
336 
337  *cFloatPtr++ = lv_creal(temp) / scalar;
338  *cFloatPtr++ = lv_cimag(temp) / scalar;
339  }
340 }
341 #endif /* LV_HAVE_AVX2*/
342 
343 
344 #ifdef LV_HAVE_RVV
345 #include <riscv_vector.h>
346 
347 static inline void volk_8ic_x2_s32f_multiply_conjugate_32fc_rvv(lv_32fc_t* cVector,
348  const lv_8sc_t* aVector,
349  const lv_8sc_t* bVector,
350  const float scalar,
351  unsigned int num_points)
352 {
353  size_t n = num_points;
354  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
355  vl = __riscv_vsetvl_e8m1(n);
356  vint16m2_t va = __riscv_vle16_v_i16m2((const int16_t*)aVector, vl);
357  vint16m2_t vb = __riscv_vle16_v_i16m2((const int16_t*)bVector, vl);
358  vint8m1_t var = __riscv_vnsra(va, 0, vl), vai = __riscv_vnsra(va, 8, vl);
359  vint8m1_t vbr = __riscv_vnsra(vb, 0, vl), vbi = __riscv_vnsra(vb, 8, vl);
360  vint16m2_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
361  vint16m2_t vi =
362  __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
363  vfloat32m4_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0 / scalar, vl);
364  vfloat32m4_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0 / scalar, vl);
365  vuint32m4_t vru = __riscv_vreinterpret_u32m4(vrf);
366  vuint32m4_t viu = __riscv_vreinterpret_u32m4(vif);
367  vuint64m8_t v =
368  __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
369  __riscv_vse64((uint64_t*)cVector, v, vl);
370  }
371 }
372 #endif /*LV_HAVE_RVV*/
373 
374 #ifdef LV_HAVE_RVVSEG
375 #include <riscv_vector.h>
376 
377 static inline void
378 volk_8ic_x2_s32f_multiply_conjugate_32fc_rvvseg(lv_32fc_t* cVector,
379  const lv_8sc_t* aVector,
380  const lv_8sc_t* bVector,
381  const float scalar,
382  unsigned int num_points)
383 {
384  size_t n = num_points;
385  for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
386  vl = __riscv_vsetvl_e8m1(n);
387  vint8m1x2_t va = __riscv_vlseg2e8_v_i8m1x2((const int8_t*)aVector, vl);
388  vint8m1x2_t vb = __riscv_vlseg2e8_v_i8m1x2((const int8_t*)bVector, vl);
389  vint8m1_t var = __riscv_vget_i8m1(va, 0), vai = __riscv_vget_i8m1(va, 1);
390  vint8m1_t vbr = __riscv_vget_i8m1(vb, 0), vbi = __riscv_vget_i8m1(vb, 1);
391  vint16m2_t vr = __riscv_vwmacc(__riscv_vwmul(var, vbr, vl), vai, vbi, vl);
392  vint16m2_t vi =
393  __riscv_vsub(__riscv_vwmul(vai, vbr, vl), __riscv_vwmul(var, vbi, vl), vl);
394  vfloat32m4_t vrf = __riscv_vfmul(__riscv_vfwcvt_f(vr, vl), 1.0 / scalar, vl);
395  vfloat32m4_t vif = __riscv_vfmul(__riscv_vfwcvt_f(vi, vl), 1.0 / scalar, vl);
396  __riscv_vsseg2e32_v_f32m4x2(
397  (float*)cVector, __riscv_vcreate_v_f32m4x2(vrf, vif), vl);
398  }
399 }
400 
401 #endif /*LV_HAVE_RVVSEG*/
402 
403 #endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H */
static void volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t *cVector, const lv_8sc_t *aVector, const lv_8sc_t *bVector, const float scalar, unsigned int num_points)
Definition: volk_8ic_x2_s32f_multiply_conjugate_32fc.h:223
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_cmake(r, i)
Definition: volk_complex.h:77
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:70
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74