54 #ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a_H 55 #define INCLUDED_volk_16ic_deinterleave_real_8i_a_H 62 #include <immintrin.h> 64 static inline void volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
66 unsigned int num_points)
68 unsigned int number = 0;
69 const int8_t* complexVectorPtr = (int8_t*)complexVector;
70 int8_t* iBufferPtr = iBuffer;
71 __m256i iMoveMask1 = _mm256_set_epi8(0x80,
103 __m256i iMoveMask2 = _mm256_set_epi8(13,
135 __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
137 unsigned int thirtysecondPoints = num_points / 32;
139 for (number = 0; number < thirtysecondPoints; number++) {
140 complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
141 complexVectorPtr += 32;
142 complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
143 complexVectorPtr += 32;
145 complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr);
146 complexVectorPtr += 32;
147 complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr);
148 complexVectorPtr += 32;
150 complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
151 complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
153 complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
154 complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
156 complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
157 complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
159 complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
160 complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
162 complexVal1 = _mm256_srai_epi16(complexVal1, 8);
163 complexVal3 = _mm256_srai_epi16(complexVal3, 8);
165 iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
166 iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
168 _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
173 number = thirtysecondPoints * 32;
174 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
175 for (; number < num_points; number++) {
176 *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
177 int16ComplexVectorPtr++;
184 #include <tmmintrin.h> 188 unsigned int num_points)
190 unsigned int number = 0;
191 const int8_t* complexVectorPtr = (int8_t*)complexVector;
192 int8_t* iBufferPtr = iBuffer;
193 __m128i iMoveMask1 = _mm_set_epi8(
194 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
195 __m128i iMoveMask2 = _mm_set_epi8(
196 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
197 __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
199 unsigned int sixteenthPoints = num_points / 16;
201 for (number = 0; number < sixteenthPoints; number++) {
202 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
203 complexVectorPtr += 16;
204 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
205 complexVectorPtr += 16;
207 complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr);
208 complexVectorPtr += 16;
209 complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr);
210 complexVectorPtr += 16;
212 complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
213 complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
215 complexVal1 = _mm_or_si128(complexVal1, complexVal2);
217 complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
218 complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
220 complexVal3 = _mm_or_si128(complexVal3, complexVal4);
223 complexVal1 = _mm_srai_epi16(complexVal1, 8);
224 complexVal3 = _mm_srai_epi16(complexVal3, 8);
226 iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
228 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
233 number = sixteenthPoints * 16;
234 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
235 for (; number < num_points; number++) {
236 *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
237 int16ComplexVectorPtr++;
242 #ifdef LV_HAVE_GENERIC 246 unsigned int num_points)
248 unsigned int number = 0;
249 int16_t* complexVectorPtr = (int16_t*)complexVector;
250 int8_t* iBufferPtr = iBuffer;
251 for (number = 0; number < num_points; number++) {
252 *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
259 #include <arm_neon.h> 263 unsigned int num_points)
265 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
266 int8_t* iBufferPtr = iBuffer;
267 unsigned int eighth_points = num_points / 8;
270 int16x8x2_t complexInput;
272 for (number = 0; number < eighth_points; number++) {
273 complexInput = vld2q_s16(complexVectorPtr);
274 realOutput = vshrn_n_s16(complexInput.val[0], 8);
275 vst1_s8(iBufferPtr, realOutput);
276 complexVectorPtr += 16;
280 for (number = eighth_points * 8; number < num_points; number++) {
281 *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
289 extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer,
291 unsigned int num_points);
293 static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer,
295 unsigned int num_points)
297 volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points);
304 #ifndef INCLUDED_volk_16ic_deinterleave_real_8i_u_H 305 #define INCLUDED_volk_16ic_deinterleave_real_8i_u_H 307 #include <inttypes.h> 312 #include <immintrin.h> 314 static inline void volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
316 unsigned int num_points)
318 unsigned int number = 0;
319 const int8_t* complexVectorPtr = (int8_t*)complexVector;
320 int8_t* iBufferPtr = iBuffer;
321 __m256i iMoveMask1 = _mm256_set_epi8(0x80,
353 __m256i iMoveMask2 = _mm256_set_epi8(13,
385 __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
387 unsigned int thirtysecondPoints = num_points / 32;
389 for (number = 0; number < thirtysecondPoints; number++) {
390 complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
391 complexVectorPtr += 32;
392 complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
393 complexVectorPtr += 32;
395 complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
396 complexVectorPtr += 32;
397 complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
398 complexVectorPtr += 32;
400 complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
401 complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
403 complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
404 complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
406 complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
407 complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
409 complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
410 complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
412 complexVal1 = _mm256_srai_epi16(complexVal1, 8);
413 complexVal3 = _mm256_srai_epi16(complexVal3, 8);
415 iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
416 iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
418 _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
423 number = thirtysecondPoints * 32;
424 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
425 for (; number < num_points; number++) {
426 *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
427 int16ComplexVectorPtr++;
short complex lv_16sc_t
Definition: volk_complex.h:67
static void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_8i.h:186
static void volk_16ic_deinterleave_real_8i_neon(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_8i.h:261
static void volk_16ic_deinterleave_real_8i_generic(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_8i.h:244