Vector Optimized Library of Kernels  3.1.2
Architecture-tuned implementations of math kernels
volk_64u_byteswap.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
53 #ifndef INCLUDED_volk_64u_byteswap_u_H
54 #define INCLUDED_volk_64u_byteswap_u_H
55 
56 #include <inttypes.h>
57 #include <stdio.h>
58 
59 #ifdef LV_HAVE_SSE2
60 #include <emmintrin.h>
61 
62 static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points)
63 {
64  uint32_t* inputPtr = (uint32_t*)intsToSwap;
65  __m128i input, byte1, byte2, byte3, byte4, output;
66  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
67  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
68  uint64_t number = 0;
69  const unsigned int halfPoints = num_points / 2;
70  for (; number < halfPoints; number++) {
71  // Load the 32t values, increment inputPtr later since we're doing it in-place.
72  input = _mm_loadu_si128((__m128i*)inputPtr);
73 
74  // Do the four shifts
75  byte1 = _mm_slli_epi32(input, 24);
76  byte2 = _mm_slli_epi32(input, 8);
77  byte3 = _mm_srli_epi32(input, 8);
78  byte4 = _mm_srli_epi32(input, 24);
79  // Or bytes together
80  output = _mm_or_si128(byte1, byte4);
81  byte2 = _mm_and_si128(byte2, byte2mask);
82  output = _mm_or_si128(output, byte2);
83  byte3 = _mm_and_si128(byte3, byte3mask);
84  output = _mm_or_si128(output, byte3);
85 
86  // Reorder the two words
87  output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
88 
89  // Store the results
90  _mm_storeu_si128((__m128i*)inputPtr, output);
91  inputPtr += 4;
92  }
93 
94  // Byteswap any remaining points:
95  number = halfPoints * 2;
96  for (; number < num_points; number++) {
97  uint32_t output1 = *inputPtr;
98  uint32_t output2 = inputPtr[1];
99 
100  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
101  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
102 
103  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
104  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
105 
106  *inputPtr++ = output2;
107  *inputPtr++ = output1;
108  }
109 }
110 #endif /* LV_HAVE_SSE2 */
111 
112 
113 #ifdef LV_HAVE_GENERIC
114 
115 static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap,
116  unsigned int num_points)
117 {
118  uint32_t* inputPtr = (uint32_t*)intsToSwap;
119  unsigned int point;
120  for (point = 0; point < num_points; point++) {
121  uint32_t output1 = *inputPtr;
122  uint32_t output2 = inputPtr[1];
123 
124  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
125  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
126 
127  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
128  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
129 
130  *inputPtr++ = output2;
131  *inputPtr++ = output1;
132  }
133 }
134 #endif /* LV_HAVE_GENERIC */
135 
136 #if LV_HAVE_AVX2
137 #include <immintrin.h>
138 static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int num_points)
139 {
140  unsigned int number = 0;
141 
142  const unsigned int nPerSet = 4;
143  const uint64_t nSets = num_points / nPerSet;
144 
145  uint32_t* inputPtr = (uint32_t*)intsToSwap;
146 
147  const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
148  12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
149  17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
150 
151  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
152 
153  for (; number < nSets; number++) {
154 
155  // Load the 32t values, increment inputPtr later since we're doing it in-place.
156  const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
157  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
158 
159  // Store the results
160  _mm256_store_si256((__m256i*)inputPtr, output);
161 
162  /* inputPtr is 32bit so increment twice */
163  inputPtr += 2 * nPerSet;
164  }
165 
166  // Byteswap any remaining points:
167  for (number = nSets * nPerSet; number < num_points; ++number) {
168  uint32_t output1 = *inputPtr;
169  uint32_t output2 = inputPtr[1];
170  uint32_t out1 =
171  ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
172  (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
173 
174  uint32_t out2 =
175  ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
176  (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
177  *inputPtr++ = out2;
178  *inputPtr++ = out1;
179  }
180 }
181 
182 #endif /* LV_HAVE_AVX2 */
183 
184 
185 #if LV_HAVE_SSSE3
186 #include <tmmintrin.h>
187 static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap,
188  unsigned int num_points)
189 {
190  unsigned int number = 0;
191 
192  const unsigned int nPerSet = 2;
193  const uint64_t nSets = num_points / nPerSet;
194 
195  uint32_t* inputPtr = (uint32_t*)intsToSwap;
196 
197  uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
198 
199  const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
200 
201  for (; number < nSets; number++) {
202 
203  // Load the 32t values, increment inputPtr later since we're doing it in-place.
204  const __m128i input = _mm_load_si128((__m128i*)inputPtr);
205  const __m128i output = _mm_shuffle_epi8(input, myShuffle);
206 
207  // Store the results
208  _mm_store_si128((__m128i*)inputPtr, output);
209 
210  /* inputPtr is 32bit so increment twice */
211  inputPtr += 2 * nPerSet;
212  }
213 
214  // Byteswap any remaining points:
215  for (number = nSets * nPerSet; number < num_points; ++number) {
216  uint32_t output1 = *inputPtr;
217  uint32_t output2 = inputPtr[1];
218  uint32_t out1 =
219  ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
220  (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
221 
222  uint32_t out2 =
223  ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
224  (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
225  *inputPtr++ = out2;
226  *inputPtr++ = out1;
227  }
228 }
229 #endif /* LV_HAVE_SSSE3 */
230 #endif /* INCLUDED_volk_64u_byteswap_u_H */
231 
232 
233 #ifndef INCLUDED_volk_64u_byteswap_a_H
234 #define INCLUDED_volk_64u_byteswap_a_H
235 
236 #include <inttypes.h>
237 #include <stdio.h>
238 
239 #ifdef LV_HAVE_SSE2
240 #include <emmintrin.h>
241 
242 static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points)
243 {
244  uint32_t* inputPtr = (uint32_t*)intsToSwap;
245  __m128i input, byte1, byte2, byte3, byte4, output;
246  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
247  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
248  uint64_t number = 0;
249  const unsigned int halfPoints = num_points / 2;
250  for (; number < halfPoints; number++) {
251  // Load the 32t values, increment inputPtr later since we're doing it in-place.
252  input = _mm_load_si128((__m128i*)inputPtr);
253 
254  // Do the four shifts
255  byte1 = _mm_slli_epi32(input, 24);
256  byte2 = _mm_slli_epi32(input, 8);
257  byte3 = _mm_srli_epi32(input, 8);
258  byte4 = _mm_srli_epi32(input, 24);
259  // Or bytes together
260  output = _mm_or_si128(byte1, byte4);
261  byte2 = _mm_and_si128(byte2, byte2mask);
262  output = _mm_or_si128(output, byte2);
263  byte3 = _mm_and_si128(byte3, byte3mask);
264  output = _mm_or_si128(output, byte3);
265 
266  // Reorder the two words
267  output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
268 
269  // Store the results
270  _mm_store_si128((__m128i*)inputPtr, output);
271  inputPtr += 4;
272  }
273 
274  // Byteswap any remaining points:
275  number = halfPoints * 2;
276  for (; number < num_points; number++) {
277  uint32_t output1 = *inputPtr;
278  uint32_t output2 = inputPtr[1];
279 
280  output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
281  ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
282 
283  output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
284  ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
285 
286  *inputPtr++ = output2;
287  *inputPtr++ = output1;
288  }
289 }
290 #endif /* LV_HAVE_SSE2 */
291 
292 #if LV_HAVE_AVX2
293 #include <immintrin.h>
294 static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int num_points)
295 {
296  unsigned int number = 0;
297 
298  const unsigned int nPerSet = 4;
299  const uint64_t nSets = num_points / nPerSet;
300 
301  uint32_t* inputPtr = (uint32_t*)intsToSwap;
302 
303  const uint8_t shuffleVector[32] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13,
304  12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18,
305  17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
306 
307  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
308 
309  for (; number < nSets; number++) {
310  // Load the 32t values, increment inputPtr later since we're doing it in-place.
311  const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
312  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
313 
314  // Store the results
315  _mm256_storeu_si256((__m256i*)inputPtr, output);
316 
317  /* inputPtr is 32bit so increment twice */
318  inputPtr += 2 * nPerSet;
319  }
320 
321  // Byteswap any remaining points:
322  for (number = nSets * nPerSet; number < num_points; ++number) {
323  uint32_t output1 = *inputPtr;
324  uint32_t output2 = inputPtr[1];
325  uint32_t out1 =
326  ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
327  (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
328 
329  uint32_t out2 =
330  ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
331  (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
332  *inputPtr++ = out2;
333  *inputPtr++ = out1;
334  }
335 }
336 
337 #endif /* LV_HAVE_AVX2 */
338 
339 
340 #if LV_HAVE_SSSE3
341 #include <tmmintrin.h>
342 static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap,
343  unsigned int num_points)
344 {
345  unsigned int number = 0;
346 
347  const unsigned int nPerSet = 2;
348  const uint64_t nSets = num_points / nPerSet;
349 
350  uint32_t* inputPtr = (uint32_t*)intsToSwap;
351 
352  uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
353 
354  const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
355 
356  for (; number < nSets; number++) {
357  // Load the 32t values, increment inputPtr later since we're doing it in-place.
358  const __m128i input = _mm_loadu_si128((__m128i*)inputPtr);
359  const __m128i output = _mm_shuffle_epi8(input, myShuffle);
360 
361  // Store the results
362  _mm_storeu_si128((__m128i*)inputPtr, output);
363 
364  /* inputPtr is 32bit so increment twice */
365  inputPtr += 2 * nPerSet;
366  }
367 
368  // Byteswap any remaining points:
369  for (number = nSets * nPerSet; number < num_points; ++number) {
370  uint32_t output1 = *inputPtr;
371  uint32_t output2 = inputPtr[1];
372  uint32_t out1 =
373  ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
374  (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
375 
376  uint32_t out2 =
377  ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
378  (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
379  *inputPtr++ = out2;
380  *inputPtr++ = out1;
381  }
382 }
383 #endif /* LV_HAVE_SSSE3 */
384 
385 
386 #endif /* INCLUDED_volk_64u_byteswap_a_H */
static void volk_64u_byteswap_a_ssse3(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:187
static void volk_64u_byteswap_a_sse2(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:242
static void volk_64u_byteswap_u_ssse3(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:342
static void volk_64u_byteswap_u_sse2(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:62
static void volk_64u_byteswap_generic(uint64_t *intsToSwap, unsigned int num_points)
Definition: volk_64u_byteswap.h:115