Vector Optimized Library of Kernels  3.1.2
Architecture-tuned implementations of math kernels
volk_8u_x2_encodeframepolar_8u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
10 /*
11  * for documentation see 'volk_8u_x3_encodepolar_8u_x2.h'
12  */
13 
14 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
15 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
16 #include <string.h>
17 
18 static inline unsigned int log2_of_power_of_2(unsigned int val)
19 {
20  // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog
21  static const unsigned int b[] = {
22  0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
23  };
24 
25  unsigned int res = (val & b[0]) != 0;
26  res |= ((val & b[4]) != 0) << 4;
27  res |= ((val & b[3]) != 0) << 3;
28  res |= ((val & b[2]) != 0) << 2;
29  res |= ((val & b[1]) != 0) << 1;
30  return res;
31 }
32 
33 static inline void encodepolar_single_stage(unsigned char* frame_ptr,
34  const unsigned char* temp_ptr,
35  const unsigned int num_branches,
36  const unsigned int frame_half)
37 {
38  unsigned int branch, bit;
39  for (branch = 0; branch < num_branches; ++branch) {
40  for (bit = 0; bit < frame_half; ++bit) {
41  *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
42  *(frame_ptr + frame_half) = *(temp_ptr + 1);
43  ++frame_ptr;
44  temp_ptr += 2;
45  }
46  frame_ptr += frame_half;
47  }
48 }
49 
50 #ifdef LV_HAVE_GENERIC
51 
52 static inline void volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame,
53  unsigned char* temp,
54  unsigned int frame_size)
55 {
56  unsigned int stage = log2_of_power_of_2(frame_size);
57  unsigned int frame_half = frame_size >> 1;
58  unsigned int num_branches = 1;
59 
60  while (stage) {
61  // encode stage
62  encodepolar_single_stage(frame, temp, num_branches, frame_half);
63  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
64 
65  // update all the parameters.
66  num_branches = num_branches << 1;
67  frame_half = frame_half >> 1;
68  --stage;
69  }
70 }
71 #endif /* LV_HAVE_GENERIC */
72 
73 #ifdef LV_HAVE_SSSE3
74 #include <tmmintrin.h>
75 
76 static inline void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame,
77  unsigned char* temp,
78  unsigned int frame_size)
79 {
80  if (frame_size < 16) {
81  volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
82  return;
83  }
84 
85  const unsigned int po2 = log2_of_power_of_2(frame_size);
86 
87  unsigned int stage = po2;
88  unsigned char* frame_ptr = frame;
89  unsigned char* temp_ptr = temp;
90 
91  unsigned int frame_half = frame_size >> 1;
92  unsigned int num_branches = 1;
93  unsigned int branch;
94  unsigned int bit;
95 
96  // prepare constants
97  const __m128i mask_stage1 = _mm_set_epi8(0x0,
98  0xFF,
99  0x0,
100  0xFF,
101  0x0,
102  0xFF,
103  0x0,
104  0xFF,
105  0x0,
106  0xFF,
107  0x0,
108  0xFF,
109  0x0,
110  0xFF,
111  0x0,
112  0xFF);
113 
114  // get some SIMD registers to play with.
115  __m128i r_frame0, r_temp0, shifted;
116 
117  {
118  __m128i r_frame1, r_temp1;
119  const __m128i shuffle_separate =
120  _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
121 
122  while (stage > 4) {
123  frame_ptr = frame;
124  temp_ptr = temp;
125 
126  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
127  for (branch = 0; branch < num_branches; ++branch) {
128  for (bit = 0; bit < frame_half; bit += 16) {
129  r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
130  temp_ptr += 16;
131  r_temp1 = _mm_loadu_si128((__m128i*)temp_ptr);
132  temp_ptr += 16;
133 
134  shifted = _mm_srli_si128(r_temp0, 1);
135  shifted = _mm_and_si128(shifted, mask_stage1);
136  r_temp0 = _mm_xor_si128(shifted, r_temp0);
137  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
138 
139  shifted = _mm_srli_si128(r_temp1, 1);
140  shifted = _mm_and_si128(shifted, mask_stage1);
141  r_temp1 = _mm_xor_si128(shifted, r_temp1);
142  r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
143 
144  r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
145  _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
146 
147  r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
148  _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
149  frame_ptr += 16;
150  }
151 
152  frame_ptr += frame_half;
153  }
154  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
155 
156  num_branches = num_branches << 1;
157  frame_half = frame_half >> 1;
158  stage--;
159  }
160  }
161 
162  // This last part requires at least 16-bit frames.
163  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
164 
165  // reset pointers to correct positions.
166  frame_ptr = frame;
167  temp_ptr = temp;
168 
169  // prefetch first chunk
170  __VOLK_PREFETCH(temp_ptr);
171 
172  const __m128i shuffle_stage4 =
173  _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
174  const __m128i mask_stage4 = _mm_set_epi8(0x0,
175  0x0,
176  0x0,
177  0x0,
178  0x0,
179  0x0,
180  0x0,
181  0x0,
182  0xFF,
183  0xFF,
184  0xFF,
185  0xFF,
186  0xFF,
187  0xFF,
188  0xFF,
189  0xFF);
190  const __m128i mask_stage3 = _mm_set_epi8(0x0,
191  0x0,
192  0x0,
193  0x0,
194  0xFF,
195  0xFF,
196  0xFF,
197  0xFF,
198  0x0,
199  0x0,
200  0x0,
201  0x0,
202  0xFF,
203  0xFF,
204  0xFF,
205  0xFF);
206  const __m128i mask_stage2 = _mm_set_epi8(0x0,
207  0x0,
208  0xFF,
209  0xFF,
210  0x0,
211  0x0,
212  0xFF,
213  0xFF,
214  0x0,
215  0x0,
216  0xFF,
217  0xFF,
218  0x0,
219  0x0,
220  0xFF,
221  0xFF);
222 
223  for (branch = 0; branch < num_branches; ++branch) {
224  r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
225 
226  // prefetch next chunk
227  temp_ptr += 16;
228  __VOLK_PREFETCH(temp_ptr);
229 
230  // shuffle once for bit-reversal.
231  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
232 
233  shifted = _mm_srli_si128(r_temp0, 8);
234  shifted = _mm_and_si128(shifted, mask_stage4);
235  r_frame0 = _mm_xor_si128(shifted, r_temp0);
236 
237  shifted = _mm_srli_si128(r_frame0, 4);
238  shifted = _mm_and_si128(shifted, mask_stage3);
239  r_frame0 = _mm_xor_si128(shifted, r_frame0);
240 
241  shifted = _mm_srli_si128(r_frame0, 2);
242  shifted = _mm_and_si128(shifted, mask_stage2);
243  r_frame0 = _mm_xor_si128(shifted, r_frame0);
244 
245  shifted = _mm_srli_si128(r_frame0, 1);
246  shifted = _mm_and_si128(shifted, mask_stage1);
247  r_frame0 = _mm_xor_si128(shifted, r_frame0);
248 
249  // store result of chunk.
250  _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
251  frame_ptr += 16;
252  }
253 }
254 
255 #endif /* LV_HAVE_SSSE3 */
256 
257 #ifdef LV_HAVE_AVX2
258 #include <immintrin.h>
259 
260 static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame,
261  unsigned char* temp,
262  unsigned int frame_size)
263 {
264  if (frame_size < 32) {
265  volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
266  return;
267  }
268 
269  const unsigned int po2 = log2_of_power_of_2(frame_size);
270 
271  unsigned int stage = po2;
272  unsigned char* frame_ptr = frame;
273  unsigned char* temp_ptr = temp;
274 
275  unsigned int frame_half = frame_size >> 1;
276  unsigned int num_branches = 1;
277  unsigned int branch;
278  unsigned int bit;
279 
280  // prepare constants
281  const __m256i mask_stage1 = _mm256_set_epi8(0x0,
282  0xFF,
283  0x0,
284  0xFF,
285  0x0,
286  0xFF,
287  0x0,
288  0xFF,
289  0x0,
290  0xFF,
291  0x0,
292  0xFF,
293  0x0,
294  0xFF,
295  0x0,
296  0xFF,
297  0x0,
298  0xFF,
299  0x0,
300  0xFF,
301  0x0,
302  0xFF,
303  0x0,
304  0xFF,
305  0x0,
306  0xFF,
307  0x0,
308  0xFF,
309  0x0,
310  0xFF,
311  0x0,
312  0xFF);
313 
314  const __m128i mask_stage0 = _mm_set_epi8(0x0,
315  0xFF,
316  0x0,
317  0xFF,
318  0x0,
319  0xFF,
320  0x0,
321  0xFF,
322  0x0,
323  0xFF,
324  0x0,
325  0xFF,
326  0x0,
327  0xFF,
328  0x0,
329  0xFF);
330  // get some SIMD registers to play with.
331  __m256i r_frame0, r_temp0, shifted;
332  __m128i r_temp2, r_frame2, shifted2;
333  {
334  __m256i r_frame1, r_temp1;
335  __m128i r_frame3, r_temp3;
336  const __m256i shuffle_separate = _mm256_setr_epi8(0,
337  2,
338  4,
339  6,
340  8,
341  10,
342  12,
343  14,
344  1,
345  3,
346  5,
347  7,
348  9,
349  11,
350  13,
351  15,
352  0,
353  2,
354  4,
355  6,
356  8,
357  10,
358  12,
359  14,
360  1,
361  3,
362  5,
363  7,
364  9,
365  11,
366  13,
367  15);
368  const __m128i shuffle_separate128 =
369  _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
370 
371  while (stage > 4) {
372  frame_ptr = frame;
373  temp_ptr = temp;
374 
375  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
376  for (branch = 0; branch < num_branches; ++branch) {
377  for (bit = 0; bit < frame_half; bit += 32) {
378  if ((frame_half - bit) <
379  32) // if only 16 bits remaining in frame, not 32
380  {
381  r_temp2 = _mm_loadu_si128((__m128i*)temp_ptr);
382  temp_ptr += 16;
383  r_temp3 = _mm_loadu_si128((__m128i*)temp_ptr);
384  temp_ptr += 16;
385 
386  shifted2 = _mm_srli_si128(r_temp2, 1);
387  shifted2 = _mm_and_si128(shifted2, mask_stage0);
388  r_temp2 = _mm_xor_si128(shifted2, r_temp2);
389  r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
390 
391  shifted2 = _mm_srli_si128(r_temp3, 1);
392  shifted2 = _mm_and_si128(shifted2, mask_stage0);
393  r_temp3 = _mm_xor_si128(shifted2, r_temp3);
394  r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
395 
396  r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
397  _mm_storeu_si128((__m128i*)frame_ptr, r_frame2);
398 
399  r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
400  _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
401  frame_ptr += 16;
402  break;
403  }
404  r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
405  temp_ptr += 32;
406  r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
407  temp_ptr += 32;
408 
409  shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
410  shifted = _mm256_and_si256(shifted, mask_stage1);
411  r_temp0 = _mm256_xor_si256(shifted, r_temp0);
412  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
413 
414  shifted = _mm256_srli_si256(r_temp1, 1);
415  shifted = _mm256_and_si256(shifted, mask_stage1);
416  r_temp1 = _mm256_xor_si256(shifted, r_temp1);
417  r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
418 
419  r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
420  r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
421  r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
422  r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
423 
424  _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
425 
426  _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
427  frame_ptr += 32;
428  }
429 
430  frame_ptr += frame_half;
431  }
432  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
433 
434  num_branches = num_branches << 1;
435  frame_half = frame_half >> 1;
436  stage--;
437  }
438  }
439 
440  // This last part requires at least 32-bit frames.
441  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
442 
443  // reset pointers to correct positions.
444  frame_ptr = frame;
445  temp_ptr = temp;
446 
447  // prefetch first chunk
448  __VOLK_PREFETCH(temp_ptr);
449 
450  const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
451  8,
452  4,
453  12,
454  2,
455  10,
456  6,
457  14,
458  1,
459  9,
460  5,
461  13,
462  3,
463  11,
464  7,
465  15,
466  0,
467  8,
468  4,
469  12,
470  2,
471  10,
472  6,
473  14,
474  1,
475  9,
476  5,
477  13,
478  3,
479  11,
480  7,
481  15);
482  const __m256i mask_stage4 = _mm256_set_epi8(0x0,
483  0x0,
484  0x0,
485  0x0,
486  0x0,
487  0x0,
488  0x0,
489  0x0,
490  0xFF,
491  0xFF,
492  0xFF,
493  0xFF,
494  0xFF,
495  0xFF,
496  0xFF,
497  0xFF,
498  0x0,
499  0x0,
500  0x0,
501  0x0,
502  0x0,
503  0x0,
504  0x0,
505  0x0,
506  0xFF,
507  0xFF,
508  0xFF,
509  0xFF,
510  0xFF,
511  0xFF,
512  0xFF,
513  0xFF);
514  const __m256i mask_stage3 = _mm256_set_epi8(0x0,
515  0x0,
516  0x0,
517  0x0,
518  0xFF,
519  0xFF,
520  0xFF,
521  0xFF,
522  0x0,
523  0x0,
524  0x0,
525  0x0,
526  0xFF,
527  0xFF,
528  0xFF,
529  0xFF,
530  0x0,
531  0x0,
532  0x0,
533  0x0,
534  0xFF,
535  0xFF,
536  0xFF,
537  0xFF,
538  0x0,
539  0x0,
540  0x0,
541  0x0,
542  0xFF,
543  0xFF,
544  0xFF,
545  0xFF);
546  const __m256i mask_stage2 = _mm256_set_epi8(0x0,
547  0x0,
548  0xFF,
549  0xFF,
550  0x0,
551  0x0,
552  0xFF,
553  0xFF,
554  0x0,
555  0x0,
556  0xFF,
557  0xFF,
558  0x0,
559  0x0,
560  0xFF,
561  0xFF,
562  0x0,
563  0x0,
564  0xFF,
565  0xFF,
566  0x0,
567  0x0,
568  0xFF,
569  0xFF,
570  0x0,
571  0x0,
572  0xFF,
573  0xFF,
574  0x0,
575  0x0,
576  0xFF,
577  0xFF);
578 
579  for (branch = 0; branch < num_branches / 2; ++branch) {
580  r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
581 
582  // prefetch next chunk
583  temp_ptr += 32;
584  __VOLK_PREFETCH(temp_ptr);
585 
586  // shuffle once for bit-reversal.
587  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
588 
589  shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
590  shifted = _mm256_and_si256(shifted, mask_stage4);
591  r_frame0 = _mm256_xor_si256(shifted, r_temp0);
592 
593 
594  shifted = _mm256_srli_si256(r_frame0, 4);
595  shifted = _mm256_and_si256(shifted, mask_stage3);
596  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
597 
598  shifted = _mm256_srli_si256(r_frame0, 2);
599  shifted = _mm256_and_si256(shifted, mask_stage2);
600  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
601 
602  shifted = _mm256_srli_si256(r_frame0, 1);
603  shifted = _mm256_and_si256(shifted, mask_stage1);
604  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
605 
606  // store result of chunk.
607  _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
608  frame_ptr += 32;
609  }
610 }
611 #endif /* LV_HAVE_AVX2 */
612 
613 #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ */
614 
615 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
616 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
617 
618 #ifdef LV_HAVE_SSSE3
619 #include <tmmintrin.h>
620 
621 static inline void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame,
622  unsigned char* temp,
623  unsigned int frame_size)
624 {
625  if (frame_size < 16) {
626  volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
627  return;
628  }
629 
630  const unsigned int po2 = log2_of_power_of_2(frame_size);
631 
632  unsigned int stage = po2;
633  unsigned char* frame_ptr = frame;
634  unsigned char* temp_ptr = temp;
635 
636  unsigned int frame_half = frame_size >> 1;
637  unsigned int num_branches = 1;
638  unsigned int branch;
639  unsigned int bit;
640 
641  // prepare constants
642  const __m128i mask_stage1 = _mm_set_epi8(0x0,
643  0xFF,
644  0x0,
645  0xFF,
646  0x0,
647  0xFF,
648  0x0,
649  0xFF,
650  0x0,
651  0xFF,
652  0x0,
653  0xFF,
654  0x0,
655  0xFF,
656  0x0,
657  0xFF);
658 
659  // get some SIMD registers to play with.
660  __m128i r_frame0, r_temp0, shifted;
661 
662  {
663  __m128i r_frame1, r_temp1;
664  const __m128i shuffle_separate =
665  _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
666 
667  while (stage > 4) {
668  frame_ptr = frame;
669  temp_ptr = temp;
670 
671  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
672  for (branch = 0; branch < num_branches; ++branch) {
673  for (bit = 0; bit < frame_half; bit += 16) {
674  r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
675  temp_ptr += 16;
676  r_temp1 = _mm_load_si128((__m128i*)temp_ptr);
677  temp_ptr += 16;
678 
679  shifted = _mm_srli_si128(r_temp0, 1);
680  shifted = _mm_and_si128(shifted, mask_stage1);
681  r_temp0 = _mm_xor_si128(shifted, r_temp0);
682  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
683 
684  shifted = _mm_srli_si128(r_temp1, 1);
685  shifted = _mm_and_si128(shifted, mask_stage1);
686  r_temp1 = _mm_xor_si128(shifted, r_temp1);
687  r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
688 
689  r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
690  _mm_store_si128((__m128i*)frame_ptr, r_frame0);
691 
692  r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
693  _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
694  frame_ptr += 16;
695  }
696 
697  frame_ptr += frame_half;
698  }
699  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
700 
701  num_branches = num_branches << 1;
702  frame_half = frame_half >> 1;
703  stage--;
704  }
705  }
706 
707  // This last part requires at least 16-bit frames.
708  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
709 
710  // reset pointers to correct positions.
711  frame_ptr = frame;
712  temp_ptr = temp;
713 
714  // prefetch first chunk
715  __VOLK_PREFETCH(temp_ptr);
716 
717  const __m128i shuffle_stage4 =
718  _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
719  const __m128i mask_stage4 = _mm_set_epi8(0x0,
720  0x0,
721  0x0,
722  0x0,
723  0x0,
724  0x0,
725  0x0,
726  0x0,
727  0xFF,
728  0xFF,
729  0xFF,
730  0xFF,
731  0xFF,
732  0xFF,
733  0xFF,
734  0xFF);
735  const __m128i mask_stage3 = _mm_set_epi8(0x0,
736  0x0,
737  0x0,
738  0x0,
739  0xFF,
740  0xFF,
741  0xFF,
742  0xFF,
743  0x0,
744  0x0,
745  0x0,
746  0x0,
747  0xFF,
748  0xFF,
749  0xFF,
750  0xFF);
751  const __m128i mask_stage2 = _mm_set_epi8(0x0,
752  0x0,
753  0xFF,
754  0xFF,
755  0x0,
756  0x0,
757  0xFF,
758  0xFF,
759  0x0,
760  0x0,
761  0xFF,
762  0xFF,
763  0x0,
764  0x0,
765  0xFF,
766  0xFF);
767 
768  for (branch = 0; branch < num_branches; ++branch) {
769  r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
770 
771  // prefetch next chunk
772  temp_ptr += 16;
773  __VOLK_PREFETCH(temp_ptr);
774 
775  // shuffle once for bit-reversal.
776  r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
777 
778  shifted = _mm_srli_si128(r_temp0, 8);
779  shifted = _mm_and_si128(shifted, mask_stage4);
780  r_frame0 = _mm_xor_si128(shifted, r_temp0);
781 
782  shifted = _mm_srli_si128(r_frame0, 4);
783  shifted = _mm_and_si128(shifted, mask_stage3);
784  r_frame0 = _mm_xor_si128(shifted, r_frame0);
785 
786  shifted = _mm_srli_si128(r_frame0, 2);
787  shifted = _mm_and_si128(shifted, mask_stage2);
788  r_frame0 = _mm_xor_si128(shifted, r_frame0);
789 
790  shifted = _mm_srli_si128(r_frame0, 1);
791  shifted = _mm_and_si128(shifted, mask_stage1);
792  r_frame0 = _mm_xor_si128(shifted, r_frame0);
793 
794  // store result of chunk.
795  _mm_store_si128((__m128i*)frame_ptr, r_frame0);
796  frame_ptr += 16;
797  }
798 }
799 #endif /* LV_HAVE_SSSE3 */
800 
801 #ifdef LV_HAVE_AVX2
802 #include <immintrin.h>
803 
804 static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame,
805  unsigned char* temp,
806  unsigned int frame_size)
807 {
808  if (frame_size < 32) {
809  volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
810  return;
811  }
812 
813  const unsigned int po2 = log2_of_power_of_2(frame_size);
814 
815  unsigned int stage = po2;
816  unsigned char* frame_ptr = frame;
817  unsigned char* temp_ptr = temp;
818 
819  unsigned int frame_half = frame_size >> 1;
820  unsigned int num_branches = 1;
821  unsigned int branch;
822  unsigned int bit;
823 
824  // prepare constants
825  const __m256i mask_stage1 = _mm256_set_epi8(0x0,
826  0xFF,
827  0x0,
828  0xFF,
829  0x0,
830  0xFF,
831  0x0,
832  0xFF,
833  0x0,
834  0xFF,
835  0x0,
836  0xFF,
837  0x0,
838  0xFF,
839  0x0,
840  0xFF,
841  0x0,
842  0xFF,
843  0x0,
844  0xFF,
845  0x0,
846  0xFF,
847  0x0,
848  0xFF,
849  0x0,
850  0xFF,
851  0x0,
852  0xFF,
853  0x0,
854  0xFF,
855  0x0,
856  0xFF);
857 
858  const __m128i mask_stage0 = _mm_set_epi8(0x0,
859  0xFF,
860  0x0,
861  0xFF,
862  0x0,
863  0xFF,
864  0x0,
865  0xFF,
866  0x0,
867  0xFF,
868  0x0,
869  0xFF,
870  0x0,
871  0xFF,
872  0x0,
873  0xFF);
874  // get some SIMD registers to play with.
875  __m256i r_frame0, r_temp0, shifted;
876  __m128i r_temp2, r_frame2, shifted2;
877  {
878  __m256i r_frame1, r_temp1;
879  __m128i r_frame3, r_temp3;
880  const __m256i shuffle_separate = _mm256_setr_epi8(0,
881  2,
882  4,
883  6,
884  8,
885  10,
886  12,
887  14,
888  1,
889  3,
890  5,
891  7,
892  9,
893  11,
894  13,
895  15,
896  0,
897  2,
898  4,
899  6,
900  8,
901  10,
902  12,
903  14,
904  1,
905  3,
906  5,
907  7,
908  9,
909  11,
910  13,
911  15);
912  const __m128i shuffle_separate128 =
913  _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
914 
915  while (stage > 4) {
916  frame_ptr = frame;
917  temp_ptr = temp;
918 
919  // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
920  for (branch = 0; branch < num_branches; ++branch) {
921  for (bit = 0; bit < frame_half; bit += 32) {
922  if ((frame_half - bit) <
923  32) // if only 16 bits remaining in frame, not 32
924  {
925  r_temp2 = _mm_load_si128((__m128i*)temp_ptr);
926  temp_ptr += 16;
927  r_temp3 = _mm_load_si128((__m128i*)temp_ptr);
928  temp_ptr += 16;
929 
930  shifted2 = _mm_srli_si128(r_temp2, 1);
931  shifted2 = _mm_and_si128(shifted2, mask_stage0);
932  r_temp2 = _mm_xor_si128(shifted2, r_temp2);
933  r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
934 
935  shifted2 = _mm_srli_si128(r_temp3, 1);
936  shifted2 = _mm_and_si128(shifted2, mask_stage0);
937  r_temp3 = _mm_xor_si128(shifted2, r_temp3);
938  r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
939 
940  r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
941  _mm_store_si128((__m128i*)frame_ptr, r_frame2);
942 
943  r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
944  _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
945  frame_ptr += 16;
946  break;
947  }
948  r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
949  temp_ptr += 32;
950  r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
951  temp_ptr += 32;
952 
953  shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
954  shifted = _mm256_and_si256(shifted, mask_stage1);
955  r_temp0 = _mm256_xor_si256(shifted, r_temp0);
956  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
957 
958  shifted = _mm256_srli_si256(r_temp1, 1);
959  shifted = _mm256_and_si256(shifted, mask_stage1);
960  r_temp1 = _mm256_xor_si256(shifted, r_temp1);
961  r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
962 
963  r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
964  r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
965  r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
966  r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
967 
968  _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
969 
970  _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
971  frame_ptr += 32;
972  }
973 
974  frame_ptr += frame_half;
975  }
976  memcpy(temp, frame, sizeof(unsigned char) * frame_size);
977 
978  num_branches = num_branches << 1;
979  frame_half = frame_half >> 1;
980  stage--;
981  }
982  }
983 
984  // This last part requires at least 32-bit frames.
985  // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
986 
987  // reset pointers to correct positions.
988  frame_ptr = frame;
989  temp_ptr = temp;
990 
991  // prefetch first chunk.
992  __VOLK_PREFETCH(temp_ptr);
993 
994  const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
995  8,
996  4,
997  12,
998  2,
999  10,
1000  6,
1001  14,
1002  1,
1003  9,
1004  5,
1005  13,
1006  3,
1007  11,
1008  7,
1009  15,
1010  0,
1011  8,
1012  4,
1013  12,
1014  2,
1015  10,
1016  6,
1017  14,
1018  1,
1019  9,
1020  5,
1021  13,
1022  3,
1023  11,
1024  7,
1025  15);
1026  const __m256i mask_stage4 = _mm256_set_epi8(0x0,
1027  0x0,
1028  0x0,
1029  0x0,
1030  0x0,
1031  0x0,
1032  0x0,
1033  0x0,
1034  0xFF,
1035  0xFF,
1036  0xFF,
1037  0xFF,
1038  0xFF,
1039  0xFF,
1040  0xFF,
1041  0xFF,
1042  0x0,
1043  0x0,
1044  0x0,
1045  0x0,
1046  0x0,
1047  0x0,
1048  0x0,
1049  0x0,
1050  0xFF,
1051  0xFF,
1052  0xFF,
1053  0xFF,
1054  0xFF,
1055  0xFF,
1056  0xFF,
1057  0xFF);
1058  const __m256i mask_stage3 = _mm256_set_epi8(0x0,
1059  0x0,
1060  0x0,
1061  0x0,
1062  0xFF,
1063  0xFF,
1064  0xFF,
1065  0xFF,
1066  0x0,
1067  0x0,
1068  0x0,
1069  0x0,
1070  0xFF,
1071  0xFF,
1072  0xFF,
1073  0xFF,
1074  0x0,
1075  0x0,
1076  0x0,
1077  0x0,
1078  0xFF,
1079  0xFF,
1080  0xFF,
1081  0xFF,
1082  0x0,
1083  0x0,
1084  0x0,
1085  0x0,
1086  0xFF,
1087  0xFF,
1088  0xFF,
1089  0xFF);
1090  const __m256i mask_stage2 = _mm256_set_epi8(0x0,
1091  0x0,
1092  0xFF,
1093  0xFF,
1094  0x0,
1095  0x0,
1096  0xFF,
1097  0xFF,
1098  0x0,
1099  0x0,
1100  0xFF,
1101  0xFF,
1102  0x0,
1103  0x0,
1104  0xFF,
1105  0xFF,
1106  0x0,
1107  0x0,
1108  0xFF,
1109  0xFF,
1110  0x0,
1111  0x0,
1112  0xFF,
1113  0xFF,
1114  0x0,
1115  0x0,
1116  0xFF,
1117  0xFF,
1118  0x0,
1119  0x0,
1120  0xFF,
1121  0xFF);
1122 
1123  for (branch = 0; branch < num_branches / 2; ++branch) {
1124  r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
1125 
1126  // prefetch next chunk
1127  temp_ptr += 32;
1128  __VOLK_PREFETCH(temp_ptr);
1129 
1130  // shuffle once for bit-reversal.
1131  r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
1132 
1133  shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
1134  shifted = _mm256_and_si256(shifted, mask_stage4);
1135  r_frame0 = _mm256_xor_si256(shifted, r_temp0);
1136 
1137  shifted = _mm256_srli_si256(r_frame0, 4);
1138  shifted = _mm256_and_si256(shifted, mask_stage3);
1139  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1140 
1141  shifted = _mm256_srli_si256(r_frame0, 2);
1142  shifted = _mm256_and_si256(shifted, mask_stage2);
1143  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1144 
1145  shifted = _mm256_srli_si256(r_frame0, 1);
1146  shifted = _mm256_and_si256(shifted, mask_stage1);
1147  r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1148 
1149  // store result of chunk.
1150  _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
1151  frame_ptr += 32;
1152  }
1153 }
1154 #endif /* LV_HAVE_AVX2 */
1155 
1156 
1157 #endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */
val
Definition: volk_arch_defs.py:57
static void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:621
static void encodepolar_single_stage(unsigned char *frame_ptr, const unsigned char *temp_ptr, const unsigned int num_branches, const unsigned int frame_half)
Definition: volk_8u_x2_encodeframepolar_8u.h:33
static void volk_8u_x2_encodeframepolar_8u_generic(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:52
static unsigned int log2_of_power_of_2(unsigned int val)
Definition: volk_8u_x2_encodeframepolar_8u.h:18
static void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:76
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:68