Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_8u_x4_conv_k7_r2_8u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
45 #ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
46 #define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
47 
48 typedef union {
49  unsigned char /*DECISIONTYPE*/ t[64 /*NUMSTATES*/ / 8 /*DECISIONTYPE_BITSIZE*/];
50  unsigned int w[64 /*NUMSTATES*/ / 32];
51  unsigned short s[64 /*NUMSTATES*/ / 16];
52  unsigned char c[64 /*NUMSTATES*/ / 8];
53 #ifdef _MSC_VER
54 } decision_t;
55 #else
56 } decision_t __attribute__((aligned(16)));
57 #endif
58 
59 
60 static inline void renormalize(unsigned char* X)
61 {
62  int NUMSTATES = 64;
63  int i;
64 
65  unsigned char min = X[0];
66  for (i = 0; i < NUMSTATES; i++) {
67  if (min > X[i]) {
68  min = X[i];
69  }
70  }
71  for (i = 0; i < NUMSTATES; i++) {
72  X[i] -= min;
73  }
74 }
75 
76 
77 // helper BFLY for GENERIC version
78 static inline void BFLY(int i,
79  int s,
80  unsigned char* syms,
81  unsigned char* Y,
82  unsigned char* X,
83  decision_t* d,
84  unsigned char* Branchtab)
85 {
86  int j;
87  unsigned int decision0, decision1;
88  unsigned char metric, m0, m1, m2, m3;
89  unsigned short metricsum;
90 
91  int NUMSTATES = 64;
92  int RATE = 2;
93  int METRICSHIFT = 1;
94  int PRECISIONSHIFT = 2;
95 
96  metricsum = 1;
97  for (j = 0; j < RATE; j++) {
98  metricsum += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]);
99  }
100  metric = (metricsum >> METRICSHIFT) >> PRECISIONSHIFT;
101 
102  unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);
103 
104  m0 = X[i] + metric;
105  m1 = X[i + NUMSTATES / 2] + (max - metric);
106  m2 = X[i] + (max - metric);
107  m3 = X[i + NUMSTATES / 2] + metric;
108 
109  decision0 = (signed int)(m0 - m1) >= 0;
110  decision1 = (signed int)(m2 - m3) >= 0;
111 
112  Y[2 * i] = decision0 ? m1 : m0;
113  Y[2 * i + 1] = decision1 ? m3 : m2;
114 
115  d->w[i / (sizeof(unsigned int) * 8 / 2) +
116  s * (sizeof(decision_t) / sizeof(unsigned int))] |=
117  (decision0 | decision1 << 1) << ((2 * i) & (sizeof(unsigned int) * 8 - 1));
118 }
119 
120 
121 #if LV_HAVE_AVX2
122 
123 #include <immintrin.h>
124 #include <stdio.h>
125 
126 static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y,
127  unsigned char* X,
128  unsigned char* syms,
129  unsigned char* dec,
130  unsigned int framebits,
131  unsigned int excess,
132  unsigned char* Branchtab)
133 {
134  unsigned int i;
135  for (i = 0; i < framebits + excess; i++) {
136  unsigned char* tmp;
137  unsigned int* dec_int = (unsigned int*)dec;
138  __m256i a76, a78, a79, a82, a84, a85, a86, a88, a89, a90, d10, d9, m23, m24, m25,
139  m26, s18, s19, s22, s23, t14, t15;
140 
141  // Butterfly
142  s18 = ((__m256i*)X)[0];
143  s19 = ((__m256i*)X)[1];
144  a76 = _mm256_set1_epi8(syms[2 * i]);
145  a78 = ((__m256i*)Branchtab)[0];
146  a79 = _mm256_xor_si256(a76, a78);
147  a82 = _mm256_set1_epi8(syms[2 * i + 1]);
148  a84 = ((__m256i*)Branchtab)[1];
149  a85 = _mm256_xor_si256(a82, a84);
150  a86 = _mm256_avg_epu8(a79, a85);
151  a88 = _mm256_srli_epi16(a86, 2);
152  t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
153  t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
154  m23 = _mm256_adds_epu8(s18, t14);
155  m24 = _mm256_adds_epu8(s19, t15);
156  m25 = _mm256_adds_epu8(s18, t15);
157  m26 = _mm256_adds_epu8(s19, t14);
158  a89 = _mm256_min_epu8(m24, m23);
159  d9 = _mm256_cmpeq_epi8(a89, m24);
160  a90 = _mm256_min_epu8(m26, m25);
161  d10 = _mm256_cmpeq_epi8(a90, m26);
162  s22 = _mm256_unpacklo_epi8(d9, d10);
163  s23 = _mm256_unpackhi_epi8(d9, d10);
164  dec_int[2 * i] = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
165  dec_int[2 * i + 1] =
166  _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
167  s22 = _mm256_unpacklo_epi8(a89, a90);
168  s23 = _mm256_unpackhi_epi8(a89, a90);
169  ((__m256i*)Y)[0] = _mm256_permute2x128_si256(s22, s23, 0x20);
170  ((__m256i*)Y)[1] = _mm256_permute2x128_si256(s22, s23, 0x31);
171 
172  // Renormalize
173  __m256i m5, m6;
174  m5 = ((__m256i*)Y)[0];
175  m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]);
176  m5 = ((__m256i)_mm256_min_epu8(_mm256_permute2x128_si256(m5, m5, 0x21), m5));
177  __m256i m7;
178  m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
179  m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)),
180  ((__m256i)m7)));
181  m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)),
182  ((__m256i)m7)));
183  m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)),
184  ((__m256i)m7)));
185  m7 = _mm256_unpacklo_epi8(m7, m7);
186  m7 = _mm256_shufflelo_epi16(m7, 0);
187  m6 = _mm256_unpacklo_epi64(m7, m7);
188  m6 = _mm256_permute2x128_si256(
189  m6, m6, 0); // copy lower half of m6 to upper half, since above ops
190  // operate on 128 bit lanes
191  ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6);
192  ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6);
193 
194  // Swap pointers to old and new metrics
195  tmp = X;
196  X = Y;
197  Y = tmp;
198  }
199 }
200 
201 #endif /*LV_HAVE_AVX2*/
202 
203 
204 #if LV_HAVE_SSE3
205 
206 #include <emmintrin.h>
207 #include <mmintrin.h>
208 #include <pmmintrin.h>
209 #include <stdio.h>
210 #include <xmmintrin.h>
211 
212 static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
213  unsigned char* X,
214  unsigned char* syms,
215  unsigned char* dec,
216  unsigned int framebits,
217  unsigned int excess,
218  unsigned char* Branchtab)
219 {
220  unsigned int i;
221  for (i = 0; i < framebits + excess; i++) {
222  unsigned char* tmp;
223  unsigned short* dec_short = (unsigned short*)dec;
224  __m128i a100, a101, a103, a104, a105, a107, a108, a109, a76, a78, a79, a82, a84,
225  a85, a86, a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29,
226  m30, s18, s19, s24, s25, t14, t15, t17, t18;
227 
228  // First half of butterfly
229  s18 = ((__m128i*)X)[0];
230  s19 = ((__m128i*)X)[2];
231  a76 = _mm_set1_epi8(syms[2 * i]);
232  a78 = ((__m128i*)Branchtab)[0];
233  a79 = _mm_xor_si128(a76, a78);
234  a82 = _mm_set1_epi8(syms[2 * i + 1]);
235  a84 = ((__m128i*)Branchtab)[2];
236  a85 = _mm_xor_si128(a82, a84);
237  a86 = _mm_avg_epu8(a79, a85);
238  a88 = _mm_srli_epi16(a86, 2);
239  t14 = _mm_and_si128(a88, _mm_set1_epi8(63));
240  t15 = _mm_subs_epu8(_mm_set1_epi8(63), t14);
241  m23 = _mm_adds_epu8(s18, t14);
242  m24 = _mm_adds_epu8(s19, t15);
243  m25 = _mm_adds_epu8(s18, t15);
244  m26 = _mm_adds_epu8(s19, t14);
245  a89 = _mm_min_epu8(m24, m23);
246  d9 = _mm_cmpeq_epi8(a89, m24);
247  a90 = _mm_min_epu8(m26, m25);
248  d10 = _mm_cmpeq_epi8(a90, m26);
249  dec_short[4 * i] = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
250  dec_short[4 * i + 1] = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
251  ((__m128i*)Y)[0] = _mm_unpacklo_epi8(a89, a90);
252  ((__m128i*)Y)[1] = _mm_unpackhi_epi8(a89, a90);
253 
254  // Second half of butterfly
255  s24 = ((__m128i*)X)[1];
256  s25 = ((__m128i*)X)[3];
257  a100 = ((__m128i*)Branchtab)[1];
258  a101 = _mm_xor_si128(a76, a100);
259  a103 = ((__m128i*)Branchtab)[3];
260  a104 = _mm_xor_si128(a82, a103);
261  a105 = _mm_avg_epu8(a101, a104);
262  a107 = _mm_srli_epi16(a105, 2);
263  t17 = _mm_and_si128(a107, _mm_set1_epi8(63));
264  t18 = _mm_subs_epu8(_mm_set1_epi8(63), t17);
265  m27 = _mm_adds_epu8(s24, t17);
266  m28 = _mm_adds_epu8(s25, t18);
267  m29 = _mm_adds_epu8(s24, t18);
268  m30 = _mm_adds_epu8(s25, t17);
269  a108 = _mm_min_epu8(m28, m27);
270  d11 = _mm_cmpeq_epi8(a108, m28);
271  a109 = _mm_min_epu8(m30, m29);
272  d12 = _mm_cmpeq_epi8(a109, m30);
273  dec_short[4 * i + 2] = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
274  dec_short[4 * i + 3] = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
275  ((__m128i*)Y)[2] = _mm_unpacklo_epi8(a108, a109);
276  ((__m128i*)Y)[3] = _mm_unpackhi_epi8(a108, a109);
277 
278  // Renormalize
279  __m128i m5, m6;
280  m5 = ((__m128i*)Y)[0];
281  m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
282  m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
283  m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
284  __m128i m7;
285  m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
286  m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
287  m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
288  m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
289  m7 = _mm_unpacklo_epi8(m7, m7);
290  m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
291  m6 = _mm_unpacklo_epi64(m7, m7);
292  ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
293  ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
294  ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
295  ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
296 
297  // Swap pointers to old and new metrics
298  tmp = X;
299  X = Y;
300  Y = tmp;
301  }
302 }
303 
304 #endif /*LV_HAVE_SSE3*/
305 
306 #if LV_HAVE_NEON
307 
308 #include <arm_neon.h>
309 
310 static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
311  unsigned char* X,
312  unsigned char* syms,
313  unsigned char* dec,
314  unsigned int framebits,
315  unsigned int excess,
316  unsigned char* Branchtab)
317 {
318  unsigned int i;
319  for (i = 0; i < framebits + excess; i++) {
320  unsigned char* tmp;
321  unsigned int* dec_int = (unsigned int*)dec;
322  uint8x16_t a100, a101, a103, a104, a105, a108, a109, a76, a78, a79, a82, a84, a85,
323  a86, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
324  s19, s24, s25, t14, t15, t17, t18;
325  uint16x8_t high_bits;
326  uint32x4_t paired16;
327  uint8x16_t paired32;
328  uint8x8_t left, right;
329  uint8x8x2_t both;
330 
331  // First half of butterfly
332  s18 = ((uint8x16_t*)X)[0];
333  s19 = ((uint8x16_t*)X)[2];
334  a76 = vdupq_n_u8(syms[2 * i]);
335  a78 = ((uint8x16_t*)Branchtab)[0];
336  a79 = veorq_u8(a76, a78);
337  a82 = vdupq_n_u8(syms[2 * i + 1]);
338  a84 = ((uint8x16_t*)Branchtab)[2];
339  a85 = veorq_u8(a82, a84);
340  a86 = vrhaddq_u8(a79, a85);
341  t14 = vshrq_n_u8(a86, 2);
342  t15 = vqsubq_u8(vdupq_n_u8(63), t14);
343  m23 = vqaddq_u8(s18, t14);
344  m24 = vqaddq_u8(s19, t15);
345  m25 = vqaddq_u8(s18, t15);
346  m26 = vqaddq_u8(s19, t14);
347  a89 = vminq_u8(m24, m23);
348  d9 = vceqq_u8(a89, m24);
349  a90 = vminq_u8(m26, m25);
350  d10 = vceqq_u8(a90, m26);
351  high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d9, 7));
352  paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
353  paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
354  dec_int[2 * i] = ((unsigned int)vgetq_lane_u8(paired32, 0) << 0) |
355  ((unsigned int)vgetq_lane_u8(paired32, 4) << 8) |
356  ((unsigned int)vgetq_lane_u8(paired32, 8) << 16) |
357  ((unsigned int)vgetq_lane_u8(paired32, 12) << 24);
358  high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d10, 7));
359  paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
360  paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
361  dec_int[2 * i] |= ((unsigned int)vgetq_lane_u8(paired32, 0) << 1) |
362  ((unsigned int)vgetq_lane_u8(paired32, 4) << 9) |
363  ((unsigned int)vgetq_lane_u8(paired32, 8) << 17) |
364  ((unsigned int)vgetq_lane_u8(paired32, 12) << 25);
365  left = vget_low_u8(a89);
366  right = vget_low_u8(a90);
367  both = vzip_u8(left, right);
368  ((uint8x16_t*)Y)[0] = vcombine_u8(both.val[0], both.val[1]);
369  left = vget_high_u8(a89);
370  right = vget_high_u8(a90);
371  both = vzip_u8(left, right);
372  ((uint8x16_t*)Y)[1] = vcombine_u8(both.val[0], both.val[1]);
373 
374  // Second half of butterfly
375  s24 = ((uint8x16_t*)X)[1];
376  s25 = ((uint8x16_t*)X)[3];
377  a100 = ((uint8x16_t*)Branchtab)[1];
378  a101 = veorq_u8(a76, a100);
379  a103 = ((uint8x16_t*)Branchtab)[3];
380  a104 = veorq_u8(a82, a103);
381  a105 = vrhaddq_u8(a101, a104);
382  t17 = vshrq_n_u8(a105, 2);
383  t18 = vqsubq_u8(vdupq_n_u8(63), t17);
384  m27 = vqaddq_u8(s24, t17);
385  m28 = vqaddq_u8(s25, t18);
386  m29 = vqaddq_u8(s24, t18);
387  m30 = vqaddq_u8(s25, t17);
388  a108 = vminq_u8(m28, m27);
389  d11 = vceqq_u8(a108, m28);
390  a109 = vminq_u8(m30, m29);
391  d12 = vceqq_u8(a109, m30);
392  high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d11, 7));
393  paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
394  paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
395  dec_int[2 * i + 1] = ((unsigned int)vgetq_lane_u8(paired32, 0) << 0) |
396  ((unsigned int)vgetq_lane_u8(paired32, 4) << 8) |
397  ((unsigned int)vgetq_lane_u8(paired32, 8) << 16) |
398  ((unsigned int)vgetq_lane_u8(paired32, 12) << 24);
399  high_bits = vreinterpretq_u16_u8(vshrq_n_u8(d12, 7));
400  paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 6));
401  paired32 = vreinterpretq_u8_u32(vsraq_n_u32(paired16, paired16, 12));
402  dec_int[2 * i + 1] |= ((unsigned int)vgetq_lane_u8(paired32, 0) << 1) |
403  ((unsigned int)vgetq_lane_u8(paired32, 4) << 9) |
404  ((unsigned int)vgetq_lane_u8(paired32, 8) << 17) |
405  ((unsigned int)vgetq_lane_u8(paired32, 12) << 25);
406  left = vget_low_u8(a108);
407  right = vget_low_u8(a109);
408  both = vzip_u8(left, right);
409  ((uint8x16_t*)Y)[2] = vcombine_u8(both.val[0], both.val[1]);
410  left = vget_high_u8(a108);
411  right = vget_high_u8(a109);
412  both = vzip_u8(left, right);
413  ((uint8x16_t*)Y)[3] = vcombine_u8(both.val[0], both.val[1]);
414 
415  // Renormalize
416  uint8x16_t m5, m6;
417  m5 = ((uint8x16_t*)Y)[0];
418  m5 = vminq_u8(m5, ((uint8x16_t*)Y)[1]);
419  m5 = vminq_u8(m5, ((uint8x16_t*)Y)[2]);
420  m5 = vminq_u8(m5, ((uint8x16_t*)Y)[3]);
421  uint8x8_t m7;
422  m7 = vpmin_u8(vget_low_u8(m5), vget_high_u8(m5));
423  m7 = vpmin_u8(m7, m7);
424  m7 = vpmin_u8(m7, m7);
425  m7 = vpmin_u8(m7, m7);
426  m6 = vcombine_u8(m7, m7);
427  ((uint8x16_t*)Y)[0] = vqsubq_u8(((uint8x16_t*)Y)[0], m6);
428  ((uint8x16_t*)Y)[1] = vqsubq_u8(((uint8x16_t*)Y)[1], m6);
429  ((uint8x16_t*)Y)[2] = vqsubq_u8(((uint8x16_t*)Y)[2], m6);
430  ((uint8x16_t*)Y)[3] = vqsubq_u8(((uint8x16_t*)Y)[3], m6);
431 
432  // Swap pointers to old and new metrics
433  tmp = X;
434  X = Y;
435  Y = tmp;
436  }
437 }
438 
439 #endif /*LV_HAVE_NEON*/
440 
441 #if LV_HAVE_GENERIC
442 
443 static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y,
444  unsigned char* X,
445  unsigned char* syms,
446  unsigned char* dec,
447  unsigned int framebits,
448  unsigned int excess,
449  unsigned char* Branchtab)
450 {
451  int nbits = framebits + excess;
452  int NUMSTATES = 64;
453 
454  int s, i;
455  for (s = 0; s < nbits; s++) {
456  void* tmp;
457  for (i = 0; i < NUMSTATES / 2; i++) {
458  BFLY(i, s, syms, Y, X, (decision_t*)dec, Branchtab);
459  }
460 
461  renormalize(Y);
462 
464  tmp = (void*)X;
465  X = Y;
466  Y = (unsigned char*)tmp;
467  }
468 }
469 
470 #endif /* LV_HAVE_GENERIC */
471 
472 #if LV_HAVE_RVV
473 #include <riscv_vector.h>
474 
475 static inline void volk_8u_x4_conv_k7_r2_8u_rvv(unsigned char* Y,
476  unsigned char* X,
477  unsigned char* syms,
478  unsigned char* dec,
479  unsigned int framebits,
480  unsigned int excess,
481  unsigned char* Branchtab)
482 {
483  size_t vl = 256 / 8;
484 
485  size_t n = framebits + excess;
486 
487  if (__riscv_vlenb() == 128 / 8) {
488  vuint8m2_t vX0 = __riscv_vle8_v_u8m2(X, vl),
489  vX1 = __riscv_vle8_v_u8m2(X + vl, vl);
490  vuint8m2_t vY0 = __riscv_vle8_v_u8m2(Y, vl),
491  vY1 = __riscv_vle8_v_u8m2(Y + vl, vl);
492  vuint8m2_t vB0 = __riscv_vle8_v_u8m2(Branchtab, vl);
493  vuint8m2_t vB1 = __riscv_vle8_v_u8m2(Branchtab + vl, vl);
494  vuint8m2_t v63 = __riscv_vmv_v_x_u8m2(63, vl);
495 
496  for (size_t i = 0; i < n; ++i) {
497  // Butterfly
498  vuint8m2_t va0 = __riscv_vxor(vB0, syms[2 * i + 0], vl);
499  vuint8m2_t va1 = __riscv_vxor(vB1, syms[2 * i + 1], vl);
500  vuint8m2_t va = __riscv_vaaddu(va0, va1, 0, vl);
501  va = __riscv_vreinterpret_u8m2(
502  __riscv_vsrl(__riscv_vreinterpret_u16m2(va), 2, vl / 2));
503  va = __riscv_vand(va, v63, vl);
504  vuint8m2_t vb = __riscv_vssubu(v63, va, vl);
505  vuint8m2_t vX0a = __riscv_vsaddu(vX0, va, vl);
506  vuint8m2_t vX1b = __riscv_vsaddu(vX1, vb, vl);
507  vuint8m2_t vX0b = __riscv_vsaddu(vX0, vb, vl);
508  vuint8m2_t vX1a = __riscv_vsaddu(vX1, va, vl);
509  vY0 = __riscv_vminu(vX1b, vX0a, vl);
510  vY1 = __riscv_vminu(vX1a, vX0b, vl);
511 
512  vuint16m4_t vX1ba =
513  __riscv_vwmaccu(__riscv_vwaddu_vv(vX1b, vX1a, vl), 0xFF, vX1a, vl);
514  vX1b = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vX1ba), 0);
515  vX1a = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vX1ba), 1);
516 
517  vuint16m4_t vm =
518  __riscv_vwmaccu(__riscv_vwaddu_vv(vY0, vY1, vl), 0xFF, vY1, vl);
519  vY0 = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vm), 0);
520  vY1 = __riscv_vget_u8m2(__riscv_vreinterpret_u8m4(vm), 1);
521 
522  __riscv_vsm(&dec[8 * i + 0], __riscv_vmseq(vY0, vX1b, vl), vl);
523  __riscv_vsm(&dec[8 * i + 4], __riscv_vmseq(vY1, vX1a, vl), vl);
524 
525  // Renormalize
526  vuint8m2_t vmin = __riscv_vminu(vY0, vY1, vl);
527  vmin = __riscv_vlmul_ext_u8m2(
528  __riscv_vredminu(vmin, __riscv_vlmul_trunc_u8m1(vmin), vl));
529  vmin = __riscv_vrgather(vmin, 0, vl);
530  vY0 = __riscv_vsub(vY0, vmin, vl);
531  vY1 = __riscv_vsub(vY1, vmin, vl);
532 
533  vuint8m2_t tmp; // Swap pointers to old and new metrics
534  tmp = vX0;
535  vX0 = vY0;
536  vY0 = tmp;
537  tmp = vX1;
538  vX1 = vY1;
539  vY1 = tmp;
540  }
541  if (n & 1) {
542  __riscv_vse8(X, vY0, vl);
543  __riscv_vse8(X + vl, vY1, vl);
544  __riscv_vse8(Y, vX0, vl);
545  __riscv_vse8(Y + vl, vX1, vl);
546  } else {
547  __riscv_vse8(X, vX0, vl);
548  __riscv_vse8(X + vl, vX1, vl);
549  __riscv_vse8(Y, vY0, vl);
550  __riscv_vse8(Y + vl, vY1, vl);
551  }
552  } else if (__riscv_vlenb() == 256 / 8) {
553  vuint8m1_t vX0 = __riscv_vle8_v_u8m1(X, vl),
554  vX1 = __riscv_vle8_v_u8m1(X + vl, vl);
555  vuint8m1_t vY0 = __riscv_vle8_v_u8m1(Y, vl),
556  vY1 = __riscv_vle8_v_u8m1(Y + vl, vl);
557  vuint8m1_t vB0 = __riscv_vle8_v_u8m1(Branchtab, vl);
558  vuint8m1_t vB1 = __riscv_vle8_v_u8m1(Branchtab + vl, vl);
559  vuint8m1_t v63 = __riscv_vmv_v_x_u8m1(63, vl);
560 
561  for (size_t i = 0; i < n; ++i) {
562  // Butterfly
563  vuint8m1_t va0 = __riscv_vxor(vB0, syms[2 * i + 0], vl);
564  vuint8m1_t va1 = __riscv_vxor(vB1, syms[2 * i + 1], vl);
565  vuint8m1_t va = __riscv_vaaddu(va0, va1, 0, vl);
566  va = __riscv_vreinterpret_u8m1(
567  __riscv_vsrl(__riscv_vreinterpret_u16m1(va), 2, vl / 2));
568  va = __riscv_vand(va, v63, vl);
569  vuint8m1_t vb = __riscv_vssubu(v63, va, vl);
570  vuint8m1_t vX0a = __riscv_vsaddu(vX0, va, vl);
571  vuint8m1_t vX1b = __riscv_vsaddu(vX1, vb, vl);
572  vuint8m1_t vX0b = __riscv_vsaddu(vX0, vb, vl);
573  vuint8m1_t vX1a = __riscv_vsaddu(vX1, va, vl);
574  vY0 = __riscv_vminu(vX1b, vX0a, vl);
575  vY1 = __riscv_vminu(vX1a, vX0b, vl);
576 
577  vuint16m2_t vX1ba =
578  __riscv_vwmaccu(__riscv_vwaddu_vv(vX1b, vX1a, vl), 0xFF, vX1a, vl);
579  vX1b = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vX1ba), 0);
580  vX1a = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vX1ba), 1);
581 
582  vuint16m2_t vm =
583  __riscv_vwmaccu(__riscv_vwaddu_vv(vY0, vY1, vl), 0xFF, vY1, vl);
584  vY0 = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vm), 0);
585  vY1 = __riscv_vget_u8m1(__riscv_vreinterpret_u8m2(vm), 1);
586 
587  __riscv_vsm(&dec[8 * i + 0], __riscv_vmseq(vY0, vX1b, vl), vl);
588  __riscv_vsm(&dec[8 * i + 4], __riscv_vmseq(vY1, vX1a, vl), vl);
589 
590  // Renormalize
591  vuint8m1_t vmin = __riscv_vminu(vY0, vY1, vl);
592  vmin = __riscv_vrgather(__riscv_vredminu(vmin, vmin, vl), 0, vl);
593  vY0 = __riscv_vsub(vY0, vmin, vl);
594  vY1 = __riscv_vsub(vY1, vmin, vl);
595 
596  vuint8m1_t tmp; // Swap pointers to old and new metrics
597  tmp = vX0;
598  vX0 = vY0;
599  vY0 = tmp;
600  tmp = vX1;
601  vX1 = vY1;
602  vY1 = tmp;
603  }
604  if (n & 1) {
605  __riscv_vse8(X, vY0, vl);
606  __riscv_vse8(X + vl, vY1, vl);
607  __riscv_vse8(Y, vX0, vl);
608  __riscv_vse8(Y + vl, vX1, vl);
609  } else {
610  __riscv_vse8(X, vX0, vl);
611  __riscv_vse8(X + vl, vX1, vl);
612  __riscv_vse8(Y, vY0, vl);
613  __riscv_vse8(Y + vl, vY1, vl);
614  }
615  } else {
616  vuint8mf2_t vX0 = __riscv_vle8_v_u8mf2(X, vl),
617  vX1 = __riscv_vle8_v_u8mf2(X + vl, vl);
618  vuint8mf2_t vY0 = __riscv_vle8_v_u8mf2(Y, vl),
619  vY1 = __riscv_vle8_v_u8mf2(Y + vl, vl);
620  vuint8mf2_t vB0 = __riscv_vle8_v_u8mf2(Branchtab, vl);
621  vuint8mf2_t vB1 = __riscv_vle8_v_u8mf2(Branchtab + vl, vl);
622  vuint8mf2_t v63 = __riscv_vmv_v_x_u8mf2(63, vl);
623 
624  for (size_t i = 0; i < n; ++i) {
625  // Butterfly
626  vuint8mf2_t va0 = __riscv_vxor(vB0, syms[2 * i + 0], vl);
627  vuint8mf2_t va1 = __riscv_vxor(vB1, syms[2 * i + 1], vl);
628  vuint8mf2_t va = __riscv_vaaddu(va0, va1, 0, vl);
629  va = __riscv_vreinterpret_u8mf2(
630  __riscv_vsrl(__riscv_vreinterpret_u16mf2(va), 2, vl / 2));
631  va = __riscv_vand(va, v63, vl);
632  vuint8mf2_t vb = __riscv_vssubu(v63, va, vl);
633  vuint8mf2_t vX0a = __riscv_vsaddu(vX0, va, vl);
634  vuint8mf2_t vX1b = __riscv_vsaddu(vX1, vb, vl);
635  vuint8mf2_t vX0b = __riscv_vsaddu(vX0, vb, vl);
636  vuint8mf2_t vX1a = __riscv_vsaddu(vX1, va, vl);
637  vY0 = __riscv_vminu(vX1b, vX0a, vl);
638  vY1 = __riscv_vminu(vX1a, vX0b, vl);
639 
640  vuint8m1_t vX1ba = __riscv_vreinterpret_u8m1(
641  __riscv_vwmaccu(__riscv_vwaddu_vv(vX1b, vX1a, vl), 0xFF, vX1a, vl));
642  vuint8m1_t vY01 = __riscv_vreinterpret_u8m1(
643  __riscv_vwmaccu(__riscv_vwaddu_vv(vY0, vY1, vl), 0xFF, vY1, vl));
644 
645  __riscv_vsm(&dec[8 * i + 0], __riscv_vmseq(vY01, vX1ba, vl * 2), vl * 2);
646 
647  // Renormalize
648  vuint8m1_t vmin =
649  __riscv_vrgather(__riscv_vredminu(vY01, vY01, vl * 2), 0, vl * 2);
650  vY01 = __riscv_vsub(vY01, vmin, vl * 2);
651 
652  vY0 = __riscv_vlmul_trunc_u8mf2(vY01);
653  vY1 = __riscv_vlmul_trunc_u8mf2(__riscv_vslidedown(vY01, vl, vl));
654 
655  vuint8mf2_t tmp; // Swap pointers to old and new metrics
656  tmp = vX0;
657  vX0 = vY0;
658  vY0 = tmp;
659  tmp = vX1;
660  vX1 = vY1;
661  vY1 = tmp;
662  }
663  if (n & 1) {
664  __riscv_vse8(X, vY0, vl);
665  __riscv_vse8(X + vl, vY1, vl);
666  __riscv_vse8(Y, vX0, vl);
667  __riscv_vse8(Y + vl, vX1, vl);
668  } else {
669  __riscv_vse8(X, vX0, vl);
670  __riscv_vse8(X + vl, vX1, vl);
671  __riscv_vse8(Y, vY0, vl);
672  __riscv_vse8(Y + vl, vY1, vl);
673  }
674  }
675 }
676 #endif /*LV_HAVE_RVV*/
677 
678 #endif /*INCLUDED_volk_8u_x4_conv_k7_r2_8u_H*/
Definition: volk_8u_x4_conv_k7_r2_8u.h:48
unsigned int w[64/32]
Definition: volk_8u_x4_conv_k7_r2_8u.h:50
static void BFLY(int i, int s, unsigned char *syms, unsigned char *Y, unsigned char *X, decision_t *d, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:78
static void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:212
static void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:310
static void renormalize(unsigned char *X)
Definition: volk_8u_x4_conv_k7_r2_8u.h:60
static void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:443
for i
Definition: volk_config_fixed.tmpl.h:13