Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_8u_conv_k7_r2puppet_8u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
10 #ifndef INCLUDED_volk_8u_conv_k7_r2puppet_8u_H
11 #define INCLUDED_volk_8u_conv_k7_r2puppet_8u_H
12 
13 #include <string.h>
14 #include <volk/volk.h>
16 
17 typedef union {
18  // decision_t is a BIT vector
19  unsigned char* t;
20  unsigned int* w;
21 } p_decision_t;
22 
23 static inline int parity(int x)
24 {
25  x ^= x >> 16;
26  x ^= x >> 8;
27  x ^= x >> 4;
28  x ^= x >> 2;
29  x ^= x >> 1;
30  return x & 1;
31 }
32 
33 static inline int chainback_viterbi(unsigned char* data,
34  unsigned int nbits,
35  unsigned int endstate,
36  unsigned int tailsize,
37  unsigned char* decisions)
38 {
39  unsigned char* d;
40  int d_ADDSHIFT = 0;
41  int d_numstates = (1 << 6);
42  int d_decision_t_size = d_numstates / 8;
43  unsigned int d_k = 7;
44  int d_framebits = nbits;
45  /* ADDSHIFT and SUBSHIFT make sure that the thing returned is a byte. */
46  d = decisions;
47  /* Make room beyond the end of the encoder register so we can
48  * accumulate a full byte of decoded data
49  */
50 
51  endstate = (endstate % d_numstates) << d_ADDSHIFT;
52 
53  /* The store into data[] only needs to be done every 8 bits.
54  * But this avoids a conditional branch, and the writes will
55  * combine in the cache anyway
56  */
57 
58  d += tailsize * d_decision_t_size; /* Look past tail */
59  int retval;
60  int dif = tailsize - (d_k - 1);
61  // printf("break, %d, %d\n", dif, (nbits+dif)%d_framebits);
62  p_decision_t dec;
63  while (nbits-- > d_framebits - (d_k - 1)) {
64  int k;
65  dec.t = &d[nbits * d_decision_t_size];
66  k = (dec.w[(endstate >> d_ADDSHIFT) / 32] >> ((endstate >> d_ADDSHIFT) % 32)) & 1;
67 
68  endstate = (endstate >> 1) | (k << (d_k - 2 + d_ADDSHIFT));
69  // data[((nbits+dif)%nbits)>>3] = endstate>>d_SUBSHIFT;
70  // printf("%d, %d\n", k, (nbits+dif)%d_framebits);
71  data[((nbits + dif) % d_framebits)] = k;
72 
73  retval = endstate;
74  }
75  nbits += 1;
76 
77  while (nbits-- != 0) {
78  int k;
79 
80  dec.t = &d[nbits * d_decision_t_size];
81 
82  k = (dec.w[(endstate >> d_ADDSHIFT) / 32] >> ((endstate >> d_ADDSHIFT) % 32)) & 1;
83 
84  endstate = (endstate >> 1) | (k << (d_k - 2 + d_ADDSHIFT));
85  data[((nbits + dif) % d_framebits)] = k;
86  }
87  // printf("%d, %d, %d, %d, %d, %d, %d, %d\n",
88  // data[4095],data[4094],data[4093],data[4092],data[4091],data[4090],data[4089],data[4088]);
89 
90 
91  return retval >> d_ADDSHIFT;
92 }
93 
94 
95 #if LV_HAVE_SSE3
96 
97 #include <emmintrin.h>
98 #include <mmintrin.h>
99 #include <pmmintrin.h>
100 #include <stdio.h>
101 #include <xmmintrin.h>
102 
103 static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* dec,
104  unsigned char* syms,
105  unsigned int framebits)
106 {
107  if (framebits < 12) {
108  return;
109  }
110 
111  static int once = 1;
112  int d_numstates = (1 << 6);
113  int rate = 2;
114  static unsigned char* D;
115  static unsigned char* Y;
116  static unsigned char* X;
117  static unsigned int excess = 6;
118  static unsigned char* Branchtab;
119 
120  int d_polys[2] = { 79, 109 };
121 
122 
123  if (once) {
124 
125  X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment());
126  Y = X + d_numstates;
127  Branchtab =
128  (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment());
129  D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
131  int state, i;
132 
133  /* Initialize the branch table */
134  for (state = 0; state < d_numstates / 2; state++) {
135  for (i = 0; i < rate; i++) {
136  Branchtab[i * d_numstates / 2 + state] =
137  parity((2 * state) & d_polys[i]) ? 255 : 0;
138  }
139  }
140 
141  once = 0;
142  }
143 
144  // unbias the old_metrics
145  memset(X, 31, d_numstates);
146 
147  // initialize decisions
148  memset(D, 0, (d_numstates / 8) * (framebits + 6));
149 
151  Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
152 
153  unsigned int min = X[0];
154  int i = 0, state = 0;
155  for (i = 0; i < (d_numstates); ++i) {
156  if (X[i] < min) {
157  min = X[i];
158  state = i;
159  }
160  }
161 
162  chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
163 
164  return;
165 }
166 
167 #endif /*LV_HAVE_SSE3*/
168 
169 
170 #if LV_HAVE_NEON
171 
172 static inline void volk_8u_conv_k7_r2puppet_8u_neonspiral(unsigned char* dec,
173  unsigned char* syms,
174  unsigned int framebits)
175 {
176  if (framebits < 12) {
177  return;
178  }
179 
180  static int once = 1;
181  int d_numstates = (1 << 6);
182  int rate = 2;
183  static unsigned char* D;
184  static unsigned char* Y;
185  static unsigned char* X;
186  static unsigned int excess = 6;
187  static unsigned char* Branchtab;
188 
189  int d_polys[2] = { 79, 109 };
190 
191 
192  if (once) {
193 
194  X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment());
195  Y = X + d_numstates;
196  Branchtab =
197  (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment());
198  D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
200  int state, i;
201 
202  /* Initialize the branch table */
203  for (state = 0; state < d_numstates / 2; state++) {
204  for (i = 0; i < rate; i++) {
205  Branchtab[i * d_numstates / 2 + state] =
206  parity((2 * state) & d_polys[i]) ? 255 : 0;
207  }
208  }
209 
210  once = 0;
211  }
212 
213  // unbias the old_metrics
214  memset(X, 31, d_numstates);
215 
216  // initialize decisions
217  memset(D, 0, (d_numstates / 8) * (framebits + 6));
218 
220  Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
221 
222  unsigned int min = X[0];
223  int i = 0, state = 0;
224  for (i = 0; i < (d_numstates); ++i) {
225  if (X[i] < min) {
226  min = X[i];
227  state = i;
228  }
229  }
230 
231  chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
232 
233  return;
234 }
235 
236 #endif /*LV_HAVE_NEON*/
237 
238 
239 #if LV_HAVE_AVX2
240 
241 #include <immintrin.h>
242 #include <stdio.h>
243 
244 static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* dec,
245  unsigned char* syms,
246  unsigned int framebits)
247 {
248  if (framebits < 12) {
249  return;
250  }
251 
252  static int once = 1;
253  int d_numstates = (1 << 6);
254  int rate = 2;
255  static unsigned char* D;
256  static unsigned char* Y;
257  static unsigned char* X;
258  static unsigned int excess = 6;
259  static unsigned char* Branchtab;
260 
261  int d_polys[2] = { 79, 109 };
262 
263 
264  if (once) {
265 
266  X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment());
267  Y = X + d_numstates;
268  Branchtab =
269  (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment());
270  D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
272  int state, i;
273 
274  /* Initialize the branch table */
275  for (state = 0; state < d_numstates / 2; state++) {
276  for (i = 0; i < rate; i++) {
277  Branchtab[i * d_numstates / 2 + state] =
278  parity((2 * state) & d_polys[i]) ? 255 : 0;
279  }
280  }
281 
282  once = 0;
283  }
284 
285  // unbias the old_metrics
286  memset(X, 31, d_numstates);
287 
288  // initialize decisions
289  memset(D, 0, (d_numstates / 8) * (framebits + 6));
290 
291  volk_8u_x4_conv_k7_r2_8u_avx2(
292  Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
293 
294  unsigned int min = X[0];
295  int i = 0, state = 0;
296  for (i = 0; i < (d_numstates); ++i) {
297  if (X[i] < min) {
298  min = X[i];
299  state = i;
300  }
301  }
302 
303  chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
304 
305  return;
306 }
307 
308 #endif /*LV_HAVE_AVX2*/
309 
310 
311 #if LV_HAVE_GENERIC
312 
313 
314 static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* dec,
315  unsigned char* syms,
316  unsigned int framebits)
317 {
318  if (framebits < 12) {
319  return;
320  }
321 
322  static int once = 1;
323  int d_numstates = (1 << 6);
324  int rate = 2;
325  static unsigned char* Y;
326  static unsigned char* X;
327  static unsigned char* D;
328  static unsigned int excess = 6;
329  static unsigned char* Branchtab;
330 
331  int d_polys[2] = { 79, 109 };
332 
333 
334  if (once) {
335 
336  X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment());
337  Y = X + d_numstates;
338  Branchtab =
339  (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment());
340  D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
342 
343  int state, i;
344 
345  /* Initialize the branch table */
346  for (state = 0; state < d_numstates / 2; state++) {
347  for (i = 0; i < rate; i++) {
348  Branchtab[i * d_numstates / 2 + state] =
349  parity((2 * state) & d_polys[i]) ? 255 : 0;
350  }
351  }
352 
353  once = 0;
354  }
355 
356  // unbias the old_metrics
357  memset(X, 31, d_numstates);
358 
359  // initialize decisions
360  memset(D, 0, (d_numstates / 8) * (framebits + 6));
361 
363  Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
364 
365  unsigned int min = X[0];
366  int i = 0, state = 0;
367  for (i = 0; i < (d_numstates); ++i) {
368  if (X[i] < min) {
369  min = X[i];
370  state = i;
371  }
372  }
373 
374  chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
375 
376  return;
377 }
378 
379 #endif /* LV_HAVE_GENERIC */
380 
381 #if LV_HAVE_RVV
382 #include <riscv_vector.h>
383 
384 static inline void volk_8u_conv_k7_r2puppet_8u_rvv(unsigned char* dec,
385  unsigned char* syms,
386  unsigned int framebits)
387 {
388  if (framebits < 12)
389  return;
390 
391  int d_numstates = (1 << 6);
392  static unsigned char* D;
393  static unsigned char* Y;
394  static unsigned char* X;
395  static unsigned int excess = 6;
396  static unsigned char* Branchtab;
397 
398  static int once = 1;
399  if (once) {
400  once = 0;
401 
402  X = (unsigned char*)volk_malloc(3 * d_numstates, volk_get_alignment());
403  Y = X + d_numstates;
404  Branchtab = Y + d_numstates;
405  D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
407 
408  /* Initialize the branch table */
409  for (size_t state = 0; state < d_numstates / 2; state++) {
410  Branchtab[state] = parity(state & 39) * 255;
411  Branchtab[state + d_numstates / 2] = parity(state & 54) * 255;
412  }
413  }
414 
415  memset(X, 31, d_numstates); // unbias the old_metrics
416  memset(D, 0, (d_numstates / 8) * (framebits + 6)); // initialize decisions
417 
418  volk_8u_x4_conv_k7_r2_8u_rvv(
419  Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
420 
421  unsigned int min = X[0];
422  int i = 0, state = 0;
423  for (i = 0; i < d_numstates; ++i) {
424  if (X[i] < min) {
425  min = X[i];
426  state = i;
427  }
428  }
429 
430  chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
431 
432  return;
433 }
434 #endif /*LV_HAVE_RVV*/
435 
436 #endif /*INCLUDED_volk_8u_conv_k7_r2puppet_8u_H*/
data
Definition: plot_best_vs_generic.py:23
Definition: volk_8u_conv_k7_r2puppet_8u.h:17
unsigned int * w
Definition: volk_8u_conv_k7_r2puppet_8u.h:20
unsigned char * t
Definition: volk_8u_conv_k7_r2puppet_8u.h:19
size_t volk_get_alignment(void)
Get the machine alignment in bytes.
Definition: volk.tmpl.c:90
static int parity(int x)
Definition: volk_8u_conv_k7_r2puppet_8u.h:23
static void volk_8u_conv_k7_r2puppet_8u_neonspiral(unsigned char *dec, unsigned char *syms, unsigned int framebits)
Definition: volk_8u_conv_k7_r2puppet_8u.h:172
static int chainback_viterbi(unsigned char *data, unsigned int nbits, unsigned int endstate, unsigned int tailsize, unsigned char *decisions)
Definition: volk_8u_conv_k7_r2puppet_8u.h:33
static void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char *dec, unsigned char *syms, unsigned int framebits)
Definition: volk_8u_conv_k7_r2puppet_8u.h:103
static void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char *dec, unsigned char *syms, unsigned int framebits)
Definition: volk_8u_conv_k7_r2puppet_8u.h:314
static void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:212
static void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:310
static void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:443
for i
Definition: volk_config_fixed.tmpl.h:13
__VOLK_DECL_BEGIN VOLK_API void * volk_malloc(size_t size, size_t alignment)
Allocate size bytes of data aligned to alignment.
Definition: volk_malloc.c:38