Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_16i_x4_quad_max_star_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
46 #ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
47 #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
48 
49 #include <inttypes.h>
50 
51 #ifdef LV_HAVE_SSE2
52 
53 #include <emmintrin.h>
54 
55 static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target,
56  short* src0,
57  short* src1,
58  short* src2,
59  short* src3,
60  unsigned int num_points)
61 {
62  const unsigned int num_bytes = num_points * 2;
63 
64  int i = 0;
65 
66  int bound = (num_bytes >> 4);
67  int bound_copy = bound;
68  int leftovers = (num_bytes >> 1) & 7;
69 
70  __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
71  p_target = (__m128i*)target;
72  p_src0 = (__m128i*)src0;
73  p_src1 = (__m128i*)src1;
74  p_src2 = (__m128i*)src2;
75  p_src3 = (__m128i*)src3;
76 
77  __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
78 
79  while (bound_copy > 0) {
80  xmm1 = _mm_load_si128(p_src0);
81  xmm2 = _mm_load_si128(p_src1);
82  xmm3 = _mm_load_si128(p_src2);
83  xmm4 = _mm_load_si128(p_src3);
84 
85  xmm5 = _mm_setzero_si128();
86  xmm6 = _mm_setzero_si128();
87  xmm7 = xmm1;
88  xmm8 = xmm3;
89 
90  xmm1 = _mm_sub_epi16(xmm2, xmm1);
91 
92  xmm3 = _mm_sub_epi16(xmm4, xmm3);
93 
94  xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
95  xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
96 
97  xmm2 = _mm_and_si128(xmm5, xmm2);
98  xmm4 = _mm_and_si128(xmm6, xmm4);
99  xmm5 = _mm_andnot_si128(xmm5, xmm7);
100  xmm6 = _mm_andnot_si128(xmm6, xmm8);
101 
102  xmm5 = _mm_add_epi16(xmm2, xmm5);
103  xmm6 = _mm_add_epi16(xmm4, xmm6);
104 
105  xmm1 = _mm_xor_si128(xmm1, xmm1);
106  xmm2 = xmm5;
107  xmm5 = _mm_sub_epi16(xmm6, xmm5);
108  p_src0 += 1;
109  bound_copy -= 1;
110 
111  xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
112  p_src1 += 1;
113 
114  xmm6 = _mm_and_si128(xmm1, xmm6);
115 
116  xmm1 = _mm_andnot_si128(xmm1, xmm2);
117  p_src2 += 1;
118 
119  xmm1 = _mm_add_epi16(xmm6, xmm1);
120  p_src3 += 1;
121 
122  _mm_store_si128(p_target, xmm1);
123  p_target += 1;
124  }
125 
126  short temp0 = 0;
127  short temp1 = 0;
128  for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
129  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
130  temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
131  target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
132  }
133  return;
134 }
135 
136 #endif /*LV_HAVE_SSE2*/
137 
138 #ifdef LV_HAVE_NEON
139 
140 #include <arm_neon.h>
141 
142 static inline void volk_16i_x4_quad_max_star_16i_neon(short* target,
143  short* src0,
144  short* src1,
145  short* src2,
146  short* src3,
147  unsigned int num_points)
148 {
149  const unsigned int eighth_points = num_points / 8;
150  unsigned i;
151 
152  int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
153  int16x8_t diff12, diff34;
154  int16x8_t comp0, comp1, comp2, comp3;
155  int16x8_t result1_vec, result2_vec;
156  int16x8_t zeros;
157  zeros = vdupq_n_s16(0);
158  for (i = 0; i < eighth_points; ++i) {
159  src0_vec = vld1q_s16(src0);
160  src1_vec = vld1q_s16(src1);
161  src2_vec = vld1q_s16(src2);
162  src3_vec = vld1q_s16(src3);
163  diff12 = vsubq_s16(src0_vec, src1_vec);
164  diff34 = vsubq_s16(src2_vec, src3_vec);
165  comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
166  comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
167  comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
168  comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
169  comp0 = vandq_s16(src0_vec, comp0);
170  comp1 = vandq_s16(src1_vec, comp1);
171  comp2 = vandq_s16(src2_vec, comp2);
172  comp3 = vandq_s16(src3_vec, comp3);
173 
174  result1_vec = vaddq_s16(comp0, comp1);
175  result2_vec = vaddq_s16(comp2, comp3);
176 
177  diff12 = vsubq_s16(result1_vec, result2_vec);
178  comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
179  comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
180  comp0 = vandq_s16(result1_vec, comp0);
181  comp1 = vandq_s16(result2_vec, comp1);
182  result1_vec = vaddq_s16(comp0, comp1);
183  vst1q_s16(target, result1_vec);
184  src0 += 8;
185  src1 += 8;
186  src2 += 8;
187  src3 += 8;
188  target += 8;
189  }
190 
191  short temp0 = 0;
192  short temp1 = 0;
193  for (i = eighth_points * 8; i < num_points; ++i) {
194  temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
195  temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
196  *target++ = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
197  src0++;
198  src1++;
199  src2++;
200  src3++;
201  }
202 }
203 #endif /* LV_HAVE_NEON */
204 
205 
206 #ifdef LV_HAVE_GENERIC
207 static inline void volk_16i_x4_quad_max_star_16i_generic(short* target,
208  short* src0,
209  short* src1,
210  short* src2,
211  short* src3,
212  unsigned int num_points)
213 {
214  const unsigned int num_bytes = num_points * 2;
215 
216  int i = 0;
217 
218  int bound = num_bytes >> 1;
219 
220  short temp0 = 0;
221  short temp1 = 0;
222  for (i = 0; i < bound; ++i) {
223  temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
224  temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
225  target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
226  }
227 }
228 
229 #endif /*LV_HAVE_GENERIC*/
230 
231 #endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a_H*/
static void volk_16i_x4_quad_max_star_16i_generic(short *target, short *src0, short *src1, short *src2, short *src3, unsigned int num_points)
Definition: volk_16i_x4_quad_max_star_16i.h:207
static void volk_16i_x4_quad_max_star_16i_neon(short *target, short *src0, short *src1, short *src2, short *src3, unsigned int num_points)
Definition: volk_16i_x4_quad_max_star_16i.h:142
static void volk_16i_x4_quad_max_star_16i_a_sse2(short *target, short *src0, short *src1, short *src2, short *src3, unsigned int num_points)
Definition: volk_16i_x4_quad_max_star_16i.h:55
for i
Definition: volk_config_fixed.tmpl.h:13