Vector Optimized Library of Kernels  2.4
Architecture-tuned implementations of math kernels
volk_8i_convert_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
53 #ifndef INCLUDED_volk_8i_convert_16i_u_H
54 #define INCLUDED_volk_8i_convert_16i_u_H
55 
56 #include <inttypes.h>
57 #include <stdio.h>
58 
59 #ifdef LV_HAVE_AVX2
60 #include <immintrin.h>
61 
62 static inline void volk_8i_convert_16i_u_avx2(int16_t* outputVector,
63  const int8_t* inputVector,
64  unsigned int num_points)
65 {
66  unsigned int number = 0;
67  const unsigned int sixteenthPoints = num_points / 16;
68 
69  const __m128i* inputVectorPtr = (const __m128i*)inputVector;
70  __m256i* outputVectorPtr = (__m256i*)outputVector;
71  __m128i inputVal;
72  __m256i ret;
73 
74  for (; number < sixteenthPoints; number++) {
75  inputVal = _mm_loadu_si128(inputVectorPtr);
76  ret = _mm256_cvtepi8_epi16(inputVal);
77  ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
78  _mm256_storeu_si256(outputVectorPtr, ret);
79 
80  outputVectorPtr++;
81  inputVectorPtr++;
82  }
83 
84  number = sixteenthPoints * 16;
85  for (; number < num_points; number++) {
86  outputVector[number] = (int16_t)(inputVector[number]) * 256;
87  }
88 }
89 #endif /* LV_HAVE_AVX2 */
90 
91 
92 #ifdef LV_HAVE_SSE4_1
93 #include <smmintrin.h>
94 
95 static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector,
96  const int8_t* inputVector,
97  unsigned int num_points)
98 {
99  unsigned int number = 0;
100  const unsigned int sixteenthPoints = num_points / 16;
101 
102  const __m128i* inputVectorPtr = (const __m128i*)inputVector;
103  __m128i* outputVectorPtr = (__m128i*)outputVector;
104  __m128i inputVal;
105  __m128i ret;
106 
107  for (; number < sixteenthPoints; number++) {
108  inputVal = _mm_loadu_si128(inputVectorPtr);
109  ret = _mm_cvtepi8_epi16(inputVal);
110  ret = _mm_slli_epi16(ret, 8); // Multiply by 256
111  _mm_storeu_si128(outputVectorPtr, ret);
112 
113  outputVectorPtr++;
114 
115  inputVal = _mm_srli_si128(inputVal, 8);
116  ret = _mm_cvtepi8_epi16(inputVal);
117  ret = _mm_slli_epi16(ret, 8); // Multiply by 256
118  _mm_storeu_si128(outputVectorPtr, ret);
119 
120  outputVectorPtr++;
121 
122  inputVectorPtr++;
123  }
124 
125  number = sixteenthPoints * 16;
126  for (; number < num_points; number++) {
127  outputVector[number] = (int16_t)(inputVector[number]) * 256;
128  }
129 }
130 #endif /* LV_HAVE_SSE4_1 */
131 
132 
133 #ifdef LV_HAVE_GENERIC
134 
135 static inline void volk_8i_convert_16i_generic(int16_t* outputVector,
136  const int8_t* inputVector,
137  unsigned int num_points)
138 {
139  int16_t* outputVectorPtr = outputVector;
140  const int8_t* inputVectorPtr = inputVector;
141  unsigned int number = 0;
142 
143  for (number = 0; number < num_points; number++) {
144  *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
145  }
146 }
147 #endif /* LV_HAVE_GENERIC */
148 
149 
150 #endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
151 
152 
153 #ifndef INCLUDED_volk_8i_convert_16i_a_H
154 #define INCLUDED_volk_8i_convert_16i_a_H
155 
156 #include <inttypes.h>
157 #include <stdio.h>
158 
159 #ifdef LV_HAVE_AVX2
160 #include <immintrin.h>
161 
162 static inline void volk_8i_convert_16i_a_avx2(int16_t* outputVector,
163  const int8_t* inputVector,
164  unsigned int num_points)
165 {
166  unsigned int number = 0;
167  const unsigned int sixteenthPoints = num_points / 16;
168 
169  const __m128i* inputVectorPtr = (const __m128i*)inputVector;
170  __m256i* outputVectorPtr = (__m256i*)outputVector;
171  __m128i inputVal;
172  __m256i ret;
173 
174  for (; number < sixteenthPoints; number++) {
175  inputVal = _mm_load_si128(inputVectorPtr);
176  ret = _mm256_cvtepi8_epi16(inputVal);
177  ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
178  _mm256_store_si256(outputVectorPtr, ret);
179 
180  outputVectorPtr++;
181  inputVectorPtr++;
182  }
183 
184  number = sixteenthPoints * 16;
185  for (; number < num_points; number++) {
186  outputVector[number] = (int16_t)(inputVector[number]) * 256;
187  }
188 }
189 #endif /* LV_HAVE_AVX2 */
190 
191 
192 #ifdef LV_HAVE_SSE4_1
193 #include <smmintrin.h>
194 
195 static inline void volk_8i_convert_16i_a_sse4_1(int16_t* outputVector,
196  const int8_t* inputVector,
197  unsigned int num_points)
198 {
199  unsigned int number = 0;
200  const unsigned int sixteenthPoints = num_points / 16;
201 
202  const __m128i* inputVectorPtr = (const __m128i*)inputVector;
203  __m128i* outputVectorPtr = (__m128i*)outputVector;
204  __m128i inputVal;
205  __m128i ret;
206 
207  for (; number < sixteenthPoints; number++) {
208  inputVal = _mm_load_si128(inputVectorPtr);
209  ret = _mm_cvtepi8_epi16(inputVal);
210  ret = _mm_slli_epi16(ret, 8); // Multiply by 256
211  _mm_store_si128(outputVectorPtr, ret);
212 
213  outputVectorPtr++;
214 
215  inputVal = _mm_srli_si128(inputVal, 8);
216  ret = _mm_cvtepi8_epi16(inputVal);
217  ret = _mm_slli_epi16(ret, 8); // Multiply by 256
218  _mm_store_si128(outputVectorPtr, ret);
219 
220  outputVectorPtr++;
221 
222  inputVectorPtr++;
223  }
224 
225  number = sixteenthPoints * 16;
226  for (; number < num_points; number++) {
227  outputVector[number] = (int16_t)(inputVector[number]) * 256;
228  }
229 }
230 #endif /* LV_HAVE_SSE4_1 */
231 
232 
233 #ifdef LV_HAVE_GENERIC
234 
235 static inline void volk_8i_convert_16i_a_generic(int16_t* outputVector,
236  const int8_t* inputVector,
237  unsigned int num_points)
238 {
239  int16_t* outputVectorPtr = outputVector;
240  const int8_t* inputVectorPtr = inputVector;
241  unsigned int number = 0;
242 
243  for (number = 0; number < num_points; number++) {
244  *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
245  }
246 }
247 #endif /* LV_HAVE_GENERIC */
248 
249 
250 #ifdef LV_HAVE_NEON
251 #include <arm_neon.h>
252 
253 static inline void volk_8i_convert_16i_neon(int16_t* outputVector,
254  const int8_t* inputVector,
255  unsigned int num_points)
256 {
257  int16_t* outputVectorPtr = outputVector;
258  const int8_t* inputVectorPtr = inputVector;
259  unsigned int number;
260  const unsigned int eighth_points = num_points / 8;
261 
262  int8x8_t input_vec;
263  int16x8_t converted_vec;
264 
265  // NEON doesn't have a concept of 8 bit registers, so we are really
266  // dealing with the low half of 16-bit registers. Since this requires
267  // a move instruction we likely do better with ASM here.
268  for (number = 0; number < eighth_points; ++number) {
269  input_vec = vld1_s8(inputVectorPtr);
270  converted_vec = vmovl_s8(input_vec);
271  // converted_vec = vmulq_s16(converted_vec, scale_factor);
272  converted_vec = vshlq_n_s16(converted_vec, 8);
273  vst1q_s16(outputVectorPtr, converted_vec);
274 
275  inputVectorPtr += 8;
276  outputVectorPtr += 8;
277  }
278 
279  for (number = eighth_points * 8; number < num_points; number++) {
280  *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
281  }
282 }
283 #endif /* LV_HAVE_NEON */
284 
285 
286 #ifdef LV_HAVE_ORC
287 extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector,
288  const int8_t* inputVector,
289  unsigned int num_points);
290 
291 static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector,
292  const int8_t* inputVector,
293  unsigned int num_points)
294 {
295  volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points);
296 }
297 #endif /* LV_HAVE_ORC */
298 
299 
300 #endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */
static void volk_8i_convert_16i_a_generic(int16_t *outputVector, const int8_t *inputVector, unsigned int num_points)
Definition: volk_8i_convert_16i.h:235
static void volk_8i_convert_16i_neon(int16_t *outputVector, const int8_t *inputVector, unsigned int num_points)
Definition: volk_8i_convert_16i.h:253
static void volk_8i_convert_16i_generic(int16_t *outputVector, const int8_t *inputVector, unsigned int num_points)
Definition: volk_8i_convert_16i.h:135