Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_16i_x5_add_quad_16i_x4.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
51 #ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
52 #define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
53 
54 #include <inttypes.h>
55 #include <stdio.h>
56 
57 #ifdef LV_HAVE_SSE2
58 #include <emmintrin.h>
59 #include <xmmintrin.h>
60 
61 static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0,
62  short* target1,
63  short* target2,
64  short* target3,
65  short* src0,
66  short* src1,
67  short* src2,
68  short* src3,
69  short* src4,
70  unsigned int num_points)
71 {
72  const unsigned int num_bytes = num_points * 2;
73 
74  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
75  __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2,
76  *p_src3, *p_src4;
77  p_target0 = (__m128i*)target0;
78  p_target1 = (__m128i*)target1;
79  p_target2 = (__m128i*)target2;
80  p_target3 = (__m128i*)target3;
81 
82  p_src0 = (__m128i*)src0;
83  p_src1 = (__m128i*)src1;
84  p_src2 = (__m128i*)src2;
85  p_src3 = (__m128i*)src3;
86  p_src4 = (__m128i*)src4;
87 
88  int i = 0;
89 
90  int bound = (num_bytes >> 4);
91  int leftovers = (num_bytes >> 1) & 7;
92 
93  for (; i < bound; ++i) {
94  xmm0 = _mm_load_si128(p_src0);
95  xmm1 = _mm_load_si128(p_src1);
96  xmm2 = _mm_load_si128(p_src2);
97  xmm3 = _mm_load_si128(p_src3);
98  xmm4 = _mm_load_si128(p_src4);
99 
100  p_src0 += 1;
101  p_src1 += 1;
102 
103  xmm1 = _mm_add_epi16(xmm0, xmm1);
104  xmm2 = _mm_add_epi16(xmm0, xmm2);
105  xmm3 = _mm_add_epi16(xmm0, xmm3);
106  xmm4 = _mm_add_epi16(xmm0, xmm4);
107 
108 
109  p_src2 += 1;
110  p_src3 += 1;
111  p_src4 += 1;
112 
113  _mm_store_si128(p_target0, xmm1);
114  _mm_store_si128(p_target1, xmm2);
115  _mm_store_si128(p_target2, xmm3);
116  _mm_store_si128(p_target3, xmm4);
117 
118  p_target0 += 1;
119  p_target1 += 1;
120  p_target2 += 1;
121  p_target3 += 1;
122  }
123 
124  for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
125  target0[i] = src0[i] + src1[i];
126  target1[i] = src0[i] + src2[i];
127  target2[i] = src0[i] + src3[i];
128  target3[i] = src0[i] + src4[i];
129  }
130 }
131 #endif /*LV_HAVE_SSE2*/
132 
133 #ifdef LV_HAVE_NEON
134 #include <arm_neon.h>
135 
136 static inline void volk_16i_x5_add_quad_16i_x4_neon(short* target0,
137  short* target1,
138  short* target2,
139  short* target3,
140  short* src0,
141  short* src1,
142  short* src2,
143  short* src3,
144  short* src4,
145  unsigned int num_points)
146 {
147  const unsigned int eighth_points = num_points / 8;
148  unsigned int number = 0;
149 
150  int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
151  int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
152  for (number = 0; number < eighth_points; ++number) {
153  src0_vec = vld1q_s16(src0);
154  src1_vec = vld1q_s16(src1);
155  src2_vec = vld1q_s16(src2);
156  src3_vec = vld1q_s16(src3);
157  src4_vec = vld1q_s16(src4);
158 
159  target0_vec = vaddq_s16(src0_vec, src1_vec);
160  target1_vec = vaddq_s16(src0_vec, src2_vec);
161  target2_vec = vaddq_s16(src0_vec, src3_vec);
162  target3_vec = vaddq_s16(src0_vec, src4_vec);
163 
164  vst1q_s16(target0, target0_vec);
165  vst1q_s16(target1, target1_vec);
166  vst1q_s16(target2, target2_vec);
167  vst1q_s16(target3, target3_vec);
168  src0 += 8;
169  src1 += 8;
170  src2 += 8;
171  src3 += 8;
172  src4 += 8;
173  target0 += 8;
174  target1 += 8;
175  target2 += 8;
176  target3 += 8;
177  }
178 
179  for (number = eighth_points * 8; number < num_points; ++number) {
180  *target0++ = *src0 + *src1++;
181  *target1++ = *src0 + *src2++;
182  *target2++ = *src0 + *src3++;
183  *target3++ = *src0++ + *src4++;
184  }
185 }
186 
187 #endif /* LV_HAVE_NEON */
188 
189 #ifdef LV_HAVE_GENERIC
190 
191 static inline void volk_16i_x5_add_quad_16i_x4_generic(short* target0,
192  short* target1,
193  short* target2,
194  short* target3,
195  short* src0,
196  short* src1,
197  short* src2,
198  short* src3,
199  short* src4,
200  unsigned int num_points)
201 {
202  const unsigned int num_bytes = num_points * 2;
203 
204  int i = 0;
205 
206  int bound = num_bytes >> 1;
207 
208  for (i = 0; i < bound; ++i) {
209  target0[i] = src0[i] + src1[i];
210  target1[i] = src0[i] + src2[i];
211  target2[i] = src0[i] + src3[i];
212  target3[i] = src0[i] + src4[i];
213  }
214 }
215 
216 #endif /* LV_HAVE_GENERIC */
217 
218 #endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H*/
static void volk_16i_x5_add_quad_16i_x4_a_sse2(short *target0, short *target1, short *target2, short *target3, short *src0, short *src1, short *src2, short *src3, short *src4, unsigned int num_points)
Definition: volk_16i_x5_add_quad_16i_x4.h:61
static void volk_16i_x5_add_quad_16i_x4_neon(short *target0, short *target1, short *target2, short *target3, short *src0, short *src1, short *src2, short *src3, short *src4, unsigned int num_points)
Definition: volk_16i_x5_add_quad_16i_x4.h:136
static void volk_16i_x5_add_quad_16i_x4_generic(short *target0, short *target1, short *target2, short *target3, short *src0, short *src1, short *src2, short *src3, short *src4, unsigned int num_points)
Definition: volk_16i_x5_add_quad_16i_x4.h:191
for i
Definition: volk_config_fixed.tmpl.h:13