Vector Optimized Library of Kernels  3.1.2
Architecture-tuned implementations of math kernels
volk_32f_s32f_multiply_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
56 #ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H
57 #define INCLUDED_volk_32f_s32f_multiply_32f_u_H
58 
59 #include <inttypes.h>
60 #include <stdio.h>
61 
62 #ifdef LV_HAVE_GENERIC
63 static inline void volk_32f_s32f_multiply_32f_generic(float* cVector,
64  const float* aVector,
65  const float scalar,
66  unsigned int num_points)
67 {
68  for (unsigned int number = 0; number < num_points; number++) {
69  *cVector++ = (*aVector++) * scalar;
70  }
71 }
72 #endif /* LV_HAVE_GENERIC */
73 
74 #ifdef LV_HAVE_SSE
75 #include <xmmintrin.h>
76 
77 static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector,
78  const float* aVector,
79  const float scalar,
80  unsigned int num_points)
81 {
82  const unsigned int quarterPoints = num_points / 4;
83 
84  float* cPtr = cVector;
85  const float* aPtr = aVector;
86 
87  const __m128 bVal = _mm_set_ps1(scalar);
88  for (unsigned int number = 0; number < quarterPoints; number++) {
89  __m128 aVal = _mm_loadu_ps(aPtr);
90 
91  __m128 cVal = _mm_mul_ps(aVal, bVal);
92 
93  _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
94 
95  aPtr += 4;
96  cPtr += 4;
97  }
98 
99  for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
100  *cPtr++ = (*aPtr++) * scalar;
101  }
102 }
103 #endif /* LV_HAVE_SSE */
104 
105 #ifdef LV_HAVE_AVX
106 #include <immintrin.h>
107 
108 static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector,
109  const float* aVector,
110  const float scalar,
111  unsigned int num_points)
112 {
113  const unsigned int eighthPoints = num_points / 8;
114 
115  float* cPtr = cVector;
116  const float* aPtr = aVector;
117 
118  const __m256 bVal = _mm256_set1_ps(scalar);
119  for (unsigned int number = 0; number < eighthPoints; number++) {
120  __m256 aVal = _mm256_loadu_ps(aPtr);
121 
122  __m256 cVal = _mm256_mul_ps(aVal, bVal);
123 
124  _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
125 
126  aPtr += 8;
127  cPtr += 8;
128  }
129 
130  for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
131  *cPtr++ = (*aPtr++) * scalar;
132  }
133 }
134 #endif /* LV_HAVE_AVX */
135 
136 #ifdef LV_HAVE_RISCV64
137 extern void volk_32f_s32f_multiply_32f_sifive_u74(float* cVector,
138  const float* aVector,
139  const float scalar,
140  unsigned int num_points);
141 #endif /* LV_HAVE_RISCV64 */
142 
143 
144 #endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */
145 
146 
147 #ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H
148 #define INCLUDED_volk_32f_s32f_multiply_32f_a_H
149 
150 #include <inttypes.h>
151 #include <stdio.h>
152 
153 #ifdef LV_HAVE_SSE
154 #include <xmmintrin.h>
155 
156 static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector,
157  const float* aVector,
158  const float scalar,
159  unsigned int num_points)
160 {
161  const unsigned int quarterPoints = num_points / 4;
162 
163  float* cPtr = cVector;
164  const float* aPtr = aVector;
165 
166  const __m128 bVal = _mm_set_ps1(scalar);
167  for (unsigned int number = 0; number < quarterPoints; number++) {
168  __m128 aVal = _mm_load_ps(aPtr);
169 
170  __m128 cVal = _mm_mul_ps(aVal, bVal);
171 
172  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
173 
174  aPtr += 4;
175  cPtr += 4;
176  }
177 
178  for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
179  *cPtr++ = (*aPtr++) * scalar;
180  }
181 }
182 #endif /* LV_HAVE_SSE */
183 
184 #ifdef LV_HAVE_AVX
185 #include <immintrin.h>
186 
187 static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector,
188  const float* aVector,
189  const float scalar,
190  unsigned int num_points)
191 {
192  const unsigned int eighthPoints = num_points / 8;
193 
194  float* cPtr = cVector;
195  const float* aPtr = aVector;
196 
197  const __m256 bVal = _mm256_set1_ps(scalar);
198  for (unsigned int number = 0; number < eighthPoints; number++) {
199  __m256 aVal = _mm256_load_ps(aPtr);
200 
201  __m256 cVal = _mm256_mul_ps(aVal, bVal);
202 
203  _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
204 
205  aPtr += 8;
206  cPtr += 8;
207  }
208 
209  for (unsigned int number = eighthPoints * 8; number < num_points; number++) {
210  *cPtr++ = (*aPtr++) * scalar;
211  }
212 }
213 #endif /* LV_HAVE_AVX */
214 
215 #ifdef LV_HAVE_NEON
216 #include <arm_neon.h>
217 
218 static inline void volk_32f_s32f_multiply_32f_u_neon(float* cVector,
219  const float* aVector,
220  const float scalar,
221  unsigned int num_points)
222 {
223  const unsigned int quarterPoints = num_points / 4;
224 
225  const float* inputPtr = aVector;
226  float* outputPtr = cVector;
227 
228  for (unsigned int number = 0; number < quarterPoints; number++) {
229  float32x4_t aVal = vld1q_f32(inputPtr); // Load into NEON regs
230  float32x4_t cVal = vmulq_n_f32(aVal, scalar); // Do the multiply
231  vst1q_f32(outputPtr, cVal); // Store results back to output
232  inputPtr += 4;
233  outputPtr += 4;
234  }
235 
236  for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
237  *outputPtr++ = (*inputPtr++) * scalar;
238  }
239 }
240 #endif /* LV_HAVE_NEON */
241 
242 
243 #ifdef LV_HAVE_ORC
244 
245 extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst,
246  const float* src,
247  const float scalar,
248  int num_points);
249 
250 static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector,
251  const float* aVector,
252  const float scalar,
253  unsigned int num_points)
254 {
255  volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
256 }
257 
258 #endif /* LV_HAVE_ORC */
259 
260 #endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */
static void volk_32f_s32f_multiply_32f_a_avx(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:187
static void volk_32f_s32f_multiply_32f_u_sse(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:77
static void volk_32f_s32f_multiply_32f_u_avx(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:108
static void volk_32f_s32f_multiply_32f_a_sse(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:156
static void volk_32f_s32f_multiply_32f_generic(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:63
static void volk_32f_s32f_multiply_32f_u_neon(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:218