Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_32f_sqrt_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
52 #ifndef INCLUDED_volk_32f_sqrt_32f_a_H
53 #define INCLUDED_volk_32f_sqrt_32f_a_H
54 
55 #include <inttypes.h>
56 #include <math.h>
57 #include <stdio.h>
58 
59 #ifdef LV_HAVE_SSE
60 #include <xmmintrin.h>
61 
62 static inline void
63 volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
64 {
65  unsigned int number = 0;
66  const unsigned int quarterPoints = num_points / 4;
67 
68  float* cPtr = cVector;
69  const float* aPtr = aVector;
70 
71  __m128 aVal, cVal;
72  for (; number < quarterPoints; number++) {
73  aVal = _mm_load_ps(aPtr);
74 
75  cVal = _mm_sqrt_ps(aVal);
76 
77  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
78 
79  aPtr += 4;
80  cPtr += 4;
81  }
82 
83  number = quarterPoints * 4;
84  for (; number < num_points; number++) {
85  *cPtr++ = sqrtf(*aPtr++);
86  }
87 }
88 
89 #endif /* LV_HAVE_SSE */
90 
91 #ifdef LV_HAVE_AVX
92 #include <immintrin.h>
93 
94 static inline void
95 volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
96 {
97  unsigned int number = 0;
98  const unsigned int eighthPoints = num_points / 8;
99 
100  float* cPtr = cVector;
101  const float* aPtr = aVector;
102 
103  __m256 aVal, cVal;
104  for (; number < eighthPoints; number++) {
105  aVal = _mm256_load_ps(aPtr);
106 
107  cVal = _mm256_sqrt_ps(aVal);
108 
109  _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
110 
111  aPtr += 8;
112  cPtr += 8;
113  }
114 
115  number = eighthPoints * 8;
116  for (; number < num_points; number++) {
117  *cPtr++ = sqrtf(*aPtr++);
118  }
119 }
120 
121 #endif /* LV_HAVE_AVX */
122 
123 
124 #ifdef LV_HAVE_NEON
125 #include <arm_neon.h>
126 
127 static inline void
128 volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
129 {
130  float* cPtr = cVector;
131  const float* aPtr = aVector;
132  unsigned int number = 0;
133  unsigned int quarter_points = num_points / 4;
134  float32x4_t in_vec, out_vec;
135 
136  for (number = 0; number < quarter_points; number++) {
137  in_vec = vld1q_f32(aPtr);
138  // note that armv8 has vsqrt_f32 which will be much better
139  out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec));
140  vst1q_f32(cPtr, out_vec);
141  aPtr += 4;
142  cPtr += 4;
143  }
144 
145  for (number = quarter_points * 4; number < num_points; number++) {
146  *cPtr++ = sqrtf(*aPtr++);
147  }
148 }
149 
150 #endif /* LV_HAVE_NEON */
151 
152 
153 #ifdef LV_HAVE_GENERIC
154 
155 static inline void
156 volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
157 {
158  float* cPtr = cVector;
159  const float* aPtr = aVector;
160  unsigned int number = 0;
161 
162  for (number = 0; number < num_points; number++) {
163  *cPtr++ = sqrtf(*aPtr++);
164  }
165 }
166 
167 #endif /* LV_HAVE_GENERIC */
168 
169 #endif /* INCLUDED_volk_32f_sqrt_32f_a_H */
170 
171 #ifndef INCLUDED_volk_32f_sqrt_32f_u_H
172 #define INCLUDED_volk_32f_sqrt_32f_u_H
173 
174 #include <inttypes.h>
175 #include <math.h>
176 #include <stdio.h>
177 #ifdef LV_HAVE_AVX
178 #include <immintrin.h>
179 
180 static inline void
181 volk_32f_sqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
182 {
183  unsigned int number = 0;
184  const unsigned int eighthPoints = num_points / 8;
185 
186  float* cPtr = cVector;
187  const float* aPtr = aVector;
188 
189  __m256 aVal, cVal;
190  for (; number < eighthPoints; number++) {
191  aVal = _mm256_loadu_ps(aPtr);
192 
193  cVal = _mm256_sqrt_ps(aVal);
194 
195  _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
196 
197  aPtr += 8;
198  cPtr += 8;
199  }
200 
201  number = eighthPoints * 8;
202  for (; number < num_points; number++) {
203  *cPtr++ = sqrtf(*aPtr++);
204  }
205 }
206 
207 #endif /* LV_HAVE_AVX */
208 
209 #ifdef LV_HAVE_RVV
210 #include <riscv_vector.h>
211 
212 static inline void
213 volk_32f_sqrt_32f_rvv(float* cVector, const float* aVector, unsigned int num_points)
214 {
215  size_t n = num_points;
216  for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) {
217  vl = __riscv_vsetvl_e32m8(n);
218  vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl);
219  __riscv_vse32(cVector, __riscv_vfsqrt(v, vl), vl);
220  }
221 }
222 #endif /*LV_HAVE_RVV*/
223 
224 #endif /* INCLUDED_volk_32f_sqrt_32f_u_H */
static void volk_32f_sqrt_32f_neon(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:128
static void volk_32f_sqrt_32f_a_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:95
static void volk_32f_sqrt_32f_a_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:63
static void volk_32f_sqrt_32f_u_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:181
static void volk_32f_sqrt_32f_generic(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:156