Vector Optimized Library of Kernels  3.2.0
Architecture-tuned implementations of math kernels
volk_32f_reciprocal_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2024 Magnus Lundmark <magnuslundmark@gmail.com>
4  *
5  * This file is part of VOLK
6  *
7  * SPDX-License-Identifier: LGPL-3.0-or-later
8  */
9 
53 #ifndef INCLUDED_volk_32f_reciprocal_32f_a_H
54 #define INCLUDED_volk_32f_reciprocal_32f_a_H
55 
56 #ifdef LV_HAVE_GENERIC
57 static inline void
58 volk_32f_reciprocal_32f_generic(float* out, const float* in, unsigned int num_points)
59 {
60  for (unsigned int i = 0; i < num_points; i++) {
61  out[i] = 1.f / in[i];
62  }
63 }
64 #endif /* LV_HAVE_GENERIC */
65 
66 #ifdef LV_HAVE_SSE
67 #include <xmmintrin.h>
68 static inline void
69 volk_32f_reciprocal_32f_a_sse(float* out, const float* in, unsigned int num_points)
70 {
71  const __m128 ONE = _mm_set_ps1(1.f);
72  const unsigned int quarter_points = num_points / 4;
73 
74  for (unsigned int number = 0; number < quarter_points; number++) {
75  __m128 x = _mm_load_ps(in);
76  in += 4;
77  __m128 r = _mm_div_ps(ONE, x);
78  _mm_store_ps(out, r);
79  out += 4;
80  }
81 
82  const unsigned int done = quarter_points * 4;
83 
84  volk_32f_reciprocal_32f_generic(out, in, num_points - done);
85 }
86 #endif /* LV_HAVE_SSE */
87 
88 #ifdef LV_HAVE_AVX
89 #include <immintrin.h>
90 static inline void
91 volk_32f_reciprocal_32f_a_avx(float* out, const float* in, unsigned int num_points)
92 {
93  const __m256 ONE = _mm256_set1_ps(1.f);
94  const unsigned int eighth_points = num_points / 8;
95 
96  for (unsigned int number = 0; number < eighth_points; number++) {
97  __m256 x = _mm256_load_ps(in);
98  in += 8;
99  __m256 r = _mm256_div_ps(ONE, x);
100  _mm256_store_ps(out, r);
101  out += 8;
102  }
103 
104  const unsigned int done = eighth_points * 8;
105 
106  volk_32f_reciprocal_32f_generic(out, in, num_points - done);
107 }
108 #endif /* LV_HAVE_AVX */
109 
110 #ifdef LV_HAVE_AVX512F
111 #include <immintrin.h>
112 static inline void
113 volk_32f_reciprocal_32f_a_avx512(float* out, const float* in, unsigned int num_points)
114 {
115  const unsigned int sixteenth_points = num_points / 16;
116 
117  for (unsigned int number = 0; number < sixteenth_points; number++) {
118  __m512 x = _mm512_load_ps(in);
119  in += 16;
120  __m512 r = _mm512_rcp14_ps(x);
121  _mm512_store_ps(out, r);
122  out += 16;
123  }
124 
125  const unsigned int done = sixteenth_points * 16;
126 
127  volk_32f_reciprocal_32f_generic(out, in, num_points - done);
128 }
129 #endif /* LV_HAVE_AVX512F */
130 
131 #endif /* INCLUDED_volk_32f_reciprocal_32f_a_H */
132 
133 #ifndef INCLUDED_volk_32f_reciprocal_32f_u_H
134 #define INCLUDED_volk_32f_reciprocal_32f_u_H
135 
136 #ifdef LV_HAVE_SSE
137 #include <xmmintrin.h>
138 static inline void
139 volk_32f_reciprocal_32f_u_sse(float* out, const float* in, unsigned int num_points)
140 {
141  const __m128 ONE = _mm_set_ps1(1.f);
142  const unsigned int quarter_points = num_points / 4;
143 
144  for (unsigned int number = 0; number < quarter_points; number++) {
145  __m128 x = _mm_loadu_ps(in);
146  in += 4;
147  __m128 r = _mm_div_ps(ONE, x);
148  _mm_storeu_ps(out, r);
149  out += 4;
150  }
151 
152  const unsigned int done = quarter_points * 4;
153 
154  volk_32f_reciprocal_32f_generic(out, in, num_points - done);
155 }
156 #endif /* LV_HAVE_SSE */
157 
158 #ifdef LV_HAVE_AVX
159 #include <immintrin.h>
160 static inline void
161 volk_32f_reciprocal_32f_u_avx(float* out, const float* in, unsigned int num_points)
162 {
163  const __m256 ONE = _mm256_set1_ps(1.f);
164  const unsigned int eighth_points = num_points / 8;
165 
166  for (unsigned int number = 0; number < eighth_points; number++) {
167  __m256 x = _mm256_loadu_ps(in);
168  in += 8;
169  __m256 r = _mm256_div_ps(ONE, x);
170  _mm256_storeu_ps(out, r);
171  out += 8;
172  }
173 
174  const unsigned int done = eighth_points * 8;
175 
176  volk_32f_reciprocal_32f_generic(out, in, num_points - done);
177 }
178 #endif /* LV_HAVE_AVX */
179 
180 #ifdef LV_HAVE_AVX512F
181 #include <immintrin.h>
182 static inline void
183 volk_32f_reciprocal_32f_u_avx512(float* out, const float* in, unsigned int num_points)
184 {
185  const unsigned int sixteenth_points = num_points / 16;
186 
187  for (unsigned int number = 0; number < sixteenth_points; number++) {
188  __m512 x = _mm512_loadu_ps(in);
189  in += 16;
190  __m512 r = _mm512_rcp14_ps(x);
191  _mm512_storeu_ps(out, r);
192  out += 16;
193  }
194 
195  const unsigned int done = sixteenth_points * 16;
196 
197  volk_32f_reciprocal_32f_generic(out, in, num_points - done);
198 }
199 #endif /* LV_HAVE_AVX512F */
200 
201 #ifdef LV_HAVE_RVV
202 #include <riscv_vector.h>
203 
204 static inline void
205 volk_32f_reciprocal_32f_rvv(float* out, const float* in, unsigned int num_points)
206 {
207  size_t n = num_points;
208  for (size_t vl; n > 0; n -= vl, in += vl, out += vl) {
209  vl = __riscv_vsetvl_e32m8(n);
210  vfloat32m8_t v = __riscv_vle32_v_f32m8(in, vl);
211  __riscv_vse32(out, __riscv_vfrdiv(v, 1.0f, vl), vl);
212  }
213 }
214 #endif /*LV_HAVE_RVV*/
215 
216 #endif /* INCLUDED_volk_32f_reciprocal_32f_u_H */
static void volk_32f_reciprocal_32f_a_sse(float *out, const float *in, unsigned int num_points)
Definition: volk_32f_reciprocal_32f.h:69
static void volk_32f_reciprocal_32f_u_avx(float *out, const float *in, unsigned int num_points)
Definition: volk_32f_reciprocal_32f.h:161
static void volk_32f_reciprocal_32f_a_avx(float *out, const float *in, unsigned int num_points)
Definition: volk_32f_reciprocal_32f.h:91
static void volk_32f_reciprocal_32f_generic(float *out, const float *in, unsigned int num_points)
Definition: volk_32f_reciprocal_32f.h:58
static void volk_32f_reciprocal_32f_u_sse(float *out, const float *in, unsigned int num_points)
Definition: volk_32f_reciprocal_32f.h:139
for i
Definition: volk_config_fixed.tmpl.h:13