libstdc++
simd_builtin.h
1 // Simd Abi specific implementations -*- C++ -*-
2 
3 // Copyright (C) 2020-2024 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
26 #define _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
27 
28 #if __cplusplus >= 201703L
29 
30 #include <array>
31 #include <cmath>
32 #include <cstdlib>
33 
34 _GLIBCXX_SIMD_BEGIN_NAMESPACE
35 // _S_allbits{{{
36 template <typename _V>
37  static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_allbits
38  = reinterpret_cast<_V>(~__vector_type_t<char, sizeof(_V) / sizeof(char)>());
39 
40 // }}}
41 // _S_signmask, _S_absmask{{{
42 template <typename _V, typename = _VectorTraits<_V>>
43  static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_signmask
44  = __xor(_V() + 1, _V() - 1);
45 
46 template <typename _V, typename = _VectorTraits<_V>>
47  static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_absmask
48  = __andnot(_S_signmask<_V>, _S_allbits<_V>);
49 
50 //}}}
51 // __vector_permute<Indices...>{{{
52 // Index == -1 requests zeroing of the output element
53 template <int... _Indices, typename _Tp, typename _TVT = _VectorTraits<_Tp>,
54  typename = __detail::__odr_helper>
55  constexpr _Tp
56  __vector_permute(_Tp __x)
57  {
58  static_assert(sizeof...(_Indices) == _TVT::_S_full_size);
59  return __make_vector<typename _TVT::value_type>(
60  (_Indices == -1 ? 0 : __x[_Indices == -1 ? 0 : _Indices])...);
61  }
62 
63 // }}}
64 // __vector_shuffle<Indices...>{{{
65 // Index == -1 requests zeroing of the output element
66 template <int... _Indices, typename _Tp, typename _TVT = _VectorTraits<_Tp>,
67  typename = __detail::__odr_helper>
68  constexpr _Tp
69  __vector_shuffle(_Tp __x, _Tp __y)
70  {
71  return _Tp{(_Indices == -1 ? 0
72  : _Indices < _TVT::_S_full_size
73  ? __x[_Indices]
74  : __y[_Indices - _TVT::_S_full_size])...};
75  }
76 
77 // }}}
78 // __make_wrapper{{{
79 template <typename _Tp, typename... _Args>
80  _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, sizeof...(_Args)>
81  __make_wrapper(const _Args&... __args)
82  { return __make_vector<_Tp>(__args...); }
83 
84 // }}}
85 // __wrapper_bitcast{{{
86 template <typename _Tp, size_t _ToN = 0, typename _Up, size_t _M,
87  size_t _Np = _ToN != 0 ? _ToN : sizeof(_Up) * _M / sizeof(_Tp)>
88  _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, _Np>
89  __wrapper_bitcast(_SimdWrapper<_Up, _M> __x)
90  {
91  static_assert(_Np > 1);
92  return __intrin_bitcast<__vector_type_t<_Tp, _Np>>(__x._M_data);
93  }
94 
95 // }}}
96 // __extract_part(_SimdWrapper<_Tp, _Np>) {{{
97 template <int _Index, int _Total, int _Combine, typename _Tp, size_t _Np>
98  _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
99  conditional_t<_Np == _Total and _Combine == 1, _Tp, _SimdWrapper<_Tp, _Np / _Total * _Combine>>
100  __extract_part(const _SimdWrapper<_Tp, _Np> __x)
101  {
102  if constexpr (_Np == _Total and _Combine == 1)
103  return __x[_Index];
104  else if constexpr (_Index % 2 == 0 && _Total % 2 == 0 && _Combine % 2 == 0)
105  return __extract_part<_Index / 2, _Total / 2, _Combine / 2>(__x);
106  else
107  {
108  constexpr size_t __values_per_part = _Np / _Total;
109  constexpr size_t __values_to_skip = _Index * __values_per_part;
110  constexpr size_t __return_size = __values_per_part * _Combine;
111  using _R = __vector_type_t<_Tp, __return_size>;
112  static_assert((_Index + _Combine) * __values_per_part * sizeof(_Tp)
113  <= sizeof(__x),
114  "out of bounds __extract_part");
115  // the following assertion would ensure no "padding" to be read
116  // static_assert(_Total >= _Index + _Combine, "_Total must be greater
117  // than _Index");
118 
119  // static_assert(__return_size * _Total == _Np, "_Np must be divisible
120  // by _Total");
121  if (__x._M_is_constprop())
122  return __generate_from_n_evaluations<__return_size, _R>(
123  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
124  return __x[__values_to_skip + __i];
125  });
126  if constexpr (_Index == 0 && _Total == 1)
127  return __x;
128  else if constexpr (_Index == 0)
129  return __intrin_bitcast<_R>(__as_vector(__x));
130  else
131  return __vec_shuffle(__as_vector(__x), make_index_sequence<__bit_ceil(__return_size)>(),
132  [](size_t __i) {
133  return __i + __values_to_skip;
134  });
135  }
136  }
137 
138 // }}}
139 // __extract_part(_SimdWrapper<bool, _Np>) {{{
140 template <int _Index, int _Total, int _Combine = 1, size_t _Np>
141  _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<bool, _Np / _Total * _Combine>
142  __extract_part(const _SimdWrapper<bool, _Np> __x)
143  {
144  static_assert(_Combine == 1, "_Combine != 1 not implemented");
145  static_assert(__have_avx512f && _Total >= 2 && _Index + _Combine <= _Total && _Index >= 0);
146  return __x._M_data >> (_Index * _Np / _Total);
147  }
148 
149 // }}}
150 
151 // __vector_convert {{{
152 // implementation requires an index sequence
153 template <typename _To, typename _From, size_t... _I>
154  _GLIBCXX_SIMD_INTRINSIC constexpr _To
155  __vector_convert(_From __a, index_sequence<_I...>)
156  {
157  using _Tp = typename _VectorTraits<_To>::value_type;
158  return _To{static_cast<_Tp>(__a[_I])...};
159  }
160 
161 template <typename _To, typename _From, size_t... _I>
162  _GLIBCXX_SIMD_INTRINSIC constexpr _To
163  __vector_convert(_From __a, _From __b, index_sequence<_I...>)
164  {
165  using _Tp = typename _VectorTraits<_To>::value_type;
166  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...};
167  }
168 
169 template <typename _To, typename _From, size_t... _I>
170  _GLIBCXX_SIMD_INTRINSIC constexpr _To
171  __vector_convert(_From __a, _From __b, _From __c, index_sequence<_I...>)
172  {
173  using _Tp = typename _VectorTraits<_To>::value_type;
174  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
175  static_cast<_Tp>(__c[_I])...};
176  }
177 
178 template <typename _To, typename _From, size_t... _I>
179  _GLIBCXX_SIMD_INTRINSIC constexpr _To
180  __vector_convert(_From __a, _From __b, _From __c, _From __d,
181  index_sequence<_I...>)
182  {
183  using _Tp = typename _VectorTraits<_To>::value_type;
184  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
185  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...};
186  }
187 
188 template <typename _To, typename _From, size_t... _I>
189  _GLIBCXX_SIMD_INTRINSIC constexpr _To
190  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
191  index_sequence<_I...>)
192  {
193  using _Tp = typename _VectorTraits<_To>::value_type;
194  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
195  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
196  static_cast<_Tp>(__e[_I])...};
197  }
198 
199 template <typename _To, typename _From, size_t... _I>
200  _GLIBCXX_SIMD_INTRINSIC constexpr _To
201  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
202  _From __f, index_sequence<_I...>)
203  {
204  using _Tp = typename _VectorTraits<_To>::value_type;
205  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
206  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
207  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...};
208  }
209 
210 template <typename _To, typename _From, size_t... _I>
211  _GLIBCXX_SIMD_INTRINSIC constexpr _To
212  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
213  _From __f, _From __g, index_sequence<_I...>)
214  {
215  using _Tp = typename _VectorTraits<_To>::value_type;
216  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
217  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
218  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
219  static_cast<_Tp>(__g[_I])...};
220  }
221 
222 template <typename _To, typename _From, size_t... _I>
223  _GLIBCXX_SIMD_INTRINSIC constexpr _To
224  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
225  _From __f, _From __g, _From __h, index_sequence<_I...>)
226  {
227  using _Tp = typename _VectorTraits<_To>::value_type;
228  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
229  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
230  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
231  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...};
232  }
233 
234 template <typename _To, typename _From, size_t... _I>
235  _GLIBCXX_SIMD_INTRINSIC constexpr _To
236  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
237  _From __f, _From __g, _From __h, _From __i,
238  index_sequence<_I...>)
239  {
240  using _Tp = typename _VectorTraits<_To>::value_type;
241  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
242  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
243  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
244  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
245  static_cast<_Tp>(__i[_I])...};
246  }
247 
248 template <typename _To, typename _From, size_t... _I>
249  _GLIBCXX_SIMD_INTRINSIC constexpr _To
250  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
251  _From __f, _From __g, _From __h, _From __i, _From __j,
252  index_sequence<_I...>)
253  {
254  using _Tp = typename _VectorTraits<_To>::value_type;
255  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
256  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
257  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
258  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
259  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...};
260  }
261 
262 template <typename _To, typename _From, size_t... _I>
263  _GLIBCXX_SIMD_INTRINSIC constexpr _To
264  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
265  _From __f, _From __g, _From __h, _From __i, _From __j,
266  _From __k, index_sequence<_I...>)
267  {
268  using _Tp = typename _VectorTraits<_To>::value_type;
269  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
270  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
271  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
272  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
273  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
274  static_cast<_Tp>(__k[_I])...};
275  }
276 
277 template <typename _To, typename _From, size_t... _I>
278  _GLIBCXX_SIMD_INTRINSIC constexpr _To
279  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
280  _From __f, _From __g, _From __h, _From __i, _From __j,
281  _From __k, _From __l, index_sequence<_I...>)
282  {
283  using _Tp = typename _VectorTraits<_To>::value_type;
284  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
285  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
286  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
287  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
288  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
289  static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...};
290  }
291 
292 template <typename _To, typename _From, size_t... _I>
293  _GLIBCXX_SIMD_INTRINSIC constexpr _To
294  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
295  _From __f, _From __g, _From __h, _From __i, _From __j,
296  _From __k, _From __l, _From __m, index_sequence<_I...>)
297  {
298  using _Tp = typename _VectorTraits<_To>::value_type;
299  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
300  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
301  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
302  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
303  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
304  static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
305  static_cast<_Tp>(__m[_I])...};
306  }
307 
308 template <typename _To, typename _From, size_t... _I>
309  _GLIBCXX_SIMD_INTRINSIC constexpr _To
310  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
311  _From __f, _From __g, _From __h, _From __i, _From __j,
312  _From __k, _From __l, _From __m, _From __n,
313  index_sequence<_I...>)
314  {
315  using _Tp = typename _VectorTraits<_To>::value_type;
316  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
317  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
318  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
319  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
320  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
321  static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
322  static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...};
323  }
324 
325 template <typename _To, typename _From, size_t... _I>
326  _GLIBCXX_SIMD_INTRINSIC constexpr _To
327  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
328  _From __f, _From __g, _From __h, _From __i, _From __j,
329  _From __k, _From __l, _From __m, _From __n, _From __o,
330  index_sequence<_I...>)
331  {
332  using _Tp = typename _VectorTraits<_To>::value_type;
333  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
334  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
335  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
336  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
337  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
338  static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
339  static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...,
340  static_cast<_Tp>(__o[_I])...};
341  }
342 
343 template <typename _To, typename _From, size_t... _I>
344  _GLIBCXX_SIMD_INTRINSIC constexpr _To
345  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
346  _From __f, _From __g, _From __h, _From __i, _From __j,
347  _From __k, _From __l, _From __m, _From __n, _From __o,
348  _From __p, index_sequence<_I...>)
349  {
350  using _Tp = typename _VectorTraits<_To>::value_type;
351  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
352  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
353  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
354  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
355  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
356  static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
357  static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...,
358  static_cast<_Tp>(__o[_I])..., static_cast<_Tp>(__p[_I])...};
359  }
360 
361 // Defer actual conversion to the overload that takes an index sequence. Note
362 // that this function adds zeros or drops values off the end if you don't ensure
363 // matching width.
364 template <typename _To, typename... _From, size_t _FromSize>
365  _GLIBCXX_SIMD_INTRINSIC constexpr _To
366  __vector_convert(_SimdWrapper<_From, _FromSize>... __xs)
367  {
368 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
369  using _From0 = __first_of_pack_t<_From...>;
370  using _FW = _SimdWrapper<_From0, _FromSize>;
371  if (!_FW::_S_is_partial && !(... && __xs._M_is_constprop()))
372  {
373  if constexpr ((sizeof...(_From) & (sizeof...(_From) - 1))
374  == 0) // power-of-two number of arguments
375  return __convert_x86<_To>(__as_vector(__xs)...);
376  else // append zeros and recurse until the above branch is taken
377  return __vector_convert<_To>(__xs..., _FW{});
378  }
379  else
380 #endif
381  return __vector_convert<_To>(
382  __as_vector(__xs)...,
383  make_index_sequence<(sizeof...(__xs) == 1 ? std::min(
384  _VectorTraits<_To>::_S_full_size, int(_FromSize))
385  : _FromSize)>());
386  }
387 
388 // }}}
389 // __convert function{{{
390 template <typename _To, typename _From, typename... _More>
391  _GLIBCXX_SIMD_INTRINSIC constexpr auto
392  __convert(_From __v0, _More... __vs)
393  {
394  static_assert((true && ... && is_same_v<_From, _More>) );
395  if constexpr (__is_vectorizable_v<_From>)
396  {
397  using _V = typename _VectorTraits<_To>::type;
398  using _Tp = typename _VectorTraits<_To>::value_type;
399  return _V{static_cast<_Tp>(__v0), static_cast<_Tp>(__vs)...};
400  }
401  else if constexpr (__is_vector_type_v<_From>)
402  return __convert<_To>(__as_wrapper(__v0), __as_wrapper(__vs)...);
403  else // _SimdWrapper arguments
404  {
405  constexpr size_t __input_size = _From::_S_size * (1 + sizeof...(_More));
406  if constexpr (__is_vectorizable_v<_To>)
407  return __convert<__vector_type_t<_To, __input_size>>(__v0, __vs...);
408  else if constexpr (!__is_vector_type_v<_To>)
409  return _To(__convert<typename _To::_BuiltinType>(__v0, __vs...));
410  else
411  {
412  static_assert(
413  sizeof...(_More) == 0
414  || _VectorTraits<_To>::_S_full_size >= __input_size,
415  "__convert(...) requires the input to fit into the output");
416  return __vector_convert<_To>(__v0, __vs...);
417  }
418  }
419  }
420 
421 // }}}
422 // __convert_all{{{
423 // Converts __v into array<_To, N>, where N is _NParts if non-zero or
424 // otherwise deduced from _To such that N * #elements(_To) <= #elements(__v).
425 // Note: this function may return less than all converted elements
426 template <typename _To,
427  size_t _NParts = 0, // allows to convert fewer or more (only last
428  // _To, to be partially filled) than all
429  size_t _Offset = 0, // where to start, # of elements (not Bytes or
430  // Parts)
431  typename _From, typename _FromVT = _VectorTraits<_From>>
432  _GLIBCXX_SIMD_INTRINSIC auto
433  __convert_all(_From __v)
434  {
435  if constexpr (is_arithmetic_v<_To> && _NParts != 1)
436  {
437  static_assert(_Offset < _FromVT::_S_full_size);
438  constexpr auto _Np
439  = _NParts == 0 ? _FromVT::_S_partial_width - _Offset : _NParts;
440  return __generate_from_n_evaluations<_Np, array<_To, _Np>>(
441  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
442  return static_cast<_To>(__v[__i + _Offset]);
443  });
444  }
445  else
446  {
447  static_assert(__is_vector_type_v<_To>);
448  using _ToVT = _VectorTraits<_To>;
449  if constexpr (__is_vector_type_v<_From>)
450  return __convert_all<_To, _NParts>(__as_wrapper(__v));
451  else if constexpr (_NParts == 1)
452  {
453  static_assert(_Offset % _ToVT::_S_full_size == 0);
454  return array<_To, 1>{__vector_convert<_To>(
455  __extract_part<_Offset / _ToVT::_S_full_size,
456  __div_roundup(_FromVT::_S_partial_width,
457  _ToVT::_S_full_size)>(__v))};
458  }
459 #if _GLIBCXX_SIMD_X86INTRIN // {{{
460  else if constexpr (!__have_sse4_1 && _Offset == 0
461  && is_integral_v<typename _FromVT::value_type>
462  && sizeof(typename _FromVT::value_type)
463  < sizeof(typename _ToVT::value_type)
464  && !(sizeof(typename _FromVT::value_type) == 4
465  && is_same_v<typename _ToVT::value_type, double>))
466  {
467  using _ToT = typename _ToVT::value_type;
468  using _FromT = typename _FromVT::value_type;
469  constexpr size_t _Np
470  = _NParts != 0
471  ? _NParts
472  : (_FromVT::_S_partial_width / _ToVT::_S_full_size);
473  using _R = array<_To, _Np>;
474  // __adjust modifies its input to have _Np (use _SizeConstant)
475  // entries so that no unnecessary intermediate conversions are
476  // requested and, more importantly, no intermediate conversions are
477  // missing
478  [[maybe_unused]] auto __adjust
479  = [](auto __n,
480  auto __vv) -> _SimdWrapper<_FromT, decltype(__n)::value> {
481  return __vector_bitcast<_FromT, decltype(__n)::value>(__vv);
482  };
483  [[maybe_unused]] const auto __vi = __to_intrin(__v);
484  auto&& __make_array
485  = [](auto __x0, [[maybe_unused]] auto __x1) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
486  if constexpr (_Np == 1)
487  return _R{__intrin_bitcast<_To>(__x0)};
488  else
489  return _R{__intrin_bitcast<_To>(__x0),
490  __intrin_bitcast<_To>(__x1)};
491  };
492 
493  if constexpr (_Np == 0)
494  return _R{};
495  else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 2)
496  {
497  static_assert(is_integral_v<_FromT>);
498  static_assert(is_integral_v<_ToT>);
499  if constexpr (is_unsigned_v<_FromT>)
500  return __make_array(_mm_unpacklo_epi8(__vi, __m128i()),
501  _mm_unpackhi_epi8(__vi, __m128i()));
502  else
503  return __make_array(
504  _mm_srai_epi16(_mm_unpacklo_epi8(__vi, __vi), 8),
505  _mm_srai_epi16(_mm_unpackhi_epi8(__vi, __vi), 8));
506  }
507  else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 4)
508  {
509  static_assert(is_integral_v<_FromT>);
510  if constexpr (is_floating_point_v<_ToT>)
511  {
512  const auto __ints
513  = __convert_all<__vector_type16_t<int>, _Np>(
514  __adjust(_SizeConstant<_Np * 4>(), __v));
515  return __generate_from_n_evaluations<_Np, _R>(
516  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
517  return __vector_convert<_To>(__as_wrapper(__ints[__i]));
518  });
519  }
520  else if constexpr (is_unsigned_v<_FromT>)
521  return __make_array(_mm_unpacklo_epi16(__vi, __m128i()),
522  _mm_unpackhi_epi16(__vi, __m128i()));
523  else
524  return __make_array(
525  _mm_srai_epi32(_mm_unpacklo_epi16(__vi, __vi), 16),
526  _mm_srai_epi32(_mm_unpackhi_epi16(__vi, __vi), 16));
527  }
528  else if constexpr (sizeof(_FromT) == 4 && sizeof(_ToT) == 8
529  && is_integral_v<_FromT> && is_integral_v<_ToT>)
530  {
531  if constexpr (is_unsigned_v<_FromT>)
532  return __make_array(_mm_unpacklo_epi32(__vi, __m128i()),
533  _mm_unpackhi_epi32(__vi, __m128i()));
534  else
535  return __make_array(
536  _mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, 31)),
537  _mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, 31)));
538  }
539  else if constexpr (sizeof(_FromT) == 4 && sizeof(_ToT) == 8
540  && is_integral_v<_FromT> && is_integral_v<_ToT>)
541  {
542  if constexpr (is_unsigned_v<_FromT>)
543  return __make_array(_mm_unpacklo_epi32(__vi, __m128i()),
544  _mm_unpackhi_epi32(__vi, __m128i()));
545  else
546  return __make_array(
547  _mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, 31)),
548  _mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, 31)));
549  }
550  else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) >= 4
551  && is_signed_v<_FromT>)
552  {
553  const __m128i __vv[2] = {_mm_unpacklo_epi8(__vi, __vi),
554  _mm_unpackhi_epi8(__vi, __vi)};
555  const __vector_type_t<int, 4> __vvvv[4] = {
556  __vector_bitcast<int>(_mm_unpacklo_epi16(__vv[0], __vv[0])),
557  __vector_bitcast<int>(_mm_unpackhi_epi16(__vv[0], __vv[0])),
558  __vector_bitcast<int>(_mm_unpacklo_epi16(__vv[1], __vv[1])),
559  __vector_bitcast<int>(_mm_unpackhi_epi16(__vv[1], __vv[1]))};
560  if constexpr (sizeof(_ToT) == 4)
561  return __generate_from_n_evaluations<_Np, _R>(
562  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
563  return __vector_convert<_To>(
564  _SimdWrapper<int, 4>(__vvvv[__i] >> 24));
565  });
566  else if constexpr (is_integral_v<_ToT>)
567  return __generate_from_n_evaluations<_Np, _R>(
568  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
569  const auto __signbits = __to_intrin(__vvvv[__i / 2] >> 31);
570  const auto __sx32 = __to_intrin(__vvvv[__i / 2] >> 24);
571  return __vector_bitcast<_ToT>(
572  __i % 2 == 0 ? _mm_unpacklo_epi32(__sx32, __signbits)
573  : _mm_unpackhi_epi32(__sx32, __signbits));
574  });
575  else
576  return __generate_from_n_evaluations<_Np, _R>(
577  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
578  const _SimdWrapper<int, 4> __int4 = __vvvv[__i / 2] >> 24;
579  return __vector_convert<_To>(
580  __i % 2 == 0 ? __int4
581  : _SimdWrapper<int, 4>(
582  _mm_unpackhi_epi64(__to_intrin(__int4),
583  __to_intrin(__int4))));
584  });
585  }
586  else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 4)
587  {
588  const auto __shorts = __convert_all<__vector_type16_t<
589  conditional_t<is_signed_v<_FromT>, short, unsigned short>>>(
590  __adjust(_SizeConstant<(_Np + 1) / 2 * 8>(), __v));
591  return __generate_from_n_evaluations<_Np, _R>(
592  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
593  return __convert_all<_To>(__shorts[__i / 2])[__i % 2];
594  });
595  }
596  else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 8
597  && is_signed_v<_FromT> && is_integral_v<_ToT>)
598  {
599  const __m128i __vv[2] = {_mm_unpacklo_epi16(__vi, __vi),
600  _mm_unpackhi_epi16(__vi, __vi)};
601  const __vector_type16_t<int> __vvvv[4]
602  = {__vector_bitcast<int>(
603  _mm_unpacklo_epi32(_mm_srai_epi32(__vv[0], 16),
604  _mm_srai_epi32(__vv[0], 31))),
605  __vector_bitcast<int>(
606  _mm_unpackhi_epi32(_mm_srai_epi32(__vv[0], 16),
607  _mm_srai_epi32(__vv[0], 31))),
608  __vector_bitcast<int>(
609  _mm_unpacklo_epi32(_mm_srai_epi32(__vv[1], 16),
610  _mm_srai_epi32(__vv[1], 31))),
611  __vector_bitcast<int>(
612  _mm_unpackhi_epi32(_mm_srai_epi32(__vv[1], 16),
613  _mm_srai_epi32(__vv[1], 31)))};
614  return __generate_from_n_evaluations<_Np, _R>(
615  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
616  return __vector_bitcast<_ToT>(__vvvv[__i]);
617  });
618  }
619  else if constexpr (sizeof(_FromT) <= 2 && sizeof(_ToT) == 8)
620  {
621  const auto __ints
622  = __convert_all<__vector_type16_t<conditional_t<
623  is_signed_v<_FromT> || is_floating_point_v<_ToT>, int,
624  unsigned int>>>(
625  __adjust(_SizeConstant<(_Np + 1) / 2 * 4>(), __v));
626  return __generate_from_n_evaluations<_Np, _R>(
627  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
628  return __convert_all<_To>(__ints[__i / 2])[__i % 2];
629  });
630  }
631  else
632  __assert_unreachable<_To>();
633  }
634 #endif // _GLIBCXX_SIMD_X86INTRIN }}}
635  else if constexpr ((_FromVT::_S_partial_width - _Offset)
636  > _ToVT::_S_full_size)
637  {
638  /*
639  static_assert(
640  (_FromVT::_S_partial_width & (_FromVT::_S_partial_width - 1)) ==
641  0,
642  "__convert_all only supports power-of-2 number of elements.
643  Otherwise " "the return type cannot be array<_To, N>.");
644  */
645  constexpr size_t _NTotal
646  = (_FromVT::_S_partial_width - _Offset) / _ToVT::_S_full_size;
647  constexpr size_t _Np = _NParts == 0 ? _NTotal : _NParts;
648  static_assert(
649  _Np <= _NTotal
650  || (_Np == _NTotal + 1
651  && (_FromVT::_S_partial_width - _Offset) % _ToVT::_S_full_size
652  > 0));
653  using _R = array<_To, _Np>;
654  if constexpr (_Np == 1)
655  return _R{__vector_convert<_To>(
656  __extract_part<_Offset, _FromVT::_S_partial_width,
657  _ToVT::_S_full_size>(__v))};
658  else
659  return __generate_from_n_evaluations<_Np, _R>(
660  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
661  auto __part
662  = __extract_part<__i * _ToVT::_S_full_size + _Offset,
663  _FromVT::_S_partial_width,
664  _ToVT::_S_full_size>(__v);
665  return __vector_convert<_To>(__part);
666  });
667  }
668  else if constexpr (_Offset == 0)
669  return array<_To, 1>{__vector_convert<_To>(__v)};
670  else
671  return array<_To, 1>{__vector_convert<_To>(
672  __extract_part<_Offset, _FromVT::_S_partial_width,
673  _FromVT::_S_partial_width - _Offset>(__v))};
674  }
675  }
676 
677 // }}}
678 
679 // _GnuTraits {{{
680 template <typename _Tp, typename _Mp, typename _Abi, size_t _Np>
681  struct _GnuTraits
682  {
683  using _IsValid = true_type;
684  using _SimdImpl = typename _Abi::_SimdImpl;
685  using _MaskImpl = typename _Abi::_MaskImpl;
686 
687  // simd and simd_mask member types {{{
688  using _SimdMember = _SimdWrapper<_Tp, _Np>;
689  using _MaskMember = _SimdWrapper<_Mp, _Np>;
690  static constexpr size_t _S_simd_align = alignof(_SimdMember);
691  static constexpr size_t _S_mask_align = alignof(_MaskMember);
692 
693  // }}}
694  // size metadata {{{
695  static constexpr size_t _S_full_size = _SimdMember::_S_full_size;
696  static constexpr bool _S_is_partial = _SimdMember::_S_is_partial;
697 
698  // }}}
699  // _SimdBase / base class for simd, providing extra conversions {{{
700  struct _SimdBase2
701  {
702  _GLIBCXX_SIMD_ALWAYS_INLINE explicit
703  operator __intrinsic_type_t<_Tp, _Np>() const
704  { return __to_intrin(static_cast<const simd<_Tp, _Abi>*>(this)->_M_data); }
705 
706  _GLIBCXX_SIMD_ALWAYS_INLINE explicit
707  operator __vector_type_t<_Tp, _Np>() const
708  { return __data(*static_cast<const simd<_Tp, _Abi>*>(this)); }
709  };
710 
711  struct _SimdBase1
712  {
713  _GLIBCXX_SIMD_ALWAYS_INLINE explicit
714  operator __intrinsic_type_t<_Tp, _Np>() const
715  { return __data(*static_cast<const simd<_Tp, _Abi>*>(this)); }
716  };
717 
718  using _SimdBase = conditional_t<
719  is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
720  _SimdBase1, _SimdBase2>;
721 
722  // }}}
723  // _MaskBase {{{
724  struct _MaskBase2
725  {
726  _GLIBCXX_SIMD_ALWAYS_INLINE explicit
727  operator __intrinsic_type_t<_Tp, _Np>() const
728  { return static_cast<const simd_mask<_Tp, _Abi>*>(this) ->_M_data.__intrin(); }
729 
730  _GLIBCXX_SIMD_ALWAYS_INLINE explicit
731  operator __vector_type_t<_Tp, _Np>() const
732  { return static_cast<const simd_mask<_Tp, _Abi>*>(this)->_M_data._M_data; }
733  };
734 
735  struct _MaskBase1
736  {
737  _GLIBCXX_SIMD_ALWAYS_INLINE explicit
738  operator __intrinsic_type_t<_Tp, _Np>() const
739  { return __data(*static_cast<const simd_mask<_Tp, _Abi>*>(this)); }
740  };
741 
742  using _MaskBase = conditional_t<
743  is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
744  _MaskBase1, _MaskBase2>;
745 
746  // }}}
747  // _MaskCastType {{{
748  // parameter type of one explicit simd_mask constructor
749  class _MaskCastType
750  {
751  using _Up = __intrinsic_type_t<_Tp, _Np>;
752  _Up _M_data;
753 
754  public:
755  _GLIBCXX_SIMD_ALWAYS_INLINE
756  _MaskCastType(_Up __x) : _M_data(__x) {}
757 
758  _GLIBCXX_SIMD_ALWAYS_INLINE
759  operator _MaskMember() const { return _M_data; }
760  };
761 
762  // }}}
763  // _SimdCastType {{{
764  // parameter type of one explicit simd constructor
765  class _SimdCastType1
766  {
767  using _Ap = __intrinsic_type_t<_Tp, _Np>;
768  _SimdMember _M_data;
769 
770  public:
771  _GLIBCXX_SIMD_ALWAYS_INLINE constexpr
772  _SimdCastType1(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {}
773 
774  _GLIBCXX_SIMD_ALWAYS_INLINE constexpr
775  operator _SimdMember() const { return _M_data; }
776  };
777 
778  class _SimdCastType2
779  {
780  using _Ap = __intrinsic_type_t<_Tp, _Np>;
781  using _Bp = __vector_type_t<_Tp, _Np>;
782  _SimdMember _M_data;
783 
784  public:
785  _GLIBCXX_SIMD_ALWAYS_INLINE constexpr
786  _SimdCastType2(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {}
787 
788  _GLIBCXX_SIMD_ALWAYS_INLINE constexpr
789  _SimdCastType2(_Bp __b) : _M_data(__b) {}
790 
791  _GLIBCXX_SIMD_ALWAYS_INLINE constexpr
792  operator _SimdMember() const { return _M_data; }
793  };
794 
795  using _SimdCastType = conditional_t<
796  is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
797  _SimdCastType1, _SimdCastType2>;
798  //}}}
799  };
800 
801 // }}}
802 struct _CommonImplX86;
803 struct _CommonImplNeon;
804 struct _CommonImplBuiltin;
805 template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplBuiltin;
806 template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplBuiltin;
807 template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplX86;
808 template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplX86;
809 template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplNeon;
810 template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplNeon;
811 template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplPpc;
812 template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplPpc;
813 
814 // simd_abi::_VecBuiltin {{{
815 template <int _UsedBytes>
816  struct simd_abi::_VecBuiltin
817  {
818  template <typename _Tp>
819  static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp);
820 
821  // validity traits {{{
822  struct _IsValidAbiTag : __bool_constant<(_UsedBytes > 1)> {};
823 
824  template <typename _Tp>
825  struct _IsValidSizeFor
826  : __bool_constant<(_UsedBytes / sizeof(_Tp) > 1
827  && _UsedBytes % sizeof(_Tp) == 0
828  && _UsedBytes <= __vectorized_sizeof<_Tp>()
829  && (!__have_avx512f || _UsedBytes <= 32))> {};
830 
831  template <typename _Tp>
832  struct _IsValid : conjunction<_IsValidAbiTag, __is_vectorizable<_Tp>,
833  _IsValidSizeFor<_Tp>> {};
834 
835  template <typename _Tp>
836  static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value;
837 
838  // }}}
839  // _SimdImpl/_MaskImpl {{{
840 #if _GLIBCXX_SIMD_X86INTRIN
841  using _CommonImpl = _CommonImplX86;
842  using _SimdImpl = _SimdImplX86<_VecBuiltin<_UsedBytes>>;
843  using _MaskImpl = _MaskImplX86<_VecBuiltin<_UsedBytes>>;
844 #elif _GLIBCXX_SIMD_HAVE_NEON
845  using _CommonImpl = _CommonImplNeon;
846  using _SimdImpl = _SimdImplNeon<_VecBuiltin<_UsedBytes>>;
847  using _MaskImpl = _MaskImplNeon<_VecBuiltin<_UsedBytes>>;
848 #else
849  using _CommonImpl = _CommonImplBuiltin;
850 #ifdef __ALTIVEC__
851  using _SimdImpl = _SimdImplPpc<_VecBuiltin<_UsedBytes>>;
852  using _MaskImpl = _MaskImplPpc<_VecBuiltin<_UsedBytes>>;
853 #else
854  using _SimdImpl = _SimdImplBuiltin<_VecBuiltin<_UsedBytes>>;
855  using _MaskImpl = _MaskImplBuiltin<_VecBuiltin<_UsedBytes>>;
856 #endif
857 #endif
858 
859  // }}}
860  // __traits {{{
861  template <typename _Tp>
862  using _MaskValueType = __int_for_sizeof_t<_Tp>;
863 
864  template <typename _Tp>
865  using __traits
866  = conditional_t<_S_is_valid_v<_Tp>,
867  _GnuTraits<_Tp, _MaskValueType<_Tp>,
868  _VecBuiltin<_UsedBytes>, _S_size<_Tp>>,
869  _InvalidTraits>;
870 
871  //}}}
872  // size metadata {{{
873  template <typename _Tp>
874  static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size;
875 
876  template <typename _Tp>
877  static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial;
878 
879  // }}}
880  // implicit masks {{{
881  template <typename _Tp>
882  using _MaskMember = _SimdWrapper<_MaskValueType<_Tp>, _S_size<_Tp>>;
883 
884  template <typename _Tp>
885  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
886  _S_implicit_mask()
887  {
888  using _UV = typename _MaskMember<_Tp>::_BuiltinType;
889  if constexpr (!_MaskMember<_Tp>::_S_is_partial)
890  return ~_UV();
891  else
892  {
893  constexpr auto __size = _S_size<_Tp>;
894  _GLIBCXX_SIMD_USE_CONSTEXPR auto __r
895  = __generate_vector<_UV>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
896  { return __i < __size ? -1 : 0; });
897  return __r;
898  }
899  }
900 
901  template <typename _Tp>
902  _GLIBCXX_SIMD_INTRINSIC static constexpr __intrinsic_type_t<_Tp, _S_size<_Tp>>
903  _S_implicit_mask_intrin()
904  { return __to_intrin(__vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()._M_data)); }
905 
906  template <typename _TW, typename _TVT = _VectorTraits<_TW>>
907  _GLIBCXX_SIMD_INTRINSIC static constexpr _TW
908  _S_masked(_TW __x)
909  {
910  using _Tp = typename _TVT::value_type;
911  if constexpr (!_MaskMember<_Tp>::_S_is_partial)
912  return __x;
913  else
914  return __and(__as_vector(__x),
915  __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()));
916  }
917 
918  template <typename _TW, typename _TVT = _VectorTraits<_TW>>
919  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
920  __make_padding_nonzero(_TW __x)
921  {
922  using _Tp = typename _TVT::value_type;
923  if constexpr (!_S_is_partial<_Tp>)
924  return __x;
925  else
926  {
927  _GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask
928  = __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>());
929  if constexpr (is_integral_v<_Tp>)
930  return __or(__x, ~__implicit_mask);
931  else
932  {
933  _GLIBCXX_SIMD_USE_CONSTEXPR auto __one
934  = __andnot(__implicit_mask,
935  __vector_broadcast<_S_full_size<_Tp>>(_Tp(1)));
936  // it's not enough to return `x | 1_in_padding` because the
937  // padding in x might be inf or nan (independent of
938  // __FINITE_MATH_ONLY__, because it's about padding bits)
939  return __or(__and(__x, __implicit_mask), __one);
940  }
941  }
942  }
943  // }}}
944  };
945 
946 // }}}
947 // simd_abi::_VecBltnBtmsk {{{
948 template <int _UsedBytes>
949  struct simd_abi::_VecBltnBtmsk
950  {
951  template <typename _Tp>
952  static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp);
953 
954  // validity traits {{{
955  struct _IsValidAbiTag : __bool_constant<(_UsedBytes > 1)> {};
956 
957  template <typename _Tp>
958  struct _IsValidSizeFor
959  : __bool_constant<(_UsedBytes / sizeof(_Tp) > 1
960  && _UsedBytes % sizeof(_Tp) == 0 && _UsedBytes <= 64
961  && (_UsedBytes > 32 || __have_avx512vl))> {};
962 
963  // Bitmasks require at least AVX512F. If sizeof(_Tp) < 4 the AVX512BW is also
964  // required.
965  template <typename _Tp>
966  struct _IsValid
967  : conjunction<
968  _IsValidAbiTag, __bool_constant<__have_avx512f>,
969  __bool_constant<__have_avx512bw || (sizeof(_Tp) >= 4)>,
970  __bool_constant<(__vectorized_sizeof<_Tp>() > sizeof(_Tp))>,
971  _IsValidSizeFor<_Tp>> {};
972 
973  template <typename _Tp>
974  static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value;
975 
976  // }}}
977  // simd/_MaskImpl {{{
978  #if _GLIBCXX_SIMD_X86INTRIN
979  using _CommonImpl = _CommonImplX86;
980  using _SimdImpl = _SimdImplX86<_VecBltnBtmsk<_UsedBytes>>;
981  using _MaskImpl = _MaskImplX86<_VecBltnBtmsk<_UsedBytes>>;
982  #else
983  template <int>
984  struct _MissingImpl;
985 
986  using _CommonImpl = _MissingImpl<_UsedBytes>;
987  using _SimdImpl = _MissingImpl<_UsedBytes>;
988  using _MaskImpl = _MissingImpl<_UsedBytes>;
989  #endif
990 
991  // }}}
992  // __traits {{{
993  template <typename _Tp>
994  using _MaskMember = _SimdWrapper<bool, _S_size<_Tp>>;
995 
996  template <typename _Tp>
997  using __traits = conditional_t<
998  _S_is_valid_v<_Tp>,
999  _GnuTraits<_Tp, bool, _VecBltnBtmsk<_UsedBytes>, _S_size<_Tp>>,
1000  _InvalidTraits>;
1001 
1002  //}}}
1003  // size metadata {{{
1004  template <typename _Tp>
1005  static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size;
1006  template <typename _Tp>
1007  static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial;
1008 
1009  // }}}
1010  // implicit mask {{{
1011  private:
1012  template <typename _Tp>
1013  using _ImplicitMask = _SimdWrapper<bool, _S_size<_Tp>>;
1014 
1015  public:
1016  template <size_t _Np>
1017  _GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_Np>
1018  __implicit_mask_n()
1019  {
1020  using _Tp = __bool_storage_member_type_t<_Np>;
1021  return _Np < sizeof(_Tp) * __CHAR_BIT__ ? _Tp((1ULL << _Np) - 1) : ~_Tp();
1022  }
1023 
1024  template <typename _Tp>
1025  _GLIBCXX_SIMD_INTRINSIC static constexpr _ImplicitMask<_Tp>
1026  _S_implicit_mask()
1027  { return __implicit_mask_n<_S_size<_Tp>>(); }
1028 
1029  template <typename _Tp>
1030  _GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_S_size<_Tp>>
1031  _S_implicit_mask_intrin()
1032  { return __implicit_mask_n<_S_size<_Tp>>(); }
1033 
1034  template <typename _Tp, size_t _Np>
1035  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1036  _S_masked(_SimdWrapper<_Tp, _Np> __x)
1037  {
1038  if constexpr (is_same_v<_Tp, bool>)
1039  if constexpr (_Np < 8 || (_Np & (_Np - 1)) != 0)
1040  return _MaskImpl::_S_bit_and(
1041  __x, _SimdWrapper<_Tp, _Np>(
1042  __bool_storage_member_type_t<_Np>((1ULL << _Np) - 1)));
1043  else
1044  return __x;
1045  else
1046  return _S_masked(__x._M_data);
1047  }
1048 
1049  template <typename _TV>
1050  _GLIBCXX_SIMD_INTRINSIC static constexpr _TV
1051  _S_masked(_TV __x)
1052  {
1053  using _Tp = typename _VectorTraits<_TV>::value_type;
1054  static_assert(
1055  !__is_bitmask_v<_TV>,
1056  "_VecBltnBtmsk::_S_masked cannot work on bitmasks, since it doesn't "
1057  "know the number of elements. Use _SimdWrapper<bool, N> instead.");
1058  if constexpr (_S_is_partial<_Tp>)
1059  {
1060  constexpr size_t _Np = _S_size<_Tp>;
1061  return __make_dependent_t<_TV, _CommonImpl>::_S_blend(
1062  _S_implicit_mask<_Tp>(), _SimdWrapper<_Tp, _Np>(),
1063  _SimdWrapper<_Tp, _Np>(__x));
1064  }
1065  else
1066  return __x;
1067  }
1068 
1069  template <typename _TV, typename _TVT = _VectorTraits<_TV>>
1070  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
1071  __make_padding_nonzero(_TV __x)
1072  {
1073  using _Tp = typename _TVT::value_type;
1074  if constexpr (!_S_is_partial<_Tp>)
1075  return __x;
1076  else
1077  {
1078  constexpr size_t _Np = _S_size<_Tp>;
1079  if constexpr (is_integral_v<typename _TVT::value_type>)
1080  return __x
1081  | __generate_vector<_Tp, _S_full_size<_Tp>>(
1082  [](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Tp {
1083  if (__i < _Np)
1084  return 0;
1085  else
1086  return 1;
1087  });
1088  else
1089  return __make_dependent_t<_TV, _CommonImpl>::_S_blend(
1090  _S_implicit_mask<_Tp>(),
1091  _SimdWrapper<_Tp, _Np>(
1092  __vector_broadcast<_S_full_size<_Tp>>(_Tp(1))),
1093  _SimdWrapper<_Tp, _Np>(__x))
1094  ._M_data;
1095  }
1096  }
1097 
1098  // }}}
1099  };
1100 
1101 //}}}
1102 // _CommonImplBuiltin {{{
1103 struct _CommonImplBuiltin
1104 {
1105  // _S_converts_via_decomposition{{{
1106  // This lists all cases where a __vector_convert needs to fall back to
1107  // conversion of individual scalars (i.e. decompose the input vector into
1108  // scalars, convert, compose output vector). In those cases, _S_masked_load &
1109  // _S_masked_store prefer to use the _S_bit_iteration implementation.
1110  template <typename _From, typename _To, size_t _ToSize>
1111  static inline constexpr bool __converts_via_decomposition_v
1112  = sizeof(_From) != sizeof(_To);
1113 
1114  // }}}
1115  // _S_load{{{
1116  template <typename _Tp, size_t _Np, size_t _Bytes = _Np * sizeof(_Tp)>
1117  _GLIBCXX_SIMD_INTRINSIC static __vector_type_t<_Tp, _Np>
1118  _S_load(const void* __p)
1119  {
1120  static_assert(_Np > 1);
1121  static_assert(_Bytes % sizeof(_Tp) == 0);
1122  using _Rp = __vector_type_t<_Tp, _Np>;
1123  if constexpr (sizeof(_Rp) == _Bytes)
1124  {
1125  _Rp __r;
1126  __builtin_memcpy(&__r, __p, _Bytes);
1127  return __r;
1128  }
1129  else
1130  {
1131 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR90424
1132  using _Up = conditional_t<
1133  is_integral_v<_Tp>,
1134  conditional_t<_Bytes % 4 == 0,
1135  conditional_t<_Bytes % 8 == 0, long long, int>,
1136  conditional_t<_Bytes % 2 == 0, short, signed char>>,
1137  conditional_t<(_Bytes < 8 || _Np % 2 == 1 || _Np == 2), _Tp,
1138  double>>;
1139  using _V = __vector_type_t<_Up, _Np * sizeof(_Tp) / sizeof(_Up)>;
1140  if constexpr (sizeof(_V) != sizeof(_Rp))
1141  { // on i386 with 4 < _Bytes <= 8
1142  _Rp __r{};
1143  __builtin_memcpy(&__r, __p, _Bytes);
1144  return __r;
1145  }
1146  else
1147 #else // _GLIBCXX_SIMD_WORKAROUND_PR90424
1148  using _V = _Rp;
1149 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90424
1150  {
1151  _V __r{};
1152  static_assert(_Bytes <= sizeof(_V));
1153  __builtin_memcpy(&__r, __p, _Bytes);
1154  return reinterpret_cast<_Rp>(__r);
1155  }
1156  }
1157  }
1158 
1159  // }}}
1160  // _S_store {{{
1161  template <size_t _Bytes>
1162  _GLIBCXX_SIMD_INTRINSIC static void
1163  _S_memcpy(char* __dst, const char* __src)
1164  {
1165  if constexpr (_Bytes > 0)
1166  {
1167  constexpr size_t _Ns = std::__bit_floor(_Bytes);
1168  __builtin_memcpy(__dst, __src, _Ns);
1169  _S_memcpy<_Bytes - _Ns>(__dst + _Ns, __src + _Ns);
1170  }
1171  }
1172 
1173  template <size_t _ReqBytes = 0, typename _TV>
1174  _GLIBCXX_SIMD_INTRINSIC static void
1175  _S_store(_TV __x, void* __addr)
1176  {
1177  constexpr size_t _Bytes = _ReqBytes == 0 ? sizeof(__x) : _ReqBytes;
1178  static_assert(sizeof(__x) >= _Bytes);
1179 
1180 #if !defined __clang__ && _GLIBCXX_SIMD_WORKAROUND_PR90424
1181  if constexpr (__is_vector_type_v<_TV>)
1182  _S_memcpy<_Bytes>(reinterpret_cast<char*>(__addr), reinterpret_cast<const char*>(&__x));
1183  else
1184 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90424
1185  __builtin_memcpy(__addr, &__x, _Bytes);
1186  }
1187 
1188  template <typename _Tp, size_t _Np>
1189  _GLIBCXX_SIMD_INTRINSIC static void
1190  _S_store(_SimdWrapper<_Tp, _Np> __x, void* __addr)
1191  { _S_store<_Np * sizeof(_Tp)>(__x._M_data, __addr); }
1192 
1193  // }}}
1194  // _S_store_bool_array(_BitMask) {{{
1195  template <size_t _Np, bool _Sanitized>
1196  _GLIBCXX_SIMD_INTRINSIC static constexpr void
1197  _S_store_bool_array(_BitMask<_Np, _Sanitized> __x, bool* __mem)
1198  {
1199  if constexpr (_Np == 1)
1200  __mem[0] = __x[0];
1201  else if (__builtin_is_constant_evaluated())
1202  {
1203  for (size_t __i = 0; __i < _Np; ++__i)
1204  __mem[__i] = __x[__i];
1205  }
1206  else if constexpr (_Np == 2)
1207  {
1208  short __bool2 = (__x._M_to_bits() * 0x81) & 0x0101;
1209  _S_store<_Np>(__bool2, __mem);
1210  }
1211  else if constexpr (_Np == 3)
1212  {
1213  int __bool3 = (__x._M_to_bits() * 0x4081) & 0x010101;
1214  _S_store<_Np>(__bool3, __mem);
1215  }
1216  else
1217  {
1218  __execute_n_times<__div_roundup(_Np, 4)>(
1219  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1220  constexpr int __offset = __i * 4;
1221  constexpr int __remaining = _Np - __offset;
1222  if constexpr (__remaining > 4 && __remaining <= 7)
1223  {
1224  const _ULLong __bool7
1225  = (__x.template _M_extract<__offset>()._M_to_bits()
1226  * 0x40810204081ULL)
1227  & 0x0101010101010101ULL;
1228  _S_store<__remaining>(__bool7, __mem + __offset);
1229  }
1230  else if constexpr (__remaining >= 4)
1231  {
1232  int __bits = __x.template _M_extract<__offset>()._M_to_bits();
1233  if constexpr (__remaining > 7)
1234  __bits &= 0xf;
1235  const int __bool4 = (__bits * 0x204081) & 0x01010101;
1236  _S_store<4>(__bool4, __mem + __offset);
1237  }
1238  });
1239  }
1240  }
1241 
1242  // }}}
1243  // _S_blend{{{
1244  template <typename _Tp, size_t _Np>
1245  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
1246  _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k,
1247  _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1)
1248  { return __k._M_data ? __at1._M_data : __at0._M_data; }
1249 
1250  // }}}
1251 };
1252 
1253 // }}}
1254 // _SimdImplBuiltin {{{1
1255 template <typename _Abi, typename>
1256  struct _SimdImplBuiltin
1257  {
1258  // member types {{{2
1259  template <typename _Tp>
1260  static constexpr size_t _S_max_store_size = 16;
1261 
1262  using abi_type = _Abi;
1263 
1264  template <typename _Tp>
1265  using _TypeTag = _Tp*;
1266 
1267  template <typename _Tp>
1268  using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
1269 
1270  template <typename _Tp>
1271  using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
1272 
1273  template <typename _Tp>
1274  static constexpr size_t _S_size = _Abi::template _S_size<_Tp>;
1275 
1276  template <typename _Tp>
1277  static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>;
1278 
1279  using _CommonImpl = typename _Abi::_CommonImpl;
1280  using _SuperImpl = typename _Abi::_SimdImpl;
1281  using _MaskImpl = typename _Abi::_MaskImpl;
1282 
1283  // _M_make_simd(_SimdWrapper/__intrinsic_type_t) {{{2
1284  template <typename _Tp, size_t _Np>
1285  _GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi>
1286  _M_make_simd(_SimdWrapper<_Tp, _Np> __x)
1287  { return {__private_init, __x}; }
1288 
1289  template <typename _Tp, size_t _Np>
1290  _GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi>
1291  _M_make_simd(__intrinsic_type_t<_Tp, _Np> __x)
1292  { return {__private_init, __vector_bitcast<_Tp>(__x)}; }
1293 
1294  // _S_broadcast {{{2
1295  template <typename _Tp>
1296  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp>
1297  _S_broadcast(_Tp __x) noexcept
1298  { return __vector_broadcast<_S_full_size<_Tp>>(__x); }
1299 
1300  // _S_generator {{{2
1301  template <typename _Fp, typename _Tp>
1302  inline static constexpr _SimdMember<_Tp>
1303  _S_generator(_Fp&& __gen, _TypeTag<_Tp>)
1304  {
1305  return __generate_vector<_Tp, _S_full_size<_Tp>>(
1306  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1307  if constexpr (__i < _S_size<_Tp>)
1308  return __gen(__i);
1309  else
1310  return 0;
1311  });
1312  }
1313 
1314  // _S_load {{{2
1315  template <typename _Tp, typename _Up>
1316  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp>
1317  _S_load(const _Up* __mem, _TypeTag<_Tp>) noexcept
1318  {
1319  constexpr size_t _Np = _S_size<_Tp>;
1320  constexpr size_t __max_load_size
1321  = (sizeof(_Up) >= 4 && __have_avx512f) || __have_avx512bw ? 64
1322  : (is_floating_point_v<_Up> && __have_avx) || __have_avx2 ? 32
1323  : 16;
1324  constexpr size_t __bytes_to_load = sizeof(_Up) * _Np;
1325  if (__builtin_is_constant_evaluated())
1326  return __generate_vector<_Tp, _S_full_size<_Tp>>(
1327  [&](auto __i) constexpr {
1328  return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0);
1329  });
1330  else if constexpr (sizeof(_Up) > 8 or __vectorized_sizeof<_Up>() <= sizeof(_Up))
1331  return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>(
1332  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1333  return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0);
1334  });
1335  else if constexpr (is_same_v<_Up, _Tp>)
1336  return _CommonImpl::template _S_load<_Tp, _S_full_size<_Tp>,
1337  _Np * sizeof(_Tp)>(__mem);
1338  else if constexpr (__bytes_to_load <= __max_load_size)
1339  return __convert<_SimdMember<_Tp>>(
1340  _CommonImpl::template _S_load<_Up, _Np>(__mem));
1341  else if constexpr (__bytes_to_load % __max_load_size == 0)
1342  {
1343  constexpr size_t __n_loads = __bytes_to_load / __max_load_size;
1344  constexpr size_t __elements_per_load = _Np / __n_loads;
1345  return __call_with_n_evaluations<__n_loads>(
1346  [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1347  return __convert<_SimdMember<_Tp>>(__uncvted...);
1348  }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1349  return _CommonImpl::template _S_load<_Up, __elements_per_load>(
1350  __mem + __i * __elements_per_load);
1351  });
1352  }
1353  else if constexpr (__bytes_to_load % (__max_load_size / 2) == 0
1354  && __max_load_size > 16)
1355  { // e.g. int[] -> <char, 12> with AVX2
1356  constexpr size_t __n_loads
1357  = __bytes_to_load / (__max_load_size / 2);
1358  constexpr size_t __elements_per_load = _Np / __n_loads;
1359  return __call_with_n_evaluations<__n_loads>(
1360  [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1361  return __convert<_SimdMember<_Tp>>(__uncvted...);
1362  }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1363  return _CommonImpl::template _S_load<_Up, __elements_per_load>(
1364  __mem + __i * __elements_per_load);
1365  });
1366  }
1367  else // e.g. int[] -> <char, 9>
1368  return __call_with_subscripts(
1369  __mem, make_index_sequence<_Np>(),
1370  [](auto... __args) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1371  return __vector_type_t<_Tp, _S_full_size<_Tp>>{static_cast<_Tp>(__args)...};
1372  });
1373  }
1374 
1375  // _S_masked_load {{{2
1376  template <typename _Tp, size_t _Np, typename _Up>
1377  static constexpr inline _SimdWrapper<_Tp, _Np>
1378  _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
1379  const _Up* __mem) noexcept
1380  {
1381  _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
1382  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1383  __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
1384  });
1385  return __merge;
1386  }
1387 
1388  // _S_store {{{2
1389  template <typename _Tp, typename _Up>
1390  _GLIBCXX_SIMD_INTRINSIC static constexpr void
1391  _S_store(_SimdMember<_Tp> __v, _Up* __mem, _TypeTag<_Tp>) noexcept
1392  {
1393  // TODO: converting int -> "smaller int" can be optimized with AVX512
1394  constexpr size_t _Np = _S_size<_Tp>;
1395  constexpr size_t __max_store_size
1396  = _SuperImpl::template _S_max_store_size<_Up>;
1397  if (__builtin_is_constant_evaluated())
1398  {
1399  for (size_t __i = 0; __i < _Np; ++__i)
1400  __mem[__i] = __v[__i];
1401  }
1402  else if constexpr (sizeof(_Up) > 8 or __vectorized_sizeof<_Up>() <= sizeof(_Up))
1403  __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1404  __mem[__i] = __v[__i];
1405  });
1406  else if constexpr (is_same_v<_Up, _Tp>)
1407  _CommonImpl::_S_store(__v, __mem);
1408  else if constexpr (sizeof(_Up) * _Np <= __max_store_size)
1409  _CommonImpl::_S_store(_SimdWrapper<_Up, _Np>(__convert<_Up>(__v)),
1410  __mem);
1411  else
1412  {
1413  constexpr size_t __vsize = __max_store_size / sizeof(_Up);
1414  // round up to convert the last partial vector as well:
1415  constexpr size_t __stores = __div_roundup(_Np, __vsize);
1416  constexpr size_t __full_stores = _Np / __vsize;
1417  using _V = __vector_type_t<_Up, __vsize>;
1418  const array<_V, __stores> __converted
1419  = __convert_all<_V, __stores>(__v);
1420  __execute_n_times<__full_stores>(
1421  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1422  _CommonImpl::_S_store(__converted[__i], __mem + __i * __vsize);
1423  });
1424  if constexpr (__full_stores < __stores)
1425  _CommonImpl::template _S_store<(_Np - __full_stores * __vsize)
1426  * sizeof(_Up)>(
1427  __converted[__full_stores], __mem + __full_stores * __vsize);
1428  }
1429  }
1430 
1431  // _S_masked_store_nocvt {{{2
1432  template <typename _Tp, size_t _Np>
1433  _GLIBCXX_SIMD_INTRINSIC static constexpr void
1434  _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _MaskMember<_Tp> __k)
1435  {
1436  _BitOps::_S_bit_iteration(
1437  _MaskImpl::_S_to_bits(__k),
1438  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1439  __mem[__i] = __v[__i];
1440  });
1441  }
1442 
1443  // _S_masked_store {{{2
1444  template <typename _TW, typename _TVT = _VectorTraits<_TW>,
1445  typename _Tp = typename _TVT::value_type, typename _Up>
1446  static constexpr inline void
1447  _S_masked_store(const _TW __v, _Up* __mem, const _MaskMember<_Tp> __k) noexcept
1448  {
1449  constexpr size_t _TV_size = _S_size<_Tp>;
1450  [[maybe_unused]] const auto __vi = __to_intrin(__v);
1451  constexpr size_t __max_store_size
1452  = _SuperImpl::template _S_max_store_size<_Up>;
1453  if constexpr (
1454  is_same_v<
1455  _Tp,
1456  _Up> || (is_integral_v<_Tp> && is_integral_v<_Up> && sizeof(_Tp) == sizeof(_Up)))
1457  {
1458  // bitwise or no conversion, reinterpret:
1459  const _MaskMember<_Up> __kk = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1460  if constexpr (__is_bitmask_v<decltype(__k)>)
1461  return _MaskMember<_Up>(__k._M_data);
1462  else
1463  return __wrapper_bitcast<__int_for_sizeof_t<_Up>>(__k);
1464  }();
1465  _SuperImpl::_S_masked_store_nocvt(__wrapper_bitcast<_Up>(__v),
1466  __mem, __kk);
1467  }
1468  else if constexpr (__vectorized_sizeof<_Up>() > sizeof(_Up)
1469  && !_CommonImpl::
1470  template __converts_via_decomposition_v<
1471  _Tp, _Up, __max_store_size>)
1472  { // conversion via decomposition is better handled via the
1473  // bit_iteration
1474  // fallback below
1475  constexpr size_t _UW_size
1476  = std::min(_TV_size, __max_store_size / sizeof(_Up));
1477  static_assert(_UW_size <= _TV_size);
1478  using _UW = _SimdWrapper<_Up, _UW_size>;
1479  using _UV = __vector_type_t<_Up, _UW_size>;
1480  using _UAbi = simd_abi::__no_sve_deduce_t<_Up, _UW_size>;
1481  if constexpr (_UW_size == _TV_size) // one convert+store
1482  {
1483  const _UW __converted = __convert<_UW>(__v);
1484  _UAbi::_SimdImpl::_S_masked_store_nocvt(
1485  __converted, __mem,
1486  _UAbi::_MaskImpl::template _S_convert<
1487  __int_for_sizeof_t<_Up>>(__k));
1488  }
1489  else
1490  {
1491  static_assert(_UW_size * sizeof(_Up) == __max_store_size);
1492  constexpr size_t _NFullStores = _TV_size / _UW_size;
1493  constexpr size_t _NAllStores
1494  = __div_roundup(_TV_size, _UW_size);
1495  constexpr size_t _NParts = _S_full_size<_Tp> / _UW_size;
1496  const array<_UV, _NAllStores> __converted
1497  = __convert_all<_UV, _NAllStores>(__v);
1498  __execute_n_times<_NFullStores>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1499  _UAbi::_SimdImpl::_S_masked_store_nocvt(
1500  _UW(__converted[__i]), __mem + __i * _UW_size,
1501  _UAbi::_MaskImpl::template _S_convert<
1502  __int_for_sizeof_t<_Up>>(
1503  __extract_part<__i, _NParts>(__k.__as_full_vector())));
1504  });
1505  if constexpr (_NAllStores
1506  > _NFullStores) // one partial at the end
1507  _UAbi::_SimdImpl::_S_masked_store_nocvt(
1508  _UW(__converted[_NFullStores]),
1509  __mem + _NFullStores * _UW_size,
1510  _UAbi::_MaskImpl::template _S_convert<
1511  __int_for_sizeof_t<_Up>>(
1512  __extract_part<_NFullStores, _NParts>(
1513  __k.__as_full_vector())));
1514  }
1515  }
1516  else
1517  _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
1518  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1519  __mem[__i] = static_cast<_Up>(__v[__i]);
1520  });
1521  }
1522 
1523  // _S_complement {{{2
1524  template <typename _Tp, size_t _Np>
1525  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1526  _S_complement(_SimdWrapper<_Tp, _Np> __x) noexcept
1527  {
1528  if constexpr (is_floating_point_v<_Tp>)
1529  return __vector_bitcast<_Tp>(~__vector_bitcast<__int_for_sizeof_t<_Tp>>(__x));
1530  else
1531  return ~__x._M_data;
1532  }
1533 
1534  // _S_unary_minus {{{2
1535  template <typename _Tp, size_t _Np>
1536  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1537  _S_unary_minus(_SimdWrapper<_Tp, _Np> __x) noexcept
1538  {
1539  // GCC doesn't use the psign instructions, but pxor & psub seem to be
1540  // just as good a choice as pcmpeqd & psign. So meh.
1541  return -__x._M_data;
1542  }
1543 
1544  // arithmetic operators {{{2
1545  template <typename _Tp, size_t _Np>
1546  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1547  _S_plus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1548  { return __x._M_data + __y._M_data; }
1549 
1550  template <typename _Tp, size_t _Np>
1551  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1552  _S_minus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1553  { return __x._M_data - __y._M_data; }
1554 
1555  template <typename _Tp, size_t _Np>
1556  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1557  _S_multiplies(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1558  { return __x._M_data * __y._M_data; }
1559 
1560  template <typename _Tp, size_t _Np>
1561  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1562  _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1563  {
1564  // Note that division by 0 is always UB, so we must ensure we avoid the
1565  // case for partial registers
1566  if constexpr (!_Abi::template _S_is_partial<_Tp>)
1567  return __x._M_data / __y._M_data;
1568  else
1569  return __x._M_data / _Abi::__make_padding_nonzero(__y._M_data);
1570  }
1571 
1572  template <typename _Tp, size_t _Np>
1573  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1574  _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1575  {
1576  if constexpr (!_Abi::template _S_is_partial<_Tp>)
1577  return __x._M_data % __y._M_data;
1578  else
1579  return __as_vector(__x)
1580  % _Abi::__make_padding_nonzero(__as_vector(__y));
1581  }
1582 
1583  template <typename _Tp, size_t _Np>
1584  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1585  _S_bit_and(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1586  { return __and(__x, __y); }
1587 
1588  template <typename _Tp, size_t _Np>
1589  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1590  _S_bit_or(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1591  { return __or(__x, __y); }
1592 
1593  template <typename _Tp, size_t _Np>
1594  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1595  _S_bit_xor(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1596  { return __xor(__x, __y); }
1597 
1598  template <typename _Tp, size_t _Np>
1599  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
1600  _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1601  { return __x._M_data << __y._M_data; }
1602 
1603  template <typename _Tp, size_t _Np>
1604  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
1605  _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1606  { return __x._M_data >> __y._M_data; }
1607 
1608  template <typename _Tp, size_t _Np>
1609  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1610  _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, int __y)
1611  { return __x._M_data << __y; }
1612 
1613  template <typename _Tp, size_t _Np>
1614  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1615  _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, int __y)
1616  { return __x._M_data >> __y; }
1617 
1618  // compares {{{2
1619  // _S_equal_to {{{3
1620  template <typename _Tp, size_t _Np>
1621  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1622  _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1623  { return __x._M_data == __y._M_data; }
1624 
1625  // _S_not_equal_to {{{3
1626  template <typename _Tp, size_t _Np>
1627  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1628  _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1629  { return __x._M_data != __y._M_data; }
1630 
1631  // _S_less {{{3
1632  template <typename _Tp, size_t _Np>
1633  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1634  _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1635  { return __x._M_data < __y._M_data; }
1636 
1637  // _S_less_equal {{{3
1638  template <typename _Tp, size_t _Np>
1639  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1640  _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1641  { return __x._M_data <= __y._M_data; }
1642 
1643  // _S_negate {{{2
1644  template <typename _Tp, size_t _Np>
1645  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1646  _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept
1647  { return !__x._M_data; }
1648 
1649  // _S_min, _S_max, _S_minmax {{{2
1650  template <typename _Tp, size_t _Np>
1651  _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
1652  _SimdWrapper<_Tp, _Np>
1653  _S_min(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
1654  { return __a._M_data < __b._M_data ? __a._M_data : __b._M_data; }
1655 
1656  template <typename _Tp, size_t _Np>
1657  _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
1658  _SimdWrapper<_Tp, _Np>
1659  _S_max(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
1660  { return __a._M_data > __b._M_data ? __a._M_data : __b._M_data; }
1661 
1662  template <typename _Tp, size_t _Np>
1663  _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
1664  pair<_SimdWrapper<_Tp, _Np>, _SimdWrapper<_Tp, _Np>>
1665  _S_minmax(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
1666  {
1667  return {__a._M_data < __b._M_data ? __a._M_data : __b._M_data,
1668  __a._M_data < __b._M_data ? __b._M_data : __a._M_data};
1669  }
1670 
1671  // reductions {{{2
1672  template <size_t _Np, size_t... _Is, size_t... _Zeros, typename _Tp,
1673  typename _BinaryOperation>
1674  _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
1675  _S_reduce_partial(index_sequence<_Is...>, index_sequence<_Zeros...>,
1676  simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
1677  {
1678  using _V = __vector_type_t<_Tp, _Np / 2>;
1679  static_assert(sizeof(_V) <= sizeof(__x));
1680  // _S_full_size is the size of the smallest native SIMD register that
1681  // can store _Np/2 elements:
1682  using _FullSimd = __deduced_simd<_Tp, _VectorTraits<_V>::_S_full_size>;
1683  using _HalfSimd = __deduced_simd<_Tp, _Np / 2>;
1684  const auto __xx = __as_vector(__x);
1685  return _HalfSimd::abi_type::_SimdImpl::_S_reduce(
1686  static_cast<_HalfSimd>(__as_vector(__binary_op(
1687  static_cast<_FullSimd>(__intrin_bitcast<_V>(__xx)),
1688  static_cast<_FullSimd>(__intrin_bitcast<_V>(
1689  __vector_permute<(_Np / 2 + _Is)..., (int(_Zeros * 0) - 1)...>(
1690  __xx)))))),
1691  __binary_op);
1692  }
1693 
1694  template <typename _Tp, typename _BinaryOperation>
1695  _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
1696  _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
1697  {
1698  constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
1699  if constexpr (_Np == 1)
1700  return __x[0];
1701  else if constexpr (_Np == 2)
1702  return __binary_op(simd<_Tp, simd_abi::scalar>(__x[0]),
1703  simd<_Tp, simd_abi::scalar>(__x[1]))[0];
1704  else if (__builtin_is_constant_evaluated())
1705  {
1706  simd<_Tp, simd_abi::scalar> __acc = __x[0];
1707  for (size_t __i = 1; __i < _Np; ++__i)
1708  __acc = __binary_op(__acc, simd<_Tp, simd_abi::scalar>(__x[__i]));
1709  return __acc[0];
1710  }
1711  else if constexpr (_Abi::template _S_is_partial<_Tp>) //{{{
1712  {
1713  [[maybe_unused]] constexpr auto __full_size
1714  = _Abi::template _S_full_size<_Tp>;
1715  if constexpr (_Np == 3)
1716  return __binary_op(
1717  __binary_op(simd<_Tp, simd_abi::scalar>(__x[0]),
1718  simd<_Tp, simd_abi::scalar>(__x[1])),
1719  simd<_Tp, simd_abi::scalar>(__x[2]))[0];
1720  else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>,
1721  plus<>>)
1722  {
1723  using _Ap = simd_abi::__no_sve_deduce_t<_Tp, __full_size>;
1724  return _Ap::_SimdImpl::_S_reduce(
1725  simd<_Tp, _Ap>(__private_init,
1726  _Abi::_S_masked(__as_vector(__x))),
1727  __binary_op);
1728  }
1729  else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>,
1730  multiplies<>>)
1731  {
1732  using _Ap = simd_abi::__no_sve_deduce_t<_Tp, __full_size>;
1733  using _TW = _SimdWrapper<_Tp, __full_size>;
1734  _GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask_full
1735  = _Abi::template _S_implicit_mask<_Tp>().__as_full_vector();
1736  _GLIBCXX_SIMD_USE_CONSTEXPR _TW __one
1737  = __vector_broadcast<__full_size>(_Tp(1));
1738  const _TW __x_full = __data(__x).__as_full_vector();
1739  const _TW __x_padded_with_ones
1740  = _Ap::_CommonImpl::_S_blend(__implicit_mask_full, __one,
1741  __x_full);
1742  return _Ap::_SimdImpl::_S_reduce(
1743  simd<_Tp, _Ap>(__private_init, __x_padded_with_ones),
1744  __binary_op);
1745  }
1746  else if constexpr (_Np & 1)
1747  {
1748  using _Ap = simd_abi::__no_sve_deduce_t<_Tp, _Np - 1>;
1749  return __binary_op(
1750  simd<_Tp, simd_abi::scalar>(_Ap::_SimdImpl::_S_reduce(
1751  simd<_Tp, _Ap>(
1752  __intrin_bitcast<__vector_type_t<_Tp, _Np - 1>>(
1753  __as_vector(__x))),
1754  __binary_op)),
1755  simd<_Tp, simd_abi::scalar>(__x[_Np - 1]))[0];
1756  }
1757  else
1758  return _S_reduce_partial<_Np>(
1759  make_index_sequence<_Np / 2>(),
1760  make_index_sequence<__full_size - _Np / 2>(), __x, __binary_op);
1761  } //}}}
1762  else if constexpr (sizeof(__x) == 16) //{{{
1763  {
1764  if constexpr (_Np == 16)
1765  {
1766  const auto __y = __data(__x);
1767  __x = __binary_op(
1768  _M_make_simd<_Tp, _Np>(
1769  __vector_permute<0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
1770  7, 7>(__y)),
1771  _M_make_simd<_Tp, _Np>(
1772  __vector_permute<8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
1773  14, 14, 15, 15>(__y)));
1774  }
1775  if constexpr (_Np >= 8)
1776  {
1777  const auto __y = __vector_bitcast<short>(__data(__x));
1778  __x = __binary_op(
1779  _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1780  __vector_permute<0, 0, 1, 1, 2, 2, 3, 3>(__y))),
1781  _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1782  __vector_permute<4, 4, 5, 5, 6, 6, 7, 7>(__y))));
1783  }
1784  if constexpr (_Np >= 4)
1785  {
1786  using _Up = conditional_t<is_floating_point_v<_Tp>, float, int>;
1787  const auto __y = __vector_bitcast<_Up>(__data(__x));
1788  __x = __binary_op(__x,
1789  _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1790  __vector_permute<3, 2, 1, 0>(__y))));
1791  }
1792  using _Up = conditional_t<is_floating_point_v<_Tp>, double, _LLong>;
1793  const auto __y = __vector_bitcast<_Up>(__data(__x));
1794  __x = __binary_op(__x, _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1795  __vector_permute<1, 1>(__y))));
1796  return __x[0];
1797  } //}}}
1798  else
1799  {
1800  static_assert(sizeof(__x) > __min_vector_size<_Tp>);
1801  static_assert((_Np & (_Np - 1)) == 0); // _Np must be a power of 2
1802  using _Ap = simd_abi::__no_sve_deduce_t<_Tp, _Np / 2>;
1803  using _V = simd<_Tp, _Ap>;
1804  return _Ap::_SimdImpl::_S_reduce(
1805  __binary_op(_V(__private_init, __extract<0, 2>(__as_vector(__x))),
1806  _V(__private_init,
1807  __extract<1, 2>(__as_vector(__x)))),
1808  static_cast<_BinaryOperation&&>(__binary_op));
1809  }
1810  }
1811 
1812  // math {{{2
1813  // frexp, modf and copysign implemented in simd_math.h
1814 #define _GLIBCXX_SIMD_MATH_FALLBACK(__name) \
1815  template <typename _Tp, typename... _More> \
1816  static _Tp \
1817  _S_##__name(const _Tp& __x, const _More&... __more) \
1818  { \
1819  return __generate_vector<_Tp>( \
1820  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1821  return __name(__x[__i], __more[__i]...); \
1822  }); \
1823  }
1824 
1825 #define _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET(__name) \
1826  template <typename _Tp, typename... _More> \
1827  static typename _Tp::mask_type \
1828  _S_##__name(const _Tp& __x, const _More&... __more) \
1829  { \
1830  return __generate_vector<_Tp>( \
1831  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1832  return __name(__x[__i], __more[__i]...); \
1833  }); \
1834  }
1835 
1836 #define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name) \
1837  template <typename _Tp, typename... _More> \
1838  static auto \
1839  _S_##__name(const _Tp& __x, const _More&... __more) \
1840  { \
1841  return __fixed_size_storage_t<_RetTp, \
1842  _VectorTraits<_Tp>::_S_partial_width>:: \
1843  _S_generate([&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1844  return __meta._S_generator( \
1845  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1846  return __name(__x[__meta._S_offset + __i], \
1847  __more[__meta._S_offset + __i]...); \
1848  }, \
1849  static_cast<_RetTp*>(nullptr)); \
1850  }); \
1851  }
1852 
1853  _GLIBCXX_SIMD_MATH_FALLBACK(acos)
1854  _GLIBCXX_SIMD_MATH_FALLBACK(asin)
1855  _GLIBCXX_SIMD_MATH_FALLBACK(atan)
1856  _GLIBCXX_SIMD_MATH_FALLBACK(atan2)
1857  _GLIBCXX_SIMD_MATH_FALLBACK(cos)
1858  _GLIBCXX_SIMD_MATH_FALLBACK(sin)
1859  _GLIBCXX_SIMD_MATH_FALLBACK(tan)
1860  _GLIBCXX_SIMD_MATH_FALLBACK(acosh)
1861  _GLIBCXX_SIMD_MATH_FALLBACK(asinh)
1862  _GLIBCXX_SIMD_MATH_FALLBACK(atanh)
1863  _GLIBCXX_SIMD_MATH_FALLBACK(cosh)
1864  _GLIBCXX_SIMD_MATH_FALLBACK(sinh)
1865  _GLIBCXX_SIMD_MATH_FALLBACK(tanh)
1866  _GLIBCXX_SIMD_MATH_FALLBACK(exp)
1867  _GLIBCXX_SIMD_MATH_FALLBACK(exp2)
1868  _GLIBCXX_SIMD_MATH_FALLBACK(expm1)
1869  _GLIBCXX_SIMD_MATH_FALLBACK(ldexp)
1870  _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(int, ilogb)
1871  _GLIBCXX_SIMD_MATH_FALLBACK(log)
1872  _GLIBCXX_SIMD_MATH_FALLBACK(log10)
1873  _GLIBCXX_SIMD_MATH_FALLBACK(log1p)
1874  _GLIBCXX_SIMD_MATH_FALLBACK(log2)
1875  _GLIBCXX_SIMD_MATH_FALLBACK(logb)
1876 
1877  // modf implemented in simd_math.h
1878  _GLIBCXX_SIMD_MATH_FALLBACK(scalbn)
1879  _GLIBCXX_SIMD_MATH_FALLBACK(scalbln)
1880  _GLIBCXX_SIMD_MATH_FALLBACK(cbrt)
1881  _GLIBCXX_SIMD_MATH_FALLBACK(fabs)
1882  _GLIBCXX_SIMD_MATH_FALLBACK(pow)
1883  _GLIBCXX_SIMD_MATH_FALLBACK(sqrt)
1884  _GLIBCXX_SIMD_MATH_FALLBACK(erf)
1885  _GLIBCXX_SIMD_MATH_FALLBACK(erfc)
1886  _GLIBCXX_SIMD_MATH_FALLBACK(lgamma)
1887  _GLIBCXX_SIMD_MATH_FALLBACK(tgamma)
1888 
1889  _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lrint)
1890  _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llrint)
1891 
1892  _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lround)
1893  _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llround)
1894 
1895  _GLIBCXX_SIMD_MATH_FALLBACK(fmod)
1896  _GLIBCXX_SIMD_MATH_FALLBACK(remainder)
1897 
1898  template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
1899  static _Tp
1900  _S_remquo(const _Tp __x, const _Tp __y,
1901  __fixed_size_storage_t<int, _TVT::_S_partial_width>* __z)
1902  {
1903  return __generate_vector<_Tp>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1904  int __tmp;
1905  auto __r = remquo(__x[__i], __y[__i], &__tmp);
1906  __z->_M_set(__i, __tmp);
1907  return __r;
1908  });
1909  }
1910 
1911  // copysign in simd_math.h
1912  _GLIBCXX_SIMD_MATH_FALLBACK(nextafter)
1913  _GLIBCXX_SIMD_MATH_FALLBACK(fdim)
1914  _GLIBCXX_SIMD_MATH_FALLBACK(fmax)
1915  _GLIBCXX_SIMD_MATH_FALLBACK(fmin)
1916  _GLIBCXX_SIMD_MATH_FALLBACK(fma)
1917 
1918  template <typename _Tp, size_t _Np>
1919  static constexpr _MaskMember<_Tp>
1920  _S_isgreater(_SimdWrapper<_Tp, _Np> __x,
1921  _SimdWrapper<_Tp, _Np> __y) noexcept
1922  {
1923  using _Ip = __int_for_sizeof_t<_Tp>;
1924  const auto __xn = __vector_bitcast<_Ip>(__x);
1925  const auto __yn = __vector_bitcast<_Ip>(__y);
1926  const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
1927  const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
1928  return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
1929  __xp > __yp);
1930  }
1931 
1932  template <typename _Tp, size_t _Np>
1933  static constexpr _MaskMember<_Tp>
1934  _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x,
1935  _SimdWrapper<_Tp, _Np> __y) noexcept
1936  {
1937  using _Ip = __int_for_sizeof_t<_Tp>;
1938  const auto __xn = __vector_bitcast<_Ip>(__x);
1939  const auto __yn = __vector_bitcast<_Ip>(__y);
1940  const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
1941  const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
1942  return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
1943  __xp >= __yp);
1944  }
1945 
1946  template <typename _Tp, size_t _Np>
1947  static constexpr _MaskMember<_Tp>
1948  _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) noexcept
1949  {
1950  using _Ip = __int_for_sizeof_t<_Tp>;
1951  const auto __xn = __vector_bitcast<_Ip>(__x);
1952  const auto __yn = __vector_bitcast<_Ip>(__y);
1953  const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
1954  const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
1955  return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
1956  __xp < __yp);
1957  }
1958 
1959  template <typename _Tp, size_t _Np>
1960  static constexpr _MaskMember<_Tp>
1961  _S_islessequal(_SimdWrapper<_Tp, _Np> __x,
1962  _SimdWrapper<_Tp, _Np> __y) noexcept
1963  {
1964  using _Ip = __int_for_sizeof_t<_Tp>;
1965  const auto __xn = __vector_bitcast<_Ip>(__x);
1966  const auto __yn = __vector_bitcast<_Ip>(__y);
1967  const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
1968  const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
1969  return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
1970  __xp <= __yp);
1971  }
1972 
1973  template <typename _Tp, size_t _Np>
1974  static constexpr _MaskMember<_Tp>
1975  _S_islessgreater(_SimdWrapper<_Tp, _Np> __x,
1976  _SimdWrapper<_Tp, _Np> __y) noexcept
1977  {
1978  return __andnot(_SuperImpl::_S_isunordered(__x, __y),
1979  _SuperImpl::_S_not_equal_to(__x, __y));
1980  }
1981 
1982 #undef _GLIBCXX_SIMD_MATH_FALLBACK
1983 #undef _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET
1984 #undef _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET
1985  // _S_abs {{{3
1986  template <typename _Tp, size_t _Np>
1987  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
1988  _S_abs(_SimdWrapper<_Tp, _Np> __x) noexcept
1989  {
1990  // if (__builtin_is_constant_evaluated())
1991  // {
1992  // return __x._M_data < 0 ? -__x._M_data : __x._M_data;
1993  // }
1994  if constexpr (is_floating_point_v<_Tp>)
1995  // `v < 0 ? -v : v` cannot compile to the efficient implementation of
1996  // masking the signbit off because it must consider v == -0
1997 
1998  // ~(-0.) & v would be easy, but breaks with fno-signed-zeros
1999  return __and(_S_absmask<__vector_type_t<_Tp, _Np>>, __x._M_data);
2000  else
2001  return __x._M_data < 0 ? -__x._M_data : __x._M_data;
2002  }
2003 
2004  // }}}3
2005  // _S_plus_minus {{{
2006  // Returns __x + __y - __y without -fassociative-math optimizing to __x.
2007  // - _TV must be __vector_type_t<floating-point type, N>.
2008  // - _UV must be _TV or floating-point type.
2009  template <typename _TV, typename _UV>
2010  _GLIBCXX_SIMD_INTRINSIC static constexpr _TV
2011  _S_plus_minus(_TV __x, _UV __y) noexcept
2012  {
2013 #if defined __i386__ && !defined __SSE_MATH__
2014  if constexpr (sizeof(__x) == 8)
2015  { // operations on __x would use the FPU
2016  static_assert(is_same_v<_TV, __vector_type_t<float, 2>>);
2017  const auto __x4 = __vector_bitcast<float, 4>(__x);
2018  if constexpr (is_same_v<_TV, _UV>)
2019  return __vector_bitcast<float, 2>(
2020  _S_plus_minus(__x4, __vector_bitcast<float, 4>(__y)));
2021  else
2022  return __vector_bitcast<float, 2>(_S_plus_minus(__x4, __y));
2023  }
2024 #endif
2025 #if !defined __clang__ && __GCC_IEC_559 == 0
2026  if (__builtin_is_constant_evaluated()
2027  || (__builtin_constant_p(__x) && __builtin_constant_p(__y)))
2028  return (__x + __y) - __y;
2029  else
2030  return [&] {
2031  __x += __y;
2032  if constexpr(__have_sse)
2033  {
2034  if constexpr (sizeof(__x) >= 16)
2035  asm("" : "+x"(__x));
2036  else if constexpr (is_same_v<__vector_type_t<float, 2>, _TV>)
2037  asm("" : "+x"(__x[0]), "+x"(__x[1]));
2038  else
2039  __assert_unreachable<_TV>();
2040  }
2041  else if constexpr(__have_neon)
2042  asm("" : "+w"(__x));
2043  else if constexpr (__have_power_vmx)
2044  {
2045  if constexpr (is_same_v<__vector_type_t<float, 2>, _TV>)
2046  asm("" : "+fgr"(__x[0]), "+fgr"(__x[1]));
2047  else
2048  asm("" : "+v"(__x));
2049  }
2050  else
2051  asm("" : "+g"(__x));
2052  return __x - __y;
2053  }();
2054 #else
2055  return (__x + __y) - __y;
2056 #endif
2057  }
2058 
2059  // }}}
2060  // _S_nearbyint {{{3
2061  template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2062  _GLIBCXX_SIMD_INTRINSIC static _Tp
2063  _S_nearbyint(_Tp __x_) noexcept
2064  {
2065  using value_type = typename _TVT::value_type;
2066  using _V = typename _TVT::type;
2067  const _V __x = __x_;
2068  const _V __absx = __and(__x, _S_absmask<_V>);
2069  static_assert(__CHAR_BIT__ * sizeof(1ull) >= __digits_v<value_type>);
2070  _GLIBCXX_SIMD_USE_CONSTEXPR _V __shifter_abs
2071  = _V() + (1ull << (__digits_v<value_type> - 1));
2072  const _V __shifter = __or(__and(_S_signmask<_V>, __x), __shifter_abs);
2073  const _V __shifted = _S_plus_minus(__x, __shifter);
2074  return __absx < __shifter_abs ? __shifted : __x;
2075  }
2076 
2077  // _S_rint {{{3
2078  template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2079  _GLIBCXX_SIMD_INTRINSIC static _Tp
2080  _S_rint(_Tp __x) noexcept
2081  { return _SuperImpl::_S_nearbyint(__x); }
2082 
2083  // _S_trunc {{{3
2084  template <typename _Tp, size_t _Np>
2085  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2086  _S_trunc(_SimdWrapper<_Tp, _Np> __x)
2087  {
2088  using _V = __vector_type_t<_Tp, _Np>;
2089  const _V __absx = __and(__x._M_data, _S_absmask<_V>);
2090  static_assert(__CHAR_BIT__ * sizeof(1ull) >= __digits_v<_Tp>);
2091  constexpr _Tp __shifter = 1ull << (__digits_v<_Tp> - 1);
2092  _V __truncated = _S_plus_minus(__absx, __shifter);
2093  __truncated -= __truncated > __absx ? _V() + 1 : _V();
2094  return __absx < __shifter ? __or(__xor(__absx, __x._M_data), __truncated)
2095  : __x._M_data;
2096  }
2097 
2098  // _S_round {{{3
2099  template <typename _Tp, size_t _Np>
2100  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2101  _S_round(_SimdWrapper<_Tp, _Np> __x)
2102  {
2103  const auto __abs_x = _SuperImpl::_S_abs(__x);
2104  const auto __t_abs = _SuperImpl::_S_trunc(__abs_x)._M_data;
2105  const auto __r_abs // round(abs(x)) =
2106  = __t_abs + (__abs_x._M_data - __t_abs >= _Tp(.5) ? _Tp(1) : 0);
2107  return __or(__xor(__abs_x._M_data, __x._M_data), __r_abs);
2108  }
2109 
2110  // _S_floor {{{3
2111  template <typename _Tp, size_t _Np>
2112  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2113  _S_floor(_SimdWrapper<_Tp, _Np> __x)
2114  {
2115  const auto __y = _SuperImpl::_S_trunc(__x)._M_data;
2116  const auto __negative_input
2117  = __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(0));
2118  const auto __mask
2119  = __andnot(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input);
2120  return __or(__andnot(__mask, __y),
2121  __and(__mask, __y - __vector_broadcast<_Np, _Tp>(1)));
2122  }
2123 
2124  // _S_ceil {{{3
2125  template <typename _Tp, size_t _Np>
2126  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2127  _S_ceil(_SimdWrapper<_Tp, _Np> __x)
2128  {
2129  const auto __y = _SuperImpl::_S_trunc(__x)._M_data;
2130  const auto __negative_input
2131  = __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(0));
2132  const auto __inv_mask
2133  = __or(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input);
2134  return __or(__and(__inv_mask, __y),
2135  __andnot(__inv_mask, __y + __vector_broadcast<_Np, _Tp>(1)));
2136  }
2137 
2138  // _S_isnan {{{3
2139  template <typename _Tp, size_t _Np>
2140  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2141  _S_isnan([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
2142  {
2143 #if __FINITE_MATH_ONLY__
2144  return {}; // false
2145 #elif !defined __SUPPORT_SNAN__
2146  return ~(__x._M_data == __x._M_data);
2147 #elif defined __STDC_IEC_559__
2148  using _Ip = __int_for_sizeof_t<_Tp>;
2149  const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
2150  const auto __infn
2151  = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__infinity_v<_Tp>));
2152  return __infn < __absn;
2153 #else
2154 #error "Not implemented: how to support SNaN but non-IEC559 floating-point?"
2155 #endif
2156  }
2157 
2158  // _S_isfinite {{{3
2159  template <typename _Tp, size_t _Np>
2160  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2161  _S_isfinite([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
2162  {
2163 #if __FINITE_MATH_ONLY__
2164  using _UV = typename _MaskMember<_Tp>::_BuiltinType;
2165  _GLIBCXX_SIMD_USE_CONSTEXPR _UV __alltrue = ~_UV();
2166  return __alltrue;
2167 #else
2168  // if all exponent bits are set, __x is either inf or NaN
2169  using _Ip = __int_for_sizeof_t<_Tp>;
2170  const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
2171  const auto __maxn
2172  = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>));
2173  return __absn <= __maxn;
2174 #endif
2175  }
2176 
2177  // _S_isunordered {{{3
2178  template <typename _Tp, size_t _Np>
2179  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2180  _S_isunordered(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2181  { return __or(_S_isnan(__x), _S_isnan(__y)); }
2182 
2183  // _S_signbit {{{3
2184  template <typename _Tp, size_t _Np>
2185  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2186  _S_signbit(_SimdWrapper<_Tp, _Np> __x)
2187  {
2188  using _Ip = __int_for_sizeof_t<_Tp>;
2189  return __vector_bitcast<_Ip>(__x) < 0;
2190  // Arithmetic right shift (SRA) would also work (instead of compare), but
2191  // 64-bit SRA isn't available on x86 before AVX512. And in general,
2192  // compares are more likely to be efficient than SRA.
2193  }
2194 
2195  // _S_isinf {{{3
2196  template <typename _Tp, size_t _Np>
2197  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2198  _S_isinf([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
2199  {
2200 #if __FINITE_MATH_ONLY__
2201  return {}; // false
2202 #else
2203  return _SuperImpl::template _S_equal_to<_Tp, _Np>(_SuperImpl::_S_abs(__x),
2204  __vector_broadcast<_Np>(
2205  __infinity_v<_Tp>));
2206  // alternative:
2207  // compare to inf using the corresponding integer type
2208  /*
2209  return
2210  __vector_bitcast<_Tp>(__vector_bitcast<__int_for_sizeof_t<_Tp>>(
2211  _S_abs(__x)._M_data)
2212  ==
2213  __vector_bitcast<__int_for_sizeof_t<_Tp>>(__vector_broadcast<_Np>(
2214  __infinity_v<_Tp>)));
2215  */
2216 #endif
2217  }
2218 
2219  // _S_isnormal {{{3
2220  template <typename _Tp, size_t _Np>
2221  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2222  _S_isnormal(_SimdWrapper<_Tp, _Np> __x)
2223  {
2224  using _Ip = __int_for_sizeof_t<_Tp>;
2225  const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
2226  const auto __minn
2227  = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__norm_min_v<_Tp>));
2228 #if __FINITE_MATH_ONLY__
2229  return __absn >= __minn;
2230 #else
2231  const auto __maxn
2232  = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>));
2233  return __minn <= __absn && __absn <= __maxn;
2234 #endif
2235  }
2236 
2237  // _S_fpclassify {{{3
2238  template <typename _Tp, size_t _Np>
2239  _GLIBCXX_SIMD_INTRINSIC static __fixed_size_storage_t<int, _Np>
2240  _S_fpclassify(_SimdWrapper<_Tp, _Np> __x)
2241  {
2242  if constexpr(__have_sve)
2243  {
2244  __fixed_size_storage_t<int, _Np> __r{};
2245  __execute_n_times<_Np>(
2246  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2247  __r._M_set(__i, std::fpclassify(__x[__i]));
2248  });
2249  return __r;
2250  }
2251  else
2252  {
2253  using _I = __int_for_sizeof_t<_Tp>;
2254  const auto __xn
2255  = __vector_bitcast<_I>(__to_intrin(_SuperImpl::_S_abs(__x)));
2256  constexpr size_t _NI = sizeof(__xn) / sizeof(_I);
2257  _GLIBCXX_SIMD_USE_CONSTEXPR auto __minn
2258  = __vector_bitcast<_I>(__vector_broadcast<_NI>(__norm_min_v<_Tp>));
2259 
2260  _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_normal
2261  = __vector_broadcast<_NI, _I>(FP_NORMAL);
2262 #if !__FINITE_MATH_ONLY__
2263  _GLIBCXX_SIMD_USE_CONSTEXPR auto __infn
2264  = __vector_bitcast<_I>(__vector_broadcast<_NI>(__infinity_v<_Tp>));
2265  _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_nan
2266  = __vector_broadcast<_NI, _I>(FP_NAN);
2267  _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_infinite
2268  = __vector_broadcast<_NI, _I>(FP_INFINITE);
2269 #endif
2270 #ifndef __FAST_MATH__
2271  _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_subnormal
2272  = __vector_broadcast<_NI, _I>(FP_SUBNORMAL);
2273 #endif
2274  _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_zero
2275  = __vector_broadcast<_NI, _I>(FP_ZERO);
2276 
2277  __vector_type_t<_I, _NI>
2278  __tmp = __xn < __minn
2279 #ifdef __FAST_MATH__
2280  ? __fp_zero
2281 #else
2282  ? (__xn == 0 ? __fp_zero : __fp_subnormal)
2283 #endif
2284 #if __FINITE_MATH_ONLY__
2285  : __fp_normal;
2286 #else
2287  : (__xn < __infn ? __fp_normal
2288  : (__xn == __infn ? __fp_infinite : __fp_nan));
2289 #endif
2290 
2291  if constexpr (sizeof(_I) == sizeof(int))
2292  {
2293  using _FixedInt = __fixed_size_storage_t<int, _Np>;
2294  const auto __as_int = __vector_bitcast<int, _Np>(__tmp);
2295  if constexpr (_FixedInt::_S_tuple_size == 1)
2296  return {__as_int};
2297  else if constexpr (_FixedInt::_S_tuple_size == 2
2298  && is_same_v<
2299  typename _FixedInt::_SecondType::_FirstAbi,
2300  simd_abi::scalar>)
2301  return {__extract<0, 2>(__as_int), __as_int[_Np - 1]};
2302  else if constexpr (_FixedInt::_S_tuple_size == 2)
2303  return {__extract<0, 2>(__as_int),
2304  __auto_bitcast(__extract<1, 2>(__as_int))};
2305  else
2306  __assert_unreachable<_Tp>();
2307  }
2308  else if constexpr (_Np == 2 && sizeof(_I) == 8
2309  && __fixed_size_storage_t<int, _Np>::_S_tuple_size == 2)
2310  {
2311  const auto __aslong = __vector_bitcast<_LLong>(__tmp);
2312  return {int(__aslong[0]), {int(__aslong[1])}};
2313  }
2314 #if _GLIBCXX_SIMD_X86INTRIN
2315  else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 32
2316  && __fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
2317  return {_mm_packs_epi32(__to_intrin(__lo128(__tmp)),
2318  __to_intrin(__hi128(__tmp)))};
2319  else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 64
2320  && __fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
2321  return {_mm512_cvtepi64_epi32(__to_intrin(__tmp))};
2322 #endif // _GLIBCXX_SIMD_X86INTRIN
2323  else if constexpr (__fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
2324  return {__call_with_subscripts<_Np>(__vector_bitcast<_LLong>(__tmp),
2325  [](auto... __l) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2326  return __make_wrapper<int>(__l...);
2327  })};
2328  else
2329  __assert_unreachable<_Tp>();
2330  }
2331  }
2332 
2333  // _S_increment & _S_decrement{{{2
2334  template <typename _Tp, size_t _Np>
2335  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2336  _S_increment(_SimdWrapper<_Tp, _Np>& __x)
2337  { __x = __x._M_data + 1; }
2338 
2339  template <typename _Tp, size_t _Np>
2340  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2341  _S_decrement(_SimdWrapper<_Tp, _Np>& __x)
2342  { __x = __x._M_data - 1; }
2343 
2344  // smart_reference access {{{2
2345  template <typename _Tp, size_t _Np, typename _Up>
2346  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2347  _S_set(_SimdWrapper<_Tp, _Np>& __v, int __i, _Up&& __x) noexcept
2348  { __v._M_set(__i, static_cast<_Up&&>(__x)); }
2349 
2350  // _S_masked_assign{{{2
2351  template <typename _Tp, typename _K, size_t _Np>
2352  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2353  _S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
2354  __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs)
2355  {
2356  if (__k._M_is_constprop_none_of())
2357  return;
2358  else if (__k._M_is_constprop_all_of())
2359  __lhs = __rhs;
2360  else
2361  __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs);
2362  }
2363 
2364  template <typename _Tp, typename _K, size_t _Np>
2365  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2366  _S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
2367  __type_identity_t<_Tp> __rhs)
2368  {
2369  if (__k._M_is_constprop_none_of())
2370  return;
2371  else if (__k._M_is_constprop_all_of())
2372  __lhs = __vector_broadcast<_Np>(__rhs);
2373  else if (__builtin_constant_p(__rhs) && __rhs == 0)
2374  {
2375  if constexpr (!is_same_v<bool, _K>)
2376  // the __andnot optimization only makes sense if __k._M_data is a
2377  // vector register
2378  __lhs._M_data
2379  = __andnot(__vector_bitcast<_Tp>(__k), __lhs._M_data);
2380  else
2381  // for AVX512/__mmask, a _mm512_maskz_mov is best
2382  __lhs
2383  = _CommonImpl::_S_blend(__k, __lhs, _SimdWrapper<_Tp, _Np>());
2384  }
2385  else
2386  __lhs = _CommonImpl::_S_blend(__k, __lhs,
2387  _SimdWrapper<_Tp, _Np>(
2388  __vector_broadcast<_Np>(__rhs)));
2389  }
2390 
2391  // _S_masked_cassign {{{2
2392  template <typename _Op, typename _Tp, typename _K, size_t _Np>
2393  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2394  _S_masked_cassign(const _SimdWrapper<_K, _Np> __k,
2395  _SimdWrapper<_Tp, _Np>& __lhs,
2396  const __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs,
2397  _Op __op)
2398  {
2399  if (__k._M_is_constprop_none_of())
2400  return;
2401  else if (__k._M_is_constprop_all_of())
2402  __lhs = __op(_SuperImpl{}, __lhs, __rhs);
2403  else
2404  __lhs = _CommonImpl::_S_blend(__k, __lhs,
2405  __op(_SuperImpl{}, __lhs, __rhs));
2406  }
2407 
2408  template <typename _Op, typename _Tp, typename _K, size_t _Np>
2409  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2410  _S_masked_cassign(const _SimdWrapper<_K, _Np> __k,
2411  _SimdWrapper<_Tp, _Np>& __lhs,
2412  const __type_identity_t<_Tp> __rhs, _Op __op)
2413  { _S_masked_cassign(__k, __lhs, __vector_broadcast<_Np>(__rhs), __op); }
2414 
2415  // _S_masked_unary {{{2
2416  template <template <typename> class _Op, typename _Tp, typename _K,
2417  size_t _Np>
2418  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2419  _S_masked_unary(const _SimdWrapper<_K, _Np> __k,
2420  const _SimdWrapper<_Tp, _Np> __v)
2421  {
2422  if (__k._M_is_constprop_none_of())
2423  return __v;
2424  auto __vv = _M_make_simd(__v);
2425  _Op<decltype(__vv)> __op;
2426  if (__k._M_is_constprop_all_of())
2427  return __data(__op(__vv));
2428  else if constexpr (is_same_v<_Op<void>, __increment<void>>)
2429  {
2430  static_assert(not std::is_same_v<_K, bool>);
2431  if constexpr (is_integral_v<_Tp>)
2432  // Take a shortcut knowing that __k is an integer vector with values -1 or 0.
2433  return __v._M_data - __vector_bitcast<_Tp>(__k._M_data);
2434  else if constexpr (not __have_avx2)
2435  return __v._M_data
2436  + __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast(
2437  _K, _Tp(1)));
2438  // starting with AVX2 it is more efficient to blend after add
2439  }
2440  else if constexpr (is_same_v<_Op<void>, __decrement<void>>)
2441  {
2442  static_assert(not std::is_same_v<_K, bool>);
2443  if constexpr (is_integral_v<_Tp>)
2444  // Take a shortcut knowing that __k is an integer vector with values -1 or 0.
2445  return __v._M_data + __vector_bitcast<_Tp>(__k._M_data);
2446  else if constexpr (not __have_avx2)
2447  return __v._M_data
2448  - __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast(
2449  _K, _Tp(1)));
2450  // starting with AVX2 it is more efficient to blend after sub
2451  }
2452  return _CommonImpl::_S_blend(__k, __v, __data(__op(__vv)));
2453  }
2454 
2455  //}}}2
2456  };
2457 
2458 // _MaskImplBuiltinMixin {{{1
2459 struct _MaskImplBuiltinMixin
2460 {
2461  template <typename _Tp>
2462  using _TypeTag = _Tp*;
2463 
2464  // _S_to_maskvector {{{
2465  template <typename _Up, size_t _ToN = 1>
2466  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
2467  _S_to_maskvector(bool __x)
2468  {
2469  static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
2470  return __x ? __vector_type_t<_Up, _ToN>{~_Up()}
2471  : __vector_type_t<_Up, _ToN>{};
2472  }
2473 
2474  template <typename _Up, size_t _UpN = 0, size_t _Np, bool _Sanitized,
2475  size_t _ToN = _UpN == 0 ? _Np : _UpN>
2476  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
2477  _S_to_maskvector(_BitMask<_Np, _Sanitized> __x)
2478  {
2479  static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
2480  return __generate_vector<__vector_type_t<_Up, _ToN>>(
2481  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2482  if constexpr (__i < _Np)
2483  return __x[__i] ? ~_Up() : _Up();
2484  else
2485  return _Up();
2486  });
2487  }
2488 
2489  template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np,
2490  size_t _ToN = _UpN == 0 ? _Np : _UpN>
2491  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
2492  _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x)
2493  {
2494  static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
2495  using _TW = _SimdWrapper<_Tp, _Np>;
2496  using _UW = _SimdWrapper<_Up, _ToN>;
2497  if constexpr (sizeof(_Up) == sizeof(_Tp) && sizeof(_TW) == sizeof(_UW))
2498  return __wrapper_bitcast<_Up, _ToN>(__x);
2499  else if constexpr (is_same_v<_Tp, bool>) // bits -> vector
2500  return _S_to_maskvector<_Up, _ToN>(_BitMask<_Np>(__x._M_data));
2501  else
2502  { // vector -> vector
2503  /*
2504  [[maybe_unused]] const auto __y = __vector_bitcast<_Up>(__x._M_data);
2505  if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 && sizeof(__y) ==
2506  16) return __vector_permute<1, 3, -1, -1>(__y); else if constexpr
2507  (sizeof(_Tp) == 4 && sizeof(_Up) == 2
2508  && sizeof(__y) == 16)
2509  return __vector_permute<1, 3, 5, 7, -1, -1, -1, -1>(__y);
2510  else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
2511  && sizeof(__y) == 16)
2512  return __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>(__y);
2513  else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
2514  && sizeof(__y) == 16)
2515  return __vector_permute<1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1,
2516  -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 4 &&
2517  sizeof(_Up) == 1
2518  && sizeof(__y) == 16)
2519  return __vector_permute<3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1,
2520  -1, -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 8 &&
2521  sizeof(_Up) == 1
2522  && sizeof(__y) == 16)
2523  return __vector_permute<7, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2524  -1, -1, -1, -1, -1>(__y); else
2525  */
2526  {
2527  return __generate_vector<__vector_type_t<_Up, _ToN>>(
2528  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2529  if constexpr (__i < _Np)
2530  return _Up(__x[__i.value]);
2531  else
2532  return _Up();
2533  });
2534  }
2535  }
2536  }
2537 
2538  // }}}
2539  // _S_to_bits {{{
2540  template <typename _Tp, size_t _Np>
2541  _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
2542  _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
2543  {
2544  static_assert(!is_same_v<_Tp, bool>);
2545  static_assert(_Np <= __CHAR_BIT__ * sizeof(_ULLong));
2546  using _Up = make_unsigned_t<__int_for_sizeof_t<_Tp>>;
2547  const auto __bools
2548  = __vector_bitcast<_Up>(__x) >> (sizeof(_Up) * __CHAR_BIT__ - 1);
2549  _ULLong __r = 0;
2550  __execute_n_times<_Np>(
2551  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2552  __r |= _ULLong(__bools[__i.value]) << __i;
2553  });
2554  return __r;
2555  }
2556 
2557  // }}}
2558 };
2559 
2560 // _MaskImplBuiltin {{{1
2561 template <typename _Abi, typename>
2562  struct _MaskImplBuiltin : _MaskImplBuiltinMixin
2563  {
2564  using _MaskImplBuiltinMixin::_S_to_bits;
2565  using _MaskImplBuiltinMixin::_S_to_maskvector;
2566 
2567  // member types {{{
2568  template <typename _Tp>
2569  using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
2570 
2571  template <typename _Tp>
2572  using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
2573 
2574  using _SuperImpl = typename _Abi::_MaskImpl;
2575  using _CommonImpl = typename _Abi::_CommonImpl;
2576 
2577  template <typename _Tp>
2578  static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>;
2579 
2580  // }}}
2581  // _S_broadcast {{{
2582  template <typename _Tp>
2583  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2584  _S_broadcast(bool __x)
2585  { return __x ? _Abi::template _S_implicit_mask<_Tp>() : _MaskMember<_Tp>(); }
2586 
2587  // }}}
2588  // _S_load {{{
2589  template <typename _Tp>
2590  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2591  _S_load(const bool* __mem)
2592  {
2593  using _I = __int_for_sizeof_t<_Tp>;
2594  if (not __builtin_is_constant_evaluated())
2595  if constexpr (sizeof(_Tp) == sizeof(bool))
2596  {
2597  const auto __bools
2598  = _CommonImpl::template _S_load<_I, _S_size<_Tp>>(__mem);
2599  // bool is {0, 1}, everything else is UB
2600  return __bools > 0;
2601  }
2602  return __generate_vector<_I, _S_size<_Tp>>(
2603  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2604  return __mem[__i] ? ~_I() : _I();
2605  });
2606  }
2607 
2608  // }}}
2609  // _S_convert {{{
2610  template <typename _Tp, size_t _Np, bool _Sanitized>
2611  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
2612  _S_convert(_BitMask<_Np, _Sanitized> __x)
2613  {
2614  if constexpr (__is_builtin_bitmask_abi<_Abi>())
2615  return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(__x._M_to_bits());
2616  else
2617  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2618  _S_size<_Tp>>(
2619  __x._M_sanitized());
2620  }
2621 
2622  template <typename _Tp, size_t _Np>
2623  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
2624  _S_convert(_SimdWrapper<bool, _Np> __x)
2625  {
2626  if constexpr (__is_builtin_bitmask_abi<_Abi>())
2627  return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(__x._M_data);
2628  else
2629  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2630  _S_size<_Tp>>(
2631  _BitMask<_Np>(__x._M_data)._M_sanitized());
2632  }
2633 
2634  template <typename _Tp, typename _Up, size_t _Np>
2635  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
2636  _S_convert(_SimdWrapper<_Up, _Np> __x)
2637  {
2638  if constexpr (__is_builtin_bitmask_abi<_Abi>())
2639  return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(
2640  _SuperImpl::_S_to_bits(__x));
2641  else
2642  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2643  _S_size<_Tp>>(__x);
2644  }
2645 
2646  template <typename _Tp, typename _Up, typename _UAbi>
2647  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
2648  _S_convert(simd_mask<_Up, _UAbi> __x)
2649  {
2650  if constexpr (__is_builtin_bitmask_abi<_Abi>())
2651  {
2652  using _R = _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>;
2653  if constexpr (__is_builtin_bitmask_abi<_UAbi>()) // bits -> bits
2654  return _R(__data(__x));
2655  else if constexpr (__is_scalar_abi<_UAbi>()) // bool -> bits
2656  return _R(__data(__x));
2657  else if constexpr (__is_fixed_size_abi_v<_UAbi>) // bitset -> bits
2658  return _R(__data(__x)._M_to_bits());
2659  else // vector -> bits
2660  return _R(_UAbi::_MaskImpl::_S_to_bits(__data(__x))._M_to_bits());
2661  }
2662  else
2663  {
2664  if constexpr(__is_sve_abi<_UAbi>())
2665  {
2666  simd_mask<_Tp> __r(false);
2667  constexpr size_t __min_size = std::min(__r.size(), __x.size());
2668  __execute_n_times<__min_size>(
2669  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2670  __r[__i] = __x[__i];
2671  });
2672  return __data(__r);
2673  }
2674  else
2675  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2676  _S_size<_Tp>>(
2677  __data(__x));
2678  }
2679  }
2680  // }}}
2681  // _S_masked_load {{{2
2682  template <typename _Tp, size_t _Np>
2683  static inline _SimdWrapper<_Tp, _Np>
2684  _S_masked_load(_SimdWrapper<_Tp, _Np> __merge,
2685  _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept
2686  {
2687  // AVX(2) has 32/64 bit maskload, but nothing at 8 bit granularity
2688  auto __tmp = __wrapper_bitcast<__int_for_sizeof_t<_Tp>>(__merge);
2689  _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__mask),
2690  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2691  __tmp._M_set(__i, -__mem[__i]);
2692  });
2693  __merge = __wrapper_bitcast<_Tp>(__tmp);
2694  return __merge;
2695  }
2696 
2697  // _S_store {{{2
2698  template <typename _Tp, size_t _Np>
2699  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2700  _S_store(_SimdWrapper<_Tp, _Np> __v, bool* __mem) noexcept
2701  {
2702  __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2703  __mem[__i] = __v[__i];
2704  });
2705  }
2706 
2707  // _S_masked_store {{{2
2708  template <typename _Tp, size_t _Np>
2709  static inline void
2710  _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem,
2711  const _SimdWrapper<_Tp, _Np> __k) noexcept
2712  {
2713  _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__k),
2714  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2715  __mem[__i] = __v[__i];
2716  });
2717  }
2718 
2719  // _S_from_bitmask{{{2
2720  template <size_t _Np, typename _Tp>
2721  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2722  _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>)
2723  { return _SuperImpl::template _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits); }
2724 
2725  // logical and bitwise operators {{{2
2726  template <typename _Tp, size_t _Np>
2727  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2728  _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2729  { return __and(__x._M_data, __y._M_data); }
2730 
2731  template <typename _Tp, size_t _Np>
2732  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2733  _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2734  { return __or(__x._M_data, __y._M_data); }
2735 
2736  template <typename _Tp, size_t _Np>
2737  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2738  _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x)
2739  {
2740  if constexpr (_Abi::template _S_is_partial<_Tp>)
2741  return __andnot(__x, __wrapper_bitcast<_Tp>(
2742  _Abi::template _S_implicit_mask<_Tp>()));
2743  else
2744  return __not(__x._M_data);
2745  }
2746 
2747  template <typename _Tp, size_t _Np>
2748  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2749  _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2750  { return __and(__x._M_data, __y._M_data); }
2751 
2752  template <typename _Tp, size_t _Np>
2753  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2754  _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2755  { return __or(__x._M_data, __y._M_data); }
2756 
2757  template <typename _Tp, size_t _Np>
2758  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2759  _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2760  { return __xor(__x._M_data, __y._M_data); }
2761 
2762  // smart_reference access {{{2
2763  template <typename _Tp, size_t _Np>
2764  static constexpr void
2765  _S_set(_SimdWrapper<_Tp, _Np>& __k, int __i, bool __x) noexcept
2766  {
2767  if constexpr (is_same_v<_Tp, bool>)
2768  __k._M_set(__i, __x);
2769  else
2770  {
2771  static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
2772  if (__builtin_is_constant_evaluated())
2773  {
2774  __k = __generate_from_n_evaluations<_Np,
2775  __vector_type_t<_Tp, _Np>>(
2776  [&](auto __j) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2777  if (__i == static_cast<int>(__j))
2778  return _Tp(-__x);
2779  else
2780  return __k[+__j];
2781  });
2782  }
2783  else
2784  __k._M_data[__i] = -__x;
2785  }
2786  }
2787 
2788  // _S_masked_assign{{{2
2789  template <typename _Tp, size_t _Np>
2790  _GLIBCXX_SIMD_INTRINSIC static void
2791  _S_masked_assign(_SimdWrapper<_Tp, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
2792  __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs)
2793  { __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs); }
2794 
2795  template <typename _Tp, size_t _Np>
2796  _GLIBCXX_SIMD_INTRINSIC static void
2797  _S_masked_assign(_SimdWrapper<_Tp, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs, bool __rhs)
2798  {
2799  if (__builtin_constant_p(__rhs))
2800  {
2801  if (__rhs == false)
2802  __lhs = __andnot(__k, __lhs);
2803  else
2804  __lhs = __or(__k, __lhs);
2805  return;
2806  }
2807  __lhs = _CommonImpl::_S_blend(__k, __lhs,
2808  __data(simd_mask<_Tp, _Abi>(__rhs)));
2809  }
2810 
2811  //}}}2
2812  // _S_all_of {{{
2813  template <typename _Tp>
2814  _GLIBCXX_SIMD_INTRINSIC static bool
2815  _S_all_of(simd_mask<_Tp, _Abi> __k)
2816  {
2817  return __call_with_subscripts(
2818  __data(__k), make_index_sequence<_S_size<_Tp>>(),
2819  [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
2820  { return (... && !(__ent == 0)); });
2821  }
2822 
2823  // }}}
2824  // _S_any_of {{{
2825  template <typename _Tp>
2826  _GLIBCXX_SIMD_INTRINSIC static bool
2827  _S_any_of(simd_mask<_Tp, _Abi> __k)
2828  {
2829  return __call_with_subscripts(
2830  __data(__k), make_index_sequence<_S_size<_Tp>>(),
2831  [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
2832  { return (... || !(__ent == 0)); });
2833  }
2834 
2835  // }}}
2836  // _S_none_of {{{
2837  template <typename _Tp>
2838  _GLIBCXX_SIMD_INTRINSIC static bool
2839  _S_none_of(simd_mask<_Tp, _Abi> __k)
2840  {
2841  return __call_with_subscripts(
2842  __data(__k), make_index_sequence<_S_size<_Tp>>(),
2843  [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
2844  { return (... && (__ent == 0)); });
2845  }
2846 
2847  // }}}
2848  // _S_some_of {{{
2849  template <typename _Tp>
2850  _GLIBCXX_SIMD_INTRINSIC static bool
2851  _S_some_of(simd_mask<_Tp, _Abi> __k)
2852  {
2853  const int __n_true = _SuperImpl::_S_popcount(__k);
2854  return __n_true > 0 && __n_true < int(_S_size<_Tp>);
2855  }
2856 
2857  // }}}
2858  // _S_popcount {{{
2859  template <typename _Tp>
2860  _GLIBCXX_SIMD_INTRINSIC static int
2861  _S_popcount(simd_mask<_Tp, _Abi> __k)
2862  {
2863  using _I = __int_for_sizeof_t<_Tp>;
2864  if constexpr (is_default_constructible_v<simd<_I, _Abi>>)
2865  return -reduce(
2866  simd<_I, _Abi>(__private_init, __wrapper_bitcast<_I>(__data(__k))));
2867  else
2868  return -reduce(__bit_cast<rebind_simd_t<_I, simd<_Tp, _Abi>>>(
2869  simd<_Tp, _Abi>(__private_init, __data(__k))));
2870  }
2871 
2872  // }}}
2873  // _S_find_first_set {{{
2874  template <typename _Tp>
2875  _GLIBCXX_SIMD_INTRINSIC static int
2876  _S_find_first_set(simd_mask<_Tp, _Abi> __k)
2877  { return std::__countr_zero(_SuperImpl::_S_to_bits(__data(__k))._M_to_bits()); }
2878 
2879  // }}}
2880  // _S_find_last_set {{{
2881  template <typename _Tp>
2882  _GLIBCXX_SIMD_INTRINSIC static int
2883  _S_find_last_set(simd_mask<_Tp, _Abi> __k)
2884  { return std::__bit_width(_SuperImpl::_S_to_bits(__data(__k))._M_to_bits()) - 1; }
2885 
2886  // }}}
2887  };
2888 
2889 //}}}1
2890 _GLIBCXX_SIMD_END_NAMESPACE
2891 #endif // __cplusplus >= 201703L
2892 #endif // _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
2893 
2894 // vim: foldmethod=marker foldmarker={{{,}}} sw=2 noet ts=8 sts=2 tw=100
complex< _Tp > log10(const complex< _Tp > &)
Return complex base 10 logarithm of z.
Definition: complex:1095
complex< _Tp > sin(const complex< _Tp > &)
Return complex sine of z.
Definition: complex:1125
complex< _Tp > log(const complex< _Tp > &)
Return complex natural logarithm of z.
Definition: complex:1090
complex< _Tp > tan(const complex< _Tp > &)
Return complex tangent of z.
Definition: complex:1226
complex< _Tp > exp(const complex< _Tp > &)
Return complex base e exponential of z.
Definition: complex:1063
complex< _Tp > cosh(const complex< _Tp > &)
Return complex hyperbolic cosine of z.
Definition: complex:1037
complex< _Tp > tanh(const complex< _Tp > &)
Return complex hyperbolic tangent of z.
Definition: complex:1254
complex< _Tp > pow(const complex< _Tp > &, int)
Return x to the y'th power.
Definition: complex:1285
complex< _Tp > sinh(const complex< _Tp > &)
Return complex hyperbolic sine of z.
Definition: complex:1155
complex< _Tp > cos(const complex< _Tp > &)
Return complex cosine of z.
Definition: complex:1007
complex< _Tp > sqrt(const complex< _Tp > &)
Return complex square root of z.
Definition: complex:1199
__bool_constant< true > true_type
The type used as a compile-time boolean with true value.
Definition: type_traits:111
typename conditional< _Cond, _Iftrue, _Iffalse >::type conditional_t
Alias template for conditional.
Definition: type_traits:2715
constexpr const _Tp & min(const _Tp &, const _Tp &)
This does what you think it does.
Definition: stl_algobase.h:233
constexpr _Tp reduce(_InputIterator __first, _InputIterator __last, _Tp __init, _BinaryOperation __binary_op)
Calculate reduction of values in a range.
Definition: numeric:287
_Tp fabs(const std::complex< _Tp > &)
fabs(__z) [8.1.8].
Definition: complex:2429
make_integer_sequence< size_t, _Num > make_index_sequence
Alias template make_index_sequence.
Definition: utility.h:186