25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H 26 #define _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H 28 #if __cplusplus >= 201703L 32 template <
typename _To,
typename _V,
typename _Traits>
33 _GLIBCXX_SIMD_INTRINSIC _To
36 static_assert(__is_vector_type_v<_V>);
37 using _Tp =
typename _Traits::value_type;
38 constexpr
size_t _Np = _Traits::_S_full_size;
39 [[maybe_unused]]
const auto __intrin = __to_intrin(__v);
40 using _Up =
typename _VectorTraits<_To>::value_type;
41 constexpr
size_t _M = _VectorTraits<_To>::_S_full_size;
44 [[maybe_unused]] constexpr
bool __x_to_x
45 =
sizeof(__v) <= 16 &&
sizeof(_To) <= 16;
46 [[maybe_unused]] constexpr
bool __x_to_y
47 =
sizeof(__v) <= 16 &&
sizeof(_To) == 32;
48 [[maybe_unused]] constexpr
bool __x_to_z
49 =
sizeof(__v) <= 16 &&
sizeof(_To) == 64;
50 [[maybe_unused]] constexpr
bool __y_to_x
51 =
sizeof(__v) == 32 &&
sizeof(_To) <= 16;
52 [[maybe_unused]] constexpr
bool __y_to_y
53 =
sizeof(__v) == 32 &&
sizeof(_To) == 32;
54 [[maybe_unused]] constexpr
bool __y_to_z
55 =
sizeof(__v) == 32 &&
sizeof(_To) == 64;
56 [[maybe_unused]] constexpr
bool __z_to_x
57 =
sizeof(__v) == 64 &&
sizeof(_To) <= 16;
58 [[maybe_unused]] constexpr
bool __z_to_y
59 =
sizeof(__v) == 64 &&
sizeof(_To) == 32;
60 [[maybe_unused]] constexpr
bool __z_to_z
61 =
sizeof(__v) == 64 &&
sizeof(_To) == 64;
64 [[maybe_unused]] constexpr
bool __i_to_i
65 = is_integral_v<_Up> && is_integral_v<_Tp>;
66 [[maybe_unused]] constexpr
bool __i8_to_i16
67 = __i_to_i &&
sizeof(_Tp) == 1 &&
sizeof(_Up) == 2;
68 [[maybe_unused]] constexpr
bool __i8_to_i32
69 = __i_to_i &&
sizeof(_Tp) == 1 &&
sizeof(_Up) == 4;
70 [[maybe_unused]] constexpr
bool __i8_to_i64
71 = __i_to_i &&
sizeof(_Tp) == 1 &&
sizeof(_Up) == 8;
72 [[maybe_unused]] constexpr
bool __i16_to_i8
73 = __i_to_i &&
sizeof(_Tp) == 2 &&
sizeof(_Up) == 1;
74 [[maybe_unused]] constexpr
bool __i16_to_i32
75 = __i_to_i &&
sizeof(_Tp) == 2 &&
sizeof(_Up) == 4;
76 [[maybe_unused]] constexpr
bool __i16_to_i64
77 = __i_to_i &&
sizeof(_Tp) == 2 &&
sizeof(_Up) == 8;
78 [[maybe_unused]] constexpr
bool __i32_to_i8
79 = __i_to_i &&
sizeof(_Tp) == 4 &&
sizeof(_Up) == 1;
80 [[maybe_unused]] constexpr
bool __i32_to_i16
81 = __i_to_i &&
sizeof(_Tp) == 4 &&
sizeof(_Up) == 2;
82 [[maybe_unused]] constexpr
bool __i32_to_i64
83 = __i_to_i &&
sizeof(_Tp) == 4 &&
sizeof(_Up) == 8;
84 [[maybe_unused]] constexpr
bool __i64_to_i8
85 = __i_to_i &&
sizeof(_Tp) == 8 &&
sizeof(_Up) == 1;
86 [[maybe_unused]] constexpr
bool __i64_to_i16
87 = __i_to_i &&
sizeof(_Tp) == 8 &&
sizeof(_Up) == 2;
88 [[maybe_unused]] constexpr
bool __i64_to_i32
89 = __i_to_i &&
sizeof(_Tp) == 8 &&
sizeof(_Up) == 4;
93 [[maybe_unused]] constexpr
bool __s64_to_f32
94 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 8
95 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
96 [[maybe_unused]] constexpr
bool __s32_to_f32
97 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 4
98 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
99 [[maybe_unused]] constexpr
bool __s16_to_f32
100 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 2
101 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
102 [[maybe_unused]] constexpr
bool __s8_to_f32
103 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 1
104 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
105 [[maybe_unused]] constexpr
bool __u64_to_f32
106 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 8
107 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
108 [[maybe_unused]] constexpr
bool __u32_to_f32
109 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 4
110 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
111 [[maybe_unused]] constexpr
bool __u16_to_f32
112 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 2
113 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
114 [[maybe_unused]] constexpr
bool __u8_to_f32
115 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 1
116 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
117 [[maybe_unused]] constexpr
bool __s64_to_f64
118 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 8
119 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
120 [[maybe_unused]] constexpr
bool __s32_to_f64
121 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 4
122 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
123 [[maybe_unused]] constexpr
bool __u64_to_f64
124 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 8
125 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
126 [[maybe_unused]] constexpr
bool __u32_to_f64
127 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 4
128 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
129 [[maybe_unused]] constexpr
bool __f32_to_s64
130 = is_integral_v<_Up> && is_signed_v<_Up> &&
sizeof(_Up) == 8
131 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
132 [[maybe_unused]] constexpr
bool __f32_to_s32
133 = is_integral_v<_Up> && is_signed_v<_Up> &&
sizeof(_Up) == 4
134 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
135 [[maybe_unused]] constexpr
bool __f32_to_u64
136 = is_integral_v<_Up> && is_unsigned_v<_Up> &&
sizeof(_Up) == 8
137 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
138 [[maybe_unused]] constexpr
bool __f32_to_u32
139 = is_integral_v<_Up> && is_unsigned_v<_Up> &&
sizeof(_Up) == 4
140 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
141 [[maybe_unused]] constexpr
bool __f64_to_s64
142 = is_integral_v<_Up> && is_signed_v<_Up> &&
sizeof(_Up) == 8
143 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
144 [[maybe_unused]] constexpr
bool __f64_to_s32
145 = is_integral_v<_Up> && is_signed_v<_Up> &&
sizeof(_Up) == 4
146 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
147 [[maybe_unused]] constexpr
bool __f64_to_u64
148 = is_integral_v<_Up> && is_unsigned_v<_Up> &&
sizeof(_Up) == 8
149 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
150 [[maybe_unused]] constexpr
bool __f64_to_u32
151 = is_integral_v<_Up> && is_unsigned_v<_Up> &&
sizeof(_Up) == 4
152 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
153 [[maybe_unused]] constexpr
bool __ibw_to_f32
154 = is_integral_v<_Tp> &&
sizeof(_Tp) <= 2
155 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
156 [[maybe_unused]] constexpr
bool __ibw_to_f64
157 = is_integral_v<_Tp> &&
sizeof(_Tp) <= 2
158 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
159 [[maybe_unused]] constexpr
bool __f32_to_ibw
160 = is_integral_v<_Up> &&
sizeof(_Up) <= 2
161 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
162 [[maybe_unused]] constexpr
bool __f64_to_ibw
163 = is_integral_v<_Up> &&
sizeof(_Up) <= 2
164 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
165 [[maybe_unused]] constexpr
bool __f32_to_f64
166 = is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4
167 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
168 [[maybe_unused]] constexpr
bool __f64_to_f32
169 = is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8
170 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
172 if constexpr (__i_to_i && __y_to_x && !__have_avx2)
173 return __convert_x86<_To>(__lo128(__v), __hi128(__v));
174 else if constexpr (__i_to_i && __x_to_y && !__have_avx2)
175 return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v),
176 __convert_x86<__vector_type_t<_Up, _M / 2>>(
177 __extract_part<1, _Np / _M * 2>(__v)));
178 else if constexpr (__i_to_i)
180 static_assert(__x_to_x || __have_avx2,
181 "integral conversions with ymm registers require AVX2");
182 static_assert(__have_avx512bw
183 || ((
sizeof(_Tp) >= 4 ||
sizeof(__v) < 64)
184 && (
sizeof(_Up) >= 4 ||
sizeof(_To) < 64)),
185 "8/16-bit integers in zmm registers require AVX512BW");
186 static_assert((
sizeof(__v) < 64 &&
sizeof(_To) < 64) || __have_avx512f,
187 "integral conversions with ymm registers require AVX2");
189 if constexpr (is_floating_point_v<_Tp> == is_floating_point_v<_Up> &&
190 sizeof(_Tp) ==
sizeof(_Up))
193 if constexpr (_Np >= _M)
194 return __intrin_bitcast<_To>(__v);
196 return __zero_extend(__vector_bitcast<_Up>(__v));
198 else if constexpr (_Np < _M &&
sizeof(_To) > 16)
200 return __zero_extend(
201 __convert_x86<__vector_type_t<
202 _Up, (16 / sizeof(_Up) > _Np) ? 16 / sizeof(_Up) : _Np>>(__v));
203 else if constexpr (_Np > _M && sizeof(__v) > 16)
205 return __convert_x86<_To>(__extract_part<0, _Np / _M>(__v));
206 else if constexpr (__i64_to_i32)
208 if constexpr (__x_to_x && __have_avx512vl)
209 return __intrin_bitcast<_To>(_mm_cvtepi64_epi32(__intrin));
210 else if constexpr (__x_to_x)
211 return __auto_bitcast(
212 _mm_shuffle_ps(__vector_bitcast<float>(__v), __m128(), 8));
213 else if constexpr (__y_to_x && __have_avx512vl)
214 return __intrin_bitcast<_To>(_mm256_cvtepi64_epi32(__intrin));
215 else if constexpr (__y_to_x && __have_avx512f)
216 return __intrin_bitcast<_To>(
217 __lo128(_mm512_cvtepi64_epi32(__auto_bitcast(__v))));
218 else if constexpr (__y_to_x)
219 return __intrin_bitcast<_To>(
220 __lo128(_mm256_permute4x64_epi64(_mm256_shuffle_epi32(__intrin, 8),
222 else if constexpr (__z_to_y)
223 return __intrin_bitcast<_To>(_mm512_cvtepi64_epi32(__intrin));
225 else if constexpr (__i64_to_i16)
227 if constexpr (__x_to_x && __have_avx512vl)
228 return __intrin_bitcast<_To>(_mm_cvtepi64_epi16(__intrin));
229 else if constexpr (__x_to_x && __have_avx512f)
230 return __intrin_bitcast<_To>(
231 __lo128(_mm512_cvtepi64_epi16(__auto_bitcast(__v))));
232 else if constexpr (__x_to_x && __have_ssse3)
234 return __intrin_bitcast<_To>(
235 _mm_shuffle_epi8(__intrin,
236 _mm_setr_epi8(0, 1, 8, 9, -0x80, -0x80, -0x80,
237 -0x80, -0x80, -0x80, -0x80, -0x80,
238 -0x80, -0x80, -0x80, -0x80)));
241 else if constexpr (__y_to_x && __have_avx512vl)
242 return __intrin_bitcast<_To>(_mm256_cvtepi64_epi16(__intrin));
243 else if constexpr (__y_to_x && __have_avx512f)
244 return __intrin_bitcast<_To>(
245 __lo128(_mm512_cvtepi64_epi16(__auto_bitcast(__v))));
246 else if constexpr (__y_to_x)
248 const auto __a = _mm256_shuffle_epi8(
250 _mm256_setr_epi8(0, 1, 8, 9, -0x80, -0x80, -0x80, -0x80, -0x80,
251 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
252 -0x80, -0x80, -0x80, -0x80, 0, 1, 8, 9, -0x80,
253 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
255 return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a));
257 else if constexpr (__z_to_x)
258 return __intrin_bitcast<_To>(_mm512_cvtepi64_epi16(__intrin));
260 else if constexpr (__i64_to_i8)
262 if constexpr (__x_to_x && __have_avx512vl)
263 return __intrin_bitcast<_To>(_mm_cvtepi64_epi8(__intrin));
264 else if constexpr (__x_to_x && __have_avx512f)
265 return __intrin_bitcast<_To>(
266 __lo128(_mm512_cvtepi64_epi8(__zero_extend(__intrin))));
267 else if constexpr (__y_to_x && __have_avx512vl)
268 return __intrin_bitcast<_To>(_mm256_cvtepi64_epi8(__intrin));
269 else if constexpr (__y_to_x && __have_avx512f)
270 return __intrin_bitcast<_To>(
271 _mm512_cvtepi64_epi8(__zero_extend(__intrin)));
272 else if constexpr (__z_to_x)
273 return __intrin_bitcast<_To>(_mm512_cvtepi64_epi8(__intrin));
275 else if constexpr (__i32_to_i64)
277 if constexpr (__have_sse4_1 && __x_to_x)
278 return __intrin_bitcast<_To>(is_signed_v<_Tp>
279 ? _mm_cvtepi32_epi64(__intrin)
280 : _mm_cvtepu32_epi64(__intrin));
281 else if constexpr (__x_to_x)
283 return __intrin_bitcast<_To>(
284 _mm_unpacklo_epi32(__intrin, is_signed_v<_Tp>
285 ? _mm_srai_epi32(__intrin, 31)
288 else if constexpr (__x_to_y)
289 return __intrin_bitcast<_To>(is_signed_v<_Tp>
290 ? _mm256_cvtepi32_epi64(__intrin)
291 : _mm256_cvtepu32_epi64(__intrin));
292 else if constexpr (__y_to_z)
293 return __intrin_bitcast<_To>(is_signed_v<_Tp>
294 ? _mm512_cvtepi32_epi64(__intrin)
295 : _mm512_cvtepu32_epi64(__intrin));
297 else if constexpr (__i32_to_i16)
299 if constexpr (__x_to_x && __have_avx512vl)
300 return __intrin_bitcast<_To>(_mm_cvtepi32_epi16(__intrin));
301 else if constexpr (__x_to_x && __have_avx512f)
302 return __intrin_bitcast<_To>(
303 __lo128(_mm512_cvtepi32_epi16(__auto_bitcast(__v))));
304 else if constexpr (__x_to_x && __have_ssse3)
305 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
306 __intrin, _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
307 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80)));
308 else if constexpr (__x_to_x)
310 auto __a = _mm_unpacklo_epi16(__intrin, __m128i());
311 auto __b = _mm_unpackhi_epi16(__intrin, __m128i());
312 auto __c = _mm_unpacklo_epi16(__a, __b);
313 auto __d = _mm_unpackhi_epi16(__a, __b);
314 return __intrin_bitcast<_To>(
315 _mm_unpacklo_epi16(__c, __d));
317 else if constexpr (__y_to_x && __have_avx512vl)
318 return __intrin_bitcast<_To>(_mm256_cvtepi32_epi16(__intrin));
319 else if constexpr (__y_to_x && __have_avx512f)
320 return __intrin_bitcast<_To>(
321 __lo128(_mm512_cvtepi32_epi16(__auto_bitcast(__v))));
322 else if constexpr (__y_to_x)
324 auto __a = _mm256_shuffle_epi8(
326 _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80,
327 -0x80, -0x80, -0x80, -0x80, -0x80, 0, 1, 4, 5, 8,
328 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80,
329 -0x80, -0x80, -0x80));
330 return __intrin_bitcast<_To>(__lo128(
331 _mm256_permute4x64_epi64(__a,
334 else if constexpr (__z_to_y)
335 return __intrin_bitcast<_To>(_mm512_cvtepi32_epi16(__intrin));
337 else if constexpr (__i32_to_i8)
339 if constexpr (__x_to_x && __have_avx512vl)
340 return __intrin_bitcast<_To>(_mm_cvtepi32_epi8(__intrin));
341 else if constexpr (__x_to_x && __have_avx512f)
342 return __intrin_bitcast<_To>(
343 __lo128(_mm512_cvtepi32_epi8(__zero_extend(__intrin))));
344 else if constexpr (__x_to_x && __have_ssse3)
346 return __intrin_bitcast<_To>(
347 _mm_shuffle_epi8(__intrin,
348 _mm_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80,
349 -0x80, -0x80, -0x80, -0x80, -0x80,
350 -0x80, -0x80, -0x80, -0x80)));
352 else if constexpr (__x_to_x)
355 = _mm_unpacklo_epi8(__intrin, __intrin);
357 = _mm_unpackhi_epi8(__intrin, __intrin);
358 const auto __c = _mm_unpacklo_epi8(__a, __b);
359 const auto __d = _mm_unpackhi_epi8(__a, __b);
360 const auto __e = _mm_unpacklo_epi8(__c, __d);
361 return __intrin_bitcast<_To>(__e & _mm_cvtsi32_si128(-1));
363 else if constexpr (__y_to_x && __have_avx512vl)
364 return __intrin_bitcast<_To>(_mm256_cvtepi32_epi8(__intrin));
365 else if constexpr (__y_to_x && __have_avx512f)
366 return __intrin_bitcast<_To>(
367 _mm512_cvtepi32_epi8(__zero_extend(__intrin)));
368 else if constexpr (__z_to_x)
369 return __intrin_bitcast<_To>(_mm512_cvtepi32_epi8(__intrin));
371 else if constexpr (__i16_to_i64)
373 if constexpr (__x_to_x && __have_sse4_1)
374 return __intrin_bitcast<_To>(is_signed_v<_Tp>
375 ? _mm_cvtepi16_epi64(__intrin)
376 : _mm_cvtepu16_epi64(__intrin));
377 else if constexpr (__x_to_x && is_signed_v<_Tp>)
379 auto __x = _mm_srai_epi16(__intrin, 15);
380 auto __y = _mm_unpacklo_epi16(__intrin, __x);
381 __x = _mm_unpacklo_epi16(__x, __x);
382 return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__y, __x));
384 else if constexpr (__x_to_x)
385 return __intrin_bitcast<_To>(
386 _mm_unpacklo_epi32(_mm_unpacklo_epi16(__intrin, __m128i()),
388 else if constexpr (__x_to_y)
389 return __intrin_bitcast<_To>(is_signed_v<_Tp>
390 ? _mm256_cvtepi16_epi64(__intrin)
391 : _mm256_cvtepu16_epi64(__intrin));
392 else if constexpr (__x_to_z)
393 return __intrin_bitcast<_To>(is_signed_v<_Tp>
394 ? _mm512_cvtepi16_epi64(__intrin)
395 : _mm512_cvtepu16_epi64(__intrin));
397 else if constexpr (__i16_to_i32)
399 if constexpr (__x_to_x && __have_sse4_1)
400 return __intrin_bitcast<_To>(is_signed_v<_Tp>
401 ? _mm_cvtepi16_epi32(__intrin)
402 : _mm_cvtepu16_epi32(__intrin));
403 else if constexpr (__x_to_x && is_signed_v<_Tp>)
404 return __intrin_bitcast<_To>(
405 _mm_srai_epi32(_mm_unpacklo_epi16(__intrin, __intrin), 16));
406 else if constexpr (__x_to_x && is_unsigned_v<_Tp>)
407 return __intrin_bitcast<_To>(_mm_unpacklo_epi16(__intrin, __m128i()));
408 else if constexpr (__x_to_y)
409 return __intrin_bitcast<_To>(is_signed_v<_Tp>
410 ? _mm256_cvtepi16_epi32(__intrin)
411 : _mm256_cvtepu16_epi32(__intrin));
412 else if constexpr (__y_to_z)
413 return __intrin_bitcast<_To>(is_signed_v<_Tp>
414 ? _mm512_cvtepi16_epi32(__intrin)
415 : _mm512_cvtepu16_epi32(__intrin));
417 else if constexpr (__i16_to_i8)
419 if constexpr (__x_to_x && __have_avx512bw_vl)
420 return __intrin_bitcast<_To>(_mm_cvtepi16_epi8(__intrin));
421 else if constexpr (__x_to_x && __have_avx512bw)
422 return __intrin_bitcast<_To>(
423 __lo128(_mm512_cvtepi16_epi8(__zero_extend(__intrin))));
424 else if constexpr (__x_to_x && __have_ssse3)
425 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
426 __intrin, _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -0x80, -0x80,
427 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80)));
428 else if constexpr (__x_to_x)
431 = _mm_unpacklo_epi8(__intrin, __intrin);
433 = _mm_unpackhi_epi8(__intrin, __intrin);
434 auto __c = _mm_unpacklo_epi8(__a, __b);
435 auto __d = _mm_unpackhi_epi8(__a, __b);
436 auto __e = _mm_unpacklo_epi8(__c, __d);
437 auto __f = _mm_unpackhi_epi8(__c, __d);
438 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__e, __f));
440 else if constexpr (__y_to_x && __have_avx512bw_vl)
441 return __intrin_bitcast<_To>(_mm256_cvtepi16_epi8(__intrin));
442 else if constexpr (__y_to_x && __have_avx512bw)
443 return __intrin_bitcast<_To>(
444 __lo256(_mm512_cvtepi16_epi8(__zero_extend(__intrin))));
445 else if constexpr (__y_to_x)
447 auto __a = _mm256_shuffle_epi8(
449 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -0x80, -0x80, -0x80,
450 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
451 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80, 0, 2,
452 4, 6, 8, 10, 12, 14));
453 return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a));
455 else if constexpr (__z_to_y && __have_avx512bw)
456 return __intrin_bitcast<_To>(_mm512_cvtepi16_epi8(__intrin));
457 else if constexpr (__z_to_y)
458 __assert_unreachable<_Tp>();
460 else if constexpr (__i8_to_i64)
462 if constexpr (__x_to_x && __have_sse4_1)
463 return __intrin_bitcast<_To>(is_signed_v<_Tp>
464 ? _mm_cvtepi8_epi64(__intrin)
465 : _mm_cvtepu8_epi64(__intrin));
466 else if constexpr (__x_to_x && is_signed_v<_Tp>)
468 if constexpr (__have_ssse3)
470 auto __dup = _mm_unpacklo_epi8(__intrin, __intrin);
471 auto __epi16 = _mm_srai_epi16(__dup, 8);
472 _mm_shuffle_epi8(__epi16,
473 _mm_setr_epi8(0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3,
478 auto __x = _mm_unpacklo_epi8(__intrin, __intrin);
479 __x = _mm_unpacklo_epi16(__x, __x);
480 return __intrin_bitcast<_To>(
481 _mm_unpacklo_epi32(_mm_srai_epi32(__x, 24),
482 _mm_srai_epi32(__x, 31)));
485 else if constexpr (__x_to_x)
487 return __intrin_bitcast<_To>(_mm_unpacklo_epi32(
488 _mm_unpacklo_epi16(_mm_unpacklo_epi8(__intrin, __m128i()),
492 else if constexpr (__x_to_y)
493 return __intrin_bitcast<_To>(is_signed_v<_Tp>
494 ? _mm256_cvtepi8_epi64(__intrin)
495 : _mm256_cvtepu8_epi64(__intrin));
496 else if constexpr (__x_to_z)
497 return __intrin_bitcast<_To>(is_signed_v<_Tp>
498 ? _mm512_cvtepi8_epi64(__intrin)
499 : _mm512_cvtepu8_epi64(__intrin));
501 else if constexpr (__i8_to_i32)
503 if constexpr (__x_to_x && __have_sse4_1)
504 return __intrin_bitcast<_To>(is_signed_v<_Tp>
505 ? _mm_cvtepi8_epi32(__intrin)
506 : _mm_cvtepu8_epi32(__intrin));
507 else if constexpr (__x_to_x && is_signed_v<_Tp>)
509 const auto __x = _mm_unpacklo_epi8(__intrin, __intrin);
510 return __intrin_bitcast<_To>(
511 _mm_srai_epi32(_mm_unpacklo_epi16(__x, __x), 24));
513 else if constexpr (__x_to_x && is_unsigned_v<_Tp>)
514 return __intrin_bitcast<_To>(
515 _mm_unpacklo_epi16(_mm_unpacklo_epi8(__intrin, __m128i()),
517 else if constexpr (__x_to_y)
518 return __intrin_bitcast<_To>(is_signed_v<_Tp>
519 ? _mm256_cvtepi8_epi32(__intrin)
520 : _mm256_cvtepu8_epi32(__intrin));
521 else if constexpr (__x_to_z)
522 return __intrin_bitcast<_To>(is_signed_v<_Tp>
523 ? _mm512_cvtepi8_epi32(__intrin)
524 : _mm512_cvtepu8_epi32(__intrin));
526 else if constexpr (__i8_to_i16)
528 if constexpr (__x_to_x && __have_sse4_1)
529 return __intrin_bitcast<_To>(is_signed_v<_Tp>
530 ? _mm_cvtepi8_epi16(__intrin)
531 : _mm_cvtepu8_epi16(__intrin));
532 else if constexpr (__x_to_x && is_signed_v<_Tp>)
533 return __intrin_bitcast<_To>(
534 _mm_srai_epi16(_mm_unpacklo_epi8(__intrin, __intrin), 8));
535 else if constexpr (__x_to_x && is_unsigned_v<_Tp>)
536 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__intrin, __m128i()));
537 else if constexpr (__x_to_y)
538 return __intrin_bitcast<_To>(is_signed_v<_Tp>
539 ? _mm256_cvtepi8_epi16(__intrin)
540 : _mm256_cvtepu8_epi16(__intrin));
541 else if constexpr (__y_to_z && __have_avx512bw)
542 return __intrin_bitcast<_To>(is_signed_v<_Tp>
543 ? _mm512_cvtepi8_epi16(__intrin)
544 : _mm512_cvtepu8_epi16(__intrin));
545 else if constexpr (__y_to_z)
546 __assert_unreachable<_Tp>();
548 else if constexpr (__f32_to_s64)
550 if constexpr (__have_avx512dq_vl && __x_to_x)
551 return __intrin_bitcast<_To>(_mm_cvttps_epi64(__intrin));
552 else if constexpr (__have_avx512dq_vl && __x_to_y)
553 return __intrin_bitcast<_To>(_mm256_cvttps_epi64(__intrin));
554 else if constexpr (__have_avx512dq && __y_to_z)
555 return __intrin_bitcast<_To>(_mm512_cvttps_epi64(__intrin));
558 else if constexpr (__f32_to_u64)
560 if constexpr (__have_avx512dq_vl && __x_to_x)
561 return __intrin_bitcast<_To>(_mm_cvttps_epu64(__intrin));
562 else if constexpr (__have_avx512dq_vl && __x_to_y)
563 return __intrin_bitcast<_To>(_mm256_cvttps_epu64(__intrin));
564 else if constexpr (__have_avx512dq && __y_to_z)
565 return __intrin_bitcast<_To>(_mm512_cvttps_epu64(__intrin));
568 else if constexpr (__f32_to_s32)
570 if constexpr (__x_to_x || __y_to_y || __z_to_z)
575 __assert_unreachable<_Tp>();
577 else if constexpr (__f32_to_u32)
579 if constexpr (__have_avx512vl && __x_to_x)
580 return __auto_bitcast(_mm_cvttps_epu32(__intrin));
581 else if constexpr (__have_avx512f && __x_to_x)
582 return __auto_bitcast(
583 __lo128(_mm512_cvttps_epu32(__auto_bitcast(__v))));
584 else if constexpr (__have_avx512vl && __y_to_y)
585 return __vector_bitcast<_Up>(_mm256_cvttps_epu32(__intrin));
586 else if constexpr (__have_avx512f && __y_to_y)
587 return __vector_bitcast<_Up>(
588 __lo256(_mm512_cvttps_epu32(__auto_bitcast(__v))));
589 else if constexpr (__x_to_x || __y_to_y || __z_to_z)
596 __assert_unreachable<_Tp>();
598 else if constexpr (__f32_to_ibw)
599 return __convert_x86<_To>(__convert_x86<__vector_type_t<int, _Np>>(__v));
600 else if constexpr (__f64_to_s64)
602 if constexpr (__have_avx512dq_vl && __x_to_x)
603 return __intrin_bitcast<_To>(_mm_cvttpd_epi64(__intrin));
604 else if constexpr (__have_avx512dq_vl && __y_to_y)
605 return __intrin_bitcast<_To>(_mm256_cvttpd_epi64(__intrin));
606 else if constexpr (__have_avx512dq && __z_to_z)
607 return __intrin_bitcast<_To>(_mm512_cvttpd_epi64(__intrin));
610 else if constexpr (__f64_to_u64)
612 if constexpr (__have_avx512dq_vl && __x_to_x)
613 return __intrin_bitcast<_To>(_mm_cvttpd_epu64(__intrin));
614 else if constexpr (__have_avx512dq_vl && __y_to_y)
615 return __intrin_bitcast<_To>(_mm256_cvttpd_epu64(__intrin));
616 else if constexpr (__have_avx512dq && __z_to_z)
617 return __intrin_bitcast<_To>(_mm512_cvttpd_epu64(__intrin));
620 else if constexpr (__f64_to_s32)
622 if constexpr (__x_to_x)
623 return __intrin_bitcast<_To>(_mm_cvttpd_epi32(__intrin));
624 else if constexpr (__y_to_x)
625 return __intrin_bitcast<_To>(_mm256_cvttpd_epi32(__intrin));
626 else if constexpr (__z_to_y)
627 return __intrin_bitcast<_To>(_mm512_cvttpd_epi32(__intrin));
629 else if constexpr (__f64_to_u32)
631 if constexpr (__have_avx512vl && __x_to_x)
632 return __intrin_bitcast<_To>(_mm_cvttpd_epu32(__intrin));
633 else if constexpr (__have_sse4_1 && __x_to_x)
634 return __vector_bitcast<_Up, _M>(
635 _mm_cvttpd_epi32(_mm_floor_pd(__intrin) - 0x8000
'0000u)) 637 else if constexpr (__x_to_x)
642 else if constexpr (__have_avx512vl && __y_to_x)
643 return __intrin_bitcast<_To>(_mm256_cvttpd_epu32(__intrin));
644 else if constexpr (__y_to_x)
646 return __intrin_bitcast<_To>(
647 __vector_bitcast<_Up>(
648 _mm256_cvttpd_epi32(_mm256_floor_pd(__intrin) - 0x8000
'0000u)) 651 else if constexpr (__z_to_y)
652 return __intrin_bitcast<_To>(_mm512_cvttpd_epu32(__intrin));
654 else if constexpr (__f64_to_ibw)
656 return __convert_x86<_To>(
657 __convert_x86<__vector_type_t<int, (_Np < 4 ? 4 : _Np)>>(__v));
659 else if constexpr (__s64_to_f32)
661 if constexpr (__x_to_x && __have_avx512dq_vl)
662 return __intrin_bitcast<_To>(_mm_cvtepi64_ps(__intrin));
663 else if constexpr (__y_to_x && __have_avx512dq_vl)
664 return __intrin_bitcast<_To>(_mm256_cvtepi64_ps(__intrin));
665 else if constexpr (__z_to_y && __have_avx512dq)
666 return __intrin_bitcast<_To>(_mm512_cvtepi64_ps(__intrin));
667 else if constexpr (__z_to_y)
668 return __intrin_bitcast<_To>(
669 _mm512_cvtpd_ps(__convert_x86<__vector_type_t<double, 8>>(__v)));
671 else if constexpr (__u64_to_f32)
673 if constexpr (__x_to_x && __have_avx512dq_vl)
674 return __intrin_bitcast<_To>(_mm_cvtepu64_ps(__intrin));
675 else if constexpr (__y_to_x && __have_avx512dq_vl)
676 return __intrin_bitcast<_To>(_mm256_cvtepu64_ps(__intrin));
677 else if constexpr (__z_to_y && __have_avx512dq)
678 return __intrin_bitcast<_To>(_mm512_cvtepu64_ps(__intrin));
679 else if constexpr (__z_to_y)
681 return __intrin_bitcast<_To>(
682 __lo256(_mm512_cvtepu32_ps(__auto_bitcast(
683 _mm512_cvtepi64_epi32(_mm512_srai_epi64(__intrin, 32)))))
685 + __lo256(_mm512_cvtepu32_ps(
686 __auto_bitcast(_mm512_cvtepi64_epi32(__intrin)))));
689 else if constexpr (__s32_to_f32)
693 else if constexpr (__u32_to_f32)
695 if constexpr (__x_to_x && __have_avx512vl)
699 else if constexpr (__x_to_x && __have_avx512f)
700 return __intrin_bitcast<_To>(
701 __lo128(_mm512_cvtepu32_ps(__auto_bitcast(__v))));
702 else if constexpr (__x_to_x && (__have_fma || __have_fma4))
704 return __auto_bitcast(0x10000
705 * _mm_cvtepi32_ps(__to_intrin(__v >> 16))
706 + _mm_cvtepi32_ps(__to_intrin(__v & 0xffff)));
707 else if constexpr (__y_to_y && __have_avx512vl)
711 else if constexpr (__y_to_y && __have_avx512f)
712 return __intrin_bitcast<_To>(
713 __lo256(_mm512_cvtepu32_ps(__auto_bitcast(__v))));
714 else if constexpr (__y_to_y)
716 return 0x10000 * _mm256_cvtepi32_ps(__to_intrin(__v >> 16))
717 + _mm256_cvtepi32_ps(__to_intrin(__v & 0xffff));
720 else if constexpr (__ibw_to_f32)
722 if constexpr (_M <= 4 || __have_avx2)
723 return __convert_x86<_To>(
724 __convert_x86<__vector_type_t<int, _M>>(__v));
727 static_assert(__x_to_y);
729 if constexpr (__have_sse4_1)
731 __a =
sizeof(_Tp) == 2
732 ? (is_signed_v<_Tp> ? _mm_cvtepi16_epi32(__intrin)
733 : _mm_cvtepu16_epi32(__intrin))
734 : (is_signed_v<_Tp> ? _mm_cvtepi8_epi32(__intrin)
735 : _mm_cvtepu8_epi32(__intrin));
737 = _mm_shuffle_epi32(__intrin,
sizeof(_Tp) == 2 ? 0xee : 0xe9);
738 __b =
sizeof(_Tp) == 2
739 ? (is_signed_v<_Tp> ? _mm_cvtepi16_epi32(__w)
740 : _mm_cvtepu16_epi32(__w))
741 : (is_signed_v<_Tp> ? _mm_cvtepi8_epi32(__w)
742 : _mm_cvtepu8_epi32(__w));
747 if constexpr (
sizeof(_Tp) == 1)
749 __tmp = is_signed_v<_Tp>
750 ? _mm_srai_epi16(_mm_unpacklo_epi8(__intrin,
753 : _mm_unpacklo_epi8(__intrin, __m128i());
757 static_assert(
sizeof(_Tp) == 2);
760 __a = is_signed_v<_Tp>
761 ? _mm_srai_epi32(_mm_unpacklo_epi16(__tmp, __tmp), 16)
762 : _mm_unpacklo_epi16(__tmp, __m128i());
763 __b = is_signed_v<_Tp>
764 ? _mm_srai_epi32(_mm_unpackhi_epi16(__tmp, __tmp), 16)
765 : _mm_unpackhi_epi16(__tmp, __m128i());
767 return __convert_x86<_To>(__vector_bitcast<int>(__a),
768 __vector_bitcast<int>(__b));
771 else if constexpr (__s64_to_f64)
773 if constexpr (__x_to_x && __have_avx512dq_vl)
774 return __intrin_bitcast<_To>(_mm_cvtepi64_pd(__intrin));
775 else if constexpr (__y_to_y && __have_avx512dq_vl)
776 return __intrin_bitcast<_To>(_mm256_cvtepi64_pd(__intrin));
777 else if constexpr (__z_to_z && __have_avx512dq)
778 return __intrin_bitcast<_To>(_mm512_cvtepi64_pd(__intrin));
779 else if constexpr (__z_to_z)
781 return __intrin_bitcast<_To>(
782 _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(__to_intrin(__v >> 32)))
784 + _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__intrin)));
787 else if constexpr (__u64_to_f64)
789 if constexpr (__x_to_x && __have_avx512dq_vl)
790 return __intrin_bitcast<_To>(_mm_cvtepu64_pd(__intrin));
791 else if constexpr (__y_to_y && __have_avx512dq_vl)
792 return __intrin_bitcast<_To>(_mm256_cvtepu64_pd(__intrin));
793 else if constexpr (__z_to_z && __have_avx512dq)
794 return __intrin_bitcast<_To>(_mm512_cvtepu64_pd(__intrin));
795 else if constexpr (__z_to_z)
797 return __intrin_bitcast<_To>(
798 _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__to_intrin(__v >> 32)))
800 + _mm512_cvtepu32_pd(_mm512_cvtepi64_epi32(__intrin)));
803 else if constexpr (__s32_to_f64)
805 if constexpr (__x_to_x)
806 return __intrin_bitcast<_To>(_mm_cvtepi32_pd(__intrin));
807 else if constexpr (__x_to_y)
808 return __intrin_bitcast<_To>(_mm256_cvtepi32_pd(__intrin));
809 else if constexpr (__y_to_z)
810 return __intrin_bitcast<_To>(_mm512_cvtepi32_pd(__intrin));
812 else if constexpr (__u32_to_f64)
814 if constexpr (__x_to_x && __have_avx512vl)
815 return __intrin_bitcast<_To>(_mm_cvtepu32_pd(__intrin));
816 else if constexpr (__x_to_x && __have_avx512f)
817 return __intrin_bitcast<_To>(
818 __lo128(_mm512_cvtepu32_pd(__auto_bitcast(__v))));
819 else if constexpr (__x_to_x)
820 return __intrin_bitcast<_To>(
821 _mm_cvtepi32_pd(__to_intrin(__v ^ 0x8000
'0000u)) + 0x8000'0000u);
822 else if constexpr (__x_to_y && __have_avx512vl)
823 return __intrin_bitcast<_To>(_mm256_cvtepu32_pd(__intrin));
824 else if constexpr (__x_to_y && __have_avx512f)
825 return __intrin_bitcast<_To>(
826 __lo256(_mm512_cvtepu32_pd(__auto_bitcast(__v))));
827 else if constexpr (__x_to_y)
828 return __intrin_bitcast<_To>(
829 _mm256_cvtepi32_pd(__to_intrin(__v ^ 0x8000
'0000u)) + 0x8000'0000u);
830 else if constexpr (__y_to_z)
831 return __intrin_bitcast<_To>(_mm512_cvtepu32_pd(__intrin));
833 else if constexpr (__ibw_to_f64)
835 return __convert_x86<_To>(
836 __convert_x86<__vector_type_t<int, std::max(size_t(4), _M)>>(__v));
838 else if constexpr (__f32_to_f64)
840 if constexpr (__x_to_x)
841 return __intrin_bitcast<_To>(_mm_cvtps_pd(__intrin));
842 else if constexpr (__x_to_y)
843 return __intrin_bitcast<_To>(_mm256_cvtps_pd(__intrin));
844 else if constexpr (__y_to_z)
845 return __intrin_bitcast<_To>(_mm512_cvtps_pd(__intrin));
847 else if constexpr (__f64_to_f32)
849 if constexpr (__x_to_x)
850 return __intrin_bitcast<_To>(_mm_cvtpd_ps(__intrin));
851 else if constexpr (__y_to_x)
852 return __intrin_bitcast<_To>(_mm256_cvtpd_ps(__intrin));
853 else if constexpr (__z_to_y)
854 return __intrin_bitcast<_To>(_mm512_cvtpd_ps(__intrin));
857 __assert_unreachable<_Tp>();
860 return __vector_convert<_To>(__v, make_index_sequence<std::min(_M, _Np)>());
866 template <
typename _To,
typename _V,
typename _Traits>
867 _GLIBCXX_SIMD_INTRINSIC _To
868 __convert_x86(_V __v0, _V __v1)
870 static_assert(__is_vector_type_v<_V>);
871 using _Tp =
typename _Traits::value_type;
872 constexpr
size_t _Np = _Traits::_S_full_size;
873 [[maybe_unused]]
const auto __i0 = __to_intrin(__v0);
874 [[maybe_unused]]
const auto __i1 = __to_intrin(__v1);
875 using _Up =
typename _VectorTraits<_To>::value_type;
876 constexpr
size_t _M = _VectorTraits<_To>::_S_full_size;
878 static_assert(2 * _Np <= _M,
879 "__v1 would be discarded; use the one-argument " 880 "__convert_x86 overload instead");
883 [[maybe_unused]] constexpr
bool __x_to_x
884 =
sizeof(__v0) <= 16 &&
sizeof(_To) <= 16;
885 [[maybe_unused]] constexpr
bool __x_to_y
886 =
sizeof(__v0) <= 16 &&
sizeof(_To) == 32;
887 [[maybe_unused]] constexpr
bool __x_to_z
888 =
sizeof(__v0) <= 16 &&
sizeof(_To) == 64;
889 [[maybe_unused]] constexpr
bool __y_to_x
890 =
sizeof(__v0) == 32 &&
sizeof(_To) <= 16;
891 [[maybe_unused]] constexpr
bool __y_to_y
892 =
sizeof(__v0) == 32 &&
sizeof(_To) == 32;
893 [[maybe_unused]] constexpr
bool __y_to_z
894 =
sizeof(__v0) == 32 &&
sizeof(_To) == 64;
895 [[maybe_unused]] constexpr
bool __z_to_x
896 =
sizeof(__v0) == 64 &&
sizeof(_To) <= 16;
897 [[maybe_unused]] constexpr
bool __z_to_y
898 =
sizeof(__v0) == 64 &&
sizeof(_To) == 32;
899 [[maybe_unused]] constexpr
bool __z_to_z
900 =
sizeof(__v0) == 64 &&
sizeof(_To) == 64;
903 [[maybe_unused]] constexpr
bool __i_to_i
904 = is_integral_v<_Up> && is_integral_v<_Tp>;
905 [[maybe_unused]] constexpr
bool __i8_to_i16
906 = __i_to_i &&
sizeof(_Tp) == 1 &&
sizeof(_Up) == 2;
907 [[maybe_unused]] constexpr
bool __i8_to_i32
908 = __i_to_i &&
sizeof(_Tp) == 1 &&
sizeof(_Up) == 4;
909 [[maybe_unused]] constexpr
bool __i8_to_i64
910 = __i_to_i &&
sizeof(_Tp) == 1 &&
sizeof(_Up) == 8;
911 [[maybe_unused]] constexpr
bool __i16_to_i8
912 = __i_to_i &&
sizeof(_Tp) == 2 &&
sizeof(_Up) == 1;
913 [[maybe_unused]] constexpr
bool __i16_to_i32
914 = __i_to_i &&
sizeof(_Tp) == 2 &&
sizeof(_Up) == 4;
915 [[maybe_unused]] constexpr
bool __i16_to_i64
916 = __i_to_i &&
sizeof(_Tp) == 2 &&
sizeof(_Up) == 8;
917 [[maybe_unused]] constexpr
bool __i32_to_i8
918 = __i_to_i &&
sizeof(_Tp) == 4 &&
sizeof(_Up) == 1;
919 [[maybe_unused]] constexpr
bool __i32_to_i16
920 = __i_to_i &&
sizeof(_Tp) == 4 &&
sizeof(_Up) == 2;
921 [[maybe_unused]] constexpr
bool __i32_to_i64
922 = __i_to_i &&
sizeof(_Tp) == 4 &&
sizeof(_Up) == 8;
923 [[maybe_unused]] constexpr
bool __i64_to_i8
924 = __i_to_i &&
sizeof(_Tp) == 8 &&
sizeof(_Up) == 1;
925 [[maybe_unused]] constexpr
bool __i64_to_i16
926 = __i_to_i &&
sizeof(_Tp) == 8 &&
sizeof(_Up) == 2;
927 [[maybe_unused]] constexpr
bool __i64_to_i32
928 = __i_to_i &&
sizeof(_Tp) == 8 &&
sizeof(_Up) == 4;
932 [[maybe_unused]] constexpr
bool __i64_to_f32
933 = is_integral_v<_Tp> &&
sizeof(_Tp) == 8
934 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
935 [[maybe_unused]] constexpr
bool __s32_to_f32
936 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 4
937 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
938 [[maybe_unused]] constexpr
bool __s16_to_f32
939 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 2
940 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
941 [[maybe_unused]] constexpr
bool __s8_to_f32
942 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 1
943 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
944 [[maybe_unused]] constexpr
bool __u32_to_f32
945 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 4
946 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
947 [[maybe_unused]] constexpr
bool __u16_to_f32
948 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 2
949 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
950 [[maybe_unused]] constexpr
bool __u8_to_f32
951 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 1
952 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
953 [[maybe_unused]] constexpr
bool __s64_to_f64
954 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 8
955 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
956 [[maybe_unused]] constexpr
bool __s32_to_f64
957 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 4
958 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
959 [[maybe_unused]] constexpr
bool __s16_to_f64
960 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 2
961 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
962 [[maybe_unused]] constexpr
bool __s8_to_f64
963 = is_integral_v<_Tp> && is_signed_v<_Tp> &&
sizeof(_Tp) == 1
964 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
965 [[maybe_unused]] constexpr
bool __u64_to_f64
966 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 8
967 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
968 [[maybe_unused]] constexpr
bool __u32_to_f64
969 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 4
970 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
971 [[maybe_unused]] constexpr
bool __u16_to_f64
972 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 2
973 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
974 [[maybe_unused]] constexpr
bool __u8_to_f64
975 = is_integral_v<_Tp> && is_unsigned_v<_Tp> &&
sizeof(_Tp) == 1
976 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
977 [[maybe_unused]] constexpr
bool __f32_to_s64
978 = is_integral_v<_Up> && is_signed_v<_Up> &&
sizeof(_Up) == 8
979 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
980 [[maybe_unused]] constexpr
bool __f32_to_s32
981 = is_integral_v<_Up> && is_signed_v<_Up> &&
sizeof(_Up) == 4
982 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
983 [[maybe_unused]] constexpr
bool __f32_to_u64
984 = is_integral_v<_Up> && is_unsigned_v<_Up> &&
sizeof(_Up) == 8
985 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
986 [[maybe_unused]] constexpr
bool __f32_to_u32
987 = is_integral_v<_Up> && is_unsigned_v<_Up> &&
sizeof(_Up) == 4
988 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
989 [[maybe_unused]] constexpr
bool __f64_to_s64
990 = is_integral_v<_Up> && is_signed_v<_Up> &&
sizeof(_Up) == 8
991 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
992 [[maybe_unused]] constexpr
bool __f64_to_s32
993 = is_integral_v<_Up> && is_signed_v<_Up> &&
sizeof(_Up) == 4
994 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
995 [[maybe_unused]] constexpr
bool __f64_to_u64
996 = is_integral_v<_Up> && is_unsigned_v<_Up> &&
sizeof(_Up) == 8
997 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
998 [[maybe_unused]] constexpr
bool __f64_to_u32
999 = is_integral_v<_Up> && is_unsigned_v<_Up> &&
sizeof(_Up) == 4
1000 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
1001 [[maybe_unused]] constexpr
bool __f32_to_ibw
1002 = is_integral_v<_Up> &&
sizeof(_Up) <= 2
1003 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4;
1004 [[maybe_unused]] constexpr
bool __f64_to_ibw
1005 = is_integral_v<_Up> &&
sizeof(_Up) <= 2
1006 && is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8;
1007 [[maybe_unused]] constexpr
bool __f32_to_f64
1008 = is_floating_point_v<_Tp> &&
sizeof(_Tp) == 4
1009 && is_floating_point_v<_Up> &&
sizeof(_Up) == 8;
1010 [[maybe_unused]] constexpr
bool __f64_to_f32
1011 = is_floating_point_v<_Tp> &&
sizeof(_Tp) == 8
1012 && is_floating_point_v<_Up> &&
sizeof(_Up) == 4;
1014 if constexpr (__i_to_i && __y_to_x && !__have_avx2)
1016 return __convert_x86<_To>(__lo128(__v0), __hi128(__v0), __lo128(__v1),
1018 else if constexpr (__i_to_i)
1020 static_assert(__x_to_x || __have_avx2,
1021 "integral conversions with ymm registers require AVX2");
1022 static_assert(__have_avx512bw
1023 || ((
sizeof(_Tp) >= 4 ||
sizeof(__v0) < 64)
1024 && (
sizeof(_Up) >= 4 ||
sizeof(_To) < 64)),
1025 "8/16-bit integers in zmm registers require AVX512BW");
1026 static_assert((
sizeof(__v0) < 64 &&
sizeof(_To) < 64) || __have_avx512f,
1027 "integral conversions with ymm registers require AVX2");
1030 if constexpr (
sizeof(__v0) < 16 || (
sizeof(__v0) == 16 && __have_avx2)
1031 || (
sizeof(__v0) == 16 && __have_avx
1032 && is_floating_point_v<_Tp>)
1033 || (
sizeof(__v0) == 32 && __have_avx512f
1034 && (
sizeof(_Tp) >= 4 || __have_avx512bw)))
1038 return __convert_x86<_To>(__concat(__v0, __v1));
1045 !(is_floating_point_v<
1046 _Tp> == is_floating_point_v<_Up> &&
sizeof(_Tp) ==
sizeof(_Up)));
1048 if constexpr (2 * _Np < _M &&
sizeof(_To) > 16)
1050 constexpr
size_t Min = 16 /
sizeof(_Up);
1051 return __zero_extend(
1053 __vector_type_t<_Up, (Min > 2 * _Np) ? Min : 2 * _Np>>(__v0,
1056 else if constexpr (__i64_to_i32)
1058 if constexpr (__x_to_x)
1059 return __auto_bitcast(_mm_shuffle_ps(__auto_bitcast(__v0),
1060 __auto_bitcast(__v1), 0x88));
1061 else if constexpr (__y_to_y)
1064 return __auto_bitcast(
1065 __xzyw(_mm256_shuffle_ps(__auto_bitcast(__v0),
1066 __auto_bitcast(__v1), 0x88)));
1074 else if constexpr (__z_to_z)
1075 return __intrin_bitcast<_To>(
1076 __concat(_mm512_cvtepi64_epi32(__i0),
1077 _mm512_cvtepi64_epi32(__i1)));
1079 else if constexpr (__i64_to_i16)
1081 if constexpr (__x_to_x)
1084 if constexpr (__have_sse4_1)
1086 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1087 _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 4), 0x44),
1088 _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, -0x80, -0x80,
1089 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80)));
1093 return __vector_type_t<_Up, _M>{_Up(__v0[0]), _Up(__v0[1]),
1094 _Up(__v1[0]), _Up(__v1[1])};
1097 else if constexpr (__y_to_x)
1100 = _mm256_unpacklo_epi16(__i0, __i1);
1102 = _mm256_unpackhi_epi16(__i0, __i1);
1104 = _mm256_unpacklo_epi16(__a, __b);
1105 return __intrin_bitcast<_To>(
1106 _mm_unpacklo_epi32(__lo128(__c), __hi128(__c)));
1108 else if constexpr (__z_to_y)
1109 return __intrin_bitcast<_To>(
1110 __concat(_mm512_cvtepi64_epi16(__i0),
1111 _mm512_cvtepi64_epi16(__i1)));
1113 else if constexpr (__i64_to_i8)
1115 if constexpr (__x_to_x && __have_sse4_1)
1117 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1118 _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 4), 0x44),
1119 _mm_setr_epi8(0, 8, 4, 12, -0x80, -0x80, -0x80, -0x80, -0x80,
1120 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
1123 else if constexpr (__x_to_x && __have_ssse3)
1125 return __intrin_bitcast<_To>(_mm_unpacklo_epi16(
1127 __i0, _mm_setr_epi8(0, 8, -0x80, -0x80, -0x80, -0x80, -0x80,
1128 -0x80, -0x80, -0x80, -0x80, -0x80,
1129 -0x80, -0x80, -0x80, -0x80)),
1131 __i1, _mm_setr_epi8(0, 8, -0x80, -0x80, -0x80, -0x80, -0x80,
1132 -0x80, -0x80, -0x80, -0x80, -0x80,
1133 -0x80, -0x80, -0x80, -0x80))));
1135 else if constexpr (__x_to_x)
1137 return __vector_type_t<_Up, _M>{_Up(__v0[0]), _Up(__v0[1]),
1138 _Up(__v1[0]), _Up(__v1[1])};
1140 else if constexpr (__y_to_x)
1142 const auto __a = _mm256_shuffle_epi8(
1143 _mm256_blend_epi32(__i0, _mm256_slli_epi64(__i1, 32), 0xAA),
1144 _mm256_setr_epi8(0, 8, -0x80, -0x80, 4, 12, -0x80, -0x80,
1145 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
1146 -0x80, -0x80, -0x80, -0x80, 0, 8, -0x80,
1147 -0x80, 4, 12, -0x80, -0x80, -0x80, -0x80,
1148 -0x80, -0x80, -0x80, -0x80));
1149 return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a));
1152 else if constexpr (__i32_to_i16)
1154 if constexpr (__x_to_x)
1157 if constexpr (__have_sse4_1)
1159 return __intrin_bitcast<_To>(_mm_shuffle_epi8(
1160 _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 2), 0xaa),
1161 _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10,
1164 else if constexpr (__have_ssse3)
1166 return __intrin_bitcast<_To>(
1167 _mm_hadd_epi16(__to_intrin(__v0 << 16),
1168 __to_intrin(__v1 << 16)));
1179 auto __a = _mm_unpacklo_epi16(__i0, __i1);
1180 auto __b = _mm_unpackhi_epi16(__i0, __i1);
1181 auto __c = _mm_unpacklo_epi16(__a, __b);
1182 auto __d = _mm_unpackhi_epi16(__a, __b);
1183 return __intrin_bitcast<_To>(
1184 _mm_unpacklo_epi16(__c, __d));
1187 else if constexpr (__y_to_y)
1190 = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
1191 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
1192 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
1193 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80);
1194 auto __a = _mm256_shuffle_epi8(__i0, __shuf);
1195 auto __b = _mm256_shuffle_epi8(__i1, __shuf);
1196 return __intrin_bitcast<_To>(
1197 __xzyw(_mm256_unpacklo_epi64(__a, __b)));
1200 else if constexpr (__i32_to_i8)
1202 if constexpr (__x_to_x && __have_ssse3)
1205 = _mm_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80, -0x80,
1206 -0x80, -0x80, -0x80, -0x80, -0x80, -0x80,
1208 return __intrin_bitcast<_To>(
1209 _mm_unpacklo_epi32(_mm_shuffle_epi8(__i0, shufmask),
1210 _mm_shuffle_epi8(__i1, shufmask)));
1212 else if constexpr (__x_to_x)
1214 auto __a = _mm_unpacklo_epi8(__i0, __i1);
1215 auto __b = _mm_unpackhi_epi8(__i0, __i1);
1216 auto __c = _mm_unpacklo_epi8(__a, __b);
1217 auto __d = _mm_unpackhi_epi8(__a, __b);
1218 auto __e = _mm_unpacklo_epi8(__c, __d);
1219 return __intrin_bitcast<_To>(__e & __m128i{-1, 0});
1221 else if constexpr (__y_to_x)
1223 const auto __a = _mm256_shuffle_epi8(
1224 _mm256_blend_epi16(__i0, _mm256_slli_epi32(__i1, 16), 0xAA),
1225 _mm256_setr_epi8(0, 4, 8, 12, -0x80, -0x80, -0x80, -0x80, 2,
1226 6, 10, 14, -0x80, -0x80, -0x80, -0x80, -0x80,
1227 -0x80, -0x80, -0x80, 0, 4, 8, 12, -0x80,
1228 -0x80, -0x80, -0x80, 2, 6, 10, 14));
1229 return __intrin_bitcast<_To>(__lo128(__a) | __hi128(__a));
1232 else if constexpr (__i16_to_i8)
1234 if constexpr (__x_to_x && __have_ssse3)
1236 const auto __shuf =
reinterpret_cast<__m128i
>(
1237 __vector_type_t<_UChar, 16>{0, 2, 4, 6, 8, 10, 12, 14, 0x80,
1238 0x80, 0x80, 0x80, 0x80, 0x80,
1240 return __intrin_bitcast<_To>(
1241 _mm_unpacklo_epi64(_mm_shuffle_epi8(__i0, __shuf),
1242 _mm_shuffle_epi8(__i1, __shuf)));
1244 else if constexpr (__x_to_x)
1246 auto __a = _mm_unpacklo_epi8(__i0, __i1);
1247 auto __b = _mm_unpackhi_epi8(__i0, __i1);
1248 auto __c = _mm_unpacklo_epi8(__a, __b);
1249 auto __d = _mm_unpackhi_epi8(__a, __b);
1250 auto __e = _mm_unpacklo_epi8(__c, __d);
1251 auto __f = _mm_unpackhi_epi8(__c, __d);
1252 return __intrin_bitcast<_To>(_mm_unpacklo_epi8(__e, __f));
1254 else if constexpr (__y_to_y)
1256 return __intrin_bitcast<_To>(__xzyw(_mm256_shuffle_epi8(
1257 (__to_intrin(__v0) & _mm256_set1_epi32(0x00ff00ff))
1258 | _mm256_slli_epi16(__i1, 8),
1259 _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11,
1260 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5,
1261 7, 9, 11, 13, 15))));
1264 else if constexpr (__i64_to_f32)
1266 if constexpr (__x_to_x)
1267 return __make_wrapper<float>(__v0[0], __v0[1], __v1[0], __v1[1]);
1268 else if constexpr (__y_to_y)
1270 static_assert(__y_to_y && __have_avx2);
1271 const auto __a = _mm256_unpacklo_epi32(__i0, __i1);
1272 const auto __b = _mm256_unpackhi_epi32(__i0, __i1);
1274 = _mm256_unpacklo_epi32(__a, __b);
1275 const auto __hi32 = __vector_bitcast<
1276 conditional_t<is_signed_v<_Tp>, int, _UInt>>(
1277 _mm256_unpackhi_epi32(__a, __b));
1280 * __convert_x86<__vector_type_t<float, 8>>(__hi32);
1282 = 0x10000 * _mm256_cvtepi32_ps(_mm256_srli_epi32(__lo32, 16));
1284 = _mm256_cvtepi32_ps(_mm256_set1_epi32(0x0000ffffu) & __lo32);
1285 return __xzyw((__hi + __mid) + __lo);
1287 else if constexpr (__z_to_z && __have_avx512dq)
1289 return is_signed_v<_Tp> ? __concat(_mm512_cvtepi64_ps(__i0),
1290 _mm512_cvtepi64_ps(__i1))
1291 : __concat(_mm512_cvtepu64_ps(__i0),
1292 _mm512_cvtepu64_ps(__i1));
1294 else if constexpr (__z_to_z && is_signed_v<_Tp>)
1296 const __m512 __hi32 = _mm512_cvtepi32_ps(
1297 __concat(_mm512_cvtepi64_epi32(__to_intrin(__v0 >> 32)),
1298 _mm512_cvtepi64_epi32(__to_intrin(__v1 >> 32))));
1299 const __m512i __lo32 = __concat(_mm512_cvtepi64_epi32(__i0),
1300 _mm512_cvtepi64_epi32(__i1));
1306 = _mm512_cvtepu32_ps(_mm512_set1_epi32(0xffff0000u) & __lo32);
1308 = _mm512_cvtepi32_ps(_mm512_set1_epi32(0x0000ffffu) & __lo32);
1309 return (__hi32 * 0x100000000LL + __hi16) + __lo16;
1311 else if constexpr (__z_to_z && is_unsigned_v<_Tp>)
1313 return __intrin_bitcast<_To>(
1314 _mm512_cvtepu32_ps(__concat(
1315 _mm512_cvtepi64_epi32(_mm512_srai_epi64(__i0, 32)),
1316 _mm512_cvtepi64_epi32(_mm512_srai_epi64(__i1, 32))))
1318 + _mm512_cvtepu32_ps(__concat(_mm512_cvtepi64_epi32(__i0),
1319 _mm512_cvtepi64_epi32(__i1))));
1322 else if constexpr (__f64_to_s32)
1326 else if constexpr (__f64_to_u32)
1328 if constexpr (__x_to_x && __have_sse4_1)
1330 return __vector_bitcast<_Up, _M>(_mm_unpacklo_epi64(
1331 _mm_cvttpd_epi32(_mm_floor_pd(__i0) - 0x8000
'0000u), 1332 _mm_cvttpd_epi32(_mm_floor_pd(__i1) - 0x8000'0000u)))
1334 // without SSE4.1 just use the scalar fallback, it's only four
1337 else if constexpr (__y_to_y)
1339 return __vector_bitcast<_Up>(
1340 __concat(_mm256_cvttpd_epi32(_mm256_floor_pd(__i0)
1342 _mm256_cvttpd_epi32(_mm256_floor_pd(__i1) 1345 } // __z_to_z uses fallback 1347 else if constexpr (__f64_to_ibw) //{{{2 1349 // one-arg __f64_to_ibw goes via _SimdWrapper<int, ?>. The fallback 1350 // would go via two independet conversions to _SimdWrapper<_To> and 1351 // subsequent interleaving. This is better, because f64->__i32 1352 // allows to combine __v0 and __v1 into one register: if constexpr 1353 // (__z_to_x || __y_to_x) { 1354 return __convert_x86<_To>( 1355 __convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1)); 1358 else if constexpr (__f32_to_ibw) //{{{2 1360 return __convert_x86<_To>( 1361 __convert_x86<__vector_type_t<int, _Np>>(__v0), 1362 __convert_x86<__vector_type_t<int, _Np>>(__v1)); 1366 if constexpr (sizeof(_To) >= 32) 1367 // if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm 1368 return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0), 1369 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v1)); 1370 else if constexpr (sizeof(_To) == 16) 1372 const auto __lo = __to_intrin(__convert_x86<_To>(__v0)); 1373 const auto __hi = __to_intrin(__convert_x86<_To>(__v1)); 1374 if constexpr (sizeof(_Up) * _Np == 8) 1376 if constexpr (is_floating_point_v<_Up>) 1377 return __auto_bitcast( 1378 _mm_unpacklo_pd(__vector_bitcast<double>(__lo), 1379 __vector_bitcast<double>(__hi))); 1381 return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi)); 1383 else if constexpr (sizeof(_Up) * _Np == 4) 1385 if constexpr (is_floating_point_v<_Up>) 1386 return __auto_bitcast( 1387 _mm_unpacklo_ps(__vector_bitcast<float>(__lo), 1388 __vector_bitcast<float>(__hi))); 1390 return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__lo, __hi)); 1392 else if constexpr (sizeof(_Up) * _Np == 2) 1393 return __intrin_bitcast<_To>(_mm_unpacklo_epi16(__lo, __hi)); 1395 __assert_unreachable<_Tp>(); 1398 return __vector_convert<_To>(__v0, __v1, make_index_sequence<_Np>()); 1404 // 4-arg __convert_x86 {{{1 1405 template <typename _To, typename _V, typename _Traits> 1406 _GLIBCXX_SIMD_INTRINSIC _To 1407 __convert_x86(_V __v0, _V __v1, _V __v2, _V __v3) 1409 static_assert(__is_vector_type_v<_V>); 1410 using _Tp = typename _Traits::value_type; 1411 constexpr size_t _Np = _Traits::_S_full_size; 1412 [[maybe_unused]] const auto __i0 = __to_intrin(__v0); 1413 [[maybe_unused]] const auto __i1 = __to_intrin(__v1); 1414 [[maybe_unused]] const auto __i2 = __to_intrin(__v2); 1415 [[maybe_unused]] const auto __i3 = __to_intrin(__v3); 1416 using _Up = typename _VectorTraits<_To>::value_type; 1417 constexpr size_t _M = _VectorTraits<_To>::_S_full_size; 1419 static_assert(4 * _Np <= _M, 1420 "__v2/__v3 would be discarded; use the two/one-argument " 1421 "__convert_x86 overload instead"); 1423 // [xyz]_to_[xyz] {{{2 1424 [[maybe_unused]] constexpr bool __x_to_x 1425 = sizeof(__v0) <= 16 && sizeof(_To) <= 16; 1426 [[maybe_unused]] constexpr bool __x_to_y 1427 = sizeof(__v0) <= 16 && sizeof(_To) == 32; 1428 [[maybe_unused]] constexpr bool __x_to_z 1429 = sizeof(__v0) <= 16 && sizeof(_To) == 64; 1430 [[maybe_unused]] constexpr bool __y_to_x 1431 = sizeof(__v0) == 32 && sizeof(_To) <= 16; 1432 [[maybe_unused]] constexpr bool __y_to_y 1433 = sizeof(__v0) == 32 && sizeof(_To) == 32; 1434 [[maybe_unused]] constexpr bool __y_to_z 1435 = sizeof(__v0) == 32 && sizeof(_To) == 64; 1436 [[maybe_unused]] constexpr bool __z_to_x 1437 = sizeof(__v0) == 64 && sizeof(_To) <= 16; 1438 [[maybe_unused]] constexpr bool __z_to_y 1439 = sizeof(__v0) == 64 && sizeof(_To) == 32; 1440 [[maybe_unused]] constexpr bool __z_to_z 1441 = sizeof(__v0) == 64 && sizeof(_To) == 64; 1444 [[maybe_unused]] constexpr bool __i_to_i 1445 = is_integral_v<_Up> && is_integral_v<_Tp>; 1446 [[maybe_unused]] constexpr bool __i8_to_i16 1447 = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 2; 1448 [[maybe_unused]] constexpr bool __i8_to_i32 1449 = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 4; 1450 [[maybe_unused]] constexpr bool __i8_to_i64 1451 = __i_to_i && sizeof(_Tp) == 1 && sizeof(_Up) == 8; 1452 [[maybe_unused]] constexpr bool __i16_to_i8 1453 = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 1; 1454 [[maybe_unused]] constexpr bool __i16_to_i32 1455 = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 4; 1456 [[maybe_unused]] constexpr bool __i16_to_i64 1457 = __i_to_i && sizeof(_Tp) == 2 && sizeof(_Up) == 8; 1458 [[maybe_unused]] constexpr bool __i32_to_i8 1459 = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 1; 1460 [[maybe_unused]] constexpr bool __i32_to_i16 1461 = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 2; 1462 [[maybe_unused]] constexpr bool __i32_to_i64 1463 = __i_to_i && sizeof(_Tp) == 4 && sizeof(_Up) == 8; 1464 [[maybe_unused]] constexpr bool __i64_to_i8 1465 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1; 1466 [[maybe_unused]] constexpr bool __i64_to_i16 1467 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 2; 1468 [[maybe_unused]] constexpr bool __i64_to_i32 1469 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 4; 1471 // [fsu]X_to_[fsu]X {{{2 1472 // ibw = integral && byte or word, i.e. char and short with any signedness 1473 [[maybe_unused]] constexpr bool __i64_to_f32 1474 = is_integral_v<_Tp> && sizeof(_Tp) == 8 1475 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 1476 [[maybe_unused]] constexpr bool __s32_to_f32 1477 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4 1478 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 1479 [[maybe_unused]] constexpr bool __s16_to_f32 1480 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2 1481 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 1482 [[maybe_unused]] constexpr bool __s8_to_f32 1483 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1 1484 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 1485 [[maybe_unused]] constexpr bool __u32_to_f32 1486 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4 1487 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 1488 [[maybe_unused]] constexpr bool __u16_to_f32 1489 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2 1490 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 1491 [[maybe_unused]] constexpr bool __u8_to_f32 1492 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1 1493 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 1494 [[maybe_unused]] constexpr bool __s64_to_f64 1495 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 8 1496 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 1497 [[maybe_unused]] constexpr bool __s32_to_f64 1498 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 4 1499 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 1500 [[maybe_unused]] constexpr bool __s16_to_f64 1501 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 2 1502 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 1503 [[maybe_unused]] constexpr bool __s8_to_f64 1504 = is_integral_v<_Tp> && is_signed_v<_Tp> && sizeof(_Tp) == 1 1505 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 1506 [[maybe_unused]] constexpr bool __u64_to_f64 1507 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 8 1508 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 1509 [[maybe_unused]] constexpr bool __u32_to_f64 1510 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 4 1511 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 1512 [[maybe_unused]] constexpr bool __u16_to_f64 1513 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 2 1514 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 1515 [[maybe_unused]] constexpr bool __u8_to_f64 1516 = is_integral_v<_Tp> && is_unsigned_v<_Tp> && sizeof(_Tp) == 1 1517 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 1518 [[maybe_unused]] constexpr bool __f32_to_s64 1519 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8 1520 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; 1521 [[maybe_unused]] constexpr bool __f32_to_s32 1522 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4 1523 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; 1524 [[maybe_unused]] constexpr bool __f32_to_u64 1525 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8 1526 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; 1527 [[maybe_unused]] constexpr bool __f32_to_u32 1528 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4 1529 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; 1530 [[maybe_unused]] constexpr bool __f64_to_s64 1531 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 8 1532 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 1533 [[maybe_unused]] constexpr bool __f64_to_s32 1534 = is_integral_v<_Up> && is_signed_v<_Up> && sizeof(_Up) == 4 1535 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 1536 [[maybe_unused]] constexpr bool __f64_to_u64 1537 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 8 1538 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 1539 [[maybe_unused]] constexpr bool __f64_to_u32 1540 = is_integral_v<_Up> && is_unsigned_v<_Up> && sizeof(_Up) == 4 1541 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 1542 [[maybe_unused]] constexpr bool __f32_to_ibw 1543 = is_integral_v<_Up> && sizeof(_Up) <= 2 1544 && is_floating_point_v<_Tp> && sizeof(_Tp) == 4; 1545 [[maybe_unused]] constexpr bool __f64_to_ibw 1546 = is_integral_v<_Up> && sizeof(_Up) <= 2 1547 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 1548 [[maybe_unused]] constexpr bool __f32_to_f64 1549 = is_floating_point_v<_Tp> && sizeof(_Tp) == 4 1550 && is_floating_point_v<_Up> && sizeof(_Up) == 8; 1551 [[maybe_unused]] constexpr bool __f64_to_f32 1552 = is_floating_point_v<_Tp> && sizeof(_Tp) == 8 1553 && is_floating_point_v<_Up> && sizeof(_Up) == 4; 1555 if constexpr (__i_to_i && __y_to_x && !__have_avx2) //{{{2 1557 // <double, 4>, <double, 4>, <double, 4>, <double, 4> => <char, 16> 1558 return __convert_x86<_To>(__lo128(__v0), __hi128(__v0), __lo128(__v1), 1559 __hi128(__v1), __lo128(__v2), __hi128(__v2), 1560 __lo128(__v3), __hi128(__v3)); 1562 else if constexpr (__i_to_i) // assert ISA {{{2 1564 static_assert(__x_to_x || __have_avx2, 1565 "integral conversions with ymm registers require AVX2"); 1566 static_assert(__have_avx512bw 1567 || ((sizeof(_Tp) >= 4 || sizeof(__v0) < 64) 1568 && (sizeof(_Up) >= 4 || sizeof(_To) < 64)), 1569 "8/16-bit integers in zmm registers require AVX512BW"); 1570 static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) || __have_avx512f, 1571 "integral conversions with ymm registers require AVX2"); 1573 // concat => use 2-arg __convert_x86 {{{2 1574 if constexpr (sizeof(__v0) < 16 || (sizeof(__v0) == 16 && __have_avx2) 1575 || (sizeof(__v0) == 16 && __have_avx 1576 && is_floating_point_v<_Tp>) 1577 || (sizeof(__v0) == 32 && __have_avx512f)) 1579 // The ISA can handle wider input registers, so concat and use two-arg 1580 // implementation. This reduces code duplication considerably. 1581 return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3)); 1585 // conversion using bit reinterpretation (or no conversion at all) 1586 // should all go through the concat branch above: 1588 !(is_floating_point_v< 1589 _Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up))); 1590 // handle all zero extension{{{2 1591 if constexpr (4 * _Np < _M && sizeof(_To) > 16) 1593 constexpr size_t Min = 16 / sizeof(_Up); 1594 return __zero_extend( 1596 __vector_type_t<_Up, (Min > 4 * _Np) ? Min : 4 * _Np>>( 1597 __v0, __v1, __v2, __v3)); 1599 else if constexpr (__i64_to_i16) //{{{2 1601 if constexpr (__x_to_x && __have_sse4_1) 1603 return __intrin_bitcast<_To>(_mm_shuffle_epi8( 1605 _mm_blend_epi16(__i0, _mm_slli_si128(__i1, 2), 0x22), 1606 _mm_blend_epi16(_mm_slli_si128(__i2, 4), 1607 _mm_slli_si128(__i3, 6), 0x88), 1609 _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 1612 else if constexpr (__y_to_y && __have_avx2) 1614 return __intrin_bitcast<_To>(_mm256_shuffle_epi8( 1615 __xzyw(_mm256_blend_epi16( 1617 _mm256_shuffle_ps(__vector_bitcast<float>(__v0), 1618 __vector_bitcast<float>(__v2), 1619 0x88)), // 0.1. 8.9. 2.3. A.B. 1620 __to_intrin(__vector_bitcast<int>(_mm256_shuffle_ps( 1621 __vector_bitcast<float>(__v1), 1622 __vector_bitcast<float>(__v3), 0x88)) 1623 << 16), // .4.5 .C.D .6.7 .E.F 1624 0xaa) // 0415 8C9D 2637 AEBF 1625 ), // 0415 2637 8C9D AEBF 1626 _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 1627 14, 15, 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 1630 auto __a = _mm256_unpacklo_epi16(__v0, __v1); // 04.. .... 26.. 1631 .... auto __b = _mm256_unpackhi_epi16(__v0, __v1); // 15.. 1632 .... 37.. .... auto __c = _mm256_unpacklo_epi16(__v2, __v3); // 1633 8C.. .... AE.. .... auto __d = _mm256_unpackhi_epi16(__v2, 1635 // 9D.. .... BF.. .... auto __e = _mm256_unpacklo_epi16(__a, 1637 // 0145 .... 2367 .... auto __f = _mm256_unpacklo_epi16(__c, 1639 // 89CD .... ABEF .... auto __g = _mm256_unpacklo_epi64(__e, 1641 // 0145 89CD 2367 ABEF return __concat( 1642 _mm_unpacklo_epi32(__lo128(__g), __hi128(__g)), 1643 _mm_unpackhi_epi32(__lo128(__g), __hi128(__g))); // 0123 1646 } // else use fallback 1648 else if constexpr (__i64_to_i8) //{{{2 1650 if constexpr (__x_to_x) 1652 // TODO: use fallback for now 1654 else if constexpr (__y_to_x) 1657 = _mm256_srli_epi32(_mm256_slli_epi32(__i0, 24), 24) 1658 | _mm256_srli_epi32(_mm256_slli_epi32(__i1, 24), 16) 1659 | _mm256_srli_epi32(_mm256_slli_epi32(__i2, 24), 8) 1660 | _mm256_slli_epi32( 1661 __i3, 24); // 048C .... 159D .... 26AE .... 37BF .... 1662 /*return _mm_shuffle_epi8( 1663 _mm_blend_epi32(__lo128(__a) << 32, __hi128(__a), 0x5), 1664 _mm_setr_epi8(4, 12, 0, 8, 5, 13, 1, 9, 6, 14, 2, 10, 7, 15, 1666 auto __b = _mm256_unpackhi_epi64( 1667 __a, __a); // 159D .... 159D .... 37BF .... 37BF .... 1668 auto __c = _mm256_unpacklo_epi8( 1669 __a, __b); // 0145 89CD .... .... 2367 ABEF .... .... 1670 return __intrin_bitcast<_To>( 1671 _mm_unpacklo_epi16(__lo128(__c), 1672 __hi128(__c))); // 0123 4567 89AB CDEF 1675 else if constexpr (__i32_to_i8) //{{{2 1677 if constexpr (__x_to_x) 1679 if constexpr (__have_ssse3) 1681 const auto __x0 = __vector_bitcast<_UInt>(__v0) & 0xff; 1682 const auto __x1 = (__vector_bitcast<_UInt>(__v1) & 0xff) 1684 const auto __x2 = (__vector_bitcast<_UInt>(__v2) & 0xff) 1686 const auto __x3 = __vector_bitcast<_UInt>(__v3) << 24; 1687 return __intrin_bitcast<_To>( 1688 _mm_shuffle_epi8(__to_intrin(__x0 | __x1 | __x2 | __x3), 1689 _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 1690 2, 6, 10, 14, 3, 7, 11, 1696 = _mm_unpacklo_epi8(__i0, __i2); // 08.. .... 19.. .... 1698 = _mm_unpackhi_epi8(__i0, __i2); // 2A.. .... 3B.. .... 1700 = _mm_unpacklo_epi8(__i1, __i3); // 4C.. .... 5D.. .... 1702 = _mm_unpackhi_epi8(__i1, __i3); // 6E.. .... 7F.. .... 1704 = _mm_unpacklo_epi8(__a, __c); // 048C .... .... .... 1706 = _mm_unpackhi_epi8(__a, __c); // 159D .... .... .... 1708 = _mm_unpacklo_epi8(__b, __d); // 26AE .... .... .... 1710 = _mm_unpackhi_epi8(__b, __d); // 37BF .... .... .... 1711 return __intrin_bitcast<_To>(_mm_unpacklo_epi8( 1712 _mm_unpacklo_epi8(__e, __g), // 0246 8ACE .... .... 1713 _mm_unpacklo_epi8(__f, __h) // 1357 9BDF .... .... 1714 )); // 0123 4567 89AB CDEF 1717 else if constexpr (__y_to_y) 1719 const auto __a = _mm256_shuffle_epi8( 1720 __to_intrin((__vector_bitcast<_UShort>(_mm256_blend_epi16( 1721 __i0, _mm256_slli_epi32(__i1, 16), 0xAA)) 1723 | (__vector_bitcast<_UShort>(_mm256_blend_epi16( 1724 __i2, _mm256_slli_epi32(__i3, 16), 0xAA)) 1726 _mm256_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 1727 11, 15, 0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 1729 return __intrin_bitcast<_To>(_mm256_permutevar8x32_epi32( 1730 __a, _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7))); 1733 else if constexpr (__i64_to_f32) //{{{2 1735 // this branch is only relevant with AVX and w/o AVX2 (i.e. no ymm 1737 if constexpr (__x_to_y) 1739 return __make_wrapper<float>(__v0[0], __v0[1], __v1[0], __v1[1], 1740 __v2[0], __v2[1], __v3[0], 1743 const auto __a = _mm_unpacklo_epi32(__i0, __i1); // acAC 1744 const auto __b = _mm_unpackhi_epi32(__i0, __i1); // bdBD 1745 const auto __c = _mm_unpacklo_epi32(__i2, __i3); // egEG 1746 const auto __d = _mm_unpackhi_epi32(__i2, __i3); // fhFH 1747 const auto __lo32a = _mm_unpacklo_epi32(__a, __b); // abcd 1748 const auto __lo32b = _mm_unpacklo_epi32(__c, __d); // efgh 1749 const auto __hi32 = __vector_bitcast< 1750 conditional_t<is_signed_v<_Tp>, int, _UInt>>( 1751 __concat(_mm_unpackhi_epi32(__a, __b), 1752 _mm_unpackhi_epi32(__c, __d))); // ABCD EFGH 1755 * __convert_x86<__vector_type_t<float, 8>>(__hi32); 1758 * _mm256_cvtepi32_ps(__concat(_mm_srli_epi32(__lo32a, 16), 1759 _mm_srli_epi32(__lo32b, 16))); 1760 const auto __lo = _mm256_cvtepi32_ps( 1761 __concat(_mm_set1_epi32(0x0000ffffu) & __lo32a, 1762 _mm_set1_epi32(0x0000ffffu) & __lo32b)); 1763 return (__hi + __mid) + __lo; 1766 else if constexpr (__f64_to_ibw) //{{{2 1768 return __convert_x86<_To>( 1769 __convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1), 1770 __convert_x86<__vector_type_t<int, _Np * 2>>(__v2, __v3)); 1772 else if constexpr (__f32_to_ibw) //{{{2 1774 return __convert_x86<_To>( 1775 __convert_x86<__vector_type_t<int, _Np>>(__v0), 1776 __convert_x86<__vector_type_t<int, _Np>>(__v1), 1777 __convert_x86<__vector_type_t<int, _Np>>(__v2), 1778 __convert_x86<__vector_type_t<int, _Np>>(__v3)); 1782 if constexpr (sizeof(_To) >= 32) 1783 // if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm 1784 return __concat(__convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, 1786 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v2, 1788 else if constexpr (sizeof(_To) == 16) 1790 const auto __lo = __to_intrin(__convert_x86<_To>(__v0, __v1)); 1791 const auto __hi = __to_intrin(__convert_x86<_To>(__v2, __v3)); 1792 if constexpr (sizeof(_Up) * _Np * 2 == 8) 1794 if constexpr (is_floating_point_v<_Up>) 1795 return __auto_bitcast(_mm_unpacklo_pd(__lo, __hi)); 1797 return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi)); 1799 else if constexpr (sizeof(_Up) * _Np * 2 == 4) 1801 if constexpr (is_floating_point_v<_Up>) 1802 return __auto_bitcast(_mm_unpacklo_ps(__lo, __hi)); 1804 return __intrin_bitcast<_To>(_mm_unpacklo_epi32(__lo, __hi)); 1807 __assert_unreachable<_Tp>(); 1810 return __vector_convert<_To>(__v0, __v1, __v2, __v3, 1811 make_index_sequence<_Np>()); 1817 // 8-arg __convert_x86 {{{1 1818 template <typename _To, typename _V, typename _Traits> 1819 _GLIBCXX_SIMD_INTRINSIC _To 1820 __convert_x86(_V __v0, _V __v1, _V __v2, _V __v3, _V __v4, _V __v5, _V __v6, 1823 static_assert(__is_vector_type_v<_V>); 1824 using _Tp = typename _Traits::value_type; 1825 constexpr size_t _Np = _Traits::_S_full_size; 1826 [[maybe_unused]] const auto __i0 = __to_intrin(__v0); 1827 [[maybe_unused]] const auto __i1 = __to_intrin(__v1); 1828 [[maybe_unused]] const auto __i2 = __to_intrin(__v2); 1829 [[maybe_unused]] const auto __i3 = __to_intrin(__v3); 1830 [[maybe_unused]] const auto __i4 = __to_intrin(__v4); 1831 [[maybe_unused]] const auto __i5 = __to_intrin(__v5); 1832 [[maybe_unused]] const auto __i6 = __to_intrin(__v6); 1833 [[maybe_unused]] const auto __i7 = __to_intrin(__v7); 1834 using _Up = typename _VectorTraits<_To>::value_type; 1835 constexpr size_t _M = _VectorTraits<_To>::_S_full_size; 1837 static_assert(8 * _Np <= _M, 1838 "__v4-__v7 would be discarded; use the four/two/one-argument " 1839 "__convert_x86 overload instead"); 1841 // [xyz]_to_[xyz] {{{2 1842 [[maybe_unused]] constexpr bool __x_to_x 1843 = sizeof(__v0) <= 16 && sizeof(_To) <= 16; 1844 [[maybe_unused]] constexpr bool __x_to_y 1845 = sizeof(__v0) <= 16 && sizeof(_To) == 32; 1846 [[maybe_unused]] constexpr bool __x_to_z 1847 = sizeof(__v0) <= 16 && sizeof(_To) == 64; 1848 [[maybe_unused]] constexpr bool __y_to_x 1849 = sizeof(__v0) == 32 && sizeof(_To) <= 16; 1850 [[maybe_unused]] constexpr bool __y_to_y 1851 = sizeof(__v0) == 32 && sizeof(_To) == 32; 1852 [[maybe_unused]] constexpr bool __y_to_z 1853 = sizeof(__v0) == 32 && sizeof(_To) == 64; 1854 [[maybe_unused]] constexpr bool __z_to_x 1855 = sizeof(__v0) == 64 && sizeof(_To) <= 16; 1856 [[maybe_unused]] constexpr bool __z_to_y 1857 = sizeof(__v0) == 64 && sizeof(_To) == 32; 1858 [[maybe_unused]] constexpr bool __z_to_z 1859 = sizeof(__v0) == 64 && sizeof(_To) == 64; 1862 [[maybe_unused]] constexpr bool __i_to_i 1863 = is_integral_v<_Up> && is_integral_v<_Tp>; 1864 [[maybe_unused]] constexpr bool __i64_to_i8 1865 = __i_to_i && sizeof(_Tp) == 8 && sizeof(_Up) == 1; 1866 [[maybe_unused]] constexpr bool __f64_to_i8 1867 = is_integral_v<_Up> && sizeof(_Up) == 1 1868 && is_floating_point_v<_Tp> && sizeof(_Tp) == 8; 1870 if constexpr (__i_to_i) // assert ISA {{{2 1872 static_assert(__x_to_x || __have_avx2, 1873 "integral conversions with ymm registers require AVX2"); 1874 static_assert(__have_avx512bw 1875 || ((sizeof(_Tp) >= 4 || sizeof(__v0) < 64) 1876 && (sizeof(_Up) >= 4 || sizeof(_To) < 64)), 1877 "8/16-bit integers in zmm registers require AVX512BW"); 1878 static_assert((sizeof(__v0) < 64 && sizeof(_To) < 64) || __have_avx512f, 1879 "integral conversions with ymm registers require AVX2"); 1881 // concat => use 4-arg __convert_x86 {{{2 1882 if constexpr (sizeof(__v0) < 16 || (sizeof(__v0) == 16 && __have_avx2) 1883 || (sizeof(__v0) == 16 && __have_avx 1884 && is_floating_point_v<_Tp>) 1885 || (sizeof(__v0) == 32 && __have_avx512f)) 1887 // The ISA can handle wider input registers, so concat and use two-arg 1888 // implementation. This reduces code duplication considerably. 1889 return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3), 1890 __concat(__v4, __v5), __concat(__v6, __v7)); 1894 // conversion using bit reinterpretation (or no conversion at all) 1895 // should all go through the concat branch above: 1897 !(is_floating_point_v< 1898 _Tp> == is_floating_point_v<_Up> && sizeof(_Tp) == sizeof(_Up))); 1899 static_assert(!(8 * _Np < _M && sizeof(_To) > 16), 1900 "zero extension should be impossible"); 1901 if constexpr (__i64_to_i8) //{{{2 1903 if constexpr (__x_to_x && __have_ssse3) 1905 // unsure whether this is better than the variant below 1906 return __intrin_bitcast<_To>(_mm_shuffle_epi8( 1908 (((__v0 & 0xff) | ((__v1 & 0xff) << 8)) 1909 | (((__v2 & 0xff) << 16) | ((__v3 & 0xff) << 24))) 1910 | ((((__v4 & 0xff) << 32) | ((__v5 & 0xff) << 40)) 1911 | (((__v6 & 0xff) << 48) | (__v7 << 56)))), 1912 _mm_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 1915 else if constexpr (__x_to_x) 1917 const auto __a = _mm_unpacklo_epi8(__i0, __i1); // ac 1918 const auto __b = _mm_unpackhi_epi8(__i0, __i1); // bd 1919 const auto __c = _mm_unpacklo_epi8(__i2, __i3); // eg 1920 const auto __d = _mm_unpackhi_epi8(__i2, __i3); // fh 1921 const auto __e = _mm_unpacklo_epi8(__i4, __i5); // ik 1922 const auto __f = _mm_unpackhi_epi8(__i4, __i5); // jl 1923 const auto __g = _mm_unpacklo_epi8(__i6, __i7); // mo 1924 const auto __h = _mm_unpackhi_epi8(__i6, __i7); // np 1925 return __intrin_bitcast<_To>(_mm_unpacklo_epi64( 1926 _mm_unpacklo_epi32(_mm_unpacklo_epi8(__a, __b), // abcd 1927 _mm_unpacklo_epi8(__c, __d)), // efgh 1928 _mm_unpacklo_epi32(_mm_unpacklo_epi8(__e, __f), // ijkl 1929 _mm_unpacklo_epi8(__g, __h)) // mnop 1932 else if constexpr (__y_to_y) 1934 auto __a = // 048C GKOS 159D HLPT 26AE IMQU 37BF JNRV 1936 (((__v0 & 0xff) | ((__v1 & 0xff) << 8)) 1937 | (((__v2 & 0xff) << 16) | ((__v3 & 0xff) << 24))) 1938 | ((((__v4 & 0xff) << 32) | ((__v5 & 0xff) << 40)) 1939 | (((__v6 & 0xff) << 48) | ((__v7 << 56))))); 1941 auto __b = _mm256_unpackhi_epi64(__a, __a); // 159D HLPT 159D 1942 HLPT 37BF JNRV 37BF JNRV auto __c = _mm256_unpacklo_epi8(__a, 1943 __b); // 0145 89CD GHKL OPST 2367 ABEF IJMN QRUV auto __d = 1944 __xzyw(__c); // 0145 89CD 2367 ABEF GHKL OPST IJMN QRUV return 1945 _mm256_shuffle_epi8( 1946 __d, _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 1947 13, 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 1950 auto __b = _mm256_shuffle_epi8( // 0145 89CD GHKL OPST 2367 ABEF 1952 __a, _mm256_setr_epi8(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 1953 6, 14, 7, 15, 0, 8, 1, 9, 2, 10, 3, 11, 1954 4, 12, 5, 13, 6, 14, 7, 15)); 1956 = __xzyw(__b); // 0145 89CD 2367 ABEF GHKL OPST IJMN QRUV 1957 return __intrin_bitcast<_To>(_mm256_shuffle_epi8( 1958 __c, _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 1959 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 1960 4, 5, 12, 13, 6, 7, 14, 15))); 1962 else if constexpr (__z_to_z) 1965 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, __v1, __v2, 1967 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v4, __v5, __v6, 1971 else if constexpr (__f64_to_i8) //{{{2 1973 return __convert_x86<_To>( 1974 __convert_x86<__vector_type_t<int, _Np * 2>>(__v0, __v1), 1975 __convert_x86<__vector_type_t<int, _Np * 2>>(__v2, __v3), 1976 __convert_x86<__vector_type_t<int, _Np * 2>>(__v4, __v5), 1977 __convert_x86<__vector_type_t<int, _Np * 2>>(__v6, __v7)); 1979 else // unreachable {{{2 1980 __assert_unreachable<_Tp>(); 1984 if constexpr (sizeof(_To) >= 32) 1985 // if _To is ymm or zmm, then _SimdWrapper<_Up, _M / 2> is xmm or ymm 1987 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v0, __v1, __v2, __v3), 1988 __convert_x86<__vector_type_t<_Up, _M / 2>>(__v4, __v5, __v6, 1990 else if constexpr (sizeof(_To) == 16) 1993 = __to_intrin(__convert_x86<_To>(__v0, __v1, __v2, __v3)); 1995 = __to_intrin(__convert_x86<_To>(__v4, __v5, __v6, __v7)); 1996 static_assert(sizeof(_Up) == 1 && _Np == 2); 1997 return __intrin_bitcast<_To>(_mm_unpacklo_epi64(__lo, __hi)); 2001 __assert_unreachable<_Tp>(); 2002 // return __vector_convert<_To>(__v0, __v1, __v2, __v3, __v4, __v5, 2004 // make_index_sequence<_Np>()); 2010 // 16-arg __convert_x86 {{{1 2011 template <typename _To, typename _V, typename _Traits> 2012 _GLIBCXX_SIMD_INTRINSIC _To 2013 __convert_x86(_V __v0, _V __v1, _V __v2, _V __v3, _V __v4, _V __v5, _V __v6, 2014 _V __v7, _V __v8, _V __v9, _V __v10, _V __v11, _V __v12, 2015 _V __v13, _V __v14, _V __v15) 2017 // concat => use 8-arg __convert_x86 2018 return __convert_x86<_To>(__concat(__v0, __v1), __concat(__v2, __v3), 2019 __concat(__v4, __v5), __concat(__v6, __v7), 2020 __concat(__v8, __v9), __concat(__v10, __v11), 2021 __concat(__v12, __v13), __concat(__v14, __v15)); 2026 #endif // __cplusplus >= 201703L 2027 #endif // _GLIBCXX_EXPERIMENTAL_SIMD_X86_CONVERSIONS_H 2029 // vim: foldmethod=marker