Clone of mesa.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

simdlib_512_emu.inl 33KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852
  1. /****************************************************************************
  2. * Copyright (C) 2017 Intel Corporation. All Rights Reserved.
  3. *
  4. * Permission is hereby granted, free of charge, to any person obtaining a
  5. * copy of this software and associated documentation files (the "Software"),
  6. * to deal in the Software without restriction, including without limitation
  7. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. * and/or sell copies of the Software, and to permit persons to whom the
  9. * Software is furnished to do so, subject to the following conditions:
  10. *
  11. * The above copyright notice and this permission notice (including the next
  12. * paragraph) shall be included in all copies or substantial portions of the
  13. * Software.
  14. *
  15. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  18. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21. * IN THE SOFTWARE.
  22. ****************************************************************************/
  23. #if !defined(__SIMD_LIB_AVX_HPP__)
  24. #error Do not include this file directly, use "simdlib.hpp" instead.
  25. #endif
  26. //============================================================================
  27. // SIMD16 AVX (1) implementation
  28. //============================================================================
  29. static const int TARGET_SIMD_WIDTH = 8;
  30. using SIMD128T = SIMD128Impl::AVXImpl;
  31. #define SIMD_WRAPPER_1(op) \
  32. static SIMDINLINE Float SIMDCALL op(Float const& a) \
  33. { \
  34. return Float{ \
  35. SIMD256T::op(a.v8[0]), \
  36. SIMD256T::op(a.v8[1]), \
  37. }; \
  38. }
  39. #define SIMD_WRAPPER_2(op) \
  40. static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
  41. { \
  42. return Float{ \
  43. SIMD256T::op(a.v8[0], b.v8[0]), \
  44. SIMD256T::op(a.v8[1], b.v8[1]), \
  45. }; \
  46. }
  47. #define SIMD_WRAPPER_2I(op) \
  48. template <int ImmT> \
  49. static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
  50. { \
  51. return Float{ \
  52. SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]), \
  53. SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
  54. }; \
  55. }
  56. #define SIMD_WRAPPER_2I_1(op) \
  57. template <int ImmT> \
  58. static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
  59. { \
  60. return Float{ \
  61. SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]), \
  62. SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]), \
  63. }; \
  64. }
  65. #define SIMD_WRAPPER_3(op) \
  66. static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \
  67. { \
  68. return Float{ \
  69. SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]), \
  70. SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]), \
  71. }; \
  72. }
  73. #define SIMD_IWRAPPER_1(op) \
  74. static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
  75. { \
  76. return Integer{ \
  77. SIMD256T::op(a.v8[0]), \
  78. SIMD256T::op(a.v8[1]), \
  79. }; \
  80. }
  81. #define SIMD_IWRAPPER_2(op) \
  82. static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
  83. { \
  84. return Integer{ \
  85. SIMD256T::op(a.v8[0], b.v8[0]), \
  86. SIMD256T::op(a.v8[1], b.v8[1]), \
  87. }; \
  88. }
  89. #define SIMD_IWRAPPER_2I(op) \
  90. template <int ImmT> \
  91. static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
  92. { \
  93. return Integer{ \
  94. SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]), \
  95. SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
  96. }; \
  97. }
  98. #define SIMD_IWRAPPER_2I_1(op) \
  99. template <int ImmT> \
  100. static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
  101. { \
  102. return Integer{ \
  103. SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]), \
  104. SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]), \
  105. }; \
  106. }
  107. #define SIMD_IWRAPPER_2I_2(op) \
  108. template <int ImmT> \
  109. static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
  110. { \
  111. return Integer{ \
  112. SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]), \
  113. SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]), \
  114. }; \
  115. }
  116. #define SIMD_IWRAPPER_3(op) \
  117. static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \
  118. { \
  119. return Integer{ \
  120. SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]), \
  121. SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]), \
  122. }; \
  123. }
  124. //-----------------------------------------------------------------------
  125. // Single precision floating point arithmetic operations
  126. //-----------------------------------------------------------------------
  127. SIMD_WRAPPER_2(add_ps); // return a + b
  128. SIMD_WRAPPER_2(div_ps); // return a / b
  129. SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
  130. SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
  131. SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
  132. SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
  133. SIMD_WRAPPER_2(mul_ps); // return a * b
  134. SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a
  135. SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
  136. SIMD_WRAPPER_2(sub_ps); // return a - b
  137. template <RoundMode RMT>
  138. static SIMDINLINE Float SIMDCALL round_ps(Float const& a)
  139. {
  140. return Float{
  141. SIMD256T::template round_ps<RMT>(a.v8[0]),
  142. SIMD256T::template round_ps<RMT>(a.v8[1]),
  143. };
  144. }
  145. static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a)
  146. {
  147. return round_ps<RoundMode::CEIL_NOEXC>(a);
  148. }
  149. static SIMDINLINE Float SIMDCALL floor_ps(Float const& a)
  150. {
  151. return round_ps<RoundMode::FLOOR_NOEXC>(a);
  152. }
  153. //-----------------------------------------------------------------------
  154. // Integer (various width) arithmetic operations
  155. //-----------------------------------------------------------------------
  156. SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
  157. SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
  158. SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
  159. SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
  160. SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
  161. SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
  162. SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
  163. SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
  164. SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
  165. // return (a * b) & 0xFFFFFFFF
  166. //
  167. // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
  168. // and store the low 32 bits of the intermediate integers in dst.
  169. SIMD_IWRAPPER_2(mullo_epi32);
  170. SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
  171. SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
  172. SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
  173. //-----------------------------------------------------------------------
  174. // Logical operations
  175. //-----------------------------------------------------------------------
  176. SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
  177. SIMD_IWRAPPER_2(and_si); // return a & b (int)
  178. SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
  179. SIMD_IWRAPPER_2(andnot_si); // return (~a) & b (int)
  180. SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
  181. SIMD_IWRAPPER_2(or_si); // return a | b (int)
  182. SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
  183. SIMD_IWRAPPER_2(xor_si); // return a ^ b (int)
  184. //-----------------------------------------------------------------------
  185. // Shift operations
  186. //-----------------------------------------------------------------------
  187. template <int ImmT>
  188. static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const& a) // return a << ImmT
  189. {
  190. return Integer{
  191. SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
  192. SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
  193. };
  194. }
  195. SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32)
  196. template <int ImmT>
  197. static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const& a) // return a >> ImmT (int32)
  198. {
  199. return Integer{
  200. SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
  201. SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
  202. };
  203. }
  204. template <int ImmT>
  205. static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const& a) // return a >> ImmT (uint32)
  206. {
  207. return Integer{
  208. SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
  209. SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
  210. };
  211. }
  212. template <int ImmT> // for each 128-bit lane:
  213. static SIMDINLINE Integer SIMDCALL srli_si(Integer const& a) // return a >> (ImmT*8) (uint)
  214. {
  215. return Integer{
  216. SIMD256T::template srli_si<ImmT>(a.v8[0]),
  217. SIMD256T::template srli_si<ImmT>(a.v8[1]),
  218. };
  219. }
  220. template <int ImmT>
  221. static SIMDINLINE Float SIMDCALL
  222. srlisi_ps(Float const& a) // same as srli_si, but with Float cast to int
  223. {
  224. return Float{
  225. SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
  226. SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
  227. };
  228. }
  229. SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32)
  230. //-----------------------------------------------------------------------
  231. // Conversion operations
  232. //-----------------------------------------------------------------------
  233. static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a)
  234. {
  235. return Float{
  236. SIMD256T::castpd_ps(a.v8[0]),
  237. SIMD256T::castpd_ps(a.v8[1]),
  238. };
  239. }
  240. static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a)
  241. {
  242. return Integer{
  243. SIMD256T::castps_si(a.v8[0]),
  244. SIMD256T::castps_si(a.v8[1]),
  245. };
  246. }
  247. static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a)
  248. {
  249. return Double{
  250. SIMD256T::castsi_pd(a.v8[0]),
  251. SIMD256T::castsi_pd(a.v8[1]),
  252. };
  253. }
  254. static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a)
  255. {
  256. return Double{
  257. SIMD256T::castps_pd(a.v8[0]),
  258. SIMD256T::castps_pd(a.v8[1]),
  259. };
  260. }
  261. static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a)
  262. {
  263. return Float{
  264. SIMD256T::castsi_ps(a.v8[0]),
  265. SIMD256T::castsi_ps(a.v8[1]),
  266. };
  267. }
  268. static SIMDINLINE Float SIMDCALL
  269. cvtepi32_ps(Integer const& a) // return (float)a (int32 --> float)
  270. {
  271. return Float{
  272. SIMD256T::cvtepi32_ps(a.v8[0]),
  273. SIMD256T::cvtepi32_ps(a.v8[1]),
  274. };
  275. }
  276. static SIMDINLINE Integer SIMDCALL
  277. cvtepu8_epi16(SIMD256Impl::Integer const& a) // return (int16)a (uint8 --> int16)
  278. {
  279. return Integer{
  280. SIMD256T::cvtepu8_epi16(a.v4[0]),
  281. SIMD256T::cvtepu8_epi16(a.v4[1]),
  282. };
  283. }
  284. static SIMDINLINE Integer SIMDCALL
  285. cvtepu8_epi32(SIMD256Impl::Integer const& a) // return (int32)a (uint8 --> int32)
  286. {
  287. return Integer{
  288. SIMD256T::cvtepu8_epi32(a.v4[0]),
  289. SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
  290. };
  291. }
  292. static SIMDINLINE Integer SIMDCALL
  293. cvtepu16_epi32(SIMD256Impl::Integer const& a) // return (int32)a (uint16 --> int32)
  294. {
  295. return Integer{
  296. SIMD256T::cvtepu16_epi32(a.v4[0]),
  297. SIMD256T::cvtepu16_epi32(a.v4[1]),
  298. };
  299. }
  300. static SIMDINLINE Integer SIMDCALL
  301. cvtepu16_epi64(SIMD256Impl::Integer const& a) // return (int64)a (uint16 --> int64)
  302. {
  303. return Integer{
  304. SIMD256T::cvtepu16_epi64(a.v4[0]),
  305. SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
  306. };
  307. }
  308. static SIMDINLINE Integer SIMDCALL
  309. cvtepu32_epi64(SIMD256Impl::Integer const& a) // return (int64)a (uint32 --> int64)
  310. {
  311. return Integer{
  312. SIMD256T::cvtepu32_epi64(a.v4[0]),
  313. SIMD256T::cvtepu32_epi64(a.v4[1]),
  314. };
  315. }
  316. static SIMDINLINE Integer SIMDCALL
  317. cvtps_epi32(Float const& a) // return (int32)a (float --> int32)
  318. {
  319. return Integer{
  320. SIMD256T::cvtps_epi32(a.v8[0]),
  321. SIMD256T::cvtps_epi32(a.v8[1]),
  322. };
  323. }
  324. static SIMDINLINE Integer SIMDCALL
  325. cvttps_epi32(Float const& a) // return (int32)a (rnd_to_zero(float) --> int32)
  326. {
  327. return Integer{
  328. SIMD256T::cvtps_epi32(a.v8[0]),
  329. SIMD256T::cvtps_epi32(a.v8[1]),
  330. };
  331. }
  332. //-----------------------------------------------------------------------
  333. // Comparison operations
  334. //-----------------------------------------------------------------------
  335. template <CompareType CmpTypeT>
  336. static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b
  337. {
  338. return Float{
  339. SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
  340. SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
  341. };
  342. }
  343. static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b)
  344. {
  345. return cmp_ps<CompareType::LT_OQ>(a, b);
  346. }
  347. static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b)
  348. {
  349. return cmp_ps<CompareType::GT_OQ>(a, b);
  350. }
  351. static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b)
  352. {
  353. return cmp_ps<CompareType::NEQ_OQ>(a, b);
  354. }
  355. static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b)
  356. {
  357. return cmp_ps<CompareType::EQ_OQ>(a, b);
  358. }
  359. static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b)
  360. {
  361. return cmp_ps<CompareType::GE_OQ>(a, b);
  362. }
  363. static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b)
  364. {
  365. return cmp_ps<CompareType::LE_OQ>(a, b);
  366. }
  367. template <CompareType CmpTypeT>
  368. static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const& a, Float const& b)
  369. {
  370. return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
  371. }
  372. SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
  373. SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
  374. SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
  375. SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
  376. SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
  377. SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
  378. SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
  379. SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
  380. SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
  381. static SIMDINLINE bool SIMDCALL
  382. testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
  383. {
  384. return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) & SIMD256T::testz_ps(a.v8[1], b.v8[1]));
  385. }
  386. static SIMDINLINE bool SIMDCALL
  387. testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
  388. {
  389. return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) & SIMD256T::testz_si(a.v8[1], b.v8[1]));
  390. }
  391. //-----------------------------------------------------------------------
  392. // Blend / shuffle / permute operations
  393. //-----------------------------------------------------------------------
  394. SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float)
  395. SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32)
  396. SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
  397. static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
  398. Integer const& b,
  399. Float const& mask) // return mask ? b : a (int)
  400. {
  401. return Integer{
  402. SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
  403. SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
  404. };
  405. }
  406. static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
  407. Integer const& b,
  408. Integer const& mask) // return mask ? b : a (int)
  409. {
  410. return Integer{
  411. SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
  412. SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
  413. };
  414. }
  415. static SIMDINLINE Float SIMDCALL
  416. broadcast_ss(float const* p) // return *p (all elements in vector get same value)
  417. {
  418. float f = *p;
  419. return Float{
  420. SIMD256T::set1_ps(f),
  421. SIMD256T::set1_ps(f),
  422. };
  423. }
  424. template <int imm>
  425. static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const& a)
  426. {
  427. SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
  428. return a.v8[imm];
  429. }
  430. template <int imm>
  431. static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const& a)
  432. {
  433. SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
  434. return a.v8[imm];
  435. }
  436. template <int imm>
  437. static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const& a)
  438. {
  439. SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
  440. return a.v8[imm];
  441. }
  442. template <int imm>
  443. static SIMDINLINE Float SIMDCALL insert_ps(Float const& a, SIMD256Impl::Float const& b)
  444. {
  445. SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
  446. Float r = a;
  447. r.v8[imm] = b;
  448. return r;
  449. }
  450. template <int imm>
  451. static SIMDINLINE Double SIMDCALL insert_pd(Double const& a, SIMD256Impl::Double const& b)
  452. {
  453. SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
  454. Double r = a;
  455. r.v8[imm] = b;
  456. return r;
  457. }
  458. template <int imm>
  459. static SIMDINLINE Integer SIMDCALL insert_si(Integer const& a, SIMD256Impl::Integer const& b)
  460. {
  461. SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
  462. Integer r = a;
  463. r.v8[imm] = b;
  464. return r;
  465. }
  466. SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
  467. SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
  468. SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
  469. SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
  470. template <int ImmT>
  471. static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
  472. {
  473. return Float{
  474. SIMD256T::template permute_ps<ImmT>(a.v8[0]),
  475. SIMD256T::template permute_ps<ImmT>(a.v8[1]),
  476. };
  477. }
  478. static SIMDINLINE Integer SIMDCALL permute_epi32(
  479. Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
  480. {
  481. return castps_si(permute_ps(castsi_ps(a), swiz));
  482. }
  483. static SIMDINLINE Float SIMDCALL
  484. permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
  485. {
  486. const auto mask = SIMD256T::set1_epi32(7);
  487. auto lolo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[0], mask));
  488. auto lohi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[0], mask));
  489. auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], mask));
  490. auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], mask));
  491. return Float{
  492. SIMD256T::blendv_ps(
  493. lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))),
  494. SIMD256T::blendv_ps(
  495. hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))),
  496. };
  497. }
  498. // All of the 512-bit permute2f128_XX intrinsics do the following:
  499. //
  500. // SELECT4(src, control) {
  501. // CASE(control[1:0])
  502. // 0 : tmp[127:0] : = src[127:0]
  503. // 1 : tmp[127:0] : = src[255:128]
  504. // 2 : tmp[127:0] : = src[383:256]
  505. // 3 : tmp[127:0] : = src[511:384]
  506. // ESAC
  507. // RETURN tmp[127:0]
  508. // }
  509. //
  510. // dst[127:0] : = SELECT4(a[511:0], imm8[1:0])
  511. // dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
  512. // dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
  513. // dst[511:384] : = SELECT4(b[511:0], imm8[7:6])
  514. // dst[MAX:512] : = 0
  515. //
  516. // Since the 256-bit AVX instructions use a 4-bit control field (instead
  517. // of 2-bit for AVX512), we need to expand the control bits sent to the
  518. // AVX instructions for emulation.
  519. //
  520. template <int shuf>
  521. static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const& a, Float const& b)
  522. {
  523. return Float{
  524. SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
  525. a.v8[1]),
  526. SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
  527. b.v8[1]),
  528. };
  529. }
  530. template <int shuf>
  531. static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const& a, Double const& b)
  532. {
  533. return Double{
  534. SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
  535. a.v8[1]),
  536. SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
  537. b.v8[1]),
  538. };
  539. }
  540. template <int shuf>
  541. static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const& a, Integer const& b)
  542. {
  543. return Integer{
  544. SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
  545. a.v8[1]),
  546. SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
  547. b.v8[1]),
  548. };
  549. }
  550. SIMD_IWRAPPER_2I_1(shuffle_epi32);
  551. SIMD_IWRAPPER_2I_2(shuffle_epi64);
  552. SIMD_IWRAPPER_2(shuffle_epi8);
  553. SIMD_WRAPPER_2I_1(shuffle_pd);
  554. SIMD_WRAPPER_2I_1(shuffle_ps);
  555. SIMD_IWRAPPER_2(unpackhi_epi16);
  556. SIMD_IWRAPPER_2(unpackhi_epi32);
  557. SIMD_IWRAPPER_2(unpackhi_epi64);
  558. SIMD_IWRAPPER_2(unpackhi_epi8);
  559. SIMD_WRAPPER_2(unpackhi_pd);
  560. SIMD_WRAPPER_2(unpackhi_ps);
  561. SIMD_IWRAPPER_2(unpacklo_epi16);
  562. SIMD_IWRAPPER_2(unpacklo_epi32);
  563. SIMD_IWRAPPER_2(unpacklo_epi64);
  564. SIMD_IWRAPPER_2(unpacklo_epi8);
  565. SIMD_WRAPPER_2(unpacklo_pd);
  566. SIMD_WRAPPER_2(unpacklo_ps);
  567. //-----------------------------------------------------------------------
  568. // Load / store operations
  569. //-----------------------------------------------------------------------
  570. template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  571. static SIMDINLINE Float SIMDCALL
  572. i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
  573. {
  574. return Float{
  575. SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
  576. SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
  577. };
  578. }
  579. template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  580. static SIMDINLINE Float SIMDCALL
  581. sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
  582. {
  583. return Float{
  584. SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[0]),
  585. SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[1]),
  586. };
  587. }
  588. static SIMDINLINE Float SIMDCALL
  589. load1_ps(float const* p) // return *p (broadcast 1 value to all elements)
  590. {
  591. return broadcast_ss(p);
  592. }
  593. static SIMDINLINE Float SIMDCALL
  594. load_ps(float const* p) // return *p (loads SIMD width elements from memory)
  595. {
  596. return Float{SIMD256T::load_ps(p), SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)};
  597. }
  598. static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
  599. {
  600. return Integer{
  601. SIMD256T::load_si(&p->v8[0]),
  602. SIMD256T::load_si(&p->v8[1]),
  603. };
  604. }
  605. static SIMDINLINE Float SIMDCALL
  606. loadu_ps(float const* p) // return *p (same as load_ps but allows for unaligned mem)
  607. {
  608. return Float{SIMD256T::loadu_ps(p), SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)};
  609. }
  610. static SIMDINLINE Integer SIMDCALL
  611. loadu_si(Integer const* p) // return *p (same as load_si but allows for unaligned mem)
  612. {
  613. return Integer{
  614. SIMD256T::loadu_si(&p->v8[0]),
  615. SIMD256T::loadu_si(&p->v8[1]),
  616. };
  617. }
  618. // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
  619. template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  620. static SIMDINLINE Float SIMDCALL
  621. mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
  622. {
  623. return Float{
  624. SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
  625. SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
  626. };
  627. }
  628. template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  629. static SIMDINLINE Float SIMDCALL
  630. sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
  631. {
  632. return Float{
  633. SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
  634. SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
  635. };
  636. }
  637. static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src)
  638. {
  639. SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
  640. SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
  641. }
  642. static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const& a)
  643. {
  644. uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
  645. mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
  646. return mask;
  647. }
  648. static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a)
  649. {
  650. uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
  651. mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
  652. return mask;
  653. }
  654. static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a)
  655. {
  656. uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
  657. mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
  658. return mask;
  659. }
  660. static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
  661. {
  662. return Integer{SIMD256T::set1_epi32(i), SIMD256T::set1_epi32(i)};
  663. }
  664. static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
  665. {
  666. return Integer{SIMD256T::set1_epi8(i), SIMD256T::set1_epi8(i)};
  667. }
  668. static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
  669. {
  670. return Float{SIMD256T::set1_ps(f), SIMD256T::set1_ps(f)};
  671. }
  672. static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
  673. {
  674. return Float{SIMD256T::setzero_ps(), SIMD256T::setzero_ps()};
  675. }
  676. static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
  677. {
  678. return Integer{SIMD256T::setzero_si(), SIMD256T::setzero_si()};
  679. }
  680. static SIMDINLINE void SIMDCALL
  681. store_ps(float* p, Float const& a) // *p = a (stores all elements contiguously in memory)
  682. {
  683. SIMD256T::store_ps(p, a.v8[0]);
  684. SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
  685. }
  686. static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a
  687. {
  688. SIMD256T::store_si(&p->v8[0], a.v8[0]);
  689. SIMD256T::store_si(&p->v8[1], a.v8[1]);
  690. }
  691. static SIMDINLINE void SIMDCALL
  692. stream_ps(float* p, Float const& a) // *p = a (same as store_ps, but doesn't keep memory in cache)
  693. {
  694. SIMD256T::stream_ps(p, a.v8[0]);
  695. SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
  696. }
  697. static SIMDINLINE Integer SIMDCALL set_epi32(int i15,
  698. int i14,
  699. int i13,
  700. int i12,
  701. int i11,
  702. int i10,
  703. int i9,
  704. int i8,
  705. int i7,
  706. int i6,
  707. int i5,
  708. int i4,
  709. int i3,
  710. int i2,
  711. int i1,
  712. int i0)
  713. {
  714. return Integer{SIMD256T::set_epi32(i7, i6, i5, i4, i3, i2, i1, i0),
  715. SIMD256T::set_epi32(i15, i14, i13, i12, i11, i10, i9, i8)};
  716. }
  717. static SIMDINLINE Integer SIMDCALL
  718. set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
  719. {
  720. return set_epi32(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
  721. }
  722. static SIMDINLINE Float SIMDCALL set_ps(float i15,
  723. float i14,
  724. float i13,
  725. float i12,
  726. float i11,
  727. float i10,
  728. float i9,
  729. float i8,
  730. float i7,
  731. float i6,
  732. float i5,
  733. float i4,
  734. float i3,
  735. float i2,
  736. float i1,
  737. float i0)
  738. {
  739. return Float{SIMD256T::set_ps(i7, i6, i5, i4, i3, i2, i1, i0),
  740. SIMD256T::set_ps(i15, i14, i13, i12, i11, i10, i9, i8)};
  741. }
  742. static SIMDINLINE Float SIMDCALL
  743. set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
  744. {
  745. return set_ps(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
  746. }
  747. static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
  748. {
  749. return Float{SIMD256T::vmask_ps(mask), SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH)};
  750. }
  751. #undef SIMD_WRAPPER_1
  752. #undef SIMD_WRAPPER_2
  753. #undef SIMD_WRAPPER_2I
  754. #undef SIMD_WRAPPER_2I_1
  755. #undef SIMD_WRAPPER_3
  756. #undef SIMD_IWRAPPER_1
  757. #undef SIMD_IWRAPPER_2
  758. #undef SIMD_IWRAPPER_2I
  759. #undef SIMD_IWRAPPER_2I_1
  760. #undef SIMD_IWRAPPER_3