mmx.h 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356
  1. /* Copyright (c) 2017-2018 Evan Nemerson <[email protected]>
  2. *
  3. * Permission is hereby granted, free of charge, to any person
  4. * obtaining a copy of this software and associated documentation
  5. * files (the "Software"), to deal in the Software without
  6. * restriction, including without limitation the rights to use, copy,
  7. * modify, merge, publish, distribute, sublicense, and/or sell copies
  8. * of the Software, and to permit persons to whom the Software is
  9. * furnished to do so, subject to the following conditions:
  10. *
  11. * The above copyright notice and this permission notice shall be
  12. * included in all copies or substantial portions of the Software.
  13. *
  14. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  16. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  17. * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  18. * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  19. * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  20. * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. * SOFTWARE.
  22. */
  23. #if !defined(SIMDE__MMX_H)
  24. #if !defined(SIMDE__MMX_H)
  25. #define SIMDE__MMX_H
  26. #endif
  27. #include "simde-common.h"
  28. #if defined(SIMDE_MMX_FORCE_NATIVE)
  29. #define SIMDE_MMX_NATIVE
  30. #elif defined(__MMX__) && !defined(SIMDE_MMX_NO_NATIVE) && \
  31. !defined(SIMDE_NO_NATIVE)
  32. #define SIMDE_MMX_NATIVE
  33. #elif defined(__ARM_NEON) && !defined(SIMDE_MMX_NO_NEON) && \
  34. !defined(SIMDE_NO_NEON)
  35. #define SIMDE_MMX_NEON
  36. #endif
  37. #if defined(SIMDE_MMX_NATIVE)
  38. #include <mmintrin.h>
  39. #else
  40. #if defined(SIMDE_MMX_NEON)
  41. #include <arm_neon.h>
  42. #endif
  43. #endif
  44. #include <stdint.h>
  45. #include <limits.h>
  46. #include <stdlib.h>
  47. #include <string.h>
  48. SIMDE__BEGIN_DECLS
  49. typedef union {
  50. #if defined(SIMDE__ENABLE_GCC_VEC_EXT)
  51. int8_t i8 __attribute__((__vector_size__(8), __may_alias__));
  52. int16_t i16 __attribute__((__vector_size__(8), __may_alias__));
  53. int32_t i32 __attribute__((__vector_size__(8), __may_alias__));
  54. int64_t i64 __attribute__((__vector_size__(8), __may_alias__));
  55. uint8_t u8 __attribute__((__vector_size__(8), __may_alias__));
  56. uint16_t u16 __attribute__((__vector_size__(8), __may_alias__));
  57. uint32_t u32 __attribute__((__vector_size__(8), __may_alias__));
  58. uint64_t u64 __attribute__((__vector_size__(8), __may_alias__));
  59. simde_float32 f32 __attribute__((__vector_size__(8), __may_alias__));
  60. #else
  61. int8_t i8[8];
  62. int16_t i16[4];
  63. int32_t i32[2];
  64. int64_t i64[1];
  65. uint8_t u8[8];
  66. uint16_t u16[4];
  67. uint32_t u32[2];
  68. uint64_t u64[1];
  69. simde_float32 f32[2];
  70. #endif
  71. #if defined(SIMDE_MMX_NATIVE)
  72. __m64 n;
  73. #elif defined(SIMDE_MMX_NEON)
  74. int8x8_t neon_i8;
  75. int16x4_t neon_i16;
  76. int32x2_t neon_i32;
  77. int64x1_t neon_i64;
  78. uint8x8_t neon_u8;
  79. uint16x4_t neon_u16;
  80. uint32x2_t neon_u32;
  81. uint64x1_t neon_u64;
  82. float32x2_t neon_f32;
  83. #endif
  84. } simde__m64;
  85. #if defined(SIMDE_MMX_NATIVE)
  86. HEDLEY_STATIC_ASSERT(sizeof(__m64) == sizeof(simde__m64),
  87. "__m64 size doesn't match simde__m64 size");
  88. SIMDE__FUNCTION_ATTRIBUTES simde__m64 SIMDE__M64_C(__m64 v)
  89. {
  90. simde__m64 r;
  91. r.n = v;
  92. return r;
  93. }
  94. #elif defined(SIMDE_MMX_NEON)
  95. #define SIMDE__M64_NEON_C(T, expr) \
  96. (simde__m64) { .neon_##T = (expr) }
  97. #endif
  98. HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64), "__m64 size incorrect");
  99. SIMDE__FUNCTION_ATTRIBUTES
  100. simde__m64 simde_mm_add_pi8(simde__m64 a, simde__m64 b)
  101. {
  102. #if defined(SIMDE_MMX_NATIVE)
  103. return SIMDE__M64_C(_mm_add_pi8(a.n, b.n));
  104. #else
  105. simde__m64 r;
  106. SIMDE__VECTORIZE
  107. for (size_t i = 0; i < 8; i++) {
  108. r.i8[i] = a.i8[i] + b.i8[i];
  109. }
  110. return r;
  111. #endif
  112. }
  113. #define simde_m_paddb(a, b) simde_mm_add_pi8(a, b)
  114. SIMDE__FUNCTION_ATTRIBUTES
  115. simde__m64 simde_mm_add_pi16(simde__m64 a, simde__m64 b)
  116. {
  117. #if defined(SIMDE_MMX_NATIVE)
  118. return SIMDE__M64_C(_mm_add_pi16(a.n, b.n));
  119. #else
  120. simde__m64 r;
  121. SIMDE__VECTORIZE
  122. for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
  123. r.i16[i] = a.i16[i] + b.i16[i];
  124. }
  125. return r;
  126. #endif
  127. }
  128. #define simde_m_paddw(a, b) simde_mm_add_pi16(a, b)
  129. SIMDE__FUNCTION_ATTRIBUTES
  130. simde__m64 simde_mm_add_pi32(simde__m64 a, simde__m64 b)
  131. {
  132. #if defined(SIMDE_MMX_NATIVE)
  133. return SIMDE__M64_C(_mm_add_pi32(a.n, b.n));
  134. #else
  135. simde__m64 r;
  136. SIMDE__VECTORIZE
  137. for (size_t i = 0; i < (8 / sizeof(int32_t)); i++) {
  138. r.i32[i] = a.i32[i] + b.i32[i];
  139. }
  140. return r;
  141. #endif
  142. }
  143. #define simde_m_paddd(a, b) simde_mm_add_pi32(a, b)
  144. SIMDE__FUNCTION_ATTRIBUTES
  145. simde__m64 simde_mm_adds_pi8(simde__m64 a, simde__m64 b)
  146. {
  147. #if defined(SIMDE_MMX_NATIVE)
  148. return SIMDE__M64_C(_mm_adds_pi8(a.n, b.n));
  149. #else
  150. simde__m64 r;
  151. SIMDE__VECTORIZE
  152. for (int i = 0; i < 8; i++) {
  153. if ((((b.i8[i]) > 0) && ((a.i8[i]) > (INT8_MAX - (b.i8[i]))))) {
  154. r.i8[i] = INT8_MAX;
  155. } else if ((((b.i8[i]) < 0) &&
  156. ((a.i8[i]) < (INT8_MIN - (b.i8[i]))))) {
  157. r.i8[i] = INT8_MIN;
  158. } else {
  159. r.i8[i] = (a.i8[i]) + (b.i8[i]);
  160. }
  161. }
  162. return r;
  163. #endif
  164. }
  165. #define simde_m_paddsb(a, b) simde_mm_adds_pi8(a, b)
  166. SIMDE__FUNCTION_ATTRIBUTES
  167. simde__m64 simde_mm_adds_pu8(simde__m64 a, simde__m64 b)
  168. {
  169. #if defined(SIMDE_MMX_NATIVE)
  170. return SIMDE__M64_C(_mm_adds_pu8(a.n, b.n));
  171. #else
  172. simde__m64 r;
  173. SIMDE__VECTORIZE
  174. for (size_t i = 0; i < 8; i++) {
  175. const int32_t x = a.u8[i] + b.u8[i];
  176. if (x < 0)
  177. r.u8[i] = 0;
  178. else if (x > UINT8_MAX)
  179. r.u8[i] = UINT8_MAX;
  180. else
  181. r.u8[i] = (uint8_t)x;
  182. }
  183. return r;
  184. #endif
  185. }
  186. #define simde_m_paddusb(a, b) simde_mm_adds_pu8(a, b)
  187. SIMDE__FUNCTION_ATTRIBUTES
  188. simde__m64 simde_mm_adds_pi16(simde__m64 a, simde__m64 b)
  189. {
  190. #if defined(SIMDE_MMX_NATIVE)
  191. return SIMDE__M64_C(_mm_adds_pi16(a.n, b.n));
  192. #else
  193. simde__m64 r;
  194. SIMDE__VECTORIZE
  195. for (int i = 0; i < 4; i++) {
  196. if ((((b.i16[i]) > 0) &&
  197. ((a.i16[i]) > (INT16_MAX - (b.i16[i]))))) {
  198. r.i16[i] = INT16_MAX;
  199. } else if ((((b.i16[i]) < 0) &&
  200. ((a.i16[i]) < (SHRT_MIN - (b.i16[i]))))) {
  201. r.i16[i] = SHRT_MIN;
  202. } else {
  203. r.i16[i] = (a.i16[i]) + (b.i16[i]);
  204. }
  205. }
  206. return r;
  207. #endif
  208. }
  209. #define simde_m_paddsw(a, b) simde_mm_adds_pi16(a, b)
  210. SIMDE__FUNCTION_ATTRIBUTES
  211. simde__m64 simde_mm_adds_pu16(simde__m64 a, simde__m64 b)
  212. {
  213. #if defined(SIMDE_MMX_NATIVE)
  214. return SIMDE__M64_C(_mm_adds_pu16(a.n, b.n));
  215. #else
  216. simde__m64 r;
  217. SIMDE__VECTORIZE
  218. for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
  219. const uint32_t x = a.u16[i] + b.u16[i];
  220. if (x > UINT16_MAX)
  221. r.u16[i] = UINT16_MAX;
  222. else
  223. r.u16[i] = (uint16_t)x;
  224. }
  225. return r;
  226. #endif
  227. }
  228. #define simde_m_paddusw(a, b) simde_mm_adds_pu16(a, b)
  229. SIMDE__FUNCTION_ATTRIBUTES
  230. simde__m64 simde_mm_and_si64(simde__m64 a, simde__m64 b)
  231. {
  232. #if defined(SIMDE_MMX_NATIVE)
  233. return SIMDE__M64_C(_mm_and_si64(a.n, b.n));
  234. #else
  235. simde__m64 r;
  236. r.i64[0] = a.i64[0] & b.i64[0];
  237. return r;
  238. #endif
  239. }
  240. #define simde_m_pand(a, b) simde_mm_and_si64(a, b)
  241. SIMDE__FUNCTION_ATTRIBUTES
  242. simde__m64 simde_mm_andnot_si64(simde__m64 a, simde__m64 b)
  243. {
  244. #if defined(SIMDE_MMX_NATIVE)
  245. return SIMDE__M64_C(_mm_andnot_si64(a.n, b.n));
  246. #else
  247. simde__m64 r;
  248. r.i64[0] = ~(a.i64[0]) & b.i64[0];
  249. return r;
  250. #endif
  251. }
  252. #define simde_m_pandn(a, b) simde_mm_andnot_si64(a, b)
  253. SIMDE__FUNCTION_ATTRIBUTES
  254. simde__m64 simde_mm_cmpeq_pi8(simde__m64 a, simde__m64 b)
  255. {
  256. #if defined(SIMDE_MMX_NATIVE)
  257. return SIMDE__M64_C(_mm_cmpeq_pi8(a.n, b.n));
  258. #else
  259. simde__m64 r;
  260. SIMDE__VECTORIZE
  261. for (int i = 0; i < 8; i++) {
  262. r.i8[i] = (a.i8[i] == b.i8[i]) * 0xff;
  263. }
  264. return r;
  265. #endif
  266. }
  267. #define simde_m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b)
  268. SIMDE__FUNCTION_ATTRIBUTES
  269. simde__m64 simde_mm_cmpeq_pi16(simde__m64 a, simde__m64 b)
  270. {
  271. #if defined(SIMDE_MMX_NATIVE)
  272. return SIMDE__M64_C(_mm_cmpeq_pi16(a.n, b.n));
  273. #else
  274. simde__m64 r;
  275. SIMDE__VECTORIZE
  276. for (int i = 0; i < 4; i++) {
  277. r.i16[i] = (a.i16[i] == b.i16[i]) * 0xffff;
  278. }
  279. return r;
  280. #endif
  281. }
  282. #define simde_m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b)
  283. SIMDE__FUNCTION_ATTRIBUTES
  284. simde__m64 simde_mm_cmpeq_pi32(simde__m64 a, simde__m64 b)
  285. {
  286. #if defined(SIMDE_MMX_NATIVE)
  287. return SIMDE__M64_C(_mm_cmpeq_pi32(a.n, b.n));
  288. #else
  289. simde__m64 r;
  290. SIMDE__VECTORIZE
  291. for (int i = 0; i < 2; i++) {
  292. r.i32[i] = (a.i32[i] == b.i32[i]) * 0xffffffff;
  293. }
  294. return r;
  295. #endif
  296. }
  297. #define simde_m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b)
  298. SIMDE__FUNCTION_ATTRIBUTES
  299. simde__m64 simde_mm_cmpgt_pi8(simde__m64 a, simde__m64 b)
  300. {
  301. #if defined(SIMDE_MMX_NATIVE)
  302. return SIMDE__M64_C(_mm_cmpgt_pi8(a.n, b.n));
  303. #else
  304. simde__m64 r;
  305. SIMDE__VECTORIZE
  306. for (int i = 0; i < 8; i++) {
  307. r.i8[i] = (a.i8[i] > b.i8[i]) * 0xff;
  308. }
  309. return r;
  310. #endif
  311. }
  312. #define simde_m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b)
  313. SIMDE__FUNCTION_ATTRIBUTES
  314. simde__m64 simde_mm_cmpgt_pi16(simde__m64 a, simde__m64 b)
  315. {
  316. #if defined(SIMDE_MMX_NATIVE)
  317. return SIMDE__M64_C(_mm_cmpgt_pi16(a.n, b.n));
  318. #else
  319. simde__m64 r;
  320. SIMDE__VECTORIZE
  321. for (int i = 0; i < 4; i++) {
  322. r.i16[i] = (a.i16[i] > b.i16[i]) * 0xffff;
  323. }
  324. return r;
  325. #endif
  326. }
  327. #define simde_m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b)
  328. SIMDE__FUNCTION_ATTRIBUTES
  329. simde__m64 simde_mm_cmpgt_pi32(simde__m64 a, simde__m64 b)
  330. {
  331. #if defined(SIMDE_MMX_NATIVE)
  332. return SIMDE__M64_C(_mm_cmpgt_pi32(a.n, b.n));
  333. #else
  334. simde__m64 r;
  335. SIMDE__VECTORIZE
  336. for (int i = 0; i < 2; i++) {
  337. r.i32[i] = (a.i32[i] > b.i32[i]) * 0xffffffff;
  338. }
  339. return r;
  340. #endif
  341. }
  342. #define simde_m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b)
  343. SIMDE__FUNCTION_ATTRIBUTES
  344. int64_t simde_mm_cvtm64_si64(simde__m64 a)
  345. {
  346. #if defined(SIMDE_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI)
  347. return _mm_cvtm64_si64(a.n);
  348. #else
  349. return a.i64[0];
  350. #endif
  351. }
  352. #define simde_m_to_int64(a) simde_mm_cvtm64_si64(a)
  353. SIMDE__FUNCTION_ATTRIBUTES
  354. simde__m64 simde_mm_cvtsi32_si64(int32_t a)
  355. {
  356. #if defined(SIMDE_MMX_NATIVE)
  357. return SIMDE__M64_C(_mm_cvtsi32_si64(a));
  358. #else
  359. simde__m64 r;
  360. r.i32[0] = a;
  361. r.i32[1] = 0;
  362. return r;
  363. #endif
  364. }
  365. #define simde_m_from_int(a) simde_mm_cvtsi32_si64(a)
  366. SIMDE__FUNCTION_ATTRIBUTES
  367. simde__m64 simde_mm_cvtsi64_m64(int64_t a)
  368. {
  369. #if defined(SIMDE_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI)
  370. return SIMDE__M64_C(_mm_cvtsi64_m64(a));
  371. #else
  372. simde__m64 r;
  373. r.i64[0] = a;
  374. return r;
  375. #endif
  376. }
  377. #define simde_m_from_int64(a) simde_mm_cvtsi64_m64(a)
  378. SIMDE__FUNCTION_ATTRIBUTES
  379. int32_t simde_mm_cvtsi64_si32(simde__m64 a)
  380. {
  381. #if defined(SIMDE_MMX_NATIVE)
  382. return _mm_cvtsi64_si32(a.n);
  383. #else
  384. return a.i32[0];
  385. #endif
  386. }
  387. SIMDE__FUNCTION_ATTRIBUTES
  388. void simde_mm_empty(void)
  389. {
  390. #if defined(SIMDE_MMX_NATIVE)
  391. _mm_empty();
  392. #else
  393. #endif
  394. }
  395. #define simde_m_empty() simde_mm_empty()
  396. SIMDE__FUNCTION_ATTRIBUTES
  397. simde__m64 simde_mm_madd_pi16(simde__m64 a, simde__m64 b)
  398. {
  399. #if defined(SIMDE_MMX_NATIVE)
  400. return SIMDE__M64_C(_mm_madd_pi16(a.n, b.n));
  401. #else
  402. simde__m64 r;
  403. SIMDE__VECTORIZE
  404. for (int i = 0; i < 4; i += 2) {
  405. r.i32[i / 2] =
  406. (a.i16[i] * b.i16[i]) + (a.i16[i + 1] * b.i16[i + 1]);
  407. }
  408. return r;
  409. #endif
  410. }
  411. #define simde_m_pmaddwd(a, b) simde_mm_madd_pi16(a, b)
  412. SIMDE__FUNCTION_ATTRIBUTES
  413. simde__m64 simde_mm_mulhi_pi16(simde__m64 a, simde__m64 b)
  414. {
  415. #if defined(SIMDE_MMX_NATIVE)
  416. return SIMDE__M64_C(_mm_mulhi_pi16(a.n, b.n));
  417. #else
  418. simde__m64 r;
  419. SIMDE__VECTORIZE
  420. for (int i = 0; i < 4; i++) {
  421. r.i16[i] = (int16_t)((a.i16[i] * b.i16[i]) >> 16);
  422. }
  423. return r;
  424. #endif
  425. }
  426. #define simde_m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b)
  427. SIMDE__FUNCTION_ATTRIBUTES
  428. simde__m64 simde_mm_mullo_pi16(simde__m64 a, simde__m64 b)
  429. {
  430. #if defined(SIMDE_MMX_NATIVE)
  431. return SIMDE__M64_C(_mm_mullo_pi16(a.n, b.n));
  432. #else
  433. simde__m64 r;
  434. SIMDE__VECTORIZE
  435. for (int i = 0; i < 4; i++) {
  436. r.i16[i] = (int16_t)((a.i16[i] * b.i16[i]) & 0xffff);
  437. }
  438. return r;
  439. #endif
  440. }
  441. #define simde_m_pmullw(a, b) simde_mm_mullo_pi16(a, b)
  442. SIMDE__FUNCTION_ATTRIBUTES
  443. simde__m64 simde_mm_or_si64(simde__m64 a, simde__m64 b)
  444. {
  445. #if defined(SIMDE_MMX_NATIVE)
  446. return SIMDE__M64_C(_mm_or_si64(a.n, b.n));
  447. #else
  448. simde__m64 r;
  449. r.i64[0] = a.i64[0] | b.i64[0];
  450. return r;
  451. #endif
  452. }
  453. #define simde_m_por(a, b) simde_mm_or_si64(a, b)
  454. SIMDE__FUNCTION_ATTRIBUTES
  455. simde__m64 simde_mm_packs_pi16(simde__m64 a, simde__m64 b)
  456. {
  457. #if defined(SIMDE_MMX_NATIVE)
  458. return SIMDE__M64_C(_mm_packs_pi16(a.n, b.n));
  459. #else
  460. simde__m64 r;
  461. SIMDE__VECTORIZE
  462. for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
  463. if (a.i16[i] < INT8_MIN) {
  464. r.i8[i] = INT8_MIN;
  465. } else if (a.i16[i] > INT8_MAX) {
  466. r.i8[i] = INT8_MAX;
  467. } else {
  468. r.i8[i] = (int8_t)a.i16[i];
  469. }
  470. }
  471. SIMDE__VECTORIZE
  472. for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
  473. if (b.i16[i] < INT8_MIN) {
  474. r.i8[i + 4] = INT8_MIN;
  475. } else if (b.i16[i] > INT8_MAX) {
  476. r.i8[i + 4] = INT8_MAX;
  477. } else {
  478. r.i8[i + 4] = (int8_t)b.i16[i];
  479. }
  480. }
  481. return r;
  482. #endif
  483. }
  484. #define simde_m_packsswb(a, b) simde_mm_packs_pi16(a, b)
  485. SIMDE__FUNCTION_ATTRIBUTES
  486. simde__m64 simde_mm_packs_pi32(simde__m64 a, simde__m64 b)
  487. {
  488. #if defined(SIMDE_MMX_NATIVE)
  489. return SIMDE__M64_C(_mm_packs_pi32(a.n, b.n));
  490. #else
  491. simde__m64 r;
  492. SIMDE__VECTORIZE
  493. for (size_t i = 0; i < (8 / sizeof(a.i32[0])); i++) {
  494. if (a.i32[i] < SHRT_MIN) {
  495. r.i16[i] = SHRT_MIN;
  496. } else if (a.i32[i] > INT16_MAX) {
  497. r.i16[i] = INT16_MAX;
  498. } else {
  499. r.i16[i] = (int16_t)a.i32[i];
  500. }
  501. }
  502. SIMDE__VECTORIZE
  503. for (size_t i = 0; i < (8 / sizeof(b.i32[0])); i++) {
  504. if (b.i32[i] < SHRT_MIN) {
  505. r.i16[i + 2] = SHRT_MIN;
  506. } else if (b.i32[i] > INT16_MAX) {
  507. r.i16[i + 2] = INT16_MAX;
  508. } else {
  509. r.i16[i + 2] = (int16_t)b.i32[i];
  510. }
  511. }
  512. return r;
  513. #endif
  514. }
  515. #define simde_m_packssdw(a, b) simde_mm_packs_pi32(a, b)
  516. SIMDE__FUNCTION_ATTRIBUTES
  517. simde__m64 simde_mm_packs_pu16(simde__m64 a, simde__m64 b)
  518. {
  519. #if defined(SIMDE_MMX_NATIVE)
  520. return SIMDE__M64_C(_mm_packs_pu16(a.n, b.n));
  521. #else
  522. simde__m64 r;
  523. SIMDE__VECTORIZE
  524. for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
  525. if (a.i16[i] > UINT8_MAX) {
  526. r.u8[i] = UINT8_MAX;
  527. } else if (a.i16[i] < 0) {
  528. r.u8[i] = 0;
  529. } else {
  530. r.u8[i] = (int8_t)a.i16[i];
  531. }
  532. }
  533. SIMDE__VECTORIZE
  534. for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
  535. if (b.i16[i] > UINT8_MAX) {
  536. r.u8[i + 4] = UINT8_MAX;
  537. } else if (b.i16[i] < 0) {
  538. r.u8[i + 4] = 0;
  539. } else {
  540. r.u8[i + 4] = (int8_t)b.i16[i];
  541. }
  542. }
  543. return r;
  544. #endif
  545. }
  546. #define simde_m_packuswb(a, b) simde_mm_packs_pu16(a, b)
  547. SIMDE__FUNCTION_ATTRIBUTES
  548. simde__m64 simde_mm_set_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4,
  549. int8_t e3, int8_t e2, int8_t e1, int8_t e0)
  550. {
  551. #if defined(SIMDE_MMX_NATIVE)
  552. return SIMDE__M64_C(_mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0));
  553. #else
  554. simde__m64 r;
  555. r.i8[0] = e0;
  556. r.i8[1] = e1;
  557. r.i8[2] = e2;
  558. r.i8[3] = e3;
  559. r.i8[4] = e4;
  560. r.i8[5] = e5;
  561. r.i8[6] = e6;
  562. r.i8[7] = e7;
  563. return r;
  564. #endif
  565. }
  566. SIMDE__FUNCTION_ATTRIBUTES
  567. simde__m64 simde_x_mm_set_pu8(uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4,
  568. uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0)
  569. {
  570. #if defined(SIMDE_MMX_NATIVE)
  571. return SIMDE__M64_C(_mm_set_pi8((int8_t)e7, (int8_t)e6, (int8_t)e5,
  572. (int8_t)e4, (int8_t)e3, (int8_t)e2,
  573. (int8_t)e1, (int8_t)e0));
  574. #else
  575. simde__m64 r;
  576. r.u8[0] = e0;
  577. r.u8[1] = e1;
  578. r.u8[2] = e2;
  579. r.u8[3] = e3;
  580. r.u8[4] = e4;
  581. r.u8[5] = e5;
  582. r.u8[6] = e6;
  583. r.u8[7] = e7;
  584. return r;
  585. #endif
  586. }
  587. SIMDE__FUNCTION_ATTRIBUTES
  588. simde__m64 simde_mm_set_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0)
  589. {
  590. #if defined(SIMDE_MMX_NATIVE)
  591. return SIMDE__M64_C(_mm_set_pi16(e3, e2, e1, e0));
  592. #else
  593. simde__m64 r;
  594. r.i16[0] = e0;
  595. r.i16[1] = e1;
  596. r.i16[2] = e2;
  597. r.i16[3] = e3;
  598. return r;
  599. #endif
  600. }
  601. SIMDE__FUNCTION_ATTRIBUTES
  602. simde__m64 simde_x_mm_set_pu16(uint16_t e3, uint16_t e2, uint16_t e1,
  603. uint16_t e0)
  604. {
  605. #if defined(SIMDE_MMX_NATIVE)
  606. return SIMDE__M64_C(_mm_set_pi16((int16_t)e3, (int16_t)e2, (int16_t)e1,
  607. (int16_t)e0));
  608. #else
  609. simde__m64 r;
  610. r.u16[0] = e0;
  611. r.u16[1] = e1;
  612. r.u16[2] = e2;
  613. r.u16[3] = e3;
  614. return r;
  615. #endif
  616. }
  617. SIMDE__FUNCTION_ATTRIBUTES
  618. simde__m64 simde_x_mm_set_pu32(uint32_t e1, uint32_t e0)
  619. {
  620. #if defined(SIMDE_MMX_NATIVE)
  621. return SIMDE__M64_C(_mm_set_pi32((int32_t)e1, (int32_t)e0));
  622. #else
  623. simde__m64 r;
  624. r.u32[0] = e0;
  625. r.u32[1] = e1;
  626. return r;
  627. #endif
  628. }
  629. SIMDE__FUNCTION_ATTRIBUTES
  630. simde__m64 simde_mm_set_pi32(int32_t e1, int32_t e0)
  631. {
  632. #if defined(SIMDE_MMX_NATIVE)
  633. return SIMDE__M64_C(_mm_set_pi32(e1, e0));
  634. #else
  635. simde__m64 r;
  636. r.i32[0] = e0;
  637. r.i32[1] = e1;
  638. return r;
  639. #endif
  640. }
  641. SIMDE__FUNCTION_ATTRIBUTES
  642. simde__m64 simde_mm_set1_pi8(int8_t a)
  643. {
  644. #if defined(SIMDE_MMX_NATIVE)
  645. return SIMDE__M64_C(_mm_set1_pi8(a));
  646. #else
  647. return simde_mm_set_pi8(a, a, a, a, a, a, a, a);
  648. #endif
  649. }
  650. SIMDE__FUNCTION_ATTRIBUTES
  651. simde__m64 simde_mm_set1_pi16(int16_t a)
  652. {
  653. #if defined(SIMDE_MMX_NATIVE)
  654. return SIMDE__M64_C(_mm_set1_pi16(a));
  655. #else
  656. return simde_mm_set_pi16(a, a, a, a);
  657. #endif
  658. }
  659. SIMDE__FUNCTION_ATTRIBUTES
  660. simde__m64 simde_mm_set1_pi32(int32_t a)
  661. {
  662. #if defined(SIMDE_MMX_NATIVE)
  663. return SIMDE__M64_C(_mm_set1_pi32(a));
  664. #else
  665. return simde_mm_set_pi32(a, a);
  666. #endif
  667. }
  668. SIMDE__FUNCTION_ATTRIBUTES
  669. simde__m64 simde_mm_setr_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4,
  670. int8_t e3, int8_t e2, int8_t e1, int8_t e0)
  671. {
  672. #if defined(SIMDE_MMX_NATIVE)
  673. return SIMDE__M64_C(_mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0));
  674. #else
  675. return simde_mm_set_pi8(e0, e1, e2, e3, e4, e5, e6, e7);
  676. #endif
  677. }
  678. SIMDE__FUNCTION_ATTRIBUTES
  679. simde__m64 simde_mm_setr_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0)
  680. {
  681. #if defined(SIMDE_MMX_NATIVE)
  682. return SIMDE__M64_C(_mm_setr_pi16(e3, e2, e1, e0));
  683. #else
  684. return simde_mm_set_pi16(e0, e1, e2, e3);
  685. #endif
  686. }
  687. SIMDE__FUNCTION_ATTRIBUTES
  688. simde__m64 simde_mm_setr_pi32(int32_t e1, int32_t e0)
  689. {
  690. #if defined(SIMDE_MMX_NATIVE)
  691. return SIMDE__M64_C(_mm_setr_pi32(e1, e0));
  692. #else
  693. return simde_mm_set_pi32(e0, e1);
  694. #endif
  695. }
  696. SIMDE__FUNCTION_ATTRIBUTES
  697. simde__m64 simde_mm_setzero_si64(void)
  698. {
  699. #if defined(SIMDE_MMX_NATIVE)
  700. return SIMDE__M64_C(_mm_setzero_si64());
  701. #else
  702. return simde_mm_set_pi32(0, 0);
  703. #endif
  704. }
  705. SIMDE__FUNCTION_ATTRIBUTES
  706. simde__m64 simde_mm_sll_pi16(simde__m64 a, simde__m64 count)
  707. {
  708. #if defined(SIMDE_MMX_NATIVE)
  709. return SIMDE__M64_C(_mm_sll_pi16(a.n, count.n));
  710. #else
  711. simde__m64 r;
  712. if (HEDLEY_UNLIKELY(count.u64[0] > 15)) {
  713. memset(&r, 0, sizeof(r));
  714. return r;
  715. }
  716. SIMDE__VECTORIZE
  717. for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
  718. r.u16[i] = a.u16[i] << count.u64[0];
  719. }
  720. return r;
  721. #endif
  722. }
  723. #define simde_m_psllw(a, count) simde_mm_sll_pi16(a, count)
  724. SIMDE__FUNCTION_ATTRIBUTES
  725. simde__m64 simde_mm_sll_pi32(simde__m64 a, simde__m64 count)
  726. {
  727. #if defined(SIMDE_MMX_NATIVE)
  728. return SIMDE__M64_C(_mm_sll_pi32(a.n, count.n));
  729. #else
  730. simde__m64 r;
  731. if (HEDLEY_UNLIKELY(count.u64[0] > 31)) {
  732. memset(&r, 0, sizeof(r));
  733. return r;
  734. }
  735. SIMDE__VECTORIZE
  736. for (size_t i = 0; i < (sizeof(r.u32) / sizeof(r.u32[0])); i++) {
  737. r.u32[i] = a.u32[i] << count.u64[0];
  738. }
  739. return r;
  740. #endif
  741. }
  742. #define simde_m_pslld(a, count) simde_mm_sll_pi32(a, count)
  743. SIMDE__FUNCTION_ATTRIBUTES
  744. simde__m64 simde_mm_slli_pi16(simde__m64 a, int count)
  745. {
  746. #if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
  747. return SIMDE__M64_C(_mm_slli_pi16(a.n, count));
  748. #else
  749. simde__m64 r;
  750. SIMDE__VECTORIZE
  751. for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
  752. r.u16[i] = a.u16[i] << count;
  753. }
  754. return r;
  755. #endif
  756. }
  757. #define simde_m_psllwi(a, count) simde_mm_slli_pi16(a, count)
  758. SIMDE__FUNCTION_ATTRIBUTES
  759. simde__m64 simde_mm_slli_pi32(simde__m64 a, int count)
  760. {
  761. #if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
  762. return SIMDE__M64_C(_mm_slli_pi32(a.n, count));
  763. #else
  764. simde__m64 r;
  765. SIMDE__VECTORIZE
  766. for (size_t i = 0; i < (8 / sizeof(int)); i++) {
  767. r.u32[i] = a.u32[i] << count;
  768. }
  769. return r;
  770. #endif
  771. }
  772. #define simde_m_pslldi(a, b) simde_mm_slli_pi32(a, b)
  773. SIMDE__FUNCTION_ATTRIBUTES
  774. simde__m64 simde_mm_slli_si64(simde__m64 a, int count)
  775. {
  776. #if defined(SIMDE_MMX_NATIVE)
  777. return SIMDE__M64_C(_mm_slli_si64(a.n, count));
  778. #else
  779. simde__m64 r;
  780. r.u64[0] = a.u64[0] << count;
  781. return r;
  782. #endif
  783. }
  784. #define simde_m_psllqi(a, count) simde_mm_slli_si64(a, count)
  785. SIMDE__FUNCTION_ATTRIBUTES
  786. simde__m64 simde_mm_sll_si64(simde__m64 a, simde__m64 count)
  787. {
  788. #if defined(SIMDE_MMX_NATIVE)
  789. return SIMDE__M64_C(_mm_sll_si64(a.n, count.n));
  790. #else
  791. simde__m64 r;
  792. if (HEDLEY_UNLIKELY(count.u64[0] > 63)) {
  793. memset(&r, 0, sizeof(r));
  794. return r;
  795. }
  796. r.u64[0] = a.u64[0] << count.u64[0];
  797. return r;
  798. #endif
  799. }
  800. #define simde_m_psllq(a, count) simde_mm_sll_si64(a, count)
  801. SIMDE__FUNCTION_ATTRIBUTES
  802. simde__m64 simde_mm_srl_pi16(simde__m64 a, simde__m64 count)
  803. {
  804. #if defined(SIMDE_MMX_NATIVE)
  805. return SIMDE__M64_C(_mm_srl_pi16(a.n, count.n));
  806. #else
  807. simde__m64 r;
  808. if (HEDLEY_UNLIKELY(count.u64[0] > 15)) {
  809. memset(&r, 0, sizeof(r));
  810. return r;
  811. }
  812. SIMDE__VECTORIZE
  813. for (size_t i = 0; i < sizeof(r.u16) / sizeof(r.u16[0]); i++) {
  814. r.u16[i] = a.u16[i] >> count.u64[0];
  815. }
  816. return r;
  817. #endif
  818. }
  819. #define simde_m_psrlw(a, count) simde_mm_srl_pi16(a, count)
  820. SIMDE__FUNCTION_ATTRIBUTES
  821. simde__m64 simde_mm_srl_pi32(simde__m64 a, simde__m64 count)
  822. {
  823. #if defined(SIMDE_MMX_NATIVE)
  824. return SIMDE__M64_C(_mm_srl_pi32(a.n, count.n));
  825. #else
  826. simde__m64 r;
  827. if (HEDLEY_UNLIKELY(count.u64[0] > 31)) {
  828. memset(&r, 0, sizeof(r));
  829. return r;
  830. }
  831. SIMDE__VECTORIZE
  832. for (size_t i = 0; i < sizeof(r.u32) / sizeof(r.u32[0]); i++) {
  833. r.u32[i] = a.u32[i] >> count.u64[0];
  834. }
  835. return r;
  836. #endif
  837. }
  838. #define simde_m_psrld(a, count) simde_mm_srl_pi32(a, count)
  839. SIMDE__FUNCTION_ATTRIBUTES
  840. simde__m64 simde_mm_srli_pi16(simde__m64 a, int count)
  841. {
  842. #if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
  843. return SIMDE__M64_C(_mm_srli_pi16(a.n, count));
  844. #else
  845. simde__m64 r;
  846. SIMDE__VECTORIZE
  847. for (size_t i = 0; i < (8 / sizeof(uint16_t)); i++) {
  848. r.u16[i] = a.u16[i] >> count;
  849. }
  850. return r;
  851. #endif
  852. }
  853. #define simde_m_psrlwi(a, count) simde_mm_srli_pi16(a, count)
  854. SIMDE__FUNCTION_ATTRIBUTES
  855. simde__m64 simde_mm_srli_pi32(simde__m64 a, int count)
  856. {
  857. #if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
  858. return SIMDE__M64_C(_mm_srli_pi32(a.n, count));
  859. #else
  860. simde__m64 r;
  861. SIMDE__VECTORIZE
  862. for (size_t i = 0; i < (8 / sizeof(int)); i++) {
  863. r.u32[i] = a.u32[i] >> count;
  864. }
  865. return r;
  866. #endif
  867. }
  868. #define simde_m_psrldi(a, count) simde_mm_srli_pi32(a, count)
  869. SIMDE__FUNCTION_ATTRIBUTES
  870. simde__m64 simde_mm_srli_si64(simde__m64 a, int count)
  871. {
  872. #if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
  873. return SIMDE__M64_C(_mm_srli_si64(a.n, count));
  874. #else
  875. simde__m64 r;
  876. r.u64[0] = a.u64[0] >> count;
  877. return r;
  878. #endif
  879. }
  880. #define simde_m_psrlqi(a, count) simde_mm_srli_si64(a, count)
  881. SIMDE__FUNCTION_ATTRIBUTES
  882. simde__m64 simde_mm_srl_si64(simde__m64 a, simde__m64 count)
  883. {
  884. #if defined(SIMDE_MMX_NATIVE)
  885. return SIMDE__M64_C(_mm_srl_si64(a.n, count.n));
  886. #else
  887. simde__m64 r;
  888. if (HEDLEY_UNLIKELY(count.u64[0] > 63)) {
  889. memset(&r, 0, sizeof(r));
  890. return r;
  891. }
  892. r.u64[0] = a.u64[0] >> count.u64[0];
  893. return r;
  894. #endif
  895. }
  896. #define simde_m_psrlq(a, count) simde_mm_srl_si64(a, count)
  897. SIMDE__FUNCTION_ATTRIBUTES
  898. simde__m64 simde_mm_srai_pi16(simde__m64 a, int count)
  899. {
  900. #if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
  901. return SIMDE__M64_C(_mm_srai_pi16(a.n, count));
  902. #else
  903. simde__m64 r;
  904. const uint16_t m =
  905. (uint16_t)((~0U) << ((sizeof(int16_t) * CHAR_BIT) - count));
  906. SIMDE__VECTORIZE
  907. for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
  908. const uint16_t is_neg = ((uint16_t)(
  909. ((a.u16[i]) >> ((sizeof(int16_t) * CHAR_BIT) - 1))));
  910. r.u16[i] = (a.u16[i] >> count) | (m * is_neg);
  911. }
  912. return r;
  913. #endif
  914. }
  915. #define simde_m_psrawi(a, count) simde_mm_srai_pi16(a, count)
  916. SIMDE__FUNCTION_ATTRIBUTES
  917. simde__m64 simde_mm_srai_pi32(simde__m64 a, int count)
  918. {
  919. #if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
  920. return SIMDE__M64_C(_mm_srai_pi32(a.n, count));
  921. #else
  922. simde__m64 r;
  923. const uint32_t m =
  924. (uint32_t)((~0U) << ((sizeof(int) * CHAR_BIT) - count));
  925. SIMDE__VECTORIZE
  926. for (size_t i = 0; i < (8 / sizeof(int)); i++) {
  927. const uint32_t is_neg = ((uint32_t)(
  928. ((a.u32[i]) >> ((sizeof(int) * CHAR_BIT) - 1))));
  929. r.u32[i] = (a.u32[i] >> count) | (m * is_neg);
  930. }
  931. return r;
  932. #endif
  933. }
  934. #define simde_m_srai_pi32(a, count) simde_mm_srai_pi32(a, count)
  935. SIMDE__FUNCTION_ATTRIBUTES
  936. simde__m64 simde_mm_sra_pi16(simde__m64 a, simde__m64 count)
  937. {
  938. #if defined(SIMDE_MMX_NATIVE)
  939. return SIMDE__M64_C(_mm_sra_pi16(a.n, count.n));
  940. #else
  941. simde__m64 r;
  942. int cnt = (int)count.i64[0];
  943. if (cnt > 15 || cnt < 0) {
  944. for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0]));
  945. i++) {
  946. r.u16[i] = (a.i16[i] < 0) ? 0xffff : 0x0000;
  947. }
  948. } else {
  949. const uint16_t m = (uint16_t)(
  950. (~0U) << ((sizeof(int16_t) * CHAR_BIT) - cnt));
  951. for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0]));
  952. i++) {
  953. const uint16_t is_neg = a.i16[i] < 0;
  954. r.u16[i] = (a.u16[i] >> cnt) | (m * is_neg);
  955. }
  956. }
  957. return r;
  958. #endif
  959. }
  960. #define simde_m_psraw(a, count) simde_mm_sra_pi16(a, count)
  961. SIMDE__FUNCTION_ATTRIBUTES
  962. simde__m64 simde_mm_sra_pi32(simde__m64 a, simde__m64 count)
  963. {
  964. #if defined(SIMDE_MMX_NATIVE)
  965. return SIMDE__M64_C(_mm_sra_pi32(a.n, count.n));
  966. #else
  967. simde__m64 r;
  968. const uint64_t cnt = count.u64[0];
  969. if (cnt > 31) {
  970. for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0]));
  971. i++) {
  972. r.u32[i] = (a.i32[i] < 0) ? UINT32_MAX : 0;
  973. }
  974. } else if (cnt == 0) {
  975. memcpy(&r, &a, sizeof(r));
  976. } else {
  977. const uint32_t m = (uint32_t)(
  978. (~0U) << ((sizeof(int32_t) * CHAR_BIT) - cnt));
  979. for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0]));
  980. i++) {
  981. const uint32_t is_neg = a.i32[i] < 0;
  982. r.u32[i] = (a.u32[i] >> cnt) | (m * is_neg);
  983. }
  984. }
  985. return r;
  986. #endif
  987. }
  988. #define simde_m_psrad(a, b) simde_mm_sra_pi32(a, b)
  989. SIMDE__FUNCTION_ATTRIBUTES
  990. simde__m64 simde_mm_sub_pi8(simde__m64 a, simde__m64 b)
  991. {
  992. #if defined(SIMDE_MMX_NATIVE)
  993. return SIMDE__M64_C(_mm_sub_pi8(a.n, b.n));
  994. #else
  995. simde__m64 r;
  996. SIMDE__VECTORIZE
  997. for (size_t i = 0; i < 8; i++) {
  998. r.i8[i] = a.i8[i] - b.i8[i];
  999. }
  1000. return r;
  1001. #endif
  1002. }
  1003. #define simde_m_psubb(a, b) simde_mm_sub_pi8(a, b)
  1004. SIMDE__FUNCTION_ATTRIBUTES
  1005. simde__m64 simde_mm_sub_pi16(simde__m64 a, simde__m64 b)
  1006. {
  1007. #if defined(SIMDE_MMX_NATIVE)
  1008. return SIMDE__M64_C(_mm_sub_pi16(a.n, b.n));
  1009. #else
  1010. simde__m64 r;
  1011. SIMDE__VECTORIZE
  1012. for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
  1013. r.i16[i] = a.i16[i] - b.i16[i];
  1014. }
  1015. return r;
  1016. #endif
  1017. }
  1018. #define simde_m_psubw(a, b) simde_mm_sub_pi16(a, b)
  1019. SIMDE__FUNCTION_ATTRIBUTES
  1020. simde__m64 simde_mm_sub_pi32(simde__m64 a, simde__m64 b)
  1021. {
  1022. #if defined(SIMDE_MMX_NATIVE)
  1023. return SIMDE__M64_C(_mm_sub_pi32(a.n, b.n));
  1024. #else
  1025. simde__m64 r;
  1026. SIMDE__VECTORIZE
  1027. for (size_t i = 0; i < (8 / sizeof(int)); i++) {
  1028. r.i32[i] = a.i32[i] - b.i32[i];
  1029. }
  1030. return r;
  1031. #endif
  1032. }
  1033. #define simde_m_psubd(a, b) simde_mm_sub_pi32(a, b)
  1034. SIMDE__FUNCTION_ATTRIBUTES
  1035. simde__m64 simde_mm_subs_pi8(simde__m64 a, simde__m64 b)
  1036. {
  1037. #if defined(SIMDE_MMX_NATIVE)
  1038. return SIMDE__M64_C(_mm_subs_pi8(a.n, b.n));
  1039. #else
  1040. simde__m64 r;
  1041. SIMDE__VECTORIZE
  1042. for (size_t i = 0; i < (8); i++) {
  1043. if (((b.i8[i]) > 0 && (a.i8[i]) < INT8_MIN + (b.i8[i]))) {
  1044. r.i8[i] = INT8_MIN;
  1045. } else if ((b.i8[i]) < 0 && (a.i8[i]) > INT8_MAX + (b.i8[i])) {
  1046. r.i8[i] = INT8_MAX;
  1047. } else {
  1048. r.i8[i] = (a.i8[i]) - (b.i8[i]);
  1049. }
  1050. }
  1051. return r;
  1052. #endif
  1053. }
  1054. #define simde_m_psubsb(a, b) simde_mm_subs_pi8(a, b)
  1055. SIMDE__FUNCTION_ATTRIBUTES
  1056. simde__m64 simde_mm_subs_pu8(simde__m64 a, simde__m64 b)
  1057. {
  1058. #if defined(SIMDE_MMX_NATIVE)
  1059. return SIMDE__M64_C(_mm_subs_pu8(a.n, b.n));
  1060. #else
  1061. simde__m64 r;
  1062. SIMDE__VECTORIZE
  1063. for (size_t i = 0; i < (8); i++) {
  1064. const int32_t x = a.u8[i] - b.u8[i];
  1065. if (x < 0) {
  1066. r.u8[i] = 0;
  1067. } else if (x > UINT8_MAX) {
  1068. r.u8[i] = UINT8_MAX;
  1069. } else {
  1070. r.u8[i] = (uint8_t)x;
  1071. }
  1072. }
  1073. return r;
  1074. #endif
  1075. }
  1076. #define simde_m_psubusb(a, b) simde_mm_subs_pu8(a, b)
  1077. SIMDE__FUNCTION_ATTRIBUTES
  1078. simde__m64 simde_mm_subs_pi16(simde__m64 a, simde__m64 b)
  1079. {
  1080. #if defined(SIMDE_MMX_NATIVE)
  1081. return SIMDE__M64_C(_mm_subs_pi16(a.n, b.n));
  1082. #else
  1083. simde__m64 r;
  1084. SIMDE__VECTORIZE
  1085. for (size_t i = 0; i < (8 / sizeof(int16_t)); i++) {
  1086. if (((b.i16[i]) > 0 && (a.i16[i]) < SHRT_MIN + (b.i16[i]))) {
  1087. r.i16[i] = SHRT_MIN;
  1088. } else if ((b.i16[i]) < 0 &&
  1089. (a.i16[i]) > INT16_MAX + (b.i16[i])) {
  1090. r.i16[i] = INT16_MAX;
  1091. } else {
  1092. r.i16[i] = (a.i16[i]) - (b.i16[i]);
  1093. }
  1094. }
  1095. return r;
  1096. #endif
  1097. }
  1098. #define simde_m_psubsw(a, b) simde_mm_subs_pi16(a, b)
  1099. SIMDE__FUNCTION_ATTRIBUTES
  1100. simde__m64 simde_mm_subs_pu16(simde__m64 a, simde__m64 b)
  1101. {
  1102. #if defined(SIMDE_MMX_NATIVE)
  1103. return SIMDE__M64_C(_mm_subs_pu16(a.n, b.n));
  1104. #else
  1105. simde__m64 r;
  1106. SIMDE__VECTORIZE
  1107. for (size_t i = 0; i < (8 / sizeof(uint16_t)); i++) {
  1108. const int x = a.u16[i] - b.u16[i];
  1109. if (x < 0) {
  1110. r.u16[i] = 0;
  1111. } else if (x > UINT16_MAX) {
  1112. r.u16[i] = UINT16_MAX;
  1113. } else {
  1114. r.u16[i] = (uint16_t)x;
  1115. }
  1116. }
  1117. return r;
  1118. #endif
  1119. }
  1120. #define simde_m_psubusw(a, b) simde_mm_subs_pu16(a, b)
  1121. SIMDE__FUNCTION_ATTRIBUTES
  1122. simde__m64 simde_mm_unpackhi_pi8(simde__m64 a, simde__m64 b)
  1123. {
  1124. #if defined(SIMDE_MMX_NATIVE)
  1125. return SIMDE__M64_C(_mm_unpackhi_pi8(a.n, b.n));
  1126. #else
  1127. simde__m64 r;
  1128. r.i8[0] = a.i8[4];
  1129. r.i8[1] = b.i8[4];
  1130. r.i8[2] = a.i8[5];
  1131. r.i8[3] = b.i8[5];
  1132. r.i8[4] = a.i8[6];
  1133. r.i8[5] = b.i8[6];
  1134. r.i8[6] = a.i8[7];
  1135. r.i8[7] = b.i8[7];
  1136. return r;
  1137. #endif
  1138. }
  1139. #define simde_m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b)
  1140. SIMDE__FUNCTION_ATTRIBUTES
  1141. simde__m64 simde_mm_unpackhi_pi16(simde__m64 a, simde__m64 b)
  1142. {
  1143. #if defined(SIMDE_MMX_NATIVE)
  1144. return SIMDE__M64_C(_mm_unpackhi_pi16(a.n, b.n));
  1145. #else
  1146. simde__m64 r;
  1147. r.i16[0] = a.i16[2];
  1148. r.i16[1] = b.i16[2];
  1149. r.i16[2] = a.i16[3];
  1150. r.i16[3] = b.i16[3];
  1151. return r;
  1152. #endif
  1153. }
  1154. #define simde_m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b)
  1155. SIMDE__FUNCTION_ATTRIBUTES
  1156. simde__m64 simde_mm_unpackhi_pi32(simde__m64 a, simde__m64 b)
  1157. {
  1158. #if defined(SIMDE_MMX_NATIVE)
  1159. return SIMDE__M64_C(_mm_unpackhi_pi32(a.n, b.n));
  1160. #else
  1161. simde__m64 r;
  1162. r.i32[0] = a.i32[1];
  1163. r.i32[1] = b.i32[1];
  1164. return r;
  1165. #endif
  1166. }
  1167. #define simde_m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b)
  1168. SIMDE__FUNCTION_ATTRIBUTES
  1169. simde__m64 simde_mm_unpacklo_pi8(simde__m64 a, simde__m64 b)
  1170. {
  1171. #if defined(SIMDE_MMX_NATIVE)
  1172. return SIMDE__M64_C(_mm_unpacklo_pi8(a.n, b.n));
  1173. #else
  1174. simde__m64 r;
  1175. r.i8[0] = a.i8[0];
  1176. r.i8[1] = b.i8[0];
  1177. r.i8[2] = a.i8[1];
  1178. r.i8[3] = b.i8[1];
  1179. r.i8[4] = a.i8[2];
  1180. r.i8[5] = b.i8[2];
  1181. r.i8[6] = a.i8[3];
  1182. r.i8[7] = b.i8[3];
  1183. return r;
  1184. #endif
  1185. }
  1186. #define simde_m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b)
  1187. SIMDE__FUNCTION_ATTRIBUTES
  1188. simde__m64 simde_mm_unpacklo_pi16(simde__m64 a, simde__m64 b)
  1189. {
  1190. #if defined(SIMDE_MMX_NATIVE)
  1191. return SIMDE__M64_C(_mm_unpacklo_pi16(a.n, b.n));
  1192. #else
  1193. simde__m64 r;
  1194. r.i16[0] = a.i16[0];
  1195. r.i16[1] = b.i16[0];
  1196. r.i16[2] = a.i16[1];
  1197. r.i16[3] = b.i16[1];
  1198. return r;
  1199. #endif
  1200. }
  1201. #define simde_m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b)
  1202. SIMDE__FUNCTION_ATTRIBUTES
  1203. simde__m64 simde_mm_unpacklo_pi32(simde__m64 a, simde__m64 b)
  1204. {
  1205. #if defined(SIMDE_MMX_NATIVE)
  1206. return SIMDE__M64_C(_mm_unpacklo_pi32(a.n, b.n));
  1207. #else
  1208. simde__m64 r;
  1209. r.i32[0] = a.i32[0];
  1210. r.i32[1] = b.i32[0];
  1211. return r;
  1212. #endif
  1213. }
  1214. #define simde_m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b)
  1215. SIMDE__FUNCTION_ATTRIBUTES
  1216. simde__m64 simde_mm_xor_si64(simde__m64 a, simde__m64 b)
  1217. {
  1218. #if defined(SIMDE_MMX_NATIVE)
  1219. return SIMDE__M64_C(_mm_xor_si64(a.n, b.n));
  1220. #else
  1221. simde__m64 r;
  1222. r.i64[0] = a.i64[0] ^ b.i64[0];
  1223. return r;
  1224. #endif
  1225. }
  1226. #define simde_m_pxor(a, b) simde_mm_xor_si64(a, b)
  1227. SIMDE__FUNCTION_ATTRIBUTES
  1228. int32_t simde_m_to_int(simde__m64 a)
  1229. {
  1230. #if defined(SIMDE_MMX_NATIVE)
  1231. return _m_to_int(a.n);
  1232. #else
  1233. return a.i32[0];
  1234. #endif
  1235. }
  1236. SIMDE__END_DECLS
  1237. #endif /* !defined(SIMDE__MMX_H) */