mmx.h 63 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270
  1. /* SPDX-License-Identifier: MIT
  2. *
  3. * Permission is hereby granted, free of charge, to any person
  4. * obtaining a copy of this software and associated documentation
  5. * files (the "Software"), to deal in the Software without
  6. * restriction, including without limitation the rights to use, copy,
  7. * modify, merge, publish, distribute, sublicense, and/or sell copies
  8. * of the Software, and to permit persons to whom the Software is
  9. * furnished to do so, subject to the following conditions:
  10. *
  11. * The above copyright notice and this permission notice shall be
  12. * included in all copies or substantial portions of the Software.
  13. *
  14. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  16. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  17. * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  18. * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  19. * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  20. * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. * SOFTWARE.
  22. *
  23. * Copyright:
  24. * 2017-2020 Evan Nemerson <[email protected]>
  25. */
  26. #if !defined(SIMDE_X86_MMX_H)
  27. #define SIMDE_X86_MMX_H
  28. #include "simde-common.h"
  29. #if !defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
  30. #define SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES
  31. #endif
  32. HEDLEY_DIAGNOSTIC_PUSH
  33. SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
  34. #if defined(SIMDE_X86_MMX_NATIVE)
  35. #define SIMDE_X86_MMX_USE_NATIVE_TYPE
  36. #elif defined(SIMDE_X86_SSE_NATIVE)
  37. #define SIMDE_X86_MMX_USE_NATIVE_TYPE
  38. #endif
  39. #if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
  40. #include <mmintrin.h>
  41. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  42. #include <arm_neon.h>
  43. #endif
  44. #include <stdint.h>
  45. #include <limits.h>
  46. SIMDE_BEGIN_DECLS_
  47. typedef union {
  48. #if defined(SIMDE_VECTOR_SUBSCRIPT)
  49. SIMDE_ALIGN(8) int8_t i8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  50. SIMDE_ALIGN(8) int16_t i16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  51. SIMDE_ALIGN(8) int32_t i32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  52. SIMDE_ALIGN(8) int64_t i64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  53. SIMDE_ALIGN(8) uint8_t u8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  54. SIMDE_ALIGN(8) uint16_t u16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  55. SIMDE_ALIGN(8) uint32_t u32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  56. SIMDE_ALIGN(8) uint64_t u64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  57. SIMDE_ALIGN(8) simde_float32 f32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  58. SIMDE_ALIGN(8) int_fast32_t i32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  59. SIMDE_ALIGN(8) uint_fast32_t u32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  60. #else
  61. SIMDE_ALIGN(8) int8_t i8[8];
  62. SIMDE_ALIGN(8) int16_t i16[4];
  63. SIMDE_ALIGN(8) int32_t i32[2];
  64. SIMDE_ALIGN(8) int64_t i64[1];
  65. SIMDE_ALIGN(8) uint8_t u8[8];
  66. SIMDE_ALIGN(8) uint16_t u16[4];
  67. SIMDE_ALIGN(8) uint32_t u32[2];
  68. SIMDE_ALIGN(8) uint64_t u64[1];
  69. SIMDE_ALIGN(8) simde_float32 f32[2];
  70. SIMDE_ALIGN(8) int_fast32_t i32f[8 / sizeof(int_fast32_t)];
  71. SIMDE_ALIGN(8) uint_fast32_t u32f[8 / sizeof(uint_fast32_t)];
  72. #endif
  73. #if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
  74. __m64 n;
  75. #endif
  76. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  77. int8x8_t neon_i8;
  78. int16x4_t neon_i16;
  79. int32x2_t neon_i32;
  80. int64x1_t neon_i64;
  81. uint8x8_t neon_u8;
  82. uint16x4_t neon_u16;
  83. uint32x2_t neon_u32;
  84. uint64x1_t neon_u64;
  85. float32x2_t neon_f32;
  86. #endif
  87. } simde__m64_private;
  88. #if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
  89. typedef __m64 simde__m64;
  90. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  91. typedef int32x2_t simde__m64;
  92. #elif defined(SIMDE_VECTOR_SUBSCRIPT)
  93. typedef int32_t simde__m64 SIMDE_ALIGN(8) SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  94. #else
  95. typedef simde__m64_private simde__m64;
  96. #endif
  97. #if !defined(SIMDE_X86_MMX_USE_NATIVE_TYPE) && \
  98. defined(SIMDE_ENABLE_NATIVE_ALIASES)
  99. #define SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES
  100. typedef simde__m64 __m64;
  101. #endif
  102. HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64), "__m64 size incorrect");
  103. HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64_private), "__m64 size incorrect");
  104. #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
  105. HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m64) == 8,
  106. "simde__m64 is not 8-byte aligned");
  107. HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m64_private) == 8,
  108. "simde__m64_private is not 8-byte aligned");
  109. #endif
  110. SIMDE_FUNCTION_ATTRIBUTES
  111. simde__m64 simde__m64_from_private(simde__m64_private v)
  112. {
  113. simde__m64 r;
  114. simde_memcpy(&r, &v, sizeof(r));
  115. return r;
  116. }
  117. SIMDE_FUNCTION_ATTRIBUTES
  118. simde__m64_private simde__m64_to_private(simde__m64 v)
  119. {
  120. simde__m64_private r;
  121. simde_memcpy(&r, &v, sizeof(r));
  122. return r;
  123. }
  124. #define SIMDE_X86_GENERATE_CONVERSION_FUNCTION(simde_type, source_type, isax, \
  125. fragment) \
  126. SIMDE_FUNCTION_ATTRIBUTES \
  127. simde__##simde_type simde__##simde_type##_from_##isax##_##fragment( \
  128. source_type value) \
  129. { \
  130. simde__##simde_type##_private r_; \
  131. r_.isax##_##fragment = value; \
  132. return simde__##simde_type##_from_private(r_); \
  133. } \
  134. \
  135. SIMDE_FUNCTION_ATTRIBUTES \
  136. source_type simde__##simde_type##_to_##isax##_##fragment( \
  137. simde__##simde_type value) \
  138. { \
  139. simde__##simde_type##_private r_ = \
  140. simde__##simde_type##_to_private(value); \
  141. return r_.isax##_##fragment; \
  142. }
  143. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  144. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int8x8_t, neon, i8)
  145. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int16x4_t, neon, i16)
  146. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int32x2_t, neon, i32)
  147. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int64x1_t, neon, i64)
  148. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint8x8_t, neon, u8)
  149. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint16x4_t, neon, u16)
  150. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint32x2_t, neon, u32)
  151. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint64x1_t, neon, u64)
  152. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, float32x2_t, neon, f32)
  153. #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
  154. SIMDE_FUNCTION_ATTRIBUTES
  155. simde__m64 simde_mm_add_pi8(simde__m64 a, simde__m64 b)
  156. {
  157. #if defined(SIMDE_X86_MMX_NATIVE)
  158. return _mm_add_pi8(a, b);
  159. #else
  160. simde__m64_private r_;
  161. simde__m64_private a_ = simde__m64_to_private(a);
  162. simde__m64_private b_ = simde__m64_to_private(b);
  163. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  164. r_.neon_i8 = vadd_s8(a_.neon_i8, b_.neon_i8);
  165. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  166. r_.i8 = a_.i8 + b_.i8;
  167. #else
  168. SIMDE_VECTORIZE
  169. for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
  170. r_.i8[i] = a_.i8[i] + b_.i8[i];
  171. }
  172. #endif
  173. return simde__m64_from_private(r_);
  174. #endif
  175. }
  176. #define simde_m_paddb(a, b) simde_mm_add_pi8(a, b)
  177. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  178. #define _mm_add_pi8(a, b) simde_mm_add_pi8(a, b)
  179. #define _m_paddb(a, b) simde_m_paddb(a, b)
  180. #endif
  181. SIMDE_FUNCTION_ATTRIBUTES
  182. simde__m64 simde_mm_add_pi16(simde__m64 a, simde__m64 b)
  183. {
  184. #if defined(SIMDE_X86_MMX_NATIVE)
  185. return _mm_add_pi16(a, b);
  186. #else
  187. simde__m64_private r_;
  188. simde__m64_private a_ = simde__m64_to_private(a);
  189. simde__m64_private b_ = simde__m64_to_private(b);
  190. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  191. r_.neon_i16 = vadd_s16(a_.neon_i16, b_.neon_i16);
  192. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  193. r_.i16 = a_.i16 + b_.i16;
  194. #else
  195. SIMDE_VECTORIZE
  196. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  197. r_.i16[i] = a_.i16[i] + b_.i16[i];
  198. }
  199. #endif
  200. return simde__m64_from_private(r_);
  201. #endif
  202. }
  203. #define simde_m_paddw(a, b) simde_mm_add_pi16(a, b)
  204. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  205. #define _mm_add_pi16(a, b) simde_mm_add_pi16(a, b)
  206. #define _m_add_paddw(a, b) simde_mm_add_pi16(a, b)
  207. #endif
  208. SIMDE_FUNCTION_ATTRIBUTES
  209. simde__m64 simde_mm_add_pi32(simde__m64 a, simde__m64 b)
  210. {
  211. #if defined(SIMDE_X86_MMX_NATIVE)
  212. return _mm_add_pi32(a, b);
  213. #else
  214. simde__m64_private r_;
  215. simde__m64_private a_ = simde__m64_to_private(a);
  216. simde__m64_private b_ = simde__m64_to_private(b);
  217. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  218. r_.neon_i32 = vadd_s32(a_.neon_i32, b_.neon_i32);
  219. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  220. r_.i32 = a_.i32 + b_.i32;
  221. #else
  222. SIMDE_VECTORIZE
  223. for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
  224. r_.i32[i] = a_.i32[i] + b_.i32[i];
  225. }
  226. #endif
  227. return simde__m64_from_private(r_);
  228. #endif
  229. }
  230. #define simde_m_paddd(a, b) simde_mm_add_pi32(a, b)
  231. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  232. #define _mm_add_pi32(a, b) simde_mm_add_pi32(a, b)
  233. #define _m_add_paddd(a, b) simde_mm_add_pi32(a, b)
  234. #endif
  235. SIMDE_FUNCTION_ATTRIBUTES
  236. simde__m64 simde_mm_adds_pi8(simde__m64 a, simde__m64 b)
  237. {
  238. #if defined(SIMDE_X86_MMX_NATIVE)
  239. return _mm_adds_pi8(a, b);
  240. #else
  241. simde__m64_private r_, a_ = simde__m64_to_private(a),
  242. b_ = simde__m64_to_private(b);
  243. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  244. r_.neon_i8 = vqadd_s8(a_.neon_i8, b_.neon_i8);
  245. #else
  246. SIMDE_VECTORIZE
  247. for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
  248. if ((((b_.i8[i]) > 0) &&
  249. ((a_.i8[i]) > (INT8_MAX - (b_.i8[i]))))) {
  250. r_.i8[i] = INT8_MAX;
  251. } else if ((((b_.i8[i]) < 0) &&
  252. ((a_.i8[i]) < (INT8_MIN - (b_.i8[i]))))) {
  253. r_.i8[i] = INT8_MIN;
  254. } else {
  255. r_.i8[i] = (a_.i8[i]) + (b_.i8[i]);
  256. }
  257. }
  258. #endif
  259. return simde__m64_from_private(r_);
  260. #endif
  261. }
  262. #define simde_m_paddsb(a, b) simde_mm_adds_pi8(a, b)
  263. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  264. #define _mm_adds_pi8(a, b) simde_mm_adds_pi8(a, b)
  265. #define _m_add_paddsb(a, b) simde_mm_adds_pi8(a, b)
  266. #endif
  267. SIMDE_FUNCTION_ATTRIBUTES
  268. simde__m64 simde_mm_adds_pu8(simde__m64 a, simde__m64 b)
  269. {
  270. #if defined(SIMDE_X86_MMX_NATIVE)
  271. return _mm_adds_pu8(a, b);
  272. #else
  273. simde__m64_private r_;
  274. simde__m64_private a_ = simde__m64_to_private(a);
  275. simde__m64_private b_ = simde__m64_to_private(b);
  276. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  277. r_.neon_u8 = vqadd_u8(a_.neon_u8, b_.neon_u8);
  278. #else
  279. SIMDE_VECTORIZE
  280. for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
  281. const uint_fast16_t x =
  282. HEDLEY_STATIC_CAST(uint_fast16_t, a_.u8[i]) +
  283. HEDLEY_STATIC_CAST(uint_fast16_t, b_.u8[i]);
  284. if (x > UINT8_MAX)
  285. r_.u8[i] = UINT8_MAX;
  286. else
  287. r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
  288. }
  289. #endif
  290. return simde__m64_from_private(r_);
  291. #endif
  292. }
  293. #define simde_m_paddusb(a, b) simde_mm_adds_pu8(a, b)
  294. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  295. #define _mm_adds_pu8(a, b) simde_mm_adds_pu8(a, b)
  296. #define _m_paddusb(a, b) simde_mm_adds_pu8(a, b)
  297. #endif
  298. SIMDE_FUNCTION_ATTRIBUTES
  299. simde__m64 simde_mm_adds_pi16(simde__m64 a, simde__m64 b)
  300. {
  301. #if defined(SIMDE_X86_MMX_NATIVE)
  302. return _mm_adds_pi16(a, b);
  303. #else
  304. simde__m64_private r_;
  305. simde__m64_private a_ = simde__m64_to_private(a);
  306. simde__m64_private b_ = simde__m64_to_private(b);
  307. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  308. r_.neon_i16 = vqadd_s16(a_.neon_i16, b_.neon_i16);
  309. #else
  310. SIMDE_VECTORIZE
  311. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  312. if ((((b_.i16[i]) > 0) &&
  313. ((a_.i16[i]) > (INT16_MAX - (b_.i16[i]))))) {
  314. r_.i16[i] = INT16_MAX;
  315. } else if ((((b_.i16[i]) < 0) &&
  316. ((a_.i16[i]) < (SHRT_MIN - (b_.i16[i]))))) {
  317. r_.i16[i] = SHRT_MIN;
  318. } else {
  319. r_.i16[i] = (a_.i16[i]) + (b_.i16[i]);
  320. }
  321. }
  322. #endif
  323. return simde__m64_from_private(r_);
  324. #endif
  325. }
  326. #define simde_m_paddsw(a, b) simde_mm_adds_pi16(a, b)
  327. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  328. #define _mm_adds_pi16(a, b) simde_mm_adds_pi16(a, b)
  329. #define _m_paddsw(a, b) simde_mm_adds_pi16(a, b)
  330. #endif
  331. SIMDE_FUNCTION_ATTRIBUTES
  332. simde__m64 simde_mm_adds_pu16(simde__m64 a, simde__m64 b)
  333. {
  334. #if defined(SIMDE_X86_MMX_NATIVE)
  335. return _mm_adds_pu16(a, b);
  336. #else
  337. simde__m64_private r_;
  338. simde__m64_private a_ = simde__m64_to_private(a);
  339. simde__m64_private b_ = simde__m64_to_private(b);
  340. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  341. r_.neon_u16 = vqadd_u16(a_.neon_u16, b_.neon_u16);
  342. #else
  343. SIMDE_VECTORIZE
  344. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  345. const uint32_t x = a_.u16[i] + b_.u16[i];
  346. if (x > UINT16_MAX)
  347. r_.u16[i] = UINT16_MAX;
  348. else
  349. r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
  350. }
  351. #endif
  352. return simde__m64_from_private(r_);
  353. #endif
  354. }
  355. #define simde_m_paddusw(a, b) simde_mm_adds_pu16(a, b)
  356. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  357. #define _mm_adds_pu16(a, b) simde_mm_adds_pu16(a, b)
  358. #define _m_paddusw(a, b) simde_mm_adds_pu16(a, b)
  359. #endif
  360. SIMDE_FUNCTION_ATTRIBUTES
  361. simde__m64 simde_mm_and_si64(simde__m64 a, simde__m64 b)
  362. {
  363. #if defined(SIMDE_X86_MMX_NATIVE)
  364. return _mm_and_si64(a, b);
  365. #else
  366. simde__m64_private r_;
  367. simde__m64_private a_ = simde__m64_to_private(a);
  368. simde__m64_private b_ = simde__m64_to_private(b);
  369. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  370. r_.neon_i32 = vand_s32(a_.neon_i32, b_.neon_i32);
  371. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  372. r_.i64 = a_.i64 & b_.i64;
  373. #else
  374. r_.i64[0] = a_.i64[0] & b_.i64[0];
  375. #endif
  376. return simde__m64_from_private(r_);
  377. #endif
  378. }
  379. #define simde_m_pand(a, b) simde_mm_and_si64(a, b)
  380. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  381. #define _mm_and_si64(a, b) simde_mm_and_si64(a, b)
  382. #define _m_pand(a, b) simde_mm_and_si64(a, b)
  383. #endif
  384. SIMDE_FUNCTION_ATTRIBUTES
  385. simde__m64 simde_mm_andnot_si64(simde__m64 a, simde__m64 b)
  386. {
  387. #if defined(SIMDE_X86_MMX_NATIVE)
  388. return _mm_andnot_si64(a, b);
  389. #else
  390. simde__m64_private r_;
  391. simde__m64_private a_ = simde__m64_to_private(a);
  392. simde__m64_private b_ = simde__m64_to_private(b);
  393. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  394. r_.neon_i32 = vbic_s32(b_.neon_i32, a_.neon_i32);
  395. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  396. r_.i32f = ~a_.i32f & b_.i32f;
  397. #else
  398. r_.u64[0] = (~(a_.u64[0])) & (b_.u64[0]);
  399. #endif
  400. return simde__m64_from_private(r_);
  401. #endif
  402. }
  403. #define simde_m_pandn(a, b) simde_mm_andnot_si64(a, b)
  404. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  405. #define _mm_andnot_si64(a, b) simde_mm_andnot_si64(a, b)
  406. #define _m_pandn(a, b) simde_mm_andnot_si64(a, b)
  407. #endif
  408. SIMDE_FUNCTION_ATTRIBUTES
  409. simde__m64 simde_mm_cmpeq_pi8(simde__m64 a, simde__m64 b)
  410. {
  411. #if defined(SIMDE_X86_MMX_NATIVE)
  412. return _mm_cmpeq_pi8(a, b);
  413. #else
  414. simde__m64_private r_;
  415. simde__m64_private a_ = simde__m64_to_private(a);
  416. simde__m64_private b_ = simde__m64_to_private(b);
  417. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  418. r_.neon_i8 = vreinterpret_s8_u8(vceq_s8(a_.neon_i8, b_.neon_i8));
  419. #else
  420. SIMDE_VECTORIZE
  421. for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
  422. r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
  423. }
  424. #endif
  425. return simde__m64_from_private(r_);
  426. #endif
  427. }
  428. #define simde_m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b)
  429. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  430. #define _mm_cmpeq_pi8(a, b) simde_mm_cmpeq_pi8(a, b)
  431. #define _m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b)
  432. #endif
  433. SIMDE_FUNCTION_ATTRIBUTES
  434. simde__m64 simde_mm_cmpeq_pi16(simde__m64 a, simde__m64 b)
  435. {
  436. #if defined(SIMDE_X86_MMX_NATIVE)
  437. return _mm_cmpeq_pi16(a, b);
  438. #else
  439. simde__m64_private r_;
  440. simde__m64_private a_ = simde__m64_to_private(a);
  441. simde__m64_private b_ = simde__m64_to_private(b);
  442. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  443. r_.neon_i16 = vreinterpret_s16_u16(vceq_s16(a_.neon_i16, b_.neon_i16));
  444. #else
  445. SIMDE_VECTORIZE
  446. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  447. r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
  448. }
  449. #endif
  450. return simde__m64_from_private(r_);
  451. #endif
  452. }
  453. #define simde_m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b)
  454. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  455. #define _mm_cmpeq_pi16(a, b) simde_mm_cmpeq_pi16(a, b)
  456. #define _m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b)
  457. #endif
  458. SIMDE_FUNCTION_ATTRIBUTES
  459. simde__m64 simde_mm_cmpeq_pi32(simde__m64 a, simde__m64 b)
  460. {
  461. #if defined(SIMDE_X86_MMX_NATIVE)
  462. return _mm_cmpeq_pi32(a, b);
  463. #else
  464. simde__m64_private r_;
  465. simde__m64_private a_ = simde__m64_to_private(a);
  466. simde__m64_private b_ = simde__m64_to_private(b);
  467. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  468. r_.neon_i32 = vreinterpret_s32_u32(vceq_s32(a_.neon_i32, b_.neon_i32));
  469. #else
  470. SIMDE_VECTORIZE
  471. for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
  472. r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
  473. }
  474. #endif
  475. return simde__m64_from_private(r_);
  476. #endif
  477. }
  478. #define simde_m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b)
  479. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  480. #define _mm_cmpeq_pi32(a, b) simde_mm_cmpeq_pi32(a, b)
  481. #define _m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b)
  482. #endif
  483. SIMDE_FUNCTION_ATTRIBUTES
  484. simde__m64 simde_mm_cmpgt_pi8(simde__m64 a, simde__m64 b)
  485. {
  486. #if defined(SIMDE_X86_MMX_NATIVE)
  487. return _mm_cmpgt_pi8(a, b);
  488. #else
  489. simde__m64_private r_;
  490. simde__m64_private a_ = simde__m64_to_private(a);
  491. simde__m64_private b_ = simde__m64_to_private(b);
  492. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  493. r_.neon_i8 = vreinterpret_s8_u8(vcgt_s8(a_.neon_i8, b_.neon_i8));
  494. #else
  495. SIMDE_VECTORIZE
  496. for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
  497. r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
  498. }
  499. #endif
  500. return simde__m64_from_private(r_);
  501. #endif
  502. }
  503. #define simde_m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b)
  504. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  505. #define _mm_cmpgt_pi8(a, b) simde_mm_cmpgt_pi8(a, b)
  506. #define _m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b)
  507. #endif
  508. SIMDE_FUNCTION_ATTRIBUTES
  509. simde__m64 simde_mm_cmpgt_pi16(simde__m64 a, simde__m64 b)
  510. {
  511. #if defined(SIMDE_X86_MMX_NATIVE)
  512. return _mm_cmpgt_pi16(a, b);
  513. #else
  514. simde__m64_private r_;
  515. simde__m64_private a_ = simde__m64_to_private(a);
  516. simde__m64_private b_ = simde__m64_to_private(b);
  517. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  518. r_.neon_i16 = vreinterpret_s16_u16(vcgt_s16(a_.neon_i16, b_.neon_i16));
  519. #else
  520. SIMDE_VECTORIZE
  521. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  522. r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
  523. }
  524. #endif
  525. return simde__m64_from_private(r_);
  526. #endif
  527. }
  528. #define simde_m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b)
  529. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  530. #define _mm_cmpgt_pi16(a, b) simde_mm_cmpgt_pi16(a, b)
  531. #define _m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b)
  532. #endif
  533. SIMDE_FUNCTION_ATTRIBUTES
  534. simde__m64 simde_mm_cmpgt_pi32(simde__m64 a, simde__m64 b)
  535. {
  536. #if defined(SIMDE_X86_MMX_NATIVE)
  537. return _mm_cmpgt_pi32(a, b);
  538. #else
  539. simde__m64_private r_;
  540. simde__m64_private a_ = simde__m64_to_private(a);
  541. simde__m64_private b_ = simde__m64_to_private(b);
  542. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  543. r_.neon_i32 = vreinterpret_s32_u32(vcgt_s32(a_.neon_i32, b_.neon_i32));
  544. #else
  545. SIMDE_VECTORIZE
  546. for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
  547. r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
  548. }
  549. #endif
  550. return simde__m64_from_private(r_);
  551. #endif
  552. }
  553. #define simde_m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b)
  554. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  555. #define _mm_cmpgt_pi32(a, b) simde_mm_cmpgt_pi32(a, b)
  556. #define _m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b)
  557. #endif
  558. SIMDE_FUNCTION_ATTRIBUTES
  559. int64_t simde_mm_cvtm64_si64(simde__m64 a)
  560. {
  561. #if defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && \
  562. !defined(__PGI)
  563. return _mm_cvtm64_si64(a);
  564. #else
  565. simde__m64_private a_ = simde__m64_to_private(a);
  566. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  567. return vget_lane_s64(a_.neon_i64, 0);
  568. #else
  569. return a_.i64[0];
  570. #endif
  571. #endif
  572. }
  573. #define simde_m_to_int64(a) simde_mm_cvtm64_si64(a)
  574. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  575. #define _mm_cvtm64_si64(a) simde_mm_cvtm64_si64(a)
  576. #define _m_to_int64(a) simde_mm_cvtm64_si64(a)
  577. #endif
  578. SIMDE_FUNCTION_ATTRIBUTES
  579. simde__m64 simde_mm_cvtsi32_si64(int32_t a)
  580. {
  581. #if defined(SIMDE_X86_MMX_NATIVE)
  582. return _mm_cvtsi32_si64(a);
  583. #else
  584. simde__m64_private r_;
  585. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  586. const int32_t av[sizeof(r_.neon_i32) / sizeof(r_.neon_i32[0])] = {a, 0};
  587. r_.neon_i32 = vld1_s32(av);
  588. #else
  589. r_.i32[0] = a;
  590. r_.i32[1] = 0;
  591. #endif
  592. return simde__m64_from_private(r_);
  593. #endif
  594. }
  595. #define simde_m_from_int(a) simde_mm_cvtsi32_si64(a)
  596. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  597. #define _mm_cvtsi32_si64(a) simde_mm_cvtsi32_si64(a)
  598. #define _m_from_int(a) simde_mm_cvtsi32_si64(a)
  599. #endif
  600. SIMDE_FUNCTION_ATTRIBUTES
  601. simde__m64 simde_mm_cvtsi64_m64(int64_t a)
  602. {
  603. #if defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && \
  604. !defined(__PGI)
  605. return _mm_cvtsi64_m64(a);
  606. #else
  607. simde__m64_private r_;
  608. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  609. r_.neon_i64 = vld1_s64(&a);
  610. #else
  611. r_.i64[0] = a;
  612. #endif
  613. return simde__m64_from_private(r_);
  614. #endif
  615. }
  616. #define simde_m_from_int64(a) simde_mm_cvtsi64_m64(a)
  617. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  618. #define _mm_cvtsi64_m64(a) simde_mm_cvtsi64_m64(a)
  619. #define _m_from_int64(a) simde_mm_cvtsi64_m64(a)
  620. #endif
  621. SIMDE_FUNCTION_ATTRIBUTES
  622. int32_t simde_mm_cvtsi64_si32(simde__m64 a)
  623. {
  624. #if defined(SIMDE_X86_MMX_NATIVE)
  625. return _mm_cvtsi64_si32(a);
  626. #else
  627. simde__m64_private a_ = simde__m64_to_private(a);
  628. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  629. return vget_lane_s32(a_.neon_i32, 0);
  630. #else
  631. return a_.i32[0];
  632. #endif
  633. #endif
  634. }
  635. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  636. #define _mm_cvtsi64_si32(a) simde_mm_cvtsi64_si32(a)
  637. #endif
  638. SIMDE_FUNCTION_ATTRIBUTES
  639. void simde_mm_empty(void)
  640. {
  641. #if defined(SIMDE_X86_MMX_NATIVE)
  642. _mm_empty();
  643. #else
  644. #endif
  645. }
  646. #define simde_m_empty() simde_mm_empty()
  647. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  648. #define _mm_empty() simde_mm_empty()
  649. #define _m_empty() simde_mm_empty()
  650. #endif
  651. SIMDE_FUNCTION_ATTRIBUTES
  652. simde__m64 simde_mm_madd_pi16(simde__m64 a, simde__m64 b)
  653. {
  654. #if defined(SIMDE_X86_MMX_NATIVE)
  655. return _mm_madd_pi16(a, b);
  656. #else
  657. simde__m64_private r_;
  658. simde__m64_private a_ = simde__m64_to_private(a);
  659. simde__m64_private b_ = simde__m64_to_private(b);
  660. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  661. int32x4_t i1 = vmull_s16(a_.neon_i16, b_.neon_i16);
  662. r_.neon_i32 = vpadd_s32(vget_low_s32(i1), vget_high_s32(i1));
  663. #else
  664. SIMDE_VECTORIZE
  665. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i += 2) {
  666. r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) +
  667. (a_.i16[i + 1] * b_.i16[i + 1]);
  668. }
  669. #endif
  670. return simde__m64_from_private(r_);
  671. #endif
  672. }
  673. #define simde_m_pmaddwd(a, b) simde_mm_madd_pi16(a, b)
  674. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  675. #define _mm_madd_pi16(a, b) simde_mm_madd_pi16(a, b)
  676. #define _m_pmaddwd(a, b) simde_mm_madd_pi16(a, b)
  677. #endif
  678. SIMDE_FUNCTION_ATTRIBUTES
  679. simde__m64 simde_mm_mulhi_pi16(simde__m64 a, simde__m64 b)
  680. {
  681. #if defined(SIMDE_X86_MMX_NATIVE)
  682. return _mm_mulhi_pi16(a, b);
  683. #else
  684. simde__m64_private r_;
  685. simde__m64_private a_ = simde__m64_to_private(a);
  686. simde__m64_private b_ = simde__m64_to_private(b);
  687. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  688. const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16);
  689. const uint32x4_t t2 = vshrq_n_u32(vreinterpretq_u32_s32(t1), 16);
  690. const uint16x4_t t3 = vmovn_u32(t2);
  691. r_.neon_i16 = vreinterpret_s16_u16(t3);
  692. #else
  693. SIMDE_VECTORIZE
  694. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  695. r_.i16[i] = HEDLEY_STATIC_CAST(int16_t,
  696. ((a_.i16[i] * b_.i16[i]) >> 16));
  697. }
  698. #endif
  699. return simde__m64_from_private(r_);
  700. #endif
  701. }
  702. #define simde_m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b)
  703. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  704. #define _mm_mulhi_pi16(a, b) simde_mm_mulhi_pi16(a, b)
  705. #define _m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b)
  706. #endif
  707. SIMDE_FUNCTION_ATTRIBUTES
  708. simde__m64 simde_mm_mullo_pi16(simde__m64 a, simde__m64 b)
  709. {
  710. #if defined(SIMDE_X86_MMX_NATIVE)
  711. return _mm_mullo_pi16(a, b);
  712. #else
  713. simde__m64_private r_;
  714. simde__m64_private a_ = simde__m64_to_private(a);
  715. simde__m64_private b_ = simde__m64_to_private(b);
  716. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  717. const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16);
  718. const uint16x4_t t2 = vmovn_u32(vreinterpretq_u32_s32(t1));
  719. r_.neon_i16 = vreinterpret_s16_u16(t2);
  720. #else
  721. SIMDE_VECTORIZE
  722. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  723. r_.i16[i] = HEDLEY_STATIC_CAST(
  724. int16_t, ((a_.i16[i] * b_.i16[i]) & 0xffff));
  725. }
  726. #endif
  727. return simde__m64_from_private(r_);
  728. #endif
  729. }
  730. #define simde_m_pmullw(a, b) simde_mm_mullo_pi16(a, b)
  731. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  732. #define _mm_mullo_pi16(a, b) simde_mm_mullo_pi16(a, b)
  733. #define _m_pmullw(a, b) simde_mm_mullo_pi16(a, b)
  734. #endif
  735. SIMDE_FUNCTION_ATTRIBUTES
  736. simde__m64 simde_mm_or_si64(simde__m64 a, simde__m64 b)
  737. {
  738. #if defined(SIMDE_X86_MMX_NATIVE)
  739. return _mm_or_si64(a, b);
  740. #else
  741. simde__m64_private r_;
  742. simde__m64_private a_ = simde__m64_to_private(a);
  743. simde__m64_private b_ = simde__m64_to_private(b);
  744. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  745. r_.neon_i32 = vorr_s32(a_.neon_i32, b_.neon_i32);
  746. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  747. r_.i64 = a_.i64 | b_.i64;
  748. #else
  749. r_.i64[0] = a_.i64[0] | b_.i64[0];
  750. #endif
  751. return simde__m64_from_private(r_);
  752. #endif
  753. }
  754. #define simde_m_por(a, b) simde_mm_or_si64(a, b)
  755. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  756. #define _mm_or_si64(a, b) simde_mm_or_si64(a, b)
  757. #define _m_por(a, b) simde_mm_or_si64(a, b)
  758. #endif
  759. SIMDE_FUNCTION_ATTRIBUTES
  760. simde__m64 simde_mm_packs_pi16(simde__m64 a, simde__m64 b)
  761. {
  762. #if defined(SIMDE_X86_MMX_NATIVE)
  763. return _mm_packs_pi16(a, b);
  764. #else
  765. simde__m64_private r_;
  766. simde__m64_private a_ = simde__m64_to_private(a);
  767. simde__m64_private b_ = simde__m64_to_private(b);
  768. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  769. r_.neon_i8 = vqmovn_s16(vcombine_s16(a_.neon_i16, b_.neon_i16));
  770. #else
  771. SIMDE_VECTORIZE
  772. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  773. if (a_.i16[i] < INT8_MIN) {
  774. r_.i8[i] = INT8_MIN;
  775. } else if (a_.i16[i] > INT8_MAX) {
  776. r_.i8[i] = INT8_MAX;
  777. } else {
  778. r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, a_.i16[i]);
  779. }
  780. }
  781. SIMDE_VECTORIZE
  782. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  783. if (b_.i16[i] < INT8_MIN) {
  784. r_.i8[i + 4] = INT8_MIN;
  785. } else if (b_.i16[i] > INT8_MAX) {
  786. r_.i8[i + 4] = INT8_MAX;
  787. } else {
  788. r_.i8[i + 4] = HEDLEY_STATIC_CAST(int8_t, b_.i16[i]);
  789. }
  790. }
  791. #endif
  792. return simde__m64_from_private(r_);
  793. #endif
  794. }
  795. #define simde_m_packsswb(a, b) simde_mm_packs_pi16(a, b)
  796. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  797. #define _mm_packs_pi16(a, b) simde_mm_packs_pi16(a, b)
  798. #define _m_packsswb(a, b) mm_packs_pi16(a, b)
  799. #endif
  800. SIMDE_FUNCTION_ATTRIBUTES
  801. simde__m64 simde_mm_packs_pi32(simde__m64 a, simde__m64 b)
  802. {
  803. #if defined(SIMDE_X86_MMX_NATIVE)
  804. return _mm_packs_pi32(a, b);
  805. #else
  806. simde__m64_private r_;
  807. simde__m64_private a_ = simde__m64_to_private(a);
  808. simde__m64_private b_ = simde__m64_to_private(b);
  809. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  810. r_.neon_i16 = vqmovn_s32(vcombine_s32(a_.neon_i32, b_.neon_i32));
  811. #else
  812. SIMDE_VECTORIZE
  813. for (size_t i = 0; i < (8 / sizeof(a_.i32[0])); i++) {
  814. if (a_.i32[i] < SHRT_MIN) {
  815. r_.i16[i] = SHRT_MIN;
  816. } else if (a_.i32[i] > INT16_MAX) {
  817. r_.i16[i] = INT16_MAX;
  818. } else {
  819. r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i32[i]);
  820. }
  821. }
  822. SIMDE_VECTORIZE
  823. for (size_t i = 0; i < (8 / sizeof(b_.i32[0])); i++) {
  824. if (b_.i32[i] < SHRT_MIN) {
  825. r_.i16[i + 2] = SHRT_MIN;
  826. } else if (b_.i32[i] > INT16_MAX) {
  827. r_.i16[i + 2] = INT16_MAX;
  828. } else {
  829. r_.i16[i + 2] = HEDLEY_STATIC_CAST(int16_t, b_.i32[i]);
  830. }
  831. }
  832. #endif
  833. return simde__m64_from_private(r_);
  834. #endif
  835. }
  836. #define simde_m_packssdw(a, b) simde_mm_packs_pi32(a, b)
  837. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  838. #define _mm_packs_pi32(a, b) simde_mm_packs_pi32(a, b)
  839. #define _m_packssdw(a, b) simde_mm_packs_pi32(a, b)
  840. #endif
  841. SIMDE_FUNCTION_ATTRIBUTES
  842. simde__m64 simde_mm_packs_pu16(simde__m64 a, simde__m64 b)
  843. {
  844. #if defined(SIMDE_X86_MMX_NATIVE)
  845. return _mm_packs_pu16(a, b);
  846. #else
  847. simde__m64_private r_;
  848. simde__m64_private a_ = simde__m64_to_private(a);
  849. simde__m64_private b_ = simde__m64_to_private(b);
  850. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  851. const int16x8_t t1 = vcombine_s16(a_.neon_i16, b_.neon_i16);
  852. /* Set elements which are < 0 to 0 */
  853. const int16x8_t t2 =
  854. vandq_s16(t1, vreinterpretq_s16_u16(vcgezq_s16(t1)));
  855. /* Vector with all s16 elements set to UINT8_MAX */
  856. const int16x8_t vmax = vmovq_n_s16((int16_t)UINT8_MAX);
  857. /* Elements which are within the acceptable range */
  858. const int16x8_t le_max =
  859. vandq_s16(t2, vreinterpretq_s16_u16(vcleq_s16(t2, vmax)));
  860. const int16x8_t gt_max =
  861. vandq_s16(vmax, vreinterpretq_s16_u16(vcgtq_s16(t2, vmax)));
  862. /* Final values as 16-bit integers */
  863. const int16x8_t values = vorrq_s16(le_max, gt_max);
  864. r_.neon_u8 = vmovn_u16(vreinterpretq_u16_s16(values));
  865. #else
  866. SIMDE_VECTORIZE
  867. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  868. if (a_.i16[i] > UINT8_MAX) {
  869. r_.u8[i] = UINT8_MAX;
  870. } else if (a_.i16[i] < 0) {
  871. r_.u8[i] = 0;
  872. } else {
  873. r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]);
  874. }
  875. }
  876. SIMDE_VECTORIZE
  877. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  878. if (b_.i16[i] > UINT8_MAX) {
  879. r_.u8[i + 4] = UINT8_MAX;
  880. } else if (b_.i16[i] < 0) {
  881. r_.u8[i + 4] = 0;
  882. } else {
  883. r_.u8[i + 4] = HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]);
  884. }
  885. }
  886. #endif
  887. return simde__m64_from_private(r_);
  888. #endif
  889. }
  890. #define simde_m_packuswb(a, b) simde_mm_packs_pu16(a, b)
  891. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  892. #define _mm_packs_pu16(a, b) simde_mm_packs_pu16(a, b)
  893. #define _m_packuswb(a, b) simde_mm_packs_pu16(a, b)
  894. #endif
  895. SIMDE_FUNCTION_ATTRIBUTES
  896. simde__m64 simde_mm_set_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4,
  897. int8_t e3, int8_t e2, int8_t e1, int8_t e0)
  898. {
  899. #if defined(SIMDE_X86_MMX_NATIVE)
  900. return _mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0);
  901. #else
  902. simde__m64_private r_;
  903. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  904. const int8_t v[sizeof(r_.i8) / sizeof(r_.i8[0])] = {e0, e1, e2, e3,
  905. e4, e5, e6, e7};
  906. r_.neon_i8 = vld1_s8(v);
  907. #else
  908. r_.i8[0] = e0;
  909. r_.i8[1] = e1;
  910. r_.i8[2] = e2;
  911. r_.i8[3] = e3;
  912. r_.i8[4] = e4;
  913. r_.i8[5] = e5;
  914. r_.i8[6] = e6;
  915. r_.i8[7] = e7;
  916. #endif
  917. return simde__m64_from_private(r_);
  918. #endif
  919. }
  920. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  921. #define _mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0) \
  922. simde_mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0)
  923. #endif
  924. SIMDE_FUNCTION_ATTRIBUTES
  925. simde__m64 simde_x_mm_set_pu8(uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4,
  926. uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0)
  927. {
  928. simde__m64_private r_;
  929. #if defined(SIMDE_X86_MMX_NATIVE)
  930. r_.n = _mm_set_pi8(
  931. HEDLEY_STATIC_CAST(int8_t, e7), HEDLEY_STATIC_CAST(int8_t, e6),
  932. HEDLEY_STATIC_CAST(int8_t, e5), HEDLEY_STATIC_CAST(int8_t, e4),
  933. HEDLEY_STATIC_CAST(int8_t, e3), HEDLEY_STATIC_CAST(int8_t, e2),
  934. HEDLEY_STATIC_CAST(int8_t, e1), HEDLEY_STATIC_CAST(int8_t, e0));
  935. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  936. const uint8_t v[sizeof(r_.u8) / sizeof(r_.u8[0])] = {e0, e1, e2, e3,
  937. e4, e5, e6, e7};
  938. r_.neon_u8 = vld1_u8(v);
  939. #else
  940. r_.u8[0] = e0;
  941. r_.u8[1] = e1;
  942. r_.u8[2] = e2;
  943. r_.u8[3] = e3;
  944. r_.u8[4] = e4;
  945. r_.u8[5] = e5;
  946. r_.u8[6] = e6;
  947. r_.u8[7] = e7;
  948. #endif
  949. return simde__m64_from_private(r_);
  950. }
  951. SIMDE_FUNCTION_ATTRIBUTES
  952. simde__m64 simde_mm_set_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0)
  953. {
  954. #if defined(SIMDE_X86_MMX_NATIVE)
  955. return _mm_set_pi16(e3, e2, e1, e0);
  956. #else
  957. simde__m64_private r_;
  958. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  959. const int16_t v[sizeof(r_.i16) / sizeof(r_.i16[0])] = {e0, e1, e2, e3};
  960. r_.neon_i16 = vld1_s16(v);
  961. #else
  962. r_.i16[0] = e0;
  963. r_.i16[1] = e1;
  964. r_.i16[2] = e2;
  965. r_.i16[3] = e3;
  966. #endif
  967. return simde__m64_from_private(r_);
  968. #endif
  969. }
  970. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  971. #define _mm_set_pi16(e3, e2, e1, e0) simde_mm_set_pi16(e3, e2, e1, e0)
  972. #endif
  973. SIMDE_FUNCTION_ATTRIBUTES
  974. simde__m64 simde_x_mm_set_pu16(uint16_t e3, uint16_t e2, uint16_t e1,
  975. uint16_t e0)
  976. {
  977. simde__m64_private r_;
  978. #if defined(SIMDE_X86_MMX_NATIVE)
  979. r_.n = _mm_set_pi16(HEDLEY_STATIC_CAST(int16_t, e3),
  980. HEDLEY_STATIC_CAST(int16_t, e2),
  981. HEDLEY_STATIC_CAST(int16_t, e1),
  982. HEDLEY_STATIC_CAST(int16_t, e0));
  983. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  984. const uint16_t v[sizeof(r_.u16) / sizeof(r_.u16[0])] = {e0, e1, e2, e3};
  985. r_.neon_u16 = vld1_u16(v);
  986. #else
  987. r_.u16[0] = e0;
  988. r_.u16[1] = e1;
  989. r_.u16[2] = e2;
  990. r_.u16[3] = e3;
  991. #endif
  992. return simde__m64_from_private(r_);
  993. }
  994. SIMDE_FUNCTION_ATTRIBUTES
  995. simde__m64 simde_x_mm_set_pu32(uint32_t e1, uint32_t e0)
  996. {
  997. simde__m64_private r_;
  998. #if defined(SIMDE_X86_MMX_NATIVE)
  999. r_.n = _mm_set_pi32(HEDLEY_STATIC_CAST(int32_t, e1),
  1000. HEDLEY_STATIC_CAST(int32_t, e0));
  1001. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1002. const uint32_t v[sizeof(r_.u32) / sizeof(r_.u32[0])] = {e0, e1};
  1003. r_.neon_u32 = vld1_u32(v);
  1004. #else
  1005. r_.u32[0] = e0;
  1006. r_.u32[1] = e1;
  1007. #endif
  1008. return simde__m64_from_private(r_);
  1009. }
  1010. SIMDE_FUNCTION_ATTRIBUTES
  1011. simde__m64 simde_mm_set_pi32(int32_t e1, int32_t e0)
  1012. {
  1013. simde__m64_private r_;
  1014. #if defined(SIMDE_X86_MMX_NATIVE)
  1015. r_.n = _mm_set_pi32(e1, e0);
  1016. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1017. const int32_t v[sizeof(r_.i32) / sizeof(r_.i32[0])] = {e0, e1};
  1018. r_.neon_i32 = vld1_s32(v);
  1019. #else
  1020. r_.i32[0] = e0;
  1021. r_.i32[1] = e1;
  1022. #endif
  1023. return simde__m64_from_private(r_);
  1024. }
  1025. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1026. #define _mm_set_pi32(e1, e0) simde_mm_set_pi32(e1, e0)
  1027. #endif
  1028. SIMDE_FUNCTION_ATTRIBUTES
  1029. simde__m64 simde_x_mm_set_pi64(int64_t e0)
  1030. {
  1031. simde__m64_private r_;
  1032. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1033. const int64_t v[sizeof(r_.i64) / sizeof(r_.i64[0])] = {e0};
  1034. r_.neon_i64 = vld1_s64(v);
  1035. #else
  1036. r_.i64[0] = e0;
  1037. #endif
  1038. return simde__m64_from_private(r_);
  1039. }
  1040. SIMDE_FUNCTION_ATTRIBUTES
  1041. simde__m64 simde_x_mm_set_f32x2(simde_float32 e1, simde_float32 e0)
  1042. {
  1043. simde__m64_private r_;
  1044. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1045. const simde_float32 v[sizeof(r_.f32) / sizeof(r_.f32[0])] = {e0, e1};
  1046. r_.neon_f32 = vld1_f32(v);
  1047. #else
  1048. r_.f32[0] = e0;
  1049. r_.f32[1] = e1;
  1050. #endif
  1051. return simde__m64_from_private(r_);
  1052. }
  1053. SIMDE_FUNCTION_ATTRIBUTES
  1054. simde__m64 simde_mm_set1_pi8(int8_t a)
  1055. {
  1056. #if defined(SIMDE_X86_MMX_NATIVE)
  1057. return _mm_set1_pi8(a);
  1058. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1059. simde__m64_private r_;
  1060. r_.neon_i8 = vmov_n_s8(a);
  1061. return simde__m64_from_private(r_);
  1062. #else
  1063. return simde_mm_set_pi8(a, a, a, a, a, a, a, a);
  1064. #endif
  1065. }
  1066. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1067. #define _mm_set1_pi8(a) simde_mm_set1_pi8(a)
  1068. #endif
  1069. SIMDE_FUNCTION_ATTRIBUTES
  1070. simde__m64 simde_mm_set1_pi16(int16_t a)
  1071. {
  1072. #if defined(SIMDE_X86_MMX_NATIVE)
  1073. return _mm_set1_pi16(a);
  1074. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1075. simde__m64_private r_;
  1076. r_.neon_i16 = vmov_n_s16(a);
  1077. return simde__m64_from_private(r_);
  1078. #else
  1079. return simde_mm_set_pi16(a, a, a, a);
  1080. #endif
  1081. }
  1082. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1083. #define _mm_set1_pi16(a) simde_mm_set1_pi16(a)
  1084. #endif
  1085. SIMDE_FUNCTION_ATTRIBUTES
  1086. simde__m64 simde_mm_set1_pi32(int32_t a)
  1087. {
  1088. #if defined(SIMDE_X86_MMX_NATIVE)
  1089. return _mm_set1_pi32(a);
  1090. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1091. simde__m64_private r_;
  1092. r_.neon_i32 = vmov_n_s32(a);
  1093. return simde__m64_from_private(r_);
  1094. #else
  1095. return simde_mm_set_pi32(a, a);
  1096. #endif
  1097. }
  1098. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1099. #define _mm_set1_pi32(a) simde_mm_set1_pi32(a)
  1100. #endif
  1101. SIMDE_FUNCTION_ATTRIBUTES
  1102. simde__m64 simde_mm_setr_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4,
  1103. int8_t e3, int8_t e2, int8_t e1, int8_t e0)
  1104. {
  1105. #if defined(SIMDE_X86_MMX_NATIVE)
  1106. return _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0);
  1107. #else
  1108. return simde_mm_set_pi8(e0, e1, e2, e3, e4, e5, e6, e7);
  1109. #endif
  1110. }
  1111. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1112. #define _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0) \
  1113. simde_mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0)
  1114. #endif
  1115. SIMDE_FUNCTION_ATTRIBUTES
  1116. simde__m64 simde_mm_setr_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0)
  1117. {
  1118. #if defined(SIMDE_X86_MMX_NATIVE)
  1119. return _mm_setr_pi16(e3, e2, e1, e0);
  1120. #else
  1121. return simde_mm_set_pi16(e0, e1, e2, e3);
  1122. #endif
  1123. }
  1124. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1125. #define _mm_setr_pi16(e3, e2, e1, e0) simde_mm_setr_pi16(e3, e2, e1, e0)
  1126. #endif
  1127. SIMDE_FUNCTION_ATTRIBUTES
  1128. simde__m64 simde_mm_setr_pi32(int32_t e1, int32_t e0)
  1129. {
  1130. #if defined(SIMDE_X86_MMX_NATIVE)
  1131. return _mm_setr_pi32(e1, e0);
  1132. #else
  1133. return simde_mm_set_pi32(e0, e1);
  1134. #endif
  1135. }
  1136. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1137. #define _mm_setr_pi32(e1, e0) simde_mm_setr_pi32(e1, e0)
  1138. #endif
  1139. SIMDE_FUNCTION_ATTRIBUTES
  1140. simde__m64 simde_mm_setzero_si64(void)
  1141. {
  1142. #if defined(SIMDE_X86_MMX_NATIVE)
  1143. return _mm_setzero_si64();
  1144. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1145. simde__m64_private r_;
  1146. r_.neon_u32 = vmov_n_u32(0);
  1147. return simde__m64_from_private(r_);
  1148. #else
  1149. return simde_mm_set_pi32(0, 0);
  1150. #endif
  1151. }
  1152. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1153. #define _mm_setzero_si64() simde_mm_setzero_si64()
  1154. #endif
  1155. SIMDE_FUNCTION_ATTRIBUTES
  1156. simde__m64 simde_x_mm_setone_si64(void)
  1157. {
  1158. return simde_mm_set1_pi32(~INT32_C(0));
  1159. }
  1160. SIMDE_FUNCTION_ATTRIBUTES
  1161. simde__m64 simde_mm_sll_pi16(simde__m64 a, simde__m64 count)
  1162. {
  1163. #if defined(SIMDE_X86_MMX_NATIVE)
  1164. return _mm_sll_pi16(a, count);
  1165. #else
  1166. simde__m64_private r_;
  1167. simde__m64_private a_ = simde__m64_to_private(a);
  1168. simde__m64_private count_ = simde__m64_to_private(count);
  1169. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1170. r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16((int16_t)vget_lane_u64(
  1171. count_.neon_u64, 0)));
  1172. #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1173. r_.i16 = a_.i16 << count_.u64[0];
  1174. #else
  1175. if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) {
  1176. simde_memset(&r_, 0, sizeof(r_));
  1177. return simde__m64_from_private(r_);
  1178. }
  1179. SIMDE_VECTORIZE
  1180. for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
  1181. r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t,
  1182. a_.u16[i] << count_.u64[0]);
  1183. }
  1184. #endif
  1185. return simde__m64_from_private(r_);
  1186. #endif
  1187. }
  1188. #define simde_m_psllw(a, count) simde_mm_sll_pi16(a, count)
  1189. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1190. #define _mm_sll_pi16(a, count) simde_mm_sll_pi16(a, count)
  1191. #define _m_psllw(a, count) simde_mm_sll_pi16(a, count)
  1192. #endif
  1193. SIMDE_FUNCTION_ATTRIBUTES
  1194. simde__m64 simde_mm_sll_pi32(simde__m64 a, simde__m64 count)
  1195. {
  1196. #if defined(SIMDE_X86_MMX_NATIVE)
  1197. return _mm_sll_pi32(a, count);
  1198. #else
  1199. simde__m64_private r_;
  1200. simde__m64_private a_ = simde__m64_to_private(a);
  1201. simde__m64_private count_ = simde__m64_to_private(count);
  1202. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1203. r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32((int32_t)vget_lane_u64(
  1204. count_.neon_u64, 0)));
  1205. #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1206. r_.i32 = a_.i32 << count_.u64[0];
  1207. #else
  1208. if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) {
  1209. simde_memset(&r_, 0, sizeof(r_));
  1210. return simde__m64_from_private(r_);
  1211. }
  1212. SIMDE_VECTORIZE
  1213. for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
  1214. r_.u32[i] = a_.u32[i] << count_.u64[0];
  1215. }
  1216. #endif
  1217. return simde__m64_from_private(r_);
  1218. #endif
  1219. }
  1220. #define simde_m_pslld(a, count) simde_mm_sll_pi32(a, count)
  1221. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1222. #define _mm_sll_pi32(a, count) simde_mm_sll_pi32(a, count)
  1223. #define _m_pslld(a, count) simde_mm_sll_pi32(a, count)
  1224. #endif
  1225. SIMDE_FUNCTION_ATTRIBUTES
  1226. simde__m64 simde_mm_slli_pi16(simde__m64 a, int count)
  1227. {
  1228. #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
  1229. return _mm_slli_pi16(a, count);
  1230. #else
  1231. simde__m64_private r_;
  1232. simde__m64_private a_ = simde__m64_to_private(a);
  1233. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1234. r_.i16 = a_.i16 << count;
  1235. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1236. r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16((int16_t)count));
  1237. #else
  1238. SIMDE_VECTORIZE
  1239. for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
  1240. r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i] << count);
  1241. }
  1242. #endif
  1243. return simde__m64_from_private(r_);
  1244. #endif
  1245. }
  1246. #define simde_m_psllwi(a, count) simde_mm_slli_pi16(a, count)
  1247. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1248. #define _mm_slli_pi16(a, count) simde_mm_slli_pi16(a, count)
  1249. #define _m_psllwi(a, count) simde_mm_slli_pi16(a, count)
  1250. #endif
  1251. SIMDE_FUNCTION_ATTRIBUTES
  1252. simde__m64 simde_mm_slli_pi32(simde__m64 a, int count)
  1253. {
  1254. #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
  1255. return _mm_slli_pi32(a, count);
  1256. #else
  1257. simde__m64_private r_;
  1258. simde__m64_private a_ = simde__m64_to_private(a);
  1259. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1260. r_.i32 = a_.i32 << count;
  1261. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1262. r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32((int32_t)count));
  1263. #else
  1264. SIMDE_VECTORIZE
  1265. for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
  1266. r_.u32[i] = a_.u32[i] << count;
  1267. }
  1268. #endif
  1269. return simde__m64_from_private(r_);
  1270. #endif
  1271. }
  1272. #define simde_m_pslldi(a, b) simde_mm_slli_pi32(a, b)
  1273. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1274. #define _mm_slli_pi32(a, count) simde_mm_slli_pi32(a, count)
  1275. #define _m_pslldi(a, count) simde_mm_slli_pi32(a, count)
  1276. #endif
  1277. SIMDE_FUNCTION_ATTRIBUTES
  1278. simde__m64 simde_mm_slli_si64(simde__m64 a, int count)
  1279. {
  1280. #if defined(SIMDE_X86_MMX_NATIVE)
  1281. return _mm_slli_si64(a, count);
  1282. #else
  1283. simde__m64_private r_;
  1284. simde__m64_private a_ = simde__m64_to_private(a);
  1285. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1286. r_.i64 = a_.i64 << count;
  1287. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1288. r_.neon_i64 = vshl_s64(a_.neon_i64, vmov_n_s64((int64_t)count));
  1289. #else
  1290. r_.u64[0] = a_.u64[0] << count;
  1291. #endif
  1292. return simde__m64_from_private(r_);
  1293. #endif
  1294. }
  1295. #define simde_m_psllqi(a, count) simde_mm_slli_si64(a, count)
  1296. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1297. #define _mm_slli_si64(a, count) simde_mm_slli_si64(a, count)
  1298. #define _m_psllqi(a, count) simde_mm_slli_si64(a, count)
  1299. #endif
  1300. SIMDE_FUNCTION_ATTRIBUTES
  1301. simde__m64 simde_mm_sll_si64(simde__m64 a, simde__m64 count)
  1302. {
  1303. #if defined(SIMDE_X86_MMX_NATIVE)
  1304. return _mm_sll_si64(a, count);
  1305. #else
  1306. simde__m64_private r_;
  1307. simde__m64_private a_ = simde__m64_to_private(a);
  1308. simde__m64_private count_ = simde__m64_to_private(count);
  1309. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1310. r_.neon_i64 = vshl_s64(a_.neon_i64, count_.neon_i64);
  1311. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  1312. r_.i64 = a_.i64 << count_.i64;
  1313. #else
  1314. if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) {
  1315. simde_memset(&r_, 0, sizeof(r_));
  1316. return simde__m64_from_private(r_);
  1317. }
  1318. r_.u64[0] = a_.u64[0] << count_.u64[0];
  1319. #endif
  1320. return simde__m64_from_private(r_);
  1321. #endif
  1322. }
  1323. #define simde_m_psllq(a, count) simde_mm_sll_si64(a, count)
  1324. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1325. #define _mm_sll_si64(a, count) simde_mm_sll_si64(a, count)
  1326. #define _m_psllq(a, count) simde_mm_sll_si64(a, count)
  1327. #endif
  1328. SIMDE_FUNCTION_ATTRIBUTES
  1329. simde__m64 simde_mm_srl_pi16(simde__m64 a, simde__m64 count)
  1330. {
  1331. #if defined(SIMDE_X86_MMX_NATIVE)
  1332. return _mm_srl_pi16(a, count);
  1333. #else
  1334. simde__m64_private r_;
  1335. simde__m64_private a_ = simde__m64_to_private(a);
  1336. simde__m64_private count_ = simde__m64_to_private(count);
  1337. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1338. r_.u16 = a_.u16 >> count_.u64[0];
  1339. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1340. r_.neon_u16 = vshl_u16(
  1341. a_.neon_u16,
  1342. vmov_n_s16(-((int16_t)vget_lane_u64(count_.neon_u64, 0))));
  1343. #else
  1344. if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) {
  1345. simde_memset(&r_, 0, sizeof(r_));
  1346. return simde__m64_from_private(r_);
  1347. }
  1348. SIMDE_VECTORIZE
  1349. for (size_t i = 0; i < sizeof(r_.u16) / sizeof(r_.u16[0]); i++) {
  1350. r_.u16[i] = a_.u16[i] >> count_.u64[0];
  1351. }
  1352. #endif
  1353. return simde__m64_from_private(r_);
  1354. #endif
  1355. }
  1356. #define simde_m_psrlw(a, count) simde_mm_srl_pi16(a, count)
  1357. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1358. #define _mm_srl_pi16(a, count) simde_mm_srl_pi16(a, count)
  1359. #define _m_psrlw(a, count) simde_mm_srl_pi16(a, count)
  1360. #endif
  1361. SIMDE_FUNCTION_ATTRIBUTES
  1362. simde__m64 simde_mm_srl_pi32(simde__m64 a, simde__m64 count)
  1363. {
  1364. #if defined(SIMDE_X86_MMX_NATIVE)
  1365. return _mm_srl_pi32(a, count);
  1366. #else
  1367. simde__m64_private r_;
  1368. simde__m64_private a_ = simde__m64_to_private(a);
  1369. simde__m64_private count_ = simde__m64_to_private(count);
  1370. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1371. r_.u32 = a_.u32 >> count_.u64[0];
  1372. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1373. r_.neon_u32 = vshl_u32(
  1374. a_.neon_u32,
  1375. vmov_n_s32(-((int32_t)vget_lane_u64(count_.neon_u64, 0))));
  1376. #else
  1377. if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) {
  1378. simde_memset(&r_, 0, sizeof(r_));
  1379. return simde__m64_from_private(r_);
  1380. }
  1381. SIMDE_VECTORIZE
  1382. for (size_t i = 0; i < sizeof(r_.u32) / sizeof(r_.u32[0]); i++) {
  1383. r_.u32[i] = a_.u32[i] >> count_.u64[0];
  1384. }
  1385. #endif
  1386. return simde__m64_from_private(r_);
  1387. #endif
  1388. }
  1389. #define simde_m_psrld(a, count) simde_mm_srl_pi32(a, count)
  1390. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1391. #define _mm_srl_pi32(a, count) simde_mm_srl_pi32(a, count)
  1392. #define _m_psrld(a, count) simde_mm_srl_pi32(a, count)
  1393. #endif
  1394. SIMDE_FUNCTION_ATTRIBUTES
  1395. simde__m64 simde_mm_srli_pi16(simde__m64 a, int count)
  1396. {
  1397. #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
  1398. return _mm_srli_pi16(a, count);
  1399. #else
  1400. simde__m64_private r_;
  1401. simde__m64_private a_ = simde__m64_to_private(a);
  1402. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1403. r_.u16 = a_.u16 >> count;
  1404. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1405. r_.neon_u16 = vshl_u16(a_.neon_u16, vmov_n_s16(-((int16_t)count)));
  1406. #else
  1407. SIMDE_VECTORIZE
  1408. for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
  1409. r_.u16[i] = a_.u16[i] >> count;
  1410. }
  1411. #endif
  1412. return simde__m64_from_private(r_);
  1413. #endif
  1414. }
  1415. #define simde_m_psrlwi(a, count) simde_mm_srli_pi16(a, count)
  1416. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1417. #define _mm_srli_pi16(a, count) simde_mm_srli_pi16(a, count)
  1418. #define _m_psrlwi(a, count) simde_mm_srli_pi16(a, count)
  1419. #endif
  1420. SIMDE_FUNCTION_ATTRIBUTES
  1421. simde__m64 simde_mm_srli_pi32(simde__m64 a, int count)
  1422. {
  1423. #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
  1424. return _mm_srli_pi32(a, count);
  1425. #else
  1426. simde__m64_private r_;
  1427. simde__m64_private a_ = simde__m64_to_private(a);
  1428. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1429. r_.u32 = a_.u32 >> count;
  1430. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1431. r_.neon_u32 = vshl_u32(a_.neon_u32, vmov_n_s32(-((int32_t)count)));
  1432. #else
  1433. SIMDE_VECTORIZE
  1434. for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
  1435. r_.u32[i] = a_.u32[i] >> count;
  1436. }
  1437. #endif
  1438. return simde__m64_from_private(r_);
  1439. #endif
  1440. }
  1441. #define simde_m_psrldi(a, count) simde_mm_srli_pi32(a, count)
  1442. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1443. #define _mm_srli_pi32(a, count) simde_mm_srli_pi32(a, count)
  1444. #define _m_psrldi(a, count) simde_mm_srli_pi32(a, count)
  1445. #endif
  1446. SIMDE_FUNCTION_ATTRIBUTES
  1447. simde__m64 simde_mm_srli_si64(simde__m64 a, int count)
  1448. {
  1449. #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
  1450. return _mm_srli_si64(a, count);
  1451. #else
  1452. simde__m64_private r_;
  1453. simde__m64_private a_ = simde__m64_to_private(a);
  1454. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1455. r_.neon_u64 = vshl_u64(a_.neon_u64, vmov_n_s64(-count));
  1456. #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1457. r_.u64 = a_.u64 >> count;
  1458. #else
  1459. r_.u64[0] = a_.u64[0] >> count;
  1460. #endif
  1461. return simde__m64_from_private(r_);
  1462. #endif
  1463. }
  1464. #define simde_m_psrlqi(a, count) simde_mm_srli_si64(a, count)
  1465. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1466. #define _mm_srli_si64(a, count) simde_mm_srli_si64(a, count)
  1467. #define _m_psrlqi(a, count) simde_mm_srli_si64(a, count)
  1468. #endif
  1469. SIMDE_FUNCTION_ATTRIBUTES
  1470. simde__m64 simde_mm_srl_si64(simde__m64 a, simde__m64 count)
  1471. {
  1472. #if defined(SIMDE_X86_MMX_NATIVE)
  1473. return _mm_srl_si64(a, count);
  1474. #else
  1475. simde__m64_private r_;
  1476. simde__m64_private a_ = simde__m64_to_private(a);
  1477. simde__m64_private count_ = simde__m64_to_private(count);
  1478. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  1479. r_.neon_u64 = vshl_u64(a_.neon_u64, vneg_s64(count_.neon_i64));
  1480. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  1481. r_.u64 = a_.u64 >> count_.u64;
  1482. #else
  1483. if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) {
  1484. simde_memset(&r_, 0, sizeof(r_));
  1485. return simde__m64_from_private(r_);
  1486. }
  1487. r_.u64[0] = a_.u64[0] >> count_.u64[0];
  1488. #endif
  1489. return simde__m64_from_private(r_);
  1490. #endif
  1491. }
  1492. #define simde_m_psrlq(a, count) simde_mm_srl_si64(a, count)
  1493. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1494. #define _mm_srl_si64(a, count) simde_mm_srl_si64(a, count)
  1495. #define _m_psrlq(a, count) simde_mm_srl_si64(a, count)
  1496. #endif
  1497. SIMDE_FUNCTION_ATTRIBUTES
  1498. simde__m64 simde_mm_srai_pi16(simde__m64 a, int count)
  1499. {
  1500. #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
  1501. return _mm_srai_pi16(a, count);
  1502. #else
  1503. simde__m64_private r_;
  1504. simde__m64_private a_ = simde__m64_to_private(a);
  1505. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1506. r_.i16 = a_.i16 >> (count & 0xff);
  1507. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1508. r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16(-HEDLEY_STATIC_CAST(int16_t, count));
  1509. #else
  1510. SIMDE_VECTORIZE
  1511. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  1512. r_.i16[i] = a_.i16[i] >> (count & 0xff);
  1513. }
  1514. #endif
  1515. return simde__m64_from_private(r_);
  1516. #endif
  1517. }
  1518. #define simde_m_psrawi(a, count) simde_mm_srai_pi16(a, count)
  1519. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1520. #define _mm_srai_pi16(a, count) simde_mm_srai_pi16(a, count)
  1521. #define _m_psrawi(a, count) simde_mm_srai_pi16(a, count)
  1522. #endif
  1523. SIMDE_FUNCTION_ATTRIBUTES
  1524. simde__m64 simde_mm_srai_pi32(simde__m64 a, int count)
  1525. {
  1526. #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
  1527. return _mm_srai_pi32(a, count);
  1528. #else
  1529. simde__m64_private r_;
  1530. simde__m64_private a_ = simde__m64_to_private(a);
  1531. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1532. r_.i32 = a_.i32 >> (count & 0xff);
  1533. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1534. r_.neon_i32 = vshl_s32(a_.neon_i32,
  1535. vmov_n_s32(-HEDLEY_STATIC_CAST(int32_t, count)));
  1536. #else
  1537. SIMDE_VECTORIZE
  1538. for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
  1539. r_.i32[i] = a_.i32[i] >> (count & 0xff);
  1540. }
  1541. #endif
  1542. return simde__m64_from_private(r_);
  1543. #endif
  1544. }
  1545. #define simde_m_psradi(a, count) simde_mm_srai_pi32(a, count)
  1546. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1547. #define _mm_srai_pi32(a, count) simde_mm_srai_pi32(a, count)
  1548. #define _m_srai_pi32(a, count) simde_mm_srai_pi32(a, count)
  1549. #endif
  1550. SIMDE_FUNCTION_ATTRIBUTES
  1551. simde__m64 simde_mm_sra_pi16(simde__m64 a, simde__m64 count)
  1552. {
  1553. #if defined(SIMDE_X86_MMX_NATIVE)
  1554. return _mm_sra_pi16(a, count);
  1555. #else
  1556. simde__m64_private r_;
  1557. simde__m64_private a_ = simde__m64_to_private(a);
  1558. simde__m64_private count_ = simde__m64_to_private(count);
  1559. const int cnt = HEDLEY_STATIC_CAST(
  1560. int, (count_.i64[0] > 15 ? 15 : count_.i64[0]));
  1561. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1562. r_.i16 = a_.i16 >> cnt;
  1563. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1564. r_.neon_i16 =
  1565. vshl_s16(a_.neon_i16,
  1566. vmov_n_s16(-HEDLEY_STATIC_CAST(
  1567. int16_t, vget_lane_u64(count_.neon_u64, 0))));
  1568. #else
  1569. SIMDE_VECTORIZE
  1570. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  1571. r_.i16[i] = a_.i16[i] >> cnt;
  1572. }
  1573. #endif
  1574. return simde__m64_from_private(r_);
  1575. #endif
  1576. }
  1577. #define simde_m_psraw(a, count) simde_mm_sra_pi16(a, count)
  1578. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1579. #define _mm_sra_pi16(a, count) simde_mm_sra_pi16(a, count)
  1580. #define _m_psraw(a, count) simde_mm_sra_pi16(a, count)
  1581. #endif
  1582. SIMDE_FUNCTION_ATTRIBUTES
  1583. simde__m64 simde_mm_sra_pi32(simde__m64 a, simde__m64 count)
  1584. {
  1585. #if defined(SIMDE_X86_MMX_NATIVE)
  1586. return _mm_sra_pi32(a, count);
  1587. #else
  1588. simde__m64_private r_;
  1589. simde__m64_private a_ = simde__m64_to_private(a);
  1590. simde__m64_private count_ = simde__m64_to_private(count);
  1591. const int32_t cnt =
  1592. (count_.u64[0] > 31)
  1593. ? 31
  1594. : HEDLEY_STATIC_CAST(int32_t, count_.u64[0]);
  1595. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1596. r_.i32 = a_.i32 >> cnt;
  1597. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1598. r_.neon_i32 =
  1599. vshl_s32(a_.neon_i32,
  1600. vmov_n_s32(-HEDLEY_STATIC_CAST(
  1601. int32_t, vget_lane_u64(count_.neon_u64, 0))));
  1602. #else
  1603. SIMDE_VECTORIZE
  1604. for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
  1605. r_.i32[i] = a_.i32[i] >> cnt;
  1606. }
  1607. #endif
  1608. return simde__m64_from_private(r_);
  1609. #endif
  1610. }
  1611. #define simde_m_psrad(a, b) simde_mm_sra_pi32(a, b)
  1612. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1613. #define _mm_sra_pi32(a, count) simde_mm_sra_pi32(a, count)
  1614. #define _m_psrad(a, count) simde_mm_sra_pi32(a, count)
  1615. #endif
  1616. SIMDE_FUNCTION_ATTRIBUTES
  1617. simde__m64 simde_mm_sub_pi8(simde__m64 a, simde__m64 b)
  1618. {
  1619. #if defined(SIMDE_X86_MMX_NATIVE)
  1620. return _mm_sub_pi8(a, b);
  1621. #else
  1622. simde__m64_private r_;
  1623. simde__m64_private a_ = simde__m64_to_private(a);
  1624. simde__m64_private b_ = simde__m64_to_private(b);
  1625. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1626. r_.neon_i8 = vsub_s8(a_.neon_i8, b_.neon_i8);
  1627. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  1628. r_.i8 = a_.i8 - b_.i8;
  1629. #else
  1630. SIMDE_VECTORIZE
  1631. for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
  1632. r_.i8[i] = a_.i8[i] - b_.i8[i];
  1633. }
  1634. #endif
  1635. return simde__m64_from_private(r_);
  1636. #endif
  1637. }
  1638. #define simde_m_psubb(a, b) simde_mm_sub_pi8(a, b)
  1639. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1640. #define _mm_sub_pi8(a, b) simde_mm_sub_pi8(a, b)
  1641. #define _m_psubb(a, b) simde_mm_sub_pi8(a, b)
  1642. #endif
  1643. SIMDE_FUNCTION_ATTRIBUTES
  1644. simde__m64 simde_mm_sub_pi16(simde__m64 a, simde__m64 b)
  1645. {
  1646. #if defined(SIMDE_X86_MMX_NATIVE)
  1647. return _mm_sub_pi16(a, b);
  1648. #else
  1649. simde__m64_private r_;
  1650. simde__m64_private a_ = simde__m64_to_private(a);
  1651. simde__m64_private b_ = simde__m64_to_private(b);
  1652. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1653. r_.neon_i16 = vsub_s16(a_.neon_i16, b_.neon_i16);
  1654. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  1655. r_.i16 = a_.i16 - b_.i16;
  1656. #else
  1657. SIMDE_VECTORIZE
  1658. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  1659. r_.i16[i] = a_.i16[i] - b_.i16[i];
  1660. }
  1661. #endif
  1662. return simde__m64_from_private(r_);
  1663. #endif
  1664. }
  1665. #define simde_m_psubw(a, b) simde_mm_sub_pi16(a, b)
  1666. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1667. #define _mm_sub_pi16(a, b) simde_mm_sub_pi16(a, b)
  1668. #define _m_psubw(a, b) simde_mm_sub_pi16(a, b)
  1669. #endif
  1670. SIMDE_FUNCTION_ATTRIBUTES
  1671. simde__m64 simde_mm_sub_pi32(simde__m64 a, simde__m64 b)
  1672. {
  1673. #if defined(SIMDE_X86_MMX_NATIVE)
  1674. return _mm_sub_pi32(a, b);
  1675. #else
  1676. simde__m64_private r_;
  1677. simde__m64_private a_ = simde__m64_to_private(a);
  1678. simde__m64_private b_ = simde__m64_to_private(b);
  1679. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1680. r_.neon_i32 = vsub_s32(a_.neon_i32, b_.neon_i32);
  1681. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  1682. r_.i32 = a_.i32 - b_.i32;
  1683. #else
  1684. SIMDE_VECTORIZE
  1685. for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
  1686. r_.i32[i] = a_.i32[i] - b_.i32[i];
  1687. }
  1688. #endif
  1689. return simde__m64_from_private(r_);
  1690. #endif
  1691. }
  1692. #define simde_m_psubd(a, b) simde_mm_sub_pi32(a, b)
  1693. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1694. #define _mm_sub_pi32(a, b) simde_mm_sub_pi32(a, b)
  1695. #define _m_psubd(a, b) simde_mm_sub_pi32(a, b)
  1696. #endif
  1697. SIMDE_FUNCTION_ATTRIBUTES
  1698. simde__m64 simde_mm_subs_pi8(simde__m64 a, simde__m64 b)
  1699. {
  1700. #if defined(SIMDE_X86_MMX_NATIVE)
  1701. return _mm_subs_pi8(a, b);
  1702. #else
  1703. simde__m64_private r_;
  1704. simde__m64_private a_ = simde__m64_to_private(a);
  1705. simde__m64_private b_ = simde__m64_to_private(b);
  1706. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1707. r_.neon_i8 = vqsub_s8(a_.neon_i8, b_.neon_i8);
  1708. #else
  1709. SIMDE_VECTORIZE
  1710. for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
  1711. if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) {
  1712. r_.i8[i] = INT8_MIN;
  1713. } else if ((b_.i8[i]) < 0 &&
  1714. (a_.i8[i]) > INT8_MAX + (b_.i8[i])) {
  1715. r_.i8[i] = INT8_MAX;
  1716. } else {
  1717. r_.i8[i] = (a_.i8[i]) - (b_.i8[i]);
  1718. }
  1719. }
  1720. #endif
  1721. return simde__m64_from_private(r_);
  1722. #endif
  1723. }
  1724. #define simde_m_psubsb(a, b) simde_mm_subs_pi8(a, b)
  1725. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1726. #define _mm_subs_pi8(a, b) simde_mm_subs_pi8(a, b)
  1727. #define _m_psubsb(a, b) simde_mm_subs_pi8(a, b)
  1728. #endif
  1729. SIMDE_FUNCTION_ATTRIBUTES
  1730. simde__m64 simde_mm_subs_pu8(simde__m64 a, simde__m64 b)
  1731. {
  1732. #if defined(SIMDE_X86_MMX_NATIVE)
  1733. return _mm_subs_pu8(a, b);
  1734. #else
  1735. simde__m64_private r_;
  1736. simde__m64_private a_ = simde__m64_to_private(a);
  1737. simde__m64_private b_ = simde__m64_to_private(b);
  1738. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1739. r_.neon_u8 = vqsub_u8(a_.neon_u8, b_.neon_u8);
  1740. #else
  1741. SIMDE_VECTORIZE
  1742. for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
  1743. const int32_t x = a_.u8[i] - b_.u8[i];
  1744. if (x < 0) {
  1745. r_.u8[i] = 0;
  1746. } else if (x > UINT8_MAX) {
  1747. r_.u8[i] = UINT8_MAX;
  1748. } else {
  1749. r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
  1750. }
  1751. }
  1752. #endif
  1753. return simde__m64_from_private(r_);
  1754. #endif
  1755. }
  1756. #define simde_m_psubusb(a, b) simde_mm_subs_pu8(a, b)
  1757. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1758. #define _mm_subs_pu8(a, b) simde_mm_subs_pu8(a, b)
  1759. #define _m_psubusb(a, b) simde_mm_subs_pu8(a, b)
  1760. #endif
  1761. SIMDE_FUNCTION_ATTRIBUTES
  1762. simde__m64 simde_mm_subs_pi16(simde__m64 a, simde__m64 b)
  1763. {
  1764. #if defined(SIMDE_X86_MMX_NATIVE)
  1765. return _mm_subs_pi16(a, b);
  1766. #else
  1767. simde__m64_private r_;
  1768. simde__m64_private a_ = simde__m64_to_private(a);
  1769. simde__m64_private b_ = simde__m64_to_private(b);
  1770. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1771. r_.neon_i16 = vqsub_s16(a_.neon_i16, b_.neon_i16);
  1772. #else
  1773. SIMDE_VECTORIZE
  1774. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  1775. if (((b_.i16[i]) > 0 && (a_.i16[i]) < SHRT_MIN + (b_.i16[i]))) {
  1776. r_.i16[i] = SHRT_MIN;
  1777. } else if ((b_.i16[i]) < 0 &&
  1778. (a_.i16[i]) > INT16_MAX + (b_.i16[i])) {
  1779. r_.i16[i] = INT16_MAX;
  1780. } else {
  1781. r_.i16[i] = (a_.i16[i]) - (b_.i16[i]);
  1782. }
  1783. }
  1784. #endif
  1785. return simde__m64_from_private(r_);
  1786. #endif
  1787. }
  1788. #define simde_m_psubsw(a, b) simde_mm_subs_pi16(a, b)
  1789. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1790. #define _mm_subs_pi16(a, b) simde_mm_subs_pi16(a, b)
  1791. #define _m_psubsw(a, b) simde_mm_subs_pi16(a, b)
  1792. #endif
  1793. SIMDE_FUNCTION_ATTRIBUTES
  1794. simde__m64 simde_mm_subs_pu16(simde__m64 a, simde__m64 b)
  1795. {
  1796. #if defined(SIMDE_X86_MMX_NATIVE)
  1797. return _mm_subs_pu16(a, b);
  1798. #else
  1799. simde__m64_private r_;
  1800. simde__m64_private a_ = simde__m64_to_private(a);
  1801. simde__m64_private b_ = simde__m64_to_private(b);
  1802. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1803. r_.neon_u16 = vqsub_u16(a_.neon_u16, b_.neon_u16);
  1804. #else
  1805. SIMDE_VECTORIZE
  1806. for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
  1807. const int x = a_.u16[i] - b_.u16[i];
  1808. if (x < 0) {
  1809. r_.u16[i] = 0;
  1810. } else if (x > UINT16_MAX) {
  1811. r_.u16[i] = UINT16_MAX;
  1812. } else {
  1813. r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
  1814. }
  1815. }
  1816. #endif
  1817. return simde__m64_from_private(r_);
  1818. #endif
  1819. }
  1820. #define simde_m_psubusw(a, b) simde_mm_subs_pu16(a, b)
  1821. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1822. #define _mm_subs_pu16(a, b) simde_mm_subs_pu16(a, b)
  1823. #define _m_psubusw(a, b) simde_mm_subs_pu16(a, b)
  1824. #endif
  1825. SIMDE_FUNCTION_ATTRIBUTES
  1826. simde__m64 simde_mm_unpackhi_pi8(simde__m64 a, simde__m64 b)
  1827. {
  1828. #if defined(SIMDE_X86_MMX_NATIVE)
  1829. return _mm_unpackhi_pi8(a, b);
  1830. #else
  1831. simde__m64_private r_;
  1832. simde__m64_private a_ = simde__m64_to_private(a);
  1833. simde__m64_private b_ = simde__m64_to_private(b);
  1834. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  1835. r_.neon_i8 = vzip2_s8(a_.neon_i8, b_.neon_i8);
  1836. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  1837. r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 4, 12, 5, 13, 6, 14,
  1838. 7, 15);
  1839. #else
  1840. r_.i8[0] = a_.i8[4];
  1841. r_.i8[1] = b_.i8[4];
  1842. r_.i8[2] = a_.i8[5];
  1843. r_.i8[3] = b_.i8[5];
  1844. r_.i8[4] = a_.i8[6];
  1845. r_.i8[5] = b_.i8[6];
  1846. r_.i8[6] = a_.i8[7];
  1847. r_.i8[7] = b_.i8[7];
  1848. #endif
  1849. return simde__m64_from_private(r_);
  1850. #endif
  1851. }
  1852. #define simde_m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b)
  1853. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1854. #define _mm_unpackhi_pi8(a, b) simde_mm_unpackhi_pi8(a, b)
  1855. #define _m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b)
  1856. #endif
  1857. SIMDE_FUNCTION_ATTRIBUTES
  1858. simde__m64 simde_mm_unpackhi_pi16(simde__m64 a, simde__m64 b)
  1859. {
  1860. #if defined(SIMDE_X86_MMX_NATIVE)
  1861. return _mm_unpackhi_pi16(a, b);
  1862. #else
  1863. simde__m64_private r_;
  1864. simde__m64_private a_ = simde__m64_to_private(a);
  1865. simde__m64_private b_ = simde__m64_to_private(b);
  1866. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  1867. r_.neon_i16 = vzip2_s16(a_.neon_i16, b_.neon_i16);
  1868. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  1869. r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 2, 6, 3, 7);
  1870. #else
  1871. r_.i16[0] = a_.i16[2];
  1872. r_.i16[1] = b_.i16[2];
  1873. r_.i16[2] = a_.i16[3];
  1874. r_.i16[3] = b_.i16[3];
  1875. #endif
  1876. return simde__m64_from_private(r_);
  1877. #endif
  1878. }
  1879. #define simde_m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b)
  1880. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1881. #define _mm_unpackhi_pi16(a, b) simde_mm_unpackhi_pi16(a, b)
  1882. #define _m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b)
  1883. #endif
  1884. SIMDE_FUNCTION_ATTRIBUTES
  1885. simde__m64 simde_mm_unpackhi_pi32(simde__m64 a, simde__m64 b)
  1886. {
  1887. #if defined(SIMDE_X86_MMX_NATIVE)
  1888. return _mm_unpackhi_pi32(a, b);
  1889. #else
  1890. simde__m64_private r_;
  1891. simde__m64_private a_ = simde__m64_to_private(a);
  1892. simde__m64_private b_ = simde__m64_to_private(b);
  1893. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  1894. r_.neon_i32 = vzip2_s32(a_.neon_i32, b_.neon_i32);
  1895. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  1896. r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 1, 3);
  1897. #else
  1898. r_.i32[0] = a_.i32[1];
  1899. r_.i32[1] = b_.i32[1];
  1900. #endif
  1901. return simde__m64_from_private(r_);
  1902. #endif
  1903. }
  1904. #define simde_m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b)
  1905. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1906. #define _mm_unpackhi_pi32(a, b) simde_mm_unpackhi_pi32(a, b)
  1907. #define _m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b)
  1908. #endif
  1909. SIMDE_FUNCTION_ATTRIBUTES
  1910. simde__m64 simde_mm_unpacklo_pi8(simde__m64 a, simde__m64 b)
  1911. {
  1912. #if defined(SIMDE_X86_MMX_NATIVE)
  1913. return _mm_unpacklo_pi8(a, b);
  1914. #else
  1915. simde__m64_private r_;
  1916. simde__m64_private a_ = simde__m64_to_private(a);
  1917. simde__m64_private b_ = simde__m64_to_private(b);
  1918. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  1919. r_.neon_i8 = vzip1_s8(a_.neon_i8, b_.neon_i8);
  1920. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  1921. r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 0, 8, 1, 9, 2, 10, 3,
  1922. 11);
  1923. #else
  1924. r_.i8[0] = a_.i8[0];
  1925. r_.i8[1] = b_.i8[0];
  1926. r_.i8[2] = a_.i8[1];
  1927. r_.i8[3] = b_.i8[1];
  1928. r_.i8[4] = a_.i8[2];
  1929. r_.i8[5] = b_.i8[2];
  1930. r_.i8[6] = a_.i8[3];
  1931. r_.i8[7] = b_.i8[3];
  1932. #endif
  1933. return simde__m64_from_private(r_);
  1934. #endif
  1935. }
  1936. #define simde_m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b)
  1937. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1938. #define _mm_unpacklo_pi8(a, b) simde_mm_unpacklo_pi8(a, b)
  1939. #define _m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b)
  1940. #endif
  1941. SIMDE_FUNCTION_ATTRIBUTES
  1942. simde__m64 simde_mm_unpacklo_pi16(simde__m64 a, simde__m64 b)
  1943. {
  1944. #if defined(SIMDE_X86_MMX_NATIVE)
  1945. return _mm_unpacklo_pi16(a, b);
  1946. #else
  1947. simde__m64_private r_;
  1948. simde__m64_private a_ = simde__m64_to_private(a);
  1949. simde__m64_private b_ = simde__m64_to_private(b);
  1950. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  1951. r_.neon_i16 = vzip1_s16(a_.neon_i16, b_.neon_i16);
  1952. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  1953. r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 0, 4, 1, 5);
  1954. #else
  1955. r_.i16[0] = a_.i16[0];
  1956. r_.i16[1] = b_.i16[0];
  1957. r_.i16[2] = a_.i16[1];
  1958. r_.i16[3] = b_.i16[1];
  1959. #endif
  1960. return simde__m64_from_private(r_);
  1961. #endif
  1962. }
  1963. #define simde_m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b)
  1964. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1965. #define _mm_unpacklo_pi16(a, b) simde_mm_unpacklo_pi16(a, b)
  1966. #define _m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b)
  1967. #endif
  1968. SIMDE_FUNCTION_ATTRIBUTES
  1969. simde__m64 simde_mm_unpacklo_pi32(simde__m64 a, simde__m64 b)
  1970. {
  1971. #if defined(SIMDE_X86_MMX_NATIVE)
  1972. return _mm_unpacklo_pi32(a, b);
  1973. #else
  1974. simde__m64_private r_;
  1975. simde__m64_private a_ = simde__m64_to_private(a);
  1976. simde__m64_private b_ = simde__m64_to_private(b);
  1977. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  1978. r_.neon_i32 = vzip1_s32(a_.neon_i32, b_.neon_i32);
  1979. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  1980. r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 0, 2);
  1981. #else
  1982. r_.i32[0] = a_.i32[0];
  1983. r_.i32[1] = b_.i32[0];
  1984. #endif
  1985. return simde__m64_from_private(r_);
  1986. #endif
  1987. }
  1988. #define simde_m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b)
  1989. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1990. #define _mm_unpacklo_pi32(a, b) simde_mm_unpacklo_pi32(a, b)
  1991. #define _m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b)
  1992. #endif
  1993. SIMDE_FUNCTION_ATTRIBUTES
  1994. simde__m64 simde_mm_xor_si64(simde__m64 a, simde__m64 b)
  1995. {
  1996. #if defined(SIMDE_X86_MMX_NATIVE)
  1997. return _mm_xor_si64(a, b);
  1998. #else
  1999. simde__m64_private r_;
  2000. simde__m64_private a_ = simde__m64_to_private(a);
  2001. simde__m64_private b_ = simde__m64_to_private(b);
  2002. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2003. r_.neon_i32 = veor_s32(a_.neon_i32, b_.neon_i32);
  2004. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  2005. r_.i32f = a_.i32f ^ b_.i32f;
  2006. #else
  2007. r_.u64[0] = a_.u64[0] ^ b_.u64[0];
  2008. #endif
  2009. return simde__m64_from_private(r_);
  2010. #endif
  2011. }
  2012. #define simde_m_pxor(a, b) simde_mm_xor_si64(a, b)
  2013. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  2014. #define _mm_xor_si64(a, b) simde_mm_xor_si64(a, b)
  2015. #define _m_pxor(a, b) simde_mm_xor_si64(a, b)
  2016. #endif
  2017. SIMDE_FUNCTION_ATTRIBUTES
  2018. int32_t simde_m_to_int(simde__m64 a)
  2019. {
  2020. #if defined(SIMDE_X86_MMX_NATIVE)
  2021. return _m_to_int(a);
  2022. #else
  2023. simde__m64_private a_ = simde__m64_to_private(a);
  2024. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2025. return vget_lane_s32(a_.neon_i32, 0);
  2026. #else
  2027. return a_.i32[0];
  2028. #endif
  2029. #endif
  2030. }
  2031. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  2032. #define _m_to_int(a) simde_m_to_int(a)
  2033. #endif
  2034. SIMDE_END_DECLS_
  2035. HEDLEY_DIAGNOSTIC_POP
  2036. #endif /* !defined(SIMDE_X86_MMX_H) */