mmx.h 70 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456
  1. /* SPDX-License-Identifier: MIT
  2. *
  3. * Permission is hereby granted, free of charge, to any person
  4. * obtaining a copy of this software and associated documentation
  5. * files (the "Software"), to deal in the Software without
  6. * restriction, including without limitation the rights to use, copy,
  7. * modify, merge, publish, distribute, sublicense, and/or sell copies
  8. * of the Software, and to permit persons to whom the Software is
  9. * furnished to do so, subject to the following conditions:
  10. *
  11. * The above copyright notice and this permission notice shall be
  12. * included in all copies or substantial portions of the Software.
  13. *
  14. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  16. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  17. * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  18. * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  19. * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  20. * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. * SOFTWARE.
  22. *
  23. * Copyright:
  24. * 2017-2020 Evan Nemerson <[email protected]>
  25. */
  26. #if !defined(SIMDE_X86_MMX_H)
  27. #define SIMDE_X86_MMX_H
  28. #include "../simde-common.h"
  29. HEDLEY_DIAGNOSTIC_PUSH
  30. SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
  31. #if defined(SIMDE_X86_MMX_NATIVE)
  32. #define SIMDE_X86_MMX_USE_NATIVE_TYPE
  33. #elif defined(SIMDE_X86_SSE_NATIVE)
  34. #define SIMDE_X86_MMX_USE_NATIVE_TYPE
  35. #endif
  36. #if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
  37. #include <mmintrin.h>
  38. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  39. #include <arm_neon.h>
  40. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  41. #include <loongson-mmiintrin.h>
  42. #endif
  43. #include <stdint.h>
  44. #include <limits.h>
  45. SIMDE_BEGIN_DECLS_
  46. typedef union {
  47. #if defined(SIMDE_VECTOR_SUBSCRIPT)
  48. SIMDE_ALIGN_TO_8 int8_t i8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  49. SIMDE_ALIGN_TO_8 int16_t i16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  50. SIMDE_ALIGN_TO_8 int32_t i32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  51. SIMDE_ALIGN_TO_8 int64_t i64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  52. SIMDE_ALIGN_TO_8 uint8_t u8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  53. SIMDE_ALIGN_TO_8 uint16_t u16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  54. SIMDE_ALIGN_TO_8 uint32_t u32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  55. SIMDE_ALIGN_TO_8 uint64_t u64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  56. SIMDE_ALIGN_TO_8 simde_float32 f32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  57. SIMDE_ALIGN_TO_8 int_fast32_t i32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  58. SIMDE_ALIGN_TO_8 uint_fast32_t u32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  59. #else
  60. SIMDE_ALIGN_TO_8 int8_t i8[8];
  61. SIMDE_ALIGN_TO_8 int16_t i16[4];
  62. SIMDE_ALIGN_TO_8 int32_t i32[2];
  63. SIMDE_ALIGN_TO_8 int64_t i64[1];
  64. SIMDE_ALIGN_TO_8 uint8_t u8[8];
  65. SIMDE_ALIGN_TO_8 uint16_t u16[4];
  66. SIMDE_ALIGN_TO_8 uint32_t u32[2];
  67. SIMDE_ALIGN_TO_8 uint64_t u64[1];
  68. SIMDE_ALIGN_TO_8 simde_float32 f32[2];
  69. SIMDE_ALIGN_TO_8 int_fast32_t i32f[8 / sizeof(int_fast32_t)];
  70. SIMDE_ALIGN_TO_8 uint_fast32_t u32f[8 / sizeof(uint_fast32_t)];
  71. #endif
  72. #if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
  73. __m64 n;
  74. #endif
  75. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  76. int8x8_t neon_i8;
  77. int16x4_t neon_i16;
  78. int32x2_t neon_i32;
  79. int64x1_t neon_i64;
  80. uint8x8_t neon_u8;
  81. uint16x4_t neon_u16;
  82. uint32x2_t neon_u32;
  83. uint64x1_t neon_u64;
  84. float32x2_t neon_f32;
  85. #endif
  86. #if defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  87. int8x8_t mmi_i8;
  88. int16x4_t mmi_i16;
  89. int32x2_t mmi_i32;
  90. int64_t mmi_i64;
  91. uint8x8_t mmi_u8;
  92. uint16x4_t mmi_u16;
  93. uint32x2_t mmi_u32;
  94. uint64_t mmi_u64;
  95. #endif
  96. } simde__m64_private;
  97. #if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
  98. typedef __m64 simde__m64;
  99. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  100. typedef int32x2_t simde__m64;
  101. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  102. typedef int32x2_t simde__m64;
  103. #elif defined(SIMDE_VECTOR_SUBSCRIPT)
  104. typedef int32_t simde__m64 SIMDE_ALIGN_TO_8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
  105. #else
  106. typedef simde__m64_private simde__m64;
  107. #endif
  108. #if !defined(SIMDE_X86_MMX_USE_NATIVE_TYPE) && \
  109. defined(SIMDE_ENABLE_NATIVE_ALIASES)
  110. #define SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES
  111. typedef simde__m64 __m64;
  112. #endif
  113. HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64), "__m64 size incorrect");
  114. HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64_private), "__m64 size incorrect");
  115. #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
  116. HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m64) == 8,
  117. "simde__m64 is not 8-byte aligned");
  118. HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m64_private) == 8,
  119. "simde__m64_private is not 8-byte aligned");
  120. #endif
  121. SIMDE_FUNCTION_ATTRIBUTES
  122. simde__m64 simde__m64_from_private(simde__m64_private v)
  123. {
  124. simde__m64 r;
  125. simde_memcpy(&r, &v, sizeof(r));
  126. return r;
  127. }
  128. SIMDE_FUNCTION_ATTRIBUTES
  129. simde__m64_private simde__m64_to_private(simde__m64 v)
  130. {
  131. simde__m64_private r;
  132. simde_memcpy(&r, &v, sizeof(r));
  133. return r;
  134. }
  135. #define SIMDE_X86_GENERATE_CONVERSION_FUNCTION(simde_type, source_type, isax, \
  136. fragment) \
  137. SIMDE_FUNCTION_ATTRIBUTES \
  138. simde__##simde_type simde__##simde_type##_from_##isax##_##fragment( \
  139. source_type value) \
  140. { \
  141. simde__##simde_type##_private r_; \
  142. r_.isax##_##fragment = value; \
  143. return simde__##simde_type##_from_private(r_); \
  144. } \
  145. \
  146. SIMDE_FUNCTION_ATTRIBUTES \
  147. source_type simde__##simde_type##_to_##isax##_##fragment( \
  148. simde__##simde_type value) \
  149. { \
  150. simde__##simde_type##_private r_ = \
  151. simde__##simde_type##_to_private(value); \
  152. return r_.isax##_##fragment; \
  153. }
  154. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  155. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int8x8_t, neon, i8)
  156. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int16x4_t, neon, i16)
  157. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int32x2_t, neon, i32)
  158. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int64x1_t, neon, i64)
  159. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint8x8_t, neon, u8)
  160. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint16x4_t, neon, u16)
  161. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint32x2_t, neon, u32)
  162. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint64x1_t, neon, u64)
  163. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, float32x2_t, neon, f32)
  164. #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
  165. #if defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  166. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int8x8_t, mmi, i8)
  167. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int16x4_t, mmi, i16)
  168. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int32x2_t, mmi, i32)
  169. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int64_t, mmi, i64)
  170. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint8x8_t, mmi, u8)
  171. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint16x4_t, mmi, u16)
  172. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint32x2_t, mmi, u32)
  173. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint64_t, mmi, u64)
  174. #endif /* defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) */
  175. SIMDE_FUNCTION_ATTRIBUTES
  176. simde__m64 simde_mm_add_pi8(simde__m64 a, simde__m64 b)
  177. {
  178. #if defined(SIMDE_X86_MMX_NATIVE)
  179. return _mm_add_pi8(a, b);
  180. #else
  181. simde__m64_private r_;
  182. simde__m64_private a_ = simde__m64_to_private(a);
  183. simde__m64_private b_ = simde__m64_to_private(b);
  184. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  185. r_.neon_i8 = vadd_s8(a_.neon_i8, b_.neon_i8);
  186. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  187. r_.mmi_i8 = paddb_s(a_.mmi_i8, b_.mmi_i8);
  188. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  189. r_.i8 = a_.i8 + b_.i8;
  190. #else
  191. SIMDE_VECTORIZE
  192. for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
  193. r_.i8[i] = a_.i8[i] + b_.i8[i];
  194. }
  195. #endif
  196. return simde__m64_from_private(r_);
  197. #endif
  198. }
  199. #define simde_m_paddb(a, b) simde_mm_add_pi8(a, b)
  200. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  201. #define _mm_add_pi8(a, b) simde_mm_add_pi8(a, b)
  202. #define _m_paddb(a, b) simde_m_paddb(a, b)
  203. #endif
  204. SIMDE_FUNCTION_ATTRIBUTES
  205. simde__m64 simde_mm_add_pi16(simde__m64 a, simde__m64 b)
  206. {
  207. #if defined(SIMDE_X86_MMX_NATIVE)
  208. return _mm_add_pi16(a, b);
  209. #else
  210. simde__m64_private r_;
  211. simde__m64_private a_ = simde__m64_to_private(a);
  212. simde__m64_private b_ = simde__m64_to_private(b);
  213. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  214. r_.neon_i16 = vadd_s16(a_.neon_i16, b_.neon_i16);
  215. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  216. r_.mmi_i16 = paddh_s(a_.mmi_i16, b_.mmi_i16);
  217. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  218. r_.i16 = a_.i16 + b_.i16;
  219. #else
  220. SIMDE_VECTORIZE
  221. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  222. r_.i16[i] = a_.i16[i] + b_.i16[i];
  223. }
  224. #endif
  225. return simde__m64_from_private(r_);
  226. #endif
  227. }
  228. #define simde_m_paddw(a, b) simde_mm_add_pi16(a, b)
  229. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  230. #define _mm_add_pi16(a, b) simde_mm_add_pi16(a, b)
  231. #define _m_paddw(a, b) simde_mm_add_pi16(a, b)
  232. #endif
  233. SIMDE_FUNCTION_ATTRIBUTES
  234. simde__m64 simde_mm_add_pi32(simde__m64 a, simde__m64 b)
  235. {
  236. #if defined(SIMDE_X86_MMX_NATIVE)
  237. return _mm_add_pi32(a, b);
  238. #else
  239. simde__m64_private r_;
  240. simde__m64_private a_ = simde__m64_to_private(a);
  241. simde__m64_private b_ = simde__m64_to_private(b);
  242. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  243. r_.neon_i32 = vadd_s32(a_.neon_i32, b_.neon_i32);
  244. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  245. r_.mmi_i32 = paddw_s(a_.mmi_i32, b_.mmi_i32);
  246. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  247. r_.i32 = a_.i32 + b_.i32;
  248. #else
  249. SIMDE_VECTORIZE
  250. for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
  251. r_.i32[i] = a_.i32[i] + b_.i32[i];
  252. }
  253. #endif
  254. return simde__m64_from_private(r_);
  255. #endif
  256. }
  257. #define simde_m_paddd(a, b) simde_mm_add_pi32(a, b)
  258. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  259. #define _mm_add_pi32(a, b) simde_mm_add_pi32(a, b)
  260. #define _m_paddd(a, b) simde_mm_add_pi32(a, b)
  261. #endif
  262. SIMDE_FUNCTION_ATTRIBUTES
  263. simde__m64 simde_mm_adds_pi8(simde__m64 a, simde__m64 b)
  264. {
  265. #if defined(SIMDE_X86_MMX_NATIVE)
  266. return _mm_adds_pi8(a, b);
  267. #else
  268. simde__m64_private r_, a_ = simde__m64_to_private(a),
  269. b_ = simde__m64_to_private(b);
  270. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  271. r_.neon_i8 = vqadd_s8(a_.neon_i8, b_.neon_i8);
  272. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  273. r_.mmi_i8 = paddsb(a_.mmi_i8, b_.mmi_i8);
  274. #else
  275. SIMDE_VECTORIZE
  276. for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
  277. if ((((b_.i8[i]) > 0) &&
  278. ((a_.i8[i]) > (INT8_MAX - (b_.i8[i]))))) {
  279. r_.i8[i] = INT8_MAX;
  280. } else if ((((b_.i8[i]) < 0) &&
  281. ((a_.i8[i]) < (INT8_MIN - (b_.i8[i]))))) {
  282. r_.i8[i] = INT8_MIN;
  283. } else {
  284. r_.i8[i] = (a_.i8[i]) + (b_.i8[i]);
  285. }
  286. }
  287. #endif
  288. return simde__m64_from_private(r_);
  289. #endif
  290. }
  291. #define simde_m_paddsb(a, b) simde_mm_adds_pi8(a, b)
  292. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  293. #define _mm_adds_pi8(a, b) simde_mm_adds_pi8(a, b)
  294. #define _m_paddsb(a, b) simde_mm_adds_pi8(a, b)
  295. #endif
  296. SIMDE_FUNCTION_ATTRIBUTES
  297. simde__m64 simde_mm_adds_pu8(simde__m64 a, simde__m64 b)
  298. {
  299. #if defined(SIMDE_X86_MMX_NATIVE)
  300. return _mm_adds_pu8(a, b);
  301. #else
  302. simde__m64_private r_;
  303. simde__m64_private a_ = simde__m64_to_private(a);
  304. simde__m64_private b_ = simde__m64_to_private(b);
  305. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  306. r_.neon_u8 = vqadd_u8(a_.neon_u8, b_.neon_u8);
  307. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  308. r_.mmi_u8 = paddusb(a_.mmi_u8, b_.mmi_u8);
  309. #else
  310. SIMDE_VECTORIZE
  311. for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
  312. const uint_fast16_t x =
  313. HEDLEY_STATIC_CAST(uint_fast16_t, a_.u8[i]) +
  314. HEDLEY_STATIC_CAST(uint_fast16_t, b_.u8[i]);
  315. if (x > UINT8_MAX)
  316. r_.u8[i] = UINT8_MAX;
  317. else
  318. r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
  319. }
  320. #endif
  321. return simde__m64_from_private(r_);
  322. #endif
  323. }
  324. #define simde_m_paddusb(a, b) simde_mm_adds_pu8(a, b)
  325. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  326. #define _mm_adds_pu8(a, b) simde_mm_adds_pu8(a, b)
  327. #define _m_paddusb(a, b) simde_mm_adds_pu8(a, b)
  328. #endif
  329. SIMDE_FUNCTION_ATTRIBUTES
  330. simde__m64 simde_mm_adds_pi16(simde__m64 a, simde__m64 b)
  331. {
  332. #if defined(SIMDE_X86_MMX_NATIVE)
  333. return _mm_adds_pi16(a, b);
  334. #else
  335. simde__m64_private r_;
  336. simde__m64_private a_ = simde__m64_to_private(a);
  337. simde__m64_private b_ = simde__m64_to_private(b);
  338. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  339. r_.neon_i16 = vqadd_s16(a_.neon_i16, b_.neon_i16);
  340. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  341. r_.mmi_i16 = paddsh(a_.mmi_i16, b_.mmi_i16);
  342. #else
  343. SIMDE_VECTORIZE
  344. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  345. if ((((b_.i16[i]) > 0) &&
  346. ((a_.i16[i]) > (INT16_MAX - (b_.i16[i]))))) {
  347. r_.i16[i] = INT16_MAX;
  348. } else if ((((b_.i16[i]) < 0) &&
  349. ((a_.i16[i]) < (SHRT_MIN - (b_.i16[i]))))) {
  350. r_.i16[i] = SHRT_MIN;
  351. } else {
  352. r_.i16[i] = (a_.i16[i]) + (b_.i16[i]);
  353. }
  354. }
  355. #endif
  356. return simde__m64_from_private(r_);
  357. #endif
  358. }
  359. #define simde_m_paddsw(a, b) simde_mm_adds_pi16(a, b)
  360. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  361. #define _mm_adds_pi16(a, b) simde_mm_adds_pi16(a, b)
  362. #define _m_paddsw(a, b) simde_mm_adds_pi16(a, b)
  363. #endif
  364. SIMDE_FUNCTION_ATTRIBUTES
  365. simde__m64 simde_mm_adds_pu16(simde__m64 a, simde__m64 b)
  366. {
  367. #if defined(SIMDE_X86_MMX_NATIVE)
  368. return _mm_adds_pu16(a, b);
  369. #else
  370. simde__m64_private r_;
  371. simde__m64_private a_ = simde__m64_to_private(a);
  372. simde__m64_private b_ = simde__m64_to_private(b);
  373. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  374. r_.neon_u16 = vqadd_u16(a_.neon_u16, b_.neon_u16);
  375. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  376. r_.mmi_u16 = paddush(a_.mmi_u16, b_.mmi_u16);
  377. #else
  378. SIMDE_VECTORIZE
  379. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  380. const uint32_t x = a_.u16[i] + b_.u16[i];
  381. if (x > UINT16_MAX)
  382. r_.u16[i] = UINT16_MAX;
  383. else
  384. r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
  385. }
  386. #endif
  387. return simde__m64_from_private(r_);
  388. #endif
  389. }
  390. #define simde_m_paddusw(a, b) simde_mm_adds_pu16(a, b)
  391. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  392. #define _mm_adds_pu16(a, b) simde_mm_adds_pu16(a, b)
  393. #define _m_paddusw(a, b) simde_mm_adds_pu16(a, b)
  394. #endif
  395. SIMDE_FUNCTION_ATTRIBUTES
  396. simde__m64 simde_mm_and_si64(simde__m64 a, simde__m64 b)
  397. {
  398. #if defined(SIMDE_X86_MMX_NATIVE)
  399. return _mm_and_si64(a, b);
  400. #else
  401. simde__m64_private r_;
  402. simde__m64_private a_ = simde__m64_to_private(a);
  403. simde__m64_private b_ = simde__m64_to_private(b);
  404. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  405. r_.neon_i32 = vand_s32(a_.neon_i32, b_.neon_i32);
  406. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  407. r_.i64 = a_.i64 & b_.i64;
  408. #else
  409. r_.i64[0] = a_.i64[0] & b_.i64[0];
  410. #endif
  411. return simde__m64_from_private(r_);
  412. #endif
  413. }
  414. #define simde_m_pand(a, b) simde_mm_and_si64(a, b)
  415. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  416. #define _mm_and_si64(a, b) simde_mm_and_si64(a, b)
  417. #define _m_pand(a, b) simde_mm_and_si64(a, b)
  418. #endif
  419. SIMDE_FUNCTION_ATTRIBUTES
  420. simde__m64 simde_mm_andnot_si64(simde__m64 a, simde__m64 b)
  421. {
  422. #if defined(SIMDE_X86_MMX_NATIVE)
  423. return _mm_andnot_si64(a, b);
  424. #else
  425. simde__m64_private r_;
  426. simde__m64_private a_ = simde__m64_to_private(a);
  427. simde__m64_private b_ = simde__m64_to_private(b);
  428. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  429. r_.neon_i32 = vbic_s32(b_.neon_i32, a_.neon_i32);
  430. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  431. r_.mmi_i32 = pandn_sw(a_.mmi_i32, b_.mmi_i32);
  432. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  433. r_.i32f = ~a_.i32f & b_.i32f;
  434. #else
  435. r_.u64[0] = (~(a_.u64[0])) & (b_.u64[0]);
  436. #endif
  437. return simde__m64_from_private(r_);
  438. #endif
  439. }
  440. #define simde_m_pandn(a, b) simde_mm_andnot_si64(a, b)
  441. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  442. #define _mm_andnot_si64(a, b) simde_mm_andnot_si64(a, b)
  443. #define _m_pandn(a, b) simde_mm_andnot_si64(a, b)
  444. #endif
  445. SIMDE_FUNCTION_ATTRIBUTES
  446. simde__m64 simde_mm_cmpeq_pi8(simde__m64 a, simde__m64 b)
  447. {
  448. #if defined(SIMDE_X86_MMX_NATIVE)
  449. return _mm_cmpeq_pi8(a, b);
  450. #else
  451. simde__m64_private r_;
  452. simde__m64_private a_ = simde__m64_to_private(a);
  453. simde__m64_private b_ = simde__m64_to_private(b);
  454. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  455. r_.neon_u8 = vceq_s8(a_.neon_i8, b_.neon_i8);
  456. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  457. r_.mmi_i8 = pcmpeqb_s(a_.mmi_i8, b_.mmi_i8);
  458. #else
  459. SIMDE_VECTORIZE
  460. for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
  461. r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
  462. }
  463. #endif
  464. return simde__m64_from_private(r_);
  465. #endif
  466. }
  467. #define simde_m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b)
  468. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  469. #define _mm_cmpeq_pi8(a, b) simde_mm_cmpeq_pi8(a, b)
  470. #define _m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b)
  471. #endif
  472. SIMDE_FUNCTION_ATTRIBUTES
  473. simde__m64 simde_mm_cmpeq_pi16(simde__m64 a, simde__m64 b)
  474. {
  475. #if defined(SIMDE_X86_MMX_NATIVE)
  476. return _mm_cmpeq_pi16(a, b);
  477. #else
  478. simde__m64_private r_;
  479. simde__m64_private a_ = simde__m64_to_private(a);
  480. simde__m64_private b_ = simde__m64_to_private(b);
  481. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  482. r_.neon_u16 = vceq_s16(a_.neon_i16, b_.neon_i16);
  483. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  484. r_.mmi_i16 = pcmpeqh_s(a_.mmi_i16, b_.mmi_i16);
  485. #else
  486. SIMDE_VECTORIZE
  487. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  488. r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
  489. }
  490. #endif
  491. return simde__m64_from_private(r_);
  492. #endif
  493. }
  494. #define simde_m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b)
  495. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  496. #define _mm_cmpeq_pi16(a, b) simde_mm_cmpeq_pi16(a, b)
  497. #define _m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b)
  498. #endif
  499. SIMDE_FUNCTION_ATTRIBUTES
  500. simde__m64 simde_mm_cmpeq_pi32(simde__m64 a, simde__m64 b)
  501. {
  502. #if defined(SIMDE_X86_MMX_NATIVE)
  503. return _mm_cmpeq_pi32(a, b);
  504. #else
  505. simde__m64_private r_;
  506. simde__m64_private a_ = simde__m64_to_private(a);
  507. simde__m64_private b_ = simde__m64_to_private(b);
  508. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  509. r_.neon_u32 = vceq_s32(a_.neon_i32, b_.neon_i32);
  510. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  511. r_.mmi_i32 = pcmpeqw_s(a_.mmi_i32, b_.mmi_i32);
  512. #else
  513. SIMDE_VECTORIZE
  514. for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
  515. r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
  516. }
  517. #endif
  518. return simde__m64_from_private(r_);
  519. #endif
  520. }
  521. #define simde_m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b)
  522. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  523. #define _mm_cmpeq_pi32(a, b) simde_mm_cmpeq_pi32(a, b)
  524. #define _m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b)
  525. #endif
  526. SIMDE_FUNCTION_ATTRIBUTES
  527. simde__m64 simde_mm_cmpgt_pi8(simde__m64 a, simde__m64 b)
  528. {
  529. #if defined(SIMDE_X86_MMX_NATIVE)
  530. return _mm_cmpgt_pi8(a, b);
  531. #else
  532. simde__m64_private r_;
  533. simde__m64_private a_ = simde__m64_to_private(a);
  534. simde__m64_private b_ = simde__m64_to_private(b);
  535. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  536. r_.neon_u8 = vcgt_s8(a_.neon_i8, b_.neon_i8);
  537. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  538. r_.mmi_i8 = pcmpgtb_s(a_.mmi_i8, b_.mmi_i8);
  539. #else
  540. SIMDE_VECTORIZE
  541. for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
  542. r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
  543. }
  544. #endif
  545. return simde__m64_from_private(r_);
  546. #endif
  547. }
  548. #define simde_m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b)
  549. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  550. #define _mm_cmpgt_pi8(a, b) simde_mm_cmpgt_pi8(a, b)
  551. #define _m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b)
  552. #endif
  553. SIMDE_FUNCTION_ATTRIBUTES
  554. simde__m64 simde_mm_cmpgt_pi16(simde__m64 a, simde__m64 b)
  555. {
  556. #if defined(SIMDE_X86_MMX_NATIVE)
  557. return _mm_cmpgt_pi16(a, b);
  558. #else
  559. simde__m64_private r_;
  560. simde__m64_private a_ = simde__m64_to_private(a);
  561. simde__m64_private b_ = simde__m64_to_private(b);
  562. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  563. r_.neon_u16 = vcgt_s16(a_.neon_i16, b_.neon_i16);
  564. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  565. r_.mmi_i16 = pcmpgth_s(a_.mmi_i16, b_.mmi_i16);
  566. #else
  567. SIMDE_VECTORIZE
  568. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  569. r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
  570. }
  571. #endif
  572. return simde__m64_from_private(r_);
  573. #endif
  574. }
  575. #define simde_m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b)
  576. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  577. #define _mm_cmpgt_pi16(a, b) simde_mm_cmpgt_pi16(a, b)
  578. #define _m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b)
  579. #endif
  580. SIMDE_FUNCTION_ATTRIBUTES
  581. simde__m64 simde_mm_cmpgt_pi32(simde__m64 a, simde__m64 b)
  582. {
  583. #if defined(SIMDE_X86_MMX_NATIVE)
  584. return _mm_cmpgt_pi32(a, b);
  585. #else
  586. simde__m64_private r_;
  587. simde__m64_private a_ = simde__m64_to_private(a);
  588. simde__m64_private b_ = simde__m64_to_private(b);
  589. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  590. r_.neon_u32 = vcgt_s32(a_.neon_i32, b_.neon_i32);
  591. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  592. r_.mmi_i32 = pcmpgtw_s(a_.mmi_i32, b_.mmi_i32);
  593. #else
  594. SIMDE_VECTORIZE
  595. for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
  596. r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
  597. }
  598. #endif
  599. return simde__m64_from_private(r_);
  600. #endif
  601. }
  602. #define simde_m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b)
  603. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  604. #define _mm_cmpgt_pi32(a, b) simde_mm_cmpgt_pi32(a, b)
  605. #define _m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b)
  606. #endif
  607. SIMDE_FUNCTION_ATTRIBUTES
  608. int64_t simde_mm_cvtm64_si64(simde__m64 a)
  609. {
  610. #if defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && \
  611. !defined(__PGI)
  612. return _mm_cvtm64_si64(a);
  613. #else
  614. simde__m64_private a_ = simde__m64_to_private(a);
  615. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  616. HEDLEY_DIAGNOSTIC_PUSH
  617. #if HEDLEY_HAS_WARNING("-Wvector-conversion") && \
  618. SIMDE_DETECT_CLANG_VERSION_NOT(10, 0, 0)
  619. #pragma clang diagnostic ignored "-Wvector-conversion"
  620. #endif
  621. return vget_lane_s64(a_.neon_i64, 0);
  622. HEDLEY_DIAGNOSTIC_POP
  623. #else
  624. return a_.i64[0];
  625. #endif
  626. #endif
  627. }
  628. #define simde_m_to_int64(a) simde_mm_cvtm64_si64(a)
  629. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  630. #define _mm_cvtm64_si64(a) simde_mm_cvtm64_si64(a)
  631. #define _m_to_int64(a) simde_mm_cvtm64_si64(a)
  632. #endif
  633. SIMDE_FUNCTION_ATTRIBUTES
  634. simde__m64 simde_mm_cvtsi32_si64(int32_t a)
  635. {
  636. #if defined(SIMDE_X86_MMX_NATIVE)
  637. return _mm_cvtsi32_si64(a);
  638. #else
  639. simde__m64_private r_;
  640. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  641. const int32_t av[sizeof(r_.neon_i32) / sizeof(r_.neon_i32[0])] = {a, 0};
  642. r_.neon_i32 = vld1_s32(av);
  643. #else
  644. r_.i32[0] = a;
  645. r_.i32[1] = 0;
  646. #endif
  647. return simde__m64_from_private(r_);
  648. #endif
  649. }
  650. #define simde_m_from_int(a) simde_mm_cvtsi32_si64(a)
  651. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  652. #define _mm_cvtsi32_si64(a) simde_mm_cvtsi32_si64(a)
  653. #define _m_from_int(a) simde_mm_cvtsi32_si64(a)
  654. #endif
  655. SIMDE_FUNCTION_ATTRIBUTES
  656. simde__m64 simde_mm_cvtsi64_m64(int64_t a)
  657. {
  658. #if defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && \
  659. !defined(__PGI)
  660. return _mm_cvtsi64_m64(a);
  661. #else
  662. simde__m64_private r_;
  663. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  664. r_.neon_i64 = vld1_s64(&a);
  665. #else
  666. r_.i64[0] = a;
  667. #endif
  668. return simde__m64_from_private(r_);
  669. #endif
  670. }
  671. #define simde_m_from_int64(a) simde_mm_cvtsi64_m64(a)
  672. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  673. #define _mm_cvtsi64_m64(a) simde_mm_cvtsi64_m64(a)
  674. #define _m_from_int64(a) simde_mm_cvtsi64_m64(a)
  675. #endif
  676. SIMDE_FUNCTION_ATTRIBUTES
  677. int32_t simde_mm_cvtsi64_si32(simde__m64 a)
  678. {
  679. #if defined(SIMDE_X86_MMX_NATIVE)
  680. return _mm_cvtsi64_si32(a);
  681. #else
  682. simde__m64_private a_ = simde__m64_to_private(a);
  683. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  684. HEDLEY_DIAGNOSTIC_PUSH
  685. #if HEDLEY_HAS_WARNING("-Wvector-conversion") && \
  686. SIMDE_DETECT_CLANG_VERSION_NOT(10, 0, 0)
  687. #pragma clang diagnostic ignored "-Wvector-conversion"
  688. #endif
  689. return vget_lane_s32(a_.neon_i32, 0);
  690. HEDLEY_DIAGNOSTIC_POP
  691. #else
  692. return a_.i32[0];
  693. #endif
  694. #endif
  695. }
  696. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  697. #define _mm_cvtsi64_si32(a) simde_mm_cvtsi64_si32(a)
  698. #endif
  699. SIMDE_FUNCTION_ATTRIBUTES
  700. void simde_mm_empty(void)
  701. {
  702. #if defined(SIMDE_X86_MMX_NATIVE)
  703. _mm_empty();
  704. #else
  705. /* noop */
  706. #endif
  707. }
  708. #define simde_m_empty() simde_mm_empty()
  709. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  710. #define _mm_empty() simde_mm_empty()
  711. #define _m_empty() simde_mm_empty()
  712. #endif
  713. SIMDE_FUNCTION_ATTRIBUTES
  714. simde__m64 simde_mm_madd_pi16(simde__m64 a, simde__m64 b)
  715. {
  716. #if defined(SIMDE_X86_MMX_NATIVE)
  717. return _mm_madd_pi16(a, b);
  718. #else
  719. simde__m64_private r_;
  720. simde__m64_private a_ = simde__m64_to_private(a);
  721. simde__m64_private b_ = simde__m64_to_private(b);
  722. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  723. int32x4_t i1 = vmull_s16(a_.neon_i16, b_.neon_i16);
  724. r_.neon_i32 = vpadd_s32(vget_low_s32(i1), vget_high_s32(i1));
  725. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  726. r_.mmi_i32 = pmaddhw(a_.mmi_i16, b_.mmi_i16);
  727. #else
  728. SIMDE_VECTORIZE
  729. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i += 2) {
  730. r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) +
  731. (a_.i16[i + 1] * b_.i16[i + 1]);
  732. }
  733. #endif
  734. return simde__m64_from_private(r_);
  735. #endif
  736. }
  737. #define simde_m_pmaddwd(a, b) simde_mm_madd_pi16(a, b)
  738. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  739. #define _mm_madd_pi16(a, b) simde_mm_madd_pi16(a, b)
  740. #define _m_pmaddwd(a, b) simde_mm_madd_pi16(a, b)
  741. #endif
  742. SIMDE_FUNCTION_ATTRIBUTES
  743. simde__m64 simde_mm_mulhi_pi16(simde__m64 a, simde__m64 b)
  744. {
  745. #if defined(SIMDE_X86_MMX_NATIVE)
  746. return _mm_mulhi_pi16(a, b);
  747. #else
  748. simde__m64_private r_;
  749. simde__m64_private a_ = simde__m64_to_private(a);
  750. simde__m64_private b_ = simde__m64_to_private(b);
  751. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  752. const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16);
  753. const uint32x4_t t2 = vshrq_n_u32(vreinterpretq_u32_s32(t1), 16);
  754. const uint16x4_t t3 = vmovn_u32(t2);
  755. r_.neon_u16 = t3;
  756. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  757. r_.mmi_i16 = pmulhh(a_.mmi_i16, b_.mmi_i16);
  758. #else
  759. SIMDE_VECTORIZE
  760. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  761. r_.i16[i] = HEDLEY_STATIC_CAST(int16_t,
  762. ((a_.i16[i] * b_.i16[i]) >> 16));
  763. }
  764. #endif
  765. return simde__m64_from_private(r_);
  766. #endif
  767. }
  768. #define simde_m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b)
  769. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  770. #define _mm_mulhi_pi16(a, b) simde_mm_mulhi_pi16(a, b)
  771. #define _m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b)
  772. #endif
  773. SIMDE_FUNCTION_ATTRIBUTES
  774. simde__m64 simde_mm_mullo_pi16(simde__m64 a, simde__m64 b)
  775. {
  776. #if defined(SIMDE_X86_MMX_NATIVE)
  777. return _mm_mullo_pi16(a, b);
  778. #else
  779. simde__m64_private r_;
  780. simde__m64_private a_ = simde__m64_to_private(a);
  781. simde__m64_private b_ = simde__m64_to_private(b);
  782. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  783. const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16);
  784. const uint16x4_t t2 = vmovn_u32(vreinterpretq_u32_s32(t1));
  785. r_.neon_u16 = t2;
  786. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  787. r_.mmi_i16 = pmullh(a_.mmi_i16, b_.mmi_i16);
  788. #else
  789. SIMDE_VECTORIZE
  790. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  791. r_.i16[i] = HEDLEY_STATIC_CAST(
  792. int16_t, ((a_.i16[i] * b_.i16[i]) & 0xffff));
  793. }
  794. #endif
  795. return simde__m64_from_private(r_);
  796. #endif
  797. }
  798. #define simde_m_pmullw(a, b) simde_mm_mullo_pi16(a, b)
  799. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  800. #define _mm_mullo_pi16(a, b) simde_mm_mullo_pi16(a, b)
  801. #define _m_pmullw(a, b) simde_mm_mullo_pi16(a, b)
  802. #endif
  803. SIMDE_FUNCTION_ATTRIBUTES
  804. simde__m64 simde_mm_or_si64(simde__m64 a, simde__m64 b)
  805. {
  806. #if defined(SIMDE_X86_MMX_NATIVE)
  807. return _mm_or_si64(a, b);
  808. #else
  809. simde__m64_private r_;
  810. simde__m64_private a_ = simde__m64_to_private(a);
  811. simde__m64_private b_ = simde__m64_to_private(b);
  812. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  813. r_.neon_i32 = vorr_s32(a_.neon_i32, b_.neon_i32);
  814. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  815. r_.i64 = a_.i64 | b_.i64;
  816. #else
  817. r_.i64[0] = a_.i64[0] | b_.i64[0];
  818. #endif
  819. return simde__m64_from_private(r_);
  820. #endif
  821. }
  822. #define simde_m_por(a, b) simde_mm_or_si64(a, b)
  823. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  824. #define _mm_or_si64(a, b) simde_mm_or_si64(a, b)
  825. #define _m_por(a, b) simde_mm_or_si64(a, b)
  826. #endif
  827. SIMDE_FUNCTION_ATTRIBUTES
  828. simde__m64 simde_mm_packs_pi16(simde__m64 a, simde__m64 b)
  829. {
  830. #if defined(SIMDE_X86_MMX_NATIVE)
  831. return _mm_packs_pi16(a, b);
  832. #else
  833. simde__m64_private r_;
  834. simde__m64_private a_ = simde__m64_to_private(a);
  835. simde__m64_private b_ = simde__m64_to_private(b);
  836. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  837. r_.neon_i8 = vqmovn_s16(vcombine_s16(a_.neon_i16, b_.neon_i16));
  838. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  839. r_.mmi_i8 = packsshb(a_.mmi_i16, b_.mmi_i16);
  840. #else
  841. SIMDE_VECTORIZE
  842. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  843. if (a_.i16[i] < INT8_MIN) {
  844. r_.i8[i] = INT8_MIN;
  845. } else if (a_.i16[i] > INT8_MAX) {
  846. r_.i8[i] = INT8_MAX;
  847. } else {
  848. r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, a_.i16[i]);
  849. }
  850. }
  851. SIMDE_VECTORIZE
  852. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  853. if (b_.i16[i] < INT8_MIN) {
  854. r_.i8[i + 4] = INT8_MIN;
  855. } else if (b_.i16[i] > INT8_MAX) {
  856. r_.i8[i + 4] = INT8_MAX;
  857. } else {
  858. r_.i8[i + 4] = HEDLEY_STATIC_CAST(int8_t, b_.i16[i]);
  859. }
  860. }
  861. #endif
  862. return simde__m64_from_private(r_);
  863. #endif
  864. }
  865. #define simde_m_packsswb(a, b) simde_mm_packs_pi16(a, b)
  866. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  867. #define _mm_packs_pi16(a, b) simde_mm_packs_pi16(a, b)
  868. #define _m_packsswb(a, b) simde_mm_packs_pi16(a, b)
  869. #endif
  870. SIMDE_FUNCTION_ATTRIBUTES
  871. simde__m64 simde_mm_packs_pi32(simde__m64 a, simde__m64 b)
  872. {
  873. #if defined(SIMDE_X86_MMX_NATIVE)
  874. return _mm_packs_pi32(a, b);
  875. #else
  876. simde__m64_private r_;
  877. simde__m64_private a_ = simde__m64_to_private(a);
  878. simde__m64_private b_ = simde__m64_to_private(b);
  879. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  880. r_.neon_i16 = vqmovn_s32(vcombine_s32(a_.neon_i32, b_.neon_i32));
  881. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  882. r_.mmi_i16 = packsswh(a_.mmi_i32, b_.mmi_i32);
  883. #else
  884. SIMDE_VECTORIZE
  885. for (size_t i = 0; i < (8 / sizeof(a_.i32[0])); i++) {
  886. if (a_.i32[i] < SHRT_MIN) {
  887. r_.i16[i] = SHRT_MIN;
  888. } else if (a_.i32[i] > INT16_MAX) {
  889. r_.i16[i] = INT16_MAX;
  890. } else {
  891. r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i32[i]);
  892. }
  893. }
  894. SIMDE_VECTORIZE
  895. for (size_t i = 0; i < (8 / sizeof(b_.i32[0])); i++) {
  896. if (b_.i32[i] < SHRT_MIN) {
  897. r_.i16[i + 2] = SHRT_MIN;
  898. } else if (b_.i32[i] > INT16_MAX) {
  899. r_.i16[i + 2] = INT16_MAX;
  900. } else {
  901. r_.i16[i + 2] = HEDLEY_STATIC_CAST(int16_t, b_.i32[i]);
  902. }
  903. }
  904. #endif
  905. return simde__m64_from_private(r_);
  906. #endif
  907. }
  908. #define simde_m_packssdw(a, b) simde_mm_packs_pi32(a, b)
  909. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  910. #define _mm_packs_pi32(a, b) simde_mm_packs_pi32(a, b)
  911. #define _m_packssdw(a, b) simde_mm_packs_pi32(a, b)
  912. #endif
  913. SIMDE_FUNCTION_ATTRIBUTES
  914. simde__m64 simde_mm_packs_pu16(simde__m64 a, simde__m64 b)
  915. {
  916. #if defined(SIMDE_X86_MMX_NATIVE)
  917. return _mm_packs_pu16(a, b);
  918. #else
  919. simde__m64_private r_;
  920. simde__m64_private a_ = simde__m64_to_private(a);
  921. simde__m64_private b_ = simde__m64_to_private(b);
  922. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  923. const int16x8_t t1 = vcombine_s16(a_.neon_i16, b_.neon_i16);
  924. /* Set elements which are < 0 to 0 */
  925. const int16x8_t t2 =
  926. vandq_s16(t1, vreinterpretq_s16_u16(vcgezq_s16(t1)));
  927. /* Vector with all s16 elements set to UINT8_MAX */
  928. const int16x8_t vmax =
  929. vmovq_n_s16(HEDLEY_STATIC_CAST(int16_t, UINT8_MAX));
  930. /* Elements which are within the acceptable range */
  931. const int16x8_t le_max =
  932. vandq_s16(t2, vreinterpretq_s16_u16(vcleq_s16(t2, vmax)));
  933. const int16x8_t gt_max =
  934. vandq_s16(vmax, vreinterpretq_s16_u16(vcgtq_s16(t2, vmax)));
  935. /* Final values as 16-bit integers */
  936. const int16x8_t values = vorrq_s16(le_max, gt_max);
  937. r_.neon_u8 = vmovn_u16(vreinterpretq_u16_s16(values));
  938. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  939. r_.mmi_u8 = packushb(a_.mmi_u16, b_.mmi_u16);
  940. #else
  941. SIMDE_VECTORIZE
  942. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  943. if (a_.i16[i] > UINT8_MAX) {
  944. r_.u8[i] = UINT8_MAX;
  945. } else if (a_.i16[i] < 0) {
  946. r_.u8[i] = 0;
  947. } else {
  948. r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]);
  949. }
  950. }
  951. SIMDE_VECTORIZE
  952. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  953. if (b_.i16[i] > UINT8_MAX) {
  954. r_.u8[i + 4] = UINT8_MAX;
  955. } else if (b_.i16[i] < 0) {
  956. r_.u8[i + 4] = 0;
  957. } else {
  958. r_.u8[i + 4] = HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]);
  959. }
  960. }
  961. #endif
  962. return simde__m64_from_private(r_);
  963. #endif
  964. }
  965. #define simde_m_packuswb(a, b) simde_mm_packs_pu16(a, b)
  966. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  967. #define _mm_packs_pu16(a, b) simde_mm_packs_pu16(a, b)
  968. #define _m_packuswb(a, b) simde_mm_packs_pu16(a, b)
  969. #endif
  970. SIMDE_FUNCTION_ATTRIBUTES
  971. simde__m64 simde_mm_set_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4,
  972. int8_t e3, int8_t e2, int8_t e1, int8_t e0)
  973. {
  974. #if defined(SIMDE_X86_MMX_NATIVE)
  975. return _mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0);
  976. #else
  977. simde__m64_private r_;
  978. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  979. const int8_t v[sizeof(r_.i8) / sizeof(r_.i8[0])] = {e0, e1, e2, e3,
  980. e4, e5, e6, e7};
  981. r_.neon_i8 = vld1_s8(v);
  982. #else
  983. r_.i8[0] = e0;
  984. r_.i8[1] = e1;
  985. r_.i8[2] = e2;
  986. r_.i8[3] = e3;
  987. r_.i8[4] = e4;
  988. r_.i8[5] = e5;
  989. r_.i8[6] = e6;
  990. r_.i8[7] = e7;
  991. #endif
  992. return simde__m64_from_private(r_);
  993. #endif
  994. }
  995. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  996. #define _mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0) \
  997. simde_mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0)
  998. #endif
  999. SIMDE_FUNCTION_ATTRIBUTES
  1000. simde__m64 simde_x_mm_set_pu8(uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4,
  1001. uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0)
  1002. {
  1003. simde__m64_private r_;
  1004. #if defined(SIMDE_X86_MMX_NATIVE)
  1005. r_.n = _mm_set_pi8(
  1006. HEDLEY_STATIC_CAST(int8_t, e7), HEDLEY_STATIC_CAST(int8_t, e6),
  1007. HEDLEY_STATIC_CAST(int8_t, e5), HEDLEY_STATIC_CAST(int8_t, e4),
  1008. HEDLEY_STATIC_CAST(int8_t, e3), HEDLEY_STATIC_CAST(int8_t, e2),
  1009. HEDLEY_STATIC_CAST(int8_t, e1), HEDLEY_STATIC_CAST(int8_t, e0));
  1010. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1011. const uint8_t v[sizeof(r_.u8) / sizeof(r_.u8[0])] = {e0, e1, e2, e3,
  1012. e4, e5, e6, e7};
  1013. r_.neon_u8 = vld1_u8(v);
  1014. #else
  1015. r_.u8[0] = e0;
  1016. r_.u8[1] = e1;
  1017. r_.u8[2] = e2;
  1018. r_.u8[3] = e3;
  1019. r_.u8[4] = e4;
  1020. r_.u8[5] = e5;
  1021. r_.u8[6] = e6;
  1022. r_.u8[7] = e7;
  1023. #endif
  1024. return simde__m64_from_private(r_);
  1025. }
  1026. SIMDE_FUNCTION_ATTRIBUTES
  1027. simde__m64 simde_mm_set_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0)
  1028. {
  1029. #if defined(SIMDE_X86_MMX_NATIVE)
  1030. return _mm_set_pi16(e3, e2, e1, e0);
  1031. #else
  1032. simde__m64_private r_;
  1033. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1034. const int16_t v[sizeof(r_.i16) / sizeof(r_.i16[0])] = {e0, e1, e2, e3};
  1035. r_.neon_i16 = vld1_s16(v);
  1036. #else
  1037. r_.i16[0] = e0;
  1038. r_.i16[1] = e1;
  1039. r_.i16[2] = e2;
  1040. r_.i16[3] = e3;
  1041. #endif
  1042. return simde__m64_from_private(r_);
  1043. #endif
  1044. }
  1045. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1046. #define _mm_set_pi16(e3, e2, e1, e0) simde_mm_set_pi16(e3, e2, e1, e0)
  1047. #endif
  1048. SIMDE_FUNCTION_ATTRIBUTES
  1049. simde__m64 simde_x_mm_set_pu16(uint16_t e3, uint16_t e2, uint16_t e1,
  1050. uint16_t e0)
  1051. {
  1052. simde__m64_private r_;
  1053. #if defined(SIMDE_X86_MMX_NATIVE)
  1054. r_.n = _mm_set_pi16(HEDLEY_STATIC_CAST(int16_t, e3),
  1055. HEDLEY_STATIC_CAST(int16_t, e2),
  1056. HEDLEY_STATIC_CAST(int16_t, e1),
  1057. HEDLEY_STATIC_CAST(int16_t, e0));
  1058. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1059. const uint16_t v[sizeof(r_.u16) / sizeof(r_.u16[0])] = {e0, e1, e2, e3};
  1060. r_.neon_u16 = vld1_u16(v);
  1061. #else
  1062. r_.u16[0] = e0;
  1063. r_.u16[1] = e1;
  1064. r_.u16[2] = e2;
  1065. r_.u16[3] = e3;
  1066. #endif
  1067. return simde__m64_from_private(r_);
  1068. }
  1069. SIMDE_FUNCTION_ATTRIBUTES
  1070. simde__m64 simde_x_mm_set_pu32(uint32_t e1, uint32_t e0)
  1071. {
  1072. simde__m64_private r_;
  1073. #if defined(SIMDE_X86_MMX_NATIVE)
  1074. r_.n = _mm_set_pi32(HEDLEY_STATIC_CAST(int32_t, e1),
  1075. HEDLEY_STATIC_CAST(int32_t, e0));
  1076. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1077. const uint32_t v[sizeof(r_.u32) / sizeof(r_.u32[0])] = {e0, e1};
  1078. r_.neon_u32 = vld1_u32(v);
  1079. #else
  1080. r_.u32[0] = e0;
  1081. r_.u32[1] = e1;
  1082. #endif
  1083. return simde__m64_from_private(r_);
  1084. }
  1085. SIMDE_FUNCTION_ATTRIBUTES
  1086. simde__m64 simde_mm_set_pi32(int32_t e1, int32_t e0)
  1087. {
  1088. simde__m64_private r_;
  1089. #if defined(SIMDE_X86_MMX_NATIVE)
  1090. r_.n = _mm_set_pi32(e1, e0);
  1091. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1092. const int32_t v[sizeof(r_.i32) / sizeof(r_.i32[0])] = {e0, e1};
  1093. r_.neon_i32 = vld1_s32(v);
  1094. #else
  1095. r_.i32[0] = e0;
  1096. r_.i32[1] = e1;
  1097. #endif
  1098. return simde__m64_from_private(r_);
  1099. }
  1100. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1101. #define _mm_set_pi32(e1, e0) simde_mm_set_pi32(e1, e0)
  1102. #endif
  1103. SIMDE_FUNCTION_ATTRIBUTES
  1104. simde__m64 simde_x_mm_set_pi64(int64_t e0)
  1105. {
  1106. simde__m64_private r_;
  1107. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1108. const int64_t v[sizeof(r_.i64) / sizeof(r_.i64[0])] = {e0};
  1109. r_.neon_i64 = vld1_s64(v);
  1110. #else
  1111. r_.i64[0] = e0;
  1112. #endif
  1113. return simde__m64_from_private(r_);
  1114. }
  1115. SIMDE_FUNCTION_ATTRIBUTES
  1116. simde__m64 simde_x_mm_set_f32x2(simde_float32 e1, simde_float32 e0)
  1117. {
  1118. simde__m64_private r_;
  1119. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1120. const simde_float32 v[sizeof(r_.f32) / sizeof(r_.f32[0])] = {e0, e1};
  1121. r_.neon_f32 = vld1_f32(v);
  1122. #else
  1123. r_.f32[0] = e0;
  1124. r_.f32[1] = e1;
  1125. #endif
  1126. return simde__m64_from_private(r_);
  1127. }
  1128. SIMDE_FUNCTION_ATTRIBUTES
  1129. simde__m64 simde_mm_set1_pi8(int8_t a)
  1130. {
  1131. #if defined(SIMDE_X86_MMX_NATIVE)
  1132. return _mm_set1_pi8(a);
  1133. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1134. simde__m64_private r_;
  1135. r_.neon_i8 = vmov_n_s8(a);
  1136. return simde__m64_from_private(r_);
  1137. #else
  1138. return simde_mm_set_pi8(a, a, a, a, a, a, a, a);
  1139. #endif
  1140. }
  1141. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1142. #define _mm_set1_pi8(a) simde_mm_set1_pi8(a)
  1143. #endif
  1144. SIMDE_FUNCTION_ATTRIBUTES
  1145. simde__m64 simde_mm_set1_pi16(int16_t a)
  1146. {
  1147. #if defined(SIMDE_X86_MMX_NATIVE)
  1148. return _mm_set1_pi16(a);
  1149. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1150. simde__m64_private r_;
  1151. r_.neon_i16 = vmov_n_s16(a);
  1152. return simde__m64_from_private(r_);
  1153. #else
  1154. return simde_mm_set_pi16(a, a, a, a);
  1155. #endif
  1156. }
  1157. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1158. #define _mm_set1_pi16(a) simde_mm_set1_pi16(a)
  1159. #endif
  1160. SIMDE_FUNCTION_ATTRIBUTES
  1161. simde__m64 simde_mm_set1_pi32(int32_t a)
  1162. {
  1163. #if defined(SIMDE_X86_MMX_NATIVE)
  1164. return _mm_set1_pi32(a);
  1165. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1166. simde__m64_private r_;
  1167. r_.neon_i32 = vmov_n_s32(a);
  1168. return simde__m64_from_private(r_);
  1169. #else
  1170. return simde_mm_set_pi32(a, a);
  1171. #endif
  1172. }
  1173. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1174. #define _mm_set1_pi32(a) simde_mm_set1_pi32(a)
  1175. #endif
  1176. SIMDE_FUNCTION_ATTRIBUTES
  1177. simde__m64 simde_mm_setr_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4,
  1178. int8_t e3, int8_t e2, int8_t e1, int8_t e0)
  1179. {
  1180. #if defined(SIMDE_X86_MMX_NATIVE)
  1181. return _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0);
  1182. #else
  1183. return simde_mm_set_pi8(e0, e1, e2, e3, e4, e5, e6, e7);
  1184. #endif
  1185. }
  1186. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1187. #define _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0) \
  1188. simde_mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0)
  1189. #endif
  1190. SIMDE_FUNCTION_ATTRIBUTES
  1191. simde__m64 simde_mm_setr_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0)
  1192. {
  1193. #if defined(SIMDE_X86_MMX_NATIVE)
  1194. return _mm_setr_pi16(e3, e2, e1, e0);
  1195. #else
  1196. return simde_mm_set_pi16(e0, e1, e2, e3);
  1197. #endif
  1198. }
  1199. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1200. #define _mm_setr_pi16(e3, e2, e1, e0) simde_mm_setr_pi16(e3, e2, e1, e0)
  1201. #endif
  1202. SIMDE_FUNCTION_ATTRIBUTES
  1203. simde__m64 simde_mm_setr_pi32(int32_t e1, int32_t e0)
  1204. {
  1205. #if defined(SIMDE_X86_MMX_NATIVE)
  1206. return _mm_setr_pi32(e1, e0);
  1207. #else
  1208. return simde_mm_set_pi32(e0, e1);
  1209. #endif
  1210. }
  1211. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1212. #define _mm_setr_pi32(e1, e0) simde_mm_setr_pi32(e1, e0)
  1213. #endif
  1214. SIMDE_FUNCTION_ATTRIBUTES
  1215. simde__m64 simde_mm_setzero_si64(void)
  1216. {
  1217. #if defined(SIMDE_X86_MMX_NATIVE)
  1218. return _mm_setzero_si64();
  1219. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1220. simde__m64_private r_;
  1221. r_.neon_u32 = vmov_n_u32(0);
  1222. return simde__m64_from_private(r_);
  1223. #else
  1224. return simde_mm_set_pi32(0, 0);
  1225. #endif
  1226. }
  1227. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1228. #define _mm_setzero_si64() simde_mm_setzero_si64()
  1229. #endif
  1230. SIMDE_FUNCTION_ATTRIBUTES
  1231. simde__m64 simde_x_mm_load_si64(const void *mem_addr)
  1232. {
  1233. simde__m64 r;
  1234. simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m64),
  1235. sizeof(r));
  1236. return r;
  1237. }
  1238. SIMDE_FUNCTION_ATTRIBUTES
  1239. simde__m64 simde_x_mm_loadu_si64(const void *mem_addr)
  1240. {
  1241. simde__m64 r;
  1242. simde_memcpy(&r, mem_addr, sizeof(r));
  1243. return r;
  1244. }
  1245. SIMDE_FUNCTION_ATTRIBUTES
  1246. void simde_x_mm_store_si64(void *mem_addr, simde__m64 value)
  1247. {
  1248. simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m64), &value,
  1249. sizeof(value));
  1250. }
  1251. SIMDE_FUNCTION_ATTRIBUTES
  1252. void simde_x_mm_storeu_si64(void *mem_addr, simde__m64 value)
  1253. {
  1254. simde_memcpy(mem_addr, &value, sizeof(value));
  1255. }
  1256. SIMDE_FUNCTION_ATTRIBUTES
  1257. simde__m64 simde_x_mm_setone_si64(void)
  1258. {
  1259. return simde_mm_set1_pi32(~INT32_C(0));
  1260. }
  1261. SIMDE_FUNCTION_ATTRIBUTES
  1262. simde__m64 simde_mm_sll_pi16(simde__m64 a, simde__m64 count)
  1263. {
  1264. #if defined(SIMDE_X86_MMX_NATIVE)
  1265. return _mm_sll_pi16(a, count);
  1266. #else
  1267. simde__m64_private r_;
  1268. simde__m64_private a_ = simde__m64_to_private(a);
  1269. simde__m64_private count_ = simde__m64_to_private(count);
  1270. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1271. HEDLEY_DIAGNOSTIC_PUSH
  1272. #if HEDLEY_HAS_WARNING("-Wvector-conversion") && \
  1273. SIMDE_DETECT_CLANG_VERSION_NOT(10, 0, 0)
  1274. #pragma clang diagnostic ignored "-Wvector-conversion"
  1275. #endif
  1276. r_.neon_i16 =
  1277. vshl_s16(a_.neon_i16,
  1278. vmov_n_s16(HEDLEY_STATIC_CAST(
  1279. int16_t, vget_lane_u64(count_.neon_u64, 0))));
  1280. HEDLEY_DIAGNOSTIC_POP
  1281. #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \
  1282. defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT)
  1283. if (HEDLEY_UNLIKELY(count_.u64[0] > 15))
  1284. return simde_mm_setzero_si64();
  1285. r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, count_.u64[0]);
  1286. #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1287. r_.i16 = a_.i16 << count_.u64[0];
  1288. #else
  1289. if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) {
  1290. simde_memset(&r_, 0, sizeof(r_));
  1291. return simde__m64_from_private(r_);
  1292. }
  1293. SIMDE_VECTORIZE
  1294. for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
  1295. r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t,
  1296. a_.u16[i] << count_.u64[0]);
  1297. }
  1298. #endif
  1299. return simde__m64_from_private(r_);
  1300. #endif
  1301. }
  1302. #define simde_m_psllw(a, count) simde_mm_sll_pi16(a, count)
  1303. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1304. #define _mm_sll_pi16(a, count) simde_mm_sll_pi16(a, count)
  1305. #define _m_psllw(a, count) simde_mm_sll_pi16(a, count)
  1306. #endif
  1307. SIMDE_FUNCTION_ATTRIBUTES
  1308. simde__m64 simde_mm_sll_pi32(simde__m64 a, simde__m64 count)
  1309. {
  1310. #if defined(SIMDE_X86_MMX_NATIVE)
  1311. return _mm_sll_pi32(a, count);
  1312. #else
  1313. simde__m64_private r_;
  1314. simde__m64_private a_ = simde__m64_to_private(a);
  1315. simde__m64_private count_ = simde__m64_to_private(count);
  1316. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1317. HEDLEY_DIAGNOSTIC_PUSH
  1318. #if HEDLEY_HAS_WARNING("-Wvector-conversion") && \
  1319. SIMDE_DETECT_CLANG_VERSION_NOT(10, 0, 0)
  1320. #pragma clang diagnostic ignored "-Wvector-conversion"
  1321. #endif
  1322. r_.neon_i32 =
  1323. vshl_s32(a_.neon_i32,
  1324. vmov_n_s32(HEDLEY_STATIC_CAST(
  1325. int32_t, vget_lane_u64(count_.neon_u64, 0))));
  1326. HEDLEY_DIAGNOSTIC_POP
  1327. #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1328. r_.i32 = a_.i32 << count_.u64[0];
  1329. #else
  1330. if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) {
  1331. simde_memset(&r_, 0, sizeof(r_));
  1332. return simde__m64_from_private(r_);
  1333. }
  1334. SIMDE_VECTORIZE
  1335. for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
  1336. r_.u32[i] = a_.u32[i] << count_.u64[0];
  1337. }
  1338. #endif
  1339. return simde__m64_from_private(r_);
  1340. #endif
  1341. }
  1342. #define simde_m_pslld(a, count) simde_mm_sll_pi32(a, count)
  1343. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1344. #define _mm_sll_pi32(a, count) simde_mm_sll_pi32(a, count)
  1345. #define _m_pslld(a, count) simde_mm_sll_pi32(a, count)
  1346. #endif
  1347. SIMDE_FUNCTION_ATTRIBUTES
  1348. simde__m64 simde_mm_slli_pi16(simde__m64 a, int count)
  1349. {
  1350. #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
  1351. return _mm_slli_pi16(a, count);
  1352. #else
  1353. simde__m64_private r_;
  1354. simde__m64_private a_ = simde__m64_to_private(a);
  1355. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \
  1356. defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT)
  1357. if (HEDLEY_UNLIKELY(count > 15))
  1358. return simde_mm_setzero_si64();
  1359. r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, count);
  1360. #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1361. r_.i16 = a_.i16 << count;
  1362. #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1363. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1364. r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16((int16_t)count));
  1365. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  1366. r_.mmi_i16 = psllh_s(a_.mmi_i16, b_.mmi_i16);
  1367. #else
  1368. SIMDE_VECTORIZE
  1369. for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
  1370. r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i] << count);
  1371. }
  1372. #endif
  1373. return simde__m64_from_private(r_);
  1374. #endif
  1375. }
  1376. #define simde_m_psllwi(a, count) simde_mm_slli_pi16(a, count)
  1377. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1378. #define _mm_slli_pi16(a, count) simde_mm_slli_pi16(a, count)
  1379. #define _m_psllwi(a, count) simde_mm_slli_pi16(a, count)
  1380. #endif
  1381. SIMDE_FUNCTION_ATTRIBUTES
  1382. simde__m64 simde_mm_slli_pi32(simde__m64 a, int count)
  1383. {
  1384. #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
  1385. return _mm_slli_pi32(a, count);
  1386. #else
  1387. simde__m64_private r_;
  1388. simde__m64_private a_ = simde__m64_to_private(a);
  1389. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1390. r_.i32 = a_.i32 << count;
  1391. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1392. r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32((int32_t)count));
  1393. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  1394. r_.mmi_i32 = psllw_s(a_.mmi_i32, b_.mmi_i32);
  1395. #else
  1396. SIMDE_VECTORIZE
  1397. for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
  1398. r_.u32[i] = a_.u32[i] << count;
  1399. }
  1400. #endif
  1401. return simde__m64_from_private(r_);
  1402. #endif
  1403. }
  1404. #define simde_m_pslldi(a, b) simde_mm_slli_pi32(a, b)
  1405. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1406. #define _mm_slli_pi32(a, count) simde_mm_slli_pi32(a, count)
  1407. #define _m_pslldi(a, count) simde_mm_slli_pi32(a, count)
  1408. #endif
  1409. SIMDE_FUNCTION_ATTRIBUTES
  1410. simde__m64 simde_mm_slli_si64(simde__m64 a, int count)
  1411. {
  1412. #if defined(SIMDE_X86_MMX_NATIVE)
  1413. return _mm_slli_si64(a, count);
  1414. #else
  1415. simde__m64_private r_;
  1416. simde__m64_private a_ = simde__m64_to_private(a);
  1417. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1418. r_.i64 = a_.i64 << count;
  1419. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1420. r_.neon_i64 = vshl_s64(a_.neon_i64, vmov_n_s64((int64_t)count));
  1421. #else
  1422. r_.u64[0] = a_.u64[0] << count;
  1423. #endif
  1424. return simde__m64_from_private(r_);
  1425. #endif
  1426. }
  1427. #define simde_m_psllqi(a, count) simde_mm_slli_si64(a, count)
  1428. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1429. #define _mm_slli_si64(a, count) simde_mm_slli_si64(a, count)
  1430. #define _m_psllqi(a, count) simde_mm_slli_si64(a, count)
  1431. #endif
  1432. SIMDE_FUNCTION_ATTRIBUTES
  1433. simde__m64 simde_mm_sll_si64(simde__m64 a, simde__m64 count)
  1434. {
  1435. #if defined(SIMDE_X86_MMX_NATIVE)
  1436. return _mm_sll_si64(a, count);
  1437. #else
  1438. simde__m64_private r_;
  1439. simde__m64_private a_ = simde__m64_to_private(a);
  1440. simde__m64_private count_ = simde__m64_to_private(count);
  1441. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1442. r_.neon_i64 = vshl_s64(a_.neon_i64, count_.neon_i64);
  1443. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  1444. r_.i64 = a_.i64 << count_.i64;
  1445. #else
  1446. if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) {
  1447. simde_memset(&r_, 0, sizeof(r_));
  1448. return simde__m64_from_private(r_);
  1449. }
  1450. r_.u64[0] = a_.u64[0] << count_.u64[0];
  1451. #endif
  1452. return simde__m64_from_private(r_);
  1453. #endif
  1454. }
  1455. #define simde_m_psllq(a, count) simde_mm_sll_si64(a, count)
  1456. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1457. #define _mm_sll_si64(a, count) simde_mm_sll_si64(a, count)
  1458. #define _m_psllq(a, count) simde_mm_sll_si64(a, count)
  1459. #endif
  1460. SIMDE_FUNCTION_ATTRIBUTES
  1461. simde__m64 simde_mm_srl_pi16(simde__m64 a, simde__m64 count)
  1462. {
  1463. #if defined(SIMDE_X86_MMX_NATIVE)
  1464. return _mm_srl_pi16(a, count);
  1465. #else
  1466. simde__m64_private r_;
  1467. simde__m64_private a_ = simde__m64_to_private(a);
  1468. simde__m64_private count_ = simde__m64_to_private(count);
  1469. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \
  1470. defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT)
  1471. if (HEDLEY_UNLIKELY(count_.u64[0] > 15))
  1472. return simde_mm_setzero_si64();
  1473. r_.i16 = a_.i16 >> HEDLEY_STATIC_CAST(int16_t, count_.u64[0]);
  1474. #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1475. r_.u16 = a_.u16 >> count_.u64[0];
  1476. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1477. r_.neon_u16 = vshl_u16(
  1478. a_.neon_u16,
  1479. vmov_n_s16(-((int16_t)vget_lane_u64(count_.neon_u64, 0))));
  1480. #else
  1481. if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) {
  1482. simde_memset(&r_, 0, sizeof(r_));
  1483. return simde__m64_from_private(r_);
  1484. }
  1485. SIMDE_VECTORIZE
  1486. for (size_t i = 0; i < sizeof(r_.u16) / sizeof(r_.u16[0]); i++) {
  1487. r_.u16[i] = a_.u16[i] >> count_.u64[0];
  1488. }
  1489. #endif
  1490. return simde__m64_from_private(r_);
  1491. #endif
  1492. }
  1493. #define simde_m_psrlw(a, count) simde_mm_srl_pi16(a, count)
  1494. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1495. #define _mm_srl_pi16(a, count) simde_mm_srl_pi16(a, count)
  1496. #define _m_psrlw(a, count) simde_mm_srl_pi16(a, count)
  1497. #endif
  1498. SIMDE_FUNCTION_ATTRIBUTES
  1499. simde__m64 simde_mm_srl_pi32(simde__m64 a, simde__m64 count)
  1500. {
  1501. #if defined(SIMDE_X86_MMX_NATIVE)
  1502. return _mm_srl_pi32(a, count);
  1503. #else
  1504. simde__m64_private r_;
  1505. simde__m64_private a_ = simde__m64_to_private(a);
  1506. simde__m64_private count_ = simde__m64_to_private(count);
  1507. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1508. r_.u32 = a_.u32 >> count_.u64[0];
  1509. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1510. r_.neon_u32 = vshl_u32(
  1511. a_.neon_u32,
  1512. vmov_n_s32(-((int32_t)vget_lane_u64(count_.neon_u64, 0))));
  1513. #else
  1514. if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) {
  1515. simde_memset(&r_, 0, sizeof(r_));
  1516. return simde__m64_from_private(r_);
  1517. }
  1518. SIMDE_VECTORIZE
  1519. for (size_t i = 0; i < sizeof(r_.u32) / sizeof(r_.u32[0]); i++) {
  1520. r_.u32[i] = a_.u32[i] >> count_.u64[0];
  1521. }
  1522. #endif
  1523. return simde__m64_from_private(r_);
  1524. #endif
  1525. }
  1526. #define simde_m_psrld(a, count) simde_mm_srl_pi32(a, count)
  1527. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1528. #define _mm_srl_pi32(a, count) simde_mm_srl_pi32(a, count)
  1529. #define _m_psrld(a, count) simde_mm_srl_pi32(a, count)
  1530. #endif
  1531. SIMDE_FUNCTION_ATTRIBUTES
  1532. simde__m64 simde_mm_srli_pi16(simde__m64 a, int count)
  1533. {
  1534. #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
  1535. return _mm_srli_pi16(a, count);
  1536. #else
  1537. simde__m64_private r_;
  1538. simde__m64_private a_ = simde__m64_to_private(a);
  1539. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1540. r_.u16 = a_.u16 >> count;
  1541. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1542. r_.neon_u16 = vshl_u16(a_.neon_u16, vmov_n_s16(-((int16_t)count)));
  1543. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  1544. r_.mmi_i16 = psrlh_s(a_.mmi_i16, b_.mmi_i16);
  1545. #else
  1546. SIMDE_VECTORIZE
  1547. for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
  1548. r_.u16[i] = a_.u16[i] >> count;
  1549. }
  1550. #endif
  1551. return simde__m64_from_private(r_);
  1552. #endif
  1553. }
  1554. #define simde_m_psrlwi(a, count) simde_mm_srli_pi16(a, count)
  1555. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1556. #define _mm_srli_pi16(a, count) simde_mm_srli_pi16(a, count)
  1557. #define _m_psrlwi(a, count) simde_mm_srli_pi16(a, count)
  1558. #endif
  1559. SIMDE_FUNCTION_ATTRIBUTES
  1560. simde__m64 simde_mm_srli_pi32(simde__m64 a, int count)
  1561. {
  1562. #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
  1563. return _mm_srli_pi32(a, count);
  1564. #else
  1565. simde__m64_private r_;
  1566. simde__m64_private a_ = simde__m64_to_private(a);
  1567. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1568. r_.u32 = a_.u32 >> count;
  1569. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1570. r_.neon_u32 = vshl_u32(a_.neon_u32, vmov_n_s32(-((int32_t)count)));
  1571. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  1572. r_.mmi_i32 = psrlw_s(a_.mmi_i32, b_.mmi_i32);
  1573. #else
  1574. SIMDE_VECTORIZE
  1575. for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
  1576. r_.u32[i] = a_.u32[i] >> count;
  1577. }
  1578. #endif
  1579. return simde__m64_from_private(r_);
  1580. #endif
  1581. }
  1582. #define simde_m_psrldi(a, count) simde_mm_srli_pi32(a, count)
  1583. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1584. #define _mm_srli_pi32(a, count) simde_mm_srli_pi32(a, count)
  1585. #define _m_psrldi(a, count) simde_mm_srli_pi32(a, count)
  1586. #endif
  1587. SIMDE_FUNCTION_ATTRIBUTES
  1588. simde__m64 simde_mm_srli_si64(simde__m64 a, int count)
  1589. {
  1590. #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
  1591. return _mm_srli_si64(a, count);
  1592. #else
  1593. simde__m64_private r_;
  1594. simde__m64_private a_ = simde__m64_to_private(a);
  1595. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1596. r_.neon_u64 = vshl_u64(a_.neon_u64, vmov_n_s64(-count));
  1597. #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1598. r_.u64 = a_.u64 >> count;
  1599. #else
  1600. r_.u64[0] = a_.u64[0] >> count;
  1601. #endif
  1602. return simde__m64_from_private(r_);
  1603. #endif
  1604. }
  1605. #define simde_m_psrlqi(a, count) simde_mm_srli_si64(a, count)
  1606. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1607. #define _mm_srli_si64(a, count) simde_mm_srli_si64(a, count)
  1608. #define _m_psrlqi(a, count) simde_mm_srli_si64(a, count)
  1609. #endif
  1610. SIMDE_FUNCTION_ATTRIBUTES
  1611. simde__m64 simde_mm_srl_si64(simde__m64 a, simde__m64 count)
  1612. {
  1613. #if defined(SIMDE_X86_MMX_NATIVE)
  1614. return _mm_srl_si64(a, count);
  1615. #else
  1616. simde__m64_private r_;
  1617. simde__m64_private a_ = simde__m64_to_private(a);
  1618. simde__m64_private count_ = simde__m64_to_private(count);
  1619. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  1620. r_.neon_u64 = vshl_u64(a_.neon_u64, vneg_s64(count_.neon_i64));
  1621. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  1622. r_.u64 = a_.u64 >> count_.u64;
  1623. #else
  1624. if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) {
  1625. simde_memset(&r_, 0, sizeof(r_));
  1626. return simde__m64_from_private(r_);
  1627. }
  1628. r_.u64[0] = a_.u64[0] >> count_.u64[0];
  1629. #endif
  1630. return simde__m64_from_private(r_);
  1631. #endif
  1632. }
  1633. #define simde_m_psrlq(a, count) simde_mm_srl_si64(a, count)
  1634. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1635. #define _mm_srl_si64(a, count) simde_mm_srl_si64(a, count)
  1636. #define _m_psrlq(a, count) simde_mm_srl_si64(a, count)
  1637. #endif
  1638. SIMDE_FUNCTION_ATTRIBUTES
  1639. simde__m64 simde_mm_srai_pi16(simde__m64 a, int count)
  1640. {
  1641. #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
  1642. return _mm_srai_pi16(a, count);
  1643. #else
  1644. simde__m64_private r_;
  1645. simde__m64_private a_ = simde__m64_to_private(a);
  1646. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1647. r_.i16 = a_.i16 >> (count & 0xff);
  1648. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1649. r_.neon_i16 = vshl_s16(a_.neon_i16,
  1650. vmov_n_s16(-HEDLEY_STATIC_CAST(int16_t, count)));
  1651. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  1652. r_.mmi_i16 = psrah_s(a_.mmi_i16, count);
  1653. #else
  1654. SIMDE_VECTORIZE
  1655. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  1656. r_.i16[i] = a_.i16[i] >> (count & 0xff);
  1657. }
  1658. #endif
  1659. return simde__m64_from_private(r_);
  1660. #endif
  1661. }
  1662. #define simde_m_psrawi(a, count) simde_mm_srai_pi16(a, count)
  1663. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1664. #define _mm_srai_pi16(a, count) simde_mm_srai_pi16(a, count)
  1665. #define _m_psrawi(a, count) simde_mm_srai_pi16(a, count)
  1666. #endif
  1667. SIMDE_FUNCTION_ATTRIBUTES
  1668. simde__m64 simde_mm_srai_pi32(simde__m64 a, int count)
  1669. {
  1670. #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
  1671. return _mm_srai_pi32(a, count);
  1672. #else
  1673. simde__m64_private r_;
  1674. simde__m64_private a_ = simde__m64_to_private(a);
  1675. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1676. r_.i32 = a_.i32 >> (count & 0xff);
  1677. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1678. r_.neon_i32 = vshl_s32(a_.neon_i32,
  1679. vmov_n_s32(-HEDLEY_STATIC_CAST(int32_t, count)));
  1680. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  1681. r_.mmi_i32 = psraw_s(a_.mmi_i32, count);
  1682. #else
  1683. SIMDE_VECTORIZE
  1684. for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
  1685. r_.i32[i] = a_.i32[i] >> (count & 0xff);
  1686. }
  1687. #endif
  1688. return simde__m64_from_private(r_);
  1689. #endif
  1690. }
  1691. #define simde_m_psradi(a, count) simde_mm_srai_pi32(a, count)
  1692. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1693. #define _mm_srai_pi32(a, count) simde_mm_srai_pi32(a, count)
  1694. #define _m_psradi(a, count) simde_mm_srai_pi32(a, count)
  1695. #endif
  1696. SIMDE_FUNCTION_ATTRIBUTES
  1697. simde__m64 simde_mm_sra_pi16(simde__m64 a, simde__m64 count)
  1698. {
  1699. #if defined(SIMDE_X86_MMX_NATIVE)
  1700. return _mm_sra_pi16(a, count);
  1701. #else
  1702. simde__m64_private r_;
  1703. simde__m64_private a_ = simde__m64_to_private(a);
  1704. simde__m64_private count_ = simde__m64_to_private(count);
  1705. const int cnt = HEDLEY_STATIC_CAST(
  1706. int, (count_.i64[0] > 15 ? 15 : count_.i64[0]));
  1707. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1708. r_.i16 = a_.i16 >> cnt;
  1709. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1710. r_.neon_i16 =
  1711. vshl_s16(a_.neon_i16,
  1712. vmov_n_s16(-HEDLEY_STATIC_CAST(
  1713. int16_t, vget_lane_u64(count_.neon_u64, 0))));
  1714. #else
  1715. SIMDE_VECTORIZE
  1716. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  1717. r_.i16[i] = a_.i16[i] >> cnt;
  1718. }
  1719. #endif
  1720. return simde__m64_from_private(r_);
  1721. #endif
  1722. }
  1723. #define simde_m_psraw(a, count) simde_mm_sra_pi16(a, count)
  1724. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1725. #define _mm_sra_pi16(a, count) simde_mm_sra_pi16(a, count)
  1726. #define _m_psraw(a, count) simde_mm_sra_pi16(a, count)
  1727. #endif
  1728. SIMDE_FUNCTION_ATTRIBUTES
  1729. simde__m64 simde_mm_sra_pi32(simde__m64 a, simde__m64 count)
  1730. {
  1731. #if defined(SIMDE_X86_MMX_NATIVE)
  1732. return _mm_sra_pi32(a, count);
  1733. #else
  1734. simde__m64_private r_;
  1735. simde__m64_private a_ = simde__m64_to_private(a);
  1736. simde__m64_private count_ = simde__m64_to_private(count);
  1737. const int32_t cnt =
  1738. (count_.u64[0] > 31)
  1739. ? 31
  1740. : HEDLEY_STATIC_CAST(int32_t, count_.u64[0]);
  1741. #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  1742. r_.i32 = a_.i32 >> cnt;
  1743. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1744. r_.neon_i32 =
  1745. vshl_s32(a_.neon_i32,
  1746. vmov_n_s32(-HEDLEY_STATIC_CAST(
  1747. int32_t, vget_lane_u64(count_.neon_u64, 0))));
  1748. #else
  1749. SIMDE_VECTORIZE
  1750. for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
  1751. r_.i32[i] = a_.i32[i] >> cnt;
  1752. }
  1753. #endif
  1754. return simde__m64_from_private(r_);
  1755. #endif
  1756. }
  1757. #define simde_m_psrad(a, b) simde_mm_sra_pi32(a, b)
  1758. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1759. #define _mm_sra_pi32(a, count) simde_mm_sra_pi32(a, count)
  1760. #define _m_psrad(a, count) simde_mm_sra_pi32(a, count)
  1761. #endif
  1762. SIMDE_FUNCTION_ATTRIBUTES
  1763. simde__m64 simde_mm_sub_pi8(simde__m64 a, simde__m64 b)
  1764. {
  1765. #if defined(SIMDE_X86_MMX_NATIVE)
  1766. return _mm_sub_pi8(a, b);
  1767. #else
  1768. simde__m64_private r_;
  1769. simde__m64_private a_ = simde__m64_to_private(a);
  1770. simde__m64_private b_ = simde__m64_to_private(b);
  1771. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1772. r_.neon_i8 = vsub_s8(a_.neon_i8, b_.neon_i8);
  1773. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  1774. r_.mmi_i8 = psubb_s(a_.mmi_i8, b_.mmi_i8);
  1775. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  1776. r_.i8 = a_.i8 - b_.i8;
  1777. #else
  1778. SIMDE_VECTORIZE
  1779. for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
  1780. r_.i8[i] = a_.i8[i] - b_.i8[i];
  1781. }
  1782. #endif
  1783. return simde__m64_from_private(r_);
  1784. #endif
  1785. }
  1786. #define simde_m_psubb(a, b) simde_mm_sub_pi8(a, b)
  1787. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1788. #define _mm_sub_pi8(a, b) simde_mm_sub_pi8(a, b)
  1789. #define _m_psubb(a, b) simde_mm_sub_pi8(a, b)
  1790. #endif
  1791. SIMDE_FUNCTION_ATTRIBUTES
  1792. simde__m64 simde_mm_sub_pi16(simde__m64 a, simde__m64 b)
  1793. {
  1794. #if defined(SIMDE_X86_MMX_NATIVE)
  1795. return _mm_sub_pi16(a, b);
  1796. #else
  1797. simde__m64_private r_;
  1798. simde__m64_private a_ = simde__m64_to_private(a);
  1799. simde__m64_private b_ = simde__m64_to_private(b);
  1800. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1801. r_.neon_i16 = vsub_s16(a_.neon_i16, b_.neon_i16);
  1802. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  1803. r_.mmi_i16 = psubh_s(a_.mmi_i16, b_.mmi_i16);
  1804. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  1805. r_.i16 = a_.i16 - b_.i16;
  1806. #else
  1807. SIMDE_VECTORIZE
  1808. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  1809. r_.i16[i] = a_.i16[i] - b_.i16[i];
  1810. }
  1811. #endif
  1812. return simde__m64_from_private(r_);
  1813. #endif
  1814. }
  1815. #define simde_m_psubw(a, b) simde_mm_sub_pi16(a, b)
  1816. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1817. #define _mm_sub_pi16(a, b) simde_mm_sub_pi16(a, b)
  1818. #define _m_psubw(a, b) simde_mm_sub_pi16(a, b)
  1819. #endif
  1820. SIMDE_FUNCTION_ATTRIBUTES
  1821. simde__m64 simde_mm_sub_pi32(simde__m64 a, simde__m64 b)
  1822. {
  1823. #if defined(SIMDE_X86_MMX_NATIVE)
  1824. return _mm_sub_pi32(a, b);
  1825. #else
  1826. simde__m64_private r_;
  1827. simde__m64_private a_ = simde__m64_to_private(a);
  1828. simde__m64_private b_ = simde__m64_to_private(b);
  1829. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1830. r_.neon_i32 = vsub_s32(a_.neon_i32, b_.neon_i32);
  1831. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  1832. r_.mmi_i32 = psubw_s(a_.mmi_i32, b_.mmi_i32);
  1833. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  1834. r_.i32 = a_.i32 - b_.i32;
  1835. #else
  1836. SIMDE_VECTORIZE
  1837. for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
  1838. r_.i32[i] = a_.i32[i] - b_.i32[i];
  1839. }
  1840. #endif
  1841. return simde__m64_from_private(r_);
  1842. #endif
  1843. }
  1844. #define simde_m_psubd(a, b) simde_mm_sub_pi32(a, b)
  1845. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1846. #define _mm_sub_pi32(a, b) simde_mm_sub_pi32(a, b)
  1847. #define _m_psubd(a, b) simde_mm_sub_pi32(a, b)
  1848. #endif
  1849. SIMDE_FUNCTION_ATTRIBUTES
  1850. simde__m64 simde_mm_subs_pi8(simde__m64 a, simde__m64 b)
  1851. {
  1852. #if defined(SIMDE_X86_MMX_NATIVE)
  1853. return _mm_subs_pi8(a, b);
  1854. #else
  1855. simde__m64_private r_;
  1856. simde__m64_private a_ = simde__m64_to_private(a);
  1857. simde__m64_private b_ = simde__m64_to_private(b);
  1858. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1859. r_.neon_i8 = vqsub_s8(a_.neon_i8, b_.neon_i8);
  1860. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  1861. r_.mmi_i8 = psubsb(a_.mmi_i8, b_.mmi_i8);
  1862. #else
  1863. SIMDE_VECTORIZE
  1864. for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
  1865. if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) {
  1866. r_.i8[i] = INT8_MIN;
  1867. } else if ((b_.i8[i]) < 0 &&
  1868. (a_.i8[i]) > INT8_MAX + (b_.i8[i])) {
  1869. r_.i8[i] = INT8_MAX;
  1870. } else {
  1871. r_.i8[i] = (a_.i8[i]) - (b_.i8[i]);
  1872. }
  1873. }
  1874. #endif
  1875. return simde__m64_from_private(r_);
  1876. #endif
  1877. }
  1878. #define simde_m_psubsb(a, b) simde_mm_subs_pi8(a, b)
  1879. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1880. #define _mm_subs_pi8(a, b) simde_mm_subs_pi8(a, b)
  1881. #define _m_psubsb(a, b) simde_mm_subs_pi8(a, b)
  1882. #endif
  1883. SIMDE_FUNCTION_ATTRIBUTES
  1884. simde__m64 simde_mm_subs_pu8(simde__m64 a, simde__m64 b)
  1885. {
  1886. #if defined(SIMDE_X86_MMX_NATIVE)
  1887. return _mm_subs_pu8(a, b);
  1888. #else
  1889. simde__m64_private r_;
  1890. simde__m64_private a_ = simde__m64_to_private(a);
  1891. simde__m64_private b_ = simde__m64_to_private(b);
  1892. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1893. r_.neon_u8 = vqsub_u8(a_.neon_u8, b_.neon_u8);
  1894. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  1895. r_.mmi_u8 = psubusb(a_.mmi_u8, b_.mmi_u8);
  1896. #else
  1897. SIMDE_VECTORIZE
  1898. for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
  1899. const int32_t x = a_.u8[i] - b_.u8[i];
  1900. if (x < 0) {
  1901. r_.u8[i] = 0;
  1902. } else if (x > UINT8_MAX) {
  1903. r_.u8[i] = UINT8_MAX;
  1904. } else {
  1905. r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
  1906. }
  1907. }
  1908. #endif
  1909. return simde__m64_from_private(r_);
  1910. #endif
  1911. }
  1912. #define simde_m_psubusb(a, b) simde_mm_subs_pu8(a, b)
  1913. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1914. #define _mm_subs_pu8(a, b) simde_mm_subs_pu8(a, b)
  1915. #define _m_psubusb(a, b) simde_mm_subs_pu8(a, b)
  1916. #endif
  1917. SIMDE_FUNCTION_ATTRIBUTES
  1918. simde__m64 simde_mm_subs_pi16(simde__m64 a, simde__m64 b)
  1919. {
  1920. #if defined(SIMDE_X86_MMX_NATIVE)
  1921. return _mm_subs_pi16(a, b);
  1922. #else
  1923. simde__m64_private r_;
  1924. simde__m64_private a_ = simde__m64_to_private(a);
  1925. simde__m64_private b_ = simde__m64_to_private(b);
  1926. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1927. r_.neon_i16 = vqsub_s16(a_.neon_i16, b_.neon_i16);
  1928. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  1929. r_.mmi_i16 = psubsh(a_.mmi_i16, b_.mmi_i16);
  1930. #else
  1931. SIMDE_VECTORIZE
  1932. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  1933. if (((b_.i16[i]) > 0 && (a_.i16[i]) < SHRT_MIN + (b_.i16[i]))) {
  1934. r_.i16[i] = SHRT_MIN;
  1935. } else if ((b_.i16[i]) < 0 &&
  1936. (a_.i16[i]) > INT16_MAX + (b_.i16[i])) {
  1937. r_.i16[i] = INT16_MAX;
  1938. } else {
  1939. r_.i16[i] = (a_.i16[i]) - (b_.i16[i]);
  1940. }
  1941. }
  1942. #endif
  1943. return simde__m64_from_private(r_);
  1944. #endif
  1945. }
  1946. #define simde_m_psubsw(a, b) simde_mm_subs_pi16(a, b)
  1947. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1948. #define _mm_subs_pi16(a, b) simde_mm_subs_pi16(a, b)
  1949. #define _m_psubsw(a, b) simde_mm_subs_pi16(a, b)
  1950. #endif
  1951. SIMDE_FUNCTION_ATTRIBUTES
  1952. simde__m64 simde_mm_subs_pu16(simde__m64 a, simde__m64 b)
  1953. {
  1954. #if defined(SIMDE_X86_MMX_NATIVE)
  1955. return _mm_subs_pu16(a, b);
  1956. #else
  1957. simde__m64_private r_;
  1958. simde__m64_private a_ = simde__m64_to_private(a);
  1959. simde__m64_private b_ = simde__m64_to_private(b);
  1960. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1961. r_.neon_u16 = vqsub_u16(a_.neon_u16, b_.neon_u16);
  1962. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  1963. r_.mmi_u16 = psubush(a_.mmi_u16, b_.mmi_u16);
  1964. #else
  1965. SIMDE_VECTORIZE
  1966. for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
  1967. const int x = a_.u16[i] - b_.u16[i];
  1968. if (x < 0) {
  1969. r_.u16[i] = 0;
  1970. } else if (x > UINT16_MAX) {
  1971. r_.u16[i] = UINT16_MAX;
  1972. } else {
  1973. r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
  1974. }
  1975. }
  1976. #endif
  1977. return simde__m64_from_private(r_);
  1978. #endif
  1979. }
  1980. #define simde_m_psubusw(a, b) simde_mm_subs_pu16(a, b)
  1981. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  1982. #define _mm_subs_pu16(a, b) simde_mm_subs_pu16(a, b)
  1983. #define _m_psubusw(a, b) simde_mm_subs_pu16(a, b)
  1984. #endif
  1985. SIMDE_FUNCTION_ATTRIBUTES
  1986. simde__m64 simde_mm_unpackhi_pi8(simde__m64 a, simde__m64 b)
  1987. {
  1988. #if defined(SIMDE_X86_MMX_NATIVE)
  1989. return _mm_unpackhi_pi8(a, b);
  1990. #else
  1991. simde__m64_private r_;
  1992. simde__m64_private a_ = simde__m64_to_private(a);
  1993. simde__m64_private b_ = simde__m64_to_private(b);
  1994. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  1995. r_.neon_i8 = vzip2_s8(a_.neon_i8, b_.neon_i8);
  1996. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  1997. r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 4, 12, 5, 13, 6, 14,
  1998. 7, 15);
  1999. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  2000. r_.mmi_i8 = punpckhbh_s(a_.mmi_i8, b_.mmi_i8);
  2001. #else
  2002. r_.i8[0] = a_.i8[4];
  2003. r_.i8[1] = b_.i8[4];
  2004. r_.i8[2] = a_.i8[5];
  2005. r_.i8[3] = b_.i8[5];
  2006. r_.i8[4] = a_.i8[6];
  2007. r_.i8[5] = b_.i8[6];
  2008. r_.i8[6] = a_.i8[7];
  2009. r_.i8[7] = b_.i8[7];
  2010. #endif
  2011. return simde__m64_from_private(r_);
  2012. #endif
  2013. }
  2014. #define simde_m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b)
  2015. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  2016. #define _mm_unpackhi_pi8(a, b) simde_mm_unpackhi_pi8(a, b)
  2017. #define _m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b)
  2018. #endif
  2019. SIMDE_FUNCTION_ATTRIBUTES
  2020. simde__m64 simde_mm_unpackhi_pi16(simde__m64 a, simde__m64 b)
  2021. {
  2022. #if defined(SIMDE_X86_MMX_NATIVE)
  2023. return _mm_unpackhi_pi16(a, b);
  2024. #else
  2025. simde__m64_private r_;
  2026. simde__m64_private a_ = simde__m64_to_private(a);
  2027. simde__m64_private b_ = simde__m64_to_private(b);
  2028. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  2029. r_.neon_i16 = vzip2_s16(a_.neon_i16, b_.neon_i16);
  2030. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  2031. r_.mmi_i16 = punpckhhw_s(a_.mmi_i16, b_.mmi_i16);
  2032. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  2033. r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 2, 6, 3, 7);
  2034. #else
  2035. r_.i16[0] = a_.i16[2];
  2036. r_.i16[1] = b_.i16[2];
  2037. r_.i16[2] = a_.i16[3];
  2038. r_.i16[3] = b_.i16[3];
  2039. #endif
  2040. return simde__m64_from_private(r_);
  2041. #endif
  2042. }
  2043. #define simde_m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b)
  2044. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  2045. #define _mm_unpackhi_pi16(a, b) simde_mm_unpackhi_pi16(a, b)
  2046. #define _m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b)
  2047. #endif
  2048. SIMDE_FUNCTION_ATTRIBUTES
  2049. simde__m64 simde_mm_unpackhi_pi32(simde__m64 a, simde__m64 b)
  2050. {
  2051. #if defined(SIMDE_X86_MMX_NATIVE)
  2052. return _mm_unpackhi_pi32(a, b);
  2053. #else
  2054. simde__m64_private r_;
  2055. simde__m64_private a_ = simde__m64_to_private(a);
  2056. simde__m64_private b_ = simde__m64_to_private(b);
  2057. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  2058. r_.neon_i32 = vzip2_s32(a_.neon_i32, b_.neon_i32);
  2059. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  2060. r_.mmi_i32 = punpckhwd_s(a_.mmi_i32, b_.mmi_i32);
  2061. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  2062. r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 1, 3);
  2063. #else
  2064. r_.i32[0] = a_.i32[1];
  2065. r_.i32[1] = b_.i32[1];
  2066. #endif
  2067. return simde__m64_from_private(r_);
  2068. #endif
  2069. }
  2070. #define simde_m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b)
  2071. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  2072. #define _mm_unpackhi_pi32(a, b) simde_mm_unpackhi_pi32(a, b)
  2073. #define _m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b)
  2074. #endif
  2075. SIMDE_FUNCTION_ATTRIBUTES
  2076. simde__m64 simde_mm_unpacklo_pi8(simde__m64 a, simde__m64 b)
  2077. {
  2078. #if defined(SIMDE_X86_MMX_NATIVE)
  2079. return _mm_unpacklo_pi8(a, b);
  2080. #else
  2081. simde__m64_private r_;
  2082. simde__m64_private a_ = simde__m64_to_private(a);
  2083. simde__m64_private b_ = simde__m64_to_private(b);
  2084. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  2085. r_.neon_i8 = vzip1_s8(a_.neon_i8, b_.neon_i8);
  2086. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  2087. r_.mmi_i8 = punpcklbh_s(a_.mmi_i8, b_.mmi_i8);
  2088. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  2089. r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 0, 8, 1, 9, 2, 10, 3,
  2090. 11);
  2091. #else
  2092. r_.i8[0] = a_.i8[0];
  2093. r_.i8[1] = b_.i8[0];
  2094. r_.i8[2] = a_.i8[1];
  2095. r_.i8[3] = b_.i8[1];
  2096. r_.i8[4] = a_.i8[2];
  2097. r_.i8[5] = b_.i8[2];
  2098. r_.i8[6] = a_.i8[3];
  2099. r_.i8[7] = b_.i8[3];
  2100. #endif
  2101. return simde__m64_from_private(r_);
  2102. #endif
  2103. }
  2104. #define simde_m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b)
  2105. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  2106. #define _mm_unpacklo_pi8(a, b) simde_mm_unpacklo_pi8(a, b)
  2107. #define _m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b)
  2108. #endif
  2109. SIMDE_FUNCTION_ATTRIBUTES
  2110. simde__m64 simde_mm_unpacklo_pi16(simde__m64 a, simde__m64 b)
  2111. {
  2112. #if defined(SIMDE_X86_MMX_NATIVE)
  2113. return _mm_unpacklo_pi16(a, b);
  2114. #else
  2115. simde__m64_private r_;
  2116. simde__m64_private a_ = simde__m64_to_private(a);
  2117. simde__m64_private b_ = simde__m64_to_private(b);
  2118. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  2119. r_.neon_i16 = vzip1_s16(a_.neon_i16, b_.neon_i16);
  2120. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  2121. r_.mmi_i16 = punpcklhw_s(a_.mmi_i16, b_.mmi_i16);
  2122. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  2123. r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 0, 4, 1, 5);
  2124. #else
  2125. r_.i16[0] = a_.i16[0];
  2126. r_.i16[1] = b_.i16[0];
  2127. r_.i16[2] = a_.i16[1];
  2128. r_.i16[3] = b_.i16[1];
  2129. #endif
  2130. return simde__m64_from_private(r_);
  2131. #endif
  2132. }
  2133. #define simde_m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b)
  2134. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  2135. #define _mm_unpacklo_pi16(a, b) simde_mm_unpacklo_pi16(a, b)
  2136. #define _m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b)
  2137. #endif
  2138. SIMDE_FUNCTION_ATTRIBUTES
  2139. simde__m64 simde_mm_unpacklo_pi32(simde__m64 a, simde__m64 b)
  2140. {
  2141. #if defined(SIMDE_X86_MMX_NATIVE)
  2142. return _mm_unpacklo_pi32(a, b);
  2143. #else
  2144. simde__m64_private r_;
  2145. simde__m64_private a_ = simde__m64_to_private(a);
  2146. simde__m64_private b_ = simde__m64_to_private(b);
  2147. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  2148. r_.neon_i32 = vzip1_s32(a_.neon_i32, b_.neon_i32);
  2149. #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
  2150. r_.mmi_i32 = punpcklwd_s(a_.mmi_i32, b_.mmi_i32);
  2151. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  2152. r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 0, 2);
  2153. #else
  2154. r_.i32[0] = a_.i32[0];
  2155. r_.i32[1] = b_.i32[0];
  2156. #endif
  2157. return simde__m64_from_private(r_);
  2158. #endif
  2159. }
  2160. #define simde_m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b)
  2161. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  2162. #define _mm_unpacklo_pi32(a, b) simde_mm_unpacklo_pi32(a, b)
  2163. #define _m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b)
  2164. #endif
  2165. SIMDE_FUNCTION_ATTRIBUTES
  2166. simde__m64 simde_mm_xor_si64(simde__m64 a, simde__m64 b)
  2167. {
  2168. #if defined(SIMDE_X86_MMX_NATIVE)
  2169. return _mm_xor_si64(a, b);
  2170. #else
  2171. simde__m64_private r_;
  2172. simde__m64_private a_ = simde__m64_to_private(a);
  2173. simde__m64_private b_ = simde__m64_to_private(b);
  2174. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2175. r_.neon_i32 = veor_s32(a_.neon_i32, b_.neon_i32);
  2176. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  2177. r_.i32f = a_.i32f ^ b_.i32f;
  2178. #else
  2179. r_.u64[0] = a_.u64[0] ^ b_.u64[0];
  2180. #endif
  2181. return simde__m64_from_private(r_);
  2182. #endif
  2183. }
  2184. #define simde_m_pxor(a, b) simde_mm_xor_si64(a, b)
  2185. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  2186. #define _mm_xor_si64(a, b) simde_mm_xor_si64(a, b)
  2187. #define _m_pxor(a, b) simde_mm_xor_si64(a, b)
  2188. #endif
  2189. SIMDE_FUNCTION_ATTRIBUTES
  2190. int32_t simde_m_to_int(simde__m64 a)
  2191. {
  2192. #if defined(SIMDE_X86_MMX_NATIVE)
  2193. return _m_to_int(a);
  2194. #else
  2195. simde__m64_private a_ = simde__m64_to_private(a);
  2196. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2197. HEDLEY_DIAGNOSTIC_PUSH
  2198. #if HEDLEY_HAS_WARNING("-Wvector-conversion") && \
  2199. SIMDE_DETECT_CLANG_VERSION_NOT(10, 0, 0)
  2200. #pragma clang diagnostic ignored "-Wvector-conversion"
  2201. #endif
  2202. return vget_lane_s32(a_.neon_i32, 0);
  2203. HEDLEY_DIAGNOSTIC_POP
  2204. #else
  2205. return a_.i32[0];
  2206. #endif
  2207. #endif
  2208. }
  2209. #if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
  2210. #define _m_to_int(a) simde_m_to_int(a)
  2211. #endif
  2212. SIMDE_END_DECLS_
  2213. HEDLEY_DIAGNOSTIC_POP
  2214. #endif /* !defined(SIMDE_X86_MMX_H) */