sse.h 133 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479
  1. /* SPDX-License-Identifier: MIT
  2. *
  3. * Permission is hereby granted, free of charge, to any person
  4. * obtaining a copy of this software and associated documentation
  5. * files (the "Software"), to deal in the Software without
  6. * restriction, including without limitation the rights to use, copy,
  7. * modify, merge, publish, distribute, sublicense, and/or sell copies
  8. * of the Software, and to permit persons to whom the Software is
  9. * furnished to do so, subject to the following conditions:
  10. *
  11. * The above copyright notice and this permission notice shall be
  12. * included in all copies or substantial portions of the Software.
  13. *
  14. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  16. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  17. * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  18. * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  19. * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  20. * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. * SOFTWARE.
  22. *
  23. * Copyright:
  24. * 2017-2020 Evan Nemerson <[email protected]>
  25. * 2015-2017 John W. Ratcliff <[email protected]>
  26. * 2015 Brandon Rowlett <[email protected]>
  27. * 2015 Ken Fast <[email protected]>
  28. */
  29. #if !defined(SIMDE_X86_SSE_H)
  30. #define SIMDE_X86_SSE_H
  31. #include "mmx.h"
  32. #if defined(_WIN32)
  33. #include <windows.h>
  34. #endif
  35. HEDLEY_DIAGNOSTIC_PUSH
  36. SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
  37. SIMDE_BEGIN_DECLS_
  38. typedef union {
  39. #if defined(SIMDE_VECTOR_SUBSCRIPT)
  40. SIMDE_ALIGN_TO_16 int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
  41. SIMDE_ALIGN_TO_16 int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
  42. SIMDE_ALIGN_TO_16 int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
  43. SIMDE_ALIGN_TO_16 int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
  44. SIMDE_ALIGN_TO_16 uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
  45. SIMDE_ALIGN_TO_16 uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
  46. SIMDE_ALIGN_TO_16 uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
  47. SIMDE_ALIGN_TO_16 uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
  48. #if defined(SIMDE_HAVE_INT128_)
  49. SIMDE_ALIGN_TO_16 simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
  50. SIMDE_ALIGN_TO_16 simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
  51. #endif
  52. SIMDE_ALIGN_TO_16 simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
  53. SIMDE_ALIGN_TO_16 int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
  54. SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
  55. #else
  56. SIMDE_ALIGN_TO_16 int8_t i8[16];
  57. SIMDE_ALIGN_TO_16 int16_t i16[8];
  58. SIMDE_ALIGN_TO_16 int32_t i32[4];
  59. SIMDE_ALIGN_TO_16 int64_t i64[2];
  60. SIMDE_ALIGN_TO_16 uint8_t u8[16];
  61. SIMDE_ALIGN_TO_16 uint16_t u16[8];
  62. SIMDE_ALIGN_TO_16 uint32_t u32[4];
  63. SIMDE_ALIGN_TO_16 uint64_t u64[2];
  64. #if defined(SIMDE_HAVE_INT128_)
  65. SIMDE_ALIGN_TO_16 simde_int128 i128[1];
  66. SIMDE_ALIGN_TO_16 simde_uint128 u128[1];
  67. #endif
  68. SIMDE_ALIGN_TO_16 simde_float32 f32[4];
  69. SIMDE_ALIGN_TO_16 int_fast32_t i32f[16 / sizeof(int_fast32_t)];
  70. SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
  71. #endif
  72. SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2];
  73. SIMDE_ALIGN_TO_16 simde__m64 m64[2];
  74. #if defined(SIMDE_X86_SSE_NATIVE)
  75. SIMDE_ALIGN_TO_16 __m128 n;
  76. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  77. SIMDE_ALIGN_TO_16 int8x16_t neon_i8;
  78. SIMDE_ALIGN_TO_16 int16x8_t neon_i16;
  79. SIMDE_ALIGN_TO_16 int32x4_t neon_i32;
  80. SIMDE_ALIGN_TO_16 int64x2_t neon_i64;
  81. SIMDE_ALIGN_TO_16 uint8x16_t neon_u8;
  82. SIMDE_ALIGN_TO_16 uint16x8_t neon_u16;
  83. SIMDE_ALIGN_TO_16 uint32x4_t neon_u32;
  84. SIMDE_ALIGN_TO_16 uint64x2_t neon_u64;
  85. SIMDE_ALIGN_TO_16 float32x4_t neon_f32;
  86. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  87. SIMDE_ALIGN_TO_16 float64x2_t neon_f64;
  88. #endif
  89. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  90. SIMDE_ALIGN_TO_16 v128_t wasm_v128;
  91. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  92. SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8;
  93. SIMDE_ALIGN_TO_16
  94. SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16;
  95. SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32;
  96. SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8;
  97. SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16;
  98. SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32;
  99. SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32;
  100. #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
  101. SIMDE_ALIGN_TO_16
  102. SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
  103. SIMDE_ALIGN_TO_16
  104. SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64;
  105. SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64;
  106. #endif
  107. #endif
  108. } simde__m128_private;
  109. #if defined(SIMDE_X86_SSE_NATIVE)
  110. typedef __m128 simde__m128;
  111. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  112. typedef float32x4_t simde__m128;
  113. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  114. typedef v128_t simde__m128;
  115. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  116. typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128;
  117. #elif defined(SIMDE_VECTOR_SUBSCRIPT)
  118. typedef simde_float32
  119. simde__m128 SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
  120. #else
  121. typedef simde__m128_private simde__m128;
  122. #endif
  123. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  124. typedef simde__m128 __m128;
  125. #endif
  126. HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128), "simde__m128 size incorrect");
  127. HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128_private),
  128. "simde__m128_private size incorrect");
  129. #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
  130. HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128) == 16,
  131. "simde__m128 is not 16-byte aligned");
  132. HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128_private) == 16,
  133. "simde__m128_private is not 16-byte aligned");
  134. #endif
  135. SIMDE_FUNCTION_ATTRIBUTES
  136. simde__m128 simde__m128_from_private(simde__m128_private v)
  137. {
  138. simde__m128 r;
  139. simde_memcpy(&r, &v, sizeof(r));
  140. return r;
  141. }
  142. SIMDE_FUNCTION_ATTRIBUTES
  143. simde__m128_private simde__m128_to_private(simde__m128 v)
  144. {
  145. simde__m128_private r;
  146. simde_memcpy(&r, &v, sizeof(r));
  147. return r;
  148. }
  149. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  150. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int8x16_t, neon, i8)
  151. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int16x8_t, neon, i16)
  152. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int32x4_t, neon, i32)
  153. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int64x2_t, neon, i64)
  154. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint8x16_t, neon, u8)
  155. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint16x8_t, neon, u16)
  156. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint32x4_t, neon, u32)
  157. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint64x2_t, neon, u64)
  158. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, float32x4_t, neon, f32)
  159. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  160. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, float64x2_t, neon, f64)
  161. #endif
  162. #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
  163. #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  164. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128,
  165. SIMDE_POWER_ALTIVEC_VECTOR(signed char),
  166. altivec, i8)
  167. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128,
  168. SIMDE_POWER_ALTIVEC_VECTOR(signed short),
  169. altivec, i16)
  170. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128,
  171. SIMDE_POWER_ALTIVEC_VECTOR(signed int),
  172. altivec, i32)
  173. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(
  174. m128, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
  175. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(
  176. m128, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
  177. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128,
  178. SIMDE_POWER_ALTIVEC_VECTOR(unsigned int),
  179. altivec, u32)
  180. #if defined(SIMDE_BUG_GCC_95782)
  181. SIMDE_FUNCTION_ATTRIBUTES
  182. SIMDE_POWER_ALTIVEC_VECTOR(float)
  183. simde__m128_to_altivec_f32(simde__m128 value)
  184. {
  185. simde__m128_private r_ = simde__m128_to_private(value);
  186. return r_.altivec_f32;
  187. }
  188. SIMDE_FUNCTION_ATTRIBUTES
  189. simde__m128 simde__m128_from_altivec_f32(SIMDE_POWER_ALTIVEC_VECTOR(float)
  190. value)
  191. {
  192. simde__m128_private r_;
  193. r_.altivec_f32 = value;
  194. return simde__m128_from_private(r_);
  195. }
  196. #else
  197. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(float),
  198. altivec, f32)
  199. #endif
  200. #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
  201. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(
  202. m128, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
  203. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(
  204. m128, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
  205. #endif
  206. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  207. SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v128_t, wasm, v128);
  208. #endif /* defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) */
  209. enum {
  210. #if defined(SIMDE_X86_SSE_NATIVE)
  211. SIMDE_MM_ROUND_NEAREST = _MM_ROUND_NEAREST,
  212. SIMDE_MM_ROUND_DOWN = _MM_ROUND_DOWN,
  213. SIMDE_MM_ROUND_UP = _MM_ROUND_UP,
  214. SIMDE_MM_ROUND_TOWARD_ZERO = _MM_ROUND_TOWARD_ZERO
  215. #else
  216. SIMDE_MM_ROUND_NEAREST = 0x0000,
  217. SIMDE_MM_ROUND_DOWN = 0x2000,
  218. SIMDE_MM_ROUND_UP = 0x4000,
  219. SIMDE_MM_ROUND_TOWARD_ZERO = 0x6000
  220. #endif
  221. };
  222. #if defined(_MM_FROUND_TO_NEAREST_INT)
  223. #define SIMDE_MM_FROUND_TO_NEAREST_INT _MM_FROUND_TO_NEAREST_INT
  224. #define SIMDE_MM_FROUND_TO_NEG_INF _MM_FROUND_TO_NEG_INF
  225. #define SIMDE_MM_FROUND_TO_POS_INF _MM_FROUND_TO_POS_INF
  226. #define SIMDE_MM_FROUND_TO_ZERO _MM_FROUND_TO_ZERO
  227. #define SIMDE_MM_FROUND_CUR_DIRECTION _MM_FROUND_CUR_DIRECTION
  228. #define SIMDE_MM_FROUND_RAISE_EXC _MM_FROUND_RAISE_EXC
  229. #define SIMDE_MM_FROUND_NO_EXC _MM_FROUND_NO_EXC
  230. #else
  231. #define SIMDE_MM_FROUND_TO_NEAREST_INT 0x00
  232. #define SIMDE_MM_FROUND_TO_NEG_INF 0x01
  233. #define SIMDE_MM_FROUND_TO_POS_INF 0x02
  234. #define SIMDE_MM_FROUND_TO_ZERO 0x03
  235. #define SIMDE_MM_FROUND_CUR_DIRECTION 0x04
  236. #define SIMDE_MM_FROUND_RAISE_EXC 0x00
  237. #define SIMDE_MM_FROUND_NO_EXC 0x08
  238. #endif
  239. #define SIMDE_MM_FROUND_NINT \
  240. (SIMDE_MM_FROUND_TO_NEAREST_INT | SIMDE_MM_FROUND_RAISE_EXC)
  241. #define SIMDE_MM_FROUND_FLOOR \
  242. (SIMDE_MM_FROUND_TO_NEG_INF | SIMDE_MM_FROUND_RAISE_EXC)
  243. #define SIMDE_MM_FROUND_CEIL \
  244. (SIMDE_MM_FROUND_TO_POS_INF | SIMDE_MM_FROUND_RAISE_EXC)
  245. #define SIMDE_MM_FROUND_TRUNC \
  246. (SIMDE_MM_FROUND_TO_ZERO | SIMDE_MM_FROUND_RAISE_EXC)
  247. #define SIMDE_MM_FROUND_RINT \
  248. (SIMDE_MM_FROUND_CUR_DIRECTION | SIMDE_MM_FROUND_RAISE_EXC)
  249. #define SIMDE_MM_FROUND_NEARBYINT \
  250. (SIMDE_MM_FROUND_CUR_DIRECTION | SIMDE_MM_FROUND_NO_EXC)
  251. #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) && \
  252. !defined(_MM_FROUND_TO_NEAREST_INT)
  253. #define _MM_FROUND_TO_NEAREST_INT SIMDE_MM_FROUND_TO_NEAREST_INT
  254. #define _MM_FROUND_TO_NEG_INF SIMDE_MM_FROUND_TO_NEG_INF
  255. #define _MM_FROUND_TO_POS_INF SIMDE_MM_FROUND_TO_POS_INF
  256. #define _MM_FROUND_TO_ZERO SIMDE_MM_FROUND_TO_ZERO
  257. #define _MM_FROUND_CUR_DIRECTION SIMDE_MM_FROUND_CUR_DIRECTION
  258. #define _MM_FROUND_RAISE_EXC SIMDE_MM_FROUND_RAISE_EXC
  259. #define _MM_FROUND_NINT SIMDE_MM_FROUND_NINT
  260. #define _MM_FROUND_FLOOR SIMDE_MM_FROUND_FLOOR
  261. #define _MM_FROUND_CEIL SIMDE_MM_FROUND_CEIL
  262. #define _MM_FROUND_TRUNC SIMDE_MM_FROUND_TRUNC
  263. #define _MM_FROUND_RINT SIMDE_MM_FROUND_RINT
  264. #define _MM_FROUND_NEARBYINT SIMDE_MM_FROUND_NEARBYINT
  265. #endif
  266. SIMDE_FUNCTION_ATTRIBUTES
  267. unsigned int SIMDE_MM_GET_ROUNDING_MODE(void)
  268. {
  269. #if defined(SIMDE_X86_SSE_NATIVE)
  270. return _MM_GET_ROUNDING_MODE();
  271. #elif defined(SIMDE_HAVE_FENV_H)
  272. unsigned int vfe_mode;
  273. switch (fegetround()) {
  274. #if defined(FE_TONEAREST)
  275. case FE_TONEAREST:
  276. vfe_mode = SIMDE_MM_ROUND_NEAREST;
  277. break;
  278. #endif
  279. #if defined(FE_TOWARDZERO)
  280. case FE_TOWARDZERO:
  281. vfe_mode = SIMDE_MM_ROUND_DOWN;
  282. break;
  283. #endif
  284. #if defined(FE_UPWARD)
  285. case FE_UPWARD:
  286. vfe_mode = SIMDE_MM_ROUND_UP;
  287. break;
  288. #endif
  289. #if defined(FE_DOWNWARD)
  290. case FE_DOWNWARD:
  291. vfe_mode = SIMDE_MM_ROUND_TOWARD_ZERO;
  292. break;
  293. #endif
  294. default:
  295. vfe_mode = SIMDE_MM_ROUND_NEAREST;
  296. break;
  297. }
  298. return vfe_mode;
  299. #else
  300. return SIMDE_MM_ROUND_NEAREST;
  301. #endif
  302. }
  303. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  304. #define _MM_GET_ROUNDING_MODE() SIMDE_MM_GET_ROUNDING_MODE()
  305. #endif
  306. SIMDE_FUNCTION_ATTRIBUTES
  307. void SIMDE_MM_SET_ROUNDING_MODE(unsigned int a)
  308. {
  309. #if defined(SIMDE_X86_SSE_NATIVE)
  310. _MM_SET_ROUNDING_MODE(a);
  311. #elif defined(SIMDE_HAVE_FENV_H)
  312. int fe_mode = FE_TONEAREST;
  313. switch (a) {
  314. #if defined(FE_TONEAREST)
  315. case SIMDE_MM_ROUND_NEAREST:
  316. fe_mode = FE_TONEAREST;
  317. break;
  318. #endif
  319. #if defined(FE_TOWARDZERO)
  320. case SIMDE_MM_ROUND_TOWARD_ZERO:
  321. fe_mode = FE_TOWARDZERO;
  322. break;
  323. #endif
  324. #if defined(FE_DOWNWARD)
  325. case SIMDE_MM_ROUND_DOWN:
  326. fe_mode = FE_DOWNWARD;
  327. break;
  328. #endif
  329. #if defined(FE_UPWARD)
  330. case SIMDE_MM_ROUND_UP:
  331. fe_mode = FE_UPWARD;
  332. break;
  333. #endif
  334. default:
  335. return;
  336. }
  337. fesetround(fe_mode);
  338. #else
  339. (void)a;
  340. #endif
  341. }
  342. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  343. #define _MM_SET_ROUNDING_MODE(a) SIMDE_MM_SET_ROUNDING_MODE(a)
  344. #endif
  345. SIMDE_FUNCTION_ATTRIBUTES
  346. uint32_t simde_mm_getcsr(void)
  347. {
  348. #if defined(SIMDE_X86_SSE_NATIVE)
  349. return _mm_getcsr();
  350. #else
  351. return SIMDE_MM_GET_ROUNDING_MODE();
  352. #endif
  353. }
  354. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  355. #define _mm_getcsr() simde_mm_getcsr()
  356. #endif
  357. SIMDE_FUNCTION_ATTRIBUTES
  358. void simde_mm_setcsr(uint32_t a)
  359. {
  360. #if defined(SIMDE_X86_SSE_NATIVE)
  361. _mm_setcsr(a);
  362. #else
  363. SIMDE_MM_SET_ROUNDING_MODE(HEDLEY_STATIC_CAST(unsigned int, a));
  364. #endif
  365. }
  366. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  367. #define _mm_setcsr(a) simde_mm_setcsr(a)
  368. #endif
  369. SIMDE_FUNCTION_ATTRIBUTES
  370. simde__m128 simde_x_mm_round_ps(simde__m128 a, int rounding, int lax_rounding)
  371. SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15)
  372. SIMDE_REQUIRE_CONSTANT_RANGE(lax_rounding, 0, 1)
  373. {
  374. simde__m128_private r_, a_ = simde__m128_to_private(a);
  375. (void)lax_rounding;
  376. /* For architectures which lack a current direction SIMD instruction.
  377. *
  378. * Note that NEON actually has a current rounding mode instruction,
  379. * but in ARMv8+ the rounding mode is ignored and nearest is always
  380. * used, so we treat ARMv7 as having a rounding mode but ARMv8 as
  381. * not. */
  382. #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ARM_NEON_A32V8)
  383. if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION)
  384. rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE())
  385. << 13;
  386. #endif
  387. switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
  388. case SIMDE_MM_FROUND_CUR_DIRECTION:
  389. #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  390. r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
  391. SIMDE_POWER_ALTIVEC_VECTOR(float),
  392. vec_round(a_.altivec_f32));
  393. #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
  394. r_.neon_f32 = vrndiq_f32(a_.neon_f32);
  395. #elif defined(simde_math_nearbyintf)
  396. SIMDE_VECTORIZE
  397. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0]));
  398. i++) {
  399. r_.f32[i] = simde_math_nearbyintf(a_.f32[i]);
  400. }
  401. #else
  402. HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
  403. #endif
  404. break;
  405. case SIMDE_MM_FROUND_TO_NEAREST_INT:
  406. #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
  407. r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
  408. SIMDE_POWER_ALTIVEC_VECTOR(float),
  409. vec_rint(a_.altivec_f32));
  410. #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE)
  411. r_.neon_f32 = vrndnq_f32(a_.neon_f32);
  412. #elif defined(simde_math_roundevenf)
  413. SIMDE_VECTORIZE
  414. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0]));
  415. i++) {
  416. r_.f32[i] = simde_math_roundevenf(a_.f32[i]);
  417. }
  418. #else
  419. HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
  420. #endif
  421. break;
  422. case SIMDE_MM_FROUND_TO_NEG_INF:
  423. #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  424. r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
  425. SIMDE_POWER_ALTIVEC_VECTOR(float),
  426. vec_floor(a_.altivec_f32));
  427. #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE)
  428. r_.neon_f32 = vrndmq_f32(a_.neon_f32);
  429. #elif defined(simde_math_floorf)
  430. SIMDE_VECTORIZE
  431. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0]));
  432. i++) {
  433. r_.f32[i] = simde_math_floorf(a_.f32[i]);
  434. }
  435. #else
  436. HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
  437. #endif
  438. break;
  439. case SIMDE_MM_FROUND_TO_POS_INF:
  440. #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  441. r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
  442. SIMDE_POWER_ALTIVEC_VECTOR(float),
  443. vec_ceil(a_.altivec_f32));
  444. #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE)
  445. r_.neon_f32 = vrndpq_f32(a_.neon_f32);
  446. #elif defined(simde_math_ceilf)
  447. SIMDE_VECTORIZE
  448. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0]));
  449. i++) {
  450. r_.f32[i] = simde_math_ceilf(a_.f32[i]);
  451. }
  452. #else
  453. HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
  454. #endif
  455. break;
  456. case SIMDE_MM_FROUND_TO_ZERO:
  457. #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  458. r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
  459. SIMDE_POWER_ALTIVEC_VECTOR(float),
  460. vec_trunc(a_.altivec_f32));
  461. #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE)
  462. r_.neon_f32 = vrndq_f32(a_.neon_f32);
  463. #elif defined(simde_math_truncf)
  464. SIMDE_VECTORIZE
  465. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0]));
  466. i++) {
  467. r_.f32[i] = simde_math_truncf(a_.f32[i]);
  468. }
  469. #else
  470. HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
  471. #endif
  472. break;
  473. default:
  474. HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
  475. }
  476. return simde__m128_from_private(r_);
  477. }
  478. #if defined(SIMDE_X86_SSE4_1_NATIVE)
  479. #define simde_mm_round_ps(a, rounding) _mm_round_ps((a), (rounding))
  480. #else
  481. #define simde_mm_round_ps(a, rounding) simde_x_mm_round_ps((a), (rounding), 0)
  482. #endif
  483. #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
  484. #define _mm_round_ps(a, rounding) simde_mm_round_ps((a), (rounding))
  485. #endif
  486. SIMDE_FUNCTION_ATTRIBUTES
  487. simde__m128 simde_mm_set_ps(simde_float32 e3, simde_float32 e2,
  488. simde_float32 e1, simde_float32 e0)
  489. {
  490. #if defined(SIMDE_X86_SSE_NATIVE)
  491. return _mm_set_ps(e3, e2, e1, e0);
  492. #else
  493. simde__m128_private r_;
  494. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  495. SIMDE_ALIGN_TO_16 simde_float32 data[4] = {e0, e1, e2, e3};
  496. r_.neon_f32 = vld1q_f32(data);
  497. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  498. r_.wasm_v128 = wasm_f32x4_make(e0, e1, e2, e3);
  499. #else
  500. r_.f32[0] = e0;
  501. r_.f32[1] = e1;
  502. r_.f32[2] = e2;
  503. r_.f32[3] = e3;
  504. #endif
  505. return simde__m128_from_private(r_);
  506. #endif
  507. }
  508. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  509. #define _mm_set_ps(e3, e2, e1, e0) simde_mm_set_ps(e3, e2, e1, e0)
  510. #endif
  511. SIMDE_FUNCTION_ATTRIBUTES
  512. simde__m128 simde_mm_set_ps1(simde_float32 a)
  513. {
  514. #if defined(SIMDE_X86_SSE_NATIVE)
  515. return _mm_set_ps1(a);
  516. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  517. return vdupq_n_f32(a);
  518. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  519. (void)a;
  520. return vec_splats(a);
  521. #else
  522. return simde_mm_set_ps(a, a, a, a);
  523. #endif
  524. }
  525. #define simde_mm_set1_ps(a) simde_mm_set_ps1(a)
  526. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  527. #define _mm_set_ps1(a) simde_mm_set_ps1(a)
  528. #define _mm_set1_ps(a) simde_mm_set1_ps(a)
  529. #endif
  530. SIMDE_FUNCTION_ATTRIBUTES
  531. simde__m128 simde_mm_move_ss(simde__m128 a, simde__m128 b)
  532. {
  533. #if defined(SIMDE_X86_SSE_NATIVE)
  534. return _mm_move_ss(a, b);
  535. #else
  536. simde__m128_private r_, a_ = simde__m128_to_private(a),
  537. b_ = simde__m128_to_private(b);
  538. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  539. r_.neon_f32 =
  540. vsetq_lane_f32(vgetq_lane_f32(b_.neon_f32, 0), a_.neon_f32, 0);
  541. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  542. SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)
  543. m = {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
  544. r_.altivec_f32 = vec_perm(a_.altivec_f32, b_.altivec_f32, m);
  545. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  546. r_.wasm_v128 = wasm_v8x16_shuffle(b_.wasm_v128, a_.wasm_v128, 0, 1, 2,
  547. 3, 20, 21, 22, 23, 24, 25, 26, 27, 28,
  548. 29, 30, 31);
  549. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  550. r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 4, 1, 2, 3);
  551. #else
  552. r_.f32[0] = b_.f32[0];
  553. r_.f32[1] = a_.f32[1];
  554. r_.f32[2] = a_.f32[2];
  555. r_.f32[3] = a_.f32[3];
  556. #endif
  557. return simde__m128_from_private(r_);
  558. #endif
  559. }
  560. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  561. #define _mm_move_ss(a, b) simde_mm_move_ss((a), (b))
  562. #endif
  563. SIMDE_FUNCTION_ATTRIBUTES
  564. simde__m128 simde_mm_add_ps(simde__m128 a, simde__m128 b)
  565. {
  566. #if defined(SIMDE_X86_SSE_NATIVE)
  567. return _mm_add_ps(a, b);
  568. #else
  569. simde__m128_private r_, a_ = simde__m128_to_private(a),
  570. b_ = simde__m128_to_private(b);
  571. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  572. r_.neon_f32 = vaddq_f32(a_.neon_f32, b_.neon_f32);
  573. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  574. r_.wasm_v128 = wasm_f32x4_add(a_.wasm_v128, b_.wasm_v128);
  575. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  576. r_.altivec_f32 = vec_add(a_.altivec_f32, b_.altivec_f32);
  577. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  578. r_.f32 = a_.f32 + b_.f32;
  579. #else
  580. SIMDE_VECTORIZE
  581. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  582. r_.f32[i] = a_.f32[i] + b_.f32[i];
  583. }
  584. #endif
  585. return simde__m128_from_private(r_);
  586. #endif
  587. }
  588. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  589. #define _mm_add_ps(a, b) simde_mm_add_ps((a), (b))
  590. #endif
  591. SIMDE_FUNCTION_ATTRIBUTES
  592. simde__m128 simde_mm_add_ss(simde__m128 a, simde__m128 b)
  593. {
  594. #if defined(SIMDE_X86_SSE_NATIVE)
  595. return _mm_add_ss(a, b);
  596. #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
  597. return simde_mm_move_ss(a, simde_mm_add_ps(a, b));
  598. #else
  599. simde__m128_private r_, a_ = simde__m128_to_private(a),
  600. b_ = simde__m128_to_private(b);
  601. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  602. float32_t b0 = vgetq_lane_f32(b_.neon_f32, 0);
  603. float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
  604. // the upper values in the result must be the remnants of <a>.
  605. r_.neon_f32 = vaddq_f32(a_.neon_f32, value);
  606. #else
  607. r_.f32[0] = a_.f32[0] + b_.f32[0];
  608. r_.f32[1] = a_.f32[1];
  609. r_.f32[2] = a_.f32[2];
  610. r_.f32[3] = a_.f32[3];
  611. #endif
  612. return simde__m128_from_private(r_);
  613. #endif
  614. }
  615. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  616. #define _mm_add_ss(a, b) simde_mm_add_ss((a), (b))
  617. #endif
  618. SIMDE_FUNCTION_ATTRIBUTES
  619. simde__m128 simde_mm_and_ps(simde__m128 a, simde__m128 b)
  620. {
  621. #if defined(SIMDE_X86_SSE_NATIVE)
  622. return _mm_and_ps(a, b);
  623. #else
  624. simde__m128_private r_, a_ = simde__m128_to_private(a),
  625. b_ = simde__m128_to_private(b);
  626. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  627. r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32);
  628. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  629. r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128);
  630. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  631. r_.i32 = a_.i32 & b_.i32;
  632. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  633. r_.altivec_f32 = vec_and(a_.altivec_f32, b_.altivec_f32);
  634. #else
  635. SIMDE_VECTORIZE
  636. for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
  637. r_.i32[i] = a_.i32[i] & b_.i32[i];
  638. }
  639. #endif
  640. return simde__m128_from_private(r_);
  641. #endif
  642. }
  643. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  644. #define _mm_and_ps(a, b) simde_mm_and_ps((a), (b))
  645. #endif
  646. SIMDE_FUNCTION_ATTRIBUTES
  647. simde__m128 simde_mm_andnot_ps(simde__m128 a, simde__m128 b)
  648. {
  649. #if defined(SIMDE_X86_SSE_NATIVE)
  650. return _mm_andnot_ps(a, b);
  651. #else
  652. simde__m128_private r_, a_ = simde__m128_to_private(a),
  653. b_ = simde__m128_to_private(b);
  654. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  655. r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
  656. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  657. r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128);
  658. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  659. r_.altivec_f32 = vec_andc(b_.altivec_f32, a_.altivec_f32);
  660. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  661. r_.i32 = ~a_.i32 & b_.i32;
  662. #else
  663. SIMDE_VECTORIZE
  664. for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
  665. r_.i32[i] = ~(a_.i32[i]) & b_.i32[i];
  666. }
  667. #endif
  668. return simde__m128_from_private(r_);
  669. #endif
  670. }
  671. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  672. #define _mm_andnot_ps(a, b) simde_mm_andnot_ps((a), (b))
  673. #endif
  674. SIMDE_FUNCTION_ATTRIBUTES
  675. simde__m128 simde_mm_xor_ps(simde__m128 a, simde__m128 b)
  676. {
  677. #if defined(SIMDE_X86_SSE_NATIVE)
  678. return _mm_xor_ps(a, b);
  679. #else
  680. simde__m128_private r_, a_ = simde__m128_to_private(a),
  681. b_ = simde__m128_to_private(b);
  682. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  683. r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32);
  684. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  685. r_.wasm_v128 = wasm_v128_xor(a_.wasm_v128, b_.wasm_v128);
  686. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  687. r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32);
  688. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  689. r_.i32f = a_.i32f ^ b_.i32f;
  690. #else
  691. SIMDE_VECTORIZE
  692. for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
  693. r_.u32[i] = a_.u32[i] ^ b_.u32[i];
  694. }
  695. #endif
  696. return simde__m128_from_private(r_);
  697. #endif
  698. }
  699. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  700. #define _mm_xor_ps(a, b) simde_mm_xor_ps((a), (b))
  701. #endif
  702. SIMDE_FUNCTION_ATTRIBUTES
  703. simde__m128 simde_mm_or_ps(simde__m128 a, simde__m128 b)
  704. {
  705. #if defined(SIMDE_X86_SSE_NATIVE)
  706. return _mm_or_ps(a, b);
  707. #else
  708. simde__m128_private r_, a_ = simde__m128_to_private(a),
  709. b_ = simde__m128_to_private(b);
  710. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  711. r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32);
  712. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  713. r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128);
  714. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  715. r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32);
  716. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  717. r_.i32f = a_.i32f | b_.i32f;
  718. #else
  719. SIMDE_VECTORIZE
  720. for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
  721. r_.u32[i] = a_.u32[i] | b_.u32[i];
  722. }
  723. #endif
  724. return simde__m128_from_private(r_);
  725. #endif
  726. }
  727. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  728. #define _mm_or_ps(a, b) simde_mm_or_ps((a), (b))
  729. #endif
  730. SIMDE_FUNCTION_ATTRIBUTES
  731. simde__m128 simde_x_mm_not_ps(simde__m128 a)
  732. {
  733. #if defined(SIMDE_X86_AVX512VL_NATIVE)
  734. __m128i ai = _mm_castps_si128(a);
  735. return _mm_castsi128_ps(_mm_ternarylogic_epi32(ai, ai, ai, 0x55));
  736. #elif defined(SIMDE_X86_SSE2_NATIVE)
  737. /* Note: we use ints instead of floats because we don't want cmpeq
  738. * to return false for (NaN, NaN) */
  739. __m128i ai = _mm_castps_si128(a);
  740. return _mm_castsi128_ps(_mm_andnot_si128(ai, _mm_cmpeq_epi32(ai, ai)));
  741. #else
  742. simde__m128_private r_, a_ = simde__m128_to_private(a);
  743. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  744. r_.neon_i32 = vmvnq_s32(a_.neon_i32);
  745. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  746. r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32);
  747. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  748. r_.wasm_v128 = wasm_v128_not(a_.wasm_v128);
  749. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  750. r_.i32 = ~a_.i32;
  751. #else
  752. SIMDE_VECTORIZE
  753. for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
  754. r_.i32[i] = ~(a_.i32[i]);
  755. }
  756. #endif
  757. return simde__m128_from_private(r_);
  758. #endif
  759. }
  760. SIMDE_FUNCTION_ATTRIBUTES
  761. simde__m128 simde_x_mm_select_ps(simde__m128 a, simde__m128 b, simde__m128 mask)
  762. {
  763. /* This function is for when you want to blend two elements together
  764. * according to a mask. It is similar to _mm_blendv_ps, except that
  765. * it is undefined whether the blend is based on the highest bit in
  766. * each lane (like blendv) or just bitwise operations. This allows
  767. * us to implement the function efficiently everywhere.
  768. *
  769. * Basically, you promise that all the lanes in mask are either 0 or
  770. * ~0. */
  771. #if defined(SIMDE_X86_SSE4_1_NATIVE)
  772. return _mm_blendv_ps(a, b, mask);
  773. #else
  774. simde__m128_private r_, a_ = simde__m128_to_private(a),
  775. b_ = simde__m128_to_private(b),
  776. mask_ = simde__m128_to_private(mask);
  777. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  778. r_.neon_i32 = vbslq_s32(mask_.neon_u32, b_.neon_i32, a_.neon_i32);
  779. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  780. r_.wasm_v128 = wasm_v128_bitselect(b_.wasm_v128, a_.wasm_v128,
  781. mask_.wasm_v128);
  782. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  783. r_.altivec_i32 =
  784. vec_sel(a_.altivec_i32, b_.altivec_i32, mask_.altivec_u32);
  785. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  786. r_.i32 = a_.i32 ^ ((a_.i32 ^ b_.i32) & mask_.i32);
  787. #else
  788. SIMDE_VECTORIZE
  789. for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
  790. r_.i32[i] = a_.i32[i] ^
  791. ((a_.i32[i] ^ b_.i32[i]) & mask_.i32[i]);
  792. }
  793. #endif
  794. return simde__m128_from_private(r_);
  795. #endif
  796. }
  797. SIMDE_FUNCTION_ATTRIBUTES
  798. simde__m64 simde_mm_avg_pu16(simde__m64 a, simde__m64 b)
  799. {
  800. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  801. return _mm_avg_pu16(a, b);
  802. #else
  803. simde__m64_private r_, a_ = simde__m64_to_private(a),
  804. b_ = simde__m64_to_private(b);
  805. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  806. r_.neon_u16 = vrhadd_u16(b_.neon_u16, a_.neon_u16);
  807. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && \
  808. defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \
  809. defined(SIMDE_CONVERT_VECTOR_)
  810. uint32_t wa SIMDE_VECTOR(16);
  811. uint32_t wb SIMDE_VECTOR(16);
  812. uint32_t wr SIMDE_VECTOR(16);
  813. SIMDE_CONVERT_VECTOR_(wa, a_.u16);
  814. SIMDE_CONVERT_VECTOR_(wb, b_.u16);
  815. wr = (wa + wb + 1) >> 1;
  816. SIMDE_CONVERT_VECTOR_(r_.u16, wr);
  817. #else
  818. SIMDE_VECTORIZE
  819. for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
  820. r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
  821. }
  822. #endif
  823. return simde__m64_from_private(r_);
  824. #endif
  825. }
  826. #define simde_m_pavgw(a, b) simde_mm_avg_pu16(a, b)
  827. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  828. #define _mm_avg_pu16(a, b) simde_mm_avg_pu16(a, b)
  829. #define _m_pavgw(a, b) simde_mm_avg_pu16(a, b)
  830. #endif
  831. SIMDE_FUNCTION_ATTRIBUTES
  832. simde__m64 simde_mm_avg_pu8(simde__m64 a, simde__m64 b)
  833. {
  834. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  835. return _mm_avg_pu8(a, b);
  836. #else
  837. simde__m64_private r_, a_ = simde__m64_to_private(a),
  838. b_ = simde__m64_to_private(b);
  839. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  840. r_.neon_u8 = vrhadd_u8(b_.neon_u8, a_.neon_u8);
  841. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && \
  842. defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \
  843. defined(SIMDE_CONVERT_VECTOR_)
  844. uint16_t wa SIMDE_VECTOR(16);
  845. uint16_t wb SIMDE_VECTOR(16);
  846. uint16_t wr SIMDE_VECTOR(16);
  847. SIMDE_CONVERT_VECTOR_(wa, a_.u8);
  848. SIMDE_CONVERT_VECTOR_(wb, b_.u8);
  849. wr = (wa + wb + 1) >> 1;
  850. SIMDE_CONVERT_VECTOR_(r_.u8, wr);
  851. #else
  852. SIMDE_VECTORIZE
  853. for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
  854. r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
  855. }
  856. #endif
  857. return simde__m64_from_private(r_);
  858. #endif
  859. }
  860. #define simde_m_pavgb(a, b) simde_mm_avg_pu8(a, b)
  861. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  862. #define _mm_avg_pu8(a, b) simde_mm_avg_pu8(a, b)
  863. #define _m_pavgb(a, b) simde_mm_avg_pu8(a, b)
  864. #endif
  865. SIMDE_FUNCTION_ATTRIBUTES
  866. simde__m128 simde_x_mm_abs_ps(simde__m128 a)
  867. {
  868. #if defined(SIMDE_X86_AVX512F_NATIVE) && \
  869. (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7, 1, 0))
  870. return _mm512_castps512_ps128(_mm512_abs_ps(_mm512_castps128_ps512(a)));
  871. #else
  872. simde__m128_private r_, a_ = simde__m128_to_private(a);
  873. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  874. r_.neon_f32 = vabsq_f32(a_.neon_f32);
  875. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  876. r_.altivec_f32 = vec_abs(a_.altivec_f32);
  877. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  878. r_.wasm_v128 = wasm_f32x4_abs(a_.wasm_v128);
  879. #else
  880. SIMDE_VECTORIZE
  881. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  882. r_.f32[i] = simde_math_fabsf(a_.f32[i]);
  883. }
  884. #endif
  885. return simde__m128_from_private(r_);
  886. #endif
  887. }
  888. SIMDE_FUNCTION_ATTRIBUTES
  889. simde__m128 simde_mm_cmpeq_ps(simde__m128 a, simde__m128 b)
  890. {
  891. #if defined(SIMDE_X86_SSE_NATIVE)
  892. return _mm_cmpeq_ps(a, b);
  893. #else
  894. simde__m128_private r_, a_ = simde__m128_to_private(a),
  895. b_ = simde__m128_to_private(b);
  896. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  897. r_.neon_u32 = vceqq_f32(a_.neon_f32, b_.neon_f32);
  898. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  899. r_.wasm_v128 = wasm_f32x4_eq(a_.wasm_v128, b_.wasm_v128);
  900. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  901. r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
  902. SIMDE_POWER_ALTIVEC_VECTOR(float),
  903. vec_cmpeq(a_.altivec_f32, b_.altivec_f32));
  904. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  905. r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), a_.f32 == b_.f32);
  906. #else
  907. SIMDE_VECTORIZE
  908. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  909. r_.u32[i] = (a_.f32[i] == b_.f32[i]) ? ~UINT32_C(0)
  910. : UINT32_C(0);
  911. }
  912. #endif
  913. return simde__m128_from_private(r_);
  914. #endif
  915. }
  916. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  917. #define _mm_cmpeq_ps(a, b) simde_mm_cmpeq_ps((a), (b))
  918. #endif
  919. SIMDE_FUNCTION_ATTRIBUTES
  920. simde__m128 simde_mm_cmpeq_ss(simde__m128 a, simde__m128 b)
  921. {
  922. #if defined(SIMDE_X86_SSE_NATIVE)
  923. return _mm_cmpeq_ss(a, b);
  924. #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
  925. return simde_mm_move_ss(a, simde_mm_cmpeq_ps(a, b));
  926. #else
  927. simde__m128_private r_, a_ = simde__m128_to_private(a),
  928. b_ = simde__m128_to_private(b);
  929. r_.u32[0] = (a_.f32[0] == b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
  930. SIMDE_VECTORIZE
  931. for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  932. r_.u32[i] = a_.u32[i];
  933. }
  934. return simde__m128_from_private(r_);
  935. #endif
  936. }
  937. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  938. #define _mm_cmpeq_ss(a, b) simde_mm_cmpeq_ss((a), (b))
  939. #endif
  940. SIMDE_FUNCTION_ATTRIBUTES
  941. simde__m128 simde_mm_cmpge_ps(simde__m128 a, simde__m128 b)
  942. {
  943. #if defined(SIMDE_X86_SSE_NATIVE)
  944. return _mm_cmpge_ps(a, b);
  945. #else
  946. simde__m128_private r_, a_ = simde__m128_to_private(a),
  947. b_ = simde__m128_to_private(b);
  948. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  949. r_.neon_u32 = vcgeq_f32(a_.neon_f32, b_.neon_f32);
  950. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  951. r_.wasm_v128 = wasm_f32x4_ge(a_.wasm_v128, b_.wasm_v128);
  952. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  953. r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
  954. SIMDE_POWER_ALTIVEC_VECTOR(float),
  955. vec_cmpge(a_.altivec_f32, b_.altivec_f32));
  956. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  957. r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 >= b_.f32));
  958. #else
  959. SIMDE_VECTORIZE
  960. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  961. r_.u32[i] = (a_.f32[i] >= b_.f32[i]) ? ~UINT32_C(0)
  962. : UINT32_C(0);
  963. }
  964. #endif
  965. return simde__m128_from_private(r_);
  966. #endif
  967. }
  968. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  969. #define _mm_cmpge_ps(a, b) simde_mm_cmpge_ps((a), (b))
  970. #endif
  971. SIMDE_FUNCTION_ATTRIBUTES
  972. simde__m128 simde_mm_cmpge_ss(simde__m128 a, simde__m128 b)
  973. {
  974. #if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
  975. return _mm_cmpge_ss(a, b);
  976. #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
  977. return simde_mm_move_ss(a, simde_mm_cmpge_ps(a, b));
  978. #else
  979. simde__m128_private r_, a_ = simde__m128_to_private(a),
  980. b_ = simde__m128_to_private(b);
  981. r_.u32[0] = (a_.f32[0] >= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
  982. SIMDE_VECTORIZE
  983. for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  984. r_.u32[i] = a_.u32[i];
  985. }
  986. return simde__m128_from_private(r_);
  987. #endif
  988. }
  989. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  990. #define _mm_cmpge_ss(a, b) simde_mm_cmpge_ss((a), (b))
  991. #endif
  992. SIMDE_FUNCTION_ATTRIBUTES
  993. simde__m128 simde_mm_cmpgt_ps(simde__m128 a, simde__m128 b)
  994. {
  995. #if defined(SIMDE_X86_SSE_NATIVE)
  996. return _mm_cmpgt_ps(a, b);
  997. #else
  998. simde__m128_private r_, a_ = simde__m128_to_private(a),
  999. b_ = simde__m128_to_private(b);
  1000. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1001. r_.neon_u32 = vcgtq_f32(a_.neon_f32, b_.neon_f32);
  1002. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  1003. r_.wasm_v128 = wasm_f32x4_gt(a_.wasm_v128, b_.wasm_v128);
  1004. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  1005. r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
  1006. SIMDE_POWER_ALTIVEC_VECTOR(float),
  1007. vec_cmpgt(a_.altivec_f32, b_.altivec_f32));
  1008. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  1009. r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 > b_.f32));
  1010. #else
  1011. SIMDE_VECTORIZE
  1012. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  1013. r_.u32[i] = (a_.f32[i] > b_.f32[i]) ? ~UINT32_C(0)
  1014. : UINT32_C(0);
  1015. }
  1016. #endif
  1017. return simde__m128_from_private(r_);
  1018. #endif
  1019. }
  1020. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1021. #define _mm_cmpgt_ps(a, b) simde_mm_cmpgt_ps((a), (b))
  1022. #endif
  1023. SIMDE_FUNCTION_ATTRIBUTES
  1024. simde__m128 simde_mm_cmpgt_ss(simde__m128 a, simde__m128 b)
  1025. {
  1026. #if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
  1027. return _mm_cmpgt_ss(a, b);
  1028. #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
  1029. return simde_mm_move_ss(a, simde_mm_cmpgt_ps(a, b));
  1030. #else
  1031. simde__m128_private r_, a_ = simde__m128_to_private(a),
  1032. b_ = simde__m128_to_private(b);
  1033. r_.u32[0] = (a_.f32[0] > b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
  1034. SIMDE_VECTORIZE
  1035. for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  1036. r_.u32[i] = a_.u32[i];
  1037. }
  1038. return simde__m128_from_private(r_);
  1039. #endif
  1040. }
  1041. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1042. #define _mm_cmpgt_ss(a, b) simde_mm_cmpgt_ss((a), (b))
  1043. #endif
  1044. SIMDE_FUNCTION_ATTRIBUTES
  1045. simde__m128 simde_mm_cmple_ps(simde__m128 a, simde__m128 b)
  1046. {
  1047. #if defined(SIMDE_X86_SSE_NATIVE)
  1048. return _mm_cmple_ps(a, b);
  1049. #else
  1050. simde__m128_private r_, a_ = simde__m128_to_private(a),
  1051. b_ = simde__m128_to_private(b);
  1052. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1053. r_.neon_u32 = vcleq_f32(a_.neon_f32, b_.neon_f32);
  1054. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  1055. r_.wasm_v128 = wasm_f32x4_le(a_.wasm_v128, b_.wasm_v128);
  1056. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  1057. r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
  1058. SIMDE_POWER_ALTIVEC_VECTOR(float),
  1059. vec_cmple(a_.altivec_f32, b_.altivec_f32));
  1060. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  1061. r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 <= b_.f32));
  1062. #else
  1063. SIMDE_VECTORIZE
  1064. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  1065. r_.u32[i] = (a_.f32[i] <= b_.f32[i]) ? ~UINT32_C(0)
  1066. : UINT32_C(0);
  1067. }
  1068. #endif
  1069. return simde__m128_from_private(r_);
  1070. #endif
  1071. }
  1072. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1073. #define _mm_cmple_ps(a, b) simde_mm_cmple_ps((a), (b))
  1074. #endif
  1075. SIMDE_FUNCTION_ATTRIBUTES
  1076. simde__m128 simde_mm_cmple_ss(simde__m128 a, simde__m128 b)
  1077. {
  1078. #if defined(SIMDE_X86_SSE_NATIVE)
  1079. return _mm_cmple_ss(a, b);
  1080. #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
  1081. return simde_mm_move_ss(a, simde_mm_cmple_ps(a, b));
  1082. #else
  1083. simde__m128_private r_, a_ = simde__m128_to_private(a),
  1084. b_ = simde__m128_to_private(b);
  1085. r_.u32[0] = (a_.f32[0] <= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
  1086. SIMDE_VECTORIZE
  1087. for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  1088. r_.u32[i] = a_.u32[i];
  1089. }
  1090. return simde__m128_from_private(r_);
  1091. #endif
  1092. }
  1093. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1094. #define _mm_cmple_ss(a, b) simde_mm_cmple_ss((a), (b))
  1095. #endif
  1096. SIMDE_FUNCTION_ATTRIBUTES
  1097. simde__m128 simde_mm_cmplt_ps(simde__m128 a, simde__m128 b)
  1098. {
  1099. #if defined(SIMDE_X86_SSE_NATIVE)
  1100. return _mm_cmplt_ps(a, b);
  1101. #else
  1102. simde__m128_private r_, a_ = simde__m128_to_private(a),
  1103. b_ = simde__m128_to_private(b);
  1104. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1105. r_.neon_u32 = vcltq_f32(a_.neon_f32, b_.neon_f32);
  1106. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  1107. r_.wasm_v128 = wasm_f32x4_lt(a_.wasm_v128, b_.wasm_v128);
  1108. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  1109. r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
  1110. SIMDE_POWER_ALTIVEC_VECTOR(float),
  1111. vec_cmplt(a_.altivec_f32, b_.altivec_f32));
  1112. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  1113. r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 < b_.f32));
  1114. #else
  1115. SIMDE_VECTORIZE
  1116. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  1117. r_.u32[i] = (a_.f32[i] < b_.f32[i]) ? ~UINT32_C(0)
  1118. : UINT32_C(0);
  1119. }
  1120. #endif
  1121. return simde__m128_from_private(r_);
  1122. #endif
  1123. }
  1124. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1125. #define _mm_cmplt_ps(a, b) simde_mm_cmplt_ps((a), (b))
  1126. #endif
  1127. SIMDE_FUNCTION_ATTRIBUTES
  1128. simde__m128 simde_mm_cmplt_ss(simde__m128 a, simde__m128 b)
  1129. {
  1130. #if defined(SIMDE_X86_SSE_NATIVE)
  1131. return _mm_cmplt_ss(a, b);
  1132. #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
  1133. return simde_mm_move_ss(a, simde_mm_cmplt_ps(a, b));
  1134. #else
  1135. simde__m128_private r_, a_ = simde__m128_to_private(a),
  1136. b_ = simde__m128_to_private(b);
  1137. r_.u32[0] = (a_.f32[0] < b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
  1138. SIMDE_VECTORIZE
  1139. for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  1140. r_.u32[i] = a_.u32[i];
  1141. }
  1142. return simde__m128_from_private(r_);
  1143. #endif
  1144. }
  1145. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1146. #define _mm_cmplt_ss(a, b) simde_mm_cmplt_ss((a), (b))
  1147. #endif
  1148. SIMDE_FUNCTION_ATTRIBUTES
  1149. simde__m128 simde_mm_cmpneq_ps(simde__m128 a, simde__m128 b)
  1150. {
  1151. #if defined(SIMDE_X86_SSE_NATIVE)
  1152. return _mm_cmpneq_ps(a, b);
  1153. #else
  1154. simde__m128_private r_, a_ = simde__m128_to_private(a),
  1155. b_ = simde__m128_to_private(b);
  1156. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1157. r_.neon_u32 = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32));
  1158. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  1159. r_.wasm_v128 = wasm_f32x4_ne(a_.wasm_v128, b_.wasm_v128);
  1160. #elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE) && SIMDE_ARCH_POWER_CHECK(900) && \
  1161. !defined(HEDLEY_IBM_VERSION)
  1162. /* vec_cmpne(SIMDE_POWER_ALTIVEC_VECTOR(float), SIMDE_POWER_ALTIVEC_VECTOR(float))
  1163. is missing from XL C/C++ v16.1.1,
  1164. though the documentation (table 89 on page 432 of the IBM XL C/C++ for
  1165. Linux Compiler Reference, Version 16.1.1) shows that it should be
  1166. present. Both GCC and clang support it. */
  1167. r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
  1168. SIMDE_POWER_ALTIVEC_VECTOR(float),
  1169. vec_cmpne(a_.altivec_f32, b_.altivec_f32));
  1170. #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
  1171. r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
  1172. SIMDE_POWER_ALTIVEC_VECTOR(float),
  1173. vec_cmpeq(a_.altivec_f32, b_.altivec_f32));
  1174. r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
  1175. SIMDE_POWER_ALTIVEC_VECTOR(float),
  1176. vec_nor(r_.altivec_f32, r_.altivec_f32));
  1177. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  1178. r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 != b_.f32));
  1179. #else
  1180. SIMDE_VECTORIZE
  1181. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  1182. r_.u32[i] = (a_.f32[i] != b_.f32[i]) ? ~UINT32_C(0)
  1183. : UINT32_C(0);
  1184. }
  1185. #endif
  1186. return simde__m128_from_private(r_);
  1187. #endif
  1188. }
  1189. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1190. #define _mm_cmpneq_ps(a, b) simde_mm_cmpneq_ps((a), (b))
  1191. #endif
  1192. SIMDE_FUNCTION_ATTRIBUTES
  1193. simde__m128 simde_mm_cmpneq_ss(simde__m128 a, simde__m128 b)
  1194. {
  1195. #if defined(SIMDE_X86_SSE_NATIVE)
  1196. return _mm_cmpneq_ss(a, b);
  1197. #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
  1198. return simde_mm_move_ss(a, simde_mm_cmpneq_ps(a, b));
  1199. #else
  1200. simde__m128_private r_, a_ = simde__m128_to_private(a),
  1201. b_ = simde__m128_to_private(b);
  1202. r_.u32[0] = (a_.f32[0] != b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
  1203. SIMDE_VECTORIZE
  1204. for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  1205. r_.u32[i] = a_.u32[i];
  1206. }
  1207. return simde__m128_from_private(r_);
  1208. #endif
  1209. }
  1210. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1211. #define _mm_cmpneq_ss(a, b) simde_mm_cmpneq_ss((a), (b))
  1212. #endif
  1213. SIMDE_FUNCTION_ATTRIBUTES
  1214. simde__m128 simde_mm_cmpnge_ps(simde__m128 a, simde__m128 b)
  1215. {
  1216. return simde_mm_cmplt_ps(a, b);
  1217. }
  1218. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1219. #define _mm_cmpnge_ps(a, b) simde_mm_cmpnge_ps((a), (b))
  1220. #endif
  1221. SIMDE_FUNCTION_ATTRIBUTES
  1222. simde__m128 simde_mm_cmpnge_ss(simde__m128 a, simde__m128 b)
  1223. {
  1224. return simde_mm_cmplt_ss(a, b);
  1225. }
  1226. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1227. #define _mm_cmpnge_ss(a, b) simde_mm_cmpnge_ss((a), (b))
  1228. #endif
  1229. SIMDE_FUNCTION_ATTRIBUTES
  1230. simde__m128 simde_mm_cmpngt_ps(simde__m128 a, simde__m128 b)
  1231. {
  1232. return simde_mm_cmple_ps(a, b);
  1233. }
  1234. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1235. #define _mm_cmpngt_ps(a, b) simde_mm_cmpngt_ps((a), (b))
  1236. #endif
  1237. SIMDE_FUNCTION_ATTRIBUTES
  1238. simde__m128 simde_mm_cmpngt_ss(simde__m128 a, simde__m128 b)
  1239. {
  1240. return simde_mm_cmple_ss(a, b);
  1241. }
  1242. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1243. #define _mm_cmpngt_ss(a, b) simde_mm_cmpngt_ss((a), (b))
  1244. #endif
  1245. SIMDE_FUNCTION_ATTRIBUTES
  1246. simde__m128 simde_mm_cmpnle_ps(simde__m128 a, simde__m128 b)
  1247. {
  1248. return simde_mm_cmpgt_ps(a, b);
  1249. }
  1250. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1251. #define _mm_cmpnle_ps(a, b) simde_mm_cmpnle_ps((a), (b))
  1252. #endif
  1253. SIMDE_FUNCTION_ATTRIBUTES
  1254. simde__m128 simde_mm_cmpnle_ss(simde__m128 a, simde__m128 b)
  1255. {
  1256. return simde_mm_cmpgt_ss(a, b);
  1257. }
  1258. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1259. #define _mm_cmpnle_ss(a, b) simde_mm_cmpnle_ss((a), (b))
  1260. #endif
  1261. SIMDE_FUNCTION_ATTRIBUTES
  1262. simde__m128 simde_mm_cmpnlt_ps(simde__m128 a, simde__m128 b)
  1263. {
  1264. return simde_mm_cmpge_ps(a, b);
  1265. }
  1266. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1267. #define _mm_cmpnlt_ps(a, b) simde_mm_cmpnlt_ps((a), (b))
  1268. #endif
  1269. SIMDE_FUNCTION_ATTRIBUTES
  1270. simde__m128 simde_mm_cmpnlt_ss(simde__m128 a, simde__m128 b)
  1271. {
  1272. return simde_mm_cmpge_ss(a, b);
  1273. }
  1274. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1275. #define _mm_cmpnlt_ss(a, b) simde_mm_cmpnlt_ss((a), (b))
  1276. #endif
  1277. SIMDE_FUNCTION_ATTRIBUTES
  1278. simde__m128 simde_mm_cmpord_ps(simde__m128 a, simde__m128 b)
  1279. {
  1280. #if defined(SIMDE_X86_SSE_NATIVE)
  1281. return _mm_cmpord_ps(a, b);
  1282. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  1283. return wasm_v128_and(wasm_f32x4_eq(a, a), wasm_f32x4_eq(b, b));
  1284. #else
  1285. simde__m128_private r_, a_ = simde__m128_to_private(a),
  1286. b_ = simde__m128_to_private(b);
  1287. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1288. /* Note: NEON does not have ordered compare builtin
  1289. Need to compare a eq a and b eq b to check for NaN
  1290. Do AND of results to get final */
  1291. uint32x4_t ceqaa = vceqq_f32(a_.neon_f32, a_.neon_f32);
  1292. uint32x4_t ceqbb = vceqq_f32(b_.neon_f32, b_.neon_f32);
  1293. r_.neon_u32 = vandq_u32(ceqaa, ceqbb);
  1294. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  1295. r_.wasm_v128 = wasm_v128_and(wasm_f32x4_eq(a_.wasm_v128, a_.wasm_v128),
  1296. wasm_f32x4_eq(b_.wasm_v128, b_.wasm_v128));
  1297. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  1298. r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
  1299. SIMDE_POWER_ALTIVEC_VECTOR(float),
  1300. vec_and(vec_cmpeq(a_.altivec_f32, a_.altivec_f32),
  1301. vec_cmpeq(b_.altivec_f32, b_.altivec_f32)));
  1302. #elif defined(simde_math_isnanf)
  1303. SIMDE_VECTORIZE
  1304. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  1305. r_.u32[i] = (simde_math_isnanf(a_.f32[i]) ||
  1306. simde_math_isnanf(b_.f32[i]))
  1307. ? UINT32_C(0)
  1308. : ~UINT32_C(0);
  1309. }
  1310. #else
  1311. HEDLEY_UNREACHABLE();
  1312. #endif
  1313. return simde__m128_from_private(r_);
  1314. #endif
  1315. }
  1316. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1317. #define _mm_cmpord_ps(a, b) simde_mm_cmpord_ps((a), (b))
  1318. #endif
  1319. SIMDE_FUNCTION_ATTRIBUTES
  1320. simde__m128 simde_mm_cmpunord_ps(simde__m128 a, simde__m128 b)
  1321. {
  1322. #if defined(SIMDE_X86_SSE_NATIVE)
  1323. return _mm_cmpunord_ps(a, b);
  1324. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  1325. return wasm_v128_or(wasm_f32x4_ne(a, a), wasm_f32x4_ne(b, b));
  1326. #else
  1327. simde__m128_private r_, a_ = simde__m128_to_private(a),
  1328. b_ = simde__m128_to_private(b);
  1329. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1330. uint32x4_t ceqaa = vceqq_f32(a_.neon_f32, a_.neon_f32);
  1331. uint32x4_t ceqbb = vceqq_f32(b_.neon_f32, b_.neon_f32);
  1332. r_.neon_u32 = vmvnq_u32(vandq_u32(ceqaa, ceqbb));
  1333. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  1334. r_.wasm_v128 = wasm_v128_or(wasm_f32x4_ne(a_.wasm_v128, a_.wasm_v128),
  1335. wasm_f32x4_ne(b_.wasm_v128, b_.wasm_v128));
  1336. #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
  1337. r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
  1338. SIMDE_POWER_ALTIVEC_VECTOR(float),
  1339. vec_nand(vec_cmpeq(a_.altivec_f32, a_.altivec_f32),
  1340. vec_cmpeq(b_.altivec_f32, b_.altivec_f32)));
  1341. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  1342. r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
  1343. SIMDE_POWER_ALTIVEC_VECTOR(float),
  1344. vec_and(vec_cmpeq(a_.altivec_f32, a_.altivec_f32),
  1345. vec_cmpeq(b_.altivec_f32, b_.altivec_f32)));
  1346. r_.altivec_f32 = vec_nor(r_.altivec_f32, r_.altivec_f32);
  1347. #elif defined(simde_math_isnanf)
  1348. SIMDE_VECTORIZE
  1349. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  1350. r_.u32[i] = (simde_math_isnanf(a_.f32[i]) ||
  1351. simde_math_isnanf(b_.f32[i]))
  1352. ? ~UINT32_C(0)
  1353. : UINT32_C(0);
  1354. }
  1355. #else
  1356. HEDLEY_UNREACHABLE();
  1357. #endif
  1358. return simde__m128_from_private(r_);
  1359. #endif
  1360. }
  1361. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1362. #define _mm_cmpunord_ps(a, b) simde_mm_cmpunord_ps((a), (b))
  1363. #endif
  1364. SIMDE_FUNCTION_ATTRIBUTES
  1365. simde__m128 simde_mm_cmpunord_ss(simde__m128 a, simde__m128 b)
  1366. {
  1367. #if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
  1368. return _mm_cmpunord_ss(a, b);
  1369. #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
  1370. return simde_mm_move_ss(a, simde_mm_cmpunord_ps(a, b));
  1371. #else
  1372. simde__m128_private r_, a_ = simde__m128_to_private(a),
  1373. b_ = simde__m128_to_private(b);
  1374. #if defined(simde_math_isnanf)
  1375. r_.u32[0] =
  1376. (simde_math_isnanf(a_.f32[0]) || simde_math_isnanf(b_.f32[0]))
  1377. ? ~UINT32_C(0)
  1378. : UINT32_C(0);
  1379. SIMDE_VECTORIZE
  1380. for (size_t i = 1; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
  1381. r_.u32[i] = a_.u32[i];
  1382. }
  1383. #else
  1384. HEDLEY_UNREACHABLE();
  1385. #endif
  1386. return simde__m128_from_private(r_);
  1387. #endif
  1388. }
  1389. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1390. #define _mm_cmpunord_ss(a, b) simde_mm_cmpunord_ss((a), (b))
  1391. #endif
  1392. SIMDE_FUNCTION_ATTRIBUTES
  1393. int simde_mm_comieq_ss(simde__m128 a, simde__m128 b)
  1394. {
  1395. #if defined(SIMDE_X86_SSE_NATIVE)
  1396. return _mm_comieq_ss(a, b);
  1397. #else
  1398. simde__m128_private a_ = simde__m128_to_private(a),
  1399. b_ = simde__m128_to_private(b);
  1400. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1401. uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
  1402. uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
  1403. uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
  1404. uint32x4_t a_eq_b = vceqq_f32(a_.neon_f32, b_.neon_f32);
  1405. return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0);
  1406. #else
  1407. return a_.f32[0] == b_.f32[0];
  1408. #endif
  1409. #endif
  1410. }
  1411. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1412. #define _mm_comieq_ss(a, b) simde_mm_comieq_ss((a), (b))
  1413. #endif
  1414. SIMDE_FUNCTION_ATTRIBUTES
  1415. int simde_mm_comige_ss(simde__m128 a, simde__m128 b)
  1416. {
  1417. #if defined(SIMDE_X86_SSE_NATIVE)
  1418. return _mm_comige_ss(a, b);
  1419. #else
  1420. simde__m128_private a_ = simde__m128_to_private(a),
  1421. b_ = simde__m128_to_private(b);
  1422. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1423. uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
  1424. uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
  1425. uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
  1426. uint32x4_t a_ge_b = vcgeq_f32(a_.neon_f32, b_.neon_f32);
  1427. return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0);
  1428. #else
  1429. return a_.f32[0] >= b_.f32[0];
  1430. #endif
  1431. #endif
  1432. }
  1433. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1434. #define _mm_comige_ss(a, b) simde_mm_comige_ss((a), (b))
  1435. #endif
  1436. SIMDE_FUNCTION_ATTRIBUTES
  1437. int simde_mm_comigt_ss(simde__m128 a, simde__m128 b)
  1438. {
  1439. #if defined(SIMDE_X86_SSE_NATIVE)
  1440. return _mm_comigt_ss(a, b);
  1441. #else
  1442. simde__m128_private a_ = simde__m128_to_private(a),
  1443. b_ = simde__m128_to_private(b);
  1444. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1445. uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
  1446. uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
  1447. uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
  1448. uint32x4_t a_gt_b = vcgtq_f32(a_.neon_f32, b_.neon_f32);
  1449. return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0);
  1450. #else
  1451. return a_.f32[0] > b_.f32[0];
  1452. #endif
  1453. #endif
  1454. }
  1455. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1456. #define _mm_comigt_ss(a, b) simde_mm_comigt_ss((a), (b))
  1457. #endif
  1458. SIMDE_FUNCTION_ATTRIBUTES
  1459. int simde_mm_comile_ss(simde__m128 a, simde__m128 b)
  1460. {
  1461. #if defined(SIMDE_X86_SSE_NATIVE)
  1462. return _mm_comile_ss(a, b);
  1463. #else
  1464. simde__m128_private a_ = simde__m128_to_private(a),
  1465. b_ = simde__m128_to_private(b);
  1466. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1467. uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
  1468. uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
  1469. uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
  1470. uint32x4_t a_le_b = vcleq_f32(a_.neon_f32, b_.neon_f32);
  1471. return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0);
  1472. #else
  1473. return a_.f32[0] <= b_.f32[0];
  1474. #endif
  1475. #endif
  1476. }
  1477. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1478. #define _mm_comile_ss(a, b) simde_mm_comile_ss((a), (b))
  1479. #endif
  1480. SIMDE_FUNCTION_ATTRIBUTES
  1481. int simde_mm_comilt_ss(simde__m128 a, simde__m128 b)
  1482. {
  1483. #if defined(SIMDE_X86_SSE_NATIVE)
  1484. return _mm_comilt_ss(a, b);
  1485. #else
  1486. simde__m128_private a_ = simde__m128_to_private(a),
  1487. b_ = simde__m128_to_private(b);
  1488. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1489. uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
  1490. uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
  1491. uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
  1492. uint32x4_t a_lt_b = vcltq_f32(a_.neon_f32, b_.neon_f32);
  1493. return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0);
  1494. #else
  1495. return a_.f32[0] < b_.f32[0];
  1496. #endif
  1497. #endif
  1498. }
  1499. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1500. #define _mm_comilt_ss(a, b) simde_mm_comilt_ss((a), (b))
  1501. #endif
  1502. SIMDE_FUNCTION_ATTRIBUTES
  1503. int simde_mm_comineq_ss(simde__m128 a, simde__m128 b)
  1504. {
  1505. #if defined(SIMDE_X86_SSE_NATIVE)
  1506. return _mm_comineq_ss(a, b);
  1507. #else
  1508. simde__m128_private a_ = simde__m128_to_private(a),
  1509. b_ = simde__m128_to_private(b);
  1510. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1511. uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
  1512. uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
  1513. uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
  1514. uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32));
  1515. return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0);
  1516. #else
  1517. return a_.f32[0] != b_.f32[0];
  1518. #endif
  1519. #endif
  1520. }
  1521. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1522. #define _mm_comineq_ss(a, b) simde_mm_comineq_ss((a), (b))
  1523. #endif
  1524. SIMDE_FUNCTION_ATTRIBUTES
  1525. simde__m128 simde_x_mm_copysign_ps(simde__m128 dest, simde__m128 src)
  1526. {
  1527. simde__m128_private r_, dest_ = simde__m128_to_private(dest),
  1528. src_ = simde__m128_to_private(src);
  1529. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1530. const uint32x4_t sign_pos =
  1531. vreinterpretq_u32_f32(vdupq_n_f32(-SIMDE_FLOAT32_C(0.0)));
  1532. r_.neon_u32 = vbslq_u32(sign_pos, src_.neon_u32, dest_.neon_u32);
  1533. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  1534. const v128_t sign_pos = wasm_f32x4_splat(-0.0f);
  1535. r_.wasm_v128 =
  1536. wasm_v128_bitselect(src_.wasm_v128, dest_.wasm_v128, sign_pos);
  1537. #elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE)
  1538. #if !defined(HEDLEY_IBM_VERSION)
  1539. r_.altivec_f32 = vec_cpsgn(dest_.altivec_f32, src_.altivec_f32);
  1540. #else
  1541. r_.altivec_f32 = vec_cpsgn(src_.altivec_f32, dest_.altivec_f32);
  1542. #endif
  1543. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  1544. const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)
  1545. sign_pos = HEDLEY_REINTERPRET_CAST(
  1546. SIMDE_POWER_ALTIVEC_VECTOR(unsigned int),
  1547. vec_splats(-0.0f));
  1548. r_.altivec_f32 = vec_sel(dest_.altivec_f32, src_.altivec_f32, sign_pos);
  1549. #elif defined(SIMDE_IEEE754_STORAGE)
  1550. (void)src_;
  1551. (void)dest_;
  1552. simde__m128 sign_pos = simde_mm_set1_ps(-0.0f);
  1553. r_ = simde__m128_to_private(simde_mm_xor_ps(
  1554. dest, simde_mm_and_ps(simde_mm_xor_ps(dest, src), sign_pos)));
  1555. #else
  1556. SIMDE_VECTORIZE
  1557. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  1558. r_.f32[i] = simde_math_copysignf(dest_.f32[i], src_.f32[i]);
  1559. }
  1560. #endif
  1561. return simde__m128_from_private(r_);
  1562. }
  1563. SIMDE_FUNCTION_ATTRIBUTES
  1564. simde__m128 simde_x_mm_xorsign_ps(simde__m128 dest, simde__m128 src)
  1565. {
  1566. return simde_mm_xor_ps(simde_mm_and_ps(simde_mm_set1_ps(-0.0f), src),
  1567. dest);
  1568. }
  1569. SIMDE_FUNCTION_ATTRIBUTES
  1570. simde__m128 simde_mm_cvt_pi2ps(simde__m128 a, simde__m64 b)
  1571. {
  1572. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  1573. return _mm_cvt_pi2ps(a, b);
  1574. #else
  1575. simde__m128_private r_, a_ = simde__m128_to_private(a);
  1576. simde__m64_private b_ = simde__m64_to_private(b);
  1577. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1578. r_.neon_f32 = vcombine_f32(vcvt_f32_s32(b_.neon_i32),
  1579. vget_high_f32(a_.neon_f32));
  1580. #elif defined(SIMDE_CONVERT_VECTOR_)
  1581. SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, b_.i32);
  1582. r_.m64_private[1] = a_.m64_private[1];
  1583. #else
  1584. r_.f32[0] = (simde_float32)b_.i32[0];
  1585. r_.f32[1] = (simde_float32)b_.i32[1];
  1586. r_.i32[2] = a_.i32[2];
  1587. r_.i32[3] = a_.i32[3];
  1588. #endif
  1589. return simde__m128_from_private(r_);
  1590. #endif
  1591. }
  1592. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1593. #define _mm_cvt_pi2ps(a, b) simde_mm_cvt_pi2ps((a), (b))
  1594. #endif
  1595. SIMDE_FUNCTION_ATTRIBUTES
  1596. simde__m64 simde_mm_cvt_ps2pi(simde__m128 a)
  1597. {
  1598. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  1599. return _mm_cvt_ps2pi(a);
  1600. #else
  1601. simde__m64_private r_;
  1602. simde__m128_private a_;
  1603. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1604. a_ = simde__m128_to_private(
  1605. simde_mm_round_ps(a, SIMDE_MM_FROUND_CUR_DIRECTION));
  1606. r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32));
  1607. #elif defined(SIMDE_CONVERT_VECTOR_) && SIMDE_NATURAL_VECTOR_SIZE_GE(128)
  1608. a_ = simde__m128_to_private(
  1609. simde_mm_round_ps(a, SIMDE_MM_FROUND_CUR_DIRECTION));
  1610. SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].f32);
  1611. #else
  1612. a_ = simde__m128_to_private(a);
  1613. SIMDE_VECTORIZE
  1614. for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
  1615. r_.i32[i] = HEDLEY_STATIC_CAST(
  1616. int32_t, simde_math_nearbyintf(a_.f32[i]));
  1617. }
  1618. #endif
  1619. return simde__m64_from_private(r_);
  1620. #endif
  1621. }
  1622. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1623. #define _mm_cvt_ps2pi(a) simde_mm_cvt_ps2pi((a))
  1624. #endif
  1625. SIMDE_FUNCTION_ATTRIBUTES
  1626. simde__m128 simde_mm_cvt_si2ss(simde__m128 a, int32_t b)
  1627. {
  1628. #if defined(SIMDE_X86_SSE_NATIVE)
  1629. return _mm_cvt_si2ss(a, b);
  1630. #else
  1631. simde__m128_private r_, a_ = simde__m128_to_private(a);
  1632. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1633. r_.neon_f32 =
  1634. vsetq_lane_f32(HEDLEY_STATIC_CAST(float, b), a_.neon_f32, 0);
  1635. #else
  1636. r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b);
  1637. r_.i32[1] = a_.i32[1];
  1638. r_.i32[2] = a_.i32[2];
  1639. r_.i32[3] = a_.i32[3];
  1640. #endif
  1641. return simde__m128_from_private(r_);
  1642. #endif
  1643. }
  1644. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1645. #define _mm_cvt_si2ss(a, b) simde_mm_cvt_si2ss((a), b)
  1646. #endif
  1647. SIMDE_FUNCTION_ATTRIBUTES
  1648. int32_t simde_mm_cvt_ss2si(simde__m128 a)
  1649. {
  1650. #if defined(SIMDE_X86_SSE_NATIVE)
  1651. return _mm_cvt_ss2si(a);
  1652. #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
  1653. return vgetq_lane_s32(vcvtnq_s32_f32(simde__m128_to_neon_f32(a)), 0);
  1654. #else
  1655. simde__m128_private a_ = simde__m128_to_private(
  1656. simde_mm_round_ps(a, SIMDE_MM_FROUND_CUR_DIRECTION));
  1657. return SIMDE_CONVERT_FTOI(int32_t, a_.f32[0]);
  1658. #endif
  1659. }
  1660. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1661. #define _mm_cvt_ss2si(a) simde_mm_cvt_ss2si((a))
  1662. #endif
  1663. SIMDE_FUNCTION_ATTRIBUTES
  1664. simde__m128 simde_mm_cvtpi16_ps(simde__m64 a)
  1665. {
  1666. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  1667. return _mm_cvtpi16_ps(a);
  1668. #else
  1669. simde__m128_private r_;
  1670. simde__m64_private a_ = simde__m64_to_private(a);
  1671. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1672. r_.neon_f32 = vcvtq_f32_s32(vmovl_s16(a_.neon_i16));
  1673. #elif defined(SIMDE_CONVERT_VECTOR_)
  1674. SIMDE_CONVERT_VECTOR_(r_.f32, a_.i16);
  1675. #else
  1676. SIMDE_VECTORIZE
  1677. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  1678. simde_float32 v = a_.i16[i];
  1679. r_.f32[i] = v;
  1680. }
  1681. #endif
  1682. return simde__m128_from_private(r_);
  1683. #endif
  1684. }
  1685. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1686. #define _mm_cvtpi16_ps(a) simde_mm_cvtpi16_ps(a)
  1687. #endif
  1688. SIMDE_FUNCTION_ATTRIBUTES
  1689. simde__m128 simde_mm_cvtpi32_ps(simde__m128 a, simde__m64 b)
  1690. {
  1691. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  1692. return _mm_cvtpi32_ps(a, b);
  1693. #else
  1694. simde__m128_private r_, a_ = simde__m128_to_private(a);
  1695. simde__m64_private b_ = simde__m64_to_private(b);
  1696. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1697. r_.neon_f32 = vcombine_f32(vcvt_f32_s32(b_.neon_i32),
  1698. vget_high_f32(a_.neon_f32));
  1699. #elif defined(SIMDE_CONVERT_VECTOR_)
  1700. SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, b_.i32);
  1701. r_.m64_private[1] = a_.m64_private[1];
  1702. #else
  1703. r_.f32[0] = (simde_float32)b_.i32[0];
  1704. r_.f32[1] = (simde_float32)b_.i32[1];
  1705. r_.i32[2] = a_.i32[2];
  1706. r_.i32[3] = a_.i32[3];
  1707. #endif
  1708. return simde__m128_from_private(r_);
  1709. #endif
  1710. }
  1711. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1712. #define _mm_cvtpi32_ps(a, b) simde_mm_cvtpi32_ps((a), b)
  1713. #endif
  1714. SIMDE_FUNCTION_ATTRIBUTES
  1715. simde__m128 simde_mm_cvtpi32x2_ps(simde__m64 a, simde__m64 b)
  1716. {
  1717. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  1718. return _mm_cvtpi32x2_ps(a, b);
  1719. #else
  1720. simde__m128_private r_;
  1721. simde__m64_private a_ = simde__m64_to_private(a),
  1722. b_ = simde__m64_to_private(b);
  1723. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1724. r_.neon_f32 = vcvtq_f32_s32(vcombine_s32(a_.neon_i32, b_.neon_i32));
  1725. #elif defined(SIMDE_CONVERT_VECTOR_)
  1726. SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, a_.i32);
  1727. SIMDE_CONVERT_VECTOR_(r_.m64_private[1].f32, b_.i32);
  1728. #else
  1729. r_.f32[0] = (simde_float32)a_.i32[0];
  1730. r_.f32[1] = (simde_float32)a_.i32[1];
  1731. r_.f32[2] = (simde_float32)b_.i32[0];
  1732. r_.f32[3] = (simde_float32)b_.i32[1];
  1733. #endif
  1734. return simde__m128_from_private(r_);
  1735. #endif
  1736. }
  1737. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1738. #define _mm_cvtpi32x2_ps(a, b) simde_mm_cvtpi32x2_ps(a, b)
  1739. #endif
  1740. SIMDE_FUNCTION_ATTRIBUTES
  1741. simde__m128 simde_mm_cvtpi8_ps(simde__m64 a)
  1742. {
  1743. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  1744. return _mm_cvtpi8_ps(a);
  1745. #else
  1746. simde__m128_private r_;
  1747. simde__m64_private a_ = simde__m64_to_private(a);
  1748. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1749. r_.neon_f32 =
  1750. vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(a_.neon_i8))));
  1751. #else
  1752. r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[0]);
  1753. r_.f32[1] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[1]);
  1754. r_.f32[2] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[2]);
  1755. r_.f32[3] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[3]);
  1756. #endif
  1757. return simde__m128_from_private(r_);
  1758. #endif
  1759. }
  1760. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1761. #define _mm_cvtpi8_ps(a) simde_mm_cvtpi8_ps(a)
  1762. #endif
  1763. SIMDE_FUNCTION_ATTRIBUTES
  1764. simde__m64 simde_mm_cvtps_pi16(simde__m128 a)
  1765. {
  1766. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  1767. return _mm_cvtps_pi16(a);
  1768. #else
  1769. simde__m64_private r_;
  1770. simde__m128_private a_ = simde__m128_to_private(a);
  1771. #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
  1772. r_.neon_i16 = vmovn_s32(vcvtq_s32_f32(vrndiq_f32(a_.neon_f32)));
  1773. #else
  1774. SIMDE_VECTORIZE
  1775. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  1776. r_.i16[i] = SIMDE_CONVERT_FTOI(int16_t,
  1777. simde_math_roundf(a_.f32[i]));
  1778. }
  1779. #endif
  1780. return simde__m64_from_private(r_);
  1781. #endif
  1782. }
  1783. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1784. #define _mm_cvtps_pi16(a) simde_mm_cvtps_pi16((a))
  1785. #endif
  1786. SIMDE_FUNCTION_ATTRIBUTES
  1787. simde__m64 simde_mm_cvtps_pi32(simde__m128 a)
  1788. {
  1789. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  1790. return _mm_cvtps_pi32(a);
  1791. #else
  1792. simde__m64_private r_;
  1793. simde__m128_private a_ = simde__m128_to_private(a);
  1794. #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \
  1795. defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_BUG_GCC_95399)
  1796. r_.neon_i32 = vcvt_s32_f32(vget_low_f32(vrndiq_f32(a_.neon_f32)));
  1797. #else
  1798. SIMDE_VECTORIZE
  1799. for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
  1800. simde_float32 v = simde_math_roundf(a_.f32[i]);
  1801. #if !defined(SIMDE_FAST_CONVERSION_RANGE)
  1802. r_.i32[i] =
  1803. ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) &&
  1804. (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX)))
  1805. ? SIMDE_CONVERT_FTOI(int32_t, v)
  1806. : INT32_MIN;
  1807. #else
  1808. r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
  1809. #endif
  1810. }
  1811. #endif
  1812. return simde__m64_from_private(r_);
  1813. #endif
  1814. }
  1815. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1816. #define _mm_cvtps_pi32(a) simde_mm_cvtps_pi32((a))
  1817. #endif
  1818. SIMDE_FUNCTION_ATTRIBUTES
  1819. simde__m64 simde_mm_cvtps_pi8(simde__m128 a)
  1820. {
  1821. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  1822. return _mm_cvtps_pi8(a);
  1823. #else
  1824. simde__m64_private r_;
  1825. simde__m128_private a_ = simde__m128_to_private(a);
  1826. #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95471)
  1827. /* Clamp the input to [INT8_MIN, INT8_MAX], round, convert to i32, narrow to
  1828. * i16, combine with an all-zero vector of i16 (which will become the upper
  1829. * half), narrow to i8. */
  1830. float32x4_t max =
  1831. vdupq_n_f32(HEDLEY_STATIC_CAST(simde_float32, INT8_MAX));
  1832. float32x4_t min =
  1833. vdupq_n_f32(HEDLEY_STATIC_CAST(simde_float32, INT8_MIN));
  1834. float32x4_t values =
  1835. vrndnq_f32(vmaxq_f32(vminq_f32(max, a_.neon_f32), min));
  1836. r_.neon_i8 = vmovn_s16(
  1837. vcombine_s16(vmovn_s32(vcvtq_s32_f32(values)), vdup_n_s16(0)));
  1838. #else
  1839. SIMDE_VECTORIZE
  1840. for (size_t i = 0; i < (sizeof(a_.f32) / sizeof(a_.f32[0])); i++) {
  1841. if (a_.f32[i] > HEDLEY_STATIC_CAST(simde_float32, INT8_MAX))
  1842. r_.i8[i] = INT8_MAX;
  1843. else if (a_.f32[i] <
  1844. HEDLEY_STATIC_CAST(simde_float32, INT8_MIN))
  1845. r_.i8[i] = INT8_MIN;
  1846. else
  1847. r_.i8[i] = SIMDE_CONVERT_FTOI(
  1848. int8_t, simde_math_roundf(a_.f32[i]));
  1849. }
  1850. /* Note: the upper half is undefined */
  1851. #endif
  1852. return simde__m64_from_private(r_);
  1853. #endif
  1854. }
  1855. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1856. #define _mm_cvtps_pi8(a) simde_mm_cvtps_pi8((a))
  1857. #endif
  1858. SIMDE_FUNCTION_ATTRIBUTES
  1859. simde__m128 simde_mm_cvtpu16_ps(simde__m64 a)
  1860. {
  1861. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  1862. return _mm_cvtpu16_ps(a);
  1863. #else
  1864. simde__m128_private r_;
  1865. simde__m64_private a_ = simde__m64_to_private(a);
  1866. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1867. r_.neon_f32 = vcvtq_f32_u32(vmovl_u16(a_.neon_u16));
  1868. #elif defined(SIMDE_CONVERT_VECTOR_)
  1869. SIMDE_CONVERT_VECTOR_(r_.f32, a_.u16);
  1870. #else
  1871. SIMDE_VECTORIZE
  1872. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  1873. r_.f32[i] = (simde_float32)a_.u16[i];
  1874. }
  1875. #endif
  1876. return simde__m128_from_private(r_);
  1877. #endif
  1878. }
  1879. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1880. #define _mm_cvtpu16_ps(a) simde_mm_cvtpu16_ps(a)
  1881. #endif
  1882. SIMDE_FUNCTION_ATTRIBUTES
  1883. simde__m128 simde_mm_cvtpu8_ps(simde__m64 a)
  1884. {
  1885. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  1886. return _mm_cvtpu8_ps(a);
  1887. #else
  1888. simde__m128_private r_;
  1889. simde__m64_private a_ = simde__m64_to_private(a);
  1890. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1891. r_.neon_f32 =
  1892. vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(a_.neon_u8))));
  1893. #else
  1894. SIMDE_VECTORIZE
  1895. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  1896. r_.f32[i] = HEDLEY_STATIC_CAST(simde_float32, a_.u8[i]);
  1897. }
  1898. #endif
  1899. return simde__m128_from_private(r_);
  1900. #endif
  1901. }
  1902. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1903. #define _mm_cvtpu8_ps(a) simde_mm_cvtpu8_ps(a)
  1904. #endif
  1905. SIMDE_FUNCTION_ATTRIBUTES
  1906. simde__m128 simde_mm_cvtsi32_ss(simde__m128 a, int32_t b)
  1907. {
  1908. #if defined(SIMDE_X86_SSE_NATIVE)
  1909. return _mm_cvtsi32_ss(a, b);
  1910. #else
  1911. simde__m128_private r_;
  1912. simde__m128_private a_ = simde__m128_to_private(a);
  1913. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1914. r_.neon_f32 = vsetq_lane_f32(HEDLEY_STATIC_CAST(float32_t, b),
  1915. a_.neon_f32, 0);
  1916. #else
  1917. r_ = a_;
  1918. r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b);
  1919. #endif
  1920. return simde__m128_from_private(r_);
  1921. #endif
  1922. }
  1923. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1924. #define _mm_cvtsi32_ss(a, b) simde_mm_cvtsi32_ss((a), b)
  1925. #endif
  1926. SIMDE_FUNCTION_ATTRIBUTES
  1927. simde__m128 simde_mm_cvtsi64_ss(simde__m128 a, int64_t b)
  1928. {
  1929. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
  1930. #if !defined(__PGI)
  1931. return _mm_cvtsi64_ss(a, b);
  1932. #else
  1933. return _mm_cvtsi64x_ss(a, b);
  1934. #endif
  1935. #else
  1936. simde__m128_private r_;
  1937. simde__m128_private a_ = simde__m128_to_private(a);
  1938. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1939. r_.neon_f32 = vsetq_lane_f32(HEDLEY_STATIC_CAST(float32_t, b),
  1940. a_.neon_f32, 0);
  1941. #else
  1942. r_ = a_;
  1943. r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b);
  1944. #endif
  1945. return simde__m128_from_private(r_);
  1946. #endif
  1947. }
  1948. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1949. #define _mm_cvtsi64_ss(a, b) simde_mm_cvtsi64_ss((a), b)
  1950. #endif
  1951. SIMDE_FUNCTION_ATTRIBUTES
  1952. simde_float32 simde_mm_cvtss_f32(simde__m128 a)
  1953. {
  1954. #if defined(SIMDE_X86_SSE_NATIVE)
  1955. return _mm_cvtss_f32(a);
  1956. #else
  1957. simde__m128_private a_ = simde__m128_to_private(a);
  1958. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1959. return vgetq_lane_f32(a_.neon_f32, 0);
  1960. #else
  1961. return a_.f32[0];
  1962. #endif
  1963. #endif
  1964. }
  1965. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1966. #define _mm_cvtss_f32(a) simde_mm_cvtss_f32((a))
  1967. #endif
  1968. SIMDE_FUNCTION_ATTRIBUTES
  1969. int32_t simde_mm_cvtss_si32(simde__m128 a)
  1970. {
  1971. return simde_mm_cvt_ss2si(a);
  1972. }
  1973. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1974. #define _mm_cvtss_si32(a) simde_mm_cvtss_si32((a))
  1975. #endif
  1976. SIMDE_FUNCTION_ATTRIBUTES
  1977. int64_t simde_mm_cvtss_si64(simde__m128 a)
  1978. {
  1979. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
  1980. #if !defined(__PGI)
  1981. return _mm_cvtss_si64(a);
  1982. #else
  1983. return _mm_cvtss_si64x(a);
  1984. #endif
  1985. #else
  1986. simde__m128_private a_ = simde__m128_to_private(a);
  1987. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  1988. return SIMDE_CONVERT_FTOI(
  1989. int64_t, simde_math_roundf(vgetq_lane_f32(a_.neon_f32, 0)));
  1990. #else
  1991. return SIMDE_CONVERT_FTOI(int64_t, simde_math_roundf(a_.f32[0]));
  1992. #endif
  1993. #endif
  1994. }
  1995. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  1996. #define _mm_cvtss_si64(a) simde_mm_cvtss_si64((a))
  1997. #endif
  1998. SIMDE_FUNCTION_ATTRIBUTES
  1999. simde__m64 simde_mm_cvtt_ps2pi(simde__m128 a)
  2000. {
  2001. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  2002. return _mm_cvtt_ps2pi(a);
  2003. #else
  2004. simde__m64_private r_;
  2005. simde__m128_private a_ = simde__m128_to_private(a);
  2006. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
  2007. r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32));
  2008. #else
  2009. SIMDE_VECTORIZE
  2010. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  2011. simde_float32 v = a_.f32[i];
  2012. #if !defined(SIMDE_FAST_CONVERSION_RANGE)
  2013. r_.i32[i] =
  2014. ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) &&
  2015. (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX)))
  2016. ? SIMDE_CONVERT_FTOI(int32_t, v)
  2017. : INT32_MIN;
  2018. #else
  2019. r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
  2020. #endif
  2021. }
  2022. #endif
  2023. return simde__m64_from_private(r_);
  2024. #endif
  2025. }
  2026. #define simde_mm_cvttps_pi32(a) simde_mm_cvtt_ps2pi(a)
  2027. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2028. #define _mm_cvtt_ps2pi(a) simde_mm_cvtt_ps2pi((a))
  2029. #define _mm_cvttps_pi32(a) simde_mm_cvttps_pi32((a))
  2030. #endif
  2031. SIMDE_FUNCTION_ATTRIBUTES
  2032. int32_t simde_mm_cvtt_ss2si(simde__m128 a)
  2033. {
  2034. #if defined(SIMDE_X86_SSE_NATIVE)
  2035. return _mm_cvtt_ss2si(a);
  2036. #else
  2037. simde__m128_private a_ = simde__m128_to_private(a);
  2038. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
  2039. return SIMDE_CONVERT_FTOI(int32_t, vgetq_lane_f32(a_.neon_f32, 0));
  2040. #else
  2041. simde_float32 v = a_.f32[0];
  2042. #if !defined(SIMDE_FAST_CONVERSION_RANGE)
  2043. return ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) &&
  2044. (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX)))
  2045. ? SIMDE_CONVERT_FTOI(int32_t, v)
  2046. : INT32_MIN;
  2047. #else
  2048. return SIMDE_CONVERT_FTOI(int32_t, v);
  2049. #endif
  2050. #endif
  2051. #endif
  2052. }
  2053. #define simde_mm_cvttss_si32(a) simde_mm_cvtt_ss2si((a))
  2054. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2055. #define _mm_cvtt_ss2si(a) simde_mm_cvtt_ss2si((a))
  2056. #define _mm_cvttss_si32(a) simde_mm_cvtt_ss2si((a))
  2057. #endif
  2058. SIMDE_FUNCTION_ATTRIBUTES
  2059. int64_t simde_mm_cvttss_si64(simde__m128 a)
  2060. {
  2061. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) && \
  2062. !defined(_MSC_VER)
  2063. #if defined(__PGI)
  2064. return _mm_cvttss_si64x(a);
  2065. #else
  2066. return _mm_cvttss_si64(a);
  2067. #endif
  2068. #else
  2069. simde__m128_private a_ = simde__m128_to_private(a);
  2070. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2071. return SIMDE_CONVERT_FTOI(int64_t, vgetq_lane_f32(a_.neon_f32, 0));
  2072. #else
  2073. return SIMDE_CONVERT_FTOI(int64_t, a_.f32[0]);
  2074. #endif
  2075. #endif
  2076. }
  2077. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2078. #define _mm_cvttss_si64(a) simde_mm_cvttss_si64((a))
  2079. #endif
  2080. SIMDE_FUNCTION_ATTRIBUTES
  2081. simde__m128 simde_mm_cmpord_ss(simde__m128 a, simde__m128 b)
  2082. {
  2083. #if defined(SIMDE_X86_SSE_NATIVE)
  2084. return _mm_cmpord_ss(a, b);
  2085. #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
  2086. return simde_mm_move_ss(a, simde_mm_cmpord_ps(a, b));
  2087. #else
  2088. simde__m128_private r_, a_ = simde__m128_to_private(a);
  2089. #if defined(simde_math_isnanf)
  2090. r_.u32[0] = (simde_math_isnanf(simde_mm_cvtss_f32(a)) ||
  2091. simde_math_isnanf(simde_mm_cvtss_f32(b)))
  2092. ? UINT32_C(0)
  2093. : ~UINT32_C(0);
  2094. SIMDE_VECTORIZE
  2095. for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  2096. r_.u32[i] = a_.u32[i];
  2097. }
  2098. #else
  2099. HEDLEY_UNREACHABLE();
  2100. #endif
  2101. return simde__m128_from_private(r_);
  2102. #endif
  2103. }
  2104. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2105. #define _mm_cmpord_ss(a, b) simde_mm_cmpord_ss((a), (b))
  2106. #endif
  2107. SIMDE_FUNCTION_ATTRIBUTES
  2108. simde__m128 simde_mm_div_ps(simde__m128 a, simde__m128 b)
  2109. {
  2110. #if defined(SIMDE_X86_SSE_NATIVE)
  2111. return _mm_div_ps(a, b);
  2112. #else
  2113. simde__m128_private r_, a_ = simde__m128_to_private(a),
  2114. b_ = simde__m128_to_private(b);
  2115. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  2116. r_.neon_f32 = vdivq_f32(a_.neon_f32, b_.neon_f32);
  2117. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2118. float32x4_t recip0 = vrecpeq_f32(b_.neon_f32);
  2119. float32x4_t recip1 =
  2120. vmulq_f32(recip0, vrecpsq_f32(recip0, b_.neon_f32));
  2121. r_.neon_f32 = vmulq_f32(a_.neon_f32, recip1);
  2122. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  2123. r_.wasm_v128 = wasm_f32x4_div(a_.wasm_v128, b_.wasm_v128);
  2124. #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
  2125. r_.altivec_f32 = vec_div(a_.altivec_f32, b_.altivec_f32);
  2126. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  2127. r_.f32 = a_.f32 / b_.f32;
  2128. #else
  2129. SIMDE_VECTORIZE
  2130. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  2131. r_.f32[i] = a_.f32[i] / b_.f32[i];
  2132. }
  2133. #endif
  2134. return simde__m128_from_private(r_);
  2135. #endif
  2136. }
  2137. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2138. #define _mm_div_ps(a, b) simde_mm_div_ps((a), (b))
  2139. #endif
  2140. SIMDE_FUNCTION_ATTRIBUTES
  2141. simde__m128 simde_mm_div_ss(simde__m128 a, simde__m128 b)
  2142. {
  2143. #if defined(SIMDE_X86_SSE_NATIVE)
  2144. return _mm_div_ss(a, b);
  2145. #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
  2146. return simde_mm_move_ss(a, simde_mm_div_ps(a, b));
  2147. #else
  2148. simde__m128_private r_, a_ = simde__m128_to_private(a),
  2149. b_ = simde__m128_to_private(b);
  2150. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2151. float32_t value = vgetq_lane_f32(
  2152. simde__m128_to_private(simde_mm_div_ps(a, b)).neon_f32, 0);
  2153. r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0);
  2154. #else
  2155. r_.f32[0] = a_.f32[0] / b_.f32[0];
  2156. SIMDE_VECTORIZE
  2157. for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  2158. r_.f32[i] = a_.f32[i];
  2159. }
  2160. #endif
  2161. return simde__m128_from_private(r_);
  2162. #endif
  2163. }
  2164. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2165. #define _mm_div_ss(a, b) simde_mm_div_ss((a), (b))
  2166. #endif
  2167. SIMDE_FUNCTION_ATTRIBUTES
  2168. int16_t simde_mm_extract_pi16(simde__m64 a, const int imm8)
  2169. SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)
  2170. {
  2171. simde__m64_private a_ = simde__m64_to_private(a);
  2172. return a_.i16[imm8];
  2173. }
  2174. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && \
  2175. !defined(HEDLEY_PGI_VERSION)
  2176. #if defined(SIMDE_BUG_CLANG_44589)
  2177. #define simde_mm_extract_pi16(a, imm8) \
  2178. (HEDLEY_DIAGNOSTIC_PUSH _Pragma( \
  2179. "clang diagnostic ignored \"-Wvector-conversion\"") \
  2180. HEDLEY_STATIC_CAST(int16_t, _mm_extract_pi16((a), (imm8))) \
  2181. HEDLEY_DIAGNOSTIC_POP)
  2182. #else
  2183. #define simde_mm_extract_pi16(a, imm8) \
  2184. HEDLEY_STATIC_CAST(int16_t, _mm_extract_pi16(a, imm8))
  2185. #endif
  2186. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2187. #define simde_mm_extract_pi16(a, imm8) \
  2188. vget_lane_s16(simde__m64_to_private(a).neon_i16, imm8)
  2189. #endif
  2190. #define simde_m_pextrw(a, imm8) simde_mm_extract_pi16(a, imm8)
  2191. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2192. #define _mm_extract_pi16(a, imm8) simde_mm_extract_pi16((a), (imm8))
  2193. #define _m_pextrw(a, imm8) simde_mm_extract_pi16((a), (imm8))
  2194. #endif
  2195. SIMDE_FUNCTION_ATTRIBUTES
  2196. simde__m64 simde_mm_insert_pi16(simde__m64 a, int16_t i, const int imm8)
  2197. SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)
  2198. {
  2199. simde__m64_private r_, a_ = simde__m64_to_private(a);
  2200. r_.i64[0] = a_.i64[0];
  2201. r_.i16[imm8] = i;
  2202. return simde__m64_from_private(r_);
  2203. }
  2204. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && \
  2205. !defined(__PGI)
  2206. #if defined(SIMDE_BUG_CLANG_44589)
  2207. #define ssimde_mm_insert_pi16(a, i, imm8) \
  2208. (HEDLEY_DIAGNOSTIC_PUSH _Pragma( \
  2209. "clang diagnostic ignored \"-Wvector-conversion\"")( \
  2210. _mm_insert_pi16((a), (i), (imm8))) HEDLEY_DIAGNOSTIC_POP)
  2211. #else
  2212. #define simde_mm_insert_pi16(a, i, imm8) _mm_insert_pi16(a, i, imm8)
  2213. #endif
  2214. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2215. #define simde_mm_insert_pi16(a, i, imm8) \
  2216. simde__m64_from_neon_i16( \
  2217. vset_lane_s16((i), simde__m64_to_neon_i16(a), (imm8)))
  2218. #endif
  2219. #define simde_m_pinsrw(a, i, imm8) (simde_mm_insert_pi16(a, i, imm8))
  2220. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2221. #define _mm_insert_pi16(a, i, imm8) simde_mm_insert_pi16(a, i, imm8)
  2222. #define _m_pinsrw(a, i, imm8) simde_mm_insert_pi16(a, i, imm8)
  2223. #endif
  2224. SIMDE_FUNCTION_ATTRIBUTES
  2225. simde__m128
  2226. simde_mm_load_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)])
  2227. {
  2228. #if defined(SIMDE_X86_SSE_NATIVE)
  2229. return _mm_load_ps(mem_addr);
  2230. #else
  2231. simde__m128_private r_;
  2232. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2233. r_.neon_f32 = vld1q_f32(mem_addr);
  2234. #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
  2235. r_.altivec_f32 = vec_vsx_ld(0, mem_addr);
  2236. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  2237. r_.altivec_f32 = vec_ld(0, mem_addr);
  2238. #else
  2239. simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128),
  2240. sizeof(r_));
  2241. #endif
  2242. return simde__m128_from_private(r_);
  2243. #endif
  2244. }
  2245. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2246. #define _mm_load_ps(mem_addr) simde_mm_load_ps(mem_addr)
  2247. #endif
  2248. SIMDE_FUNCTION_ATTRIBUTES
  2249. simde__m128 simde_mm_load1_ps(simde_float32 const *mem_addr)
  2250. {
  2251. #if defined(SIMDE_X86_SSE_NATIVE)
  2252. return _mm_load_ps1(mem_addr);
  2253. #else
  2254. simde__m128_private r_;
  2255. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2256. r_.neon_f32 = vld1q_dup_f32(mem_addr);
  2257. #else
  2258. r_ = simde__m128_to_private(simde_mm_set1_ps(*mem_addr));
  2259. #endif
  2260. return simde__m128_from_private(r_);
  2261. #endif
  2262. }
  2263. #define simde_mm_load_ps1(mem_addr) simde_mm_load1_ps(mem_addr)
  2264. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2265. #define _mm_load_ps1(mem_addr) simde_mm_load1_ps(mem_addr)
  2266. #define _mm_load1_ps(mem_addr) simde_mm_load1_ps(mem_addr)
  2267. #endif
  2268. SIMDE_FUNCTION_ATTRIBUTES
  2269. simde__m128 simde_mm_load_ss(simde_float32 const *mem_addr)
  2270. {
  2271. #if defined(SIMDE_X86_SSE_NATIVE)
  2272. return _mm_load_ss(mem_addr);
  2273. #else
  2274. simde__m128_private r_;
  2275. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2276. r_.neon_f32 = vsetq_lane_f32(*mem_addr, vdupq_n_f32(0), 0);
  2277. #else
  2278. r_.f32[0] = *mem_addr;
  2279. r_.i32[1] = 0;
  2280. r_.i32[2] = 0;
  2281. r_.i32[3] = 0;
  2282. #endif
  2283. return simde__m128_from_private(r_);
  2284. #endif
  2285. }
  2286. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2287. #define _mm_load_ss(mem_addr) simde_mm_load_ss(mem_addr)
  2288. #endif
  2289. SIMDE_FUNCTION_ATTRIBUTES
  2290. simde__m128 simde_mm_loadh_pi(simde__m128 a, simde__m64 const *mem_addr)
  2291. {
  2292. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  2293. return _mm_loadh_pi(a,
  2294. HEDLEY_REINTERPRET_CAST(__m64 const *, mem_addr));
  2295. #else
  2296. simde__m128_private r_, a_ = simde__m128_to_private(a);
  2297. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2298. r_.neon_f32 = vcombine_f32(
  2299. vget_low_f32(a_.neon_f32),
  2300. vld1_f32(HEDLEY_REINTERPRET_CAST(const float32_t *, mem_addr)));
  2301. #else
  2302. simde__m64_private b_ =
  2303. *HEDLEY_REINTERPRET_CAST(simde__m64_private const *, mem_addr);
  2304. r_.f32[0] = a_.f32[0];
  2305. r_.f32[1] = a_.f32[1];
  2306. r_.f32[2] = b_.f32[0];
  2307. r_.f32[3] = b_.f32[1];
  2308. #endif
  2309. return simde__m128_from_private(r_);
  2310. #endif
  2311. }
  2312. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2313. #if HEDLEY_HAS_WARNING("-Wold-style-cast")
  2314. #define _mm_loadh_pi(a, mem_addr) \
  2315. simde_mm_loadh_pi((a), HEDLEY_REINTERPRET_CAST(simde__m64 const *, \
  2316. (mem_addr)))
  2317. #else
  2318. #define _mm_loadh_pi(a, mem_addr) \
  2319. simde_mm_loadh_pi((a), (simde__m64 const *)(mem_addr))
  2320. #endif
  2321. #endif
  2322. /* The SSE documentation says that there are no alignment requirements
  2323. for mem_addr. Unfortunately they used the __m64 type for the argument
  2324. which is supposed to be 8-byte aligned, so some compilers (like clang
  2325. with -Wcast-align) will generate a warning if you try to cast, say,
  2326. a simde_float32* to a simde__m64* for this function.
  2327. I think the choice of argument type is unfortunate, but I do think we
  2328. need to stick to it here. If there is demand I can always add something
  2329. like simde_x_mm_loadl_f32(simde__m128, simde_float32 mem_addr[2]) */
  2330. SIMDE_FUNCTION_ATTRIBUTES
  2331. simde__m128 simde_mm_loadl_pi(simde__m128 a, simde__m64 const *mem_addr)
  2332. {
  2333. #if defined(SIMDE_X86_SSE_NATIVE)
  2334. return _mm_loadl_pi(a,
  2335. HEDLEY_REINTERPRET_CAST(__m64 const *, mem_addr));
  2336. #else
  2337. simde__m128_private r_, a_ = simde__m128_to_private(a);
  2338. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2339. r_.neon_f32 = vcombine_f32(
  2340. vld1_f32(HEDLEY_REINTERPRET_CAST(const float32_t *, mem_addr)),
  2341. vget_high_f32(a_.neon_f32));
  2342. #else
  2343. simde__m64_private b_;
  2344. simde_memcpy(&b_, mem_addr, sizeof(b_));
  2345. r_.i32[0] = b_.i32[0];
  2346. r_.i32[1] = b_.i32[1];
  2347. r_.i32[2] = a_.i32[2];
  2348. r_.i32[3] = a_.i32[3];
  2349. #endif
  2350. return simde__m128_from_private(r_);
  2351. #endif
  2352. }
  2353. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2354. #if HEDLEY_HAS_WARNING("-Wold-style-cast")
  2355. #define _mm_loadl_pi(a, mem_addr) \
  2356. simde_mm_loadl_pi((a), HEDLEY_REINTERPRET_CAST(simde__m64 const *, \
  2357. (mem_addr)))
  2358. #else
  2359. #define _mm_loadl_pi(a, mem_addr) \
  2360. simde_mm_loadl_pi((a), (simde__m64 const *)(mem_addr))
  2361. #endif
  2362. #endif
  2363. SIMDE_FUNCTION_ATTRIBUTES
  2364. simde__m128
  2365. simde_mm_loadr_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)])
  2366. {
  2367. #if defined(SIMDE_X86_SSE_NATIVE)
  2368. return _mm_loadr_ps(mem_addr);
  2369. #else
  2370. simde__m128_private r_,
  2371. v_ = simde__m128_to_private(simde_mm_load_ps(mem_addr));
  2372. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2373. r_.neon_f32 = vrev64q_f32(v_.neon_f32);
  2374. r_.neon_f32 = vextq_f32(r_.neon_f32, r_.neon_f32, 2);
  2375. #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && defined(__PPC64__)
  2376. r_.altivec_f32 = vec_reve(v_.altivec_f32);
  2377. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  2378. r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, v_.f32, v_.f32, 3, 2, 1, 0);
  2379. #else
  2380. r_.f32[0] = v_.f32[3];
  2381. r_.f32[1] = v_.f32[2];
  2382. r_.f32[2] = v_.f32[1];
  2383. r_.f32[3] = v_.f32[0];
  2384. #endif
  2385. return simde__m128_from_private(r_);
  2386. #endif
  2387. }
  2388. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2389. #define _mm_loadr_ps(mem_addr) simde_mm_loadr_ps(mem_addr)
  2390. #endif
  2391. SIMDE_FUNCTION_ATTRIBUTES
  2392. simde__m128
  2393. simde_mm_loadu_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)])
  2394. {
  2395. #if defined(SIMDE_X86_SSE_NATIVE)
  2396. return _mm_loadu_ps(mem_addr);
  2397. #else
  2398. simde__m128_private r_;
  2399. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2400. r_.neon_f32 =
  2401. vld1q_f32(HEDLEY_REINTERPRET_CAST(const float32_t *, mem_addr));
  2402. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  2403. r_.wasm_v128 = wasm_v128_load(mem_addr);
  2404. #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && defined(__PPC64__)
  2405. r_.altivec_f32 = vec_vsx_ld(0, mem_addr);
  2406. #else
  2407. simde_memcpy(&r_, mem_addr, sizeof(r_));
  2408. #endif
  2409. return simde__m128_from_private(r_);
  2410. #endif
  2411. }
  2412. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2413. #define _mm_loadu_ps(mem_addr) simde_mm_loadu_ps(mem_addr)
  2414. #endif
  2415. SIMDE_FUNCTION_ATTRIBUTES
  2416. void simde_mm_maskmove_si64(simde__m64 a, simde__m64 mask, int8_t *mem_addr)
  2417. {
  2418. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  2419. _mm_maskmove_si64(a, mask, HEDLEY_REINTERPRET_CAST(char *, mem_addr));
  2420. #else
  2421. simde__m64_private a_ = simde__m64_to_private(a),
  2422. mask_ = simde__m64_to_private(mask);
  2423. SIMDE_VECTORIZE
  2424. for (size_t i = 0; i < (sizeof(a_.i8) / sizeof(a_.i8[0])); i++)
  2425. if (mask_.i8[i] < 0)
  2426. mem_addr[i] = a_.i8[i];
  2427. #endif
  2428. }
  2429. #define simde_m_maskmovq(a, mask, mem_addr) \
  2430. simde_mm_maskmove_si64(a, mask, mem_addr)
  2431. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2432. #define _mm_maskmove_si64(a, mask, mem_addr) \
  2433. simde_mm_maskmove_si64( \
  2434. (a), (mask), \
  2435. SIMDE_CHECKED_REINTERPRET_CAST(int8_t *, char *, (mem_addr)))
  2436. #define _m_maskmovq(a, mask, mem_addr) \
  2437. simde_mm_maskmove_si64( \
  2438. (a), (mask), \
  2439. SIMDE_CHECKED_REINTERPRET_CAST(int8_t *, char *, (mem_addr)))
  2440. #endif
  2441. SIMDE_FUNCTION_ATTRIBUTES
  2442. simde__m64 simde_mm_max_pi16(simde__m64 a, simde__m64 b)
  2443. {
  2444. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  2445. return _mm_max_pi16(a, b);
  2446. #else
  2447. simde__m64_private r_, a_ = simde__m64_to_private(a),
  2448. b_ = simde__m64_to_private(b);
  2449. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2450. r_.neon_i16 = vmax_s16(a_.neon_i16, b_.neon_i16);
  2451. #else
  2452. SIMDE_VECTORIZE
  2453. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  2454. r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
  2455. }
  2456. #endif
  2457. return simde__m64_from_private(r_);
  2458. #endif
  2459. }
  2460. #define simde_m_pmaxsw(a, b) simde_mm_max_pi16(a, b)
  2461. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2462. #define _mm_max_pi16(a, b) simde_mm_max_pi16(a, b)
  2463. #define _m_pmaxsw(a, b) simde_mm_max_pi16(a, b)
  2464. #endif
  2465. SIMDE_FUNCTION_ATTRIBUTES
  2466. simde__m128 simde_mm_max_ps(simde__m128 a, simde__m128 b)
  2467. {
  2468. #if defined(SIMDE_X86_SSE_NATIVE)
  2469. return _mm_max_ps(a, b);
  2470. #else
  2471. simde__m128_private r_, a_ = simde__m128_to_private(a),
  2472. b_ = simde__m128_to_private(b);
  2473. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_NANS)
  2474. r_.neon_f32 = vmaxq_f32(a_.neon_f32, b_.neon_f32);
  2475. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2476. r_.neon_f32 = vbslq_f32(vcgtq_f32(a_.neon_f32, b_.neon_f32),
  2477. a_.neon_f32, b_.neon_f32);
  2478. #elif defined(SIMDE_WASM_SIMD128_NATIVE) && defined(SIMDE_FAST_NANS)
  2479. r_.wasm_v128 = wasm_f32x4_max(a_.wasm_v128, b_.wasm_v128);
  2480. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  2481. r_.wasm_v128 =
  2482. wasm_v128_bitselect(a_.wasm_v128, b_.wasm_v128,
  2483. wasm_f32x4_gt(a_.wasm_v128, b_.wasm_v128));
  2484. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_FAST_NANS)
  2485. r_.altivec_f32 = vec_max(a_.altivec_f32, b_.altivec_f32);
  2486. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  2487. r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32,
  2488. vec_cmpgt(a_.altivec_f32, b_.altivec_f32));
  2489. #else
  2490. SIMDE_VECTORIZE
  2491. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  2492. r_.f32[i] = (a_.f32[i] > b_.f32[i]) ? a_.f32[i] : b_.f32[i];
  2493. }
  2494. #endif
  2495. return simde__m128_from_private(r_);
  2496. #endif
  2497. }
  2498. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2499. #define _mm_max_ps(a, b) simde_mm_max_ps((a), (b))
  2500. #endif
  2501. SIMDE_FUNCTION_ATTRIBUTES
  2502. simde__m64 simde_mm_max_pu8(simde__m64 a, simde__m64 b)
  2503. {
  2504. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  2505. return _mm_max_pu8(a, b);
  2506. #else
  2507. simde__m64_private r_, a_ = simde__m64_to_private(a),
  2508. b_ = simde__m64_to_private(b);
  2509. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2510. r_.neon_u8 = vmax_u8(a_.neon_u8, b_.neon_u8);
  2511. #else
  2512. SIMDE_VECTORIZE
  2513. for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
  2514. r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
  2515. }
  2516. #endif
  2517. return simde__m64_from_private(r_);
  2518. #endif
  2519. }
  2520. #define simde_m_pmaxub(a, b) simde_mm_max_pu8(a, b)
  2521. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2522. #define _mm_max_pu8(a, b) simde_mm_max_pu8(a, b)
  2523. #define _m_pmaxub(a, b) simde_mm_max_pu8(a, b)
  2524. #endif
  2525. SIMDE_FUNCTION_ATTRIBUTES
  2526. simde__m128 simde_mm_max_ss(simde__m128 a, simde__m128 b)
  2527. {
  2528. #if defined(SIMDE_X86_SSE_NATIVE)
  2529. return _mm_max_ss(a, b);
  2530. #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
  2531. return simde_mm_move_ss(a, simde_mm_max_ps(a, b));
  2532. #else
  2533. simde__m128_private r_, a_ = simde__m128_to_private(a),
  2534. b_ = simde__m128_to_private(b);
  2535. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2536. float32_t value = vgetq_lane_f32(maxq_f32(a_.neon_f32, b_.neon_f32), 0);
  2537. r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0);
  2538. #else
  2539. r_.f32[0] = (a_.f32[0] > b_.f32[0]) ? a_.f32[0] : b_.f32[0];
  2540. r_.f32[1] = a_.f32[1];
  2541. r_.f32[2] = a_.f32[2];
  2542. r_.f32[3] = a_.f32[3];
  2543. #endif
  2544. return simde__m128_from_private(r_);
  2545. #endif
  2546. }
  2547. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2548. #define _mm_max_ss(a, b) simde_mm_max_ss((a), (b))
  2549. #endif
  2550. SIMDE_FUNCTION_ATTRIBUTES
  2551. simde__m64 simde_mm_min_pi16(simde__m64 a, simde__m64 b)
  2552. {
  2553. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  2554. return _mm_min_pi16(a, b);
  2555. #else
  2556. simde__m64_private r_, a_ = simde__m64_to_private(a),
  2557. b_ = simde__m64_to_private(b);
  2558. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2559. r_.neon_i16 = vmin_s16(a_.neon_i16, b_.neon_i16);
  2560. #else
  2561. SIMDE_VECTORIZE
  2562. for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
  2563. r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
  2564. }
  2565. #endif
  2566. return simde__m64_from_private(r_);
  2567. #endif
  2568. }
  2569. #define simde_m_pminsw(a, b) simde_mm_min_pi16(a, b)
  2570. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2571. #define _mm_min_pi16(a, b) simde_mm_min_pi16(a, b)
  2572. #define _m_pminsw(a, b) simde_mm_min_pi16(a, b)
  2573. #endif
  2574. SIMDE_FUNCTION_ATTRIBUTES
  2575. simde__m128 simde_mm_min_ps(simde__m128 a, simde__m128 b)
  2576. {
  2577. #if defined(SIMDE_X86_SSE_NATIVE)
  2578. return _mm_min_ps(a, b);
  2579. #elif defined(SIMDE_FAST_NANS) && defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2580. return simde__m128_from_neon_f32(vminq_f32(simde__m128_to_neon_f32(a),
  2581. simde__m128_to_neon_f32(b)));
  2582. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  2583. simde__m128_private r_, a_ = simde__m128_to_private(a),
  2584. b_ = simde__m128_to_private(b);
  2585. #if defined(SIMDE_FAST_NANS)
  2586. r_.wasm_v128 = wasm_f32x4_min(a_.wasm_v128, b_.wasm_v128);
  2587. #else
  2588. r_.wasm_v128 =
  2589. wasm_v128_bitselect(a_.wasm_v128, b_.wasm_v128,
  2590. wasm_f32x4_lt(a_.wasm_v128, b_.wasm_v128));
  2591. #endif
  2592. return simde__m128_from_private(r_);
  2593. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  2594. simde__m128_private r_, a_ = simde__m128_to_private(a),
  2595. b_ = simde__m128_to_private(b);
  2596. #if defined(SIMDE_FAST_NANS)
  2597. r_.altivec_f32 = vec_min(a_.altivec_f32, b_.altivec_f32);
  2598. #else
  2599. r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32,
  2600. vec_cmpgt(b_.altivec_f32, a_.altivec_f32));
  2601. #endif
  2602. return simde__m128_from_private(r_);
  2603. #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
  2604. simde__m128 mask = simde_mm_cmplt_ps(a, b);
  2605. return simde_mm_or_ps(simde_mm_and_ps(mask, a),
  2606. simde_mm_andnot_ps(mask, b));
  2607. #else
  2608. simde__m128_private r_, a_ = simde__m128_to_private(a),
  2609. b_ = simde__m128_to_private(b);
  2610. SIMDE_VECTORIZE
  2611. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  2612. r_.f32[i] = (a_.f32[i] < b_.f32[i]) ? a_.f32[i] : b_.f32[i];
  2613. }
  2614. return simde__m128_from_private(r_);
  2615. #endif
  2616. }
  2617. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2618. #define _mm_min_ps(a, b) simde_mm_min_ps((a), (b))
  2619. #endif
  2620. SIMDE_FUNCTION_ATTRIBUTES
  2621. simde__m64 simde_mm_min_pu8(simde__m64 a, simde__m64 b)
  2622. {
  2623. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  2624. return _mm_min_pu8(a, b);
  2625. #else
  2626. simde__m64_private r_, a_ = simde__m64_to_private(a),
  2627. b_ = simde__m64_to_private(b);
  2628. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2629. r_.neon_u8 = vmin_u8(a_.neon_u8, b_.neon_u8);
  2630. #else
  2631. SIMDE_VECTORIZE
  2632. for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
  2633. r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
  2634. }
  2635. #endif
  2636. return simde__m64_from_private(r_);
  2637. #endif
  2638. }
  2639. #define simde_m_pminub(a, b) simde_mm_min_pu8(a, b)
  2640. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2641. #define _mm_min_pu8(a, b) simde_mm_min_pu8(a, b)
  2642. #define _m_pminub(a, b) simde_mm_min_pu8(a, b)
  2643. #endif
  2644. SIMDE_FUNCTION_ATTRIBUTES
  2645. simde__m128 simde_mm_min_ss(simde__m128 a, simde__m128 b)
  2646. {
  2647. #if defined(SIMDE_X86_SSE_NATIVE)
  2648. return _mm_min_ss(a, b);
  2649. #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
  2650. return simde_mm_move_ss(a, simde_mm_min_ps(a, b));
  2651. #else
  2652. simde__m128_private r_, a_ = simde__m128_to_private(a),
  2653. b_ = simde__m128_to_private(b);
  2654. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2655. float32_t value =
  2656. vgetq_lane_f32(vminq_f32(a_.neon_f32, b_.neon_f32), 0);
  2657. r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0);
  2658. #else
  2659. r_.f32[0] = (a_.f32[0] < b_.f32[0]) ? a_.f32[0] : b_.f32[0];
  2660. r_.f32[1] = a_.f32[1];
  2661. r_.f32[2] = a_.f32[2];
  2662. r_.f32[3] = a_.f32[3];
  2663. #endif
  2664. return simde__m128_from_private(r_);
  2665. #endif
  2666. }
  2667. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2668. #define _mm_min_ss(a, b) simde_mm_min_ss((a), (b))
  2669. #endif
  2670. SIMDE_FUNCTION_ATTRIBUTES
  2671. simde__m128 simde_mm_movehl_ps(simde__m128 a, simde__m128 b)
  2672. {
  2673. #if defined(SIMDE_X86_SSE_NATIVE)
  2674. return _mm_movehl_ps(a, b);
  2675. #else
  2676. simde__m128_private r_, a_ = simde__m128_to_private(a),
  2677. b_ = simde__m128_to_private(b);
  2678. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2679. float32x2_t a32 = vget_high_f32(a_.neon_f32);
  2680. float32x2_t b32 = vget_high_f32(b_.neon_f32);
  2681. r_.neon_f32 = vcombine_f32(b32, a32);
  2682. #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
  2683. r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
  2684. SIMDE_POWER_ALTIVEC_VECTOR(float),
  2685. vec_mergel(b_.altivec_i64, a_.altivec_i64));
  2686. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  2687. r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 6, 7, 2, 3);
  2688. #else
  2689. r_.f32[0] = b_.f32[2];
  2690. r_.f32[1] = b_.f32[3];
  2691. r_.f32[2] = a_.f32[2];
  2692. r_.f32[3] = a_.f32[3];
  2693. #endif
  2694. return simde__m128_from_private(r_);
  2695. #endif
  2696. }
  2697. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2698. #define _mm_movehl_ps(a, b) simde_mm_movehl_ps((a), (b))
  2699. #endif
  2700. SIMDE_FUNCTION_ATTRIBUTES
  2701. simde__m128 simde_mm_movelh_ps(simde__m128 a, simde__m128 b)
  2702. {
  2703. #if defined(SIMDE_X86_SSE_NATIVE)
  2704. return _mm_movelh_ps(a, b);
  2705. #else
  2706. simde__m128_private r_, a_ = simde__m128_to_private(a),
  2707. b_ = simde__m128_to_private(b);
  2708. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2709. float32x2_t a10 = vget_low_f32(a_.neon_f32);
  2710. float32x2_t b10 = vget_low_f32(b_.neon_f32);
  2711. r_.neon_f32 = vcombine_f32(a10, b10);
  2712. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  2713. r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 1, 4, 5);
  2714. #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
  2715. r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
  2716. SIMDE_POWER_ALTIVEC_VECTOR(float),
  2717. vec_mergeh(a_.altivec_i64, b_.altivec_i64));
  2718. #else
  2719. r_.f32[0] = a_.f32[0];
  2720. r_.f32[1] = a_.f32[1];
  2721. r_.f32[2] = b_.f32[0];
  2722. r_.f32[3] = b_.f32[1];
  2723. #endif
  2724. return simde__m128_from_private(r_);
  2725. #endif
  2726. }
  2727. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2728. #define _mm_movelh_ps(a, b) simde_mm_movelh_ps((a), (b))
  2729. #endif
  2730. SIMDE_FUNCTION_ATTRIBUTES
  2731. int simde_mm_movemask_pi8(simde__m64 a)
  2732. {
  2733. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  2734. return _mm_movemask_pi8(a);
  2735. #else
  2736. simde__m64_private a_ = simde__m64_to_private(a);
  2737. int r = 0;
  2738. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  2739. uint8x8_t input = a_.neon_u8;
  2740. const int8_t xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0};
  2741. const uint8x8_t mask_and = vdup_n_u8(0x80);
  2742. const int8x8_t mask_shift = vld1_s8(xr);
  2743. const uint8x8_t mask_result =
  2744. vshl_u8(vand_u8(input, mask_and), mask_shift);
  2745. uint8x8_t lo = mask_result;
  2746. r = vaddv_u8(lo);
  2747. #else
  2748. const size_t nmemb = sizeof(a_.i8) / sizeof(a_.i8[0]);
  2749. SIMDE_VECTORIZE_REDUCTION(| : r)
  2750. for (size_t i = 0; i < nmemb; i++) {
  2751. r |= (a_.u8[nmemb - 1 - i] >> 7) << (nmemb - 1 - i);
  2752. }
  2753. #endif
  2754. return r;
  2755. #endif
  2756. }
  2757. #define simde_m_pmovmskb(a) simde_mm_movemask_pi8(a)
  2758. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2759. #define _mm_movemask_pi8(a) simde_mm_movemask_pi8(a)
  2760. #define _m_pmovmskb(a) simde_mm_movemask_pi8(a)
  2761. #endif
  2762. SIMDE_FUNCTION_ATTRIBUTES
  2763. int simde_mm_movemask_ps(simde__m128 a)
  2764. {
  2765. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  2766. return _mm_movemask_ps(a);
  2767. #else
  2768. int r = 0;
  2769. simde__m128_private a_ = simde__m128_to_private(a);
  2770. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  2771. static const int32_t shift_amount[] = {0, 1, 2, 3};
  2772. const int32x4_t shift = vld1q_s32(shift_amount);
  2773. uint32x4_t tmp = vshrq_n_u32(a_.neon_u32, 31);
  2774. return HEDLEY_STATIC_CAST(int, vaddvq_u32(vshlq_u32(tmp, shift)));
  2775. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2776. // Shift out everything but the sign bits with a 32-bit unsigned shift right.
  2777. uint64x2_t high_bits =
  2778. vreinterpretq_u64_u32(vshrq_n_u32(a_.neon_u32, 31));
  2779. // Merge the two pairs together with a 64-bit unsigned shift right + add.
  2780. uint8x16_t paired =
  2781. vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
  2782. // Extract the result.
  2783. return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
  2784. #else
  2785. SIMDE_VECTORIZE_REDUCTION(| : r)
  2786. for (size_t i = 0; i < sizeof(a_.u32) / sizeof(a_.u32[0]); i++) {
  2787. r |= (a_.u32[i] >> ((sizeof(a_.u32[i]) * CHAR_BIT) - 1)) << i;
  2788. }
  2789. #endif
  2790. return r;
  2791. #endif
  2792. }
  2793. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2794. #define _mm_movemask_ps(a) simde_mm_movemask_ps((a))
  2795. #endif
  2796. SIMDE_FUNCTION_ATTRIBUTES
  2797. simde__m128 simde_mm_mul_ps(simde__m128 a, simde__m128 b)
  2798. {
  2799. #if defined(SIMDE_X86_SSE_NATIVE)
  2800. return _mm_mul_ps(a, b);
  2801. #else
  2802. simde__m128_private r_, a_ = simde__m128_to_private(a),
  2803. b_ = simde__m128_to_private(b);
  2804. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2805. r_.neon_f32 = vmulq_f32(a_.neon_f32, b_.neon_f32);
  2806. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  2807. r_.wasm_v128 = wasm_f32x4_mul(a_.wasm_v128, b_.wasm_v128);
  2808. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  2809. r_.f32 = a_.f32 * b_.f32;
  2810. #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
  2811. r_.altivec_f32 = vec_mul(a_.altivec_f32, b_.altivec_f32);
  2812. #else
  2813. SIMDE_VECTORIZE
  2814. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  2815. r_.f32[i] = a_.f32[i] * b_.f32[i];
  2816. }
  2817. #endif
  2818. return simde__m128_from_private(r_);
  2819. #endif
  2820. }
  2821. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2822. #define _mm_mul_ps(a, b) simde_mm_mul_ps((a), (b))
  2823. #endif
  2824. SIMDE_FUNCTION_ATTRIBUTES
  2825. simde__m128 simde_mm_mul_ss(simde__m128 a, simde__m128 b)
  2826. {
  2827. #if defined(SIMDE_X86_SSE_NATIVE)
  2828. return _mm_mul_ss(a, b);
  2829. #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
  2830. return simde_mm_move_ss(a, simde_mm_mul_ps(a, b));
  2831. #else
  2832. simde__m128_private r_, a_ = simde__m128_to_private(a),
  2833. b_ = simde__m128_to_private(b);
  2834. r_.f32[0] = a_.f32[0] * b_.f32[0];
  2835. r_.f32[1] = a_.f32[1];
  2836. r_.f32[2] = a_.f32[2];
  2837. r_.f32[3] = a_.f32[3];
  2838. return simde__m128_from_private(r_);
  2839. #endif
  2840. }
  2841. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2842. #define _mm_mul_ss(a, b) simde_mm_mul_ss((a), (b))
  2843. #endif
  2844. SIMDE_FUNCTION_ATTRIBUTES
  2845. simde__m64 simde_mm_mulhi_pu16(simde__m64 a, simde__m64 b)
  2846. {
  2847. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  2848. return _mm_mulhi_pu16(a, b);
  2849. #else
  2850. simde__m64_private r_, a_ = simde__m64_to_private(a),
  2851. b_ = simde__m64_to_private(b);
  2852. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2853. const uint32x4_t t1 = vmull_u16(a_.neon_u16, b_.neon_u16);
  2854. const uint32x4_t t2 = vshrq_n_u32(t1, 16);
  2855. const uint16x4_t t3 = vmovn_u32(t2);
  2856. r_.neon_u16 = t3;
  2857. #else
  2858. SIMDE_VECTORIZE
  2859. for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
  2860. r_.u16[i] = HEDLEY_STATIC_CAST(
  2861. uint16_t, ((HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) *
  2862. HEDLEY_STATIC_CAST(uint32_t, b_.u16[i])) >>
  2863. UINT32_C(16)));
  2864. }
  2865. #endif
  2866. return simde__m64_from_private(r_);
  2867. #endif
  2868. }
  2869. #define simde_m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b)
  2870. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2871. #define _mm_mulhi_pu16(a, b) simde_mm_mulhi_pu16(a, b)
  2872. #define _m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b)
  2873. #endif
  2874. #if defined(SIMDE_X86_SSE_NATIVE) && defined(HEDLEY_GCC_VERSION)
  2875. #define SIMDE_MM_HINT_NTA HEDLEY_STATIC_CAST(enum _mm_hint, 0)
  2876. #define SIMDE_MM_HINT_T0 HEDLEY_STATIC_CAST(enum _mm_hint, 1)
  2877. #define SIMDE_MM_HINT_T1 HEDLEY_STATIC_CAST(enum _mm_hint, 2)
  2878. #define SIMDE_MM_HINT_T2 HEDLEY_STATIC_CAST(enum _mm_hint, 3)
  2879. #define SIMDE_MM_HINT_ENTA HEDLEY_STATIC_CAST(enum _mm_hint, 4)
  2880. #define SIMDE_MM_HINT_ET0 HEDLEY_STATIC_CAST(enum _mm_hint, 5)
  2881. #define SIMDE_MM_HINT_ET1 HEDLEY_STATIC_CAST(enum _mm_hint, 6)
  2882. #define SIMDE_MM_HINT_ET2 HEDLEY_STATIC_CAST(enum _mm_hint, 7)
  2883. #else
  2884. #define SIMDE_MM_HINT_NTA 0
  2885. #define SIMDE_MM_HINT_T0 1
  2886. #define SIMDE_MM_HINT_T1 2
  2887. #define SIMDE_MM_HINT_T2 3
  2888. #define SIMDE_MM_HINT_ENTA 4
  2889. #define SIMDE_MM_HINT_ET0 5
  2890. #define SIMDE_MM_HINT_ET1 6
  2891. #define SIMDE_MM_HINT_ET2 7
  2892. #endif
  2893. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2894. HEDLEY_DIAGNOSTIC_PUSH
  2895. #if HEDLEY_HAS_WARNING("-Wreserved-id-macro")
  2896. _Pragma("clang diagnostic ignored \"-Wreserved-id-macro\"")
  2897. #endif
  2898. #undef _MM_HINT_NTA
  2899. #define _MM_HINT_NTA SIMDE_MM_HINT_NTA
  2900. #undef _MM_HINT_T0
  2901. #define _MM_HINT_T0 SIMDE_MM_HINT_T0
  2902. #undef _MM_HINT_T1
  2903. #define _MM_HINT_T1 SIMDE_MM_HINT_T1
  2904. #undef _MM_HINT_T2
  2905. #define _MM_HINT_T2 SIMDE_MM_HINT_T2
  2906. #undef _MM_HINT_ETNA
  2907. #define _MM_HINT_ETNA SIMDE_MM_HINT_ETNA
  2908. #undef _MM_HINT_ET0
  2909. #define _MM_HINT_ET0 SIMDE_MM_HINT_ET0
  2910. #undef _MM_HINT_ET1
  2911. #define _MM_HINT_ET1 SIMDE_MM_HINT_ET1
  2912. #undef _MM_HINT_ET1
  2913. #define _MM_HINT_ET2 SIMDE_MM_HINT_ET2
  2914. HEDLEY_DIAGNOSTIC_POP
  2915. #endif
  2916. SIMDE_FUNCTION_ATTRIBUTES void simde_mm_prefetch(char const *p, int i)
  2917. {
  2918. #if defined(HEDLEY_GCC_VERSION)
  2919. __builtin_prefetch(p);
  2920. #else
  2921. (void)p;
  2922. #endif
  2923. (void)i;
  2924. }
  2925. #if defined(SIMDE_X86_SSE_NATIVE)
  2926. #if defined(__clang__) && \
  2927. !SIMDE_DETECT_CLANG_VERSION_CHECK( \
  2928. 10, 0, 0) /* https://reviews.llvm.org/D71718 */
  2929. #define simde_mm_prefetch(p, i) \
  2930. (__extension__({ \
  2931. HEDLEY_DIAGNOSTIC_PUSH \
  2932. HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \
  2933. _mm_prefetch((p), (i)); \
  2934. HEDLEY_DIAGNOSTIC_POP \
  2935. }))
  2936. #else
  2937. #define simde_mm_prefetch(p, i) _mm_prefetch(p, i)
  2938. #endif
  2939. #endif
  2940. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  2941. #define _mm_prefetch(p, i) simde_mm_prefetch(p, i)
  2942. #endif
  2943. SIMDE_FUNCTION_ATTRIBUTES
  2944. simde__m128 simde_x_mm_negate_ps(simde__m128 a)
  2945. {
  2946. #if defined(SIMDE_X86_SSE_NATIVE)
  2947. return simde_mm_xor_ps(a, _mm_set1_ps(SIMDE_FLOAT32_C(-0.0)));
  2948. #else
  2949. simde__m128_private r_, a_ = simde__m128_to_private(a);
  2950. #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && \
  2951. (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8, 1, 0))
  2952. r_.altivec_f32 = vec_neg(a_.altivec_f32);
  2953. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2954. r_.neon_f32 = vnegq_f32(a_.neon_f32);
  2955. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  2956. r_.wasm_v128 = wasm_f32x4_neg(a_.wasm_v128);
  2957. #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
  2958. r_.altivec_f32 = vec_neg(a_.altivec_f32);
  2959. #elif defined(SIMDE_VECTOR_NEGATE)
  2960. r_.f32 = -a_.f32;
  2961. #else
  2962. SIMDE_VECTORIZE
  2963. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  2964. r_.f32[i] = -a_.f32[i];
  2965. }
  2966. #endif
  2967. return simde__m128_from_private(r_);
  2968. #endif
  2969. }
  2970. SIMDE_FUNCTION_ATTRIBUTES
  2971. simde__m128 simde_mm_rcp_ps(simde__m128 a)
  2972. {
  2973. #if defined(SIMDE_X86_SSE_NATIVE)
  2974. return _mm_rcp_ps(a);
  2975. #else
  2976. simde__m128_private r_, a_ = simde__m128_to_private(a);
  2977. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  2978. float32x4_t recip = vrecpeq_f32(a_.neon_f32);
  2979. #if SIMDE_ACCURACY_PREFERENCE > 0
  2980. for (int i = 0; i < SIMDE_ACCURACY_PREFERENCE; ++i) {
  2981. recip = vmulq_f32(recip, vrecpsq_f32(recip, a_.neon_f32));
  2982. }
  2983. #endif
  2984. r_.neon_f32 = recip;
  2985. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  2986. r_.wasm_v128 = wasm_f32x4_div(simde_mm_set1_ps(1.0f), a_.wasm_v128);
  2987. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  2988. r_.altivec_f32 = vec_re(a_.altivec_f32);
  2989. #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
  2990. r_.f32 = 1.0f / a_.f32;
  2991. #elif defined(SIMDE_IEEE754_STORAGE)
  2992. /* https://stackoverflow.com/questions/12227126/division-as-multiply-and-lut-fast-float-division-reciprocal/12228234#12228234 */
  2993. SIMDE_VECTORIZE
  2994. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  2995. int32_t ix;
  2996. simde_float32 fx = a_.f32[i];
  2997. simde_memcpy(&ix, &fx, sizeof(ix));
  2998. int32_t x = INT32_C(0x7EF311C3) - ix;
  2999. simde_float32 temp;
  3000. simde_memcpy(&temp, &x, sizeof(temp));
  3001. r_.f32[i] = temp * (SIMDE_FLOAT32_C(2.0) - temp * fx);
  3002. }
  3003. #else
  3004. SIMDE_VECTORIZE
  3005. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  3006. r_.f32[i] = 1.0f / a_.f32[i];
  3007. }
  3008. #endif
  3009. return simde__m128_from_private(r_);
  3010. #endif
  3011. }
  3012. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3013. #define _mm_rcp_ps(a) simde_mm_rcp_ps((a))
  3014. #endif
  3015. SIMDE_FUNCTION_ATTRIBUTES
  3016. simde__m128 simde_mm_rcp_ss(simde__m128 a)
  3017. {
  3018. #if defined(SIMDE_X86_SSE_NATIVE)
  3019. return _mm_rcp_ss(a);
  3020. #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
  3021. return simde_mm_move_ss(a, simde_mm_rcp_ps(a));
  3022. #else
  3023. simde__m128_private r_, a_ = simde__m128_to_private(a);
  3024. r_.f32[0] = 1.0f / a_.f32[0];
  3025. r_.f32[1] = a_.f32[1];
  3026. r_.f32[2] = a_.f32[2];
  3027. r_.f32[3] = a_.f32[3];
  3028. return simde__m128_from_private(r_);
  3029. #endif
  3030. }
  3031. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3032. #define _mm_rcp_ss(a) simde_mm_rcp_ss((a))
  3033. #endif
  3034. SIMDE_FUNCTION_ATTRIBUTES
  3035. simde__m128 simde_mm_rsqrt_ps(simde__m128 a)
  3036. {
  3037. #if defined(SIMDE_X86_SSE_NATIVE)
  3038. return _mm_rsqrt_ps(a);
  3039. #else
  3040. simde__m128_private r_, a_ = simde__m128_to_private(a);
  3041. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3042. r_.neon_f32 = vrsqrteq_f32(a_.neon_f32);
  3043. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  3044. r_.altivec_f32 = vec_rsqrte(a_.altivec_f32);
  3045. #elif defined(SIMDE_IEEE754_STORAGE)
  3046. /* https://basesandframes.files.wordpress.com/2020/04/even_faster_math_functions_green_2020.pdf
  3047. Pages 100 - 103 */
  3048. SIMDE_VECTORIZE
  3049. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  3050. #if SIMDE_ACCURACY_PREFERENCE <= 0
  3051. r_.i32[i] = INT32_C(0x5F37624F) - (a_.i32[i] >> 1);
  3052. #else
  3053. simde_float32 x = a_.f32[i];
  3054. simde_float32 xhalf = SIMDE_FLOAT32_C(0.5) * x;
  3055. int32_t ix;
  3056. simde_memcpy(&ix, &x, sizeof(ix));
  3057. #if SIMDE_ACCURACY_PREFERENCE == 1
  3058. ix = INT32_C(0x5F375A82) - (ix >> 1);
  3059. #else
  3060. ix = INT32_C(0x5F37599E) - (ix >> 1);
  3061. #endif
  3062. simde_memcpy(&x, &ix, sizeof(x));
  3063. #if SIMDE_ACCURACY_PREFERENCE >= 2
  3064. x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x);
  3065. #endif
  3066. x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x);
  3067. r_.f32[i] = x;
  3068. #endif
  3069. }
  3070. #elif defined(simde_math_sqrtf)
  3071. SIMDE_VECTORIZE
  3072. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  3073. r_.f32[i] = 1.0f / simde_math_sqrtf(a_.f32[i]);
  3074. }
  3075. #else
  3076. HEDLEY_UNREACHABLE();
  3077. #endif
  3078. return simde__m128_from_private(r_);
  3079. #endif
  3080. }
  3081. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3082. #define _mm_rsqrt_ps(a) simde_mm_rsqrt_ps((a))
  3083. #endif
  3084. SIMDE_FUNCTION_ATTRIBUTES
  3085. simde__m128 simde_mm_rsqrt_ss(simde__m128 a)
  3086. {
  3087. #if defined(SIMDE_X86_SSE_NATIVE)
  3088. return _mm_rsqrt_ss(a);
  3089. #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
  3090. return simde_mm_move_ss(a, simde_mm_rsqrt_ps(a));
  3091. #else
  3092. simde__m128_private r_, a_ = simde__m128_to_private(a);
  3093. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3094. r_.neon_f32 =
  3095. vsetq_lane_f32(vgetq_lane_f32(simde_mm_rsqrt_ps(a).neon_f32, 0),
  3096. a_.neon_f32, 0);
  3097. #elif defined(SIMDE_IEEE754_STORAGE)
  3098. {
  3099. #if SIMDE_ACCURACY_PREFERENCE <= 0
  3100. r_.i32[0] = INT32_C(0x5F37624F) - (a_.i32[0] >> 1);
  3101. #else
  3102. simde_float32 x = a_.f32[0];
  3103. simde_float32 xhalf = SIMDE_FLOAT32_C(0.5) * x;
  3104. int32_t ix;
  3105. simde_memcpy(&ix, &x, sizeof(ix));
  3106. #if SIMDE_ACCURACY_PREFERENCE == 1
  3107. ix = INT32_C(0x5F375A82) - (ix >> 1);
  3108. #else
  3109. ix = INT32_C(0x5F37599E) - (ix >> 1);
  3110. #endif
  3111. simde_memcpy(&x, &ix, sizeof(x));
  3112. #if SIMDE_ACCURACY_PREFERENCE >= 2
  3113. x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x);
  3114. #endif
  3115. x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x);
  3116. r_.f32[0] = x;
  3117. #endif
  3118. }
  3119. r_.f32[1] = a_.f32[1];
  3120. r_.f32[2] = a_.f32[2];
  3121. r_.f32[3] = a_.f32[3];
  3122. #elif defined(simde_math_sqrtf)
  3123. r_.f32[0] = 1.0f / simde_math_sqrtf(a_.f32[0]);
  3124. r_.f32[1] = a_.f32[1];
  3125. r_.f32[2] = a_.f32[2];
  3126. r_.f32[3] = a_.f32[3];
  3127. #else
  3128. HEDLEY_UNREACHABLE();
  3129. #endif
  3130. return simde__m128_from_private(r_);
  3131. #endif
  3132. }
  3133. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3134. #define _mm_rsqrt_ss(a) simde_mm_rsqrt_ss((a))
  3135. #endif
  3136. SIMDE_FUNCTION_ATTRIBUTES
  3137. simde__m64 simde_mm_sad_pu8(simde__m64 a, simde__m64 b)
  3138. {
  3139. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  3140. return _mm_sad_pu8(a, b);
  3141. #else
  3142. simde__m64_private r_, a_ = simde__m64_to_private(a),
  3143. b_ = simde__m64_to_private(b);
  3144. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3145. uint16x4_t t = vpaddl_u8(vabd_u8(a_.neon_u8, b_.neon_u8));
  3146. uint16_t r0 = t[0] + t[1] + t[2] + t[3];
  3147. r_.neon_u16 = vset_lane_u16(r0, vdup_n_u16(0), 0);
  3148. #else
  3149. uint16_t sum = 0;
  3150. #if defined(SIMDE_HAVE_STDLIB_H)
  3151. SIMDE_VECTORIZE_REDUCTION(+ : sum)
  3152. for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
  3153. sum += HEDLEY_STATIC_CAST(uint8_t, abs(a_.u8[i] - b_.u8[i]));
  3154. }
  3155. r_.i16[0] = HEDLEY_STATIC_CAST(int16_t, sum);
  3156. r_.i16[1] = 0;
  3157. r_.i16[2] = 0;
  3158. r_.i16[3] = 0;
  3159. #else
  3160. HEDLEY_UNREACHABLE();
  3161. #endif
  3162. #endif
  3163. return simde__m64_from_private(r_);
  3164. #endif
  3165. }
  3166. #define simde_m_psadbw(a, b) simde_mm_sad_pu8(a, b)
  3167. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3168. #define _mm_sad_pu8(a, b) simde_mm_sad_pu8(a, b)
  3169. #define _m_psadbw(a, b) simde_mm_sad_pu8(a, b)
  3170. #endif
  3171. SIMDE_FUNCTION_ATTRIBUTES
  3172. simde__m128 simde_mm_set_ss(simde_float32 a)
  3173. {
  3174. #if defined(SIMDE_X86_SSE_NATIVE)
  3175. return _mm_set_ss(a);
  3176. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3177. return vsetq_lane_f32(a, vdupq_n_f32(SIMDE_FLOAT32_C(0.0)), 0);
  3178. #else
  3179. return simde_mm_set_ps(SIMDE_FLOAT32_C(0.0), SIMDE_FLOAT32_C(0.0),
  3180. SIMDE_FLOAT32_C(0.0), a);
  3181. #endif
  3182. }
  3183. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3184. #define _mm_set_ss(a) simde_mm_set_ss(a)
  3185. #endif
  3186. SIMDE_FUNCTION_ATTRIBUTES
  3187. simde__m128 simde_mm_setr_ps(simde_float32 e3, simde_float32 e2,
  3188. simde_float32 e1, simde_float32 e0)
  3189. {
  3190. #if defined(SIMDE_X86_SSE_NATIVE)
  3191. return _mm_setr_ps(e3, e2, e1, e0);
  3192. #else
  3193. return simde_mm_set_ps(e0, e1, e2, e3);
  3194. #endif
  3195. }
  3196. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3197. #define _mm_setr_ps(e3, e2, e1, e0) simde_mm_setr_ps(e3, e2, e1, e0)
  3198. #endif
  3199. SIMDE_FUNCTION_ATTRIBUTES
  3200. simde__m128 simde_mm_setzero_ps(void)
  3201. {
  3202. #if defined(SIMDE_X86_SSE_NATIVE)
  3203. return _mm_setzero_ps();
  3204. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3205. return vdupq_n_f32(SIMDE_FLOAT32_C(0.0));
  3206. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  3207. return vec_splats(SIMDE_FLOAT32_C(0.0));
  3208. #else
  3209. simde__m128 r;
  3210. simde_memset(&r, 0, sizeof(r));
  3211. return r;
  3212. #endif
  3213. }
  3214. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3215. #define _mm_setzero_ps() simde_mm_setzero_ps()
  3216. #endif
  3217. #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
  3218. HEDLEY_DIAGNOSTIC_PUSH
  3219. SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
  3220. #endif
  3221. SIMDE_FUNCTION_ATTRIBUTES
  3222. simde__m128 simde_mm_undefined_ps(void)
  3223. {
  3224. simde__m128_private r_;
  3225. #if defined(SIMDE_HAVE_UNDEFINED128)
  3226. r_.n = _mm_undefined_ps();
  3227. #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
  3228. r_ = simde__m128_to_private(simde_mm_setzero_ps());
  3229. #endif
  3230. return simde__m128_from_private(r_);
  3231. }
  3232. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3233. #define _mm_undefined_ps() simde_mm_undefined_ps()
  3234. #endif
  3235. #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
  3236. HEDLEY_DIAGNOSTIC_POP
  3237. #endif
  3238. SIMDE_FUNCTION_ATTRIBUTES
  3239. simde__m128 simde_x_mm_setone_ps(void)
  3240. {
  3241. simde__m128 t = simde_mm_setzero_ps();
  3242. return simde_mm_cmpeq_ps(t, t);
  3243. }
  3244. SIMDE_FUNCTION_ATTRIBUTES
  3245. void simde_mm_sfence(void)
  3246. {
  3247. /* TODO: Use Hedley. */
  3248. #if defined(SIMDE_X86_SSE_NATIVE)
  3249. _mm_sfence();
  3250. #elif defined(__GNUC__) && \
  3251. ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
  3252. __atomic_thread_fence(__ATOMIC_SEQ_CST);
  3253. #elif !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && \
  3254. (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__)
  3255. #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ < 9)
  3256. __atomic_thread_fence(__ATOMIC_SEQ_CST);
  3257. #else
  3258. atomic_thread_fence(memory_order_seq_cst);
  3259. #endif
  3260. #elif defined(_MSC_VER)
  3261. MemoryBarrier();
  3262. #elif HEDLEY_HAS_EXTENSION(c_atomic)
  3263. __c11_atomic_thread_fence(__ATOMIC_SEQ_CST);
  3264. #elif defined(__GNUC__) && \
  3265. ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
  3266. __sync_synchronize();
  3267. #elif defined(_OPENMP)
  3268. #pragma omp critical(simde_mm_sfence_)
  3269. {
  3270. }
  3271. #endif
  3272. }
  3273. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3274. #define _mm_sfence() simde_mm_sfence()
  3275. #endif
  3276. #define SIMDE_MM_SHUFFLE(z, y, x, w) \
  3277. (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
  3278. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3279. #define _MM_SHUFFLE(z, y, x, w) SIMDE_MM_SHUFFLE(z, y, x, w)
  3280. #endif
  3281. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && \
  3282. !defined(__PGI)
  3283. #define simde_mm_shuffle_pi16(a, imm8) _mm_shuffle_pi16(a, imm8)
  3284. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  3285. #define simde_mm_shuffle_pi16(a, imm8) \
  3286. (__extension__({ \
  3287. const simde__m64_private simde__tmp_a_ = \
  3288. simde__m64_to_private(a); \
  3289. simde__m64_from_private((simde__m64_private){ \
  3290. .i16 = SIMDE_SHUFFLE_VECTOR_( \
  3291. 16, 8, (simde__tmp_a_).i16, \
  3292. (simde__tmp_a_).i16, (((imm8)) & 3), \
  3293. (((imm8) >> 2) & 3), (((imm8) >> 4) & 3), \
  3294. (((imm8) >> 6) & 3))}); \
  3295. }))
  3296. #else
  3297. SIMDE_FUNCTION_ATTRIBUTES
  3298. simde__m64 simde_mm_shuffle_pi16(simde__m64 a, const int imm8)
  3299. SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)
  3300. {
  3301. simde__m64_private r_;
  3302. simde__m64_private a_ = simde__m64_to_private(a);
  3303. for (size_t i = 0; i < sizeof(r_.i16) / sizeof(r_.i16[0]); i++) {
  3304. r_.i16[i] = a_.i16[(imm8 >> (i * 2)) & 3];
  3305. }
  3306. HEDLEY_DIAGNOSTIC_PUSH
  3307. #if HEDLEY_HAS_WARNING("-Wconditional-uninitialized")
  3308. #pragma clang diagnostic ignored "-Wconditional-uninitialized"
  3309. #endif
  3310. return simde__m64_from_private(r_);
  3311. HEDLEY_DIAGNOSTIC_POP
  3312. }
  3313. #endif
  3314. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && \
  3315. !defined(__PGI)
  3316. #define simde_m_pshufw(a, imm8) _m_pshufw(a, imm8)
  3317. #else
  3318. #define simde_m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8)
  3319. #endif
  3320. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3321. #define _mm_shuffle_pi16(a, imm8) simde_mm_shuffle_pi16(a, imm8)
  3322. #define _m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8)
  3323. #endif
  3324. #if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
  3325. #define simde_mm_shuffle_ps(a, b, imm8) _mm_shuffle_ps(a, b, imm8)
  3326. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3327. #define simde_mm_shuffle_ps(a, b, imm8) \
  3328. __extension__({ \
  3329. float32x4_t ret; \
  3330. ret = vmovq_n_f32(vgetq_lane_f32(a, (imm8) & (0x3))); \
  3331. ret = vsetq_lane_f32(vgetq_lane_f32(a, ((imm8) >> 2) & 0x3), \
  3332. ret, 1); \
  3333. ret = vsetq_lane_f32(vgetq_lane_f32(b, ((imm8) >> 4) & 0x3), \
  3334. ret, 2); \
  3335. ret = vsetq_lane_f32(vgetq_lane_f32(b, ((imm8) >> 6) & 0x3), \
  3336. ret, 3); \
  3337. })
  3338. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  3339. #define simde_mm_shuffle_ps(a, b, imm8) \
  3340. (__extension__({ \
  3341. simde__m128_from_private((simde__m128_private){ \
  3342. .f32 = SIMDE_SHUFFLE_VECTOR_( \
  3343. 32, 16, simde__m128_to_private(a).f32, \
  3344. simde__m128_to_private(b).f32, (((imm8)) & 3), \
  3345. (((imm8) >> 2) & 3), (((imm8) >> 4) & 3) + 4, \
  3346. (((imm8) >> 6) & 3) + 4)}); \
  3347. }))
  3348. #else
  3349. SIMDE_FUNCTION_ATTRIBUTES
  3350. simde__m128 simde_mm_shuffle_ps(simde__m128 a, simde__m128 b, const int imm8)
  3351. SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)
  3352. {
  3353. simde__m128_private r_, a_ = simde__m128_to_private(a),
  3354. b_ = simde__m128_to_private(b);
  3355. r_.f32[0] = a_.f32[(imm8 >> 0) & 3];
  3356. r_.f32[1] = a_.f32[(imm8 >> 2) & 3];
  3357. r_.f32[2] = b_.f32[(imm8 >> 4) & 3];
  3358. r_.f32[3] = b_.f32[(imm8 >> 6) & 3];
  3359. return simde__m128_from_private(r_);
  3360. }
  3361. #endif
  3362. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3363. #define _mm_shuffle_ps(a, b, imm8) simde_mm_shuffle_ps((a), (b), imm8)
  3364. #endif
  3365. SIMDE_FUNCTION_ATTRIBUTES
  3366. simde__m128 simde_mm_sqrt_ps(simde__m128 a)
  3367. {
  3368. #if defined(SIMDE_X86_SSE_NATIVE)
  3369. return _mm_sqrt_ps(a);
  3370. #else
  3371. simde__m128_private r_, a_ = simde__m128_to_private(a);
  3372. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  3373. r_.neon_f32 = vsqrtq_f32(a_.neon_f32);
  3374. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3375. float32x4_t est = vrsqrteq_f32(a_.neon_f32);
  3376. for (int i = 0; i <= SIMDE_ACCURACY_PREFERENCE; i++) {
  3377. est = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a_.neon_f32, est), est),
  3378. est);
  3379. }
  3380. r_.neon_f32 = vmulq_f32(a_.neon_f32, est);
  3381. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  3382. r_.wasm_v128 = wasm_f32x4_sqrt(a_.wasm_v128);
  3383. #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
  3384. r_.altivec_f32 = vec_sqrt(a_.altivec_f32);
  3385. #elif defined(simde_math_sqrt)
  3386. SIMDE_VECTORIZE
  3387. for (size_t i = 0; i < sizeof(r_.f32) / sizeof(r_.f32[0]); i++) {
  3388. r_.f32[i] = simde_math_sqrtf(a_.f32[i]);
  3389. }
  3390. #else
  3391. HEDLEY_UNREACHABLE();
  3392. #endif
  3393. return simde__m128_from_private(r_);
  3394. #endif
  3395. }
  3396. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3397. #define _mm_sqrt_ps(a) simde_mm_sqrt_ps((a))
  3398. #endif
  3399. SIMDE_FUNCTION_ATTRIBUTES
  3400. simde__m128 simde_mm_sqrt_ss(simde__m128 a)
  3401. {
  3402. #if defined(SIMDE_X86_SSE_NATIVE)
  3403. return _mm_sqrt_ss(a);
  3404. #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
  3405. return simde_mm_move_ss(a, simde_mm_sqrt_ps(a));
  3406. #else
  3407. simde__m128_private r_, a_ = simde__m128_to_private(a);
  3408. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3409. float32_t value = vgetq_lane_f32(
  3410. simde__m128_to_private(simde_mm_sqrt_ps(a)).neon_f32, 0);
  3411. r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0);
  3412. #elif defined(simde_math_sqrtf)
  3413. r_.f32[0] = simde_math_sqrtf(a_.f32[0]);
  3414. r_.f32[1] = a_.f32[1];
  3415. r_.f32[2] = a_.f32[2];
  3416. r_.f32[3] = a_.f32[3];
  3417. #else
  3418. HEDLEY_UNREACHABLE();
  3419. #endif
  3420. return simde__m128_from_private(r_);
  3421. #endif
  3422. }
  3423. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3424. #define _mm_sqrt_ss(a) simde_mm_sqrt_ss((a))
  3425. #endif
  3426. SIMDE_FUNCTION_ATTRIBUTES
  3427. void simde_mm_store_ps(simde_float32 mem_addr[4], simde__m128 a)
  3428. {
  3429. #if defined(SIMDE_X86_SSE_NATIVE)
  3430. _mm_store_ps(mem_addr, a);
  3431. #else
  3432. simde__m128_private a_ = simde__m128_to_private(a);
  3433. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3434. vst1q_f32(mem_addr, a_.neon_f32);
  3435. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  3436. vec_st(a_.altivec_f32, 0, mem_addr);
  3437. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  3438. wasm_v128_store(mem_addr, a_.wasm_v128);
  3439. #else
  3440. simde_memcpy(mem_addr, &a_, sizeof(a));
  3441. #endif
  3442. #endif
  3443. }
  3444. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3445. #define _mm_store_ps(mem_addr, a) \
  3446. simde_mm_store_ps(SIMDE_CHECKED_REINTERPRET_CAST( \
  3447. float *, simde_float32 *, mem_addr), \
  3448. (a))
  3449. #endif
  3450. SIMDE_FUNCTION_ATTRIBUTES
  3451. void simde_mm_store1_ps(simde_float32 mem_addr[4], simde__m128 a)
  3452. {
  3453. simde_float32 *mem_addr_ =
  3454. SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128);
  3455. #if defined(SIMDE_X86_SSE_NATIVE)
  3456. _mm_store_ps1(mem_addr_, a);
  3457. #else
  3458. simde__m128_private a_ = simde__m128_to_private(a);
  3459. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3460. vst1q_f32(mem_addr_, vdupq_lane_f32(vget_low_f32(a_.neon_f32), 0));
  3461. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  3462. wasm_v128_store(mem_addr_,
  3463. wasm_v32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0, 0,
  3464. 0));
  3465. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  3466. vec_st(vec_splat(a_.altivec_f32, 0), 0, mem_addr_);
  3467. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  3468. simde__m128_private tmp_;
  3469. tmp_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 0, 0);
  3470. simde_mm_store_ps(mem_addr_, tmp_.f32);
  3471. #else
  3472. SIMDE_VECTORIZE_ALIGNED(mem_addr_ : 16)
  3473. for (size_t i = 0; i < sizeof(a_.f32) / sizeof(a_.f32[0]); i++) {
  3474. mem_addr_[i] = a_.f32[0];
  3475. }
  3476. #endif
  3477. #endif
  3478. }
  3479. #define simde_mm_store_ps1(mem_addr, a) simde_mm_store1_ps(mem_addr, a)
  3480. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3481. #define _mm_store_ps1(mem_addr, a) \
  3482. simde_mm_store1_ps(SIMDE_CHECKED_REINTERPRET_CAST( \
  3483. float *, simde_float32 *, mem_addr), \
  3484. (a))
  3485. #define _mm_store1_ps(mem_addr, a) \
  3486. simde_mm_store1_ps(SIMDE_CHECKED_REINTERPRET_CAST( \
  3487. float *, simde_float32 *, mem_addr), \
  3488. (a))
  3489. #endif
  3490. SIMDE_FUNCTION_ATTRIBUTES
  3491. void simde_mm_store_ss(simde_float32 *mem_addr, simde__m128 a)
  3492. {
  3493. #if defined(SIMDE_X86_SSE_NATIVE)
  3494. _mm_store_ss(mem_addr, a);
  3495. #else
  3496. simde__m128_private a_ = simde__m128_to_private(a);
  3497. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3498. vst1q_lane_f32(mem_addr, a_.neon_f32, 0);
  3499. #else
  3500. *mem_addr = a_.f32[0];
  3501. #endif
  3502. #endif
  3503. }
  3504. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3505. #define _mm_store_ss(mem_addr, a) \
  3506. simde_mm_store_ss(SIMDE_CHECKED_REINTERPRET_CAST( \
  3507. float *, simde_float32 *, mem_addr), \
  3508. (a))
  3509. #endif
  3510. SIMDE_FUNCTION_ATTRIBUTES
  3511. void simde_mm_storeh_pi(simde__m64 *mem_addr, simde__m128 a)
  3512. {
  3513. #if defined(SIMDE_X86_SSE_NATIVE)
  3514. _mm_storeh_pi(HEDLEY_REINTERPRET_CAST(__m64 *, mem_addr), a);
  3515. #else
  3516. simde__m128_private a_ = simde__m128_to_private(a);
  3517. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3518. vst1_f32(HEDLEY_REINTERPRET_CAST(float32_t *, mem_addr),
  3519. vget_high_f32(a_.neon_f32));
  3520. #else
  3521. simde_memcpy(mem_addr, &(a_.m64[1]), sizeof(a_.m64[1]));
  3522. #endif
  3523. #endif
  3524. }
  3525. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3526. #define _mm_storeh_pi(mem_addr, a) simde_mm_storeh_pi(mem_addr, (a))
  3527. #endif
  3528. SIMDE_FUNCTION_ATTRIBUTES
  3529. void simde_mm_storel_pi(simde__m64 *mem_addr, simde__m128 a)
  3530. {
  3531. #if defined(SIMDE_X86_SSE_NATIVE)
  3532. _mm_storel_pi(HEDLEY_REINTERPRET_CAST(__m64 *, mem_addr), a);
  3533. #else
  3534. simde__m64_private *dest_ =
  3535. HEDLEY_REINTERPRET_CAST(simde__m64_private *, mem_addr);
  3536. simde__m128_private a_ = simde__m128_to_private(a);
  3537. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3538. dest_->neon_f32 = vget_low_f32(a_.neon_f32);
  3539. #else
  3540. dest_->f32[0] = a_.f32[0];
  3541. dest_->f32[1] = a_.f32[1];
  3542. #endif
  3543. #endif
  3544. }
  3545. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3546. #define _mm_storel_pi(mem_addr, a) simde_mm_storel_pi(mem_addr, (a))
  3547. #endif
  3548. SIMDE_FUNCTION_ATTRIBUTES
  3549. void simde_mm_storer_ps(simde_float32 mem_addr[4], simde__m128 a)
  3550. {
  3551. #if defined(SIMDE_X86_SSE_NATIVE)
  3552. _mm_storer_ps(mem_addr, a);
  3553. #else
  3554. simde__m128_private a_ = simde__m128_to_private(a);
  3555. #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  3556. vec_st(vec_reve(a_.altivec_f32), 0, mem_addr);
  3557. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3558. float32x4_t tmp = vrev64q_f32(a_.neon_f32);
  3559. vst1q_f32(mem_addr, vextq_f32(tmp, tmp, 2));
  3560. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  3561. a_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 3, 2, 1, 0);
  3562. simde_mm_store_ps(mem_addr, simde__m128_from_private(a_));
  3563. #else
  3564. SIMDE_VECTORIZE_ALIGNED(mem_addr : 16)
  3565. for (size_t i = 0; i < sizeof(a_.f32) / sizeof(a_.f32[0]); i++) {
  3566. mem_addr[i] =
  3567. a_.f32[((sizeof(a_.f32) / sizeof(a_.f32[0])) - 1) - i];
  3568. }
  3569. #endif
  3570. #endif
  3571. }
  3572. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3573. #define _mm_storer_ps(mem_addr, a) \
  3574. simde_mm_storer_ps(SIMDE_CHECKED_REINTERPRET_CAST( \
  3575. float *, simde_float32 *, mem_addr), \
  3576. (a))
  3577. #endif
  3578. SIMDE_FUNCTION_ATTRIBUTES
  3579. void simde_mm_storeu_ps(simde_float32 mem_addr[4], simde__m128 a)
  3580. {
  3581. #if defined(SIMDE_X86_SSE_NATIVE)
  3582. _mm_storeu_ps(mem_addr, a);
  3583. #else
  3584. simde__m128_private a_ = simde__m128_to_private(a);
  3585. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3586. vst1q_f32(mem_addr, a_.neon_f32);
  3587. #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
  3588. vec_vsx_st(a_.altivec_f32, 0, mem_addr);
  3589. #else
  3590. simde_memcpy(mem_addr, &a_, sizeof(a_));
  3591. #endif
  3592. #endif
  3593. }
  3594. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3595. #define _mm_storeu_ps(mem_addr, a) \
  3596. simde_mm_storeu_ps(SIMDE_CHECKED_REINTERPRET_CAST( \
  3597. float *, simde_float32 *, mem_addr), \
  3598. (a))
  3599. #endif
  3600. SIMDE_FUNCTION_ATTRIBUTES
  3601. simde__m128 simde_mm_sub_ps(simde__m128 a, simde__m128 b)
  3602. {
  3603. #if defined(SIMDE_X86_SSE_NATIVE)
  3604. return _mm_sub_ps(a, b);
  3605. #else
  3606. simde__m128_private r_, a_ = simde__m128_to_private(a),
  3607. b_ = simde__m128_to_private(b);
  3608. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3609. r_.neon_f32 = vsubq_f32(a_.neon_f32, b_.neon_f32);
  3610. #elif defined(SIMDE_WASM_SIMD128_NATIVE)
  3611. r_.wasm_v128 = wasm_f32x4_sub(a_.wasm_v128, b_.wasm_v128);
  3612. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  3613. r_.altivec_f32 = vec_sub(a_.altivec_f32, b_.altivec_f32);
  3614. #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  3615. r_.f32 = a_.f32 - b_.f32;
  3616. #else
  3617. SIMDE_VECTORIZE
  3618. for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
  3619. r_.f32[i] = a_.f32[i] - b_.f32[i];
  3620. }
  3621. #endif
  3622. return simde__m128_from_private(r_);
  3623. #endif
  3624. }
  3625. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3626. #define _mm_sub_ps(a, b) simde_mm_sub_ps((a), (b))
  3627. #endif
  3628. SIMDE_FUNCTION_ATTRIBUTES
  3629. simde__m128 simde_mm_sub_ss(simde__m128 a, simde__m128 b)
  3630. {
  3631. #if defined(SIMDE_X86_SSE_NATIVE)
  3632. return _mm_sub_ss(a, b);
  3633. #elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
  3634. return simde_mm_move_ss(a, simde_mm_sub_ps(a, b));
  3635. #else
  3636. simde__m128_private r_, a_ = simde__m128_to_private(a),
  3637. b_ = simde__m128_to_private(b);
  3638. r_.f32[0] = a_.f32[0] - b_.f32[0];
  3639. r_.f32[1] = a_.f32[1];
  3640. r_.f32[2] = a_.f32[2];
  3641. r_.f32[3] = a_.f32[3];
  3642. return simde__m128_from_private(r_);
  3643. #endif
  3644. }
  3645. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3646. #define _mm_sub_ss(a, b) simde_mm_sub_ss((a), (b))
  3647. #endif
  3648. SIMDE_FUNCTION_ATTRIBUTES
  3649. int simde_mm_ucomieq_ss(simde__m128 a, simde__m128 b)
  3650. {
  3651. #if defined(SIMDE_X86_SSE_NATIVE)
  3652. return _mm_ucomieq_ss(a, b);
  3653. #else
  3654. simde__m128_private a_ = simde__m128_to_private(a),
  3655. b_ = simde__m128_to_private(b);
  3656. int r;
  3657. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3658. uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
  3659. uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
  3660. uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
  3661. uint32x4_t a_eq_b = vceqq_f32(a_.neon_f32, b_.neon_f32);
  3662. r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0);
  3663. #elif defined(SIMDE_HAVE_FENV_H)
  3664. fenv_t envp;
  3665. int x = feholdexcept(&envp);
  3666. r = a_.f32[0] == b_.f32[0];
  3667. if (HEDLEY_LIKELY(x == 0))
  3668. fesetenv(&envp);
  3669. #else
  3670. r = a_.f32[0] == b_.f32[0];
  3671. #endif
  3672. return r;
  3673. #endif
  3674. }
  3675. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3676. #define _mm_ucomieq_ss(a, b) simde_mm_ucomieq_ss((a), (b))
  3677. #endif
  3678. SIMDE_FUNCTION_ATTRIBUTES
  3679. int simde_mm_ucomige_ss(simde__m128 a, simde__m128 b)
  3680. {
  3681. #if defined(SIMDE_X86_SSE_NATIVE)
  3682. return _mm_ucomige_ss(a, b);
  3683. #else
  3684. simde__m128_private a_ = simde__m128_to_private(a),
  3685. b_ = simde__m128_to_private(b);
  3686. int r;
  3687. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3688. uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
  3689. uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
  3690. uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
  3691. uint32x4_t a_ge_b = vcgeq_f32(a_.neon_f32, b_.neon_f32);
  3692. r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0);
  3693. #elif defined(SIMDE_HAVE_FENV_H)
  3694. fenv_t envp;
  3695. int x = feholdexcept(&envp);
  3696. r = a_.f32[0] >= b_.f32[0];
  3697. if (HEDLEY_LIKELY(x == 0))
  3698. fesetenv(&envp);
  3699. #else
  3700. r = a_.f32[0] >= b_.f32[0];
  3701. #endif
  3702. return r;
  3703. #endif
  3704. }
  3705. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3706. #define _mm_ucomige_ss(a, b) simde_mm_ucomige_ss((a), (b))
  3707. #endif
  3708. SIMDE_FUNCTION_ATTRIBUTES
  3709. int simde_mm_ucomigt_ss(simde__m128 a, simde__m128 b)
  3710. {
  3711. #if defined(SIMDE_X86_SSE_NATIVE)
  3712. return _mm_ucomigt_ss(a, b);
  3713. #else
  3714. simde__m128_private a_ = simde__m128_to_private(a),
  3715. b_ = simde__m128_to_private(b);
  3716. int r;
  3717. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3718. uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
  3719. uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
  3720. uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
  3721. uint32x4_t a_gt_b = vcgtq_f32(a_.neon_f32, b_.neon_f32);
  3722. r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0);
  3723. #elif defined(SIMDE_HAVE_FENV_H)
  3724. fenv_t envp;
  3725. int x = feholdexcept(&envp);
  3726. r = a_.f32[0] > b_.f32[0];
  3727. if (HEDLEY_LIKELY(x == 0))
  3728. fesetenv(&envp);
  3729. #else
  3730. r = a_.f32[0] > b_.f32[0];
  3731. #endif
  3732. return r;
  3733. #endif
  3734. }
  3735. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3736. #define _mm_ucomigt_ss(a, b) simde_mm_ucomigt_ss((a), (b))
  3737. #endif
  3738. SIMDE_FUNCTION_ATTRIBUTES
  3739. int simde_mm_ucomile_ss(simde__m128 a, simde__m128 b)
  3740. {
  3741. #if defined(SIMDE_X86_SSE_NATIVE)
  3742. return _mm_ucomile_ss(a, b);
  3743. #else
  3744. simde__m128_private a_ = simde__m128_to_private(a),
  3745. b_ = simde__m128_to_private(b);
  3746. int r;
  3747. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3748. uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
  3749. uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
  3750. uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
  3751. uint32x4_t a_le_b = vcleq_f32(a_.neon_f32, b_.neon_f32);
  3752. r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0);
  3753. #elif defined(SIMDE_HAVE_FENV_H)
  3754. fenv_t envp;
  3755. int x = feholdexcept(&envp);
  3756. r = a_.f32[0] <= b_.f32[0];
  3757. if (HEDLEY_LIKELY(x == 0))
  3758. fesetenv(&envp);
  3759. #else
  3760. r = a_.f32[0] <= b_.f32[0];
  3761. #endif
  3762. return r;
  3763. #endif
  3764. }
  3765. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3766. #define _mm_ucomile_ss(a, b) simde_mm_ucomile_ss((a), (b))
  3767. #endif
  3768. SIMDE_FUNCTION_ATTRIBUTES
  3769. int simde_mm_ucomilt_ss(simde__m128 a, simde__m128 b)
  3770. {
  3771. #if defined(SIMDE_X86_SSE_NATIVE)
  3772. return _mm_ucomilt_ss(a, b);
  3773. #else
  3774. simde__m128_private a_ = simde__m128_to_private(a),
  3775. b_ = simde__m128_to_private(b);
  3776. int r;
  3777. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3778. uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
  3779. uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
  3780. uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
  3781. uint32x4_t a_lt_b = vcltq_f32(a_.neon_f32, b_.neon_f32);
  3782. r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0);
  3783. #elif defined(SIMDE_HAVE_FENV_H)
  3784. fenv_t envp;
  3785. int x = feholdexcept(&envp);
  3786. r = a_.f32[0] < b_.f32[0];
  3787. if (HEDLEY_LIKELY(x == 0))
  3788. fesetenv(&envp);
  3789. #else
  3790. r = a_.f32[0] < b_.f32[0];
  3791. #endif
  3792. return r;
  3793. #endif
  3794. }
  3795. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3796. #define _mm_ucomilt_ss(a, b) simde_mm_ucomilt_ss((a), (b))
  3797. #endif
  3798. SIMDE_FUNCTION_ATTRIBUTES
  3799. int simde_mm_ucomineq_ss(simde__m128 a, simde__m128 b)
  3800. {
  3801. #if defined(SIMDE_X86_SSE_NATIVE)
  3802. return _mm_ucomineq_ss(a, b);
  3803. #else
  3804. simde__m128_private a_ = simde__m128_to_private(a),
  3805. b_ = simde__m128_to_private(b);
  3806. int r;
  3807. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3808. uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
  3809. uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
  3810. uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
  3811. uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32));
  3812. r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0);
  3813. #elif defined(SIMDE_HAVE_FENV_H)
  3814. fenv_t envp;
  3815. int x = feholdexcept(&envp);
  3816. r = a_.f32[0] != b_.f32[0];
  3817. if (HEDLEY_LIKELY(x == 0))
  3818. fesetenv(&envp);
  3819. #else
  3820. r = a_.f32[0] != b_.f32[0];
  3821. #endif
  3822. return r;
  3823. #endif
  3824. }
  3825. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3826. #define _mm_ucomineq_ss(a, b) simde_mm_ucomineq_ss((a), (b))
  3827. #endif
  3828. #if defined(SIMDE_X86_SSE_NATIVE)
  3829. #if defined(__has_builtin)
  3830. #if __has_builtin(__builtin_ia32_undef128)
  3831. #define SIMDE_HAVE_UNDEFINED128
  3832. #endif
  3833. #elif !defined(__PGI) && !defined(SIMDE_BUG_GCC_REV_208793) && \
  3834. !defined(_MSC_VER)
  3835. #define SIMDE_HAVE_UNDEFINED128
  3836. #endif
  3837. #endif
  3838. #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
  3839. HEDLEY_DIAGNOSTIC_PUSH
  3840. SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
  3841. #endif
  3842. SIMDE_FUNCTION_ATTRIBUTES
  3843. simde__m128 simde_mm_unpackhi_ps(simde__m128 a, simde__m128 b)
  3844. {
  3845. #if defined(SIMDE_X86_SSE_NATIVE)
  3846. return _mm_unpackhi_ps(a, b);
  3847. #else
  3848. simde__m128_private r_, a_ = simde__m128_to_private(a),
  3849. b_ = simde__m128_to_private(b);
  3850. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  3851. r_.neon_f32 = vzip2q_f32(a_.neon_f32, b_.neon_f32);
  3852. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3853. float32x2_t a1 = vget_high_f32(a_.neon_f32);
  3854. float32x2_t b1 = vget_high_f32(b_.neon_f32);
  3855. float32x2x2_t result = vzip_f32(a1, b1);
  3856. r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
  3857. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  3858. r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 2, 6, 3, 7);
  3859. #else
  3860. r_.f32[0] = a_.f32[2];
  3861. r_.f32[1] = b_.f32[2];
  3862. r_.f32[2] = a_.f32[3];
  3863. r_.f32[3] = b_.f32[3];
  3864. #endif
  3865. return simde__m128_from_private(r_);
  3866. #endif
  3867. }
  3868. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3869. #define _mm_unpackhi_ps(a, b) simde_mm_unpackhi_ps((a), (b))
  3870. #endif
  3871. SIMDE_FUNCTION_ATTRIBUTES
  3872. simde__m128 simde_mm_unpacklo_ps(simde__m128 a, simde__m128 b)
  3873. {
  3874. #if defined(SIMDE_X86_SSE_NATIVE)
  3875. return _mm_unpacklo_ps(a, b);
  3876. #else
  3877. simde__m128_private r_, a_ = simde__m128_to_private(a),
  3878. b_ = simde__m128_to_private(b);
  3879. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  3880. r_.neon_f32 = vzip1q_f32(a_.neon_f32, b_.neon_f32);
  3881. #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
  3882. r_.altivec_f32 = vec_mergeh(a_.altivec_f32, b_.altivec_f32);
  3883. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  3884. r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 4, 1, 5);
  3885. #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3886. float32x2_t a1 = vget_low_f32(a_.neon_f32);
  3887. float32x2_t b1 = vget_low_f32(b_.neon_f32);
  3888. float32x2x2_t result = vzip_f32(a1, b1);
  3889. r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
  3890. #else
  3891. r_.f32[0] = a_.f32[0];
  3892. r_.f32[1] = b_.f32[0];
  3893. r_.f32[2] = a_.f32[1];
  3894. r_.f32[3] = b_.f32[1];
  3895. #endif
  3896. return simde__m128_from_private(r_);
  3897. #endif
  3898. }
  3899. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3900. #define _mm_unpacklo_ps(a, b) simde_mm_unpacklo_ps((a), (b))
  3901. #endif
  3902. SIMDE_FUNCTION_ATTRIBUTES
  3903. void simde_mm_stream_pi(simde__m64 *mem_addr, simde__m64 a)
  3904. {
  3905. #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
  3906. _mm_stream_pi(HEDLEY_REINTERPRET_CAST(__m64 *, mem_addr), a);
  3907. #else
  3908. simde__m64_private *dest = HEDLEY_REINTERPRET_CAST(simde__m64_private *,
  3909. mem_addr),
  3910. a_ = simde__m64_to_private(a);
  3911. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3912. dest->i64[0] = vget_lane_s64(a_.neon_i64, 0);
  3913. #else
  3914. dest->i64[0] = a_.i64[0];
  3915. #endif
  3916. #endif
  3917. }
  3918. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3919. #define _mm_stream_pi(mem_addr, a) simde_mm_stream_pi(mem_addr, (a))
  3920. #endif
  3921. SIMDE_FUNCTION_ATTRIBUTES
  3922. void simde_mm_stream_ps(simde_float32 mem_addr[4], simde__m128 a)
  3923. {
  3924. #if defined(SIMDE_X86_SSE_NATIVE)
  3925. _mm_stream_ps(mem_addr, a);
  3926. #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && \
  3927. defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
  3928. simde__m128_private a_ = simde__m128_to_private(a);
  3929. __builtin_nontemporal_store(
  3930. a_.f32, SIMDE_ALIGN_CAST(__typeof__(a_.f32) *, mem_addr));
  3931. #else
  3932. simde_mm_store_ps(mem_addr, a);
  3933. #endif
  3934. }
  3935. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3936. #define _mm_stream_ps(mem_addr, a) \
  3937. simde_mm_stream_ps(SIMDE_CHECKED_REINTERPRET_CAST( \
  3938. float *, simde_float32 *, mem_addr), \
  3939. (a))
  3940. #endif
  3941. #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
  3942. #define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
  3943. do { \
  3944. float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
  3945. float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
  3946. row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
  3947. vget_low_f32(ROW23.val[0])); \
  3948. row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
  3949. vget_low_f32(ROW23.val[1])); \
  3950. row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
  3951. vget_high_f32(ROW23.val[0])); \
  3952. row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
  3953. vget_high_f32(ROW23.val[1])); \
  3954. } while (0)
  3955. #else
  3956. #define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
  3957. do { \
  3958. simde__m128 tmp3, tmp2, tmp1, tmp0; \
  3959. tmp0 = simde_mm_unpacklo_ps((row0), (row1)); \
  3960. tmp2 = simde_mm_unpacklo_ps((row2), (row3)); \
  3961. tmp1 = simde_mm_unpackhi_ps((row0), (row1)); \
  3962. tmp3 = simde_mm_unpackhi_ps((row2), (row3)); \
  3963. row0 = simde_mm_movelh_ps(tmp0, tmp2); \
  3964. row1 = simde_mm_movehl_ps(tmp2, tmp0); \
  3965. row2 = simde_mm_movelh_ps(tmp1, tmp3); \
  3966. row3 = simde_mm_movehl_ps(tmp3, tmp1); \
  3967. } while (0)
  3968. #endif
  3969. #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
  3970. #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
  3971. SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3)
  3972. #endif
  3973. #if defined(_MM_EXCEPT_INVALID)
  3974. #define SIMDE_MM_EXCEPT_INVALID _MM_EXCEPT_INVALID
  3975. #else
  3976. #define SIMDE_MM_EXCEPT_INVALID (0x0001)
  3977. #endif
  3978. #if defined(_MM_EXCEPT_DENORM)
  3979. #define SIMDE_MM_EXCEPT_DENORM _MM_EXCEPT_DENORM
  3980. #else
  3981. #define SIMDE_MM_EXCEPT_DENORM (0x0002)
  3982. #endif
  3983. #if defined(_MM_EXCEPT_DIV_ZERO)
  3984. #define SIMDE_MM_EXCEPT_DIV_ZERO _MM_EXCEPT_DIV_ZERO
  3985. #else
  3986. #define SIMDE_MM_EXCEPT_DIV_ZERO (0x0004)
  3987. #endif
  3988. #if defined(_MM_EXCEPT_OVERFLOW)
  3989. #define SIMDE_MM_EXCEPT_OVERFLOW _MM_EXCEPT_OVERFLOW
  3990. #else
  3991. #define SIMDE_MM_EXCEPT_OVERFLOW (0x0008)
  3992. #endif
  3993. #if defined(_MM_EXCEPT_UNDERFLOW)
  3994. #define SIMDE_MM_EXCEPT_UNDERFLOW _MM_EXCEPT_UNDERFLOW
  3995. #else
  3996. #define SIMDE_MM_EXCEPT_UNDERFLOW (0x0010)
  3997. #endif
  3998. #if defined(_MM_EXCEPT_INEXACT)
  3999. #define SIMDE_MM_EXCEPT_INEXACT _MM_EXCEPT_INEXACT
  4000. #else
  4001. #define SIMDE_MM_EXCEPT_INEXACT (0x0020)
  4002. #endif
  4003. #if defined(_MM_EXCEPT_MASK)
  4004. #define SIMDE_MM_EXCEPT_MASK _MM_EXCEPT_MASK
  4005. #else
  4006. #define SIMDE_MM_EXCEPT_MASK \
  4007. (SIMDE_MM_EXCEPT_INVALID | SIMDE_MM_EXCEPT_DENORM | \
  4008. SIMDE_MM_EXCEPT_DIV_ZERO | SIMDE_MM_EXCEPT_OVERFLOW | \
  4009. SIMDE_MM_EXCEPT_UNDERFLOW | SIMDE_MM_EXCEPT_INEXACT)
  4010. #endif
  4011. #if defined(_MM_MASK_INVALID)
  4012. #define SIMDE_MM_MASK_INVALID _MM_MASK_INVALID
  4013. #else
  4014. #define SIMDE_MM_MASK_INVALID (0x0080)
  4015. #endif
  4016. #if defined(_MM_MASK_DENORM)
  4017. #define SIMDE_MM_MASK_DENORM _MM_MASK_DENORM
  4018. #else
  4019. #define SIMDE_MM_MASK_DENORM (0x0100)
  4020. #endif
  4021. #if defined(_MM_MASK_DIV_ZERO)
  4022. #define SIMDE_MM_MASK_DIV_ZERO _MM_MASK_DIV_ZERO
  4023. #else
  4024. #define SIMDE_MM_MASK_DIV_ZERO (0x0200)
  4025. #endif
  4026. #if defined(_MM_MASK_OVERFLOW)
  4027. #define SIMDE_MM_MASK_OVERFLOW _MM_MASK_OVERFLOW
  4028. #else
  4029. #define SIMDE_MM_MASK_OVERFLOW (0x0400)
  4030. #endif
  4031. #if defined(_MM_MASK_UNDERFLOW)
  4032. #define SIMDE_MM_MASK_UNDERFLOW _MM_MASK_UNDERFLOW
  4033. #else
  4034. #define SIMDE_MM_MASK_UNDERFLOW (0x0800)
  4035. #endif
  4036. #if defined(_MM_MASK_INEXACT)
  4037. #define SIMDE_MM_MASK_INEXACT _MM_MASK_INEXACT
  4038. #else
  4039. #define SIMDE_MM_MASK_INEXACT (0x1000)
  4040. #endif
  4041. #if defined(_MM_MASK_MASK)
  4042. #define SIMDE_MM_MASK_MASK _MM_MASK_MASK
  4043. #else
  4044. #define SIMDE_MM_MASK_MASK \
  4045. (SIMDE_MM_MASK_INVALID | SIMDE_MM_MASK_DENORM | \
  4046. SIMDE_MM_MASK_DIV_ZERO | SIMDE_MM_MASK_OVERFLOW | \
  4047. SIMDE_MM_MASK_UNDERFLOW | SIMDE_MM_MASK_INEXACT)
  4048. #endif
  4049. #if defined(_MM_FLUSH_ZERO_MASK)
  4050. #define SIMDE_MM_FLUSH_ZERO_MASK _MM_FLUSH_ZERO_MASK
  4051. #else
  4052. #define SIMDE_MM_FLUSH_ZERO_MASK (0x8000)
  4053. #endif
  4054. #if defined(_MM_FLUSH_ZERO_ON)
  4055. #define SIMDE_MM_FLUSH_ZERO_ON _MM_FLUSH_ZERO_ON
  4056. #else
  4057. #define SIMDE_MM_FLUSH_ZERO_ON (0x8000)
  4058. #endif
  4059. #if defined(_MM_FLUSH_ZERO_OFF)
  4060. #define SIMDE_MM_FLUSH_ZERO_OFF _MM_FLUSH_ZERO_OFF
  4061. #else
  4062. #define SIMDE_MM_FLUSH_ZERO_OFF (0x0000)
  4063. #endif
  4064. SIMDE_END_DECLS_
  4065. HEDLEY_DIAGNOSTIC_POP
  4066. #endif /* !defined(SIMDE_X86_SSE_H) */