aes-gcm-avx512.pl 199 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984
  1. # Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved.
  2. # Copyright (c) 2021, Intel Corporation. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. #
  10. # This implementation is based on the AES-GCM code (AVX512VAES + VPCLMULQDQ)
  11. # from Intel(R) Multi-Buffer Crypto for IPsec Library v1.1
  12. # (https://github.com/intel/intel-ipsec-mb).
  13. # Original author is Tomasz Kantecki <[email protected]>.
  14. #
  15. # References:
  16. # [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on
  17. # Intel Architecture Processors. August, 2010.
  18. # [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on
  19. # Intel Architecture Processors. October, 2012.
  20. # [3] Shay Gueron et. al. Intel Carry-Less Multiplication Instruction and its
  21. # Usage for Computing the GCM Mode. May, 2010.
  22. #
  23. #
  24. # December 2021
  25. #
  26. # Initial release.
  27. #
  28. # GCM128_CONTEXT structure has storage for 16 hkeys only, but this
  29. # implementation can use up to 48. To avoid extending the context size,
  30. # precompute and store in the context first 16 hkeys only, and compute the rest
  31. # on demand keeping them in the local frame.
  32. #
  33. #======================================================================
  34. # $output is the last argument if it looks like a file (it has an extension)
  35. # $flavour is the first argument if it doesn't look like a file
  36. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  37. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  38. $win64 = 0;
  39. $win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  40. $avx512vaes = 0;
  41. $0 =~ m/(.*[\/\\])[^\/\\]+$/;
  42. $dir = $1;
  43. ($xlate = "${dir}x86_64-xlate.pl" and -f $xlate)
  44. or ($xlate = "${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate)
  45. or die "can't locate x86_64-xlate.pl";
  46. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  47. $avx512vaes = ($1 >= 2.30);
  48. }
  49. if (!$avx512vaes
  50. && $win64
  51. && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/)
  52. && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/)
  53. {
  54. $avx512vaes = ($1 == 2.13 && $2 >= 3) + ($1 >= 2.14);
  55. }
  56. if (!$avx512vaes && `$ENV{CC} -v 2>&1`
  57. =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) {
  58. my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001
  59. if ($1) {
  60. # Apple conditions, they use a different version series, see
  61. # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2
  62. # clang 7.0.0 is Apple clang 10.0.1
  63. $avx512vaes = ($ver>=10.0001)
  64. } else {
  65. $avx512vaes = ($ver>=7.0);
  66. }
  67. }
  68. open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""
  69. or die "can't call $xlate: $!";
  70. *STDOUT = *OUT;
  71. #======================================================================
  72. if ($avx512vaes>0) { #<<<
  73. $code .= <<___;
  74. .extern OPENSSL_ia32cap_P
  75. .globl ossl_vaes_vpclmulqdq_capable
  76. .type ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent
  77. .align 32
  78. ossl_vaes_vpclmulqdq_capable:
  79. mov OPENSSL_ia32cap_P+8(%rip), %rcx
  80. # avx512vpclmulqdq + avx512vaes + avx512vl + avx512bw + avx512dq + avx512f
  81. mov \$`1<<42|1<<41|1<<31|1<<30|1<<17|1<<16`,%rdx
  82. xor %eax,%eax
  83. and %rdx,%rcx
  84. cmp %rdx,%rcx
  85. cmove %rcx,%rax
  86. ret
  87. .size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable
  88. ___
  89. # ; Mapping key length -> AES rounds count
  90. my %aes_rounds = (
  91. 128 => 9,
  92. 192 => 11,
  93. 256 => 13);
  94. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  95. # ;;; Code generation control switches
  96. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  97. # ; ABI-aware zeroing of volatile registers in EPILOG().
  98. # ; Disabled due to performance reasons.
  99. my $CLEAR_SCRATCH_REGISTERS = 0;
  100. # ; Zero HKeys storage from the stack if they are stored there
  101. my $CLEAR_HKEYS_STORAGE_ON_EXIT = 1;
  102. # ; Enable / disable check of function arguments for null pointer
  103. # ; Currently disabled, as this check is handled outside.
  104. my $CHECK_FUNCTION_ARGUMENTS = 0;
  105. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  106. # ;;; Global constants
  107. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  108. # AES block size in bytes
  109. my $AES_BLOCK_SIZE = 16;
  110. # Storage capacity in elements
  111. my $HKEYS_STORAGE_CAPACITY = 48;
  112. my $LOCAL_STORAGE_CAPACITY = 48;
  113. my $HKEYS_CONTEXT_CAPACITY = 16;
  114. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  115. # ;;; Stack frame definition
  116. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  117. # (1) -> +64(Win)/+48(Lin)-byte space for pushed GPRs
  118. # (2) -> +8-byte space for 16-byte alignment of XMM storage
  119. # (3) -> Frame pointer (%RBP)
  120. # (4) -> +160-byte XMM storage (Windows only, zero on Linux)
  121. # (5) -> +48-byte space for 64-byte alignment of %RSP from p.8
  122. # (6) -> +768-byte LOCAL storage (optional, can be omitted in some functions)
  123. # (7) -> +768-byte HKEYS storage
  124. # (8) -> Stack pointer (%RSP) aligned on 64-byte boundary
  125. my $GP_STORAGE = $win64 ? 8 * 8 : 8 * 6; # ; space for saved non-volatile GP registers (pushed on stack)
  126. my $XMM_STORAGE = $win64 ? (10 * 16) : 0; # ; space for saved XMM registers
  127. my $HKEYS_STORAGE = ($HKEYS_STORAGE_CAPACITY * $AES_BLOCK_SIZE); # ; space for HKeys^i, i=1..48
  128. my $LOCAL_STORAGE = ($LOCAL_STORAGE_CAPACITY * $AES_BLOCK_SIZE); # ; space for up to 48 AES blocks
  129. my $STACK_HKEYS_OFFSET = 0;
  130. my $STACK_LOCAL_OFFSET = ($STACK_HKEYS_OFFSET + $HKEYS_STORAGE);
  131. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  132. # ;;; Function arguments abstraction
  133. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  134. my ($arg1, $arg2, $arg3, $arg4, $arg5, $arg6, $arg7, $arg8, $arg9, $arg10, $arg11);
  135. # ; This implementation follows the convention: for non-leaf functions (they
  136. # ; must call PROLOG) %rbp is used as a frame pointer, and has fixed offset from
  137. # ; the function entry: $GP_STORAGE + [8 bytes alignment (Windows only)]. This
  138. # ; helps to facilitate SEH handlers writing.
  139. #
  140. # ; Leaf functions here do not use more than 4 input arguments.
  141. if ($win64) {
  142. $arg1 = "%rcx";
  143. $arg2 = "%rdx";
  144. $arg3 = "%r8";
  145. $arg4 = "%r9";
  146. $arg5 = "`$GP_STORAGE + 8 + 8*5`(%rbp)"; # +8 - alignment bytes
  147. $arg6 = "`$GP_STORAGE + 8 + 8*6`(%rbp)";
  148. $arg7 = "`$GP_STORAGE + 8 + 8*7`(%rbp)";
  149. $arg8 = "`$GP_STORAGE + 8 + 8*8`(%rbp)";
  150. $arg9 = "`$GP_STORAGE + 8 + 8*9`(%rbp)";
  151. $arg10 = "`$GP_STORAGE + 8 + 8*10`(%rbp)";
  152. $arg11 = "`$GP_STORAGE + 8 + 8*11`(%rbp)";
  153. } else {
  154. $arg1 = "%rdi";
  155. $arg2 = "%rsi";
  156. $arg3 = "%rdx";
  157. $arg4 = "%rcx";
  158. $arg5 = "%r8";
  159. $arg6 = "%r9";
  160. $arg7 = "`$GP_STORAGE + 8*1`(%rbp)";
  161. $arg8 = "`$GP_STORAGE + 8*2`(%rbp)";
  162. $arg9 = "`$GP_STORAGE + 8*3`(%rbp)";
  163. $arg10 = "`$GP_STORAGE + 8*4`(%rbp)";
  164. $arg11 = "`$GP_STORAGE + 8*5`(%rbp)";
  165. }
  166. # ; Offsets in gcm128_context structure (see include/crypto/modes.h)
  167. my $CTX_OFFSET_CurCount = (16 * 0); # ; (Yi) Current counter for generation of encryption key
  168. my $CTX_OFFSET_PEncBlock = (16 * 1); # ; (repurposed EKi field) Partial block buffer
  169. my $CTX_OFFSET_EK0 = (16 * 2); # ; (EK0) Encrypted Y0 counter (see gcm spec notation)
  170. my $CTX_OFFSET_AadLen = (16 * 3); # ; (len.u[0]) Length of Hash which has been input
  171. my $CTX_OFFSET_InLen = ((16 * 3) + 8); # ; (len.u[1]) Length of input data which will be encrypted or decrypted
  172. my $CTX_OFFSET_AadHash = (16 * 4); # ; (Xi) Current hash
  173. my $CTX_OFFSET_HTable = (16 * 6); # ; (Htable) Precomputed table (allows 16 values)
  174. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  175. # ;;; Helper functions
  176. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  177. # ; Generates "random" local labels
  178. sub random_string() {
  179. my @chars = ('a' .. 'z', 'A' .. 'Z', '0' .. '9', '_');
  180. my $length = 15;
  181. my $str;
  182. map { $str .= $chars[rand(33)] } 1 .. $length;
  183. return $str;
  184. }
  185. sub BYTE {
  186. my ($reg) = @_;
  187. if ($reg =~ /%r[abcd]x/i) {
  188. $reg =~ s/%r([abcd])x/%${1}l/i;
  189. } elsif ($reg =~ /%r[sdb][ip]/i) {
  190. $reg =~ s/%r([sdb][ip])/%${1}l/i;
  191. } elsif ($reg =~ /%r[0-9]{1,2}/i) {
  192. $reg =~ s/%(r[0-9]{1,2})/%${1}b/i;
  193. } else {
  194. die "BYTE: unknown register: $reg\n";
  195. }
  196. return $reg;
  197. }
  198. sub WORD {
  199. my ($reg) = @_;
  200. if ($reg =~ /%r[abcdsdb][xip]/i) {
  201. $reg =~ s/%r([abcdsdb])([xip])/%${1}${2}/i;
  202. } elsif ($reg =~ /%r[0-9]{1,2}/) {
  203. $reg =~ s/%(r[0-9]{1,2})/%${1}w/i;
  204. } else {
  205. die "WORD: unknown register: $reg\n";
  206. }
  207. return $reg;
  208. }
  209. sub DWORD {
  210. my ($reg) = @_;
  211. if ($reg =~ /%r[abcdsdb][xip]/i) {
  212. $reg =~ s/%r([abcdsdb])([xip])/%e${1}${2}/i;
  213. } elsif ($reg =~ /%r[0-9]{1,2}/i) {
  214. $reg =~ s/%(r[0-9]{1,2})/%${1}d/i;
  215. } else {
  216. die "DWORD: unknown register: $reg\n";
  217. }
  218. return $reg;
  219. }
  220. sub XWORD {
  221. my ($reg) = @_;
  222. if ($reg =~ /%[xyz]mm/i) {
  223. $reg =~ s/%[xyz]mm/%xmm/i;
  224. } else {
  225. die "XWORD: unknown register: $reg\n";
  226. }
  227. return $reg;
  228. }
  229. sub YWORD {
  230. my ($reg) = @_;
  231. if ($reg =~ /%[xyz]mm/i) {
  232. $reg =~ s/%[xyz]mm/%ymm/i;
  233. } else {
  234. die "YWORD: unknown register: $reg\n";
  235. }
  236. return $reg;
  237. }
  238. sub ZWORD {
  239. my ($reg) = @_;
  240. if ($reg =~ /%[xyz]mm/i) {
  241. $reg =~ s/%[xyz]mm/%zmm/i;
  242. } else {
  243. die "ZWORD: unknown register: $reg\n";
  244. }
  245. return $reg;
  246. }
  247. # ; Helper function to construct effective address based on two kinds of
  248. # ; offsets: numerical or located in the register
  249. sub EffectiveAddress {
  250. my ($base, $offset, $displacement) = @_;
  251. $displacement = 0 if (!$displacement);
  252. if ($offset =~ /^\d+\z/) { # numerical offset
  253. return "`$offset + $displacement`($base)";
  254. } else { # offset resides in register
  255. return "$displacement($base,$offset,1)";
  256. }
  257. }
  258. # ; Provides memory location of corresponding HashKey power
  259. sub HashKeyByIdx {
  260. my ($idx, $base) = @_;
  261. my $base_str = ($base eq "%rsp") ? "frame" : "context";
  262. my $offset = &HashKeyOffsetByIdx($idx, $base_str);
  263. return "$offset($base)";
  264. }
  265. # ; Provides offset (in bytes) of corresponding HashKey power from the highest key in the storage
  266. sub HashKeyOffsetByIdx {
  267. my ($idx, $base) = @_;
  268. die "HashKeyOffsetByIdx: base should be either 'frame' or 'context'; base = $base"
  269. if (($base ne "frame") && ($base ne "context"));
  270. my $offset_base;
  271. my $offset_idx;
  272. if ($base eq "frame") { # frame storage
  273. die "HashKeyOffsetByIdx: idx out of bounds (1..48)! idx = $idx\n" if ($idx > $HKEYS_STORAGE_CAPACITY || $idx < 1);
  274. $offset_base = $STACK_HKEYS_OFFSET;
  275. $offset_idx = ($AES_BLOCK_SIZE * ($HKEYS_STORAGE_CAPACITY - $idx));
  276. } else { # context storage
  277. die "HashKeyOffsetByIdx: idx out of bounds (1..16)! idx = $idx\n" if ($idx > $HKEYS_CONTEXT_CAPACITY || $idx < 1);
  278. $offset_base = $CTX_OFFSET_HTable;
  279. $offset_idx = ($AES_BLOCK_SIZE * ($HKEYS_CONTEXT_CAPACITY - $idx));
  280. }
  281. return $offset_base + $offset_idx;
  282. }
  283. # ; Creates local frame and does back up of non-volatile registers.
  284. # ; Holds stack unwinding directives.
  285. sub PROLOG {
  286. my ($need_hkeys_stack_storage, $need_aes_stack_storage, $func_name) = @_;
  287. my $DYNAMIC_STACK_ALLOC_SIZE = 0;
  288. my $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE = $win64 ? 48 : 52;
  289. if ($need_hkeys_stack_storage) {
  290. $DYNAMIC_STACK_ALLOC_SIZE += $HKEYS_STORAGE;
  291. }
  292. if ($need_aes_stack_storage) {
  293. if (!$need_hkeys_stack_storage) {
  294. die "PROLOG: unsupported case - aes storage without hkeys one";
  295. }
  296. $DYNAMIC_STACK_ALLOC_SIZE += $LOCAL_STORAGE;
  297. }
  298. $code .= <<___;
  299. push %rbx
  300. .cfi_push %rbx
  301. .L${func_name}_seh_push_rbx:
  302. push %rbp
  303. .cfi_push %rbp
  304. .L${func_name}_seh_push_rbp:
  305. push %r12
  306. .cfi_push %r12
  307. .L${func_name}_seh_push_r12:
  308. push %r13
  309. .cfi_push %r13
  310. .L${func_name}_seh_push_r13:
  311. push %r14
  312. .cfi_push %r14
  313. .L${func_name}_seh_push_r14:
  314. push %r15
  315. .cfi_push %r15
  316. .L${func_name}_seh_push_r15:
  317. ___
  318. if ($win64) {
  319. $code .= <<___;
  320. push %rdi
  321. .L${func_name}_seh_push_rdi:
  322. push %rsi
  323. .L${func_name}_seh_push_rsi:
  324. sub \$`$XMM_STORAGE+8`,%rsp # +8 alignment
  325. .L${func_name}_seh_allocstack_xmm:
  326. ___
  327. }
  328. $code .= <<___;
  329. # ; %rbp contains stack pointer right after GP regs pushed at stack + [8
  330. # ; bytes of alignment (Windows only)]. It serves as a frame pointer in SEH
  331. # ; handlers. The requirement for a frame pointer is that its offset from
  332. # ; RSP shall be multiple of 16, and not exceed 240 bytes. The frame pointer
  333. # ; itself seems to be reasonable to use here, because later we do 64-byte stack
  334. # ; alignment which gives us non-determinate offsets and complicates writing
  335. # ; SEH handlers.
  336. #
  337. # ; It also serves as an anchor for retrieving stack arguments on both Linux
  338. # ; and Windows.
  339. lea `$XMM_STORAGE`(%rsp),%rbp
  340. .cfi_def_cfa_register %rbp
  341. .L${func_name}_seh_setfp:
  342. ___
  343. if ($win64) {
  344. # ; xmm6:xmm15 need to be preserved on Windows
  345. foreach my $reg_idx (6 .. 15) {
  346. my $xmm_reg_offset = ($reg_idx - 6) * 16;
  347. $code .= <<___;
  348. vmovdqu %xmm${reg_idx},$xmm_reg_offset(%rsp)
  349. .L${func_name}_seh_save_xmm${reg_idx}:
  350. ___
  351. }
  352. }
  353. $code .= <<___;
  354. # Prolog ends here. Next stack allocation is treated as "dynamic".
  355. .L${func_name}_seh_prolog_end:
  356. ___
  357. if ($DYNAMIC_STACK_ALLOC_SIZE) {
  358. $code .= <<___;
  359. sub \$`$DYNAMIC_STACK_ALLOC_SIZE + $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE`,%rsp
  360. and \$(-64),%rsp
  361. ___
  362. }
  363. }
  364. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  365. # ;;; Restore register content for the caller.
  366. # ;;; And cleanup stack.
  367. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  368. sub EPILOG {
  369. my ($hkeys_storage_on_stack, $payload_len) = @_;
  370. my $rndsuffix = &random_string();
  371. if ($hkeys_storage_on_stack && $CLEAR_HKEYS_STORAGE_ON_EXIT) {
  372. # ; There is no need in hkeys cleanup if payload len was small, i.e. no hkeys
  373. # ; were stored in the local frame storage
  374. $code .= <<___;
  375. cmpq \$`16*16`,$payload_len
  376. jbe .Lskip_hkeys_cleanup_${rndsuffix}
  377. vpxor %xmm0,%xmm0,%xmm0
  378. ___
  379. for (my $i = 0; $i < int($HKEYS_STORAGE / 64); $i++) {
  380. $code .= "vmovdqa64 %zmm0,`$STACK_HKEYS_OFFSET + 64*$i`(%rsp)\n";
  381. }
  382. $code .= ".Lskip_hkeys_cleanup_${rndsuffix}:\n";
  383. }
  384. if ($CLEAR_SCRATCH_REGISTERS) {
  385. &clear_scratch_gps_asm();
  386. &clear_scratch_zmms_asm();
  387. } else {
  388. $code .= "vzeroupper\n";
  389. }
  390. if ($win64) {
  391. # ; restore xmm15:xmm6
  392. for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) {
  393. my $xmm_reg_offset = -$XMM_STORAGE + ($reg_idx - 6) * 16;
  394. $code .= <<___;
  395. vmovdqu $xmm_reg_offset(%rbp),%xmm${reg_idx},
  396. ___
  397. }
  398. }
  399. if ($win64) {
  400. # Forming valid epilog for SEH with use of frame pointer.
  401. # https://docs.microsoft.com/en-us/cpp/build/prolog-and-epilog?view=msvc-160#epilog-code
  402. $code .= "lea 8(%rbp),%rsp\n";
  403. } else {
  404. $code .= "lea (%rbp),%rsp\n";
  405. $code .= ".cfi_def_cfa_register %rsp\n";
  406. }
  407. if ($win64) {
  408. $code .= <<___;
  409. pop %rsi
  410. .cfi_pop %rsi
  411. pop %rdi
  412. .cfi_pop %rdi
  413. ___
  414. }
  415. $code .= <<___;
  416. pop %r15
  417. .cfi_pop %r15
  418. pop %r14
  419. .cfi_pop %r14
  420. pop %r13
  421. .cfi_pop %r13
  422. pop %r12
  423. .cfi_pop %r12
  424. pop %rbp
  425. .cfi_pop %rbp
  426. pop %rbx
  427. .cfi_pop %rbx
  428. ___
  429. }
  430. # ; Clears all scratch ZMM registers
  431. # ;
  432. # ; It should be called before restoring the XMM registers
  433. # ; for Windows (XMM6-XMM15).
  434. # ;
  435. sub clear_scratch_zmms_asm {
  436. # ; On Linux, all ZMM registers are scratch registers
  437. if (!$win64) {
  438. $code .= "vzeroall\n";
  439. } else {
  440. foreach my $i (0 .. 5) {
  441. $code .= "vpxorq %xmm${i},%xmm${i},%xmm${i}\n";
  442. }
  443. }
  444. foreach my $i (16 .. 31) {
  445. $code .= "vpxorq %xmm${i},%xmm${i},%xmm${i}\n";
  446. }
  447. }
  448. # Clears all scratch GP registers
  449. sub clear_scratch_gps_asm {
  450. foreach my $reg ("%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11") {
  451. $code .= "xor $reg,$reg\n";
  452. }
  453. if (!$win64) {
  454. foreach my $reg ("%rsi", "%rdi") {
  455. $code .= "xor $reg,$reg\n";
  456. }
  457. }
  458. }
  459. sub precompute_hkeys_on_stack {
  460. my $GCM128_CTX = $_[0];
  461. my $HKEYS_READY = $_[1];
  462. my $ZTMP0 = $_[2];
  463. my $ZTMP1 = $_[3];
  464. my $ZTMP2 = $_[4];
  465. my $ZTMP3 = $_[5];
  466. my $ZTMP4 = $_[6];
  467. my $ZTMP5 = $_[7];
  468. my $ZTMP6 = $_[8];
  469. my $HKEYS_RANGE = $_[9]; # ; "first16", "mid16", "all", "first32", "last32"
  470. die "precompute_hkeys_on_stack: Unexpected value of HKEYS_RANGE: $HKEYS_RANGE"
  471. if ($HKEYS_RANGE ne "first16"
  472. && $HKEYS_RANGE ne "mid16"
  473. && $HKEYS_RANGE ne "all"
  474. && $HKEYS_RANGE ne "first32"
  475. && $HKEYS_RANGE ne "last32");
  476. my $rndsuffix = &random_string();
  477. $code .= <<___;
  478. test $HKEYS_READY,$HKEYS_READY
  479. jnz .L_skip_hkeys_precomputation_${rndsuffix}
  480. ___
  481. if ($HKEYS_RANGE eq "first16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "all") {
  482. # ; Fill the stack with the first 16 hkeys from the context
  483. $code .= <<___;
  484. # ; Move 16 hkeys from the context to stack
  485. vmovdqu64 @{[HashKeyByIdx(4,$GCM128_CTX)]},$ZTMP0
  486. vmovdqu64 $ZTMP0,@{[HashKeyByIdx(4,"%rsp")]}
  487. vmovdqu64 @{[HashKeyByIdx(8,$GCM128_CTX)]},$ZTMP1
  488. vmovdqu64 $ZTMP1,@{[HashKeyByIdx(8,"%rsp")]}
  489. # ; broadcast HashKey^8
  490. vshufi64x2 \$0x00,$ZTMP1,$ZTMP1,$ZTMP1
  491. vmovdqu64 @{[HashKeyByIdx(12,$GCM128_CTX)]},$ZTMP2
  492. vmovdqu64 $ZTMP2,@{[HashKeyByIdx(12,"%rsp")]}
  493. vmovdqu64 @{[HashKeyByIdx(16,$GCM128_CTX)]},$ZTMP3
  494. vmovdqu64 $ZTMP3,@{[HashKeyByIdx(16,"%rsp")]}
  495. ___
  496. }
  497. if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "last32") {
  498. $code .= <<___;
  499. vmovdqu64 @{[HashKeyByIdx(8,"%rsp")]},$ZTMP1
  500. # ; broadcast HashKey^8
  501. vshufi64x2 \$0x00,$ZTMP1,$ZTMP1,$ZTMP1
  502. vmovdqu64 @{[HashKeyByIdx(12,"%rsp")]},$ZTMP2
  503. vmovdqu64 @{[HashKeyByIdx(16,"%rsp")]},$ZTMP3
  504. ___
  505. }
  506. if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") {
  507. # ; Precompute hkeys^i, i=17..32
  508. my $i = 20;
  509. foreach (1 .. int((32 - 16) / 8)) {
  510. # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n)
  511. &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
  512. $code .= "vmovdqu64 $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
  513. $i += 4;
  514. # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n)
  515. &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
  516. $code .= "vmovdqu64 $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
  517. $i += 4;
  518. }
  519. }
  520. if ($HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") {
  521. # ; Precompute hkeys^i, i=33..48 (HKEYS_STORAGE_CAPACITY = 48)
  522. my $i = 36;
  523. foreach (1 .. int((48 - 32) / 8)) {
  524. # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n)
  525. &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
  526. $code .= "vmovdqu64 $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
  527. $i += 4;
  528. # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n)
  529. &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
  530. $code .= "vmovdqu64 $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
  531. $i += 4;
  532. }
  533. }
  534. $code .= ".L_skip_hkeys_precomputation_${rndsuffix}:\n";
  535. }
  536. # ;; =============================================================================
  537. # ;; Generic macro to produce code that executes $OPCODE instruction
  538. # ;; on selected number of AES blocks (16 bytes long ) between 0 and 16.
  539. # ;; All three operands of the instruction come from registers.
  540. # ;; Note: if 3 blocks are left at the end instruction is produced to operate all
  541. # ;; 4 blocks (full width of ZMM)
  542. sub ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 {
  543. my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16)
  544. my $OPCODE = $_[1]; # [in] instruction name
  545. my @DST;
  546. $DST[0] = $_[2]; # [out] destination ZMM register
  547. $DST[1] = $_[3]; # [out] destination ZMM register
  548. $DST[2] = $_[4]; # [out] destination ZMM register
  549. $DST[3] = $_[5]; # [out] destination ZMM register
  550. my @SRC1;
  551. $SRC1[0] = $_[6]; # [in] source 1 ZMM register
  552. $SRC1[1] = $_[7]; # [in] source 1 ZMM register
  553. $SRC1[2] = $_[8]; # [in] source 1 ZMM register
  554. $SRC1[3] = $_[9]; # [in] source 1 ZMM register
  555. my @SRC2;
  556. $SRC2[0] = $_[10]; # [in] source 2 ZMM register
  557. $SRC2[1] = $_[11]; # [in] source 2 ZMM register
  558. $SRC2[2] = $_[12]; # [in] source 2 ZMM register
  559. $SRC2[3] = $_[13]; # [in] source 2 ZMM register
  560. die "ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
  561. if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
  562. my $reg_idx = 0;
  563. my $blocks_left = $NUM_BLOCKS;
  564. foreach (1 .. ($NUM_BLOCKS / 4)) {
  565. $code .= "$OPCODE $SRC2[$reg_idx],$SRC1[$reg_idx],$DST[$reg_idx]\n";
  566. $reg_idx++;
  567. $blocks_left -= 4;
  568. }
  569. my $DSTREG = $DST[$reg_idx];
  570. my $SRC1REG = $SRC1[$reg_idx];
  571. my $SRC2REG = $SRC2[$reg_idx];
  572. if ($blocks_left == 1) {
  573. $code .= "$OPCODE @{[XWORD($SRC2REG)]},@{[XWORD($SRC1REG)]},@{[XWORD($DSTREG)]}\n";
  574. } elsif ($blocks_left == 2) {
  575. $code .= "$OPCODE @{[YWORD($SRC2REG)]},@{[YWORD($SRC1REG)]},@{[YWORD($DSTREG)]}\n";
  576. } elsif ($blocks_left == 3) {
  577. $code .= "$OPCODE $SRC2REG,$SRC1REG,$DSTREG\n";
  578. }
  579. }
  580. # ;; =============================================================================
  581. # ;; Loads specified number of AES blocks into ZMM registers using mask register
  582. # ;; for the last loaded register (xmm, ymm or zmm).
  583. # ;; Loads take place at 1 byte granularity.
  584. sub ZMM_LOAD_MASKED_BLOCKS_0_16 {
  585. my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16)
  586. my $INP = $_[1]; # [in] input data pointer to read from
  587. my $DATA_OFFSET = $_[2]; # [in] offset to the output pointer (GP or numerical)
  588. my @DST;
  589. $DST[0] = $_[3]; # [out] ZMM register with loaded data
  590. $DST[1] = $_[4]; # [out] ZMM register with loaded data
  591. $DST[2] = $_[5]; # [out] ZMM register with loaded data
  592. $DST[3] = $_[6]; # [out] ZMM register with loaded data
  593. my $MASK = $_[7]; # [in] mask register
  594. die "ZMM_LOAD_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
  595. if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
  596. my $src_offset = 0;
  597. my $dst_idx = 0;
  598. my $blocks_left = $NUM_BLOCKS;
  599. if ($NUM_BLOCKS > 0) {
  600. foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) {
  601. $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DST[$dst_idx]\n";
  602. $src_offset += 64;
  603. $dst_idx++;
  604. $blocks_left -= 4;
  605. }
  606. }
  607. my $DSTREG = $DST[$dst_idx];
  608. if ($blocks_left == 1) {
  609. $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[XWORD($DSTREG)]}\{$MASK\}{z}\n";
  610. } elsif ($blocks_left == 2) {
  611. $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[YWORD($DSTREG)]}\{$MASK\}{z}\n";
  612. } elsif (($blocks_left == 3 || $blocks_left == 4)) {
  613. $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DSTREG\{$MASK\}{z}\n";
  614. }
  615. }
  616. # ;; =============================================================================
  617. # ;; Stores specified number of AES blocks from ZMM registers with mask register
  618. # ;; for the last loaded register (xmm, ymm or zmm).
  619. # ;; Stores take place at 1 byte granularity.
  620. sub ZMM_STORE_MASKED_BLOCKS_0_16 {
  621. my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16)
  622. my $OUTP = $_[1]; # [in] output data pointer to write to
  623. my $DATA_OFFSET = $_[2]; # [in] offset to the output pointer (GP or numerical)
  624. my @SRC;
  625. $SRC[0] = $_[3]; # [in] ZMM register with data to store
  626. $SRC[1] = $_[4]; # [in] ZMM register with data to store
  627. $SRC[2] = $_[5]; # [in] ZMM register with data to store
  628. $SRC[3] = $_[6]; # [in] ZMM register with data to store
  629. my $MASK = $_[7]; # [in] mask register
  630. die "ZMM_STORE_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
  631. if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
  632. my $dst_offset = 0;
  633. my $src_idx = 0;
  634. my $blocks_left = $NUM_BLOCKS;
  635. if ($NUM_BLOCKS > 0) {
  636. foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) {
  637. $code .= "vmovdqu8 $SRC[$src_idx],`$dst_offset`($OUTP,$DATA_OFFSET,1)\n";
  638. $dst_offset += 64;
  639. $src_idx++;
  640. $blocks_left -= 4;
  641. }
  642. }
  643. my $SRCREG = $SRC[$src_idx];
  644. if ($blocks_left == 1) {
  645. $code .= "vmovdqu8 @{[XWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
  646. } elsif ($blocks_left == 2) {
  647. $code .= "vmovdqu8 @{[YWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
  648. } elsif ($blocks_left == 3 || $blocks_left == 4) {
  649. $code .= "vmovdqu8 $SRCREG,`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
  650. }
  651. }
  652. # ;;; ===========================================================================
  653. # ;;; Handles AES encryption rounds
  654. # ;;; It handles special cases: the last and first rounds
  655. # ;;; Optionally, it performs XOR with data after the last AES round.
  656. # ;;; Uses NROUNDS parameter to check what needs to be done for the current round.
  657. # ;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks).
  658. sub ZMM_AESENC_ROUND_BLOCKS_0_16 {
  659. my $L0B0_3 = $_[0]; # [in/out] zmm; blocks 0 to 3
  660. my $L0B4_7 = $_[1]; # [in/out] zmm; blocks 4 to 7
  661. my $L0B8_11 = $_[2]; # [in/out] zmm; blocks 8 to 11
  662. my $L0B12_15 = $_[3]; # [in/out] zmm; blocks 12 to 15
  663. my $KEY = $_[4]; # [in] zmm containing round key
  664. my $ROUND = $_[5]; # [in] round number
  665. my $D0_3 = $_[6]; # [in] zmm or no_data; plain/cipher text blocks 0-3
  666. my $D4_7 = $_[7]; # [in] zmm or no_data; plain/cipher text blocks 4-7
  667. my $D8_11 = $_[8]; # [in] zmm or no_data; plain/cipher text blocks 8-11
  668. my $D12_15 = $_[9]; # [in] zmm or no_data; plain/cipher text blocks 12-15
  669. my $NUMBL = $_[10]; # [in] number of blocks; numerical value
  670. my $NROUNDS = $_[11]; # [in] number of rounds; numerical value
  671. # ;;; === first AES round
  672. if ($ROUND < 1) {
  673. # ;; round 0
  674. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  675. $NUMBL, "vpxorq", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
  676. $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY);
  677. }
  678. # ;;; === middle AES rounds
  679. if ($ROUND >= 1 && $ROUND <= $NROUNDS) {
  680. # ;; rounds 1 to 9/11/13
  681. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  682. $NUMBL, "vaesenc", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
  683. $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY);
  684. }
  685. # ;;; === last AES round
  686. if ($ROUND > $NROUNDS) {
  687. # ;; the last round - mix enclast with text xor's
  688. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  689. $NUMBL, "vaesenclast", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
  690. $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY);
  691. # ;;; === XOR with data
  692. if ( ($D0_3 ne "no_data")
  693. && ($D4_7 ne "no_data")
  694. && ($D8_11 ne "no_data")
  695. && ($D12_15 ne "no_data"))
  696. {
  697. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  698. $NUMBL, "vpxorq", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
  699. $L0B4_7, $L0B8_11, $L0B12_15, $D0_3, $D4_7, $D8_11, $D12_15);
  700. }
  701. }
  702. }
  703. # ;;; Horizontal XOR - 4 x 128bits xored together
  704. sub VHPXORI4x128 {
  705. my $REG = $_[0]; # [in/out] ZMM with 4x128bits to xor; 128bit output
  706. my $TMP = $_[1]; # [clobbered] ZMM temporary register
  707. $code .= <<___;
  708. vextracti64x4 \$1,$REG,@{[YWORD($TMP)]}
  709. vpxorq @{[YWORD($TMP)]},@{[YWORD($REG)]},@{[YWORD($REG)]}
  710. vextracti32x4 \$1,@{[YWORD($REG)]},@{[XWORD($TMP)]}
  711. vpxorq @{[XWORD($TMP)]},@{[XWORD($REG)]},@{[XWORD($REG)]}
  712. ___
  713. }
  714. # ;;; AVX512 reduction macro
  715. sub VCLMUL_REDUCE {
  716. my $OUT = $_[0]; # [out] zmm/ymm/xmm: result (must not be $TMP1 or $HI128)
  717. my $POLY = $_[1]; # [in] zmm/ymm/xmm: polynomial
  718. my $HI128 = $_[2]; # [in] zmm/ymm/xmm: high 128b of hash to reduce
  719. my $LO128 = $_[3]; # [in] zmm/ymm/xmm: low 128b of hash to reduce
  720. my $TMP0 = $_[4]; # [in] zmm/ymm/xmm: temporary register
  721. my $TMP1 = $_[5]; # [in] zmm/ymm/xmm: temporary register
  722. $code .= <<___;
  723. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  724. # ;; first phase of the reduction
  725. vpclmulqdq \$0x01,$LO128,$POLY,$TMP0
  726. vpslldq \$8,$TMP0,$TMP0 # ; shift-L 2 DWs
  727. vpxorq $TMP0,$LO128,$TMP0 # ; first phase of the reduction complete
  728. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  729. # ;; second phase of the reduction
  730. vpclmulqdq \$0x00,$TMP0,$POLY,$TMP1
  731. vpsrldq \$4,$TMP1,$TMP1 # ; shift-R only 1-DW to obtain 2-DWs shift-R
  732. vpclmulqdq \$0x10,$TMP0,$POLY,$OUT
  733. vpslldq \$4,$OUT,$OUT # ; shift-L 1-DW to obtain result with no shifts
  734. vpternlogq \$0x96,$HI128,$TMP1,$OUT # ; OUT/GHASH = OUT xor TMP1 xor HI128
  735. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  736. ___
  737. }
  738. # ;; ===========================================================================
  739. # ;; schoolbook multiply of 16 blocks (16 x 16 bytes)
  740. # ;; - it is assumed that data read from $INPTR is already shuffled and
  741. # ;; $INPTR address is 64 byte aligned
  742. # ;; - there is an option to pass ready blocks through ZMM registers too.
  743. # ;; 4 extra parameters need to be passed in such case and 21st ($ZTMP9) argument can be empty
  744. sub GHASH_16 {
  745. my $TYPE = $_[0]; # [in] ghash type: start (xor hash), mid, end (same as mid; no reduction),
  746. # end_reduce (end with reduction), start_reduce
  747. my $GH = $_[1]; # [in/out] ZMM ghash sum: high 128-bits
  748. my $GM = $_[2]; # [in/out] ZMM ghash sum: middle 128-bits
  749. my $GL = $_[3]; # [in/out] ZMM ghash sum: low 128-bits
  750. my $INPTR = $_[4]; # [in] data input pointer
  751. my $INOFF = $_[5]; # [in] data input offset
  752. my $INDIS = $_[6]; # [in] data input displacement
  753. my $HKPTR = $_[7]; # [in] hash key pointer
  754. my $HKOFF = $_[8]; # [in] hash key offset (can be either numerical offset, or register containing offset)
  755. my $HKDIS = $_[9]; # [in] hash key displacement
  756. my $HASH = $_[10]; # [in/out] ZMM hash value in/out
  757. my $ZTMP0 = $_[11]; # [clobbered] temporary ZMM
  758. my $ZTMP1 = $_[12]; # [clobbered] temporary ZMM
  759. my $ZTMP2 = $_[13]; # [clobbered] temporary ZMM
  760. my $ZTMP3 = $_[14]; # [clobbered] temporary ZMM
  761. my $ZTMP4 = $_[15]; # [clobbered] temporary ZMM
  762. my $ZTMP5 = $_[16]; # [clobbered] temporary ZMM
  763. my $ZTMP6 = $_[17]; # [clobbered] temporary ZMM
  764. my $ZTMP7 = $_[18]; # [clobbered] temporary ZMM
  765. my $ZTMP8 = $_[19]; # [clobbered] temporary ZMM
  766. my $ZTMP9 = $_[20]; # [clobbered] temporary ZMM, can be empty if 4 extra parameters below are provided
  767. my $DAT0 = $_[21]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
  768. my $DAT1 = $_[22]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
  769. my $DAT2 = $_[23]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
  770. my $DAT3 = $_[24]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
  771. my $start_ghash = 0;
  772. my $do_reduction = 0;
  773. if ($TYPE eq "start") {
  774. $start_ghash = 1;
  775. }
  776. if ($TYPE eq "start_reduce") {
  777. $start_ghash = 1;
  778. $do_reduction = 1;
  779. }
  780. if ($TYPE eq "end_reduce") {
  781. $do_reduction = 1;
  782. }
  783. # ;; ghash blocks 0-3
  784. if (scalar(@_) == 21) {
  785. $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+0*64))]},$ZTMP9\n";
  786. } else {
  787. $ZTMP9 = $DAT0;
  788. }
  789. if ($start_ghash != 0) {
  790. $code .= "vpxorq $HASH,$ZTMP9,$ZTMP9\n";
  791. }
  792. $code .= <<___;
  793. vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+0*64))]},$ZTMP8
  794. vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP0 # ; T0H = a1*b1
  795. vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP1 # ; T0L = a0*b0
  796. vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP2 # ; T0M1 = a1*b0
  797. vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP3 # ; T0M2 = a0*b1
  798. ___
  799. # ;; ghash blocks 4-7
  800. if (scalar(@_) == 21) {
  801. $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+1*64))]},$ZTMP9\n";
  802. } else {
  803. $ZTMP9 = $DAT1;
  804. }
  805. $code .= <<___;
  806. vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+1*64))]},$ZTMP8
  807. vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP4 # ; T1H = a1*b1
  808. vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP5 # ; T1L = a0*b0
  809. vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP6 # ; T1M1 = a1*b0
  810. vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP7 # ; T1M2 = a0*b1
  811. ___
  812. # ;; update sums
  813. if ($start_ghash != 0) {
  814. $code .= <<___;
  815. vpxorq $ZTMP6,$ZTMP2,$GM # ; GM = T0M1 + T1M1
  816. vpxorq $ZTMP4,$ZTMP0,$GH # ; GH = T0H + T1H
  817. vpxorq $ZTMP5,$ZTMP1,$GL # ; GL = T0L + T1L
  818. vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM = T0M2 + T1M1
  819. ___
  820. } else { # ;; mid, end, end_reduce
  821. $code .= <<___;
  822. vpternlogq \$0x96,$ZTMP6,$ZTMP2,$GM # ; GM += T0M1 + T1M1
  823. vpternlogq \$0x96,$ZTMP4,$ZTMP0,$GH # ; GH += T0H + T1H
  824. vpternlogq \$0x96,$ZTMP5,$ZTMP1,$GL # ; GL += T0L + T1L
  825. vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM += T0M2 + T1M1
  826. ___
  827. }
  828. # ;; ghash blocks 8-11
  829. if (scalar(@_) == 21) {
  830. $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+2*64))]},$ZTMP9\n";
  831. } else {
  832. $ZTMP9 = $DAT2;
  833. }
  834. $code .= <<___;
  835. vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+2*64))]},$ZTMP8
  836. vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP0 # ; T0H = a1*b1
  837. vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP1 # ; T0L = a0*b0
  838. vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP2 # ; T0M1 = a1*b0
  839. vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP3 # ; T0M2 = a0*b1
  840. ___
  841. # ;; ghash blocks 12-15
  842. if (scalar(@_) == 21) {
  843. $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+3*64))]},$ZTMP9\n";
  844. } else {
  845. $ZTMP9 = $DAT3;
  846. }
  847. $code .= <<___;
  848. vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+3*64))]},$ZTMP8
  849. vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP4 # ; T1H = a1*b1
  850. vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP5 # ; T1L = a0*b0
  851. vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP6 # ; T1M1 = a1*b0
  852. vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP7 # ; T1M2 = a0*b1
  853. # ;; update sums
  854. vpternlogq \$0x96,$ZTMP6,$ZTMP2,$GM # ; GM += T0M1 + T1M1
  855. vpternlogq \$0x96,$ZTMP4,$ZTMP0,$GH # ; GH += T0H + T1H
  856. vpternlogq \$0x96,$ZTMP5,$ZTMP1,$GL # ; GL += T0L + T1L
  857. vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM += T0M2 + T1M1
  858. ___
  859. if ($do_reduction != 0) {
  860. $code .= <<___;
  861. # ;; integrate GM into GH and GL
  862. vpsrldq \$8,$GM,$ZTMP0
  863. vpslldq \$8,$GM,$ZTMP1
  864. vpxorq $ZTMP0,$GH,$GH
  865. vpxorq $ZTMP1,$GL,$GL
  866. ___
  867. # ;; add GH and GL 128-bit words horizontally
  868. &VHPXORI4x128($GH, $ZTMP0);
  869. &VHPXORI4x128($GL, $ZTMP1);
  870. # ;; reduction
  871. $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($ZTMP2)]}\n";
  872. &VCLMUL_REDUCE(&XWORD($HASH), &XWORD($ZTMP2), &XWORD($GH), &XWORD($GL), &XWORD($ZTMP0), &XWORD($ZTMP1));
  873. }
  874. }
  875. # ;; ===========================================================================
  876. # ;; GHASH 1 to 16 blocks of cipher text
  877. # ;; - performs reduction at the end
  878. # ;; - it doesn't load the data and it assumed it is already loaded and shuffled
  879. sub GHASH_1_TO_16 {
  880. my $GCM128_CTX = $_[0]; # [in] pointer to expanded keys
  881. my $GHASH = $_[1]; # [out] ghash output
  882. my $T0H = $_[2]; # [clobbered] temporary ZMM
  883. my $T0L = $_[3]; # [clobbered] temporary ZMM
  884. my $T0M1 = $_[4]; # [clobbered] temporary ZMM
  885. my $T0M2 = $_[5]; # [clobbered] temporary ZMM
  886. my $T1H = $_[6]; # [clobbered] temporary ZMM
  887. my $T1L = $_[7]; # [clobbered] temporary ZMM
  888. my $T1M1 = $_[8]; # [clobbered] temporary ZMM
  889. my $T1M2 = $_[9]; # [clobbered] temporary ZMM
  890. my $HK = $_[10]; # [clobbered] temporary ZMM
  891. my $AAD_HASH_IN = $_[11]; # [in] input hash value
  892. my @CIPHER_IN;
  893. $CIPHER_IN[0] = $_[12]; # [in] ZMM with cipher text blocks 0-3
  894. $CIPHER_IN[1] = $_[13]; # [in] ZMM with cipher text blocks 4-7
  895. $CIPHER_IN[2] = $_[14]; # [in] ZMM with cipher text blocks 8-11
  896. $CIPHER_IN[3] = $_[15]; # [in] ZMM with cipher text blocks 12-15
  897. my $NUM_BLOCKS = $_[16]; # [in] numerical value, number of blocks
  898. my $GH = $_[17]; # [in] ZMM with hi product part
  899. my $GM = $_[18]; # [in] ZMM with mid product part
  900. my $GL = $_[19]; # [in] ZMM with lo product part
  901. die "GHASH_1_TO_16: num_blocks is out of bounds = $NUM_BLOCKS\n" if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
  902. if (scalar(@_) == 17) {
  903. $code .= "vpxorq $AAD_HASH_IN,$CIPHER_IN[0],$CIPHER_IN[0]\n";
  904. }
  905. if ($NUM_BLOCKS == 16) {
  906. $code .= <<___;
  907. vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
  908. vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1
  909. vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0
  910. vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0
  911. vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1
  912. vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK
  913. vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1
  914. vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0
  915. vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0
  916. vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1
  917. vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-2*4, $GCM128_CTX)]},$HK
  918. vpclmulqdq \$0x11,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; H = a1*b1
  919. vpclmulqdq \$0x00,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; L = a0*b0
  920. vpternlogq \$0x96,$T1H,$CIPHER_IN[0],$T0H
  921. vpternlogq \$0x96,$T1L,$CIPHER_IN[1],$T0L
  922. vpclmulqdq \$0x01,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; M1 = a1*b0
  923. vpclmulqdq \$0x10,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; M2 = a0*b1
  924. vpternlogq \$0x96,$T1M1,$CIPHER_IN[0],$T0M1
  925. vpternlogq \$0x96,$T1M2,$CIPHER_IN[1],$T0M2
  926. vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-3*4, $GCM128_CTX)]},$HK
  927. vpclmulqdq \$0x11,$HK,$CIPHER_IN[3],$T1H # ; H = a1*b1
  928. vpclmulqdq \$0x00,$HK,$CIPHER_IN[3],$T1L # ; L = a0*b0
  929. vpclmulqdq \$0x01,$HK,$CIPHER_IN[3],$T1M1 # ; M1 = a1*b0
  930. vpclmulqdq \$0x10,$HK,$CIPHER_IN[3],$T1M2 # ; M2 = a0*b1
  931. vpxorq $T1H,$T0H,$T1H
  932. vpxorq $T1L,$T0L,$T1L
  933. vpxorq $T1M1,$T0M1,$T1M1
  934. vpxorq $T1M2,$T0M2,$T1M2
  935. ___
  936. } elsif ($NUM_BLOCKS >= 12) {
  937. $code .= <<___;
  938. vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
  939. vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1
  940. vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0
  941. vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0
  942. vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1
  943. vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK
  944. vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1
  945. vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0
  946. vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0
  947. vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1
  948. vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-2*4, $GCM128_CTX)]},$HK
  949. vpclmulqdq \$0x11,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; H = a1*b1
  950. vpclmulqdq \$0x00,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; L = a0*b0
  951. vpternlogq \$0x96,$T0H,$CIPHER_IN[0],$T1H
  952. vpternlogq \$0x96,$T0L,$CIPHER_IN[1],$T1L
  953. vpclmulqdq \$0x01,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; M1 = a1*b0
  954. vpclmulqdq \$0x10,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; M2 = a0*b1
  955. vpternlogq \$0x96,$T0M1,$CIPHER_IN[0],$T1M1
  956. vpternlogq \$0x96,$T0M2,$CIPHER_IN[1],$T1M2
  957. ___
  958. } elsif ($NUM_BLOCKS >= 8) {
  959. $code .= <<___;
  960. vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
  961. vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1
  962. vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0
  963. vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0
  964. vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1
  965. vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK
  966. vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1
  967. vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0
  968. vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0
  969. vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1
  970. vpxorq $T1H,$T0H,$T1H
  971. vpxorq $T1L,$T0L,$T1L
  972. vpxorq $T1M1,$T0M1,$T1M1
  973. vpxorq $T1M2,$T0M2,$T1M2
  974. ___
  975. } elsif ($NUM_BLOCKS >= 4) {
  976. $code .= <<___;
  977. vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
  978. vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T1H # ; H = a1*b1
  979. vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T1L # ; L = a0*b0
  980. vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T1M1 # ; M1 = a1*b0
  981. vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T1M2 # ; M2 = a0*b1
  982. ___
  983. }
  984. # ;; T1H/L/M1/M2 - hold current product sums (provided $NUM_BLOCKS >= 4)
  985. my $blocks_left = ($NUM_BLOCKS % 4);
  986. if ($blocks_left > 0) {
  987. # ;; =====================================================
  988. # ;; There are 1, 2 or 3 blocks left to process.
  989. # ;; It may also be that they are the only blocks to process.
  990. # ;; Set hash key and register index position for the remaining 1 to 3 blocks
  991. my $reg_idx = ($NUM_BLOCKS / 4);
  992. my $REG_IN = $CIPHER_IN[$reg_idx];
  993. if ($blocks_left == 1) {
  994. $code .= <<___;
  995. vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[XWORD($HK)]}
  996. vpclmulqdq \$0x01,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0M1)]} # ; M1 = a1*b0
  997. vpclmulqdq \$0x10,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0M2)]} # ; M2 = a0*b1
  998. vpclmulqdq \$0x11,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0H)]} # ; H = a1*b1
  999. vpclmulqdq \$0x00,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0L)]} # ; L = a0*b0
  1000. ___
  1001. } elsif ($blocks_left == 2) {
  1002. $code .= <<___;
  1003. vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[YWORD($HK)]}
  1004. vpclmulqdq \$0x01,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0M1)]} # ; M1 = a1*b0
  1005. vpclmulqdq \$0x10,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0M2)]} # ; M2 = a0*b1
  1006. vpclmulqdq \$0x11,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0H)]} # ; H = a1*b1
  1007. vpclmulqdq \$0x00,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0L)]} # ; L = a0*b0
  1008. ___
  1009. } else { # ; blocks_left == 3
  1010. $code .= <<___;
  1011. vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[YWORD($HK)]}
  1012. vinserti64x2 \$2,@{[HashKeyByIdx($blocks_left-2, $GCM128_CTX)]},$HK,$HK
  1013. vpclmulqdq \$0x01,$HK,$REG_IN,$T0M1 # ; M1 = a1*b0
  1014. vpclmulqdq \$0x10,$HK,$REG_IN,$T0M2 # ; M2 = a0*b1
  1015. vpclmulqdq \$0x11,$HK,$REG_IN,$T0H # ; H = a1*b1
  1016. vpclmulqdq \$0x00,$HK,$REG_IN,$T0L # ; L = a0*b0
  1017. ___
  1018. }
  1019. if (scalar(@_) == 20) {
  1020. # ;; *** GH/GM/GL passed as arguments
  1021. if ($NUM_BLOCKS >= 4) {
  1022. $code .= <<___;
  1023. # ;; add ghash product sums from the first 4, 8 or 12 blocks
  1024. vpxorq $T1M1,$T0M1,$T0M1
  1025. vpternlogq \$0x96,$T1M2,$GM,$T0M2
  1026. vpternlogq \$0x96,$T1H,$GH,$T0H
  1027. vpternlogq \$0x96,$T1L,$GL,$T0L
  1028. ___
  1029. } else {
  1030. $code .= <<___;
  1031. vpxorq $GM,$T0M1,$T0M1
  1032. vpxorq $GH,$T0H,$T0H
  1033. vpxorq $GL,$T0L,$T0L
  1034. ___
  1035. }
  1036. } else {
  1037. # ;; *** GH/GM/GL NOT passed as arguments
  1038. if ($NUM_BLOCKS >= 4) {
  1039. $code .= <<___;
  1040. # ;; add ghash product sums from the first 4, 8 or 12 blocks
  1041. vpxorq $T1M1,$T0M1,$T0M1
  1042. vpxorq $T1M2,$T0M2,$T0M2
  1043. vpxorq $T1H,$T0H,$T0H
  1044. vpxorq $T1L,$T0L,$T0L
  1045. ___
  1046. }
  1047. }
  1048. $code .= <<___;
  1049. # ;; integrate TM into TH and TL
  1050. vpxorq $T0M2,$T0M1,$T0M1
  1051. vpsrldq \$8,$T0M1,$T1M1
  1052. vpslldq \$8,$T0M1,$T1M2
  1053. vpxorq $T1M1,$T0H,$T0H
  1054. vpxorq $T1M2,$T0L,$T0L
  1055. ___
  1056. } else {
  1057. # ;; =====================================================
  1058. # ;; number of blocks is 4, 8, 12 or 16
  1059. # ;; T1H/L/M1/M2 include product sums not T0H/L/M1/M2
  1060. if (scalar(@_) == 20) {
  1061. $code .= <<___;
  1062. # ;; *** GH/GM/GL passed as arguments
  1063. vpxorq $GM,$T1M1,$T1M1
  1064. vpxorq $GH,$T1H,$T1H
  1065. vpxorq $GL,$T1L,$T1L
  1066. ___
  1067. }
  1068. $code .= <<___;
  1069. # ;; integrate TM into TH and TL
  1070. vpxorq $T1M2,$T1M1,$T1M1
  1071. vpsrldq \$8,$T1M1,$T0M1
  1072. vpslldq \$8,$T1M1,$T0M2
  1073. vpxorq $T0M1,$T1H,$T0H
  1074. vpxorq $T0M2,$T1L,$T0L
  1075. ___
  1076. }
  1077. # ;; add TH and TL 128-bit words horizontally
  1078. &VHPXORI4x128($T0H, $T1M1);
  1079. &VHPXORI4x128($T0L, $T1M2);
  1080. # ;; reduction
  1081. $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($HK)]}\n";
  1082. &VCLMUL_REDUCE(
  1083. @{[XWORD($GHASH)]},
  1084. @{[XWORD($HK)]},
  1085. @{[XWORD($T0H)]},
  1086. @{[XWORD($T0L)]},
  1087. @{[XWORD($T0M1)]},
  1088. @{[XWORD($T0M2)]});
  1089. }
  1090. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1091. # ;; GHASH_MUL MACRO to implement: Data*HashKey mod (x^128 + x^127 + x^126 +x^121 + 1)
  1092. # ;; Input: A and B (128-bits each, bit-reflected)
  1093. # ;; Output: C = A*B*x mod poly, (i.e. >>1 )
  1094. # ;; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
  1095. # ;; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
  1096. # ;;
  1097. # ;; Refer to [3] for more detals.
  1098. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1099. sub GHASH_MUL {
  1100. my $GH = $_[0]; #; [in/out] xmm/ymm/zmm with multiply operand(s) (128-bits)
  1101. my $HK = $_[1]; #; [in] xmm/ymm/zmm with hash key value(s) (128-bits)
  1102. my $T1 = $_[2]; #; [clobbered] xmm/ymm/zmm
  1103. my $T2 = $_[3]; #; [clobbered] xmm/ymm/zmm
  1104. my $T3 = $_[4]; #; [clobbered] xmm/ymm/zmm
  1105. $code .= <<___;
  1106. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1107. vpclmulqdq \$0x11,$HK,$GH,$T1 # ; $T1 = a1*b1
  1108. vpclmulqdq \$0x00,$HK,$GH,$T2 # ; $T2 = a0*b0
  1109. vpclmulqdq \$0x01,$HK,$GH,$T3 # ; $T3 = a1*b0
  1110. vpclmulqdq \$0x10,$HK,$GH,$GH # ; $GH = a0*b1
  1111. vpxorq $T3,$GH,$GH
  1112. vpsrldq \$8,$GH,$T3 # ; shift-R $GH 2 DWs
  1113. vpslldq \$8,$GH,$GH # ; shift-L $GH 2 DWs
  1114. vpxorq $T3,$T1,$T1
  1115. vpxorq $T2,$GH,$GH
  1116. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1117. # ;first phase of the reduction
  1118. vmovdqu64 POLY2(%rip),$T3
  1119. vpclmulqdq \$0x01,$GH,$T3,$T2
  1120. vpslldq \$8,$T2,$T2 # ; shift-L $T2 2 DWs
  1121. vpxorq $T2,$GH,$GH # ; first phase of the reduction complete
  1122. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1123. # ;second phase of the reduction
  1124. vpclmulqdq \$0x00,$GH,$T3,$T2
  1125. vpsrldq \$4,$T2,$T2 # ; shift-R only 1-DW to obtain 2-DWs shift-R
  1126. vpclmulqdq \$0x10,$GH,$T3,$GH
  1127. vpslldq \$4,$GH,$GH # ; Shift-L 1-DW to obtain result with no shifts
  1128. # ; second phase of the reduction complete, the result is in $GH
  1129. vpternlogq \$0x96,$T2,$T1,$GH # ; GH = GH xor T1 xor T2
  1130. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1131. ___
  1132. }
  1133. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1134. # ;;; PRECOMPUTE computes HashKey_i
  1135. sub PRECOMPUTE {
  1136. my $GCM128_CTX = $_[0]; #; [in/out] context pointer, hkeys content updated
  1137. my $HK = $_[1]; #; [in] xmm, hash key
  1138. my $T1 = $_[2]; #; [clobbered] xmm
  1139. my $T2 = $_[3]; #; [clobbered] xmm
  1140. my $T3 = $_[4]; #; [clobbered] xmm
  1141. my $T4 = $_[5]; #; [clobbered] xmm
  1142. my $T5 = $_[6]; #; [clobbered] xmm
  1143. my $T6 = $_[7]; #; [clobbered] xmm
  1144. my $ZT1 = &ZWORD($T1);
  1145. my $ZT2 = &ZWORD($T2);
  1146. my $ZT3 = &ZWORD($T3);
  1147. my $ZT4 = &ZWORD($T4);
  1148. my $ZT5 = &ZWORD($T5);
  1149. my $ZT6 = &ZWORD($T6);
  1150. my $YT1 = &YWORD($T1);
  1151. my $YT2 = &YWORD($T2);
  1152. my $YT3 = &YWORD($T3);
  1153. my $YT4 = &YWORD($T4);
  1154. my $YT5 = &YWORD($T5);
  1155. my $YT6 = &YWORD($T6);
  1156. $code .= <<___;
  1157. vshufi32x4 \$0x00,@{[YWORD($HK)]},@{[YWORD($HK)]},$YT5
  1158. vmovdqa $YT5,$YT4
  1159. ___
  1160. # ;; calculate HashKey^2<<1 mod poly
  1161. &GHASH_MUL($YT4, $YT5, $YT1, $YT2, $YT3);
  1162. $code .= <<___;
  1163. vmovdqu64 $T4,@{[HashKeyByIdx(2,$GCM128_CTX)]}
  1164. vinserti64x2 \$1,$HK,$YT4,$YT5
  1165. vmovdqa64 $YT5,$YT6 # ;; YT6 = HashKey | HashKey^2
  1166. ___
  1167. # ;; use 2x128-bit computation
  1168. # ;; calculate HashKey^4<<1 mod poly, HashKey^3<<1 mod poly
  1169. &GHASH_MUL($YT5, $YT4, $YT1, $YT2, $YT3); # ;; YT5 = HashKey^3 | HashKey^4
  1170. $code .= <<___;
  1171. vmovdqu64 $YT5,@{[HashKeyByIdx(4,$GCM128_CTX)]}
  1172. vinserti64x4 \$1,$YT6,$ZT5,$ZT5 # ;; ZT5 = YT6 | YT5
  1173. # ;; switch to 4x128-bit computations now
  1174. vshufi64x2 \$0x00,$ZT5,$ZT5,$ZT4 # ;; broadcast HashKey^4 across all ZT4
  1175. vmovdqa64 $ZT5,$ZT6 # ;; save HashKey^4 to HashKey^1 in ZT6
  1176. ___
  1177. # ;; calculate HashKey^5<<1 mod poly, HashKey^6<<1 mod poly, ... HashKey^8<<1 mod poly
  1178. &GHASH_MUL($ZT5, $ZT4, $ZT1, $ZT2, $ZT3);
  1179. $code .= <<___;
  1180. vmovdqu64 $ZT5,@{[HashKeyByIdx(8,$GCM128_CTX)]} # ;; HashKey^8 to HashKey^5 in ZT5 now
  1181. vshufi64x2 \$0x00,$ZT5,$ZT5,$ZT4 # ;; broadcast HashKey^8 across all ZT4
  1182. ___
  1183. # ;; calculate HashKey^9<<1 mod poly, HashKey^10<<1 mod poly, ... HashKey^16<<1 mod poly
  1184. # ;; use HashKey^8 as multiplier against ZT6 and ZT5 - this allows deeper ooo execution
  1185. # ;; compute HashKey^(12), HashKey^(11), ... HashKey^(9)
  1186. &GHASH_MUL($ZT6, $ZT4, $ZT1, $ZT2, $ZT3);
  1187. $code .= "vmovdqu64 $ZT6,@{[HashKeyByIdx(12,$GCM128_CTX)]}\n";
  1188. # ;; compute HashKey^(16), HashKey^(15), ... HashKey^(13)
  1189. &GHASH_MUL($ZT5, $ZT4, $ZT1, $ZT2, $ZT3);
  1190. $code .= "vmovdqu64 $ZT5,@{[HashKeyByIdx(16,$GCM128_CTX)]}\n";
  1191. # ; Hkeys 17..48 will be precomputed somewhere else as context can hold only 16 hkeys
  1192. }
  1193. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1194. # ;; READ_SMALL_DATA_INPUT
  1195. # ;; Packs xmm register with data when data input is less or equal to 16 bytes
  1196. # ;; Returns 0 if data has length 0
  1197. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1198. sub READ_SMALL_DATA_INPUT {
  1199. my $OUTPUT = $_[0]; # [out] xmm register
  1200. my $INPUT = $_[1]; # [in] buffer pointer to read from
  1201. my $LENGTH = $_[2]; # [in] number of bytes to read
  1202. my $TMP1 = $_[3]; # [clobbered]
  1203. my $TMP2 = $_[4]; # [clobbered]
  1204. my $MASK = $_[5]; # [out] k1 to k7 register to store the partial block mask
  1205. $code .= <<___;
  1206. mov \$16,@{[DWORD($TMP2)]}
  1207. lea byte_len_to_mask_table(%rip),$TMP1
  1208. cmp $TMP2,$LENGTH
  1209. cmovc $LENGTH,$TMP2
  1210. ___
  1211. if ($win64) {
  1212. $code .= <<___;
  1213. add $TMP2,$TMP1
  1214. add $TMP2,$TMP1
  1215. kmovw ($TMP1),$MASK
  1216. ___
  1217. } else {
  1218. $code .= "kmovw ($TMP1,$TMP2,2),$MASK\n";
  1219. }
  1220. $code .= "vmovdqu8 ($INPUT),${OUTPUT}{$MASK}{z}\n";
  1221. }
  1222. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1223. # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
  1224. # Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
  1225. # Output: The hash of the data (AAD_HASH).
  1226. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1227. sub CALC_AAD_HASH {
  1228. my $A_IN = $_[0]; # [in] AAD text pointer
  1229. my $A_LEN = $_[1]; # [in] AAD length
  1230. my $AAD_HASH = $_[2]; # [in/out] xmm ghash value
  1231. my $GCM128_CTX = $_[3]; # [in] pointer to context
  1232. my $ZT0 = $_[4]; # [clobbered] ZMM register
  1233. my $ZT1 = $_[5]; # [clobbered] ZMM register
  1234. my $ZT2 = $_[6]; # [clobbered] ZMM register
  1235. my $ZT3 = $_[7]; # [clobbered] ZMM register
  1236. my $ZT4 = $_[8]; # [clobbered] ZMM register
  1237. my $ZT5 = $_[9]; # [clobbered] ZMM register
  1238. my $ZT6 = $_[10]; # [clobbered] ZMM register
  1239. my $ZT7 = $_[11]; # [clobbered] ZMM register
  1240. my $ZT8 = $_[12]; # [clobbered] ZMM register
  1241. my $ZT9 = $_[13]; # [clobbered] ZMM register
  1242. my $ZT10 = $_[14]; # [clobbered] ZMM register
  1243. my $ZT11 = $_[15]; # [clobbered] ZMM register
  1244. my $ZT12 = $_[16]; # [clobbered] ZMM register
  1245. my $ZT13 = $_[17]; # [clobbered] ZMM register
  1246. my $ZT14 = $_[18]; # [clobbered] ZMM register
  1247. my $ZT15 = $_[19]; # [clobbered] ZMM register
  1248. my $ZT16 = $_[20]; # [clobbered] ZMM register
  1249. my $T1 = $_[21]; # [clobbered] GP register
  1250. my $T2 = $_[22]; # [clobbered] GP register
  1251. my $T3 = $_[23]; # [clobbered] GP register
  1252. my $MASKREG = $_[24]; # [clobbered] mask register
  1253. my $HKEYS_READY = "%rbx";
  1254. my $SHFMSK = $ZT13;
  1255. my $rndsuffix = &random_string();
  1256. $code .= <<___;
  1257. mov $A_IN,$T1 # ; T1 = AAD
  1258. mov $A_LEN,$T2 # ; T2 = aadLen
  1259. or $T2,$T2
  1260. jz .L_CALC_AAD_done_${rndsuffix}
  1261. xor $HKEYS_READY,$HKEYS_READY
  1262. vmovdqa64 SHUF_MASK(%rip),$SHFMSK
  1263. .L_get_AAD_loop48x16_${rndsuffix}:
  1264. cmp \$`(48*16)`,$T2
  1265. jl .L_exit_AAD_loop48x16_${rndsuffix}
  1266. ___
  1267. $code .= <<___;
  1268. vmovdqu64 `64*0`($T1),$ZT1 # ; Blocks 0-3
  1269. vmovdqu64 `64*1`($T1),$ZT2 # ; Blocks 4-7
  1270. vmovdqu64 `64*2`($T1),$ZT3 # ; Blocks 8-11
  1271. vmovdqu64 `64*3`($T1),$ZT4 # ; Blocks 12-15
  1272. vpshufb $SHFMSK,$ZT1,$ZT1
  1273. vpshufb $SHFMSK,$ZT2,$ZT2
  1274. vpshufb $SHFMSK,$ZT3,$ZT3
  1275. vpshufb $SHFMSK,$ZT4,$ZT4
  1276. ___
  1277. &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZT0, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT14, "all");
  1278. $code .= "mov \$1,$HKEYS_READY\n";
  1279. &GHASH_16(
  1280. "start", $ZT5, $ZT6, $ZT7,
  1281. "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
  1282. &HashKeyOffsetByIdx(48, "frame"), 0, "@{[ZWORD($AAD_HASH)]}", $ZT0,
  1283. $ZT8, $ZT9, $ZT10, $ZT11,
  1284. $ZT12, $ZT14, $ZT15, $ZT16,
  1285. "NO_ZMM", $ZT1, $ZT2, $ZT3,
  1286. $ZT4);
  1287. $code .= <<___;
  1288. vmovdqu64 `16*16 + 64*0`($T1),$ZT1 # ; Blocks 16-19
  1289. vmovdqu64 `16*16 + 64*1`($T1),$ZT2 # ; Blocks 20-23
  1290. vmovdqu64 `16*16 + 64*2`($T1),$ZT3 # ; Blocks 24-27
  1291. vmovdqu64 `16*16 + 64*3`($T1),$ZT4 # ; Blocks 28-31
  1292. vpshufb $SHFMSK,$ZT1,$ZT1
  1293. vpshufb $SHFMSK,$ZT2,$ZT2
  1294. vpshufb $SHFMSK,$ZT3,$ZT3
  1295. vpshufb $SHFMSK,$ZT4,$ZT4
  1296. ___
  1297. &GHASH_16(
  1298. "mid", $ZT5, $ZT6, $ZT7,
  1299. "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
  1300. &HashKeyOffsetByIdx(32, "frame"), 0, "NO_HASH_IN_OUT", $ZT0,
  1301. $ZT8, $ZT9, $ZT10, $ZT11,
  1302. $ZT12, $ZT14, $ZT15, $ZT16,
  1303. "NO_ZMM", $ZT1, $ZT2, $ZT3,
  1304. $ZT4);
  1305. $code .= <<___;
  1306. vmovdqu64 `32*16 + 64*0`($T1),$ZT1 # ; Blocks 32-35
  1307. vmovdqu64 `32*16 + 64*1`($T1),$ZT2 # ; Blocks 36-39
  1308. vmovdqu64 `32*16 + 64*2`($T1),$ZT3 # ; Blocks 40-43
  1309. vmovdqu64 `32*16 + 64*3`($T1),$ZT4 # ; Blocks 44-47
  1310. vpshufb $SHFMSK,$ZT1,$ZT1
  1311. vpshufb $SHFMSK,$ZT2,$ZT2
  1312. vpshufb $SHFMSK,$ZT3,$ZT3
  1313. vpshufb $SHFMSK,$ZT4,$ZT4
  1314. ___
  1315. &GHASH_16(
  1316. "end_reduce", $ZT5, $ZT6, $ZT7,
  1317. "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
  1318. &HashKeyOffsetByIdx(16, "frame"), 0, &ZWORD($AAD_HASH), $ZT0,
  1319. $ZT8, $ZT9, $ZT10, $ZT11,
  1320. $ZT12, $ZT14, $ZT15, $ZT16,
  1321. "NO_ZMM", $ZT1, $ZT2, $ZT3,
  1322. $ZT4);
  1323. $code .= <<___;
  1324. sub \$`(48*16)`,$T2
  1325. je .L_CALC_AAD_done_${rndsuffix}
  1326. add \$`(48*16)`,$T1
  1327. jmp .L_get_AAD_loop48x16_${rndsuffix}
  1328. .L_exit_AAD_loop48x16_${rndsuffix}:
  1329. # ; Less than 48x16 bytes remaining
  1330. cmp \$`(32*16)`,$T2
  1331. jl .L_less_than_32x16_${rndsuffix}
  1332. ___
  1333. $code .= <<___;
  1334. # ; Get next 16 blocks
  1335. vmovdqu64 `64*0`($T1),$ZT1
  1336. vmovdqu64 `64*1`($T1),$ZT2
  1337. vmovdqu64 `64*2`($T1),$ZT3
  1338. vmovdqu64 `64*3`($T1),$ZT4
  1339. vpshufb $SHFMSK,$ZT1,$ZT1
  1340. vpshufb $SHFMSK,$ZT2,$ZT2
  1341. vpshufb $SHFMSK,$ZT3,$ZT3
  1342. vpshufb $SHFMSK,$ZT4,$ZT4
  1343. ___
  1344. &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZT0, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT14, "first32");
  1345. $code .= "mov \$1,$HKEYS_READY\n";
  1346. &GHASH_16(
  1347. "start", $ZT5, $ZT6, $ZT7,
  1348. "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
  1349. &HashKeyOffsetByIdx(32, "frame"), 0, &ZWORD($AAD_HASH), $ZT0,
  1350. $ZT8, $ZT9, $ZT10, $ZT11,
  1351. $ZT12, $ZT14, $ZT15, $ZT16,
  1352. "NO_ZMM", $ZT1, $ZT2, $ZT3,
  1353. $ZT4);
  1354. $code .= <<___;
  1355. vmovdqu64 `16*16 + 64*0`($T1),$ZT1
  1356. vmovdqu64 `16*16 + 64*1`($T1),$ZT2
  1357. vmovdqu64 `16*16 + 64*2`($T1),$ZT3
  1358. vmovdqu64 `16*16 + 64*3`($T1),$ZT4
  1359. vpshufb $SHFMSK,$ZT1,$ZT1
  1360. vpshufb $SHFMSK,$ZT2,$ZT2
  1361. vpshufb $SHFMSK,$ZT3,$ZT3
  1362. vpshufb $SHFMSK,$ZT4,$ZT4
  1363. ___
  1364. &GHASH_16(
  1365. "end_reduce", $ZT5, $ZT6, $ZT7,
  1366. "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
  1367. &HashKeyOffsetByIdx(16, "frame"), 0, &ZWORD($AAD_HASH), $ZT0,
  1368. $ZT8, $ZT9, $ZT10, $ZT11,
  1369. $ZT12, $ZT14, $ZT15, $ZT16,
  1370. "NO_ZMM", $ZT1, $ZT2, $ZT3,
  1371. $ZT4);
  1372. $code .= <<___;
  1373. sub \$`(32*16)`,$T2
  1374. je .L_CALC_AAD_done_${rndsuffix}
  1375. add \$`(32*16)`,$T1
  1376. jmp .L_less_than_16x16_${rndsuffix}
  1377. .L_less_than_32x16_${rndsuffix}:
  1378. cmp \$`(16*16)`,$T2
  1379. jl .L_less_than_16x16_${rndsuffix}
  1380. # ; Get next 16 blocks
  1381. vmovdqu64 `64*0`($T1),$ZT1
  1382. vmovdqu64 `64*1`($T1),$ZT2
  1383. vmovdqu64 `64*2`($T1),$ZT3
  1384. vmovdqu64 `64*3`($T1),$ZT4
  1385. vpshufb $SHFMSK,$ZT1,$ZT1
  1386. vpshufb $SHFMSK,$ZT2,$ZT2
  1387. vpshufb $SHFMSK,$ZT3,$ZT3
  1388. vpshufb $SHFMSK,$ZT4,$ZT4
  1389. ___
  1390. # ; This code path does not use more than 16 hkeys, so they can be taken from the context
  1391. # ; (not from the stack storage)
  1392. &GHASH_16(
  1393. "start_reduce", $ZT5, $ZT6, $ZT7,
  1394. "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", $GCM128_CTX,
  1395. &HashKeyOffsetByIdx(16, "context"), 0, &ZWORD($AAD_HASH), $ZT0,
  1396. $ZT8, $ZT9, $ZT10, $ZT11,
  1397. $ZT12, $ZT14, $ZT15, $ZT16,
  1398. "NO_ZMM", $ZT1, $ZT2, $ZT3,
  1399. $ZT4);
  1400. $code .= <<___;
  1401. sub \$`(16*16)`,$T2
  1402. je .L_CALC_AAD_done_${rndsuffix}
  1403. add \$`(16*16)`,$T1
  1404. # ; Less than 16x16 bytes remaining
  1405. .L_less_than_16x16_${rndsuffix}:
  1406. # ;; prep mask source address
  1407. lea byte64_len_to_mask_table(%rip),$T3
  1408. lea ($T3,$T2,8),$T3
  1409. # ;; calculate number of blocks to ghash (including partial bytes)
  1410. add \$15,@{[DWORD($T2)]}
  1411. shr \$4,@{[DWORD($T2)]}
  1412. cmp \$2,@{[DWORD($T2)]}
  1413. jb .L_AAD_blocks_1_${rndsuffix}
  1414. je .L_AAD_blocks_2_${rndsuffix}
  1415. cmp \$4,@{[DWORD($T2)]}
  1416. jb .L_AAD_blocks_3_${rndsuffix}
  1417. je .L_AAD_blocks_4_${rndsuffix}
  1418. cmp \$6,@{[DWORD($T2)]}
  1419. jb .L_AAD_blocks_5_${rndsuffix}
  1420. je .L_AAD_blocks_6_${rndsuffix}
  1421. cmp \$8,@{[DWORD($T2)]}
  1422. jb .L_AAD_blocks_7_${rndsuffix}
  1423. je .L_AAD_blocks_8_${rndsuffix}
  1424. cmp \$10,@{[DWORD($T2)]}
  1425. jb .L_AAD_blocks_9_${rndsuffix}
  1426. je .L_AAD_blocks_10_${rndsuffix}
  1427. cmp \$12,@{[DWORD($T2)]}
  1428. jb .L_AAD_blocks_11_${rndsuffix}
  1429. je .L_AAD_blocks_12_${rndsuffix}
  1430. cmp \$14,@{[DWORD($T2)]}
  1431. jb .L_AAD_blocks_13_${rndsuffix}
  1432. je .L_AAD_blocks_14_${rndsuffix}
  1433. cmp \$15,@{[DWORD($T2)]}
  1434. je .L_AAD_blocks_15_${rndsuffix}
  1435. ___
  1436. # ;; fall through for 16 blocks
  1437. # ;; The flow of each of these cases is identical:
  1438. # ;; - load blocks plain text
  1439. # ;; - shuffle loaded blocks
  1440. # ;; - xor in current hash value into block 0
  1441. # ;; - perform up multiplications with ghash keys
  1442. # ;; - jump to reduction code
  1443. for (my $aad_blocks = 16; $aad_blocks > 0; $aad_blocks--) {
  1444. $code .= ".L_AAD_blocks_${aad_blocks}_${rndsuffix}:\n";
  1445. if ($aad_blocks > 12) {
  1446. $code .= "sub \$`12*16*8`, $T3\n";
  1447. } elsif ($aad_blocks > 8) {
  1448. $code .= "sub \$`8*16*8`, $T3\n";
  1449. } elsif ($aad_blocks > 4) {
  1450. $code .= "sub \$`4*16*8`, $T3\n";
  1451. }
  1452. $code .= "kmovq ($T3),$MASKREG\n";
  1453. &ZMM_LOAD_MASKED_BLOCKS_0_16($aad_blocks, $T1, 0, $ZT1, $ZT2, $ZT3, $ZT4, $MASKREG);
  1454. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16($aad_blocks, "vpshufb", $ZT1, $ZT2, $ZT3, $ZT4,
  1455. $ZT1, $ZT2, $ZT3, $ZT4, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
  1456. &GHASH_1_TO_16($GCM128_CTX, &ZWORD($AAD_HASH),
  1457. $ZT0, $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, &ZWORD($AAD_HASH), $ZT1, $ZT2, $ZT3, $ZT4, $aad_blocks);
  1458. if ($aad_blocks > 1) {
  1459. # ;; fall through to CALC_AAD_done in 1 block case
  1460. $code .= "jmp .L_CALC_AAD_done_${rndsuffix}\n";
  1461. }
  1462. }
  1463. $code .= ".L_CALC_AAD_done_${rndsuffix}:\n";
  1464. # ;; result in AAD_HASH
  1465. }
  1466. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1467. # ;; PARTIAL_BLOCK
  1468. # ;; Handles encryption/decryption and the tag partial blocks between
  1469. # ;; update calls.
  1470. # ;; Requires the input data be at least 1 byte long.
  1471. # ;; Output:
  1472. # ;; A cipher/plain of the first partial block (CIPH_PLAIN_OUT),
  1473. # ;; AAD_HASH and updated GCM128_CTX
  1474. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1475. sub PARTIAL_BLOCK {
  1476. my $GCM128_CTX = $_[0]; # [in] key pointer
  1477. my $PBLOCK_LEN = $_[1]; # [in] partial block length
  1478. my $CIPH_PLAIN_OUT = $_[2]; # [in] output buffer
  1479. my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer
  1480. my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length
  1481. my $DATA_OFFSET = $_[5]; # [out] data offset (gets set)
  1482. my $AAD_HASH = $_[6]; # [out] updated GHASH value
  1483. my $ENC_DEC = $_[7]; # [in] cipher direction
  1484. my $GPTMP0 = $_[8]; # [clobbered] GP temporary register
  1485. my $GPTMP1 = $_[9]; # [clobbered] GP temporary register
  1486. my $GPTMP2 = $_[10]; # [clobbered] GP temporary register
  1487. my $ZTMP0 = $_[11]; # [clobbered] ZMM temporary register
  1488. my $ZTMP1 = $_[12]; # [clobbered] ZMM temporary register
  1489. my $ZTMP2 = $_[13]; # [clobbered] ZMM temporary register
  1490. my $ZTMP3 = $_[14]; # [clobbered] ZMM temporary register
  1491. my $ZTMP4 = $_[15]; # [clobbered] ZMM temporary register
  1492. my $ZTMP5 = $_[16]; # [clobbered] ZMM temporary register
  1493. my $ZTMP6 = $_[17]; # [clobbered] ZMM temporary register
  1494. my $ZTMP7 = $_[18]; # [clobbered] ZMM temporary register
  1495. my $MASKREG = $_[19]; # [clobbered] mask temporary register
  1496. my $XTMP0 = &XWORD($ZTMP0);
  1497. my $XTMP1 = &XWORD($ZTMP1);
  1498. my $XTMP2 = &XWORD($ZTMP2);
  1499. my $XTMP3 = &XWORD($ZTMP3);
  1500. my $XTMP4 = &XWORD($ZTMP4);
  1501. my $XTMP5 = &XWORD($ZTMP5);
  1502. my $XTMP6 = &XWORD($ZTMP6);
  1503. my $XTMP7 = &XWORD($ZTMP7);
  1504. my $LENGTH = $DATA_OFFSET;
  1505. my $IA0 = $GPTMP1;
  1506. my $IA1 = $GPTMP2;
  1507. my $IA2 = $GPTMP0;
  1508. my $rndsuffix = &random_string();
  1509. $code .= <<___;
  1510. # ;; if no partial block present then LENGTH/DATA_OFFSET will be set to zero
  1511. mov ($PBLOCK_LEN),$LENGTH
  1512. or $LENGTH,$LENGTH
  1513. je .L_partial_block_done_${rndsuffix} # ;Leave Macro if no partial blocks
  1514. ___
  1515. &READ_SMALL_DATA_INPUT($XTMP0, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, $IA0, $IA2, $MASKREG);
  1516. $code .= <<___;
  1517. # ;; XTMP1 = my_ctx_data.partial_block_enc_key
  1518. vmovdqu64 $CTX_OFFSET_PEncBlock($GCM128_CTX),$XTMP1
  1519. vmovdqu64 @{[HashKeyByIdx(1,$GCM128_CTX)]},$XTMP2
  1520. # ;; adjust the shuffle mask pointer to be able to shift right $LENGTH bytes
  1521. # ;; (16 - $LENGTH) is the number of bytes in plaintext mod 16)
  1522. lea SHIFT_MASK(%rip),$IA0
  1523. add $LENGTH,$IA0
  1524. vmovdqu64 ($IA0),$XTMP3 # ; shift right shuffle mask
  1525. vpshufb $XTMP3,$XTMP1,$XTMP1
  1526. ___
  1527. if ($ENC_DEC eq "DEC") {
  1528. $code .= <<___;
  1529. # ;; keep copy of cipher text in $XTMP4
  1530. vmovdqa64 $XTMP0,$XTMP4
  1531. ___
  1532. }
  1533. $code .= <<___;
  1534. vpxorq $XTMP0,$XTMP1,$XTMP1 # ; Ciphertext XOR E(K, Yn)
  1535. # ;; Set $IA1 to be the amount of data left in CIPH_PLAIN_IN after filling the block
  1536. # ;; Determine if partial block is not being filled and shift mask accordingly
  1537. ___
  1538. if ($win64) {
  1539. $code .= <<___;
  1540. mov $PLAIN_CIPH_LEN,$IA1
  1541. add $LENGTH,$IA1
  1542. ___
  1543. } else {
  1544. $code .= "lea ($PLAIN_CIPH_LEN, $LENGTH, 1),$IA1\n";
  1545. }
  1546. $code .= <<___;
  1547. sub \$16,$IA1
  1548. jge .L_no_extra_mask_${rndsuffix}
  1549. sub $IA1,$IA0
  1550. .L_no_extra_mask_${rndsuffix}:
  1551. # ;; get the appropriate mask to mask out bottom $LENGTH bytes of $XTMP1
  1552. # ;; - mask out bottom $LENGTH bytes of $XTMP1
  1553. # ;; sizeof(SHIFT_MASK) == 16 bytes
  1554. vmovdqu64 16($IA0),$XTMP0
  1555. vpand $XTMP0,$XTMP1,$XTMP1
  1556. ___
  1557. if ($ENC_DEC eq "DEC") {
  1558. $code .= <<___;
  1559. vpand $XTMP0,$XTMP4,$XTMP4
  1560. vpshufb SHUF_MASK(%rip),$XTMP4,$XTMP4
  1561. vpshufb $XTMP3,$XTMP4,$XTMP4
  1562. vpxorq $XTMP4,$AAD_HASH,$AAD_HASH
  1563. ___
  1564. } else {
  1565. $code .= <<___;
  1566. vpshufb SHUF_MASK(%rip),$XTMP1,$XTMP1
  1567. vpshufb $XTMP3,$XTMP1,$XTMP1
  1568. vpxorq $XTMP1,$AAD_HASH,$AAD_HASH
  1569. ___
  1570. }
  1571. $code .= <<___;
  1572. cmp \$0,$IA1
  1573. jl .L_partial_incomplete_${rndsuffix}
  1574. ___
  1575. # ;; GHASH computation for the last <16 Byte block
  1576. &GHASH_MUL($AAD_HASH, $XTMP2, $XTMP5, $XTMP6, $XTMP7);
  1577. $code .= <<___;
  1578. movq \$0, ($PBLOCK_LEN)
  1579. # ;; Set $LENGTH to be the number of bytes to write out
  1580. mov $LENGTH,$IA0
  1581. mov \$16,$LENGTH
  1582. sub $IA0,$LENGTH
  1583. jmp .L_enc_dec_done_${rndsuffix}
  1584. .L_partial_incomplete_${rndsuffix}:
  1585. ___
  1586. if ($win64) {
  1587. $code .= <<___;
  1588. mov $PLAIN_CIPH_LEN,$IA0
  1589. add $IA0,($PBLOCK_LEN)
  1590. ___
  1591. } else {
  1592. $code .= "add $PLAIN_CIPH_LEN,($PBLOCK_LEN)\n";
  1593. }
  1594. $code .= <<___;
  1595. mov $PLAIN_CIPH_LEN,$LENGTH
  1596. .L_enc_dec_done_${rndsuffix}:
  1597. # ;; output encrypted Bytes
  1598. lea byte_len_to_mask_table(%rip),$IA0
  1599. kmovw ($IA0,$LENGTH,2),$MASKREG
  1600. vmovdqu64 $AAD_HASH,$CTX_OFFSET_AadHash($GCM128_CTX)
  1601. ___
  1602. if ($ENC_DEC eq "ENC") {
  1603. $code .= <<___;
  1604. # ;; shuffle XTMP1 back to output as ciphertext
  1605. vpshufb SHUF_MASK(%rip),$XTMP1,$XTMP1
  1606. vpshufb $XTMP3,$XTMP1,$XTMP1
  1607. ___
  1608. }
  1609. $code .= <<___;
  1610. mov $CIPH_PLAIN_OUT,$IA0
  1611. vmovdqu8 $XTMP1,($IA0){$MASKREG}
  1612. .L_partial_block_done_${rndsuffix}:
  1613. ___
  1614. }
  1615. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1616. # ;; Ciphers 1 to 16 blocks and prepares them for later GHASH compute operation
  1617. sub INITIAL_BLOCKS_PARTIAL_CIPHER {
  1618. my $AES_KEYS = $_[0]; # [in] key pointer
  1619. my $GCM128_CTX = $_[1]; # [in] context pointer
  1620. my $CIPH_PLAIN_OUT = $_[2]; # [in] text output pointer
  1621. my $PLAIN_CIPH_IN = $_[3]; # [in] text input pointer
  1622. my $LENGTH = $_[4]; # [in/clobbered] length in bytes
  1623. my $DATA_OFFSET = $_[5]; # [in/out] current data offset (updated)
  1624. my $NUM_BLOCKS = $_[6]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
  1625. my $CTR = $_[7]; # [in/out] current counter value
  1626. my $ENC_DEC = $_[8]; # [in] cipher direction (ENC/DEC)
  1627. my $DAT0 = $_[9]; # [out] ZMM with cipher text shuffled for GHASH
  1628. my $DAT1 = $_[10]; # [out] ZMM with cipher text shuffled for GHASH
  1629. my $DAT2 = $_[11]; # [out] ZMM with cipher text shuffled for GHASH
  1630. my $DAT3 = $_[12]; # [out] ZMM with cipher text shuffled for GHASH
  1631. my $LAST_CIPHER_BLK = $_[13]; # [out] XMM to put ciphered counter block partially xor'ed with text
  1632. my $LAST_GHASH_BLK = $_[14]; # [out] XMM to put last cipher text block shuffled for GHASH
  1633. my $CTR0 = $_[15]; # [clobbered] ZMM temporary
  1634. my $CTR1 = $_[16]; # [clobbered] ZMM temporary
  1635. my $CTR2 = $_[17]; # [clobbered] ZMM temporary
  1636. my $CTR3 = $_[18]; # [clobbered] ZMM temporary
  1637. my $ZT1 = $_[19]; # [clobbered] ZMM temporary
  1638. my $IA0 = $_[20]; # [clobbered] GP temporary
  1639. my $IA1 = $_[21]; # [clobbered] GP temporary
  1640. my $MASKREG = $_[22]; # [clobbered] mask register
  1641. my $SHUFMASK = $_[23]; # [out] ZMM loaded with BE/LE shuffle mask
  1642. if ($NUM_BLOCKS == 1) {
  1643. $code .= "vmovdqa64 SHUF_MASK(%rip),@{[XWORD($SHUFMASK)]}\n";
  1644. } elsif ($NUM_BLOCKS == 2) {
  1645. $code .= "vmovdqa64 SHUF_MASK(%rip),@{[YWORD($SHUFMASK)]}\n";
  1646. } else {
  1647. $code .= "vmovdqa64 SHUF_MASK(%rip),$SHUFMASK\n";
  1648. }
  1649. # ;; prepare AES counter blocks
  1650. if ($NUM_BLOCKS == 1) {
  1651. $code .= "vpaddd ONE(%rip),$CTR,@{[XWORD($CTR0)]}\n";
  1652. } elsif ($NUM_BLOCKS == 2) {
  1653. $code .= <<___;
  1654. vshufi64x2 \$0,@{[YWORD($CTR)]},@{[YWORD($CTR)]},@{[YWORD($CTR0)]}
  1655. vpaddd ddq_add_1234(%rip),@{[YWORD($CTR0)]},@{[YWORD($CTR0)]}
  1656. ___
  1657. } else {
  1658. $code .= <<___;
  1659. vshufi64x2 \$0,@{[ZWORD($CTR)]},@{[ZWORD($CTR)]},@{[ZWORD($CTR)]}
  1660. vpaddd ddq_add_1234(%rip),@{[ZWORD($CTR)]},$CTR0
  1661. ___
  1662. if ($NUM_BLOCKS > 4) {
  1663. $code .= "vpaddd ddq_add_5678(%rip),@{[ZWORD($CTR)]},$CTR1\n";
  1664. }
  1665. if ($NUM_BLOCKS > 8) {
  1666. $code .= "vpaddd ddq_add_8888(%rip),$CTR0,$CTR2\n";
  1667. }
  1668. if ($NUM_BLOCKS > 12) {
  1669. $code .= "vpaddd ddq_add_8888(%rip),$CTR1,$CTR3\n";
  1670. }
  1671. }
  1672. # ;; get load/store mask
  1673. $code .= <<___;
  1674. lea byte64_len_to_mask_table(%rip),$IA0
  1675. mov $LENGTH,$IA1
  1676. ___
  1677. if ($NUM_BLOCKS > 12) {
  1678. $code .= "sub \$`3*64`,$IA1\n";
  1679. } elsif ($NUM_BLOCKS > 8) {
  1680. $code .= "sub \$`2*64`,$IA1\n";
  1681. } elsif ($NUM_BLOCKS > 4) {
  1682. $code .= "sub \$`1*64`,$IA1\n";
  1683. }
  1684. $code .= "kmovq ($IA0,$IA1,8),$MASKREG\n";
  1685. # ;; extract new counter value
  1686. # ;; shuffle the counters for AES rounds
  1687. if ($NUM_BLOCKS <= 4) {
  1688. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$CTR0,$CTR\n";
  1689. } elsif ($NUM_BLOCKS <= 8) {
  1690. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$CTR1,$CTR\n";
  1691. } elsif ($NUM_BLOCKS <= 12) {
  1692. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$CTR2,$CTR\n";
  1693. } else {
  1694. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$CTR3,$CTR\n";
  1695. }
  1696. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  1697. $NUM_BLOCKS, "vpshufb", $CTR0, $CTR1, $CTR2, $CTR3, $CTR0,
  1698. $CTR1, $CTR2, $CTR3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK);
  1699. # ;; load plain/cipher text
  1700. &ZMM_LOAD_MASKED_BLOCKS_0_16($NUM_BLOCKS, $PLAIN_CIPH_IN, $DATA_OFFSET, $DAT0, $DAT1, $DAT2, $DAT3, $MASKREG);
  1701. # ;; AES rounds and XOR with plain/cipher text
  1702. foreach my $j (0 .. ($NROUNDS + 1)) {
  1703. $code .= "vbroadcastf64x2 `($j * 16)`($AES_KEYS),$ZT1\n";
  1704. &ZMM_AESENC_ROUND_BLOCKS_0_16($CTR0, $CTR1, $CTR2, $CTR3, $ZT1, $j,
  1705. $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS, $NROUNDS);
  1706. }
  1707. # ;; retrieve the last cipher counter block (partially XOR'ed with text)
  1708. # ;; - this is needed for partial block cases
  1709. if ($NUM_BLOCKS <= 4) {
  1710. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$CTR0,$LAST_CIPHER_BLK\n";
  1711. } elsif ($NUM_BLOCKS <= 8) {
  1712. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$CTR1,$LAST_CIPHER_BLK\n";
  1713. } elsif ($NUM_BLOCKS <= 12) {
  1714. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$CTR2,$LAST_CIPHER_BLK\n";
  1715. } else {
  1716. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$CTR3,$LAST_CIPHER_BLK\n";
  1717. }
  1718. # ;; write cipher/plain text back to output and
  1719. $code .= "mov $CIPH_PLAIN_OUT,$IA0\n";
  1720. &ZMM_STORE_MASKED_BLOCKS_0_16($NUM_BLOCKS, $IA0, $DATA_OFFSET, $CTR0, $CTR1, $CTR2, $CTR3, $MASKREG);
  1721. # ;; zero bytes outside the mask before hashing
  1722. if ($NUM_BLOCKS <= 4) {
  1723. $code .= "vmovdqu8 $CTR0,${CTR0}{$MASKREG}{z}\n";
  1724. } elsif ($NUM_BLOCKS <= 8) {
  1725. $code .= "vmovdqu8 $CTR1,${CTR1}{$MASKREG}{z}\n";
  1726. } elsif ($NUM_BLOCKS <= 12) {
  1727. $code .= "vmovdqu8 $CTR2,${CTR2}{$MASKREG}{z}\n";
  1728. } else {
  1729. $code .= "vmovdqu8 $CTR3,${CTR3}{$MASKREG}{z}\n";
  1730. }
  1731. # ;; Shuffle the cipher text blocks for hashing part
  1732. # ;; ZT5 and ZT6 are expected outputs with blocks for hashing
  1733. if ($ENC_DEC eq "DEC") {
  1734. # ;; Decrypt case
  1735. # ;; - cipher blocks are in ZT5 & ZT6
  1736. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  1737. $NUM_BLOCKS, "vpshufb", $DAT0, $DAT1, $DAT2, $DAT3, $DAT0,
  1738. $DAT1, $DAT2, $DAT3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK);
  1739. } else {
  1740. # ;; Encrypt case
  1741. # ;; - cipher blocks are in CTR0-CTR3
  1742. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  1743. $NUM_BLOCKS, "vpshufb", $DAT0, $DAT1, $DAT2, $DAT3, $CTR0,
  1744. $CTR1, $CTR2, $CTR3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK);
  1745. }
  1746. # ;; Extract the last block for partials and multi_call cases
  1747. if ($NUM_BLOCKS <= 4) {
  1748. $code .= "vextracti32x4 \$`($NUM_BLOCKS-1)`,$DAT0,$LAST_GHASH_BLK\n";
  1749. } elsif ($NUM_BLOCKS <= 8) {
  1750. $code .= "vextracti32x4 \$`($NUM_BLOCKS-5)`,$DAT1,$LAST_GHASH_BLK\n";
  1751. } elsif ($NUM_BLOCKS <= 12) {
  1752. $code .= "vextracti32x4 \$`($NUM_BLOCKS-9)`,$DAT2,$LAST_GHASH_BLK\n";
  1753. } else {
  1754. $code .= "vextracti32x4 \$`($NUM_BLOCKS-13)`,$DAT3,$LAST_GHASH_BLK\n";
  1755. }
  1756. }
  1757. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1758. # ;; Computes GHASH on 1 to 16 blocks
  1759. sub INITIAL_BLOCKS_PARTIAL_GHASH {
  1760. my $AES_KEYS = $_[0]; # [in] key pointer
  1761. my $GCM128_CTX = $_[1]; # [in] context pointer
  1762. my $LENGTH = $_[2]; # [in/clobbered] length in bytes
  1763. my $NUM_BLOCKS = $_[3]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
  1764. my $HASH_IN_OUT = $_[4]; # [in/out] XMM ghash in/out value
  1765. my $ENC_DEC = $_[5]; # [in] cipher direction (ENC/DEC)
  1766. my $DAT0 = $_[6]; # [in] ZMM with cipher text shuffled for GHASH
  1767. my $DAT1 = $_[7]; # [in] ZMM with cipher text shuffled for GHASH
  1768. my $DAT2 = $_[8]; # [in] ZMM with cipher text shuffled for GHASH
  1769. my $DAT3 = $_[9]; # [in] ZMM with cipher text shuffled for GHASH
  1770. my $LAST_CIPHER_BLK = $_[10]; # [in] XMM with ciphered counter block partially xor'ed with text
  1771. my $LAST_GHASH_BLK = $_[11]; # [in] XMM with last cipher text block shuffled for GHASH
  1772. my $ZT0 = $_[12]; # [clobbered] ZMM temporary
  1773. my $ZT1 = $_[13]; # [clobbered] ZMM temporary
  1774. my $ZT2 = $_[14]; # [clobbered] ZMM temporary
  1775. my $ZT3 = $_[15]; # [clobbered] ZMM temporary
  1776. my $ZT4 = $_[16]; # [clobbered] ZMM temporary
  1777. my $ZT5 = $_[17]; # [clobbered] ZMM temporary
  1778. my $ZT6 = $_[18]; # [clobbered] ZMM temporary
  1779. my $ZT7 = $_[19]; # [clobbered] ZMM temporary
  1780. my $ZT8 = $_[20]; # [clobbered] ZMM temporary
  1781. my $PBLOCK_LEN = $_[21]; # [in] partial block length
  1782. my $GH = $_[22]; # [in] ZMM with hi product part
  1783. my $GM = $_[23]; # [in] ZMM with mid prodcut part
  1784. my $GL = $_[24]; # [in] ZMM with lo product part
  1785. my $rndsuffix = &random_string();
  1786. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1787. # ;;; - Hash all but the last partial block of data
  1788. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1789. # ;; update data offset
  1790. if ($NUM_BLOCKS > 1) {
  1791. # ;; The final block of data may be <16B
  1792. $code .= "sub \$16 * ($NUM_BLOCKS - 1),$LENGTH\n";
  1793. }
  1794. if ($NUM_BLOCKS < 16) {
  1795. $code .= <<___;
  1796. # ;; NOTE: the 'jl' is always taken for num_initial_blocks = 16.
  1797. # ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 256.
  1798. cmp \$16,$LENGTH
  1799. jl .L_small_initial_partial_block_${rndsuffix}
  1800. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1801. # ;;; Handle a full length final block - encrypt and hash all blocks
  1802. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1803. sub \$16,$LENGTH
  1804. movq \$0,($PBLOCK_LEN)
  1805. ___
  1806. # ;; Hash all of the data
  1807. if (scalar(@_) == 22) {
  1808. # ;; start GHASH compute
  1809. &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
  1810. $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS);
  1811. } elsif (scalar(@_) == 25) {
  1812. # ;; continue GHASH compute
  1813. &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
  1814. $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS, $GH, $GM, $GL);
  1815. }
  1816. $code .= "jmp .L_small_initial_compute_done_${rndsuffix}\n";
  1817. }
  1818. $code .= <<___;
  1819. .L_small_initial_partial_block_${rndsuffix}:
  1820. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1821. # ;;; Handle ghash for a <16B final block
  1822. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1823. # ;; As it's an init / update / finalize series we need to leave the
  1824. # ;; last block if it's less than a full block of data.
  1825. mov $LENGTH,($PBLOCK_LEN)
  1826. vmovdqu64 $LAST_CIPHER_BLK,$CTX_OFFSET_PEncBlock($GCM128_CTX)
  1827. ___
  1828. my $k = ($NUM_BLOCKS - 1);
  1829. my $last_block_to_hash = 1;
  1830. if (($NUM_BLOCKS > $last_block_to_hash)) {
  1831. # ;; ZT12-ZT20 - temporary registers
  1832. if (scalar(@_) == 22) {
  1833. # ;; start GHASH compute
  1834. &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
  1835. $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $k);
  1836. } elsif (scalar(@_) == 25) {
  1837. # ;; continue GHASH compute
  1838. &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
  1839. $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $k, $GH, $GM, $GL);
  1840. }
  1841. # ;; just fall through no jmp needed
  1842. } else {
  1843. if (scalar(@_) == 25) {
  1844. $code .= <<___;
  1845. # ;; Reduction is required in this case.
  1846. # ;; Integrate GM into GH and GL.
  1847. vpsrldq \$8,$GM,$ZT0
  1848. vpslldq \$8,$GM,$ZT1
  1849. vpxorq $ZT0,$GH,$GH
  1850. vpxorq $ZT1,$GL,$GL
  1851. ___
  1852. # ;; Add GH and GL 128-bit words horizontally
  1853. &VHPXORI4x128($GH, $ZT0);
  1854. &VHPXORI4x128($GL, $ZT1);
  1855. # ;; 256-bit to 128-bit reduction
  1856. $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($ZT0)]}\n";
  1857. &VCLMUL_REDUCE(&XWORD($HASH_IN_OUT), &XWORD($ZT0), &XWORD($GH), &XWORD($GL), &XWORD($ZT1), &XWORD($ZT2));
  1858. }
  1859. $code .= <<___;
  1860. # ;; Record that a reduction is not needed -
  1861. # ;; In this case no hashes are computed because there
  1862. # ;; is only one initial block and it is < 16B in length.
  1863. # ;; We only need to check if a reduction is needed if
  1864. # ;; initial_blocks == 1 and init/update/final is being used.
  1865. # ;; In this case we may just have a partial block, and that
  1866. # ;; gets hashed in finalize.
  1867. # ;; The hash should end up in HASH_IN_OUT.
  1868. # ;; The only way we should get here is if there is
  1869. # ;; a partial block of data, so xor that into the hash.
  1870. vpxorq $LAST_GHASH_BLK,$HASH_IN_OUT,$HASH_IN_OUT
  1871. # ;; The result is in $HASH_IN_OUT
  1872. jmp .L_after_reduction_${rndsuffix}
  1873. ___
  1874. }
  1875. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1876. # ;;; After GHASH reduction
  1877. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1878. $code .= ".L_small_initial_compute_done_${rndsuffix}:\n";
  1879. # ;; If using init/update/finalize, we need to xor any partial block data
  1880. # ;; into the hash.
  1881. if ($NUM_BLOCKS > 1) {
  1882. # ;; NOTE: for $NUM_BLOCKS = 0 the xor never takes place
  1883. if ($NUM_BLOCKS != 16) {
  1884. $code .= <<___;
  1885. # ;; NOTE: for $NUM_BLOCKS = 16, $LENGTH, stored in [PBlockLen] is never zero
  1886. or $LENGTH,$LENGTH
  1887. je .L_after_reduction_${rndsuffix}
  1888. ___
  1889. }
  1890. $code .= "vpxorq $LAST_GHASH_BLK,$HASH_IN_OUT,$HASH_IN_OUT\n";
  1891. }
  1892. $code .= ".L_after_reduction_${rndsuffix}:\n";
  1893. # ;; Final hash is now in HASH_IN_OUT
  1894. }
  1895. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  1896. # ;; INITIAL_BLOCKS_PARTIAL macro with support for a partial final block.
  1897. # ;; It may look similar to INITIAL_BLOCKS but its usage is different:
  1898. # ;; - first encrypts/decrypts required number of blocks and then
  1899. # ;; ghashes these blocks
  1900. # ;; - Small packets or left over data chunks (<256 bytes)
  1901. # ;; - Remaining data chunks below 256 bytes (multi buffer code)
  1902. # ;;
  1903. # ;; num_initial_blocks is expected to include the partial final block
  1904. # ;; in the count.
  1905. sub INITIAL_BLOCKS_PARTIAL {
  1906. my $AES_KEYS = $_[0]; # [in] key pointer
  1907. my $GCM128_CTX = $_[1]; # [in] context pointer
  1908. my $CIPH_PLAIN_OUT = $_[2]; # [in] text output pointer
  1909. my $PLAIN_CIPH_IN = $_[3]; # [in] text input pointer
  1910. my $LENGTH = $_[4]; # [in/clobbered] length in bytes
  1911. my $DATA_OFFSET = $_[5]; # [in/out] current data offset (updated)
  1912. my $NUM_BLOCKS = $_[6]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
  1913. my $CTR = $_[7]; # [in/out] current counter value
  1914. my $HASH_IN_OUT = $_[8]; # [in/out] XMM ghash in/out value
  1915. my $ENC_DEC = $_[9]; # [in] cipher direction (ENC/DEC)
  1916. my $CTR0 = $_[10]; # [clobbered] ZMM temporary
  1917. my $CTR1 = $_[11]; # [clobbered] ZMM temporary
  1918. my $CTR2 = $_[12]; # [clobbered] ZMM temporary
  1919. my $CTR3 = $_[13]; # [clobbered] ZMM temporary
  1920. my $DAT0 = $_[14]; # [clobbered] ZMM temporary
  1921. my $DAT1 = $_[15]; # [clobbered] ZMM temporary
  1922. my $DAT2 = $_[16]; # [clobbered] ZMM temporary
  1923. my $DAT3 = $_[17]; # [clobbered] ZMM temporary
  1924. my $LAST_CIPHER_BLK = $_[18]; # [clobbered] ZMM temporary
  1925. my $LAST_GHASH_BLK = $_[19]; # [clobbered] ZMM temporary
  1926. my $ZT0 = $_[20]; # [clobbered] ZMM temporary
  1927. my $ZT1 = $_[21]; # [clobbered] ZMM temporary
  1928. my $ZT2 = $_[22]; # [clobbered] ZMM temporary
  1929. my $ZT3 = $_[23]; # [clobbered] ZMM temporary
  1930. my $ZT4 = $_[24]; # [clobbered] ZMM temporary
  1931. my $IA0 = $_[25]; # [clobbered] GP temporary
  1932. my $IA1 = $_[26]; # [clobbered] GP temporary
  1933. my $MASKREG = $_[27]; # [clobbered] mask register
  1934. my $SHUFMASK = $_[28]; # [clobbered] ZMM for BE/LE shuffle mask
  1935. my $PBLOCK_LEN = $_[29]; # [in] partial block length
  1936. &INITIAL_BLOCKS_PARTIAL_CIPHER(
  1937. $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,
  1938. $LENGTH, $DATA_OFFSET, $NUM_BLOCKS, $CTR,
  1939. $ENC_DEC, $DAT0, $DAT1, $DAT2,
  1940. $DAT3, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK), $CTR0,
  1941. $CTR1, $CTR2, $CTR3, $ZT0,
  1942. $IA0, $IA1, $MASKREG, $SHUFMASK);
  1943. &INITIAL_BLOCKS_PARTIAL_GHASH($AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS, $HASH_IN_OUT, $ENC_DEC, $DAT0,
  1944. $DAT1, $DAT2, $DAT3, &XWORD($LAST_CIPHER_BLK),
  1945. &XWORD($LAST_GHASH_BLK), $CTR0, $CTR1, $CTR2, $CTR3, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, $PBLOCK_LEN);
  1946. }
  1947. # ;; ===========================================================================
  1948. # ;; Stitched GHASH of 16 blocks (with reduction) with encryption of N blocks
  1949. # ;; followed with GHASH of the N blocks.
  1950. sub GHASH_16_ENCRYPT_N_GHASH_N {
  1951. my $AES_KEYS = $_[0]; # [in] key pointer
  1952. my $GCM128_CTX = $_[1]; # [in] context pointer
  1953. my $CIPH_PLAIN_OUT = $_[2]; # [in] pointer to output buffer
  1954. my $PLAIN_CIPH_IN = $_[3]; # [in] pointer to input buffer
  1955. my $DATA_OFFSET = $_[4]; # [in] data offset
  1956. my $LENGTH = $_[5]; # [in] data length
  1957. my $CTR_BE = $_[6]; # [in/out] ZMM counter blocks (last 4) in big-endian
  1958. my $CTR_CHECK = $_[7]; # [in/out] GP with 8-bit counter for overflow check
  1959. my $HASHKEY_OFFSET = $_[8]; # [in] numerical offset for the highest hash key
  1960. # (can be in form of register or numerical value)
  1961. my $GHASHIN_BLK_OFFSET = $_[9]; # [in] numerical offset for GHASH blocks in
  1962. my $SHFMSK = $_[10]; # [in] ZMM with byte swap mask for pshufb
  1963. my $B00_03 = $_[11]; # [clobbered] temporary ZMM
  1964. my $B04_07 = $_[12]; # [clobbered] temporary ZMM
  1965. my $B08_11 = $_[13]; # [clobbered] temporary ZMM
  1966. my $B12_15 = $_[14]; # [clobbered] temporary ZMM
  1967. my $GH1H_UNUSED = $_[15]; # [clobbered] temporary ZMM
  1968. my $GH1L = $_[16]; # [clobbered] temporary ZMM
  1969. my $GH1M = $_[17]; # [clobbered] temporary ZMM
  1970. my $GH1T = $_[18]; # [clobbered] temporary ZMM
  1971. my $GH2H = $_[19]; # [clobbered] temporary ZMM
  1972. my $GH2L = $_[20]; # [clobbered] temporary ZMM
  1973. my $GH2M = $_[21]; # [clobbered] temporary ZMM
  1974. my $GH2T = $_[22]; # [clobbered] temporary ZMM
  1975. my $GH3H = $_[23]; # [clobbered] temporary ZMM
  1976. my $GH3L = $_[24]; # [clobbered] temporary ZMM
  1977. my $GH3M = $_[25]; # [clobbered] temporary ZMM
  1978. my $GH3T = $_[26]; # [clobbered] temporary ZMM
  1979. my $AESKEY1 = $_[27]; # [clobbered] temporary ZMM
  1980. my $AESKEY2 = $_[28]; # [clobbered] temporary ZMM
  1981. my $GHKEY1 = $_[29]; # [clobbered] temporary ZMM
  1982. my $GHKEY2 = $_[30]; # [clobbered] temporary ZMM
  1983. my $GHDAT1 = $_[31]; # [clobbered] temporary ZMM
  1984. my $GHDAT2 = $_[32]; # [clobbered] temporary ZMM
  1985. my $ZT01 = $_[33]; # [clobbered] temporary ZMM
  1986. my $ADDBE_4x4 = $_[34]; # [in] ZMM with 4x128bits 4 in big-endian
  1987. my $ADDBE_1234 = $_[35]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
  1988. my $GHASH_TYPE = $_[36]; # [in] "start", "start_reduce", "mid", "end_reduce"
  1989. my $TO_REDUCE_L = $_[37]; # [in] ZMM for low 4x128-bit GHASH sum
  1990. my $TO_REDUCE_H = $_[38]; # [in] ZMM for hi 4x128-bit GHASH sum
  1991. my $TO_REDUCE_M = $_[39]; # [in] ZMM for medium 4x128-bit GHASH sum
  1992. my $ENC_DEC = $_[40]; # [in] cipher direction
  1993. my $HASH_IN_OUT = $_[41]; # [in/out] XMM ghash in/out value
  1994. my $IA0 = $_[42]; # [clobbered] GP temporary
  1995. my $IA1 = $_[43]; # [clobbered] GP temporary
  1996. my $MASKREG = $_[44]; # [clobbered] mask register
  1997. my $NUM_BLOCKS = $_[45]; # [in] numerical value with number of blocks to be encrypted/ghashed (1 to 16)
  1998. my $PBLOCK_LEN = $_[46]; # [in] partial block length
  1999. die "GHASH_16_ENCRYPT_N_GHASH_N: num_blocks is out of bounds = $NUM_BLOCKS\n"
  2000. if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
  2001. my $rndsuffix = &random_string();
  2002. my $GH1H = $HASH_IN_OUT;
  2003. # ; this is to avoid additional move in do_reduction case
  2004. my $LAST_GHASH_BLK = $GH1L;
  2005. my $LAST_CIPHER_BLK = $GH1T;
  2006. my $RED_POLY = $GH2T;
  2007. my $RED_P1 = $GH2L;
  2008. my $RED_T1 = $GH2H;
  2009. my $RED_T2 = $GH2M;
  2010. my $DATA1 = $GH3H;
  2011. my $DATA2 = $GH3L;
  2012. my $DATA3 = $GH3M;
  2013. my $DATA4 = $GH3T;
  2014. # ;; do reduction after the 16 blocks ?
  2015. my $do_reduction = 0;
  2016. # ;; is 16 block chunk a start?
  2017. my $is_start = 0;
  2018. if ($GHASH_TYPE eq "start_reduce") {
  2019. $is_start = 1;
  2020. $do_reduction = 1;
  2021. }
  2022. if ($GHASH_TYPE eq "start") {
  2023. $is_start = 1;
  2024. }
  2025. if ($GHASH_TYPE eq "end_reduce") {
  2026. $do_reduction = 1;
  2027. }
  2028. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2029. # ;; - get load/store mask
  2030. # ;; - load plain/cipher text
  2031. # ;; get load/store mask
  2032. $code .= <<___;
  2033. lea byte64_len_to_mask_table(%rip),$IA0
  2034. mov $LENGTH,$IA1
  2035. ___
  2036. if ($NUM_BLOCKS > 12) {
  2037. $code .= "sub \$`3*64`,$IA1\n";
  2038. } elsif ($NUM_BLOCKS > 8) {
  2039. $code .= "sub \$`2*64`,$IA1\n";
  2040. } elsif ($NUM_BLOCKS > 4) {
  2041. $code .= "sub \$`1*64`,$IA1\n";
  2042. }
  2043. $code .= "kmovq ($IA0,$IA1,8),$MASKREG\n";
  2044. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2045. # ;; prepare counter blocks
  2046. $code .= <<___;
  2047. cmp \$`(256 - $NUM_BLOCKS)`,@{[DWORD($CTR_CHECK)]}
  2048. jae .L_16_blocks_overflow_${rndsuffix}
  2049. ___
  2050. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2051. $NUM_BLOCKS, "vpaddd", $B00_03, $B04_07, $B08_11, $B12_15, $CTR_BE,
  2052. $B00_03, $B04_07, $B08_11, $ADDBE_1234, $ADDBE_4x4, $ADDBE_4x4, $ADDBE_4x4);
  2053. $code .= <<___;
  2054. jmp .L_16_blocks_ok_${rndsuffix}
  2055. .L_16_blocks_overflow_${rndsuffix}:
  2056. vpshufb $SHFMSK,$CTR_BE,$CTR_BE
  2057. vpaddd ddq_add_1234(%rip),$CTR_BE,$B00_03
  2058. ___
  2059. if ($NUM_BLOCKS > 4) {
  2060. $code .= <<___;
  2061. vmovdqa64 ddq_add_4444(%rip),$B12_15
  2062. vpaddd $B12_15,$B00_03,$B04_07
  2063. ___
  2064. }
  2065. if ($NUM_BLOCKS > 8) {
  2066. $code .= "vpaddd $B12_15,$B04_07,$B08_11\n";
  2067. }
  2068. if ($NUM_BLOCKS > 12) {
  2069. $code .= "vpaddd $B12_15,$B08_11,$B12_15\n";
  2070. }
  2071. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2072. $NUM_BLOCKS, "vpshufb", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2073. $B04_07, $B08_11, $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
  2074. $code .= <<___;
  2075. .L_16_blocks_ok_${rndsuffix}:
  2076. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2077. # ;; - pre-load constants
  2078. # ;; - add current hash into the 1st block
  2079. vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1
  2080. ___
  2081. if ($is_start != 0) {
  2082. $code .= "vpxorq `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$HASH_IN_OUT,$GHDAT1\n";
  2083. } else {
  2084. $code .= "vmovdqa64 `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n";
  2085. }
  2086. $code .= "vmovdqu64 @{[EffectiveAddress(\"%rsp\",$HASHKEY_OFFSET,0*64)]},$GHKEY1\n";
  2087. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2088. # ;; save counter for the next round
  2089. # ;; increment counter overflow check register
  2090. if ($NUM_BLOCKS <= 4) {
  2091. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($CTR_BE)]}\n";
  2092. } elsif ($NUM_BLOCKS <= 8) {
  2093. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($CTR_BE)]}\n";
  2094. } elsif ($NUM_BLOCKS <= 12) {
  2095. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($CTR_BE)]}\n";
  2096. } else {
  2097. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($CTR_BE)]}\n";
  2098. }
  2099. $code .= "vshufi64x2 \$0b00000000,$CTR_BE,$CTR_BE,$CTR_BE\n";
  2100. $code .= <<___;
  2101. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2102. # ;; pre-load constants
  2103. vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2
  2104. vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,1*64)]},$GHKEY2
  2105. vmovdqa64 `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2
  2106. ___
  2107. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2108. # ;; stitch AES rounds with GHASH
  2109. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2110. # ;; AES round 0 - ARK
  2111. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2112. $NUM_BLOCKS, "vpxorq", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2113. $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
  2114. $code .= "vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1\n";
  2115. $code .= <<___;
  2116. # ;;==================================================
  2117. # ;; GHASH 4 blocks (15 to 12)
  2118. vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH1H # ; a1*b1
  2119. vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH1L # ; a0*b0
  2120. vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH1M # ; a1*b0
  2121. vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH1T # ; a0*b1
  2122. vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,2*64)]},$GHKEY1
  2123. vmovdqa64 `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1
  2124. ___
  2125. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2126. # ;; AES round 1
  2127. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2128. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2129. $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
  2130. $code .= "vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2\n";
  2131. $code .= <<___;
  2132. # ;; =================================================
  2133. # ;; GHASH 4 blocks (11 to 8)
  2134. vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
  2135. vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
  2136. vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
  2137. vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
  2138. vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,3*64)]},$GHKEY2
  2139. vmovdqa64 `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2
  2140. ___
  2141. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2142. # ;; AES round 2
  2143. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2144. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2145. $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
  2146. $code .= "vbroadcastf64x2 `(16 * 4)`($AES_KEYS),$AESKEY1\n";
  2147. $code .= <<___;
  2148. # ;; =================================================
  2149. # ;; GHASH 4 blocks (7 to 4)
  2150. vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH3M # ; a0*b1
  2151. vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH3T # ; a1*b0
  2152. vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH3H # ; a1*b1
  2153. vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH3L # ; a0*b0
  2154. ___
  2155. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2156. # ;; AES rounds 3
  2157. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2158. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2159. $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
  2160. $code .= "vbroadcastf64x2 `(16 * 5)`($AES_KEYS),$AESKEY2\n";
  2161. $code .= <<___;
  2162. # ;; =================================================
  2163. # ;; Gather (XOR) GHASH for 12 blocks
  2164. vpternlogq \$0x96,$GH3H,$GH2H,$GH1H
  2165. vpternlogq \$0x96,$GH3L,$GH2L,$GH1L
  2166. vpternlogq \$0x96,$GH3T,$GH2T,$GH1T
  2167. vpternlogq \$0x96,$GH3M,$GH2M,$GH1M
  2168. ___
  2169. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2170. # ;; AES rounds 4
  2171. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2172. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2173. $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
  2174. $code .= "vbroadcastf64x2 `(16 * 6)`($AES_KEYS),$AESKEY1\n";
  2175. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2176. # ;; load plain/cipher text
  2177. &ZMM_LOAD_MASKED_BLOCKS_0_16($NUM_BLOCKS, $PLAIN_CIPH_IN, $DATA_OFFSET, $DATA1, $DATA2, $DATA3, $DATA4, $MASKREG);
  2178. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2179. # ;; AES rounds 5
  2180. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2181. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2182. $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
  2183. $code .= "vbroadcastf64x2 `(16 * 7)`($AES_KEYS),$AESKEY2\n";
  2184. $code .= <<___;
  2185. # ;; =================================================
  2186. # ;; GHASH 4 blocks (3 to 0)
  2187. vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
  2188. vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
  2189. vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
  2190. vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
  2191. ___
  2192. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2193. # ;; AES round 6
  2194. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2195. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2196. $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
  2197. $code .= "vbroadcastf64x2 `(16 * 8)`($AES_KEYS),$AESKEY1\n";
  2198. # ;; =================================================
  2199. # ;; gather GHASH in GH1L (low), GH1H (high), GH1M (mid)
  2200. # ;; - add GH2[MTLH] to GH1[MTLH]
  2201. $code .= "vpternlogq \$0x96,$GH2T,$GH1T,$GH1M\n";
  2202. if ($do_reduction != 0) {
  2203. if ($is_start != 0) {
  2204. $code .= "vpxorq $GH2M,$GH1M,$GH1M\n";
  2205. } else {
  2206. $code .= <<___;
  2207. vpternlogq \$0x96,$GH2H,$TO_REDUCE_H,$GH1H
  2208. vpternlogq \$0x96,$GH2L,$TO_REDUCE_L,$GH1L
  2209. vpternlogq \$0x96,$GH2M,$TO_REDUCE_M,$GH1M
  2210. ___
  2211. }
  2212. } else {
  2213. # ;; Update H/M/L hash sums if not carrying reduction
  2214. if ($is_start != 0) {
  2215. $code .= <<___;
  2216. vpxorq $GH2H,$GH1H,$TO_REDUCE_H
  2217. vpxorq $GH2L,$GH1L,$TO_REDUCE_L
  2218. vpxorq $GH2M,$GH1M,$TO_REDUCE_M
  2219. ___
  2220. } else {
  2221. $code .= <<___;
  2222. vpternlogq \$0x96,$GH2H,$GH1H,$TO_REDUCE_H
  2223. vpternlogq \$0x96,$GH2L,$GH1L,$TO_REDUCE_L
  2224. vpternlogq \$0x96,$GH2M,$GH1M,$TO_REDUCE_M
  2225. ___
  2226. }
  2227. }
  2228. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2229. # ;; AES round 7
  2230. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2231. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2232. $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
  2233. $code .= "vbroadcastf64x2 `(16 * 9)`($AES_KEYS),$AESKEY2\n";
  2234. # ;; =================================================
  2235. # ;; prepare mid sum for adding to high & low
  2236. # ;; load polynomial constant for reduction
  2237. if ($do_reduction != 0) {
  2238. $code .= <<___;
  2239. vpsrldq \$8,$GH1M,$GH2M
  2240. vpslldq \$8,$GH1M,$GH1M
  2241. vmovdqa64 POLY2(%rip),@{[XWORD($RED_POLY)]}
  2242. ___
  2243. }
  2244. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2245. # ;; AES round 8
  2246. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2247. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2248. $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
  2249. $code .= "vbroadcastf64x2 `(16 * 10)`($AES_KEYS),$AESKEY1\n";
  2250. # ;; =================================================
  2251. # ;; Add mid product to high and low
  2252. if ($do_reduction != 0) {
  2253. if ($is_start != 0) {
  2254. $code .= <<___;
  2255. vpternlogq \$0x96,$GH2M,$GH2H,$GH1H # ; TH = TH1 + TH2 + TM>>64
  2256. vpternlogq \$0x96,$GH1M,$GH2L,$GH1L # ; TL = TL1 + TL2 + TM<<64
  2257. ___
  2258. } else {
  2259. $code .= <<___;
  2260. vpxorq $GH2M,$GH1H,$GH1H # ; TH = TH1 + TM>>64
  2261. vpxorq $GH1M,$GH1L,$GH1L # ; TL = TL1 + TM<<64
  2262. ___
  2263. }
  2264. }
  2265. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2266. # ;; AES round 9
  2267. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2268. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2269. $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
  2270. # ;; =================================================
  2271. # ;; horizontal xor of low and high 4x128
  2272. if ($do_reduction != 0) {
  2273. &VHPXORI4x128($GH1H, $GH2H);
  2274. &VHPXORI4x128($GH1L, $GH2L);
  2275. }
  2276. if (($NROUNDS >= 11)) {
  2277. $code .= "vbroadcastf64x2 `(16 * 11)`($AES_KEYS),$AESKEY2\n";
  2278. }
  2279. # ;; =================================================
  2280. # ;; first phase of reduction
  2281. if ($do_reduction != 0) {
  2282. $code .= <<___;
  2283. vpclmulqdq \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]}
  2284. vpslldq \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]} # ; shift-L 2 DWs
  2285. vpxorq @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]} # ; first phase of the reduct
  2286. ___
  2287. }
  2288. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2289. # ;; AES rounds up to 11 (AES192) or 13 (AES256)
  2290. # ;; AES128 is done
  2291. if (($NROUNDS >= 11)) {
  2292. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2293. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2294. $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
  2295. $code .= "vbroadcastf64x2 `(16 * 12)`($AES_KEYS),$AESKEY1\n";
  2296. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2297. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2298. $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
  2299. if (($NROUNDS == 13)) {
  2300. $code .= "vbroadcastf64x2 `(16 * 13)`($AES_KEYS),$AESKEY2\n";
  2301. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2302. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2303. $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
  2304. $code .= "vbroadcastf64x2 `(16 * 14)`($AES_KEYS),$AESKEY1\n";
  2305. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2306. $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2307. $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
  2308. }
  2309. }
  2310. # ;; =================================================
  2311. # ;; second phase of the reduction
  2312. if ($do_reduction != 0) {
  2313. $code .= <<___;
  2314. vpclmulqdq \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]}
  2315. vpsrldq \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]} # ; shift-R 1-DW to obtain 2-DWs shift-R
  2316. vpclmulqdq \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]}
  2317. vpslldq \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]} # ; shift-L 1-DW for result without shifts
  2318. # ;; GH1H = GH1H + RED_T1 + RED_T2
  2319. vpternlogq \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]}
  2320. ___
  2321. }
  2322. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2323. # ;; the last AES round
  2324. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2325. $NUM_BLOCKS, "vaesenclast", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2326. $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
  2327. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2328. # ;; XOR against plain/cipher text
  2329. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2330. $NUM_BLOCKS, "vpxorq", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
  2331. $B04_07, $B08_11, $B12_15, $DATA1, $DATA2, $DATA3, $DATA4);
  2332. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2333. # ;; retrieve the last cipher counter block (partially XOR'ed with text)
  2334. # ;; - this is needed for partial block cases
  2335. if ($NUM_BLOCKS <= 4) {
  2336. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($LAST_CIPHER_BLK)]}\n";
  2337. } elsif ($NUM_BLOCKS <= 8) {
  2338. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($LAST_CIPHER_BLK)]}\n";
  2339. } elsif ($NUM_BLOCKS <= 12) {
  2340. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($LAST_CIPHER_BLK)]}\n";
  2341. } else {
  2342. $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($LAST_CIPHER_BLK)]}\n";
  2343. }
  2344. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2345. # ;; store cipher/plain text
  2346. $code .= "mov $CIPH_PLAIN_OUT,$IA0\n";
  2347. &ZMM_STORE_MASKED_BLOCKS_0_16($NUM_BLOCKS, $IA0, $DATA_OFFSET, $B00_03, $B04_07, $B08_11, $B12_15, $MASKREG);
  2348. # ;; =================================================
  2349. # ;; shuffle cipher text blocks for GHASH computation
  2350. if ($ENC_DEC eq "ENC") {
  2351. # ;; zero bytes outside the mask before hashing
  2352. if ($NUM_BLOCKS <= 4) {
  2353. $code .= "vmovdqu8 $B00_03,${B00_03}{$MASKREG}{z}\n";
  2354. } elsif ($NUM_BLOCKS <= 8) {
  2355. $code .= "vmovdqu8 $B04_07,${B04_07}{$MASKREG}{z}\n";
  2356. } elsif ($NUM_BLOCKS <= 12) {
  2357. $code .= "vmovdqu8 $B08_11,${B08_11}{$MASKREG}{z}\n";
  2358. } else {
  2359. $code .= "vmovdqu8 $B12_15,${B12_15}{$MASKREG}{z}\n";
  2360. }
  2361. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2362. $NUM_BLOCKS, "vpshufb", $DATA1, $DATA2, $DATA3, $DATA4, $B00_03,
  2363. $B04_07, $B08_11, $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
  2364. } else {
  2365. # ;; zero bytes outside the mask before hashing
  2366. if ($NUM_BLOCKS <= 4) {
  2367. $code .= "vmovdqu8 $DATA1,${DATA1}{$MASKREG}{z}\n";
  2368. } elsif ($NUM_BLOCKS <= 8) {
  2369. $code .= "vmovdqu8 $DATA2,${DATA2}{$MASKREG}{z}\n";
  2370. } elsif ($NUM_BLOCKS <= 12) {
  2371. $code .= "vmovdqu8 $DATA3,${DATA3}{$MASKREG}{z}\n";
  2372. } else {
  2373. $code .= "vmovdqu8 $DATA4,${DATA4}{$MASKREG}{z}\n";
  2374. }
  2375. &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
  2376. $NUM_BLOCKS, "vpshufb", $DATA1, $DATA2, $DATA3, $DATA4, $DATA1,
  2377. $DATA2, $DATA3, $DATA4, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
  2378. }
  2379. # ;; =================================================
  2380. # ;; Extract the last block for partial / multi_call cases
  2381. if ($NUM_BLOCKS <= 4) {
  2382. $code .= "vextracti32x4 \$`($NUM_BLOCKS-1)`,$DATA1,@{[XWORD($LAST_GHASH_BLK)]}\n";
  2383. } elsif ($NUM_BLOCKS <= 8) {
  2384. $code .= "vextracti32x4 \$`($NUM_BLOCKS-5)`,$DATA2,@{[XWORD($LAST_GHASH_BLK)]}\n";
  2385. } elsif ($NUM_BLOCKS <= 12) {
  2386. $code .= "vextracti32x4 \$`($NUM_BLOCKS-9)`,$DATA3,@{[XWORD($LAST_GHASH_BLK)]}\n";
  2387. } else {
  2388. $code .= "vextracti32x4 \$`($NUM_BLOCKS-13)`,$DATA4,@{[XWORD($LAST_GHASH_BLK)]}\n";
  2389. }
  2390. if ($do_reduction != 0) {
  2391. # ;; GH1H holds reduced hash value
  2392. # ;; - normally do "vmovdqa64 &XWORD($GH1H), &XWORD($HASH_IN_OUT)"
  2393. # ;; - register rename trick obsoletes the above move
  2394. }
  2395. # ;; =================================================
  2396. # ;; GHASH last N blocks
  2397. # ;; - current hash value in HASH_IN_OUT or
  2398. # ;; product parts in TO_REDUCE_H/M/L
  2399. # ;; - DATA1-DATA4 include blocks for GHASH
  2400. if ($do_reduction == 0) {
  2401. &INITIAL_BLOCKS_PARTIAL_GHASH(
  2402. $AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS,
  2403. &XWORD($HASH_IN_OUT), $ENC_DEC, $DATA1, $DATA2,
  2404. $DATA3, $DATA4, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK),
  2405. $B00_03, $B04_07, $B08_11, $B12_15,
  2406. $GHDAT1, $GHDAT2, $AESKEY1, $AESKEY2,
  2407. $GHKEY1, $PBLOCK_LEN, $TO_REDUCE_H, $TO_REDUCE_M,
  2408. $TO_REDUCE_L);
  2409. } else {
  2410. &INITIAL_BLOCKS_PARTIAL_GHASH(
  2411. $AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS,
  2412. &XWORD($HASH_IN_OUT), $ENC_DEC, $DATA1, $DATA2,
  2413. $DATA3, $DATA4, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK),
  2414. $B00_03, $B04_07, $B08_11, $B12_15,
  2415. $GHDAT1, $GHDAT2, $AESKEY1, $AESKEY2,
  2416. $GHKEY1, $PBLOCK_LEN);
  2417. }
  2418. }
  2419. # ;; ===========================================================================
  2420. # ;; ===========================================================================
  2421. # ;; Stitched GHASH of 16 blocks (with reduction) with encryption of N blocks
  2422. # ;; followed with GHASH of the N blocks.
  2423. sub GCM_ENC_DEC_LAST {
  2424. my $AES_KEYS = $_[0]; # [in] key pointer
  2425. my $GCM128_CTX = $_[1]; # [in] context pointer
  2426. my $CIPH_PLAIN_OUT = $_[2]; # [in] pointer to output buffer
  2427. my $PLAIN_CIPH_IN = $_[3]; # [in] pointer to input buffer
  2428. my $DATA_OFFSET = $_[4]; # [in] data offset
  2429. my $LENGTH = $_[5]; # [in/clobbered] data length
  2430. my $CTR_BE = $_[6]; # [in/out] ZMM counter blocks (last 4) in big-endian
  2431. my $CTR_CHECK = $_[7]; # [in/out] GP with 8-bit counter for overflow check
  2432. my $HASHKEY_OFFSET = $_[8]; # [in] numerical offset for the highest hash key
  2433. # (can be register or numerical offset)
  2434. my $GHASHIN_BLK_OFFSET = $_[9]; # [in] numerical offset for GHASH blocks in
  2435. my $SHFMSK = $_[10]; # [in] ZMM with byte swap mask for pshufb
  2436. my $ZT00 = $_[11]; # [clobbered] temporary ZMM
  2437. my $ZT01 = $_[12]; # [clobbered] temporary ZMM
  2438. my $ZT02 = $_[13]; # [clobbered] temporary ZMM
  2439. my $ZT03 = $_[14]; # [clobbered] temporary ZMM
  2440. my $ZT04 = $_[15]; # [clobbered] temporary ZMM
  2441. my $ZT05 = $_[16]; # [clobbered] temporary ZMM
  2442. my $ZT06 = $_[17]; # [clobbered] temporary ZMM
  2443. my $ZT07 = $_[18]; # [clobbered] temporary ZMM
  2444. my $ZT08 = $_[19]; # [clobbered] temporary ZMM
  2445. my $ZT09 = $_[20]; # [clobbered] temporary ZMM
  2446. my $ZT10 = $_[21]; # [clobbered] temporary ZMM
  2447. my $ZT11 = $_[22]; # [clobbered] temporary ZMM
  2448. my $ZT12 = $_[23]; # [clobbered] temporary ZMM
  2449. my $ZT13 = $_[24]; # [clobbered] temporary ZMM
  2450. my $ZT14 = $_[25]; # [clobbered] temporary ZMM
  2451. my $ZT15 = $_[26]; # [clobbered] temporary ZMM
  2452. my $ZT16 = $_[27]; # [clobbered] temporary ZMM
  2453. my $ZT17 = $_[28]; # [clobbered] temporary ZMM
  2454. my $ZT18 = $_[29]; # [clobbered] temporary ZMM
  2455. my $ZT19 = $_[30]; # [clobbered] temporary ZMM
  2456. my $ZT20 = $_[31]; # [clobbered] temporary ZMM
  2457. my $ZT21 = $_[32]; # [clobbered] temporary ZMM
  2458. my $ZT22 = $_[33]; # [clobbered] temporary ZMM
  2459. my $ADDBE_4x4 = $_[34]; # [in] ZMM with 4x128bits 4 in big-endian
  2460. my $ADDBE_1234 = $_[35]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
  2461. my $GHASH_TYPE = $_[36]; # [in] "start", "start_reduce", "mid", "end_reduce"
  2462. my $TO_REDUCE_L = $_[37]; # [in] ZMM for low 4x128-bit GHASH sum
  2463. my $TO_REDUCE_H = $_[38]; # [in] ZMM for hi 4x128-bit GHASH sum
  2464. my $TO_REDUCE_M = $_[39]; # [in] ZMM for medium 4x128-bit GHASH sum
  2465. my $ENC_DEC = $_[40]; # [in] cipher direction
  2466. my $HASH_IN_OUT = $_[41]; # [in/out] XMM ghash in/out value
  2467. my $IA0 = $_[42]; # [clobbered] GP temporary
  2468. my $IA1 = $_[43]; # [clobbered] GP temporary
  2469. my $MASKREG = $_[44]; # [clobbered] mask register
  2470. my $PBLOCK_LEN = $_[45]; # [in] partial block length
  2471. my $rndsuffix = &random_string();
  2472. $code .= <<___;
  2473. mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}
  2474. add \$15,@{[DWORD($IA0)]}
  2475. shr \$4,@{[DWORD($IA0)]}
  2476. je .L_last_num_blocks_is_0_${rndsuffix}
  2477. cmp \$8,@{[DWORD($IA0)]}
  2478. je .L_last_num_blocks_is_8_${rndsuffix}
  2479. jb .L_last_num_blocks_is_7_1_${rndsuffix}
  2480. cmp \$12,@{[DWORD($IA0)]}
  2481. je .L_last_num_blocks_is_12_${rndsuffix}
  2482. jb .L_last_num_blocks_is_11_9_${rndsuffix}
  2483. # ;; 16, 15, 14 or 13
  2484. cmp \$15,@{[DWORD($IA0)]}
  2485. je .L_last_num_blocks_is_15_${rndsuffix}
  2486. ja .L_last_num_blocks_is_16_${rndsuffix}
  2487. cmp \$14,@{[DWORD($IA0)]}
  2488. je .L_last_num_blocks_is_14_${rndsuffix}
  2489. jmp .L_last_num_blocks_is_13_${rndsuffix}
  2490. .L_last_num_blocks_is_11_9_${rndsuffix}:
  2491. # ;; 11, 10 or 9
  2492. cmp \$10,@{[DWORD($IA0)]}
  2493. je .L_last_num_blocks_is_10_${rndsuffix}
  2494. ja .L_last_num_blocks_is_11_${rndsuffix}
  2495. jmp .L_last_num_blocks_is_9_${rndsuffix}
  2496. .L_last_num_blocks_is_7_1_${rndsuffix}:
  2497. cmp \$4,@{[DWORD($IA0)]}
  2498. je .L_last_num_blocks_is_4_${rndsuffix}
  2499. jb .L_last_num_blocks_is_3_1_${rndsuffix}
  2500. # ;; 7, 6 or 5
  2501. cmp \$6,@{[DWORD($IA0)]}
  2502. ja .L_last_num_blocks_is_7_${rndsuffix}
  2503. je .L_last_num_blocks_is_6_${rndsuffix}
  2504. jmp .L_last_num_blocks_is_5_${rndsuffix}
  2505. .L_last_num_blocks_is_3_1_${rndsuffix}:
  2506. # ;; 3, 2 or 1
  2507. cmp \$2,@{[DWORD($IA0)]}
  2508. ja .L_last_num_blocks_is_3_${rndsuffix}
  2509. je .L_last_num_blocks_is_2_${rndsuffix}
  2510. ___
  2511. # ;; fall through for `jmp .L_last_num_blocks_is_1`
  2512. # ;; Use rep to generate different block size variants
  2513. # ;; - one block size has to be the first one
  2514. for my $num_blocks (1 .. 16) {
  2515. $code .= ".L_last_num_blocks_is_${num_blocks}_${rndsuffix}:\n";
  2516. &GHASH_16_ENCRYPT_N_GHASH_N(
  2517. $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET,
  2518. $LENGTH, $CTR_BE, $CTR_CHECK, $HASHKEY_OFFSET, $GHASHIN_BLK_OFFSET,
  2519. $SHFMSK, $ZT00, $ZT01, $ZT02, $ZT03,
  2520. $ZT04, $ZT05, $ZT06, $ZT07, $ZT08,
  2521. $ZT09, $ZT10, $ZT11, $ZT12, $ZT13,
  2522. $ZT14, $ZT15, $ZT16, $ZT17, $ZT18,
  2523. $ZT19, $ZT20, $ZT21, $ZT22, $ADDBE_4x4,
  2524. $ADDBE_1234, $GHASH_TYPE, $TO_REDUCE_L, $TO_REDUCE_H, $TO_REDUCE_M,
  2525. $ENC_DEC, $HASH_IN_OUT, $IA0, $IA1, $MASKREG,
  2526. $num_blocks, $PBLOCK_LEN);
  2527. $code .= "jmp .L_last_blocks_done_${rndsuffix}\n";
  2528. }
  2529. $code .= ".L_last_num_blocks_is_0_${rndsuffix}:\n";
  2530. # ;; if there is 0 blocks to cipher then there are only 16 blocks for ghash and reduction
  2531. # ;; - convert mid into end_reduce
  2532. # ;; - convert start into start_reduce
  2533. if ($GHASH_TYPE eq "mid") {
  2534. $GHASH_TYPE = "end_reduce";
  2535. }
  2536. if ($GHASH_TYPE eq "start") {
  2537. $GHASH_TYPE = "start_reduce";
  2538. }
  2539. &GHASH_16($GHASH_TYPE, $TO_REDUCE_H, $TO_REDUCE_M, $TO_REDUCE_L, "%rsp",
  2540. $GHASHIN_BLK_OFFSET, 0, "%rsp", $HASHKEY_OFFSET, 0, $HASH_IN_OUT, $ZT00, $ZT01,
  2541. $ZT02, $ZT03, $ZT04, $ZT05, $ZT06, $ZT07, $ZT08, $ZT09);
  2542. $code .= ".L_last_blocks_done_${rndsuffix}:\n";
  2543. }
  2544. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2545. # ;; Main GCM macro stitching cipher with GHASH
  2546. # ;; - operates on single stream
  2547. # ;; - encrypts 16 blocks at a time
  2548. # ;; - ghash the 16 previously encrypted ciphertext blocks
  2549. # ;; - no partial block or multi_call handling here
  2550. sub GHASH_16_ENCRYPT_16_PARALLEL {
  2551. my $AES_KEYS = $_[0]; # [in] key pointer
  2552. my $CIPH_PLAIN_OUT = $_[1]; # [in] pointer to output buffer
  2553. my $PLAIN_CIPH_IN = $_[2]; # [in] pointer to input buffer
  2554. my $DATA_OFFSET = $_[3]; # [in] data offset
  2555. my $CTR_BE = $_[4]; # [in/out] ZMM counter blocks (last 4) in big-endian
  2556. my $CTR_CHECK = $_[5]; # [in/out] GP with 8-bit counter for overflow check
  2557. my $HASHKEY_OFFSET = $_[6]; # [in] numerical offset for the highest hash key (hash key index value)
  2558. my $AESOUT_BLK_OFFSET = $_[7]; # [in] numerical offset for AES-CTR out
  2559. my $GHASHIN_BLK_OFFSET = $_[8]; # [in] numerical offset for GHASH blocks in
  2560. my $SHFMSK = $_[9]; # [in] ZMM with byte swap mask for pshufb
  2561. my $ZT1 = $_[10]; # [clobbered] temporary ZMM (cipher)
  2562. my $ZT2 = $_[11]; # [clobbered] temporary ZMM (cipher)
  2563. my $ZT3 = $_[12]; # [clobbered] temporary ZMM (cipher)
  2564. my $ZT4 = $_[13]; # [clobbered] temporary ZMM (cipher)
  2565. my $ZT5 = $_[14]; # [clobbered/out] temporary ZMM or GHASH OUT (final_reduction)
  2566. my $ZT6 = $_[15]; # [clobbered] temporary ZMM (cipher)
  2567. my $ZT7 = $_[16]; # [clobbered] temporary ZMM (cipher)
  2568. my $ZT8 = $_[17]; # [clobbered] temporary ZMM (cipher)
  2569. my $ZT9 = $_[18]; # [clobbered] temporary ZMM (cipher)
  2570. my $ZT10 = $_[19]; # [clobbered] temporary ZMM (ghash)
  2571. my $ZT11 = $_[20]; # [clobbered] temporary ZMM (ghash)
  2572. my $ZT12 = $_[21]; # [clobbered] temporary ZMM (ghash)
  2573. my $ZT13 = $_[22]; # [clobbered] temporary ZMM (ghash)
  2574. my $ZT14 = $_[23]; # [clobbered] temporary ZMM (ghash)
  2575. my $ZT15 = $_[24]; # [clobbered] temporary ZMM (ghash)
  2576. my $ZT16 = $_[25]; # [clobbered] temporary ZMM (ghash)
  2577. my $ZT17 = $_[26]; # [clobbered] temporary ZMM (ghash)
  2578. my $ZT18 = $_[27]; # [clobbered] temporary ZMM (ghash)
  2579. my $ZT19 = $_[28]; # [clobbered] temporary ZMM
  2580. my $ZT20 = $_[29]; # [clobbered] temporary ZMM
  2581. my $ZT21 = $_[30]; # [clobbered] temporary ZMM
  2582. my $ZT22 = $_[31]; # [clobbered] temporary ZMM
  2583. my $ZT23 = $_[32]; # [clobbered] temporary ZMM
  2584. my $ADDBE_4x4 = $_[33]; # [in] ZMM with 4x128bits 4 in big-endian
  2585. my $ADDBE_1234 = $_[34]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
  2586. my $TO_REDUCE_L = $_[35]; # [in/out] ZMM for low 4x128-bit GHASH sum
  2587. my $TO_REDUCE_H = $_[36]; # [in/out] ZMM for hi 4x128-bit GHASH sum
  2588. my $TO_REDUCE_M = $_[37]; # [in/out] ZMM for medium 4x128-bit GHASH sum
  2589. my $DO_REDUCTION = $_[38]; # [in] "no_reduction", "final_reduction", "first_time"
  2590. my $ENC_DEC = $_[39]; # [in] cipher direction
  2591. my $DATA_DISPL = $_[40]; # [in] fixed numerical data displacement/offset
  2592. my $GHASH_IN = $_[41]; # [in] current GHASH value or "no_ghash_in"
  2593. my $IA0 = $_[42]; # [clobbered] temporary GPR
  2594. my $B00_03 = $ZT1;
  2595. my $B04_07 = $ZT2;
  2596. my $B08_11 = $ZT3;
  2597. my $B12_15 = $ZT4;
  2598. my $GH1H = $ZT5;
  2599. # ; @note: do not change this mapping
  2600. my $GH1L = $ZT6;
  2601. my $GH1M = $ZT7;
  2602. my $GH1T = $ZT8;
  2603. my $GH2H = $ZT9;
  2604. my $GH2L = $ZT10;
  2605. my $GH2M = $ZT11;
  2606. my $GH2T = $ZT12;
  2607. my $RED_POLY = $GH2T;
  2608. my $RED_P1 = $GH2L;
  2609. my $RED_T1 = $GH2H;
  2610. my $RED_T2 = $GH2M;
  2611. my $GH3H = $ZT13;
  2612. my $GH3L = $ZT14;
  2613. my $GH3M = $ZT15;
  2614. my $GH3T = $ZT16;
  2615. my $DATA1 = $ZT13;
  2616. my $DATA2 = $ZT14;
  2617. my $DATA3 = $ZT15;
  2618. my $DATA4 = $ZT16;
  2619. my $AESKEY1 = $ZT17;
  2620. my $AESKEY2 = $ZT18;
  2621. my $GHKEY1 = $ZT19;
  2622. my $GHKEY2 = $ZT20;
  2623. my $GHDAT1 = $ZT21;
  2624. my $GHDAT2 = $ZT22;
  2625. my $rndsuffix = &random_string();
  2626. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2627. # ;; prepare counter blocks
  2628. $code .= <<___;
  2629. cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]}
  2630. jae .L_16_blocks_overflow_${rndsuffix}
  2631. vpaddd $ADDBE_1234,$CTR_BE,$B00_03
  2632. vpaddd $ADDBE_4x4,$B00_03,$B04_07
  2633. vpaddd $ADDBE_4x4,$B04_07,$B08_11
  2634. vpaddd $ADDBE_4x4,$B08_11,$B12_15
  2635. jmp .L_16_blocks_ok_${rndsuffix}
  2636. .L_16_blocks_overflow_${rndsuffix}:
  2637. vpshufb $SHFMSK,$CTR_BE,$CTR_BE
  2638. vmovdqa64 ddq_add_4444(%rip),$B12_15
  2639. vpaddd ddq_add_1234(%rip),$CTR_BE,$B00_03
  2640. vpaddd $B12_15,$B00_03,$B04_07
  2641. vpaddd $B12_15,$B04_07,$B08_11
  2642. vpaddd $B12_15,$B08_11,$B12_15
  2643. vpshufb $SHFMSK,$B00_03,$B00_03
  2644. vpshufb $SHFMSK,$B04_07,$B04_07
  2645. vpshufb $SHFMSK,$B08_11,$B08_11
  2646. vpshufb $SHFMSK,$B12_15,$B12_15
  2647. .L_16_blocks_ok_${rndsuffix}:
  2648. ___
  2649. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2650. # ;; pre-load constants
  2651. $code .= "vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1\n";
  2652. if ($GHASH_IN ne "no_ghash_in") {
  2653. $code .= "vpxorq `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHASH_IN,$GHDAT1\n";
  2654. } else {
  2655. $code .= "vmovdqa64 `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n";
  2656. }
  2657. $code .= <<___;
  2658. vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (0*4)),"%rsp")]},$GHKEY1
  2659. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2660. # ;; save counter for the next round
  2661. # ;; increment counter overflow check register
  2662. vshufi64x2 \$0b11111111,$B12_15,$B12_15,$CTR_BE
  2663. addb \$16,@{[BYTE($CTR_CHECK)]}
  2664. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2665. # ;; pre-load constants
  2666. vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2
  2667. vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (1*4)),"%rsp")]},$GHKEY2
  2668. vmovdqa64 `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2
  2669. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2670. # ;; stitch AES rounds with GHASH
  2671. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2672. # ;; AES round 0 - ARK
  2673. vpxorq $AESKEY1,$B00_03,$B00_03
  2674. vpxorq $AESKEY1,$B04_07,$B04_07
  2675. vpxorq $AESKEY1,$B08_11,$B08_11
  2676. vpxorq $AESKEY1,$B12_15,$B12_15
  2677. vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1
  2678. # ;;==================================================
  2679. # ;; GHASH 4 blocks (15 to 12)
  2680. vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH1H # ; a1*b1
  2681. vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH1L # ; a0*b0
  2682. vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH1M # ; a1*b0
  2683. vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH1T # ; a0*b1
  2684. vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (2*4)),"%rsp")]},$GHKEY1
  2685. vmovdqa64 `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1
  2686. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2687. # ;; AES round 1
  2688. vaesenc $AESKEY2,$B00_03,$B00_03
  2689. vaesenc $AESKEY2,$B04_07,$B04_07
  2690. vaesenc $AESKEY2,$B08_11,$B08_11
  2691. vaesenc $AESKEY2,$B12_15,$B12_15
  2692. vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2
  2693. # ;; =================================================
  2694. # ;; GHASH 4 blocks (11 to 8)
  2695. vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
  2696. vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
  2697. vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
  2698. vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
  2699. vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (3*4)),"%rsp")]},$GHKEY2
  2700. vmovdqa64 `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2
  2701. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2702. # ;; AES round 2
  2703. vaesenc $AESKEY1,$B00_03,$B00_03
  2704. vaesenc $AESKEY1,$B04_07,$B04_07
  2705. vaesenc $AESKEY1,$B08_11,$B08_11
  2706. vaesenc $AESKEY1,$B12_15,$B12_15
  2707. vbroadcastf64x2 `(16 * 4)`($AES_KEYS),$AESKEY1
  2708. # ;; =================================================
  2709. # ;; GHASH 4 blocks (7 to 4)
  2710. vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH3M # ; a0*b1
  2711. vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH3T # ; a1*b0
  2712. vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH3H # ; a1*b1
  2713. vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH3L # ; a0*b0
  2714. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2715. # ;; AES rounds 3
  2716. vaesenc $AESKEY2,$B00_03,$B00_03
  2717. vaesenc $AESKEY2,$B04_07,$B04_07
  2718. vaesenc $AESKEY2,$B08_11,$B08_11
  2719. vaesenc $AESKEY2,$B12_15,$B12_15
  2720. vbroadcastf64x2 `(16 * 5)`($AES_KEYS),$AESKEY2
  2721. # ;; =================================================
  2722. # ;; Gather (XOR) GHASH for 12 blocks
  2723. vpternlogq \$0x96,$GH3H,$GH2H,$GH1H
  2724. vpternlogq \$0x96,$GH3L,$GH2L,$GH1L
  2725. vpternlogq \$0x96,$GH3T,$GH2T,$GH1T
  2726. vpternlogq \$0x96,$GH3M,$GH2M,$GH1M
  2727. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2728. # ;; AES rounds 4
  2729. vaesenc $AESKEY1,$B00_03,$B00_03
  2730. vaesenc $AESKEY1,$B04_07,$B04_07
  2731. vaesenc $AESKEY1,$B08_11,$B08_11
  2732. vaesenc $AESKEY1,$B12_15,$B12_15
  2733. vbroadcastf64x2 `(16 * 6)`($AES_KEYS),$AESKEY1
  2734. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2735. # ;; load plain/cipher text (recycle GH3xx registers)
  2736. vmovdqu8 `$DATA_DISPL + (0 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA1
  2737. vmovdqu8 `$DATA_DISPL + (1 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA2
  2738. vmovdqu8 `$DATA_DISPL + (2 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA3
  2739. vmovdqu8 `$DATA_DISPL + (3 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA4
  2740. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2741. # ;; AES rounds 5
  2742. vaesenc $AESKEY2,$B00_03,$B00_03
  2743. vaesenc $AESKEY2,$B04_07,$B04_07
  2744. vaesenc $AESKEY2,$B08_11,$B08_11
  2745. vaesenc $AESKEY2,$B12_15,$B12_15
  2746. vbroadcastf64x2 `(16 * 7)`($AES_KEYS),$AESKEY2
  2747. # ;; =================================================
  2748. # ;; GHASH 4 blocks (3 to 0)
  2749. vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
  2750. vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
  2751. vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
  2752. vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
  2753. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2754. # ;; AES round 6
  2755. vaesenc $AESKEY1,$B00_03,$B00_03
  2756. vaesenc $AESKEY1,$B04_07,$B04_07
  2757. vaesenc $AESKEY1,$B08_11,$B08_11
  2758. vaesenc $AESKEY1,$B12_15,$B12_15
  2759. vbroadcastf64x2 `(16 * 8)`($AES_KEYS),$AESKEY1
  2760. ___
  2761. # ;; =================================================
  2762. # ;; gather GHASH in GH1L (low) and GH1H (high)
  2763. if ($DO_REDUCTION eq "first_time") {
  2764. $code .= <<___;
  2765. vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM
  2766. vpxorq $GH2M,$GH1M,$TO_REDUCE_M # ; TM
  2767. vpxorq $GH2H,$GH1H,$TO_REDUCE_H # ; TH
  2768. vpxorq $GH2L,$GH1L,$TO_REDUCE_L # ; TL
  2769. ___
  2770. }
  2771. if ($DO_REDUCTION eq "no_reduction") {
  2772. $code .= <<___;
  2773. vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM
  2774. vpternlogq \$0x96,$GH2M,$GH1M,$TO_REDUCE_M # ; TM
  2775. vpternlogq \$0x96,$GH2H,$GH1H,$TO_REDUCE_H # ; TH
  2776. vpternlogq \$0x96,$GH2L,$GH1L,$TO_REDUCE_L # ; TL
  2777. ___
  2778. }
  2779. if ($DO_REDUCTION eq "final_reduction") {
  2780. $code .= <<___;
  2781. # ;; phase 1: add mid products together
  2782. # ;; also load polynomial constant for reduction
  2783. vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM
  2784. vpternlogq \$0x96,$GH2M,$TO_REDUCE_M,$GH1M
  2785. vpsrldq \$8,$GH1M,$GH2M
  2786. vpslldq \$8,$GH1M,$GH1M
  2787. vmovdqa64 POLY2(%rip),@{[XWORD($RED_POLY)]}
  2788. ___
  2789. }
  2790. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2791. # ;; AES round 7
  2792. $code .= <<___;
  2793. vaesenc $AESKEY2,$B00_03,$B00_03
  2794. vaesenc $AESKEY2,$B04_07,$B04_07
  2795. vaesenc $AESKEY2,$B08_11,$B08_11
  2796. vaesenc $AESKEY2,$B12_15,$B12_15
  2797. vbroadcastf64x2 `(16 * 9)`($AES_KEYS),$AESKEY2
  2798. ___
  2799. # ;; =================================================
  2800. # ;; Add mid product to high and low
  2801. if ($DO_REDUCTION eq "final_reduction") {
  2802. $code .= <<___;
  2803. vpternlogq \$0x96,$GH2M,$GH2H,$GH1H # ; TH = TH1 + TH2 + TM>>64
  2804. vpxorq $TO_REDUCE_H,$GH1H,$GH1H
  2805. vpternlogq \$0x96,$GH1M,$GH2L,$GH1L # ; TL = TL1 + TL2 + TM<<64
  2806. vpxorq $TO_REDUCE_L,$GH1L,$GH1L
  2807. ___
  2808. }
  2809. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2810. # ;; AES round 8
  2811. $code .= <<___;
  2812. vaesenc $AESKEY1,$B00_03,$B00_03
  2813. vaesenc $AESKEY1,$B04_07,$B04_07
  2814. vaesenc $AESKEY1,$B08_11,$B08_11
  2815. vaesenc $AESKEY1,$B12_15,$B12_15
  2816. vbroadcastf64x2 `(16 * 10)`($AES_KEYS),$AESKEY1
  2817. ___
  2818. # ;; =================================================
  2819. # ;; horizontal xor of low and high 4x128
  2820. if ($DO_REDUCTION eq "final_reduction") {
  2821. &VHPXORI4x128($GH1H, $GH2H);
  2822. &VHPXORI4x128($GH1L, $GH2L);
  2823. }
  2824. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2825. # ;; AES round 9
  2826. $code .= <<___;
  2827. vaesenc $AESKEY2,$B00_03,$B00_03
  2828. vaesenc $AESKEY2,$B04_07,$B04_07
  2829. vaesenc $AESKEY2,$B08_11,$B08_11
  2830. vaesenc $AESKEY2,$B12_15,$B12_15
  2831. ___
  2832. if (($NROUNDS >= 11)) {
  2833. $code .= "vbroadcastf64x2 `(16 * 11)`($AES_KEYS),$AESKEY2\n";
  2834. }
  2835. # ;; =================================================
  2836. # ;; first phase of reduction
  2837. if ($DO_REDUCTION eq "final_reduction") {
  2838. $code .= <<___;
  2839. vpclmulqdq \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]}
  2840. vpslldq \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]} # ; shift-L 2 DWs
  2841. vpxorq @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]} # ; first phase of the reduct
  2842. ___
  2843. }
  2844. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2845. # ;; AES rounds up to 11 (AES192) or 13 (AES256)
  2846. # ;; AES128 is done
  2847. if (($NROUNDS >= 11)) {
  2848. $code .= <<___;
  2849. vaesenc $AESKEY1,$B00_03,$B00_03
  2850. vaesenc $AESKEY1,$B04_07,$B04_07
  2851. vaesenc $AESKEY1,$B08_11,$B08_11
  2852. vaesenc $AESKEY1,$B12_15,$B12_15
  2853. vbroadcastf64x2 `(16 * 12)`($AES_KEYS),$AESKEY1
  2854. vaesenc $AESKEY2,$B00_03,$B00_03
  2855. vaesenc $AESKEY2,$B04_07,$B04_07
  2856. vaesenc $AESKEY2,$B08_11,$B08_11
  2857. vaesenc $AESKEY2,$B12_15,$B12_15
  2858. ___
  2859. if (($NROUNDS == 13)) {
  2860. $code .= <<___;
  2861. vbroadcastf64x2 `(16 * 13)`($AES_KEYS),$AESKEY2
  2862. vaesenc $AESKEY1,$B00_03,$B00_03
  2863. vaesenc $AESKEY1,$B04_07,$B04_07
  2864. vaesenc $AESKEY1,$B08_11,$B08_11
  2865. vaesenc $AESKEY1,$B12_15,$B12_15
  2866. vbroadcastf64x2 `(16 * 14)`($AES_KEYS),$AESKEY1
  2867. vaesenc $AESKEY2,$B00_03,$B00_03
  2868. vaesenc $AESKEY2,$B04_07,$B04_07
  2869. vaesenc $AESKEY2,$B08_11,$B08_11
  2870. vaesenc $AESKEY2,$B12_15,$B12_15
  2871. ___
  2872. }
  2873. }
  2874. # ;; =================================================
  2875. # ;; second phase of the reduction
  2876. if ($DO_REDUCTION eq "final_reduction") {
  2877. $code .= <<___;
  2878. vpclmulqdq \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]}
  2879. vpsrldq \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]} # ; shift-R 1-DW to obtain 2-DWs shift-R
  2880. vpclmulqdq \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]}
  2881. vpslldq \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]} # ; shift-L 1-DW for result without shifts
  2882. # ;; GH1H = GH1H x RED_T1 x RED_T2
  2883. vpternlogq \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]}
  2884. ___
  2885. }
  2886. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2887. # ;; the last AES round
  2888. $code .= <<___;
  2889. vaesenclast $AESKEY1,$B00_03,$B00_03
  2890. vaesenclast $AESKEY1,$B04_07,$B04_07
  2891. vaesenclast $AESKEY1,$B08_11,$B08_11
  2892. vaesenclast $AESKEY1,$B12_15,$B12_15
  2893. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2894. # ;; XOR against plain/cipher text
  2895. vpxorq $DATA1,$B00_03,$B00_03
  2896. vpxorq $DATA2,$B04_07,$B04_07
  2897. vpxorq $DATA3,$B08_11,$B08_11
  2898. vpxorq $DATA4,$B12_15,$B12_15
  2899. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2900. # ;; store cipher/plain text
  2901. mov $CIPH_PLAIN_OUT,$IA0
  2902. vmovdqu8 $B00_03,`$DATA_DISPL + (0 * 64)`($IA0,$DATA_OFFSET,1)
  2903. vmovdqu8 $B04_07,`$DATA_DISPL + (1 * 64)`($IA0,$DATA_OFFSET,1)
  2904. vmovdqu8 $B08_11,`$DATA_DISPL + (2 * 64)`($IA0,$DATA_OFFSET,1)
  2905. vmovdqu8 $B12_15,`$DATA_DISPL + (3 * 64)`($IA0,$DATA_OFFSET,1)
  2906. ___
  2907. # ;; =================================================
  2908. # ;; shuffle cipher text blocks for GHASH computation
  2909. if ($ENC_DEC eq "ENC") {
  2910. $code .= <<___;
  2911. vpshufb $SHFMSK,$B00_03,$B00_03
  2912. vpshufb $SHFMSK,$B04_07,$B04_07
  2913. vpshufb $SHFMSK,$B08_11,$B08_11
  2914. vpshufb $SHFMSK,$B12_15,$B12_15
  2915. ___
  2916. } else {
  2917. $code .= <<___;
  2918. vpshufb $SHFMSK,$DATA1,$B00_03
  2919. vpshufb $SHFMSK,$DATA2,$B04_07
  2920. vpshufb $SHFMSK,$DATA3,$B08_11
  2921. vpshufb $SHFMSK,$DATA4,$B12_15
  2922. ___
  2923. }
  2924. # ;; =================================================
  2925. # ;; store shuffled cipher text for ghashing
  2926. $code .= <<___;
  2927. vmovdqa64 $B00_03,`$AESOUT_BLK_OFFSET + (0*64)`(%rsp)
  2928. vmovdqa64 $B04_07,`$AESOUT_BLK_OFFSET + (1*64)`(%rsp)
  2929. vmovdqa64 $B08_11,`$AESOUT_BLK_OFFSET + (2*64)`(%rsp)
  2930. vmovdqa64 $B12_15,`$AESOUT_BLK_OFFSET + (3*64)`(%rsp)
  2931. ___
  2932. }
  2933. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  2934. # ;;; Encryption of a single block
  2935. sub ENCRYPT_SINGLE_BLOCK {
  2936. my $AES_KEY = $_[0]; # ; [in]
  2937. my $XMM0 = $_[1]; # ; [in/out]
  2938. my $GPR1 = $_[2]; # ; [clobbered]
  2939. my $rndsuffix = &random_string();
  2940. $code .= <<___;
  2941. # ; load number of rounds from AES_KEY structure (offset in bytes is
  2942. # ; size of the |rd_key| buffer)
  2943. mov `4*15*4`($AES_KEY),@{[DWORD($GPR1)]}
  2944. cmp \$9,@{[DWORD($GPR1)]}
  2945. je .Laes_128_${rndsuffix}
  2946. cmp \$11,@{[DWORD($GPR1)]}
  2947. je .Laes_192_${rndsuffix}
  2948. cmp \$13,@{[DWORD($GPR1)]}
  2949. je .Laes_256_${rndsuffix}
  2950. jmp .Lexit_aes_${rndsuffix}
  2951. ___
  2952. for my $keylen (sort keys %aes_rounds) {
  2953. my $nr = $aes_rounds{$keylen};
  2954. $code .= <<___;
  2955. .align 32
  2956. .Laes_${keylen}_${rndsuffix}:
  2957. ___
  2958. $code .= "vpxorq `16*0`($AES_KEY),$XMM0, $XMM0\n\n";
  2959. for (my $i = 1; $i <= $nr; $i++) {
  2960. $code .= "vaesenc `16*$i`($AES_KEY),$XMM0,$XMM0\n\n";
  2961. }
  2962. $code .= <<___;
  2963. vaesenclast `16*($nr+1)`($AES_KEY),$XMM0,$XMM0
  2964. jmp .Lexit_aes_${rndsuffix}
  2965. ___
  2966. }
  2967. $code .= ".Lexit_aes_${rndsuffix}:\n\n";
  2968. }
  2969. sub CALC_J0 {
  2970. my $GCM128_CTX = $_[0]; #; [in] Pointer to GCM context
  2971. my $IV = $_[1]; #; [in] Pointer to IV
  2972. my $IV_LEN = $_[2]; #; [in] IV length
  2973. my $J0 = $_[3]; #; [out] XMM reg to contain J0
  2974. my $ZT0 = $_[4]; #; [clobbered] ZMM register
  2975. my $ZT1 = $_[5]; #; [clobbered] ZMM register
  2976. my $ZT2 = $_[6]; #; [clobbered] ZMM register
  2977. my $ZT3 = $_[7]; #; [clobbered] ZMM register
  2978. my $ZT4 = $_[8]; #; [clobbered] ZMM register
  2979. my $ZT5 = $_[9]; #; [clobbered] ZMM register
  2980. my $ZT6 = $_[10]; #; [clobbered] ZMM register
  2981. my $ZT7 = $_[11]; #; [clobbered] ZMM register
  2982. my $ZT8 = $_[12]; #; [clobbered] ZMM register
  2983. my $ZT9 = $_[13]; #; [clobbered] ZMM register
  2984. my $ZT10 = $_[14]; #; [clobbered] ZMM register
  2985. my $ZT11 = $_[15]; #; [clobbered] ZMM register
  2986. my $ZT12 = $_[16]; #; [clobbered] ZMM register
  2987. my $ZT13 = $_[17]; #; [clobbered] ZMM register
  2988. my $ZT14 = $_[18]; #; [clobbered] ZMM register
  2989. my $ZT15 = $_[19]; #; [clobbered] ZMM register
  2990. my $ZT16 = $_[20]; #; [clobbered] ZMM register
  2991. my $T1 = $_[21]; #; [clobbered] GP register
  2992. my $T2 = $_[22]; #; [clobbered] GP register
  2993. my $T3 = $_[23]; #; [clobbered] GP register
  2994. my $MASKREG = $_[24]; #; [clobbered] mask register
  2995. # ;; J0 = GHASH(IV || 0s+64 || len(IV)64)
  2996. # ;; s = 16 * RoundUp(len(IV)/16) - len(IV) */
  2997. # ;; Calculate GHASH of (IV || 0s)
  2998. $code .= "vpxor $J0,$J0,$J0\n";
  2999. &CALC_AAD_HASH($IV, $IV_LEN, $J0, $GCM128_CTX, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
  3000. $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13, $ZT14, $ZT15, $ZT16, $T1, $T2, $T3, $MASKREG);
  3001. # ;; Calculate GHASH of last 16-byte block (0 || len(IV)64)
  3002. $code .= <<___;
  3003. mov $IV_LEN,$T1
  3004. shl \$3,$T1 # ; IV length in bits
  3005. vmovq $T1,@{[XWORD($ZT2)]}
  3006. # ;; Might need shuffle of ZT2
  3007. vpxorq $J0,@{[XWORD($ZT2)]},$J0
  3008. vmovdqu64 @{[HashKeyByIdx(1,$GCM128_CTX)]},@{[XWORD($ZT0)]}
  3009. ___
  3010. &GHASH_MUL($J0, @{[XWORD($ZT0)]}, @{[XWORD($ZT1)]}, @{[XWORD($ZT2)]}, @{[XWORD($ZT3)]});
  3011. $code .= "vpshufb SHUF_MASK(%rip),$J0,$J0 # ; perform a 16Byte swap\n";
  3012. }
  3013. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3014. # ;;; GCM_INIT_IV performs an initialization of gcm128_ctx struct to prepare for
  3015. # ;;; encoding/decoding.
  3016. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3017. sub GCM_INIT_IV {
  3018. my $AES_KEYS = $_[0]; # [in] AES key schedule
  3019. my $GCM128_CTX = $_[1]; # [in/out] GCM context
  3020. my $IV = $_[2]; # [in] IV pointer
  3021. my $IV_LEN = $_[3]; # [in] IV length
  3022. my $GPR1 = $_[4]; # [clobbered] GP register
  3023. my $GPR2 = $_[5]; # [clobbered] GP register
  3024. my $GPR3 = $_[6]; # [clobbered] GP register
  3025. my $MASKREG = $_[7]; # [clobbered] mask register
  3026. my $CUR_COUNT = $_[8]; # [out] XMM with current counter
  3027. my $ZT0 = $_[9]; # [clobbered] ZMM register
  3028. my $ZT1 = $_[10]; # [clobbered] ZMM register
  3029. my $ZT2 = $_[11]; # [clobbered] ZMM register
  3030. my $ZT3 = $_[12]; # [clobbered] ZMM register
  3031. my $ZT4 = $_[13]; # [clobbered] ZMM register
  3032. my $ZT5 = $_[14]; # [clobbered] ZMM register
  3033. my $ZT6 = $_[15]; # [clobbered] ZMM register
  3034. my $ZT7 = $_[16]; # [clobbered] ZMM register
  3035. my $ZT8 = $_[17]; # [clobbered] ZMM register
  3036. my $ZT9 = $_[18]; # [clobbered] ZMM register
  3037. my $ZT10 = $_[19]; # [clobbered] ZMM register
  3038. my $ZT11 = $_[20]; # [clobbered] ZMM register
  3039. my $ZT12 = $_[21]; # [clobbered] ZMM register
  3040. my $ZT13 = $_[22]; # [clobbered] ZMM register
  3041. my $ZT14 = $_[23]; # [clobbered] ZMM register
  3042. my $ZT15 = $_[24]; # [clobbered] ZMM register
  3043. my $ZT16 = $_[25]; # [clobbered] ZMM register
  3044. my $ZT0x = $ZT0;
  3045. $ZT0x =~ s/zmm/xmm/;
  3046. $code .= <<___;
  3047. cmp \$12,$IV_LEN
  3048. je iv_len_12_init_IV
  3049. ___
  3050. # ;; IV is different than 12 bytes
  3051. &CALC_J0($GCM128_CTX, $IV, $IV_LEN, $CUR_COUNT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, $ZT5, $ZT6, $ZT7,
  3052. $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13, $ZT14, $ZT15, $ZT16, $GPR1, $GPR2, $GPR3, $MASKREG);
  3053. $code .= <<___;
  3054. jmp skip_iv_len_12_init_IV
  3055. iv_len_12_init_IV: # ;; IV is 12 bytes
  3056. # ;; read 12 IV bytes and pad with 0x00000001
  3057. vmovdqu8 ONEf(%rip),$CUR_COUNT
  3058. mov $IV,$GPR2
  3059. mov \$0x0000000000000fff,@{[DWORD($GPR1)]}
  3060. kmovq $GPR1,$MASKREG
  3061. vmovdqu8 ($GPR2),${CUR_COUNT}{$MASKREG} # ; ctr = IV | 0x1
  3062. skip_iv_len_12_init_IV:
  3063. vmovdqu $CUR_COUNT,$ZT0x
  3064. ___
  3065. &ENCRYPT_SINGLE_BLOCK($AES_KEYS, "$ZT0x", "$GPR1"); # ; E(K, Y0)
  3066. $code .= <<___;
  3067. vmovdqu $ZT0x,`$CTX_OFFSET_EK0`($GCM128_CTX) # ; save EK0 for finalization stage
  3068. # ;; store IV as counter in LE format
  3069. vpshufb SHUF_MASK(%rip),$CUR_COUNT,$CUR_COUNT
  3070. vmovdqu $CUR_COUNT,`$CTX_OFFSET_CurCount`($GCM128_CTX) # ; save current counter Yi
  3071. ___
  3072. }
  3073. sub GCM_UPDATE_AAD {
  3074. my $GCM128_CTX = $_[0]; # [in] GCM context pointer
  3075. my $A_IN = $_[1]; # [in] AAD pointer
  3076. my $A_LEN = $_[2]; # [in] AAD length in bytes
  3077. my $GPR1 = $_[3]; # [clobbered] GP register
  3078. my $GPR2 = $_[4]; # [clobbered] GP register
  3079. my $GPR3 = $_[5]; # [clobbered] GP register
  3080. my $MASKREG = $_[6]; # [clobbered] mask register
  3081. my $AAD_HASH = $_[7]; # [out] XMM for AAD_HASH value
  3082. my $ZT0 = $_[8]; # [clobbered] ZMM register
  3083. my $ZT1 = $_[9]; # [clobbered] ZMM register
  3084. my $ZT2 = $_[10]; # [clobbered] ZMM register
  3085. my $ZT3 = $_[11]; # [clobbered] ZMM register
  3086. my $ZT4 = $_[12]; # [clobbered] ZMM register
  3087. my $ZT5 = $_[13]; # [clobbered] ZMM register
  3088. my $ZT6 = $_[14]; # [clobbered] ZMM register
  3089. my $ZT7 = $_[15]; # [clobbered] ZMM register
  3090. my $ZT8 = $_[16]; # [clobbered] ZMM register
  3091. my $ZT9 = $_[17]; # [clobbered] ZMM register
  3092. my $ZT10 = $_[18]; # [clobbered] ZMM register
  3093. my $ZT11 = $_[19]; # [clobbered] ZMM register
  3094. my $ZT12 = $_[20]; # [clobbered] ZMM register
  3095. my $ZT13 = $_[21]; # [clobbered] ZMM register
  3096. my $ZT14 = $_[22]; # [clobbered] ZMM register
  3097. my $ZT15 = $_[23]; # [clobbered] ZMM register
  3098. my $ZT16 = $_[24]; # [clobbered] ZMM register
  3099. # ; load current hash
  3100. $code .= "vmovdqu64 $CTX_OFFSET_AadHash($GCM128_CTX),$AAD_HASH\n";
  3101. &CALC_AAD_HASH($A_IN, $A_LEN, $AAD_HASH, $GCM128_CTX, $ZT0, $ZT1, $ZT2,
  3102. $ZT3, $ZT4, $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13,
  3103. $ZT14, $ZT15, $ZT16, $GPR1, $GPR2, $GPR3, $MASKREG);
  3104. # ; load current hash
  3105. $code .= "vmovdqu64 $AAD_HASH,$CTX_OFFSET_AadHash($GCM128_CTX)\n";
  3106. }
  3107. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3108. # ;;; Cipher and ghash of payloads shorter than 256 bytes
  3109. # ;;; - number of blocks in the message comes as argument
  3110. # ;;; - depending on the number of blocks an optimized variant of
  3111. # ;;; INITIAL_BLOCKS_PARTIAL is invoked
  3112. sub GCM_ENC_DEC_SMALL {
  3113. my $AES_KEYS = $_[0]; # [in] key pointer
  3114. my $GCM128_CTX = $_[1]; # [in] context pointer
  3115. my $CIPH_PLAIN_OUT = $_[2]; # [in] output buffer
  3116. my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer
  3117. my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length
  3118. my $ENC_DEC = $_[5]; # [in] cipher direction
  3119. my $DATA_OFFSET = $_[6]; # [in] data offset
  3120. my $LENGTH = $_[7]; # [in] data length
  3121. my $NUM_BLOCKS = $_[8]; # [in] number of blocks to process 1 to 16
  3122. my $CTR = $_[9]; # [in/out] XMM counter block
  3123. my $HASH_IN_OUT = $_[10]; # [in/out] XMM GHASH value
  3124. my $ZTMP0 = $_[11]; # [clobbered] ZMM register
  3125. my $ZTMP1 = $_[12]; # [clobbered] ZMM register
  3126. my $ZTMP2 = $_[13]; # [clobbered] ZMM register
  3127. my $ZTMP3 = $_[14]; # [clobbered] ZMM register
  3128. my $ZTMP4 = $_[15]; # [clobbered] ZMM register
  3129. my $ZTMP5 = $_[16]; # [clobbered] ZMM register
  3130. my $ZTMP6 = $_[17]; # [clobbered] ZMM register
  3131. my $ZTMP7 = $_[18]; # [clobbered] ZMM register
  3132. my $ZTMP8 = $_[19]; # [clobbered] ZMM register
  3133. my $ZTMP9 = $_[20]; # [clobbered] ZMM register
  3134. my $ZTMP10 = $_[21]; # [clobbered] ZMM register
  3135. my $ZTMP11 = $_[22]; # [clobbered] ZMM register
  3136. my $ZTMP12 = $_[23]; # [clobbered] ZMM register
  3137. my $ZTMP13 = $_[24]; # [clobbered] ZMM register
  3138. my $ZTMP14 = $_[25]; # [clobbered] ZMM register
  3139. my $IA0 = $_[26]; # [clobbered] GP register
  3140. my $IA1 = $_[27]; # [clobbered] GP register
  3141. my $MASKREG = $_[28]; # [clobbered] mask register
  3142. my $SHUFMASK = $_[29]; # [in] ZMM with BE/LE shuffle mask
  3143. my $PBLOCK_LEN = $_[30]; # [in] partial block length
  3144. my $rndsuffix = &random_string();
  3145. $code .= <<___;
  3146. cmp \$8,$NUM_BLOCKS
  3147. je .L_small_initial_num_blocks_is_8_${rndsuffix}
  3148. jl .L_small_initial_num_blocks_is_7_1_${rndsuffix}
  3149. cmp \$12,$NUM_BLOCKS
  3150. je .L_small_initial_num_blocks_is_12_${rndsuffix}
  3151. jl .L_small_initial_num_blocks_is_11_9_${rndsuffix}
  3152. # ;; 16, 15, 14 or 13
  3153. cmp \$16,$NUM_BLOCKS
  3154. je .L_small_initial_num_blocks_is_16_${rndsuffix}
  3155. cmp \$15,$NUM_BLOCKS
  3156. je .L_small_initial_num_blocks_is_15_${rndsuffix}
  3157. cmp \$14,$NUM_BLOCKS
  3158. je .L_small_initial_num_blocks_is_14_${rndsuffix}
  3159. jmp .L_small_initial_num_blocks_is_13_${rndsuffix}
  3160. .L_small_initial_num_blocks_is_11_9_${rndsuffix}:
  3161. # ;; 11, 10 or 9
  3162. cmp \$11,$NUM_BLOCKS
  3163. je .L_small_initial_num_blocks_is_11_${rndsuffix}
  3164. cmp \$10,$NUM_BLOCKS
  3165. je .L_small_initial_num_blocks_is_10_${rndsuffix}
  3166. jmp .L_small_initial_num_blocks_is_9_${rndsuffix}
  3167. .L_small_initial_num_blocks_is_7_1_${rndsuffix}:
  3168. cmp \$4,$NUM_BLOCKS
  3169. je .L_small_initial_num_blocks_is_4_${rndsuffix}
  3170. jl .L_small_initial_num_blocks_is_3_1_${rndsuffix}
  3171. # ;; 7, 6 or 5
  3172. cmp \$7,$NUM_BLOCKS
  3173. je .L_small_initial_num_blocks_is_7_${rndsuffix}
  3174. cmp \$6,$NUM_BLOCKS
  3175. je .L_small_initial_num_blocks_is_6_${rndsuffix}
  3176. jmp .L_small_initial_num_blocks_is_5_${rndsuffix}
  3177. .L_small_initial_num_blocks_is_3_1_${rndsuffix}:
  3178. # ;; 3, 2 or 1
  3179. cmp \$3,$NUM_BLOCKS
  3180. je .L_small_initial_num_blocks_is_3_${rndsuffix}
  3181. cmp \$2,$NUM_BLOCKS
  3182. je .L_small_initial_num_blocks_is_2_${rndsuffix}
  3183. # ;; for $NUM_BLOCKS == 1, just fall through and no 'jmp' needed
  3184. # ;; Generation of different block size variants
  3185. # ;; - one block size has to be the first one
  3186. ___
  3187. for (my $num_blocks = 1; $num_blocks <= 16; $num_blocks++) {
  3188. $code .= ".L_small_initial_num_blocks_is_${num_blocks}_${rndsuffix}:\n";
  3189. &INITIAL_BLOCKS_PARTIAL(
  3190. $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $LENGTH, $DATA_OFFSET,
  3191. $num_blocks, $CTR, $HASH_IN_OUT, $ENC_DEC, $ZTMP0, $ZTMP1,
  3192. $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
  3193. $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
  3194. $ZTMP14, $IA0, $IA1, $MASKREG, $SHUFMASK, $PBLOCK_LEN);
  3195. if ($num_blocks != 16) {
  3196. $code .= "jmp .L_small_initial_blocks_encrypted_${rndsuffix}\n";
  3197. }
  3198. }
  3199. $code .= ".L_small_initial_blocks_encrypted_${rndsuffix}:\n";
  3200. }
  3201. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3202. # ; GCM_ENC_DEC Encrypts/Decrypts given data. Assumes that the passed gcm128_context
  3203. # ; struct has been initialized by GCM_INIT_IV
  3204. # ; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
  3205. # ; Clobbers rax, r10-r15, and zmm0-zmm31, k1
  3206. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3207. sub GCM_ENC_DEC {
  3208. my $AES_KEYS = $_[0]; # [in] AES Key schedule
  3209. my $GCM128_CTX = $_[1]; # [in] context pointer
  3210. my $PBLOCK_LEN = $_[2]; # [in] length of partial block at the moment of previous update
  3211. my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer pointer
  3212. my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length
  3213. my $CIPH_PLAIN_OUT = $_[5]; # [in] output buffer pointer
  3214. my $ENC_DEC = $_[6]; # [in] cipher direction
  3215. my $IA0 = "%r10";
  3216. my $IA1 = "%r12";
  3217. my $IA2 = "%r13";
  3218. my $IA3 = "%r15";
  3219. my $IA4 = "%r11";
  3220. my $IA5 = "%rax";
  3221. my $IA6 = "%rbx";
  3222. my $IA7 = "%r14";
  3223. my $LENGTH = $win64 ? $IA2 : $PLAIN_CIPH_LEN;
  3224. my $CTR_CHECK = $IA3;
  3225. my $DATA_OFFSET = $IA4;
  3226. my $HASHK_PTR = $IA6;
  3227. my $HKEYS_READY = $IA7;
  3228. my $CTR_BLOCKz = "%zmm2";
  3229. my $CTR_BLOCKx = "%xmm2";
  3230. # ; hardcoded in GCM_INIT
  3231. my $AAD_HASHz = "%zmm14";
  3232. my $AAD_HASHx = "%xmm14";
  3233. # ; hardcoded in GCM_COMPLETE
  3234. my $ZTMP0 = "%zmm0";
  3235. my $ZTMP1 = "%zmm3";
  3236. my $ZTMP2 = "%zmm4";
  3237. my $ZTMP3 = "%zmm5";
  3238. my $ZTMP4 = "%zmm6";
  3239. my $ZTMP5 = "%zmm7";
  3240. my $ZTMP6 = "%zmm10";
  3241. my $ZTMP7 = "%zmm11";
  3242. my $ZTMP8 = "%zmm12";
  3243. my $ZTMP9 = "%zmm13";
  3244. my $ZTMP10 = "%zmm15";
  3245. my $ZTMP11 = "%zmm16";
  3246. my $ZTMP12 = "%zmm17";
  3247. my $ZTMP13 = "%zmm19";
  3248. my $ZTMP14 = "%zmm20";
  3249. my $ZTMP15 = "%zmm21";
  3250. my $ZTMP16 = "%zmm30";
  3251. my $ZTMP17 = "%zmm31";
  3252. my $ZTMP18 = "%zmm1";
  3253. my $ZTMP19 = "%zmm18";
  3254. my $ZTMP20 = "%zmm8";
  3255. my $ZTMP21 = "%zmm22";
  3256. my $ZTMP22 = "%zmm23";
  3257. my $GH = "%zmm24";
  3258. my $GL = "%zmm25";
  3259. my $GM = "%zmm26";
  3260. my $SHUF_MASK = "%zmm29";
  3261. # ; Unused in the small packet path
  3262. my $ADDBE_4x4 = "%zmm27";
  3263. my $ADDBE_1234 = "%zmm28";
  3264. my $MASKREG = "%k1";
  3265. my $rndsuffix = &random_string();
  3266. # ;; reduction every 48 blocks, depth 32 blocks
  3267. # ;; @note 48 blocks is the maximum capacity of the stack frame
  3268. my $big_loop_nblocks = 48;
  3269. my $big_loop_depth = 32;
  3270. # ;;; Macro flow depending on packet size
  3271. # ;;; - LENGTH <= 16 blocks
  3272. # ;;; - cipher followed by hashing (reduction)
  3273. # ;;; - 16 blocks < LENGTH < 32 blocks
  3274. # ;;; - cipher 16 blocks
  3275. # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
  3276. # ;;; - 32 blocks < LENGTH < 48 blocks
  3277. # ;;; - cipher 2 x 16 blocks
  3278. # ;;; - hash 16 blocks
  3279. # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
  3280. # ;;; - LENGTH >= 48 blocks
  3281. # ;;; - cipher 2 x 16 blocks
  3282. # ;;; - while (data_to_cipher >= 48 blocks):
  3283. # ;;; - cipher 16 blocks & hash 16 blocks
  3284. # ;;; - cipher 16 blocks & hash 16 blocks
  3285. # ;;; - cipher 16 blocks & hash 16 blocks (reduction)
  3286. # ;;; - if (data_to_cipher >= 32 blocks):
  3287. # ;;; - cipher 16 blocks & hash 16 blocks
  3288. # ;;; - cipher 16 blocks & hash 16 blocks
  3289. # ;;; - hash 16 blocks (reduction)
  3290. # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
  3291. # ;;; - elif (data_to_cipher >= 16 blocks):
  3292. # ;;; - cipher 16 blocks & hash 16 blocks
  3293. # ;;; - hash 16 blocks
  3294. # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
  3295. # ;;; - else:
  3296. # ;;; - hash 16 blocks
  3297. # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
  3298. if ($win64) {
  3299. $code .= "cmpq \$0,$PLAIN_CIPH_LEN\n";
  3300. } else {
  3301. $code .= "or $PLAIN_CIPH_LEN,$PLAIN_CIPH_LEN\n";
  3302. }
  3303. $code .= "je .L_enc_dec_done_${rndsuffix}\n";
  3304. # Length value from context $CTX_OFFSET_InLen`($GCM128_CTX) is updated in
  3305. # 'providers/implementations/ciphers/cipher_aes_gcm_hw_vaes_avx512.inc'
  3306. $code .= "xor $HKEYS_READY, $HKEYS_READY\n";
  3307. $code .= "vmovdqu64 `$CTX_OFFSET_AadHash`($GCM128_CTX),$AAD_HASHx\n";
  3308. # ;; Used for the update flow - if there was a previous partial
  3309. # ;; block fill the remaining bytes here.
  3310. &PARTIAL_BLOCK(
  3311. $GCM128_CTX, $PBLOCK_LEN, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN,
  3312. $DATA_OFFSET, $AAD_HASHx, $ENC_DEC, $IA0, $IA1,
  3313. $IA2, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3,
  3314. $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $MASKREG);
  3315. $code .= "vmovdqu64 `$CTX_OFFSET_CurCount`($GCM128_CTX),$CTR_BLOCKx\n";
  3316. # ;; Save the amount of data left to process in $LENGTH
  3317. # ;; NOTE: PLAIN_CIPH_LEN is a register on linux;
  3318. if ($win64) {
  3319. $code .= "mov $PLAIN_CIPH_LEN,$LENGTH\n";
  3320. }
  3321. # ;; There may be no more data if it was consumed in the partial block.
  3322. $code .= <<___;
  3323. sub $DATA_OFFSET,$LENGTH
  3324. je .L_enc_dec_done_${rndsuffix}
  3325. ___
  3326. $code .= <<___;
  3327. cmp \$`(16 * 16)`,$LENGTH
  3328. jbe .L_message_below_equal_16_blocks_${rndsuffix}
  3329. vmovdqa64 SHUF_MASK(%rip),$SHUF_MASK
  3330. vmovdqa64 ddq_addbe_4444(%rip),$ADDBE_4x4
  3331. vmovdqa64 ddq_addbe_1234(%rip),$ADDBE_1234
  3332. # ;; start the pipeline
  3333. # ;; - 32 blocks aes-ctr
  3334. # ;; - 16 blocks ghash + aes-ctr
  3335. # ;; set up CTR_CHECK
  3336. vmovd $CTR_BLOCKx,@{[DWORD($CTR_CHECK)]}
  3337. and \$255,@{[DWORD($CTR_CHECK)]}
  3338. # ;; in LE format after init, convert to BE
  3339. vshufi64x2 \$0,$CTR_BLOCKz,$CTR_BLOCKz,$CTR_BLOCKz
  3340. vpshufb $SHUF_MASK,$CTR_BLOCKz,$CTR_BLOCKz
  3341. ___
  3342. # ;; ==== AES-CTR - first 16 blocks
  3343. my $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
  3344. my $data_in_out_offset = 0;
  3345. &INITIAL_BLOCKS_16(
  3346. $PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS, $DATA_OFFSET, "no_ghash", $CTR_BLOCKz,
  3347. $CTR_CHECK, $ADDBE_4x4, $ADDBE_1234, $ZTMP0, $ZTMP1, $ZTMP2,
  3348. $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8,
  3349. $SHUF_MASK, $ENC_DEC, $aesout_offset, $data_in_out_offset, $IA0);
  3350. &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
  3351. "first16");
  3352. $code .= <<___;
  3353. cmp \$`(32 * 16)`,$LENGTH
  3354. jb .L_message_below_32_blocks_${rndsuffix}
  3355. ___
  3356. # ;; ==== AES-CTR - next 16 blocks
  3357. $aesout_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
  3358. $data_in_out_offset = (16 * 16);
  3359. &INITIAL_BLOCKS_16(
  3360. $PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS, $DATA_OFFSET, "no_ghash", $CTR_BLOCKz,
  3361. $CTR_CHECK, $ADDBE_4x4, $ADDBE_1234, $ZTMP0, $ZTMP1, $ZTMP2,
  3362. $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8,
  3363. $SHUF_MASK, $ENC_DEC, $aesout_offset, $data_in_out_offset, $IA0);
  3364. &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
  3365. "last32");
  3366. $code .= "mov \$1,$HKEYS_READY\n";
  3367. $code .= <<___;
  3368. add \$`(32 * 16)`,$DATA_OFFSET
  3369. sub \$`(32 * 16)`,$LENGTH
  3370. cmp \$`($big_loop_nblocks * 16)`,$LENGTH
  3371. jb .L_no_more_big_nblocks_${rndsuffix}
  3372. ___
  3373. # ;; ====
  3374. # ;; ==== AES-CTR + GHASH - 48 blocks loop
  3375. # ;; ====
  3376. $code .= ".L_encrypt_big_nblocks_${rndsuffix}:\n";
  3377. # ;; ==== AES-CTR + GHASH - 16 blocks, start
  3378. $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
  3379. $data_in_out_offset = (0 * 16);
  3380. my $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
  3381. &GHASH_16_ENCRYPT_16_PARALLEL(
  3382. $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
  3383. 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
  3384. $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
  3385. $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
  3386. $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
  3387. $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
  3388. $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
  3389. $IA0);
  3390. # ;; ==== AES-CTR + GHASH - 16 blocks, no reduction
  3391. $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
  3392. $data_in_out_offset = (16 * 16);
  3393. $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
  3394. &GHASH_16_ENCRYPT_16_PARALLEL(
  3395. $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
  3396. 32, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
  3397. $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
  3398. $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
  3399. $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
  3400. $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
  3401. $GH, $GM, "no_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in",
  3402. $IA0);
  3403. # ;; ==== AES-CTR + GHASH - 16 blocks, reduction
  3404. $aesout_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
  3405. $data_in_out_offset = (32 * 16);
  3406. $ghashin_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
  3407. &GHASH_16_ENCRYPT_16_PARALLEL(
  3408. $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
  3409. 16, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
  3410. $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
  3411. $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
  3412. $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
  3413. $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
  3414. $GH, $GM, "final_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in",
  3415. $IA0);
  3416. # ;; === xor cipher block 0 with GHASH (ZT4)
  3417. $code .= <<___;
  3418. vmovdqa64 $ZTMP4,$AAD_HASHz
  3419. add \$`($big_loop_nblocks * 16)`,$DATA_OFFSET
  3420. sub \$`($big_loop_nblocks * 16)`,$LENGTH
  3421. cmp \$`($big_loop_nblocks * 16)`,$LENGTH
  3422. jae .L_encrypt_big_nblocks_${rndsuffix}
  3423. .L_no_more_big_nblocks_${rndsuffix}:
  3424. cmp \$`(32 * 16)`,$LENGTH
  3425. jae .L_encrypt_32_blocks_${rndsuffix}
  3426. cmp \$`(16 * 16)`,$LENGTH
  3427. jae .L_encrypt_16_blocks_${rndsuffix}
  3428. ___
  3429. # ;; =====================================================
  3430. # ;; =====================================================
  3431. # ;; ==== GHASH 1 x 16 blocks
  3432. # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
  3433. # ;; ==== then GHASH N blocks
  3434. $code .= ".L_encrypt_0_blocks_ghash_32_${rndsuffix}:\n";
  3435. # ;; calculate offset to the right hash key
  3436. $code .= <<___;
  3437. mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}
  3438. and \$~15,@{[DWORD($IA0)]}
  3439. mov \$`@{[HashKeyOffsetByIdx(32,"frame")]}`,@{[DWORD($HASHK_PTR)]}
  3440. sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
  3441. ___
  3442. # ;; ==== GHASH 32 blocks and follow with reduction
  3443. &GHASH_16("start", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (0 * 16),
  3444. "%rsp", $HASHK_PTR, 0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);
  3445. # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
  3446. $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
  3447. $code .= "add \$`(16 * 16)`,@{[DWORD($HASHK_PTR)]}\n";
  3448. &GCM_ENC_DEC_LAST(
  3449. $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH,
  3450. $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0,
  3451. $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
  3452. $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
  3453. $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18,
  3454. $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
  3455. "mid", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz,
  3456. $IA0, $IA5, $MASKREG, $PBLOCK_LEN);
  3457. $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
  3458. $code .= "jmp .L_ghash_done_${rndsuffix}\n";
  3459. # ;; =====================================================
  3460. # ;; =====================================================
  3461. # ;; ==== GHASH & encrypt 1 x 16 blocks
  3462. # ;; ==== GHASH & encrypt 1 x 16 blocks
  3463. # ;; ==== GHASH 1 x 16 blocks (reduction)
  3464. # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
  3465. # ;; ==== then GHASH N blocks
  3466. $code .= ".L_encrypt_32_blocks_${rndsuffix}:\n";
  3467. # ;; ==== AES-CTR + GHASH - 16 blocks, start
  3468. $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
  3469. $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
  3470. $data_in_out_offset = (0 * 16);
  3471. &GHASH_16_ENCRYPT_16_PARALLEL(
  3472. $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
  3473. 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
  3474. $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
  3475. $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
  3476. $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
  3477. $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
  3478. $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
  3479. $IA0);
  3480. # ;; ==== AES-CTR + GHASH - 16 blocks, no reduction
  3481. $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
  3482. $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
  3483. $data_in_out_offset = (16 * 16);
  3484. &GHASH_16_ENCRYPT_16_PARALLEL(
  3485. $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
  3486. 32, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
  3487. $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
  3488. $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
  3489. $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
  3490. $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
  3491. $GH, $GM, "no_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in",
  3492. $IA0);
  3493. # ;; ==== GHASH 16 blocks with reduction
  3494. &GHASH_16(
  3495. "end_reduce", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (32 * 16),
  3496. "%rsp", &HashKeyOffsetByIdx(16, "frame"),
  3497. 0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);
  3498. # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
  3499. $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
  3500. $code .= <<___;
  3501. sub \$`(32 * 16)`,$LENGTH
  3502. add \$`(32 * 16)`,$DATA_OFFSET
  3503. ___
  3504. # ;; calculate offset to the right hash key
  3505. $code .= "mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n";
  3506. $code .= <<___;
  3507. and \$~15,@{[DWORD($IA0)]}
  3508. mov \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]}
  3509. sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
  3510. ___
  3511. &GCM_ENC_DEC_LAST(
  3512. $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH,
  3513. $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0,
  3514. $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
  3515. $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
  3516. $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18,
  3517. $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
  3518. "start", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz,
  3519. $IA0, $IA5, $MASKREG, $PBLOCK_LEN);
  3520. $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
  3521. $code .= "jmp .L_ghash_done_${rndsuffix}\n";
  3522. # ;; =====================================================
  3523. # ;; =====================================================
  3524. # ;; ==== GHASH & encrypt 16 blocks (done before)
  3525. # ;; ==== GHASH 1 x 16 blocks
  3526. # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
  3527. # ;; ==== then GHASH N blocks
  3528. $code .= ".L_encrypt_16_blocks_${rndsuffix}:\n";
  3529. # ;; ==== AES-CTR + GHASH - 16 blocks, start
  3530. $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
  3531. $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
  3532. $data_in_out_offset = (0 * 16);
  3533. &GHASH_16_ENCRYPT_16_PARALLEL(
  3534. $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
  3535. 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
  3536. $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
  3537. $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
  3538. $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
  3539. $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
  3540. $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
  3541. $IA0);
  3542. # ;; ==== GHASH 1 x 16 blocks
  3543. &GHASH_16(
  3544. "mid", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (16 * 16),
  3545. "%rsp", &HashKeyOffsetByIdx(32, "frame"),
  3546. 0, "no_hash_input", $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);
  3547. # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
  3548. $ghashin_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
  3549. $code .= <<___;
  3550. sub \$`(16 * 16)`,$LENGTH
  3551. add \$`(16 * 16)`,$DATA_OFFSET
  3552. ___
  3553. &GCM_ENC_DEC_LAST(
  3554. $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,
  3555. $DATA_OFFSET, $LENGTH, $CTR_BLOCKz, $CTR_CHECK,
  3556. &HashKeyOffsetByIdx(16, "frame"), $ghashin_offset, $SHUF_MASK, $ZTMP0,
  3557. $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4,
  3558. $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8,
  3559. $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
  3560. $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16,
  3561. $ZTMP17, $ZTMP18, $ZTMP19, $ZTMP20,
  3562. $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
  3563. "end_reduce", $GL, $GH, $GM,
  3564. $ENC_DEC, $AAD_HASHz, $IA0, $IA5,
  3565. $MASKREG, $PBLOCK_LEN);
  3566. $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
  3567. $code .= <<___;
  3568. jmp .L_ghash_done_${rndsuffix}
  3569. .L_message_below_32_blocks_${rndsuffix}:
  3570. # ;; 32 > number of blocks > 16
  3571. sub \$`(16 * 16)`,$LENGTH
  3572. add \$`(16 * 16)`,$DATA_OFFSET
  3573. ___
  3574. $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
  3575. # ;; calculate offset to the right hash key
  3576. $code .= "mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n";
  3577. &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
  3578. "mid16");
  3579. $code .= "mov \$1,$HKEYS_READY\n";
  3580. $code .= <<___;
  3581. and \$~15,@{[DWORD($IA0)]}
  3582. mov \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]}
  3583. sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
  3584. ___
  3585. &GCM_ENC_DEC_LAST(
  3586. $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH,
  3587. $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0,
  3588. $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
  3589. $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
  3590. $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18,
  3591. $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
  3592. "start", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz,
  3593. $IA0, $IA5, $MASKREG, $PBLOCK_LEN);
  3594. $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
  3595. $code .= <<___;
  3596. jmp .L_ghash_done_${rndsuffix}
  3597. .L_message_below_equal_16_blocks_${rndsuffix}:
  3598. # ;; Determine how many blocks to process
  3599. # ;; - process one additional block if there is a partial block
  3600. mov @{[DWORD($LENGTH)]},@{[DWORD($IA1)]}
  3601. add \$15,@{[DWORD($IA1)]}
  3602. shr \$4, @{[DWORD($IA1)]} # ; $IA1 can be in the range from 0 to 16
  3603. ___
  3604. &GCM_ENC_DEC_SMALL(
  3605. $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, $ENC_DEC,
  3606. $DATA_OFFSET, $LENGTH, $IA1, $CTR_BLOCKx, $AAD_HASHx, $ZTMP0,
  3607. $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
  3608. $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
  3609. $ZTMP13, $ZTMP14, $IA0, $IA3, $MASKREG, $SHUF_MASK,
  3610. $PBLOCK_LEN);
  3611. # ;; fall through to exit
  3612. $code .= ".L_ghash_done_${rndsuffix}:\n";
  3613. # ;; save the last counter block
  3614. $code .= "vmovdqu64 $CTR_BLOCKx,`$CTX_OFFSET_CurCount`($GCM128_CTX)\n";
  3615. $code .= <<___;
  3616. vmovdqu64 $AAD_HASHx,`$CTX_OFFSET_AadHash`($GCM128_CTX)
  3617. .L_enc_dec_done_${rndsuffix}:
  3618. ___
  3619. }
  3620. # ;;; ===========================================================================
  3621. # ;;; Encrypt/decrypt the initial 16 blocks
  3622. sub INITIAL_BLOCKS_16 {
  3623. my $IN = $_[0]; # [in] input buffer
  3624. my $OUT = $_[1]; # [in] output buffer
  3625. my $AES_KEYS = $_[2]; # [in] pointer to expanded keys
  3626. my $DATA_OFFSET = $_[3]; # [in] data offset
  3627. my $GHASH = $_[4]; # [in] ZMM with AAD (low 128 bits)
  3628. my $CTR = $_[5]; # [in] ZMM with CTR BE blocks 4x128 bits
  3629. my $CTR_CHECK = $_[6]; # [in/out] GPR with counter overflow check
  3630. my $ADDBE_4x4 = $_[7]; # [in] ZMM 4x128bits with value 4 (big endian)
  3631. my $ADDBE_1234 = $_[8]; # [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian)
  3632. my $T0 = $_[9]; # [clobered] temporary ZMM register
  3633. my $T1 = $_[10]; # [clobered] temporary ZMM register
  3634. my $T2 = $_[11]; # [clobered] temporary ZMM register
  3635. my $T3 = $_[12]; # [clobered] temporary ZMM register
  3636. my $T4 = $_[13]; # [clobered] temporary ZMM register
  3637. my $T5 = $_[14]; # [clobered] temporary ZMM register
  3638. my $T6 = $_[15]; # [clobered] temporary ZMM register
  3639. my $T7 = $_[16]; # [clobered] temporary ZMM register
  3640. my $T8 = $_[17]; # [clobered] temporary ZMM register
  3641. my $SHUF_MASK = $_[18]; # [in] ZMM with BE/LE shuffle mask
  3642. my $ENC_DEC = $_[19]; # [in] ENC (encrypt) or DEC (decrypt) selector
  3643. my $BLK_OFFSET = $_[20]; # [in] stack frame offset to ciphered blocks
  3644. my $DATA_DISPL = $_[21]; # [in] fixed numerical data displacement/offset
  3645. my $IA0 = $_[22]; # [clobered] temporary GP register
  3646. my $B00_03 = $T5;
  3647. my $B04_07 = $T6;
  3648. my $B08_11 = $T7;
  3649. my $B12_15 = $T8;
  3650. my $rndsuffix = &random_string();
  3651. my $stack_offset = $BLK_OFFSET;
  3652. $code .= <<___;
  3653. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3654. # ;; prepare counter blocks
  3655. cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]}
  3656. jae .L_next_16_overflow_${rndsuffix}
  3657. vpaddd $ADDBE_1234,$CTR,$B00_03
  3658. vpaddd $ADDBE_4x4,$B00_03,$B04_07
  3659. vpaddd $ADDBE_4x4,$B04_07,$B08_11
  3660. vpaddd $ADDBE_4x4,$B08_11,$B12_15
  3661. jmp .L_next_16_ok_${rndsuffix}
  3662. .L_next_16_overflow_${rndsuffix}:
  3663. vpshufb $SHUF_MASK,$CTR,$CTR
  3664. vmovdqa64 ddq_add_4444(%rip),$B12_15
  3665. vpaddd ddq_add_1234(%rip),$CTR,$B00_03
  3666. vpaddd $B12_15,$B00_03,$B04_07
  3667. vpaddd $B12_15,$B04_07,$B08_11
  3668. vpaddd $B12_15,$B08_11,$B12_15
  3669. vpshufb $SHUF_MASK,$B00_03,$B00_03
  3670. vpshufb $SHUF_MASK,$B04_07,$B04_07
  3671. vpshufb $SHUF_MASK,$B08_11,$B08_11
  3672. vpshufb $SHUF_MASK,$B12_15,$B12_15
  3673. .L_next_16_ok_${rndsuffix}:
  3674. vshufi64x2 \$0b11111111,$B12_15,$B12_15,$CTR
  3675. addb \$16,@{[BYTE($CTR_CHECK)]}
  3676. # ;; === load 16 blocks of data
  3677. vmovdqu8 `$DATA_DISPL + (64*0)`($IN,$DATA_OFFSET,1),$T0
  3678. vmovdqu8 `$DATA_DISPL + (64*1)`($IN,$DATA_OFFSET,1),$T1
  3679. vmovdqu8 `$DATA_DISPL + (64*2)`($IN,$DATA_OFFSET,1),$T2
  3680. vmovdqu8 `$DATA_DISPL + (64*3)`($IN,$DATA_OFFSET,1),$T3
  3681. # ;; move to AES encryption rounds
  3682. vbroadcastf64x2 `(16*0)`($AES_KEYS),$T4
  3683. vpxorq $T4,$B00_03,$B00_03
  3684. vpxorq $T4,$B04_07,$B04_07
  3685. vpxorq $T4,$B08_11,$B08_11
  3686. vpxorq $T4,$B12_15,$B12_15
  3687. ___
  3688. foreach (1 .. ($NROUNDS)) {
  3689. $code .= <<___;
  3690. vbroadcastf64x2 `(16*$_)`($AES_KEYS),$T4
  3691. vaesenc $T4,$B00_03,$B00_03
  3692. vaesenc $T4,$B04_07,$B04_07
  3693. vaesenc $T4,$B08_11,$B08_11
  3694. vaesenc $T4,$B12_15,$B12_15
  3695. ___
  3696. }
  3697. $code .= <<___;
  3698. vbroadcastf64x2 `(16*($NROUNDS+1))`($AES_KEYS),$T4
  3699. vaesenclast $T4,$B00_03,$B00_03
  3700. vaesenclast $T4,$B04_07,$B04_07
  3701. vaesenclast $T4,$B08_11,$B08_11
  3702. vaesenclast $T4,$B12_15,$B12_15
  3703. # ;; xor against text
  3704. vpxorq $T0,$B00_03,$B00_03
  3705. vpxorq $T1,$B04_07,$B04_07
  3706. vpxorq $T2,$B08_11,$B08_11
  3707. vpxorq $T3,$B12_15,$B12_15
  3708. # ;; store
  3709. mov $OUT, $IA0
  3710. vmovdqu8 $B00_03,`$DATA_DISPL + (64*0)`($IA0,$DATA_OFFSET,1)
  3711. vmovdqu8 $B04_07,`$DATA_DISPL + (64*1)`($IA0,$DATA_OFFSET,1)
  3712. vmovdqu8 $B08_11,`$DATA_DISPL + (64*2)`($IA0,$DATA_OFFSET,1)
  3713. vmovdqu8 $B12_15,`$DATA_DISPL + (64*3)`($IA0,$DATA_OFFSET,1)
  3714. ___
  3715. if ($ENC_DEC eq "DEC") {
  3716. $code .= <<___;
  3717. # ;; decryption - cipher text needs to go to GHASH phase
  3718. vpshufb $SHUF_MASK,$T0,$B00_03
  3719. vpshufb $SHUF_MASK,$T1,$B04_07
  3720. vpshufb $SHUF_MASK,$T2,$B08_11
  3721. vpshufb $SHUF_MASK,$T3,$B12_15
  3722. ___
  3723. } else {
  3724. $code .= <<___;
  3725. # ;; encryption
  3726. vpshufb $SHUF_MASK,$B00_03,$B00_03
  3727. vpshufb $SHUF_MASK,$B04_07,$B04_07
  3728. vpshufb $SHUF_MASK,$B08_11,$B08_11
  3729. vpshufb $SHUF_MASK,$B12_15,$B12_15
  3730. ___
  3731. }
  3732. if ($GHASH ne "no_ghash") {
  3733. $code .= <<___;
  3734. # ;; === xor cipher block 0 with GHASH for the next GHASH round
  3735. vpxorq $GHASH,$B00_03,$B00_03
  3736. ___
  3737. }
  3738. $code .= <<___;
  3739. vmovdqa64 $B00_03,`$stack_offset + (0 * 64)`(%rsp)
  3740. vmovdqa64 $B04_07,`$stack_offset + (1 * 64)`(%rsp)
  3741. vmovdqa64 $B08_11,`$stack_offset + (2 * 64)`(%rsp)
  3742. vmovdqa64 $B12_15,`$stack_offset + (3 * 64)`(%rsp)
  3743. ___
  3744. }
  3745. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3746. # ; GCM_COMPLETE Finishes ghash calculation
  3747. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3748. sub GCM_COMPLETE {
  3749. my $GCM128_CTX = $_[0];
  3750. my $PBLOCK_LEN = $_[1];
  3751. my $rndsuffix = &random_string();
  3752. $code .= <<___;
  3753. vmovdqu @{[HashKeyByIdx(1,$GCM128_CTX)]},%xmm2
  3754. vmovdqu $CTX_OFFSET_EK0($GCM128_CTX),%xmm3 # ; xmm3 = E(K,Y0)
  3755. ___
  3756. $code .= <<___;
  3757. vmovdqu `$CTX_OFFSET_AadHash`($GCM128_CTX),%xmm4
  3758. # ;; Process the final partial block.
  3759. cmp \$0,$PBLOCK_LEN
  3760. je .L_partial_done_${rndsuffix}
  3761. ___
  3762. # ;GHASH computation for the last <16 Byte block
  3763. &GHASH_MUL("%xmm4", "%xmm2", "%xmm0", "%xmm16", "%xmm17");
  3764. $code .= <<___;
  3765. .L_partial_done_${rndsuffix}:
  3766. vmovq `$CTX_OFFSET_InLen`($GCM128_CTX), %xmm5
  3767. vpinsrq \$1, `$CTX_OFFSET_AadLen`($GCM128_CTX), %xmm5, %xmm5 # ; xmm5 = len(A)||len(C)
  3768. vpsllq \$3, %xmm5, %xmm5 # ; convert bytes into bits
  3769. vpxor %xmm5,%xmm4,%xmm4
  3770. ___
  3771. &GHASH_MUL("%xmm4", "%xmm2", "%xmm0", "%xmm16", "%xmm17");
  3772. $code .= <<___;
  3773. vpshufb SHUF_MASK(%rip),%xmm4,%xmm4 # ; perform a 16Byte swap
  3774. vpxor %xmm4,%xmm3,%xmm3
  3775. .L_return_T_${rndsuffix}:
  3776. vmovdqu %xmm3,`$CTX_OFFSET_AadHash`($GCM128_CTX)
  3777. ___
  3778. }
  3779. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3780. # ;;; Functions definitions
  3781. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3782. $code .= ".text\n";
  3783. {
  3784. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3785. # ;void ossl_aes_gcm_init_avx512 /
  3786. # ; (const void *aes_keys,
  3787. # ; void *gcm128ctx)
  3788. # ;
  3789. # ; Precomputes hashkey table for GHASH optimization.
  3790. # ; Leaf function (does not allocate stack space, does not use non-volatile registers).
  3791. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3792. $code .= <<___;
  3793. .globl ossl_aes_gcm_init_avx512
  3794. .type ossl_aes_gcm_init_avx512,\@abi-omnipotent
  3795. .align 32
  3796. ossl_aes_gcm_init_avx512:
  3797. .cfi_startproc
  3798. endbranch
  3799. ___
  3800. if ($CHECK_FUNCTION_ARGUMENTS) {
  3801. $code .= <<___;
  3802. # ;; Check aes_keys != NULL
  3803. test $arg1,$arg1
  3804. jz .Labort_init
  3805. # ;; Check gcm128ctx != NULL
  3806. test $arg2,$arg2
  3807. jz .Labort_init
  3808. ___
  3809. }
  3810. $code .= "vpxorq %xmm16,%xmm16,%xmm16\n";
  3811. &ENCRYPT_SINGLE_BLOCK("$arg1", "%xmm16", "%rax"); # ; xmm16 = HashKey
  3812. $code .= <<___;
  3813. vpshufb SHUF_MASK(%rip),%xmm16,%xmm16
  3814. # ;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey ;;;
  3815. vmovdqa64 %xmm16,%xmm2
  3816. vpsllq \$1,%xmm16,%xmm16
  3817. vpsrlq \$63,%xmm2,%xmm2
  3818. vmovdqa %xmm2,%xmm1
  3819. vpslldq \$8,%xmm2,%xmm2
  3820. vpsrldq \$8,%xmm1,%xmm1
  3821. vporq %xmm2,%xmm16,%xmm16
  3822. # ;reduction
  3823. vpshufd \$0b00100100,%xmm1,%xmm2
  3824. vpcmpeqd TWOONE(%rip),%xmm2,%xmm2
  3825. vpand POLY(%rip),%xmm2,%xmm2
  3826. vpxorq %xmm2,%xmm16,%xmm16 # ; xmm16 holds the HashKey<<1 mod poly
  3827. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3828. vmovdqu64 %xmm16,@{[HashKeyByIdx(1,$arg2)]} # ; store HashKey<<1 mod poly
  3829. ___
  3830. &PRECOMPUTE("$arg2", "%xmm16", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5");
  3831. if ($CLEAR_SCRATCH_REGISTERS) {
  3832. &clear_scratch_gps_asm();
  3833. &clear_scratch_zmms_asm();
  3834. } else {
  3835. $code .= "vzeroupper\n";
  3836. }
  3837. $code .= <<___;
  3838. .Labort_init:
  3839. ret
  3840. .cfi_endproc
  3841. .size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512
  3842. ___
  3843. }
  3844. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3845. # ;void ossl_aes_gcm_setiv_avx512
  3846. # ; (const void *aes_keys,
  3847. # ; void *gcm128ctx,
  3848. # ; const unsigned char *iv,
  3849. # ; size_t ivlen)
  3850. # ;
  3851. # ; Computes E(K,Y0) for finalization, updates current counter Yi in gcm128_context structure.
  3852. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3853. $code .= <<___;
  3854. .globl ossl_aes_gcm_setiv_avx512
  3855. .type ossl_aes_gcm_setiv_avx512,\@abi-omnipotent
  3856. .align 32
  3857. ossl_aes_gcm_setiv_avx512:
  3858. .cfi_startproc
  3859. .Lsetiv_seh_begin:
  3860. endbranch
  3861. ___
  3862. if ($CHECK_FUNCTION_ARGUMENTS) {
  3863. $code .= <<___;
  3864. # ;; Check aes_keys != NULL
  3865. test $arg1,$arg1
  3866. jz .Labort_setiv
  3867. # ;; Check gcm128ctx != NULL
  3868. test $arg2,$arg2
  3869. jz .Labort_setiv
  3870. # ;; Check iv != NULL
  3871. test $arg3,$arg3
  3872. jz .Labort_setiv
  3873. # ;; Check ivlen != 0
  3874. test $arg4,$arg4
  3875. jz .Labort_setiv
  3876. ___
  3877. }
  3878. # ; NOTE: code before PROLOG() must not modify any registers
  3879. &PROLOG(
  3880. 1, # allocate stack space for hkeys
  3881. 0, # do not allocate stack space for AES blocks
  3882. "setiv");
  3883. &GCM_INIT_IV(
  3884. "$arg1", "$arg2", "$arg3", "$arg4", "%r10", "%r11", "%r12", "%k1", "%xmm2", "%zmm1",
  3885. "%zmm11", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm12",
  3886. "%zmm13", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19");
  3887. &EPILOG(
  3888. 1, # hkeys were allocated
  3889. $arg4);
  3890. $code .= <<___;
  3891. .Labort_setiv:
  3892. ret
  3893. .Lsetiv_seh_end:
  3894. .cfi_endproc
  3895. .size ossl_aes_gcm_setiv_avx512, .-ossl_aes_gcm_setiv_avx512
  3896. ___
  3897. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3898. # ;void ossl_aes_gcm_update_aad_avx512
  3899. # ; (unsigned char *gcm128ctx,
  3900. # ; const unsigned char *aad,
  3901. # ; size_t aadlen)
  3902. # ;
  3903. # ; Updates AAD hash in gcm128_context structure.
  3904. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3905. $code .= <<___;
  3906. .globl ossl_aes_gcm_update_aad_avx512
  3907. .type ossl_aes_gcm_update_aad_avx512,\@abi-omnipotent
  3908. .align 32
  3909. ossl_aes_gcm_update_aad_avx512:
  3910. .cfi_startproc
  3911. .Lghash_seh_begin:
  3912. endbranch
  3913. ___
  3914. if ($CHECK_FUNCTION_ARGUMENTS) {
  3915. $code .= <<___;
  3916. # ;; Check gcm128ctx != NULL
  3917. test $arg1,$arg1
  3918. jz .Lexit_update_aad
  3919. # ;; Check aad != NULL
  3920. test $arg2,$arg2
  3921. jz .Lexit_update_aad
  3922. # ;; Check aadlen != 0
  3923. test $arg3,$arg3
  3924. jz .Lexit_update_aad
  3925. ___
  3926. }
  3927. # ; NOTE: code before PROLOG() must not modify any registers
  3928. &PROLOG(
  3929. 1, # allocate stack space for hkeys,
  3930. 0, # do not allocate stack space for AES blocks
  3931. "ghash");
  3932. &GCM_UPDATE_AAD(
  3933. "$arg1", "$arg2", "$arg3", "%r10", "%r11", "%r12", "%k1", "%xmm14", "%zmm1", "%zmm11",
  3934. "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm12", "%zmm13",
  3935. "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19");
  3936. &EPILOG(
  3937. 1, # hkeys were allocated
  3938. $arg3);
  3939. $code .= <<___;
  3940. .Lexit_update_aad:
  3941. ret
  3942. .Lghash_seh_end:
  3943. .cfi_endproc
  3944. .size ossl_aes_gcm_update_aad_avx512, .-ossl_aes_gcm_update_aad_avx512
  3945. ___
  3946. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3947. # ;void ossl_aes_gcm_encrypt_avx512
  3948. # ; (const void* aes_keys,
  3949. # ; void *gcm128ctx,
  3950. # ; unsigned int *pblocklen,
  3951. # ; const unsigned char *in,
  3952. # ; size_t len,
  3953. # ; unsigned char *out);
  3954. # ;
  3955. # ; Performs encryption of data |in| of len |len|, and stores the output in |out|.
  3956. # ; Stores encrypted partial block (if any) in gcm128ctx and its length in |pblocklen|.
  3957. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3958. $code .= <<___;
  3959. .globl ossl_aes_gcm_encrypt_avx512
  3960. .type ossl_aes_gcm_encrypt_avx512,\@abi-omnipotent
  3961. .align 32
  3962. ossl_aes_gcm_encrypt_avx512:
  3963. .cfi_startproc
  3964. .Lencrypt_seh_begin:
  3965. endbranch
  3966. ___
  3967. # ; NOTE: code before PROLOG() must not modify any registers
  3968. &PROLOG(
  3969. 1, # allocate stack space for hkeys
  3970. 1, # allocate stack space for AES blocks
  3971. "encrypt");
  3972. if ($CHECK_FUNCTION_ARGUMENTS) {
  3973. $code .= <<___;
  3974. # ;; Check aes_keys != NULL
  3975. test $arg1,$arg1
  3976. jz .Lexit_gcm_encrypt
  3977. # ;; Check gcm128ctx != NULL
  3978. test $arg2,$arg2
  3979. jz .Lexit_gcm_encrypt
  3980. # ;; Check pblocklen != NULL
  3981. test $arg3,$arg3
  3982. jz .Lexit_gcm_encrypt
  3983. # ;; Check in != NULL
  3984. test $arg4,$arg4
  3985. jz .Lexit_gcm_encrypt
  3986. # ;; Check if len != 0
  3987. cmp \$0,$arg5
  3988. jz .Lexit_gcm_encrypt
  3989. # ;; Check out != NULL
  3990. cmp \$0,$arg6
  3991. jz .Lexit_gcm_encrypt
  3992. ___
  3993. }
  3994. $code .= <<___;
  3995. # ; load number of rounds from AES_KEY structure (offset in bytes is
  3996. # ; size of the |rd_key| buffer)
  3997. mov `4*15*4`($arg1),%eax
  3998. cmp \$9,%eax
  3999. je .Laes_gcm_encrypt_128_avx512
  4000. cmp \$11,%eax
  4001. je .Laes_gcm_encrypt_192_avx512
  4002. cmp \$13,%eax
  4003. je .Laes_gcm_encrypt_256_avx512
  4004. xor %eax,%eax
  4005. jmp .Lexit_gcm_encrypt
  4006. ___
  4007. for my $keylen (sort keys %aes_rounds) {
  4008. $NROUNDS = $aes_rounds{$keylen};
  4009. $code .= <<___;
  4010. .align 32
  4011. .Laes_gcm_encrypt_${keylen}_avx512:
  4012. ___
  4013. &GCM_ENC_DEC("$arg1", "$arg2", "$arg3", "$arg4", "$arg5", "$arg6", "ENC");
  4014. $code .= "jmp .Lexit_gcm_encrypt\n";
  4015. }
  4016. $code .= ".Lexit_gcm_encrypt:\n";
  4017. &EPILOG(1, $arg5);
  4018. $code .= <<___;
  4019. ret
  4020. .Lencrypt_seh_end:
  4021. .cfi_endproc
  4022. .size ossl_aes_gcm_encrypt_avx512, .-ossl_aes_gcm_encrypt_avx512
  4023. ___
  4024. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  4025. # ;void ossl_aes_gcm_decrypt_avx512
  4026. # ; (const void* keys,
  4027. # ; void *gcm128ctx,
  4028. # ; unsigned int *pblocklen,
  4029. # ; const unsigned char *in,
  4030. # ; size_t len,
  4031. # ; unsigned char *out);
  4032. # ;
  4033. # ; Performs decryption of data |in| of len |len|, and stores the output in |out|.
  4034. # ; Stores decrypted partial block (if any) in gcm128ctx and its length in |pblocklen|.
  4035. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  4036. $code .= <<___;
  4037. .globl ossl_aes_gcm_decrypt_avx512
  4038. .type ossl_aes_gcm_decrypt_avx512,\@abi-omnipotent
  4039. .align 32
  4040. ossl_aes_gcm_decrypt_avx512:
  4041. .cfi_startproc
  4042. .Ldecrypt_seh_begin:
  4043. endbranch
  4044. ___
  4045. # ; NOTE: code before PROLOG() must not modify any registers
  4046. &PROLOG(
  4047. 1, # allocate stack space for hkeys
  4048. 1, # allocate stack space for AES blocks
  4049. "decrypt");
  4050. if ($CHECK_FUNCTION_ARGUMENTS) {
  4051. $code .= <<___;
  4052. # ;; Check keys != NULL
  4053. test $arg1,$arg1
  4054. jz .Lexit_gcm_decrypt
  4055. # ;; Check gcm128ctx != NULL
  4056. test $arg2,$arg2
  4057. jz .Lexit_gcm_decrypt
  4058. # ;; Check pblocklen != NULL
  4059. test $arg3,$arg3
  4060. jz .Lexit_gcm_decrypt
  4061. # ;; Check in != NULL
  4062. test $arg4,$arg4
  4063. jz .Lexit_gcm_decrypt
  4064. # ;; Check if len != 0
  4065. cmp \$0,$arg5
  4066. jz .Lexit_gcm_decrypt
  4067. # ;; Check out != NULL
  4068. cmp \$0,$arg6
  4069. jz .Lexit_gcm_decrypt
  4070. ___
  4071. }
  4072. $code .= <<___;
  4073. # ; load number of rounds from AES_KEY structure (offset in bytes is
  4074. # ; size of the |rd_key| buffer)
  4075. mov `4*15*4`($arg1),%eax
  4076. cmp \$9,%eax
  4077. je .Laes_gcm_decrypt_128_avx512
  4078. cmp \$11,%eax
  4079. je .Laes_gcm_decrypt_192_avx512
  4080. cmp \$13,%eax
  4081. je .Laes_gcm_decrypt_256_avx512
  4082. xor %eax,%eax
  4083. jmp .Lexit_gcm_decrypt
  4084. ___
  4085. for my $keylen (sort keys %aes_rounds) {
  4086. $NROUNDS = $aes_rounds{$keylen};
  4087. $code .= <<___;
  4088. .align 32
  4089. .Laes_gcm_decrypt_${keylen}_avx512:
  4090. ___
  4091. &GCM_ENC_DEC("$arg1", "$arg2", "$arg3", "$arg4", "$arg5", "$arg6", "DEC");
  4092. $code .= "jmp .Lexit_gcm_decrypt\n";
  4093. }
  4094. $code .= ".Lexit_gcm_decrypt:\n";
  4095. &EPILOG(1, $arg5);
  4096. $code .= <<___;
  4097. ret
  4098. .Ldecrypt_seh_end:
  4099. .cfi_endproc
  4100. .size ossl_aes_gcm_decrypt_avx512, .-ossl_aes_gcm_decrypt_avx512
  4101. ___
  4102. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  4103. # ;void ossl_aes_gcm_finalize_vaes_avx512
  4104. # ; (void *gcm128ctx,
  4105. # ; unsigned int pblocklen);
  4106. # ;
  4107. # ; Finalizes encryption / decryption
  4108. # ; Leaf function (does not allocate stack space, does not use non-volatile registers).
  4109. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  4110. $code .= <<___;
  4111. .globl ossl_aes_gcm_finalize_avx512
  4112. .type ossl_aes_gcm_finalize_avx512,\@abi-omnipotent
  4113. .align 32
  4114. ossl_aes_gcm_finalize_avx512:
  4115. .cfi_startproc
  4116. endbranch
  4117. ___
  4118. if ($CHECK_FUNCTION_ARGUMENTS) {
  4119. $code .= <<___;
  4120. # ;; Check gcm128ctx != NULL
  4121. test $arg1,$arg1
  4122. jz .Labort_finalize
  4123. ___
  4124. }
  4125. &GCM_COMPLETE("$arg1", "$arg2");
  4126. $code .= <<___;
  4127. .Labort_finalize:
  4128. ret
  4129. .cfi_endproc
  4130. .size ossl_aes_gcm_finalize_avx512, .-ossl_aes_gcm_finalize_avx512
  4131. ___
  4132. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  4133. # ;void ossl_gcm_gmult_avx512(u64 Xi[2],
  4134. # ; const void* gcm128ctx)
  4135. # ;
  4136. # ; Leaf function (does not allocate stack space, does not use non-volatile registers).
  4137. # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  4138. $code .= <<___;
  4139. .globl ossl_gcm_gmult_avx512
  4140. .hidden ossl_gcm_gmult_avx512
  4141. .type ossl_gcm_gmult_avx512,\@abi-omnipotent
  4142. .align 32
  4143. ossl_gcm_gmult_avx512:
  4144. .cfi_startproc
  4145. endbranch
  4146. ___
  4147. if ($CHECK_FUNCTION_ARGUMENTS) {
  4148. $code .= <<___;
  4149. # ;; Check Xi != NULL
  4150. test $arg1,$arg1
  4151. jz .Labort_gmult
  4152. # ;; Check gcm128ctx != NULL
  4153. test $arg2,$arg2
  4154. jz .Labort_gmult
  4155. ___
  4156. }
  4157. $code .= "vmovdqu64 ($arg1),%xmm1\n";
  4158. $code .= "vmovdqu64 @{[HashKeyByIdx(1,$arg2)]},%xmm2\n";
  4159. &GHASH_MUL("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5");
  4160. $code .= "vmovdqu64 %xmm1,($arg1)\n";
  4161. if ($CLEAR_SCRATCH_REGISTERS) {
  4162. &clear_scratch_gps_asm();
  4163. &clear_scratch_zmms_asm();
  4164. } else {
  4165. $code .= "vzeroupper\n";
  4166. }
  4167. $code .= <<___;
  4168. .Labort_gmult:
  4169. ret
  4170. .cfi_endproc
  4171. .size ossl_gcm_gmult_avx512, .-ossl_gcm_gmult_avx512
  4172. ___
  4173. if ($win64) {
  4174. # Add unwind metadata for SEH.
  4175. # See https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-160
  4176. my $UWOP_PUSH_NONVOL = 0;
  4177. my $UWOP_ALLOC_LARGE = 1;
  4178. my $UWOP_SET_FPREG = 3;
  4179. my $UWOP_SAVE_XMM128 = 8;
  4180. my %UWOP_REG_NUMBER = (
  4181. rax => 0,
  4182. rcx => 1,
  4183. rdx => 2,
  4184. rbx => 3,
  4185. rsp => 4,
  4186. rbp => 5,
  4187. rsi => 6,
  4188. rdi => 7,
  4189. map(("r$_" => $_), (8 .. 15)));
  4190. $code .= <<___;
  4191. .section .pdata
  4192. .align 4
  4193. .rva .Lsetiv_seh_begin
  4194. .rva .Lsetiv_seh_end
  4195. .rva .Lsetiv_seh_info
  4196. .rva .Lghash_seh_begin
  4197. .rva .Lghash_seh_end
  4198. .rva .Lghash_seh_info
  4199. .rva .Lencrypt_seh_begin
  4200. .rva .Lencrypt_seh_end
  4201. .rva .Lencrypt_seh_info
  4202. .rva .Ldecrypt_seh_begin
  4203. .rva .Ldecrypt_seh_end
  4204. .rva .Ldecrypt_seh_info
  4205. .section .xdata
  4206. ___
  4207. foreach my $func_name ("setiv", "ghash", "encrypt", "decrypt") {
  4208. $code .= <<___;
  4209. .align 8
  4210. .L${func_name}_seh_info:
  4211. .byte 1 # version 1, no flags
  4212. .byte .L${func_name}_seh_prolog_end-.L${func_name}_seh_begin
  4213. .byte 31 # num_slots = 1*8 + 2 + 1 + 2*10
  4214. # FR = rbp; Offset from RSP = $XMM_STORAGE scaled on 16
  4215. .byte @{[$UWOP_REG_NUMBER{rbp} | (($XMM_STORAGE / 16 ) << 4)]}
  4216. ___
  4217. # Metadata for %xmm15-%xmm6
  4218. # Occupy 2 slots each
  4219. for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) {
  4220. # Scaled-by-16 stack offset
  4221. my $xmm_reg_offset = ($reg_idx - 6);
  4222. $code .= <<___;
  4223. .byte .L${func_name}_seh_save_xmm${reg_idx}-.L${func_name}_seh_begin
  4224. .byte @{[$UWOP_SAVE_XMM128 | (${reg_idx} << 4)]}
  4225. .value $xmm_reg_offset
  4226. ___
  4227. }
  4228. $code .= <<___;
  4229. # Frame pointer (occupy 1 slot)
  4230. .byte .L${func_name}_seh_setfp-.L${func_name}_seh_begin
  4231. .byte $UWOP_SET_FPREG
  4232. # Occupy 2 slots, as stack allocation < 512K, but > 128 bytes
  4233. .byte .L${func_name}_seh_allocstack_xmm-.L${func_name}_seh_begin
  4234. .byte $UWOP_ALLOC_LARGE
  4235. .value `($XMM_STORAGE + 8) / 8`
  4236. ___
  4237. # Metadata for GPR regs
  4238. # Occupy 1 slot each
  4239. foreach my $reg ("rsi", "rdi", "r15", "r14", "r13", "r12", "rbp", "rbx") {
  4240. $code .= <<___;
  4241. .byte .L${func_name}_seh_push_${reg}-.L${func_name}_seh_begin
  4242. .byte @{[$UWOP_PUSH_NONVOL | ($UWOP_REG_NUMBER{$reg} << 4)]}
  4243. ___
  4244. }
  4245. }
  4246. }
  4247. $code .= <<___;
  4248. .data
  4249. .align 16
  4250. POLY: .quad 0x0000000000000001, 0xC200000000000000
  4251. .align 64
  4252. POLY2:
  4253. .quad 0x00000001C2000000, 0xC200000000000000
  4254. .quad 0x00000001C2000000, 0xC200000000000000
  4255. .quad 0x00000001C2000000, 0xC200000000000000
  4256. .quad 0x00000001C2000000, 0xC200000000000000
  4257. .align 16
  4258. TWOONE: .quad 0x0000000000000001, 0x0000000100000000
  4259. # ;;; Order of these constants should not change.
  4260. # ;;; More specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
  4261. .align 64
  4262. SHUF_MASK:
  4263. .quad 0x08090A0B0C0D0E0F, 0x0001020304050607
  4264. .quad 0x08090A0B0C0D0E0F, 0x0001020304050607
  4265. .quad 0x08090A0B0C0D0E0F, 0x0001020304050607
  4266. .quad 0x08090A0B0C0D0E0F, 0x0001020304050607
  4267. .align 16
  4268. SHIFT_MASK:
  4269. .quad 0x0706050403020100, 0x0f0e0d0c0b0a0908
  4270. ALL_F:
  4271. .quad 0xffffffffffffffff, 0xffffffffffffffff
  4272. ZERO:
  4273. .quad 0x0000000000000000, 0x0000000000000000
  4274. .align 16
  4275. ONE:
  4276. .quad 0x0000000000000001, 0x0000000000000000
  4277. .align 16
  4278. ONEf:
  4279. .quad 0x0000000000000000, 0x0100000000000000
  4280. .align 64
  4281. ddq_add_1234:
  4282. .quad 0x0000000000000001, 0x0000000000000000
  4283. .quad 0x0000000000000002, 0x0000000000000000
  4284. .quad 0x0000000000000003, 0x0000000000000000
  4285. .quad 0x0000000000000004, 0x0000000000000000
  4286. .align 64
  4287. ddq_add_5678:
  4288. .quad 0x0000000000000005, 0x0000000000000000
  4289. .quad 0x0000000000000006, 0x0000000000000000
  4290. .quad 0x0000000000000007, 0x0000000000000000
  4291. .quad 0x0000000000000008, 0x0000000000000000
  4292. .align 64
  4293. ddq_add_4444:
  4294. .quad 0x0000000000000004, 0x0000000000000000
  4295. .quad 0x0000000000000004, 0x0000000000000000
  4296. .quad 0x0000000000000004, 0x0000000000000000
  4297. .quad 0x0000000000000004, 0x0000000000000000
  4298. .align 64
  4299. ddq_add_8888:
  4300. .quad 0x0000000000000008, 0x0000000000000000
  4301. .quad 0x0000000000000008, 0x0000000000000000
  4302. .quad 0x0000000000000008, 0x0000000000000000
  4303. .quad 0x0000000000000008, 0x0000000000000000
  4304. .align 64
  4305. ddq_addbe_1234:
  4306. .quad 0x0000000000000000, 0x0100000000000000
  4307. .quad 0x0000000000000000, 0x0200000000000000
  4308. .quad 0x0000000000000000, 0x0300000000000000
  4309. .quad 0x0000000000000000, 0x0400000000000000
  4310. .align 64
  4311. ddq_addbe_4444:
  4312. .quad 0x0000000000000000, 0x0400000000000000
  4313. .quad 0x0000000000000000, 0x0400000000000000
  4314. .quad 0x0000000000000000, 0x0400000000000000
  4315. .quad 0x0000000000000000, 0x0400000000000000
  4316. .align 64
  4317. byte_len_to_mask_table:
  4318. .value 0x0000, 0x0001, 0x0003, 0x0007
  4319. .value 0x000f, 0x001f, 0x003f, 0x007f
  4320. .value 0x00ff, 0x01ff, 0x03ff, 0x07ff
  4321. .value 0x0fff, 0x1fff, 0x3fff, 0x7fff
  4322. .value 0xffff
  4323. .align 64
  4324. byte64_len_to_mask_table:
  4325. .quad 0x0000000000000000, 0x0000000000000001
  4326. .quad 0x0000000000000003, 0x0000000000000007
  4327. .quad 0x000000000000000f, 0x000000000000001f
  4328. .quad 0x000000000000003f, 0x000000000000007f
  4329. .quad 0x00000000000000ff, 0x00000000000001ff
  4330. .quad 0x00000000000003ff, 0x00000000000007ff
  4331. .quad 0x0000000000000fff, 0x0000000000001fff
  4332. .quad 0x0000000000003fff, 0x0000000000007fff
  4333. .quad 0x000000000000ffff, 0x000000000001ffff
  4334. .quad 0x000000000003ffff, 0x000000000007ffff
  4335. .quad 0x00000000000fffff, 0x00000000001fffff
  4336. .quad 0x00000000003fffff, 0x00000000007fffff
  4337. .quad 0x0000000000ffffff, 0x0000000001ffffff
  4338. .quad 0x0000000003ffffff, 0x0000000007ffffff
  4339. .quad 0x000000000fffffff, 0x000000001fffffff
  4340. .quad 0x000000003fffffff, 0x000000007fffffff
  4341. .quad 0x00000000ffffffff, 0x00000001ffffffff
  4342. .quad 0x00000003ffffffff, 0x00000007ffffffff
  4343. .quad 0x0000000fffffffff, 0x0000001fffffffff
  4344. .quad 0x0000003fffffffff, 0x0000007fffffffff
  4345. .quad 0x000000ffffffffff, 0x000001ffffffffff
  4346. .quad 0x000003ffffffffff, 0x000007ffffffffff
  4347. .quad 0x00000fffffffffff, 0x00001fffffffffff
  4348. .quad 0x00003fffffffffff, 0x00007fffffffffff
  4349. .quad 0x0000ffffffffffff, 0x0001ffffffffffff
  4350. .quad 0x0003ffffffffffff, 0x0007ffffffffffff
  4351. .quad 0x000fffffffffffff, 0x001fffffffffffff
  4352. .quad 0x003fffffffffffff, 0x007fffffffffffff
  4353. .quad 0x00ffffffffffffff, 0x01ffffffffffffff
  4354. .quad 0x03ffffffffffffff, 0x07ffffffffffffff
  4355. .quad 0x0fffffffffffffff, 0x1fffffffffffffff
  4356. .quad 0x3fffffffffffffff, 0x7fffffffffffffff
  4357. .quad 0xffffffffffffffff
  4358. ___
  4359. } else {
  4360. # Fallback for old assembler
  4361. $code .= <<___;
  4362. .text
  4363. .globl ossl_vaes_vpclmulqdq_capable
  4364. .type ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent
  4365. ossl_vaes_vpclmulqdq_capable:
  4366. xor %eax,%eax
  4367. ret
  4368. .size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable
  4369. .globl ossl_aes_gcm_init_avx512
  4370. .globl ossl_aes_gcm_setiv_avx512
  4371. .globl ossl_aes_gcm_update_aad_avx512
  4372. .globl ossl_aes_gcm_encrypt_avx512
  4373. .globl ossl_aes_gcm_decrypt_avx512
  4374. .globl ossl_aes_gcm_finalize_avx512
  4375. .globl ossl_gcm_gmult_avx512
  4376. .type ossl_aes_gcm_init_avx512,\@abi-omnipotent
  4377. ossl_aes_gcm_init_avx512:
  4378. ossl_aes_gcm_setiv_avx512:
  4379. ossl_aes_gcm_update_aad_avx512:
  4380. ossl_aes_gcm_encrypt_avx512:
  4381. ossl_aes_gcm_decrypt_avx512:
  4382. ossl_aes_gcm_finalize_avx512:
  4383. ossl_gcm_gmult_avx512:
  4384. .byte 0x0f,0x0b # ud2
  4385. ret
  4386. .size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512
  4387. ___
  4388. }
  4389. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  4390. print $code;
  4391. close STDOUT or die "error closing STDOUT: $!";