| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532 |
- #! /usr/bin/env perl
- # Copyright 2023-2025 The OpenSSL Project Authors. All Rights Reserved.
- #
- # Licensed under the Apache License 2.0 (the "License"). You may not use
- # this file except in compliance with the License. You can obtain a copy
- # in the file LICENSE in the source distribution or at
- # https://www.openssl.org/source/license.html
- #
- # ====================================================================
- # Written by Danny Tsen <[email protected]> # for the OpenSSL project.
- #
- # Copyright 2025- IBM Corp.
- # ====================================================================
- #
- # p384 lower-level primitives for PPC64.
- #
- use strict;
- use warnings;
- my $flavour = shift;
- my $output = "";
- while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
- if (!$output) {
- $output = "-";
- }
- my ($xlate, $dir);
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
- die "can't locate ppc-xlate.pl";
- open OUT,"| \"$^X\" $xlate $flavour $output";
- *STDOUT=*OUT;
- my $code = "";
- $code.=<<___;
- .machine "any"
- .text
- .globl p384_felem_mul
- .type p384_felem_mul,\@function
- .align 4
- p384_felem_mul:
- stdu 1, -176(1)
- mflr 0
- std 14, 56(1)
- std 15, 64(1)
- std 16, 72(1)
- std 17, 80(1)
- std 18, 88(1)
- std 19, 96(1)
- std 20, 104(1)
- std 21, 112(1)
- std 22, 120(1)
- bl _p384_felem_mul_core
- mtlr 0
- ld 14, 56(1)
- ld 15, 64(1)
- ld 16, 72(1)
- ld 17, 80(1)
- ld 18, 88(1)
- ld 19, 96(1)
- ld 20, 104(1)
- ld 21, 112(1)
- ld 22, 120(1)
- addi 1, 1, 176
- blr
- .size p384_felem_mul,.-p384_felem_mul
- .globl p384_felem_square
- .type p384_felem_square,\@function
- .align 4
- p384_felem_square:
- stdu 1, -176(1)
- mflr 0
- std 14, 56(1)
- std 15, 64(1)
- std 16, 72(1)
- std 17, 80(1)
- bl _p384_felem_square_core
- mtlr 0
- ld 14, 56(1)
- ld 15, 64(1)
- ld 16, 72(1)
- ld 17, 80(1)
- addi 1, 1, 176
- blr
- .size p384_felem_square,.-p384_felem_square
- #
- # Felem mul core function -
- # r3, r4 and r5 need to pre-loaded.
- #
- .type _p384_felem_mul_core,\@function
- .align 4
- _p384_felem_mul_core:
- ld 6,0(4)
- ld 14,0(5)
- ld 7,8(4)
- ld 15,8(5)
- ld 8,16(4)
- ld 16,16(5)
- ld 9,24(4)
- ld 17,24(5)
- ld 10,32(4)
- ld 18,32(5)
- ld 11,40(4)
- ld 19,40(5)
- ld 12,48(4)
- ld 20,48(5)
- # out0
- mulld 21, 14, 6
- mulhdu 22, 14, 6
- std 21, 0(3)
- std 22, 8(3)
- vxor 0, 0, 0
- # out1
- mtvsrdd 32+13, 14, 6
- mtvsrdd 32+14, 7, 15
- vmsumudm 1, 13, 14, 0
- # out2
- mtvsrdd 32+15, 15, 6
- mtvsrdd 32+16, 7, 16
- mtvsrdd 32+17, 0, 8
- mtvsrdd 32+18, 0, 14
- vmsumudm 19, 15, 16, 0
- vmsumudm 2, 17, 18, 19
- # out3
- mtvsrdd 32+13, 16, 6
- mtvsrdd 32+14, 7, 17
- mtvsrdd 32+15, 14, 8
- mtvsrdd 32+16, 9, 15
- vmsumudm 19, 13, 14, 0
- vmsumudm 3, 15, 16, 19
- # out4
- mtvsrdd 32+13, 17, 6
- mtvsrdd 32+14, 7, 18
- mtvsrdd 32+15, 15, 8
- mtvsrdd 32+16, 9, 16
- mtvsrdd 32+17, 0, 10
- mtvsrdd 32+18, 0, 14
- vmsumudm 19, 13, 14, 0
- vmsumudm 4, 15, 16, 19
- vmsumudm 4, 17, 18, 4
- # out5
- mtvsrdd 32+13, 18, 6
- mtvsrdd 32+14, 7, 19
- mtvsrdd 32+15, 16, 8
- mtvsrdd 32+16, 9, 17
- mtvsrdd 32+17, 14, 10
- mtvsrdd 32+18, 11, 15
- vmsumudm 19, 13, 14, 0
- vmsumudm 5, 15, 16, 19
- vmsumudm 5, 17, 18, 5
- stxv 32+1, 16(3)
- stxv 32+2, 32(3)
- stxv 32+3, 48(3)
- stxv 32+4, 64(3)
- stxv 32+5, 80(3)
- # out6
- mtvsrdd 32+13, 19, 6
- mtvsrdd 32+14, 7, 20
- mtvsrdd 32+15, 17, 8
- mtvsrdd 32+16, 9, 18
- mtvsrdd 32+17, 15, 10
- mtvsrdd 32+18, 11, 16
- vmsumudm 19, 13, 14, 0
- vmsumudm 6, 15, 16, 19
- mtvsrdd 32+13, 0, 12
- mtvsrdd 32+14, 0, 14
- vmsumudm 19, 17, 18, 6
- vmsumudm 6, 13, 14, 19
- # out7
- mtvsrdd 32+13, 19, 7
- mtvsrdd 32+14, 8, 20
- mtvsrdd 32+15, 17, 9
- mtvsrdd 32+16, 10, 18
- mtvsrdd 32+17, 15, 11
- mtvsrdd 32+18, 12, 16
- vmsumudm 19, 13, 14, 0
- vmsumudm 7, 15, 16, 19
- vmsumudm 7, 17, 18, 7
- # out8
- mtvsrdd 32+13, 19, 8
- mtvsrdd 32+14, 9, 20
- mtvsrdd 32+15, 17, 10
- mtvsrdd 32+16, 11, 18
- mtvsrdd 32+17, 0, 12
- mtvsrdd 32+18, 0, 16
- vmsumudm 19, 13, 14, 0
- vmsumudm 8, 15, 16, 19
- vmsumudm 8, 17, 18, 8
- # out9
- mtvsrdd 32+13, 19, 9
- mtvsrdd 32+14, 10, 20
- mtvsrdd 32+15, 17, 11
- mtvsrdd 32+16, 12, 18
- vmsumudm 19, 13, 14, 0
- vmsumudm 9, 15, 16, 19
- # out10
- mtvsrdd 32+13, 19, 10
- mtvsrdd 32+14, 11, 20
- mtvsrdd 32+15, 0, 12
- mtvsrdd 32+16, 0, 18
- vmsumudm 19, 13, 14, 0
- vmsumudm 10, 15, 16, 19
- # out11
- mtvsrdd 32+17, 19, 11
- mtvsrdd 32+18, 12, 20
- vmsumudm 11, 17, 18, 0
- stxv 32+6, 96(3)
- stxv 32+7, 112(3)
- stxv 32+8, 128(3)
- stxv 32+9, 144(3)
- stxv 32+10, 160(3)
- stxv 32+11, 176(3)
- # out12
- mulld 21, 20, 12
- mulhdu 22, 20, 12 # out12
- std 21, 192(3)
- std 22, 200(3)
- blr
- .size _p384_felem_mul_core,.-_p384_felem_mul_core
- #
- # Felem square core function -
- # r3 and r4 need to pre-loaded.
- #
- .type _p384_felem_square_core,\@function
- .align 4
- _p384_felem_square_core:
- ld 6, 0(4)
- ld 7, 8(4)
- ld 8, 16(4)
- ld 9, 24(4)
- ld 10, 32(4)
- ld 11, 40(4)
- ld 12, 48(4)
- vxor 0, 0, 0
- # out0
- mulld 14, 6, 6
- mulhdu 15, 6, 6
- std 14, 0(3)
- std 15, 8(3)
- # out1
- add 14, 6, 6
- mtvsrdd 32+13, 0, 14
- mtvsrdd 32+14, 0, 7
- vmsumudm 1, 13, 14, 0
- # out2
- mtvsrdd 32+15, 7, 14
- mtvsrdd 32+16, 7, 8
- vmsumudm 2, 15, 16, 0
- # out3
- add 15, 7, 7
- mtvsrdd 32+13, 8, 14
- mtvsrdd 32+14, 15, 9
- vmsumudm 3, 13, 14, 0
- # out4
- mtvsrdd 32+13, 9, 14
- mtvsrdd 32+14, 15, 10
- mtvsrdd 32+15, 0, 8
- vmsumudm 4, 13, 14, 0
- vmsumudm 4, 15, 15, 4
- # out5
- mtvsrdd 32+13, 10, 14
- mtvsrdd 32+14, 15, 11
- add 16, 8, 8
- mtvsrdd 32+15, 0, 16
- mtvsrdd 32+16, 0, 9
- vmsumudm 5, 13, 14, 0
- vmsumudm 5, 15, 16, 5
- stxv 32+1, 16(3)
- stxv 32+2, 32(3)
- stxv 32+3, 48(3)
- stxv 32+4, 64(3)
- # out6
- mtvsrdd 32+13, 11, 14
- mtvsrdd 32+14, 15, 12
- mtvsrdd 32+15, 9, 16
- mtvsrdd 32+16, 9, 10
- stxv 32+5, 80(3)
- vmsumudm 19, 13, 14, 0
- vmsumudm 6, 15, 16, 19
- # out7
- add 17, 9, 9
- mtvsrdd 32+13, 11, 15
- mtvsrdd 32+14, 16, 12
- mtvsrdd 32+15, 0, 17
- mtvsrdd 32+16, 0, 10
- vmsumudm 19, 13, 14, 0
- vmsumudm 7, 15, 16, 19
- # out8
- mtvsrdd 32+13, 11, 16
- mtvsrdd 32+14, 17, 12
- mtvsrdd 32+15, 0, 10
- vmsumudm 19, 13, 14, 0
- vmsumudm 8, 15, 15, 19
- # out9
- add 14, 10, 10
- mtvsrdd 32+13, 11, 17
- mtvsrdd 32+14, 14, 12
- vmsumudm 9, 13, 14, 0
- # out10
- mtvsrdd 32+13, 11, 14
- mtvsrdd 32+14, 11, 12
- vmsumudm 10, 13, 14, 0
- stxv 32+6, 96(3)
- stxv 32+7, 112(3)
- # out11
- #add 14, 11, 11
- #mtvsrdd 32+13, 0, 14
- #mtvsrdd 32+14, 0, 12
- #vmsumudm 11, 13, 14, 0
- mulld 6, 12, 11
- mulhdu 7, 12, 11
- addc 8, 6, 6
- adde 9, 7, 7
- stxv 32+8, 128(3)
- stxv 32+9, 144(3)
- stxv 32+10, 160(3)
- #stxv 32+11, 176(3)
- # out12
- mulld 14, 12, 12
- mulhdu 15, 12, 12
- std 8, 176(3)
- std 9, 184(3)
- std 14, 192(3)
- std 15, 200(3)
- blr
- .size _p384_felem_square_core,.-_p384_felem_square_core
- #
- # widefelem (128 bits) * 8
- #
- .macro F128_X_8 _off1 _off2
- ld 9,\\_off1(3)
- ld 8,\\_off2(3)
- srdi 10,9,61
- rldimi 10,8,3,0
- sldi 9,9,3
- std 9,\\_off1(3)
- std 10,\\_off2(3)
- .endm
- .globl p384_felem128_mul_by_8
- .type p384_felem128_mul_by_8, \@function
- .align 4
- p384_felem128_mul_by_8:
- F128_X_8 0, 8
- F128_X_8 16, 24
- F128_X_8 32, 40
- F128_X_8 48, 56
- F128_X_8 64, 72
- F128_X_8 80, 88
- F128_X_8 96, 104
- F128_X_8 112, 120
- F128_X_8 128, 136
- F128_X_8 144, 152
- F128_X_8 160, 168
- F128_X_8 176, 184
- F128_X_8 192, 200
- blr
- .size p384_felem128_mul_by_8,.-p384_felem128_mul_by_8
- #
- # widefelem (128 bits) * 2
- #
- .macro F128_X_2 _off1 _off2
- ld 9,\\_off1(3)
- ld 8,\\_off2(3)
- srdi 10,9,63
- rldimi 10,8,1,0
- sldi 9,9,1
- std 9,\\_off1(3)
- std 10,\\_off2(3)
- .endm
- .globl p384_felem128_mul_by_2
- .type p384_felem128_mul_by_2, \@function
- .align 4
- p384_felem128_mul_by_2:
- F128_X_2 0, 8
- F128_X_2 16, 24
- F128_X_2 32, 40
- F128_X_2 48, 56
- F128_X_2 64, 72
- F128_X_2 80, 88
- F128_X_2 96, 104
- F128_X_2 112, 120
- F128_X_2 128, 136
- F128_X_2 144, 152
- F128_X_2 160, 168
- F128_X_2 176, 184
- F128_X_2 192, 200
- blr
- .size p384_felem128_mul_by_2,.-p384_felem128_mul_by_2
- .globl p384_felem_diff128
- .type p384_felem_diff128, \@function
- .align 4
- p384_felem_diff128:
- addis 5, 2, .LConst_two127\@toc\@ha
- addi 5, 5, .LConst_two127\@toc\@l
- ld 10, 0(3)
- ld 8, 8(3)
- li 9, 0
- addc 10, 10, 9
- li 7, -1
- rldicr 7, 7, 0, 0 # two127
- adde 8, 8, 7
- ld 11, 0(4)
- ld 12, 8(4)
- subfc 11, 11, 10
- subfe 12, 12, 8
- std 11, 0(3) # out0
- std 12, 8(3)
- # two127m71 = (r10, r9)
- ld 8, 16(3)
- ld 7, 24(3)
- ld 10, 24(5) # two127m71
- addc 8, 8, 9
- adde 7, 7, 10
- ld 11, 16(4)
- ld 12, 24(4)
- subfc 11, 11, 8
- subfe 12, 12, 7
- std 11, 16(3) # out1
- std 12, 24(3)
- ld 8, 32(3)
- ld 7, 40(3)
- addc 8, 8, 9
- adde 7, 7, 10
- ld 11, 32(4)
- ld 12, 40(4)
- subfc 11, 11, 8
- subfe 12, 12, 7
- std 11, 32(3) # out2
- std 12, 40(3)
- ld 8, 48(3)
- ld 7, 56(3)
- addc 8, 8, 9
- adde 7, 7, 10
- ld 11, 48(4)
- ld 12, 56(4)
- subfc 11, 11, 8
- subfe 12, 12, 7
- std 11, 48(3) # out3
- std 12, 56(3)
- ld 8, 64(3)
- ld 7, 72(3)
- addc 8, 8, 9
- adde 7, 7, 10
- ld 11, 64(4)
- ld 12, 72(4)
- subfc 11, 11, 8
- subfe 12, 12, 7
- std 11, 64(3) # out4
- std 12, 72(3)
- ld 8, 80(3)
- ld 7, 88(3)
- addc 8, 8, 9
- adde 7, 7, 10
- ld 11, 80(4)
- ld 12, 88(4)
- subfc 11, 11, 8
- subfe 12, 12, 7
- std 11, 80(3) # out5
- std 12, 88(3)
- ld 8, 96(3)
- ld 7, 104(3)
- ld 6, 40(5) # two127p111m79m71
- addc 8, 8, 9
- adde 7, 7, 6
- ld 11, 96(4)
- ld 12, 104(4)
- subfc 11, 11, 8
- subfe 12, 12, 7
- std 11, 96(3) # out6
- std 12, 104(3)
- ld 8, 112(3)
- ld 7, 120(3)
- ld 6, 56(5) # two127m119m71
- addc 8, 8, 9
- adde 7, 7, 6
- ld 11, 112(4)
- ld 12, 120(4)
- subfc 11, 11, 8
- subfe 12, 12, 7
- std 11, 112(3) # out7
- std 12, 120(3)
- ld 8, 128(3)
- ld 7, 136(3)
- ld 6, 72(5) # two127m95m71
- addc 8, 8, 9
- adde 7, 7, 6
- ld 11, 128(4)
- ld 12, 136(4)
- subfc 11, 11, 8
- subfe 12, 12, 7
- std 11, 128(3) # out8
- std 12, 136(3)
- ld 8, 144(3)
- ld 7, 152(3)
- addc 8, 8, 9
- adde 7, 7, 10
- ld 11, 144(4)
- ld 12, 152(4)
- subfc 11, 11, 8
- subfe 12, 12, 7
- std 11, 144(3) # out9
- std 12, 152(3)
- ld 8, 160(3)
- ld 7, 168(3)
- addc 8, 8, 9
- adde 7, 7, 10
- ld 11, 160(4)
- ld 12, 168(4)
- subfc 11, 11, 8
- subfe 12, 12, 7
- std 11, 160(3) # out10
- std 12, 168(3)
- ld 8, 176(3)
- ld 7, 184(3)
- addc 8, 8, 9
- adde 7, 7, 10
- ld 11, 176(4)
- ld 12, 184(4)
- subfc 11, 11, 8
- subfe 12, 12, 7
- std 11, 176(3) # out11
- std 12, 184(3)
- ld 8, 192(3)
- ld 7, 200(3)
- addc 8, 8, 9
- adde 7, 7, 10
- ld 11, 192(4)
- ld 12, 200(4)
- subfc 11, 11, 8
- subfe 12, 12, 7
- std 11, 192(3) # out12
- std 12, 200(3)
- blr
- .size p384_felem_diff128,.-p384_felem_diff128
- .data
- .align 4
- .LConst_two127:
- #two127
- .long 0x00000000, 0x00000000, 0x00000000, 0x80000000
- #two127m71
- .long 0x00000000, 0x00000000, 0xffffff80, 0x7fffffff
- #two127p111m79m71
- .long 0x00000000, 0x00000000, 0xffff7f80, 0x80007fff
- #two127m119m71
- .long 0x00000000, 0x00000000, 0xffffff80, 0x7f7fffff
- #two127m95m71
- .long 0x00000000, 0x00000000, 0x7fffff80, 0x7fffffff
- .text
- .globl p384_felem_diff_128_64
- .type p384_felem_diff_128_64, \@function
- .align 4
- p384_felem_diff_128_64:
- addis 5, 2, .LConst_128_two64\@toc\@ha
- addi 5, 5, .LConst_128_two64\@toc\@l
- ld 9, 0(3)
- ld 10, 8(3)
- ld 8, 48(5) # two64p48m16
- li 7, 0
- addc 9, 9, 8
- li 6, 1
- adde 10, 10, 6
- ld 11, 0(4)
- subfc 8, 11, 9
- subfe 12, 7, 10
- std 8, 0(3) # out0
- std 12, 8(3)
- ld 9, 16(3)
- ld 10, 24(3)
- ld 8, 0(5) # two64m56m8
- addc 9, 9, 8
- addze 10, 10
- ld 11, 8(4)
- subfc 11, 11, 9
- subfe 12, 7, 10
- std 11, 16(3) # out1
- std 12, 24(3)
- ld 9, 32(3)
- ld 10, 40(3)
- ld 8, 16(5) # two64m32m8
- addc 9, 9, 8
- addze 10, 10
- ld 11, 16(4)
- subfc 11, 11, 9
- subfe 12, 7, 10
- std 11, 32(3) # out2
- std 12, 40(3)
- ld 10, 48(3)
- ld 8, 56(3)
- #ld 9, 32(5) # two64m8
- li 9, -256 # two64m8
- addc 10, 10, 9
- addze 8, 8
- ld 11, 24(4)
- subfc 11, 11, 10
- subfe 12, 7, 8
- std 11, 48(3) # out3
- std 12, 56(3)
- ld 10, 64(3)
- ld 8, 72(3)
- addc 10, 10, 9
- addze 8, 8
- ld 11, 32(4)
- subfc 11, 11, 10
- subfe 12, 7, 8
- std 11, 64(3) # out4
- std 12, 72(3)
- ld 10, 80(3)
- ld 8, 88(3)
- addc 10, 10, 9
- addze 8, 8
- ld 11, 40(4)
- subfc 11, 11, 10
- subfe 12, 7, 8
- std 11, 80(3) # out5
- std 12, 88(3)
- ld 10, 96(3)
- ld 8, 104(3)
- addc 10, 10, 9
- addze 9, 8
- ld 11, 48(4)
- subfc 11, 11, 10
- subfe 12, 7, 9
- std 11, 96(3) # out6
- std 12, 104(3)
- blr
- .size p384_felem_diff_128_64,.-p384_felem_diff_128_64
- .data
- .align 4
- .LConst_128_two64:
- #two64m56m8
- .long 0xffffff00, 0xfeffffff, 0x00000000, 0x00000000
- #two64m32m8
- .long 0xffffff00, 0xfffffffe, 0x00000000, 0x00000000
- #two64m8
- .long 0xffffff00, 0xffffffff, 0x00000000, 0x00000000
- #two64p48m16
- .long 0xffff0000, 0x0000ffff, 0x00000001, 0x00000000
- .LConst_two60:
- #two60m52m4
- .long 0xfffffff0, 0x0fefffff, 0x0, 0x0
- #two60p44m12
- .long 0xfffff000, 0x10000fff, 0x0, 0x0
- #two60m28m4
- .long 0xeffffff0, 0x0fffffff, 0x0, 0x0
- #two60m4
- .long 0xfffffff0, 0x0fffffff, 0x0, 0x0
- .text
- #
- # static void felem_diff64(felem out, const felem in)
- #
- .globl p384_felem_diff64
- .type p384_felem_diff64, \@function
- .align 4
- p384_felem_diff64:
- addis 5, 2, .LConst_two60\@toc\@ha
- addi 5, 5, .LConst_two60\@toc\@l
- ld 9, 0(3)
- ld 8, 16(5) # two60p44m12
- li 7, 0
- add 9, 9, 8
- ld 11, 0(4)
- subf 8, 11, 9
- std 8, 0(3) # out0
- ld 9, 8(3)
- ld 8, 0(5) # two60m52m4
- add 9, 9, 8
- ld 11, 8(4)
- subf 11, 11, 9
- std 11, 8(3) # out1
- ld 9, 16(3)
- ld 8, 32(5) # two60m28m4
- add 9, 9, 8
- ld 11, 16(4)
- subf 11, 11, 9
- std 11, 16(3) # out2
- ld 10, 24(3)
- ld 9, 48(5) # two60m4
- add 10, 10, 9
- ld 12, 24(4)
- subf 12, 12, 10
- std 12, 24(3) # out3
- ld 10, 32(3)
- add 10, 10, 9
- ld 11, 32(4)
- subf 11, 11, 10
- std 11, 32(3) # out4
- ld 10, 40(3)
- add 10, 10, 9
- ld 12, 40(4)
- subf 12, 12, 10
- std 12, 40(3) # out5
- ld 10, 48(3)
- add 10, 10, 9
- ld 11, 48(4)
- subf 11, 11, 10
- std 11, 48(3) # out6
- blr
- .size p384_felem_diff64,.-p384_felem_diff64
- .text
- #
- # Shift 128 bits right <nbits>
- #
- .macro SHR o_h o_l in_h in_l nbits
- srdi \\o_l, \\in_l, \\nbits # shift lower right <nbits>
- rldimi \\o_l, \\in_h, 64-\\nbits, 0 # insert <64-nbits> from hi
- srdi \\o_h, \\in_h, \\nbits # shift higher right <nbits>
- .endm
- #
- # static void felem_reduce(felem out, const widefelem in)
- #
- .global p384_felem_reduce
- .type p384_felem_reduce,\@function
- .align 4
- p384_felem_reduce:
- stdu 1, -208(1)
- mflr 0
- std 14, 56(1)
- std 15, 64(1)
- std 16, 72(1)
- std 17, 80(1)
- std 18, 88(1)
- std 19, 96(1)
- std 20, 104(1)
- std 21, 112(1)
- std 22, 120(1)
- std 23, 128(1)
- std 24, 136(1)
- std 25, 144(1)
- std 26, 152(1)
- std 27, 160(1)
- std 28, 168(1)
- std 29, 176(1)
- std 30, 184(1)
- std 31, 192(1)
- bl _p384_felem_reduce_core
- mtlr 0
- ld 14, 56(1)
- ld 15, 64(1)
- ld 16, 72(1)
- ld 17, 80(1)
- ld 18, 88(1)
- ld 19, 96(1)
- ld 20, 104(1)
- ld 21, 112(1)
- ld 22, 120(1)
- ld 23, 128(1)
- ld 24, 136(1)
- ld 25, 144(1)
- ld 26, 152(1)
- ld 27, 160(1)
- ld 28, 168(1)
- ld 29, 176(1)
- ld 30, 184(1)
- ld 31, 192(1)
- addi 1, 1, 208
- blr
- .size p384_felem_reduce,.-p384_felem_reduce
- #
- # Felem reduction core function -
- # r3 and r4 need to pre-loaded.
- #
- .type _p384_felem_reduce_core,\@function
- .align 4
- _p384_felem_reduce_core:
- addis 12, 2, .LConst\@toc\@ha
- addi 12, 12, .LConst\@toc\@l
- # load constat p
- ld 11, 8(12) # hi - two124m68
- # acc[6] = in[6] + two124m68;
- ld 26, 96(4) # in[6].l
- ld 27, 96+8(4) # in[6].h
- add 27, 27, 11
- # acc[5] = in[5] + two124m68;
- ld 24, 80(4) # in[5].l
- ld 25, 80+8(4) # in[5].h
- add 25, 25, 11
- # acc[4] = in[4] + two124m68;
- ld 22, 64(4) # in[4].l
- ld 23, 64+8(4) # in[4].h
- add 23, 23, 11
- # acc[3] = in[3] + two124m68;
- ld 20, 48(4) # in[3].l
- ld 21, 48+8(4) # in[3].h
- add 21, 21, 11
- ld 11, 48+8(12) # hi - two124m92m68
- # acc[2] = in[2] + two124m92m68;
- ld 18, 32(4) # in[2].l
- ld 19, 32+8(4) # in[2].h
- add 19, 19, 11
- ld 11, 16+8(12) # high - two124m116m68
- # acc[1] = in[1] + two124m116m68;
- ld 16, 16(4) # in[1].l
- ld 17, 16+8(4) # in[1].h
- add 17, 17, 11
- ld 11, 32+8(12) # high - two124p108m76
- # acc[0] = in[0] + two124p108m76;
- ld 14, 0(4) # in[0].l
- ld 15, 0+8(4) # in[0].h
- add 15, 15, 11
- # compute mask
- li 7, -1
- # Eliminate in[12]
- # acc[8] += in[12] >> 32;
- ld 5, 192(4) # in[12].l
- ld 6, 192+8(4) # in[12].h
- SHR 9, 10, 6, 5, 32
- ld 30, 128(4) # in[8].l
- ld 31, 136(4) # in[8].h
- addc 30, 30, 10
- adde 31, 31, 9
- # acc[7] += (in[12] & 0xffffffff) << 24;
- srdi 11, 7, 32 # 0xffffffff
- and 11, 11, 5
- sldi 11, 11, 24 # << 24
- ld 28, 112(4) # in[7].l
- ld 29, 120(4) # in[7].h
- addc 28, 28, 11
- addze 29, 29
- # acc[7] += in[12] >> 8;
- SHR 9, 10, 6, 5, 8
- addc 28, 28, 10
- adde 29, 29, 9
- # acc[6] += (in[12] & 0xff) << 48;
- andi. 11, 5, 0xff
- sldi 11, 11, 48
- addc 26, 26, 11
- addze 27, 27
- # acc[6] -= in[12] >> 16;
- SHR 9, 10, 6, 5, 16
- subfc 26, 10, 26
- subfe 27, 9, 27
- # acc[5] -= (in[12] & 0xffff) << 40;
- srdi 11, 7, 48 # 0xffff
- and 11, 11, 5
- sldi 11, 11, 40 # << 40
- li 9, 0
- subfc 24, 11, 24
- subfe 25, 9, 25
- # acc[6] += in[12] >> 48;
- SHR 9, 10, 6, 5, 48
- addc 26, 26, 10
- adde 27, 27, 9
- # acc[5] += (in[12] & 0xffffffffffff) << 8;
- srdi 11, 7, 16 # 0xffffffffffff
- and 11, 11, 5
- sldi 11, 11, 8 # << 8
- addc 24, 24, 11
- addze 25, 25
- # Eliminate in[11]
- # acc[7] += in[11] >> 32;
- ld 5, 176(4) # in[11].l
- ld 6, 176+8(4) # in[11].h
- SHR 9, 10, 6, 5, 32
- addc 28, 28, 10
- adde 29, 29, 9
- # acc[6] += (in[11] & 0xffffffff) << 24;
- srdi 11, 7, 32 # 0xffffffff
- and 11, 11, 5
- sldi 11, 11, 24 # << 24
- addc 26, 26, 11
- addze 27, 27
- # acc[6] += in[11] >> 8;
- SHR 9, 10, 6, 5, 8
- addc 26, 26, 10
- adde 27, 27, 9
- # acc[5] += (in[11] & 0xff) << 48;
- andi. 11, 5, 0xff
- sldi 11, 11, 48
- addc 24, 24, 11
- addze 25, 25
- # acc[5] -= in[11] >> 16;
- SHR 9, 10, 6, 5, 16
- subfc 24, 10, 24
- subfe 25, 9, 25
- # acc[4] -= (in[11] & 0xffff) << 40;
- srdi 11, 7, 48 # 0xffff
- and 11, 11, 5
- sldi 11, 11, 40 # << 40
- li 9, 0
- subfc 22, 11, 22
- subfe 23, 9, 23
- # acc[5] += in[11] >> 48;
- SHR 9, 10, 6, 5, 48
- addc 24, 24, 10
- adde 25, 25, 9
- # acc[4] += (in[11] & 0xffffffffffff) << 8;
- srdi 11, 7, 16 # 0xffffffffffff
- and 11, 11, 5
- sldi 11, 11, 8 # << 8
- addc 22, 22, 11
- addze 23, 23
- # Eliminate in[10]
- # acc[6] += in[10] >> 32;
- ld 5, 160(4) # in[10].l
- ld 6, 160+8(4) # in[10].h
- SHR 9, 10, 6, 5, 32
- addc 26, 26, 10
- adde 27, 27, 9
- # acc[5] += (in[10] & 0xffffffff) << 24;
- srdi 11, 7, 32 # 0xffffffff
- and 11, 11, 5
- sldi 11, 11, 24 # << 24
- addc 24, 24, 11
- addze 25, 25
- # acc[5] += in[10] >> 8;
- SHR 9, 10, 6, 5, 8
- addc 24, 24, 10
- adde 25, 25, 9
- # acc[4] += (in[10] & 0xff) << 48;
- andi. 11, 5, 0xff
- sldi 11, 11, 48
- addc 22, 22, 11
- addze 23, 23
- # acc[4] -= in[10] >> 16;
- SHR 9, 10, 6, 5, 16
- subfc 22, 10, 22
- subfe 23, 9, 23
- # acc[3] -= (in[10] & 0xffff) << 40;
- srdi 11, 7, 48 # 0xffff
- and 11, 11, 5
- sldi 11, 11, 40 # << 40
- li 9, 0
- subfc 20, 11, 20
- subfe 21, 9, 21
- # acc[4] += in[10] >> 48;
- SHR 9, 10, 6, 5, 48
- addc 22, 22, 10
- adde 23, 23, 9
- # acc[3] += (in[10] & 0xffffffffffff) << 8;
- srdi 11, 7, 16 # 0xffffffffffff
- and 11, 11, 5
- sldi 11, 11, 8 # << 8
- addc 20, 20, 11
- addze 21, 21
- # Eliminate in[9]
- # acc[5] += in[9] >> 32;
- ld 5, 144(4) # in[9].l
- ld 6, 144+8(4) # in[9].h
- SHR 9, 10, 6, 5, 32
- addc 24, 24, 10
- adde 25, 25, 9
- # acc[4] += (in[9] & 0xffffffff) << 24;
- srdi 11, 7, 32 # 0xffffffff
- and 11, 11, 5
- sldi 11, 11, 24 # << 24
- addc 22, 22, 11
- addze 23, 23
- # acc[4] += in[9] >> 8;
- SHR 9, 10, 6, 5, 8
- addc 22, 22, 10
- adde 23, 23, 9
- # acc[3] += (in[9] & 0xff) << 48;
- andi. 11, 5, 0xff
- sldi 11, 11, 48
- addc 20, 20, 11
- addze 21, 21
- # acc[3] -= in[9] >> 16;
- SHR 9, 10, 6, 5, 16
- subfc 20, 10, 20
- subfe 21, 9, 21
- # acc[2] -= (in[9] & 0xffff) << 40;
- srdi 11, 7, 48 # 0xffff
- and 11, 11, 5
- sldi 11, 11, 40 # << 40
- li 9, 0
- subfc 18, 11, 18
- subfe 19, 9, 19
- # acc[3] += in[9] >> 48;
- SHR 9, 10, 6, 5, 48
- addc 20, 20, 10
- adde 21, 21, 9
- # acc[2] += (in[9] & 0xffffffffffff) << 8;
- srdi 11, 7, 16 # 0xffffffffffff
- and 11, 11, 5
- sldi 11, 11, 8 # << 8
- addc 18, 18, 11
- addze 19, 19
- # Eliminate acc[8]
- # acc[4] += acc[8] >> 32;
- mr 5, 30 # acc[8].l
- mr 6, 31 # acc[8].h
- SHR 9, 10, 6, 5, 32
- addc 22, 22, 10
- adde 23, 23, 9
- # acc[3] += (acc[8] & 0xffffffff) << 24;
- srdi 11, 7, 32 # 0xffffffff
- and 11, 11, 5
- sldi 11, 11, 24 # << 24
- addc 20, 20, 11
- addze 21, 21
- # acc[3] += acc[8] >> 8;
- SHR 9, 10, 6, 5, 8
- addc 20, 20, 10
- adde 21, 21, 9
- # acc[2] += (acc[8] & 0xff) << 48;
- andi. 11, 5, 0xff
- sldi 11, 11, 48
- addc 18, 18, 11
- addze 19, 19
- # acc[2] -= acc[8] >> 16;
- SHR 9, 10, 6, 5, 16
- subfc 18, 10, 18
- subfe 19, 9, 19
- # acc[1] -= (acc[8] & 0xffff) << 40;
- srdi 11, 7, 48 # 0xffff
- and 11, 11, 5
- sldi 11, 11, 40 # << 40
- li 9, 0
- subfc 16, 11, 16
- subfe 17, 9, 17
- #acc[2] += acc[8] >> 48;
- SHR 9, 10, 6, 5, 48
- addc 18, 18, 10
- adde 19, 19, 9
- # acc[1] += (acc[8] & 0xffffffffffff) << 8;
- srdi 11, 7, 16 # 0xffffffffffff
- and 11, 11, 5
- sldi 11, 11, 8 # << 8
- addc 16, 16, 11
- addze 17, 17
- # Eliminate acc[7]
- # acc[3] += acc[7] >> 32;
- mr 5, 28 # acc[7].l
- mr 6, 29 # acc[7].h
- SHR 9, 10, 6, 5, 32
- addc 20, 20, 10
- adde 21, 21, 9
- # acc[2] += (acc[7] & 0xffffffff) << 24;
- srdi 11, 7, 32 # 0xffffffff
- and 11, 11, 5
- sldi 11, 11, 24 # << 24
- addc 18, 18, 11
- addze 19, 19
- # acc[2] += acc[7] >> 8;
- SHR 9, 10, 6, 5, 8
- addc 18, 18, 10
- adde 19, 19, 9
- # acc[1] += (acc[7] & 0xff) << 48;
- andi. 11, 5, 0xff
- sldi 11, 11, 48
- addc 16, 16, 11
- addze 17, 17
- # acc[1] -= acc[7] >> 16;
- SHR 9, 10, 6, 5, 16
- subfc 16, 10, 16
- subfe 17, 9, 17
- # acc[0] -= (acc[7] & 0xffff) << 40;
- srdi 11, 7, 48 # 0xffff
- and 11, 11, 5
- sldi 11, 11, 40 # << 40
- li 9, 0
- subfc 14, 11, 14
- subfe 15, 9, 15
- # acc[1] += acc[7] >> 48;
- SHR 9, 10, 6, 5, 48
- addc 16, 16, 10
- adde 17, 17, 9
- # acc[0] += (acc[7] & 0xffffffffffff) << 8;
- srdi 11, 7, 16 # 0xffffffffffff
- and 11, 11, 5
- sldi 11, 11, 8 # << 8
- addc 14, 14, 11
- addze 15, 15
- #
- # Carry 4 -> 5 -> 6
- #
- # acc[5] += acc[4] >> 56;
- # acc[4] &= 0x00ffffffffffffff;
- SHR 9, 10, 23, 22, 56
- addc 24, 24, 10
- adde 25, 25, 9
- srdi 11, 7, 8 # 0x00ffffffffffffff
- and 22, 22, 11
- li 23, 0
- # acc[6] += acc[5] >> 56;
- # acc[5] &= 0x00ffffffffffffff;
- SHR 9, 10, 25, 24, 56
- addc 26, 26, 10
- adde 27, 27, 9
- and 24, 24, 11
- li 25, 0
- # [3]: Eliminate high bits of acc[6] */
- # temp = acc[6] >> 48;
- # acc[6] &= 0x0000ffffffffffff;
- SHR 31, 30, 27, 26, 48 # temp = acc[6] >> 48
- srdi 11, 7, 16 # 0x0000ffffffffffff
- and 26, 26, 11
- li 27, 0
- # temp < 2^80
- # acc[3] += temp >> 40;
- SHR 9, 10, 31, 30, 40
- addc 20, 20, 10
- adde 21, 21, 9
- # acc[2] += (temp & 0xffffffffff) << 16;
- srdi 11, 7, 24 # 0xffffffffff
- and 10, 30, 11
- sldi 10, 10, 16
- addc 18, 18, 10
- addze 19, 19
- # acc[2] += temp >> 16;
- SHR 9, 10, 31, 30, 16
- addc 18, 18, 10
- adde 19, 19, 9
- # acc[1] += (temp & 0xffff) << 40;
- srdi 11, 7, 48 # 0xffff
- and 10, 30, 11
- sldi 10, 10, 40
- addc 16, 16, 10
- addze 17, 17
- # acc[1] -= temp >> 24;
- SHR 9, 10, 31, 30, 24
- subfc 16, 10, 16
- subfe 17, 9, 17
- # acc[0] -= (temp & 0xffffff) << 32;
- srdi 11, 7, 40 # 0xffffff
- and 10, 30, 11
- sldi 10, 10, 32
- li 9, 0
- subfc 14, 10, 14
- subfe 15, 9, 15
- # acc[0] += temp;
- addc 14, 14, 30
- adde 15, 15, 31
- # Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6
- #
- # acc[1] += acc[0] >> 56; /* acc[1] < acc_old[1] + 2^72 */
- SHR 9, 10, 15, 14, 56
- addc 16, 16, 10
- adde 17, 17, 9
- # acc[0] &= 0x00ffffffffffffff;
- srdi 11, 7, 8 # 0x00ffffffffffffff
- and 14, 14, 11
- li 15, 0
- # acc[2] += acc[1] >> 56; /* acc[2] < acc_old[2] + 2^72 + 2^16 */
- SHR 9, 10, 17, 16, 56
- addc 18, 18, 10
- adde 19, 19, 9
- # acc[1] &= 0x00ffffffffffffff;
- and 16, 16, 11
- li 17, 0
- # acc[3] += acc[2] >> 56; /* acc[3] < acc_old[3] + 2^72 + 2^16 */
- SHR 9, 10, 19, 18, 56
- addc 20, 20, 10
- adde 21, 21, 9
- # acc[2] &= 0x00ffffffffffffff;
- and 18, 18, 11
- li 19, 0
- # acc[4] += acc[3] >> 56;
- SHR 9, 10, 21, 20, 56
- addc 22, 22, 10
- adde 23, 23, 9
- # acc[3] &= 0x00ffffffffffffff;
- and 20, 20, 11
- li 21, 0
- # acc[5] += acc[4] >> 56;
- SHR 9, 10, 23, 22, 56
- addc 24, 24, 10
- adde 25, 25, 9
- # acc[4] &= 0x00ffffffffffffff;
- and 22, 22, 11
- # acc[6] += acc[5] >> 56;
- SHR 9, 10, 25, 24, 56
- addc 26, 26, 10
- adde 27, 27, 9
- # acc[5] &= 0x00ffffffffffffff;
- and 24, 24, 11
- std 14, 0(3)
- std 16, 8(3)
- std 18, 16(3)
- std 20, 24(3)
- std 22, 32(3)
- std 24, 40(3)
- std 26, 48(3)
- blr
- .size _p384_felem_reduce_core,.-_p384_felem_reduce_core
- .data
- .align 4
- .LConst:
- # two124m68:
- .long 0x0, 0x0, 0xfffffff0, 0xfffffff
- # two124m116m68:
- .long 0x0, 0x0, 0xfffffff0, 0xfefffff
- #two124p108m76:
- .long 0x0, 0x0, 0xfffff000, 0x10000fff
- #two124m92m68:
- .long 0x0, 0x0, 0xeffffff0, 0xfffffff
- .text
- #
- # void p384_felem_square_reduce(felem out, const felem in)
- #
- .global p384_felem_square_reduce
- .type p384_felem_square_reduce,\@function
- .align 4
- p384_felem_square_reduce:
- stdu 1, -512(1)
- mflr 0
- std 14, 56(1)
- std 15, 64(1)
- std 16, 72(1)
- std 17, 80(1)
- std 18, 88(1)
- std 19, 96(1)
- std 20, 104(1)
- std 21, 112(1)
- std 22, 120(1)
- std 23, 128(1)
- std 24, 136(1)
- std 25, 144(1)
- std 26, 152(1)
- std 27, 160(1)
- std 28, 168(1)
- std 29, 176(1)
- std 30, 184(1)
- std 31, 192(1)
- std 3, 496(1)
- addi 3, 1, 208
- bl _p384_felem_square_core
- mr 4, 3
- ld 3, 496(1)
- bl _p384_felem_reduce_core
- ld 14, 56(1)
- ld 15, 64(1)
- ld 16, 72(1)
- ld 17, 80(1)
- ld 18, 88(1)
- ld 19, 96(1)
- ld 20, 104(1)
- ld 21, 112(1)
- ld 22, 120(1)
- ld 23, 128(1)
- ld 24, 136(1)
- ld 25, 144(1)
- ld 26, 152(1)
- ld 27, 160(1)
- ld 28, 168(1)
- ld 29, 176(1)
- ld 30, 184(1)
- ld 31, 192(1)
- addi 1, 1, 512
- mtlr 0
- blr
- .size p384_felem_square_reduce,.-p384_felem_square_reduce
- #
- # void p384_felem_mul_reduce(felem out, const felem in1, const felem in2)
- #
- .global p384_felem_mul_reduce
- .type p384_felem_mul_reduce,\@function
- .align 5
- p384_felem_mul_reduce:
- stdu 1, -512(1)
- mflr 0
- std 14, 56(1)
- std 15, 64(1)
- std 16, 72(1)
- std 17, 80(1)
- std 18, 88(1)
- std 19, 96(1)
- std 20, 104(1)
- std 21, 112(1)
- std 22, 120(1)
- std 23, 128(1)
- std 24, 136(1)
- std 25, 144(1)
- std 26, 152(1)
- std 27, 160(1)
- std 28, 168(1)
- std 29, 176(1)
- std 30, 184(1)
- std 31, 192(1)
- std 3, 496(1)
- addi 3, 1, 208
- bl _p384_felem_mul_core
- mr 4, 3
- ld 3, 496(1)
- bl _p384_felem_reduce_core
- ld 14, 56(1)
- ld 15, 64(1)
- ld 16, 72(1)
- ld 17, 80(1)
- ld 18, 88(1)
- ld 19, 96(1)
- ld 20, 104(1)
- ld 21, 112(1)
- ld 22, 120(1)
- ld 23, 128(1)
- ld 24, 136(1)
- ld 25, 144(1)
- ld 26, 152(1)
- ld 27, 160(1)
- ld 28, 168(1)
- ld 29, 176(1)
- ld 30, 184(1)
- ld 31, 192(1)
- addi 1, 1, 512
- mtlr 0
- blr
- .size p384_felem_mul_reduce,.-p384_felem_mul_reduce
- ___
- $code =~ s/\`([^\`]*)\`/eval $1/gem;
- print $code;
- close STDOUT or die "error closing STDOUT: $!";
|