ecp_nistp384-ppc64.pl 26 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532
  1. #! /usr/bin/env perl
  2. # Copyright 2023-2025 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Danny Tsen <[email protected]> # for the OpenSSL project.
  11. #
  12. # Copyright 2025- IBM Corp.
  13. # ====================================================================
  14. #
  15. # p384 lower-level primitives for PPC64.
  16. #
  17. use strict;
  18. use warnings;
  19. my $flavour = shift;
  20. my $output = "";
  21. while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
  22. if (!$output) {
  23. $output = "-";
  24. }
  25. my ($xlate, $dir);
  26. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  27. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  28. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  29. die "can't locate ppc-xlate.pl";
  30. open OUT,"| \"$^X\" $xlate $flavour $output";
  31. *STDOUT=*OUT;
  32. my $code = "";
  33. $code.=<<___;
  34. .machine "any"
  35. .text
  36. .globl p384_felem_mul
  37. .type p384_felem_mul,\@function
  38. .align 4
  39. p384_felem_mul:
  40. stdu 1, -176(1)
  41. mflr 0
  42. std 14, 56(1)
  43. std 15, 64(1)
  44. std 16, 72(1)
  45. std 17, 80(1)
  46. std 18, 88(1)
  47. std 19, 96(1)
  48. std 20, 104(1)
  49. std 21, 112(1)
  50. std 22, 120(1)
  51. bl _p384_felem_mul_core
  52. mtlr 0
  53. ld 14, 56(1)
  54. ld 15, 64(1)
  55. ld 16, 72(1)
  56. ld 17, 80(1)
  57. ld 18, 88(1)
  58. ld 19, 96(1)
  59. ld 20, 104(1)
  60. ld 21, 112(1)
  61. ld 22, 120(1)
  62. addi 1, 1, 176
  63. blr
  64. .size p384_felem_mul,.-p384_felem_mul
  65. .globl p384_felem_square
  66. .type p384_felem_square,\@function
  67. .align 4
  68. p384_felem_square:
  69. stdu 1, -176(1)
  70. mflr 0
  71. std 14, 56(1)
  72. std 15, 64(1)
  73. std 16, 72(1)
  74. std 17, 80(1)
  75. bl _p384_felem_square_core
  76. mtlr 0
  77. ld 14, 56(1)
  78. ld 15, 64(1)
  79. ld 16, 72(1)
  80. ld 17, 80(1)
  81. addi 1, 1, 176
  82. blr
  83. .size p384_felem_square,.-p384_felem_square
  84. #
  85. # Felem mul core function -
  86. # r3, r4 and r5 need to pre-loaded.
  87. #
  88. .type _p384_felem_mul_core,\@function
  89. .align 4
  90. _p384_felem_mul_core:
  91. ld 6,0(4)
  92. ld 14,0(5)
  93. ld 7,8(4)
  94. ld 15,8(5)
  95. ld 8,16(4)
  96. ld 16,16(5)
  97. ld 9,24(4)
  98. ld 17,24(5)
  99. ld 10,32(4)
  100. ld 18,32(5)
  101. ld 11,40(4)
  102. ld 19,40(5)
  103. ld 12,48(4)
  104. ld 20,48(5)
  105. # out0
  106. mulld 21, 14, 6
  107. mulhdu 22, 14, 6
  108. std 21, 0(3)
  109. std 22, 8(3)
  110. vxor 0, 0, 0
  111. # out1
  112. mtvsrdd 32+13, 14, 6
  113. mtvsrdd 32+14, 7, 15
  114. vmsumudm 1, 13, 14, 0
  115. # out2
  116. mtvsrdd 32+15, 15, 6
  117. mtvsrdd 32+16, 7, 16
  118. mtvsrdd 32+17, 0, 8
  119. mtvsrdd 32+18, 0, 14
  120. vmsumudm 19, 15, 16, 0
  121. vmsumudm 2, 17, 18, 19
  122. # out3
  123. mtvsrdd 32+13, 16, 6
  124. mtvsrdd 32+14, 7, 17
  125. mtvsrdd 32+15, 14, 8
  126. mtvsrdd 32+16, 9, 15
  127. vmsumudm 19, 13, 14, 0
  128. vmsumudm 3, 15, 16, 19
  129. # out4
  130. mtvsrdd 32+13, 17, 6
  131. mtvsrdd 32+14, 7, 18
  132. mtvsrdd 32+15, 15, 8
  133. mtvsrdd 32+16, 9, 16
  134. mtvsrdd 32+17, 0, 10
  135. mtvsrdd 32+18, 0, 14
  136. vmsumudm 19, 13, 14, 0
  137. vmsumudm 4, 15, 16, 19
  138. vmsumudm 4, 17, 18, 4
  139. # out5
  140. mtvsrdd 32+13, 18, 6
  141. mtvsrdd 32+14, 7, 19
  142. mtvsrdd 32+15, 16, 8
  143. mtvsrdd 32+16, 9, 17
  144. mtvsrdd 32+17, 14, 10
  145. mtvsrdd 32+18, 11, 15
  146. vmsumudm 19, 13, 14, 0
  147. vmsumudm 5, 15, 16, 19
  148. vmsumudm 5, 17, 18, 5
  149. stxv 32+1, 16(3)
  150. stxv 32+2, 32(3)
  151. stxv 32+3, 48(3)
  152. stxv 32+4, 64(3)
  153. stxv 32+5, 80(3)
  154. # out6
  155. mtvsrdd 32+13, 19, 6
  156. mtvsrdd 32+14, 7, 20
  157. mtvsrdd 32+15, 17, 8
  158. mtvsrdd 32+16, 9, 18
  159. mtvsrdd 32+17, 15, 10
  160. mtvsrdd 32+18, 11, 16
  161. vmsumudm 19, 13, 14, 0
  162. vmsumudm 6, 15, 16, 19
  163. mtvsrdd 32+13, 0, 12
  164. mtvsrdd 32+14, 0, 14
  165. vmsumudm 19, 17, 18, 6
  166. vmsumudm 6, 13, 14, 19
  167. # out7
  168. mtvsrdd 32+13, 19, 7
  169. mtvsrdd 32+14, 8, 20
  170. mtvsrdd 32+15, 17, 9
  171. mtvsrdd 32+16, 10, 18
  172. mtvsrdd 32+17, 15, 11
  173. mtvsrdd 32+18, 12, 16
  174. vmsumudm 19, 13, 14, 0
  175. vmsumudm 7, 15, 16, 19
  176. vmsumudm 7, 17, 18, 7
  177. # out8
  178. mtvsrdd 32+13, 19, 8
  179. mtvsrdd 32+14, 9, 20
  180. mtvsrdd 32+15, 17, 10
  181. mtvsrdd 32+16, 11, 18
  182. mtvsrdd 32+17, 0, 12
  183. mtvsrdd 32+18, 0, 16
  184. vmsumudm 19, 13, 14, 0
  185. vmsumudm 8, 15, 16, 19
  186. vmsumudm 8, 17, 18, 8
  187. # out9
  188. mtvsrdd 32+13, 19, 9
  189. mtvsrdd 32+14, 10, 20
  190. mtvsrdd 32+15, 17, 11
  191. mtvsrdd 32+16, 12, 18
  192. vmsumudm 19, 13, 14, 0
  193. vmsumudm 9, 15, 16, 19
  194. # out10
  195. mtvsrdd 32+13, 19, 10
  196. mtvsrdd 32+14, 11, 20
  197. mtvsrdd 32+15, 0, 12
  198. mtvsrdd 32+16, 0, 18
  199. vmsumudm 19, 13, 14, 0
  200. vmsumudm 10, 15, 16, 19
  201. # out11
  202. mtvsrdd 32+17, 19, 11
  203. mtvsrdd 32+18, 12, 20
  204. vmsumudm 11, 17, 18, 0
  205. stxv 32+6, 96(3)
  206. stxv 32+7, 112(3)
  207. stxv 32+8, 128(3)
  208. stxv 32+9, 144(3)
  209. stxv 32+10, 160(3)
  210. stxv 32+11, 176(3)
  211. # out12
  212. mulld 21, 20, 12
  213. mulhdu 22, 20, 12 # out12
  214. std 21, 192(3)
  215. std 22, 200(3)
  216. blr
  217. .size _p384_felem_mul_core,.-_p384_felem_mul_core
  218. #
  219. # Felem square core function -
  220. # r3 and r4 need to pre-loaded.
  221. #
  222. .type _p384_felem_square_core,\@function
  223. .align 4
  224. _p384_felem_square_core:
  225. ld 6, 0(4)
  226. ld 7, 8(4)
  227. ld 8, 16(4)
  228. ld 9, 24(4)
  229. ld 10, 32(4)
  230. ld 11, 40(4)
  231. ld 12, 48(4)
  232. vxor 0, 0, 0
  233. # out0
  234. mulld 14, 6, 6
  235. mulhdu 15, 6, 6
  236. std 14, 0(3)
  237. std 15, 8(3)
  238. # out1
  239. add 14, 6, 6
  240. mtvsrdd 32+13, 0, 14
  241. mtvsrdd 32+14, 0, 7
  242. vmsumudm 1, 13, 14, 0
  243. # out2
  244. mtvsrdd 32+15, 7, 14
  245. mtvsrdd 32+16, 7, 8
  246. vmsumudm 2, 15, 16, 0
  247. # out3
  248. add 15, 7, 7
  249. mtvsrdd 32+13, 8, 14
  250. mtvsrdd 32+14, 15, 9
  251. vmsumudm 3, 13, 14, 0
  252. # out4
  253. mtvsrdd 32+13, 9, 14
  254. mtvsrdd 32+14, 15, 10
  255. mtvsrdd 32+15, 0, 8
  256. vmsumudm 4, 13, 14, 0
  257. vmsumudm 4, 15, 15, 4
  258. # out5
  259. mtvsrdd 32+13, 10, 14
  260. mtvsrdd 32+14, 15, 11
  261. add 16, 8, 8
  262. mtvsrdd 32+15, 0, 16
  263. mtvsrdd 32+16, 0, 9
  264. vmsumudm 5, 13, 14, 0
  265. vmsumudm 5, 15, 16, 5
  266. stxv 32+1, 16(3)
  267. stxv 32+2, 32(3)
  268. stxv 32+3, 48(3)
  269. stxv 32+4, 64(3)
  270. # out6
  271. mtvsrdd 32+13, 11, 14
  272. mtvsrdd 32+14, 15, 12
  273. mtvsrdd 32+15, 9, 16
  274. mtvsrdd 32+16, 9, 10
  275. stxv 32+5, 80(3)
  276. vmsumudm 19, 13, 14, 0
  277. vmsumudm 6, 15, 16, 19
  278. # out7
  279. add 17, 9, 9
  280. mtvsrdd 32+13, 11, 15
  281. mtvsrdd 32+14, 16, 12
  282. mtvsrdd 32+15, 0, 17
  283. mtvsrdd 32+16, 0, 10
  284. vmsumudm 19, 13, 14, 0
  285. vmsumudm 7, 15, 16, 19
  286. # out8
  287. mtvsrdd 32+13, 11, 16
  288. mtvsrdd 32+14, 17, 12
  289. mtvsrdd 32+15, 0, 10
  290. vmsumudm 19, 13, 14, 0
  291. vmsumudm 8, 15, 15, 19
  292. # out9
  293. add 14, 10, 10
  294. mtvsrdd 32+13, 11, 17
  295. mtvsrdd 32+14, 14, 12
  296. vmsumudm 9, 13, 14, 0
  297. # out10
  298. mtvsrdd 32+13, 11, 14
  299. mtvsrdd 32+14, 11, 12
  300. vmsumudm 10, 13, 14, 0
  301. stxv 32+6, 96(3)
  302. stxv 32+7, 112(3)
  303. # out11
  304. #add 14, 11, 11
  305. #mtvsrdd 32+13, 0, 14
  306. #mtvsrdd 32+14, 0, 12
  307. #vmsumudm 11, 13, 14, 0
  308. mulld 6, 12, 11
  309. mulhdu 7, 12, 11
  310. addc 8, 6, 6
  311. adde 9, 7, 7
  312. stxv 32+8, 128(3)
  313. stxv 32+9, 144(3)
  314. stxv 32+10, 160(3)
  315. #stxv 32+11, 176(3)
  316. # out12
  317. mulld 14, 12, 12
  318. mulhdu 15, 12, 12
  319. std 8, 176(3)
  320. std 9, 184(3)
  321. std 14, 192(3)
  322. std 15, 200(3)
  323. blr
  324. .size _p384_felem_square_core,.-_p384_felem_square_core
  325. #
  326. # widefelem (128 bits) * 8
  327. #
  328. .macro F128_X_8 _off1 _off2
  329. ld 9,\\_off1(3)
  330. ld 8,\\_off2(3)
  331. srdi 10,9,61
  332. rldimi 10,8,3,0
  333. sldi 9,9,3
  334. std 9,\\_off1(3)
  335. std 10,\\_off2(3)
  336. .endm
  337. .globl p384_felem128_mul_by_8
  338. .type p384_felem128_mul_by_8, \@function
  339. .align 4
  340. p384_felem128_mul_by_8:
  341. F128_X_8 0, 8
  342. F128_X_8 16, 24
  343. F128_X_8 32, 40
  344. F128_X_8 48, 56
  345. F128_X_8 64, 72
  346. F128_X_8 80, 88
  347. F128_X_8 96, 104
  348. F128_X_8 112, 120
  349. F128_X_8 128, 136
  350. F128_X_8 144, 152
  351. F128_X_8 160, 168
  352. F128_X_8 176, 184
  353. F128_X_8 192, 200
  354. blr
  355. .size p384_felem128_mul_by_8,.-p384_felem128_mul_by_8
  356. #
  357. # widefelem (128 bits) * 2
  358. #
  359. .macro F128_X_2 _off1 _off2
  360. ld 9,\\_off1(3)
  361. ld 8,\\_off2(3)
  362. srdi 10,9,63
  363. rldimi 10,8,1,0
  364. sldi 9,9,1
  365. std 9,\\_off1(3)
  366. std 10,\\_off2(3)
  367. .endm
  368. .globl p384_felem128_mul_by_2
  369. .type p384_felem128_mul_by_2, \@function
  370. .align 4
  371. p384_felem128_mul_by_2:
  372. F128_X_2 0, 8
  373. F128_X_2 16, 24
  374. F128_X_2 32, 40
  375. F128_X_2 48, 56
  376. F128_X_2 64, 72
  377. F128_X_2 80, 88
  378. F128_X_2 96, 104
  379. F128_X_2 112, 120
  380. F128_X_2 128, 136
  381. F128_X_2 144, 152
  382. F128_X_2 160, 168
  383. F128_X_2 176, 184
  384. F128_X_2 192, 200
  385. blr
  386. .size p384_felem128_mul_by_2,.-p384_felem128_mul_by_2
  387. .globl p384_felem_diff128
  388. .type p384_felem_diff128, \@function
  389. .align 4
  390. p384_felem_diff128:
  391. addis 5, 2, .LConst_two127\@toc\@ha
  392. addi 5, 5, .LConst_two127\@toc\@l
  393. ld 10, 0(3)
  394. ld 8, 8(3)
  395. li 9, 0
  396. addc 10, 10, 9
  397. li 7, -1
  398. rldicr 7, 7, 0, 0 # two127
  399. adde 8, 8, 7
  400. ld 11, 0(4)
  401. ld 12, 8(4)
  402. subfc 11, 11, 10
  403. subfe 12, 12, 8
  404. std 11, 0(3) # out0
  405. std 12, 8(3)
  406. # two127m71 = (r10, r9)
  407. ld 8, 16(3)
  408. ld 7, 24(3)
  409. ld 10, 24(5) # two127m71
  410. addc 8, 8, 9
  411. adde 7, 7, 10
  412. ld 11, 16(4)
  413. ld 12, 24(4)
  414. subfc 11, 11, 8
  415. subfe 12, 12, 7
  416. std 11, 16(3) # out1
  417. std 12, 24(3)
  418. ld 8, 32(3)
  419. ld 7, 40(3)
  420. addc 8, 8, 9
  421. adde 7, 7, 10
  422. ld 11, 32(4)
  423. ld 12, 40(4)
  424. subfc 11, 11, 8
  425. subfe 12, 12, 7
  426. std 11, 32(3) # out2
  427. std 12, 40(3)
  428. ld 8, 48(3)
  429. ld 7, 56(3)
  430. addc 8, 8, 9
  431. adde 7, 7, 10
  432. ld 11, 48(4)
  433. ld 12, 56(4)
  434. subfc 11, 11, 8
  435. subfe 12, 12, 7
  436. std 11, 48(3) # out3
  437. std 12, 56(3)
  438. ld 8, 64(3)
  439. ld 7, 72(3)
  440. addc 8, 8, 9
  441. adde 7, 7, 10
  442. ld 11, 64(4)
  443. ld 12, 72(4)
  444. subfc 11, 11, 8
  445. subfe 12, 12, 7
  446. std 11, 64(3) # out4
  447. std 12, 72(3)
  448. ld 8, 80(3)
  449. ld 7, 88(3)
  450. addc 8, 8, 9
  451. adde 7, 7, 10
  452. ld 11, 80(4)
  453. ld 12, 88(4)
  454. subfc 11, 11, 8
  455. subfe 12, 12, 7
  456. std 11, 80(3) # out5
  457. std 12, 88(3)
  458. ld 8, 96(3)
  459. ld 7, 104(3)
  460. ld 6, 40(5) # two127p111m79m71
  461. addc 8, 8, 9
  462. adde 7, 7, 6
  463. ld 11, 96(4)
  464. ld 12, 104(4)
  465. subfc 11, 11, 8
  466. subfe 12, 12, 7
  467. std 11, 96(3) # out6
  468. std 12, 104(3)
  469. ld 8, 112(3)
  470. ld 7, 120(3)
  471. ld 6, 56(5) # two127m119m71
  472. addc 8, 8, 9
  473. adde 7, 7, 6
  474. ld 11, 112(4)
  475. ld 12, 120(4)
  476. subfc 11, 11, 8
  477. subfe 12, 12, 7
  478. std 11, 112(3) # out7
  479. std 12, 120(3)
  480. ld 8, 128(3)
  481. ld 7, 136(3)
  482. ld 6, 72(5) # two127m95m71
  483. addc 8, 8, 9
  484. adde 7, 7, 6
  485. ld 11, 128(4)
  486. ld 12, 136(4)
  487. subfc 11, 11, 8
  488. subfe 12, 12, 7
  489. std 11, 128(3) # out8
  490. std 12, 136(3)
  491. ld 8, 144(3)
  492. ld 7, 152(3)
  493. addc 8, 8, 9
  494. adde 7, 7, 10
  495. ld 11, 144(4)
  496. ld 12, 152(4)
  497. subfc 11, 11, 8
  498. subfe 12, 12, 7
  499. std 11, 144(3) # out9
  500. std 12, 152(3)
  501. ld 8, 160(3)
  502. ld 7, 168(3)
  503. addc 8, 8, 9
  504. adde 7, 7, 10
  505. ld 11, 160(4)
  506. ld 12, 168(4)
  507. subfc 11, 11, 8
  508. subfe 12, 12, 7
  509. std 11, 160(3) # out10
  510. std 12, 168(3)
  511. ld 8, 176(3)
  512. ld 7, 184(3)
  513. addc 8, 8, 9
  514. adde 7, 7, 10
  515. ld 11, 176(4)
  516. ld 12, 184(4)
  517. subfc 11, 11, 8
  518. subfe 12, 12, 7
  519. std 11, 176(3) # out11
  520. std 12, 184(3)
  521. ld 8, 192(3)
  522. ld 7, 200(3)
  523. addc 8, 8, 9
  524. adde 7, 7, 10
  525. ld 11, 192(4)
  526. ld 12, 200(4)
  527. subfc 11, 11, 8
  528. subfe 12, 12, 7
  529. std 11, 192(3) # out12
  530. std 12, 200(3)
  531. blr
  532. .size p384_felem_diff128,.-p384_felem_diff128
  533. .data
  534. .align 4
  535. .LConst_two127:
  536. #two127
  537. .long 0x00000000, 0x00000000, 0x00000000, 0x80000000
  538. #two127m71
  539. .long 0x00000000, 0x00000000, 0xffffff80, 0x7fffffff
  540. #two127p111m79m71
  541. .long 0x00000000, 0x00000000, 0xffff7f80, 0x80007fff
  542. #two127m119m71
  543. .long 0x00000000, 0x00000000, 0xffffff80, 0x7f7fffff
  544. #two127m95m71
  545. .long 0x00000000, 0x00000000, 0x7fffff80, 0x7fffffff
  546. .text
  547. .globl p384_felem_diff_128_64
  548. .type p384_felem_diff_128_64, \@function
  549. .align 4
  550. p384_felem_diff_128_64:
  551. addis 5, 2, .LConst_128_two64\@toc\@ha
  552. addi 5, 5, .LConst_128_two64\@toc\@l
  553. ld 9, 0(3)
  554. ld 10, 8(3)
  555. ld 8, 48(5) # two64p48m16
  556. li 7, 0
  557. addc 9, 9, 8
  558. li 6, 1
  559. adde 10, 10, 6
  560. ld 11, 0(4)
  561. subfc 8, 11, 9
  562. subfe 12, 7, 10
  563. std 8, 0(3) # out0
  564. std 12, 8(3)
  565. ld 9, 16(3)
  566. ld 10, 24(3)
  567. ld 8, 0(5) # two64m56m8
  568. addc 9, 9, 8
  569. addze 10, 10
  570. ld 11, 8(4)
  571. subfc 11, 11, 9
  572. subfe 12, 7, 10
  573. std 11, 16(3) # out1
  574. std 12, 24(3)
  575. ld 9, 32(3)
  576. ld 10, 40(3)
  577. ld 8, 16(5) # two64m32m8
  578. addc 9, 9, 8
  579. addze 10, 10
  580. ld 11, 16(4)
  581. subfc 11, 11, 9
  582. subfe 12, 7, 10
  583. std 11, 32(3) # out2
  584. std 12, 40(3)
  585. ld 10, 48(3)
  586. ld 8, 56(3)
  587. #ld 9, 32(5) # two64m8
  588. li 9, -256 # two64m8
  589. addc 10, 10, 9
  590. addze 8, 8
  591. ld 11, 24(4)
  592. subfc 11, 11, 10
  593. subfe 12, 7, 8
  594. std 11, 48(3) # out3
  595. std 12, 56(3)
  596. ld 10, 64(3)
  597. ld 8, 72(3)
  598. addc 10, 10, 9
  599. addze 8, 8
  600. ld 11, 32(4)
  601. subfc 11, 11, 10
  602. subfe 12, 7, 8
  603. std 11, 64(3) # out4
  604. std 12, 72(3)
  605. ld 10, 80(3)
  606. ld 8, 88(3)
  607. addc 10, 10, 9
  608. addze 8, 8
  609. ld 11, 40(4)
  610. subfc 11, 11, 10
  611. subfe 12, 7, 8
  612. std 11, 80(3) # out5
  613. std 12, 88(3)
  614. ld 10, 96(3)
  615. ld 8, 104(3)
  616. addc 10, 10, 9
  617. addze 9, 8
  618. ld 11, 48(4)
  619. subfc 11, 11, 10
  620. subfe 12, 7, 9
  621. std 11, 96(3) # out6
  622. std 12, 104(3)
  623. blr
  624. .size p384_felem_diff_128_64,.-p384_felem_diff_128_64
  625. .data
  626. .align 4
  627. .LConst_128_two64:
  628. #two64m56m8
  629. .long 0xffffff00, 0xfeffffff, 0x00000000, 0x00000000
  630. #two64m32m8
  631. .long 0xffffff00, 0xfffffffe, 0x00000000, 0x00000000
  632. #two64m8
  633. .long 0xffffff00, 0xffffffff, 0x00000000, 0x00000000
  634. #two64p48m16
  635. .long 0xffff0000, 0x0000ffff, 0x00000001, 0x00000000
  636. .LConst_two60:
  637. #two60m52m4
  638. .long 0xfffffff0, 0x0fefffff, 0x0, 0x0
  639. #two60p44m12
  640. .long 0xfffff000, 0x10000fff, 0x0, 0x0
  641. #two60m28m4
  642. .long 0xeffffff0, 0x0fffffff, 0x0, 0x0
  643. #two60m4
  644. .long 0xfffffff0, 0x0fffffff, 0x0, 0x0
  645. .text
  646. #
  647. # static void felem_diff64(felem out, const felem in)
  648. #
  649. .globl p384_felem_diff64
  650. .type p384_felem_diff64, \@function
  651. .align 4
  652. p384_felem_diff64:
  653. addis 5, 2, .LConst_two60\@toc\@ha
  654. addi 5, 5, .LConst_two60\@toc\@l
  655. ld 9, 0(3)
  656. ld 8, 16(5) # two60p44m12
  657. li 7, 0
  658. add 9, 9, 8
  659. ld 11, 0(4)
  660. subf 8, 11, 9
  661. std 8, 0(3) # out0
  662. ld 9, 8(3)
  663. ld 8, 0(5) # two60m52m4
  664. add 9, 9, 8
  665. ld 11, 8(4)
  666. subf 11, 11, 9
  667. std 11, 8(3) # out1
  668. ld 9, 16(3)
  669. ld 8, 32(5) # two60m28m4
  670. add 9, 9, 8
  671. ld 11, 16(4)
  672. subf 11, 11, 9
  673. std 11, 16(3) # out2
  674. ld 10, 24(3)
  675. ld 9, 48(5) # two60m4
  676. add 10, 10, 9
  677. ld 12, 24(4)
  678. subf 12, 12, 10
  679. std 12, 24(3) # out3
  680. ld 10, 32(3)
  681. add 10, 10, 9
  682. ld 11, 32(4)
  683. subf 11, 11, 10
  684. std 11, 32(3) # out4
  685. ld 10, 40(3)
  686. add 10, 10, 9
  687. ld 12, 40(4)
  688. subf 12, 12, 10
  689. std 12, 40(3) # out5
  690. ld 10, 48(3)
  691. add 10, 10, 9
  692. ld 11, 48(4)
  693. subf 11, 11, 10
  694. std 11, 48(3) # out6
  695. blr
  696. .size p384_felem_diff64,.-p384_felem_diff64
  697. .text
  698. #
  699. # Shift 128 bits right <nbits>
  700. #
  701. .macro SHR o_h o_l in_h in_l nbits
  702. srdi \\o_l, \\in_l, \\nbits # shift lower right <nbits>
  703. rldimi \\o_l, \\in_h, 64-\\nbits, 0 # insert <64-nbits> from hi
  704. srdi \\o_h, \\in_h, \\nbits # shift higher right <nbits>
  705. .endm
  706. #
  707. # static void felem_reduce(felem out, const widefelem in)
  708. #
  709. .global p384_felem_reduce
  710. .type p384_felem_reduce,\@function
  711. .align 4
  712. p384_felem_reduce:
  713. stdu 1, -208(1)
  714. mflr 0
  715. std 14, 56(1)
  716. std 15, 64(1)
  717. std 16, 72(1)
  718. std 17, 80(1)
  719. std 18, 88(1)
  720. std 19, 96(1)
  721. std 20, 104(1)
  722. std 21, 112(1)
  723. std 22, 120(1)
  724. std 23, 128(1)
  725. std 24, 136(1)
  726. std 25, 144(1)
  727. std 26, 152(1)
  728. std 27, 160(1)
  729. std 28, 168(1)
  730. std 29, 176(1)
  731. std 30, 184(1)
  732. std 31, 192(1)
  733. bl _p384_felem_reduce_core
  734. mtlr 0
  735. ld 14, 56(1)
  736. ld 15, 64(1)
  737. ld 16, 72(1)
  738. ld 17, 80(1)
  739. ld 18, 88(1)
  740. ld 19, 96(1)
  741. ld 20, 104(1)
  742. ld 21, 112(1)
  743. ld 22, 120(1)
  744. ld 23, 128(1)
  745. ld 24, 136(1)
  746. ld 25, 144(1)
  747. ld 26, 152(1)
  748. ld 27, 160(1)
  749. ld 28, 168(1)
  750. ld 29, 176(1)
  751. ld 30, 184(1)
  752. ld 31, 192(1)
  753. addi 1, 1, 208
  754. blr
  755. .size p384_felem_reduce,.-p384_felem_reduce
  756. #
  757. # Felem reduction core function -
  758. # r3 and r4 need to pre-loaded.
  759. #
  760. .type _p384_felem_reduce_core,\@function
  761. .align 4
  762. _p384_felem_reduce_core:
  763. addis 12, 2, .LConst\@toc\@ha
  764. addi 12, 12, .LConst\@toc\@l
  765. # load constat p
  766. ld 11, 8(12) # hi - two124m68
  767. # acc[6] = in[6] + two124m68;
  768. ld 26, 96(4) # in[6].l
  769. ld 27, 96+8(4) # in[6].h
  770. add 27, 27, 11
  771. # acc[5] = in[5] + two124m68;
  772. ld 24, 80(4) # in[5].l
  773. ld 25, 80+8(4) # in[5].h
  774. add 25, 25, 11
  775. # acc[4] = in[4] + two124m68;
  776. ld 22, 64(4) # in[4].l
  777. ld 23, 64+8(4) # in[4].h
  778. add 23, 23, 11
  779. # acc[3] = in[3] + two124m68;
  780. ld 20, 48(4) # in[3].l
  781. ld 21, 48+8(4) # in[3].h
  782. add 21, 21, 11
  783. ld 11, 48+8(12) # hi - two124m92m68
  784. # acc[2] = in[2] + two124m92m68;
  785. ld 18, 32(4) # in[2].l
  786. ld 19, 32+8(4) # in[2].h
  787. add 19, 19, 11
  788. ld 11, 16+8(12) # high - two124m116m68
  789. # acc[1] = in[1] + two124m116m68;
  790. ld 16, 16(4) # in[1].l
  791. ld 17, 16+8(4) # in[1].h
  792. add 17, 17, 11
  793. ld 11, 32+8(12) # high - two124p108m76
  794. # acc[0] = in[0] + two124p108m76;
  795. ld 14, 0(4) # in[0].l
  796. ld 15, 0+8(4) # in[0].h
  797. add 15, 15, 11
  798. # compute mask
  799. li 7, -1
  800. # Eliminate in[12]
  801. # acc[8] += in[12] >> 32;
  802. ld 5, 192(4) # in[12].l
  803. ld 6, 192+8(4) # in[12].h
  804. SHR 9, 10, 6, 5, 32
  805. ld 30, 128(4) # in[8].l
  806. ld 31, 136(4) # in[8].h
  807. addc 30, 30, 10
  808. adde 31, 31, 9
  809. # acc[7] += (in[12] & 0xffffffff) << 24;
  810. srdi 11, 7, 32 # 0xffffffff
  811. and 11, 11, 5
  812. sldi 11, 11, 24 # << 24
  813. ld 28, 112(4) # in[7].l
  814. ld 29, 120(4) # in[7].h
  815. addc 28, 28, 11
  816. addze 29, 29
  817. # acc[7] += in[12] >> 8;
  818. SHR 9, 10, 6, 5, 8
  819. addc 28, 28, 10
  820. adde 29, 29, 9
  821. # acc[6] += (in[12] & 0xff) << 48;
  822. andi. 11, 5, 0xff
  823. sldi 11, 11, 48
  824. addc 26, 26, 11
  825. addze 27, 27
  826. # acc[6] -= in[12] >> 16;
  827. SHR 9, 10, 6, 5, 16
  828. subfc 26, 10, 26
  829. subfe 27, 9, 27
  830. # acc[5] -= (in[12] & 0xffff) << 40;
  831. srdi 11, 7, 48 # 0xffff
  832. and 11, 11, 5
  833. sldi 11, 11, 40 # << 40
  834. li 9, 0
  835. subfc 24, 11, 24
  836. subfe 25, 9, 25
  837. # acc[6] += in[12] >> 48;
  838. SHR 9, 10, 6, 5, 48
  839. addc 26, 26, 10
  840. adde 27, 27, 9
  841. # acc[5] += (in[12] & 0xffffffffffff) << 8;
  842. srdi 11, 7, 16 # 0xffffffffffff
  843. and 11, 11, 5
  844. sldi 11, 11, 8 # << 8
  845. addc 24, 24, 11
  846. addze 25, 25
  847. # Eliminate in[11]
  848. # acc[7] += in[11] >> 32;
  849. ld 5, 176(4) # in[11].l
  850. ld 6, 176+8(4) # in[11].h
  851. SHR 9, 10, 6, 5, 32
  852. addc 28, 28, 10
  853. adde 29, 29, 9
  854. # acc[6] += (in[11] & 0xffffffff) << 24;
  855. srdi 11, 7, 32 # 0xffffffff
  856. and 11, 11, 5
  857. sldi 11, 11, 24 # << 24
  858. addc 26, 26, 11
  859. addze 27, 27
  860. # acc[6] += in[11] >> 8;
  861. SHR 9, 10, 6, 5, 8
  862. addc 26, 26, 10
  863. adde 27, 27, 9
  864. # acc[5] += (in[11] & 0xff) << 48;
  865. andi. 11, 5, 0xff
  866. sldi 11, 11, 48
  867. addc 24, 24, 11
  868. addze 25, 25
  869. # acc[5] -= in[11] >> 16;
  870. SHR 9, 10, 6, 5, 16
  871. subfc 24, 10, 24
  872. subfe 25, 9, 25
  873. # acc[4] -= (in[11] & 0xffff) << 40;
  874. srdi 11, 7, 48 # 0xffff
  875. and 11, 11, 5
  876. sldi 11, 11, 40 # << 40
  877. li 9, 0
  878. subfc 22, 11, 22
  879. subfe 23, 9, 23
  880. # acc[5] += in[11] >> 48;
  881. SHR 9, 10, 6, 5, 48
  882. addc 24, 24, 10
  883. adde 25, 25, 9
  884. # acc[4] += (in[11] & 0xffffffffffff) << 8;
  885. srdi 11, 7, 16 # 0xffffffffffff
  886. and 11, 11, 5
  887. sldi 11, 11, 8 # << 8
  888. addc 22, 22, 11
  889. addze 23, 23
  890. # Eliminate in[10]
  891. # acc[6] += in[10] >> 32;
  892. ld 5, 160(4) # in[10].l
  893. ld 6, 160+8(4) # in[10].h
  894. SHR 9, 10, 6, 5, 32
  895. addc 26, 26, 10
  896. adde 27, 27, 9
  897. # acc[5] += (in[10] & 0xffffffff) << 24;
  898. srdi 11, 7, 32 # 0xffffffff
  899. and 11, 11, 5
  900. sldi 11, 11, 24 # << 24
  901. addc 24, 24, 11
  902. addze 25, 25
  903. # acc[5] += in[10] >> 8;
  904. SHR 9, 10, 6, 5, 8
  905. addc 24, 24, 10
  906. adde 25, 25, 9
  907. # acc[4] += (in[10] & 0xff) << 48;
  908. andi. 11, 5, 0xff
  909. sldi 11, 11, 48
  910. addc 22, 22, 11
  911. addze 23, 23
  912. # acc[4] -= in[10] >> 16;
  913. SHR 9, 10, 6, 5, 16
  914. subfc 22, 10, 22
  915. subfe 23, 9, 23
  916. # acc[3] -= (in[10] & 0xffff) << 40;
  917. srdi 11, 7, 48 # 0xffff
  918. and 11, 11, 5
  919. sldi 11, 11, 40 # << 40
  920. li 9, 0
  921. subfc 20, 11, 20
  922. subfe 21, 9, 21
  923. # acc[4] += in[10] >> 48;
  924. SHR 9, 10, 6, 5, 48
  925. addc 22, 22, 10
  926. adde 23, 23, 9
  927. # acc[3] += (in[10] & 0xffffffffffff) << 8;
  928. srdi 11, 7, 16 # 0xffffffffffff
  929. and 11, 11, 5
  930. sldi 11, 11, 8 # << 8
  931. addc 20, 20, 11
  932. addze 21, 21
  933. # Eliminate in[9]
  934. # acc[5] += in[9] >> 32;
  935. ld 5, 144(4) # in[9].l
  936. ld 6, 144+8(4) # in[9].h
  937. SHR 9, 10, 6, 5, 32
  938. addc 24, 24, 10
  939. adde 25, 25, 9
  940. # acc[4] += (in[9] & 0xffffffff) << 24;
  941. srdi 11, 7, 32 # 0xffffffff
  942. and 11, 11, 5
  943. sldi 11, 11, 24 # << 24
  944. addc 22, 22, 11
  945. addze 23, 23
  946. # acc[4] += in[9] >> 8;
  947. SHR 9, 10, 6, 5, 8
  948. addc 22, 22, 10
  949. adde 23, 23, 9
  950. # acc[3] += (in[9] & 0xff) << 48;
  951. andi. 11, 5, 0xff
  952. sldi 11, 11, 48
  953. addc 20, 20, 11
  954. addze 21, 21
  955. # acc[3] -= in[9] >> 16;
  956. SHR 9, 10, 6, 5, 16
  957. subfc 20, 10, 20
  958. subfe 21, 9, 21
  959. # acc[2] -= (in[9] & 0xffff) << 40;
  960. srdi 11, 7, 48 # 0xffff
  961. and 11, 11, 5
  962. sldi 11, 11, 40 # << 40
  963. li 9, 0
  964. subfc 18, 11, 18
  965. subfe 19, 9, 19
  966. # acc[3] += in[9] >> 48;
  967. SHR 9, 10, 6, 5, 48
  968. addc 20, 20, 10
  969. adde 21, 21, 9
  970. # acc[2] += (in[9] & 0xffffffffffff) << 8;
  971. srdi 11, 7, 16 # 0xffffffffffff
  972. and 11, 11, 5
  973. sldi 11, 11, 8 # << 8
  974. addc 18, 18, 11
  975. addze 19, 19
  976. # Eliminate acc[8]
  977. # acc[4] += acc[8] >> 32;
  978. mr 5, 30 # acc[8].l
  979. mr 6, 31 # acc[8].h
  980. SHR 9, 10, 6, 5, 32
  981. addc 22, 22, 10
  982. adde 23, 23, 9
  983. # acc[3] += (acc[8] & 0xffffffff) << 24;
  984. srdi 11, 7, 32 # 0xffffffff
  985. and 11, 11, 5
  986. sldi 11, 11, 24 # << 24
  987. addc 20, 20, 11
  988. addze 21, 21
  989. # acc[3] += acc[8] >> 8;
  990. SHR 9, 10, 6, 5, 8
  991. addc 20, 20, 10
  992. adde 21, 21, 9
  993. # acc[2] += (acc[8] & 0xff) << 48;
  994. andi. 11, 5, 0xff
  995. sldi 11, 11, 48
  996. addc 18, 18, 11
  997. addze 19, 19
  998. # acc[2] -= acc[8] >> 16;
  999. SHR 9, 10, 6, 5, 16
  1000. subfc 18, 10, 18
  1001. subfe 19, 9, 19
  1002. # acc[1] -= (acc[8] & 0xffff) << 40;
  1003. srdi 11, 7, 48 # 0xffff
  1004. and 11, 11, 5
  1005. sldi 11, 11, 40 # << 40
  1006. li 9, 0
  1007. subfc 16, 11, 16
  1008. subfe 17, 9, 17
  1009. #acc[2] += acc[8] >> 48;
  1010. SHR 9, 10, 6, 5, 48
  1011. addc 18, 18, 10
  1012. adde 19, 19, 9
  1013. # acc[1] += (acc[8] & 0xffffffffffff) << 8;
  1014. srdi 11, 7, 16 # 0xffffffffffff
  1015. and 11, 11, 5
  1016. sldi 11, 11, 8 # << 8
  1017. addc 16, 16, 11
  1018. addze 17, 17
  1019. # Eliminate acc[7]
  1020. # acc[3] += acc[7] >> 32;
  1021. mr 5, 28 # acc[7].l
  1022. mr 6, 29 # acc[7].h
  1023. SHR 9, 10, 6, 5, 32
  1024. addc 20, 20, 10
  1025. adde 21, 21, 9
  1026. # acc[2] += (acc[7] & 0xffffffff) << 24;
  1027. srdi 11, 7, 32 # 0xffffffff
  1028. and 11, 11, 5
  1029. sldi 11, 11, 24 # << 24
  1030. addc 18, 18, 11
  1031. addze 19, 19
  1032. # acc[2] += acc[7] >> 8;
  1033. SHR 9, 10, 6, 5, 8
  1034. addc 18, 18, 10
  1035. adde 19, 19, 9
  1036. # acc[1] += (acc[7] & 0xff) << 48;
  1037. andi. 11, 5, 0xff
  1038. sldi 11, 11, 48
  1039. addc 16, 16, 11
  1040. addze 17, 17
  1041. # acc[1] -= acc[7] >> 16;
  1042. SHR 9, 10, 6, 5, 16
  1043. subfc 16, 10, 16
  1044. subfe 17, 9, 17
  1045. # acc[0] -= (acc[7] & 0xffff) << 40;
  1046. srdi 11, 7, 48 # 0xffff
  1047. and 11, 11, 5
  1048. sldi 11, 11, 40 # << 40
  1049. li 9, 0
  1050. subfc 14, 11, 14
  1051. subfe 15, 9, 15
  1052. # acc[1] += acc[7] >> 48;
  1053. SHR 9, 10, 6, 5, 48
  1054. addc 16, 16, 10
  1055. adde 17, 17, 9
  1056. # acc[0] += (acc[7] & 0xffffffffffff) << 8;
  1057. srdi 11, 7, 16 # 0xffffffffffff
  1058. and 11, 11, 5
  1059. sldi 11, 11, 8 # << 8
  1060. addc 14, 14, 11
  1061. addze 15, 15
  1062. #
  1063. # Carry 4 -> 5 -> 6
  1064. #
  1065. # acc[5] += acc[4] >> 56;
  1066. # acc[4] &= 0x00ffffffffffffff;
  1067. SHR 9, 10, 23, 22, 56
  1068. addc 24, 24, 10
  1069. adde 25, 25, 9
  1070. srdi 11, 7, 8 # 0x00ffffffffffffff
  1071. and 22, 22, 11
  1072. li 23, 0
  1073. # acc[6] += acc[5] >> 56;
  1074. # acc[5] &= 0x00ffffffffffffff;
  1075. SHR 9, 10, 25, 24, 56
  1076. addc 26, 26, 10
  1077. adde 27, 27, 9
  1078. and 24, 24, 11
  1079. li 25, 0
  1080. # [3]: Eliminate high bits of acc[6] */
  1081. # temp = acc[6] >> 48;
  1082. # acc[6] &= 0x0000ffffffffffff;
  1083. SHR 31, 30, 27, 26, 48 # temp = acc[6] >> 48
  1084. srdi 11, 7, 16 # 0x0000ffffffffffff
  1085. and 26, 26, 11
  1086. li 27, 0
  1087. # temp < 2^80
  1088. # acc[3] += temp >> 40;
  1089. SHR 9, 10, 31, 30, 40
  1090. addc 20, 20, 10
  1091. adde 21, 21, 9
  1092. # acc[2] += (temp & 0xffffffffff) << 16;
  1093. srdi 11, 7, 24 # 0xffffffffff
  1094. and 10, 30, 11
  1095. sldi 10, 10, 16
  1096. addc 18, 18, 10
  1097. addze 19, 19
  1098. # acc[2] += temp >> 16;
  1099. SHR 9, 10, 31, 30, 16
  1100. addc 18, 18, 10
  1101. adde 19, 19, 9
  1102. # acc[1] += (temp & 0xffff) << 40;
  1103. srdi 11, 7, 48 # 0xffff
  1104. and 10, 30, 11
  1105. sldi 10, 10, 40
  1106. addc 16, 16, 10
  1107. addze 17, 17
  1108. # acc[1] -= temp >> 24;
  1109. SHR 9, 10, 31, 30, 24
  1110. subfc 16, 10, 16
  1111. subfe 17, 9, 17
  1112. # acc[0] -= (temp & 0xffffff) << 32;
  1113. srdi 11, 7, 40 # 0xffffff
  1114. and 10, 30, 11
  1115. sldi 10, 10, 32
  1116. li 9, 0
  1117. subfc 14, 10, 14
  1118. subfe 15, 9, 15
  1119. # acc[0] += temp;
  1120. addc 14, 14, 30
  1121. adde 15, 15, 31
  1122. # Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6
  1123. #
  1124. # acc[1] += acc[0] >> 56; /* acc[1] < acc_old[1] + 2^72 */
  1125. SHR 9, 10, 15, 14, 56
  1126. addc 16, 16, 10
  1127. adde 17, 17, 9
  1128. # acc[0] &= 0x00ffffffffffffff;
  1129. srdi 11, 7, 8 # 0x00ffffffffffffff
  1130. and 14, 14, 11
  1131. li 15, 0
  1132. # acc[2] += acc[1] >> 56; /* acc[2] < acc_old[2] + 2^72 + 2^16 */
  1133. SHR 9, 10, 17, 16, 56
  1134. addc 18, 18, 10
  1135. adde 19, 19, 9
  1136. # acc[1] &= 0x00ffffffffffffff;
  1137. and 16, 16, 11
  1138. li 17, 0
  1139. # acc[3] += acc[2] >> 56; /* acc[3] < acc_old[3] + 2^72 + 2^16 */
  1140. SHR 9, 10, 19, 18, 56
  1141. addc 20, 20, 10
  1142. adde 21, 21, 9
  1143. # acc[2] &= 0x00ffffffffffffff;
  1144. and 18, 18, 11
  1145. li 19, 0
  1146. # acc[4] += acc[3] >> 56;
  1147. SHR 9, 10, 21, 20, 56
  1148. addc 22, 22, 10
  1149. adde 23, 23, 9
  1150. # acc[3] &= 0x00ffffffffffffff;
  1151. and 20, 20, 11
  1152. li 21, 0
  1153. # acc[5] += acc[4] >> 56;
  1154. SHR 9, 10, 23, 22, 56
  1155. addc 24, 24, 10
  1156. adde 25, 25, 9
  1157. # acc[4] &= 0x00ffffffffffffff;
  1158. and 22, 22, 11
  1159. # acc[6] += acc[5] >> 56;
  1160. SHR 9, 10, 25, 24, 56
  1161. addc 26, 26, 10
  1162. adde 27, 27, 9
  1163. # acc[5] &= 0x00ffffffffffffff;
  1164. and 24, 24, 11
  1165. std 14, 0(3)
  1166. std 16, 8(3)
  1167. std 18, 16(3)
  1168. std 20, 24(3)
  1169. std 22, 32(3)
  1170. std 24, 40(3)
  1171. std 26, 48(3)
  1172. blr
  1173. .size _p384_felem_reduce_core,.-_p384_felem_reduce_core
  1174. .data
  1175. .align 4
  1176. .LConst:
  1177. # two124m68:
  1178. .long 0x0, 0x0, 0xfffffff0, 0xfffffff
  1179. # two124m116m68:
  1180. .long 0x0, 0x0, 0xfffffff0, 0xfefffff
  1181. #two124p108m76:
  1182. .long 0x0, 0x0, 0xfffff000, 0x10000fff
  1183. #two124m92m68:
  1184. .long 0x0, 0x0, 0xeffffff0, 0xfffffff
  1185. .text
  1186. #
  1187. # void p384_felem_square_reduce(felem out, const felem in)
  1188. #
  1189. .global p384_felem_square_reduce
  1190. .type p384_felem_square_reduce,\@function
  1191. .align 4
  1192. p384_felem_square_reduce:
  1193. stdu 1, -512(1)
  1194. mflr 0
  1195. std 14, 56(1)
  1196. std 15, 64(1)
  1197. std 16, 72(1)
  1198. std 17, 80(1)
  1199. std 18, 88(1)
  1200. std 19, 96(1)
  1201. std 20, 104(1)
  1202. std 21, 112(1)
  1203. std 22, 120(1)
  1204. std 23, 128(1)
  1205. std 24, 136(1)
  1206. std 25, 144(1)
  1207. std 26, 152(1)
  1208. std 27, 160(1)
  1209. std 28, 168(1)
  1210. std 29, 176(1)
  1211. std 30, 184(1)
  1212. std 31, 192(1)
  1213. std 3, 496(1)
  1214. addi 3, 1, 208
  1215. bl _p384_felem_square_core
  1216. mr 4, 3
  1217. ld 3, 496(1)
  1218. bl _p384_felem_reduce_core
  1219. ld 14, 56(1)
  1220. ld 15, 64(1)
  1221. ld 16, 72(1)
  1222. ld 17, 80(1)
  1223. ld 18, 88(1)
  1224. ld 19, 96(1)
  1225. ld 20, 104(1)
  1226. ld 21, 112(1)
  1227. ld 22, 120(1)
  1228. ld 23, 128(1)
  1229. ld 24, 136(1)
  1230. ld 25, 144(1)
  1231. ld 26, 152(1)
  1232. ld 27, 160(1)
  1233. ld 28, 168(1)
  1234. ld 29, 176(1)
  1235. ld 30, 184(1)
  1236. ld 31, 192(1)
  1237. addi 1, 1, 512
  1238. mtlr 0
  1239. blr
  1240. .size p384_felem_square_reduce,.-p384_felem_square_reduce
  1241. #
  1242. # void p384_felem_mul_reduce(felem out, const felem in1, const felem in2)
  1243. #
  1244. .global p384_felem_mul_reduce
  1245. .type p384_felem_mul_reduce,\@function
  1246. .align 5
  1247. p384_felem_mul_reduce:
  1248. stdu 1, -512(1)
  1249. mflr 0
  1250. std 14, 56(1)
  1251. std 15, 64(1)
  1252. std 16, 72(1)
  1253. std 17, 80(1)
  1254. std 18, 88(1)
  1255. std 19, 96(1)
  1256. std 20, 104(1)
  1257. std 21, 112(1)
  1258. std 22, 120(1)
  1259. std 23, 128(1)
  1260. std 24, 136(1)
  1261. std 25, 144(1)
  1262. std 26, 152(1)
  1263. std 27, 160(1)
  1264. std 28, 168(1)
  1265. std 29, 176(1)
  1266. std 30, 184(1)
  1267. std 31, 192(1)
  1268. std 3, 496(1)
  1269. addi 3, 1, 208
  1270. bl _p384_felem_mul_core
  1271. mr 4, 3
  1272. ld 3, 496(1)
  1273. bl _p384_felem_reduce_core
  1274. ld 14, 56(1)
  1275. ld 15, 64(1)
  1276. ld 16, 72(1)
  1277. ld 17, 80(1)
  1278. ld 18, 88(1)
  1279. ld 19, 96(1)
  1280. ld 20, 104(1)
  1281. ld 21, 112(1)
  1282. ld 22, 120(1)
  1283. ld 23, 128(1)
  1284. ld 24, 136(1)
  1285. ld 25, 144(1)
  1286. ld 26, 152(1)
  1287. ld 27, 160(1)
  1288. ld 28, 168(1)
  1289. ld 29, 176(1)
  1290. ld 30, 184(1)
  1291. ld 31, 192(1)
  1292. addi 1, 1, 512
  1293. mtlr 0
  1294. blr
  1295. .size p384_felem_mul_reduce,.-p384_felem_mul_reduce
  1296. ___
  1297. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  1298. print $code;
  1299. close STDOUT or die "error closing STDOUT: $!";