chacha-armv8.pl 27 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144
  1. #! /usr/bin/env perl
  2. # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <[email protected]> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # June 2015
  17. #
  18. # ChaCha20 for ARMv8.
  19. #
  20. # Performance in cycles per byte out of large buffer.
  21. #
  22. # IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU
  23. #
  24. # Apple A7 5.50/+49% 3.33 1.70
  25. # Cortex-A53 8.40/+80% 4.72 4.72(*)
  26. # Cortex-A57 8.06/+43% 4.90 4.43(**)
  27. # Denver 4.50/+82% 2.63 2.67(*)
  28. # X-Gene 9.50/+46% 8.82 8.89(*)
  29. # Mongoose 8.00/+44% 3.64 3.25
  30. # Kryo 8.17/+50% 4.83 4.65
  31. #
  32. # (*) it's expected that doubling interleave factor doesn't help
  33. # all processors, only those with higher NEON latency and
  34. # higher instruction issue rate;
  35. # (**) expected improvement was actually higher;
  36. $flavour=shift;
  37. $output=shift;
  38. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  39. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  40. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  41. die "can't locate arm-xlate.pl";
  42. open OUT,"| \"$^X\" $xlate $flavour $output";
  43. *STDOUT=*OUT;
  44. sub AUTOLOAD() # thunk [simplified] x86-style perlasm
  45. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
  46. my $arg = pop;
  47. $arg = "#$arg" if ($arg*1 eq $arg);
  48. $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
  49. }
  50. my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
  51. my @x=map("x$_",(5..17,19..21));
  52. my @d=map("x$_",(22..28,30));
  53. sub ROUND {
  54. my ($a0,$b0,$c0,$d0)=@_;
  55. my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
  56. my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
  57. my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
  58. (
  59. "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
  60. "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
  61. "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
  62. "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
  63. "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
  64. "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
  65. "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
  66. "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
  67. "&ror_32 (@x[$d0],@x[$d0],16)",
  68. "&ror_32 (@x[$d1],@x[$d1],16)",
  69. "&ror_32 (@x[$d2],@x[$d2],16)",
  70. "&ror_32 (@x[$d3],@x[$d3],16)",
  71. "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
  72. "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
  73. "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
  74. "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
  75. "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
  76. "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
  77. "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
  78. "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
  79. "&ror_32 (@x[$b0],@x[$b0],20)",
  80. "&ror_32 (@x[$b1],@x[$b1],20)",
  81. "&ror_32 (@x[$b2],@x[$b2],20)",
  82. "&ror_32 (@x[$b3],@x[$b3],20)",
  83. "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
  84. "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
  85. "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
  86. "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
  87. "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
  88. "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
  89. "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
  90. "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
  91. "&ror_32 (@x[$d0],@x[$d0],24)",
  92. "&ror_32 (@x[$d1],@x[$d1],24)",
  93. "&ror_32 (@x[$d2],@x[$d2],24)",
  94. "&ror_32 (@x[$d3],@x[$d3],24)",
  95. "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
  96. "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
  97. "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
  98. "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
  99. "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
  100. "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
  101. "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
  102. "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
  103. "&ror_32 (@x[$b0],@x[$b0],25)",
  104. "&ror_32 (@x[$b1],@x[$b1],25)",
  105. "&ror_32 (@x[$b2],@x[$b2],25)",
  106. "&ror_32 (@x[$b3],@x[$b3],25)"
  107. );
  108. }
  109. $code.=<<___;
  110. #include "arm_arch.h"
  111. .text
  112. .extern OPENSSL_armcap_P
  113. .align 5
  114. .Lsigma:
  115. .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
  116. .Lone:
  117. .long 1,0,0,0
  118. .LOPENSSL_armcap_P:
  119. #ifdef __ILP32__
  120. .long OPENSSL_armcap_P-.
  121. #else
  122. .quad OPENSSL_armcap_P-.
  123. #endif
  124. .asciz "ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  125. .globl ChaCha20_ctr32
  126. .type ChaCha20_ctr32,%function
  127. .align 5
  128. ChaCha20_ctr32:
  129. cbz $len,.Labort
  130. adr @x[0],.LOPENSSL_armcap_P
  131. cmp $len,#192
  132. b.lo .Lshort
  133. #ifdef __ILP32__
  134. ldrsw @x[1],[@x[0]]
  135. #else
  136. ldr @x[1],[@x[0]]
  137. #endif
  138. ldr w17,[@x[1],@x[0]]
  139. tst w17,#ARMV7_NEON
  140. b.ne ChaCha20_neon
  141. .Lshort:
  142. .inst 0xd503233f // paciasp
  143. stp x29,x30,[sp,#-96]!
  144. add x29,sp,#0
  145. adr @x[0],.Lsigma
  146. stp x19,x20,[sp,#16]
  147. stp x21,x22,[sp,#32]
  148. stp x23,x24,[sp,#48]
  149. stp x25,x26,[sp,#64]
  150. stp x27,x28,[sp,#80]
  151. sub sp,sp,#64
  152. ldp @d[0],@d[1],[@x[0]] // load sigma
  153. ldp @d[2],@d[3],[$key] // load key
  154. ldp @d[4],@d[5],[$key,#16]
  155. ldp @d[6],@d[7],[$ctr] // load counter
  156. #ifdef __ARMEB__
  157. ror @d[2],@d[2],#32
  158. ror @d[3],@d[3],#32
  159. ror @d[4],@d[4],#32
  160. ror @d[5],@d[5],#32
  161. ror @d[6],@d[6],#32
  162. ror @d[7],@d[7],#32
  163. #endif
  164. .Loop_outer:
  165. mov.32 @x[0],@d[0] // unpack key block
  166. lsr @x[1],@d[0],#32
  167. mov.32 @x[2],@d[1]
  168. lsr @x[3],@d[1],#32
  169. mov.32 @x[4],@d[2]
  170. lsr @x[5],@d[2],#32
  171. mov.32 @x[6],@d[3]
  172. lsr @x[7],@d[3],#32
  173. mov.32 @x[8],@d[4]
  174. lsr @x[9],@d[4],#32
  175. mov.32 @x[10],@d[5]
  176. lsr @x[11],@d[5],#32
  177. mov.32 @x[12],@d[6]
  178. lsr @x[13],@d[6],#32
  179. mov.32 @x[14],@d[7]
  180. lsr @x[15],@d[7],#32
  181. mov $ctr,#10
  182. subs $len,$len,#64
  183. .Loop:
  184. sub $ctr,$ctr,#1
  185. ___
  186. foreach (&ROUND(0, 4, 8,12)) { eval; }
  187. foreach (&ROUND(0, 5,10,15)) { eval; }
  188. $code.=<<___;
  189. cbnz $ctr,.Loop
  190. add.32 @x[0],@x[0],@d[0] // accumulate key block
  191. add @x[1],@x[1],@d[0],lsr#32
  192. add.32 @x[2],@x[2],@d[1]
  193. add @x[3],@x[3],@d[1],lsr#32
  194. add.32 @x[4],@x[4],@d[2]
  195. add @x[5],@x[5],@d[2],lsr#32
  196. add.32 @x[6],@x[6],@d[3]
  197. add @x[7],@x[7],@d[3],lsr#32
  198. add.32 @x[8],@x[8],@d[4]
  199. add @x[9],@x[9],@d[4],lsr#32
  200. add.32 @x[10],@x[10],@d[5]
  201. add @x[11],@x[11],@d[5],lsr#32
  202. add.32 @x[12],@x[12],@d[6]
  203. add @x[13],@x[13],@d[6],lsr#32
  204. add.32 @x[14],@x[14],@d[7]
  205. add @x[15],@x[15],@d[7],lsr#32
  206. b.lo .Ltail
  207. add @x[0],@x[0],@x[1],lsl#32 // pack
  208. add @x[2],@x[2],@x[3],lsl#32
  209. ldp @x[1],@x[3],[$inp,#0] // load input
  210. add @x[4],@x[4],@x[5],lsl#32
  211. add @x[6],@x[6],@x[7],lsl#32
  212. ldp @x[5],@x[7],[$inp,#16]
  213. add @x[8],@x[8],@x[9],lsl#32
  214. add @x[10],@x[10],@x[11],lsl#32
  215. ldp @x[9],@x[11],[$inp,#32]
  216. add @x[12],@x[12],@x[13],lsl#32
  217. add @x[14],@x[14],@x[15],lsl#32
  218. ldp @x[13],@x[15],[$inp,#48]
  219. add $inp,$inp,#64
  220. #ifdef __ARMEB__
  221. rev @x[0],@x[0]
  222. rev @x[2],@x[2]
  223. rev @x[4],@x[4]
  224. rev @x[6],@x[6]
  225. rev @x[8],@x[8]
  226. rev @x[10],@x[10]
  227. rev @x[12],@x[12]
  228. rev @x[14],@x[14]
  229. #endif
  230. eor @x[0],@x[0],@x[1]
  231. eor @x[2],@x[2],@x[3]
  232. eor @x[4],@x[4],@x[5]
  233. eor @x[6],@x[6],@x[7]
  234. eor @x[8],@x[8],@x[9]
  235. eor @x[10],@x[10],@x[11]
  236. eor @x[12],@x[12],@x[13]
  237. eor @x[14],@x[14],@x[15]
  238. stp @x[0],@x[2],[$out,#0] // store output
  239. add @d[6],@d[6],#1 // increment counter
  240. stp @x[4],@x[6],[$out,#16]
  241. stp @x[8],@x[10],[$out,#32]
  242. stp @x[12],@x[14],[$out,#48]
  243. add $out,$out,#64
  244. b.hi .Loop_outer
  245. ldp x19,x20,[x29,#16]
  246. add sp,sp,#64
  247. ldp x21,x22,[x29,#32]
  248. ldp x23,x24,[x29,#48]
  249. ldp x25,x26,[x29,#64]
  250. ldp x27,x28,[x29,#80]
  251. ldp x29,x30,[sp],#96
  252. .inst 0xd50323bf // autiasp
  253. .Labort:
  254. ret
  255. .align 4
  256. .Ltail:
  257. add $len,$len,#64
  258. .Less_than_64:
  259. sub $out,$out,#1
  260. add $inp,$inp,$len
  261. add $out,$out,$len
  262. add $ctr,sp,$len
  263. neg $len,$len
  264. add @x[0],@x[0],@x[1],lsl#32 // pack
  265. add @x[2],@x[2],@x[3],lsl#32
  266. add @x[4],@x[4],@x[5],lsl#32
  267. add @x[6],@x[6],@x[7],lsl#32
  268. add @x[8],@x[8],@x[9],lsl#32
  269. add @x[10],@x[10],@x[11],lsl#32
  270. add @x[12],@x[12],@x[13],lsl#32
  271. add @x[14],@x[14],@x[15],lsl#32
  272. #ifdef __ARMEB__
  273. rev @x[0],@x[0]
  274. rev @x[2],@x[2]
  275. rev @x[4],@x[4]
  276. rev @x[6],@x[6]
  277. rev @x[8],@x[8]
  278. rev @x[10],@x[10]
  279. rev @x[12],@x[12]
  280. rev @x[14],@x[14]
  281. #endif
  282. stp @x[0],@x[2],[sp,#0]
  283. stp @x[4],@x[6],[sp,#16]
  284. stp @x[8],@x[10],[sp,#32]
  285. stp @x[12],@x[14],[sp,#48]
  286. .Loop_tail:
  287. ldrb w10,[$inp,$len]
  288. ldrb w11,[$ctr,$len]
  289. add $len,$len,#1
  290. eor w10,w10,w11
  291. strb w10,[$out,$len]
  292. cbnz $len,.Loop_tail
  293. stp xzr,xzr,[sp,#0]
  294. stp xzr,xzr,[sp,#16]
  295. stp xzr,xzr,[sp,#32]
  296. stp xzr,xzr,[sp,#48]
  297. ldp x19,x20,[x29,#16]
  298. add sp,sp,#64
  299. ldp x21,x22,[x29,#32]
  300. ldp x23,x24,[x29,#48]
  301. ldp x25,x26,[x29,#64]
  302. ldp x27,x28,[x29,#80]
  303. ldp x29,x30,[sp],#96
  304. .inst 0xd50323bf // autiasp
  305. ret
  306. .size ChaCha20_ctr32,.-ChaCha20_ctr32
  307. ___
  308. {{{
  309. my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
  310. map("v$_.4s",(0..7,16..23));
  311. my (@K)=map("v$_.4s",(24..30));
  312. my $ONE="v31.4s";
  313. sub NEONROUND {
  314. my $odd = pop;
  315. my ($a,$b,$c,$d,$t)=@_;
  316. (
  317. "&add ('$a','$a','$b')",
  318. "&eor ('$d','$d','$a')",
  319. "&rev32_16 ('$d','$d')", # vrot ($d,16)
  320. "&add ('$c','$c','$d')",
  321. "&eor ('$t','$b','$c')",
  322. "&ushr ('$b','$t',20)",
  323. "&sli ('$b','$t',12)",
  324. "&add ('$a','$a','$b')",
  325. "&eor ('$t','$d','$a')",
  326. "&ushr ('$d','$t',24)",
  327. "&sli ('$d','$t',8)",
  328. "&add ('$c','$c','$d')",
  329. "&eor ('$t','$b','$c')",
  330. "&ushr ('$b','$t',25)",
  331. "&sli ('$b','$t',7)",
  332. "&ext ('$c','$c','$c',8)",
  333. "&ext ('$d','$d','$d',$odd?4:12)",
  334. "&ext ('$b','$b','$b',$odd?12:4)"
  335. );
  336. }
  337. $code.=<<___;
  338. .type ChaCha20_neon,%function
  339. .align 5
  340. ChaCha20_neon:
  341. .inst 0xd503233f // paciasp
  342. stp x29,x30,[sp,#-96]!
  343. add x29,sp,#0
  344. adr @x[0],.Lsigma
  345. stp x19,x20,[sp,#16]
  346. stp x21,x22,[sp,#32]
  347. stp x23,x24,[sp,#48]
  348. stp x25,x26,[sp,#64]
  349. stp x27,x28,[sp,#80]
  350. cmp $len,#512
  351. b.hs .L512_or_more_neon
  352. sub sp,sp,#64
  353. ldp @d[0],@d[1],[@x[0]] // load sigma
  354. ld1 {@K[0]},[@x[0]],#16
  355. ldp @d[2],@d[3],[$key] // load key
  356. ldp @d[4],@d[5],[$key,#16]
  357. ld1 {@K[1],@K[2]},[$key]
  358. ldp @d[6],@d[7],[$ctr] // load counter
  359. ld1 {@K[3]},[$ctr]
  360. ld1 {$ONE},[@x[0]]
  361. #ifdef __ARMEB__
  362. rev64 @K[0],@K[0]
  363. ror @d[2],@d[2],#32
  364. ror @d[3],@d[3],#32
  365. ror @d[4],@d[4],#32
  366. ror @d[5],@d[5],#32
  367. ror @d[6],@d[6],#32
  368. ror @d[7],@d[7],#32
  369. #endif
  370. add @K[3],@K[3],$ONE // += 1
  371. add @K[4],@K[3],$ONE
  372. add @K[5],@K[4],$ONE
  373. shl $ONE,$ONE,#2 // 1 -> 4
  374. .Loop_outer_neon:
  375. mov.32 @x[0],@d[0] // unpack key block
  376. lsr @x[1],@d[0],#32
  377. mov $A0,@K[0]
  378. mov.32 @x[2],@d[1]
  379. lsr @x[3],@d[1],#32
  380. mov $A1,@K[0]
  381. mov.32 @x[4],@d[2]
  382. lsr @x[5],@d[2],#32
  383. mov $A2,@K[0]
  384. mov.32 @x[6],@d[3]
  385. mov $B0,@K[1]
  386. lsr @x[7],@d[3],#32
  387. mov $B1,@K[1]
  388. mov.32 @x[8],@d[4]
  389. mov $B2,@K[1]
  390. lsr @x[9],@d[4],#32
  391. mov $D0,@K[3]
  392. mov.32 @x[10],@d[5]
  393. mov $D1,@K[4]
  394. lsr @x[11],@d[5],#32
  395. mov $D2,@K[5]
  396. mov.32 @x[12],@d[6]
  397. mov $C0,@K[2]
  398. lsr @x[13],@d[6],#32
  399. mov $C1,@K[2]
  400. mov.32 @x[14],@d[7]
  401. mov $C2,@K[2]
  402. lsr @x[15],@d[7],#32
  403. mov $ctr,#10
  404. subs $len,$len,#256
  405. .Loop_neon:
  406. sub $ctr,$ctr,#1
  407. ___
  408. my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
  409. my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
  410. my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
  411. my @thread3=&ROUND(0,4,8,12);
  412. foreach (@thread0) {
  413. eval; eval(shift(@thread3));
  414. eval(shift(@thread1)); eval(shift(@thread3));
  415. eval(shift(@thread2)); eval(shift(@thread3));
  416. }
  417. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
  418. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
  419. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
  420. @thread3=&ROUND(0,5,10,15);
  421. foreach (@thread0) {
  422. eval; eval(shift(@thread3));
  423. eval(shift(@thread1)); eval(shift(@thread3));
  424. eval(shift(@thread2)); eval(shift(@thread3));
  425. }
  426. $code.=<<___;
  427. cbnz $ctr,.Loop_neon
  428. add.32 @x[0],@x[0],@d[0] // accumulate key block
  429. add $A0,$A0,@K[0]
  430. add @x[1],@x[1],@d[0],lsr#32
  431. add $A1,$A1,@K[0]
  432. add.32 @x[2],@x[2],@d[1]
  433. add $A2,$A2,@K[0]
  434. add @x[3],@x[3],@d[1],lsr#32
  435. add $C0,$C0,@K[2]
  436. add.32 @x[4],@x[4],@d[2]
  437. add $C1,$C1,@K[2]
  438. add @x[5],@x[5],@d[2],lsr#32
  439. add $C2,$C2,@K[2]
  440. add.32 @x[6],@x[6],@d[3]
  441. add $D0,$D0,@K[3]
  442. add @x[7],@x[7],@d[3],lsr#32
  443. add.32 @x[8],@x[8],@d[4]
  444. add $D1,$D1,@K[4]
  445. add @x[9],@x[9],@d[4],lsr#32
  446. add.32 @x[10],@x[10],@d[5]
  447. add $D2,$D2,@K[5]
  448. add @x[11],@x[11],@d[5],lsr#32
  449. add.32 @x[12],@x[12],@d[6]
  450. add $B0,$B0,@K[1]
  451. add @x[13],@x[13],@d[6],lsr#32
  452. add.32 @x[14],@x[14],@d[7]
  453. add $B1,$B1,@K[1]
  454. add @x[15],@x[15],@d[7],lsr#32
  455. add $B2,$B2,@K[1]
  456. b.lo .Ltail_neon
  457. add @x[0],@x[0],@x[1],lsl#32 // pack
  458. add @x[2],@x[2],@x[3],lsl#32
  459. ldp @x[1],@x[3],[$inp,#0] // load input
  460. add @x[4],@x[4],@x[5],lsl#32
  461. add @x[6],@x[6],@x[7],lsl#32
  462. ldp @x[5],@x[7],[$inp,#16]
  463. add @x[8],@x[8],@x[9],lsl#32
  464. add @x[10],@x[10],@x[11],lsl#32
  465. ldp @x[9],@x[11],[$inp,#32]
  466. add @x[12],@x[12],@x[13],lsl#32
  467. add @x[14],@x[14],@x[15],lsl#32
  468. ldp @x[13],@x[15],[$inp,#48]
  469. add $inp,$inp,#64
  470. #ifdef __ARMEB__
  471. rev @x[0],@x[0]
  472. rev @x[2],@x[2]
  473. rev @x[4],@x[4]
  474. rev @x[6],@x[6]
  475. rev @x[8],@x[8]
  476. rev @x[10],@x[10]
  477. rev @x[12],@x[12]
  478. rev @x[14],@x[14]
  479. #endif
  480. ld1.8 {$T0-$T3},[$inp],#64
  481. eor @x[0],@x[0],@x[1]
  482. eor @x[2],@x[2],@x[3]
  483. eor @x[4],@x[4],@x[5]
  484. eor @x[6],@x[6],@x[7]
  485. eor @x[8],@x[8],@x[9]
  486. eor $A0,$A0,$T0
  487. eor @x[10],@x[10],@x[11]
  488. eor $B0,$B0,$T1
  489. eor @x[12],@x[12],@x[13]
  490. eor $C0,$C0,$T2
  491. eor @x[14],@x[14],@x[15]
  492. eor $D0,$D0,$T3
  493. ld1.8 {$T0-$T3},[$inp],#64
  494. stp @x[0],@x[2],[$out,#0] // store output
  495. add @d[6],@d[6],#4 // increment counter
  496. stp @x[4],@x[6],[$out,#16]
  497. add @K[3],@K[3],$ONE // += 4
  498. stp @x[8],@x[10],[$out,#32]
  499. add @K[4],@K[4],$ONE
  500. stp @x[12],@x[14],[$out,#48]
  501. add @K[5],@K[5],$ONE
  502. add $out,$out,#64
  503. st1.8 {$A0-$D0},[$out],#64
  504. ld1.8 {$A0-$D0},[$inp],#64
  505. eor $A1,$A1,$T0
  506. eor $B1,$B1,$T1
  507. eor $C1,$C1,$T2
  508. eor $D1,$D1,$T3
  509. st1.8 {$A1-$D1},[$out],#64
  510. eor $A2,$A2,$A0
  511. eor $B2,$B2,$B0
  512. eor $C2,$C2,$C0
  513. eor $D2,$D2,$D0
  514. st1.8 {$A2-$D2},[$out],#64
  515. b.hi .Loop_outer_neon
  516. ldp x19,x20,[x29,#16]
  517. add sp,sp,#64
  518. ldp x21,x22,[x29,#32]
  519. ldp x23,x24,[x29,#48]
  520. ldp x25,x26,[x29,#64]
  521. ldp x27,x28,[x29,#80]
  522. ldp x29,x30,[sp],#96
  523. .inst 0xd50323bf // autiasp
  524. ret
  525. .Ltail_neon:
  526. add $len,$len,#256
  527. cmp $len,#64
  528. b.lo .Less_than_64
  529. add @x[0],@x[0],@x[1],lsl#32 // pack
  530. add @x[2],@x[2],@x[3],lsl#32
  531. ldp @x[1],@x[3],[$inp,#0] // load input
  532. add @x[4],@x[4],@x[5],lsl#32
  533. add @x[6],@x[6],@x[7],lsl#32
  534. ldp @x[5],@x[7],[$inp,#16]
  535. add @x[8],@x[8],@x[9],lsl#32
  536. add @x[10],@x[10],@x[11],lsl#32
  537. ldp @x[9],@x[11],[$inp,#32]
  538. add @x[12],@x[12],@x[13],lsl#32
  539. add @x[14],@x[14],@x[15],lsl#32
  540. ldp @x[13],@x[15],[$inp,#48]
  541. add $inp,$inp,#64
  542. #ifdef __ARMEB__
  543. rev @x[0],@x[0]
  544. rev @x[2],@x[2]
  545. rev @x[4],@x[4]
  546. rev @x[6],@x[6]
  547. rev @x[8],@x[8]
  548. rev @x[10],@x[10]
  549. rev @x[12],@x[12]
  550. rev @x[14],@x[14]
  551. #endif
  552. eor @x[0],@x[0],@x[1]
  553. eor @x[2],@x[2],@x[3]
  554. eor @x[4],@x[4],@x[5]
  555. eor @x[6],@x[6],@x[7]
  556. eor @x[8],@x[8],@x[9]
  557. eor @x[10],@x[10],@x[11]
  558. eor @x[12],@x[12],@x[13]
  559. eor @x[14],@x[14],@x[15]
  560. stp @x[0],@x[2],[$out,#0] // store output
  561. add @d[6],@d[6],#4 // increment counter
  562. stp @x[4],@x[6],[$out,#16]
  563. stp @x[8],@x[10],[$out,#32]
  564. stp @x[12],@x[14],[$out,#48]
  565. add $out,$out,#64
  566. b.eq .Ldone_neon
  567. sub $len,$len,#64
  568. cmp $len,#64
  569. b.lo .Less_than_128
  570. ld1.8 {$T0-$T3},[$inp],#64
  571. eor $A0,$A0,$T0
  572. eor $B0,$B0,$T1
  573. eor $C0,$C0,$T2
  574. eor $D0,$D0,$T3
  575. st1.8 {$A0-$D0},[$out],#64
  576. b.eq .Ldone_neon
  577. sub $len,$len,#64
  578. cmp $len,#64
  579. b.lo .Less_than_192
  580. ld1.8 {$T0-$T3},[$inp],#64
  581. eor $A1,$A1,$T0
  582. eor $B1,$B1,$T1
  583. eor $C1,$C1,$T2
  584. eor $D1,$D1,$T3
  585. st1.8 {$A1-$D1},[$out],#64
  586. b.eq .Ldone_neon
  587. sub $len,$len,#64
  588. st1.8 {$A2-$D2},[sp]
  589. b .Last_neon
  590. .Less_than_128:
  591. st1.8 {$A0-$D0},[sp]
  592. b .Last_neon
  593. .Less_than_192:
  594. st1.8 {$A1-$D1},[sp]
  595. b .Last_neon
  596. .align 4
  597. .Last_neon:
  598. sub $out,$out,#1
  599. add $inp,$inp,$len
  600. add $out,$out,$len
  601. add $ctr,sp,$len
  602. neg $len,$len
  603. .Loop_tail_neon:
  604. ldrb w10,[$inp,$len]
  605. ldrb w11,[$ctr,$len]
  606. add $len,$len,#1
  607. eor w10,w10,w11
  608. strb w10,[$out,$len]
  609. cbnz $len,.Loop_tail_neon
  610. stp xzr,xzr,[sp,#0]
  611. stp xzr,xzr,[sp,#16]
  612. stp xzr,xzr,[sp,#32]
  613. stp xzr,xzr,[sp,#48]
  614. .Ldone_neon:
  615. ldp x19,x20,[x29,#16]
  616. add sp,sp,#64
  617. ldp x21,x22,[x29,#32]
  618. ldp x23,x24,[x29,#48]
  619. ldp x25,x26,[x29,#64]
  620. ldp x27,x28,[x29,#80]
  621. ldp x29,x30,[sp],#96
  622. .inst 0xd50323bf // autiasp
  623. ret
  624. .size ChaCha20_neon,.-ChaCha20_neon
  625. ___
  626. {
  627. my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
  628. my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
  629. $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
  630. $code.=<<___;
  631. .type ChaCha20_512_neon,%function
  632. .align 5
  633. ChaCha20_512_neon:
  634. .inst 0xd503233f // paciasp
  635. stp x29,x30,[sp,#-96]!
  636. add x29,sp,#0
  637. adr @x[0],.Lsigma
  638. stp x19,x20,[sp,#16]
  639. stp x21,x22,[sp,#32]
  640. stp x23,x24,[sp,#48]
  641. stp x25,x26,[sp,#64]
  642. stp x27,x28,[sp,#80]
  643. .L512_or_more_neon:
  644. sub sp,sp,#128+64
  645. ldp @d[0],@d[1],[@x[0]] // load sigma
  646. ld1 {@K[0]},[@x[0]],#16
  647. ldp @d[2],@d[3],[$key] // load key
  648. ldp @d[4],@d[5],[$key,#16]
  649. ld1 {@K[1],@K[2]},[$key]
  650. ldp @d[6],@d[7],[$ctr] // load counter
  651. ld1 {@K[3]},[$ctr]
  652. ld1 {$ONE},[@x[0]]
  653. #ifdef __ARMEB__
  654. rev64 @K[0],@K[0]
  655. ror @d[2],@d[2],#32
  656. ror @d[3],@d[3],#32
  657. ror @d[4],@d[4],#32
  658. ror @d[5],@d[5],#32
  659. ror @d[6],@d[6],#32
  660. ror @d[7],@d[7],#32
  661. #endif
  662. add @K[3],@K[3],$ONE // += 1
  663. stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part
  664. add @K[3],@K[3],$ONE // not typo
  665. str @K[2],[sp,#32]
  666. add @K[4],@K[3],$ONE
  667. add @K[5],@K[4],$ONE
  668. add @K[6],@K[5],$ONE
  669. shl $ONE,$ONE,#2 // 1 -> 4
  670. stp d8,d9,[sp,#128+0] // meet ABI requirements
  671. stp d10,d11,[sp,#128+16]
  672. stp d12,d13,[sp,#128+32]
  673. stp d14,d15,[sp,#128+48]
  674. sub $len,$len,#512 // not typo
  675. .Loop_outer_512_neon:
  676. mov $A0,@K[0]
  677. mov $A1,@K[0]
  678. mov $A2,@K[0]
  679. mov $A3,@K[0]
  680. mov $A4,@K[0]
  681. mov $A5,@K[0]
  682. mov $B0,@K[1]
  683. mov.32 @x[0],@d[0] // unpack key block
  684. mov $B1,@K[1]
  685. lsr @x[1],@d[0],#32
  686. mov $B2,@K[1]
  687. mov.32 @x[2],@d[1]
  688. mov $B3,@K[1]
  689. lsr @x[3],@d[1],#32
  690. mov $B4,@K[1]
  691. mov.32 @x[4],@d[2]
  692. mov $B5,@K[1]
  693. lsr @x[5],@d[2],#32
  694. mov $D0,@K[3]
  695. mov.32 @x[6],@d[3]
  696. mov $D1,@K[4]
  697. lsr @x[7],@d[3],#32
  698. mov $D2,@K[5]
  699. mov.32 @x[8],@d[4]
  700. mov $D3,@K[6]
  701. lsr @x[9],@d[4],#32
  702. mov $C0,@K[2]
  703. mov.32 @x[10],@d[5]
  704. mov $C1,@K[2]
  705. lsr @x[11],@d[5],#32
  706. add $D4,$D0,$ONE // +4
  707. mov.32 @x[12],@d[6]
  708. add $D5,$D1,$ONE // +4
  709. lsr @x[13],@d[6],#32
  710. mov $C2,@K[2]
  711. mov.32 @x[14],@d[7]
  712. mov $C3,@K[2]
  713. lsr @x[15],@d[7],#32
  714. mov $C4,@K[2]
  715. stp @K[3],@K[4],[sp,#48] // off-load key block, variable part
  716. mov $C5,@K[2]
  717. str @K[5],[sp,#80]
  718. mov $ctr,#5
  719. subs $len,$len,#512
  720. .Loop_upper_neon:
  721. sub $ctr,$ctr,#1
  722. ___
  723. my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
  724. my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
  725. my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
  726. my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
  727. my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
  728. my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
  729. my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  730. my $diff = ($#thread0+1)*6 - $#thread67 - 1;
  731. my $i = 0;
  732. foreach (@thread0) {
  733. eval; eval(shift(@thread67));
  734. eval(shift(@thread1)); eval(shift(@thread67));
  735. eval(shift(@thread2)); eval(shift(@thread67));
  736. eval(shift(@thread3)); eval(shift(@thread67));
  737. eval(shift(@thread4)); eval(shift(@thread67));
  738. eval(shift(@thread5)); eval(shift(@thread67));
  739. }
  740. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
  741. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
  742. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
  743. @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
  744. @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
  745. @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
  746. @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  747. foreach (@thread0) {
  748. eval; eval(shift(@thread67));
  749. eval(shift(@thread1)); eval(shift(@thread67));
  750. eval(shift(@thread2)); eval(shift(@thread67));
  751. eval(shift(@thread3)); eval(shift(@thread67));
  752. eval(shift(@thread4)); eval(shift(@thread67));
  753. eval(shift(@thread5)); eval(shift(@thread67));
  754. }
  755. $code.=<<___;
  756. cbnz $ctr,.Loop_upper_neon
  757. add.32 @x[0],@x[0],@d[0] // accumulate key block
  758. add @x[1],@x[1],@d[0],lsr#32
  759. add.32 @x[2],@x[2],@d[1]
  760. add @x[3],@x[3],@d[1],lsr#32
  761. add.32 @x[4],@x[4],@d[2]
  762. add @x[5],@x[5],@d[2],lsr#32
  763. add.32 @x[6],@x[6],@d[3]
  764. add @x[7],@x[7],@d[3],lsr#32
  765. add.32 @x[8],@x[8],@d[4]
  766. add @x[9],@x[9],@d[4],lsr#32
  767. add.32 @x[10],@x[10],@d[5]
  768. add @x[11],@x[11],@d[5],lsr#32
  769. add.32 @x[12],@x[12],@d[6]
  770. add @x[13],@x[13],@d[6],lsr#32
  771. add.32 @x[14],@x[14],@d[7]
  772. add @x[15],@x[15],@d[7],lsr#32
  773. add @x[0],@x[0],@x[1],lsl#32 // pack
  774. add @x[2],@x[2],@x[3],lsl#32
  775. ldp @x[1],@x[3],[$inp,#0] // load input
  776. add @x[4],@x[4],@x[5],lsl#32
  777. add @x[6],@x[6],@x[7],lsl#32
  778. ldp @x[5],@x[7],[$inp,#16]
  779. add @x[8],@x[8],@x[9],lsl#32
  780. add @x[10],@x[10],@x[11],lsl#32
  781. ldp @x[9],@x[11],[$inp,#32]
  782. add @x[12],@x[12],@x[13],lsl#32
  783. add @x[14],@x[14],@x[15],lsl#32
  784. ldp @x[13],@x[15],[$inp,#48]
  785. add $inp,$inp,#64
  786. #ifdef __ARMEB__
  787. rev @x[0],@x[0]
  788. rev @x[2],@x[2]
  789. rev @x[4],@x[4]
  790. rev @x[6],@x[6]
  791. rev @x[8],@x[8]
  792. rev @x[10],@x[10]
  793. rev @x[12],@x[12]
  794. rev @x[14],@x[14]
  795. #endif
  796. eor @x[0],@x[0],@x[1]
  797. eor @x[2],@x[2],@x[3]
  798. eor @x[4],@x[4],@x[5]
  799. eor @x[6],@x[6],@x[7]
  800. eor @x[8],@x[8],@x[9]
  801. eor @x[10],@x[10],@x[11]
  802. eor @x[12],@x[12],@x[13]
  803. eor @x[14],@x[14],@x[15]
  804. stp @x[0],@x[2],[$out,#0] // store output
  805. add @d[6],@d[6],#1 // increment counter
  806. mov.32 @x[0],@d[0] // unpack key block
  807. lsr @x[1],@d[0],#32
  808. stp @x[4],@x[6],[$out,#16]
  809. mov.32 @x[2],@d[1]
  810. lsr @x[3],@d[1],#32
  811. stp @x[8],@x[10],[$out,#32]
  812. mov.32 @x[4],@d[2]
  813. lsr @x[5],@d[2],#32
  814. stp @x[12],@x[14],[$out,#48]
  815. add $out,$out,#64
  816. mov.32 @x[6],@d[3]
  817. lsr @x[7],@d[3],#32
  818. mov.32 @x[8],@d[4]
  819. lsr @x[9],@d[4],#32
  820. mov.32 @x[10],@d[5]
  821. lsr @x[11],@d[5],#32
  822. mov.32 @x[12],@d[6]
  823. lsr @x[13],@d[6],#32
  824. mov.32 @x[14],@d[7]
  825. lsr @x[15],@d[7],#32
  826. mov $ctr,#5
  827. .Loop_lower_neon:
  828. sub $ctr,$ctr,#1
  829. ___
  830. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
  831. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
  832. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
  833. @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
  834. @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
  835. @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
  836. @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  837. foreach (@thread0) {
  838. eval; eval(shift(@thread67));
  839. eval(shift(@thread1)); eval(shift(@thread67));
  840. eval(shift(@thread2)); eval(shift(@thread67));
  841. eval(shift(@thread3)); eval(shift(@thread67));
  842. eval(shift(@thread4)); eval(shift(@thread67));
  843. eval(shift(@thread5)); eval(shift(@thread67));
  844. }
  845. @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
  846. @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
  847. @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
  848. @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
  849. @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
  850. @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
  851. @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
  852. foreach (@thread0) {
  853. eval; eval(shift(@thread67));
  854. eval(shift(@thread1)); eval(shift(@thread67));
  855. eval(shift(@thread2)); eval(shift(@thread67));
  856. eval(shift(@thread3)); eval(shift(@thread67));
  857. eval(shift(@thread4)); eval(shift(@thread67));
  858. eval(shift(@thread5)); eval(shift(@thread67));
  859. }
  860. $code.=<<___;
  861. cbnz $ctr,.Loop_lower_neon
  862. add.32 @x[0],@x[0],@d[0] // accumulate key block
  863. ldp @K[0],@K[1],[sp,#0]
  864. add @x[1],@x[1],@d[0],lsr#32
  865. ldp @K[2],@K[3],[sp,#32]
  866. add.32 @x[2],@x[2],@d[1]
  867. ldp @K[4],@K[5],[sp,#64]
  868. add @x[3],@x[3],@d[1],lsr#32
  869. add $A0,$A0,@K[0]
  870. add.32 @x[4],@x[4],@d[2]
  871. add $A1,$A1,@K[0]
  872. add @x[5],@x[5],@d[2],lsr#32
  873. add $A2,$A2,@K[0]
  874. add.32 @x[6],@x[6],@d[3]
  875. add $A3,$A3,@K[0]
  876. add @x[7],@x[7],@d[3],lsr#32
  877. add $A4,$A4,@K[0]
  878. add.32 @x[8],@x[8],@d[4]
  879. add $A5,$A5,@K[0]
  880. add @x[9],@x[9],@d[4],lsr#32
  881. add $C0,$C0,@K[2]
  882. add.32 @x[10],@x[10],@d[5]
  883. add $C1,$C1,@K[2]
  884. add @x[11],@x[11],@d[5],lsr#32
  885. add $C2,$C2,@K[2]
  886. add.32 @x[12],@x[12],@d[6]
  887. add $C3,$C3,@K[2]
  888. add @x[13],@x[13],@d[6],lsr#32
  889. add $C4,$C4,@K[2]
  890. add.32 @x[14],@x[14],@d[7]
  891. add $C5,$C5,@K[2]
  892. add @x[15],@x[15],@d[7],lsr#32
  893. add $D4,$D4,$ONE // +4
  894. add @x[0],@x[0],@x[1],lsl#32 // pack
  895. add $D5,$D5,$ONE // +4
  896. add @x[2],@x[2],@x[3],lsl#32
  897. add $D0,$D0,@K[3]
  898. ldp @x[1],@x[3],[$inp,#0] // load input
  899. add $D1,$D1,@K[4]
  900. add @x[4],@x[4],@x[5],lsl#32
  901. add $D2,$D2,@K[5]
  902. add @x[6],@x[6],@x[7],lsl#32
  903. add $D3,$D3,@K[6]
  904. ldp @x[5],@x[7],[$inp,#16]
  905. add $D4,$D4,@K[3]
  906. add @x[8],@x[8],@x[9],lsl#32
  907. add $D5,$D5,@K[4]
  908. add @x[10],@x[10],@x[11],lsl#32
  909. add $B0,$B0,@K[1]
  910. ldp @x[9],@x[11],[$inp,#32]
  911. add $B1,$B1,@K[1]
  912. add @x[12],@x[12],@x[13],lsl#32
  913. add $B2,$B2,@K[1]
  914. add @x[14],@x[14],@x[15],lsl#32
  915. add $B3,$B3,@K[1]
  916. ldp @x[13],@x[15],[$inp,#48]
  917. add $B4,$B4,@K[1]
  918. add $inp,$inp,#64
  919. add $B5,$B5,@K[1]
  920. #ifdef __ARMEB__
  921. rev @x[0],@x[0]
  922. rev @x[2],@x[2]
  923. rev @x[4],@x[4]
  924. rev @x[6],@x[6]
  925. rev @x[8],@x[8]
  926. rev @x[10],@x[10]
  927. rev @x[12],@x[12]
  928. rev @x[14],@x[14]
  929. #endif
  930. ld1.8 {$T0-$T3},[$inp],#64
  931. eor @x[0],@x[0],@x[1]
  932. eor @x[2],@x[2],@x[3]
  933. eor @x[4],@x[4],@x[5]
  934. eor @x[6],@x[6],@x[7]
  935. eor @x[8],@x[8],@x[9]
  936. eor $A0,$A0,$T0
  937. eor @x[10],@x[10],@x[11]
  938. eor $B0,$B0,$T1
  939. eor @x[12],@x[12],@x[13]
  940. eor $C0,$C0,$T2
  941. eor @x[14],@x[14],@x[15]
  942. eor $D0,$D0,$T3
  943. ld1.8 {$T0-$T3},[$inp],#64
  944. stp @x[0],@x[2],[$out,#0] // store output
  945. add @d[6],@d[6],#7 // increment counter
  946. stp @x[4],@x[6],[$out,#16]
  947. stp @x[8],@x[10],[$out,#32]
  948. stp @x[12],@x[14],[$out,#48]
  949. add $out,$out,#64
  950. st1.8 {$A0-$D0},[$out],#64
  951. ld1.8 {$A0-$D0},[$inp],#64
  952. eor $A1,$A1,$T0
  953. eor $B1,$B1,$T1
  954. eor $C1,$C1,$T2
  955. eor $D1,$D1,$T3
  956. st1.8 {$A1-$D1},[$out],#64
  957. ld1.8 {$A1-$D1},[$inp],#64
  958. eor $A2,$A2,$A0
  959. ldp @K[0],@K[1],[sp,#0]
  960. eor $B2,$B2,$B0
  961. ldp @K[2],@K[3],[sp,#32]
  962. eor $C2,$C2,$C0
  963. eor $D2,$D2,$D0
  964. st1.8 {$A2-$D2},[$out],#64
  965. ld1.8 {$A2-$D2},[$inp],#64
  966. eor $A3,$A3,$A1
  967. eor $B3,$B3,$B1
  968. eor $C3,$C3,$C1
  969. eor $D3,$D3,$D1
  970. st1.8 {$A3-$D3},[$out],#64
  971. ld1.8 {$A3-$D3},[$inp],#64
  972. eor $A4,$A4,$A2
  973. eor $B4,$B4,$B2
  974. eor $C4,$C4,$C2
  975. eor $D4,$D4,$D2
  976. st1.8 {$A4-$D4},[$out],#64
  977. shl $A0,$ONE,#1 // 4 -> 8
  978. eor $A5,$A5,$A3
  979. eor $B5,$B5,$B3
  980. eor $C5,$C5,$C3
  981. eor $D5,$D5,$D3
  982. st1.8 {$A5-$D5},[$out],#64
  983. add @K[3],@K[3],$A0 // += 8
  984. add @K[4],@K[4],$A0
  985. add @K[5],@K[5],$A0
  986. add @K[6],@K[6],$A0
  987. b.hs .Loop_outer_512_neon
  988. adds $len,$len,#512
  989. ushr $A0,$ONE,#2 // 4 -> 1
  990. ldp d8,d9,[sp,#128+0] // meet ABI requirements
  991. ldp d10,d11,[sp,#128+16]
  992. ldp d12,d13,[sp,#128+32]
  993. ldp d14,d15,[sp,#128+48]
  994. stp @K[0],$ONE,[sp,#0] // wipe off-load area
  995. stp @K[0],$ONE,[sp,#32]
  996. stp @K[0],$ONE,[sp,#64]
  997. b.eq .Ldone_512_neon
  998. cmp $len,#192
  999. sub @K[3],@K[3],$A0 // -= 1
  1000. sub @K[4],@K[4],$A0
  1001. sub @K[5],@K[5],$A0
  1002. add sp,sp,#128
  1003. b.hs .Loop_outer_neon
  1004. eor @K[1],@K[1],@K[1]
  1005. eor @K[2],@K[2],@K[2]
  1006. eor @K[3],@K[3],@K[3]
  1007. eor @K[4],@K[4],@K[4]
  1008. eor @K[5],@K[5],@K[5]
  1009. eor @K[6],@K[6],@K[6]
  1010. b .Loop_outer
  1011. .Ldone_512_neon:
  1012. ldp x19,x20,[x29,#16]
  1013. add sp,sp,#128+64
  1014. ldp x21,x22,[x29,#32]
  1015. ldp x23,x24,[x29,#48]
  1016. ldp x25,x26,[x29,#64]
  1017. ldp x27,x28,[x29,#80]
  1018. ldp x29,x30,[sp],#96
  1019. .inst 0xd50323bf // autiasp
  1020. ret
  1021. .size ChaCha20_512_neon,.-ChaCha20_512_neon
  1022. ___
  1023. }
  1024. }}}
  1025. foreach (split("\n",$code)) {
  1026. s/\`([^\`]*)\`/eval $1/geo;
  1027. (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or
  1028. (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or
  1029. (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or
  1030. (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or
  1031. (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
  1032. #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
  1033. print $_,"\n";
  1034. }
  1035. close STDOUT or die "error closing STDOUT: $!"; # flush