aesv8-armx.pl 23 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081
  1. #! /usr/bin/env perl
  2. # Copyright 2014-2022 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <[email protected]> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # This module implements support for ARMv8 AES instructions. The
  17. # module is endian-agnostic in sense that it supports both big- and
  18. # little-endian cases. As does it support both 32- and 64-bit modes
  19. # of operation. Latter is achieved by limiting amount of utilized
  20. # registers to 16, which implies additional NEON load and integer
  21. # instructions. This has no effect on mighty Apple A7, where results
  22. # are literally equal to the theoretical estimates based on AES
  23. # instruction latencies and issue rates. On Cortex-A53, an in-order
  24. # execution core, this costs up to 10-15%, which is partially
  25. # compensated by implementing dedicated code path for 128-bit
  26. # CBC encrypt case. On Cortex-A57 parallelizable mode performance
  27. # seems to be limited by sheer amount of NEON instructions...
  28. #
  29. # Performance in cycles per byte processed with 128-bit key:
  30. #
  31. # CBC enc CBC dec CTR
  32. # Apple A7 2.39 1.20 1.20
  33. # Cortex-A53 1.32 1.29 1.46
  34. # Cortex-A57(*) 1.95 0.85 0.93
  35. # Denver 1.96 0.86 0.80
  36. # Mongoose 1.33 1.20 1.20
  37. # Kryo 1.26 0.94 1.00
  38. #
  39. # (*) original 3.64/1.34/1.32 results were for r0p0 revision
  40. # and are still same even for updated module;
  41. $flavour = shift;
  42. $output = shift;
  43. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  44. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  45. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  46. die "can't locate arm-xlate.pl";
  47. open OUT,"| \"$^X\" $xlate $flavour $output";
  48. *STDOUT=*OUT;
  49. $prefix="aes_v8";
  50. $code=<<___;
  51. #include "arm_arch.h"
  52. #if __ARM_MAX_ARCH__>=7
  53. .text
  54. ___
  55. $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
  56. $code.=<<___ if ($flavour !~ /64/);
  57. .arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
  58. .fpu neon
  59. .code 32
  60. #undef __thumb2__
  61. ___
  62. # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
  63. # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
  64. # maintain both 32- and 64-bit codes within single module and
  65. # transliterate common code to either flavour with regex vodoo.
  66. #
  67. {{{
  68. my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
  69. my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
  70. $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
  71. $code.=<<___;
  72. .align 5
  73. .Lrcon:
  74. .long 0x01,0x01,0x01,0x01
  75. .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
  76. .long 0x1b,0x1b,0x1b,0x1b
  77. .globl ${prefix}_set_encrypt_key
  78. .type ${prefix}_set_encrypt_key,%function
  79. .align 5
  80. ${prefix}_set_encrypt_key:
  81. .Lenc_key:
  82. ___
  83. $code.=<<___ if ($flavour =~ /64/);
  84. stp x29,x30,[sp,#-16]!
  85. add x29,sp,#0
  86. ___
  87. $code.=<<___;
  88. mov $ptr,#-1
  89. cmp $inp,#0
  90. b.eq .Lenc_key_abort
  91. cmp $out,#0
  92. b.eq .Lenc_key_abort
  93. mov $ptr,#-2
  94. cmp $bits,#128
  95. b.lt .Lenc_key_abort
  96. cmp $bits,#256
  97. b.gt .Lenc_key_abort
  98. tst $bits,#0x3f
  99. b.ne .Lenc_key_abort
  100. adr $ptr,.Lrcon
  101. cmp $bits,#192
  102. veor $zero,$zero,$zero
  103. vld1.8 {$in0},[$inp],#16
  104. mov $bits,#8 // reuse $bits
  105. vld1.32 {$rcon,$mask},[$ptr],#32
  106. b.lt .Loop128
  107. b.eq .L192
  108. b .L256
  109. .align 4
  110. .Loop128:
  111. vtbl.8 $key,{$in0},$mask
  112. vext.8 $tmp,$zero,$in0,#12
  113. vst1.32 {$in0},[$out],#16
  114. aese $key,$zero
  115. subs $bits,$bits,#1
  116. veor $in0,$in0,$tmp
  117. vext.8 $tmp,$zero,$tmp,#12
  118. veor $in0,$in0,$tmp
  119. vext.8 $tmp,$zero,$tmp,#12
  120. veor $key,$key,$rcon
  121. veor $in0,$in0,$tmp
  122. vshl.u8 $rcon,$rcon,#1
  123. veor $in0,$in0,$key
  124. b.ne .Loop128
  125. vld1.32 {$rcon},[$ptr]
  126. vtbl.8 $key,{$in0},$mask
  127. vext.8 $tmp,$zero,$in0,#12
  128. vst1.32 {$in0},[$out],#16
  129. aese $key,$zero
  130. veor $in0,$in0,$tmp
  131. vext.8 $tmp,$zero,$tmp,#12
  132. veor $in0,$in0,$tmp
  133. vext.8 $tmp,$zero,$tmp,#12
  134. veor $key,$key,$rcon
  135. veor $in0,$in0,$tmp
  136. vshl.u8 $rcon,$rcon,#1
  137. veor $in0,$in0,$key
  138. vtbl.8 $key,{$in0},$mask
  139. vext.8 $tmp,$zero,$in0,#12
  140. vst1.32 {$in0},[$out],#16
  141. aese $key,$zero
  142. veor $in0,$in0,$tmp
  143. vext.8 $tmp,$zero,$tmp,#12
  144. veor $in0,$in0,$tmp
  145. vext.8 $tmp,$zero,$tmp,#12
  146. veor $key,$key,$rcon
  147. veor $in0,$in0,$tmp
  148. veor $in0,$in0,$key
  149. vst1.32 {$in0},[$out]
  150. add $out,$out,#0x50
  151. mov $rounds,#10
  152. b .Ldone
  153. .align 4
  154. .L192:
  155. vld1.8 {$in1},[$inp],#8
  156. vmov.i8 $key,#8 // borrow $key
  157. vst1.32 {$in0},[$out],#16
  158. vsub.i8 $mask,$mask,$key // adjust the mask
  159. .Loop192:
  160. vtbl.8 $key,{$in1},$mask
  161. vext.8 $tmp,$zero,$in0,#12
  162. #ifdef __ARMEB__
  163. vst1.32 {$in1},[$out],#16
  164. sub $out,$out,#8
  165. #else
  166. vst1.32 {$in1},[$out],#8
  167. #endif
  168. aese $key,$zero
  169. subs $bits,$bits,#1
  170. veor $in0,$in0,$tmp
  171. vext.8 $tmp,$zero,$tmp,#12
  172. veor $in0,$in0,$tmp
  173. vext.8 $tmp,$zero,$tmp,#12
  174. veor $in0,$in0,$tmp
  175. vdup.32 $tmp,${in0}[3]
  176. veor $tmp,$tmp,$in1
  177. veor $key,$key,$rcon
  178. vext.8 $in1,$zero,$in1,#12
  179. vshl.u8 $rcon,$rcon,#1
  180. veor $in1,$in1,$tmp
  181. veor $in0,$in0,$key
  182. veor $in1,$in1,$key
  183. vst1.32 {$in0},[$out],#16
  184. b.ne .Loop192
  185. mov $rounds,#12
  186. add $out,$out,#0x20
  187. b .Ldone
  188. .align 4
  189. .L256:
  190. vld1.8 {$in1},[$inp]
  191. mov $bits,#7
  192. mov $rounds,#14
  193. vst1.32 {$in0},[$out],#16
  194. .Loop256:
  195. vtbl.8 $key,{$in1},$mask
  196. vext.8 $tmp,$zero,$in0,#12
  197. vst1.32 {$in1},[$out],#16
  198. aese $key,$zero
  199. subs $bits,$bits,#1
  200. veor $in0,$in0,$tmp
  201. vext.8 $tmp,$zero,$tmp,#12
  202. veor $in0,$in0,$tmp
  203. vext.8 $tmp,$zero,$tmp,#12
  204. veor $key,$key,$rcon
  205. veor $in0,$in0,$tmp
  206. vshl.u8 $rcon,$rcon,#1
  207. veor $in0,$in0,$key
  208. vst1.32 {$in0},[$out],#16
  209. b.eq .Ldone
  210. vdup.32 $key,${in0}[3] // just splat
  211. vext.8 $tmp,$zero,$in1,#12
  212. aese $key,$zero
  213. veor $in1,$in1,$tmp
  214. vext.8 $tmp,$zero,$tmp,#12
  215. veor $in1,$in1,$tmp
  216. vext.8 $tmp,$zero,$tmp,#12
  217. veor $in1,$in1,$tmp
  218. veor $in1,$in1,$key
  219. b .Loop256
  220. .Ldone:
  221. str $rounds,[$out]
  222. mov $ptr,#0
  223. .Lenc_key_abort:
  224. mov x0,$ptr // return value
  225. `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
  226. ret
  227. .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
  228. .globl ${prefix}_set_decrypt_key
  229. .type ${prefix}_set_decrypt_key,%function
  230. .align 5
  231. ${prefix}_set_decrypt_key:
  232. ___
  233. $code.=<<___ if ($flavour =~ /64/);
  234. .inst 0xd503233f // paciasp
  235. stp x29,x30,[sp,#-16]!
  236. add x29,sp,#0
  237. ___
  238. $code.=<<___ if ($flavour !~ /64/);
  239. stmdb sp!,{r4,lr}
  240. ___
  241. $code.=<<___;
  242. bl .Lenc_key
  243. cmp x0,#0
  244. b.ne .Ldec_key_abort
  245. sub $out,$out,#240 // restore original $out
  246. mov x4,#-16
  247. add $inp,$out,x12,lsl#4 // end of key schedule
  248. vld1.32 {v0.16b},[$out]
  249. vld1.32 {v1.16b},[$inp]
  250. vst1.32 {v0.16b},[$inp],x4
  251. vst1.32 {v1.16b},[$out],#16
  252. .Loop_imc:
  253. vld1.32 {v0.16b},[$out]
  254. vld1.32 {v1.16b},[$inp]
  255. aesimc v0.16b,v0.16b
  256. aesimc v1.16b,v1.16b
  257. vst1.32 {v0.16b},[$inp],x4
  258. vst1.32 {v1.16b},[$out],#16
  259. cmp $inp,$out
  260. b.hi .Loop_imc
  261. vld1.32 {v0.16b},[$out]
  262. aesimc v0.16b,v0.16b
  263. vst1.32 {v0.16b},[$inp]
  264. eor x0,x0,x0 // return value
  265. .Ldec_key_abort:
  266. ___
  267. $code.=<<___ if ($flavour !~ /64/);
  268. ldmia sp!,{r4,pc}
  269. ___
  270. $code.=<<___ if ($flavour =~ /64/);
  271. ldp x29,x30,[sp],#16
  272. .inst 0xd50323bf // autiasp
  273. ret
  274. ___
  275. $code.=<<___;
  276. .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
  277. ___
  278. }}}
  279. {{{
  280. sub gen_block () {
  281. my $dir = shift;
  282. my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
  283. my ($inp,$out,$key)=map("x$_",(0..2));
  284. my $rounds="w3";
  285. my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
  286. $code.=<<___;
  287. .globl ${prefix}_${dir}crypt
  288. .type ${prefix}_${dir}crypt,%function
  289. .align 5
  290. ${prefix}_${dir}crypt:
  291. ldr $rounds,[$key,#240]
  292. vld1.32 {$rndkey0},[$key],#16
  293. vld1.8 {$inout},[$inp]
  294. sub $rounds,$rounds,#2
  295. vld1.32 {$rndkey1},[$key],#16
  296. .Loop_${dir}c:
  297. aes$e $inout,$rndkey0
  298. aes$mc $inout,$inout
  299. vld1.32 {$rndkey0},[$key],#16
  300. subs $rounds,$rounds,#2
  301. aes$e $inout,$rndkey1
  302. aes$mc $inout,$inout
  303. vld1.32 {$rndkey1},[$key],#16
  304. b.gt .Loop_${dir}c
  305. aes$e $inout,$rndkey0
  306. aes$mc $inout,$inout
  307. vld1.32 {$rndkey0},[$key]
  308. aes$e $inout,$rndkey1
  309. veor $inout,$inout,$rndkey0
  310. vst1.8 {$inout},[$out]
  311. ret
  312. .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
  313. ___
  314. }
  315. &gen_block("en");
  316. &gen_block("de");
  317. }}}
  318. {{{
  319. my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
  320. my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
  321. my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
  322. my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
  323. my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
  324. ### q8-q15 preloaded key schedule
  325. $code.=<<___;
  326. .globl ${prefix}_cbc_encrypt
  327. .type ${prefix}_cbc_encrypt,%function
  328. .align 5
  329. ${prefix}_cbc_encrypt:
  330. ___
  331. $code.=<<___ if ($flavour =~ /64/);
  332. stp x29,x30,[sp,#-16]!
  333. add x29,sp,#0
  334. ___
  335. $code.=<<___ if ($flavour !~ /64/);
  336. mov ip,sp
  337. stmdb sp!,{r4-r8,lr}
  338. vstmdb sp!,{d8-d15} @ ABI specification says so
  339. ldmia ip,{r4-r5} @ load remaining args
  340. ___
  341. $code.=<<___;
  342. subs $len,$len,#16
  343. mov $step,#16
  344. b.lo .Lcbc_abort
  345. cclr $step,eq
  346. cmp $enc,#0 // en- or decrypting?
  347. ldr $rounds,[$key,#240]
  348. and $len,$len,#-16
  349. vld1.8 {$ivec},[$ivp]
  350. vld1.8 {$dat},[$inp],$step
  351. vld1.32 {q8-q9},[$key] // load key schedule...
  352. sub $rounds,$rounds,#6
  353. add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
  354. sub $rounds,$rounds,#2
  355. vld1.32 {q10-q11},[$key_],#32
  356. vld1.32 {q12-q13},[$key_],#32
  357. vld1.32 {q14-q15},[$key_],#32
  358. vld1.32 {$rndlast},[$key_]
  359. add $key_,$key,#32
  360. mov $cnt,$rounds
  361. b.eq .Lcbc_dec
  362. cmp $rounds,#2
  363. veor $dat,$dat,$ivec
  364. veor $rndzero_n_last,q8,$rndlast
  365. b.eq .Lcbc_enc128
  366. vld1.32 {$in0-$in1},[$key_]
  367. add $key_,$key,#16
  368. add $key4,$key,#16*4
  369. add $key5,$key,#16*5
  370. aese $dat,q8
  371. aesmc $dat,$dat
  372. add $key6,$key,#16*6
  373. add $key7,$key,#16*7
  374. b .Lenter_cbc_enc
  375. .align 4
  376. .Loop_cbc_enc:
  377. aese $dat,q8
  378. aesmc $dat,$dat
  379. vst1.8 {$ivec},[$out],#16
  380. .Lenter_cbc_enc:
  381. aese $dat,q9
  382. aesmc $dat,$dat
  383. aese $dat,$in0
  384. aesmc $dat,$dat
  385. vld1.32 {q8},[$key4]
  386. cmp $rounds,#4
  387. aese $dat,$in1
  388. aesmc $dat,$dat
  389. vld1.32 {q9},[$key5]
  390. b.eq .Lcbc_enc192
  391. aese $dat,q8
  392. aesmc $dat,$dat
  393. vld1.32 {q8},[$key6]
  394. aese $dat,q9
  395. aesmc $dat,$dat
  396. vld1.32 {q9},[$key7]
  397. nop
  398. .Lcbc_enc192:
  399. aese $dat,q8
  400. aesmc $dat,$dat
  401. subs $len,$len,#16
  402. aese $dat,q9
  403. aesmc $dat,$dat
  404. cclr $step,eq
  405. aese $dat,q10
  406. aesmc $dat,$dat
  407. aese $dat,q11
  408. aesmc $dat,$dat
  409. vld1.8 {q8},[$inp],$step
  410. aese $dat,q12
  411. aesmc $dat,$dat
  412. veor q8,q8,$rndzero_n_last
  413. aese $dat,q13
  414. aesmc $dat,$dat
  415. vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
  416. aese $dat,q14
  417. aesmc $dat,$dat
  418. aese $dat,q15
  419. veor $ivec,$dat,$rndlast
  420. b.hs .Loop_cbc_enc
  421. vst1.8 {$ivec},[$out],#16
  422. b .Lcbc_done
  423. .align 5
  424. .Lcbc_enc128:
  425. vld1.32 {$in0-$in1},[$key_]
  426. aese $dat,q8
  427. aesmc $dat,$dat
  428. b .Lenter_cbc_enc128
  429. .Loop_cbc_enc128:
  430. aese $dat,q8
  431. aesmc $dat,$dat
  432. vst1.8 {$ivec},[$out],#16
  433. .Lenter_cbc_enc128:
  434. aese $dat,q9
  435. aesmc $dat,$dat
  436. subs $len,$len,#16
  437. aese $dat,$in0
  438. aesmc $dat,$dat
  439. cclr $step,eq
  440. aese $dat,$in1
  441. aesmc $dat,$dat
  442. aese $dat,q10
  443. aesmc $dat,$dat
  444. aese $dat,q11
  445. aesmc $dat,$dat
  446. vld1.8 {q8},[$inp],$step
  447. aese $dat,q12
  448. aesmc $dat,$dat
  449. aese $dat,q13
  450. aesmc $dat,$dat
  451. aese $dat,q14
  452. aesmc $dat,$dat
  453. veor q8,q8,$rndzero_n_last
  454. aese $dat,q15
  455. veor $ivec,$dat,$rndlast
  456. b.hs .Loop_cbc_enc128
  457. vst1.8 {$ivec},[$out],#16
  458. b .Lcbc_done
  459. ___
  460. {
  461. my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  462. $code.=<<___;
  463. .align 5
  464. .Lcbc_dec:
  465. vld1.8 {$dat2},[$inp],#16
  466. subs $len,$len,#32 // bias
  467. add $cnt,$rounds,#2
  468. vorr $in1,$dat,$dat
  469. vorr $dat1,$dat,$dat
  470. vorr $in2,$dat2,$dat2
  471. b.lo .Lcbc_dec_tail
  472. vorr $dat1,$dat2,$dat2
  473. vld1.8 {$dat2},[$inp],#16
  474. vorr $in0,$dat,$dat
  475. vorr $in1,$dat1,$dat1
  476. vorr $in2,$dat2,$dat2
  477. .Loop3x_cbc_dec:
  478. aesd $dat0,q8
  479. aesimc $dat0,$dat0
  480. aesd $dat1,q8
  481. aesimc $dat1,$dat1
  482. aesd $dat2,q8
  483. aesimc $dat2,$dat2
  484. vld1.32 {q8},[$key_],#16
  485. subs $cnt,$cnt,#2
  486. aesd $dat0,q9
  487. aesimc $dat0,$dat0
  488. aesd $dat1,q9
  489. aesimc $dat1,$dat1
  490. aesd $dat2,q9
  491. aesimc $dat2,$dat2
  492. vld1.32 {q9},[$key_],#16
  493. b.gt .Loop3x_cbc_dec
  494. aesd $dat0,q8
  495. aesimc $dat0,$dat0
  496. aesd $dat1,q8
  497. aesimc $dat1,$dat1
  498. aesd $dat2,q8
  499. aesimc $dat2,$dat2
  500. veor $tmp0,$ivec,$rndlast
  501. subs $len,$len,#0x30
  502. veor $tmp1,$in0,$rndlast
  503. mov.lo x6,$len // x6, $cnt, is zero at this point
  504. aesd $dat0,q9
  505. aesimc $dat0,$dat0
  506. aesd $dat1,q9
  507. aesimc $dat1,$dat1
  508. aesd $dat2,q9
  509. aesimc $dat2,$dat2
  510. veor $tmp2,$in1,$rndlast
  511. add $inp,$inp,x6 // $inp is adjusted in such way that
  512. // at exit from the loop $dat1-$dat2
  513. // are loaded with last "words"
  514. vorr $ivec,$in2,$in2
  515. mov $key_,$key
  516. aesd $dat0,q12
  517. aesimc $dat0,$dat0
  518. aesd $dat1,q12
  519. aesimc $dat1,$dat1
  520. aesd $dat2,q12
  521. aesimc $dat2,$dat2
  522. vld1.8 {$in0},[$inp],#16
  523. aesd $dat0,q13
  524. aesimc $dat0,$dat0
  525. aesd $dat1,q13
  526. aesimc $dat1,$dat1
  527. aesd $dat2,q13
  528. aesimc $dat2,$dat2
  529. vld1.8 {$in1},[$inp],#16
  530. aesd $dat0,q14
  531. aesimc $dat0,$dat0
  532. aesd $dat1,q14
  533. aesimc $dat1,$dat1
  534. aesd $dat2,q14
  535. aesimc $dat2,$dat2
  536. vld1.8 {$in2},[$inp],#16
  537. aesd $dat0,q15
  538. aesd $dat1,q15
  539. aesd $dat2,q15
  540. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  541. add $cnt,$rounds,#2
  542. veor $tmp0,$tmp0,$dat0
  543. veor $tmp1,$tmp1,$dat1
  544. veor $dat2,$dat2,$tmp2
  545. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  546. vst1.8 {$tmp0},[$out],#16
  547. vorr $dat0,$in0,$in0
  548. vst1.8 {$tmp1},[$out],#16
  549. vorr $dat1,$in1,$in1
  550. vst1.8 {$dat2},[$out],#16
  551. vorr $dat2,$in2,$in2
  552. b.hs .Loop3x_cbc_dec
  553. cmn $len,#0x30
  554. b.eq .Lcbc_done
  555. nop
  556. .Lcbc_dec_tail:
  557. aesd $dat1,q8
  558. aesimc $dat1,$dat1
  559. aesd $dat2,q8
  560. aesimc $dat2,$dat2
  561. vld1.32 {q8},[$key_],#16
  562. subs $cnt,$cnt,#2
  563. aesd $dat1,q9
  564. aesimc $dat1,$dat1
  565. aesd $dat2,q9
  566. aesimc $dat2,$dat2
  567. vld1.32 {q9},[$key_],#16
  568. b.gt .Lcbc_dec_tail
  569. aesd $dat1,q8
  570. aesimc $dat1,$dat1
  571. aesd $dat2,q8
  572. aesimc $dat2,$dat2
  573. aesd $dat1,q9
  574. aesimc $dat1,$dat1
  575. aesd $dat2,q9
  576. aesimc $dat2,$dat2
  577. aesd $dat1,q12
  578. aesimc $dat1,$dat1
  579. aesd $dat2,q12
  580. aesimc $dat2,$dat2
  581. cmn $len,#0x20
  582. aesd $dat1,q13
  583. aesimc $dat1,$dat1
  584. aesd $dat2,q13
  585. aesimc $dat2,$dat2
  586. veor $tmp1,$ivec,$rndlast
  587. aesd $dat1,q14
  588. aesimc $dat1,$dat1
  589. aesd $dat2,q14
  590. aesimc $dat2,$dat2
  591. veor $tmp2,$in1,$rndlast
  592. aesd $dat1,q15
  593. aesd $dat2,q15
  594. b.eq .Lcbc_dec_one
  595. veor $tmp1,$tmp1,$dat1
  596. veor $tmp2,$tmp2,$dat2
  597. vorr $ivec,$in2,$in2
  598. vst1.8 {$tmp1},[$out],#16
  599. vst1.8 {$tmp2},[$out],#16
  600. b .Lcbc_done
  601. .Lcbc_dec_one:
  602. veor $tmp1,$tmp1,$dat2
  603. vorr $ivec,$in2,$in2
  604. vst1.8 {$tmp1},[$out],#16
  605. .Lcbc_done:
  606. vst1.8 {$ivec},[$ivp]
  607. .Lcbc_abort:
  608. ___
  609. }
  610. $code.=<<___ if ($flavour !~ /64/);
  611. vldmia sp!,{d8-d15}
  612. ldmia sp!,{r4-r8,pc}
  613. ___
  614. $code.=<<___ if ($flavour =~ /64/);
  615. ldr x29,[sp],#16
  616. ret
  617. ___
  618. $code.=<<___;
  619. .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
  620. ___
  621. }}}
  622. {{{
  623. my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
  624. my ($rounds,$cnt,$key_)=("w5","w6","x7");
  625. my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
  626. my $step="x12"; # aliases with $tctr2
  627. my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
  628. my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  629. my ($dat,$tmp)=($dat0,$tmp0);
  630. ### q8-q15 preloaded key schedule
  631. $code.=<<___;
  632. .globl ${prefix}_ctr32_encrypt_blocks
  633. .type ${prefix}_ctr32_encrypt_blocks,%function
  634. .align 5
  635. ${prefix}_ctr32_encrypt_blocks:
  636. ___
  637. $code.=<<___ if ($flavour =~ /64/);
  638. stp x29,x30,[sp,#-16]!
  639. add x29,sp,#0
  640. ___
  641. $code.=<<___ if ($flavour !~ /64/);
  642. mov ip,sp
  643. stmdb sp!,{r4-r10,lr}
  644. vstmdb sp!,{d8-d15} @ ABI specification says so
  645. ldr r4, [ip] @ load remaining arg
  646. ___
  647. $code.=<<___;
  648. ldr $rounds,[$key,#240]
  649. ldr $ctr, [$ivp, #12]
  650. #ifdef __ARMEB__
  651. vld1.8 {$dat0},[$ivp]
  652. #else
  653. vld1.32 {$dat0},[$ivp]
  654. #endif
  655. vld1.32 {q8-q9},[$key] // load key schedule...
  656. sub $rounds,$rounds,#4
  657. mov $step,#16
  658. cmp $len,#2
  659. add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
  660. sub $rounds,$rounds,#2
  661. vld1.32 {q12-q13},[$key_],#32
  662. vld1.32 {q14-q15},[$key_],#32
  663. vld1.32 {$rndlast},[$key_]
  664. add $key_,$key,#32
  665. mov $cnt,$rounds
  666. cclr $step,lo
  667. #ifndef __ARMEB__
  668. rev $ctr, $ctr
  669. #endif
  670. ___
  671. $code.=<<___ if ($flavour =~ /64/);
  672. vorr $dat1,$dat0,$dat0
  673. add $tctr1, $ctr, #1
  674. vorr $dat2,$dat0,$dat0
  675. add $ctr, $ctr, #2
  676. vorr $ivec,$dat0,$dat0
  677. rev $tctr1, $tctr1
  678. vmov.32 ${dat1}[3],$tctr1
  679. b.ls .Lctr32_tail
  680. rev $tctr2, $ctr
  681. sub $len,$len,#3 // bias
  682. vmov.32 ${dat2}[3],$tctr2
  683. ___
  684. $code.=<<___ if ($flavour !~ /64/);
  685. add $tctr1, $ctr, #1
  686. vorr $ivec,$dat0,$dat0
  687. rev $tctr1, $tctr1
  688. vmov.32 ${ivec}[3],$tctr1
  689. add $ctr, $ctr, #2
  690. vorr $dat1,$ivec,$ivec
  691. b.ls .Lctr32_tail
  692. rev $tctr2, $ctr
  693. vmov.32 ${ivec}[3],$tctr2
  694. sub $len,$len,#3 // bias
  695. vorr $dat2,$ivec,$ivec
  696. ___
  697. $code.=<<___;
  698. b .Loop3x_ctr32
  699. .align 4
  700. .Loop3x_ctr32:
  701. aese $dat0,q8
  702. aesmc $dat0,$dat0
  703. aese $dat1,q8
  704. aesmc $dat1,$dat1
  705. aese $dat2,q8
  706. aesmc $dat2,$dat2
  707. vld1.32 {q8},[$key_],#16
  708. subs $cnt,$cnt,#2
  709. aese $dat0,q9
  710. aesmc $dat0,$dat0
  711. aese $dat1,q9
  712. aesmc $dat1,$dat1
  713. aese $dat2,q9
  714. aesmc $dat2,$dat2
  715. vld1.32 {q9},[$key_],#16
  716. b.gt .Loop3x_ctr32
  717. aese $dat0,q8
  718. aesmc $tmp0,$dat0
  719. aese $dat1,q8
  720. aesmc $tmp1,$dat1
  721. vld1.8 {$in0},[$inp],#16
  722. ___
  723. $code.=<<___ if ($flavour =~ /64/);
  724. vorr $dat0,$ivec,$ivec
  725. ___
  726. $code.=<<___ if ($flavour !~ /64/);
  727. add $tctr0,$ctr,#1
  728. ___
  729. $code.=<<___;
  730. aese $dat2,q8
  731. aesmc $dat2,$dat2
  732. vld1.8 {$in1},[$inp],#16
  733. ___
  734. $code.=<<___ if ($flavour =~ /64/);
  735. vorr $dat1,$ivec,$ivec
  736. ___
  737. $code.=<<___ if ($flavour !~ /64/);
  738. rev $tctr0,$tctr0
  739. ___
  740. $code.=<<___;
  741. aese $tmp0,q9
  742. aesmc $tmp0,$tmp0
  743. aese $tmp1,q9
  744. aesmc $tmp1,$tmp1
  745. vld1.8 {$in2},[$inp],#16
  746. mov $key_,$key
  747. aese $dat2,q9
  748. aesmc $tmp2,$dat2
  749. ___
  750. $code.=<<___ if ($flavour =~ /64/);
  751. vorr $dat2,$ivec,$ivec
  752. add $tctr0,$ctr,#1
  753. ___
  754. $code.=<<___;
  755. aese $tmp0,q12
  756. aesmc $tmp0,$tmp0
  757. aese $tmp1,q12
  758. aesmc $tmp1,$tmp1
  759. veor $in0,$in0,$rndlast
  760. add $tctr1,$ctr,#2
  761. aese $tmp2,q12
  762. aesmc $tmp2,$tmp2
  763. veor $in1,$in1,$rndlast
  764. add $ctr,$ctr,#3
  765. aese $tmp0,q13
  766. aesmc $tmp0,$tmp0
  767. aese $tmp1,q13
  768. aesmc $tmp1,$tmp1
  769. veor $in2,$in2,$rndlast
  770. ___
  771. $code.=<<___ if ($flavour =~ /64/);
  772. rev $tctr0,$tctr0
  773. aese $tmp2,q13
  774. aesmc $tmp2,$tmp2
  775. vmov.32 ${dat0}[3], $tctr0
  776. ___
  777. $code.=<<___ if ($flavour !~ /64/);
  778. vmov.32 ${ivec}[3], $tctr0
  779. aese $tmp2,q13
  780. aesmc $tmp2,$tmp2
  781. vorr $dat0,$ivec,$ivec
  782. ___
  783. $code.=<<___;
  784. rev $tctr1,$tctr1
  785. aese $tmp0,q14
  786. aesmc $tmp0,$tmp0
  787. ___
  788. $code.=<<___ if ($flavour !~ /64/);
  789. vmov.32 ${ivec}[3], $tctr1
  790. rev $tctr2,$ctr
  791. ___
  792. $code.=<<___;
  793. aese $tmp1,q14
  794. aesmc $tmp1,$tmp1
  795. ___
  796. $code.=<<___ if ($flavour =~ /64/);
  797. vmov.32 ${dat1}[3], $tctr1
  798. rev $tctr2,$ctr
  799. aese $tmp2,q14
  800. aesmc $tmp2,$tmp2
  801. vmov.32 ${dat2}[3], $tctr2
  802. ___
  803. $code.=<<___ if ($flavour !~ /64/);
  804. vorr $dat1,$ivec,$ivec
  805. vmov.32 ${ivec}[3], $tctr2
  806. aese $tmp2,q14
  807. aesmc $tmp2,$tmp2
  808. vorr $dat2,$ivec,$ivec
  809. ___
  810. $code.=<<___;
  811. subs $len,$len,#3
  812. aese $tmp0,q15
  813. aese $tmp1,q15
  814. aese $tmp2,q15
  815. veor $in0,$in0,$tmp0
  816. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  817. vst1.8 {$in0},[$out],#16
  818. veor $in1,$in1,$tmp1
  819. mov $cnt,$rounds
  820. vst1.8 {$in1},[$out],#16
  821. veor $in2,$in2,$tmp2
  822. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  823. vst1.8 {$in2},[$out],#16
  824. b.hs .Loop3x_ctr32
  825. adds $len,$len,#3
  826. b.eq .Lctr32_done
  827. cmp $len,#1
  828. mov $step,#16
  829. cclr $step,eq
  830. .Lctr32_tail:
  831. aese $dat0,q8
  832. aesmc $dat0,$dat0
  833. aese $dat1,q8
  834. aesmc $dat1,$dat1
  835. vld1.32 {q8},[$key_],#16
  836. subs $cnt,$cnt,#2
  837. aese $dat0,q9
  838. aesmc $dat0,$dat0
  839. aese $dat1,q9
  840. aesmc $dat1,$dat1
  841. vld1.32 {q9},[$key_],#16
  842. b.gt .Lctr32_tail
  843. aese $dat0,q8
  844. aesmc $dat0,$dat0
  845. aese $dat1,q8
  846. aesmc $dat1,$dat1
  847. aese $dat0,q9
  848. aesmc $dat0,$dat0
  849. aese $dat1,q9
  850. aesmc $dat1,$dat1
  851. vld1.8 {$in0},[$inp],$step
  852. aese $dat0,q12
  853. aesmc $dat0,$dat0
  854. aese $dat1,q12
  855. aesmc $dat1,$dat1
  856. vld1.8 {$in1},[$inp]
  857. aese $dat0,q13
  858. aesmc $dat0,$dat0
  859. aese $dat1,q13
  860. aesmc $dat1,$dat1
  861. veor $in0,$in0,$rndlast
  862. aese $dat0,q14
  863. aesmc $dat0,$dat0
  864. aese $dat1,q14
  865. aesmc $dat1,$dat1
  866. veor $in1,$in1,$rndlast
  867. aese $dat0,q15
  868. aese $dat1,q15
  869. cmp $len,#1
  870. veor $in0,$in0,$dat0
  871. veor $in1,$in1,$dat1
  872. vst1.8 {$in0},[$out],#16
  873. b.eq .Lctr32_done
  874. vst1.8 {$in1},[$out]
  875. .Lctr32_done:
  876. ___
  877. $code.=<<___ if ($flavour !~ /64/);
  878. vldmia sp!,{d8-d15}
  879. ldmia sp!,{r4-r10,pc}
  880. ___
  881. $code.=<<___ if ($flavour =~ /64/);
  882. ldr x29,[sp],#16
  883. ret
  884. ___
  885. $code.=<<___;
  886. .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
  887. ___
  888. }}}
  889. $code.=<<___;
  890. #endif
  891. ___
  892. ########################################
  893. if ($flavour =~ /64/) { ######## 64-bit code
  894. my %opcode = (
  895. "aesd" => 0x4e285800, "aese" => 0x4e284800,
  896. "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
  897. local *unaes = sub {
  898. my ($mnemonic,$arg)=@_;
  899. $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
  900. sprintf ".inst\t0x%08x\t//%s %s",
  901. $opcode{$mnemonic}|$1|($2<<5),
  902. $mnemonic,$arg;
  903. };
  904. foreach(split("\n",$code)) {
  905. s/\`([^\`]*)\`/eval($1)/geo;
  906. s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
  907. s/@\s/\/\//o; # old->new style commentary
  908. #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
  909. s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
  910. s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
  911. s/vmov\.i8/movi/o or # fix up legacy mnemonics
  912. s/vext\.8/ext/o or
  913. s/vrev32\.8/rev32/o or
  914. s/vtst\.8/cmtst/o or
  915. s/vshr/ushr/o or
  916. s/^(\s+)v/$1/o or # strip off v prefix
  917. s/\bbx\s+lr\b/ret/o;
  918. # fix up remaining legacy suffixes
  919. s/\.[ui]?8//o;
  920. m/\],#8/o and s/\.16b/\.8b/go;
  921. s/\.[ui]?32//o and s/\.16b/\.4s/go;
  922. s/\.[ui]?64//o and s/\.16b/\.2d/go;
  923. s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
  924. print $_,"\n";
  925. }
  926. } else { ######## 32-bit code
  927. my %opcode = (
  928. "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
  929. "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
  930. local *unaes = sub {
  931. my ($mnemonic,$arg)=@_;
  932. if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
  933. my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
  934. |(($2&7)<<1) |(($2&8)<<2);
  935. # since ARMv7 instructions are always encoded little-endian.
  936. # correct solution is to use .inst directive, but older
  937. # assemblers don't implement it:-(
  938. sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
  939. $word&0xff,($word>>8)&0xff,
  940. ($word>>16)&0xff,($word>>24)&0xff,
  941. $mnemonic,$arg;
  942. }
  943. };
  944. sub unvtbl {
  945. my $arg=shift;
  946. $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
  947. sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
  948. "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
  949. }
  950. sub unvdup32 {
  951. my $arg=shift;
  952. $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
  953. sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
  954. }
  955. sub unvmov32 {
  956. my $arg=shift;
  957. $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
  958. sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
  959. }
  960. foreach(split("\n",$code)) {
  961. s/\`([^\`]*)\`/eval($1)/geo;
  962. s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
  963. s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
  964. s/\/\/\s?/@ /o; # new->old style commentary
  965. # fix up remaining new-style suffixes
  966. s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
  967. s/\],#[0-9]+/]!/o;
  968. s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
  969. s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
  970. s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
  971. s/vdup\.32\s+(.*)/unvdup32($1)/geo or
  972. s/vmov\.32\s+(.*)/unvmov32($1)/geo or
  973. s/^(\s+)b\./$1b/o or
  974. s/^(\s+)mov\./$1mov/o or
  975. s/^(\s+)ret/$1bx\tlr/o;
  976. print $_,"\n";
  977. }
  978. }
  979. close STDOUT or die "error closing STDOUT: $!";