sm3-armv8.pl 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
  1. #! /usr/bin/env perl
  2. # Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # This module implements support for Armv8 SM3 instructions
  10. # $output is the last argument if it looks like a file (it has an extension)
  11. # $flavour is the first argument if it doesn't look like a file
  12. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  13. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  14. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  15. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  16. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  17. die "can't locate arm-xlate.pl";
  18. open OUT,"| \"$^X\" $xlate $flavour \"$output\""
  19. or die "can't call $xlate: $!";
  20. *STDOUT=*OUT;
  21. # Message expanding:
  22. # Wj <- P1(W[j-16]^W[j-9]^(W[j-3]<<<15))^(W[j-13]<<<7)^W[j-6]
  23. # Input: s0, s1, s2, s3
  24. # s0 = w0 | w1 | w2 | w3
  25. # s1 = w4 | w5 | w6 | w7
  26. # s2 = w8 | w9 | w10 | w11
  27. # s3 = w12 | w13 | w14 | w15
  28. # Output: s4
  29. sub msg_exp () {
  30. my $s0 = shift;
  31. my $s1 = shift;
  32. my $s2 = shift;
  33. my $s3 = shift;
  34. my $s4 = shift;
  35. my $vtmp1 = shift;
  36. my $vtmp2 = shift;
  37. $code.=<<___;
  38. // s4 = w7 | w8 | w9 | w10
  39. ext $s4.16b, $s1.16b, $s2.16b, #12
  40. // vtmp1 = w3 | w4 | w5 | w6
  41. ext $vtmp1.16b, $s0.16b, $s1.16b, #12
  42. // vtmp2 = w10 | w11 | w12 | w13
  43. ext $vtmp2.16b, $s2.16b, $s3.16b, #8
  44. sm3partw1 $s4.4s, $s0.4s, $s3.4s
  45. sm3partw2 $s4.4s, $vtmp2.4s, $vtmp1.4s
  46. ___
  47. }
  48. # A round of compresson function
  49. # Input:
  50. # ab - choose instruction among sm3tt1a, sm3tt1b, sm3tt2a, sm3tt2b
  51. # vstate0 - vstate1, store digest status(A - H)
  52. # vconst0 - vconst1, interleaved used to store Tj <<< j
  53. # vtmp - temporary register
  54. # vw - for sm3tt1ab, vw = s0 eor s1
  55. # s0 - for sm3tt2ab, just be s0
  56. # i, choose wj' or wj from vw
  57. sub round () {
  58. my $ab = shift;
  59. my $vstate0 = shift;
  60. my $vstate1 = shift;
  61. my $vconst0 = shift;
  62. my $vconst1 = shift;
  63. my $vtmp = shift;
  64. my $vw = shift;
  65. my $s0 = shift;
  66. my $i = shift;
  67. $code.=<<___;
  68. sm3ss1 $vtmp.4s, $vstate0.4s, $vconst0.4s, $vstate1.4s
  69. shl $vconst1.4s, $vconst0.4s, #1
  70. sri $vconst1.4s, $vconst0.4s, #31
  71. sm3tt1$ab $vstate0.4s, $vtmp.4s, $vw.4s[$i]
  72. sm3tt2$ab $vstate1.4s, $vtmp.4s, $s0.4s[$i]
  73. ___
  74. }
  75. sub qround () {
  76. my $ab = shift;
  77. my $vstate0 = shift;
  78. my $vstate1 = shift;
  79. my $vconst0 = shift;
  80. my $vconst1 = shift;
  81. my $vtmp1 = shift;
  82. my $vtmp2 = shift;
  83. my $s0 = shift;
  84. my $s1 = shift;
  85. my $s2 = shift;
  86. my $s3 = shift;
  87. my $s4 = shift;
  88. if($s4) {
  89. &msg_exp($s0, $s1, $s2, $s3, $s4, $vtmp1, $vtmp2);
  90. }
  91. $code.=<<___;
  92. eor $vtmp1.16b, $s0.16b, $s1.16b
  93. ___
  94. &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2,
  95. $vtmp1, $s0, 0);
  96. &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2,
  97. $vtmp1, $s0, 1);
  98. &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2,
  99. $vtmp1, $s0, 2);
  100. &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2,
  101. $vtmp1, $s0, 3);
  102. }
  103. $code=<<___;
  104. #include "arm_arch.h"
  105. .arch armv8.2-a
  106. .text
  107. ___
  108. {{{
  109. my ($pstate,$pdata,$num)=("x0","x1","w2");
  110. my ($state1,$state2)=("v5","v6");
  111. my ($sconst1, $sconst2)=("s16","s17");
  112. my ($vconst1, $vconst2)=("v16","v17");
  113. my ($s0,$s1,$s2,$s3,$s4)=map("v$_",(0..4));
  114. my ($bkstate1,$bkstate2)=("v18","v19");
  115. my ($vconst_tmp1,$vconst_tmp2)=("v20","v21");
  116. my ($vtmp1,$vtmp2)=("v22","v23");
  117. my $constaddr="x8";
  118. # void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num)
  119. $code.=<<___;
  120. .globl ossl_hwsm3_block_data_order
  121. .type ossl_hwsm3_block_data_order,%function
  122. .align 5
  123. ossl_hwsm3_block_data_order:
  124. AARCH64_VALID_CALL_TARGET
  125. // load state
  126. ld1 {$state1.4s-$state2.4s}, [$pstate]
  127. rev64 $state1.4s, $state1.4s
  128. rev64 $state2.4s, $state2.4s
  129. ext $state1.16b, $state1.16b, $state1.16b, #8
  130. ext $state2.16b, $state2.16b, $state2.16b, #8
  131. adr $constaddr, .Tj
  132. ldp $sconst1, $sconst2, [$constaddr]
  133. .Loop:
  134. // load input
  135. ld1 {$s0.16b-$s3.16b}, [$pdata], #64
  136. sub $num, $num, #1
  137. mov $bkstate1.16b, $state1.16b
  138. mov $bkstate2.16b, $state2.16b
  139. #ifndef __ARMEB__
  140. rev32 $s0.16b, $s0.16b
  141. rev32 $s1.16b, $s1.16b
  142. rev32 $s2.16b, $s2.16b
  143. rev32 $s3.16b, $s3.16b
  144. #endif
  145. ext $vconst_tmp1.16b, $vconst1.16b, $vconst1.16b, #4
  146. ___
  147. &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
  148. $s0,$s1,$s2,$s3,$s4);
  149. &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
  150. $s1,$s2,$s3,$s4,$s0);
  151. &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
  152. $s2,$s3,$s4,$s0,$s1);
  153. &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
  154. $s3,$s4,$s0,$s1,$s2);
  155. $code.=<<___;
  156. ext $vconst_tmp1.16b, $vconst2.16b, $vconst2.16b, #4
  157. ___
  158. &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
  159. $s4,$s0,$s1,$s2,$s3);
  160. &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
  161. $s0,$s1,$s2,$s3,$s4);
  162. &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
  163. $s1,$s2,$s3,$s4,$s0);
  164. &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
  165. $s2,$s3,$s4,$s0,$s1);
  166. &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
  167. $s3,$s4,$s0,$s1,$s2);
  168. &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
  169. $s4,$s0,$s1,$s2,$s3);
  170. &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
  171. $s0,$s1,$s2,$s3,$s4);
  172. &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
  173. $s1,$s2,$s3,$s4,$s0);
  174. &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
  175. $s2,$s3,$s4,$s0,$s1);
  176. &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
  177. $s3,$s4);
  178. &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
  179. $s4,$s0);
  180. &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
  181. $s0,$s1);
  182. $code.=<<___;
  183. eor $state1.16b, $state1.16b, $bkstate1.16b
  184. eor $state2.16b, $state2.16b, $bkstate2.16b
  185. // any remained blocks?
  186. cbnz $num, .Loop
  187. // save state
  188. rev64 $state1.4s, $state1.4s
  189. rev64 $state2.4s, $state2.4s
  190. ext $state1.16b, $state1.16b, $state1.16b, #8
  191. ext $state2.16b, $state2.16b, $state2.16b, #8
  192. st1 {$state1.4s-$state2.4s}, [$pstate]
  193. ret
  194. .size ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order
  195. .align 3
  196. .Tj:
  197. .word 0x79cc4519, 0x9d8a7a87
  198. ___
  199. }}}
  200. #########################################
  201. my %sm3partopcode = (
  202. "sm3partw1" => 0xce60C000,
  203. "sm3partw2" => 0xce60C400);
  204. my %sm3ss1opcode = (
  205. "sm3ss1" => 0xce400000);
  206. my %sm3ttopcode = (
  207. "sm3tt1a" => 0xce408000,
  208. "sm3tt1b" => 0xce408400,
  209. "sm3tt2a" => 0xce408800,
  210. "sm3tt2b" => 0xce408C00);
  211. sub unsm3part {
  212. my ($mnemonic,$arg)=@_;
  213. $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o
  214. &&
  215. sprintf ".inst\t0x%08x\t//%s %s",
  216. $sm3partopcode{$mnemonic}|$1|($2<<5)|($3<<16),
  217. $mnemonic,$arg;
  218. }
  219. sub unsm3ss1 {
  220. my ($mnemonic,$arg)=@_;
  221. $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o
  222. &&
  223. sprintf ".inst\t0x%08x\t//%s %s",
  224. $sm3ss1opcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10),
  225. $mnemonic,$arg;
  226. }
  227. sub unsm3tt {
  228. my ($mnemonic,$arg)=@_;
  229. $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*\[([0-3])\]/o
  230. &&
  231. sprintf ".inst\t0x%08x\t//%s %s",
  232. $sm3ttopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<12),
  233. $mnemonic,$arg;
  234. }
  235. open SELF,$0;
  236. while(<SELF>) {
  237. next if (/^#!/);
  238. last if (!s/^#/\/\// and !/^$/);
  239. print;
  240. }
  241. close SELF;
  242. foreach(split("\n",$code)) {
  243. s/\`([^\`]*)\`/eval($1)/ge;
  244. s/\b(sm3partw[1-2])\s+([qv].*)/unsm3part($1,$2)/ge;
  245. s/\b(sm3ss1)\s+([qv].*)/unsm3ss1($1,$2)/ge;
  246. s/\b(sm3tt[1-2][a-b])\s+([qv].*)/unsm3tt($1,$2)/ge;
  247. print $_,"\n";
  248. }
  249. close STDOUT or die "error closing STDOUT: $!";