md5-loongarch64.pl 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. #! /usr/bin/env perl
  2. # Author: Min Zhou <[email protected]>
  3. # Copyright 2023-2025 The OpenSSL Project Authors. All Rights Reserved.
  4. #
  5. # Licensed under the OpenSSL license (the "License"). You may not use
  6. # this file except in compliance with the License. You can obtain a copy
  7. # in the file LICENSE in the source distribution or at
  8. # https://www.openssl.org/source/license.html
  9. # Reference to crypto/md5/asm/md5-x86_64.pl
  10. # MD5 optimized for LoongArch.
  11. use strict;
  12. my $code;
  13. my ($zero,$ra,$tp,$sp,$fp)=map("\$r$_",(0..3,22));
  14. my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$r$_",(4..11));
  15. my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$x)=map("\$r$_",(12..21));
  16. # $output is the last argument if it looks like a file (it has an extension)
  17. my $output;
  18. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  19. open STDOUT,">$output";
  20. # round1_step() does:
  21. # dst = x + ((dst + F(x,y,z) + X[k] + T_i) <<< s)
  22. # $t1 = y ^ z
  23. # $t2 = dst + X[k_next]
  24. sub round1_step
  25. {
  26. my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
  27. my $T_i_h = ($T_i & 0xfffff000) >> 12;
  28. my $T_i_l = $T_i & 0xfff;
  29. # In LoongArch we have to use two instructions of lu12i.w and ori to load a
  30. # 32-bit immediate into a general register. Meanwhile, the instruction lu12i.w
  31. # treats the 20-bit immediate as a signed number. So if the T_i_h is greater
  32. # than or equal to (1<<19), we need provide lu12i.w a corresponding negative
  33. # number whose complement equals to the sign extension of T_i_h.
  34. # The details of the instruction lu12i.w can be found as following:
  35. # https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#_lu12i_w_lu32i_d_lu52i_d
  36. $T_i_h = -((1<<32) - (0xfff00000 | $T_i_h)) if ($T_i_h >= (1<<19));
  37. $code .= " ld.w $t0,$a1,0 /* (NEXT STEP) X[0] */\n" if ($pos == -1);
  38. $code .= " xor $t1,$y,$z /* y ^ z */\n" if ($pos == -1);
  39. $code .= " add.w $t2,$dst,$t0 /* dst + X[k] */\n" if ($pos == -1);
  40. $code .= <<EOF;
  41. lu12i.w $t8,$T_i_h /* load bits [31:12] of constant */
  42. and $t1,$x,$t1 /* x & ... */
  43. ori $t8,$t8,$T_i_l /* load bits [11:0] of constant */
  44. xor $t1,$z,$t1 /* z ^ ... */
  45. add.w $t7,$t2,$t8 /* dst + X[k] + Const */
  46. ld.w $t0,$a1,$k_next*4 /* (NEXT STEP) X[$k_next] */
  47. add.w $dst,$t7,$t1 /* dst += ... */
  48. add.w $t2,$z,$t0 /* (NEXT STEP) dst + X[$k_next] */
  49. EOF
  50. $code .= " rotri.w $dst,$dst,32-$s /* dst <<< s */\n";
  51. if ($pos != 1) {
  52. $code .= " xor $t1,$x,$y /* (NEXT STEP) y ^ z */\n";
  53. } else {
  54. $code .= " move $t0,$a7 /* (NEXT ROUND) $t0 = z' (copy of z) */\n";
  55. $code .= " nor $t1,$zero,$a7 /* (NEXT ROUND) $t1 = not z' (copy of not z) */\n";
  56. }
  57. $code .= " add.w $dst,$dst,$x /* dst += x */\n";
  58. }
  59. # round2_step() does:
  60. # dst = x + ((dst + G(x,y,z) + X[k] + T_i) <<< s)
  61. # $t0 = z' (copy of z for the next step)
  62. # $t1 = not z' (copy of not z for the next step)
  63. # $t2 = dst + X[k_next]
  64. sub round2_step
  65. {
  66. my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
  67. my $T_i_h = ($T_i & 0xfffff000) >> 12;
  68. my $T_i_l = $T_i & 0xfff;
  69. $T_i_h = -((1<<32) - (0xfff00000 | $T_i_h)) if ($T_i_h >= (1<<19));
  70. $code .= <<EOF;
  71. lu12i.w $t8,$T_i_h /* load bits [31:12] of Constant */
  72. and $t0,$x,$t0 /* x & z */
  73. ori $t8,$t8,$T_i_l /* load bits [11:0] of Constant */
  74. and $t1,$y,$t1 /* y & (not z) */
  75. add.w $t7,$t2,$t8 /* dst + X[k] + Const */
  76. or $t1,$t0,$t1 /* (y & (not z)) | (x & z) */
  77. ld.w $t0,$a1,$k_next*4 /* (NEXT STEP) X[$k_next] */
  78. add.w $dst,$t7,$t1 /* dst += ... */
  79. add.w $t2,$z,$t0 /* (NEXT STEP) dst + X[$k_next] */
  80. EOF
  81. $code .= " rotri.w $dst,$dst,32-$s /* dst <<< s */\n";
  82. if ($pos != 1) {
  83. $code .= " move $t0,$y /* (NEXT STEP) z' = $y */\n";
  84. $code .= " nor $t1,$zero,$y /* (NEXT STEP) not z' = not $y */\n";
  85. } else {
  86. $code .= " xor $t1,$a6,$a7 /* (NEXT ROUND) $t1 = y ^ z */\n";
  87. }
  88. $code .= " add.w $dst,$dst,$x /* dst += x */\n";
  89. }
  90. # round3_step() does:
  91. # dst = x + ((dst + H(x,y,z) + X[k] + T_i) <<< s)
  92. # $t1 = y ^ z
  93. # $t2 = dst + X[k_next]
  94. sub round3_step
  95. {
  96. my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
  97. my $T_i_h = ($T_i & 0xfffff000) >> 12;
  98. my $T_i_l = $T_i & 0xfff;
  99. $T_i_h = -((1<<32) - (0xfff00000 | $T_i_h)) if ($T_i_h >= (1<<19));
  100. $code .= <<EOF;
  101. lu12i.w $t8,$T_i_h /* load bits [31:12] of Constant */
  102. xor $t1,$x,$t1 /* x ^ ... */
  103. ori $t8,$t8,$T_i_l /* load bits [11:0] of Constant */
  104. add.w $t7,$t2,$t8 /* dst + X[k] + Const */
  105. ld.w $t0,$a1,$k_next*4 /* (NEXT STEP) X[$k_next] */
  106. add.w $dst,$t7,$t1 /* dst += ... */
  107. add.w $t2,$z,$t0 /* (NEXT STEP) dst + X[$k_next] */
  108. EOF
  109. $code .= " rotri.w $dst,$dst,32-$s /* dst <<< s */\n";
  110. if ($pos != 1) {
  111. $code .= " xor $t1,$x,$y /* (NEXT STEP) y ^ z */\n";
  112. } else {
  113. $code .= " nor $t1,$zero,$a7 /* (NEXT ROUND) $t1 = not z */\n";
  114. }
  115. $code .= " add.w $dst,$dst,$x /* dst += x */\n";
  116. }
  117. # round4_step() does:
  118. # dst = x + ((dst + I(x,y,z) + X[k] + T_i) <<< s)
  119. # $t1 = not z' (copy of not z for the next step)
  120. # $t2 = dst + X[k_next]
  121. sub round4_step
  122. {
  123. my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
  124. my $T_i_h = ($T_i & 0xfffff000) >> 12;
  125. my $T_i_l = $T_i & 0xfff;
  126. $T_i_h = -((1<<32) - (0xfff00000 | $T_i_h)) if ($T_i_h >= (1<<19));
  127. $code .= <<EOF;
  128. lu12i.w $t8,$T_i_h /* load bits [31:12] of Constant */
  129. or $t1,$x,$t1 /* x | ... */
  130. ori $t8,$t8,$T_i_l /* load bits [11:0] of Constant */
  131. xor $t1,$y,$t1 /* y ^ ... */
  132. add.w $t7,$t2,$t8 /* dst + X[k] + Const */
  133. EOF
  134. if ($pos != 1) {
  135. $code .= " ld.w $t0,$a1,$k_next*4 /* (NEXT STEP) X[$k_next] */\n";
  136. $code .= " add.w $dst,$t7,$t1 /* dst += ... */\n";
  137. $code .= " add.w $t2,$z,$t0 /* (NEXT STEP) dst + X[$k_next] */\n";
  138. $code .= " rotri.w $dst,$dst,32-$s /* dst <<< s */\n";
  139. $code .= " nor $t1,$zero,$y /* (NEXT STEP) not z' = not $y */\n";
  140. $code .= " add.w $dst,$dst,$x /* dst += x */\n";
  141. } else {
  142. $code .= " add.w $a4,$t3,$a4 /* (NEXT LOOP) add old value of A */\n";
  143. $code .= " add.w $dst,$t7,$t1 /* dst += ... */\n";
  144. $code .= " add.w $a7,$t6,$a7 /* (NEXT LOOP) add old value of D */\n";
  145. $code .= " rotri.w $dst,$dst,32-$s /* dst <<< s */\n";
  146. $code .= " addi.d $a1,$a1,64 /* (NEXT LOOP) ptr += 64 */\n";
  147. $code .= " add.w $dst,$dst,$x /* dst += x */\n";
  148. }
  149. }
  150. $code .= <<EOF;
  151. .text
  152. .globl ossl_md5_block_asm_data_order
  153. .type ossl_md5_block_asm_data_order function
  154. ossl_md5_block_asm_data_order:
  155. # $a0 = arg #1 (ctx, MD5_CTX pointer)
  156. # $a1 = arg #2 (ptr, data pointer)
  157. # $a2 = arg #3 (nbr, number of 16-word blocks to process)
  158. beqz $a2,.Lend # cmp nbr with 0, jmp if nbr == 0
  159. # ptr is '$a1'
  160. # end is '$a3'
  161. slli.d $t0,$a2,6
  162. add.d $a3,$a1,$t0
  163. # A is '$a4'
  164. # B is '$a5'
  165. # C is '$a6'
  166. # D is '$a7'
  167. ld.w $a4,$a0,0 # a4 = ctx->A
  168. ld.w $a5,$a0,4 # a5 = ctx->B
  169. ld.w $a6,$a0,8 # a6 = ctx->C
  170. ld.w $a7,$a0,12 # a7 = ctx->D
  171. # BEGIN of loop over 16-word blocks
  172. .align 6
  173. .Lloop:
  174. # save old values of A, B, C, D
  175. move $t3,$a4
  176. move $t4,$a5
  177. move $t5,$a6
  178. move $t6,$a7
  179. preld 0,$a1,0
  180. preld 0,$a1,64
  181. EOF
  182. round1_step(-1, $a4, $a5, $a6, $a7, '1', 0xd76aa478, '7');
  183. round1_step(0, $a7, $a4, $a5, $a6, '2', 0xe8c7b756, '12');
  184. round1_step(0, $a6, $a7, $a4, $a5, '3', 0x242070db, '17');
  185. round1_step(0, $a5, $a6, $a7, $a4, '4', 0xc1bdceee, '22');
  186. round1_step(0, $a4, $a5, $a6, $a7, '5', 0xf57c0faf, '7');
  187. round1_step(0, $a7, $a4, $a5, $a6, '6', 0x4787c62a, '12');
  188. round1_step(0, $a6, $a7, $a4, $a5, '7', 0xa8304613, '17');
  189. round1_step(0, $a5, $a6, $a7, $a4, '8', 0xfd469501, '22');
  190. round1_step(0, $a4, $a5, $a6, $a7, '9', 0x698098d8, '7');
  191. round1_step(0, $a7, $a4, $a5, $a6, '10', 0x8b44f7af, '12');
  192. round1_step(0, $a6, $a7, $a4, $a5, '11', 0xffff5bb1, '17');
  193. round1_step(0, $a5, $a6, $a7, $a4, '12', 0x895cd7be, '22');
  194. round1_step(0, $a4, $a5, $a6, $a7, '13', 0x6b901122, '7');
  195. round1_step(0, $a7, $a4, $a5, $a6, '14', 0xfd987193, '12');
  196. round1_step(0, $a6, $a7, $a4, $a5, '15', 0xa679438e, '17');
  197. round1_step(1, $a5, $a6, $a7, $a4, '1', 0x49b40821, '22');
  198. round2_step(-1, $a4, $a5, $a6, $a7, '6', 0xf61e2562, '5');
  199. round2_step(0, $a7, $a4, $a5, $a6, '11', 0xc040b340, '9');
  200. round2_step(0, $a6, $a7, $a4, $a5, '0', 0x265e5a51, '14');
  201. round2_step(0, $a5, $a6, $a7, $a4, '5', 0xe9b6c7aa, '20');
  202. round2_step(0, $a4, $a5, $a6, $a7, '10', 0xd62f105d, '5');
  203. round2_step(0, $a7, $a4, $a5, $a6, '15', 0x2441453, '9');
  204. round2_step(0, $a6, $a7, $a4, $a5, '4', 0xd8a1e681, '14');
  205. round2_step(0, $a5, $a6, $a7, $a4, '9', 0xe7d3fbc8, '20');
  206. round2_step(0, $a4, $a5, $a6, $a7, '14', 0x21e1cde6, '5');
  207. round2_step(0, $a7, $a4, $a5, $a6, '3', 0xc33707d6, '9');
  208. round2_step(0, $a6, $a7, $a4, $a5, '8', 0xf4d50d87, '14');
  209. round2_step(0, $a5, $a6, $a7, $a4, '13', 0x455a14ed, '20');
  210. round2_step(0, $a4, $a5, $a6, $a7, '2', 0xa9e3e905, '5');
  211. round2_step(0, $a7, $a4, $a5, $a6, '7', 0xfcefa3f8, '9');
  212. round2_step(0, $a6, $a7, $a4, $a5, '12', 0x676f02d9, '14');
  213. round2_step(1, $a5, $a6, $a7, $a4, '5', 0x8d2a4c8a, '20');
  214. round3_step(-1, $a4, $a5, $a6, $a7, '8', 0xfffa3942, '4');
  215. round3_step(0, $a7, $a4, $a5, $a6, '11', 0x8771f681, '11');
  216. round3_step(0, $a6, $a7, $a4, $a5, '14', 0x6d9d6122, '16');
  217. round3_step(0, $a5, $a6, $a7, $a4, '1', 0xfde5380c, '23');
  218. round3_step(0, $a4, $a5, $a6, $a7, '4', 0xa4beea44, '4');
  219. round3_step(0, $a7, $a4, $a5, $a6, '7', 0x4bdecfa9, '11');
  220. round3_step(0, $a6, $a7, $a4, $a5, '10', 0xf6bb4b60, '16');
  221. round3_step(0, $a5, $a6, $a7, $a4, '13', 0xbebfbc70, '23');
  222. round3_step(0, $a4, $a5, $a6, $a7, '0', 0x289b7ec6, '4');
  223. round3_step(0, $a7, $a4, $a5, $a6, '3', 0xeaa127fa, '11');
  224. round3_step(0, $a6, $a7, $a4, $a5, '6', 0xd4ef3085, '16');
  225. round3_step(0, $a5, $a6, $a7, $a4, '9', 0x4881d05, '23');
  226. round3_step(0, $a4, $a5, $a6, $a7, '12', 0xd9d4d039, '4');
  227. round3_step(0, $a7, $a4, $a5, $a6, '15', 0xe6db99e5, '11');
  228. round3_step(0, $a6, $a7, $a4, $a5, '2', 0x1fa27cf8, '16');
  229. round3_step(1, $a5, $a6, $a7, $a4, '0', 0xc4ac5665, '23');
  230. round4_step(-1, $a4, $a5, $a6, $a7, '7', 0xf4292244, '6');
  231. round4_step(0, $a7, $a4, $a5, $a6, '14', 0x432aff97, '10');
  232. round4_step(0, $a6, $a7, $a4, $a5, '5', 0xab9423a7, '15');
  233. round4_step(0, $a5, $a6, $a7, $a4, '12', 0xfc93a039, '21');
  234. round4_step(0, $a4, $a5, $a6, $a7, '3', 0x655b59c3, '6');
  235. round4_step(0, $a7, $a4, $a5, $a6, '10', 0x8f0ccc92, '10');
  236. round4_step(0, $a6, $a7, $a4, $a5, '1', 0xffeff47d, '15');
  237. round4_step(0, $a5, $a6, $a7, $a4, '8', 0x85845dd1, '21');
  238. round4_step(0, $a4, $a5, $a6, $a7, '15', 0x6fa87e4f, '6');
  239. round4_step(0, $a7, $a4, $a5, $a6, '6', 0xfe2ce6e0, '10');
  240. round4_step(0, $a6, $a7, $a4, $a5, '13', 0xa3014314, '15');
  241. round4_step(0, $a5, $a6, $a7, $a4, '4', 0x4e0811a1, '21');
  242. round4_step(0, $a4, $a5, $a6, $a7, '11', 0xf7537e82, '6');
  243. round4_step(0, $a7, $a4, $a5, $a6, '2', 0xbd3af235, '10');
  244. round4_step(0, $a6, $a7, $a4, $a5, '9', 0x2ad7d2bb, '15');
  245. round4_step(1, $a5, $a6, $a7, $a4, '0', 0xeb86d391, '21');
  246. $code .= <<EOF;
  247. # add old values of B, C
  248. add.w $a5,$t4,$a5
  249. add.w $a6,$t5,$a6
  250. bltu $a1,$a3,.Lloop # jmp if ptr < end
  251. st.w $a4,$a0,0 # ctx->A = A
  252. st.w $a5,$a0,4 # ctx->B = B
  253. st.w $a6,$a0,8 # ctx->C = C
  254. st.w $a7,$a0,12 # ctx->D = D
  255. .Lend:
  256. jr $ra
  257. .size ossl_md5_block_asm_data_order,.-ossl_md5_block_asm_data_order
  258. EOF
  259. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  260. print $code;
  261. close STDOUT;