sshsh256.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652
  1. /*
  2. * SHA-256 algorithm as described at
  3. *
  4. * http://csrc.nist.gov/cryptval/shs.html
  5. */
  6. #include "ssh.h"
  7. #include <assert.h>
  8. /* ----------------------------------------------------------------------
  9. * Core SHA256 algorithm: processes 16-word blocks into a message digest.
  10. */
  11. #define ror(x,y) ( ((x) << (32-y)) | (((uint32)(x)) >> (y)) )
  12. #define shr(x,y) ( (((uint32)(x)) >> (y)) )
  13. #define Ch(x,y,z) ( ((x) & (y)) ^ (~(x) & (z)) )
  14. #define Maj(x,y,z) ( ((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)) )
  15. #define bigsigma0(x) ( ror((x),2) ^ ror((x),13) ^ ror((x),22) )
  16. #define bigsigma1(x) ( ror((x),6) ^ ror((x),11) ^ ror((x),25) )
  17. #define smallsigma0(x) ( ror((x),7) ^ ror((x),18) ^ shr((x),3) )
  18. #define smallsigma1(x) ( ror((x),17) ^ ror((x),19) ^ shr((x),10) )
  19. static void SHA256_sw(SHA256_State *s, const unsigned char *q, int len);
  20. static void SHA256_ni(SHA256_State * s, const unsigned char *q, int len);
  21. #ifndef WINSCP_VS
  22. void SHA256_Core_Init(SHA256_State *s) {
  23. s->h[0] = 0x6a09e667;
  24. s->h[1] = 0xbb67ae85;
  25. s->h[2] = 0x3c6ef372;
  26. s->h[3] = 0xa54ff53a;
  27. s->h[4] = 0x510e527f;
  28. s->h[5] = 0x9b05688c;
  29. s->h[6] = 0x1f83d9ab;
  30. s->h[7] = 0x5be0cd19;
  31. }
  32. #endif // !WINSCP_VS
  33. #ifndef WINSCP_VS
  34. void SHA256_Block(SHA256_State *s, uint32 *block);
  35. #else
  36. void SHA256_Block(SHA256_State *s, uint32 *block) {
  37. uint32 w[80];
  38. uint32 a,b,c,d,e,f,g,h;
  39. static const int k[] = {
  40. 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
  41. 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
  42. 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
  43. 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
  44. 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
  45. 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
  46. 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
  47. 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
  48. 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
  49. 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
  50. 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
  51. 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
  52. 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
  53. 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
  54. 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
  55. 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
  56. };
  57. int t;
  58. for (t = 0; t < 16; t++)
  59. w[t] = block[t];
  60. for (t = 16; t < 64; t++)
  61. w[t] = smallsigma1(w[t-2]) + w[t-7] + smallsigma0(w[t-15]) + w[t-16];
  62. a = s->h[0]; b = s->h[1]; c = s->h[2]; d = s->h[3];
  63. e = s->h[4]; f = s->h[5]; g = s->h[6]; h = s->h[7];
  64. for (t = 0; t < 64; t+=8) {
  65. uint32 t1, t2;
  66. #define ROUND(j,a,b,c,d,e,f,g,h) \
  67. t1 = h + bigsigma1(e) + Ch(e,f,g) + k[j] + w[j]; \
  68. t2 = bigsigma0(a) + Maj(a,b,c); \
  69. d = d + t1; h = t1 + t2;
  70. ROUND(t+0, a,b,c,d,e,f,g,h);
  71. ROUND(t+1, h,a,b,c,d,e,f,g);
  72. ROUND(t+2, g,h,a,b,c,d,e,f);
  73. ROUND(t+3, f,g,h,a,b,c,d,e);
  74. ROUND(t+4, e,f,g,h,a,b,c,d);
  75. ROUND(t+5, d,e,f,g,h,a,b,c);
  76. ROUND(t+6, c,d,e,f,g,h,a,b);
  77. ROUND(t+7, b,c,d,e,f,g,h,a);
  78. }
  79. s->h[0] += a; s->h[1] += b; s->h[2] += c; s->h[3] += d;
  80. s->h[4] += e; s->h[5] += f; s->h[6] += g; s->h[7] += h;
  81. }
  82. #endif // !WINSCP_VS
  83. #ifndef WINSCP_VS
  84. /* ----------------------------------------------------------------------
  85. * Outer SHA256 algorithm: take an arbitrary length byte string,
  86. * convert it into 16-word blocks with the prescribed padding at
  87. * the end, and pass those blocks to the core SHA256 algorithm.
  88. */
  89. #define BLKSIZE 64
  90. static void SHA256_BinarySink_write(BinarySink *bs,
  91. const void *p, size_t len);
  92. void SHA256_Init(SHA256_State *s) {
  93. SHA256_Core_Init(s);
  94. s->blkused = 0;
  95. s->lenhi = s->lenlo = 0;
  96. if (supports_sha_ni())
  97. s->sha256 = &SHA256_ni;
  98. else
  99. s->sha256 = &SHA256_sw;
  100. BinarySink_INIT(s, SHA256_BinarySink_write);
  101. }
  102. static void SHA256_BinarySink_write(BinarySink *bs,
  103. const void *p, size_t len)
  104. {
  105. struct SHA256_State *s = BinarySink_DOWNCAST(bs, struct SHA256_State);
  106. unsigned char *q = (unsigned char *)p;
  107. uint32 lenw = len;
  108. assert(len == lenw);
  109. /*
  110. * Update the length field.
  111. */
  112. s->lenlo += lenw;
  113. s->lenhi += (s->lenlo < lenw);
  114. (*(s->sha256))(s, q, len);
  115. }
  116. static void SHA256_sw(SHA256_State *s, const unsigned char *q, int len) {
  117. uint32 wordblock[16];
  118. int i;
  119. if (s->blkused && s->blkused+len < BLKSIZE) {
  120. /*
  121. * Trivial case: just add to the block.
  122. */
  123. memcpy(s->block + s->blkused, q, len);
  124. s->blkused += len;
  125. } else {
  126. /*
  127. * We must complete and process at least one block.
  128. */
  129. while (s->blkused + len >= BLKSIZE) {
  130. memcpy(s->block + s->blkused, q, BLKSIZE - s->blkused);
  131. q += BLKSIZE - s->blkused;
  132. len -= BLKSIZE - s->blkused;
  133. /* Now process the block. Gather bytes big-endian into words */
  134. for (i = 0; i < 16; i++) {
  135. wordblock[i] =
  136. ( ((uint32)s->block[i*4+0]) << 24 ) |
  137. ( ((uint32)s->block[i*4+1]) << 16 ) |
  138. ( ((uint32)s->block[i*4+2]) << 8 ) |
  139. ( ((uint32)s->block[i*4+3]) << 0 );
  140. }
  141. SHA256_Block(s, wordblock);
  142. s->blkused = 0;
  143. }
  144. memcpy(s->block, q, len);
  145. s->blkused = len;
  146. }
  147. }
  148. void SHA256_Final(SHA256_State *s, unsigned char *digest) {
  149. int i;
  150. int pad;
  151. unsigned char c[64];
  152. uint32 lenhi, lenlo;
  153. if (s->blkused >= 56)
  154. pad = 56 + 64 - s->blkused;
  155. else
  156. pad = 56 - s->blkused;
  157. lenhi = (s->lenhi << 3) | (s->lenlo >> (32-3));
  158. lenlo = (s->lenlo << 3);
  159. memset(c, 0, pad);
  160. c[0] = 0x80;
  161. put_data(s, &c, pad);
  162. put_uint32(s, lenhi);
  163. put_uint32(s, lenlo);
  164. for (i = 0; i < 8; i++) {
  165. digest[i*4+0] = (s->h[i] >> 24) & 0xFF;
  166. digest[i*4+1] = (s->h[i] >> 16) & 0xFF;
  167. digest[i*4+2] = (s->h[i] >> 8) & 0xFF;
  168. digest[i*4+3] = (s->h[i] >> 0) & 0xFF;
  169. }
  170. }
  171. void SHA256_Simple(const void *p, int len, unsigned char *output) {
  172. SHA256_State s;
  173. SHA256_Init(&s);
  174. put_data(&s, p, len);
  175. SHA256_Final(&s, output);
  176. smemclr(&s, sizeof(s));
  177. }
  178. /*
  179. * Thin abstraction for things where hashes are pluggable.
  180. */
  181. struct sha256_hash {
  182. SHA256_State state;
  183. ssh_hash hash;
  184. };
  185. static ssh_hash *sha256_new(const struct ssh_hashalg *alg)
  186. {
  187. struct sha256_hash *h = snew(struct sha256_hash);
  188. SHA256_Init(&h->state);
  189. h->hash.vt = alg;
  190. BinarySink_DELEGATE_INIT(&h->hash, &h->state);
  191. return &h->hash;
  192. }
  193. static ssh_hash *sha256_copy(ssh_hash *hashold)
  194. {
  195. struct sha256_hash *hold, *hnew;
  196. ssh_hash *hashnew = sha256_new(hashold->vt);
  197. hold = FROMFIELD(hashold, struct sha256_hash, hash);
  198. hnew = FROMFIELD(hashnew, struct sha256_hash, hash);
  199. hnew->state = hold->state;
  200. BinarySink_COPIED(&hnew->state);
  201. return hashnew;
  202. }
  203. static void sha256_free(ssh_hash *hash)
  204. {
  205. struct sha256_hash *h = FROMFIELD(hash, struct sha256_hash, hash);
  206. smemclr(h, sizeof(*h));
  207. sfree(h);
  208. }
  209. static void sha256_final(ssh_hash *hash, unsigned char *output)
  210. {
  211. struct sha256_hash *h = FROMFIELD(hash, struct sha256_hash, hash);
  212. SHA256_Final(&h->state, output);
  213. sha256_free(hash);
  214. }
  215. const struct ssh_hashalg ssh_sha256 = {
  216. sha256_new, sha256_copy, sha256_final, sha256_free, 32, "SHA-256"
  217. };
  218. /* ----------------------------------------------------------------------
  219. * The above is the SHA-256 algorithm itself. Now we implement the
  220. * HMAC wrapper on it.
  221. */
  222. struct hmacsha256 {
  223. SHA256_State sha[3];
  224. ssh2_mac mac;
  225. };
  226. static ssh2_mac *hmacsha256_new(
  227. const struct ssh2_macalg *alg, ssh2_cipher *cipher)
  228. {
  229. struct hmacsha256 *ctx = snew(struct hmacsha256);
  230. ctx->mac.vt = alg;
  231. BinarySink_DELEGATE_INIT(&ctx->mac, &ctx->sha[2]);
  232. return &ctx->mac;
  233. }
  234. static void hmacsha256_free(ssh2_mac *mac)
  235. {
  236. struct hmacsha256 *ctx = FROMFIELD(mac, struct hmacsha256, mac);
  237. smemclr(ctx, sizeof(*ctx));
  238. sfree(ctx);
  239. }
  240. static void sha256_key_internal(struct hmacsha256 *ctx,
  241. const unsigned char *key, int len)
  242. {
  243. unsigned char foo[64];
  244. int i;
  245. memset(foo, 0x36, 64);
  246. for (i = 0; i < len && i < 64; i++)
  247. foo[i] ^= key[i];
  248. SHA256_Init(&ctx->sha[0]);
  249. put_data(&ctx->sha[0], foo, 64);
  250. memset(foo, 0x5C, 64);
  251. for (i = 0; i < len && i < 64; i++)
  252. foo[i] ^= key[i];
  253. SHA256_Init(&ctx->sha[1]);
  254. put_data(&ctx->sha[1], foo, 64);
  255. smemclr(foo, 64); /* burn the evidence */
  256. }
  257. static void hmacsha256_key(ssh2_mac *mac, const void *key)
  258. {
  259. struct hmacsha256 *ctx = FROMFIELD(mac, struct hmacsha256, mac);
  260. sha256_key_internal(ctx, key, ctx->mac.vt->keylen);
  261. }
  262. static void hmacsha256_start(ssh2_mac *mac)
  263. {
  264. struct hmacsha256 *ctx = FROMFIELD(mac, struct hmacsha256, mac);
  265. ctx->sha[2] = ctx->sha[0]; /* structure copy */
  266. BinarySink_COPIED(&ctx->sha[2]);
  267. }
  268. static void hmacsha256_genresult(ssh2_mac *mac, unsigned char *hmac)
  269. {
  270. struct hmacsha256 *ctx = FROMFIELD(mac, struct hmacsha256, mac);
  271. SHA256_State s;
  272. unsigned char intermediate[32];
  273. s = ctx->sha[2]; /* structure copy */
  274. BinarySink_COPIED(&s);
  275. SHA256_Final(&s, intermediate);
  276. s = ctx->sha[1]; /* structure copy */
  277. BinarySink_COPIED(&s);
  278. put_data(&s, intermediate, 32);
  279. SHA256_Final(&s, hmac);
  280. }
  281. const struct ssh2_macalg ssh_hmac_sha256 = {
  282. hmacsha256_new, hmacsha256_free, hmacsha256_key,
  283. hmacsha256_start, hmacsha256_genresult,
  284. "hmac-sha2-256", "[email protected]",
  285. 32, 32,
  286. "HMAC-SHA-256"
  287. };
  288. #endif // !WINSCP_VS
  289. #ifdef TEST
  290. #include <stdio.h>
  291. #include <stdlib.h>
  292. #include <assert.h>
  293. int main(void) {
  294. unsigned char digest[32];
  295. int i, j, errors;
  296. struct {
  297. const char *teststring;
  298. unsigned char digest[32];
  299. } tests[] = {
  300. { "abc", {
  301. 0xba, 0x78, 0x16, 0xbf, 0x8f, 0x01, 0xcf, 0xea,
  302. 0x41, 0x41, 0x40, 0xde, 0x5d, 0xae, 0x22, 0x23,
  303. 0xb0, 0x03, 0x61, 0xa3, 0x96, 0x17, 0x7a, 0x9c,
  304. 0xb4, 0x10, 0xff, 0x61, 0xf2, 0x00, 0x15, 0xad,
  305. } },
  306. { "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq", {
  307. 0x24, 0x8d, 0x6a, 0x61, 0xd2, 0x06, 0x38, 0xb8,
  308. 0xe5, 0xc0, 0x26, 0x93, 0x0c, 0x3e, 0x60, 0x39,
  309. 0xa3, 0x3c, 0xe4, 0x59, 0x64, 0xff, 0x21, 0x67,
  310. 0xf6, 0xec, 0xed, 0xd4, 0x19, 0xdb, 0x06, 0xc1,
  311. } },
  312. };
  313. errors = 0;
  314. for (i = 0; i < sizeof(tests) / sizeof(*tests); i++) {
  315. SHA256_Simple(tests[i].teststring,
  316. strlen(tests[i].teststring), digest);
  317. for (j = 0; j < 32; j++) {
  318. if (digest[j] != tests[i].digest[j]) {
  319. fprintf(stderr,
  320. "\"%s\" digest byte %d should be 0x%02x, is 0x%02x\n",
  321. tests[i].teststring, j, tests[i].digest[j], digest[j]);
  322. errors++;
  323. }
  324. }
  325. }
  326. printf("%d errors\n", errors);
  327. return 0;
  328. }
  329. #endif
  330. #ifdef COMPILER_SUPPORTS_SHA_NI
  331. #if defined _MSC_VER && defined _M_AMD64
  332. # include <intrin.h>
  333. #endif
  334. /*
  335. * Set target architecture for Clang and GCC
  336. */
  337. #if !defined(__clang__) && defined(__GNUC__)
  338. # pragma GCC target("sha")
  339. # pragma GCC target("sse4.1")
  340. #endif
  341. #if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5))
  342. # define FUNC_ISA __attribute__ ((target("sse4.1,sha")))
  343. #else
  344. # define FUNC_ISA
  345. #endif
  346. #include <wmmintrin.h>
  347. #include <smmintrin.h>
  348. #include <immintrin.h>
  349. #if defined(__clang__) || defined(__GNUC__)
  350. #include <shaintrin.h>
  351. #endif
  352. /* SHA256 implementation using new instructions
  353. The code is based on Jeffrey Walton's SHA256 implementation:
  354. https://github.com/noloader/SHA-Intrinsics
  355. */
  356. FUNC_ISA
  357. static void SHA256_ni_(SHA256_State * s, const unsigned char *q, int len) {
  358. if (s->blkused && s->blkused+len < BLKSIZE) {
  359. /*
  360. * Trivial case: just add to the block.
  361. */
  362. memcpy(s->block + s->blkused, q, len);
  363. s->blkused += len;
  364. } else {
  365. __m128i STATE0, STATE1;
  366. __m128i MSG, TMP;
  367. __m128i MSG0, MSG1, MSG2, MSG3;
  368. __m128i ABEF_SAVE, CDGH_SAVE;
  369. const __m128i MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
  370. /* Load initial values */
  371. TMP = _mm_loadu_si128((const __m128i*) &s->h[0]);
  372. STATE1 = _mm_loadu_si128((const __m128i*) &s->h[4]);
  373. TMP = _mm_shuffle_epi32(TMP, 0xB1); /* CDAB */
  374. STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); /* EFGH */
  375. STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); /* ABEF */
  376. STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); /* CDGH */
  377. /*
  378. * We must complete and process at least one block.
  379. */
  380. while (s->blkused + len >= BLKSIZE) {
  381. memcpy(s->block + s->blkused, q, BLKSIZE - s->blkused);
  382. q += BLKSIZE - s->blkused;
  383. len -= BLKSIZE - s->blkused;
  384. /* Save current state */
  385. ABEF_SAVE = STATE0;
  386. CDGH_SAVE = STATE1;
  387. /* Rounds 0-3 */
  388. MSG = _mm_loadu_si128((const __m128i*) (s->block + 0));
  389. MSG0 = _mm_shuffle_epi8(MSG, MASK);
  390. MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
  391. STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
  392. MSG = _mm_shuffle_epi32(MSG, 0x0E);
  393. STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
  394. /* Rounds 4-7 */
  395. MSG1 = _mm_loadu_si128((const __m128i*) (s->block + 16));
  396. MSG1 = _mm_shuffle_epi8(MSG1, MASK);
  397. MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
  398. STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
  399. MSG = _mm_shuffle_epi32(MSG, 0x0E);
  400. STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
  401. MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
  402. /* Rounds 8-11 */
  403. MSG2 = _mm_loadu_si128((const __m128i*) (s->block + 32));
  404. MSG2 = _mm_shuffle_epi8(MSG2, MASK);
  405. MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
  406. STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
  407. MSG = _mm_shuffle_epi32(MSG, 0x0E);
  408. STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
  409. MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
  410. /* Rounds 12-15 */
  411. MSG3 = _mm_loadu_si128((const __m128i*) (s->block + 48));
  412. MSG3 = _mm_shuffle_epi8(MSG3, MASK);
  413. MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
  414. STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
  415. TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
  416. MSG0 = _mm_add_epi32(MSG0, TMP);
  417. MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
  418. MSG = _mm_shuffle_epi32(MSG, 0x0E);
  419. STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
  420. MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
  421. /* Rounds 16-19 */
  422. MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
  423. STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
  424. TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
  425. MSG1 = _mm_add_epi32(MSG1, TMP);
  426. MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
  427. MSG = _mm_shuffle_epi32(MSG, 0x0E);
  428. STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
  429. MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
  430. /* Rounds 20-23 */
  431. MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
  432. STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
  433. TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
  434. MSG2 = _mm_add_epi32(MSG2, TMP);
  435. MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
  436. MSG = _mm_shuffle_epi32(MSG, 0x0E);
  437. STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
  438. MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
  439. /* Rounds 24-27 */
  440. MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
  441. STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
  442. TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
  443. MSG3 = _mm_add_epi32(MSG3, TMP);
  444. MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
  445. MSG = _mm_shuffle_epi32(MSG, 0x0E);
  446. STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
  447. MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
  448. /* Rounds 28-31 */
  449. MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
  450. STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
  451. TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
  452. MSG0 = _mm_add_epi32(MSG0, TMP);
  453. MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
  454. MSG = _mm_shuffle_epi32(MSG, 0x0E);
  455. STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
  456. MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
  457. /* Rounds 32-35 */
  458. MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
  459. STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
  460. TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
  461. MSG1 = _mm_add_epi32(MSG1, TMP);
  462. MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
  463. MSG = _mm_shuffle_epi32(MSG, 0x0E);
  464. STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
  465. MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
  466. /* Rounds 36-39 */
  467. MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
  468. STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
  469. TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
  470. MSG2 = _mm_add_epi32(MSG2, TMP);
  471. MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
  472. MSG = _mm_shuffle_epi32(MSG, 0x0E);
  473. STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
  474. MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
  475. /* Rounds 40-43 */
  476. MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
  477. STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
  478. TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
  479. MSG3 = _mm_add_epi32(MSG3, TMP);
  480. MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
  481. MSG = _mm_shuffle_epi32(MSG, 0x0E);
  482. STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
  483. MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
  484. /* Rounds 44-47 */
  485. MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
  486. STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
  487. TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
  488. MSG0 = _mm_add_epi32(MSG0, TMP);
  489. MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
  490. MSG = _mm_shuffle_epi32(MSG, 0x0E);
  491. STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
  492. MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
  493. /* Rounds 48-51 */
  494. MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
  495. STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
  496. TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
  497. MSG1 = _mm_add_epi32(MSG1, TMP);
  498. MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
  499. MSG = _mm_shuffle_epi32(MSG, 0x0E);
  500. STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
  501. MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
  502. /* Rounds 52-55 */
  503. MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
  504. STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
  505. TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
  506. MSG2 = _mm_add_epi32(MSG2, TMP);
  507. MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
  508. MSG = _mm_shuffle_epi32(MSG, 0x0E);
  509. STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
  510. /* Rounds 56-59 */
  511. MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
  512. STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
  513. TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
  514. MSG3 = _mm_add_epi32(MSG3, TMP);
  515. MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
  516. MSG = _mm_shuffle_epi32(MSG, 0x0E);
  517. STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
  518. /* Rounds 60-63 */
  519. MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
  520. STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
  521. MSG = _mm_shuffle_epi32(MSG, 0x0E);
  522. STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
  523. /* Combine state */
  524. STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
  525. STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
  526. s->blkused = 0;
  527. }
  528. TMP = _mm_shuffle_epi32(STATE0, 0x1B); /* FEBA */
  529. STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); /* DCHG */
  530. STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); /* DCBA */
  531. STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); /* ABEF */
  532. /* Save state */
  533. _mm_storeu_si128((__m128i*) &s->h[0], STATE0);
  534. _mm_storeu_si128((__m128i*) &s->h[4], STATE1);
  535. memcpy(s->block, q, len);
  536. s->blkused = len;
  537. }
  538. }
  539. /*
  540. * Workaround LLVM bug https://bugs.llvm.org/show_bug.cgi?id=34980
  541. */
  542. static void SHA256_ni(SHA256_State * s, const unsigned char *q, int len)
  543. {
  544. SHA256_ni_(s, q, len);
  545. }
  546. #else /* COMPILER_SUPPORTS_AES_NI */
  547. static void SHA256_ni(SHA256_State * s, const unsigned char *q, int len)
  548. {
  549. assert(0);
  550. }
  551. #endif /* COMPILER_SUPPORTS_AES_NI */