|
|
@@ -7,259 +7,302 @@
|
|
|
#include "ssh.h"
|
|
|
#include <assert.h>
|
|
|
|
|
|
+/*
|
|
|
+ * Start by deciding whether we can support hardware SHA at all.
|
|
|
+ */
|
|
|
+#define HW_SHA256_NONE 0
|
|
|
+#define HW_SHA256_NI 1
|
|
|
+
|
|
|
+#ifdef _FORCE_SHA_NI
|
|
|
+# define HW_SHA256 HW_SHA256_NI
|
|
|
+#elif defined(__clang__)
|
|
|
+# if __has_attribute(target) && __has_include(<wmmintrin.h>) && \
|
|
|
+ (defined(__x86_64__) || defined(__i386))
|
|
|
+# define HW_SHA256 HW_SHA256_NI
|
|
|
+# endif
|
|
|
+#elif defined(__GNUC__)
|
|
|
+# if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) && \
|
|
|
+ (defined(__x86_64__) || defined(__i386))
|
|
|
+# define HW_SHA256 HW_SHA256_NI
|
|
|
+# endif
|
|
|
+#elif defined (_MSC_VER)
|
|
|
+# if (defined(_M_X64) || defined(_M_IX86)) && _MSC_FULL_VER >= 150030729
|
|
|
+# define HW_SHA256 HW_SHA256_NI
|
|
|
+# endif
|
|
|
+#endif
|
|
|
+
|
|
|
+#if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA256
|
|
|
+# undef HW_SHA256
|
|
|
+# define HW_SHA256 HW_SHA256_NONE
|
|
|
+#endif
|
|
|
+
|
|
|
+/*
|
|
|
+ * The actual query function that asks if hardware acceleration is
|
|
|
+ * available.
|
|
|
+ */
|
|
|
+static bool sha256_hw_available(void);
|
|
|
+
|
|
|
+/*
|
|
|
+ * The top-level selection function, caching the results of
|
|
|
+ * sha256_hw_available() so it only has to run once.
|
|
|
+ */
|
|
|
+static bool sha256_hw_available_cached(void)
|
|
|
+{
|
|
|
+ static bool initialised = false;
|
|
|
+ static bool hw_available;
|
|
|
+ if (!initialised) {
|
|
|
+ hw_available = sha256_hw_available();
|
|
|
+ initialised = true;
|
|
|
+ }
|
|
|
+ return hw_available;
|
|
|
+}
|
|
|
+
|
|
|
+static ssh_hash *sha256_select(const ssh_hashalg *alg)
|
|
|
+{
|
|
|
+ const ssh_hashalg *real_alg =
|
|
|
+ sha256_hw_available_cached() ? &ssh_sha256_hw : &ssh_sha256_sw;
|
|
|
+
|
|
|
+ return ssh_hash_new(real_alg);
|
|
|
+}
|
|
|
+
|
|
|
+const ssh_hashalg ssh_sha256 = {
|
|
|
+ sha256_select, NULL, NULL, NULL,
|
|
|
+ 32, 64, "SHA-256",
|
|
|
+};
|
|
|
+
|
|
|
/* ----------------------------------------------------------------------
|
|
|
- * Core SHA256 algorithm: processes 16-word blocks into a message digest.
|
|
|
+ * Definitions likely to be helpful to multiple implementations.
|
|
|
*/
|
|
|
|
|
|
-#define ror(x,y) ( ((x) << (32-y)) | (((uint32_t)(x)) >> (y)) )
|
|
|
-#define shr(x,y) ( (((uint32_t)(x)) >> (y)) )
|
|
|
-#define Ch(x,y,z) ( ((x) & (y)) ^ (~(x) & (z)) )
|
|
|
-#define Maj(x,y,z) ( ((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)) )
|
|
|
-#define bigsigma0(x) ( ror((x),2) ^ ror((x),13) ^ ror((x),22) )
|
|
|
-#define bigsigma1(x) ( ror((x),6) ^ ror((x),11) ^ ror((x),25) )
|
|
|
-#define smallsigma0(x) ( ror((x),7) ^ ror((x),18) ^ shr((x),3) )
|
|
|
-#define smallsigma1(x) ( ror((x),17) ^ ror((x),19) ^ shr((x),10) )
|
|
|
-
|
|
|
-typedef struct SHA256_State {
|
|
|
- uint32_t h[8];
|
|
|
- unsigned char block[64];
|
|
|
- int blkused;
|
|
|
+static const uint32_t sha256_initial_state[] = {
|
|
|
+ 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
|
|
|
+ 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
|
|
|
+};
|
|
|
+
|
|
|
+static const uint32_t sha256_round_constants[] = {
|
|
|
+ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
|
|
|
+ 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
|
|
|
+ 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
|
|
|
+ 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
|
|
|
+ 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
|
|
|
+ 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
|
|
|
+ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
|
|
|
+ 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
|
|
|
+ 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
|
|
|
+ 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
|
|
|
+ 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
|
|
|
+ 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
|
|
|
+ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
|
|
|
+ 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
|
|
|
+ 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
|
|
|
+ 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
|
|
|
+};
|
|
|
+
|
|
|
+#define SHA256_ROUNDS 64
|
|
|
+
|
|
|
+typedef struct sha256_block sha256_block;
|
|
|
+struct sha256_block {
|
|
|
+ uint8_t block[64];
|
|
|
+ size_t used;
|
|
|
uint64_t len;
|
|
|
- void (*sha256)(struct SHA256_State * s, const unsigned char *p, int len);
|
|
|
- BinarySink_IMPLEMENTATION;
|
|
|
-} SHA256_State;
|
|
|
-
|
|
|
-static void SHA256_sw(SHA256_State *s, const unsigned char *q, int len);
|
|
|
-static void SHA256_ni(SHA256_State *s, const unsigned char *q, int len);
|
|
|
-
|
|
|
-void SHA256_Core_Init(SHA256_State *s) {
|
|
|
- s->h[0] = 0x6a09e667;
|
|
|
- s->h[1] = 0xbb67ae85;
|
|
|
- s->h[2] = 0x3c6ef372;
|
|
|
- s->h[3] = 0xa54ff53a;
|
|
|
- s->h[4] = 0x510e527f;
|
|
|
- s->h[5] = 0x9b05688c;
|
|
|
- s->h[6] = 0x1f83d9ab;
|
|
|
- s->h[7] = 0x5be0cd19;
|
|
|
+};
|
|
|
+
|
|
|
+static inline void sha256_block_setup(sha256_block *blk)
|
|
|
+{
|
|
|
+ blk->used = 0;
|
|
|
+ blk->len = 0;
|
|
|
}
|
|
|
|
|
|
-void SHA256_Block(SHA256_State *s, uint32_t *block) {
|
|
|
- uint32_t w[80];
|
|
|
- uint32_t a,b,c,d,e,f,g,h;
|
|
|
- static const int k[] = {
|
|
|
- 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
|
|
|
- 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
|
|
|
- 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
|
|
|
- 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
|
|
|
- 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
|
|
|
- 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
|
|
|
- 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
|
|
|
- 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
|
|
|
- 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
|
|
|
- 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
|
|
|
- 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
|
|
|
- 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
|
|
|
- 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
|
|
|
- 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
|
|
|
- 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
|
|
|
- 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
|
|
|
- };
|
|
|
-
|
|
|
- int t;
|
|
|
-
|
|
|
- for (t = 0; t < 16; t++)
|
|
|
- w[t] = block[t];
|
|
|
-
|
|
|
- for (t = 16; t < 64; t++)
|
|
|
- w[t] = smallsigma1(w[t-2]) + w[t-7] + smallsigma0(w[t-15]) + w[t-16];
|
|
|
-
|
|
|
- a = s->h[0]; b = s->h[1]; c = s->h[2]; d = s->h[3];
|
|
|
- e = s->h[4]; f = s->h[5]; g = s->h[6]; h = s->h[7];
|
|
|
-
|
|
|
- for (t = 0; t < 64; t+=8) {
|
|
|
- uint32_t t1, t2;
|
|
|
-
|
|
|
-#define ROUND(j,a,b,c,d,e,f,g,h) \
|
|
|
- t1 = h + bigsigma1(e) + Ch(e,f,g) + k[j] + w[j]; \
|
|
|
- t2 = bigsigma0(a) + Maj(a,b,c); \
|
|
|
- d = d + t1; h = t1 + t2;
|
|
|
-
|
|
|
- ROUND(t+0, a,b,c,d,e,f,g,h);
|
|
|
- ROUND(t+1, h,a,b,c,d,e,f,g);
|
|
|
- ROUND(t+2, g,h,a,b,c,d,e,f);
|
|
|
- ROUND(t+3, f,g,h,a,b,c,d,e);
|
|
|
- ROUND(t+4, e,f,g,h,a,b,c,d);
|
|
|
- ROUND(t+5, d,e,f,g,h,a,b,c);
|
|
|
- ROUND(t+6, c,d,e,f,g,h,a,b);
|
|
|
- ROUND(t+7, b,c,d,e,f,g,h,a);
|
|
|
+static inline bool sha256_block_write(
|
|
|
+ sha256_block *blk, const void **vdata, size_t *len)
|
|
|
+{
|
|
|
+ size_t blkleft = sizeof(blk->block) - blk->used;
|
|
|
+ size_t chunk = *len < blkleft ? *len : blkleft;
|
|
|
+
|
|
|
+ const uint8_t *p = *vdata;
|
|
|
+ memcpy(blk->block + blk->used, p, chunk);
|
|
|
+ *vdata = p + chunk;
|
|
|
+ *len -= chunk;
|
|
|
+ blk->used += chunk;
|
|
|
+ blk->len += chunk;
|
|
|
+
|
|
|
+ if (blk->used == sizeof(blk->block)) {
|
|
|
+ blk->used = 0;
|
|
|
+ return true;
|
|
|
}
|
|
|
|
|
|
- s->h[0] += a; s->h[1] += b; s->h[2] += c; s->h[3] += d;
|
|
|
- s->h[4] += e; s->h[5] += f; s->h[6] += g; s->h[7] += h;
|
|
|
+ return false;
|
|
|
+}
|
|
|
+
|
|
|
+static inline void sha256_block_pad(sha256_block *blk, BinarySink *bs)
|
|
|
+{
|
|
|
+ uint64_t final_len = blk->len << 3;
|
|
|
+ size_t pad = 1 + (63 & (55 - blk->used));
|
|
|
+
|
|
|
+ put_byte(bs, 0x80);
|
|
|
+ for (size_t i = 1; i < pad; i++)
|
|
|
+ put_byte(bs, 0);
|
|
|
+ put_uint64(bs, final_len);
|
|
|
+
|
|
|
+ assert(blk->used == 0 && "Should have exactly hit a block boundary");
|
|
|
}
|
|
|
|
|
|
/* ----------------------------------------------------------------------
|
|
|
- * Outer SHA256 algorithm: take an arbitrary length byte string,
|
|
|
- * convert it into 16-word blocks with the prescribed padding at
|
|
|
- * the end, and pass those blocks to the core SHA256 algorithm.
|
|
|
+ * Software implementation of SHA-256.
|
|
|
*/
|
|
|
|
|
|
-#define BLKSIZE 64
|
|
|
+static inline uint32_t ror(uint32_t x, unsigned y)
|
|
|
+{
|
|
|
+ return (x << (31 & -y)) | (x >> (31 & y));
|
|
|
+}
|
|
|
|
|
|
-static void SHA256_BinarySink_write(BinarySink *bs,
|
|
|
- const void *p, size_t len);
|
|
|
+static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0)
|
|
|
+{
|
|
|
+ return if0 ^ (ctrl & (if1 ^ if0));
|
|
|
+}
|
|
|
|
|
|
-void SHA256_Init(SHA256_State *s) {
|
|
|
- SHA256_Core_Init(s);
|
|
|
- s->blkused = 0;
|
|
|
- s->len = 0;
|
|
|
- if (supports_sha_ni())
|
|
|
- s->sha256 = &SHA256_ni;
|
|
|
- else
|
|
|
- s->sha256 = &SHA256_sw;
|
|
|
- BinarySink_INIT(s, SHA256_BinarySink_write);
|
|
|
+static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z)
|
|
|
+{
|
|
|
+ return (x & y) | (z & (x | y));
|
|
|
}
|
|
|
|
|
|
-static void SHA256_BinarySink_write(BinarySink *bs,
|
|
|
- const void *p, size_t len)
|
|
|
+static inline uint32_t Sigma_0(uint32_t x)
|
|
|
{
|
|
|
- struct SHA256_State *s = BinarySink_DOWNCAST(bs, struct SHA256_State);
|
|
|
- unsigned char *q = (unsigned char *)p;
|
|
|
+ return ror(x,2) ^ ror(x,13) ^ ror(x,22);
|
|
|
+}
|
|
|
|
|
|
- /*
|
|
|
- * Update the length field.
|
|
|
- */
|
|
|
- s->len += len;
|
|
|
+static inline uint32_t Sigma_1(uint32_t x)
|
|
|
+{
|
|
|
+ return ror(x,6) ^ ror(x,11) ^ ror(x,25);
|
|
|
+}
|
|
|
|
|
|
- (*(s->sha256))(s, q, len);
|
|
|
+static inline uint32_t sigma_0(uint32_t x)
|
|
|
+{
|
|
|
+ return ror(x,7) ^ ror(x,18) ^ (x >> 3);
|
|
|
}
|
|
|
|
|
|
-static void SHA256_sw(SHA256_State *s, const unsigned char *q, int len) {
|
|
|
- uint32_t wordblock[16];
|
|
|
- int i;
|
|
|
-
|
|
|
- if (s->blkused && s->blkused+len < BLKSIZE) {
|
|
|
- /*
|
|
|
- * Trivial case: just add to the block.
|
|
|
- */
|
|
|
- memcpy(s->block + s->blkused, q, len);
|
|
|
- s->blkused += len;
|
|
|
- } else {
|
|
|
- /*
|
|
|
- * We must complete and process at least one block.
|
|
|
- */
|
|
|
- while (s->blkused + len >= BLKSIZE) {
|
|
|
- memcpy(s->block + s->blkused, q, BLKSIZE - s->blkused);
|
|
|
- q += BLKSIZE - s->blkused;
|
|
|
- len -= BLKSIZE - s->blkused;
|
|
|
- /* Now process the block. Gather bytes big-endian into words */
|
|
|
- for (i = 0; i < 16; i++) {
|
|
|
- wordblock[i] =
|
|
|
- ( ((uint32_t)s->block[i*4+0]) << 24 ) |
|
|
|
- ( ((uint32_t)s->block[i*4+1]) << 16 ) |
|
|
|
- ( ((uint32_t)s->block[i*4+2]) << 8 ) |
|
|
|
- ( ((uint32_t)s->block[i*4+3]) << 0 );
|
|
|
- }
|
|
|
- SHA256_Block(s, wordblock);
|
|
|
- s->blkused = 0;
|
|
|
- }
|
|
|
- memcpy(s->block, q, len);
|
|
|
- s->blkused = len;
|
|
|
- }
|
|
|
+static inline uint32_t sigma_1(uint32_t x)
|
|
|
+{
|
|
|
+ return ror(x,17) ^ ror(x,19) ^ (x >> 10);
|
|
|
}
|
|
|
|
|
|
-void SHA256_Final(SHA256_State *s, unsigned char *digest) {
|
|
|
- int i;
|
|
|
- int pad;
|
|
|
- unsigned char c[64];
|
|
|
- uint64_t len;
|
|
|
+static inline void sha256_sw_round(
|
|
|
+ unsigned round_index, const uint32_t *schedule,
|
|
|
+ uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d,
|
|
|
+ uint32_t *e, uint32_t *f, uint32_t *g, uint32_t *h)
|
|
|
+{
|
|
|
+ uint32_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) +
|
|
|
+ sha256_round_constants[round_index] + schedule[round_index];
|
|
|
+
|
|
|
+ uint32_t t2 = Sigma_0(*a) + Maj(*a,*b,*c);
|
|
|
|
|
|
- if (s->blkused >= 56)
|
|
|
- pad = 56 + 64 - s->blkused;
|
|
|
- else
|
|
|
- pad = 56 - s->blkused;
|
|
|
+ *d += t1;
|
|
|
+ *h = t1 + t2;
|
|
|
+}
|
|
|
|
|
|
- len = (s->len << 3);
|
|
|
+static void sha256_sw_block(uint32_t *core, const uint8_t *block)
|
|
|
+{
|
|
|
+ uint32_t w[SHA256_ROUNDS];
|
|
|
+ uint32_t a,b,c,d,e,f,g,h;
|
|
|
|
|
|
- memset(c, 0, pad);
|
|
|
- c[0] = 0x80;
|
|
|
- put_data(s, &c, pad);
|
|
|
+ for (size_t t = 0; t < 16; t++)
|
|
|
+ w[t] = GET_32BIT_MSB_FIRST(block + 4*t);
|
|
|
|
|
|
- put_uint64(s, len);
|
|
|
+ for (size_t t = 16; t < SHA256_ROUNDS; t++)
|
|
|
+ w[t] = sigma_1(w[t-2]) + w[t-7] + sigma_0(w[t-15]) + w[t-16];
|
|
|
|
|
|
- for (i = 0; i < 8; i++) {
|
|
|
- digest[i*4+0] = (s->h[i] >> 24) & 0xFF;
|
|
|
- digest[i*4+1] = (s->h[i] >> 16) & 0xFF;
|
|
|
- digest[i*4+2] = (s->h[i] >> 8) & 0xFF;
|
|
|
- digest[i*4+3] = (s->h[i] >> 0) & 0xFF;
|
|
|
+ a = core[0]; b = core[1]; c = core[2]; d = core[3];
|
|
|
+ e = core[4]; f = core[5]; g = core[6]; h = core[7];
|
|
|
+
|
|
|
+ for (size_t t = 0; t < SHA256_ROUNDS; t += 8) {
|
|
|
+ sha256_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h);
|
|
|
+ sha256_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g);
|
|
|
+ sha256_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f);
|
|
|
+ sha256_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e);
|
|
|
+ sha256_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d);
|
|
|
+ sha256_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c);
|
|
|
+ sha256_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b);
|
|
|
+ sha256_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a);
|
|
|
}
|
|
|
-}
|
|
|
|
|
|
-void SHA256_Simple(const void *p, int len, unsigned char *output) {
|
|
|
- SHA256_State s;
|
|
|
+ core[0] += a; core[1] += b; core[2] += c; core[3] += d;
|
|
|
+ core[4] += e; core[5] += f; core[6] += g; core[7] += h;
|
|
|
|
|
|
- SHA256_Init(&s);
|
|
|
- put_data(&s, p, len);
|
|
|
- SHA256_Final(&s, output);
|
|
|
- smemclr(&s, sizeof(s));
|
|
|
+ smemclr(w, sizeof(w));
|
|
|
}
|
|
|
|
|
|
-/*
|
|
|
- * Thin abstraction for things where hashes are pluggable.
|
|
|
- */
|
|
|
-
|
|
|
-struct sha256_hash {
|
|
|
- SHA256_State state;
|
|
|
+typedef struct sha256_sw {
|
|
|
+ uint32_t core[8];
|
|
|
+ sha256_block blk;
|
|
|
+ BinarySink_IMPLEMENTATION;
|
|
|
ssh_hash hash;
|
|
|
-};
|
|
|
+} sha256_sw;
|
|
|
+
|
|
|
+static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len);
|
|
|
|
|
|
-static ssh_hash *sha256_new(const ssh_hashalg *alg)
|
|
|
+static ssh_hash *sha256_sw_new(const ssh_hashalg *alg)
|
|
|
{
|
|
|
- struct sha256_hash *h = snew(struct sha256_hash);
|
|
|
- SHA256_Init(&h->state);
|
|
|
- h->hash.vt = alg;
|
|
|
- BinarySink_DELEGATE_INIT(&h->hash, &h->state);
|
|
|
- return &h->hash;
|
|
|
+ sha256_sw *s = snew(sha256_sw);
|
|
|
+
|
|
|
+ memcpy(s->core, sha256_initial_state, sizeof(s->core));
|
|
|
+
|
|
|
+ sha256_block_setup(&s->blk);
|
|
|
+
|
|
|
+ s->hash.vt = alg;
|
|
|
+ BinarySink_INIT(s, sha256_sw_write);
|
|
|
+ BinarySink_DELEGATE_INIT(&s->hash, s);
|
|
|
+ return &s->hash;
|
|
|
}
|
|
|
|
|
|
-static ssh_hash *sha256_copy(ssh_hash *hashold)
|
|
|
+static ssh_hash *sha256_sw_copy(ssh_hash *hash)
|
|
|
{
|
|
|
- struct sha256_hash *hold, *hnew;
|
|
|
- ssh_hash *hashnew = sha256_new(hashold->vt);
|
|
|
+ sha256_sw *s = container_of(hash, sha256_sw, hash);
|
|
|
+ sha256_sw *copy = snew(sha256_sw);
|
|
|
+
|
|
|
+ memcpy(copy, s, sizeof(*copy));
|
|
|
+ BinarySink_COPIED(copy);
|
|
|
+ BinarySink_DELEGATE_INIT(©->hash, copy);
|
|
|
|
|
|
- hold = container_of(hashold, struct sha256_hash, hash);
|
|
|
- hnew = container_of(hashnew, struct sha256_hash, hash);
|
|
|
+ return ©->hash;
|
|
|
+}
|
|
|
|
|
|
- hnew->state = hold->state;
|
|
|
- BinarySink_COPIED(&hnew->state);
|
|
|
+static void sha256_sw_free(ssh_hash *hash)
|
|
|
+{
|
|
|
+ sha256_sw *s = container_of(hash, sha256_sw, hash);
|
|
|
|
|
|
- return hashnew;
|
|
|
+ smemclr(s, sizeof(*s));
|
|
|
+ sfree(s);
|
|
|
}
|
|
|
|
|
|
-static void sha256_free(ssh_hash *hash)
|
|
|
+static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len)
|
|
|
{
|
|
|
- struct sha256_hash *h = container_of(hash, struct sha256_hash, hash);
|
|
|
+ sha256_sw *s = BinarySink_DOWNCAST(bs, sha256_sw);
|
|
|
|
|
|
- smemclr(h, sizeof(*h));
|
|
|
- sfree(h);
|
|
|
+ while (len > 0)
|
|
|
+ if (sha256_block_write(&s->blk, &vp, &len))
|
|
|
+ sha256_sw_block(s->core, s->blk.block);
|
|
|
}
|
|
|
|
|
|
-static void sha256_final(ssh_hash *hash, unsigned char *output)
|
|
|
+static void sha256_sw_final(ssh_hash *hash, uint8_t *digest)
|
|
|
{
|
|
|
- struct sha256_hash *h = container_of(hash, struct sha256_hash, hash);
|
|
|
- SHA256_Final(&h->state, output);
|
|
|
- sha256_free(hash);
|
|
|
+ sha256_sw *s = container_of(hash, sha256_sw, hash);
|
|
|
+
|
|
|
+ sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
|
|
|
+ for (size_t i = 0; i < 8; i++)
|
|
|
+ PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]);
|
|
|
+ sha256_sw_free(hash);
|
|
|
}
|
|
|
|
|
|
-const ssh_hashalg ssh_sha256 = {
|
|
|
- sha256_new, sha256_copy, sha256_final, sha256_free, 32, 64, "SHA-256"
|
|
|
+const ssh_hashalg ssh_sha256_sw = {
|
|
|
+ sha256_sw_new, sha256_sw_copy, sha256_sw_final, sha256_sw_free,
|
|
|
+ 32, 64, "SHA-256",
|
|
|
};
|
|
|
|
|
|
-#ifdef COMPILER_SUPPORTS_SHA_NI
|
|
|
+/* ----------------------------------------------------------------------
|
|
|
+ * Hardware-accelerated implementation of SHA-256 using x86 SHA-NI.
|
|
|
+ */
|
|
|
|
|
|
-#if defined _MSC_VER && defined _M_AMD64
|
|
|
-# include <intrin.h>
|
|
|
-#endif
|
|
|
+#if HW_SHA256 == HW_SHA256_NI
|
|
|
|
|
|
/*
|
|
|
* Set target architecture for Clang and GCC
|
|
|
@@ -269,7 +312,7 @@ const ssh_hashalg ssh_sha256 = {
|
|
|
# pragma GCC target("sse4.1")
|
|
|
#endif
|
|
|
|
|
|
-#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5))
|
|
|
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
|
|
|
# define FUNC_ISA __attribute__ ((target("sse4.1,sha")))
|
|
|
#else
|
|
|
# define FUNC_ISA
|
|
|
@@ -278,236 +321,369 @@ const ssh_hashalg ssh_sha256 = {
|
|
|
#include <wmmintrin.h>
|
|
|
#include <smmintrin.h>
|
|
|
#include <immintrin.h>
|
|
|
-
|
|
|
#if defined(__clang__) || defined(__GNUC__)
|
|
|
#include <shaintrin.h>
|
|
|
#endif
|
|
|
|
|
|
+#if defined(__clang__) || defined(__GNUC__)
|
|
|
+#include <cpuid.h>
|
|
|
+#define GET_CPU_ID_0(out) \
|
|
|
+ __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3])
|
|
|
+#define GET_CPU_ID_7(out) \
|
|
|
+ __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3])
|
|
|
+#else
|
|
|
+#define GET_CPU_ID_0(out) __cpuid(out, 0)
|
|
|
+#define GET_CPU_ID_7(out) __cpuidex(out, 7, 0)
|
|
|
+#endif
|
|
|
+
|
|
|
+static bool sha256_hw_available(void)
|
|
|
+{
|
|
|
+ unsigned int CPUInfo[4];
|
|
|
+ GET_CPU_ID_0(CPUInfo);
|
|
|
+ if (CPUInfo[0] < 7)
|
|
|
+ return false;
|
|
|
+
|
|
|
+ GET_CPU_ID_7(CPUInfo);
|
|
|
+ return CPUInfo[1] & (1 << 29); /* Check SHA */
|
|
|
+}
|
|
|
+
|
|
|
/* SHA256 implementation using new instructions
|
|
|
The code is based on Jeffrey Walton's SHA256 implementation:
|
|
|
https://github.com/noloader/SHA-Intrinsics
|
|
|
*/
|
|
|
FUNC_ISA
|
|
|
-static void SHA256_ni_(SHA256_State * s, const unsigned char *q, int len) {
|
|
|
- if (s->blkused && s->blkused+len < BLKSIZE) {
|
|
|
- /*
|
|
|
- * Trivial case: just add to the block.
|
|
|
- */
|
|
|
- memcpy(s->block + s->blkused, q, len);
|
|
|
- s->blkused += len;
|
|
|
- } else {
|
|
|
- __m128i STATE0, STATE1;
|
|
|
- __m128i MSG, TMP;
|
|
|
- __m128i MSG0, MSG1, MSG2, MSG3;
|
|
|
- __m128i ABEF_SAVE, CDGH_SAVE;
|
|
|
- const __m128i MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
|
|
-
|
|
|
- /* Load initial values */
|
|
|
- TMP = _mm_loadu_si128((const __m128i*) &s->h[0]);
|
|
|
- STATE1 = _mm_loadu_si128((const __m128i*) &s->h[4]);
|
|
|
-
|
|
|
- TMP = _mm_shuffle_epi32(TMP, 0xB1); /* CDAB */
|
|
|
- STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); /* EFGH */
|
|
|
- STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); /* ABEF */
|
|
|
- STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); /* CDGH */
|
|
|
- /*
|
|
|
- * We must complete and process at least one block.
|
|
|
- */
|
|
|
- while (s->blkused + len >= BLKSIZE) {
|
|
|
- memcpy(s->block + s->blkused, q, BLKSIZE - s->blkused);
|
|
|
- q += BLKSIZE - s->blkused;
|
|
|
- len -= BLKSIZE - s->blkused;
|
|
|
-
|
|
|
- /* Save current state */
|
|
|
- ABEF_SAVE = STATE0;
|
|
|
- CDGH_SAVE = STATE1;
|
|
|
-
|
|
|
- /* Rounds 0-3 */
|
|
|
- MSG = _mm_loadu_si128((const __m128i*) (s->block + 0));
|
|
|
- MSG0 = _mm_shuffle_epi8(MSG, MASK);
|
|
|
- MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
|
|
|
- STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
- MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
- STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
-
|
|
|
- /* Rounds 4-7 */
|
|
|
- MSG1 = _mm_loadu_si128((const __m128i*) (s->block + 16));
|
|
|
- MSG1 = _mm_shuffle_epi8(MSG1, MASK);
|
|
|
- MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
|
|
|
- STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
- MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
- STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
- MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
|
|
|
-
|
|
|
- /* Rounds 8-11 */
|
|
|
- MSG2 = _mm_loadu_si128((const __m128i*) (s->block + 32));
|
|
|
- MSG2 = _mm_shuffle_epi8(MSG2, MASK);
|
|
|
- MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
|
|
|
- STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
- MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
- STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
- MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
|
|
|
-
|
|
|
- /* Rounds 12-15 */
|
|
|
- MSG3 = _mm_loadu_si128((const __m128i*) (s->block + 48));
|
|
|
- MSG3 = _mm_shuffle_epi8(MSG3, MASK);
|
|
|
- MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
|
|
|
- STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
- TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
|
|
|
- MSG0 = _mm_add_epi32(MSG0, TMP);
|
|
|
- MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
|
|
|
- MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
- STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
- MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
|
|
|
-
|
|
|
- /* Rounds 16-19 */
|
|
|
- MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
|
|
|
- STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
- TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
|
|
|
- MSG1 = _mm_add_epi32(MSG1, TMP);
|
|
|
- MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
|
|
|
- MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
- STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
- MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
|
|
|
-
|
|
|
- /* Rounds 20-23 */
|
|
|
- MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
|
|
|
- STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
- TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
|
|
|
- MSG2 = _mm_add_epi32(MSG2, TMP);
|
|
|
- MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
|
|
|
- MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
- STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
- MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
|
|
|
-
|
|
|
- /* Rounds 24-27 */
|
|
|
- MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
|
|
|
- STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
- TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
|
|
|
- MSG3 = _mm_add_epi32(MSG3, TMP);
|
|
|
- MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
|
|
|
- MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
- STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
- MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
|
|
|
-
|
|
|
- /* Rounds 28-31 */
|
|
|
- MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
|
|
|
- STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
- TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
|
|
|
- MSG0 = _mm_add_epi32(MSG0, TMP);
|
|
|
- MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
|
|
|
- MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
- STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
- MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
|
|
|
-
|
|
|
- /* Rounds 32-35 */
|
|
|
- MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
|
|
|
- STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
- TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
|
|
|
- MSG1 = _mm_add_epi32(MSG1, TMP);
|
|
|
- MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
|
|
|
- MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
- STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
- MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
|
|
|
-
|
|
|
- /* Rounds 36-39 */
|
|
|
- MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
|
|
|
- STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
- TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
|
|
|
- MSG2 = _mm_add_epi32(MSG2, TMP);
|
|
|
- MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
|
|
|
- MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
- STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
- MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
|
|
|
-
|
|
|
- /* Rounds 40-43 */
|
|
|
- MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
|
|
|
- STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
- TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
|
|
|
- MSG3 = _mm_add_epi32(MSG3, TMP);
|
|
|
- MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
|
|
|
- MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
- STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
- MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
|
|
|
-
|
|
|
- /* Rounds 44-47 */
|
|
|
- MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
|
|
|
- STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
- TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
|
|
|
- MSG0 = _mm_add_epi32(MSG0, TMP);
|
|
|
- MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
|
|
|
- MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
- STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
- MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
|
|
|
-
|
|
|
- /* Rounds 48-51 */
|
|
|
- MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
|
|
|
- STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
- TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
|
|
|
- MSG1 = _mm_add_epi32(MSG1, TMP);
|
|
|
- MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
|
|
|
- MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
- STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
- MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
|
|
|
-
|
|
|
- /* Rounds 52-55 */
|
|
|
- MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
|
|
|
- STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
- TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
|
|
|
- MSG2 = _mm_add_epi32(MSG2, TMP);
|
|
|
- MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
|
|
|
- MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
- STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
-
|
|
|
- /* Rounds 56-59 */
|
|
|
- MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
|
|
|
- STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
- TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
|
|
|
- MSG3 = _mm_add_epi32(MSG3, TMP);
|
|
|
- MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
|
|
|
- MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
- STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
-
|
|
|
- /* Rounds 60-63 */
|
|
|
- MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
|
|
|
- STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
- MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
- STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
-
|
|
|
- /* Combine state */
|
|
|
- STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
|
|
|
- STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
|
|
|
-
|
|
|
- s->blkused = 0;
|
|
|
- }
|
|
|
-
|
|
|
- TMP = _mm_shuffle_epi32(STATE0, 0x1B); /* FEBA */
|
|
|
- STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); /* DCHG */
|
|
|
- STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); /* DCBA */
|
|
|
- STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); /* ABEF */
|
|
|
-
|
|
|
- /* Save state */
|
|
|
- _mm_storeu_si128((__m128i*) &s->h[0], STATE0);
|
|
|
- _mm_storeu_si128((__m128i*) &s->h[4], STATE1);
|
|
|
-
|
|
|
- memcpy(s->block, q, len);
|
|
|
- s->blkused = len;
|
|
|
- }
|
|
|
+static inline void sha256_ni_block(__m128i *core, const uint8_t *p)
|
|
|
+{
|
|
|
+ __m128i STATE0, STATE1;
|
|
|
+ __m128i MSG, TMP;
|
|
|
+ __m128i MSG0, MSG1, MSG2, MSG3;
|
|
|
+ const __m128i *block = (const __m128i *)p;
|
|
|
+ const __m128i MASK = _mm_set_epi64x(
|
|
|
+ 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
|
|
|
+
|
|
|
+ /* Load initial values */
|
|
|
+ STATE0 = core[0];
|
|
|
+ STATE1 = core[1];
|
|
|
+
|
|
|
+ /* Rounds 0-3 */
|
|
|
+ MSG = _mm_loadu_si128(block);
|
|
|
+ MSG0 = _mm_shuffle_epi8(MSG, MASK);
|
|
|
+ MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
|
|
|
+ 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
|
|
|
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
+
|
|
|
+ /* Rounds 4-7 */
|
|
|
+ MSG1 = _mm_loadu_si128(block + 1);
|
|
|
+ MSG1 = _mm_shuffle_epi8(MSG1, MASK);
|
|
|
+ MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
|
|
|
+ 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
|
|
|
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
+ MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
|
|
|
+
|
|
|
+ /* Rounds 8-11 */
|
|
|
+ MSG2 = _mm_loadu_si128(block + 2);
|
|
|
+ MSG2 = _mm_shuffle_epi8(MSG2, MASK);
|
|
|
+ MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
|
|
|
+ 0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
|
|
|
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
+ MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
|
|
|
+
|
|
|
+ /* Rounds 12-15 */
|
|
|
+ MSG3 = _mm_loadu_si128(block + 3);
|
|
|
+ MSG3 = _mm_shuffle_epi8(MSG3, MASK);
|
|
|
+ MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
|
|
|
+ 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
|
|
|
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
+ TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
|
|
|
+ MSG0 = _mm_add_epi32(MSG0, TMP);
|
|
|
+ MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
|
|
|
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
+ MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
|
|
|
+
|
|
|
+ /* Rounds 16-19 */
|
|
|
+ MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
|
|
|
+ 0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
|
|
|
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
+ TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
|
|
|
+ MSG1 = _mm_add_epi32(MSG1, TMP);
|
|
|
+ MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
|
|
|
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
+ MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
|
|
|
+
|
|
|
+ /* Rounds 20-23 */
|
|
|
+ MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
|
|
|
+ 0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
|
|
|
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
+ TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
|
|
|
+ MSG2 = _mm_add_epi32(MSG2, TMP);
|
|
|
+ MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
|
|
|
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
+ MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
|
|
|
+
|
|
|
+ /* Rounds 24-27 */
|
|
|
+ MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
|
|
|
+ 0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
|
|
|
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
+ TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
|
|
|
+ MSG3 = _mm_add_epi32(MSG3, TMP);
|
|
|
+ MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
|
|
|
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
+ MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
|
|
|
+
|
|
|
+ /* Rounds 28-31 */
|
|
|
+ MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
|
|
|
+ 0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL));
|
|
|
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
+ TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
|
|
|
+ MSG0 = _mm_add_epi32(MSG0, TMP);
|
|
|
+ MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
|
|
|
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
+ MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
|
|
|
+
|
|
|
+ /* Rounds 32-35 */
|
|
|
+ MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
|
|
|
+ 0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
|
|
|
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
+ TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
|
|
|
+ MSG1 = _mm_add_epi32(MSG1, TMP);
|
|
|
+ MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
|
|
|
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
+ MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
|
|
|
+
|
|
|
+ /* Rounds 36-39 */
|
|
|
+ MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
|
|
|
+ 0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
|
|
|
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
+ TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
|
|
|
+ MSG2 = _mm_add_epi32(MSG2, TMP);
|
|
|
+ MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
|
|
|
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
+ MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
|
|
|
+
|
|
|
+ /* Rounds 40-43 */
|
|
|
+ MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
|
|
|
+ 0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
|
|
|
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
+ TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
|
|
|
+ MSG3 = _mm_add_epi32(MSG3, TMP);
|
|
|
+ MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
|
|
|
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
+ MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
|
|
|
+
|
|
|
+ /* Rounds 44-47 */
|
|
|
+ MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
|
|
|
+ 0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
|
|
|
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
+ TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
|
|
|
+ MSG0 = _mm_add_epi32(MSG0, TMP);
|
|
|
+ MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
|
|
|
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
+ MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
|
|
|
+
|
|
|
+ /* Rounds 48-51 */
|
|
|
+ MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
|
|
|
+ 0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
|
|
|
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
+ TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
|
|
|
+ MSG1 = _mm_add_epi32(MSG1, TMP);
|
|
|
+ MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
|
|
|
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
+ MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
|
|
|
+
|
|
|
+ /* Rounds 52-55 */
|
|
|
+ MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
|
|
|
+ 0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
|
|
|
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
+ TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
|
|
|
+ MSG2 = _mm_add_epi32(MSG2, TMP);
|
|
|
+ MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
|
|
|
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
+
|
|
|
+ /* Rounds 56-59 */
|
|
|
+ MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
|
|
|
+ 0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
|
|
|
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
+ TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
|
|
|
+ MSG3 = _mm_add_epi32(MSG3, TMP);
|
|
|
+ MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
|
|
|
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
+
|
|
|
+ /* Rounds 60-63 */
|
|
|
+ MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
|
|
|
+ 0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
|
|
|
+ STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
|
|
|
+ MSG = _mm_shuffle_epi32(MSG, 0x0E);
|
|
|
+ STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
|
|
|
+
|
|
|
+ /* Combine state */
|
|
|
+ core[0] = _mm_add_epi32(STATE0, core[0]);
|
|
|
+ core[1] = _mm_add_epi32(STATE1, core[1]);
|
|
|
}
|
|
|
|
|
|
-/*
|
|
|
- * Workaround LLVM bug https://bugs.llvm.org/show_bug.cgi?id=34980
|
|
|
- */
|
|
|
-static void SHA256_ni(SHA256_State * s, const unsigned char *q, int len)
|
|
|
+typedef struct sha256_ni {
|
|
|
+ /*
|
|
|
+ * These two vectors store the 8 words of the SHA-256 state, but
|
|
|
+ * not in the same order they appear in the spec: the first word
|
|
|
+ * holds A,B,E,F and the second word C,D,G,H.
|
|
|
+ */
|
|
|
+ __m128i core[2];
|
|
|
+ sha256_block blk;
|
|
|
+ void *pointer_to_free;
|
|
|
+ BinarySink_IMPLEMENTATION;
|
|
|
+ ssh_hash hash;
|
|
|
+} sha256_ni;
|
|
|
+
|
|
|
+static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len);
|
|
|
+
|
|
|
+static sha256_ni *sha256_ni_alloc(void)
|
|
|
{
|
|
|
- SHA256_ni_(s, q, len);
|
|
|
+ /*
|
|
|
+ * The __m128i variables in the context structure need to be
|
|
|
+ * 16-byte aligned, but not all malloc implementations that this
|
|
|
+ * code has to work with will guarantee to return a 16-byte
|
|
|
+ * aligned pointer. So we over-allocate, manually realign the
|
|
|
+ * pointer ourselves, and store the original one inside the
|
|
|
+ * context so we know how to free it later.
|
|
|
+ */
|
|
|
+ void *allocation = smalloc(sizeof(sha256_ni) + 15);
|
|
|
+ uintptr_t alloc_address = (uintptr_t)allocation;
|
|
|
+ uintptr_t aligned_address = (alloc_address + 15) & ~15;
|
|
|
+ sha256_ni *s = (sha256_ni *)aligned_address;
|
|
|
+ s->pointer_to_free = allocation;
|
|
|
+ return s;
|
|
|
}
|
|
|
|
|
|
-#else /* COMPILER_SUPPORTS_AES_NI */
|
|
|
+FUNC_ISA static ssh_hash *sha256_ni_new(const ssh_hashalg *alg)
|
|
|
+{
|
|
|
+ if (!sha256_hw_available_cached())
|
|
|
+ return NULL;
|
|
|
+
|
|
|
+ sha256_ni *s = sha256_ni_alloc();
|
|
|
+
|
|
|
+ /* Initialise the core vectors in their storage order */
|
|
|
+ s->core[0] = _mm_set_epi64x(
|
|
|
+ 0x6a09e667bb67ae85ULL, 0x510e527f9b05688cULL);
|
|
|
+ s->core[1] = _mm_set_epi64x(
|
|
|
+ 0x3c6ef372a54ff53aULL, 0x1f83d9ab5be0cd19ULL);
|
|
|
|
|
|
-static void SHA256_ni(SHA256_State * s, const unsigned char *q, int len)
|
|
|
+ sha256_block_setup(&s->blk);
|
|
|
+
|
|
|
+ s->hash.vt = alg;
|
|
|
+ BinarySink_INIT(s, sha256_ni_write);
|
|
|
+ BinarySink_DELEGATE_INIT(&s->hash, s);
|
|
|
+ return &s->hash;
|
|
|
+}
|
|
|
+
|
|
|
+static ssh_hash *sha256_ni_copy(ssh_hash *hash)
|
|
|
{
|
|
|
- unreachable("SHA256_ni not compiled in");
|
|
|
+ sha256_ni *s = container_of(hash, sha256_ni, hash);
|
|
|
+ sha256_ni *copy = sha256_ni_alloc();
|
|
|
+
|
|
|
+ void *ptf_save = copy->pointer_to_free;
|
|
|
+ *copy = *s; /* structure copy */
|
|
|
+ copy->pointer_to_free = ptf_save;
|
|
|
+
|
|
|
+ BinarySink_COPIED(copy);
|
|
|
+ BinarySink_DELEGATE_INIT(©->hash, copy);
|
|
|
+
|
|
|
+ return ©->hash;
|
|
|
+}
|
|
|
+
|
|
|
+static void sha256_ni_free(ssh_hash *hash)
|
|
|
+{
|
|
|
+ sha256_ni *s = container_of(hash, sha256_ni, hash);
|
|
|
+
|
|
|
+ void *ptf = s->pointer_to_free;
|
|
|
+ smemclr(s, sizeof(*s));
|
|
|
+ sfree(ptf);
|
|
|
+}
|
|
|
+
|
|
|
+static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len)
|
|
|
+{
|
|
|
+ sha256_ni *s = BinarySink_DOWNCAST(bs, sha256_ni);
|
|
|
+
|
|
|
+ while (len > 0)
|
|
|
+ if (sha256_block_write(&s->blk, &vp, &len))
|
|
|
+ sha256_ni_block(s->core, s->blk.block);
|
|
|
+}
|
|
|
+
|
|
|
+FUNC_ISA static void sha256_ni_final(ssh_hash *hash, uint8_t *digest)
|
|
|
+{
|
|
|
+ sha256_ni *s = container_of(hash, sha256_ni, hash);
|
|
|
+
|
|
|
+ sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
|
|
|
+
|
|
|
+ /* Rearrange the words into the output order */
|
|
|
+ __m128i feba = _mm_shuffle_epi32(s->core[0], 0x1B);
|
|
|
+ __m128i dchg = _mm_shuffle_epi32(s->core[1], 0xB1);
|
|
|
+ __m128i dcba = _mm_blend_epi16(feba, dchg, 0xF0);
|
|
|
+ __m128i hgfe = _mm_alignr_epi8(dchg, feba, 8);
|
|
|
+
|
|
|
+ /* Byte-swap them into the output endianness */
|
|
|
+ const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12);
|
|
|
+ dcba = _mm_shuffle_epi8(dcba, mask);
|
|
|
+ hgfe = _mm_shuffle_epi8(hgfe, mask);
|
|
|
+
|
|
|
+ /* And store them */
|
|
|
+ __m128i *output = (__m128i *)digest;
|
|
|
+ _mm_storeu_si128(output, dcba);
|
|
|
+ _mm_storeu_si128(output+1, hgfe);
|
|
|
+
|
|
|
+ sha256_ni_free(hash);
|
|
|
}
|
|
|
|
|
|
-#endif /* COMPILER_SUPPORTS_AES_NI */
|
|
|
+const ssh_hashalg ssh_sha256_hw = {
|
|
|
+ sha256_ni_new, sha256_ni_copy, sha256_ni_final, sha256_ni_free,
|
|
|
+ 32, 64, "SHA-256",
|
|
|
+};
|
|
|
+
|
|
|
+/* ----------------------------------------------------------------------
|
|
|
+ * Stub functions if we have no hardware-accelerated SHA-256. In this
|
|
|
+ * case, sha256_hw_new returns NULL (though it should also never be
|
|
|
+ * selected by sha256_select, so the only thing that should even be
|
|
|
+ * _able_ to call it is testcrypt). As a result, the remaining vtable
|
|
|
+ * functions should never be called at all.
|
|
|
+ */
|
|
|
+
|
|
|
+#elif HW_SHA256 == HW_SHA256_NONE
|
|
|
+
|
|
|
+static bool sha256_hw_available(void)
|
|
|
+{
|
|
|
+ return false;
|
|
|
+}
|
|
|
+
|
|
|
+static ssh_hash *sha256_stub_new(const ssh_hashalg *alg)
|
|
|
+{
|
|
|
+ return NULL;
|
|
|
+}
|
|
|
+
|
|
|
+#define STUB_BODY { unreachable("Should never be called"); }
|
|
|
+
|
|
|
+static ssh_hash *sha256_stub_copy(ssh_hash *hash) STUB_BODY
|
|
|
+static void sha256_stub_free(ssh_hash *hash) STUB_BODY
|
|
|
+static void sha256_stub_final(ssh_hash *hash, uint8_t *digest) STUB_BODY
|
|
|
+
|
|
|
+const ssh_hashalg ssh_sha256_hw = {
|
|
|
+ sha256_stub_new, sha256_stub_copy, sha256_stub_final, sha256_stub_free,
|
|
|
+ 32, 64, "SHA-256",
|
|
|
+};
|
|
|
+
|
|
|
+#endif /* HW_SHA256 */
|