6 years ago · 169f22364b
--- a/source/putty/ssh.h
+++ b/source/putty/ssh.h
@@ -558,8 +558,6 @@ struct ssh_cipher {
 
				     const ssh_cipheralg *vt;

			
 
				 };

			
 
				 

			
 
				-bool supports_sha_ni(void);

			
 
				-

			
 
				 struct ssh_cipheralg {

			
 
				     ssh_cipher *(*new)(const ssh_cipheralg *alg);

			
 
				     void (*free)(ssh_cipher *);

			
@@ -819,7 +817,11 @@ extern const ssh2_ciphers ssh2_arcfour;
 
				 extern const ssh2_ciphers ssh2_ccp;

			
 
				 extern const ssh_hashalg ssh_md5;

			
 
				 extern const ssh_hashalg ssh_sha1;

			
 
				+extern const ssh_hashalg ssh_sha1_hw;

			
 
				+extern const ssh_hashalg ssh_sha1_sw;

			
 
				 extern const ssh_hashalg ssh_sha256;

			
 
				+extern const ssh_hashalg ssh_sha256_hw;

			
 
				+extern const ssh_hashalg ssh_sha256_sw;

			
 
				 extern const ssh_hashalg ssh_sha384;

			
 
				 extern const ssh_hashalg ssh_sha512;

			
 
				 extern const ssh_kexes ssh_diffiehellman_group1;

			
@@ -867,29 +869,6 @@ extern const char sshver[];
 
				  */

			
 
				 extern bool ssh_fallback_cmd(Backend *backend);

			
 
				 

			
 
				-/*

			
 
				- * Check of compiler version

			
 
				- */

			
 
				-#ifdef _FORCE_SHA_NI

			
 
				-#   define COMPILER_SUPPORTS_SHA_NI

			
 
				-#elif defined(__clang__)

			
 
				-#   if __has_attribute(target) && __has_include(<shaintrin.h>) && (defined(__x86_64__) || defined(__i386))

			
 
				-#       define COMPILER_SUPPORTS_SHA_NI

			
 
				-#   endif

			
 
				-#elif defined(__GNUC__)

			
 
				-#    if ((__GNUC__ >= 5) && (defined(__x86_64__) || defined(__i386)))

			
 
				-#       define COMPILER_SUPPORTS_SHA_NI

			
 
				-#    endif

			
 
				-#elif defined (_MSC_VER)

			
 
				-#   if (defined(_M_X64) || defined(_M_IX86)) && _MSC_VER >= 1900

			
 
				-#      define COMPILER_SUPPORTS_SHA_NI

			
 
				-#   endif

			
 
				-#endif

			
 
				-

			
 
				-#ifdef _FORCE_SOFTWARE_SHA

			
 
				-#   undef COMPILER_SUPPORTS_SHA_NI

			
 
				-#endif

			
 
				-

			
 
				 /*

			
 
				  * The PRNG type, defined in sshprng.c. Visible data fields are

			
 
				  * 'savesize', which suggests how many random bytes you should request

			
--- a/source/putty/sshsh256.c
+++ b/source/putty/sshsh256.c
@@ -7,259 +7,302 @@
 
				 #include "ssh.h"

			
 
				 #include <assert.h>

			
 
				 

			
 
				+/*

			
 
				+ * Start by deciding whether we can support hardware SHA at all.

			
 
				+ */

			
 
				+#define HW_SHA256_NONE 0

			
 
				+#define HW_SHA256_NI 1

			
 
				+

			
 
				+#ifdef _FORCE_SHA_NI

			
 
				+#   define HW_SHA256 HW_SHA256_NI

			
 
				+#elif defined(__clang__)

			
 
				+#   if __has_attribute(target) && __has_include(<wmmintrin.h>) &&       \

			
 
				+    (defined(__x86_64__) || defined(__i386))

			
 
				+#       define HW_SHA256 HW_SHA256_NI

			
 
				+#   endif

			
 
				+#elif defined(__GNUC__)

			
 
				+#    if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) && \

			
 
				+    (defined(__x86_64__) || defined(__i386))

			
 
				+#       define HW_SHA256 HW_SHA256_NI

			
 
				+#    endif

			
 
				+#elif defined (_MSC_VER)

			
 
				+#   if (defined(_M_X64) || defined(_M_IX86)) && _MSC_FULL_VER >= 150030729

			
 
				+#      define HW_SHA256 HW_SHA256_NI

			
 
				+#   endif

			
 
				+#endif

			
 
				+

			
 
				+#if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA256

			
 
				+#   undef HW_SHA256

			
 
				+#   define HW_SHA256 HW_SHA256_NONE

			
 
				+#endif

			
 
				+

			
 
				+/*

			
 
				+ * The actual query function that asks if hardware acceleration is

			
 
				+ * available.

			
 
				+ */

			
 
				+static bool sha256_hw_available(void);

			
 
				+

			
 
				+/*

			
 
				+ * The top-level selection function, caching the results of

			
 
				+ * sha256_hw_available() so it only has to run once.

			
 
				+ */

			
 
				+static bool sha256_hw_available_cached(void)

			
 
				+{

			
 
				+    static bool initialised = false;

			
 
				+    static bool hw_available;

			
 
				+    if (!initialised) {

			
 
				+        hw_available = sha256_hw_available();

			
 
				+        initialised = true;

			
 
				+    }

			
 
				+    return hw_available;

			
 
				+}

			
 
				+

			
 
				+static ssh_hash *sha256_select(const ssh_hashalg *alg)

			
 
				+{

			
 
				+    const ssh_hashalg *real_alg =

			
 
				+        sha256_hw_available_cached() ? &ssh_sha256_hw : &ssh_sha256_sw;

			
 
				+

			
 
				+    return ssh_hash_new(real_alg);

			
 
				+}

			
 
				+

			
 
				+const ssh_hashalg ssh_sha256 = {

			
 
				+    sha256_select, NULL, NULL, NULL,

			
 
				+    32, 64, "SHA-256",

			
 
				+};

			
 
				+

			
 
				 /* ----------------------------------------------------------------------

			
 
				- * Core SHA256 algorithm: processes 16-word blocks into a message digest.

			
 
				+ * Definitions likely to be helpful to multiple implementations.

			
 
				  */

			
 
				 

			
 
				-#define ror(x,y) ( ((x) << (32-y)) | (((uint32_t)(x)) >> (y)) )

			
 
				-#define shr(x,y) ( (((uint32_t)(x)) >> (y)) )

			
 
				-#define Ch(x,y,z) ( ((x) & (y)) ^ (~(x) & (z)) )

			
 
				-#define Maj(x,y,z) ( ((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)) )

			
 
				-#define bigsigma0(x) ( ror((x),2) ^ ror((x),13) ^ ror((x),22) )

			
 
				-#define bigsigma1(x) ( ror((x),6) ^ ror((x),11) ^ ror((x),25) )

			
 
				-#define smallsigma0(x) ( ror((x),7) ^ ror((x),18) ^ shr((x),3) )

			
 
				-#define smallsigma1(x) ( ror((x),17) ^ ror((x),19) ^ shr((x),10) )

			
 
				-

			
 
				-typedef struct SHA256_State {

			
 
				-    uint32_t h[8];

			
 
				-    unsigned char block[64];

			
 
				-    int blkused;

			
 
				+static const uint32_t sha256_initial_state[] = {

			
 
				+    0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,

			
 
				+    0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,

			
 
				+};

			
 
				+

			
 
				+static const uint32_t sha256_round_constants[] = {

			
 
				+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,

			
 
				+    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,

			
 
				+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,

			
 
				+    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,

			
 
				+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,

			
 
				+    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,

			
 
				+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,

			
 
				+    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,

			
 
				+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,

			
 
				+    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,

			
 
				+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,

			
 
				+    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,

			
 
				+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,

			
 
				+    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,

			
 
				+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,

			
 
				+    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,

			
 
				+};

			
 
				+

			
 
				+#define SHA256_ROUNDS 64

			
 
				+

			
 
				+typedef struct sha256_block sha256_block;

			
 
				+struct sha256_block {

			
 
				+    uint8_t block[64];

			
 
				+    size_t used;

			
 
				     uint64_t len;

			
 
				-    void (*sha256)(struct SHA256_State * s, const unsigned char *p, int len);

			
 
				-    BinarySink_IMPLEMENTATION;

			
 
				-} SHA256_State;

			
 
				-

			
 
				-static void SHA256_sw(SHA256_State *s, const unsigned char *q, int len);

			
 
				-static void SHA256_ni(SHA256_State *s, const unsigned char *q, int len);

			
 
				-

			
 
				-void SHA256_Core_Init(SHA256_State *s) {

			
 
				-    s->h[0] = 0x6a09e667;

			
 
				-    s->h[1] = 0xbb67ae85;

			
 
				-    s->h[2] = 0x3c6ef372;

			
 
				-    s->h[3] = 0xa54ff53a;

			
 
				-    s->h[4] = 0x510e527f;

			
 
				-    s->h[5] = 0x9b05688c;

			
 
				-    s->h[6] = 0x1f83d9ab;

			
 
				-    s->h[7] = 0x5be0cd19;

			
 
				+};

			
 
				+

			
 
				+static inline void sha256_block_setup(sha256_block *blk)

			
 
				+{

			
 
				+    blk->used = 0;

			
 
				+    blk->len = 0;

			
 
				 }

			
 
				 

			
 
				-void SHA256_Block(SHA256_State *s, uint32_t *block) {

			
 
				-    uint32_t w[80];

			
 
				-    uint32_t a,b,c,d,e,f,g,h;

			
 
				-    static const int k[] = {

			
 
				-        0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,

			
 
				-        0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,

			
 
				-        0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,

			
 
				-        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,

			
 
				-        0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,

			
 
				-        0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,

			
 
				-        0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,

			
 
				-        0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,

			
 
				-        0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,

			
 
				-        0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,

			
 
				-        0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,

			
 
				-        0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,

			
 
				-        0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,

			
 
				-        0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,

			
 
				-        0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,

			
 
				-        0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,

			
 
				-    };

			
 
				-

			
 
				-    int t;

			
 
				-

			
 
				-    for (t = 0; t < 16; t++)

			
 
				-        w[t] = block[t];

			
 
				-

			
 
				-    for (t = 16; t < 64; t++)

			
 
				-	w[t] = smallsigma1(w[t-2]) + w[t-7] + smallsigma0(w[t-15]) + w[t-16];

			
 
				-

			
 
				-    a = s->h[0]; b = s->h[1]; c = s->h[2]; d = s->h[3];

			
 
				-    e = s->h[4]; f = s->h[5]; g = s->h[6]; h = s->h[7];

			
 
				-

			
 
				-    for (t = 0; t < 64; t+=8) {

			
 
				-        uint32_t t1, t2;

			
 
				-

			
 
				-#define ROUND(j,a,b,c,d,e,f,g,h) \

			
 
				-	t1 = h + bigsigma1(e) + Ch(e,f,g) + k[j] + w[j]; \

			
 
				-	t2 = bigsigma0(a) + Maj(a,b,c); \

			
 
				-        d = d + t1; h = t1 + t2;

			
 
				-

			
 
				-	ROUND(t+0, a,b,c,d,e,f,g,h);

			
 
				-	ROUND(t+1, h,a,b,c,d,e,f,g);

			
 
				-	ROUND(t+2, g,h,a,b,c,d,e,f);

			
 
				-	ROUND(t+3, f,g,h,a,b,c,d,e);

			
 
				-	ROUND(t+4, e,f,g,h,a,b,c,d);

			
 
				-	ROUND(t+5, d,e,f,g,h,a,b,c);

			
 
				-	ROUND(t+6, c,d,e,f,g,h,a,b);

			
 
				-	ROUND(t+7, b,c,d,e,f,g,h,a);

			
 
				+static inline bool sha256_block_write(

			
 
				+    sha256_block *blk, const void **vdata, size_t *len)

			
 
				+{

			
 
				+    size_t blkleft = sizeof(blk->block) - blk->used;

			
 
				+    size_t chunk = *len < blkleft ? *len : blkleft;

			
 
				+

			
 
				+    const uint8_t *p = *vdata;

			
 
				+    memcpy(blk->block + blk->used, p, chunk);

			
 
				+    *vdata = p + chunk;

			
 
				+    *len -= chunk;

			
 
				+    blk->used += chunk;

			
 
				+    blk->len += chunk;

			
 
				+

			
 
				+    if (blk->used == sizeof(blk->block)) {

			
 
				+        blk->used = 0;

			
 
				+        return true;

			
 
				     }

			
 
				 

			
 
				-    s->h[0] += a; s->h[1] += b; s->h[2] += c; s->h[3] += d;

			
 
				-    s->h[4] += e; s->h[5] += f; s->h[6] += g; s->h[7] += h;

			
 
				+    return false;

			
 
				+}

			
 
				+

			
 
				+static inline void sha256_block_pad(sha256_block *blk, BinarySink *bs)

			
 
				+{

			
 
				+    uint64_t final_len = blk->len << 3;

			
 
				+    size_t pad = 1 + (63 & (55 - blk->used));

			
 
				+

			
 
				+    put_byte(bs, 0x80);

			
 
				+    for (size_t i = 1; i < pad; i++)

			
 
				+        put_byte(bs, 0);

			
 
				+    put_uint64(bs, final_len);

			
 
				+

			
 
				+    assert(blk->used == 0 && "Should have exactly hit a block boundary");

			
 
				 }

			
 
				 

			
 
				 /* ----------------------------------------------------------------------

			
 
				- * Outer SHA256 algorithm: take an arbitrary length byte string,

			
 
				- * convert it into 16-word blocks with the prescribed padding at

			
 
				- * the end, and pass those blocks to the core SHA256 algorithm.

			
 
				+ * Software implementation of SHA-256.

			
 
				  */

			
 
				 

			
 
				-#define BLKSIZE 64

			
 
				+static inline uint32_t ror(uint32_t x, unsigned y)

			
 
				+{

			
 
				+    return (x << (31 & -y)) | (x >> (31 & y));

			
 
				+}

			
 
				 

			
 
				-static void SHA256_BinarySink_write(BinarySink *bs,

			
 
				-                                    const void *p, size_t len);

			
 
				+static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0)

			
 
				+{

			
 
				+    return if0 ^ (ctrl & (if1 ^ if0));

			
 
				+}

			
 
				 

			
 
				-void SHA256_Init(SHA256_State *s) {

			
 
				-    SHA256_Core_Init(s);

			
 
				-    s->blkused = 0;

			
 
				-    s->len = 0;

			
 
				-    if (supports_sha_ni())

			
 
				-        s->sha256 = &SHA256_ni;

			
 
				-    else

			
 
				-        s->sha256 = &SHA256_sw;

			
 
				-    BinarySink_INIT(s, SHA256_BinarySink_write);

			
 
				+static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z)

			
 
				+{

			
 
				+    return (x & y) | (z & (x | y));

			
 
				 }

			
 
				 

			
 
				-static void SHA256_BinarySink_write(BinarySink *bs,

			
 
				-                                    const void *p, size_t len)

			
 
				+static inline uint32_t Sigma_0(uint32_t x)

			
 
				 {

			
 
				-    struct SHA256_State *s = BinarySink_DOWNCAST(bs, struct SHA256_State);

			
 
				-    unsigned char *q = (unsigned char *)p;

			
 
				+    return ror(x,2) ^ ror(x,13) ^ ror(x,22);

			
 
				+}

			
 
				 

			
 
				-    /*

			
 
				-     * Update the length field.

			
 
				-     */

			
 
				-    s->len += len;

			
 
				+static inline uint32_t Sigma_1(uint32_t x)

			
 
				+{

			
 
				+    return ror(x,6) ^ ror(x,11) ^ ror(x,25);

			
 
				+}

			
 
				 

			
 
				-    (*(s->sha256))(s, q, len);

			
 
				+static inline uint32_t sigma_0(uint32_t x)

			
 
				+{

			
 
				+    return ror(x,7) ^ ror(x,18) ^ (x >> 3);

			
 
				 }

			
 
				 

			
 
				-static void SHA256_sw(SHA256_State *s, const unsigned char *q, int len) {

			
 
				-    uint32_t wordblock[16];

			
 
				-    int i;

			
 
				-

			
 
				-    if (s->blkused && s->blkused+len < BLKSIZE) {

			
 
				-        /*

			
 
				-         * Trivial case: just add to the block.

			
 
				-         */

			
 
				-        memcpy(s->block + s->blkused, q, len);

			
 
				-        s->blkused += len;

			
 
				-    } else {

			
 
				-        /*

			
 
				-         * We must complete and process at least one block.

			
 
				-         */

			
 
				-        while (s->blkused + len >= BLKSIZE) {

			
 
				-            memcpy(s->block + s->blkused, q, BLKSIZE - s->blkused);

			
 
				-            q += BLKSIZE - s->blkused;

			
 
				-            len -= BLKSIZE - s->blkused;

			
 
				-            /* Now process the block. Gather bytes big-endian into words */

			
 
				-            for (i = 0; i < 16; i++) {

			
 
				-                wordblock[i] =

			
 
				-                    ( ((uint32_t)s->block[i*4+0]) << 24 ) |

			
 
				-                    ( ((uint32_t)s->block[i*4+1]) << 16 ) |

			
 
				-                    ( ((uint32_t)s->block[i*4+2]) <<  8 ) |

			
 
				-                    ( ((uint32_t)s->block[i*4+3]) <<  0 );

			
 
				-            }

			
 
				-            SHA256_Block(s, wordblock);

			
 
				-            s->blkused = 0;

			
 
				-        }

			
 
				-        memcpy(s->block, q, len);

			
 
				-        s->blkused = len;

			
 
				-    }

			
 
				+static inline uint32_t sigma_1(uint32_t x)

			
 
				+{

			
 
				+    return ror(x,17) ^ ror(x,19) ^ (x >> 10);

			
 
				 }

			
 
				 

			
 
				-void SHA256_Final(SHA256_State *s, unsigned char *digest) {

			
 
				-    int i;

			
 
				-    int pad;

			
 
				-    unsigned char c[64];

			
 
				-    uint64_t len;

			
 
				+static inline void sha256_sw_round(

			
 
				+    unsigned round_index, const uint32_t *schedule,

			
 
				+    uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d,

			
 
				+    uint32_t *e, uint32_t *f, uint32_t *g, uint32_t *h)

			
 
				+{

			
 
				+    uint32_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) +

			
 
				+        sha256_round_constants[round_index] + schedule[round_index];

			
 
				+

			
 
				+    uint32_t t2 = Sigma_0(*a) + Maj(*a,*b,*c);

			
 
				 

			
 
				-    if (s->blkused >= 56)

			
 
				-        pad = 56 + 64 - s->blkused;

			
 
				-    else

			
 
				-        pad = 56 - s->blkused;

			
 
				+    *d += t1;

			
 
				+    *h = t1 + t2;

			
 
				+}

			
 
				 

			
 
				-    len = (s->len << 3);

			
 
				+static void sha256_sw_block(uint32_t *core, const uint8_t *block)

			
 
				+{

			
 
				+    uint32_t w[SHA256_ROUNDS];

			
 
				+    uint32_t a,b,c,d,e,f,g,h;

			
 
				 

			
 
				-    memset(c, 0, pad);

			
 
				-    c[0] = 0x80;

			
 
				-    put_data(s, &c, pad);

			
 
				+    for (size_t t = 0; t < 16; t++)

			
 
				+        w[t] = GET_32BIT_MSB_FIRST(block + 4*t);

			
 
				 

			
 
				-    put_uint64(s, len);

			
 
				+    for (size_t t = 16; t < SHA256_ROUNDS; t++)

			
 
				+	w[t] = sigma_1(w[t-2]) + w[t-7] + sigma_0(w[t-15]) + w[t-16];

			
 
				 

			
 
				-    for (i = 0; i < 8; i++) {

			
 
				-	digest[i*4+0] = (s->h[i] >> 24) & 0xFF;

			
 
				-	digest[i*4+1] = (s->h[i] >> 16) & 0xFF;

			
 
				-	digest[i*4+2] = (s->h[i] >>  8) & 0xFF;

			
 
				-	digest[i*4+3] = (s->h[i] >>  0) & 0xFF;

			
 
				+    a = core[0]; b = core[1]; c = core[2]; d = core[3];

			
 
				+    e = core[4]; f = core[5]; g = core[6]; h = core[7];

			
 
				+

			
 
				+    for (size_t t = 0; t < SHA256_ROUNDS; t += 8) {

			
 
				+	sha256_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h);

			
 
				+	sha256_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g);

			
 
				+	sha256_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f);

			
 
				+	sha256_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e);

			
 
				+	sha256_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d);

			
 
				+	sha256_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c);

			
 
				+	sha256_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b);

			
 
				+	sha256_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a);

			
 
				     }

			
 
				-}

			
 
				 

			
 
				-void SHA256_Simple(const void *p, int len, unsigned char *output) {

			
 
				-    SHA256_State s;

			
 
				+    core[0] += a; core[1] += b; core[2] += c; core[3] += d;

			
 
				+    core[4] += e; core[5] += f; core[6] += g; core[7] += h;

			
 
				 

			
 
				-    SHA256_Init(&s);

			
 
				-    put_data(&s, p, len);

			
 
				-    SHA256_Final(&s, output);

			
 
				-    smemclr(&s, sizeof(s));

			
 
				+    smemclr(w, sizeof(w));

			
 
				 }

			
 
				 

			
 
				-/*

			
 
				- * Thin abstraction for things where hashes are pluggable.

			
 
				- */

			
 
				-

			
 
				-struct sha256_hash {

			
 
				-    SHA256_State state;

			
 
				+typedef struct sha256_sw {

			
 
				+    uint32_t core[8];

			
 
				+    sha256_block blk;

			
 
				+    BinarySink_IMPLEMENTATION;

			
 
				     ssh_hash hash;

			
 
				-};

			
 
				+} sha256_sw;

			
 
				+

			
 
				+static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len);

			
 
				 

			
 
				-static ssh_hash *sha256_new(const ssh_hashalg *alg)

			
 
				+static ssh_hash *sha256_sw_new(const ssh_hashalg *alg)

			
 
				 {

			
 
				-    struct sha256_hash *h = snew(struct sha256_hash);

			
 
				-    SHA256_Init(&h->state);

			
 
				-    h->hash.vt = alg;

			
 
				-    BinarySink_DELEGATE_INIT(&h->hash, &h->state);

			
 
				-    return &h->hash;

			
 
				+    sha256_sw *s = snew(sha256_sw);

			
 
				+

			
 
				+    memcpy(s->core, sha256_initial_state, sizeof(s->core));

			
 
				+

			
 
				+    sha256_block_setup(&s->blk);

			
 
				+

			
 
				+    s->hash.vt = alg;

			
 
				+    BinarySink_INIT(s, sha256_sw_write);

			
 
				+    BinarySink_DELEGATE_INIT(&s->hash, s);

			
 
				+    return &s->hash;

			
 
				 }

			
 
				 

			
 
				-static ssh_hash *sha256_copy(ssh_hash *hashold)

			
 
				+static ssh_hash *sha256_sw_copy(ssh_hash *hash)

			
 
				 {

			
 
				-    struct sha256_hash *hold, *hnew;

			
 
				-    ssh_hash *hashnew = sha256_new(hashold->vt);

			
 
				+    sha256_sw *s = container_of(hash, sha256_sw, hash);

			
 
				+    sha256_sw *copy = snew(sha256_sw);

			
 
				+

			
 
				+    memcpy(copy, s, sizeof(*copy));

			
 
				+    BinarySink_COPIED(copy);

			
 
				+    BinarySink_DELEGATE_INIT(&copy->hash, copy);

			
 
				 

			
 
				-    hold = container_of(hashold, struct sha256_hash, hash);

			
 
				-    hnew = container_of(hashnew, struct sha256_hash, hash);

			
 
				+    return &copy->hash;

			
 
				+}

			
 
				 

			
 
				-    hnew->state = hold->state;

			
 
				-    BinarySink_COPIED(&hnew->state);

			
 
				+static void sha256_sw_free(ssh_hash *hash)

			
 
				+{

			
 
				+    sha256_sw *s = container_of(hash, sha256_sw, hash);

			
 
				 

			
 
				-    return hashnew;

			
 
				+    smemclr(s, sizeof(*s));

			
 
				+    sfree(s);

			
 
				 }

			
 
				 

			
 
				-static void sha256_free(ssh_hash *hash)

			
 
				+static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len)

			
 
				 {

			
 
				-    struct sha256_hash *h = container_of(hash, struct sha256_hash, hash);

			
 
				+    sha256_sw *s = BinarySink_DOWNCAST(bs, sha256_sw);

			
 
				 

			
 
				-    smemclr(h, sizeof(*h));

			
 
				-    sfree(h);

			
 
				+    while (len > 0)

			
 
				+        if (sha256_block_write(&s->blk, &vp, &len))

			
 
				+            sha256_sw_block(s->core, s->blk.block);

			
 
				 }

			
 
				 

			
 
				-static void sha256_final(ssh_hash *hash, unsigned char *output)

			
 
				+static void sha256_sw_final(ssh_hash *hash, uint8_t *digest)

			
 
				 {

			
 
				-    struct sha256_hash *h = container_of(hash, struct sha256_hash, hash);

			
 
				-    SHA256_Final(&h->state, output);

			
 
				-    sha256_free(hash);

			
 
				+    sha256_sw *s = container_of(hash, sha256_sw, hash);

			
 
				+

			
 
				+    sha256_block_pad(&s->blk, BinarySink_UPCAST(s));

			
 
				+    for (size_t i = 0; i < 8; i++)

			
 
				+        PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]);

			
 
				+    sha256_sw_free(hash);

			
 
				 }

			
 
				 

			
 
				-const ssh_hashalg ssh_sha256 = {

			
 
				-    sha256_new, sha256_copy, sha256_final, sha256_free, 32, 64, "SHA-256"

			
 
				+const ssh_hashalg ssh_sha256_sw = {

			
 
				+    sha256_sw_new, sha256_sw_copy, sha256_sw_final, sha256_sw_free,

			
 
				+    32, 64, "SHA-256",

			
 
				 };

			
 
				 

			
 
				-#ifdef COMPILER_SUPPORTS_SHA_NI

			
 
				+/* ----------------------------------------------------------------------

			
 
				+ * Hardware-accelerated implementation of SHA-256 using x86 SHA-NI.

			
 
				+ */

			
 
				 

			
 
				-#if defined _MSC_VER && defined _M_AMD64

			
 
				-# include <intrin.h>

			
 
				-#endif

			
 
				+#if HW_SHA256 == HW_SHA256_NI

			
 
				 

			
 
				 /*

			
 
				  * Set target architecture for Clang and GCC

			
@@ -269,7 +312,7 @@ const ssh_hashalg ssh_sha256 = {
 
				 #    pragma GCC target("sse4.1")

			
 
				 #endif

			
 
				 

			
 
				-#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5))

			
 
				+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))

			
 
				 #    define FUNC_ISA __attribute__ ((target("sse4.1,sha")))

			
 
				 #else

			
 
				 #    define FUNC_ISA

			
@@ -278,236 +321,369 @@ const ssh_hashalg ssh_sha256 = {
 
				 #include <wmmintrin.h>

			
 
				 #include <smmintrin.h>

			
 
				 #include <immintrin.h>

			
 
				-

			
 
				 #if defined(__clang__) || defined(__GNUC__)

			
 
				 #include <shaintrin.h>

			
 
				 #endif

			
 
				 

			
 
				+#if defined(__clang__) || defined(__GNUC__)

			
 
				+#include <cpuid.h>

			
 
				+#define GET_CPU_ID_0(out)                               \

			
 
				+    __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3])

			
 
				+#define GET_CPU_ID_7(out)                                       \

			
 
				+    __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3])

			
 
				+#else

			
 
				+#define GET_CPU_ID_0(out) __cpuid(out, 0)

			
 
				+#define GET_CPU_ID_7(out) __cpuidex(out, 7, 0)

			
 
				+#endif

			
 
				+

			
 
				+static bool sha256_hw_available(void)

			
 
				+{

			
 
				+    unsigned int CPUInfo[4];

			
 
				+    GET_CPU_ID_0(CPUInfo);  

			
 
				+    if (CPUInfo[0] < 7)

			
 
				+        return false;

			
 
				+

			
 
				+    GET_CPU_ID_7(CPUInfo);

			
 
				+    return CPUInfo[1] & (1 << 29); /* Check SHA */

			
 
				+}

			
 
				+

			
 
				 /* SHA256 implementation using new instructions

			
 
				    The code is based on Jeffrey Walton's SHA256 implementation:

			
 
				    https://github.com/noloader/SHA-Intrinsics

			
 
				 */

			
 
				 FUNC_ISA

			
 
				-static void SHA256_ni_(SHA256_State * s, const unsigned char *q, int len) {

			
 
				-    if (s->blkused && s->blkused+len < BLKSIZE) {

			
 
				-        /*

			
 
				-         * Trivial case: just add to the block.

			
 
				-         */

			
 
				-        memcpy(s->block + s->blkused, q, len);

			
 
				-        s->blkused += len;

			
 
				-    } else {

			
 
				-        __m128i STATE0, STATE1;

			
 
				-        __m128i MSG, TMP;

			
 
				-        __m128i MSG0, MSG1, MSG2, MSG3;

			
 
				-        __m128i ABEF_SAVE, CDGH_SAVE;

			
 
				-        const __m128i MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);

			
 
				-

			
 
				-        /* Load initial values */

			
 
				-        TMP = _mm_loadu_si128((const __m128i*) &s->h[0]);

			
 
				-        STATE1 = _mm_loadu_si128((const __m128i*) &s->h[4]);

			
 
				-

			
 
				-        TMP = _mm_shuffle_epi32(TMP, 0xB1);          /* CDAB */

			
 
				-        STATE1 = _mm_shuffle_epi32(STATE1, 0x1B);    /* EFGH */

			
 
				-        STATE0 = _mm_alignr_epi8(TMP, STATE1, 8);    /* ABEF */

			
 
				-        STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); /* CDGH */

			
 
				-        /*

			
 
				-         * We must complete and process at least one block.

			
 
				-         */

			
 
				-        while (s->blkused + len >= BLKSIZE) {

			
 
				-            memcpy(s->block + s->blkused, q, BLKSIZE - s->blkused);

			
 
				-            q += BLKSIZE - s->blkused;

			
 
				-            len -= BLKSIZE - s->blkused;

			
 
				-

			
 
				-                /* Save current state */

			
 
				-            ABEF_SAVE = STATE0;

			
 
				-            CDGH_SAVE = STATE1;

			
 
				-

			
 
				-            /* Rounds 0-3 */

			
 
				-            MSG = _mm_loadu_si128((const __m128i*) (s->block + 0));

			
 
				-            MSG0 = _mm_shuffle_epi8(MSG, MASK);

			
 
				-            MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));

			
 
				-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				-            MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				-

			
 
				-            /* Rounds 4-7 */

			
 
				-            MSG1 = _mm_loadu_si128((const __m128i*) (s->block + 16));

			
 
				-            MSG1 = _mm_shuffle_epi8(MSG1, MASK);

			
 
				-            MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));

			
 
				-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				-            MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				-            MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);

			
 
				-

			
 
				-            /* Rounds 8-11 */

			
 
				-            MSG2 = _mm_loadu_si128((const __m128i*) (s->block + 32));

			
 
				-            MSG2 = _mm_shuffle_epi8(MSG2, MASK);

			
 
				-            MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));

			
 
				-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				-            MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				-            MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);

			
 
				-

			
 
				-            /* Rounds 12-15 */

			
 
				-            MSG3 = _mm_loadu_si128((const __m128i*) (s->block + 48));

			
 
				-            MSG3 = _mm_shuffle_epi8(MSG3, MASK);

			
 
				-            MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));

			
 
				-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				-            TMP = _mm_alignr_epi8(MSG3, MSG2, 4);

			
 
				-            MSG0 = _mm_add_epi32(MSG0, TMP);

			
 
				-            MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);

			
 
				-            MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				-            MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);

			
 
				-

			
 
				-            /* Rounds 16-19 */

			
 
				-            MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));

			
 
				-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				-            TMP = _mm_alignr_epi8(MSG0, MSG3, 4);

			
 
				-            MSG1 = _mm_add_epi32(MSG1, TMP);

			
 
				-            MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);

			
 
				-            MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				-            MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);

			
 
				-

			
 
				-            /* Rounds 20-23 */

			
 
				-            MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));

			
 
				-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				-            TMP = _mm_alignr_epi8(MSG1, MSG0, 4);

			
 
				-            MSG2 = _mm_add_epi32(MSG2, TMP);

			
 
				-            MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);

			
 
				-            MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				-            MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);

			
 
				-

			
 
				-            /* Rounds 24-27 */

			
 
				-            MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));

			
 
				-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				-            TMP = _mm_alignr_epi8(MSG2, MSG1, 4);

			
 
				-            MSG3 = _mm_add_epi32(MSG3, TMP);

			
 
				-            MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);

			
 
				-            MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				-            MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);

			
 
				-

			
 
				-            /* Rounds 28-31 */

			
 
				-            MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));

			
 
				-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				-            TMP = _mm_alignr_epi8(MSG3, MSG2, 4);

			
 
				-            MSG0 = _mm_add_epi32(MSG0, TMP);

			
 
				-            MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);

			
 
				-            MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				-            MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);

			
 
				-

			
 
				-            /* Rounds 32-35 */

			
 
				-            MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));

			
 
				-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				-            TMP = _mm_alignr_epi8(MSG0, MSG3, 4);

			
 
				-            MSG1 = _mm_add_epi32(MSG1, TMP);

			
 
				-            MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);

			
 
				-            MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				-            MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);

			
 
				-

			
 
				-            /* Rounds 36-39 */

			
 
				-            MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));

			
 
				-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				-            TMP = _mm_alignr_epi8(MSG1, MSG0, 4);

			
 
				-            MSG2 = _mm_add_epi32(MSG2, TMP);

			
 
				-            MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);

			
 
				-            MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				-            MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);

			
 
				-

			
 
				-            /* Rounds 40-43 */

			
 
				-            MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));

			
 
				-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				-            TMP = _mm_alignr_epi8(MSG2, MSG1, 4);

			
 
				-            MSG3 = _mm_add_epi32(MSG3, TMP);

			
 
				-            MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);

			
 
				-            MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				-            MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);

			
 
				-

			
 
				-            /* Rounds 44-47 */

			
 
				-            MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));

			
 
				-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				-            TMP = _mm_alignr_epi8(MSG3, MSG2, 4);

			
 
				-            MSG0 = _mm_add_epi32(MSG0, TMP);

			
 
				-            MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);

			
 
				-            MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				-            MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);

			
 
				-

			
 
				-            /* Rounds 48-51 */

			
 
				-            MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));

			
 
				-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				-            TMP = _mm_alignr_epi8(MSG0, MSG3, 4);

			
 
				-            MSG1 = _mm_add_epi32(MSG1, TMP);

			
 
				-            MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);

			
 
				-            MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				-            MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);

			
 
				-

			
 
				-            /* Rounds 52-55 */

			
 
				-            MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));

			
 
				-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				-            TMP = _mm_alignr_epi8(MSG1, MSG0, 4);

			
 
				-            MSG2 = _mm_add_epi32(MSG2, TMP);

			
 
				-            MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);

			
 
				-            MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				-

			
 
				-            /* Rounds 56-59 */

			
 
				-            MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));

			
 
				-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				-            TMP = _mm_alignr_epi8(MSG2, MSG1, 4);

			
 
				-            MSG3 = _mm_add_epi32(MSG3, TMP);

			
 
				-            MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);

			
 
				-            MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				-

			
 
				-            /* Rounds 60-63 */

			
 
				-            MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));

			
 
				-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				-            MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				-

			
 
				-            /* Combine state  */

			
 
				-            STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);

			
 
				-            STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);

			
 
				-

			
 
				-            s->blkused = 0;

			
 
				-        }

			
 
				-

			
 
				-        TMP = _mm_shuffle_epi32(STATE0, 0x1B);       /* FEBA */

			
 
				-        STATE1 = _mm_shuffle_epi32(STATE1, 0xB1);    /* DCHG */

			
 
				-        STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); /* DCBA */

			
 
				-        STATE1 = _mm_alignr_epi8(STATE1, TMP, 8);    /* ABEF */

			
 
				-

			
 
				-        /* Save state */

			
 
				-        _mm_storeu_si128((__m128i*) &s->h[0], STATE0);

			
 
				-        _mm_storeu_si128((__m128i*) &s->h[4], STATE1);

			
 
				-

			
 
				-        memcpy(s->block, q, len);

			
 
				-        s->blkused = len;

			
 
				-    }

			
 
				+static inline void sha256_ni_block(__m128i *core, const uint8_t *p)

			
 
				+{

			
 
				+    __m128i STATE0, STATE1;

			
 
				+    __m128i MSG, TMP;

			
 
				+    __m128i MSG0, MSG1, MSG2, MSG3;

			
 
				+    const __m128i *block = (const __m128i *)p;

			
 
				+    const __m128i MASK = _mm_set_epi64x(

			
 
				+        0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);

			
 
				+

			
 
				+    /* Load initial values */

			
 
				+    STATE0 = core[0];

			
 
				+    STATE1 = core[1];

			
 
				+

			
 
				+    /* Rounds 0-3 */

			
 
				+    MSG = _mm_loadu_si128(block);

			
 
				+    MSG0 = _mm_shuffle_epi8(MSG, MASK);

			
 
				+    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(

			
 
				+                            0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));

			
 
				+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				+    MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				+

			
 
				+    /* Rounds 4-7 */

			
 
				+    MSG1 = _mm_loadu_si128(block + 1);

			
 
				+    MSG1 = _mm_shuffle_epi8(MSG1, MASK);

			
 
				+    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(

			
 
				+                            0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));

			
 
				+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				+    MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				+    MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);

			
 
				+

			
 
				+    /* Rounds 8-11 */

			
 
				+    MSG2 = _mm_loadu_si128(block + 2);

			
 
				+    MSG2 = _mm_shuffle_epi8(MSG2, MASK);

			
 
				+    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(

			
 
				+                            0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));

			
 
				+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				+    MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				+    MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);

			
 
				+

			
 
				+    /* Rounds 12-15 */

			
 
				+    MSG3 = _mm_loadu_si128(block + 3);

			
 
				+    MSG3 = _mm_shuffle_epi8(MSG3, MASK);

			
 
				+    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(

			
 
				+                            0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));

			
 
				+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				+    TMP = _mm_alignr_epi8(MSG3, MSG2, 4);

			
 
				+    MSG0 = _mm_add_epi32(MSG0, TMP);

			
 
				+    MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);

			
 
				+    MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				+    MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);

			
 
				+

			
 
				+    /* Rounds 16-19 */

			
 
				+    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(

			
 
				+                            0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));

			
 
				+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				+    TMP = _mm_alignr_epi8(MSG0, MSG3, 4);

			
 
				+    MSG1 = _mm_add_epi32(MSG1, TMP);

			
 
				+    MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);

			
 
				+    MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				+    MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);

			
 
				+

			
 
				+    /* Rounds 20-23 */

			
 
				+    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(

			
 
				+                            0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));

			
 
				+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				+    TMP = _mm_alignr_epi8(MSG1, MSG0, 4);

			
 
				+    MSG2 = _mm_add_epi32(MSG2, TMP);

			
 
				+    MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);

			
 
				+    MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				+    MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);

			
 
				+

			
 
				+    /* Rounds 24-27 */

			
 
				+    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(

			
 
				+                            0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));

			
 
				+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				+    TMP = _mm_alignr_epi8(MSG2, MSG1, 4);

			
 
				+    MSG3 = _mm_add_epi32(MSG3, TMP);

			
 
				+    MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);

			
 
				+    MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				+    MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);

			
 
				+

			
 
				+    /* Rounds 28-31 */

			
 
				+    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(

			
 
				+                            0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));

			
 
				+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				+    TMP = _mm_alignr_epi8(MSG3, MSG2, 4);

			
 
				+    MSG0 = _mm_add_epi32(MSG0, TMP);

			
 
				+    MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);

			
 
				+    MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				+    MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);

			
 
				+

			
 
				+    /* Rounds 32-35 */

			
 
				+    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(

			
 
				+                            0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));

			
 
				+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				+    TMP = _mm_alignr_epi8(MSG0, MSG3, 4);

			
 
				+    MSG1 = _mm_add_epi32(MSG1, TMP);

			
 
				+    MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);

			
 
				+    MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				+    MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);

			
 
				+

			
 
				+    /* Rounds 36-39 */

			
 
				+    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(

			
 
				+                            0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));

			
 
				+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				+    TMP = _mm_alignr_epi8(MSG1, MSG0, 4);

			
 
				+    MSG2 = _mm_add_epi32(MSG2, TMP);

			
 
				+    MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);

			
 
				+    MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				+    MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);

			
 
				+

			
 
				+    /* Rounds 40-43 */

			
 
				+    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(

			
 
				+                            0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));

			
 
				+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				+    TMP = _mm_alignr_epi8(MSG2, MSG1, 4);

			
 
				+    MSG3 = _mm_add_epi32(MSG3, TMP);

			
 
				+    MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);

			
 
				+    MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				+    MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);

			
 
				+

			
 
				+    /* Rounds 44-47 */

			
 
				+    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(

			
 
				+                            0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));

			
 
				+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				+    TMP = _mm_alignr_epi8(MSG3, MSG2, 4);

			
 
				+    MSG0 = _mm_add_epi32(MSG0, TMP);

			
 
				+    MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);

			
 
				+    MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				+    MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);

			
 
				+

			
 
				+    /* Rounds 48-51 */

			
 
				+    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(

			
 
				+                            0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));

			
 
				+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				+    TMP = _mm_alignr_epi8(MSG0, MSG3, 4);

			
 
				+    MSG1 = _mm_add_epi32(MSG1, TMP);

			
 
				+    MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);

			
 
				+    MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				+    MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);

			
 
				+

			
 
				+    /* Rounds 52-55 */

			
 
				+    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(

			
 
				+                            0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));

			
 
				+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				+    TMP = _mm_alignr_epi8(MSG1, MSG0, 4);

			
 
				+    MSG2 = _mm_add_epi32(MSG2, TMP);

			
 
				+    MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);

			
 
				+    MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				+

			
 
				+    /* Rounds 56-59 */

			
 
				+    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(

			
 
				+                            0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));

			
 
				+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				+    TMP = _mm_alignr_epi8(MSG2, MSG1, 4);

			
 
				+    MSG3 = _mm_add_epi32(MSG3, TMP);

			
 
				+    MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);

			
 
				+    MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				+

			
 
				+    /* Rounds 60-63 */

			
 
				+    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(

			
 
				+                            0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));

			
 
				+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);

			
 
				+    MSG = _mm_shuffle_epi32(MSG, 0x0E);

			
 
				+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);

			
 
				+

			
 
				+    /* Combine state */

			
 
				+    core[0] = _mm_add_epi32(STATE0, core[0]);

			
 
				+    core[1] = _mm_add_epi32(STATE1, core[1]);

			
 
				 }

			
 
				 

			
 
				-/*

			
 
				- * Workaround LLVM bug https://bugs.llvm.org/show_bug.cgi?id=34980

			
 
				- */

			
 
				-static void SHA256_ni(SHA256_State * s, const unsigned char *q, int len)

			
 
				+typedef struct sha256_ni {

			
 
				+    /*

			
 
				+     * These two vectors store the 8 words of the SHA-256 state, but

			
 
				+     * not in the same order they appear in the spec: the first word

			
 
				+     * holds A,B,E,F and the second word C,D,G,H.

			
 
				+     */

			
 
				+    __m128i core[2];

			
 
				+    sha256_block blk;

			
 
				+    void *pointer_to_free;

			
 
				+    BinarySink_IMPLEMENTATION;

			
 
				+    ssh_hash hash;

			
 
				+} sha256_ni;

			
 
				+

			
 
				+static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len);

			
 
				+

			
 
				+static sha256_ni *sha256_ni_alloc(void)

			
 
				 {

			
 
				-    SHA256_ni_(s, q, len);

			
 
				+    /*

			
 
				+     * The __m128i variables in the context structure need to be

			
 
				+     * 16-byte aligned, but not all malloc implementations that this

			
 
				+     * code has to work with will guarantee to return a 16-byte

			
 
				+     * aligned pointer. So we over-allocate, manually realign the

			
 
				+     * pointer ourselves, and store the original one inside the

			
 
				+     * context so we know how to free it later.

			
 
				+     */

			
 
				+    void *allocation = smalloc(sizeof(sha256_ni) + 15);

			
 
				+    uintptr_t alloc_address = (uintptr_t)allocation;

			
 
				+    uintptr_t aligned_address = (alloc_address + 15) & ~15;

			
 
				+    sha256_ni *s = (sha256_ni *)aligned_address;

			
 
				+    s->pointer_to_free = allocation;

			
 
				+    return s;

			
 
				 }

			
 
				 

			
 
				-#else /* COMPILER_SUPPORTS_AES_NI */

			
 
				+FUNC_ISA static ssh_hash *sha256_ni_new(const ssh_hashalg *alg)

			
 
				+{

			
 
				+    if (!sha256_hw_available_cached())

			
 
				+        return NULL;

			
 
				+

			
 
				+    sha256_ni *s = sha256_ni_alloc();

			
 
				+

			
 
				+    /* Initialise the core vectors in their storage order */

			
 
				+    s->core[0] = _mm_set_epi64x(

			
 
				+        0x6a09e667bb67ae85ULL, 0x510e527f9b05688cULL);

			
 
				+    s->core[1] = _mm_set_epi64x(

			
 
				+        0x3c6ef372a54ff53aULL, 0x1f83d9ab5be0cd19ULL);

			
 
				 

			
 
				-static void SHA256_ni(SHA256_State * s, const unsigned char *q, int len)

			
 
				+    sha256_block_setup(&s->blk);

			
 
				+

			
 
				+    s->hash.vt = alg;

			
 
				+    BinarySink_INIT(s, sha256_ni_write);

			
 
				+    BinarySink_DELEGATE_INIT(&s->hash, s);

			
 
				+    return &s->hash;

			
 
				+}

			
 
				+

			
 
				+static ssh_hash *sha256_ni_copy(ssh_hash *hash)

			
 
				 {

			
 
				-    unreachable("SHA256_ni not compiled in");

			
 
				+    sha256_ni *s = container_of(hash, sha256_ni, hash);

			
 
				+    sha256_ni *copy = sha256_ni_alloc();

			
 
				+

			
 
				+    void *ptf_save = copy->pointer_to_free;

			
 
				+    *copy = *s; /* structure copy */

			
 
				+    copy->pointer_to_free = ptf_save;

			
 
				+

			
 
				+    BinarySink_COPIED(copy);

			
 
				+    BinarySink_DELEGATE_INIT(&copy->hash, copy);

			
 
				+

			
 
				+    return &copy->hash;

			
 
				+}

			
 
				+

			
 
				+static void sha256_ni_free(ssh_hash *hash)

			
 
				+{

			
 
				+    sha256_ni *s = container_of(hash, sha256_ni, hash);

			
 
				+

			
 
				+    void *ptf = s->pointer_to_free;

			
 
				+    smemclr(s, sizeof(*s));

			
 
				+    sfree(ptf);

			
 
				+}

			
 
				+

			
 
				+static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len)

			
 
				+{

			
 
				+    sha256_ni *s = BinarySink_DOWNCAST(bs, sha256_ni);

			
 
				+

			
 
				+    while (len > 0)

			
 
				+        if (sha256_block_write(&s->blk, &vp, &len))

			
 
				+            sha256_ni_block(s->core, s->blk.block);

			
 
				+}

			
 
				+

			
 
				+FUNC_ISA static void sha256_ni_final(ssh_hash *hash, uint8_t *digest)

			
 
				+{

			
 
				+    sha256_ni *s = container_of(hash, sha256_ni, hash);

			
 
				+

			
 
				+    sha256_block_pad(&s->blk, BinarySink_UPCAST(s));

			
 
				+

			
 
				+    /* Rearrange the words into the output order */

			
 
				+    __m128i feba = _mm_shuffle_epi32(s->core[0], 0x1B);

			
 
				+    __m128i dchg = _mm_shuffle_epi32(s->core[1], 0xB1);

			
 
				+    __m128i dcba = _mm_blend_epi16(feba, dchg, 0xF0);

			
 
				+    __m128i hgfe = _mm_alignr_epi8(dchg, feba, 8);

			
 
				+

			
 
				+    /* Byte-swap them into the output endianness */

			
 
				+    const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12);

			
 
				+    dcba = _mm_shuffle_epi8(dcba, mask);

			
 
				+    hgfe = _mm_shuffle_epi8(hgfe, mask);

			
 
				+

			
 
				+    /* And store them */

			
 
				+    __m128i *output = (__m128i *)digest;

			
 
				+    _mm_storeu_si128(output, dcba);

			
 
				+    _mm_storeu_si128(output+1, hgfe);

			
 
				+

			
 
				+    sha256_ni_free(hash);

			
 
				 }

			
 
				 

			
 
				-#endif  /* COMPILER_SUPPORTS_AES_NI */

			
 
				+const ssh_hashalg ssh_sha256_hw = {

			
 
				+    sha256_ni_new, sha256_ni_copy, sha256_ni_final, sha256_ni_free,

			
 
				+    32, 64, "SHA-256",

			
 
				+};

			
 
				+

			
 
				+/* ----------------------------------------------------------------------

			
 
				+ * Stub functions if we have no hardware-accelerated SHA-256. In this

			
 
				+ * case, sha256_hw_new returns NULL (though it should also never be

			
 
				+ * selected by sha256_select, so the only thing that should even be

			
 
				+ * _able_ to call it is testcrypt). As a result, the remaining vtable

			
 
				+ * functions should never be called at all.

			
 
				+ */

			
 
				+

			
 
				+#elif HW_SHA256 == HW_SHA256_NONE

			
 
				+

			
 
				+static bool sha256_hw_available(void)

			
 
				+{

			
 
				+    return false;

			
 
				+}

			
 
				+

			
 
				+static ssh_hash *sha256_stub_new(const ssh_hashalg *alg)

			
 
				+{

			
 
				+    return NULL;

			
 
				+}

			
 
				+

			
 
				+#define STUB_BODY { unreachable("Should never be called"); }

			
 
				+

			
 
				+static ssh_hash *sha256_stub_copy(ssh_hash *hash) STUB_BODY

			
 
				+static void sha256_stub_free(ssh_hash *hash) STUB_BODY

			
 
				+static void sha256_stub_final(ssh_hash *hash, uint8_t *digest) STUB_BODY

			
 
				+

			
 
				+const ssh_hashalg ssh_sha256_hw = {

			
 
				+    sha256_stub_new, sha256_stub_copy, sha256_stub_final, sha256_stub_free,

			
 
				+    32, 64, "SHA-256",

			
 
				+};

			
 
				+

			
 
				+#endif /* HW_SHA256 */

			
--- a/source/putty/sshsha.c
+++ b/source/putty/sshsha.c
@@ -1,294 +1,291 @@
 
				 /*

			
 
				- * SHA1 hash algorithm. Used in SSH-2 as a MAC, and the transform is

			
 
				- * also used as a `stirring' function for the PuTTY random number

			
 
				- * pool. Implemented directly from the specification by Simon

			
 
				- * Tatham.

			
 
				+ * SHA-1 algorithm as described at

			
 
				+ * 

			
 
				+ *   http://csrc.nist.gov/cryptval/shs.html

			
 
				  */

			
 
				 

			
 
				 #include "ssh.h"

			
 
				-

			
 
				 #include <assert.h>

			
 
				 

			
 
				-typedef struct SHA_State {

			
 
				-    uint32_t h[5];

			
 
				-    unsigned char block[64];

			
 
				-    int blkused;

			
 
				-    uint64_t len;

			
 
				-    void (*sha1)(struct SHA_State * s, const unsigned char *p, int len);

			
 
				-    BinarySink_IMPLEMENTATION;

			
 
				-} SHA_State;

			
 
				+/*

			
 
				+ * Start by deciding whether we can support hardware SHA at all.

			
 
				+ */

			
 
				+#define HW_SHA1_NONE 0

			
 
				+#define HW_SHA1_NI 1

			
 
				+

			
 
				+#ifdef _FORCE_SHA_NI

			
 
				+#   define HW_SHA1 HW_SHA1_NI

			
 
				+#elif defined(__clang__)

			
 
				+#   if __has_attribute(target) && __has_include(<wmmintrin.h>) &&       \

			
 
				+    (defined(__x86_64__) || defined(__i386))

			
 
				+#       define HW_SHA1 HW_SHA1_NI

			
 
				+#   endif

			
 
				+#elif defined(__GNUC__)

			
 
				+#    if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) && \

			
 
				+    (defined(__x86_64__) || defined(__i386))

			
 
				+#       define HW_SHA1 HW_SHA1_NI

			
 
				+#    endif

			
 
				+#elif defined (_MSC_VER)

			
 
				+#   if (defined(_M_X64) || defined(_M_IX86)) && _MSC_FULL_VER >= 150030729

			
 
				+#      define HW_SHA1 HW_SHA1_NI

			
 
				+#   endif

			
 
				+#endif

			
 
				+

			
 
				+#if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA1

			
 
				+#   undef HW_SHA1

			
 
				+#   define HW_SHA1 HW_SHA1_NONE

			
 
				+#endif

			
 
				+

			
 
				+/*

			
 
				+ * The actual query function that asks if hardware acceleration is

			
 
				+ * available.

			
 
				+ */

			
 
				+static bool sha1_hw_available(void);

			
 
				+

			
 
				+/*

			
 
				+ * The top-level selection function, caching the results of

			
 
				+ * sha1_hw_available() so it only has to run once.

			
 
				+ */

			
 
				+static bool sha1_hw_available_cached(void)

			
 
				+{

			
 
				+    static bool initialised = false;

			
 
				+    static bool hw_available;

			
 
				+    if (!initialised) {

			
 
				+        hw_available = sha1_hw_available();

			
 
				+        initialised = true;

			
 
				+    }

			
 
				+    return hw_available;

			
 
				+}

			
 
				+

			
 
				+static ssh_hash *sha1_select(const ssh_hashalg *alg)

			
 
				+{

			
 
				+    const ssh_hashalg *real_alg =

			
 
				+        sha1_hw_available_cached() ? &ssh_sha1_hw : &ssh_sha1_sw;

			
 
				+

			
 
				+    return ssh_hash_new(real_alg);

			
 
				+}

			
 
				+

			
 
				+const ssh_hashalg ssh_sha1 = {

			
 
				+    sha1_select, NULL, NULL, NULL,

			
 
				+    20, 64, "SHA-1",

			
 
				+};

			
 
				 

			
 
				 /* ----------------------------------------------------------------------

			
 
				- * Core SHA algorithm: processes 16-word blocks into a message digest.

			
 
				+ * Definitions likely to be helpful to multiple implementations.

			
 
				  */

			
 
				 

			
 
				-#define rol(x,y) ( ((x) << (y)) | (((uint32_t)x) >> (32-y)) )

			
 
				+static const uint32_t sha1_initial_state[] = {

			
 
				+    0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0,

			
 
				+};

			
 
				 

			
 
				-static void sha1_sw(SHA_State * s, const unsigned char *q, int len);

			
 
				-static void sha1_ni(SHA_State * s, const unsigned char *q, int len);

			
 
				+#define SHA1_ROUNDS_PER_STAGE 20

			
 
				+#define SHA1_STAGE0_CONSTANT 0x5a827999

			
 
				+#define SHA1_STAGE1_CONSTANT 0x6ed9eba1

			
 
				+#define SHA1_STAGE2_CONSTANT 0x8f1bbcdc

			
 
				+#define SHA1_STAGE3_CONSTANT 0xca62c1d6

			
 
				+#define SHA1_ROUNDS (4 * SHA1_ROUNDS_PER_STAGE)

			
 
				+

			
 
				+typedef struct sha1_block sha1_block;

			
 
				+struct sha1_block {

			
 
				+    uint8_t block[64];

			
 
				+    size_t used;

			
 
				+    uint64_t len;

			
 
				+};

			
 
				 

			
 
				-static void SHA_Core_Init(uint32_t h[5])

			
 
				+static inline void sha1_block_setup(sha1_block *blk)

			
 
				 {

			
 
				-    h[0] = 0x67452301;

			
 
				-    h[1] = 0xefcdab89;

			
 
				-    h[2] = 0x98badcfe;

			
 
				-    h[3] = 0x10325476;

			
 
				-    h[4] = 0xc3d2e1f0;

			
 
				+    blk->used = 0;

			
 
				+    blk->len = 0;

			
 
				 }

			
 
				 

			
 
				-void SHATransform(uint32_t * digest, uint32_t * block)

			
 
				+static inline bool sha1_block_write(

			
 
				+    sha1_block *blk, const void **vdata, size_t *len)

			
 
				 {

			
 
				-    uint32_t w[80];

			
 
				-    uint32_t a, b, c, d, e;

			
 
				-    int t;

			
 
				-

			
 
				-#ifdef RANDOM_DIAGNOSTICS

			
 
				-    {

			
 
				-        extern int random_diagnostics;

			
 
				-        if (random_diagnostics) {

			
 
				-            int i;

			
 
				-            printf("SHATransform:");

			
 
				-            for (i = 0; i < 5; i++)

			
 
				-                printf(" %08x", digest[i]);

			
 
				-            printf(" +");

			
 
				-            for (i = 0; i < 16; i++)

			
 
				-                printf(" %08x", block[i]);

			
 
				-        }

			
 
				+    size_t blkleft = sizeof(blk->block) - blk->used;

			
 
				+    size_t chunk = *len < blkleft ? *len : blkleft;

			
 
				+

			
 
				+    const uint8_t *p = *vdata;

			
 
				+    memcpy(blk->block + blk->used, p, chunk);

			
 
				+    *vdata = p + chunk;

			
 
				+    *len -= chunk;

			
 
				+    blk->used += chunk;

			
 
				+    blk->len += chunk;

			
 
				+

			
 
				+    if (blk->used == sizeof(blk->block)) {

			
 
				+        blk->used = 0;

			
 
				+        return true;

			
 
				     }

			
 
				-#endif

			
 
				 

			
 
				-    for (t = 0; t < 16; t++)

			
 
				-	w[t] = block[t];

			
 
				+    return false;

			
 
				+}

			
 
				 

			
 
				-    for (t = 16; t < 80; t++) {

			
 
				-	uint32_t tmp = w[t - 3] ^ w[t - 8] ^ w[t - 14] ^ w[t - 16];

			
 
				-	w[t] = rol(tmp, 1);

			
 
				-    }

			
 
				+static inline void sha1_block_pad(sha1_block *blk, BinarySink *bs)

			
 
				+{

			
 
				+    uint64_t final_len = blk->len << 3;

			
 
				+    size_t pad = 1 + (63 & (55 - blk->used));

			
 
				 

			
 
				-    a = digest[0];

			
 
				-    b = digest[1];

			
 
				-    c = digest[2];

			
 
				-    d = digest[3];

			
 
				-    e = digest[4];

			
 
				-

			
 
				-    for (t = 0; t < 20; t++) {

			
 
				-	uint32_t tmp =

			
 
				-	    rol(a, 5) + ((b & c) | (d & ~b)) + e + w[t] + 0x5a827999;

			
 
				-	e = d;

			
 
				-	d = c;

			
 
				-	c = rol(b, 30);

			
 
				-	b = a;

			
 
				-	a = tmp;

			
 
				-    }

			
 
				-    for (t = 20; t < 40; t++) {

			
 
				-	uint32_t tmp = rol(a, 5) + (b ^ c ^ d) + e + w[t] + 0x6ed9eba1;

			
 
				-	e = d;

			
 
				-	d = c;

			
 
				-	c = rol(b, 30);

			
 
				-	b = a;

			
 
				-	a = tmp;

			
 
				-    }

			
 
				-    for (t = 40; t < 60; t++) {

			
 
				-	uint32_t tmp = rol(a,

			
 
				-			 5) + ((b & c) | (b & d) | (c & d)) + e + w[t] +

			
 
				-	    0x8f1bbcdc;

			
 
				-	e = d;

			
 
				-	d = c;

			
 
				-	c = rol(b, 30);

			
 
				-	b = a;

			
 
				-	a = tmp;

			
 
				-    }

			
 
				-    for (t = 60; t < 80; t++) {

			
 
				-	uint32_t tmp = rol(a, 5) + (b ^ c ^ d) + e + w[t] + 0xca62c1d6;

			
 
				-	e = d;

			
 
				-	d = c;

			
 
				-	c = rol(b, 30);

			
 
				-	b = a;

			
 
				-	a = tmp;

			
 
				-    }

			
 
				+    put_byte(bs, 0x80);

			
 
				+    for (size_t i = 1; i < pad; i++)

			
 
				+        put_byte(bs, 0);

			
 
				+    put_uint64(bs, final_len);

			
 
				 

			
 
				-    digest[0] += a;

			
 
				-    digest[1] += b;

			
 
				-    digest[2] += c;

			
 
				-    digest[3] += d;

			
 
				-    digest[4] += e;

			
 
				-

			
 
				-#ifdef RANDOM_DIAGNOSTICS

			
 
				-    {

			
 
				-        extern int random_diagnostics;

			
 
				-        if (random_diagnostics) {

			
 
				-            int i;

			
 
				-            printf(" =");

			
 
				-            for (i = 0; i < 5; i++)

			
 
				-                printf(" %08x", digest[i]);

			
 
				-            printf("\n");

			
 
				-        }

			
 
				-    }

			
 
				-#endif

			
 
				+    assert(blk->used == 0 && "Should have exactly hit a block boundary");

			
 
				 }

			
 
				 

			
 
				 /* ----------------------------------------------------------------------

			
 
				- * Outer SHA algorithm: take an arbitrary length byte string,

			
 
				- * convert it into 16-word blocks with the prescribed padding at

			
 
				- * the end, and pass those blocks to the core SHA algorithm.

			
 
				+ * Software implementation of SHA-1.

			
 
				  */

			
 
				 

			
 
				-static void SHA_BinarySink_write(BinarySink *bs, const void *p, size_t len);

			
 
				-

			
 
				-void SHA_Init(SHA_State * s)

			
 
				+static inline uint32_t rol(uint32_t x, unsigned y)

			
 
				 {

			
 
				-    SHA_Core_Init(s->h);

			
 
				-    s->blkused = 0;

			
 
				-    s->len = 0;

			
 
				-    if (supports_sha_ni())

			
 
				-        s->sha1 = &sha1_ni;

			
 
				-    else

			
 
				-        s->sha1 = &sha1_sw;

			
 
				-    BinarySink_INIT(s, SHA_BinarySink_write);

			
 
				+    return (x << (31 & y)) | (x >> (31 & -y));

			
 
				 }

			
 
				 

			
 
				-static void SHA_BinarySink_write(BinarySink *bs, const void *p, size_t len)

			
 
				+static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0)

			
 
				 {

			
 
				-    struct SHA_State *s = BinarySink_DOWNCAST(bs, struct SHA_State);

			
 
				-    const unsigned char *q = (const unsigned char *) p;

			
 
				-

			
 
				-    /*

			
 
				-     * Update the length field.

			
 
				-     */

			
 
				-    s->len += len;

			
 
				+    return if0 ^ (ctrl & (if1 ^ if0));

			
 
				+}

			
 
				 

			
 
				-    (*(s->sha1))(s, q, len);

			
 
				+static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z)

			
 
				+{

			
 
				+    return (x & y) | (z & (x | y));

			
 
				 }

			
 
				 

			
 
				-static void sha1_sw(SHA_State * s, const unsigned char *q, int len)

			
 
				+static inline uint32_t Par(uint32_t x, uint32_t y, uint32_t z)

			
 
				 {

			
 
				-    uint32_t wordblock[16];

			
 
				-    int i;

			
 
				-

			
 
				-    if (s->blkused && s->blkused + len < 64) {

			
 
				-	/*

			
 
				-	 * Trivial case: just add to the block.

			
 
				-	 */

			
 
				-	memcpy(s->block + s->blkused, q, len);

			
 
				-	s->blkused += len;

			
 
				-    } else {

			
 
				-	/*

			
 
				-	 * We must complete and process at least one block.

			
 
				-	 */

			
 
				-	while (s->blkused + len >= 64) {

			
 
				-	    memcpy(s->block + s->blkused, q, 64 - s->blkused);

			
 
				-	    q += 64 - s->blkused;

			
 
				-	    len -= 64 - s->blkused;

			
 
				-	    /* Now process the block. Gather bytes big-endian into words */

			
 
				-	    for (i = 0; i < 16; i++) {

			
 
				-		wordblock[i] =

			
 
				-		    (((uint32_t) s->block[i * 4 + 0]) << 24) |

			
 
				-		    (((uint32_t) s->block[i * 4 + 1]) << 16) |

			
 
				-		    (((uint32_t) s->block[i * 4 + 2]) << 8) |

			
 
				-		    (((uint32_t) s->block[i * 4 + 3]) << 0);

			
 
				-	    }

			
 
				-	    SHATransform(s->h, wordblock);

			
 
				-	    s->blkused = 0;

			
 
				-	}

			
 
				-	memcpy(s->block, q, len);

			
 
				-	s->blkused = len;

			
 
				-    }

			
 
				+    return (x ^ y ^ z);

			
 
				 }

			
 
				 

			
 
				-void SHA_Final(SHA_State * s, unsigned char *output)

			
 
				+static inline void sha1_sw_round(

			
 
				+    unsigned round_index, const uint32_t *schedule,

			
 
				+    uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d, uint32_t *e,

			
 
				+    uint32_t f, uint32_t constant)

			
 
				 {

			
 
				-    int i;

			
 
				-    int pad;

			
 
				-    unsigned char c[64];

			
 
				-    uint64_t len;

			
 
				+    *e = rol(*a, 5) + f + *e + schedule[round_index] + constant;

			
 
				+    *b = rol(*b, 30);

			
 
				+}

			
 
				 

			
 
				-    if (s->blkused >= 56)

			
 
				-	pad = 56 + 64 - s->blkused;

			
 
				-    else

			
 
				-	pad = 56 - s->blkused;

			
 
				+static void sha1_sw_block(uint32_t *core, const uint8_t *block)

			
 
				+{

			
 
				+    uint32_t w[SHA1_ROUNDS];

			
 
				+    uint32_t a,b,c,d,e;

			
 
				 

			
 
				-    len = (s->len << 3);

			
 
				+    for (size_t t = 0; t < 16; t++)

			
 
				+        w[t] = GET_32BIT_MSB_FIRST(block + 4*t);

			
 
				 

			
 
				-    memset(c, 0, pad);

			
 
				-    c[0] = 0x80;

			
 
				-    put_data(s, &c, pad);

			
 
				+    for (size_t t = 16; t < SHA1_ROUNDS; t++)

			
 
				+	w[t] = rol(w[t - 3] ^ w[t - 8] ^ w[t - 14] ^ w[t - 16], 1);

			
 
				 

			
 
				-    put_uint64(s, len);

			
 
				+    a = core[0]; b = core[1]; c = core[2]; d = core[3];

			
 
				+    e = core[4];

			
 
				 

			
 
				-    for (i = 0; i < 5; i++) {

			
 
				-	output[i * 4] = (s->h[i] >> 24) & 0xFF;

			
 
				-	output[i * 4 + 1] = (s->h[i] >> 16) & 0xFF;

			
 
				-	output[i * 4 + 2] = (s->h[i] >> 8) & 0xFF;

			
 
				-	output[i * 4 + 3] = (s->h[i]) & 0xFF;

			
 
				+    size_t t = 0;

			
 
				+    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {

			
 
				+        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Ch(b,c,d), SHA1_STAGE0_CONSTANT);

			
 
				+        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Ch(a,b,c), SHA1_STAGE0_CONSTANT);

			
 
				+        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Ch(e,a,b), SHA1_STAGE0_CONSTANT);

			
 
				+        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Ch(d,e,a), SHA1_STAGE0_CONSTANT);

			
 
				+        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Ch(c,d,e), SHA1_STAGE0_CONSTANT);

			
 
				+    }

			
 
				+    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {

			
 
				+        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Par(b,c,d), SHA1_STAGE1_CONSTANT);

			
 
				+        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Par(a,b,c), SHA1_STAGE1_CONSTANT);

			
 
				+        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Par(e,a,b), SHA1_STAGE1_CONSTANT);

			
 
				+        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Par(d,e,a), SHA1_STAGE1_CONSTANT);

			
 
				+        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Par(c,d,e), SHA1_STAGE1_CONSTANT);

			
 
				+    }

			
 
				+    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {

			
 
				+        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Maj(b,c,d), SHA1_STAGE2_CONSTANT);

			
 
				+        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Maj(a,b,c), SHA1_STAGE2_CONSTANT);

			
 
				+        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Maj(e,a,b), SHA1_STAGE2_CONSTANT);

			
 
				+        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Maj(d,e,a), SHA1_STAGE2_CONSTANT);

			
 
				+        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Maj(c,d,e), SHA1_STAGE2_CONSTANT);

			
 
				+    }

			
 
				+    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {

			
 
				+        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Par(b,c,d), SHA1_STAGE3_CONSTANT);

			
 
				+        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Par(a,b,c), SHA1_STAGE3_CONSTANT);

			
 
				+        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Par(e,a,b), SHA1_STAGE3_CONSTANT);

			
 
				+        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Par(d,e,a), SHA1_STAGE3_CONSTANT);

			
 
				+        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Par(c,d,e), SHA1_STAGE3_CONSTANT);

			
 
				     }

			
 
				-}

			
 
				 

			
 
				-void SHA_Simple(const void *p, int len, unsigned char *output)

			
 
				-{

			
 
				-    SHA_State s;

			
 
				+    core[0] += a; core[1] += b; core[2] += c; core[3] += d; core[4] += e;

			
 
				 

			
 
				-    SHA_Init(&s);

			
 
				-    put_data(&s, p, len);

			
 
				-    SHA_Final(&s, output);

			
 
				-    smemclr(&s, sizeof(s));

			
 
				+    smemclr(w, sizeof(w));

			
 
				 }

			
 
				 

			
 
				-/*

			
 
				- * Thin abstraction for things where hashes are pluggable.

			
 
				- */

			
 
				-

			
 
				-struct sha1_hash {

			
 
				-    SHA_State state;

			
 
				+typedef struct sha1_sw {

			
 
				+    uint32_t core[5];

			
 
				+    sha1_block blk;

			
 
				+    BinarySink_IMPLEMENTATION;

			
 
				     ssh_hash hash;

			
 
				-};

			
 
				+} sha1_sw;

			
 
				+

			
 
				+static void sha1_sw_write(BinarySink *bs, const void *vp, size_t len);

			
 
				 

			
 
				-static ssh_hash *sha1_new(const ssh_hashalg *alg)

			
 
				+static ssh_hash *sha1_sw_new(const ssh_hashalg *alg)

			
 
				 {

			
 
				-    struct sha1_hash *h = snew(struct sha1_hash);

			
 
				-    SHA_Init(&h->state);

			
 
				-    h->hash.vt = alg;

			
 
				-    BinarySink_DELEGATE_INIT(&h->hash, &h->state);

			
 
				-    return &h->hash;

			
 
				+    sha1_sw *s = snew(sha1_sw);

			
 
				+

			
 
				+    memcpy(s->core, sha1_initial_state, sizeof(s->core));

			
 
				+

			
 
				+    sha1_block_setup(&s->blk);

			
 
				+

			
 
				+    s->hash.vt = alg;

			
 
				+    BinarySink_INIT(s, sha1_sw_write);

			
 
				+    BinarySink_DELEGATE_INIT(&s->hash, s);

			
 
				+    return &s->hash;

			
 
				 }

			
 
				 

			
 
				-static ssh_hash *sha1_copy(ssh_hash *hashold)

			
 
				+static ssh_hash *sha1_sw_copy(ssh_hash *hash)

			
 
				 {

			
 
				-    struct sha1_hash *hold, *hnew;

			
 
				-    ssh_hash *hashnew = sha1_new(hashold->vt);

			
 
				+    sha1_sw *s = container_of(hash, sha1_sw, hash);

			
 
				+    sha1_sw *copy = snew(sha1_sw);

			
 
				+

			
 
				+    memcpy(copy, s, sizeof(*copy));

			
 
				+    BinarySink_COPIED(copy);

			
 
				+    BinarySink_DELEGATE_INIT(&copy->hash, copy);

			
 
				 

			
 
				-    hold = container_of(hashold, struct sha1_hash, hash);

			
 
				-    hnew = container_of(hashnew, struct sha1_hash, hash);

			
 
				+    return &copy->hash;

			
 
				+}

			
 
				 

			
 
				-    hnew->state = hold->state;

			
 
				-    BinarySink_COPIED(&hnew->state);

			
 
				+static void sha1_sw_free(ssh_hash *hash)

			
 
				+{

			
 
				+    sha1_sw *s = container_of(hash, sha1_sw, hash);

			
 
				 

			
 
				-    return hashnew;

			
 
				+    smemclr(s, sizeof(*s));

			
 
				+    sfree(s);

			
 
				 }

			
 
				 

			
 
				-static void sha1_free(ssh_hash *hash)

			
 
				+static void sha1_sw_write(BinarySink *bs, const void *vp, size_t len)

			
 
				 {

			
 
				-    struct sha1_hash *h = container_of(hash, struct sha1_hash, hash);

			
 
				+    sha1_sw *s = BinarySink_DOWNCAST(bs, sha1_sw);

			
 
				 

			
 
				-    smemclr(h, sizeof(*h));

			
 
				-    sfree(h);

			
 
				+    while (len > 0)

			
 
				+        if (sha1_block_write(&s->blk, &vp, &len))

			
 
				+            sha1_sw_block(s->core, s->blk.block);

			
 
				 }

			
 
				 

			
 
				-static void sha1_final(ssh_hash *hash, unsigned char *output)

			
 
				+static void sha1_sw_final(ssh_hash *hash, uint8_t *digest)

			
 
				 {

			
 
				-    struct sha1_hash *h = container_of(hash, struct sha1_hash, hash);

			
 
				-    SHA_Final(&h->state, output);

			
 
				-    sha1_free(hash);

			
 
				+    sha1_sw *s = container_of(hash, sha1_sw, hash);

			
 
				+

			
 
				+    sha1_block_pad(&s->blk, BinarySink_UPCAST(s));

			
 
				+    for (size_t i = 0; i < 5; i++)

			
 
				+        PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]);

			
 
				+    sha1_sw_free(hash);

			
 
				 }

			
 
				 

			
 
				-const ssh_hashalg ssh_sha1 = {

			
 
				-    sha1_new, sha1_copy, sha1_final, sha1_free, 20, 64, "SHA-1"

			
 
				+const ssh_hashalg ssh_sha1_sw = {

			
 
				+    sha1_sw_new, sha1_sw_copy, sha1_sw_final, sha1_sw_free,

			
 
				+    20, 64, "SHA-1",

			
 
				 };

			
 
				 

			
 
				-#ifdef COMPILER_SUPPORTS_SHA_NI

			
 
				+/* ----------------------------------------------------------------------

			
 
				+ * Hardware-accelerated implementation of SHA-1 using x86 SHA-NI.

			
 
				+ */

			
 
				 

			
 
				-#if defined _MSC_VER && defined _M_AMD64

			
 
				-# include <intrin.h>

			
 
				-#endif

			
 
				+#if HW_SHA1 == HW_SHA1_NI

			
 
				 

			
 
				 /*

			
 
				  * Set target architecture for Clang and GCC

			
@@ -298,7 +295,7 @@ const ssh_hashalg ssh_sha1 = {
 
				 #    pragma GCC target("sse4.1")

			
 
				 #endif

			
 
				 

			
 
				-#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5))

			
 
				+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))

			
 
				 #    define FUNC_ISA __attribute__ ((target("sse4.1,sha")))

			
 
				 #else

			
 
				 #    define FUNC_ISA

			
@@ -307,270 +304,353 @@ const ssh_hashalg ssh_sha1 = {
 
				 #include <wmmintrin.h>

			
 
				 #include <smmintrin.h>

			
 
				 #include <immintrin.h>

			
 
				-

			
 
				 #if defined(__clang__) || defined(__GNUC__)

			
 
				 #include <shaintrin.h>

			
 
				 #endif

			
 
				 

			
 
				-/*

			
 
				- * Determinators of CPU type

			
 
				- */

			
 
				 #if defined(__clang__) || defined(__GNUC__)

			
 
				-

			
 
				 #include <cpuid.h>

			
 
				-bool supports_sha_ni(void)

			
 
				-{

			
 
				-    unsigned int CPUInfo[4];

			
 
				-    __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);

			
 
				-    if (CPUInfo[0] < 7)

			
 
				-        return false;

			
 
				-

			
 
				-    __cpuid_count(7, 0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);

			
 
				-    return CPUInfo[1] & (1 << 29); /* SHA */

			
 
				-}

			
 
				-

			
 
				-#else /* defined(__clang__) || defined(__GNUC__) */

			
 
				+#define GET_CPU_ID_0(out)                               \

			
 
				+    __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3])

			
 
				+#define GET_CPU_ID_7(out)                                       \

			
 
				+    __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3])

			
 
				+#else

			
 
				+#define GET_CPU_ID_0(out) __cpuid(out, 0)

			
 
				+#define GET_CPU_ID_7(out) __cpuidex(out, 7, 0)

			
 
				+#endif

			
 
				 

			
 
				-bool supports_sha_ni(void)

			
 
				+static bool sha1_hw_available(void)

			
 
				 {

			
 
				     unsigned int CPUInfo[4];

			
 
				-    __cpuid(CPUInfo, 0);  

			
 
				+    GET_CPU_ID_0(CPUInfo);  

			
 
				     if (CPUInfo[0] < 7)

			
 
				         return false;

			
 
				 

			
 
				-    __cpuidex(CPUInfo, 7, 0);

			
 
				+    GET_CPU_ID_7(CPUInfo);

			
 
				     return CPUInfo[1] & (1 << 29); /* Check SHA */

			
 
				 }

			
 
				 

			
 
				-#endif /* defined(__clang__) || defined(__GNUC__) */

			
 
				-

			
 
				 /* SHA1 implementation using new instructions

			
 
				    The code is based on Jeffrey Walton's SHA1 implementation:

			
 
				    https://github.com/noloader/SHA-Intrinsics

			
 
				 */

			
 
				 FUNC_ISA

			
 
				-static void sha1_ni_(SHA_State * s, const unsigned char *q, int len)

			
 
				+static inline void sha1_ni_block(__m128i *core, const uint8_t *p)

			
 
				 {

			
 
				-    if (s->blkused && s->blkused + len < 64) {

			
 
				-      /*

			
 
				-       * Trivial case: just add to the block.

			
 
				-       */

			
 
				-       memcpy(s->block + s->blkused, q, len);

			
 
				-       s->blkused += len;

			
 
				-    } else {

			
 
				-        __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;

			
 
				-        const __m128i MASK = _mm_set_epi64x(0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);

			
 
				-

			
 
				-        ABCD = _mm_loadu_si128((const __m128i*) s->h);

			
 
				-        E0 = _mm_set_epi32(s->h[4], 0, 0, 0);

			
 
				-        ABCD = _mm_shuffle_epi32(ABCD, 0x1B);

			
 
				-

			
 
				-        /*

			
 
				-         * We must complete and process at least one block.

			
 
				-         */

			
 
				-        while (s->blkused + len >= 64)

			
 
				-        {

			
 
				-            __m128i MSG0, MSG1, MSG2, MSG3;

			
 
				-            memcpy(s->block + s->blkused, q, 64 - s->blkused);

			
 
				-            q += 64 - s->blkused;

			
 
				-            len -= 64 - s->blkused;

			
 
				-

			
 
				-            /* Save current state  */

			
 
				-            ABCD_SAVE = ABCD;

			
 
				-            E0_SAVE = E0;

			
 
				-

			
 
				-            /* Rounds 0-3 */

			
 
				-            MSG0 = _mm_loadu_si128((const __m128i*)(s->block + 0));

			
 
				-            MSG0 = _mm_shuffle_epi8(MSG0, MASK);

			
 
				-            E0 = _mm_add_epi32(E0, MSG0);

			
 
				-            E1 = ABCD;

			
 
				-            ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);

			
 
				-

			
 
				-            /* Rounds 4-7 */

			
 
				-            MSG1 = _mm_loadu_si128((const __m128i*)(s->block + 16));

			
 
				-            MSG1 = _mm_shuffle_epi8(MSG1, MASK);

			
 
				-            E1 = _mm_sha1nexte_epu32(E1, MSG1);

			
 
				-            E0 = ABCD;

			
 
				-            ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);

			
 
				-            MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);

			
 
				-

			
 
				-            /* Rounds 8-11 */

			
 
				-            MSG2 = _mm_loadu_si128((const __m128i*)(s->block + 32));

			
 
				-            MSG2 = _mm_shuffle_epi8(MSG2, MASK);

			
 
				-            E0 = _mm_sha1nexte_epu32(E0, MSG2);

			
 
				-            E1 = ABCD;

			
 
				-            ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);

			
 
				-            MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);

			
 
				-            MSG0 = _mm_xor_si128(MSG0, MSG2);

			
 
				-

			
 
				-            /* Rounds 12-15 */

			
 
				-            MSG3 = _mm_loadu_si128((const __m128i*)(s->block + 48));

			
 
				-            MSG3 = _mm_shuffle_epi8(MSG3, MASK);

			
 
				-            E1 = _mm_sha1nexte_epu32(E1, MSG3);

			
 
				-            E0 = ABCD;

			
 
				-            MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);

			
 
				-            ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);

			
 
				-            MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);

			
 
				-            MSG1 = _mm_xor_si128(MSG1, MSG3);

			
 
				-

			
 
				-            /* Rounds 16-19 */

			
 
				-            E0 = _mm_sha1nexte_epu32(E0, MSG0);

			
 
				-            E1 = ABCD;

			
 
				-            MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);

			
 
				-            ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);

			
 
				-            MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);

			
 
				-            MSG2 = _mm_xor_si128(MSG2, MSG0);

			
 
				-

			
 
				-            /* Rounds 20-23 */

			
 
				-            E1 = _mm_sha1nexte_epu32(E1, MSG1);

			
 
				-            E0 = ABCD;

			
 
				-            MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);

			
 
				-            ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);

			
 
				-            MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);

			
 
				-            MSG3 = _mm_xor_si128(MSG3, MSG1);

			
 
				-

			
 
				-            /* Rounds 24-27 */

			
 
				-            E0 = _mm_sha1nexte_epu32(E0, MSG2);

			
 
				-            E1 = ABCD;

			
 
				-            MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);

			
 
				-            ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);

			
 
				-            MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);

			
 
				-            MSG0 = _mm_xor_si128(MSG0, MSG2);

			
 
				-

			
 
				-            /* Rounds 28-31 */

			
 
				-            E1 = _mm_sha1nexte_epu32(E1, MSG3);

			
 
				-            E0 = ABCD;

			
 
				-            MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);

			
 
				-            ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);

			
 
				-            MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);

			
 
				-            MSG1 = _mm_xor_si128(MSG1, MSG3);

			
 
				-

			
 
				-            /* Rounds 32-35 */

			
 
				-            E0 = _mm_sha1nexte_epu32(E0, MSG0);

			
 
				-            E1 = ABCD;

			
 
				-            MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);

			
 
				-            ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);

			
 
				-            MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);

			
 
				-            MSG2 = _mm_xor_si128(MSG2, MSG0);

			
 
				-

			
 
				-            /* Rounds 36-39 */

			
 
				-            E1 = _mm_sha1nexte_epu32(E1, MSG1);

			
 
				-            E0 = ABCD;

			
 
				-            MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);

			
 
				-            ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);

			
 
				-            MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);

			
 
				-            MSG3 = _mm_xor_si128(MSG3, MSG1);

			
 
				-

			
 
				-            /* Rounds 40-43 */

			
 
				-            E0 = _mm_sha1nexte_epu32(E0, MSG2);

			
 
				-            E1 = ABCD;

			
 
				-            MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);

			
 
				-            ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);

			
 
				-            MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);

			
 
				-            MSG0 = _mm_xor_si128(MSG0, MSG2);

			
 
				-

			
 
				-            /* Rounds 44-47 */

			
 
				-            E1 = _mm_sha1nexte_epu32(E1, MSG3);

			
 
				-            E0 = ABCD;

			
 
				-            MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);

			
 
				-            ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);

			
 
				-            MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);

			
 
				-            MSG1 = _mm_xor_si128(MSG1, MSG3);

			
 
				-

			
 
				-            /* Rounds 48-51 */

			
 
				-            E0 = _mm_sha1nexte_epu32(E0, MSG0);

			
 
				-            E1 = ABCD;

			
 
				-            MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);

			
 
				-            ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);

			
 
				-            MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);

			
 
				-            MSG2 = _mm_xor_si128(MSG2, MSG0);

			
 
				-

			
 
				-            /* Rounds 52-55 */

			
 
				-            E1 = _mm_sha1nexte_epu32(E1, MSG1);

			
 
				-            E0 = ABCD;

			
 
				-            MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);

			
 
				-            ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);

			
 
				-            MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);

			
 
				-            MSG3 = _mm_xor_si128(MSG3, MSG1);

			
 
				-

			
 
				-            /* Rounds 56-59 */

			
 
				-            E0 = _mm_sha1nexte_epu32(E0, MSG2);

			
 
				-            E1 = ABCD;

			
 
				-            MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);

			
 
				-            ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);

			
 
				-            MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);

			
 
				-            MSG0 = _mm_xor_si128(MSG0, MSG2);

			
 
				-

			
 
				-            /* Rounds 60-63 */

			
 
				-            E1 = _mm_sha1nexte_epu32(E1, MSG3);

			
 
				-            E0 = ABCD;

			
 
				-            MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);

			
 
				-            ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);

			
 
				-            MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);

			
 
				-            MSG1 = _mm_xor_si128(MSG1, MSG3);

			
 
				-

			
 
				-            /* Rounds 64-67 */

			
 
				-            E0 = _mm_sha1nexte_epu32(E0, MSG0);

			
 
				-            E1 = ABCD;

			
 
				-            MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);

			
 
				-            ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);

			
 
				-            MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);

			
 
				-            MSG2 = _mm_xor_si128(MSG2, MSG0);

			
 
				-

			
 
				-            /* Rounds 68-71 */

			
 
				-            E1 = _mm_sha1nexte_epu32(E1, MSG1);

			
 
				-            E0 = ABCD;

			
 
				-            MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);

			
 
				-            ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);

			
 
				-            MSG3 = _mm_xor_si128(MSG3, MSG1);

			
 
				-

			
 
				-            /* Rounds 72-75 */

			
 
				-            E0 = _mm_sha1nexte_epu32(E0, MSG2);

			
 
				-            E1 = ABCD;

			
 
				-            MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);

			
 
				-            ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);

			
 
				-

			
 
				-            /* Rounds 76-79 */

			
 
				-            E1 = _mm_sha1nexte_epu32(E1, MSG3);

			
 
				-            E0 = ABCD;

			
 
				-            ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);

			
 
				-

			
 
				-            /* Combine state */

			
 
				-            E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);

			
 
				-            ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);

			
 
				-

			
 
				-            s->blkused = 0;

			
 
				-        }

			
 
				-

			
 
				-        ABCD = _mm_shuffle_epi32(ABCD, 0x1B);

			
 
				-

			
 
				-        /* Save state */

			
 
				-        _mm_storeu_si128((__m128i*) s->h, ABCD);

			
 
				-        s->h[4] = _mm_extract_epi32(E0, 3);

			
 
				-

			
 
				-        memcpy(s->block, q, len);

			
 
				-        s->blkused = len;

			
 
				-    }

			
 
				+    __m128i ABCD, E0, E1, MSG0, MSG1, MSG2, MSG3;

			
 
				+    const __m128i MASK = _mm_set_epi64x(

			
 
				+        0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);

			
 
				+

			
 
				+    const __m128i *block = (const __m128i *)p;

			
 
				+

			
 
				+    /* Load initial values */

			
 
				+    ABCD = core[0];

			
 
				+    E0 = core[1];

			
 
				+

			
 
				+    /* Rounds 0-3 */

			
 
				+    MSG0 = _mm_loadu_si128(block);

			
 
				+    MSG0 = _mm_shuffle_epi8(MSG0, MASK);

			
 
				+    E0 = _mm_add_epi32(E0, MSG0);

			
 
				+    E1 = ABCD;

			
 
				+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);

			
 
				+

			
 
				+    /* Rounds 4-7 */

			
 
				+    MSG1 = _mm_loadu_si128(block + 1);

			
 
				+    MSG1 = _mm_shuffle_epi8(MSG1, MASK);

			
 
				+    E1 = _mm_sha1nexte_epu32(E1, MSG1);

			
 
				+    E0 = ABCD;

			
 
				+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);

			
 
				+    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);

			
 
				+

			
 
				+    /* Rounds 8-11 */

			
 
				+    MSG2 = _mm_loadu_si128(block + 2);

			
 
				+    MSG2 = _mm_shuffle_epi8(MSG2, MASK);

			
 
				+    E0 = _mm_sha1nexte_epu32(E0, MSG2);

			
 
				+    E1 = ABCD;

			
 
				+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);

			
 
				+    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);

			
 
				+    MSG0 = _mm_xor_si128(MSG0, MSG2);

			
 
				+

			
 
				+    /* Rounds 12-15 */

			
 
				+    MSG3 = _mm_loadu_si128(block + 3);

			
 
				+    MSG3 = _mm_shuffle_epi8(MSG3, MASK);

			
 
				+    E1 = _mm_sha1nexte_epu32(E1, MSG3);

			
 
				+    E0 = ABCD;

			
 
				+    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);

			
 
				+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);

			
 
				+    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);

			
 
				+    MSG1 = _mm_xor_si128(MSG1, MSG3);

			
 
				+

			
 
				+    /* Rounds 16-19 */

			
 
				+    E0 = _mm_sha1nexte_epu32(E0, MSG0);

			
 
				+    E1 = ABCD;

			
 
				+    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);

			
 
				+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);

			
 
				+    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);

			
 
				+    MSG2 = _mm_xor_si128(MSG2, MSG0);

			
 
				+

			
 
				+    /* Rounds 20-23 */

			
 
				+    E1 = _mm_sha1nexte_epu32(E1, MSG1);

			
 
				+    E0 = ABCD;

			
 
				+    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);

			
 
				+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);

			
 
				+    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);

			
 
				+    MSG3 = _mm_xor_si128(MSG3, MSG1);

			
 
				+

			
 
				+    /* Rounds 24-27 */

			
 
				+    E0 = _mm_sha1nexte_epu32(E0, MSG2);

			
 
				+    E1 = ABCD;

			
 
				+    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);

			
 
				+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);

			
 
				+    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);

			
 
				+    MSG0 = _mm_xor_si128(MSG0, MSG2);

			
 
				+

			
 
				+    /* Rounds 28-31 */

			
 
				+    E1 = _mm_sha1nexte_epu32(E1, MSG3);

			
 
				+    E0 = ABCD;

			
 
				+    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);

			
 
				+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);

			
 
				+    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);

			
 
				+    MSG1 = _mm_xor_si128(MSG1, MSG3);

			
 
				+

			
 
				+    /* Rounds 32-35 */

			
 
				+    E0 = _mm_sha1nexte_epu32(E0, MSG0);

			
 
				+    E1 = ABCD;

			
 
				+    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);

			
 
				+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);

			
 
				+    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);

			
 
				+    MSG2 = _mm_xor_si128(MSG2, MSG0);

			
 
				+

			
 
				+    /* Rounds 36-39 */

			
 
				+    E1 = _mm_sha1nexte_epu32(E1, MSG1);

			
 
				+    E0 = ABCD;

			
 
				+    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);

			
 
				+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);

			
 
				+    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);

			
 
				+    MSG3 = _mm_xor_si128(MSG3, MSG1);

			
 
				+

			
 
				+    /* Rounds 40-43 */

			
 
				+    E0 = _mm_sha1nexte_epu32(E0, MSG2);

			
 
				+    E1 = ABCD;

			
 
				+    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);

			
 
				+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);

			
 
				+    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);

			
 
				+    MSG0 = _mm_xor_si128(MSG0, MSG2);

			
 
				+

			
 
				+    /* Rounds 44-47 */

			
 
				+    E1 = _mm_sha1nexte_epu32(E1, MSG3);

			
 
				+    E0 = ABCD;

			
 
				+    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);

			
 
				+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);

			
 
				+    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);

			
 
				+    MSG1 = _mm_xor_si128(MSG1, MSG3);

			
 
				+

			
 
				+    /* Rounds 48-51 */

			
 
				+    E0 = _mm_sha1nexte_epu32(E0, MSG0);

			
 
				+    E1 = ABCD;

			
 
				+    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);

			
 
				+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);

			
 
				+    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);

			
 
				+    MSG2 = _mm_xor_si128(MSG2, MSG0);

			
 
				+

			
 
				+    /* Rounds 52-55 */

			
 
				+    E1 = _mm_sha1nexte_epu32(E1, MSG1);

			
 
				+    E0 = ABCD;

			
 
				+    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);

			
 
				+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);

			
 
				+    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);

			
 
				+    MSG3 = _mm_xor_si128(MSG3, MSG1);

			
 
				+

			
 
				+    /* Rounds 56-59 */

			
 
				+    E0 = _mm_sha1nexte_epu32(E0, MSG2);

			
 
				+    E1 = ABCD;

			
 
				+    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);

			
 
				+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);

			
 
				+    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);

			
 
				+    MSG0 = _mm_xor_si128(MSG0, MSG2);

			
 
				+

			
 
				+    /* Rounds 60-63 */

			
 
				+    E1 = _mm_sha1nexte_epu32(E1, MSG3);

			
 
				+    E0 = ABCD;

			
 
				+    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);

			
 
				+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);

			
 
				+    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);

			
 
				+    MSG1 = _mm_xor_si128(MSG1, MSG3);

			
 
				+

			
 
				+    /* Rounds 64-67 */

			
 
				+    E0 = _mm_sha1nexte_epu32(E0, MSG0);

			
 
				+    E1 = ABCD;

			
 
				+    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);

			
 
				+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);

			
 
				+    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);

			
 
				+    MSG2 = _mm_xor_si128(MSG2, MSG0);

			
 
				+

			
 
				+    /* Rounds 68-71 */

			
 
				+    E1 = _mm_sha1nexte_epu32(E1, MSG1);

			
 
				+    E0 = ABCD;

			
 
				+    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);

			
 
				+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);

			
 
				+    MSG3 = _mm_xor_si128(MSG3, MSG1);

			
 
				+

			
 
				+    /* Rounds 72-75 */

			
 
				+    E0 = _mm_sha1nexte_epu32(E0, MSG2);

			
 
				+    E1 = ABCD;

			
 
				+    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);

			
 
				+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);

			
 
				+

			
 
				+    /* Rounds 76-79 */

			
 
				+    E1 = _mm_sha1nexte_epu32(E1, MSG3);

			
 
				+    E0 = ABCD;

			
 
				+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);

			
 
				+

			
 
				+    /* Combine state */

			
 
				+    core[0] = _mm_add_epi32(ABCD, core[0]);

			
 
				+    core[1] = _mm_sha1nexte_epu32(E0, core[1]);

			
 
				 }

			
 
				 

			
 
				-/*

			
 
				- * Workaround LLVM bug https://bugs.llvm.org/show_bug.cgi?id=34980

			
 
				- */

			
 
				-static void sha1_ni(SHA_State * s, const unsigned char *q, int len)

			
 
				+typedef struct sha1_ni {

			
 
				+    /*

			
 
				+     * core[0] stores the first four words of the SHA-1 state. core[1]

			
 
				+     * stores just the fifth word, in the vector lane at the highest

			
 
				+     * address.

			
 
				+     */

			
 
				+    __m128i core[2];

			
 
				+    sha1_block blk;

			
 
				+    void *pointer_to_free;

			
 
				+    BinarySink_IMPLEMENTATION;

			
 
				+    ssh_hash hash;

			
 
				+} sha1_ni;

			
 
				+

			
 
				+static void sha1_ni_write(BinarySink *bs, const void *vp, size_t len);

			
 
				+

			
 
				+static sha1_ni *sha1_ni_alloc(void)

			
 
				 {

			
 
				-    sha1_ni_(s, q, len);

			
 
				+    /*

			
 
				+     * The __m128i variables in the context structure need to be

			
 
				+     * 16-byte aligned, but not all malloc implementations that this

			
 
				+     * code has to work with will guarantee to return a 16-byte

			
 
				+     * aligned pointer. So we over-allocate, manually realign the

			
 
				+     * pointer ourselves, and store the original one inside the

			
 
				+     * context so we know how to free it later.

			
 
				+     */

			
 
				+    void *allocation = smalloc(sizeof(sha1_ni) + 15);

			
 
				+    uintptr_t alloc_address = (uintptr_t)allocation;

			
 
				+    uintptr_t aligned_address = (alloc_address + 15) & ~15;

			
 
				+    sha1_ni *s = (sha1_ni *)aligned_address;

			
 
				+    s->pointer_to_free = allocation;

			
 
				+    return s;

			
 
				 }

			
 
				 

			
 
				-#else /* COMPILER_SUPPORTS_AES_NI */

			
 
				+FUNC_ISA static ssh_hash *sha1_ni_new(const ssh_hashalg *alg)

			
 
				+{

			
 
				+    if (!sha1_hw_available_cached())

			
 
				+        return NULL;

			
 
				+

			
 
				+    sha1_ni *s = sha1_ni_alloc();

			
 
				+

			
 
				+    /* Initialise the core vectors in their storage order */

			
 
				+    s->core[0] = _mm_set_epi64x(

			
 
				+        0x67452301efcdab89ULL, 0x98badcfe10325476ULL);

			
 
				+    s->core[1] = _mm_set_epi32(0xc3d2e1f0, 0, 0, 0);

			
 
				+

			
 
				+    sha1_block_setup(&s->blk);

			
 
				+

			
 
				+    s->hash.vt = alg;

			
 
				+    BinarySink_INIT(s, sha1_ni_write);

			
 
				+    BinarySink_DELEGATE_INIT(&s->hash, s);

			
 
				+    return &s->hash;

			
 
				+}

			
 
				 

			
 
				-static void sha1_ni(SHA_State * s, const unsigned char *q, int len)

			
 
				+static ssh_hash *sha1_ni_copy(ssh_hash *hash)

			
 
				 {

			
 
				-    unreachable("sha1_ni not compiled in");

			
 
				+    sha1_ni *s = container_of(hash, sha1_ni, hash);

			
 
				+    sha1_ni *copy = sha1_ni_alloc();

			
 
				+

			
 
				+    void *ptf_save = copy->pointer_to_free;

			
 
				+    *copy = *s; /* structure copy */

			
 
				+    copy->pointer_to_free = ptf_save;

			
 
				+

			
 
				+    BinarySink_COPIED(copy);

			
 
				+    BinarySink_DELEGATE_INIT(&copy->hash, copy);

			
 
				+

			
 
				+    return &copy->hash;

			
 
				 }

			
 
				 

			
 
				-bool supports_sha_ni(void)

			
 
				+static void sha1_ni_free(ssh_hash *hash)

			
 
				+{

			
 
				+    sha1_ni *s = container_of(hash, sha1_ni, hash);

			
 
				+

			
 
				+    void *ptf = s->pointer_to_free;

			
 
				+    smemclr(s, sizeof(*s));

			
 
				+    sfree(ptf);

			
 
				+}

			
 
				+

			
 
				+static void sha1_ni_write(BinarySink *bs, const void *vp, size_t len)

			
 
				+{

			
 
				+    sha1_ni *s = BinarySink_DOWNCAST(bs, sha1_ni);

			
 
				+

			
 
				+    while (len > 0)

			
 
				+        if (sha1_block_write(&s->blk, &vp, &len))

			
 
				+            sha1_ni_block(s->core, s->blk.block);

			
 
				+}

			
 
				+

			
 
				+FUNC_ISA static void sha1_ni_final(ssh_hash *hash, uint8_t *digest)

			
 
				+{

			
 
				+    sha1_ni *s = container_of(hash, sha1_ni, hash);

			
 
				+

			
 
				+    sha1_block_pad(&s->blk, BinarySink_UPCAST(s));

			
 
				+

			
 
				+    /* Rearrange the first vector into its output order */

			
 
				+    __m128i abcd = _mm_shuffle_epi32(s->core[0], 0x1B);

			
 
				+

			
 
				+    /* Byte-swap it into the output endianness */

			
 
				+    const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12);

			
 
				+    abcd = _mm_shuffle_epi8(abcd, mask);

			
 
				+

			
 
				+    /* And store it */

			
 
				+    _mm_storeu_si128((__m128i *)digest, abcd);

			
 
				+

			
 
				+    /* Finally, store the leftover word */

			
 
				+    uint32_t e = _mm_extract_epi32(s->core[1], 3);

			
 
				+    PUT_32BIT_MSB_FIRST(digest + 16, e);

			
 
				+

			
 
				+    sha1_ni_free(hash);

			
 
				+}

			
 
				+

			
 
				+const ssh_hashalg ssh_sha1_hw = {

			
 
				+    sha1_ni_new, sha1_ni_copy, sha1_ni_final, sha1_ni_free,

			
 
				+    20, 64, "SHA-1",

			
 
				+};

			
 
				+

			
 
				+/* ----------------------------------------------------------------------

			
 
				+ * Stub functions if we have no hardware-accelerated SHA-1. In this

			
 
				+ * case, sha1_hw_new returns NULL (though it should also never be

			
 
				+ * selected by sha1_select, so the only thing that should even be

			
 
				+ * _able_ to call it is testcrypt). As a result, the remaining vtable

			
 
				+ * functions should never be called at all.

			
 
				+ */

			
 
				+

			
 
				+#elif HW_SHA1 == HW_SHA1_NONE

			
 
				+

			
 
				+static bool sha1_hw_available(void)

			
 
				 {

			
 
				     return false;

			
 
				 }

			
 
				 

			
 
				-#endif  /* COMPILER_SUPPORTS_AES_NI */

			
 
				+static ssh_hash *sha1_stub_new(const ssh_hashalg *alg)

			
 
				+{

			
 
				+    return NULL;

			
 
				+}

			
 
				+

			
 
				+#define STUB_BODY { unreachable("Should never be called"); }

			
 
				+

			
 
				+static ssh_hash *sha1_stub_copy(ssh_hash *hash) STUB_BODY

			
 
				+static void sha1_stub_free(ssh_hash *hash) STUB_BODY

			
 
				+static void sha1_stub_final(ssh_hash *hash, uint8_t *digest) STUB_BODY

			
 
				+

			
 
				+const ssh_hashalg ssh_sha1_hw = {

			
 
				+    sha1_stub_new, sha1_stub_copy, sha1_stub_final, sha1_stub_free,

			
 
				+    20, 64, "SHA-1",

			
 
				+};

			
 
				+

			
 
				+#endif /* HW_SHA1 */