Pārlūkot izejas kodu

Bug 1986: Support for PPK version 3 keys

https://winscp.net/tracker/1986

Source commit: 96acc7e21d3a9ef78274bcda7e380d2660b585d0
Martin Prikryl 4 gadi atpakaļ
vecāks
revīzija
307619dd88

+ 6 - 0
source/Putty.cbproj

@@ -223,12 +223,18 @@
 			<BuildOrder>29</BuildOrder>
 			<BuildOrder>21</BuildOrder>
 		</CppCompile>
+		<CppCompile Include="putty\sshargon2.c">
+			<BuildOrder>29</BuildOrder>
+		</CppCompile>
 		<CppCompile Include="putty\sshauxcrypt.c">
 			<BuildOrder>78</BuildOrder>
 		</CppCompile>
 		<CppCompile Include="putty\sshbcrypt.c">
 			<BuildOrder>50</BuildOrder>
 		</CppCompile>
+		<CppCompile Include="putty\sshblake2.c">
+			<BuildOrder>50</BuildOrder>
+		</CppCompile>
 		<CppCompile Include="putty\sshblowf.c">
 			<BuildOrder>34</BuildOrder>
 			<BuildOrder>32</BuildOrder>

+ 11 - 4
source/core/PuttyIntf.cpp

@@ -712,10 +712,15 @@ void SaveKey(TKeyType KeyType, const UnicodeString & FileName,
     switch (KeyType)
     {
       case ktSSH2:
-        if (!ssh2_save_userkey(KeyFile, Ssh2Key, PassphrasePtr))
         {
-          int Error = errno;
-          throw EOSExtException(FMTLOAD(KEY_SAVE_ERROR, (FileName)), Error);
+          ppk_save_parameters Params = ppk_save_default_parameters;
+          // Other parameters are probably not relevant for version 2
+          Params.fmt_version = 2;
+          if (!ppk_save_f(KeyFile, Ssh2Key, PassphrasePtr, &Params))
+          {
+            int Error = errno;
+            throw EOSExtException(FMTLOAD(KEY_SAVE_ERROR, (FileName)), Error);
+          }
         }
         break;
 
@@ -901,8 +906,10 @@ UnicodeString __fastcall ParseOpenSshPubLine(const UnicodeString & Line, const s
   char * CommentPtr = NULL;
   const char * ErrorStr = NULL;
   strbuf * PubBlobBuf = strbuf_new();
+  BinarySource Source[1];
+  BinarySource_BARE_INIT(Source, UtfLine.c_str(), UtfLine.Length());
   UnicodeString Result;
-  if (!openssh_loadpub_line(UtfLine.c_str(), &AlgorithmName, BinarySink_UPCAST(PubBlobBuf), &CommentPtr, &ErrorStr))
+  if (!openssh_loadpub(Source, &AlgorithmName, BinarySink_UPCAST(PubBlobBuf), &CommentPtr, &ErrorStr))
   {
     throw Exception(UnicodeString(ErrorStr));
   }

+ 3 - 0
source/putty/defs.h

@@ -29,6 +29,8 @@ uintmax_t strtoumax(const char *nptr, char **endptr, int base);
 #ifndef WINSCP
 // Not needed by the code WinSCP uses
 #include <inttypes.h>
+#else
+#define PRIu32 "u"
 #endif
 /* Because we still support older MSVC libraries which don't recognise the
  * standard C "z" modifier for size_t-sized integers, we must use an
@@ -67,6 +69,7 @@ typedef struct FontSpec FontSpec;
 typedef struct bufchain_tag bufchain;
 
 typedef struct strbuf strbuf;
+typedef struct LoadedFile LoadedFile;
 
 typedef struct RSAKey RSAKey;
 

+ 49 - 0
source/putty/marshal.c

@@ -218,6 +218,55 @@ const char *BinarySource_get_asciz(BinarySource *src)
     return start;
 }
 
+static ptrlen BinarySource_get_chars_internal(
+    BinarySource *src, const char *set, bool include)
+{
+    const char *start = here;
+    while (avail(1)) {
+        bool present = NULL != strchr(set, *(const char *)consume(0));
+        if (present != include)
+            break;
+        (void) consume(1);
+    }
+    { // WINSCP
+    const char *end = here;
+    return make_ptrlen(start, end - start);
+    } // WINSCP
+}
+
+ptrlen BinarySource_get_chars(BinarySource *src, const char *include_set)
+{
+    return BinarySource_get_chars_internal(src, include_set, true);
+}
+
+ptrlen BinarySource_get_nonchars(BinarySource *src, const char *exclude_set)
+{
+    return BinarySource_get_chars_internal(src, exclude_set, false);
+}
+
+ptrlen BinarySource_get_chomped_line(BinarySource *src)
+{
+    const char *start, *end;
+
+    if (src->err)
+        return make_ptrlen(here, 0);
+
+    start = here;
+    end = memchr(start, '\n', src->len - src->pos);
+    if (end)
+        advance(end + 1 - start);
+    else
+        advance(src->len - src->pos);
+    end = here;
+
+    if (end > start && end[-1] == '\n')
+        end--;
+    if (end > start && end[-1] == '\r')
+        end--;
+
+    return make_ptrlen(start, end - start);
+}
+
 ptrlen BinarySource_get_pstring(BinarySource *src)
 {
     const unsigned char *ucp;

+ 9 - 0
source/putty/marshal.h

@@ -276,6 +276,12 @@ static inline void BinarySource_INIT__(BinarySource *src, ptrlen data)
     BinarySource_get_string(BinarySource_UPCAST(src))
 #define get_asciz(src) \
     BinarySource_get_asciz(BinarySource_UPCAST(src))
+#define get_chars(src, include) \
+    BinarySource_get_chars(BinarySource_UPCAST(src), include)
+#define get_nonchars(src, exclude) \
+    BinarySource_get_nonchars(BinarySource_UPCAST(src), exclude)
+#define get_chomped_line(src) \
+    BinarySource_get_chomped_line(BinarySource_UPCAST(src))
 #define get_pstring(src) \
     BinarySource_get_pstring(BinarySource_UPCAST(src))
 #define get_mp_ssh1(src) \
@@ -305,6 +311,9 @@ unsigned long BinarySource_get_uint32(BinarySource *);
 uint64_t BinarySource_get_uint64(BinarySource *);
 ptrlen BinarySource_get_string(BinarySource *);
 const char *BinarySource_get_asciz(BinarySource *);
+ptrlen BinarySource_get_chars(BinarySource *, const char *include_set);
+ptrlen BinarySource_get_nonchars(BinarySource *, const char *exclude_set);
+ptrlen BinarySource_get_chomped_line(BinarySource *);
 ptrlen BinarySource_get_pstring(BinarySource *);
 mp_int *BinarySource_get_mp_ssh1(BinarySource *src);
 mp_int *BinarySource_get_mp_ssh2(BinarySource *src);

+ 28 - 0
source/putty/misc.h

@@ -189,6 +189,10 @@ int string_length_for_printf(size_t);
  * string. */
 #define PTRLEN_LITERAL(stringlit) \
     TYPECHECK("" stringlit "", make_ptrlen(stringlit, sizeof(stringlit)-1))
+/* Make a ptrlen out of a compile-time string literal in a way that
+ * allows you to declare the ptrlen itself as a compile-time initialiser. */
+#define PTRLEN_DECL_LITERAL(stringlit) \
+    { TYPECHECK("" stringlit "", stringlit), sizeof(stringlit)-1 }
 /* Make a ptrlen out of a constant byte array. */
 #define PTRLEN_FROM_CONST_BYTES(a) make_ptrlen(a, sizeof(a))
 
@@ -415,4 +419,28 @@ static inline char *stripctrl_string(StripCtrlChars *sccpub, const char *str)
 #define pinitassert(P) const int __assert_dummy = 1/((int)(P))
 #endif
 
+/*
+ * A mechanism for loading a file from disk into a memory buffer where
+ * it can be picked apart as a BinarySource.
+ */
+struct LoadedFile {
+    char *data;
+    size_t len, max_size;
+    BinarySource_IMPLEMENTATION;
+};
+typedef enum {
+    LF_OK,      /* file loaded successfully */
+    LF_TOO_BIG, /* file didn't fit in buffer */
+    LF_ERROR,   /* error from stdio layer */
+} LoadFileStatus;
+
+/* Set the memory block of 'size' bytes at 'out' to the bitwise XOR of
+ * the two blocks of the same size at 'in1' and 'in2'.
+ *
+ * 'out' may point to exactly the same address as one of the inputs,
+ * but if the input and output blocks overlap in any other way, the
+ * result of this function is not guaranteed. No memmove-style effort
+ * is made to handle difficult overlap cases. */
+void memxor(uint8_t *out, const uint8_t *in1, const uint8_t *in2, size_t size);
+
 #endif

+ 124 - 33
source/putty/ssh.h

@@ -744,32 +744,43 @@ struct ssh_hash {
 
 struct ssh_hashalg {
     ssh_hash *(*new)(const ssh_hashalg *alg);
-    ssh_hash *(*copy)(ssh_hash *);
-    void (*final)(ssh_hash *, unsigned char *); /* ALSO FREES THE ssh_hash! */
+    void (*reset)(ssh_hash *);
+    void (*copyfrom)(ssh_hash *dest, ssh_hash *src);
+    void (*digest)(ssh_hash *, unsigned char *);
     void (*free)(ssh_hash *);
-    int hlen; /* output length in bytes */
-    int blocklen; /* length of the hash's input block, or 0 for N/A */
+    size_t hlen; /* output length in bytes */
+    size_t blocklen; /* length of the hash's input block, or 0 for N/A */
     const char *text_basename;     /* the semantic name of the hash */
     const char *annotation;   /* extra info, e.g. which of multiple impls */
     const char *text_name;    /* both combined, e.g. "SHA-n (unaccelerated)" */
+    const void *extra;        /* private to the hash implementation */
 };
 
 static inline ssh_hash *ssh_hash_new(const ssh_hashalg *alg)
-{ return alg->new(alg); }
-static inline ssh_hash *ssh_hash_copy(ssh_hash *h)
-{ return h->vt->copy(h); }
-static inline void ssh_hash_final(ssh_hash *h, unsigned char *out)
-{ h->vt->final(h, out); }
+{ ssh_hash *h = alg->new(alg); if (h) h->vt->reset(h); return h; }
+static inline ssh_hash *ssh_hash_copy(ssh_hash *orig)
+{ ssh_hash *h = orig->vt->new(orig->vt); h->vt->copyfrom(h, orig); return h; }
+static inline void ssh_hash_digest(ssh_hash *h, unsigned char *out)
+{ h->vt->digest(h, out); }
 static inline void ssh_hash_free(ssh_hash *h)
 { h->vt->free(h); }
 static inline const ssh_hashalg *ssh_hash_alg(ssh_hash *h)
 { return h->vt; }
 
+/* The reset and copyfrom vtable methods return void. But for call-site
+ * convenience, these wrappers return their input pointer. */
+static inline ssh_hash *ssh_hash_reset(ssh_hash *h)
+{ h->vt->reset(h); return h; }
+
+/* ssh_hash_final emits the digest _and_ frees the ssh_hash */
+static inline void ssh_hash_final(ssh_hash *h, unsigned char *out)
+{ h->vt->digest(h, out); h->vt->free(h); }
+
 /* Handy macros for defining all those text-name fields at once */
 #define HASHALG_NAMES_BARE(base) \
-    base, NULL, base
-#define HASHALG_NAMES_ANNOTATED(base, annotation) \
-    base, annotation, base " (" annotation ")"
+    /*.text_basename =*/ base, /*.annotation =*/ NULL, /*.text_name =*/ base
+#define HASHALG_NAMES_ANNOTATED(base, ann) \
+    /*.text_basename =*/ base, /*.annotation =*/ ann, /*.text_name =*/ base " (" ann ")"
 
 #ifndef WINSCP_VS
 
@@ -914,8 +925,20 @@ struct ssh2_userkey {
     char *comment;                     /* the key comment */
 };
 
+/* Argon2 password hashing function */
+typedef enum { Argon2d = 0, Argon2i = 1, Argon2id = 2 } Argon2Flavour;
+void argon2(Argon2Flavour, uint32_t mem, uint32_t passes,
+            uint32_t parallel, uint32_t taglen,
+            ptrlen P, ptrlen S, ptrlen K, ptrlen X, strbuf *out);
+void argon2_choose_passes(
+    Argon2Flavour, uint32_t mem, uint32_t milliseconds, uint32_t *passes,
+    uint32_t parallel, uint32_t taglen, ptrlen P, ptrlen S, ptrlen K, ptrlen X,
+    strbuf *out);
+/* The H' hash defined in Argon2, exposed just for testcrypt */
+strbuf *argon2_long_hash(unsigned length, ptrlen data);
+
 /* The maximum length of any hash algorithm. (bytes) */
-#define MAX_HASH_LEN (64)              /* longest is SHA-512 */
+#define MAX_HASH_LEN (114) /* longest is SHAKE256 with 114-byte output */
 
 extern const ssh_cipheralg ssh_3des_ssh1;
 extern const ssh_cipheralg ssh_blowfish_ssh1;
@@ -960,7 +983,12 @@ extern const ssh_hashalg ssh_sha256;
 extern const ssh_hashalg ssh_sha256_hw;
 extern const ssh_hashalg ssh_sha256_sw;
 extern const ssh_hashalg ssh_sha384;
+extern const ssh_hashalg ssh_sha384_hw;
+extern const ssh_hashalg ssh_sha384_sw;
 extern const ssh_hashalg ssh_sha512;
+extern const ssh_hashalg ssh_sha512_hw;
+extern const ssh_hashalg ssh_sha512_sw;
+extern const ssh_hashalg ssh_blake2b;
 extern const ssh_kexes ssh_diffiehellman_group1;
 extern const ssh_kexes ssh_diffiehellman_group14;
 extern const ssh_kexes ssh_diffiehellman_gex;
@@ -986,6 +1014,10 @@ extern const ssh2_macalg ssh_hmac_sha256;
 extern const ssh2_macalg ssh2_poly1305;
 extern const ssh_compression_alg ssh_zlib;
 
+/* Special constructor: BLAKE2b can be instantiated with any hash
+ * length up to 128 bytes */
+ssh_hash *blake2b_new_general(unsigned hashlen);
+
 /*
  * On some systems, you have to detect hardware crypto acceleration by
  * asking the local OS API rather than OS-agnostically asking the CPU
@@ -1159,13 +1191,6 @@ mp_int *dh_create_e(dh_ctx *, int nbits);
 const char *dh_validate_f(dh_ctx *, mp_int *f);
 mp_int *dh_find_K(dh_ctx *, mp_int *f);
 
-bool rsa_ssh1_encrypted(const Filename *filename, char **comment);
-int rsa_ssh1_loadpub(const Filename *filename, BinarySink *bs,
-                     char **commentptr, const char **errorstr);
-int rsa_ssh1_loadkey(const Filename *filename, RSAKey *key,
-                     const char *passphrase, const char **errorstr);
-bool rsa_ssh1_savekey(const Filename *filename, RSAKey *key, char *passphrase);
-
 static inline bool is_base64_char(char c)
 {
     return ((c >= '0' && c <= '9') ||
@@ -1184,14 +1209,68 @@ extern void base64_encode(FILE *fp, const unsigned char *data, int datalen,
 extern ssh2_userkey ssh2_wrong_passphrase;
 #define SSH2_WRONG_PASSPHRASE (&ssh2_wrong_passphrase)
 
-bool ssh2_userkey_encrypted(const Filename *filename, char **comment);
-ssh2_userkey *ssh2_load_userkey(
-    const Filename *filename, const char *passphrase, const char **errorstr);
-bool ssh2_userkey_loadpub(
-    const Filename *filename, char **algorithm, BinarySink *bs,
-    char **commentptr, const char **errorstr);
-bool ssh2_save_userkey(
-    const Filename *filename, ssh2_userkey *key, char *passphrase);
+bool ppk_encrypted_s(BinarySource *src, char **comment);
+bool ppk_encrypted_f(const Filename *filename, char **comment);
+#define ssh2_userkey_encrypted ppk_encrypted_f
+bool rsa1_encrypted_s(BinarySource *src, char **comment);
+bool rsa1_encrypted_f(const Filename *filename, char **comment);
+#define rsa_ssh1_encrypted rsa1_encrypted_f
+
+ssh2_userkey *ppk_load_s(BinarySource *src, const char *passphrase,
+                         const char **errorstr);
+ssh2_userkey *ppk_load_f(const Filename *filename, const char *passphrase,
+                         const char **errorstr);
+#define ssh2_load_userkey ppk_load_f
+int rsa1_load_s(BinarySource *src, RSAKey *key,
+                const char *passphrase, const char **errorstr);
+int rsa1_load_f(const Filename *filename, RSAKey *key,
+                const char *passphrase, const char **errorstr);
+#define rsa_ssh1_loadkey rsa1_load_f
+
+typedef struct ppk_save_parameters {
+    unsigned fmt_version;              /* currently 2 or 3 */
+
+    /*
+     * Parameters for fmt_version == 3
+     */
+    Argon2Flavour argon2_flavour;
+    uint32_t argon2_mem;               /* in Kbyte */
+    bool argon2_passes_auto;
+    union {
+        uint32_t argon2_passes;        /* if auto == false */
+        uint32_t argon2_milliseconds;  /* if auto == true */
+    };
+    uint32_t argon2_parallelism;
+
+    /* The ability to choose a specific salt is only intended for the
+     * use of the automated test of PuTTYgen. It's a (mild) security
+     * risk to do it with any passphrase you actually care about,
+     * because it invalidates the entire point of having a salt in the
+     * first place. */
+    const uint8_t *salt;
+    size_t saltlen;
+} ppk_save_parameters;
+extern const ppk_save_parameters ppk_save_default_parameters;
+
+strbuf *ppk_save_sb(ssh2_userkey *key, const char *passphrase,
+                    const ppk_save_parameters *params);
+bool ppk_save_f(const Filename *filename, ssh2_userkey *key,
+                const char *passphrase, const ppk_save_parameters *params);
+strbuf *rsa1_save_sb(RSAKey *key, const char *passphrase);
+bool rsa1_save_f(const Filename *filename, RSAKey *key,
+                 const char *passphrase);
+
+bool ppk_loadpub_s(BinarySource *src, char **algorithm, BinarySink *bs,
+                   char **commentptr, const char **errorstr);
+bool ppk_loadpub_f(const Filename *filename, char **algorithm, BinarySink *bs,
+                   char **commentptr, const char **errorstr);
+#define ssh2_userkey_loadpub ppk_loadpub_f
+int rsa1_loadpub_s(BinarySource *src, BinarySink *bs,
+                   char **commentptr, const char **errorstr);
+int rsa1_loadpub_f(const Filename *filename, BinarySink *bs,
+                   char **commentptr, const char **errorstr);
+#define rsa_ssh1_loadpub rsa1_loadpub_f
+
 const ssh_keyalg *find_pubkey_alg(const char *name);
 const ssh_keyalg *find_pubkey_alg_len(ptrlen name);
 
@@ -1238,6 +1317,15 @@ enum {
     SSH_KEYTYPE_SSH2_PUBLIC_RFC4716,
     SSH_KEYTYPE_SSH2_PUBLIC_OPENSSH
 };
+
+typedef enum {
+    SSH_FPTYPE_MD5,
+    SSH_FPTYPE_SHA256,
+} FingerprintType;
+
+#define SSH_FPTYPE_DEFAULT SSH_FPTYPE_SHA256
+#define SSH_N_FPTYPES (SSH_FPTYPE_SHA256 + 1)
+
 char *ssh1_pubkey_str(RSAKey *ssh1key);
 void ssh1_write_pubkey(FILE *fp, RSAKey *ssh1key);
 char *ssh2_pubkey_openssh_str(ssh2_userkey *key);
@@ -1247,10 +1335,11 @@ void ssh2_write_pubkey(FILE *fp, const char *comment,
 char *ssh2_fingerprint_blob(ptrlen);
 char *ssh2_fingerprint(ssh_key *key);
 int key_type(const Filename *filename);
+int key_type_s(BinarySource *src);
 const char *key_type_to_str(int type);
-bool openssh_loadpub_line(char * line, char **algorithm, // WINSCP
-                         BinarySink *bs,
-                         char **commentptr, const char **errorstr);
+bool openssh_loadpub(BinarySource *src, char **algorithm, // WINSCP
+                     BinarySink *bs,
+                     char **commentptr, const char **errorstr);
 
 bool import_possible(int type);
 int import_target_type(int type);
@@ -1270,8 +1359,10 @@ void des3_decrypt_pubkey_ossh(const void *key, const void *iv,
                               void *blk, int len);
 void des3_encrypt_pubkey_ossh(const void *key, const void *iv,
                               void *blk, int len);
-void aes256_encrypt_pubkey(const void *key, void *blk, int len);
-void aes256_decrypt_pubkey(const void *key, void *blk, int len);
+void aes256_encrypt_pubkey(const void *key, const void *iv,
+                           void *blk, int len);
+void aes256_decrypt_pubkey(const void *key, const void *iv,
+                           void *blk, int len);
 
 void des_encrypt_xdmauth(const void *key, void *blk, int len);
 void des_decrypt_xdmauth(const void *key, void *blk, int len);

+ 593 - 0
source/putty/sshargon2.c

@@ -0,0 +1,593 @@
+/*
+ * Implementation of the Argon2 password hash function.
+ *
+ * My sources for the algorithm description and test vectors (the latter in
+ * test/cryptsuite.py) were the reference implementation on Github, and also
+ * the Internet-Draft description:
+ *
+ *   https://github.com/P-H-C/phc-winner-argon2
+ *   https://datatracker.ietf.org/doc/html/draft-irtf-cfrg-argon2-13
+ */
+
+#include <assert.h>
+
+#include "putty.h"
+#include "ssh.h"
+#include "marshal.h"
+
+/* ----------------------------------------------------------------------
+ * Argon2 uses data marshalling rules similar to SSH but with 32-bit integers
+ * stored little-endian. Start with some local BinarySink routines for storing
+ * a uint32 and a string in that fashion.
+ */
+
+static void BinarySink_put_uint32_le(BinarySink *bs, unsigned long val)
+{
+    unsigned char data[4];
+    PUT_32BIT_LSB_FIRST(data, val);
+    bs->write(bs, data, sizeof(data));
+}
+
+static void BinarySink_put_stringpl_le(BinarySink *bs, ptrlen pl)
+{
+    /* Check that the string length fits in a uint32, without doing a
+     * potentially implementation-defined shift of more than 31 bits */
+    assert((pl.len >> 31) < 2);
+
+    BinarySink_put_uint32_le(bs, pl.len);
+    bs->write(bs, pl.ptr, pl.len);
+}
+
+#define put_uint32_le(bs, val) \
+    BinarySink_put_uint32_le(BinarySink_UPCAST(bs), val)
+#define put_stringpl_le(bs, val) \
+    BinarySink_put_stringpl_le(BinarySink_UPCAST(bs), val)
+
+/* ----------------------------------------------------------------------
+ * Argon2 defines a hash-function family that's an extension of BLAKE2b to
+ * generate longer output digests, by repeatedly outputting half of a BLAKE2
+ * hash output and then re-hashing the whole thing until there are 64 or fewer
+ * bytes left to output. The spec calls this H' (a variant of the original
+ * hash it calls H, which is the unmodified BLAKE2b).
+ */
+
+static ssh_hash *hprime_new(unsigned length)
+{
+    ssh_hash *h = blake2b_new_general(length > 64 ? 64 : length);
+    put_uint32_le(h, length);
+    return h;
+}
+
+static void hprime_final(ssh_hash *h, unsigned length, void *vout)
+{
+    uint8_t *out = (uint8_t *)vout;
+
+    while (length > 64) {
+        uint8_t hashbuf[64];
+        ssh_hash_final(h, hashbuf);
+
+        memcpy(out, hashbuf, 32);
+        out += 32;
+        length -= 32;
+
+        h = blake2b_new_general(length > 64 ? 64 : length);
+        put_data(h, hashbuf, 64);
+
+        smemclr(hashbuf, sizeof(hashbuf));
+    }
+
+    ssh_hash_final(h, out);
+}
+
+/* Externally visible entry point for the long hash function. This is only
+ * used by testcrypt, so it would be overkill to set it up like a proper
+ * ssh_hash. */
+strbuf *argon2_long_hash(unsigned length, ptrlen data)
+{
+    ssh_hash *h = hprime_new(length);
+    put_datapl(h, data);
+    { // WINSCP
+    strbuf *out = strbuf_new();
+    hprime_final(h, length, strbuf_append(out, length));
+    return out;
+    } // WINSCP
+}
+
+/* ----------------------------------------------------------------------
+ * Argon2's own mixing function G, which operates on 1Kb blocks of data.
+ *
+ * The definition of G in the spec takes two 1Kb blocks as input and produces
+ * a 1Kb output block. The first thing that happens to the input blocks is
+ * that they get XORed together, and then only the XOR output is used, so you
+ * could perfectly well regard G as a 1Kb->1Kb function.
+ */
+
+static inline uint64_t ror(uint64_t x, unsigned rotation)
+{
+#pragma option push -w-ngu // WINSCP
+    unsigned lshift = 63 & -rotation, rshift = 63 & rotation;
+#pragma option pop // WINSCP
+    return (x << lshift) | (x >> rshift);
+}
+
+static inline uint64_t trunc32(uint64_t x)
+{
+    return x & 0xFFFFFFFF;
+}
+
+/* Internal function similar to the BLAKE2b round, which mixes up four 64-bit
+ * words */
+static inline void GB(uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d)
+{
+    *a += *b + 2 * trunc32(*a) * trunc32(*b);
+    *d = ror(*d ^ *a, 32);
+    *c += *d + 2 * trunc32(*c) * trunc32(*d);
+    *b = ror(*b ^ *c, 24);
+    *a += *b + 2 * trunc32(*a) * trunc32(*b);
+    *d = ror(*d ^ *a, 16);
+    *c += *d + 2 * trunc32(*c) * trunc32(*d);
+    *b = ror(*b ^ *c, 63);
+}
+
+/* Higher-level internal function which mixes up sixteen 64-bit words. This is
+ * applied to different subsets of the 128 words in a kilobyte block, and the
+ * API here is designed to make it easy to apply in the circumstances the spec
+ * requires. In every call, the sixteen words form eight pairs adjacent in
+ * memory, whose addresses are in arithmetic progression. So the 16 input
+ * words are in[0], in[1], in[instep], in[instep+1], ..., in[7*instep],
+ * in[7*instep+1], and the 16 output words similarly. */
+static inline void P(uint64_t *out, unsigned outstep,
+                     uint64_t *in, unsigned instep)
+{
+    unsigned i; // WINSCP
+    for (i = 0; i < 8; i++) {
+        out[i*outstep] = in[i*instep];
+        out[i*outstep+1] = in[i*instep+1];
+    }
+
+    GB(out+0*outstep+0, out+2*outstep+0, out+4*outstep+0, out+6*outstep+0);
+    GB(out+0*outstep+1, out+2*outstep+1, out+4*outstep+1, out+6*outstep+1);
+    GB(out+1*outstep+0, out+3*outstep+0, out+5*outstep+0, out+7*outstep+0);
+    GB(out+1*outstep+1, out+3*outstep+1, out+5*outstep+1, out+7*outstep+1);
+
+    GB(out+0*outstep+0, out+2*outstep+1, out+5*outstep+0, out+7*outstep+1);
+    GB(out+0*outstep+1, out+3*outstep+0, out+5*outstep+1, out+6*outstep+0);
+    GB(out+1*outstep+0, out+3*outstep+1, out+4*outstep+0, out+6*outstep+1);
+    GB(out+1*outstep+1, out+2*outstep+0, out+4*outstep+1, out+7*outstep+0);
+}
+
+/* The full G function, taking input blocks X and Y. The result of G is most
+ * often XORed into an existing output block, so this API is designed with
+ * that in mind: the mixing function's output is always XORed into whatever
+ * 1Kb of data is already at 'out'. */
+static void G_xor(uint8_t *out, const uint8_t *X, const uint8_t *Y)
+{
+    uint64_t R[128], Q[128], Z[128];
+
+    unsigned i; // WINSCP
+    for (i = 0; i < 128; i++)
+        R[i] = GET_64BIT_LSB_FIRST(X + 8*i) ^ GET_64BIT_LSB_FIRST(Y + 8*i);
+
+    for (i = 0; i < 8; i++) // WINSCP
+        P(Q+16*i, 2, R+16*i, 2);
+
+    for (i = 0; i < 8; i++) // WINSCP
+        P(Z+2*i, 16, Q+2*i, 16);
+
+    for (i = 0; i < 128; i++) // WINSCP
+        PUT_64BIT_LSB_FIRST(out + 8*i,
+                            GET_64BIT_LSB_FIRST(out + 8*i) ^ R[i] ^ Z[i]);
+
+    smemclr(R, sizeof(R));
+    smemclr(Q, sizeof(Q));
+    smemclr(Z, sizeof(Z));
+}
+
+/* ----------------------------------------------------------------------
+ * The main Argon2 function.
+ */
+
+static void argon2_internal(uint32_t p, uint32_t T, uint32_t m, uint32_t t,
+                            uint32_t y, ptrlen P, ptrlen S, ptrlen K, ptrlen X,
+                            uint8_t *out)
+{
+    /*
+     * Start by hashing all the input data together: the four string arguments
+     * (password P, salt S, optional secret key K, optional associated data
+     * X), plus all the parameters for the function's memory and time usage.
+     *
+     * The output of this hash is the sole input to the subsequent mixing
+     * step: Argon2 does not preserve any more entropy from the inputs, it
+     * just makes it extra painful to get the final answer.
+     */
+    uint8_t h0[64];
+    {
+        ssh_hash *h = blake2b_new_general(64);
+        put_uint32_le(h, p);
+        put_uint32_le(h, T);
+        put_uint32_le(h, m);
+        put_uint32_le(h, t);
+        put_uint32_le(h, 0x13);        /* hash function version number */
+        put_uint32_le(h, y);
+        put_stringpl_le(h, P);
+        put_stringpl_le(h, S);
+        put_stringpl_le(h, K);
+        put_stringpl_le(h, X);
+        ssh_hash_final(h, h0);
+    }
+
+    { // WINSCP
+    struct blk { uint8_t data[1024]; };
+
+    /*
+     * Array of 1Kb blocks. The total size is (approximately) m, the
+     * caller-specified parameter for how much memory to use; the blocks are
+     * regarded as a rectangular array of p rows ('lanes') by q columns, where
+     * p is the 'parallelism' input parameter (the lanes can be processed
+     * concurrently up to a point) and q is whatever makes the product pq come
+     * to m.
+     *
+     * Additionally, each row is divided into four equal 'segments', which are
+     * important to the way the algorithm decides which blocks to use as input
+     * to each step of the function.
+     *
+     * The term 'slice' refers to a whole set of vertically aligned segments,
+     * i.e. slice 0 is the whole left quarter of the array, and slice 3 the
+     * whole right quarter.
+     */
+    size_t SL = m / (4*p); /* segment length: # of 1Kb blocks in a segment */
+    size_t q = 4 * SL;     /* width of the array: 4 segments times SL */
+    size_t mprime = q * p; /* total size of the array, approximately m */
+
+    /* Allocate the memory. */
+    struct blk *B = snewn(mprime, struct blk);
+    memset(B, 0, mprime * sizeof(struct blk));
+
+    /*
+     * Initial setup: fill the first two full columns of the array with data
+     * expanded from the starting hash h0. Each block is the result of using
+     * the long-output hash function H' to hash h0 itself plus the block's
+     * coordinates in the array.
+     */
+    { // WINSCP
+    size_t i; // WINSCP
+    for (i = 0; i < p; i++) {
+        ssh_hash *h = hprime_new(1024);
+        put_data(h, h0, 64);
+        put_uint32_le(h, 0);
+        put_uint32_le(h, i);
+        hprime_final(h, 1024, B[i].data);
+    }
+    for (i = 0; i < p; i++) { // WINSCP
+        ssh_hash *h = hprime_new(1024);
+        put_data(h, h0, 64);
+        put_uint32_le(h, 1);
+        put_uint32_le(h, i);
+        hprime_final(h, 1024, B[i+p].data);
+    }
+
+    /*
+     * Declarations for the main loop.
+     *
+     * The basic structure of the main loop is going to involve processing the
+     * array one whole slice (vertically divided quarter) at a time. Usually
+     * we'll write a new value into every single block in the slice, except
+     * that in the initial slice on the first pass, we've already written
+     * values into the first two columns during the initial setup above. So
+     * 'jstart' indicates the starting index in each segment we process; it
+     * starts off as 2 so that we don't overwrite the inital setup, and then
+     * after the first slice is done, we set it to 0, and it stays there.
+     *
+     * d_mode indicates whether we're being data-dependent (true) or
+     * data-independent (false). In the hybrid Argon2id mode, we start off
+     * independent, and then once we've mixed things up enough, switch over to
+     * dependent mode to force long serial chains of computation.
+     */
+    { // WINSCP
+    size_t jstart = 2;
+    bool d_mode = (y == 0);
+    struct blk out2i, tmp2i, in2i;
+
+    /* Outermost loop: t whole passes from left to right over the array */
+    size_t pass; // WINSCP
+    for (pass = 0; pass < t; pass++) {
+
+        /* Within that, we process the array in its four main slices */
+        unsigned slice; // WINSCP
+        for (slice = 0; slice < 4; slice++) {
+
+            /* In Argon2id mode, if we're half way through the first pass,
+             * this is the moment to switch d_mode from false to true */
+            if (pass == 0 && slice == 2 && y == 2)
+                d_mode = true;
+
+            /* Loop over every segment in the slice (i.e. every row). So i is
+             * the y-coordinate of each block we process. */
+            { // WINSCP
+            size_t i; // WINSCP
+            for (i = 0; i < p; i++) {
+
+                /* And within that segment, process the blocks from left to
+                 * right, starting at 'jstart' (usually 0, but 2 in the first
+                 * slice). */
+                size_t jpre; // WINSCP
+                for (jpre = jstart; jpre < SL; jpre++) {
+
+                    /* j is the x-coordinate of each block we process, made up
+                     * of the slice number and the index 'jpre' within the
+                     * segment. */
+                    size_t j = slice * SL + jpre;
+
+                    /* jm1 is j-1 (mod q) */
+                    uint32_t jm1 = (j == 0 ? q-1 : j-1);
+
+                    /*
+                     * Construct two 32-bit pseudorandom integers J1 and J2.
+                     * This is the part of the algorithm that varies between
+                     * the data-dependent and independent modes.
+                     */
+                    uint32_t J1, J2;
+                    if (d_mode) {
+                        /*
+                         * Data-dependent: grab the first 64 bits of the block
+                         * to the left of this one.
+                         */
+                        J1 = GET_32BIT_LSB_FIRST(B[i + p * jm1].data);
+                        J2 = GET_32BIT_LSB_FIRST(B[i + p * jm1].data + 4);
+                    } else {
+                        /*
+                         * Data-independent: generate pseudorandom data by
+                         * hashing a sequence of preimage blocks that include
+                         * all our input parameters, plus the coordinates of
+                         * this point in the algorithm (array position and
+                         * pass number) to make all the hash outputs distinct.
+                         *
+                         * The hash we use is G itself, applied twice. So we
+                         * generate 1Kb of data at a time, which is enough for
+                         * 128 (J1,J2) pairs. Hence we only need to do the
+                         * hashing if our index within the segment is a
+                         * multiple of 128, or if we're at the very start of
+                         * the algorithm (in which case we started at 2 rather
+                         * than 0). After that we can just keep picking data
+                         * out of our most recent hash output.
+                         */
+                        if (jpre == jstart || jpre % 128 == 0) {
+                            /*
+                             * Hash preimage is mostly zeroes, with a
+                             * collection of assorted integer values we had
+                             * anyway.
+                             */
+                            memset(in2i.data, 0, sizeof(in2i.data));
+                            PUT_64BIT_LSB_FIRST(in2i.data +  0, pass);
+                            PUT_64BIT_LSB_FIRST(in2i.data +  8, i);
+                            PUT_64BIT_LSB_FIRST(in2i.data + 16, slice);
+                            PUT_64BIT_LSB_FIRST(in2i.data + 24, mprime);
+                            PUT_64BIT_LSB_FIRST(in2i.data + 32, t);
+                            PUT_64BIT_LSB_FIRST(in2i.data + 40, y);
+                            PUT_64BIT_LSB_FIRST(in2i.data + 48, jpre / 128 + 1);
+
+                            /*
+                             * Now apply G twice to generate the hash output
+                             * in out2i.
+                             */
+                            memset(tmp2i.data, 0, sizeof(tmp2i.data));
+                            G_xor(tmp2i.data, tmp2i.data, in2i.data);
+                            memset(out2i.data, 0, sizeof(out2i.data));
+                            G_xor(out2i.data, out2i.data, tmp2i.data);
+                        }
+
+                        /*
+                         * Extract J1 and J2 from the most recent hash output
+                         * (whether we've just computed it or not).
+                         */
+                        J1 = GET_32BIT_LSB_FIRST(
+                            out2i.data + 8 * (jpre % 128));
+                        J2 = GET_32BIT_LSB_FIRST(
+                            out2i.data + 8 * (jpre % 128) + 4);
+                    }
+
+                    /*
+                     * Now convert J1 and J2 into the index of an existing
+                     * block of the array to use as input to this step. This
+                     * is fairly fiddly.
+                     *
+                     * The easy part: the y-coordinate of the input block is
+                     * obtained by reducing J2 mod p, except that at the very
+                     * start of the algorithm (processing the first slice on
+                     * the first pass) we simply use the same y-coordinate as
+                     * our output block.
+                     *
+                     * Note that it's safe to use the ordinary % operator
+                     * here, without any concern for timing side channels: in
+                     * data-independent mode J2 is not correlated to any
+                     * secrets, and in data-dependent mode we're going to be
+                     * giving away side-channel data _anyway_ when we use it
+                     * as an array index (and by assumption we don't care,
+                     * because it's already massively randomised from the real
+                     * inputs).
+                     */
+                    { // WINSCP
+                    uint32_t index_l = (pass == 0 && slice == 0) ? i : J2 % p;
+
+                    /*
+                     * The hard part: which block in this array row do we use?
+                     *
+                     * First, we decide what the possible candidates are. This
+                     * requires some case analysis, and depends on whether the
+                     * array row is the same one we're writing into or not.
+                     *
+                     * If it's not the same row: we can't use any block from
+                     * the current slice (because the segments within a slice
+                     * have to be processable in parallel, so in a concurrent
+                     * implementation those blocks are potentially in the
+                     * process of being overwritten by other threads). But the
+                     * other three slices are fair game, except that in the
+                     * first pass, slices to the right of us won't have had
+                     * any values written into them yet at all.
+                     *
+                     * If it is the same row, we _are_ allowed to use blocks
+                     * from the current slice, but only the ones before our
+                     * current position.
+                     *
+                     * In both cases, we also exclude the individual _column_
+                     * just to the left of the current one. (The block
+                     * immediately to our left is going to be the _other_
+                     * input to G, but the spec also says that we avoid that
+                     * column even in a different row.)
+                     *
+                     * All of this means that we end up choosing from a
+                     * cyclically contiguous interval of blocks within this
+                     * lane, but the start and end points require some thought
+                     * to get them right.
+                     */
+
+                    /* Start position is the beginning of the _next_ slice
+                     * (containing data from the previous pass), unless we're
+                     * on pass 0, where the start position has to be 0. */
+                    uint32_t Wstart = (pass == 0 ? 0 : (slice + 1) % 4 * SL);
+
+                    /* End position splits up by cases. */
+                    uint32_t Wend;
+                    if (index_l == i) {
+                        /* Same lane as output: we can use anything up to (but
+                         * not including) the block immediately left of us. */
+                        Wend = jm1;
+                    } else {
+                        /* Different lane from output: we can use anything up
+                         * to the previous slice boundary, or one less than
+                         * that if we're at the very left edge of our slice
+                         * right now. */
+                        Wend = SL * slice;
+                        if (jpre == 0)
+                            Wend = (Wend + q-1) % q;
+                    }
+
+                    /* Total number of blocks available to choose from */
+                    { // WINSCP
+                    uint32_t Wsize = (Wend + q - Wstart) % q;
+
+                    /* Fiddly computation from the spec that chooses from the
+                     * available blocks, in a deliberately non-uniform
+                     * fashion, using J1 as pseudorandom input data. Output is
+                     * zz which is the index within our contiguous interval. */
+                    uint32_t x = ((uint64_t)J1 * J1) >> 32;
+                    uint32_t y = ((uint64_t)Wsize * x) >> 32;
+                    uint32_t zz = Wsize - 1 - y;
+
+                    /* And index_z is the actual x coordinate of the block we
+                     * want. */
+                    uint32_t index_z = (Wstart + zz) % q;
+
+                    /* Phew! Combine that block with the one immediately to
+                     * our left, and XOR over the top of whatever is already
+                     * in our current output block. */
+                    G_xor(B[i + p * j].data, B[i + p * jm1].data,
+                          B[index_l + p * index_z].data);
+                    } // WINSCP
+                    } // WINSCP
+                }
+            }
+
+            /* We've finished processing a slice. Reset jstart to 0. It will
+             * onily _not_ have been 0 if this was pass 0 slice 0, in which
+             * case it still had its initial value of 2 to avoid the starting
+             * data. */
+            jstart = 0;
+            } // WINSCP
+        }
+    }
+
+    /*
+     * The main output is all done. Final output works by taking the XOR of
+     * all the blocks in the rightmost column of the array, and then using
+     * that as input to our long hash H'. The output of _that_ is what we
+     * deliver to the caller.
+     */
+
+    { // WINSCP
+    struct blk C = B[p * (q-1)];
+    size_t i; // WINSCP
+    for (i = 1; i < p; i++)
+        memxor(C.data, C.data, B[i + p * (q-1)].data, 1024);
+
+    {
+        ssh_hash *h = hprime_new(T);
+        put_data(h, C.data, 1024);
+        hprime_final(h, T, out);
+    }
+
+    /*
+     * Clean up.
+     */
+    smemclr(out2i.data, sizeof(out2i.data));
+    smemclr(tmp2i.data, sizeof(tmp2i.data));
+    smemclr(in2i.data, sizeof(in2i.data));
+    smemclr(C.data, sizeof(C.data));
+    smemclr(B, mprime * sizeof(struct blk));
+    sfree(B);
+    } // WINSCP
+    } // WINSCP
+    } // WINSCP
+    } // WINSCP
+}
+
+/*
+ * Wrapper function that appends to a strbuf (which sshpubk.c will want).
+ */
+void argon2(Argon2Flavour flavour, uint32_t mem, uint32_t passes,
+            uint32_t parallel, uint32_t taglen,
+            ptrlen P, ptrlen S, ptrlen K, ptrlen X, strbuf *out)
+{
+    argon2_internal(parallel, taglen, mem, passes, flavour,
+                    P, S, K, X, strbuf_append(out, taglen));
+}
+
+/*
+ * Wrapper function which dynamically chooses the number of passes to run in
+ * order to hit an approximate total amount of CPU time. Writes the result
+ * into 'passes'.
+ */
+void argon2_choose_passes(
+    Argon2Flavour flavour, uint32_t mem,
+    uint32_t milliseconds, uint32_t *passes,
+    uint32_t parallel, uint32_t taglen,
+    ptrlen P, ptrlen S, ptrlen K, ptrlen X,
+    strbuf *out)
+{
+    unsigned long desired_time = (TICKSPERSEC * milliseconds) / 1000;
+
+    /*
+     * We only need the time taken to be approximately right, so we
+     * scale up the number of passes geometrically, which avoids
+     * taking O(t^2) time to find a pass count taking time t.
+     *
+     * Using the Fibonacci numbers is slightly nicer than the obvious
+     * approach of powers of 2, because it's still very easy to
+     * compute, and grows less fast (powers of 1.6 instead of 2), so
+     * you get just a touch more precision.
+     */
+    uint32_t a = 1, b = 1;
+
+    while (true) {
+        unsigned long start_time = GETTICKCOUNT();
+        argon2(flavour, mem, b, parallel, taglen, P, S, K, X, out);
+        { // WINSCP
+        unsigned long ticks = GETTICKCOUNT() - start_time;
+
+        /* But just in case computers get _too_ fast, we have to cap
+         * the growth before it gets past the uint32_t upper bound! So
+         * if computing a+b would overflow, stop here. */
+
+        if (ticks >= desired_time || a > (uint32_t)~b) {
+            *passes = b;
+            return;
+        } else {
+            strbuf_clear(out);
+
+            /* Next Fibonacci number: replace (a, b) with (b, a+b) */
+            b += a;
+            a = b - a;
+        }
+        } // WINSCP
+    }
+}

+ 5 - 7
source/putty/sshauxcrypt.c

@@ -11,14 +11,12 @@
 
 #include "ssh.h"
 
-static ssh_cipher *aes256_pubkey_cipher(const void *key)
+static ssh_cipher *aes256_pubkey_cipher(const void *key, const void *iv)
 {
     /*
      * PuTTY's own .PPK format for SSH-2 private key files is
      * encrypted with 256-bit AES in CBC mode.
      */
-    char iv[16];
-    memset(iv, 0, 16);
     { // WINSCP
     ssh_cipher *cipher = ssh_cipher_new(&ssh_aes256_cbc);
     ssh_cipher_setkey(cipher, key);
@@ -27,16 +25,16 @@ static ssh_cipher *aes256_pubkey_cipher(const void *key)
     } // WINSCP
 }
 
-void aes256_encrypt_pubkey(const void *key, void *blk, int len)
+void aes256_encrypt_pubkey(const void *key, const void *iv, void *blk, int len)
 {
-    ssh_cipher *c = aes256_pubkey_cipher(key);
+    ssh_cipher *c = aes256_pubkey_cipher(key, iv);
     ssh_cipher_encrypt(c, blk, len);
     ssh_cipher_free(c);
 }
 
-void aes256_decrypt_pubkey(const void *key, void *blk, int len)
+void aes256_decrypt_pubkey(const void *key, const void *iv, void *blk, int len)
 {
-    ssh_cipher *c = aes256_pubkey_cipher(key);
+    ssh_cipher *c = aes256_pubkey_cipher(key, iv);
     ssh_cipher_decrypt(c, blk, len);
     ssh_cipher_free(c);
 }

+ 242 - 0
source/putty/sshblake2.c

@@ -0,0 +1,242 @@
+/*
+ * BLAKE2 (RFC 7693) implementation for PuTTY.
+ *
+ * The BLAKE2 hash family includes BLAKE2s, in which the hash state is
+ * operated on as a collection of 32-bit integers, and BLAKE2b, based
+ * on 64-bit integers. At present this code implements BLAKE2b only.
+ */
+
+#include <assert.h>
+#include "ssh.h"
+
+static inline uint64_t ror(uint64_t x, unsigned rotation)
+{
+#pragma option push -w-ngu // WINSCP
+    unsigned lshift = 63 & -rotation, rshift = 63 & rotation;
+#pragma option pop // WINSCP
+    return (x << lshift) | (x >> rshift);
+}
+
+/* RFC 7963 section 2.1 */
+enum { R1 = 32, R2 = 24, R3 = 16, R4 = 63 };
+
+/* RFC 7693 section 2.6 */
+static const uint64_t iv[] = {
+    // WINSCP (ULL)
+    0x6a09e667f3bcc908ULL,                /* floor(2^64 * frac(sqrt(2)))  */
+    0xbb67ae8584caa73bULL,                /* floor(2^64 * frac(sqrt(3)))  */
+    0x3c6ef372fe94f82bULL,                /* floor(2^64 * frac(sqrt(5)))  */
+    0xa54ff53a5f1d36f1ULL,                /* floor(2^64 * frac(sqrt(7)))  */
+    0x510e527fade682d1ULL,                /* floor(2^64 * frac(sqrt(11))) */
+    0x9b05688c2b3e6c1fULL,                /* floor(2^64 * frac(sqrt(13))) */
+    0x1f83d9abfb41bd6bULL,                /* floor(2^64 * frac(sqrt(17))) */
+    0x5be0cd19137e2179ULL,                /* floor(2^64 * frac(sqrt(19))) */
+};
+
+/* RFC 7693 section 2.7 */
+static const unsigned char sigma[][16] = {
+    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15},
+    {14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3},
+    {11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4},
+    { 7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8},
+    { 9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13},
+    { 2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9},
+    {12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11},
+    {13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10},
+    { 6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5},
+    {10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0},
+    /* This array recycles if you have more than 10 rounds. BLAKE2b
+     * has 12, so we repeat the first two rows again. */
+    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15},
+    {14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3},
+};
+
+static inline void g_half(uint64_t v[16], unsigned a, unsigned b, unsigned c,
+                          unsigned d, uint64_t x, unsigned r1, unsigned r2)
+{
+    v[a] += v[b] + x;
+    v[d] ^= v[a];
+    v[d] = ror(v[d], r1);
+    v[c] += v[d];
+    v[b] ^= v[c];
+    v[b] = ror(v[b], r2);
+}
+
+static inline void g(uint64_t v[16], unsigned a, unsigned b, unsigned c,
+                     unsigned d, uint64_t x, uint64_t y)
+{
+    g_half(v, a, b, c, d, x, R1, R2);
+    g_half(v, a, b, c, d, y, R3, R4);
+}
+
+static inline void f(uint64_t h[8], uint64_t m[16], uint64_t offset_hi,
+                     uint64_t offset_lo, unsigned final)
+{
+    uint64_t v[16];
+    memcpy(v, h, 8 * sizeof(*v));
+    memcpy(v + 8, iv, 8 * sizeof(*v));
+    v[12] ^= offset_lo;
+    v[13] ^= offset_hi;
+    v[14] ^= -(uint64_t)final;
+    { // WINSCP
+    unsigned round; // WINSCP
+    for (round = 0; round < 12; round++) {
+        const unsigned char *s = sigma[round];
+        g(v,  0,  4,  8, 12, m[s[ 0]], m[s[ 1]]);
+        g(v,  1,  5,  9, 13, m[s[ 2]], m[s[ 3]]);
+        g(v,  2,  6, 10, 14, m[s[ 4]], m[s[ 5]]);
+        g(v,  3,  7, 11, 15, m[s[ 6]], m[s[ 7]]);
+        g(v,  0,  5, 10, 15, m[s[ 8]], m[s[ 9]]);
+        g(v,  1,  6, 11, 12, m[s[10]], m[s[11]]);
+        g(v,  2,  7,  8, 13, m[s[12]], m[s[13]]);
+        g(v,  3,  4,  9, 14, m[s[14]], m[s[15]]);
+    }
+    { // WINSCP
+    unsigned i; // WINSCP
+    for (i = 0; i < 8; i++)
+        h[i] ^= v[i] ^ v[i+8];
+    smemclr(v, sizeof(v));
+    } // WINSCP
+    } // WINSCP
+}
+
+static inline void f_outer(uint64_t h[8], uint8_t blk[128], uint64_t offset_hi,
+                           uint64_t offset_lo, unsigned final)
+{
+    uint64_t m[16];
+    unsigned i; // WINSCP
+    for (i = 0; i < 16; i++)
+        m[i] = GET_64BIT_LSB_FIRST(blk + 8*i);
+    f(h, m, offset_hi, offset_lo, final);
+    smemclr(m, sizeof(m));
+}
+
+typedef struct blake2b {
+    uint64_t h[8];
+    unsigned hashlen;
+
+    uint8_t block[128];
+    size_t used;
+    uint64_t lenhi, lenlo;
+
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} blake2b;
+
+static void blake2b_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *blake2b_new_inner(unsigned hashlen)
+{
+    assert(hashlen <= ssh_blake2b.hlen);
+
+    { // WINSCP
+    blake2b *s = snew(blake2b);
+    s->hash.vt = &ssh_blake2b;
+    s->hashlen = hashlen;
+    BinarySink_INIT(s, blake2b_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+    } // WINSCP
+}
+
+static ssh_hash *blake2b_new(const ssh_hashalg *alg)
+{
+    return blake2b_new_inner(alg->hlen);
+}
+
+ssh_hash *blake2b_new_general(unsigned hashlen)
+{
+    ssh_hash *h = blake2b_new_inner(hashlen);
+    ssh_hash_reset(h);
+    return h;
+}
+
+static void blake2b_reset(ssh_hash *hash)
+{
+    blake2b *s = container_of(hash, blake2b, hash);
+
+    /* Initialise the hash to the standard IV */
+    memcpy(s->h, iv, sizeof(s->h));
+
+    /* XOR in the parameters: secret key length (here always 0) in
+     * byte 1, and hash length in byte 0. */
+    s->h[0] ^= 0x01010000 ^ s->hashlen;
+
+    s->used = 0;
+    s->lenhi = s->lenlo = 0;
+}
+
+static void blake2b_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    blake2b *copy = container_of(hcopy, blake2b, hash);
+    blake2b *orig = container_of(horig, blake2b, hash);
+
+    memcpy(copy, orig, sizeof(*copy));
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void blake2b_free(ssh_hash *hash)
+{
+    blake2b *s = container_of(hash, blake2b, hash);
+
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void blake2b_write(BinarySink *bs, const void *vp, size_t len)
+{
+    blake2b *s = BinarySink_DOWNCAST(bs, blake2b);
+    const uint8_t *p = vp;
+
+    while (len > 0) {
+        if (s->used == sizeof(s->block)) {
+            f_outer(s->h, s->block, s->lenhi, s->lenlo, 0);
+            s->used = 0;
+        }
+
+        { // WINSCP
+        size_t chunk = sizeof(s->block) - s->used;
+        if (chunk > len)
+            chunk = len;
+
+        memcpy(s->block + s->used, p, chunk);
+        s->used += chunk;
+        p += chunk;
+        len -= chunk;
+
+        s->lenlo += chunk;
+        s->lenhi += (s->lenlo < chunk);
+        } // WINSCP
+    }
+}
+
+static void blake2b_digest(ssh_hash *hash, uint8_t *digest)
+{
+    blake2b *s = container_of(hash, blake2b, hash);
+
+    memset(s->block + s->used, 0, sizeof(s->block) - s->used);
+    f_outer(s->h, s->block, s->lenhi, s->lenlo, 1);
+
+    { // WINSCP
+    uint8_t hash_pre[128];
+    unsigned i; // WINSCP
+    for (i = 0; i < 8; i++)
+        PUT_64BIT_LSB_FIRST(hash_pre + 8*i, s->h[i]);
+    memcpy(digest, hash_pre, s->hashlen);
+    smemclr(hash_pre, sizeof(hash_pre));
+    } // WINSCP
+}
+
+const ssh_hashalg ssh_blake2b = {
+    // WINSCP
+    /*.new =*/ blake2b_new,
+    /*.reset =*/ blake2b_reset,
+    /*.copyfrom =*/ blake2b_copyfrom,
+    /*.digest =*/ blake2b_digest,
+    /*.free =*/ blake2b_free,
+    /*.hlen =*/ 64,
+    /*.blocklen =*/ 128,
+    HASHALG_NAMES_BARE("BLAKE2b-64"),
+    NULL, // WINSCP
+};

+ 201 - 223
source/putty/sshmd5.c

@@ -1,278 +1,256 @@
-#include <assert.h>
-#include "ssh.h"
-
 /*
  * MD5 implementation for PuTTY. Written directly from the spec by
  * Simon Tatham.
  */
 
-typedef struct {
-    uint32_t h[4];
-} MD5_Core_State;
+#include <assert.h>
+#include "ssh.h"
 
-struct MD5Context {
-    MD5_Core_State core;
-    unsigned char block[64];
-    int blkused;
-    uint64_t len;
-    BinarySink_IMPLEMENTATION;
+static const uint32_t md5_initial_state[] = {
+    0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476,
 };
 
-/* ----------------------------------------------------------------------
- * Core MD5 algorithm: processes 16-word blocks into a message digest.
- */
-
-#define F(x,y,z) ( ((x) & (y)) | ((~(x)) & (z)) )
-#define G(x,y,z) ( ((x) & (z)) | ((~(z)) & (y)) )
-#define H(x,y,z) ( (x) ^ (y) ^ (z) )
-#define I(x,y,z) ( (y) ^ ( (x) | ~(z) ) )
-
-#define rol(x,y) ( ((x) << (y)) | (((uint32_t)x) >> (32-y)) )
+static const struct md5_round_constant {
+    uint32_t addition, rotation, msg_index;
+} md5_round_constants[] = {
+    { 0xd76aa478,  7,  0 }, { 0xe8c7b756, 12,  1 },
+    { 0x242070db, 17,  2 }, { 0xc1bdceee, 22,  3 },
+    { 0xf57c0faf,  7,  4 }, { 0x4787c62a, 12,  5 },
+    { 0xa8304613, 17,  6 }, { 0xfd469501, 22,  7 },
+    { 0x698098d8,  7,  8 }, { 0x8b44f7af, 12,  9 },
+    { 0xffff5bb1, 17, 10 }, { 0x895cd7be, 22, 11 },
+    { 0x6b901122,  7, 12 }, { 0xfd987193, 12, 13 },
+    { 0xa679438e, 17, 14 }, { 0x49b40821, 22, 15 },
+    { 0xf61e2562,  5,  1 }, { 0xc040b340,  9,  6 },
+    { 0x265e5a51, 14, 11 }, { 0xe9b6c7aa, 20,  0 },
+    { 0xd62f105d,  5,  5 }, { 0x02441453,  9, 10 },
+    { 0xd8a1e681, 14, 15 }, { 0xe7d3fbc8, 20,  4 },
+    { 0x21e1cde6,  5,  9 }, { 0xc33707d6,  9, 14 },
+    { 0xf4d50d87, 14,  3 }, { 0x455a14ed, 20,  8 },
+    { 0xa9e3e905,  5, 13 }, { 0xfcefa3f8,  9,  2 },
+    { 0x676f02d9, 14,  7 }, { 0x8d2a4c8a, 20, 12 },
+    { 0xfffa3942,  4,  5 }, { 0x8771f681, 11,  8 },
+    { 0x6d9d6122, 16, 11 }, { 0xfde5380c, 23, 14 },
+    { 0xa4beea44,  4,  1 }, { 0x4bdecfa9, 11,  4 },
+    { 0xf6bb4b60, 16,  7 }, { 0xbebfbc70, 23, 10 },
+    { 0x289b7ec6,  4, 13 }, { 0xeaa127fa, 11,  0 },
+    { 0xd4ef3085, 16,  3 }, { 0x04881d05, 23,  6 },
+    { 0xd9d4d039,  4,  9 }, { 0xe6db99e5, 11, 12 },
+    { 0x1fa27cf8, 16, 15 }, { 0xc4ac5665, 23,  2 },
+    { 0xf4292244,  6,  0 }, { 0x432aff97, 10,  7 },
+    { 0xab9423a7, 15, 14 }, { 0xfc93a039, 21,  5 },
+    { 0x655b59c3,  6, 12 }, { 0x8f0ccc92, 10,  3 },
+    { 0xffeff47d, 15, 10 }, { 0x85845dd1, 21,  1 },
+    { 0x6fa87e4f,  6,  8 }, { 0xfe2ce6e0, 10, 15 },
+    { 0xa3014314, 15,  6 }, { 0x4e0811a1, 21, 13 },
+    { 0xf7537e82,  6,  4 }, { 0xbd3af235, 10, 11 },
+    { 0x2ad7d2bb, 15,  2 }, { 0xeb86d391, 21,  9 },
+};
 
-#define subround(f,w,x,y,z,k,s,ti) \
-       w = x + rol(w + f(x,y,z) + block[k] + ti, s)
+typedef struct md5_block md5_block;
+struct md5_block {
+    uint8_t block[64];
+    size_t used;
+    uint64_t len;
+};
 
-static void MD5_Core_Init(MD5_Core_State * s)
+static inline void md5_block_setup(md5_block *blk)
 {
-    s->h[0] = 0x67452301;
-    s->h[1] = 0xefcdab89;
-    s->h[2] = 0x98badcfe;
-    s->h[3] = 0x10325476;
+    blk->used = 0;
+    blk->len = 0;
 }
 
-static void MD5_Block(MD5_Core_State *s, uint32_t *block)
+static inline bool md5_block_write(
+    md5_block *blk, const void **vdata, size_t *len)
 {
-    uint32_t a, b, c, d;
-
-    a = s->h[0];
-    b = s->h[1];
-    c = s->h[2];
-    d = s->h[3];
-
-    subround(F, a, b, c, d, 0, 7, 0xd76aa478);
-    subround(F, d, a, b, c, 1, 12, 0xe8c7b756);
-    subround(F, c, d, a, b, 2, 17, 0x242070db);
-    subround(F, b, c, d, a, 3, 22, 0xc1bdceee);
-    subround(F, a, b, c, d, 4, 7, 0xf57c0faf);
-    subround(F, d, a, b, c, 5, 12, 0x4787c62a);
-    subround(F, c, d, a, b, 6, 17, 0xa8304613);
-    subround(F, b, c, d, a, 7, 22, 0xfd469501);
-    subround(F, a, b, c, d, 8, 7, 0x698098d8);
-    subround(F, d, a, b, c, 9, 12, 0x8b44f7af);
-    subround(F, c, d, a, b, 10, 17, 0xffff5bb1);
-    subround(F, b, c, d, a, 11, 22, 0x895cd7be);
-    subround(F, a, b, c, d, 12, 7, 0x6b901122);
-    subround(F, d, a, b, c, 13, 12, 0xfd987193);
-    subround(F, c, d, a, b, 14, 17, 0xa679438e);
-    subround(F, b, c, d, a, 15, 22, 0x49b40821);
-    subround(G, a, b, c, d, 1, 5, 0xf61e2562);
-    subround(G, d, a, b, c, 6, 9, 0xc040b340);
-    subround(G, c, d, a, b, 11, 14, 0x265e5a51);
-    subround(G, b, c, d, a, 0, 20, 0xe9b6c7aa);
-    subround(G, a, b, c, d, 5, 5, 0xd62f105d);
-    subround(G, d, a, b, c, 10, 9, 0x02441453);
-    subround(G, c, d, a, b, 15, 14, 0xd8a1e681);
-    subround(G, b, c, d, a, 4, 20, 0xe7d3fbc8);
-    subround(G, a, b, c, d, 9, 5, 0x21e1cde6);
-    subround(G, d, a, b, c, 14, 9, 0xc33707d6);
-    subround(G, c, d, a, b, 3, 14, 0xf4d50d87);
-    subround(G, b, c, d, a, 8, 20, 0x455a14ed);
-    subround(G, a, b, c, d, 13, 5, 0xa9e3e905);
-    subround(G, d, a, b, c, 2, 9, 0xfcefa3f8);
-    subround(G, c, d, a, b, 7, 14, 0x676f02d9);
-    subround(G, b, c, d, a, 12, 20, 0x8d2a4c8a);
-    subround(H, a, b, c, d, 5, 4, 0xfffa3942);
-    subround(H, d, a, b, c, 8, 11, 0x8771f681);
-    subround(H, c, d, a, b, 11, 16, 0x6d9d6122);
-    subround(H, b, c, d, a, 14, 23, 0xfde5380c);
-    subround(H, a, b, c, d, 1, 4, 0xa4beea44);
-    subround(H, d, a, b, c, 4, 11, 0x4bdecfa9);
-    subround(H, c, d, a, b, 7, 16, 0xf6bb4b60);
-    subround(H, b, c, d, a, 10, 23, 0xbebfbc70);
-    subround(H, a, b, c, d, 13, 4, 0x289b7ec6);
-    subround(H, d, a, b, c, 0, 11, 0xeaa127fa);
-    subround(H, c, d, a, b, 3, 16, 0xd4ef3085);
-    subround(H, b, c, d, a, 6, 23, 0x04881d05);
-    subround(H, a, b, c, d, 9, 4, 0xd9d4d039);
-    subround(H, d, a, b, c, 12, 11, 0xe6db99e5);
-    subround(H, c, d, a, b, 15, 16, 0x1fa27cf8);
-    subround(H, b, c, d, a, 2, 23, 0xc4ac5665);
-    subround(I, a, b, c, d, 0, 6, 0xf4292244);
-    subround(I, d, a, b, c, 7, 10, 0x432aff97);
-    subround(I, c, d, a, b, 14, 15, 0xab9423a7);
-    subround(I, b, c, d, a, 5, 21, 0xfc93a039);
-    subround(I, a, b, c, d, 12, 6, 0x655b59c3);
-    subround(I, d, a, b, c, 3, 10, 0x8f0ccc92);
-    subround(I, c, d, a, b, 10, 15, 0xffeff47d);
-    subround(I, b, c, d, a, 1, 21, 0x85845dd1);
-    subround(I, a, b, c, d, 8, 6, 0x6fa87e4f);
-    subround(I, d, a, b, c, 15, 10, 0xfe2ce6e0);
-    subround(I, c, d, a, b, 6, 15, 0xa3014314);
-    subround(I, b, c, d, a, 13, 21, 0x4e0811a1);
-    subround(I, a, b, c, d, 4, 6, 0xf7537e82);
-    subround(I, d, a, b, c, 11, 10, 0xbd3af235);
-    subround(I, c, d, a, b, 2, 15, 0x2ad7d2bb);
-    subround(I, b, c, d, a, 9, 21, 0xeb86d391);
-
-    s->h[0] += a;
-    s->h[1] += b;
-    s->h[2] += c;
-    s->h[3] += d;
+    size_t blkleft = sizeof(blk->block) - blk->used;
+    size_t chunk = *len < blkleft ? *len : blkleft;
+
+    const uint8_t *p = *vdata;
+    memcpy(blk->block + blk->used, p, chunk);
+    *vdata = p + chunk;
+    *len -= chunk;
+    blk->used += chunk;
+    blk->len += chunk;
+
+    if (blk->used == sizeof(blk->block)) {
+        blk->used = 0;
+        return true;
+    }
+
+    return false;
 }
 
-/* ----------------------------------------------------------------------
- * Outer MD5 algorithm: take an arbitrary length byte string,
- * convert it into 16-word blocks with the prescribed padding at
- * the end, and pass those blocks to the core MD5 algorithm.
- */
+static inline void md5_block_pad(md5_block *blk, BinarySink *bs)
+{
+    uint64_t final_len = blk->len << 3;
+    size_t pad = 63 & (55 - blk->used);
 
-#define BLKSIZE 64
+    put_byte(bs, 0x80);
+    put_padding(bs, pad, 0);
 
-static void MD5_BinarySink_write(BinarySink *bs, const void *data, size_t len);
+    { // WINSCP
+    unsigned char buf[8];
+    PUT_64BIT_LSB_FIRST(buf, final_len);
+    put_data(bs, buf, 8);
+    smemclr(buf, 8);
+    } // WINSCP
 
-void MD5Init(struct MD5Context *s)
-{
-    MD5_Core_Init(&s->core);
-    s->blkused = 0;
-    s->len = 0;
-    BinarySink_INIT(s, MD5_BinarySink_write);
+    assert(blk->used == 0 && "Should have exactly hit a block boundary");
 }
 
-static void MD5_BinarySink_write(BinarySink *bs, const void *data, size_t len)
+static inline uint32_t rol(uint32_t x, unsigned y)
 {
-    struct MD5Context *s = BinarySink_DOWNCAST(bs, struct MD5Context);
-    const unsigned char *q = (const unsigned char *)data;
-    uint32_t wordblock[16];
-    uint32_t lenw = len;
-    int i;
-
-    assert(lenw == len);
-
-    /*
-     * Update the length field.
-     */
-    s->len += lenw;
-
-    if (s->blkused + len < BLKSIZE) {
-        /*
-         * Trivial case: just add to the block.
-         */
-        memcpy(s->block + s->blkused, q, len);
-        s->blkused += len;
-    } else {
-        /*
-         * We must complete and process at least one block.
-         */
-        while (s->blkused + len >= BLKSIZE) {
-            memcpy(s->block + s->blkused, q, BLKSIZE - s->blkused);
-            q += BLKSIZE - s->blkused;
-            len -= BLKSIZE - s->blkused;
-            /* Now process the block. Gather bytes little-endian into words */
-            for (i = 0; i < 16; i++) {
-                wordblock[i] =
-                    (((uint32_t) s->block[i * 4 + 3]) << 24) |
-                    (((uint32_t) s->block[i * 4 + 2]) << 16) |
-                    (((uint32_t) s->block[i * 4 + 1]) << 8) |
-                    (((uint32_t) s->block[i * 4 + 0]) << 0);
-            }
-            MD5_Block(&s->core, wordblock);
-            s->blkused = 0;
-        }
-#ifdef MPEXT
-	if (len > 0)
-#endif
-        memcpy(s->block, q, len);
-        s->blkused = len;
-    }
+#pragma option push -w-ngu // WINSCP
+    return (x << (31 & y)) | (x >> (31 & -y));
+#pragma option pop // WINSCP
 }
 
-void MD5Final(unsigned char output[16], struct MD5Context *s)
+static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0)
 {
-    int i;
-    unsigned pad;
-    unsigned char c[64];
-    uint64_t len;
-
-    if (s->blkused >= 56)
-        pad = 56 + 64 - s->blkused;
-    else
-        pad = 56 - s->blkused;
-
-    len = (s->len << 3);
-
-    memset(c, 0, pad);
-    c[0] = 0x80;
-    put_data(s, c, pad);
-
-    PUT_64BIT_LSB_FIRST(c, len);
+    return if0 ^ (ctrl & (if1 ^ if0));
+}
 
-    put_data(s, c, 8);
+/* Parameter functions for the four MD5 round types */
+static inline uint32_t F(uint32_t x, uint32_t y, uint32_t z)
+{ return Ch(x, y, z); }
+static inline uint32_t G(uint32_t x, uint32_t y, uint32_t z)
+{ return Ch(z, x, y); }
+static inline uint32_t H(uint32_t x, uint32_t y, uint32_t z)
+{ return x ^ y ^ z; }
+static inline uint32_t I(uint32_t x, uint32_t y, uint32_t z)
+{ return y ^ (x | ~z); }
+
+static inline void md5_round(
+    unsigned round_index, const uint32_t *message,
+    uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d,
+    uint32_t (*f)(uint32_t, uint32_t, uint32_t))
+{
+    struct md5_round_constant rc = md5_round_constants[round_index];
 
-    for (i = 0; i < 4; i++) {
-        output[4 * i + 3] = (s->core.h[i] >> 24) & 0xFF;
-        output[4 * i + 2] = (s->core.h[i] >> 16) & 0xFF;
-        output[4 * i + 1] = (s->core.h[i] >> 8) & 0xFF;
-        output[4 * i + 0] = (s->core.h[i] >> 0) & 0xFF;
-    }
+    *a = *b + rol(*a + f(*b, *c, *d) + message[rc.msg_index] + rc.addition,
+                  rc.rotation);
 }
 
-void MD5Simple(void const *p, unsigned len, unsigned char output[16])
+static void md5_do_block(uint32_t *core, const uint8_t *block)
 {
-    struct MD5Context s;
+    uint32_t message_words[16];
+    size_t i; // WINSCP
+    for (i = 0; i < 16; i++)
+        message_words[i] = GET_32BIT_LSB_FIRST(block + 4*i);
+
+    { // WINSCP
+    uint32_t a = core[0], b = core[1], c = core[2], d = core[3];
+
+    size_t t = 0;
+    size_t u; // WINSCP
+    for (u = 0; u < 4; u++) {
+        md5_round(t++, message_words, &a, &b, &c, &d, F);
+        md5_round(t++, message_words, &d, &a, &b, &c, F);
+        md5_round(t++, message_words, &c, &d, &a, &b, F);
+        md5_round(t++, message_words, &b, &c, &d, &a, F);
+    }
+    for (u = 0; u < 4; u++) {
+        md5_round(t++, message_words, &a, &b, &c, &d, G);
+        md5_round(t++, message_words, &d, &a, &b, &c, G);
+        md5_round(t++, message_words, &c, &d, &a, &b, G);
+        md5_round(t++, message_words, &b, &c, &d, &a, G);
+    }
+    for (u = 0; u < 4; u++) {
+        md5_round(t++, message_words, &a, &b, &c, &d, H);
+        md5_round(t++, message_words, &d, &a, &b, &c, H);
+        md5_round(t++, message_words, &c, &d, &a, &b, H);
+        md5_round(t++, message_words, &b, &c, &d, &a, H);
+    }
+    for (u = 0; u < 4; u++) {
+        md5_round(t++, message_words, &a, &b, &c, &d, I);
+        md5_round(t++, message_words, &d, &a, &b, &c, I);
+        md5_round(t++, message_words, &c, &d, &a, &b, I);
+        md5_round(t++, message_words, &b, &c, &d, &a, I);
+    }
 
-    MD5Init(&s);
-    put_data(&s, (unsigned char const *)p, len);
-    MD5Final(output, &s);
-    smemclr(&s, sizeof(s));
-}
+    core[0] += a;
+    core[1] += b;
+    core[2] += c;
+    core[3] += d;
+    } // WINSCP
 
-/* ----------------------------------------------------------------------
- * Thin abstraction for things where hashes are pluggable.
- */
+    smemclr(message_words, sizeof(message_words));
+}
 
-struct md5_hash {
-    struct MD5Context state;
+typedef struct md5 {
+    uint32_t core[4];
+    md5_block blk;
+    BinarySink_IMPLEMENTATION;
     ssh_hash hash;
-};
+} md5;
+
+static void md5_write(BinarySink *bs, const void *vp, size_t len);
 
 static ssh_hash *md5_new(const ssh_hashalg *alg)
 {
-    struct md5_hash *h = snew(struct md5_hash);
-    MD5Init(&h->state);
-    h->hash.vt = alg;
-    BinarySink_DELEGATE_INIT(&h->hash, &h->state);
-    return &h->hash;
+    md5 *s = snew(md5);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, md5_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
 }
 
-static ssh_hash *md5_copy(ssh_hash *hashold)
+static void md5_reset(ssh_hash *hash)
 {
-    struct md5_hash *hold, *hnew;
-    ssh_hash *hashnew = md5_new(hashold->vt);
+    md5 *s = container_of(hash, md5, hash);
 
-    hold = container_of(hashold, struct md5_hash, hash);
-    hnew = container_of(hashnew, struct md5_hash, hash);
+    memcpy(s->core, md5_initial_state, sizeof(s->core));
+    md5_block_setup(&s->blk);
+}
 
-    hnew->state = hold->state;
-    BinarySink_COPIED(&hnew->state);
+static void md5_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    md5 *copy = container_of(hcopy, md5, hash);
+    md5 *orig = container_of(horig, md5, hash);
 
-    return hashnew;
+    memcpy(copy, orig, sizeof(*copy));
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
 }
 
 static void md5_free(ssh_hash *hash)
 {
-    struct md5_hash *h = container_of(hash, struct md5_hash, hash);
+    md5 *s = container_of(hash, md5, hash);
 
-    smemclr(h, sizeof(*h));
-    sfree(h);
+    smemclr(s, sizeof(*s));
+    sfree(s);
 }
 
-static void md5_final(ssh_hash *hash, unsigned char *output)
+static void md5_write(BinarySink *bs, const void *vp, size_t len)
 {
-    struct md5_hash *h = container_of(hash, struct md5_hash, hash);
-    MD5Final(output, &h->state);
-    md5_free(hash);
+    md5 *s = BinarySink_DOWNCAST(bs, md5);
+
+    while (len > 0)
+        if (md5_block_write(&s->blk, &vp, &len))
+            md5_do_block(s->core, s->blk.block);
+}
+
+static void md5_digest(ssh_hash *hash, uint8_t *digest)
+{
+    md5 *s = container_of(hash, md5, hash);
+
+    size_t i; // WINSCP
+    md5_block_pad(&s->blk, BinarySink_UPCAST(s));
+    for (i = 0; i < 4; i++)
+        PUT_32BIT_LSB_FIRST(digest + 4*i, s->core[i]);
 }
 
 const ssh_hashalg ssh_md5 = {
-    md5_new, md5_copy, md5_final, md5_free, 16, 64, HASHALG_NAMES_BARE("MD5"),
+    // WINSCP
+    /*.new =*/ md5_new,
+    /*.reset =*/ md5_reset,
+    /*.copyfrom =*/ md5_copyfrom,
+    /*.digest =*/ md5_digest,
+    /*.free =*/ md5_free,
+    /*.hlen =*/ 16,
+    /*.blocklen =*/ 64,
+    HASHALG_NAMES_BARE("MD5"),
+    NULL, // WINSCP
 };

Failā izmaiņas netiks attēlotas, jo tās ir par lielu
+ 477 - 298
source/putty/sshpubk.c


+ 92 - 51
source/putty/sshsh256.c

@@ -105,8 +105,16 @@ static ssh_hash *sha256_select(const ssh_hashalg *alg)
 }
 
 const ssh_hashalg ssh_sha256 = {
-    sha256_select, NULL, NULL, NULL,
-    32, 64, HASHALG_NAMES_ANNOTATED("SHA-256", "dummy selector vtable"),
+    // WINSCP
+    /*.new =*/ sha256_select,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    /*.hlen =*/ 32,
+    /*.blocklen =*/ 64,
+    HASHALG_NAMES_ANNOTATED("SHA-256", "dummy selector vtable"),
+    NULL,
 };
 
 #else
@@ -300,26 +308,28 @@ static ssh_hash *sha256_sw_new(const ssh_hashalg *alg)
 {
     sha256_sw *s = snew(sha256_sw);
 
-    memcpy(s->core, sha256_initial_state, sizeof(s->core));
-
-    sha256_block_setup(&s->blk);
-
     s->hash.vt = alg;
     BinarySink_INIT(s, sha256_sw_write);
     BinarySink_DELEGATE_INIT(&s->hash, s);
     return &s->hash;
 }
 
-static ssh_hash *sha256_sw_copy(ssh_hash *hash)
+static void sha256_sw_reset(ssh_hash *hash)
 {
     sha256_sw *s = container_of(hash, sha256_sw, hash);
-    sha256_sw *copy = snew(sha256_sw);
 
-    memcpy(copy, s, sizeof(*copy));
+    memcpy(s->core, sha256_initial_state, sizeof(s->core));
+    sha256_block_setup(&s->blk);
+}
+
+static void sha256_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha256_sw *copy = container_of(hcopy, sha256_sw, hash);
+    sha256_sw *orig = container_of(horig, sha256_sw, hash);
+
+    memcpy(copy, orig, sizeof(*copy));
     BinarySink_COPIED(copy);
     BinarySink_DELEGATE_INIT(&copy->hash, copy);
-
-    return &copy->hash;
 }
 
 static void sha256_sw_free(ssh_hash *hash)
@@ -339,7 +349,7 @@ static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len)
             sha256_sw_block(s->core, s->blk.block);
 }
 
-static void sha256_sw_final(ssh_hash *hash, uint8_t *digest)
+static void sha256_sw_digest(ssh_hash *hash, uint8_t *digest)
 {
     sha256_sw *s = container_of(hash, sha256_sw, hash);
 
@@ -348,13 +358,20 @@ static void sha256_sw_final(ssh_hash *hash, uint8_t *digest)
     size_t i; // WINSCP
     for (i = 0; i < 8; i++)
         PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]);
-    sha256_sw_free(hash);
     } // WINSCP
 }
 
 const ssh_hashalg ssh_sha256_sw = {
-    sha256_sw_new, sha256_sw_copy, sha256_sw_final, sha256_sw_free,
-    32, 64, HASHALG_NAMES_BARE("SHA-256"), // WINSCP (removed "unaccelerated" annotation)
+    // WINSCP
+    /*.new =*/ sha256_sw_new,
+    /*.reset =*/ sha256_sw_reset,
+    /*.copyfrom =*/ sha256_sw_copyfrom,
+    /*.digest =*/ sha256_sw_digest,
+    /*.free =*/ sha256_sw_free,
+    /*.hlen =*/ 32,
+    /*.blocklen =*/ 64,
+    HASHALG_NAMES_BARE("SHA-256"), // WINSCP (removed "unaccelerated" annotation)
+    NULL,
 };
 #endif // !WINSCP_VS
 
@@ -632,13 +649,24 @@ static sha256_ni *sha256_ni_alloc(void)
     return s;
 }
 
-FUNC_ISA /*WINSCP static*/ ssh_hash *sha256_ni_new(const ssh_hashalg *alg)
+/*WINSCP static*/ ssh_hash *sha256_ni_new(const ssh_hashalg *alg)
 {
     if (!sha256_hw_available_cached())
         return NULL;
 
     sha256_ni *s = sha256_ni_alloc();
 
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha256_ni_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+
+    return &s->hash;
+}
+
+FUNC_ISA static void sha256_ni_reset(ssh_hash *hash)
+{
+    sha256_ni *s = container_of(hash, sha256_ni, hash);
+
     /* Initialise the core vectors in their storage order */
     s->core[0] = _mm_set_epi64x(
         0x6a09e667bb67ae85ULL, 0x510e527f9b05688cULL);
@@ -646,26 +674,19 @@ FUNC_ISA /*WINSCP static*/ ssh_hash *sha256_ni_new(const ssh_hashalg *alg)
         0x3c6ef372a54ff53aULL, 0x1f83d9ab5be0cd19ULL);
 
     sha256_block_setup(&s->blk);
-
-    s->hash.vt = alg;
-    BinarySink_INIT(s, sha256_ni_write);
-    BinarySink_DELEGATE_INIT(&s->hash, s);
-    return &s->hash;
 }
 
-/*WINSCP static*/ ssh_hash *sha256_ni_copy(ssh_hash *hash)
+/*WINSCP static*/ void sha256_ni_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
 {
-    sha256_ni *s = container_of(hash, sha256_ni, hash);
-    sha256_ni *copy = sha256_ni_alloc();
+    sha256_ni *copy = container_of(hcopy, sha256_ni, hash);
+    sha256_ni *orig = container_of(horig, sha256_ni, hash);
 
     void *ptf_save = copy->pointer_to_free;
-    *copy = *s; /* structure copy */
+    *copy = *orig; /* structure copy */
     copy->pointer_to_free = ptf_save;
 
     BinarySink_COPIED(copy);
     BinarySink_DELEGATE_INIT(&copy->hash, copy);
-
-    return &copy->hash;
 }
 
 /*WINSCP static*/ void sha256_ni_free(ssh_hash *hash)
@@ -686,7 +707,7 @@ static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len)
             sha256_ni_block(s->core, s->blk.block);
 }
 
-FUNC_ISA /*WINSCP static*/ void sha256_ni_final(ssh_hash *hash, uint8_t *digest)
+FUNC_ISA /*WINSCP static*/ void sha256_ni_digest(ssh_hash *hash, uint8_t *digest)
 {
     sha256_ni *s = container_of(hash, sha256_ni, hash);
 
@@ -707,8 +728,6 @@ FUNC_ISA /*WINSCP static*/ void sha256_ni_final(ssh_hash *hash, uint8_t *digest)
     __m128i *output = (__m128i *)digest;
     _mm_storeu_si128(output, dcba);
     _mm_storeu_si128(output+1, hgfe);
-
-    sha256_ni_free(hash);
 }
 
 #endif // WINSCP_VS
@@ -721,8 +740,14 @@ void sha256_ni_final(ssh_hash *hash, uint8_t *digest);
 void sha256_ni_free(ssh_hash *hash);
 
 const ssh_hashalg ssh_sha256_hw = {
-    sha256_ni_new, sha256_ni_copy, sha256_ni_final, sha256_ni_free,
-    32, 64, HASHALG_NAMES_ANNOTATED("SHA-256", "SHA-NI accelerated"),
+    .new = sha256_ni_new,
+    .reset = sha256_ni_reset,
+    .copyfrom = sha256_ni_copyfrom,
+    .digest = sha256_ni_digest,
+    .free = sha256_ni_free,
+    .hlen = 32,
+    .blocklen = 64,
+    HASHALG_NAMES_ANNOTATED("SHA-256", "SHA-NI accelerated"),
 };
 
 /* ----------------------------------------------------------------------
@@ -857,28 +882,31 @@ static ssh_hash *sha256_neon_new(const ssh_hashalg *alg)
 
     sha256_neon *s = snew(sha256_neon);
 
-    s->core.abcd = vld1q_u32(sha256_initial_state);
-    s->core.efgh = vld1q_u32(sha256_initial_state + 4);
-
-    sha256_block_setup(&s->blk);
-
     s->hash.vt = alg;
     BinarySink_INIT(s, sha256_neon_write);
     BinarySink_DELEGATE_INIT(&s->hash, s);
     return &s->hash;
 }
 
-static ssh_hash *sha256_neon_copy(ssh_hash *hash)
+static void sha256_neon_reset(ssh_hash *hash)
 {
     sha256_neon *s = container_of(hash, sha256_neon, hash);
-    sha256_neon *copy = snew(sha256_neon);
 
-    *copy = *s; /* structure copy */
+    s->core.abcd = vld1q_u32(sha256_initial_state);
+    s->core.efgh = vld1q_u32(sha256_initial_state + 4);
+
+    sha256_block_setup(&s->blk);
+}
+
+static void sha256_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha256_neon *copy = container_of(hcopy, sha256_neon, hash);
+    sha256_neon *orig = container_of(horig, sha256_neon, hash);
+
+    *copy = *orig; /* structure copy */
 
     BinarySink_COPIED(copy);
     BinarySink_DELEGATE_INIT(&copy->hash, copy);
-
-    return &copy->hash;
 }
 
 static void sha256_neon_free(ssh_hash *hash)
@@ -897,19 +925,24 @@ static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len)
             sha256_neon_block(&s->core, s->blk.block);
 }
 
-static void sha256_neon_final(ssh_hash *hash, uint8_t *digest)
+static void sha256_neon_digest(ssh_hash *hash, uint8_t *digest)
 {
     sha256_neon *s = container_of(hash, sha256_neon, hash);
 
     sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
     vst1q_u8(digest,      vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd)));
     vst1q_u8(digest + 16, vrev32q_u8(vreinterpretq_u8_u32(s->core.efgh)));
-    sha256_neon_free(hash);
 }
 
 const ssh_hashalg ssh_sha256_hw = {
-    sha256_neon_new, sha256_neon_copy, sha256_neon_final, sha256_neon_free,
-    32, 64, HASHALG_NAMES_ANNOTATED("SHA-256", "NEON accelerated"),
+    .new = sha256_neon_new,
+    .reset = sha256_neon_reset,
+    .copyfrom = sha256_neon_copyfrom,
+    .digest = sha256_neon_digest,
+    .free = sha256_neon_free,
+    .hlen = 32,
+    .blocklen = 64,
+    HASHALG_NAMES_ANNOTATED("SHA-256", "NEON accelerated"),
 };
 
 #endif
@@ -938,14 +971,22 @@ static ssh_hash *sha256_stub_new(const ssh_hashalg *alg)
 
 #define STUB_BODY { unreachable("Should never be called"); }
 
-static ssh_hash *sha256_stub_copy(ssh_hash *hash) { STUB_BODY; return NULL; }
+static void sha256_stub_reset(ssh_hash *hash) STUB_BODY
+static void sha256_stub_copyfrom(ssh_hash *hash, ssh_hash *orig) STUB_BODY
 static void sha256_stub_free(ssh_hash *hash) STUB_BODY
-static void sha256_stub_final(ssh_hash *hash, uint8_t *digest) STUB_BODY
+static void sha256_stub_digest(ssh_hash *hash, uint8_t *digest) STUB_BODY
 
 const ssh_hashalg ssh_sha256_hw = {
-    sha256_stub_new, sha256_stub_copy, sha256_stub_final, sha256_stub_free,
-    32, 64, HASHALG_NAMES_ANNOTATED(
-        "SHA-256", "!NONEXISTENT ACCELERATED VERSION!"),
+    // WINSCP
+    /*.new =*/ sha256_stub_new,
+    /*.reset =*/ sha256_stub_reset,
+    /*.copyfrom =*/ sha256_stub_copyfrom,
+    /*.digest =*/ sha256_stub_digest,
+    /*.free =*/ sha256_stub_free,
+    /*.hlen =*/ 32,
+    /*.blocklen =*/ 64,
+    HASHALG_NAMES_ANNOTATED("SHA-256", "!NONEXISTENT ACCELERATED VERSION!"),
+    NULL,
 };
 
 #endif // !WINSCP_VS

+ 769 - 285
source/putty/sshsh512.c

@@ -9,361 +9,845 @@
 #include <assert.h>
 #include "ssh.h"
 
-#define BLKSIZE 128
+/*
+ * Start by deciding whether we can support hardware SHA at all.
+ */
+#define HW_SHA512_NONE 0
+#define HW_SHA512_NEON 1
+
+#ifdef _FORCE_SHA512_NEON
+#   define HW_SHA512 HW_SHA512_NEON
+#elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    /* Arm can potentially support both endiannesses, but this code
+     * hasn't been tested on anything but little. If anyone wants to
+     * run big-endian, they'll need to fix it first. */
+#elif defined __ARM_FEATURE_SHA512
+    /* If the Arm SHA-512 extension is available already, we can
+     * support NEON SHA without having to enable anything by hand */
+#   define HW_SHA512 HW_SHA512_NEON
+#elif defined(__clang__)
+#   if __has_attribute(target) && __has_include(<arm_neon.h>) &&       \
+    (defined(__aarch64__))
+        /* clang can enable the crypto extension in AArch64 using
+         * __attribute__((target)) */
+#       define HW_SHA512 HW_SHA512_NEON
+#       define USE_CLANG_ATTR_TARGET_AARCH64
+#   endif
+#endif
+
+#if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA512
+#   undef HW_SHA512
+#   define HW_SHA512 HW_SHA512_NONE
+#endif
 
-typedef struct {
-    uint64_t h[8];
-    unsigned char block[BLKSIZE];
-    int blkused;
-    uint64_t lenhi, lenlo;
-    BinarySink_IMPLEMENTATION;
-} SHA512_State;
+/*
+ * The actual query function that asks if hardware acceleration is
+ * available.
+ */
+static bool sha512_hw_available(void);
 
 /*
- * Arithmetic implementations. Note that AND, XOR and NOT can
- * overlap destination with one source, but the others can't.
+ * The top-level selection function, caching the results of
+ * sha512_hw_available() so it only has to run once.
  */
-#define add(r,x,y) ( r = (x) + (y) )
-#define rorB(r,x,y) ( r = ((x) >> (y)) | ((x) << (64-(y))) )
-#define rorL(r,x,y) ( r = ((x) >> (y)) | ((x) << (64-(y))) )
-#define shrB(r,x,y) ( r = (x) >> (y) )
-#define shrL(r,x,y) ( r = (x) >> (y) )
-#define and(r,x,y) ( r = (x) & (y) )
-#define xor(r,x,y) ( r = (x) ^ (y) )
-#define not(r,x) ( r = ~(x) )
-#define INIT(h,l) ((((uint64_t)(h)) << 32) | (l))
-#define BUILD(r,h,l) ( r = ((((uint64_t)(h)) << 32) | (l)) )
-#define EXTRACT(h,l,r) ( h = (r) >> 32, l = (r) & 0xFFFFFFFFU )
+static bool sha512_hw_available_cached(void)
+{
+    static bool initialised = false;
+    static bool hw_available;
+    if (!initialised) {
+        hw_available = sha512_hw_available();
+        initialised = true;
+    }
+    return hw_available;
+}
+
+struct sha512_select_options {
+    const ssh_hashalg *hw, *sw;
+};
+
+static ssh_hash *sha512_select(const ssh_hashalg *alg)
+{
+    const struct sha512_select_options *options =
+        (const struct sha512_select_options *)alg->extra;
+
+    const ssh_hashalg *real_alg =
+        sha512_hw_available_cached() ? options->hw : options->sw;
+
+    return ssh_hash_new(real_alg);
+}
+
+const struct sha512_select_options ssh_sha512_select_options = {
+    &ssh_sha512_hw, &ssh_sha512_sw,
+};
+const struct sha512_select_options ssh_sha384_select_options = {
+    &ssh_sha384_hw, &ssh_sha384_sw,
+};
+
+const ssh_hashalg ssh_sha512 = {
+    // WINSCP
+    /*.new =*/ sha512_select,
+    NULL, NULL, NULL, NULL, // WINSCP
+    /*.hlen =*/ 64,
+    /*.blocklen =*/ 128,
+    HASHALG_NAMES_ANNOTATED("SHA-512", "dummy selector vtable"),
+    /*.extra =*/ &ssh_sha512_select_options,
+};
+
+const ssh_hashalg ssh_sha384 = {
+    // WINSCP
+    /*.new =*/ sha512_select,
+    NULL, NULL, NULL, NULL, // WINSCP
+    /*.hlen =*/ 48,
+    /*.blocklen =*/ 128,
+    HASHALG_NAMES_ANNOTATED("SHA-384", "dummy selector vtable"),
+    /*.extra =*/ &ssh_sha384_select_options,
+};
 
 /* ----------------------------------------------------------------------
- * Core SHA512 algorithm: processes 16-doubleword blocks into a
- * message digest.
+ * Definitions likely to be helpful to multiple implementations.
  */
 
-#define Ch(r,t,x,y,z) ( not(t,x), and(r,t,z), and(t,x,y), xor(r,r,t) )
-#define Maj(r,t,x,y,z) ( and(r,x,y), and(t,x,z), xor(r,r,t), \
-                         and(t,y,z), xor(r,r,t) )
-#define bigsigma0(r,t,x) ( rorL(r,x,28), rorB(t,x,34), xor(r,r,t), \
-                           rorB(t,x,39), xor(r,r,t) )
-#define bigsigma1(r,t,x) ( rorL(r,x,14), rorL(t,x,18), xor(r,r,t), \
-                           rorB(t,x,41), xor(r,r,t) )
-#define smallsigma0(r,t,x) ( rorL(r,x,1), rorL(t,x,8), xor(r,r,t), \
-                             shrL(t,x,7), xor(r,r,t) )
-#define smallsigma1(r,t,x) ( rorL(r,x,19), rorB(t,x,61), xor(r,r,t), \
-                             shrL(t,x,6), xor(r,r,t) )
-
-static void SHA512_Core_Init(SHA512_State *s) {
-    static const uint64_t iv[] = {
-        INIT(0x6a09e667, 0xf3bcc908),
-        INIT(0xbb67ae85, 0x84caa73b),
-        INIT(0x3c6ef372, 0xfe94f82b),
-        INIT(0xa54ff53a, 0x5f1d36f1),
-        INIT(0x510e527f, 0xade682d1),
-        INIT(0x9b05688c, 0x2b3e6c1f),
-        INIT(0x1f83d9ab, 0xfb41bd6b),
-        INIT(0x5be0cd19, 0x137e2179),
-    };
-    int i;
-    for (i = 0; i < 8; i++)
-        s->h[i] = iv[i];
-}
-
-static void SHA384_Core_Init(SHA512_State *s) {
-    static const uint64_t iv[] = {
-        INIT(0xcbbb9d5d, 0xc1059ed8),
-        INIT(0x629a292a, 0x367cd507),
-        INIT(0x9159015a, 0x3070dd17),
-        INIT(0x152fecd8, 0xf70e5939),
-        INIT(0x67332667, 0xffc00b31),
-        INIT(0x8eb44a87, 0x68581511),
-        INIT(0xdb0c2e0d, 0x64f98fa7),
-        INIT(0x47b5481d, 0xbefa4fa4),
-    };
-    int i;
-    for (i = 0; i < 8; i++)
-        s->h[i] = iv[i];
-}
-
-static void SHA512_Block(SHA512_State *s, uint64_t *block) {
-    uint64_t w[80];
-    uint64_t a,b,c,d,e,f,g,h;
-    static const uint64_t k[] = {
-        INIT(0x428a2f98, 0xd728ae22), INIT(0x71374491, 0x23ef65cd),
-        INIT(0xb5c0fbcf, 0xec4d3b2f), INIT(0xe9b5dba5, 0x8189dbbc),
-        INIT(0x3956c25b, 0xf348b538), INIT(0x59f111f1, 0xb605d019),
-        INIT(0x923f82a4, 0xaf194f9b), INIT(0xab1c5ed5, 0xda6d8118),
-        INIT(0xd807aa98, 0xa3030242), INIT(0x12835b01, 0x45706fbe),
-        INIT(0x243185be, 0x4ee4b28c), INIT(0x550c7dc3, 0xd5ffb4e2),
-        INIT(0x72be5d74, 0xf27b896f), INIT(0x80deb1fe, 0x3b1696b1),
-        INIT(0x9bdc06a7, 0x25c71235), INIT(0xc19bf174, 0xcf692694),
-        INIT(0xe49b69c1, 0x9ef14ad2), INIT(0xefbe4786, 0x384f25e3),
-        INIT(0x0fc19dc6, 0x8b8cd5b5), INIT(0x240ca1cc, 0x77ac9c65),
-        INIT(0x2de92c6f, 0x592b0275), INIT(0x4a7484aa, 0x6ea6e483),
-        INIT(0x5cb0a9dc, 0xbd41fbd4), INIT(0x76f988da, 0x831153b5),
-        INIT(0x983e5152, 0xee66dfab), INIT(0xa831c66d, 0x2db43210),
-        INIT(0xb00327c8, 0x98fb213f), INIT(0xbf597fc7, 0xbeef0ee4),
-        INIT(0xc6e00bf3, 0x3da88fc2), INIT(0xd5a79147, 0x930aa725),
-        INIT(0x06ca6351, 0xe003826f), INIT(0x14292967, 0x0a0e6e70),
-        INIT(0x27b70a85, 0x46d22ffc), INIT(0x2e1b2138, 0x5c26c926),
-        INIT(0x4d2c6dfc, 0x5ac42aed), INIT(0x53380d13, 0x9d95b3df),
-        INIT(0x650a7354, 0x8baf63de), INIT(0x766a0abb, 0x3c77b2a8),
-        INIT(0x81c2c92e, 0x47edaee6), INIT(0x92722c85, 0x1482353b),
-        INIT(0xa2bfe8a1, 0x4cf10364), INIT(0xa81a664b, 0xbc423001),
-        INIT(0xc24b8b70, 0xd0f89791), INIT(0xc76c51a3, 0x0654be30),
-        INIT(0xd192e819, 0xd6ef5218), INIT(0xd6990624, 0x5565a910),
-        INIT(0xf40e3585, 0x5771202a), INIT(0x106aa070, 0x32bbd1b8),
-        INIT(0x19a4c116, 0xb8d2d0c8), INIT(0x1e376c08, 0x5141ab53),
-        INIT(0x2748774c, 0xdf8eeb99), INIT(0x34b0bcb5, 0xe19b48a8),
-        INIT(0x391c0cb3, 0xc5c95a63), INIT(0x4ed8aa4a, 0xe3418acb),
-        INIT(0x5b9cca4f, 0x7763e373), INIT(0x682e6ff3, 0xd6b2b8a3),
-        INIT(0x748f82ee, 0x5defb2fc), INIT(0x78a5636f, 0x43172f60),
-        INIT(0x84c87814, 0xa1f0ab72), INIT(0x8cc70208, 0x1a6439ec),
-        INIT(0x90befffa, 0x23631e28), INIT(0xa4506ceb, 0xde82bde9),
-        INIT(0xbef9a3f7, 0xb2c67915), INIT(0xc67178f2, 0xe372532b),
-        INIT(0xca273ece, 0xea26619c), INIT(0xd186b8c7, 0x21c0c207),
-        INIT(0xeada7dd6, 0xcde0eb1e), INIT(0xf57d4f7f, 0xee6ed178),
-        INIT(0x06f067aa, 0x72176fba), INIT(0x0a637dc5, 0xa2c898a6),
-        INIT(0x113f9804, 0xbef90dae), INIT(0x1b710b35, 0x131c471b),
-        INIT(0x28db77f5, 0x23047d84), INIT(0x32caab7b, 0x40c72493),
-        INIT(0x3c9ebe0a, 0x15c9bebc), INIT(0x431d67c4, 0x9c100d4c),
-        INIT(0x4cc5d4be, 0xcb3e42b6), INIT(0x597f299c, 0xfc657e2a),
-        INIT(0x5fcb6fab, 0x3ad6faec), INIT(0x6c44198c, 0x4a475817),
-    };
+static const uint64_t sha512_initial_state[] = {
+    0x6a09e667f3bcc908ULL,
+    0xbb67ae8584caa73bULL,
+    0x3c6ef372fe94f82bULL,
+    0xa54ff53a5f1d36f1ULL,
+    0x510e527fade682d1ULL,
+    0x9b05688c2b3e6c1fULL,
+    0x1f83d9abfb41bd6bULL,
+    0x5be0cd19137e2179ULL,
+};
 
-    int t;
+static const uint64_t sha384_initial_state[] = {
+    0xcbbb9d5dc1059ed8ULL,
+    0x629a292a367cd507ULL,
+    0x9159015a3070dd17ULL,
+    0x152fecd8f70e5939ULL,
+    0x67332667ffc00b31ULL,
+    0x8eb44a8768581511ULL,
+    0xdb0c2e0d64f98fa7ULL,
+    0x47b5481dbefa4fa4ULL,
+};
 
-    for (t = 0; t < 16; t++)
-        w[t] = block[t];
-
-    for (t = 16; t < 80; t++) {
-        uint64_t p, q, r, tmp;
-        smallsigma1(p, tmp, w[t-2]);
-        smallsigma0(q, tmp, w[t-15]);
-        add(r, p, q);
-        add(p, r, w[t-7]);
-        add(w[t], p, w[t-16]);
-    }
+static const uint64_t sha512_round_constants[] = {
+    0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
+    0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
+    0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+    0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
+    0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
+    0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+    0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
+    0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
+    0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+    0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
+    0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
+    0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+    0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
+    0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
+    0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+    0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
+    0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
+    0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+    0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
+    0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
+    0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+    0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
+    0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
+    0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+    0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
+    0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
+    0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+    0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
+    0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
+    0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+    0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
+    0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
+    0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+    0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
+    0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
+    0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+    0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
+    0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
+    0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+    0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
+};
 
-    a = s->h[0]; b = s->h[1]; c = s->h[2]; d = s->h[3];
-    e = s->h[4]; f = s->h[5]; g = s->h[6]; h = s->h[7];
-
-    for (t = 0; t < 80; t+=8) {
-        uint64_t tmp, p, q, r;
-
-#define ROUND(j,a,b,c,d,e,f,g,h) \
-        bigsigma1(p, tmp, e); \
-        Ch(q, tmp, e, f, g); \
-        add(r, p, q); \
-        add(p, r, k[j]) ; \
-        add(q, p, w[j]); \
-        add(r, q, h); \
-        bigsigma0(p, tmp, a); \
-        Maj(tmp, q, a, b, c); \
-        add(q, tmp, p); \
-        add(p, r, d); \
-        d = p; \
-        add(h, q, r);
-
-        ROUND(t+0, a,b,c,d,e,f,g,h);
-        ROUND(t+1, h,a,b,c,d,e,f,g);
-        ROUND(t+2, g,h,a,b,c,d,e,f);
-        ROUND(t+3, f,g,h,a,b,c,d,e);
-        ROUND(t+4, e,f,g,h,a,b,c,d);
-        ROUND(t+5, d,e,f,g,h,a,b,c);
-        ROUND(t+6, c,d,e,f,g,h,a,b);
-        ROUND(t+7, b,c,d,e,f,g,h,a);
-    }
+#define SHA512_ROUNDS 80
+
+typedef struct sha512_block sha512_block;
+struct sha512_block {
+    uint8_t block[128];
+    size_t used;
+    uint64_t lenhi, lenlo;
+};
+
+static inline void sha512_block_setup(sha512_block *blk)
+{
+    blk->used = 0;
+    blk->lenhi = blk->lenlo = 0;
+}
+
+static inline bool sha512_block_write(
+    sha512_block *blk, const void **vdata, size_t *len)
+{
+    size_t blkleft = sizeof(blk->block) - blk->used;
+    size_t chunk = *len < blkleft ? *len : blkleft;
+
+    const uint8_t *p = *vdata;
+    memcpy(blk->block + blk->used, p, chunk);
+    *vdata = p + chunk;
+    *len -= chunk;
+    blk->used += chunk;
 
-    {
-        uint64_t tmp;
-#define UPDATE(state, local) ( tmp = state, add(state, tmp, local) )
-        UPDATE(s->h[0], a); UPDATE(s->h[1], b);
-        UPDATE(s->h[2], c); UPDATE(s->h[3], d);
-        UPDATE(s->h[4], e); UPDATE(s->h[5], f);
-        UPDATE(s->h[6], g); UPDATE(s->h[7], h);
+    { // WINSCP
+    size_t chunkbits = chunk << 3;
+
+    blk->lenlo += chunkbits;
+    blk->lenhi += (blk->lenlo < chunkbits);
+
+    if (blk->used == sizeof(blk->block)) {
+        blk->used = 0;
+        return true;
     }
+
+    return false;
+    } // WINSCP
+}
+
+static inline void sha512_block_pad(sha512_block *blk, BinarySink *bs)
+{
+    uint64_t final_lenhi = blk->lenhi;
+    uint64_t final_lenlo = blk->lenlo;
+    size_t pad = 127 & (111 - blk->used);
+
+    put_byte(bs, 0x80);
+    put_padding(bs, pad, 0);
+    put_uint64(bs, final_lenhi);
+    put_uint64(bs, final_lenlo);
+
+    assert(blk->used == 0 && "Should have exactly hit a block boundary");
 }
 
 /* ----------------------------------------------------------------------
- * Outer SHA512 algorithm: take an arbitrary length byte string,
- * convert it into 16-doubleword blocks with the prescribed padding
- * at the end, and pass those blocks to the core SHA512 algorithm.
+ * Software implementation of SHA-512.
  */
 
-static void SHA512_BinarySink_write(BinarySink *bs,
-                                    const void *p, size_t len);
+static inline uint64_t ror(uint64_t x, unsigned y)
+{
+#pragma option push -w-ngu // WINSCP
+    return (x << (63 & -y)) | (x >> (63 & y));
+#pragma option pop // WINSCP
+}
 
-void SHA512_Init(SHA512_State *s) {
-    SHA512_Core_Init(s);
-    s->blkused = 0;
-    s->lenhi = s->lenlo = 0;
-    BinarySink_INIT(s, SHA512_BinarySink_write);
+static inline uint64_t Ch(uint64_t ctrl, uint64_t if1, uint64_t if0)
+{
+    return if0 ^ (ctrl & (if1 ^ if0));
 }
 
-void SHA384_Init(SHA512_State *s) {
-    SHA384_Core_Init(s);
-    s->blkused = 0;
-    s->lenhi = s->lenlo = 0;
-    BinarySink_INIT(s, SHA512_BinarySink_write);
+static inline uint64_t Maj(uint64_t x, uint64_t y, uint64_t z)
+{
+    return (x & y) | (z & (x | y));
 }
 
-static void SHA512_BinarySink_write(BinarySink *bs,
-                                    const void *p, size_t len)
+static inline uint64_t Sigma_0(uint64_t x)
 {
-    SHA512_State *s = BinarySink_DOWNCAST(bs, SHA512_State);
-    unsigned char *q = (unsigned char *)p;
-    uint64_t wordblock[16];
-    int i;
+    return ror(x,28) ^ ror(x,34) ^ ror(x,39);
+}
 
-    /*
-     * Update the length field.
-     */
-    s->lenlo += len;
-    s->lenhi += (s->lenlo < len);
-
-    if (s->blkused && s->blkused+len < BLKSIZE) {
-        /*
-         * Trivial case: just add to the block.
-         */
-        memcpy(s->block + s->blkused, q, len);
-        s->blkused += len;
-    } else {
-        /*
-         * We must complete and process at least one block.
-         */
-        while (s->blkused + len >= BLKSIZE) {
-            memcpy(s->block + s->blkused, q, BLKSIZE - s->blkused);
-            q += BLKSIZE - s->blkused;
-            len -= BLKSIZE - s->blkused;
-            /* Now process the block. Gather bytes big-endian into words */
-            for (i = 0; i < 16; i++)
-                wordblock[i] = GET_64BIT_MSB_FIRST(s->block + i*8);
-            SHA512_Block(s, wordblock);
-            s->blkused = 0;
-        }
-        memcpy(s->block, q, len);
-        s->blkused = len;
+static inline uint64_t Sigma_1(uint64_t x)
+{
+    return ror(x,14) ^ ror(x,18) ^ ror(x,41);
+}
+
+static inline uint64_t sigma_0(uint64_t x)
+{
+    return ror(x,1) ^ ror(x,8) ^ (x >> 7);
+}
+
+static inline uint64_t sigma_1(uint64_t x)
+{
+    return ror(x,19) ^ ror(x,61) ^ (x >> 6);
+}
+
+static inline void sha512_sw_round(
+    unsigned round_index, const uint64_t *schedule,
+    uint64_t *a, uint64_t *b, uint64_t *c, uint64_t *d,
+    uint64_t *e, uint64_t *f, uint64_t *g, uint64_t *h)
+{
+    uint64_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) +
+        sha512_round_constants[round_index] + schedule[round_index];
+
+    uint64_t t2 = Sigma_0(*a) + Maj(*a,*b,*c);
+
+    *d += t1;
+    *h = t1 + t2;
+}
+
+static void sha512_sw_block(uint64_t *core, const uint8_t *block)
+{
+    uint64_t w[SHA512_ROUNDS];
+    uint64_t a,b,c,d,e,f,g,h;
+
+    int t;
+
+    for (t = 0; t < 16; t++)
+        w[t] = GET_64BIT_MSB_FIRST(block + 8*t);
+
+    for (t = 16; t < SHA512_ROUNDS; t++)
+        w[t] = w[t-16] + w[t-7] + sigma_0(w[t-15]) + sigma_1(w[t-2]);
+
+    a = core[0]; b = core[1]; c = core[2]; d = core[3];
+    e = core[4]; f = core[5]; g = core[6]; h = core[7];
+
+    for (t = 0; t < SHA512_ROUNDS; t+=8) {
+        sha512_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h);
+        sha512_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g);
+        sha512_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f);
+        sha512_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e);
+        sha512_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d);
+        sha512_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c);
+        sha512_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b);
+        sha512_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a);
     }
+
+    core[0] += a; core[1] += b; core[2] += c; core[3] += d;
+    core[4] += e; core[5] += f; core[6] += g; core[7] += h;
+
+    smemclr(w, sizeof(w));
 }
 
-void SHA512_Final(SHA512_State *s, unsigned char *digest) {
-    int i;
-    int pad;
-    unsigned char c[BLKSIZE];
-    uint64_t lenhi, lenlo;
+typedef struct sha512_sw {
+    uint64_t core[8];
+    sha512_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha512_sw;
 
-    if (s->blkused >= BLKSIZE-16)
-        pad = (BLKSIZE-16) + BLKSIZE - s->blkused;
-    else
-        pad = (BLKSIZE-16) - s->blkused;
+static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len);
 
-    lenhi = (s->lenhi << 3) | (s->lenlo >> (32-3));
-    lenlo = (s->lenlo << 3);
+static ssh_hash *sha512_sw_new(const ssh_hashalg *alg)
+{
+    sha512_sw *s = snew(sha512_sw);
 
-    memset(c, 0, pad);
-    c[0] = 0x80;
-    put_data(s, &c, pad);
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha512_sw_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
 
-    put_uint64(s, lenhi);
-    put_uint64(s, lenlo);
+static void sha512_sw_reset(ssh_hash *hash)
+{
+    sha512_sw *s = container_of(hash, sha512_sw, hash);
 
-    for (i = 0; i < 8; i++)
-        PUT_64BIT_MSB_FIRST(digest + i*8, s->h[i]);
+    /* The 'extra' field in the ssh_hashalg indicates which
+     * initialisation vector we're using */
+    memcpy(s->core, hash->vt->extra, sizeof(s->core));
+    sha512_block_setup(&s->blk);
 }
 
-void SHA384_Final(SHA512_State *s, unsigned char *digest) {
-    unsigned char biggerDigest[512 / 8];
-    SHA512_Final(s, biggerDigest);
-    memcpy(digest, biggerDigest, 384 / 8);
+static void sha512_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha512_sw *copy = container_of(hcopy, sha512_sw, hash);
+    sha512_sw *orig = container_of(horig, sha512_sw, hash);
+
+    memcpy(copy, orig, sizeof(*copy));
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
 }
 
-void SHA512_Simple(const void *p, int len, unsigned char *output) {
-    SHA512_State s;
+static void sha512_sw_free(ssh_hash *hash)
+{
+    sha512_sw *s = container_of(hash, sha512_sw, hash);
 
-    SHA512_Init(&s);
-    put_data(&s, p, len);
-    SHA512_Final(&s, output);
-    smemclr(&s, sizeof(s));
+    smemclr(s, sizeof(*s));
+    sfree(s);
 }
 
-void SHA384_Simple(const void *p, int len, unsigned char *output) {
-    SHA512_State s;
+static void sha512_sw_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha512_sw *s = BinarySink_DOWNCAST(bs, sha512_sw);
 
-    SHA384_Init(&s);
-    put_data(&s, p, len);
-    SHA384_Final(&s, output);
-    smemclr(&s, sizeof(s));
+    while (len > 0)
+        if (sha512_block_write(&s->blk, &vp, &len))
+            sha512_sw_block(s->core, s->blk.block);
 }
 
+static void sha512_sw_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha512_sw *s = container_of(hash, sha512_sw, hash);
+
+    sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
+    { // WINSCP
+    size_t i; // WINSCP
+    for (i = 0; i < hash->vt->hlen / 8; i++)
+        PUT_64BIT_MSB_FIRST(digest + 8*i, s->core[i]);
+    }  // WINSCP
+}
+
+const ssh_hashalg ssh_sha512_sw = {
+    // WINSCP
+    /*.new =*/ sha512_sw_new,
+    /*.reset =*/ sha512_sw_reset,
+    /*.copyfrom =*/ sha512_sw_copyfrom,
+    /*.digest =*/ sha512_sw_digest,
+    /*.free =*/ sha512_sw_free,
+    /*.hlen =*/ 64,
+    /*.blocklen =*/ 128,
+    HASHALG_NAMES_ANNOTATED("SHA-512", "unaccelerated"),
+    /*.extra =*/ sha512_initial_state,
+};
+
+const ssh_hashalg ssh_sha384_sw = {
+    // WINSCP
+    /*.new =*/ sha512_sw_new,
+    /*.reset =*/ sha512_sw_reset,
+    /*.copyfrom =*/ sha512_sw_copyfrom,
+    /*.digest =*/ sha512_sw_digest,
+    /*.free =*/ sha512_sw_free,
+    /*.hlen =*/ 48,
+    /*.blocklen =*/ 128,
+    HASHALG_NAMES_ANNOTATED("SHA-384", "unaccelerated"),
+    /*.extra =*/ sha384_initial_state,
+};
+
+/* ----------------------------------------------------------------------
+ * Hardware-accelerated implementation of SHA-512 using Arm NEON.
+ */
+
+#if HW_SHA512 == HW_SHA512_NEON
+
 /*
- * Thin abstraction for things where hashes are pluggable.
+ * Manually set the target architecture, if we decided above that we
+ * need to.
  */
+#ifdef USE_CLANG_ATTR_TARGET_AARCH64
+/*
+ * A spot of cheating: redefine some ACLE feature macros before
+ * including arm_neon.h. Otherwise we won't get the SHA intrinsics
+ * defined by that header, because it will be looking at the settings
+ * for the whole translation unit rather than the ones we're going to
+ * put on some particular functions using __attribute__((target)).
+ */
+#define __ARM_NEON 1
+#define __ARM_FEATURE_CRYPTO 1
+#define FUNC_ISA __attribute__ ((target("neon,sha3")))
+#endif /* USE_CLANG_ATTR_TARGET_AARCH64 */
+
+#ifndef FUNC_ISA
+#define FUNC_ISA
+#endif
+
+#ifdef USE_ARM64_NEON_H
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+static bool sha512_hw_available(void)
+{
+    /*
+     * For Arm, we delegate to a per-platform detection function (see
+     * explanation in sshaes.c).
+     */
+    return platform_sha512_hw_available();
+}
 
-struct sha512_hash {
-    SHA512_State state;
-    ssh_hash hash;
+#if defined __clang__
+/*
+ * As of 2020-12-24, I've found that clang doesn't provide the SHA-512
+ * NEON intrinsics. So I define my own set using inline assembler, and
+ * use #define to effectively rename them over the top of the standard
+ * names.
+ *
+ * The aim of that #define technique is that it should avoid a build
+ * failure if these intrinsics _are_ defined in <arm_neon.h>.
+ * Obviously it would be better in that situation to switch back to
+ * using the real intrinsics, but until I see a version of clang that
+ * supports them, I won't know what version number to test in the
+ * ifdef.
+ */
+static inline FUNC_ISA
+uint64x2_t vsha512su0q_u64_asm(uint64x2_t x, uint64x2_t y) {
+    __asm__("sha512su0 %0.2D,%1.2D" : "+w" (x) : "w" (y));
+    return x;
+}
+static inline FUNC_ISA
+uint64x2_t vsha512su1q_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) {
+    __asm__("sha512su1 %0.2D,%1.2D,%2.2D" : "+w" (x) : "w" (y), "w" (z));
+    return x;
+}
+static inline FUNC_ISA
+uint64x2_t vsha512hq_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) {
+    __asm__("sha512h %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z));
+    return x;
+}
+static inline FUNC_ISA
+uint64x2_t vsha512h2q_u64_asm(uint64x2_t x, uint64x2_t y, uint64x2_t z) {
+    __asm__("sha512h2 %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z));
+    return x;
+}
+#undef vsha512su0q_u64
+#define vsha512su0q_u64 vsha512su0q_u64_asm
+#undef vsha512su1q_u64
+#define vsha512su1q_u64 vsha512su1q_u64_asm
+#undef vsha512hq_u64
+#define vsha512hq_u64 vsha512hq_u64_asm
+#undef vsha512h2q_u64
+#define vsha512h2q_u64 vsha512h2q_u64_asm
+#endif /* defined __clang__ */
+
+typedef struct sha512_neon_core sha512_neon_core;
+struct sha512_neon_core {
+    uint64x2_t ab, cd, ef, gh;
 };
 
-static ssh_hash *sha512_new(const ssh_hashalg *alg)
+FUNC_ISA
+static inline uint64x2_t sha512_neon_load_input(const uint8_t *p)
 {
-    struct sha512_hash *h = snew(struct sha512_hash);
-    SHA512_Init(&h->state);
-    h->hash.vt = alg;
-    BinarySink_DELEGATE_INIT(&h->hash, &h->state);
-    return &h->hash;
+    return vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(p)));
 }
 
-static ssh_hash *sha512_copy(ssh_hash *hashold)
+FUNC_ISA
+static inline uint64x2_t sha512_neon_schedule_update(
+    uint64x2_t m8, uint64x2_t m7, uint64x2_t m4, uint64x2_t m3, uint64x2_t m1)
 {
-    struct sha512_hash *hold, *hnew;
-    ssh_hash *hashnew = sha512_new(hashold->vt);
+    /*
+     * vsha512su0q_u64() takes words from a long way back in the
+     * schedule and performs the sigma_0 half of the computation of
+     * the next two 64-bit message-schedule words.
+     *
+     * vsha512su1q_u64() combines the result of that with the sigma_1
+     * steps, to output the finished version of those two words. The
+     * total amount of input data it requires fits nicely into three
+     * 128-bit vector registers, but one of those registers is
+     * misaligned compared to the 128-bit chunks that the message
+     * schedule is stored in. So we use vextq_u64 to make one of its
+     * input words out of the second half of m4 and the first half of
+     * m3.
+     */
+    return vsha512su1q_u64(vsha512su0q_u64(m8, m7), m1, vextq_u64(m4, m3, 1));
+}
+
+FUNC_ISA
+static inline void sha512_neon_round2(
+    unsigned round_index, uint64x2_t schedule_words,
+    uint64x2_t *ab, uint64x2_t *cd, uint64x2_t *ef, uint64x2_t *gh)
+{
+    /*
+     * vsha512hq_u64 performs the Sigma_1 and Ch half of the
+     * computation of two rounds of SHA-512 (including feeding back
+     * one of the outputs from the first of those half-rounds into the
+     * second one).
+     *
+     * vsha512h2q_u64 combines the result of that with the Sigma_0 and
+     * Maj steps, and outputs one 128-bit vector that replaces the gh
+     * piece of the input hash state, and a second that updates cd by
+     * addition.
+     *
+     * Similarly to vsha512su1q_u64 above, some of the input registers
+     * expected by these instructions are misaligned by 64 bits
+     * relative to the chunks we've divided the hash state into, so we
+     * have to start by making 'de' and 'fg' words out of our input
+     * cd,ef,gh, using vextq_u64.
+     *
+     * Also, one of the inputs to vsha512hq_u64 is expected to contain
+     * the results of summing gh + two round constants + two words of
+     * message schedule, but the two words of the message schedule
+     * have to be the opposite way round in the vector register from
+     * the way that vsha512su1q_u64 output them. Hence, there's
+     * another vextq_u64 in here that swaps the two halves of the
+     * initial_sum vector register.
+     *
+     * (This also means that I don't have to prepare a specially
+     * reordered version of the sha512_round_constants[] array: as
+     * long as I'm unavoidably doing a swap at run time _anyway_, I
+     * can load from the normally ordered version of that array, and
+     * just take care to fold in that data _before_ the swap rather
+     * than after.)
+     */
 
-    hold = container_of(hashold, struct sha512_hash, hash);
-    hnew = container_of(hashnew, struct sha512_hash, hash);
+    /* Load two round constants, with the first one in the low half */
+    uint64x2_t round_constants = vld1q_u64(
+        sha512_round_constants + round_index);
 
-    hnew->state = hold->state;
-    BinarySink_COPIED(&hnew->state);
+    /* Add schedule words to round constants */
+    uint64x2_t initial_sum = vaddq_u64(schedule_words, round_constants);
 
-    return hashnew;
+    /* Swap that sum around so the word used in the first of the two
+     * rounds is in the _high_ half of the vector, matching where h
+     * lives in the gh vector */
+    uint64x2_t swapped_initial_sum = vextq_u64(initial_sum, initial_sum, 1);
+
+    /* Add gh to that, now that they're matching ways round */
+    uint64x2_t sum = vaddq_u64(swapped_initial_sum, *gh);
+
+    /* Make the misaligned de and fg words */
+    uint64x2_t de = vextq_u64(*cd, *ef, 1);
+    uint64x2_t fg = vextq_u64(*ef, *gh, 1);
+
+    /* Now we're ready to put all the pieces together. The output from
+     * vsha512h2q_u64 can be used directly as the new gh, and the
+     * output from vsha512hq_u64 is simultaneously the intermediate
+     * value passed to h2 and the thing you have to add on to cd. */
+    uint64x2_t intermed = vsha512hq_u64(sum, fg, de);
+    *gh = vsha512h2q_u64(intermed, *cd, *ab);
+    *cd = vaddq_u64(*cd, intermed);
 }
 
-static void sha512_free(ssh_hash *hash)
+FUNC_ISA
+static inline void sha512_neon_block(sha512_neon_core *core, const uint8_t *p)
 {
-    struct sha512_hash *h = container_of(hash, struct sha512_hash, hash);
+    uint64x2_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+    uint64x2_t ab = core->ab, cd = core->cd, ef = core->ef, gh = core->gh;
+
+    s0 = sha512_neon_load_input(p + 16*0);
+    sha512_neon_round2(0, s0, &ab, &cd, &ef, &gh);
+    s1 = sha512_neon_load_input(p + 16*1);
+    sha512_neon_round2(2, s1, &gh, &ab, &cd, &ef);
+    s2 = sha512_neon_load_input(p + 16*2);
+    sha512_neon_round2(4, s2, &ef, &gh, &ab, &cd);
+    s3 = sha512_neon_load_input(p + 16*3);
+    sha512_neon_round2(6, s3, &cd, &ef, &gh, &ab);
+    s4 = sha512_neon_load_input(p + 16*4);
+    sha512_neon_round2(8, s4, &ab, &cd, &ef, &gh);
+    s5 = sha512_neon_load_input(p + 16*5);
+    sha512_neon_round2(10, s5, &gh, &ab, &cd, &ef);
+    s6 = sha512_neon_load_input(p + 16*6);
+    sha512_neon_round2(12, s6, &ef, &gh, &ab, &cd);
+    s7 = sha512_neon_load_input(p + 16*7);
+    sha512_neon_round2(14, s7, &cd, &ef, &gh, &ab);
+    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
+    sha512_neon_round2(16, s0, &ab, &cd, &ef, &gh);
+    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
+    sha512_neon_round2(18, s1, &gh, &ab, &cd, &ef);
+    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
+    sha512_neon_round2(20, s2, &ef, &gh, &ab, &cd);
+    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
+    sha512_neon_round2(22, s3, &cd, &ef, &gh, &ab);
+    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
+    sha512_neon_round2(24, s4, &ab, &cd, &ef, &gh);
+    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
+    sha512_neon_round2(26, s5, &gh, &ab, &cd, &ef);
+    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
+    sha512_neon_round2(28, s6, &ef, &gh, &ab, &cd);
+    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
+    sha512_neon_round2(30, s7, &cd, &ef, &gh, &ab);
+    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
+    sha512_neon_round2(32, s0, &ab, &cd, &ef, &gh);
+    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
+    sha512_neon_round2(34, s1, &gh, &ab, &cd, &ef);
+    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
+    sha512_neon_round2(36, s2, &ef, &gh, &ab, &cd);
+    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
+    sha512_neon_round2(38, s3, &cd, &ef, &gh, &ab);
+    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
+    sha512_neon_round2(40, s4, &ab, &cd, &ef, &gh);
+    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
+    sha512_neon_round2(42, s5, &gh, &ab, &cd, &ef);
+    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
+    sha512_neon_round2(44, s6, &ef, &gh, &ab, &cd);
+    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
+    sha512_neon_round2(46, s7, &cd, &ef, &gh, &ab);
+    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
+    sha512_neon_round2(48, s0, &ab, &cd, &ef, &gh);
+    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
+    sha512_neon_round2(50, s1, &gh, &ab, &cd, &ef);
+    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
+    sha512_neon_round2(52, s2, &ef, &gh, &ab, &cd);
+    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
+    sha512_neon_round2(54, s3, &cd, &ef, &gh, &ab);
+    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
+    sha512_neon_round2(56, s4, &ab, &cd, &ef, &gh);
+    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
+    sha512_neon_round2(58, s5, &gh, &ab, &cd, &ef);
+    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
+    sha512_neon_round2(60, s6, &ef, &gh, &ab, &cd);
+    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
+    sha512_neon_round2(62, s7, &cd, &ef, &gh, &ab);
+    s0 = sha512_neon_schedule_update(s0, s1, s4, s5, s7);
+    sha512_neon_round2(64, s0, &ab, &cd, &ef, &gh);
+    s1 = sha512_neon_schedule_update(s1, s2, s5, s6, s0);
+    sha512_neon_round2(66, s1, &gh, &ab, &cd, &ef);
+    s2 = sha512_neon_schedule_update(s2, s3, s6, s7, s1);
+    sha512_neon_round2(68, s2, &ef, &gh, &ab, &cd);
+    s3 = sha512_neon_schedule_update(s3, s4, s7, s0, s2);
+    sha512_neon_round2(70, s3, &cd, &ef, &gh, &ab);
+    s4 = sha512_neon_schedule_update(s4, s5, s0, s1, s3);
+    sha512_neon_round2(72, s4, &ab, &cd, &ef, &gh);
+    s5 = sha512_neon_schedule_update(s5, s6, s1, s2, s4);
+    sha512_neon_round2(74, s5, &gh, &ab, &cd, &ef);
+    s6 = sha512_neon_schedule_update(s6, s7, s2, s3, s5);
+    sha512_neon_round2(76, s6, &ef, &gh, &ab, &cd);
+    s7 = sha512_neon_schedule_update(s7, s0, s3, s4, s6);
+    sha512_neon_round2(78, s7, &cd, &ef, &gh, &ab);
+
+    core->ab = vaddq_u64(core->ab, ab);
+    core->cd = vaddq_u64(core->cd, cd);
+    core->ef = vaddq_u64(core->ef, ef);
+    core->gh = vaddq_u64(core->gh, gh);
+}
 
-    smemclr(h, sizeof(*h));
-    sfree(h);
+typedef struct sha512_neon {
+    sha512_neon_core core;
+    sha512_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha512_neon;
+
+static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha512_neon_new(const ssh_hashalg *alg)
+{
+    if (!sha512_hw_available_cached())
+        return NULL;
+
+    sha512_neon *s = snew(sha512_neon);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha512_neon_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
 }
 
-static void sha512_final(ssh_hash *hash, unsigned char *output)
+static void sha512_neon_reset(ssh_hash *hash)
 {
-    struct sha512_hash *h = container_of(hash, struct sha512_hash, hash);
-    SHA512_Final(&h->state, output);
-    sha512_free(hash);
+    sha512_neon *s = container_of(hash, sha512_neon, hash);
+    const uint64_t *iv = (const uint64_t *)hash->vt->extra;
+
+    s->core.ab = vld1q_u64(iv);
+    s->core.cd = vld1q_u64(iv+2);
+    s->core.ef = vld1q_u64(iv+4);
+    s->core.gh = vld1q_u64(iv+6);
+
+    sha512_block_setup(&s->blk);
 }
 
-const ssh_hashalg ssh_sha512 = {
-    sha512_new, sha512_copy, sha512_final, sha512_free,
-    64, BLKSIZE, HASHALG_NAMES_BARE("SHA-512"),
+static void sha512_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha512_neon *copy = container_of(hcopy, sha512_neon, hash);
+    sha512_neon *orig = container_of(horig, sha512_neon, hash);
+
+    *copy = *orig; /* structure copy */
+
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+}
+
+static void sha512_neon_free(ssh_hash *hash)
+{
+    sha512_neon *s = container_of(hash, sha512_neon, hash);
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha512_neon_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha512_neon *s = BinarySink_DOWNCAST(bs, sha512_neon);
+
+    while (len > 0)
+        if (sha512_block_write(&s->blk, &vp, &len))
+            sha512_neon_block(&s->core, s->blk.block);
+}
+
+static void sha512_neon_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha512_neon *s = container_of(hash, sha512_neon, hash);
+
+    sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
+
+    vst1q_u8(digest,    vrev64q_u8(vreinterpretq_u8_u64(s->core.ab)));
+    vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd)));
+    vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef)));
+    vst1q_u8(digest+48, vrev64q_u8(vreinterpretq_u8_u64(s->core.gh)));
+}
+
+static void sha384_neon_digest(ssh_hash *hash, uint8_t *digest)
+{
+    sha512_neon *s = container_of(hash, sha512_neon, hash);
+
+    sha512_block_pad(&s->blk, BinarySink_UPCAST(s));
+
+    vst1q_u8(digest,    vrev64q_u8(vreinterpretq_u8_u64(s->core.ab)));
+    vst1q_u8(digest+16, vrev64q_u8(vreinterpretq_u8_u64(s->core.cd)));
+    vst1q_u8(digest+32, vrev64q_u8(vreinterpretq_u8_u64(s->core.ef)));
+}
+
+const ssh_hashalg ssh_sha512_hw = {
+    .new = sha512_neon_new,
+    .reset = sha512_neon_reset,
+    .copyfrom = sha512_neon_copyfrom,
+    .digest = sha512_neon_digest,
+    .free = sha512_neon_free,
+    .hlen = 64,
+    .blocklen = 128,
+    HASHALG_NAMES_ANNOTATED("SHA-512", "NEON accelerated"),
+    .extra = sha512_initial_state,
 };
 
-static ssh_hash *sha384_new(const ssh_hashalg *alg)
+const ssh_hashalg ssh_sha384_hw = {
+    .new = sha512_neon_new,
+    .reset = sha512_neon_reset,
+    .copyfrom = sha512_neon_copyfrom,
+    .digest = sha384_neon_digest,
+    .free = sha512_neon_free,
+    .hlen = 48,
+    .blocklen = 128,
+    HASHALG_NAMES_ANNOTATED("SHA-384", "NEON accelerated"),
+    .extra = sha384_initial_state,
+};
+
+/* ----------------------------------------------------------------------
+ * Stub functions if we have no hardware-accelerated SHA-512. In this
+ * case, sha512_hw_new returns NULL (though it should also never be
+ * selected by sha512_select, so the only thing that should even be
+ * _able_ to call it is testcrypt). As a result, the remaining vtable
+ * functions should never be called at all.
+ */
+
+#elif HW_SHA512 == HW_SHA512_NONE
+
+static bool sha512_hw_available(void)
 {
-    struct sha512_hash *h = snew(struct sha512_hash);
-    SHA384_Init(&h->state);
-    h->hash.vt = alg;
-    BinarySink_DELEGATE_INIT(&h->hash, &h->state);
-    return &h->hash;
+    return false;
 }
 
-static void sha384_final(ssh_hash *hash, unsigned char *output)
+static ssh_hash *sha512_stub_new(const ssh_hashalg *alg)
 {
-    struct sha512_hash *h = container_of(hash, struct sha512_hash, hash);
-    SHA384_Final(&h->state, output);
-    sha512_free(hash);
+    return NULL;
 }
 
-const ssh_hashalg ssh_sha384 = {
-    sha384_new, sha512_copy, sha384_final, sha512_free,
-    48, BLKSIZE, HASHALG_NAMES_BARE("SHA-384"),
+#define STUB_BODY { unreachable("Should never be called"); }
+
+static void sha512_stub_reset(ssh_hash *hash) STUB_BODY
+static void sha512_stub_copyfrom(ssh_hash *hash, ssh_hash *orig) STUB_BODY
+static void sha512_stub_free(ssh_hash *hash) STUB_BODY
+static void sha512_stub_digest(ssh_hash *hash, uint8_t *digest) STUB_BODY
+
+const ssh_hashalg ssh_sha512_hw = {
+    // WINSCP
+    /*.new =*/ sha512_stub_new,
+    /*.reset =*/ sha512_stub_reset,
+    /*.copyfrom =*/ sha512_stub_copyfrom,
+    /*.digest =*/ sha512_stub_digest,
+    /*.free =*/ sha512_stub_free,
+    /*.hlen =*/ 64,
+    /*.blocklen =*/ 128,
+    HASHALG_NAMES_ANNOTATED("SHA-512", "!NONEXISTENT ACCELERATED VERSION!"),
+    NULL, // WINSCP
 };
+
+const ssh_hashalg ssh_sha384_hw = {
+    // WINSCP
+    /*.new =*/ sha512_stub_new,
+    /*.reset =*/ sha512_stub_reset,
+    /*.copyfrom =*/ sha512_stub_copyfrom,
+    /*.digest =*/ sha512_stub_digest,
+    /*.free =*/ sha512_stub_free,
+    /*.hlen =*/ 48,
+    /*.blocklen =*/ 128,
+    HASHALG_NAMES_ANNOTATED("SHA-384", "!NONEXISTENT ACCELERATED VERSION!"),
+    NULL, // WINSCP
+};
+
+#endif /* HW_SHA512 */

+ 91 - 51
source/putty/sshsha.c

@@ -98,8 +98,16 @@ static ssh_hash *sha1_select(const ssh_hashalg *alg)
 }
 
 const ssh_hashalg ssh_sha1 = {
-    sha1_select, NULL, NULL, NULL,
-    20, 64, HASHALG_NAMES_ANNOTATED("SHA-1", "dummy selector vtable"),
+    // WINSCP
+    /*.new =*/ sha1_select,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    /*.hlen =*/ 20,
+    /*.blocklen =*/ 64,
+    HASHALG_NAMES_ANNOTATED("SHA-1", "dummy selector vtable"),
+    NULL,
 };
 
 /* ----------------------------------------------------------------------
@@ -266,26 +274,28 @@ static ssh_hash *sha1_sw_new(const ssh_hashalg *alg)
 {
     sha1_sw *s = snew(sha1_sw);
 
-    memcpy(s->core, sha1_initial_state, sizeof(s->core));
-
-    sha1_block_setup(&s->blk);
-
     s->hash.vt = alg;
     BinarySink_INIT(s, sha1_sw_write);
     BinarySink_DELEGATE_INIT(&s->hash, s);
     return &s->hash;
 }
 
-static ssh_hash *sha1_sw_copy(ssh_hash *hash)
+static void sha1_sw_reset(ssh_hash *hash)
 {
     sha1_sw *s = container_of(hash, sha1_sw, hash);
-    sha1_sw *copy = snew(sha1_sw);
 
-    memcpy(copy, s, sizeof(*copy));
+    memcpy(s->core, sha1_initial_state, sizeof(s->core));
+    sha1_block_setup(&s->blk);
+}
+
+static void sha1_sw_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha1_sw *copy = container_of(hcopy, sha1_sw, hash);
+    sha1_sw *orig = container_of(horig, sha1_sw, hash);
+
+    memcpy(copy, orig, sizeof(*copy));
     BinarySink_COPIED(copy);
     BinarySink_DELEGATE_INIT(&copy->hash, copy);
-
-    return &copy->hash;
 }
 
 static void sha1_sw_free(ssh_hash *hash)
@@ -305,7 +315,7 @@ static void sha1_sw_write(BinarySink *bs, const void *vp, size_t len)
             sha1_sw_block(s->core, s->blk.block);
 }
 
-static void sha1_sw_final(ssh_hash *hash, uint8_t *digest)
+static void sha1_sw_digest(ssh_hash *hash, uint8_t *digest)
 {
     sha1_sw *s = container_of(hash, sha1_sw, hash);
 
@@ -314,13 +324,20 @@ static void sha1_sw_final(ssh_hash *hash, uint8_t *digest)
     size_t i; // WINSCP
     for (i = 0; i < 5; i++)
         PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]);
-    sha1_sw_free(hash);
     } // WINSCP
 }
 
 const ssh_hashalg ssh_sha1_sw = {
-    sha1_sw_new, sha1_sw_copy, sha1_sw_final, sha1_sw_free,
-    20, 64, HASHALG_NAMES_BARE("SHA-1"), // WINSCP (removed "unaccelerated" annotation)
+    // WINSCP
+    /*.new =*/ sha1_sw_new,
+    /*.reset =*/ sha1_sw_reset,
+    /*.copyfrom =*/ sha1_sw_copyfrom,
+    /*.digest =*/ sha1_sw_digest,
+    /*.free =*/ sha1_sw_free,
+    /*.hlen =*/ 20,
+    /*.blocklen =*/ 64,
+    HASHALG_NAMES_BARE("SHA-1"), // WINSCP (removed "unaccelerated" annotation)
+    NULL,
 };
 
 /* ----------------------------------------------------------------------
@@ -583,39 +600,42 @@ static sha1_ni *sha1_ni_alloc(void)
     return s;
 }
 
-FUNC_ISA static ssh_hash *sha1_ni_new(const ssh_hashalg *alg)
+static ssh_hash *sha1_ni_new(const ssh_hashalg *alg)
 {
     if (!sha1_hw_available_cached())
         return NULL;
 
     sha1_ni *s = sha1_ni_alloc();
 
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha1_ni_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+FUNC_ISA static void sha1_ni_reset(ssh_hash *hash)
+{
+    sha1_ni *s = container_of(hash, sha1_ni, hash);
+
     /* Initialise the core vectors in their storage order */
     s->core[0] = _mm_set_epi64x(
         0x67452301efcdab89ULL, 0x98badcfe10325476ULL);
     s->core[1] = _mm_set_epi32(0xc3d2e1f0, 0, 0, 0);
 
     sha1_block_setup(&s->blk);
-
-    s->hash.vt = alg;
-    BinarySink_INIT(s, sha1_ni_write);
-    BinarySink_DELEGATE_INIT(&s->hash, s);
-    return &s->hash;
 }
 
-static ssh_hash *sha1_ni_copy(ssh_hash *hash)
+static void sha1_ni_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
 {
-    sha1_ni *s = container_of(hash, sha1_ni, hash);
-    sha1_ni *copy = sha1_ni_alloc();
+    sha1_ni *copy = container_of(hcopy, sha1_ni, hash);
+    sha1_ni *orig = container_of(horig, sha1_ni, hash);
 
     void *ptf_save = copy->pointer_to_free;
-    *copy = *s; /* structure copy */
+    *copy = *orig; /* structure copy */
     copy->pointer_to_free = ptf_save;
 
     BinarySink_COPIED(copy);
     BinarySink_DELEGATE_INIT(&copy->hash, copy);
-
-    return &copy->hash;
 }
 
 static void sha1_ni_free(ssh_hash *hash)
@@ -636,7 +656,7 @@ static void sha1_ni_write(BinarySink *bs, const void *vp, size_t len)
             sha1_ni_block(s->core, s->blk.block);
 }
 
-FUNC_ISA static void sha1_ni_final(ssh_hash *hash, uint8_t *digest)
+FUNC_ISA static void sha1_ni_digest(ssh_hash *hash, uint8_t *digest)
 {
     sha1_ni *s = container_of(hash, sha1_ni, hash);
 
@@ -655,13 +675,17 @@ FUNC_ISA static void sha1_ni_final(ssh_hash *hash, uint8_t *digest)
     /* Finally, store the leftover word */
     uint32_t e = _mm_extract_epi32(s->core[1], 3);
     PUT_32BIT_MSB_FIRST(digest + 16, e);
-
-    sha1_ni_free(hash);
 }
 
 const ssh_hashalg ssh_sha1_hw = {
-    sha1_ni_new, sha1_ni_copy, sha1_ni_final, sha1_ni_free,
-    20, 64, HASHALG_NAMES_ANNOTATED("SHA-1", "SHA-NI accelerated"),
+    .new = sha1_ni_new,
+    .reset = sha1_ni_reset,
+    .copyfrom = sha1_ni_copyfrom,
+    .digest = sha1_ni_digest,
+    .free = sha1_ni_free,
+    .hlen = 20,
+    .blocklen = 64,
+    HASHALG_NAMES_ANNOTATED("SHA-1", "SHA-NI accelerated"),
 };
 
 /* ----------------------------------------------------------------------
@@ -823,28 +847,31 @@ static ssh_hash *sha1_neon_new(const ssh_hashalg *alg)
 
     sha1_neon *s = snew(sha1_neon);
 
-    s->core.abcd = vld1q_u32(sha1_initial_state);
-    s->core.e = sha1_initial_state[4];
-
-    sha1_block_setup(&s->blk);
-
     s->hash.vt = alg;
     BinarySink_INIT(s, sha1_neon_write);
     BinarySink_DELEGATE_INIT(&s->hash, s);
     return &s->hash;
 }
 
-static ssh_hash *sha1_neon_copy(ssh_hash *hash)
+static void sha1_neon_reset(ssh_hash *hash)
 {
     sha1_neon *s = container_of(hash, sha1_neon, hash);
-    sha1_neon *copy = snew(sha1_neon);
 
-    *copy = *s; /* structure copy */
+    s->core.abcd = vld1q_u32(sha1_initial_state);
+    s->core.e = sha1_initial_state[4];
+
+    sha1_block_setup(&s->blk);
+}
+
+static void sha1_neon_copyfrom(ssh_hash *hcopy, ssh_hash *horig)
+{
+    sha1_neon *copy = container_of(hcopy, sha1_neon, hash);
+    sha1_neon *orig = container_of(horig, sha1_neon, hash);
+
+    *copy = *orig; /* structure copy */
 
     BinarySink_COPIED(copy);
     BinarySink_DELEGATE_INIT(&copy->hash, copy);
-
-    return &copy->hash;
 }
 
 static void sha1_neon_free(ssh_hash *hash)
@@ -863,19 +890,24 @@ static void sha1_neon_write(BinarySink *bs, const void *vp, size_t len)
             sha1_neon_block(&s->core, s->blk.block);
 }
 
-static void sha1_neon_final(ssh_hash *hash, uint8_t *digest)
+static void sha1_neon_digest(ssh_hash *hash, uint8_t *digest)
 {
     sha1_neon *s = container_of(hash, sha1_neon, hash);
 
     sha1_block_pad(&s->blk, BinarySink_UPCAST(s));
     vst1q_u8(digest, vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd)));
     PUT_32BIT_MSB_FIRST(digest + 16, s->core.e);
-    sha1_neon_free(hash);
 }
 
 const ssh_hashalg ssh_sha1_hw = {
-    sha1_neon_new, sha1_neon_copy, sha1_neon_final, sha1_neon_free,
-    20, 64, HASHALG_NAMES_ANNOTATED("SHA-1", "NEON accelerated"),
+    .new = sha1_neon_new,
+    .reset = sha1_neon_reset,
+    .copyfrom = sha1_neon_copyfrom,
+    .digest = sha1_neon_digest,
+    .free = sha1_neon_free,
+    .hlen = 20,
+    .blocklen = 64,
+    HASHALG_NAMES_ANNOTATED("SHA-1", "NEON accelerated"),
 };
 
 /* ----------------------------------------------------------------------
@@ -900,14 +932,22 @@ static ssh_hash *sha1_stub_new(const ssh_hashalg *alg)
 
 #define STUB_BODY { unreachable("Should never be called"); }
 
-static ssh_hash *sha1_stub_copy(ssh_hash *hash) STUB_BODY
+static void sha1_stub_reset(ssh_hash *hash) STUB_BODY
+static void sha1_stub_copyfrom(ssh_hash *hash, ssh_hash *orig) STUB_BODY
 static void sha1_stub_free(ssh_hash *hash) STUB_BODY
-static void sha1_stub_final(ssh_hash *hash, uint8_t *digest) STUB_BODY
+static void sha1_stub_digest(ssh_hash *hash, uint8_t *digest) STUB_BODY
 
 const ssh_hashalg ssh_sha1_hw = {
-    sha1_stub_new, sha1_stub_copy, sha1_stub_final, sha1_stub_free,
-    20, 64, HASHALG_NAMES_ANNOTATED(
-        "SHA-1", "!NONEXISTENT ACCELERATED VERSION!"),
+    // WINSCP
+    /*.new =*/ sha1_stub_new,
+    /*.reset =*/ sha1_stub_reset,
+    /*.copyfrom =*/ sha1_stub_copyfrom,
+    /*.digest =*/ sha1_stub_digest,
+    /*.free =*/ sha1_stub_free,
+    /*.hlen =*/ 20,
+    /*.blocklen =*/ 64,
+    HASHALG_NAMES_ANNOTATED("SHA-1", "!NONEXISTENT ACCELERATED VERSION!"),
+    NULL,
 };
 
 #endif /* HW_SHA1 */

+ 26 - 0
source/putty/utils.c

@@ -1065,3 +1065,29 @@ size_t encode_utf8(void *output, unsigned long ch)
     }
     return p - start;
 }
+
+void memxor(uint8_t *out, const uint8_t *in1, const uint8_t *in2, size_t size)
+{
+    switch (size & 15) {
+      case 0:
+        while (size >= 16) {
+            size -= 16;
+                   *out++ = *in1++ ^ *in2++;
+          case 15: *out++ = *in1++ ^ *in2++;
+          case 14: *out++ = *in1++ ^ *in2++;
+          case 13: *out++ = *in1++ ^ *in2++;
+          case 12: *out++ = *in1++ ^ *in2++;
+          case 11: *out++ = *in1++ ^ *in2++;
+          case 10: *out++ = *in1++ ^ *in2++;
+          case 9:  *out++ = *in1++ ^ *in2++;
+          case 8:  *out++ = *in1++ ^ *in2++;
+          case 7:  *out++ = *in1++ ^ *in2++;
+          case 6:  *out++ = *in1++ ^ *in2++;
+          case 5:  *out++ = *in1++ ^ *in2++;
+          case 4:  *out++ = *in1++ ^ *in2++;
+          case 3:  *out++ = *in1++ ^ *in2++;
+          case 2:  *out++ = *in1++ ^ *in2++;
+          case 1:  *out++ = *in1++ ^ *in2++;
+        }
+    }
+}

Daži faili netika attēloti, jo izmaiņu fails ir pārāk liels