浏览代码

PuTTY snapshot a5911f76 (Fix null dereference in ssh_unthrottle - 2019-01-29)

Source commit: fe665ca91c847abb60aa13f448c784a6ee5d63f0
Martin Prikryl 6 年之前
父节点
当前提交
c1f64d6b16

+ 10 - 27
source/putty/WINDOWS/winmiscs.c

@@ -16,33 +16,6 @@ void smemclr(void *b, size_t n) {
 }
 #endif
 
-#ifdef DEBUG
-static FILE *debug_fp = NULL;
-static HANDLE debug_hdl = INVALID_HANDLE_VALUE;
-static int debug_got_console = 0;
-
-void dputs(const char *buf)
-{
-    DWORD dw;
-
-    if (!debug_got_console) {
-	if (AllocConsole()) {
-	    debug_got_console = 1;
-	    debug_hdl = GetStdHandle(STD_OUTPUT_HANDLE);
-	}
-    }
-    if (!debug_fp) {
-	debug_fp = fopen("debug.log", "w");
-    }
-
-    if (debug_hdl != INVALID_HANDLE_VALUE) {
-	WriteFile(debug_hdl, buf, strlen(buf), &dw, NULL);
-    }
-    fputs(buf, debug_fp);
-    fflush(debug_fp);
-}
-#endif
-
 #ifdef MINEFIELD
 /*
  * Minefield - a Windows equivalent for Electric Fence
@@ -283,4 +256,14 @@ bool platform_aes_hw_available(void)
     return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
 }
 
+bool platform_sha256_hw_available(void)
+{
+    return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
+}
+
+bool platform_sha1_hw_available(void)
+{
+    return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
+}
+
 #endif

+ 21 - 4
source/putty/mpint.c

@@ -36,6 +36,7 @@ static inline BignumInt mp_word(mp_int *x, size_t i)
 static mp_int *mp_make_sized(size_t nw)
 {
     mp_int *x = snew_plus(mp_int, nw * sizeof(BignumInt));
+    assert(nw);                   /* we outlaw the zero-word mp_int */
     x->nw = nw;
     x->w = snew_plus_get_aux(x);
     mp_clear(x);
@@ -140,8 +141,9 @@ void mp_cond_clear(mp_int *x, unsigned clear)
  */
 static mp_int *mp_from_bytes_int(ptrlen bytes, size_t m, size_t c)
 {
-    mp_int *n = mp_make_sized(
-        (bytes.len + BIGNUM_INT_BYTES - 1) / BIGNUM_INT_BYTES);
+    size_t nw = (bytes.len + BIGNUM_INT_BYTES - 1) / BIGNUM_INT_BYTES;
+    nw = size_t_max(nw, 1);
+    mp_int *n = mp_make_sized(nw);
     for (size_t i = 0; i < bytes.len; i++)
         n->w[i / BIGNUM_INT_BYTES] |=
             (BignumInt)(((const unsigned char *)bytes.ptr)[m*i+c]) <<
@@ -184,7 +186,7 @@ mp_int *mp_from_decimal_pl(ptrlen decimal)
     size_t words = bits / BIGNUM_INT_BITS + 1;
 
     mp_int *x = mp_make_sized(words);
-    for (size_t i = 0;; i++) {
+    for (size_t i = 0; i < decimal.len; i++) {
         mp_add_integer_into(x, x, ((char *)decimal.ptr)[i] - '0');
 
         if (i+1 == decimal.len)
@@ -211,6 +213,7 @@ mp_int *mp_from_hex_pl(ptrlen hex)
     assert(hex.len <= (~(size_t)0) / 4);
     size_t bits = hex.len * 4;
     size_t words = (bits + BIGNUM_INT_BITS - 1) / BIGNUM_INT_BITS;
+    words = size_t_max(words, 1);
     mp_int *x = mp_make_sized(words);
     for (size_t nibble = 0; nibble < hex.len; nibble++) {
         BignumInt digit = ((char *)hex.ptr)[hex.len-1 - nibble];
@@ -1077,7 +1080,8 @@ void mp_rshift_fixed_into(mp_int *r, mp_int *a, size_t bits)
 mp_int *mp_rshift_fixed(mp_int *x, size_t bits)
 {
     size_t words = bits / BIGNUM_INT_BITS;
-    mp_int *r = mp_make_sized(x->nw - size_t_min(x->nw, words));
+    size_t nw = x->nw - size_t_min(x->nw, words);
+    mp_int *r = mp_make_sized(size_t_max(nw, 1));
     mp_rshift_fixed_into(r, x, bits);
     return r;
 }
@@ -1148,6 +1152,7 @@ mp_int *mp_invert_mod_2to(mp_int *x, size_t p)
     assert(p > 0);
 
     size_t rw = (p + BIGNUM_INT_BITS - 1) / BIGNUM_INT_BITS;
+    rw = size_t_max(rw, 1);
     mp_int *r = mp_make_sized(rw);
 
     size_t mul_scratchsize = mp_mul_scratchspace(2*rw, rw, rw);
@@ -2109,6 +2114,11 @@ void mp_min_into(mp_int *r, mp_int *x, mp_int *y)
     mp_select_into(r, x, y, mp_cmp_hs(x, y));
 }
 
+void mp_max_into(mp_int *r, mp_int *x, mp_int *y)
+{
+    mp_select_into(r, y, x, mp_cmp_hs(x, y));
+}
+
 mp_int *mp_min(mp_int *x, mp_int *y)
 {
     mp_int *r = mp_make_sized(size_t_min(x->nw, y->nw));
@@ -2116,6 +2126,13 @@ mp_int *mp_min(mp_int *x, mp_int *y)
     return r;
 }
 
+mp_int *mp_max(mp_int *x, mp_int *y)
+{
+    mp_int *r = mp_make_sized(size_t_max(x->nw, y->nw));
+    mp_max_into(r, x, y);
+    return r;
+}
+
 mp_int *mp_power_2(size_t power)
 {
     mp_int *x = mp_new(power + 1);

+ 4 - 1
source/putty/mpint.h

@@ -152,10 +152,13 @@ unsigned mp_hs_integer(mp_int *x, uintmax_t n);
 unsigned mp_eq_integer(mp_int *x, uintmax_t n);
 
 /*
- * Take the minimum of two mp_ints, without using a conditional branch.
+ * Take the minimum or maximum of two mp_ints, without using a
+ * conditional branch.
  */
 void mp_min_into(mp_int *r, mp_int *x, mp_int *y);
+void mp_max_into(mp_int *r, mp_int *x, mp_int *y);
 mp_int *mp_min(mp_int *x, mp_int *y);
+mp_int *mp_max(mp_int *x, mp_int *y);
 
 /*
  * Diagnostic function. Writes out x in hex to the supplied stdio

+ 2 - 1
source/putty/ssh.c

@@ -1013,7 +1013,8 @@ static void ssh_unthrottle(Backend *be, int bufsize)
 {
     Ssh *ssh = container_of(be, Ssh, backend);
 
-    ssh_stdout_unthrottle(ssh->cl, bufsize);
+    if (ssh->cl)
+        ssh_stdout_unthrottle(ssh->cl, bufsize);
 }
 
 static bool ssh_connected(Backend *be)

+ 11 - 1
source/putty/ssh.h

@@ -664,7 +664,9 @@ struct ssh_hashalg {
     void (*free)(ssh_hash *);
     int hlen; /* output length in bytes */
     int blocklen; /* length of the hash's input block, or 0 for N/A */
-    const char *text_name;
+    const char *text_basename;     /* the semantic name of the hash */
+    const char *annotation;   /* extra info, e.g. which of multiple impls */
+    const char *text_name;    /* both combined, e.g. "SHA-n (unaccelerated)" */
 };
 
 #define ssh_hash_new(alg) ((alg)->new(alg))
@@ -673,6 +675,12 @@ struct ssh_hashalg {
 #define ssh_hash_free(ctx) ((ctx)->vt->free(ctx))
 #define ssh_hash_alg(ctx) ((ctx)->vt)
 
+/* Handy macros for defining all those text-name fields at once */
+#define HASHALG_NAMES_BARE(base) \
+    base, NULL, base
+#define HASHALG_NAMES_ANNOTATED(base, annotation) \
+    base, annotation, base " (" annotation ")"
+
 void hash_simple(const ssh_hashalg *alg, ptrlen data, void *output);
 
 struct ssh_kex {
@@ -856,6 +864,8 @@ extern const ssh_compression_alg ssh_zlib;
  * platform subdirectory.
  */
 bool platform_aes_hw_available(void);
+bool platform_sha256_hw_available(void);
+bool platform_sha1_hw_available(void);
 
 /*
  * PuTTY version number formatted as an SSH version string. 

+ 2 - 3
source/putty/ssh2bpp.c

@@ -514,7 +514,7 @@ static void ssh2_bpp_handle_input(BinaryPacketProtocol *bpp)
 
         s->length = s->payload + 5;
 
-        DTS_CONSUME(s->stats, in, s->packetlen);
+        dts_consume(&s->stats->in, s->packetlen);
 
         s->pktin->sequence = s->in.sequence++;
 
@@ -762,8 +762,7 @@ static void ssh2_bpp_format_packet_inner(struct ssh2_bpp_state *s, PktOut *pkt)
 
     s->out.sequence++;       /* whether or not we MACed */
 
-    DTS_CONSUME(s->stats, out, origlen + padding);
-
+    dts_consume(&s->stats->out, origlen + padding);
 }
 
 static void ssh2_bpp_format_packet(struct ssh2_bpp_state *s, PktOut *pkt)

+ 5 - 5
source/putty/ssh2kex-client.c

@@ -94,7 +94,7 @@ void ssh2kex_coroutine(struct ssh2_transport_state *s, bool *aborted)
             ppl_logevent("Doing Diffie-Hellman key exchange using %d-bit "
                          "modulus and hash %s with a server-supplied group",
                          dh_modulus_bit_size(s->dh_ctx),
-                         s->kex_alg->hash->text_name);
+                         ssh_hash_alg(s->exhash)->text_name);
         } else {
             s->ppl.bpp->pls->kctx = SSH2_PKTCTX_DHGROUP;
             s->dh_ctx = dh_setup_group(s->kex_alg);
@@ -104,7 +104,7 @@ void ssh2kex_coroutine(struct ssh2_transport_state *s, bool *aborted)
             ppl_logevent("Doing Diffie-Hellman key exchange using %d-bit "
                          "modulus and hash %s with standard group \"%s\"",
                          dh_modulus_bit_size(s->dh_ctx),
-                         s->kex_alg->hash->text_name,
+                         ssh_hash_alg(s->exhash)->text_name,
                          s->kex_alg->groupname);
         }
 
@@ -180,7 +180,7 @@ void ssh2kex_coroutine(struct ssh2_transport_state *s, bool *aborted)
 
         ppl_logevent("Doing ECDH key exchange with curve %s and hash %s",
                      ssh_ecdhkex_curve_textname(s->kex_alg),
-                     s->kex_alg->hash->text_name);
+                     ssh_hash_alg(s->exhash)->text_name);
         s->ppl.bpp->pls->kctx = SSH2_PKTCTX_ECDHKEX;
 
         s->ecdh_key = ssh_ecdhkex_newkey(s->kex_alg);
@@ -314,7 +314,7 @@ void ssh2kex_coroutine(struct ssh2_transport_state *s, bool *aborted)
         }
 
         ppl_logevent("Doing GSSAPI (with Kerberos V5) Diffie-Hellman key "
-                     "exchange with hash %s", s->kex_alg->hash->text_name);
+                     "exchange with hash %s", ssh_hash_alg(s->exhash)->text_name);
         /* Now generate e for Diffie-Hellman. */
         seat_set_busy_status(s->ppl.seat, BUSY_CPU);
         s->e = dh_create_e(s->dh_ctx, s->nbits * 2);
@@ -513,7 +513,7 @@ void ssh2kex_coroutine(struct ssh2_transport_state *s, bool *aborted)
 
         assert(s->kex_alg->main_type == KEXTYPE_RSA);
         ppl_logevent("Doing RSA key exchange with hash %s",
-                     s->kex_alg->hash->text_name);
+                     ssh_hash_alg(s->exhash)->text_name);
         s->ppl.bpp->pls->kctx = SSH2_PKTCTX_RSAKEX;
         /*
          * RSA key exchange. First expect a KEXRSA_PUBKEY packet

+ 9 - 14
source/putty/ssh2transport.c

@@ -1261,8 +1261,7 @@ static void ssh2_transport_process_queue(PacketProtocolLayer *ppl)
     pktout = ssh_bpp_new_pktout(s->ppl.bpp, SSH2_MSG_NEWKEYS);
     pq_push(s->ppl.out_pq, pktout);
     /* Start counting down the outgoing-data limit for these cipher keys. */
-    s->stats->out.running = true;
-    s->stats->out.remaining = s->max_data_size;
+    dts_reset(&s->stats->out, s->max_data_size);
 
     /*
      * Force the BPP to synchronously marshal all packets up to and
@@ -1324,8 +1323,7 @@ static void ssh2_transport_process_queue(PacketProtocolLayer *ppl)
         return;
     }
     /* Start counting down the incoming-data limit for these cipher keys. */
-    s->stats->in.running = true;
-    s->stats->in.remaining = s->max_data_size;
+    dts_reset(&s->stats->in, s->max_data_size);
 
     /*
      * We've seen incoming NEWKEYS, so create and initialise
@@ -1490,10 +1488,10 @@ static void ssh2_transport_process_queue(PacketProtocolLayer *ppl)
         if (!s->rekey_class) {
             /* If we don't yet have any other reason to rekey, check
              * if we've hit our data limit in either direction. */
-            if (!s->stats->in.running) {
+            if (s->stats->in.expired) {
                 s->rekey_reason = "too much data received";
                 s->rekey_class = RK_NORMAL;
-            } else if (!s->stats->out.running) {
+            } else if (s->stats->out.expired) {
                 s->rekey_reason = "too much data sent";
                 s->rekey_class = RK_NORMAL;
             }
@@ -1511,9 +1509,8 @@ static void ssh2_transport_process_queue(PacketProtocolLayer *ppl)
                              s->rekey_reason);
                 /* Reset the counters, so that at least this message doesn't
                  * hit the event log _too_ often. */
-                s->stats->in.running = s->stats->out.running = true;
-                s->stats->in.remaining = s->stats->out.remaining =
-                    s->max_data_size;
+                dts_reset(&s->stats->in, s->max_data_size);
+                dts_reset(&s->stats->out, s->max_data_size);
                 (void) ssh2_transport_timer_update(s, 0);
                 s->rekey_class = RK_NONE;
             } else {
@@ -1910,11 +1907,9 @@ static void ssh2_transport_reconfigure(PacketProtocolLayer *ppl, Conf *conf)
         if (s->max_data_size < old_max_data_size) {
             unsigned long diff = old_max_data_size - s->max_data_size;
 
-            /* We must decrement both counters, so avoid short-circuit
-             * evaluation skipping one */
-            bool out_expired = DTS_CONSUME(s->stats, out, diff);
-            bool in_expired = DTS_CONSUME(s->stats, in, diff);
-            if (out_expired || in_expired)
+            dts_consume(&s->stats->out, diff);
+            dts_consume(&s->stats->in, diff);
+            if (s->stats->out.expired || s->stats->in.expired)
                 rekey_reason = "data limit lowered";
         } else {
             unsigned long diff = s->max_data_size - old_max_data_size;

+ 53 - 15
source/putty/sshaes.c

@@ -369,6 +369,29 @@ static inline void dumpslices_BignumInt(
  *
  * Source: 'A new combinational logic minimization technique with
  * applications to cryptology', https://eprint.iacr.org/2009/191
+ *
+ * As a minor speed optimisation, I use a modified version of the
+ * S-box which omits the additive constant 0x63, i.e. this S-box
+ * consists of only the field inversion and linear map components.
+ * Instead, the addition of the constant is deferred until after the
+ * subsequent ShiftRows and MixColumns stages, so that it happens at
+ * the same time as adding the next round key - and then we just make
+ * it _part_ of the round key, so it doesn't cost any extra
+ * instructions to add.
+ *
+ * (Obviously adding a constant to each byte commutes with ShiftRows,
+ * which only permutes the bytes. It also commutes with MixColumns:
+ * that's not quite so obvious, but since the effect of MixColumns is
+ * to multiply a constant polynomial M into each column, it is obvious
+ * that adding some polynomial K and then multiplying by M is
+ * equivalent to multiplying by M and then adding the product KM. And
+ * in fact, since the coefficients of M happen to sum to 1, it turns
+ * out that KM = K, so we don't even have to change the constant when
+ * we move it to the far side of MixColumns.)
+ *
+ * Of course, one knock-on effect of this is that the use of the S-box
+ * *during* key setup has to be corrected by manually adding on the
+ * constant afterwards!
  */
 
 /* Initial linear transformation for the forward S-box, from Fig 2 of
@@ -495,14 +518,14 @@ static inline void dumpslices_BignumInt(
         uintN_t t65 = t61 ^ t62;                        \
         uintN_t t66 = z1 ^ t63;                         \
         output[7] = t59 ^ t63;                          \
-        output[1] = ~(t56 ^ t62);                       \
-        output[0] = ~(t48 ^ t60);                       \
+        output[1] = t56 ^ t62;                          \
+        output[0] = t48 ^ t60;                          \
         uintN_t t67 = t64 ^ t65;                        \
         output[4] = t53 ^ t66;                          \
         output[3] = t51 ^ t66;                          \
         output[2] = t47 ^ t65;                          \
-        output[6] = ~(t64 ^ output[4]);                 \
-        output[5] = ~(t55 ^ t67);                       \
+        output[6] = t64 ^ output[4];                    \
+        output[5] = t55 ^ t67;                          \
         /* end */
 
 #define BITSLICED_SUBBYTES(output, input, uintN_t) do { \
@@ -520,23 +543,19 @@ static inline void dumpslices_BignumInt(
  * S_box.
  */
 #define SBOX_BACKWARD_TOP_TRANSFORM(input, uintN_t)     \
-    /* Initial subtraction of the constant */           \
-    uintN_t iv0 = ~input[0], iv1 = ~input[1];           \
-    uintN_t iv5 = ~input[5], iv6 = ~input[6];           \
-                                                        \
-    uintN_t y5 = input[4] ^ iv6;                        \
-    uintN_t y19 = input[3] ^ iv0;                       \
-    uintN_t itmp8 = y5 ^ iv0;                           \
-    uintN_t y4 = itmp8 ^ iv1;                           \
+    uintN_t y5 = input[4] ^ input[6];                   \
+    uintN_t y19 = input[3] ^ input[0];                  \
+    uintN_t itmp8 = y5 ^ input[0];                      \
+    uintN_t y4 = itmp8 ^ input[1];                      \
     uintN_t y9 = input[4] ^ input[3];                   \
     uintN_t y2 = y9 ^ y4;                               \
     uintN_t itmp9 = y2 ^ input[7];                      \
-    uintN_t y1 = y9 ^ iv0;                              \
+    uintN_t y1 = y9 ^ input[0];                         \
     uintN_t y6 = y5 ^ input[7];                         \
-    uintN_t y18 = y9 ^ iv5;                             \
+    uintN_t y18 = y9 ^ input[5];                        \
     uintN_t y7 = y18 ^ y2;                              \
     uintN_t y16 = y7 ^ y1;                              \
-    uintN_t y21 = y7 ^ iv1;                             \
+    uintN_t y21 = y7 ^ input[1];                        \
     uintN_t y3 = input[4] ^ input[7];                   \
     uintN_t y13 = y16 ^ y21;                            \
     uintN_t y8 = input[4] ^ y6;                         \
@@ -860,7 +879,15 @@ static void aes_sliced_key_setup(
             }
 
             if (sub) {
+                /* Apply the SubBytes transform to the key word. But
+                 * here we need to apply the _full_ SubBytes from the
+                 * spec, including the constant which our S-box leaves
+                 * out. */
                 BITSLICED_SUBBYTES(slices, slices, uint16_t);
+                slices[0] ^= 0xFFFF;
+                slices[1] ^= 0xFFFF;
+                slices[5] ^= 0xFFFF;
+                slices[6] ^= 0xFFFF;
             }
 
             if (rotate_and_round_constant) {
@@ -893,6 +920,17 @@ static void aes_sliced_key_setup(
     smemclr(inblk, sizeof(inblk));
     smemclr(slices, sizeof(slices));
 
+    /*
+     * Add the S-box constant to every round key after the first one,
+     * compensating for it being left out in the main cipher.
+     */
+    for (size_t i = 8; i < 8 * (sched_words/4); i += 8) {
+        sk->roundkeys_serial[i+0] ^= 0xFFFF;
+        sk->roundkeys_serial[i+1] ^= 0xFFFF;
+        sk->roundkeys_serial[i+5] ^= 0xFFFF;
+        sk->roundkeys_serial[i+6] ^= 0xFFFF;
+    }
+
     /*
      * Replicate that set of round keys into larger integers for the
      * parallel versions of the cipher.

+ 34 - 12
source/putty/sshbpp.h

@@ -80,25 +80,47 @@ bool ssh2_bpp_check_unimplemented(BinaryPacketProtocol *bpp, PktIn *pktin);
  * purposes of triggering an SSH-2 rekey when either one gets over a
  * configured limit. In each direction, the flag 'running' indicates
  * that we haven't hit the limit yet, and 'remaining' tracks how much
- * longer until we do. The macro DTS_CONSUME subtracts a given amount
- * from the counter in a particular direction, and evaluates to a
- * boolean indicating whether the limit has been hit.
+ * longer until we do. The function dts_consume() subtracts a given
+ * amount from the counter in a particular direction, and sets
+ * 'expired' if the limit has been hit.
  *
  * The limit is sticky: once 'running' has flipped to false,
  * 'remaining' is no longer decremented, so it shouldn't dangerously
  * wrap round.
  */
+struct DataTransferStatsDirection {
+    bool running, expired;
+    unsigned long remaining;
+};
 struct DataTransferStats {
-    struct {
-        bool running;
-        unsigned long remaining;
-    } in, out;
+    struct DataTransferStatsDirection in, out;
 };
-#define DTS_CONSUME(stats, direction, size)             \
-    ((stats)->direction.running &&                      \
-     (stats)->direction.remaining <= (size) ?           \
-     ((stats)->direction.running = false, true) :       \
-     ((stats)->direction.remaining -= (size), false))
+static inline void dts_consume(struct DataTransferStatsDirection *s,
+                               unsigned long size_consumed)
+{
+    if (s->running) {
+        if (s->remaining <= size_consumed) {
+            s->running = false;
+            s->expired = true;
+        } else {
+            s->remaining -= size_consumed;
+        }
+    }
+}
+static inline void dts_reset(struct DataTransferStatsDirection *s,
+                             unsigned long starting_size)
+{
+    s->expired = false;
+    s->remaining = starting_size;
+    /*
+     * The semantics of setting CONF_ssh_rekey_data to zero are to
+     * disable data-volume based rekeying completely. So if the
+     * starting size is actually zero, we don't set 'running' to true
+     * in the first place, which means we won't ever set the expired
+     * flag.
+     */
+    s->running = (starting_size != 0);
+}
 
 BinaryPacketProtocol *ssh2_bpp_new(
     LogContext *logctx, struct DataTransferStats *stats, bool is_server);

+ 8 - 0
source/putty/sshdes.c

@@ -61,6 +61,14 @@
 #include "ssh.h"
 #include "mpint_i.h"               /* we reuse the BignumInt system */
 
+/* If you compile with -DDES_DIAGNOSTICS, intermediate results will be
+ * sent to debug() (so you also need to compile with -DDEBUG).
+ * Otherwise this ifdef will condition away all the debug() calls. */
+#ifndef DES_DIAGNOSTICS
+#undef debug
+#define debug(...) ((void)0)
+#endif
+
 /*
  * General utility functions.
  */

+ 16 - 6
source/putty/sshhmac.c

@@ -16,7 +16,7 @@ struct hmac {
 
 struct hmac_extra {
     const ssh_hashalg *hashalg_base;
-    const char *suffix;
+    const char *suffix, *annotation;
 };
 
 static ssh2_mac *hmac_new(const ssh2_macalg *alg, ssh_cipher *cipher)
@@ -44,8 +44,21 @@ static ssh2_mac *hmac_new(const ssh2_macalg *alg, ssh_cipher *cipher)
     ctx->digest = snewn(ctx->hashalg->hlen, uint8_t);
 
     ctx->text_name = strbuf_new();
-    strbuf_catf(ctx->text_name, "HMAC-%s%s",
-                ctx->hashalg->text_name, extra->suffix);
+    strbuf_catf(ctx->text_name, "HMAC-%s",
+                ctx->hashalg->text_basename, extra->suffix);
+    if (extra->annotation || ctx->hashalg->annotation) {
+        strbuf_catf(ctx->text_name, " (");
+        const char *sep = "";
+        if (extra->annotation) {
+            strbuf_catf(ctx->text_name, "%s%s", sep, extra->annotation);
+            sep = ", ";
+        }
+        if (ctx->hashalg->annotation) {
+            strbuf_catf(ctx->text_name, "%s%s", sep, ctx->hashalg->annotation);
+            sep = ", ";
+        }
+        strbuf_catf(ctx->text_name, ")");
+    }
 
     ctx->mac.vt = alg;
     BinarySink_DELEGATE_INIT(&ctx->mac, ctx->h_live);
@@ -56,7 +69,6 @@ static ssh2_mac *hmac_new(const ssh2_macalg *alg, ssh_cipher *cipher)
 static void hmac_free(ssh2_mac *mac)
 {
     struct hmac *ctx = container_of(mac, struct hmac, mac);
-    const struct hmac_extra *extra = (const struct hmac_extra *)mac->vt->extra;
 
     ssh_hash_free(ctx->h_outer);
     ssh_hash_free(ctx->h_inner);
@@ -75,7 +87,6 @@ static void hmac_free(ssh2_mac *mac)
 static void hmac_key(ssh2_mac *mac, ptrlen key)
 {
     struct hmac *ctx = container_of(mac, struct hmac, mac);
-    const struct hmac_extra *extra = (const struct hmac_extra *)mac->vt->extra;
 
     const uint8_t *kp;
     size_t klen;
@@ -149,7 +160,6 @@ static void hmac_start(ssh2_mac *mac)
 static void hmac_genresult(ssh2_mac *mac, unsigned char *output)
 {
     struct hmac *ctx = container_of(mac, struct hmac, mac);
-    const struct hmac_extra *extra = (const struct hmac_extra *)mac->vt->extra;
     ssh_hash *htmp;
 
     /* Leave h_live in place, so that the SSH-2 BPP can continue

+ 1 - 1
source/putty/sshmd5.c

@@ -271,5 +271,5 @@ static void md5_final(ssh_hash *hash, unsigned char *output)
 }
 
 const ssh_hashalg ssh_md5 = {
-    md5_new, md5_copy, md5_final, md5_free, 16, 64, "MD5"
+    md5_new, md5_copy, md5_final, md5_free, 16, 64, HASHALG_NAMES_BARE("MD5"),
 };

+ 5 - 4
source/putty/sshpubk.c

@@ -371,13 +371,14 @@ bool rsa_ssh1_savekey(const Filename *filename, RSAKey *key,
      * Done. Write the result to the file.
      */
     fp = f_open(filename, "wb", true);
+    bool ret = false;
     if (fp) {
-	bool ret = (fwrite(buf->u, 1, buf->len, fp) == (size_t) (buf->len));
+        ret = (fwrite(buf->u, 1, buf->len, fp) == (size_t) (buf->len));
         if (fclose(fp))
             ret = false;
-	return ret;
-    } else
-	return false;
+    }
+    strbuf_free(buf);
+    return ret;
 }
 
 /* ----------------------------------------------------------------------

+ 6 - 3
source/putty/sshrsa.c

@@ -328,9 +328,12 @@ bool rsa_verify(RSAKey *key)
      * should instead flip them round into the canonical order of
      * p > q. This also involves regenerating iqmp.
      */
-    unsigned swap_pq = mp_cmp_hs(key->q, key->p);
-    mp_cond_swap(key->p, key->q, swap_pq);
-    mp_free(key->iqmp);
+    mp_int *p_new = mp_max(key->p, key->q);
+    mp_int *q_new = mp_min(key->p, key->q);
+    mp_free(key->p);
+    mp_free(key->q);
+    key->p = p_new;
+    key->q = q_new;
     key->iqmp = mp_invert(key->q, key->p);
 
     return ok;

+ 228 - 10
source/putty/sshsh256.c

@@ -12,6 +12,7 @@
  */
 #define HW_SHA256_NONE 0
 #define HW_SHA256_NI 1
+#define HW_SHA256_NEON 2
 
 #ifdef _FORCE_SHA_NI
 #   define HW_SHA256 HW_SHA256_NI
@@ -21,8 +22,7 @@
 #       define HW_SHA256 HW_SHA256_NI
 #   endif
 #elif defined(__GNUC__)
-#    if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) && \
-    (defined(__x86_64__) || defined(__i386))
+#    if (__GNUC__ >= 5) && (defined(__x86_64__) || defined(__i386))
 #       define HW_SHA256 HW_SHA256_NI
 #    endif
 #elif defined (_MSC_VER)
@@ -31,6 +31,37 @@
 #   endif
 #endif
 
+#ifdef _FORCE_SHA_NEON
+#   define HW_SHA256 HW_SHA256_NEON
+#elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    /* Arm can potentially support both endiannesses, but this code
+     * hasn't been tested on anything but little. If anyone wants to
+     * run big-endian, they'll need to fix it first. */
+#elif defined __ARM_FEATURE_CRYPTO
+    /* If the Arm crypto extension is available already, we can
+     * support NEON SHA without having to enable anything by hand */
+#   define HW_SHA256 HW_SHA256_NEON
+#elif defined(__clang__)
+#   if __has_attribute(target) && __has_include(<arm_neon.h>) &&       \
+    (defined(__aarch64__))
+        /* clang can enable the crypto extension in AArch64 using
+         * __attribute__((target)) */
+#       define HW_SHA256 HW_SHA256_NEON
+#       define USE_CLANG_ATTR_TARGET_AARCH64
+#   endif
+#elif defined _MSC_VER
+    /* Visual Studio supports the crypto extension when targeting
+     * AArch64, but as of VS2017, the AArch32 header doesn't quite
+     * manage it (declaring the shae/shad intrinsics without a round
+     * key operand). */
+#   if defined _M_ARM64
+#       define HW_SHA256 HW_SHA256_NEON
+#       if defined _M_ARM64
+#           define USE_ARM64_NEON_H /* unusual header name in this case */
+#       endif
+#   endif
+#endif
+
 #if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA256
 #   undef HW_SHA256
 #   define HW_SHA256 HW_SHA256_NONE
@@ -67,7 +98,7 @@ static ssh_hash *sha256_select(const ssh_hashalg *alg)
 
 const ssh_hashalg ssh_sha256 = {
     sha256_select, NULL, NULL, NULL,
-    32, 64, "SHA-256",
+    32, 64, HASHALG_NAMES_ANNOTATED("SHA-256", "dummy selector vtable"),
 };
 
 /* ----------------------------------------------------------------------
@@ -295,7 +326,7 @@ static void sha256_sw_final(ssh_hash *hash, uint8_t *digest)
 
 const ssh_hashalg ssh_sha256_sw = {
     sha256_sw_new, sha256_sw_copy, sha256_sw_final, sha256_sw_free,
-    32, 64, "SHA-256",
+    32, 64, HASHALG_NAMES_ANNOTATED("SHA-256", "unaccelerated"),
 };
 
 /* ----------------------------------------------------------------------
@@ -307,13 +338,12 @@ const ssh_hashalg ssh_sha256_sw = {
 /*
  * Set target architecture for Clang and GCC
  */
-#if !defined(__clang__) && defined(__GNUC__)
+#if defined(__clang__) || defined(__GNUC__)
+#    define FUNC_ISA __attribute__ ((target("sse4.1,sha")))
+#if !defined(__clang__)
 #    pragma GCC target("sha")
 #    pragma GCC target("sse4.1")
 #endif
-
-#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
-#    define FUNC_ISA __attribute__ ((target("sse4.1,sha")))
 #else
 #    define FUNC_ISA
 #endif
@@ -652,7 +682,194 @@ FUNC_ISA static void sha256_ni_final(ssh_hash *hash, uint8_t *digest)
 
 const ssh_hashalg ssh_sha256_hw = {
     sha256_ni_new, sha256_ni_copy, sha256_ni_final, sha256_ni_free,
-    32, 64, "SHA-256",
+    32, 64, HASHALG_NAMES_ANNOTATED("SHA-256", "SHA-NI accelerated"),
+};
+
+/* ----------------------------------------------------------------------
+ * Hardware-accelerated implementation of SHA-256 using Arm NEON.
+ */
+
+#elif HW_SHA256 == HW_SHA256_NEON
+
+/*
+ * Manually set the target architecture, if we decided above that we
+ * need to.
+ */
+#ifdef USE_CLANG_ATTR_TARGET_AARCH64
+/*
+ * A spot of cheating: redefine some ACLE feature macros before
+ * including arm_neon.h. Otherwise we won't get the SHA intrinsics
+ * defined by that header, because it will be looking at the settings
+ * for the whole translation unit rather than the ones we're going to
+ * put on some particular functions using __attribute__((target)).
+ */
+#define __ARM_NEON 1
+#define __ARM_FEATURE_CRYPTO 1
+#define FUNC_ISA __attribute__ ((target("neon,crypto")))
+#endif /* USE_CLANG_ATTR_TARGET_AARCH64 */
+
+#ifndef FUNC_ISA
+#define FUNC_ISA
+#endif
+
+#ifdef USE_ARM64_NEON_H
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+static bool sha256_hw_available(void)
+{
+    /*
+     * For Arm, we delegate to a per-platform detection function (see
+     * explanation in sshaes.c).
+     */
+    return platform_sha256_hw_available();
+}
+
+typedef struct sha256_neon_core sha256_neon_core;
+struct sha256_neon_core {
+    uint32x4_t abcd, efgh;
+};
+
+FUNC_ISA
+static inline uint32x4_t sha256_neon_load_input(const uint8_t *p)
+{
+    return vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(p)));
+}
+
+FUNC_ISA
+static inline uint32x4_t sha256_neon_schedule_update(
+    uint32x4_t m4, uint32x4_t m3, uint32x4_t m2, uint32x4_t m1)
+{
+    return vsha256su1q_u32(vsha256su0q_u32(m4, m3), m2, m1);
+}
+
+FUNC_ISA
+static inline sha256_neon_core sha256_neon_round4(
+    sha256_neon_core old, uint32x4_t sched, unsigned round)
+{
+    sha256_neon_core new;
+
+    uint32x4_t round_input = vaddq_u32(
+        sched, vld1q_u32(sha256_round_constants + round));
+    new.abcd = vsha256hq_u32 (old.abcd, old.efgh, round_input);
+    new.efgh = vsha256h2q_u32(old.efgh, old.abcd, round_input);
+    return new;
+}
+
+FUNC_ISA
+static inline void sha256_neon_block(sha256_neon_core *core, const uint8_t *p)
+{
+    uint32x4_t s0, s1, s2, s3;
+    sha256_neon_core cr = *core;
+
+    s0 = sha256_neon_load_input(p);
+    cr = sha256_neon_round4(cr, s0, 0);
+    s1 = sha256_neon_load_input(p+16);
+    cr = sha256_neon_round4(cr, s1, 4);
+    s2 = sha256_neon_load_input(p+32);
+    cr = sha256_neon_round4(cr, s2, 8);
+    s3 = sha256_neon_load_input(p+48);
+    cr = sha256_neon_round4(cr, s3, 12);
+    s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha256_neon_round4(cr, s0, 16);
+    s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha256_neon_round4(cr, s1, 20);
+    s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha256_neon_round4(cr, s2, 24);
+    s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha256_neon_round4(cr, s3, 28);
+    s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha256_neon_round4(cr, s0, 32);
+    s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha256_neon_round4(cr, s1, 36);
+    s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha256_neon_round4(cr, s2, 40);
+    s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha256_neon_round4(cr, s3, 44);
+    s0 = sha256_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha256_neon_round4(cr, s0, 48);
+    s1 = sha256_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha256_neon_round4(cr, s1, 52);
+    s2 = sha256_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha256_neon_round4(cr, s2, 56);
+    s3 = sha256_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha256_neon_round4(cr, s3, 60);
+
+    core->abcd = vaddq_u32(core->abcd, cr.abcd);
+    core->efgh = vaddq_u32(core->efgh, cr.efgh);
+}
+
+typedef struct sha256_neon {
+    sha256_neon_core core;
+    sha256_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha256_neon;
+
+static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha256_neon_new(const ssh_hashalg *alg)
+{
+    if (!sha256_hw_available_cached())
+        return NULL;
+
+    sha256_neon *s = snew(sha256_neon);
+
+    s->core.abcd = vld1q_u32(sha256_initial_state);
+    s->core.efgh = vld1q_u32(sha256_initial_state + 4);
+
+    sha256_block_setup(&s->blk);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha256_neon_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static ssh_hash *sha256_neon_copy(ssh_hash *hash)
+{
+    sha256_neon *s = container_of(hash, sha256_neon, hash);
+    sha256_neon *copy = snew(sha256_neon);
+
+    *copy = *s; /* structure copy */
+
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+
+    return &copy->hash;
+}
+
+static void sha256_neon_free(ssh_hash *hash)
+{
+    sha256_neon *s = container_of(hash, sha256_neon, hash);
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha256_neon_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha256_neon *s = BinarySink_DOWNCAST(bs, sha256_neon);
+
+    while (len > 0)
+        if (sha256_block_write(&s->blk, &vp, &len))
+            sha256_neon_block(&s->core, s->blk.block);
+}
+
+static void sha256_neon_final(ssh_hash *hash, uint8_t *digest)
+{
+    sha256_neon *s = container_of(hash, sha256_neon, hash);
+
+    sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
+    vst1q_u8(digest,      vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd)));
+    vst1q_u8(digest + 16, vrev32q_u8(vreinterpretq_u8_u32(s->core.efgh)));
+    sha256_neon_free(hash);
+}
+
+const ssh_hashalg ssh_sha256_hw = {
+    sha256_neon_new, sha256_neon_copy, sha256_neon_final, sha256_neon_free,
+    32, 64, HASHALG_NAMES_ANNOTATED("SHA-256", "NEON accelerated"),
 };
 
 /* ----------------------------------------------------------------------
@@ -683,7 +900,8 @@ static void sha256_stub_final(ssh_hash *hash, uint8_t *digest) STUB_BODY
 
 const ssh_hashalg ssh_sha256_hw = {
     sha256_stub_new, sha256_stub_copy, sha256_stub_final, sha256_stub_free,
-    32, 64, "SHA-256",
+    32, 64, HASHALG_NAMES_ANNOTATED(
+        "SHA-256", "!NONEXISTENT ACCELERATED VERSION!"),
 };
 
 #endif /* HW_SHA256 */

+ 4 - 2
source/putty/sshsh512.c

@@ -343,7 +343,8 @@ static void sha512_final(ssh_hash *hash, unsigned char *output)
 }
 
 const ssh_hashalg ssh_sha512 = {
-    sha512_new, sha512_copy, sha512_final, sha512_free, 64, BLKSIZE, "SHA-512"
+    sha512_new, sha512_copy, sha512_final, sha512_free,
+    64, BLKSIZE, HASHALG_NAMES_BARE("SHA-512"),
 };
 
 static ssh_hash *sha384_new(const ssh_hashalg *alg)
@@ -363,5 +364,6 @@ static void sha384_final(ssh_hash *hash, unsigned char *output)
 }
 
 const ssh_hashalg ssh_sha384 = {
-    sha384_new, sha512_copy, sha384_final, sha512_free, 48, BLKSIZE, "SHA-384"
+    sha384_new, sha512_copy, sha384_final, sha512_free,
+    48, BLKSIZE, HASHALG_NAMES_BARE("SHA-384"),
 };

+ 258 - 10
source/putty/sshsha.c

@@ -12,6 +12,7 @@
  */
 #define HW_SHA1_NONE 0
 #define HW_SHA1_NI 1
+#define HW_SHA1_NEON 2
 
 #ifdef _FORCE_SHA_NI
 #   define HW_SHA1 HW_SHA1_NI
@@ -21,8 +22,7 @@
 #       define HW_SHA1 HW_SHA1_NI
 #   endif
 #elif defined(__GNUC__)
-#    if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) && \
-    (defined(__x86_64__) || defined(__i386))
+#    if (__GNUC__ >= 5) && (defined(__x86_64__) || defined(__i386))
 #       define HW_SHA1 HW_SHA1_NI
 #    endif
 #elif defined (_MSC_VER)
@@ -31,6 +31,37 @@
 #   endif
 #endif
 
+#ifdef _FORCE_SHA_NEON
+#   define HW_SHA1 HW_SHA1_NEON
+#elif defined __BYTE_ORDER__ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    /* Arm can potentially support both endiannesses, but this code
+     * hasn't been tested on anything but little. If anyone wants to
+     * run big-endian, they'll need to fix it first. */
+#elif defined __ARM_FEATURE_CRYPTO
+    /* If the Arm crypto extension is available already, we can
+     * support NEON SHA without having to enable anything by hand */
+#   define HW_SHA1 HW_SHA1_NEON
+#elif defined(__clang__)
+#   if __has_attribute(target) && __has_include(<arm_neon.h>) &&       \
+    (defined(__aarch64__))
+        /* clang can enable the crypto extension in AArch64 using
+         * __attribute__((target)) */
+#       define HW_SHA1 HW_SHA1_NEON
+#       define USE_CLANG_ATTR_TARGET_AARCH64
+#   endif
+#elif defined _MSC_VER
+    /* Visual Studio supports the crypto extension when targeting
+     * AArch64, but as of VS2017, the AArch32 header doesn't quite
+     * manage it (declaring the shae/shad intrinsics without a round
+     * key operand). */
+#   if defined _M_ARM64
+#       define HW_SHA1 HW_SHA1_NEON
+#       if defined _M_ARM64
+#           define USE_ARM64_NEON_H /* unusual header name in this case */
+#       endif
+#   endif
+#endif
+
 #if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA1
 #   undef HW_SHA1
 #   define HW_SHA1 HW_SHA1_NONE
@@ -67,7 +98,7 @@ static ssh_hash *sha1_select(const ssh_hashalg *alg)
 
 const ssh_hashalg ssh_sha1 = {
     sha1_select, NULL, NULL, NULL,
-    20, 64, "SHA-1",
+    20, 64, HASHALG_NAMES_ANNOTATED("SHA-1", "dummy selector vtable"),
 };
 
 /* ----------------------------------------------------------------------
@@ -278,7 +309,7 @@ static void sha1_sw_final(ssh_hash *hash, uint8_t *digest)
 
 const ssh_hashalg ssh_sha1_sw = {
     sha1_sw_new, sha1_sw_copy, sha1_sw_final, sha1_sw_free,
-    20, 64, "SHA-1",
+    20, 64, HASHALG_NAMES_ANNOTATED("SHA-1", "unaccelerated"),
 };
 
 /* ----------------------------------------------------------------------
@@ -290,13 +321,13 @@ const ssh_hashalg ssh_sha1_sw = {
 /*
  * Set target architecture for Clang and GCC
  */
-#if !defined(__clang__) && defined(__GNUC__)
+
+#if defined(__clang__) || defined(__GNUC__)
+#    define FUNC_ISA __attribute__ ((target("sse4.1,sha")))
+#if !defined(__clang__)
 #    pragma GCC target("sha")
 #    pragma GCC target("sse4.1")
 #endif
-
-#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
-#    define FUNC_ISA __attribute__ ((target("sse4.1,sha")))
 #else
 #    define FUNC_ISA
 #endif
@@ -619,7 +650,223 @@ FUNC_ISA static void sha1_ni_final(ssh_hash *hash, uint8_t *digest)
 
 const ssh_hashalg ssh_sha1_hw = {
     sha1_ni_new, sha1_ni_copy, sha1_ni_final, sha1_ni_free,
-    20, 64, "SHA-1",
+    20, 64, HASHALG_NAMES_ANNOTATED("SHA-1", "SHA-NI accelerated"),
+};
+
+/* ----------------------------------------------------------------------
+ * Hardware-accelerated implementation of SHA-1 using Arm NEON.
+ */
+
+#elif HW_SHA1 == HW_SHA1_NEON
+
+/*
+ * Manually set the target architecture, if we decided above that we
+ * need to.
+ */
+#ifdef USE_CLANG_ATTR_TARGET_AARCH64
+/*
+ * A spot of cheating: redefine some ACLE feature macros before
+ * including arm_neon.h. Otherwise we won't get the SHA intrinsics
+ * defined by that header, because it will be looking at the settings
+ * for the whole translation unit rather than the ones we're going to
+ * put on some particular functions using __attribute__((target)).
+ */
+#define __ARM_NEON 1
+#define __ARM_FEATURE_CRYPTO 1
+#define FUNC_ISA __attribute__ ((target("neon,crypto")))
+#endif /* USE_CLANG_ATTR_TARGET_AARCH64 */
+
+#ifndef FUNC_ISA
+#define FUNC_ISA
+#endif
+
+#ifdef USE_ARM64_NEON_H
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+static bool sha1_hw_available(void)
+{
+    /*
+     * For Arm, we delegate to a per-platform detection function (see
+     * explanation in sshaes.c).
+     */
+    return platform_sha1_hw_available();
+}
+
+typedef struct sha1_neon_core sha1_neon_core;
+struct sha1_neon_core {
+    uint32x4_t abcd;
+    uint32_t e;
+};
+
+/* ------------- got up to here ----------------------------------------- */
+
+FUNC_ISA
+static inline uint32x4_t sha1_neon_load_input(const uint8_t *p)
+{
+    return vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(p)));
+}
+
+FUNC_ISA
+static inline uint32x4_t sha1_neon_schedule_update(
+    uint32x4_t m4, uint32x4_t m3, uint32x4_t m2, uint32x4_t m1)
+{
+    return vsha1su1q_u32(vsha1su0q_u32(m4, m3, m2), m1);
+}
+
+/*
+ * SHA-1 has three different kinds of round, differing in whether they
+ * use the Ch, Maj or Par functions defined above. Each one uses a
+ * separate NEON instruction, so we define three inline functions for
+ * the different round types using this macro.
+ *
+ * The two batches of Par-type rounds also use a different constant,
+ * but that's passed in as an operand, so we don't need a fourth
+ * inline function just for that.
+ */
+#define SHA1_NEON_ROUND_FN(type)                                        \
+    FUNC_ISA static inline sha1_neon_core sha1_neon_round4_##type(      \
+        sha1_neon_core old, uint32x4_t sched, uint32x4_t constant)      \
+    {                                                                   \
+        sha1_neon_core new;                                             \
+        uint32x4_t round_input = vaddq_u32(sched, constant);            \
+        new.abcd = vsha1##type##q_u32(old.abcd, old.e, round_input);    \
+        new.e = vsha1h_u32(vget_lane_u32(vget_low_u32(old.abcd), 0));   \
+        return new;                                                     \
+    }
+SHA1_NEON_ROUND_FN(c)
+SHA1_NEON_ROUND_FN(p)
+SHA1_NEON_ROUND_FN(m)
+
+FUNC_ISA
+static inline void sha1_neon_block(sha1_neon_core *core, const uint8_t *p)
+{
+    uint32x4_t constant, s0, s1, s2, s3;
+    sha1_neon_core cr = *core;
+
+    constant = vdupq_n_u32(SHA1_STAGE0_CONSTANT);
+    s0 = sha1_neon_load_input(p);
+    cr = sha1_neon_round4_c(cr, s0, constant);
+    s1 = sha1_neon_load_input(p + 16);
+    cr = sha1_neon_round4_c(cr, s1, constant);
+    s2 = sha1_neon_load_input(p + 32);
+    cr = sha1_neon_round4_c(cr, s2, constant);
+    s3 = sha1_neon_load_input(p + 48);
+    cr = sha1_neon_round4_c(cr, s3, constant);
+    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha1_neon_round4_c(cr, s0, constant);
+
+    constant = vdupq_n_u32(SHA1_STAGE1_CONSTANT);
+    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha1_neon_round4_p(cr, s1, constant);
+    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha1_neon_round4_p(cr, s2, constant);
+    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha1_neon_round4_p(cr, s3, constant);
+    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha1_neon_round4_p(cr, s0, constant);
+    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha1_neon_round4_p(cr, s1, constant);
+
+    constant = vdupq_n_u32(SHA1_STAGE2_CONSTANT);
+    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha1_neon_round4_m(cr, s2, constant);
+    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha1_neon_round4_m(cr, s3, constant);
+    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha1_neon_round4_m(cr, s0, constant);
+    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha1_neon_round4_m(cr, s1, constant);
+    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha1_neon_round4_m(cr, s2, constant);
+
+    constant = vdupq_n_u32(SHA1_STAGE3_CONSTANT);
+    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha1_neon_round4_p(cr, s3, constant);
+    s0 = sha1_neon_schedule_update(s0, s1, s2, s3);
+    cr = sha1_neon_round4_p(cr, s0, constant);
+    s1 = sha1_neon_schedule_update(s1, s2, s3, s0);
+    cr = sha1_neon_round4_p(cr, s1, constant);
+    s2 = sha1_neon_schedule_update(s2, s3, s0, s1);
+    cr = sha1_neon_round4_p(cr, s2, constant);
+    s3 = sha1_neon_schedule_update(s3, s0, s1, s2);
+    cr = sha1_neon_round4_p(cr, s3, constant);
+
+    core->abcd = vaddq_u32(core->abcd, cr.abcd);
+    core->e += cr.e;
+}
+
+typedef struct sha1_neon {
+    sha1_neon_core core;
+    sha1_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha1_neon;
+
+static void sha1_neon_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha1_neon_new(const ssh_hashalg *alg)
+{
+    if (!sha1_hw_available_cached())
+        return NULL;
+
+    sha1_neon *s = snew(sha1_neon);
+
+    s->core.abcd = vld1q_u32(sha1_initial_state);
+    s->core.e = sha1_initial_state[4];
+
+    sha1_block_setup(&s->blk);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha1_neon_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static ssh_hash *sha1_neon_copy(ssh_hash *hash)
+{
+    sha1_neon *s = container_of(hash, sha1_neon, hash);
+    sha1_neon *copy = snew(sha1_neon);
+
+    *copy = *s; /* structure copy */
+
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+
+    return &copy->hash;
+}
+
+static void sha1_neon_free(ssh_hash *hash)
+{
+    sha1_neon *s = container_of(hash, sha1_neon, hash);
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha1_neon_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha1_neon *s = BinarySink_DOWNCAST(bs, sha1_neon);
+
+    while (len > 0)
+        if (sha1_block_write(&s->blk, &vp, &len))
+            sha1_neon_block(&s->core, s->blk.block);
+}
+
+static void sha1_neon_final(ssh_hash *hash, uint8_t *digest)
+{
+    sha1_neon *s = container_of(hash, sha1_neon, hash);
+
+    sha1_block_pad(&s->blk, BinarySink_UPCAST(s));
+    vst1q_u8(digest, vrev32q_u8(vreinterpretq_u8_u32(s->core.abcd)));
+    PUT_32BIT_MSB_FIRST(digest + 16, s->core.e);
+    sha1_neon_free(hash);
+}
+
+const ssh_hashalg ssh_sha1_hw = {
+    sha1_neon_new, sha1_neon_copy, sha1_neon_final, sha1_neon_free,
+    20, 64, HASHALG_NAMES_ANNOTATED("SHA-1", "NEON accelerated"),
 };
 
 /* ----------------------------------------------------------------------
@@ -650,7 +897,8 @@ static void sha1_stub_final(ssh_hash *hash, uint8_t *digest) STUB_BODY
 
 const ssh_hashalg ssh_sha1_hw = {
     sha1_stub_new, sha1_stub_copy, sha1_stub_final, sha1_stub_free,
-    20, 64, "SHA-1",
+    20, 64, HASHALG_NAMES_ANNOTATED(
+        "SHA-1", "!NONEXISTENT ACCELERATED VERSION!"),
 };
 
 #endif /* HW_SHA1 */

+ 27 - 0
source/putty/windows/winmisc.c

@@ -405,3 +405,30 @@ void unescape_registry_key(const char *in, strbuf *out)
 	}
     }
 }
+
+#ifdef DEBUG
+static FILE *debug_fp = NULL;
+static HANDLE debug_hdl = INVALID_HANDLE_VALUE;
+static int debug_got_console = 0;
+
+void dputs(const char *buf)
+{
+    DWORD dw;
+
+    if (!debug_got_console) {
+	if (AllocConsole()) {
+	    debug_got_console = 1;
+	    debug_hdl = GetStdHandle(STD_OUTPUT_HANDLE);
+	}
+    }
+    if (!debug_fp) {
+	debug_fp = fopen("debug.log", "w");
+    }
+
+    if (debug_hdl != INVALID_HANDLE_VALUE) {
+	WriteFile(debug_hdl, buf, strlen(buf), &dw, NULL);
+    }
+    fputs(buf, debug_fp);
+    fflush(debug_fp);
+}
+#endif

+ 2 - 0
source/putty/windows/winnet.c

@@ -1337,6 +1337,8 @@ static void sk_net_close(Socket *sock)
     if (s->child)
 	sk_net_close(&s->child->sock);
 
+    bufchain_clear(&s->output_data);
+
     del234(sktree, s);
     do_select(s->s, false);
     p_closesocket(s->s);