Browse Source

OpenSSL 1.1.1d

Source commit: 3245567ef0fa996469bea6758d78701360f3d88c
Martin Prikryl 6 years ago
parent
commit
d269acd797
100 changed files with 1665 additions and 9715 deletions
  1. 108 3
      libs/openssl/CHANGES
  2. 2 2
      libs/openssl/Configurations/00-base-templates.conf
  3. 5 1
      libs/openssl/Configurations/10-main.conf
  4. 1 1
      libs/openssl/Configurations/15-ios.conf
  5. 1 1
      libs/openssl/Configurations/50-win-onecore.conf
  6. 1 1
      libs/openssl/Configurations/common0.tmpl
  7. 102 4
      libs/openssl/Configurations/unix-Makefile.tmpl
  8. 1 4
      libs/openssl/Configure
  9. 25 4
      libs/openssl/INSTALL
  10. 18 1
      libs/openssl/NEWS
  11. 15 0
      libs/openssl/NOTES.WIN
  12. 1 1
      libs/openssl/README
  13. 8 1
      libs/openssl/apps/apps.c
  14. 1 3
      libs/openssl/apps/apps.h
  15. 2 2
      libs/openssl/apps/ca.c
  16. 2 2
      libs/openssl/apps/dgst.c
  17. 1 1
      libs/openssl/apps/enc.c
  18. 4 2
      libs/openssl/apps/ocsp.c
  19. 1 2
      libs/openssl/apps/openssl.c
  20. 1 1
      libs/openssl/apps/pkcs12.c
  21. 13 3
      libs/openssl/apps/req.c
  22. 4 16
      libs/openssl/apps/s_apps.h
  23. 2 1
      libs/openssl/apps/s_cb.c
  24. 1 1
      libs/openssl/apps/s_client.c
  25. 1 1
      libs/openssl/apps/speed.c
  26. 2 2
      libs/openssl/apps/storeutl.c
  27. 4 4
      libs/openssl/config
  28. 0 3000
      libs/openssl/crypto/aes/asm/aes-586.pl
  29. 3 3
      libs/openssl/crypto/aes/asm/aes-s390x.pl
  30. 0 2916
      libs/openssl/crypto/aes/asm/aes-x86_64.pl
  31. 0 3239
      libs/openssl/crypto/aes/asm/bsaes-x86_64.pl
  32. 26 21
      libs/openssl/crypto/asn1/a_time.c
  33. 7 3
      libs/openssl/crypto/asn1/a_type.c
  34. 15 4
      libs/openssl/crypto/asn1/x_bignum.c
  35. 20 6
      libs/openssl/crypto/bio/b_addr.c
  36. 2 2
      libs/openssl/crypto/bio/bss_dgram.c
  37. 28 33
      libs/openssl/crypto/bio/bss_file.c
  38. 1 3
      libs/openssl/crypto/bio/bss_mem.c
  39. 1 1
      libs/openssl/crypto/bn/asm/mips.pl
  40. 2 2
      libs/openssl/crypto/bn/bn_div.c
  41. 2 2
      libs/openssl/crypto/bn/bn_lcl.h
  42. 70 31
      libs/openssl/crypto/bn/bn_lib.c
  43. 6 2
      libs/openssl/crypto/bn/bn_prime.c
  44. 2 5
      libs/openssl/crypto/bn/bn_rand.c
  45. 3 2
      libs/openssl/crypto/bn/bn_sqrt.c
  46. 134 2
      libs/openssl/crypto/cms/cms_att.c
  47. 18 2
      libs/openssl/crypto/cms/cms_env.c
  48. 4 1
      libs/openssl/crypto/cms/cms_err.c
  49. 5 4
      libs/openssl/crypto/cms/cms_lcl.h
  50. 32 6
      libs/openssl/crypto/cms/cms_sd.c
  51. 5 1
      libs/openssl/crypto/cms/cms_smime.c
  52. 1 1
      libs/openssl/crypto/conf/conf_sap.c
  53. 7 1
      libs/openssl/crypto/ctype.c
  54. 11 26
      libs/openssl/crypto/dh/dh_check.c
  55. 26 26
      libs/openssl/crypto/dh/dh_gen.c
  56. 11 2
      libs/openssl/crypto/dh/dh_key.c
  57. 3 3
      libs/openssl/crypto/dh/dh_lib.c
  58. 2 2
      libs/openssl/crypto/dsa/dsa_ameth.c
  59. 3 1
      libs/openssl/crypto/dsa/dsa_err.c
  60. 9 1
      libs/openssl/crypto/dsa/dsa_ossl.c
  61. 2 3
      libs/openssl/crypto/dso/dso_dlfcn.c
  62. 1 2
      libs/openssl/crypto/ec/asm/ecp_nistz256-sparcv9.pl
  63. 1 1
      libs/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl
  64. 3 3
      libs/openssl/crypto/ec/asm/x25519-ppc64.pl
  65. 68 5
      libs/openssl/crypto/ec/ec_asn1.c
  66. 113 1
      libs/openssl/crypto/ec/ec_curve.c
  67. 3 1
      libs/openssl/crypto/ec/ec_lcl.h
  68. 96 7
      libs/openssl/crypto/ec/ec_lib.c
  69. 1 1
      libs/openssl/crypto/ec/ecdh_ossl.c
  70. 12 4
      libs/openssl/crypto/ec/ecdsa_ossl.c
  71. 26 36
      libs/openssl/crypto/ec/ecp_nistp224.c
  72. 27 36
      libs/openssl/crypto/ec/ecp_nistp256.c
  73. 28 37
      libs/openssl/crypto/ec/ecp_nistp521.c
  74. 13 13
      libs/openssl/crypto/ec/ecp_nistputil.c
  75. 1 1
      libs/openssl/crypto/ec/ecx_meth.c
  76. 1 1
      libs/openssl/crypto/engine/eng_devcrypto.c
  77. 5 3
      libs/openssl/crypto/engine/eng_openssl.c
  78. 9 7
      libs/openssl/crypto/err/err.c
  79. 12 0
      libs/openssl/crypto/err/openssl.txt
  80. 3 3
      libs/openssl/crypto/evp/bio_ok.c
  81. 83 8
      libs/openssl/crypto/evp/e_aes.c
  82. 11 2
      libs/openssl/crypto/evp/e_aria.c
  83. 6 1
      libs/openssl/crypto/evp/e_chacha20_poly1305.c
  84. 5 1
      libs/openssl/crypto/evp/e_rc5.c
  85. 10 1
      libs/openssl/crypto/evp/evp_err.c
  86. 8 1
      libs/openssl/crypto/evp/evp_lib.c
  87. 4 1
      libs/openssl/crypto/evp/m_sha3.c
  88. 3 1
      libs/openssl/crypto/include/internal/ctype.h
  89. 3 3
      libs/openssl/crypto/include/internal/rand_int.h
  90. 5 1
      libs/openssl/crypto/include/internal/sm2err.h
  91. 1 2
      libs/openssl/crypto/init.c
  92. 4 4
      libs/openssl/crypto/lhash/lhash.c
  93. 1 1
      libs/openssl/crypto/o_str.c
  94. 4 1
      libs/openssl/crypto/pem/pvkfmt.c
  95. 9 5
      libs/openssl/crypto/pkcs7/pk7_doit.c
  96. 8 5
      libs/openssl/crypto/rand/drbg_lib.c
  97. 2 1
      libs/openssl/crypto/rand/rand_err.c
  98. 26 20
      libs/openssl/crypto/rand/rand_lcl.h
  99. 125 23
      libs/openssl/crypto/rand/rand_lib.c
  100. 136 49
      libs/openssl/crypto/rand/rand_unix.c

+ 108 - 3
libs/openssl/CHANGES

@@ -7,6 +7,101 @@
  https://github.com/openssl/openssl/commits/ and pick the appropriate
  release branch.
 
+ Changes between 1.1.1c and 1.1.1d [10 Sep 2019]
+
+  *) Fixed a fork protection issue. OpenSSL 1.1.1 introduced a rewritten random
+     number generator (RNG). This was intended to include protection in the
+     event of a fork() system call in order to ensure that the parent and child
+     processes did not share the same RNG state. However this protection was not
+     being used in the default case.
+
+     A partial mitigation for this issue is that the output from a high
+     precision timer is mixed into the RNG state so the likelihood of a parent
+     and child process sharing state is significantly reduced.
+
+     If an application already calls OPENSSL_init_crypto() explicitly using
+     OPENSSL_INIT_ATFORK then this problem does not occur at all.
+     (CVE-2019-1549)
+     [Matthias St. Pierre]
+
+  *) For built-in EC curves, ensure an EC_GROUP built from the curve name is
+     used even when parsing explicit parameters, when loading a serialized key
+     or calling `EC_GROUP_new_from_ecpkparameters()`/
+     `EC_GROUP_new_from_ecparameters()`.
+     This prevents bypass of security hardening and performance gains,
+     especially for curves with specialized EC_METHODs.
+     By default, if a key encoded with explicit parameters is loaded and later
+     serialized, the output is still encoded with explicit parameters, even if
+     internally a "named" EC_GROUP is used for computation.
+     [Nicola Tuveri]
+
+  *) Compute ECC cofactors if not provided during EC_GROUP construction. Before
+     this change, EC_GROUP_set_generator would accept order and/or cofactor as
+     NULL. After this change, only the cofactor parameter can be NULL. It also
+     does some minimal sanity checks on the passed order.
+     (CVE-2019-1547)
+     [Billy Bob Brumley]
+
+  *) Fixed a padding oracle in PKCS7_dataDecode and CMS_decrypt_set1_pkey.
+     An attack is simple, if the first CMS_recipientInfo is valid but the
+     second CMS_recipientInfo is chosen ciphertext. If the second
+     recipientInfo decodes to PKCS #1 v1.5 form plaintext, the correct
+     encryption key will be replaced by garbage, and the message cannot be
+     decoded, but if the RSA decryption fails, the correct encryption key is
+     used and the recipient will not notice the attack.
+     As a work around for this potential attack the length of the decrypted
+     key must be equal to the cipher default key length, in case the
+     certifiate is not given and all recipientInfo are tried out.
+     The old behaviour can be re-enabled in the CMS code by setting the
+     CMS_DEBUG_DECRYPT flag.
+     (CVE-2019-1563)
+     [Bernd Edlinger]
+
+  *) Early start up entropy quality from the DEVRANDOM seed source has been
+     improved for older Linux systems.  The RAND subsystem will wait for
+     /dev/random to be producing output before seeding from /dev/urandom.
+     The seeded state is stored for future library initialisations using
+     a system global shared memory segment.  The shared memory identifier
+     can be configured by defining OPENSSL_RAND_SEED_DEVRANDOM_SHM_ID to
+     the desired value.  The default identifier is 114.
+     [Paul Dale]
+
+  *) Correct the extended master secret constant on EBCDIC systems. Without this
+     fix TLS connections between an EBCDIC system and a non-EBCDIC system that
+     negotiate EMS will fail. Unfortunately this also means that TLS connections
+     between EBCDIC systems with this fix, and EBCDIC systems without this
+     fix will fail if they negotiate EMS.
+     [Matt Caswell]
+
+  *) Use Windows installation paths in the mingw builds
+
+     Mingw isn't a POSIX environment per se, which means that Windows
+     paths should be used for installation.
+     (CVE-2019-1552)
+     [Richard Levitte]
+
+  *) Changed DH_check to accept parameters with order q and 2q subgroups.
+     With order 2q subgroups the bit 0 of the private key is not secret
+     but DH_generate_key works around that by clearing bit 0 of the
+     private key for those. This avoids leaking bit 0 of the private key.
+     [Bernd Edlinger]
+
+  *) Significantly reduce secure memory usage by the randomness pools.
+     [Paul Dale]
+
+  *) Revert the DEVRANDOM_WAIT feature for Linux systems
+
+     The DEVRANDOM_WAIT feature added a select() call to wait for the
+     /dev/random device to become readable before reading from the
+     /dev/urandom device.
+
+     It turned out that this change had negative side effects on
+     performance which were not acceptable. After some discussion it
+     was decided to revert this feature and leave it up to the OS
+     resp. the platform maintainer to ensure a proper initialization
+     during early boot time.
+     [Matthias St. Pierre]
+
  Changes between 1.1.1b and 1.1.1c [28 May 2019]
 
   *) Add build tests for C++.  These are generated files that only do one
@@ -75,6 +170,16 @@
      (CVE-2019-1543)
      [Matt Caswell]
 
+  *) Add DEVRANDOM_WAIT feature for Linux systems
+
+     On older Linux systems where the getrandom() system call is not available,
+     OpenSSL normally uses the /dev/urandom device for seeding its CSPRNG.
+     Contrary to getrandom(), the /dev/urandom device will not block during
+     early boot when the kernel CSPRNG has not been seeded yet.
+
+     To mitigate this known weakness, use select() to wait for /dev/random to
+     become readable before reading from /dev/urandom.
+
   *) Ensure that SM2 only uses SM3 as digest algorithm
      [Paul Yang]
 
@@ -322,7 +427,7 @@
         SSL_set_ciphersuites()
      [Matt Caswell]
 
-  *) Memory allocation failures consistenly add an error to the error
+  *) Memory allocation failures consistently add an error to the error
      stack.
      [Rich Salz]
 
@@ -6860,7 +6965,7 @@
      reason texts, thereby removing some of the footprint that may not
      be interesting if those errors aren't displayed anyway.
 
-     NOTE: it's still possible for any application or module to have it's
+     NOTE: it's still possible for any application or module to have its
      own set of error texts inserted.  The routines are there, just not
      used by default when no-err is given.
      [Richard Levitte]
@@ -8826,7 +8931,7 @@ des-cbc           3624.96k     5258.21k     5530.91k     5624.30k     5628.26k
  Changes between 0.9.6g and 0.9.6h  [5 Dec 2002]
 
   *) New function OPENSSL_cleanse(), which is used to cleanse a section of
-     memory from it's contents.  This is done with a counter that will
+     memory from its contents.  This is done with a counter that will
      place alternating values in each byte.  This can be used to solve
      two issues: 1) the removal of calls to memset() by highly optimizing
      compilers, and 2) cleansing with other values than 0, since those can

+ 2 - 2
libs/openssl/Configurations/00-base-templates.conf

@@ -198,7 +198,7 @@ my %targets=(
 	bn_asm_src	=> "bn-586.s co-586.s x86-mont.s x86-gf2m.s",
 	ec_asm_src	=> "ecp_nistz256.c ecp_nistz256-x86.s",
 	des_asm_src	=> "des-586.s crypt586.s",
-	aes_asm_src	=> "aes-586.s vpaes-x86.s aesni-x86.s",
+	aes_asm_src	=> "aes_core.c aes_cbc.c vpaes-x86.s aesni-x86.s",
 	bf_asm_src	=> "bf-586.s",
 	md5_asm_src	=> "md5-586.s",
 	cast_asm_src	=> "cast-586.s",
@@ -223,7 +223,7 @@ my %targets=(
 	cpuid_asm_src   => "x86_64cpuid.s",
 	bn_asm_src      => "asm/x86_64-gcc.c x86_64-mont.s x86_64-mont5.s x86_64-gf2m.s rsaz_exp.c rsaz-x86_64.s rsaz-avx2.s",
 	ec_asm_src      => "ecp_nistz256.c ecp_nistz256-x86_64.s x25519-x86_64.s",
-	aes_asm_src     => "aes-x86_64.s vpaes-x86_64.s bsaes-x86_64.s aesni-x86_64.s aesni-sha1-x86_64.s aesni-sha256-x86_64.s aesni-mb-x86_64.s",
+	aes_asm_src     => "aes_core.c aes_cbc.c vpaes-x86_64.s aesni-x86_64.s aesni-sha1-x86_64.s aesni-sha256-x86_64.s aesni-mb-x86_64.s",
 	md5_asm_src     => "md5-x86_64.s",
 	sha1_asm_src    => "sha1-x86_64.s sha256-x86_64.s sha512-x86_64.s sha1-mb-x86_64.s sha256-mb-x86_64.s",
 	rc4_asm_src     => "rc4-x86_64.s rc4-md5-x86_64.s",

+ 5 - 1
libs/openssl/Configurations/10-main.conf

@@ -1111,7 +1111,7 @@ my %targets = (
         thread_scheme    => "pthreads",
         dso_scheme       => "dlfcn",
         shared_target    => "self",
-        module_ldflags   => "-Wl,-G,-bsymbolic,-bexpall",
+        module_ldflags   => "-Wl,-G,-bsymbolic,-bnoentry",
         shared_ldflag    => "-Wl,-G,-bsymbolic,-bnoentry",
         shared_defflag   => "-Wl,-bE:",
         shared_extension => ".so.\$(SHLIB_VERSION_NUMBER)",
@@ -1397,6 +1397,10 @@ my %targets = (
         shared_extension => ".dll",
         multilib         => "",
         apps_aux_src     => add("win32_init.c"),
+        # "WOW" stands for "Windows on Windows", and that word engages
+        # some installation path heuristics in unix-Makefile.tmpl...
+        build_scheme     => add("WOW", { separator => undef }),
+
     },
     "mingw64" => {
         # As for OPENSSL_USE_APPLINK. Applink makes it possible to use

+ 1 - 1
libs/openssl/Configurations/15-ios.conf

@@ -1,6 +1,6 @@
 #### iPhoneOS/iOS
 #
-# It takes recent enough XCode to use following two targets. It shouldn't
+# It takes recent enough Xcode to use following two targets. It shouldn't
 # be a problem by now, but if they don't work, original targets below
 # that depend on manual definition of environment variables should still
 # work...

+ 1 - 1
libs/openssl/Configurations/50-win-onecore.conf

@@ -1,6 +1,6 @@
 # Windows OneCore targets.
 #
-# OneCore is new API stability "contract" that transends Desktop, IoT and
+# OneCore is new API stability "contract" that transcends Desktop, IoT and
 # Mobile[?] Windows editions. It's a set up "umbrella" libraries that
 # export subset of Win32 API that are common to all Windows 10 devices.
 #

+ 1 - 1
libs/openssl/Configurations/common0.tmpl

@@ -22,7 +22,7 @@
  our @generated =
      sort ( ( grep { defined $unified_info{generate}->{$_} }
               sort keys %generatables ),
-            # Scripts are assumed to be generated, so add thhem too
+            # Scripts are assumed to be generated, so add them too
             ( grep { defined $unified_info{sources}->{$_} }
               @{$unified_info{scripts}} ) );
 

+ 102 - 4
libs/openssl/Configurations/unix-Makefile.tmpl

@@ -14,6 +14,26 @@
      our $dsoext = $target{dso_extension} || ".so";
      our $makedepprog = $disabled{makedepend} ? undef : $config{makedepprog};
 
+     # $mingw_installroot and $mingw_commonroot is relevant for mingw only.
+     my $build_scheme = $target{build_scheme};
+     my $install_flavour = $build_scheme->[$#$build_scheme]; # last element
+     my $mingw_installenv = $install_flavour eq "WOW" ? "ProgramFiles(x86)"
+                                                      : "ProgramW6432";
+     my $mingw_commonenv = $install_flavour eq "WOW" ? "CommonProgramFiles(x86)"
+                                                     : "CommonProgramW6432";
+     our $mingw_installroot =
+         defined($ENV{$mingw_installenv}) ? $mingw_installenv : 'ProgramFiles';
+     our $mingw_commonroot =
+         defined($ENV{$mingw_commonenv}) ? $mingw_commonenv : 'CommonProgramFiles';
+     my $mingw_installdflt =
+         $install_flavour eq "WOW" ? "C:/Program Files (x86)"
+                                   : "C:/Program Files";
+     my $mingw_commondflt = "$mingw_installdflt/Common Files";
+
+     # expand variables early
+     $mingw_installroot = $ENV{$mingw_installroot} // $mingw_installdflt;
+     $mingw_commonroot = $ENV{$mingw_commonroot} // $mingw_commondflt;
+
      sub windowsdll { $config{target} =~ /^(?:Cygwin|mingw)/ }
 
      # Shared AIX support is special. We put libcrypto[64].so.ver into
@@ -132,6 +152,7 @@ APPS_OPENSSL={- use File::Spec::Functions;
 # Normally it is left empty.
 DESTDIR=
 
+{- output_off() if $config{target} =~ /^mingw/; "" -}
 # Do not edit these manually. Use Configure with --prefix or --openssldir
 # to change this!  Short explanation in the top comment in Configure
 INSTALLTOP={- # $prefix is used in the OPENSSLDIR perl snippet
@@ -175,6 +196,83 @@ ENGINESDIR=$(libdir)/engines-{- $sover_dirname -}
 # Convenience variable for those who want to set the rpath in shared
 # libraries and applications
 LIBRPATH=$(libdir)
+{- output_on() if $config{target} =~ /^mingw/;
+   output_off() if $config{target} !~ /^mingw/;
+   "" -}
+# Do not edit these manually. Use Configure with --prefix or --openssldir
+# to change this!  Short explanation in the top comment in Configure
+INSTALLTOP_dev={- # $prefix is used in the OPENSSLDIR perl snippet
+                  #
+                  use File::Spec::Win32;
+                  my $prefix_default = "$mingw_installroot/OpenSSL";
+                  our $prefix =
+                      File::Spec::Win32->canonpath($config{prefix}
+                                                  || $prefix_default);
+                  our ($prefix_dev, $prefix_dir, $prefix_file) =
+                      File::Spec::Win32->splitpath($prefix, 1);
+                  $prefix =~ s|\\|/|g;
+                  $prefix_dir =~ s|\\|/|g;
+                  $prefix_dev -}
+INSTALLTOP_dir={- my $x = File::Spec::Win32->canonpath($prefix_dir);
+                  $x =~ s|\\|/|g;
+                  $x -}
+OPENSSLDIR_dev={- #
+                  # The logic here is that if no --openssldir was given,
+                  # OPENSSLDIR will get the value "$mingw_commonroot/SSL".
+                  # If --openssldir was given and the value is an absolute
+                  # path, OPENSSLDIR will get its value without change.
+                  # If the value from --openssldir is a relative path,
+                  # OPENSSLDIR will get $prefix with the --openssldir
+                  # value appended as a subdirectory.
+                  #
+                  use File::Spec::Win32;
+                  our $openssldir =
+                      $config{openssldir} ?
+                          (File::Spec::Win32->file_name_is_absolute($config{openssldir}) ?
+                               File::Spec::Win32->canonpath($config{openssldir})
+                               : File::Spec::Win32->catdir($prefix, $config{openssldir}))
+                          : File::Spec::Win32->canonpath("$mingw_commonroot/SSL");
+                  our ($openssldir_dev, $openssldir_dir, $openssldir_file) =
+                      File::Spec::Win32->splitpath($openssldir, 1);
+                  $openssldir =~ s|\\|/|g;
+                  $openssldir_dir =~ s|\\|/|g;
+                  $openssldir_dev -}
+OPENSSLDIR_dir={- my $x = File::Spec::Win32->canonpath($openssldir_dir);
+                  $x =~ s|\\|/|g;
+                  $x -}
+LIBDIR={- our $libdir = $config{libdir} || "lib";
+          File::Spec::Win32->file_name_is_absolute($libdir) ? "" : $libdir -}
+ENGINESDIR_dev={- use File::Spec::Win32;
+                  our $enginesdir =
+                      File::Spec::Win32->catdir($prefix,$libdir,
+                                                "engines-$sover_dirname");
+                  our ($enginesdir_dev, $enginesdir_dir, $enginesdir_file) =
+                      File::Spec::Win32->splitpath($enginesdir, 1);
+                  $enginesdir =~ s|\\|/|g;
+                  $enginesdir_dir =~ s|\\|/|g;
+                  $enginesdir_dev -}
+ENGINESDIR_dir={- my $x = File::Spec::Win32->canonpath($enginesdir_dir);
+                  $x =~ s|\\|/|g;
+                  $x -}
+# In a Windows environment, $(DESTDIR) is harder to contatenate with other
+# directory variables, because both may contain devices.  What we do here is
+# to adapt INSTALLTOP, OPENSSLDIR and ENGINESDIR depending on if $(DESTDIR)
+# has a value or not, to ensure that concatenation will always work further
+# down.
+ifneq "$(DESTDIR)" ""
+INSTALLTOP=$(INSTALLTOP_dir)
+OPENSSLDIR=$(OPENSSLDIR_dir)
+ENGINESDIR=$(ENGINESDIR_dir)
+else
+INSTALLTOP=$(INSTALLTOP_dev)$(INSTALLTOP_dir)
+OPENSSLDIR=$(OPENSSLDIR_dev)$(OPENSSLDIR_dir)
+ENGINESDIR=$(ENGINESDIR_dev)$(ENGINESDIR_dir)
+endif
+
+# $(libdir) is chosen to be compatible with the GNU coding standards
+libdir={- File::Spec::Win32->file_name_is_absolute($libdir)
+          ? $libdir : '$(INSTALLTOP)/$(LIBDIR)' -}
+{- output_on() if $config{target} !~ /^mingw/; "" -}
 
 MANDIR=$(INSTALLTOP)/share/man
 DOCDIR=$(INSTALLTOP)/share/doc/$(BASENAME)
@@ -418,13 +516,13 @@ libclean:
 clean: libclean
 	$(RM) $(PROGRAMS) $(TESTPROGS) $(ENGINES) $(SCRIPTS)
 	$(RM) $(GENERATED_MANDATORY) $(GENERATED)
-	-$(RM) `find . -name .git -prune -o -name '*{- $depext -}' -print`
-	-$(RM) `find . -name .git -prune -o -name '*{- $objext -}' -print`
+	-$(RM) `find . -name '*{- $depext -}' \! -name '.*' -print`
+	-$(RM) `find . -name '*{- $objext -}' \! -name '.*' -print`
 	$(RM) core
 	$(RM) tags TAGS doc-nits
 	$(RM) -r test/test-runs
 	$(RM) openssl.pc libcrypto.pc libssl.pc
-	-$(RM) `find . -name .git -prune -o -type l -print`
+	-$(RM) `find . -type l \! -name '.*' -print`
 	$(RM) $(TARFILE)
 
 distclean: clean
@@ -449,7 +547,7 @@ uninstall_sw: uninstall_runtime uninstall_engines uninstall_dev
 install_docs: install_man_docs install_html_docs
 
 uninstall_docs: uninstall_man_docs uninstall_html_docs
-	$(RM) -r -v $(DESTDIR)$(DOCDIR)
+	$(RM) -r $(DESTDIR)$(DOCDIR)
 
 install_ssldirs:
 	@$(PERL) $(SRCDIR)/util/mkdir-p.pl $(DESTDIR)$(OPENSSLDIR)/certs

+ 1 - 4
libs/openssl/Configure

@@ -87,9 +87,6 @@ my $usage="Usage: Configure [no-<cipher> ...] [enable-<cipher> ...] [-Dxxx] [-lx
 #               linked openssl executable has rather debugging value than
 #               production quality.
 #
-# DEBUG_SAFESTACK use type-safe stacks to enforce type-safety on stack items
-#               provided to stack calls. Generates unique stack functions for
-#               each possible stack type.
 # BN_LLONG      use the type 'long long' in crypto/bn/bn.h
 # RC4_CHAR      use 'char' instead of 'int' for RC4_INT in crypto/rc4/rc4.h
 # Following are set automatically by this script
@@ -145,13 +142,13 @@ my @gcc_devteam_warn = qw(
 #       -Wunused-macros -- no, too tricky for BN and _XOPEN_SOURCE etc
 #       -Wextended-offsetof -- no, needed in CMS ASN1 code
 my @clang_devteam_warn = qw(
+    -Wno-unknown-warning-option
     -Wswitch-default
     -Wno-parentheses-equality
     -Wno-language-extension-token
     -Wno-extended-offsetof
     -Wconditional-uninitialized
     -Wincompatible-pointer-types-discards-qualifiers
-    -Wno-unknown-warning-option
     -Wmissing-variable-declarations
 );
 

+ 25 - 4
libs/openssl/INSTALL

@@ -98,6 +98,9 @@
     $ nmake test
     $ nmake install
 
+ Note that in order to perform the install step above you need to have
+ appropriate permissions to write to the installation directory.
+
  If any of these steps fails, see section Installation in Detail below.
 
  This will build and install OpenSSL in the default location, which is:
@@ -107,6 +110,12 @@
            OpenSSL version number with underscores instead of periods.
   Windows: C:\Program Files\OpenSSL or C:\Program Files (x86)\OpenSSL
 
+ The installation directory should be appropriately protected to ensure
+ unprivileged users cannot make changes to OpenSSL binaries or files, or install
+ engines. If you already have a pre-installed version of OpenSSL as part of
+ your Operating System it is recommended that you do not overwrite the system
+ version and instead install to somewhere else.
+
  If you want to install it anywhere else, run config like this:
 
   On Unix:
@@ -135,7 +144,10 @@
                    Don't build with support for deprecated APIs below the
                    specified version number. For example "--api=1.1.0" will
                    remove support for all APIS that were deprecated in OpenSSL
-                   version 1.1.0 or below.
+                   version 1.1.0 or below. This is a rather specialized option
+                   for developers. If you just intend to remove all deprecated
+                   APIs entirely (up to the current version), it is easier
+                   to add the 'no-deprecated' option instead (see below).
 
   --cross-compile-prefix=PREFIX
                    The PREFIX to include in front of commands for your
@@ -229,7 +241,7 @@
                                source exists.
                    getrandom:  Use the L<getrandom(2)> or equivalent system
                                call.
-                   devrandom:  Use the the first device from the DEVRANDOM list
+                   devrandom:  Use the first device from the DEVRANDOM list
                                which can be opened to read random bytes. The
                                DEVRANDOM preprocessor constant expands to
                                "/dev/urandom","/dev/random","/dev/srandom" on
@@ -908,8 +920,11 @@
        $ mms install                                    ! OpenVMS
        $ nmake install                                  # Windows
 
-     This will install all the software components in this directory
-     tree under PREFIX (the directory given with --prefix or its
+     Note that in order to perform the install step above you need to have
+     appropriate permissions to write to the installation directory.
+
+     The above commands will install all the software components in this
+     directory tree under PREFIX (the directory given with --prefix or its
      default):
 
        Unix:
@@ -965,6 +980,12 @@
                         for private key files.
          misc           Various scripts.
 
+     The installation directory should be appropriately protected to ensure
+     unprivileged users cannot make changes to OpenSSL binaries or files, or
+     install engines. If you already have a pre-installed version of OpenSSL as
+     part of your Operating System it is recommended that you do not overwrite
+     the system version and instead install to somewhere else.
+
      Package builders who want to configure the library for standard
      locations, but have the package installed somewhere else so that
      it can easily be packaged, can use

+ 18 - 1
libs/openssl/NEWS

@@ -5,6 +5,23 @@
   This file gives a brief overview of the major changes between each OpenSSL
   release. For more details please read the CHANGES file.
 
+  Major changes between OpenSSL 1.1.1c and OpenSSL 1.1.1d [10 Sep 2019]
+
+      o Fixed a fork protection issue (CVE-2019-1549)
+      o Fixed a padding oracle in PKCS7_dataDecode and CMS_decrypt_set1_pkey
+        (CVE-2019-1563)
+      o For built-in EC curves, ensure an EC_GROUP built from the curve name is
+        used even when parsing explicit parameters
+      o Compute ECC cofactors if not provided during EC_GROUP construction
+        (CVE-2019-1547)
+      o Early start up entropy quality from the DEVRANDOM seed source has been
+        improved for older Linux systems
+      o Correct the extended master secret constant on EBCDIC systems
+      o Use Windows installation paths in the mingw builds (CVE-2019-1552)
+      o Changed DH_check to accept parameters with order q and 2q subgroups
+      o Significantly reduce secure memory usage by the randomness pools
+      o Revert the DEVRANDOM_WAIT feature for Linux systems
+
   Major changes between OpenSSL 1.1.1b and OpenSSL 1.1.1c [28 May 2019]
 
       o Prevent over long nonces in ChaCha20-Poly1305 (CVE-2019-1543)
@@ -601,7 +618,7 @@
 
   Major changes between OpenSSL 0.9.7h and OpenSSL 0.9.7i [14 Oct 2005]:
 
-      o Give EVP_MAX_MD_SIZE it's old value, except for a FIPS build.
+      o Give EVP_MAX_MD_SIZE its old value, except for a FIPS build.
 
   Major changes between OpenSSL 0.9.7g and OpenSSL 0.9.7h [11 Oct 2005]:
 

+ 15 - 0
libs/openssl/NOTES.WIN

@@ -109,6 +109,21 @@
    This naturally implies that you've installed corresponding add-on
    packages.
 
+ Independently of the method chosen to build for mingw, the installation
+ paths are similar to those used when building with VC-* targets, except
+ that in case the fallbacks mentioned there aren't possible (typically
+ when cross compiling on Linux), the paths will be the following:
+
+ For mingw:
+
+     PREFIX:      C:/Program Files (x86)/OpenSSL
+     OPENSSLDIR   C:/Program Files (x86)/Common Files/SSL
+
+ For mingw64:
+
+     PREFIX:      C:/Program Files/OpenSSL
+     OPENSSLDIR   C:/Program Files/Common Files/SSL
+
  Linking your application
  ========================
 

+ 1 - 1
libs/openssl/README

@@ -1,5 +1,5 @@
 
- OpenSSL 1.1.1c 28 May 2019
+ OpenSSL 1.1.1d 10 Sep 2019
 
  Copyright (c) 1998-2019 The OpenSSL Project
  Copyright (c) 1995-1998 Eric A. Young, Tim J. Hudson

+ 8 - 1
libs/openssl/apps/apps.c

@@ -40,7 +40,6 @@
 #endif
 #include <openssl/bn.h>
 #include <openssl/ssl.h>
-#include "s_apps.h"
 #include "apps.h"
 
 #ifdef _WIN32
@@ -48,6 +47,14 @@ static int WIN32_rename(const char *from, const char *to);
 # define rename(from,to) WIN32_rename((from),(to))
 #endif
 
+#if defined(OPENSSL_SYS_WINDOWS) || defined(OPENSSL_SYS_MSDOS)
+# include <conio.h>
+#endif
+
+#if defined(OPENSSL_SYS_MSDOS) && !defined(_WIN32)
+# define _kbhit kbhit
+#endif
+
 typedef struct {
     const char *name;
     unsigned long flag;

+ 1 - 3
libs/openssl/apps/apps.h

@@ -1,5 +1,5 @@
 /*
- * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -444,11 +444,9 @@ void destroy_ui_method(void);
 const UI_METHOD *get_ui_method(void);
 
 int chopup_args(ARGS *arg, char *buf);
-# ifdef HEADER_X509_H
 int dump_cert_text(BIO *out, X509 *x);
 void print_name(BIO *out, const char *title, X509_NAME *nm,
                 unsigned long lflags);
-# endif
 void print_bignum_var(BIO *, const BIGNUM *, const char*,
                       int, unsigned char *);
 void print_array(BIO *, const char *, int, const unsigned char *);

+ 2 - 2
libs/openssl/apps/ca.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -722,7 +722,7 @@ end_of_options:
 
     /*****************************************************************/
     if (req || gencrl) {
-        if (spkac_file != NULL) {
+        if (spkac_file != NULL && outfile != NULL) {
             output_der = 1;
             batch = 1;
         }

+ 2 - 2
libs/openssl/apps/dgst.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -421,7 +421,7 @@ int do_fp(BIO *out, unsigned char *buf, BIO *bp, int sep, int binout,
     size_t len;
     int i;
 
-    for (;;) {
+    while (BIO_pending(bp) || !BIO_eof(bp)) {
         i = BIO_read(bp, (char *)buf, BUFSIZE);
         if (i < 0) {
             BIO_printf(bio_err, "Read Error in %s\n", file);

+ 1 - 1
libs/openssl/apps/enc.c

@@ -586,7 +586,7 @@ int enc_main(int argc, char **argv)
     if (benc != NULL)
         wbio = BIO_push(benc, wbio);
 
-    for (;;) {
+    while (BIO_pending(rbio) || !BIO_eof(rbio)) {
         inl = BIO_read(rbio, (char *)buff, bsize);
         if (inl <= 0)
             break;

+ 4 - 2
libs/openssl/apps/ocsp.c

@@ -1416,9 +1416,11 @@ static int do_responder(OCSP_REQUEST **preq, BIO **pcbio, BIO *acbio,
         *q = '\0';
 
         /*
-         * Skip "GET / HTTP..." requests often used by load-balancers
+         * Skip "GET / HTTP..." requests often used by load-balancers.  Note:
+         * 'p' was incremented above to point to the first byte *after* the
+         * leading slash, so with 'GET / ' it is now an empty string.
          */
-        if (p[1] == '\0')
+        if (p[0] == '\0')
             goto out;
 
         len = urldecode(p);

+ 1 - 2
libs/openssl/apps/openssl.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -22,7 +22,6 @@
 # include <openssl/engine.h>
 #endif
 #include <openssl/err.h>
-#include "s_apps.h"
 /* Needed to get the other O_xxx flags. */
 #ifdef OPENSSL_SYS_VMS
 # include <unixio.h>

+ 1 - 1
libs/openssl/apps/pkcs12.c

@@ -838,7 +838,7 @@ static int alg_print(const X509_ALGOR *alg)
                 goto done;
             }
             BIO_printf(bio_err, ", Salt length: %d, Cost(N): %ld, "
-                       "Block size(r): %ld, Paralelizm(p): %ld",
+                       "Block size(r): %ld, Parallelism(p): %ld",
                        ASN1_STRING_length(kdf->salt),
                        ASN1_INTEGER_get(kdf->costParameter),
                        ASN1_INTEGER_get(kdf->blockSize),

+ 13 - 3
libs/openssl/apps/req.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -881,9 +881,19 @@ int req_main(int argc, char **argv)
 
     if (text) {
         if (x509)
-            X509_print_ex(out, x509ss, get_nameopt(), reqflag);
+            ret = X509_print_ex(out, x509ss, get_nameopt(), reqflag);
         else
-            X509_REQ_print_ex(out, req, get_nameopt(), reqflag);
+            ret = X509_REQ_print_ex(out, req, get_nameopt(), reqflag);
+
+        if (ret == 0) {
+            if (x509)
+              BIO_printf(bio_err, "Error printing certificate\n");
+            else
+              BIO_printf(bio_err, "Error printing certificate request\n");
+
+            ERR_print_errors(bio_err);
+            goto end;
+        }
     }
 
     if (subject) {

+ 4 - 16
libs/openssl/apps/s_apps.h

@@ -1,5 +1,5 @@
 /*
- * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -9,13 +9,7 @@
 
 #include <openssl/opensslconf.h>
 
-#if defined(OPENSSL_SYS_WINDOWS) || defined(OPENSSL_SYS_MSDOS)
-# include <conio.h>
-#endif
-
-#if defined(OPENSSL_SYS_MSDOS) && !defined(_WIN32)
-# define _kbhit kbhit
-#endif
+#include <openssl/ssl.h>
 
 #define PORT            "4433"
 #define PROTOCOL        "tcp"
@@ -24,17 +18,15 @@ typedef int (*do_server_cb)(int s, int stype, int prot, unsigned char *context);
 int do_server(int *accept_sock, const char *host, const char *port,
               int family, int type, int protocol, do_server_cb cb,
               unsigned char *context, int naccept, BIO *bio_s_out);
-#ifdef HEADER_X509_H
+
 int verify_callback(int ok, X509_STORE_CTX *ctx);
-#endif
-#ifdef HEADER_SSL_H
+
 int set_cert_stuff(SSL_CTX *ctx, char *cert_file, char *key_file);
 int set_cert_key_stuff(SSL_CTX *ctx, X509 *cert, EVP_PKEY *key,
                        STACK_OF(X509) *chain, int build_chain);
 int ssl_print_sigalgs(BIO *out, SSL *s);
 int ssl_print_point_formats(BIO *out, SSL *s);
 int ssl_print_groups(BIO *out, SSL *s, int noshared);
-#endif
 int ssl_print_tmp_key(BIO *out, SSL *s);
 int init_client(int *sock, const char *host, const char *port,
                 const char *bindhost, const char *bindport,
@@ -44,13 +36,11 @@ int should_retry(int i);
 long bio_dump_callback(BIO *bio, int cmd, const char *argp,
                        int argi, long argl, long ret);
 
-#ifdef HEADER_SSL_H
 void apps_ssl_info_callback(const SSL *s, int where, int ret);
 void msg_cb(int write_p, int version, int content_type, const void *buf,
             size_t len, SSL *ssl, void *arg);
 void tlsext_cb(SSL *s, int client_server, int type, const unsigned char *data,
                int len, void *arg);
-#endif
 
 int generate_cookie_callback(SSL *ssl, unsigned char *cookie,
                              unsigned int *cookie_len);
@@ -75,7 +65,6 @@ int args_excert(int option, SSL_EXCERT **pexc);
 int load_excert(SSL_EXCERT **pexc);
 void print_verify_detail(SSL *s, BIO *bio);
 void print_ssl_summary(SSL *s);
-#ifdef HEADER_SSL_H
 int config_ctx(SSL_CONF_CTX *cctx, STACK_OF(OPENSSL_STRING) *str, SSL_CTX *ctx);
 int ssl_ctx_add_crls(SSL_CTX *ctx, STACK_OF(X509_CRL) *crls,
                      int crl_download);
@@ -86,4 +75,3 @@ int ssl_load_stores(SSL_CTX *ctx, const char *vfyCApath,
 void ssl_ctx_security_debug(SSL_CTX *ctx, int verbose);
 int set_keylog_file(SSL_CTX *ctx, const char *keylog_file);
 void print_ca_names(BIO *bio, SSL *s);
-#endif

+ 2 - 1
libs/openssl/apps/s_cb.c

@@ -1525,7 +1525,8 @@ void print_ca_names(BIO *bio, SSL *s)
     int i;
 
     if (sk == NULL || sk_X509_NAME_num(sk) == 0) {
-        BIO_printf(bio, "---\nNo %s certificate CA names sent\n", cs);
+        if (!SSL_is_server(s))
+            BIO_printf(bio, "---\nNo %s certificate CA names sent\n", cs);
         return;
     }
 

+ 1 - 1
libs/openssl/apps/s_client.c

@@ -2345,7 +2345,7 @@ int s_client_main(int argc, char **argv)
             (void)BIO_flush(fbio);
             /*
              * The first line is the HTTP response.  According to RFC 7230,
-             * it's formated exactly like this:
+             * it's formatted exactly like this:
              *
              * HTTP/d.d ddd Reason text\r\n
              */

+ 1 - 1
libs/openssl/apps/speed.c

@@ -1790,7 +1790,7 @@ int speed_main(int argc, char **argv)
         }
 
         buflen = lengths[size_num - 1];
-        if (buflen < 36)    /* size of random vector in RSA bencmark */
+        if (buflen < 36)    /* size of random vector in RSA benchmark */
             buflen = 36;
         buflen += MAX_MISALIGNMENT + 1;
         loopargs[i].buf_malloc = app_malloc(buflen, "input buffer");

+ 2 - 2
libs/openssl/apps/storeutl.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -125,7 +125,7 @@ int storeutl_main(int argc, char *argv[])
                 }
                 /*
                  * If expected wasn't set at this point, it means the map
-                 * isn't syncronised with the possible options leading here.
+                 * isn't synchronised with the possible options leading here.
                  */
                 OPENSSL_assert(expected != 0);
             }

+ 4 - 4
libs/openssl/config

@@ -498,12 +498,12 @@ case "$GUESSOS" in
 	    OUT="darwin64-x86_64-cc"
 	fi ;;
   armv6+7-*-iphoneos)
-	__CNF_CFLAGS="$__CNF_CFLAGS -arch%20armv6 -arch%20armv7"
-	__CNF_CXXFLAGS="$__CNF_CXXFLAGS -arch%20armv6 -arch%20armv7"
+	__CNF_CFLAGS="$__CNF_CFLAGS -arch armv6 -arch armv7"
+	__CNF_CXXFLAGS="$__CNF_CXXFLAGS -arch armv6 -arch armv7"
 	OUT="iphoneos-cross" ;;
   *-*-iphoneos)
-	__CNF_CFLAGS="$__CNF_CFLAGS -arch%20${MACHINE}"
-	__CNF_CXXFLAGS="$__CNF_CXXFLAGS -arch%20${MACHINE}"
+	__CNF_CFLAGS="$__CNF_CFLAGS -arch ${MACHINE}"
+	__CNF_CXXFLAGS="$__CNF_CXXFLAGS -arch ${MACHINE}"
 	OUT="iphoneos-cross" ;;
   arm64-*-iphoneos|*-*-ios64)
 	OUT="ios64-cross" ;;

+ 0 - 3000
libs/openssl/crypto/aes/asm/aes-586.pl

@@ -1,3000 +0,0 @@
-#! /usr/bin/env perl
-# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
-#
-# Licensed under the OpenSSL license (the "License").  You may not use
-# this file except in compliance with the License.  You can obtain a copy
-# in the file LICENSE in the source distribution or at
-# https://www.openssl.org/source/license.html
-
-#
-# ====================================================================
-# Written by Andy Polyakov <[email protected]> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# Version 4.3.
-#
-# You might fail to appreciate this module performance from the first
-# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
-# to be *the* best Intel C compiler without -KPIC, performance appears
-# to be virtually identical... But try to re-configure with shared
-# library support... Aha! Intel compiler "suddenly" lags behind by 30%
-# [on P4, more on others]:-) And if compared to position-independent
-# code generated by GNU C, this code performs *more* than *twice* as
-# fast! Yes, all this buzz about PIC means that unlike other hand-
-# coded implementations, this one was explicitly designed to be safe
-# to use even in shared library context... This also means that this
-# code isn't necessarily absolutely fastest "ever," because in order
-# to achieve position independence an extra register has to be
-# off-loaded to stack, which affects the benchmark result.
-#
-# Special note about instruction choice. Do you recall RC4_INT code
-# performing poorly on P4? It might be the time to figure out why.
-# RC4_INT code implies effective address calculations in base+offset*4
-# form. Trouble is that it seems that offset scaling turned to be
-# critical path... At least eliminating scaling resulted in 2.8x RC4
-# performance improvement [as you might recall]. As AES code is hungry
-# for scaling too, I [try to] avoid the latter by favoring off-by-2
-# shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.
-#
-# As was shown by Dean Gaudet, the above note turned out to be
-# void. Performance improvement with off-by-2 shifts was observed on
-# intermediate implementation, which was spilling yet another register
-# to stack... Final offset*4 code below runs just a tad faster on P4,
-# but exhibits up to 10% improvement on other cores.
-#
-# Second version is "monolithic" replacement for aes_core.c, which in
-# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
-# This made it possible to implement little-endian variant of the
-# algorithm without modifying the base C code. Motivating factor for
-# the undertaken effort was that it appeared that in tight IA-32
-# register window little-endian flavor could achieve slightly higher
-# Instruction Level Parallelism, and it indeed resulted in up to 15%
-# better performance on most recent µ-archs...
-#
-# Third version adds AES_cbc_encrypt implementation, which resulted in
-# up to 40% performance improvement of CBC benchmark results. 40% was
-# observed on P4 core, where "overall" improvement coefficient, i.e. if
-# compared to PIC generated by GCC and in CBC mode, was observed to be
-# as large as 4x:-) CBC performance is virtually identical to ECB now
-# and on some platforms even better, e.g. 17.6 "small" cycles/byte on
-# Opteron, because certain function prologues and epilogues are
-# effectively taken out of the loop...
-#
-# Version 3.2 implements compressed tables and prefetch of these tables
-# in CBC[!] mode. Former means that 3/4 of table references are now
-# misaligned, which unfortunately has negative impact on elder IA-32
-# implementations, Pentium suffered 30% penalty, PIII - 10%.
-#
-# Version 3.3 avoids L1 cache aliasing between stack frame and
-# S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
-# latter is achieved by copying the key schedule to controlled place in
-# stack. This unfortunately has rather strong impact on small block CBC
-# performance, ~2x deterioration on 16-byte block if compared to 3.3.
-#
-# Version 3.5 checks if there is L1 cache aliasing between user-supplied
-# key schedule and S-boxes and abstains from copying the former if
-# there is no. This allows end-user to consciously retain small block
-# performance by aligning key schedule in specific manner.
-#
-# Version 3.6 compresses Td4 to 256 bytes and prefetches it in ECB.
-#
-# Current ECB performance numbers for 128-bit key in CPU cycles per
-# processed byte [measure commonly used by AES benchmarkers] are:
-#
-#		small footprint		fully unrolled
-# P4		24			22
-# AMD K8	20			19
-# PIII		25			23
-# Pentium	81			78
-#
-# Version 3.7 reimplements outer rounds as "compact." Meaning that
-# first and last rounds reference compact 256 bytes S-box. This means
-# that first round consumes a lot more CPU cycles and that encrypt
-# and decrypt performance becomes asymmetric. Encrypt performance
-# drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is
-# aggressively pre-fetched.
-#
-# Version 4.0 effectively rolls back to 3.6 and instead implements
-# additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact,
-# which use exclusively 256 byte S-box. These functions are to be
-# called in modes not concealing plain text, such as ECB, or when
-# we're asked to process smaller amount of data [or unconditionally
-# on hyper-threading CPU]. Currently it's called unconditionally from
-# AES_[en|de]crypt, which affects all modes, but CBC. CBC routine
-# still needs to be modified to switch between slower and faster
-# mode when appropriate... But in either case benchmark landscape
-# changes dramatically and below numbers are CPU cycles per processed
-# byte for 128-bit key.
-#
-#		ECB encrypt	ECB decrypt	CBC large chunk
-# P4		52[54]		83[95]		23
-# AMD K8	46[41]		66[70]		18
-# PIII		41[50]		60[77]		24
-# Core 2	31[36]		45[64]		18.5
-# Atom		76[100]		96[138]		60
-# Pentium	115		150		77
-#
-# Version 4.1 switches to compact S-box even in key schedule setup.
-#
-# Version 4.2 prefetches compact S-box in every SSE round or in other
-# words every cache-line is *guaranteed* to be accessed within ~50
-# cycles window. Why just SSE? Because it's needed on hyper-threading
-# CPU! Which is also why it's prefetched with 64 byte stride. Best
-# part is that it has no negative effect on performance:-)
-#
-# Version 4.3 implements switch between compact and non-compact block
-# functions in AES_cbc_encrypt depending on how much data was asked
-# to be processed in one stroke.
-#
-######################################################################
-# Timing attacks are classified in two classes: synchronous when
-# attacker consciously initiates cryptographic operation and collects
-# timing data of various character afterwards, and asynchronous when
-# malicious code is executed on same CPU simultaneously with AES,
-# instruments itself and performs statistical analysis of this data.
-#
-# As far as synchronous attacks go the root to the AES timing
-# vulnerability is twofold. Firstly, of 256 S-box elements at most 160
-# are referred to in single 128-bit block operation. Well, in C
-# implementation with 4 distinct tables it's actually as little as 40
-# references per 256 elements table, but anyway... Secondly, even
-# though S-box elements are clustered into smaller amount of cache-
-# lines, smaller than 160 and even 40, it turned out that for certain
-# plain-text pattern[s] or simply put chosen plain-text and given key
-# few cache-lines remain unaccessed during block operation. Now, if
-# attacker can figure out this access pattern, he can deduct the key
-# [or at least part of it]. The natural way to mitigate this kind of
-# attacks is to minimize the amount of cache-lines in S-box and/or
-# prefetch them to ensure that every one is accessed for more uniform
-# timing. But note that *if* plain-text was concealed in such way that
-# input to block function is distributed *uniformly*, then attack
-# wouldn't apply. Now note that some encryption modes, most notably
-# CBC, do mask the plain-text in this exact way [secure cipher output
-# is distributed uniformly]. Yes, one still might find input that
-# would reveal the information about given key, but if amount of
-# candidate inputs to be tried is larger than amount of possible key
-# combinations then attack becomes infeasible. This is why revised
-# AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk
-# of data is to be processed in one stroke. The current size limit of
-# 512 bytes is chosen to provide same [diminishingly low] probability
-# for cache-line to remain untouched in large chunk operation with
-# large S-box as for single block operation with compact S-box and
-# surely needs more careful consideration...
-#
-# As for asynchronous attacks. There are two flavours: attacker code
-# being interleaved with AES on hyper-threading CPU at *instruction*
-# level, and two processes time sharing single core. As for latter.
-# Two vectors. 1. Given that attacker process has higher priority,
-# yield execution to process performing AES just before timer fires
-# off the scheduler, immediately regain control of CPU and analyze the
-# cache state. For this attack to be efficient attacker would have to
-# effectively slow down the operation by several *orders* of magnitude,
-# by ratio of time slice to duration of handful of AES rounds, which
-# unlikely to remain unnoticed. Not to mention that this also means
-# that he would spend correspondingly more time to collect enough
-# statistical data to mount the attack. It's probably appropriate to
-# say that if adversary reckons that this attack is beneficial and
-# risks to be noticed, you probably have larger problems having him
-# mere opportunity. In other words suggested code design expects you
-# to preclude/mitigate this attack by overall system security design.
-# 2. Attacker manages to make his code interrupt driven. In order for
-# this kind of attack to be feasible, interrupt rate has to be high
-# enough, again comparable to duration of handful of AES rounds. But
-# is there interrupt source of such rate? Hardly, not even 1Gbps NIC
-# generates interrupts at such raging rate...
-#
-# And now back to the former, hyper-threading CPU or more specifically
-# Intel P4. Recall that asynchronous attack implies that malicious
-# code instruments itself. And naturally instrumentation granularity
-# has be noticeably lower than duration of codepath accessing S-box.
-# Given that all cache-lines are accessed during that time that is.
-# Current implementation accesses *all* cache-lines within ~50 cycles
-# window, which is actually *less* than RDTSC latency on Intel P4!
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-push(@INC,"${dir}","${dir}../../perlasm");
-require "x86asm.pl";
-
-$output = pop;
-open OUT,">$output";
-*STDOUT=*OUT;
-
-&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
-&static_label("AES_Te");
-&static_label("AES_Td");
-
-$s0="eax";
-$s1="ebx";
-$s2="ecx";
-$s3="edx";
-$key="edi";
-$acc="esi";
-$tbl="ebp";
-
-# stack frame layout in _[x86|sse]_AES_* routines, frame is allocated
-# by caller
-$__ra=&DWP(0,"esp");	# return address
-$__s0=&DWP(4,"esp");	# s0 backing store
-$__s1=&DWP(8,"esp");	# s1 backing store
-$__s2=&DWP(12,"esp");	# s2 backing store
-$__s3=&DWP(16,"esp");	# s3 backing store
-$__key=&DWP(20,"esp");	# pointer to key schedule
-$__end=&DWP(24,"esp");	# pointer to end of key schedule
-$__tbl=&DWP(28,"esp");	# %ebp backing store
-
-# stack frame layout in AES_[en|crypt] routines, which differs from
-# above by 4 and overlaps by %ebp backing store
-$_tbl=&DWP(24,"esp");
-$_esp=&DWP(28,"esp");
-
-sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
-
-$speed_limit=512;	# chunks smaller than $speed_limit are
-			# processed with compact routine in CBC mode
-$small_footprint=1;	# $small_footprint=1 code is ~5% slower [on
-			# recent µ-archs], but ~5 times smaller!
-			# I favor compact code to minimize cache
-			# contention and in hope to "collect" 5% back
-			# in real-life applications...
-
-$vertical_spin=0;	# shift "vertically" defaults to 0, because of
-			# its proof-of-concept status...
-# Note that there is no decvert(), as well as last encryption round is
-# performed with "horizontal" shifts. This is because this "vertical"
-# implementation [one which groups shifts on a given $s[i] to form a
-# "column," unlike "horizontal" one, which groups shifts on different
-# $s[i] to form a "row"] is work in progress. It was observed to run
-# few percents faster on Intel cores, but not AMD. On AMD K8 core it's
-# whole 12% slower:-( So we face a trade-off... Shall it be resolved
-# some day? Till then the code is considered experimental and by
-# default remains dormant...
-
-sub encvert()
-{ my ($te,@s) = @_;
-  my ($v0,$v1) = ($acc,$key);
-
-	&mov	($v0,$s[3]);				# copy s3
-	&mov	(&DWP(4,"esp"),$s[2]);			# save s2
-	&mov	($v1,$s[0]);				# copy s0
-	&mov	(&DWP(8,"esp"),$s[1]);			# save s1
-
-	&movz	($s[2],&HB($s[0]));
-	&and	($s[0],0xFF);
-	&mov	($s[0],&DWP(0,$te,$s[0],8));		# s0>>0
-	&shr	($v1,16);
-	&mov	($s[3],&DWP(3,$te,$s[2],8));		# s0>>8
-	&movz	($s[1],&HB($v1));
-	&and	($v1,0xFF);
-	&mov	($s[2],&DWP(2,$te,$v1,8));		# s0>>16
-	 &mov	($v1,$v0);
-	&mov	($s[1],&DWP(1,$te,$s[1],8));		# s0>>24
-
-	&and	($v0,0xFF);
-	&xor	($s[3],&DWP(0,$te,$v0,8));		# s3>>0
-	&movz	($v0,&HB($v1));
-	&shr	($v1,16);
-	&xor	($s[2],&DWP(3,$te,$v0,8));		# s3>>8
-	&movz	($v0,&HB($v1));
-	&and	($v1,0xFF);
-	&xor	($s[1],&DWP(2,$te,$v1,8));		# s3>>16
-	 &mov	($v1,&DWP(4,"esp"));			# restore s2
-	&xor	($s[0],&DWP(1,$te,$v0,8));		# s3>>24
-
-	&mov	($v0,$v1);
-	&and	($v1,0xFF);
-	&xor	($s[2],&DWP(0,$te,$v1,8));		# s2>>0
-	&movz	($v1,&HB($v0));
-	&shr	($v0,16);
-	&xor	($s[1],&DWP(3,$te,$v1,8));		# s2>>8
-	&movz	($v1,&HB($v0));
-	&and	($v0,0xFF);
-	&xor	($s[0],&DWP(2,$te,$v0,8));		# s2>>16
-	 &mov	($v0,&DWP(8,"esp"));			# restore s1
-	&xor	($s[3],&DWP(1,$te,$v1,8));		# s2>>24
-
-	&mov	($v1,$v0);
-	&and	($v0,0xFF);
-	&xor	($s[1],&DWP(0,$te,$v0,8));		# s1>>0
-	&movz	($v0,&HB($v1));
-	&shr	($v1,16);
-	&xor	($s[0],&DWP(3,$te,$v0,8));		# s1>>8
-	&movz	($v0,&HB($v1));
-	&and	($v1,0xFF);
-	&xor	($s[3],&DWP(2,$te,$v1,8));		# s1>>16
-	 &mov	($key,$__key);				# reincarnate v1 as key
-	&xor	($s[2],&DWP(1,$te,$v0,8));		# s1>>24
-}
-
-# Another experimental routine, which features "horizontal spin," but
-# eliminates one reference to stack. Strangely enough runs slower...
-sub enchoriz()
-{ my ($v0,$v1) = ($key,$acc);
-
-	&movz	($v0,&LB($s0));			#  3, 2, 1, 0*
-	&rotr	($s2,8);			#  8,11,10, 9
-	&mov	($v1,&DWP(0,$te,$v0,8));	#  0
-	&movz	($v0,&HB($s1));			#  7, 6, 5*, 4
-	&rotr	($s3,16);			# 13,12,15,14
-	&xor	($v1,&DWP(3,$te,$v0,8));	#  5
-	&movz	($v0,&HB($s2));			#  8,11,10*, 9
-	&rotr	($s0,16);			#  1, 0, 3, 2
-	&xor	($v1,&DWP(2,$te,$v0,8));	# 10
-	&movz	($v0,&HB($s3));			# 13,12,15*,14
-	&xor	($v1,&DWP(1,$te,$v0,8));	# 15, t[0] collected
-	&mov	($__s0,$v1);			# t[0] saved
-
-	&movz	($v0,&LB($s1));			#  7, 6, 5, 4*
-	&shr	($s1,16);			#  -, -, 7, 6
-	&mov	($v1,&DWP(0,$te,$v0,8));	#  4
-	&movz	($v0,&LB($s3));			# 13,12,15,14*
-	&xor	($v1,&DWP(2,$te,$v0,8));	# 14
-	&movz	($v0,&HB($s0));			#  1, 0, 3*, 2
-	&and	($s3,0xffff0000);		# 13,12, -, -
-	&xor	($v1,&DWP(1,$te,$v0,8));	#  3
-	&movz	($v0,&LB($s2));			#  8,11,10, 9*
-	&or	($s3,$s1);			# 13,12, 7, 6
-	&xor	($v1,&DWP(3,$te,$v0,8));	#  9, t[1] collected
-	&mov	($s1,$v1);			#  s[1]=t[1]
-
-	&movz	($v0,&LB($s0));			#  1, 0, 3, 2*
-	&shr	($s2,16);			#  -, -, 8,11
-	&mov	($v1,&DWP(2,$te,$v0,8));	#  2
-	&movz	($v0,&HB($s3));			# 13,12, 7*, 6
-	&xor	($v1,&DWP(1,$te,$v0,8));	#  7
-	&movz	($v0,&HB($s2));			#  -, -, 8*,11
-	&xor	($v1,&DWP(0,$te,$v0,8));	#  8
-	&mov	($v0,$s3);
-	&shr	($v0,24);			# 13
-	&xor	($v1,&DWP(3,$te,$v0,8));	# 13, t[2] collected
-
-	&movz	($v0,&LB($s2));			#  -, -, 8,11*
-	&shr	($s0,24);			#  1*
-	&mov	($s2,&DWP(1,$te,$v0,8));	# 11
-	&xor	($s2,&DWP(3,$te,$s0,8));	#  1
-	&mov	($s0,$__s0);			# s[0]=t[0]
-	&movz	($v0,&LB($s3));			# 13,12, 7, 6*
-	&shr	($s3,16);			#   ,  ,13,12
-	&xor	($s2,&DWP(2,$te,$v0,8));	#  6
-	&mov	($key,$__key);			# reincarnate v0 as key
-	&and	($s3,0xff);			#   ,  ,13,12*
-	&mov	($s3,&DWP(0,$te,$s3,8));	# 12
-	&xor	($s3,$s2);			# s[2]=t[3] collected
-	&mov	($s2,$v1);			# s[2]=t[2]
-}
-
-# More experimental code... SSE one... Even though this one eliminates
-# *all* references to stack, it's not faster...
-sub sse_encbody()
-{
-	&movz	($acc,&LB("eax"));		#  0
-	&mov	("ecx",&DWP(0,$tbl,$acc,8));	#  0
-	&pshufw	("mm2","mm0",0x0d);		#  7, 6, 3, 2
-	&movz	("edx",&HB("eax"));		#  1
-	&mov	("edx",&DWP(3,$tbl,"edx",8));	#  1
-	&shr	("eax",16);			#  5, 4
-
-	&movz	($acc,&LB("ebx"));		# 10
-	&xor	("ecx",&DWP(2,$tbl,$acc,8));	# 10
-	&pshufw	("mm6","mm4",0x08);		# 13,12, 9, 8
-	&movz	($acc,&HB("ebx"));		# 11
-	&xor	("edx",&DWP(1,$tbl,$acc,8));	# 11
-	&shr	("ebx",16);			# 15,14
-
-	&movz	($acc,&HB("eax"));		#  5
-	&xor	("ecx",&DWP(3,$tbl,$acc,8));	#  5
-	&movq	("mm3",QWP(16,$key));
-	&movz	($acc,&HB("ebx"));		# 15
-	&xor	("ecx",&DWP(1,$tbl,$acc,8));	# 15
-	&movd	("mm0","ecx");			# t[0] collected
-
-	&movz	($acc,&LB("eax"));		#  4
-	&mov	("ecx",&DWP(0,$tbl,$acc,8));	#  4
-	&movd	("eax","mm2");			#  7, 6, 3, 2
-	&movz	($acc,&LB("ebx"));		# 14
-	&xor	("ecx",&DWP(2,$tbl,$acc,8));	# 14
-	&movd	("ebx","mm6");			# 13,12, 9, 8
-
-	&movz	($acc,&HB("eax"));		#  3
-	&xor	("ecx",&DWP(1,$tbl,$acc,8));	#  3
-	&movz	($acc,&HB("ebx"));		#  9
-	&xor	("ecx",&DWP(3,$tbl,$acc,8));	#  9
-	&movd	("mm1","ecx");			# t[1] collected
-
-	&movz	($acc,&LB("eax"));		#  2
-	&mov	("ecx",&DWP(2,$tbl,$acc,8));	#  2
-	&shr	("eax",16);			#  7, 6
-	&punpckldq	("mm0","mm1");		# t[0,1] collected
-	&movz	($acc,&LB("ebx"));		#  8
-	&xor	("ecx",&DWP(0,$tbl,$acc,8));	#  8
-	&shr	("ebx",16);			# 13,12
-
-	&movz	($acc,&HB("eax"));		#  7
-	&xor	("ecx",&DWP(1,$tbl,$acc,8));	#  7
-	&pxor	("mm0","mm3");
-	&movz	("eax",&LB("eax"));		#  6
-	&xor	("edx",&DWP(2,$tbl,"eax",8));	#  6
-	&pshufw	("mm1","mm0",0x08);		#  5, 4, 1, 0
-	&movz	($acc,&HB("ebx"));		# 13
-	&xor	("ecx",&DWP(3,$tbl,$acc,8));	# 13
-	&xor	("ecx",&DWP(24,$key));		# t[2]
-	&movd	("mm4","ecx");			# t[2] collected
-	&movz	("ebx",&LB("ebx"));		# 12
-	&xor	("edx",&DWP(0,$tbl,"ebx",8));	# 12
-	&shr	("ecx",16);
-	&movd	("eax","mm1");			#  5, 4, 1, 0
-	&mov	("ebx",&DWP(28,$key));		# t[3]
-	&xor	("ebx","edx");
-	&movd	("mm5","ebx");			# t[3] collected
-	&and	("ebx",0xffff0000);
-	&or	("ebx","ecx");
-
-	&punpckldq	("mm4","mm5");		# t[2,3] collected
-}
-
-######################################################################
-# "Compact" block function
-######################################################################
-
-sub enccompact()
-{ my $Fn = \&mov;
-  while ($#_>5) { pop(@_); $Fn=sub{}; }
-  my ($i,$te,@s)=@_;
-  my $tmp = $key;
-  my $out = $i==3?$s[0]:$acc;
-
-	# $Fn is used in first compact round and its purpose is to
-	# void restoration of some values from stack, so that after
-	# 4xenccompact with extra argument $key value is left there...
-	if ($i==3)  {	&$Fn	($key,$__key);			}##%edx
-	else        {	&mov	($out,$s[0]);			}
-			&and	($out,0xFF);
-	if ($i==1)  {	&shr	($s[0],16);			}#%ebx[1]
-	if ($i==2)  {	&shr	($s[0],24);			}#%ecx[2]
-			&movz	($out,&BP(-128,$te,$out,1));
-
-	if ($i==3)  {	$tmp=$s[1];				}##%eax
-			&movz	($tmp,&HB($s[1]));
-			&movz	($tmp,&BP(-128,$te,$tmp,1));
-			&shl	($tmp,8);
-			&xor	($out,$tmp);
-
-	if ($i==3)  {	$tmp=$s[2]; &mov ($s[1],$__s0);		}##%ebx
-	else        {	&mov	($tmp,$s[2]);
-			&shr	($tmp,16);			}
-	if ($i==2)  {	&and	($s[1],0xFF);			}#%edx[2]
-			&and	($tmp,0xFF);
-			&movz	($tmp,&BP(-128,$te,$tmp,1));
-			&shl	($tmp,16);
-			&xor	($out,$tmp);
-
-	if ($i==3)  {	$tmp=$s[3]; &mov ($s[2],$__s1);		}##%ecx
-	elsif($i==2){	&movz	($tmp,&HB($s[3]));		}#%ebx[2]
-	else        {	&mov	($tmp,$s[3]);
-			&shr	($tmp,24);			}
-			&movz	($tmp,&BP(-128,$te,$tmp,1));
-			&shl	($tmp,24);
-			&xor	($out,$tmp);
-	if ($i<2)   {	&mov	(&DWP(4+4*$i,"esp"),$out);	}
-	if ($i==3)  {	&mov	($s[3],$acc);			}
-	&comment();
-}
-
-sub enctransform()
-{ my @s = ($s0,$s1,$s2,$s3);
-  my $i = shift;
-  my $tmp = $tbl;
-  my $r2  = $key ;
-
-	&and	($tmp,$s[$i]);
-	&lea	($r2,&DWP(0,$s[$i],$s[$i]));
-	&mov	($acc,$tmp);
-	&shr	($tmp,7);
-	&and	($r2,0xfefefefe);
-	&sub	($acc,$tmp);
-	&mov	($tmp,$s[$i]);
-	&and	($acc,0x1b1b1b1b);
-	&rotr	($tmp,16);
-	&xor	($acc,$r2);	# r2
-	&mov	($r2,$s[$i]);
-
-	&xor	($s[$i],$acc);	# r0 ^ r2
-	&rotr	($r2,16+8);
-	&xor	($acc,$tmp);
-	&rotl	($s[$i],24);
-	&xor	($acc,$r2);
-	&mov	($tmp,0x80808080)	if ($i!=1);
-	&xor	($s[$i],$acc);	# ROTATE(r2^r0,24) ^ r2
-}
-
-&function_begin_B("_x86_AES_encrypt_compact");
-	# note that caller is expected to allocate stack frame for me!
-	&mov	($__key,$key);			# save key
-
-	&xor	($s0,&DWP(0,$key));		# xor with key
-	&xor	($s1,&DWP(4,$key));
-	&xor	($s2,&DWP(8,$key));
-	&xor	($s3,&DWP(12,$key));
-
-	&mov	($acc,&DWP(240,$key));		# load key->rounds
-	&lea	($acc,&DWP(-2,$acc,$acc));
-	&lea	($acc,&DWP(0,$key,$acc,8));
-	&mov	($__end,$acc);			# end of key schedule
-
-	# prefetch Te4
-	&mov	($key,&DWP(0-128,$tbl));
-	&mov	($acc,&DWP(32-128,$tbl));
-	&mov	($key,&DWP(64-128,$tbl));
-	&mov	($acc,&DWP(96-128,$tbl));
-	&mov	($key,&DWP(128-128,$tbl));
-	&mov	($acc,&DWP(160-128,$tbl));
-	&mov	($key,&DWP(192-128,$tbl));
-	&mov	($acc,&DWP(224-128,$tbl));
-
-	&set_label("loop",16);
-
-		&enccompact(0,$tbl,$s0,$s1,$s2,$s3,1);
-		&enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
-		&enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
-		&enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
-		&mov	($tbl,0x80808080);
-		&enctransform(2);
-		&enctransform(3);
-		&enctransform(0);
-		&enctransform(1);
-		&mov 	($key,$__key);
-		&mov	($tbl,$__tbl);
-		&add	($key,16);		# advance rd_key
-		&xor	($s0,&DWP(0,$key));
-		&xor	($s1,&DWP(4,$key));
-		&xor	($s2,&DWP(8,$key));
-		&xor	($s3,&DWP(12,$key));
-
-	&cmp	($key,$__end);
-	&mov	($__key,$key);
-	&jb	(&label("loop"));
-
-	&enccompact(0,$tbl,$s0,$s1,$s2,$s3);
-	&enccompact(1,$tbl,$s1,$s2,$s3,$s0);
-	&enccompact(2,$tbl,$s2,$s3,$s0,$s1);
-	&enccompact(3,$tbl,$s3,$s0,$s1,$s2);
-
-	&xor	($s0,&DWP(16,$key));
-	&xor	($s1,&DWP(20,$key));
-	&xor	($s2,&DWP(24,$key));
-	&xor	($s3,&DWP(28,$key));
-
-	&ret	();
-&function_end_B("_x86_AES_encrypt_compact");
-
-######################################################################
-# "Compact" SSE block function.
-######################################################################
-#
-# Performance is not actually extraordinary in comparison to pure
-# x86 code. In particular encrypt performance is virtually the same.
-# Decrypt performance on the other hand is 15-20% better on newer
-# µ-archs [but we're thankful for *any* improvement here], and ~50%
-# better on PIII:-) And additionally on the pros side this code
-# eliminates redundant references to stack and thus relieves/
-# minimizes the pressure on the memory bus.
-#
-# MMX register layout                           lsb
-# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-# |          mm4          |          mm0          |
-# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-# |     s3    |     s2    |     s1    |     s0    |
-# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-# |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
-# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-#
-# Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8.
-# In this terms encryption and decryption "compact" permutation
-# matrices can be depicted as following:
-#
-# encryption              lsb	# decryption              lsb
-# +----++----+----+----+----+	# +----++----+----+----+----+
-# | t0 || 15 | 10 |  5 |  0 |	# | t0 ||  7 | 10 | 13 |  0 |
-# +----++----+----+----+----+	# +----++----+----+----+----+
-# | t1 ||  3 | 14 |  9 |  4 |	# | t1 || 11 | 14 |  1 |  4 |
-# +----++----+----+----+----+	# +----++----+----+----+----+
-# | t2 ||  7 |  2 | 13 |  8 |	# | t2 || 15 |  2 |  5 |  8 |
-# +----++----+----+----+----+	# +----++----+----+----+----+
-# | t3 || 11 |  6 |  1 | 12 |	# | t3 ||  3 |  6 |  9 | 12 |
-# +----++----+----+----+----+	# +----++----+----+----+----+
-#
-######################################################################
-# Why not xmm registers? Short answer. It was actually tested and
-# was not any faster, but *contrary*, most notably on Intel CPUs.
-# Longer answer. Main advantage of using mm registers is that movd
-# latency is lower, especially on Intel P4. While arithmetic
-# instructions are twice as many, they can be scheduled every cycle
-# and not every second one when they are operating on xmm register,
-# so that "arithmetic throughput" remains virtually the same. And
-# finally the code can be executed even on elder SSE-only CPUs:-)
-
-sub sse_enccompact()
-{
-	&pshufw	("mm1","mm0",0x08);		#  5, 4, 1, 0
-	&pshufw	("mm5","mm4",0x0d);		# 15,14,11,10
-	&movd	("eax","mm1");			#  5, 4, 1, 0
-	&movd	("ebx","mm5");			# 15,14,11,10
-	&mov	($__key,$key);
-
-	&movz	($acc,&LB("eax"));		#  0
-	&movz	("edx",&HB("eax"));		#  1
-	&pshufw	("mm2","mm0",0x0d);		#  7, 6, 3, 2
-	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  0
-	&movz	($key,&LB("ebx"));		# 10
-	&movz	("edx",&BP(-128,$tbl,"edx",1));	#  1
-	&shr	("eax",16);			#  5, 4
-	&shl	("edx",8);			#  1
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	# 10
-	&movz	($key,&HB("ebx"));		# 11
-	&shl	($acc,16);			# 10
-	&pshufw	("mm6","mm4",0x08);		# 13,12, 9, 8
-	&or	("ecx",$acc);			# 10
-	&movz	($acc,&BP(-128,$tbl,$key,1));	# 11
-	&movz	($key,&HB("eax"));		#  5
-	&shl	($acc,24);			# 11
-	&shr	("ebx",16);			# 15,14
-	&or	("edx",$acc);			# 11
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  5
-	&movz	($key,&HB("ebx"));		# 15
-	&shl	($acc,8);			#  5
-	&or	("ecx",$acc);			#  5
-	&movz	($acc,&BP(-128,$tbl,$key,1));	# 15
-	&movz	($key,&LB("eax"));		#  4
-	&shl	($acc,24);			# 15
-	&or	("ecx",$acc);			# 15
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  4
-	&movz	($key,&LB("ebx"));		# 14
-	&movd	("eax","mm2");			#  7, 6, 3, 2
-	&movd	("mm0","ecx");			# t[0] collected
-	&movz	("ecx",&BP(-128,$tbl,$key,1));	# 14
-	&movz	($key,&HB("eax"));		#  3
-	&shl	("ecx",16);			# 14
-	&movd	("ebx","mm6");			# 13,12, 9, 8
-	&or	("ecx",$acc);			# 14
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  3
-	&movz	($key,&HB("ebx"));		#  9
-	&shl	($acc,24);			#  3
-	&or	("ecx",$acc);			#  3
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  9
-	&movz	($key,&LB("ebx"));		#  8
-	&shl	($acc,8);			#  9
-	&shr	("ebx",16);			# 13,12
-	&or	("ecx",$acc);			#  9
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  8
-	&movz	($key,&LB("eax"));		#  2
-	&shr	("eax",16);			#  7, 6
-	&movd	("mm1","ecx");			# t[1] collected
-	&movz	("ecx",&BP(-128,$tbl,$key,1));	#  2
-	&movz	($key,&HB("eax"));		#  7
-	&shl	("ecx",16);			#  2
-	&and	("eax",0xff);			#  6
-	&or	("ecx",$acc);			#  2
-
-	&punpckldq	("mm0","mm1");		# t[0,1] collected
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  7
-	&movz	($key,&HB("ebx"));		# 13
-	&shl	($acc,24);			#  7
-	&and	("ebx",0xff);			# 12
-	&movz	("eax",&BP(-128,$tbl,"eax",1));	#  6
-	&or	("ecx",$acc);			#  7
-	&shl	("eax",16);			#  6
-	&movz	($acc,&BP(-128,$tbl,$key,1));	# 13
-	&or	("edx","eax");			#  6
-	&shl	($acc,8);			# 13
-	&movz	("ebx",&BP(-128,$tbl,"ebx",1));	# 12
-	&or	("ecx",$acc);			# 13
-	&or	("edx","ebx");			# 12
-	&mov	($key,$__key);
-	&movd	("mm4","ecx");			# t[2] collected
-	&movd	("mm5","edx");			# t[3] collected
-
-	&punpckldq	("mm4","mm5");		# t[2,3] collected
-}
-
-					if (!$x86only) {
-&function_begin_B("_sse_AES_encrypt_compact");
-	&pxor	("mm0",&QWP(0,$key));	#  7, 6, 5, 4, 3, 2, 1, 0
-	&pxor	("mm4",&QWP(8,$key));	# 15,14,13,12,11,10, 9, 8
-
-	# note that caller is expected to allocate stack frame for me!
-	&mov	($acc,&DWP(240,$key));		# load key->rounds
-	&lea	($acc,&DWP(-2,$acc,$acc));
-	&lea	($acc,&DWP(0,$key,$acc,8));
-	&mov	($__end,$acc);			# end of key schedule
-
-	&mov	($s0,0x1b1b1b1b);		# magic constant
-	&mov	(&DWP(8,"esp"),$s0);
-	&mov	(&DWP(12,"esp"),$s0);
-
-	# prefetch Te4
-	&mov	($s0,&DWP(0-128,$tbl));
-	&mov	($s1,&DWP(32-128,$tbl));
-	&mov	($s2,&DWP(64-128,$tbl));
-	&mov	($s3,&DWP(96-128,$tbl));
-	&mov	($s0,&DWP(128-128,$tbl));
-	&mov	($s1,&DWP(160-128,$tbl));
-	&mov	($s2,&DWP(192-128,$tbl));
-	&mov	($s3,&DWP(224-128,$tbl));
-
-	&set_label("loop",16);
-		&sse_enccompact();
-		&add	($key,16);
-		&cmp	($key,$__end);
-		&ja	(&label("out"));
-
-		&movq	("mm2",&QWP(8,"esp"));
-		&pxor	("mm3","mm3");		&pxor	("mm7","mm7");
-		&movq	("mm1","mm0");		&movq	("mm5","mm4");	# r0
-		&pcmpgtb("mm3","mm0");		&pcmpgtb("mm7","mm4");
-		&pand	("mm3","mm2");		&pand	("mm7","mm2");
-		&pshufw	("mm2","mm0",0xb1);	&pshufw	("mm6","mm4",0xb1);# ROTATE(r0,16)
-		&paddb	("mm0","mm0");		&paddb	("mm4","mm4");
-		&pxor	("mm0","mm3");		&pxor	("mm4","mm7");	# = r2
-		&pshufw	("mm3","mm2",0xb1);	&pshufw	("mm7","mm6",0xb1);# r0
-		&pxor	("mm1","mm0");		&pxor	("mm5","mm4");	# r0^r2
-		&pxor	("mm0","mm2");		&pxor	("mm4","mm6");	# ^= ROTATE(r0,16)
-
-		&movq	("mm2","mm3");		&movq	("mm6","mm7");
-		&pslld	("mm3",8);		&pslld	("mm7",8);
-		&psrld	("mm2",24);		&psrld	("mm6",24);
-		&pxor	("mm0","mm3");		&pxor	("mm4","mm7");	# ^= r0<<8
-		&pxor	("mm0","mm2");		&pxor	("mm4","mm6");	# ^= r0>>24
-
-		&movq	("mm3","mm1");		&movq	("mm7","mm5");
-		&movq	("mm2",&QWP(0,$key));	&movq	("mm6",&QWP(8,$key));
-		&psrld	("mm1",8);		&psrld	("mm5",8);
-		&mov	($s0,&DWP(0-128,$tbl));
-		&pslld	("mm3",24);		&pslld	("mm7",24);
-		&mov	($s1,&DWP(64-128,$tbl));
-		&pxor	("mm0","mm1");		&pxor	("mm4","mm5");	# ^= (r2^r0)<<8
-		&mov	($s2,&DWP(128-128,$tbl));
-		&pxor	("mm0","mm3");		&pxor	("mm4","mm7");	# ^= (r2^r0)>>24
-		&mov	($s3,&DWP(192-128,$tbl));
-
-		&pxor	("mm0","mm2");		&pxor	("mm4","mm6");
-	&jmp	(&label("loop"));
-
-	&set_label("out",16);
-	&pxor	("mm0",&QWP(0,$key));
-	&pxor	("mm4",&QWP(8,$key));
-
-	&ret	();
-&function_end_B("_sse_AES_encrypt_compact");
-					}
-
-######################################################################
-# Vanilla block function.
-######################################################################
-
-sub encstep()
-{ my ($i,$te,@s) = @_;
-  my $tmp = $key;
-  my $out = $i==3?$s[0]:$acc;
-
-	# lines marked with #%e?x[i] denote "reordered" instructions...
-	if ($i==3)  {	&mov	($key,$__key);			}##%edx
-	else        {	&mov	($out,$s[0]);
-			&and	($out,0xFF);			}
-	if ($i==1)  {	&shr	($s[0],16);			}#%ebx[1]
-	if ($i==2)  {	&shr	($s[0],24);			}#%ecx[2]
-			&mov	($out,&DWP(0,$te,$out,8));
-
-	if ($i==3)  {	$tmp=$s[1];				}##%eax
-			&movz	($tmp,&HB($s[1]));
-			&xor	($out,&DWP(3,$te,$tmp,8));
-
-	if ($i==3)  {	$tmp=$s[2]; &mov ($s[1],$__s0);		}##%ebx
-	else        {	&mov	($tmp,$s[2]);
-			&shr	($tmp,16);			}
-	if ($i==2)  {	&and	($s[1],0xFF);			}#%edx[2]
-			&and	($tmp,0xFF);
-			&xor	($out,&DWP(2,$te,$tmp,8));
-
-	if ($i==3)  {	$tmp=$s[3]; &mov ($s[2],$__s1);		}##%ecx
-	elsif($i==2){	&movz	($tmp,&HB($s[3]));		}#%ebx[2]
-	else        {	&mov	($tmp,$s[3]);
-			&shr	($tmp,24)			}
-			&xor	($out,&DWP(1,$te,$tmp,8));
-	if ($i<2)   {	&mov	(&DWP(4+4*$i,"esp"),$out);	}
-	if ($i==3)  {	&mov	($s[3],$acc);			}
-			&comment();
-}
-
-sub enclast()
-{ my ($i,$te,@s)=@_;
-  my $tmp = $key;
-  my $out = $i==3?$s[0]:$acc;
-
-	if ($i==3)  {	&mov	($key,$__key);			}##%edx
-	else        {	&mov	($out,$s[0]);			}
-			&and	($out,0xFF);
-	if ($i==1)  {	&shr	($s[0],16);			}#%ebx[1]
-	if ($i==2)  {	&shr	($s[0],24);			}#%ecx[2]
-			&mov	($out,&DWP(2,$te,$out,8));
-			&and	($out,0x000000ff);
-
-	if ($i==3)  {	$tmp=$s[1];				}##%eax
-			&movz	($tmp,&HB($s[1]));
-			&mov	($tmp,&DWP(0,$te,$tmp,8));
-			&and	($tmp,0x0000ff00);
-			&xor	($out,$tmp);
-
-	if ($i==3)  {	$tmp=$s[2]; &mov ($s[1],$__s0);		}##%ebx
-	else        {	&mov	($tmp,$s[2]);
-			&shr	($tmp,16);			}
-	if ($i==2)  {	&and	($s[1],0xFF);			}#%edx[2]
-			&and	($tmp,0xFF);
-			&mov	($tmp,&DWP(0,$te,$tmp,8));
-			&and	($tmp,0x00ff0000);
-			&xor	($out,$tmp);
-
-	if ($i==3)  {	$tmp=$s[3]; &mov ($s[2],$__s1);		}##%ecx
-	elsif($i==2){	&movz	($tmp,&HB($s[3]));		}#%ebx[2]
-	else        {	&mov	($tmp,$s[3]);
-			&shr	($tmp,24);			}
-			&mov	($tmp,&DWP(2,$te,$tmp,8));
-			&and	($tmp,0xff000000);
-			&xor	($out,$tmp);
-	if ($i<2)   {	&mov	(&DWP(4+4*$i,"esp"),$out);	}
-	if ($i==3)  {	&mov	($s[3],$acc);			}
-}
-
-&function_begin_B("_x86_AES_encrypt");
-	if ($vertical_spin) {
-		# I need high parts of volatile registers to be accessible...
-		&exch	($s1="edi",$key="ebx");
-		&mov	($s2="esi",$acc="ecx");
-	}
-
-	# note that caller is expected to allocate stack frame for me!
-	&mov	($__key,$key);			# save key
-
-	&xor	($s0,&DWP(0,$key));		# xor with key
-	&xor	($s1,&DWP(4,$key));
-	&xor	($s2,&DWP(8,$key));
-	&xor	($s3,&DWP(12,$key));
-
-	&mov	($acc,&DWP(240,$key));		# load key->rounds
-
-	if ($small_footprint) {
-	    &lea	($acc,&DWP(-2,$acc,$acc));
-	    &lea	($acc,&DWP(0,$key,$acc,8));
-	    &mov	($__end,$acc);		# end of key schedule
-
-	    &set_label("loop",16);
-		if ($vertical_spin) {
-		    &encvert($tbl,$s0,$s1,$s2,$s3);
-		} else {
-		    &encstep(0,$tbl,$s0,$s1,$s2,$s3);
-		    &encstep(1,$tbl,$s1,$s2,$s3,$s0);
-		    &encstep(2,$tbl,$s2,$s3,$s0,$s1);
-		    &encstep(3,$tbl,$s3,$s0,$s1,$s2);
-		}
-		&add	($key,16);		# advance rd_key
-		&xor	($s0,&DWP(0,$key));
-		&xor	($s1,&DWP(4,$key));
-		&xor	($s2,&DWP(8,$key));
-		&xor	($s3,&DWP(12,$key));
-	    &cmp	($key,$__end);
-	    &mov	($__key,$key);
-	    &jb		(&label("loop"));
-	}
-	else {
-	    &cmp	($acc,10);
-	    &jle	(&label("10rounds"));
-	    &cmp	($acc,12);
-	    &jle	(&label("12rounds"));
-
-	&set_label("14rounds",4);
-	    for ($i=1;$i<3;$i++) {
-		if ($vertical_spin) {
-		    &encvert($tbl,$s0,$s1,$s2,$s3);
-		} else {
-		    &encstep(0,$tbl,$s0,$s1,$s2,$s3);
-		    &encstep(1,$tbl,$s1,$s2,$s3,$s0);
-		    &encstep(2,$tbl,$s2,$s3,$s0,$s1);
-		    &encstep(3,$tbl,$s3,$s0,$s1,$s2);
-		}
-		&xor	($s0,&DWP(16*$i+0,$key));
-		&xor	($s1,&DWP(16*$i+4,$key));
-		&xor	($s2,&DWP(16*$i+8,$key));
-		&xor	($s3,&DWP(16*$i+12,$key));
-	    }
-	    &add	($key,32);
-	    &mov	($__key,$key);		# advance rd_key
-	&set_label("12rounds",4);
-	    for ($i=1;$i<3;$i++) {
-		if ($vertical_spin) {
-		    &encvert($tbl,$s0,$s1,$s2,$s3);
-		} else {
-		    &encstep(0,$tbl,$s0,$s1,$s2,$s3);
-		    &encstep(1,$tbl,$s1,$s2,$s3,$s0);
-		    &encstep(2,$tbl,$s2,$s3,$s0,$s1);
-		    &encstep(3,$tbl,$s3,$s0,$s1,$s2);
-		}
-		&xor	($s0,&DWP(16*$i+0,$key));
-		&xor	($s1,&DWP(16*$i+4,$key));
-		&xor	($s2,&DWP(16*$i+8,$key));
-		&xor	($s3,&DWP(16*$i+12,$key));
-	    }
-	    &add	($key,32);
-	    &mov	($__key,$key);		# advance rd_key
-	&set_label("10rounds",4);
-	    for ($i=1;$i<10;$i++) {
-		if ($vertical_spin) {
-		    &encvert($tbl,$s0,$s1,$s2,$s3);
-		} else {
-		    &encstep(0,$tbl,$s0,$s1,$s2,$s3);
-		    &encstep(1,$tbl,$s1,$s2,$s3,$s0);
-		    &encstep(2,$tbl,$s2,$s3,$s0,$s1);
-		    &encstep(3,$tbl,$s3,$s0,$s1,$s2);
-		}
-		&xor	($s0,&DWP(16*$i+0,$key));
-		&xor	($s1,&DWP(16*$i+4,$key));
-		&xor	($s2,&DWP(16*$i+8,$key));
-		&xor	($s3,&DWP(16*$i+12,$key));
-	    }
-	}
-
-	if ($vertical_spin) {
-	    # "reincarnate" some registers for "horizontal" spin...
-	    &mov	($s1="ebx",$key="edi");
-	    &mov	($s2="ecx",$acc="esi");
-	}
-	&enclast(0,$tbl,$s0,$s1,$s2,$s3);
-	&enclast(1,$tbl,$s1,$s2,$s3,$s0);
-	&enclast(2,$tbl,$s2,$s3,$s0,$s1);
-	&enclast(3,$tbl,$s3,$s0,$s1,$s2);
-
-	&add	($key,$small_footprint?16:160);
-	&xor	($s0,&DWP(0,$key));
-	&xor	($s1,&DWP(4,$key));
-	&xor	($s2,&DWP(8,$key));
-	&xor	($s3,&DWP(12,$key));
-
-	&ret	();
-
-&set_label("AES_Te",64);	# Yes! I keep it in the code segment!
-	&_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
-	&_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
-	&_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
-	&_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
-	&_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
-	&_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
-	&_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
-	&_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
-	&_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
-	&_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
-	&_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
-	&_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
-	&_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
-	&_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
-	&_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
-	&_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
-	&_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
-	&_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
-	&_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
-	&_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
-	&_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
-	&_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
-	&_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
-	&_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
-	&_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
-	&_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
-	&_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
-	&_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
-	&_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
-	&_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
-	&_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
-	&_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
-	&_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
-	&_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
-	&_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
-	&_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
-	&_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
-	&_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
-	&_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
-	&_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
-	&_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
-	&_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
-	&_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
-	&_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
-	&_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
-	&_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
-	&_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
-	&_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
-	&_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
-	&_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
-	&_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
-	&_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
-	&_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
-	&_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
-	&_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
-	&_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
-	&_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
-	&_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
-	&_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
-	&_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
-	&_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
-	&_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
-	&_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
-	&_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
-
-#Te4	# four copies of Te4 to choose from to avoid L1 aliasing
-	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-
-	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-
-	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-
-	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-#rcon:
-	&data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008);
-	&data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080);
-	&data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000);
-	&data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000);
-&function_end_B("_x86_AES_encrypt");
-
-# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
-&function_begin("AES_encrypt");
-	&mov	($acc,&wparam(0));		# load inp
-	&mov	($key,&wparam(2));		# load key
-
-	&mov	($s0,"esp");
-	&sub	("esp",36);
-	&and	("esp",-64);			# align to cache-line
-
-	# place stack frame just "above" the key schedule
-	&lea	($s1,&DWP(-64-63,$key));
-	&sub	($s1,"esp");
-	&neg	($s1);
-	&and	($s1,0x3C0);	# modulo 1024, but aligned to cache-line
-	&sub	("esp",$s1);
-	&add	("esp",4);	# 4 is reserved for caller's return address
-	&mov	($_esp,$s0);			# save stack pointer
-
-	&call   (&label("pic_point"));          # make it PIC!
-	&set_label("pic_point");
-	&blindpop($tbl);
-	&picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only);
-	&lea    ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
-
-	# pick Te4 copy which can't "overlap" with stack frame or key schedule
-	&lea	($s1,&DWP(768-4,"esp"));
-	&sub	($s1,$tbl);
-	&and	($s1,0x300);
-	&lea	($tbl,&DWP(2048+128,$tbl,$s1));
-
-					if (!$x86only) {
-	&bt	(&DWP(0,$s0),25);	# check for SSE bit
-	&jnc	(&label("x86"));
-
-	&movq	("mm0",&QWP(0,$acc));
-	&movq	("mm4",&QWP(8,$acc));
-	&call	("_sse_AES_encrypt_compact");
-	&mov	("esp",$_esp);			# restore stack pointer
-	&mov	($acc,&wparam(1));		# load out
-	&movq	(&QWP(0,$acc),"mm0");		# write output data
-	&movq	(&QWP(8,$acc),"mm4");
-	&emms	();
-	&function_end_A();
-					}
-	&set_label("x86",16);
-	&mov	($_tbl,$tbl);
-	&mov	($s0,&DWP(0,$acc));		# load input data
-	&mov	($s1,&DWP(4,$acc));
-	&mov	($s2,&DWP(8,$acc));
-	&mov	($s3,&DWP(12,$acc));
-	&call	("_x86_AES_encrypt_compact");
-	&mov	("esp",$_esp);			# restore stack pointer
-	&mov	($acc,&wparam(1));		# load out
-	&mov	(&DWP(0,$acc),$s0);		# write output data
-	&mov	(&DWP(4,$acc),$s1);
-	&mov	(&DWP(8,$acc),$s2);
-	&mov	(&DWP(12,$acc),$s3);
-&function_end("AES_encrypt");
-
-#--------------------------------------------------------------------#
-
-######################################################################
-# "Compact" block function
-######################################################################
-
-sub deccompact()
-{ my $Fn = \&mov;
-  while ($#_>5) { pop(@_); $Fn=sub{}; }
-  my ($i,$td,@s)=@_;
-  my $tmp = $key;
-  my $out = $i==3?$s[0]:$acc;
-
-	# $Fn is used in first compact round and its purpose is to
-	# void restoration of some values from stack, so that after
-	# 4xdeccompact with extra argument $key, $s0 and $s1 values
-	# are left there...
-	if($i==3)   {	&$Fn	($key,$__key);			}
-	else        {	&mov	($out,$s[0]);			}
-			&and	($out,0xFF);
-			&movz	($out,&BP(-128,$td,$out,1));
-
-	if ($i==3)  {	$tmp=$s[1];				}
-			&movz	($tmp,&HB($s[1]));
-			&movz	($tmp,&BP(-128,$td,$tmp,1));
-			&shl	($tmp,8);
-			&xor	($out,$tmp);
-
-	if ($i==3)  {	$tmp=$s[2]; &mov ($s[1],$acc);		}
-	else        {	mov	($tmp,$s[2]);			}
-			&shr	($tmp,16);
-			&and	($tmp,0xFF);
-			&movz	($tmp,&BP(-128,$td,$tmp,1));
-			&shl	($tmp,16);
-			&xor	($out,$tmp);
-
-	if ($i==3)  {	$tmp=$s[3]; &$Fn ($s[2],$__s1);		}
-	else        {	&mov	($tmp,$s[3]);			}
-			&shr	($tmp,24);
-			&movz	($tmp,&BP(-128,$td,$tmp,1));
-			&shl	($tmp,24);
-			&xor	($out,$tmp);
-	if ($i<2)   {	&mov	(&DWP(4+4*$i,"esp"),$out);	}
-	if ($i==3)  {	&$Fn	($s[3],$__s0);			}
-}
-
-# must be called with 2,3,0,1 as argument sequence!!!
-sub dectransform()
-{ my @s = ($s0,$s1,$s2,$s3);
-  my $i = shift;
-  my $tmp = $key;
-  my $tp2 = @s[($i+2)%4]; $tp2 = @s[2] if ($i==1);
-  my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
-  my $tp8 = $tbl;
-
-	&mov	($tmp,0x80808080);
-	&and	($tmp,$s[$i]);
-	&mov	($acc,$tmp);
-	&shr	($tmp,7);
-	&lea	($tp2,&DWP(0,$s[$i],$s[$i]));
-	&sub	($acc,$tmp);
-	&and	($tp2,0xfefefefe);
-	&and	($acc,0x1b1b1b1b);
-	&xor	($tp2,$acc);
-	&mov	($tmp,0x80808080);
-
-	&and	($tmp,$tp2);
-	&mov	($acc,$tmp);
-	&shr	($tmp,7);
-	&lea	($tp4,&DWP(0,$tp2,$tp2));
-	&sub	($acc,$tmp);
-	&and	($tp4,0xfefefefe);
-	&and	($acc,0x1b1b1b1b);
-	 &xor	($tp2,$s[$i]);	# tp2^tp1
-	&xor	($tp4,$acc);
-	&mov	($tmp,0x80808080);
-
-	&and	($tmp,$tp4);
-	&mov	($acc,$tmp);
-	&shr	($tmp,7);
-	&lea	($tp8,&DWP(0,$tp4,$tp4));
-	&sub	($acc,$tmp);
-	&and	($tp8,0xfefefefe);
-	&and	($acc,0x1b1b1b1b);
-	 &xor	($tp4,$s[$i]);	# tp4^tp1
-	 &rotl	($s[$i],8);	# = ROTATE(tp1,8)
-	&xor	($tp8,$acc);
-
-	&xor	($s[$i],$tp2);
-	&xor	($tp2,$tp8);
-	&xor	($s[$i],$tp4);
-	&xor	($tp4,$tp8);
-	&rotl	($tp2,24);
-	&xor	($s[$i],$tp8);	# ^= tp8^(tp4^tp1)^(tp2^tp1)
-	&rotl	($tp4,16);
-	&xor	($s[$i],$tp2);	# ^= ROTATE(tp8^tp2^tp1,24)
-	&rotl	($tp8,8);
-	&xor	($s[$i],$tp4);	# ^= ROTATE(tp8^tp4^tp1,16)
-	 &mov	($s[0],$__s0)			if($i==2); #prefetch $s0
-	 &mov	($s[1],$__s1)			if($i==3); #prefetch $s1
-	 &mov	($s[2],$__s2)			if($i==1);
-	&xor	($s[$i],$tp8);	# ^= ROTATE(tp8,8)
-
-	&mov	($s[3],$__s3)			if($i==1);
-	&mov	(&DWP(4+4*$i,"esp"),$s[$i])	if($i>=2);
-}
-
-&function_begin_B("_x86_AES_decrypt_compact");
-	# note that caller is expected to allocate stack frame for me!
-	&mov	($__key,$key);			# save key
-
-	&xor	($s0,&DWP(0,$key));		# xor with key
-	&xor	($s1,&DWP(4,$key));
-	&xor	($s2,&DWP(8,$key));
-	&xor	($s3,&DWP(12,$key));
-
-	&mov	($acc,&DWP(240,$key));		# load key->rounds
-
-	&lea	($acc,&DWP(-2,$acc,$acc));
-	&lea	($acc,&DWP(0,$key,$acc,8));
-	&mov	($__end,$acc);			# end of key schedule
-
-	# prefetch Td4
-	&mov	($key,&DWP(0-128,$tbl));
-	&mov	($acc,&DWP(32-128,$tbl));
-	&mov	($key,&DWP(64-128,$tbl));
-	&mov	($acc,&DWP(96-128,$tbl));
-	&mov	($key,&DWP(128-128,$tbl));
-	&mov	($acc,&DWP(160-128,$tbl));
-	&mov	($key,&DWP(192-128,$tbl));
-	&mov	($acc,&DWP(224-128,$tbl));
-
-	&set_label("loop",16);
-
-		&deccompact(0,$tbl,$s0,$s3,$s2,$s1,1);
-		&deccompact(1,$tbl,$s1,$s0,$s3,$s2,1);
-		&deccompact(2,$tbl,$s2,$s1,$s0,$s3,1);
-		&deccompact(3,$tbl,$s3,$s2,$s1,$s0,1);
-		&dectransform(2);
-		&dectransform(3);
-		&dectransform(0);
-		&dectransform(1);
-		&mov 	($key,$__key);
-		&mov	($tbl,$__tbl);
-		&add	($key,16);		# advance rd_key
-		&xor	($s0,&DWP(0,$key));
-		&xor	($s1,&DWP(4,$key));
-		&xor	($s2,&DWP(8,$key));
-		&xor	($s3,&DWP(12,$key));
-
-	&cmp	($key,$__end);
-	&mov	($__key,$key);
-	&jb	(&label("loop"));
-
-	&deccompact(0,$tbl,$s0,$s3,$s2,$s1);
-	&deccompact(1,$tbl,$s1,$s0,$s3,$s2);
-	&deccompact(2,$tbl,$s2,$s1,$s0,$s3);
-	&deccompact(3,$tbl,$s3,$s2,$s1,$s0);
-
-	&xor	($s0,&DWP(16,$key));
-	&xor	($s1,&DWP(20,$key));
-	&xor	($s2,&DWP(24,$key));
-	&xor	($s3,&DWP(28,$key));
-
-	&ret	();
-&function_end_B("_x86_AES_decrypt_compact");
-
-######################################################################
-# "Compact" SSE block function.
-######################################################################
-
-sub sse_deccompact()
-{
-	&pshufw	("mm1","mm0",0x0c);		#  7, 6, 1, 0
-	&pshufw	("mm5","mm4",0x09);		# 13,12,11,10
-	&movd	("eax","mm1");			#  7, 6, 1, 0
-	&movd	("ebx","mm5");			# 13,12,11,10
-	&mov	($__key,$key);
-
-	&movz	($acc,&LB("eax"));		#  0
-	&movz	("edx",&HB("eax"));		#  1
-	&pshufw	("mm2","mm0",0x06);		#  3, 2, 5, 4
-	&movz	("ecx",&BP(-128,$tbl,$acc,1));	#  0
-	&movz	($key,&LB("ebx"));		# 10
-	&movz	("edx",&BP(-128,$tbl,"edx",1));	#  1
-	&shr	("eax",16);			#  7, 6
-	&shl	("edx",8);			#  1
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	# 10
-	&movz	($key,&HB("ebx"));		# 11
-	&shl	($acc,16);			# 10
-	&pshufw	("mm6","mm4",0x03);		# 9, 8,15,14
-	&or	("ecx",$acc);			# 10
-	&movz	($acc,&BP(-128,$tbl,$key,1));	# 11
-	&movz	($key,&HB("eax"));		#  7
-	&shl	($acc,24);			# 11
-	&shr	("ebx",16);			# 13,12
-	&or	("edx",$acc);			# 11
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  7
-	&movz	($key,&HB("ebx"));		# 13
-	&shl	($acc,24);			#  7
-	&or	("ecx",$acc);			#  7
-	&movz	($acc,&BP(-128,$tbl,$key,1));	# 13
-	&movz	($key,&LB("eax"));		#  6
-	&shl	($acc,8);			# 13
-	&movd	("eax","mm2");			#  3, 2, 5, 4
-	&or	("ecx",$acc);			# 13
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  6
-	&movz	($key,&LB("ebx"));		# 12
-	&shl	($acc,16);			#  6
-	&movd	("ebx","mm6");			#  9, 8,15,14
-	&movd	("mm0","ecx");			# t[0] collected
-	&movz	("ecx",&BP(-128,$tbl,$key,1));	# 12
-	&movz	($key,&LB("eax"));		#  4
-	&or	("ecx",$acc);			# 12
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  4
-	&movz	($key,&LB("ebx"));		# 14
-	&or	("edx",$acc);			#  4
-	&movz	($acc,&BP(-128,$tbl,$key,1));	# 14
-	&movz	($key,&HB("eax"));		#  5
-	&shl	($acc,16);			# 14
-	&shr	("eax",16);			#  3, 2
-	&or	("edx",$acc);			# 14
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  5
-	&movz	($key,&HB("ebx"));		# 15
-	&shr	("ebx",16);			#  9, 8
-	&shl	($acc,8);			#  5
-	&movd	("mm1","edx");			# t[1] collected
-	&movz	("edx",&BP(-128,$tbl,$key,1));	# 15
-	&movz	($key,&HB("ebx"));		#  9
-	&shl	("edx",24);			# 15
-	&and	("ebx",0xff);			#  8
-	&or	("edx",$acc);			# 15
-
-	&punpckldq	("mm0","mm1");		# t[0,1] collected
-
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  9
-	&movz	($key,&LB("eax"));		#  2
-	&shl	($acc,8);			#  9
-	&movz	("eax",&HB("eax"));		#  3
-	&movz	("ebx",&BP(-128,$tbl,"ebx",1));	#  8
-	&or	("ecx",$acc);			#  9
-	&movz	($acc,&BP(-128,$tbl,$key,1));	#  2
-	&or	("edx","ebx");			#  8
-	&shl	($acc,16);			#  2
-	&movz	("eax",&BP(-128,$tbl,"eax",1));	#  3
-	&or	("edx",$acc);			#  2
-	&shl	("eax",24);			#  3
-	&or	("ecx","eax");			#  3
-	&mov	($key,$__key);
-	&movd	("mm4","edx");			# t[2] collected
-	&movd	("mm5","ecx");			# t[3] collected
-
-	&punpckldq	("mm4","mm5");		# t[2,3] collected
-}
-
-					if (!$x86only) {
-&function_begin_B("_sse_AES_decrypt_compact");
-	&pxor	("mm0",&QWP(0,$key));	#  7, 6, 5, 4, 3, 2, 1, 0
-	&pxor	("mm4",&QWP(8,$key));	# 15,14,13,12,11,10, 9, 8
-
-	# note that caller is expected to allocate stack frame for me!
-	&mov	($acc,&DWP(240,$key));		# load key->rounds
-	&lea	($acc,&DWP(-2,$acc,$acc));
-	&lea	($acc,&DWP(0,$key,$acc,8));
-	&mov	($__end,$acc);			# end of key schedule
-
-	&mov	($s0,0x1b1b1b1b);		# magic constant
-	&mov	(&DWP(8,"esp"),$s0);
-	&mov	(&DWP(12,"esp"),$s0);
-
-	# prefetch Td4
-	&mov	($s0,&DWP(0-128,$tbl));
-	&mov	($s1,&DWP(32-128,$tbl));
-	&mov	($s2,&DWP(64-128,$tbl));
-	&mov	($s3,&DWP(96-128,$tbl));
-	&mov	($s0,&DWP(128-128,$tbl));
-	&mov	($s1,&DWP(160-128,$tbl));
-	&mov	($s2,&DWP(192-128,$tbl));
-	&mov	($s3,&DWP(224-128,$tbl));
-
-	&set_label("loop",16);
-		&sse_deccompact();
-		&add	($key,16);
-		&cmp	($key,$__end);
-		&ja	(&label("out"));
-
-		# ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N)
-		&movq	("mm3","mm0");		&movq	("mm7","mm4");
-		&movq	("mm2","mm0",1);	&movq	("mm6","mm4",1);
-		&movq	("mm1","mm0");		&movq	("mm5","mm4");
-		&pshufw	("mm0","mm0",0xb1);	&pshufw	("mm4","mm4",0xb1);# = ROTATE(tp0,16)
-		&pslld	("mm2",8);		&pslld	("mm6",8);
-		&psrld	("mm3",8);		&psrld	("mm7",8);
-		&pxor	("mm0","mm2");		&pxor	("mm4","mm6");	# ^= tp0<<8
-		&pxor	("mm0","mm3");		&pxor	("mm4","mm7");	# ^= tp0>>8
-		&pslld	("mm2",16);		&pslld	("mm6",16);
-		&psrld	("mm3",16);		&psrld	("mm7",16);
-		&pxor	("mm0","mm2");		&pxor	("mm4","mm6");	# ^= tp0<<24
-		&pxor	("mm0","mm3");		&pxor	("mm4","mm7");	# ^= tp0>>24
-
-		&movq	("mm3",&QWP(8,"esp"));
-		&pxor	("mm2","mm2");		&pxor	("mm6","mm6");
-		&pcmpgtb("mm2","mm1");		&pcmpgtb("mm6","mm5");
-		&pand	("mm2","mm3");		&pand	("mm6","mm3");
-		&paddb	("mm1","mm1");		&paddb	("mm5","mm5");
-		&pxor	("mm1","mm2");		&pxor	("mm5","mm6");	# tp2
-		&movq	("mm3","mm1");		&movq	("mm7","mm5");
-		&movq	("mm2","mm1");		&movq	("mm6","mm5");
-		&pxor	("mm0","mm1");		&pxor	("mm4","mm5");	# ^= tp2
-		&pslld	("mm3",24);		&pslld	("mm7",24);
-		&psrld	("mm2",8);		&psrld	("mm6",8);
-		&pxor	("mm0","mm3");		&pxor	("mm4","mm7");	# ^= tp2<<24
-		&pxor	("mm0","mm2");		&pxor	("mm4","mm6");	# ^= tp2>>8
-
-		&movq	("mm2",&QWP(8,"esp"));
-		&pxor	("mm3","mm3");		&pxor	("mm7","mm7");
-		&pcmpgtb("mm3","mm1");		&pcmpgtb("mm7","mm5");
-		&pand	("mm3","mm2");		&pand	("mm7","mm2");
-		&paddb	("mm1","mm1");		&paddb	("mm5","mm5");
-		&pxor	("mm1","mm3");		&pxor	("mm5","mm7");	# tp4
-		&pshufw	("mm3","mm1",0xb1);	&pshufw	("mm7","mm5",0xb1);
-		&pxor	("mm0","mm1");		&pxor	("mm4","mm5");	# ^= tp4
-		&pxor	("mm0","mm3");		&pxor	("mm4","mm7");	# ^= ROTATE(tp4,16)
-
-		&pxor	("mm3","mm3");		&pxor	("mm7","mm7");
-		&pcmpgtb("mm3","mm1");		&pcmpgtb("mm7","mm5");
-		&pand	("mm3","mm2");		&pand	("mm7","mm2");
-		&paddb	("mm1","mm1");		&paddb	("mm5","mm5");
-		&pxor	("mm1","mm3");		&pxor	("mm5","mm7");	# tp8
-		&pxor	("mm0","mm1");		&pxor	("mm4","mm5");	# ^= tp8
-		&movq	("mm3","mm1");		&movq	("mm7","mm5");
-		&pshufw	("mm2","mm1",0xb1);	&pshufw	("mm6","mm5",0xb1);
-		&pxor	("mm0","mm2");		&pxor	("mm4","mm6");	# ^= ROTATE(tp8,16)
-		&pslld	("mm1",8);		&pslld	("mm5",8);
-		&psrld	("mm3",8);		&psrld	("mm7",8);
-		&movq	("mm2",&QWP(0,$key));	&movq	("mm6",&QWP(8,$key));
-		&pxor	("mm0","mm1");		&pxor	("mm4","mm5");	# ^= tp8<<8
-		&pxor	("mm0","mm3");		&pxor	("mm4","mm7");	# ^= tp8>>8
-		&mov	($s0,&DWP(0-128,$tbl));
-		&pslld	("mm1",16);		&pslld	("mm5",16);
-		&mov	($s1,&DWP(64-128,$tbl));
-		&psrld	("mm3",16);		&psrld	("mm7",16);
-		&mov	($s2,&DWP(128-128,$tbl));
-		&pxor	("mm0","mm1");		&pxor	("mm4","mm5");	# ^= tp8<<24
-		&mov	($s3,&DWP(192-128,$tbl));
-		&pxor	("mm0","mm3");		&pxor	("mm4","mm7");	# ^= tp8>>24
-
-		&pxor	("mm0","mm2");		&pxor	("mm4","mm6");
-	&jmp	(&label("loop"));
-
-	&set_label("out",16);
-	&pxor	("mm0",&QWP(0,$key));
-	&pxor	("mm4",&QWP(8,$key));
-
-	&ret	();
-&function_end_B("_sse_AES_decrypt_compact");
-					}
-
-######################################################################
-# Vanilla block function.
-######################################################################
-
-sub decstep()
-{ my ($i,$td,@s) = @_;
-  my $tmp = $key;
-  my $out = $i==3?$s[0]:$acc;
-
-	# no instructions are reordered, as performance appears
-	# optimal... or rather that all attempts to reorder didn't
-	# result in better performance [which by the way is not a
-	# bit lower than encryption].
-	if($i==3)   {	&mov	($key,$__key);			}
-	else        {	&mov	($out,$s[0]);			}
-			&and	($out,0xFF);
-			&mov	($out,&DWP(0,$td,$out,8));
-
-	if ($i==3)  {	$tmp=$s[1];				}
-			&movz	($tmp,&HB($s[1]));
-			&xor	($out,&DWP(3,$td,$tmp,8));
-
-	if ($i==3)  {	$tmp=$s[2]; &mov ($s[1],$acc);		}
-	else        {	&mov	($tmp,$s[2]);			}
-			&shr	($tmp,16);
-			&and	($tmp,0xFF);
-			&xor	($out,&DWP(2,$td,$tmp,8));
-
-	if ($i==3)  {	$tmp=$s[3]; &mov ($s[2],$__s1);		}
-	else        {	&mov	($tmp,$s[3]);			}
-			&shr	($tmp,24);
-			&xor	($out,&DWP(1,$td,$tmp,8));
-	if ($i<2)   {	&mov	(&DWP(4+4*$i,"esp"),$out);	}
-	if ($i==3)  {	&mov	($s[3],$__s0);			}
-			&comment();
-}
-
-sub declast()
-{ my ($i,$td,@s)=@_;
-  my $tmp = $key;
-  my $out = $i==3?$s[0]:$acc;
-
-	if($i==0)   {	&lea	($td,&DWP(2048+128,$td));
-			&mov	($tmp,&DWP(0-128,$td));
-			&mov	($acc,&DWP(32-128,$td));
-			&mov	($tmp,&DWP(64-128,$td));
-			&mov	($acc,&DWP(96-128,$td));
-			&mov	($tmp,&DWP(128-128,$td));
-			&mov	($acc,&DWP(160-128,$td));
-			&mov	($tmp,&DWP(192-128,$td));
-			&mov	($acc,&DWP(224-128,$td));
-			&lea	($td,&DWP(-128,$td));		}
-	if($i==3)   {	&mov	($key,$__key);			}
-	else        {	&mov	($out,$s[0]);			}
-			&and	($out,0xFF);
-			&movz	($out,&BP(0,$td,$out,1));
-
-	if ($i==3)  {	$tmp=$s[1];				}
-			&movz	($tmp,&HB($s[1]));
-			&movz	($tmp,&BP(0,$td,$tmp,1));
-			&shl	($tmp,8);
-			&xor	($out,$tmp);
-
-	if ($i==3)  {	$tmp=$s[2]; &mov ($s[1],$acc);		}
-	else        {	mov	($tmp,$s[2]);			}
-			&shr	($tmp,16);
-			&and	($tmp,0xFF);
-			&movz	($tmp,&BP(0,$td,$tmp,1));
-			&shl	($tmp,16);
-			&xor	($out,$tmp);
-
-	if ($i==3)  {	$tmp=$s[3]; &mov ($s[2],$__s1);		}
-	else        {	&mov	($tmp,$s[3]);			}
-			&shr	($tmp,24);
-			&movz	($tmp,&BP(0,$td,$tmp,1));
-			&shl	($tmp,24);
-			&xor	($out,$tmp);
-	if ($i<2)   {	&mov	(&DWP(4+4*$i,"esp"),$out);	}
-	if ($i==3)  {	&mov	($s[3],$__s0);
-			&lea	($td,&DWP(-2048,$td));		}
-}
-
-&function_begin_B("_x86_AES_decrypt");
-	# note that caller is expected to allocate stack frame for me!
-	&mov	($__key,$key);			# save key
-
-	&xor	($s0,&DWP(0,$key));		# xor with key
-	&xor	($s1,&DWP(4,$key));
-	&xor	($s2,&DWP(8,$key));
-	&xor	($s3,&DWP(12,$key));
-
-	&mov	($acc,&DWP(240,$key));		# load key->rounds
-
-	if ($small_footprint) {
-	    &lea	($acc,&DWP(-2,$acc,$acc));
-	    &lea	($acc,&DWP(0,$key,$acc,8));
-	    &mov	($__end,$acc);		# end of key schedule
-	    &set_label("loop",16);
-		&decstep(0,$tbl,$s0,$s3,$s2,$s1);
-		&decstep(1,$tbl,$s1,$s0,$s3,$s2);
-		&decstep(2,$tbl,$s2,$s1,$s0,$s3);
-		&decstep(3,$tbl,$s3,$s2,$s1,$s0);
-		&add	($key,16);		# advance rd_key
-		&xor	($s0,&DWP(0,$key));
-		&xor	($s1,&DWP(4,$key));
-		&xor	($s2,&DWP(8,$key));
-		&xor	($s3,&DWP(12,$key));
-	    &cmp	($key,$__end);
-	    &mov	($__key,$key);
-	    &jb		(&label("loop"));
-	}
-	else {
-	    &cmp	($acc,10);
-	    &jle	(&label("10rounds"));
-	    &cmp	($acc,12);
-	    &jle	(&label("12rounds"));
-
-	&set_label("14rounds",4);
-	    for ($i=1;$i<3;$i++) {
-		&decstep(0,$tbl,$s0,$s3,$s2,$s1);
-		&decstep(1,$tbl,$s1,$s0,$s3,$s2);
-		&decstep(2,$tbl,$s2,$s1,$s0,$s3);
-		&decstep(3,$tbl,$s3,$s2,$s1,$s0);
-		&xor	($s0,&DWP(16*$i+0,$key));
-		&xor	($s1,&DWP(16*$i+4,$key));
-		&xor	($s2,&DWP(16*$i+8,$key));
-		&xor	($s3,&DWP(16*$i+12,$key));
-	    }
-	    &add	($key,32);
-	    &mov	($__key,$key);		# advance rd_key
-	&set_label("12rounds",4);
-	    for ($i=1;$i<3;$i++) {
-		&decstep(0,$tbl,$s0,$s3,$s2,$s1);
-		&decstep(1,$tbl,$s1,$s0,$s3,$s2);
-		&decstep(2,$tbl,$s2,$s1,$s0,$s3);
-		&decstep(3,$tbl,$s3,$s2,$s1,$s0);
-		&xor	($s0,&DWP(16*$i+0,$key));
-		&xor	($s1,&DWP(16*$i+4,$key));
-		&xor	($s2,&DWP(16*$i+8,$key));
-		&xor	($s3,&DWP(16*$i+12,$key));
-	    }
-	    &add	($key,32);
-	    &mov	($__key,$key);		# advance rd_key
-	&set_label("10rounds",4);
-	    for ($i=1;$i<10;$i++) {
-		&decstep(0,$tbl,$s0,$s3,$s2,$s1);
-		&decstep(1,$tbl,$s1,$s0,$s3,$s2);
-		&decstep(2,$tbl,$s2,$s1,$s0,$s3);
-		&decstep(3,$tbl,$s3,$s2,$s1,$s0);
-		&xor	($s0,&DWP(16*$i+0,$key));
-		&xor	($s1,&DWP(16*$i+4,$key));
-		&xor	($s2,&DWP(16*$i+8,$key));
-		&xor	($s3,&DWP(16*$i+12,$key));
-	    }
-	}
-
-	&declast(0,$tbl,$s0,$s3,$s2,$s1);
-	&declast(1,$tbl,$s1,$s0,$s3,$s2);
-	&declast(2,$tbl,$s2,$s1,$s0,$s3);
-	&declast(3,$tbl,$s3,$s2,$s1,$s0);
-
-	&add	($key,$small_footprint?16:160);
-	&xor	($s0,&DWP(0,$key));
-	&xor	($s1,&DWP(4,$key));
-	&xor	($s2,&DWP(8,$key));
-	&xor	($s3,&DWP(12,$key));
-
-	&ret	();
-
-&set_label("AES_Td",64);	# Yes! I keep it in the code segment!
-	&_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
-	&_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
-	&_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
-	&_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
-	&_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
-	&_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
-	&_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
-	&_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
-	&_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
-	&_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
-	&_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
-	&_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
-	&_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
-	&_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
-	&_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
-	&_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
-	&_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
-	&_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
-	&_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
-	&_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
-	&_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
-	&_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
-	&_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
-	&_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
-	&_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
-	&_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
-	&_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
-	&_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
-	&_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
-	&_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
-	&_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
-	&_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
-	&_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
-	&_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
-	&_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
-	&_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
-	&_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
-	&_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
-	&_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
-	&_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
-	&_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
-	&_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
-	&_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
-	&_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
-	&_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
-	&_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
-	&_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
-	&_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
-	&_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
-	&_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
-	&_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
-	&_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
-	&_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
-	&_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
-	&_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
-	&_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
-	&_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
-	&_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
-	&_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
-	&_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
-	&_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
-	&_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
-	&_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
-	&_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
-
-#Td4:	# four copies of Td4 to choose from to avoid L1 aliasing
-	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-
-	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-
-	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-
-	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-&function_end_B("_x86_AES_decrypt");
-
-# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
-&function_begin("AES_decrypt");
-	&mov	($acc,&wparam(0));		# load inp
-	&mov	($key,&wparam(2));		# load key
-
-	&mov	($s0,"esp");
-	&sub	("esp",36);
-	&and	("esp",-64);			# align to cache-line
-
-	# place stack frame just "above" the key schedule
-	&lea	($s1,&DWP(-64-63,$key));
-	&sub	($s1,"esp");
-	&neg	($s1);
-	&and	($s1,0x3C0);	# modulo 1024, but aligned to cache-line
-	&sub	("esp",$s1);
-	&add	("esp",4);	# 4 is reserved for caller's return address
-	&mov	($_esp,$s0);	# save stack pointer
-
-	&call   (&label("pic_point"));          # make it PIC!
-	&set_label("pic_point");
-	&blindpop($tbl);
-	&picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
-	&lea    ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));
-
-	# pick Td4 copy which can't "overlap" with stack frame or key schedule
-	&lea	($s1,&DWP(768-4,"esp"));
-	&sub	($s1,$tbl);
-	&and	($s1,0x300);
-	&lea	($tbl,&DWP(2048+128,$tbl,$s1));
-
-					if (!$x86only) {
-	&bt	(&DWP(0,$s0),25);	# check for SSE bit
-	&jnc	(&label("x86"));
-
-	&movq	("mm0",&QWP(0,$acc));
-	&movq	("mm4",&QWP(8,$acc));
-	&call	("_sse_AES_decrypt_compact");
-	&mov	("esp",$_esp);			# restore stack pointer
-	&mov	($acc,&wparam(1));		# load out
-	&movq	(&QWP(0,$acc),"mm0");		# write output data
-	&movq	(&QWP(8,$acc),"mm4");
-	&emms	();
-	&function_end_A();
-					}
-	&set_label("x86",16);
-	&mov	($_tbl,$tbl);
-	&mov	($s0,&DWP(0,$acc));		# load input data
-	&mov	($s1,&DWP(4,$acc));
-	&mov	($s2,&DWP(8,$acc));
-	&mov	($s3,&DWP(12,$acc));
-	&call	("_x86_AES_decrypt_compact");
-	&mov	("esp",$_esp);			# restore stack pointer
-	&mov	($acc,&wparam(1));		# load out
-	&mov	(&DWP(0,$acc),$s0);		# write output data
-	&mov	(&DWP(4,$acc),$s1);
-	&mov	(&DWP(8,$acc),$s2);
-	&mov	(&DWP(12,$acc),$s3);
-&function_end("AES_decrypt");
-
-# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
-#			size_t length, const AES_KEY *key,
-#			unsigned char *ivp,const int enc);
-{
-# stack frame layout
-#             -4(%esp)		# return address	 0(%esp)
-#              0(%esp)		# s0 backing store	 4(%esp)
-#              4(%esp)		# s1 backing store	 8(%esp)
-#              8(%esp)		# s2 backing store	12(%esp)
-#             12(%esp)		# s3 backing store	16(%esp)
-#             16(%esp)		# key backup		20(%esp)
-#             20(%esp)		# end of key schedule	24(%esp)
-#             24(%esp)		# %ebp backup		28(%esp)
-#             28(%esp)		# %esp backup
-my $_inp=&DWP(32,"esp");	# copy of wparam(0)
-my $_out=&DWP(36,"esp");	# copy of wparam(1)
-my $_len=&DWP(40,"esp");	# copy of wparam(2)
-my $_key=&DWP(44,"esp");	# copy of wparam(3)
-my $_ivp=&DWP(48,"esp");	# copy of wparam(4)
-my $_tmp=&DWP(52,"esp");	# volatile variable
-#
-my $ivec=&DWP(60,"esp");	# ivec[16]
-my $aes_key=&DWP(76,"esp");	# copy of aes_key
-my $mark=&DWP(76+240,"esp");	# copy of aes_key->rounds
-
-&function_begin("AES_cbc_encrypt");
-	&mov	($s2 eq "ecx"? $s2 : "",&wparam(2));	# load len
-	&cmp	($s2,0);
-	&je	(&label("drop_out"));
-
-	&call   (&label("pic_point"));		# make it PIC!
-	&set_label("pic_point");
-	&blindpop($tbl);
-	&picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
-
-	&cmp	(&wparam(5),0);
-	&lea    ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
-	&jne	(&label("picked_te"));
-	&lea	($tbl,&DWP(&label("AES_Td")."-".&label("AES_Te"),$tbl));
-	&set_label("picked_te");
-
-	# one can argue if this is required
-	&pushf	();
-	&cld	();
-
-	&cmp	($s2,$speed_limit);
-	&jb	(&label("slow_way"));
-	&test	($s2,15);
-	&jnz	(&label("slow_way"));
-					if (!$x86only) {
-	&bt	(&DWP(0,$s0),28);	# check for hyper-threading bit
-	&jc	(&label("slow_way"));
-					}
-	# pre-allocate aligned stack frame...
-	&lea	($acc,&DWP(-80-244,"esp"));
-	&and	($acc,-64);
-
-	# ... and make sure it doesn't alias with $tbl modulo 4096
-	&mov	($s0,$tbl);
-	&lea	($s1,&DWP(2048+256,$tbl));
-	&mov	($s3,$acc);
-	&and	($s0,0xfff);		# s = %ebp&0xfff
-	&and	($s1,0xfff);		# e = (%ebp+2048+256)&0xfff
-	&and	($s3,0xfff);		# p = %esp&0xfff
-
-	&cmp	($s3,$s1);		# if (p>=e) %esp =- (p-e);
-	&jb	(&label("tbl_break_out"));
-	&sub	($s3,$s1);
-	&sub	($acc,$s3);
-	&jmp	(&label("tbl_ok"));
-	&set_label("tbl_break_out",4);	# else %esp -= (p-s)&0xfff + framesz;
-	&sub	($s3,$s0);
-	&and	($s3,0xfff);
-	&add	($s3,384);
-	&sub	($acc,$s3);
-	&set_label("tbl_ok",4);
-
-	&lea	($s3,&wparam(0));	# obtain pointer to parameter block
-	&exch	("esp",$acc);		# allocate stack frame
-	&add	("esp",4);		# reserve for return address!
-	&mov	($_tbl,$tbl);		# save %ebp
-	&mov	($_esp,$acc);		# save %esp
-
-	&mov	($s0,&DWP(0,$s3));	# load inp
-	&mov	($s1,&DWP(4,$s3));	# load out
-	#&mov	($s2,&DWP(8,$s3));	# load len
-	&mov	($key,&DWP(12,$s3));	# load key
-	&mov	($acc,&DWP(16,$s3));	# load ivp
-	&mov	($s3,&DWP(20,$s3));	# load enc flag
-
-	&mov	($_inp,$s0);		# save copy of inp
-	&mov	($_out,$s1);		# save copy of out
-	&mov	($_len,$s2);		# save copy of len
-	&mov	($_key,$key);		# save copy of key
-	&mov	($_ivp,$acc);		# save copy of ivp
-
-	&mov	($mark,0);		# copy of aes_key->rounds = 0;
-	# do we copy key schedule to stack?
-	&mov	($s1 eq "ebx" ? $s1 : "",$key);
-	&mov	($s2 eq "ecx" ? $s2 : "",244/4);
-	&sub	($s1,$tbl);
-	&mov	("esi",$key);
-	&and	($s1,0xfff);
-	&lea	("edi",$aes_key);
-	&cmp	($s1,2048+256);
-	&jb	(&label("do_copy"));
-	&cmp	($s1,4096-244);
-	&jb	(&label("skip_copy"));
-	&set_label("do_copy",4);
-		&mov	($_key,"edi");
-		&data_word(0xA5F3F689);	# rep movsd
-	&set_label("skip_copy");
-
-	&mov	($key,16);
-	&set_label("prefetch_tbl",4);
-		&mov	($s0,&DWP(0,$tbl));
-		&mov	($s1,&DWP(32,$tbl));
-		&mov	($s2,&DWP(64,$tbl));
-		&mov	($acc,&DWP(96,$tbl));
-		&lea	($tbl,&DWP(128,$tbl));
-		&sub	($key,1);
-	&jnz	(&label("prefetch_tbl"));
-	&sub	($tbl,2048);
-
-	&mov	($acc,$_inp);
-	&mov	($key,$_ivp);
-
-	&cmp	($s3,0);
-	&je	(&label("fast_decrypt"));
-
-#----------------------------- ENCRYPT -----------------------------#
-	&mov	($s0,&DWP(0,$key));		# load iv
-	&mov	($s1,&DWP(4,$key));
-
-	&set_label("fast_enc_loop",16);
-		&mov	($s2,&DWP(8,$key));
-		&mov	($s3,&DWP(12,$key));
-
-		&xor	($s0,&DWP(0,$acc));	# xor input data
-		&xor	($s1,&DWP(4,$acc));
-		&xor	($s2,&DWP(8,$acc));
-		&xor	($s3,&DWP(12,$acc));
-
-		&mov	($key,$_key);		# load key
-		&call	("_x86_AES_encrypt");
-
-		&mov	($acc,$_inp);		# load inp
-		&mov	($key,$_out);		# load out
-
-		&mov	(&DWP(0,$key),$s0);	# save output data
-		&mov	(&DWP(4,$key),$s1);
-		&mov	(&DWP(8,$key),$s2);
-		&mov	(&DWP(12,$key),$s3);
-
-		&lea	($acc,&DWP(16,$acc));	# advance inp
-		&mov	($s2,$_len);		# load len
-		&mov	($_inp,$acc);		# save inp
-		&lea	($s3,&DWP(16,$key));	# advance out
-		&mov	($_out,$s3);		# save out
-		&sub	($s2,16);		# decrease len
-		&mov	($_len,$s2);		# save len
-	&jnz	(&label("fast_enc_loop"));
-	&mov	($acc,$_ivp);		# load ivp
-	&mov	($s2,&DWP(8,$key));	# restore last 2 dwords
-	&mov	($s3,&DWP(12,$key));
-	&mov	(&DWP(0,$acc),$s0);	# save ivec
-	&mov	(&DWP(4,$acc),$s1);
-	&mov	(&DWP(8,$acc),$s2);
-	&mov	(&DWP(12,$acc),$s3);
-
-	&cmp	($mark,0);		# was the key schedule copied?
-	&mov	("edi",$_key);
-	&je	(&label("skip_ezero"));
-	# zero copy of key schedule
-	&mov	("ecx",240/4);
-	&xor	("eax","eax");
-	&align	(4);
-	&data_word(0xABF3F689);		# rep stosd
-	&set_label("skip_ezero");
-	&mov	("esp",$_esp);
-	&popf	();
-    &set_label("drop_out");
-	&function_end_A();
-	&pushf	();			# kludge, never executed
-
-#----------------------------- DECRYPT -----------------------------#
-&set_label("fast_decrypt",16);
-
-	&cmp	($acc,$_out);
-	&je	(&label("fast_dec_in_place"));	# in-place processing...
-
-	&mov	($_tmp,$key);
-
-	&align	(4);
-	&set_label("fast_dec_loop",16);
-		&mov	($s0,&DWP(0,$acc));	# read input
-		&mov	($s1,&DWP(4,$acc));
-		&mov	($s2,&DWP(8,$acc));
-		&mov	($s3,&DWP(12,$acc));
-
-		&mov	($key,$_key);		# load key
-		&call	("_x86_AES_decrypt");
-
-		&mov	($key,$_tmp);		# load ivp
-		&mov	($acc,$_len);		# load len
-		&xor	($s0,&DWP(0,$key));	# xor iv
-		&xor	($s1,&DWP(4,$key));
-		&xor	($s2,&DWP(8,$key));
-		&xor	($s3,&DWP(12,$key));
-
-		&mov	($key,$_out);		# load out
-		&mov	($acc,$_inp);		# load inp
-
-		&mov	(&DWP(0,$key),$s0);	# write output
-		&mov	(&DWP(4,$key),$s1);
-		&mov	(&DWP(8,$key),$s2);
-		&mov	(&DWP(12,$key),$s3);
-
-		&mov	($s2,$_len);		# load len
-		&mov	($_tmp,$acc);		# save ivp
-		&lea	($acc,&DWP(16,$acc));	# advance inp
-		&mov	($_inp,$acc);		# save inp
-		&lea	($key,&DWP(16,$key));	# advance out
-		&mov	($_out,$key);		# save out
-		&sub	($s2,16);		# decrease len
-		&mov	($_len,$s2);		# save len
-	&jnz	(&label("fast_dec_loop"));
-	&mov	($key,$_tmp);		# load temp ivp
-	&mov	($acc,$_ivp);		# load user ivp
-	&mov	($s0,&DWP(0,$key));	# load iv
-	&mov	($s1,&DWP(4,$key));
-	&mov	($s2,&DWP(8,$key));
-	&mov	($s3,&DWP(12,$key));
-	&mov	(&DWP(0,$acc),$s0);	# copy back to user
-	&mov	(&DWP(4,$acc),$s1);
-	&mov	(&DWP(8,$acc),$s2);
-	&mov	(&DWP(12,$acc),$s3);
-	&jmp	(&label("fast_dec_out"));
-
-    &set_label("fast_dec_in_place",16);
-	&set_label("fast_dec_in_place_loop");
-		&mov	($s0,&DWP(0,$acc));	# read input
-		&mov	($s1,&DWP(4,$acc));
-		&mov	($s2,&DWP(8,$acc));
-		&mov	($s3,&DWP(12,$acc));
-
-		&lea	($key,$ivec);
-		&mov	(&DWP(0,$key),$s0);	# copy to temp
-		&mov	(&DWP(4,$key),$s1);
-		&mov	(&DWP(8,$key),$s2);
-		&mov	(&DWP(12,$key),$s3);
-
-		&mov	($key,$_key);		# load key
-		&call	("_x86_AES_decrypt");
-
-		&mov	($key,$_ivp);		# load ivp
-		&mov	($acc,$_out);		# load out
-		&xor	($s0,&DWP(0,$key));	# xor iv
-		&xor	($s1,&DWP(4,$key));
-		&xor	($s2,&DWP(8,$key));
-		&xor	($s3,&DWP(12,$key));
-
-		&mov	(&DWP(0,$acc),$s0);	# write output
-		&mov	(&DWP(4,$acc),$s1);
-		&mov	(&DWP(8,$acc),$s2);
-		&mov	(&DWP(12,$acc),$s3);
-
-		&lea	($acc,&DWP(16,$acc));	# advance out
-		&mov	($_out,$acc);		# save out
-
-		&lea	($acc,$ivec);
-		&mov	($s0,&DWP(0,$acc));	# read temp
-		&mov	($s1,&DWP(4,$acc));
-		&mov	($s2,&DWP(8,$acc));
-		&mov	($s3,&DWP(12,$acc));
-
-		&mov	(&DWP(0,$key),$s0);	# copy iv
-		&mov	(&DWP(4,$key),$s1);
-		&mov	(&DWP(8,$key),$s2);
-		&mov	(&DWP(12,$key),$s3);
-
-		&mov	($acc,$_inp);		# load inp
-		&mov	($s2,$_len);		# load len
-		&lea	($acc,&DWP(16,$acc));	# advance inp
-		&mov	($_inp,$acc);		# save inp
-		&sub	($s2,16);		# decrease len
-		&mov	($_len,$s2);		# save len
-	&jnz	(&label("fast_dec_in_place_loop"));
-
-    &set_label("fast_dec_out",4);
-	&cmp	($mark,0);		# was the key schedule copied?
-	&mov	("edi",$_key);
-	&je	(&label("skip_dzero"));
-	# zero copy of key schedule
-	&mov	("ecx",240/4);
-	&xor	("eax","eax");
-	&align	(4);
-	&data_word(0xABF3F689);		# rep stosd
-	&set_label("skip_dzero");
-	&mov	("esp",$_esp);
-	&popf	();
-	&function_end_A();
-	&pushf	();			# kludge, never executed
-
-#--------------------------- SLOW ROUTINE ---------------------------#
-&set_label("slow_way",16);
-
-	&mov	($s0,&DWP(0,$s0)) if (!$x86only);# load OPENSSL_ia32cap
-	&mov	($key,&wparam(3));	# load key
-
-	# pre-allocate aligned stack frame...
-	&lea	($acc,&DWP(-80,"esp"));
-	&and	($acc,-64);
-
-	# ... and make sure it doesn't alias with $key modulo 1024
-	&lea	($s1,&DWP(-80-63,$key));
-	&sub	($s1,$acc);
-	&neg	($s1);
-	&and	($s1,0x3C0);	# modulo 1024, but aligned to cache-line
-	&sub	($acc,$s1);
-
-	# pick S-box copy which can't overlap with stack frame or $key
-	&lea	($s1,&DWP(768,$acc));
-	&sub	($s1,$tbl);
-	&and	($s1,0x300);
-	&lea	($tbl,&DWP(2048+128,$tbl,$s1));
-
-	&lea	($s3,&wparam(0));	# pointer to parameter block
-
-	&exch	("esp",$acc);
-	&add	("esp",4);		# reserve for return address!
-	&mov	($_tbl,$tbl);		# save %ebp
-	&mov	($_esp,$acc);		# save %esp
-	&mov	($_tmp,$s0);		# save OPENSSL_ia32cap
-
-	&mov	($s0,&DWP(0,$s3));	# load inp
-	&mov	($s1,&DWP(4,$s3));	# load out
-	#&mov	($s2,&DWP(8,$s3));	# load len
-	#&mov	($key,&DWP(12,$s3));	# load key
-	&mov	($acc,&DWP(16,$s3));	# load ivp
-	&mov	($s3,&DWP(20,$s3));	# load enc flag
-
-	&mov	($_inp,$s0);		# save copy of inp
-	&mov	($_out,$s1);		# save copy of out
-	&mov	($_len,$s2);		# save copy of len
-	&mov	($_key,$key);		# save copy of key
-	&mov	($_ivp,$acc);		# save copy of ivp
-
-	&mov	($key,$acc);
-	&mov	($acc,$s0);
-
-	&cmp	($s3,0);
-	&je	(&label("slow_decrypt"));
-
-#--------------------------- SLOW ENCRYPT ---------------------------#
-	&cmp	($s2,16);
-	&mov	($s3,$s1);
-	&jb	(&label("slow_enc_tail"));
-
-					if (!$x86only) {
-	&bt	($_tmp,25);		# check for SSE bit
-	&jnc	(&label("slow_enc_x86"));
-
-	&movq	("mm0",&QWP(0,$key));	# load iv
-	&movq	("mm4",&QWP(8,$key));
-
-	&set_label("slow_enc_loop_sse",16);
-		&pxor	("mm0",&QWP(0,$acc));	# xor input data
-		&pxor	("mm4",&QWP(8,$acc));
-
-		&mov	($key,$_key);
-		&call	("_sse_AES_encrypt_compact");
-
-		&mov	($acc,$_inp);		# load inp
-		&mov	($key,$_out);		# load out
-		&mov	($s2,$_len);		# load len
-
-		&movq	(&QWP(0,$key),"mm0");	# save output data
-		&movq	(&QWP(8,$key),"mm4");
-
-		&lea	($acc,&DWP(16,$acc));	# advance inp
-		&mov	($_inp,$acc);		# save inp
-		&lea	($s3,&DWP(16,$key));	# advance out
-		&mov	($_out,$s3);		# save out
-		&sub	($s2,16);		# decrease len
-		&cmp	($s2,16);
-		&mov	($_len,$s2);		# save len
-	&jae	(&label("slow_enc_loop_sse"));
-	&test	($s2,15);
-	&jnz	(&label("slow_enc_tail"));
-	&mov	($acc,$_ivp);		# load ivp
-	&movq	(&QWP(0,$acc),"mm0");	# save ivec
-	&movq	(&QWP(8,$acc),"mm4");
-	&emms	();
-	&mov	("esp",$_esp);
-	&popf	();
-	&function_end_A();
-	&pushf	();			# kludge, never executed
-					}
-    &set_label("slow_enc_x86",16);
-	&mov	($s0,&DWP(0,$key));	# load iv
-	&mov	($s1,&DWP(4,$key));
-
-	&set_label("slow_enc_loop_x86",4);
-		&mov	($s2,&DWP(8,$key));
-		&mov	($s3,&DWP(12,$key));
-
-		&xor	($s0,&DWP(0,$acc));	# xor input data
-		&xor	($s1,&DWP(4,$acc));
-		&xor	($s2,&DWP(8,$acc));
-		&xor	($s3,&DWP(12,$acc));
-
-		&mov	($key,$_key);		# load key
-		&call	("_x86_AES_encrypt_compact");
-
-		&mov	($acc,$_inp);		# load inp
-		&mov	($key,$_out);		# load out
-
-		&mov	(&DWP(0,$key),$s0);	# save output data
-		&mov	(&DWP(4,$key),$s1);
-		&mov	(&DWP(8,$key),$s2);
-		&mov	(&DWP(12,$key),$s3);
-
-		&mov	($s2,$_len);		# load len
-		&lea	($acc,&DWP(16,$acc));	# advance inp
-		&mov	($_inp,$acc);		# save inp
-		&lea	($s3,&DWP(16,$key));	# advance out
-		&mov	($_out,$s3);		# save out
-		&sub	($s2,16);		# decrease len
-		&cmp	($s2,16);
-		&mov	($_len,$s2);		# save len
-	&jae	(&label("slow_enc_loop_x86"));
-	&test	($s2,15);
-	&jnz	(&label("slow_enc_tail"));
-	&mov	($acc,$_ivp);		# load ivp
-	&mov	($s2,&DWP(8,$key));	# restore last dwords
-	&mov	($s3,&DWP(12,$key));
-	&mov	(&DWP(0,$acc),$s0);	# save ivec
-	&mov	(&DWP(4,$acc),$s1);
-	&mov	(&DWP(8,$acc),$s2);
-	&mov	(&DWP(12,$acc),$s3);
-
-	&mov	("esp",$_esp);
-	&popf	();
-	&function_end_A();
-	&pushf	();			# kludge, never executed
-
-    &set_label("slow_enc_tail",16);
-	&emms	()	if (!$x86only);
-	&mov	($key eq "edi"? $key:"",$s3);	# load out to edi
-	&mov	($s1,16);
-	&sub	($s1,$s2);
-	&cmp	($key,$acc eq "esi"? $acc:"");	# compare with inp
-	&je	(&label("enc_in_place"));
-	&align	(4);
-	&data_word(0xA4F3F689);	# rep movsb	# copy input
-	&jmp	(&label("enc_skip_in_place"));
-    &set_label("enc_in_place");
-	&lea	($key,&DWP(0,$key,$s2));
-    &set_label("enc_skip_in_place");
-	&mov	($s2,$s1);
-	&xor	($s0,$s0);
-	&align	(4);
-	&data_word(0xAAF3F689);	# rep stosb	# zero tail
-
-	&mov	($key,$_ivp);			# restore ivp
-	&mov	($acc,$s3);			# output as input
-	&mov	($s0,&DWP(0,$key));
-	&mov	($s1,&DWP(4,$key));
-	&mov	($_len,16);			# len=16
-	&jmp	(&label("slow_enc_loop_x86"));	# one more spin...
-
-#--------------------------- SLOW DECRYPT ---------------------------#
-&set_label("slow_decrypt",16);
-					if (!$x86only) {
-	&bt	($_tmp,25);		# check for SSE bit
-	&jnc	(&label("slow_dec_loop_x86"));
-
-	&set_label("slow_dec_loop_sse",4);
-		&movq	("mm0",&QWP(0,$acc));	# read input
-		&movq	("mm4",&QWP(8,$acc));
-
-		&mov	($key,$_key);
-		&call	("_sse_AES_decrypt_compact");
-
-		&mov	($acc,$_inp);		# load inp
-		&lea	($s0,$ivec);
-		&mov	($s1,$_out);		# load out
-		&mov	($s2,$_len);		# load len
-		&mov	($key,$_ivp);		# load ivp
-
-		&movq	("mm1",&QWP(0,$acc));	# re-read input
-		&movq	("mm5",&QWP(8,$acc));
-
-		&pxor	("mm0",&QWP(0,$key));	# xor iv
-		&pxor	("mm4",&QWP(8,$key));
-
-		&movq	(&QWP(0,$key),"mm1");	# copy input to iv
-		&movq	(&QWP(8,$key),"mm5");
-
-		&sub	($s2,16);		# decrease len
-		&jc	(&label("slow_dec_partial_sse"));
-
-		&movq	(&QWP(0,$s1),"mm0");	# write output
-		&movq	(&QWP(8,$s1),"mm4");
-
-		&lea	($s1,&DWP(16,$s1));	# advance out
-		&mov	($_out,$s1);		# save out
-		&lea	($acc,&DWP(16,$acc));	# advance inp
-		&mov	($_inp,$acc);		# save inp
-		&mov	($_len,$s2);		# save len
-	&jnz	(&label("slow_dec_loop_sse"));
-	&emms	();
-	&mov	("esp",$_esp);
-	&popf	();
-	&function_end_A();
-	&pushf	();			# kludge, never executed
-
-    &set_label("slow_dec_partial_sse",16);
-	&movq	(&QWP(0,$s0),"mm0");	# save output to temp
-	&movq	(&QWP(8,$s0),"mm4");
-	&emms	();
-
-	&add	($s2 eq "ecx" ? "ecx":"",16);
-	&mov	("edi",$s1);		# out
-	&mov	("esi",$s0);		# temp
-	&align	(4);
-	&data_word(0xA4F3F689);		# rep movsb # copy partial output
-
-	&mov	("esp",$_esp);
-	&popf	();
-	&function_end_A();
-	&pushf	();			# kludge, never executed
-					}
-	&set_label("slow_dec_loop_x86",16);
-		&mov	($s0,&DWP(0,$acc));	# read input
-		&mov	($s1,&DWP(4,$acc));
-		&mov	($s2,&DWP(8,$acc));
-		&mov	($s3,&DWP(12,$acc));
-
-		&lea	($key,$ivec);
-		&mov	(&DWP(0,$key),$s0);	# copy to temp
-		&mov	(&DWP(4,$key),$s1);
-		&mov	(&DWP(8,$key),$s2);
-		&mov	(&DWP(12,$key),$s3);
-
-		&mov	($key,$_key);		# load key
-		&call	("_x86_AES_decrypt_compact");
-
-		&mov	($key,$_ivp);		# load ivp
-		&mov	($acc,$_len);		# load len
-		&xor	($s0,&DWP(0,$key));	# xor iv
-		&xor	($s1,&DWP(4,$key));
-		&xor	($s2,&DWP(8,$key));
-		&xor	($s3,&DWP(12,$key));
-
-		&sub	($acc,16);
-		&jc	(&label("slow_dec_partial_x86"));
-
-		&mov	($_len,$acc);		# save len
-		&mov	($acc,$_out);		# load out
-
-		&mov	(&DWP(0,$acc),$s0);	# write output
-		&mov	(&DWP(4,$acc),$s1);
-		&mov	(&DWP(8,$acc),$s2);
-		&mov	(&DWP(12,$acc),$s3);
-
-		&lea	($acc,&DWP(16,$acc));	# advance out
-		&mov	($_out,$acc);		# save out
-
-		&lea	($acc,$ivec);
-		&mov	($s0,&DWP(0,$acc));	# read temp
-		&mov	($s1,&DWP(4,$acc));
-		&mov	($s2,&DWP(8,$acc));
-		&mov	($s3,&DWP(12,$acc));
-
-		&mov	(&DWP(0,$key),$s0);	# copy it to iv
-		&mov	(&DWP(4,$key),$s1);
-		&mov	(&DWP(8,$key),$s2);
-		&mov	(&DWP(12,$key),$s3);
-
-		&mov	($acc,$_inp);		# load inp
-		&lea	($acc,&DWP(16,$acc));	# advance inp
-		&mov	($_inp,$acc);		# save inp
-	&jnz	(&label("slow_dec_loop_x86"));
-	&mov	("esp",$_esp);
-	&popf	();
-	&function_end_A();
-	&pushf	();			# kludge, never executed
-
-    &set_label("slow_dec_partial_x86",16);
-	&lea	($acc,$ivec);
-	&mov	(&DWP(0,$acc),$s0);	# save output to temp
-	&mov	(&DWP(4,$acc),$s1);
-	&mov	(&DWP(8,$acc),$s2);
-	&mov	(&DWP(12,$acc),$s3);
-
-	&mov	($acc,$_inp);
-	&mov	($s0,&DWP(0,$acc));	# re-read input
-	&mov	($s1,&DWP(4,$acc));
-	&mov	($s2,&DWP(8,$acc));
-	&mov	($s3,&DWP(12,$acc));
-
-	&mov	(&DWP(0,$key),$s0);	# copy it to iv
-	&mov	(&DWP(4,$key),$s1);
-	&mov	(&DWP(8,$key),$s2);
-	&mov	(&DWP(12,$key),$s3);
-
-	&mov	("ecx",$_len);
-	&mov	("edi",$_out);
-	&lea	("esi",$ivec);
-	&align	(4);
-	&data_word(0xA4F3F689);		# rep movsb # copy partial output
-
-	&mov	("esp",$_esp);
-	&popf	();
-&function_end("AES_cbc_encrypt");
-}
-
-#------------------------------------------------------------------#
-
-sub enckey()
-{
-	&movz	("esi",&LB("edx"));		# rk[i]>>0
-	&movz	("ebx",&BP(-128,$tbl,"esi",1));
-	&movz	("esi",&HB("edx"));		# rk[i]>>8
-	&shl	("ebx",24);
-	&xor	("eax","ebx");
-
-	&movz	("ebx",&BP(-128,$tbl,"esi",1));
-	&shr	("edx",16);
-	&movz	("esi",&LB("edx"));		# rk[i]>>16
-	&xor	("eax","ebx");
-
-	&movz	("ebx",&BP(-128,$tbl,"esi",1));
-	&movz	("esi",&HB("edx"));		# rk[i]>>24
-	&shl	("ebx",8);
-	&xor	("eax","ebx");
-
-	&movz	("ebx",&BP(-128,$tbl,"esi",1));
-	&shl	("ebx",16);
-	&xor	("eax","ebx");
-
-	&xor	("eax",&DWP(1024-128,$tbl,"ecx",4));	# rcon
-}
-
-&function_begin("_x86_AES_set_encrypt_key");
-	&mov	("esi",&wparam(1));		# user supplied key
-	&mov	("edi",&wparam(3));		# private key schedule
-
-	&test	("esi",-1);
-	&jz	(&label("badpointer"));
-	&test	("edi",-1);
-	&jz	(&label("badpointer"));
-
-	&call	(&label("pic_point"));
-	&set_label("pic_point");
-	&blindpop($tbl);
-	&lea	($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
-	&lea	($tbl,&DWP(2048+128,$tbl));
-
-	# prefetch Te4
-	&mov	("eax",&DWP(0-128,$tbl));
-	&mov	("ebx",&DWP(32-128,$tbl));
-	&mov	("ecx",&DWP(64-128,$tbl));
-	&mov	("edx",&DWP(96-128,$tbl));
-	&mov	("eax",&DWP(128-128,$tbl));
-	&mov	("ebx",&DWP(160-128,$tbl));
-	&mov	("ecx",&DWP(192-128,$tbl));
-	&mov	("edx",&DWP(224-128,$tbl));
-
-	&mov	("ecx",&wparam(2));		# number of bits in key
-	&cmp	("ecx",128);
-	&je	(&label("10rounds"));
-	&cmp	("ecx",192);
-	&je	(&label("12rounds"));
-	&cmp	("ecx",256);
-	&je	(&label("14rounds"));
-	&mov	("eax",-2);			# invalid number of bits
-	&jmp	(&label("exit"));
-
-    &set_label("10rounds");
-	&mov	("eax",&DWP(0,"esi"));		# copy first 4 dwords
-	&mov	("ebx",&DWP(4,"esi"));
-	&mov	("ecx",&DWP(8,"esi"));
-	&mov	("edx",&DWP(12,"esi"));
-	&mov	(&DWP(0,"edi"),"eax");
-	&mov	(&DWP(4,"edi"),"ebx");
-	&mov	(&DWP(8,"edi"),"ecx");
-	&mov	(&DWP(12,"edi"),"edx");
-
-	&xor	("ecx","ecx");
-	&jmp	(&label("10shortcut"));
-
-	&align	(4);
-	&set_label("10loop");
-		&mov	("eax",&DWP(0,"edi"));		# rk[0]
-		&mov	("edx",&DWP(12,"edi"));		# rk[3]
-	&set_label("10shortcut");
-		&enckey	();
-
-		&mov	(&DWP(16,"edi"),"eax");		# rk[4]
-		&xor	("eax",&DWP(4,"edi"));
-		&mov	(&DWP(20,"edi"),"eax");		# rk[5]
-		&xor	("eax",&DWP(8,"edi"));
-		&mov	(&DWP(24,"edi"),"eax");		# rk[6]
-		&xor	("eax",&DWP(12,"edi"));
-		&mov	(&DWP(28,"edi"),"eax");		# rk[7]
-		&inc	("ecx");
-		&add	("edi",16);
-		&cmp	("ecx",10);
-	&jl	(&label("10loop"));
-
-	&mov	(&DWP(80,"edi"),10);		# setup number of rounds
-	&xor	("eax","eax");
-	&jmp	(&label("exit"));
-
-    &set_label("12rounds");
-	&mov	("eax",&DWP(0,"esi"));		# copy first 6 dwords
-	&mov	("ebx",&DWP(4,"esi"));
-	&mov	("ecx",&DWP(8,"esi"));
-	&mov	("edx",&DWP(12,"esi"));
-	&mov	(&DWP(0,"edi"),"eax");
-	&mov	(&DWP(4,"edi"),"ebx");
-	&mov	(&DWP(8,"edi"),"ecx");
-	&mov	(&DWP(12,"edi"),"edx");
-	&mov	("ecx",&DWP(16,"esi"));
-	&mov	("edx",&DWP(20,"esi"));
-	&mov	(&DWP(16,"edi"),"ecx");
-	&mov	(&DWP(20,"edi"),"edx");
-
-	&xor	("ecx","ecx");
-	&jmp	(&label("12shortcut"));
-
-	&align	(4);
-	&set_label("12loop");
-		&mov	("eax",&DWP(0,"edi"));		# rk[0]
-		&mov	("edx",&DWP(20,"edi"));		# rk[5]
-	&set_label("12shortcut");
-		&enckey	();
-
-		&mov	(&DWP(24,"edi"),"eax");		# rk[6]
-		&xor	("eax",&DWP(4,"edi"));
-		&mov	(&DWP(28,"edi"),"eax");		# rk[7]
-		&xor	("eax",&DWP(8,"edi"));
-		&mov	(&DWP(32,"edi"),"eax");		# rk[8]
-		&xor	("eax",&DWP(12,"edi"));
-		&mov	(&DWP(36,"edi"),"eax");		# rk[9]
-
-		&cmp	("ecx",7);
-		&je	(&label("12break"));
-		&inc	("ecx");
-
-		&xor	("eax",&DWP(16,"edi"));
-		&mov	(&DWP(40,"edi"),"eax");		# rk[10]
-		&xor	("eax",&DWP(20,"edi"));
-		&mov	(&DWP(44,"edi"),"eax");		# rk[11]
-
-		&add	("edi",24);
-	&jmp	(&label("12loop"));
-
-	&set_label("12break");
-	&mov	(&DWP(72,"edi"),12);		# setup number of rounds
-	&xor	("eax","eax");
-	&jmp	(&label("exit"));
-
-    &set_label("14rounds");
-	&mov	("eax",&DWP(0,"esi"));		# copy first 8 dwords
-	&mov	("ebx",&DWP(4,"esi"));
-	&mov	("ecx",&DWP(8,"esi"));
-	&mov	("edx",&DWP(12,"esi"));
-	&mov	(&DWP(0,"edi"),"eax");
-	&mov	(&DWP(4,"edi"),"ebx");
-	&mov	(&DWP(8,"edi"),"ecx");
-	&mov	(&DWP(12,"edi"),"edx");
-	&mov	("eax",&DWP(16,"esi"));
-	&mov	("ebx",&DWP(20,"esi"));
-	&mov	("ecx",&DWP(24,"esi"));
-	&mov	("edx",&DWP(28,"esi"));
-	&mov	(&DWP(16,"edi"),"eax");
-	&mov	(&DWP(20,"edi"),"ebx");
-	&mov	(&DWP(24,"edi"),"ecx");
-	&mov	(&DWP(28,"edi"),"edx");
-
-	&xor	("ecx","ecx");
-	&jmp	(&label("14shortcut"));
-
-	&align	(4);
-	&set_label("14loop");
-		&mov	("edx",&DWP(28,"edi"));		# rk[7]
-	&set_label("14shortcut");
-		&mov	("eax",&DWP(0,"edi"));		# rk[0]
-
-		&enckey	();
-
-		&mov	(&DWP(32,"edi"),"eax");		# rk[8]
-		&xor	("eax",&DWP(4,"edi"));
-		&mov	(&DWP(36,"edi"),"eax");		# rk[9]
-		&xor	("eax",&DWP(8,"edi"));
-		&mov	(&DWP(40,"edi"),"eax");		# rk[10]
-		&xor	("eax",&DWP(12,"edi"));
-		&mov	(&DWP(44,"edi"),"eax");		# rk[11]
-
-		&cmp	("ecx",6);
-		&je	(&label("14break"));
-		&inc	("ecx");
-
-		&mov	("edx","eax");
-		&mov	("eax",&DWP(16,"edi"));		# rk[4]
-		&movz	("esi",&LB("edx"));		# rk[11]>>0
-		&movz	("ebx",&BP(-128,$tbl,"esi",1));
-		&movz	("esi",&HB("edx"));		# rk[11]>>8
-		&xor	("eax","ebx");
-
-		&movz	("ebx",&BP(-128,$tbl,"esi",1));
-		&shr	("edx",16);
-		&shl	("ebx",8);
-		&movz	("esi",&LB("edx"));		# rk[11]>>16
-		&xor	("eax","ebx");
-
-		&movz	("ebx",&BP(-128,$tbl,"esi",1));
-		&movz	("esi",&HB("edx"));		# rk[11]>>24
-		&shl	("ebx",16);
-		&xor	("eax","ebx");
-
-		&movz	("ebx",&BP(-128,$tbl,"esi",1));
-		&shl	("ebx",24);
-		&xor	("eax","ebx");
-
-		&mov	(&DWP(48,"edi"),"eax");		# rk[12]
-		&xor	("eax",&DWP(20,"edi"));
-		&mov	(&DWP(52,"edi"),"eax");		# rk[13]
-		&xor	("eax",&DWP(24,"edi"));
-		&mov	(&DWP(56,"edi"),"eax");		# rk[14]
-		&xor	("eax",&DWP(28,"edi"));
-		&mov	(&DWP(60,"edi"),"eax");		# rk[15]
-
-		&add	("edi",32);
-	&jmp	(&label("14loop"));
-
-	&set_label("14break");
-	&mov	(&DWP(48,"edi"),14);		# setup number of rounds
-	&xor	("eax","eax");
-	&jmp	(&label("exit"));
-
-    &set_label("badpointer");
-	&mov	("eax",-1);
-    &set_label("exit");
-&function_end("_x86_AES_set_encrypt_key");
-
-# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
-#                        AES_KEY *key)
-&function_begin_B("AES_set_encrypt_key");
-	&call	("_x86_AES_set_encrypt_key");
-	&ret	();
-&function_end_B("AES_set_encrypt_key");
-
-sub deckey()
-{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
-  my $tmp = $tbl;
-
-	&mov	($tmp,0x80808080);
-	&and	($tmp,$tp1);
-	&lea	($tp2,&DWP(0,$tp1,$tp1));
-	&mov	($acc,$tmp);
-	&shr	($tmp,7);
-	&sub	($acc,$tmp);
-	&and	($tp2,0xfefefefe);
-	&and	($acc,0x1b1b1b1b);
-	&xor	($tp2,$acc);
-	&mov	($tmp,0x80808080);
-
-	&and	($tmp,$tp2);
-	&lea	($tp4,&DWP(0,$tp2,$tp2));
-	&mov	($acc,$tmp);
-	&shr	($tmp,7);
-	&sub	($acc,$tmp);
-	&and	($tp4,0xfefefefe);
-	&and	($acc,0x1b1b1b1b);
-	 &xor	($tp2,$tp1);	# tp2^tp1
-	&xor	($tp4,$acc);
-	&mov	($tmp,0x80808080);
-
-	&and	($tmp,$tp4);
-	&lea	($tp8,&DWP(0,$tp4,$tp4));
-	&mov	($acc,$tmp);
-	&shr	($tmp,7);
-	 &xor	($tp4,$tp1);	# tp4^tp1
-	&sub	($acc,$tmp);
-	&and	($tp8,0xfefefefe);
-	&and	($acc,0x1b1b1b1b);
-	 &rotl	($tp1,8);	# = ROTATE(tp1,8)
-	&xor	($tp8,$acc);
-
-	&mov	($tmp,&DWP(4*($i+1),$key));	# modulo-scheduled load
-
-	&xor	($tp1,$tp2);
-	&xor	($tp2,$tp8);
-	&xor	($tp1,$tp4);
-	&rotl	($tp2,24);
-	&xor	($tp4,$tp8);
-	&xor	($tp1,$tp8);	# ^= tp8^(tp4^tp1)^(tp2^tp1)
-	&rotl	($tp4,16);
-	&xor	($tp1,$tp2);	# ^= ROTATE(tp8^tp2^tp1,24)
-	&rotl	($tp8,8);
-	&xor	($tp1,$tp4);	# ^= ROTATE(tp8^tp4^tp1,16)
-	&mov	($tp2,$tmp);
-	&xor	($tp1,$tp8);	# ^= ROTATE(tp8,8)
-
-	&mov	(&DWP(4*$i,$key),$tp1);
-}
-
-# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
-#                        AES_KEY *key)
-&function_begin_B("AES_set_decrypt_key");
-	&call	("_x86_AES_set_encrypt_key");
-	&cmp	("eax",0);
-	&je	(&label("proceed"));
-	&ret	();
-
-    &set_label("proceed");
-	&push	("ebp");
-	&push	("ebx");
-	&push	("esi");
-	&push	("edi");
-
-	&mov	("esi",&wparam(2));
-	&mov	("ecx",&DWP(240,"esi"));	# pull number of rounds
-	&lea	("ecx",&DWP(0,"","ecx",4));
-	&lea	("edi",&DWP(0,"esi","ecx",4));	# pointer to last chunk
-
-	&set_label("invert",4);			# invert order of chunks
-		&mov	("eax",&DWP(0,"esi"));
-		&mov	("ebx",&DWP(4,"esi"));
-		&mov	("ecx",&DWP(0,"edi"));
-		&mov	("edx",&DWP(4,"edi"));
-		&mov	(&DWP(0,"edi"),"eax");
-		&mov	(&DWP(4,"edi"),"ebx");
-		&mov	(&DWP(0,"esi"),"ecx");
-		&mov	(&DWP(4,"esi"),"edx");
-		&mov	("eax",&DWP(8,"esi"));
-		&mov	("ebx",&DWP(12,"esi"));
-		&mov	("ecx",&DWP(8,"edi"));
-		&mov	("edx",&DWP(12,"edi"));
-		&mov	(&DWP(8,"edi"),"eax");
-		&mov	(&DWP(12,"edi"),"ebx");
-		&mov	(&DWP(8,"esi"),"ecx");
-		&mov	(&DWP(12,"esi"),"edx");
-		&add	("esi",16);
-		&sub	("edi",16);
-		&cmp	("esi","edi");
-	&jne	(&label("invert"));
-
-	&mov	($key,&wparam(2));
-	&mov	($acc,&DWP(240,$key));		# pull number of rounds
-	&lea	($acc,&DWP(-2,$acc,$acc));
-	&lea	($acc,&DWP(0,$key,$acc,8));
-	&mov	(&wparam(2),$acc);
-
-	&mov	($s0,&DWP(16,$key));		# modulo-scheduled load
-	&set_label("permute",4);		# permute the key schedule
-		&add	($key,16);
-		&deckey	(0,$key,$s0,$s1,$s2,$s3);
-		&deckey	(1,$key,$s1,$s2,$s3,$s0);
-		&deckey	(2,$key,$s2,$s3,$s0,$s1);
-		&deckey	(3,$key,$s3,$s0,$s1,$s2);
-		&cmp	($key,&wparam(2));
-	&jb	(&label("permute"));
-
-	&xor	("eax","eax");			# return success
-&function_end("AES_set_decrypt_key");
-&asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
-
-&asm_finish();
-
-close STDOUT;

+ 3 - 3
libs/openssl/crypto/aes/asm/aes-s390x.pl

@@ -1,5 +1,5 @@
 #! /usr/bin/env perl
-# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2007-2019 The OpenSSL Project Authors. All Rights Reserved.
 #
 # Licensed under the OpenSSL license (the "License").  You may not use
 # this file except in compliance with the License.  You can obtain a copy
@@ -38,14 +38,14 @@
 # Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
 # for 128-bit keys, if hardware support is detected.
 
-# Januray 2009.
+# January 2009.
 #
 # Add support for hardware AES192/256 and reschedule instructions to
 # minimize/avoid Address Generation Interlock hazard and to favour
 # dual-issue z10 pipeline. This gave ~25% improvement on z10 and
 # almost 50% on z9. The gain is smaller on z10, because being dual-
 # issue z10 makes it impossible to eliminate the interlock condition:
-# critial path is not long enough. Yet it spends ~24 cycles per byte
+# critical path is not long enough. Yet it spends ~24 cycles per byte
 # processed with 128-bit key.
 #
 # Unlike previous version hardware support detection takes place only

+ 0 - 2916
libs/openssl/crypto/aes/asm/aes-x86_64.pl

@@ -1,2916 +0,0 @@
-#! /usr/bin/env perl
-# Copyright 2005-2019 The OpenSSL Project Authors. All Rights Reserved.
-#
-# Licensed under the OpenSSL license (the "License").  You may not use
-# this file except in compliance with the License.  You can obtain a copy
-# in the file LICENSE in the source distribution or at
-# https://www.openssl.org/source/license.html
-
-#
-# ====================================================================
-# Written by Andy Polyakov <[email protected]> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# Version 2.1.
-#
-# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
-# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
-# [you'll notice a lot of resemblance], such as compressed S-boxes
-# in little-endian byte order, prefetch of these tables in CBC mode,
-# as well as avoiding L1 cache aliasing between stack frame and key
-# schedule and already mentioned tables, compressed Td4...
-#
-# Performance in number of cycles per processed byte for 128-bit key:
-#
-#		ECB encrypt	ECB decrypt	CBC large chunk
-# AMD64		33		43		13.0
-# EM64T		38		56		18.6(*)
-# Core 2	30		42		14.5(*)
-# Atom		65		86		32.1(*)
-#
-# (*) with hyper-threading off
-
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
-*STDOUT=*OUT;
-
-$verticalspin=1;	# unlike 32-bit version $verticalspin performs
-			# ~15% better on both AMD and Intel cores
-$speed_limit=512;	# see aes-586.pl for details
-
-$code=".text\n";
-
-$s0="%eax";
-$s1="%ebx";
-$s2="%ecx";
-$s3="%edx";
-$acc0="%esi";	$mask80="%rsi";
-$acc1="%edi";	$maskfe="%rdi";
-$acc2="%ebp";	$mask1b="%rbp";
-$inp="%r8";
-$out="%r9";
-$t0="%r10d";
-$t1="%r11d";
-$t2="%r12d";
-$rnds="%r13d";
-$sbox="%r14";
-$key="%r15";
-
-sub hi() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1h/;	$r; }
-sub lo() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1l/;
-			$r =~ s/%[er]([sd]i)/%\1l/;
-			$r =~ s/%(r[0-9]+)[d]?/%\1b/;	$r; }
-sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
-			$r =~ s/%r([0-9]+)/%r\1d/;	$r; }
-sub _data_word()
-{ my $i;
-    while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
-}
-sub data_word()
-{ my $i;
-  my $last=pop(@_);
-    $code.=".long\t";
-    while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }
-    $code.=sprintf"0x%08x\n",$last;
-}
-
-sub data_byte()
-{ my $i;
-  my $last=pop(@_);
-    $code.=".byte\t";
-    while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }
-    $code.=sprintf"0x%02x\n",$last&0xff;
-}
-
-sub encvert()
-{ my $t3="%r8d";	# zaps $inp!
-
-$code.=<<___;
-	# favor 3-way issue Opteron pipeline...
-	movzb	`&lo("$s0")`,$acc0
-	movzb	`&lo("$s1")`,$acc1
-	movzb	`&lo("$s2")`,$acc2
-	mov	0($sbox,$acc0,8),$t0
-	mov	0($sbox,$acc1,8),$t1
-	mov	0($sbox,$acc2,8),$t2
-
-	movzb	`&hi("$s1")`,$acc0
-	movzb	`&hi("$s2")`,$acc1
-	movzb	`&lo("$s3")`,$acc2
-	xor	3($sbox,$acc0,8),$t0
-	xor	3($sbox,$acc1,8),$t1
-	mov	0($sbox,$acc2,8),$t3
-
-	movzb	`&hi("$s3")`,$acc0
-	shr	\$16,$s2
-	movzb	`&hi("$s0")`,$acc2
-	xor	3($sbox,$acc0,8),$t2
-	shr	\$16,$s3
-	xor	3($sbox,$acc2,8),$t3
-
-	shr	\$16,$s1
-	lea	16($key),$key
-	shr	\$16,$s0
-
-	movzb	`&lo("$s2")`,$acc0
-	movzb	`&lo("$s3")`,$acc1
-	movzb	`&lo("$s0")`,$acc2
-	xor	2($sbox,$acc0,8),$t0
-	xor	2($sbox,$acc1,8),$t1
-	xor	2($sbox,$acc2,8),$t2
-
-	movzb	`&hi("$s3")`,$acc0
-	movzb	`&hi("$s0")`,$acc1
-	movzb	`&lo("$s1")`,$acc2
-	xor	1($sbox,$acc0,8),$t0
-	xor	1($sbox,$acc1,8),$t1
-	xor	2($sbox,$acc2,8),$t3
-
-	mov	12($key),$s3
-	movzb	`&hi("$s1")`,$acc1
-	movzb	`&hi("$s2")`,$acc2
-	mov	0($key),$s0
-	xor	1($sbox,$acc1,8),$t2
-	xor	1($sbox,$acc2,8),$t3
-
-	mov	4($key),$s1
-	mov	8($key),$s2
-	xor	$t0,$s0
-	xor	$t1,$s1
-	xor	$t2,$s2
-	xor	$t3,$s3
-___
-}
-
-sub enclastvert()
-{ my $t3="%r8d";	# zaps $inp!
-
-$code.=<<___;
-	movzb	`&lo("$s0")`,$acc0
-	movzb	`&lo("$s1")`,$acc1
-	movzb	`&lo("$s2")`,$acc2
-	movzb	2($sbox,$acc0,8),$t0
-	movzb	2($sbox,$acc1,8),$t1
-	movzb	2($sbox,$acc2,8),$t2
-
-	movzb	`&lo("$s3")`,$acc0
-	movzb	`&hi("$s1")`,$acc1
-	movzb	`&hi("$s2")`,$acc2
-	movzb	2($sbox,$acc0,8),$t3
-	mov	0($sbox,$acc1,8),$acc1	#$t0
-	mov	0($sbox,$acc2,8),$acc2	#$t1
-
-	and	\$0x0000ff00,$acc1
-	and	\$0x0000ff00,$acc2
-
-	xor	$acc1,$t0
-	xor	$acc2,$t1
-	shr	\$16,$s2
-
-	movzb	`&hi("$s3")`,$acc0
-	movzb	`&hi("$s0")`,$acc1
-	shr	\$16,$s3
-	mov	0($sbox,$acc0,8),$acc0	#$t2
-	mov	0($sbox,$acc1,8),$acc1	#$t3
-
-	and	\$0x0000ff00,$acc0
-	and	\$0x0000ff00,$acc1
-	shr	\$16,$s1
-	xor	$acc0,$t2
-	xor	$acc1,$t3
-	shr	\$16,$s0
-
-	movzb	`&lo("$s2")`,$acc0
-	movzb	`&lo("$s3")`,$acc1
-	movzb	`&lo("$s0")`,$acc2
-	mov	0($sbox,$acc0,8),$acc0	#$t0
-	mov	0($sbox,$acc1,8),$acc1	#$t1
-	mov	0($sbox,$acc2,8),$acc2	#$t2
-
-	and	\$0x00ff0000,$acc0
-	and	\$0x00ff0000,$acc1
-	and	\$0x00ff0000,$acc2
-
-	xor	$acc0,$t0
-	xor	$acc1,$t1
-	xor	$acc2,$t2
-
-	movzb	`&lo("$s1")`,$acc0
-	movzb	`&hi("$s3")`,$acc1
-	movzb	`&hi("$s0")`,$acc2
-	mov	0($sbox,$acc0,8),$acc0	#$t3
-	mov	2($sbox,$acc1,8),$acc1	#$t0
-	mov	2($sbox,$acc2,8),$acc2	#$t1
-
-	and	\$0x00ff0000,$acc0
-	and	\$0xff000000,$acc1
-	and	\$0xff000000,$acc2
-
-	xor	$acc0,$t3
-	xor	$acc1,$t0
-	xor	$acc2,$t1
-
-	movzb	`&hi("$s1")`,$acc0
-	movzb	`&hi("$s2")`,$acc1
-	mov	16+12($key),$s3
-	mov	2($sbox,$acc0,8),$acc0	#$t2
-	mov	2($sbox,$acc1,8),$acc1	#$t3
-	mov	16+0($key),$s0
-
-	and	\$0xff000000,$acc0
-	and	\$0xff000000,$acc1
-
-	xor	$acc0,$t2
-	xor	$acc1,$t3
-
-	mov	16+4($key),$s1
-	mov	16+8($key),$s2
-	xor	$t0,$s0
-	xor	$t1,$s1
-	xor	$t2,$s2
-	xor	$t3,$s3
-___
-}
-
-sub encstep()
-{ my ($i,@s) = @_;
-  my $tmp0=$acc0;
-  my $tmp1=$acc1;
-  my $tmp2=$acc2;
-  my $out=($t0,$t1,$t2,$s[0])[$i];
-
-	if ($i==3) {
-		$tmp0=$s[1];
-		$tmp1=$s[2];
-		$tmp2=$s[3];
-	}
-	$code.="	movzb	".&lo($s[0]).",$out\n";
-	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
-	$code.="	lea	16($key),$key\n"	if ($i==0);
-
-	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
-	$code.="	mov	0($sbox,$out,8),$out\n";
-
-	$code.="	shr	\$16,$tmp1\n";
-	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
-	$code.="	xor	3($sbox,$tmp0,8),$out\n";
-
-	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
-	$code.="	shr	\$24,$tmp2\n";
-	$code.="	xor	4*$i($key),$out\n";
-
-	$code.="	xor	2($sbox,$tmp1,8),$out\n";
-	$code.="	xor	1($sbox,$tmp2,8),$out\n";
-
-	$code.="	mov	$t0,$s[1]\n"		if ($i==3);
-	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
-	$code.="	mov	$t2,$s[3]\n"		if ($i==3);
-	$code.="\n";
-}
-
-sub enclast()
-{ my ($i,@s)=@_;
-  my $tmp0=$acc0;
-  my $tmp1=$acc1;
-  my $tmp2=$acc2;
-  my $out=($t0,$t1,$t2,$s[0])[$i];
-
-	if ($i==3) {
-		$tmp0=$s[1];
-		$tmp1=$s[2];
-		$tmp2=$s[3];
-	}
-	$code.="	movzb	".&lo($s[0]).",$out\n";
-	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
-
-	$code.="	mov	2($sbox,$out,8),$out\n";
-	$code.="	shr	\$16,$tmp1\n";
-	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
-
-	$code.="	and	\$0x000000ff,$out\n";
-	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
-	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
-	$code.="	shr	\$24,$tmp2\n";
-
-	$code.="	mov	0($sbox,$tmp0,8),$tmp0\n";
-	$code.="	mov	0($sbox,$tmp1,8),$tmp1\n";
-	$code.="	mov	2($sbox,$tmp2,8),$tmp2\n";
-
-	$code.="	and	\$0x0000ff00,$tmp0\n";
-	$code.="	and	\$0x00ff0000,$tmp1\n";
-	$code.="	and	\$0xff000000,$tmp2\n";
-
-	$code.="	xor	$tmp0,$out\n";
-	$code.="	mov	$t0,$s[1]\n"		if ($i==3);
-	$code.="	xor	$tmp1,$out\n";
-	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
-	$code.="	xor	$tmp2,$out\n";
-	$code.="	mov	$t2,$s[3]\n"		if ($i==3);
-	$code.="\n";
-}
-
-$code.=<<___;
-.type	_x86_64_AES_encrypt,\@abi-omnipotent
-.align	16
-_x86_64_AES_encrypt:
-	xor	0($key),$s0			# xor with key
-	xor	4($key),$s1
-	xor	8($key),$s2
-	xor	12($key),$s3
-
-	mov	240($key),$rnds			# load key->rounds
-	sub	\$1,$rnds
-	jmp	.Lenc_loop
-.align	16
-.Lenc_loop:
-___
-	if ($verticalspin) { &encvert(); }
-	else {	&encstep(0,$s0,$s1,$s2,$s3);
-		&encstep(1,$s1,$s2,$s3,$s0);
-		&encstep(2,$s2,$s3,$s0,$s1);
-		&encstep(3,$s3,$s0,$s1,$s2);
-	}
-$code.=<<___;
-	sub	\$1,$rnds
-	jnz	.Lenc_loop
-___
-	if ($verticalspin) { &enclastvert(); }
-	else {	&enclast(0,$s0,$s1,$s2,$s3);
-		&enclast(1,$s1,$s2,$s3,$s0);
-		&enclast(2,$s2,$s3,$s0,$s1);
-		&enclast(3,$s3,$s0,$s1,$s2);
-		$code.=<<___;
-		xor	16+0($key),$s0		# xor with key
-		xor	16+4($key),$s1
-		xor	16+8($key),$s2
-		xor	16+12($key),$s3
-___
-	}
-$code.=<<___;
-	.byte	0xf3,0xc3			# rep ret
-.size	_x86_64_AES_encrypt,.-_x86_64_AES_encrypt
-___
-
-# it's possible to implement this by shifting tN by 8, filling least
-# significant byte with byte load and finally bswap-ing at the end,
-# but such partial register load kills Core 2...
-sub enccompactvert()
-{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
-
-$code.=<<___;
-	movzb	`&lo("$s0")`,$t0
-	movzb	`&lo("$s1")`,$t1
-	movzb	`&lo("$s2")`,$t2
-	movzb	`&lo("$s3")`,$t3
-	movzb	`&hi("$s1")`,$acc0
-	movzb	`&hi("$s2")`,$acc1
-	shr	\$16,$s2
-	movzb	`&hi("$s3")`,$acc2
-	movzb	($sbox,$t0,1),$t0
-	movzb	($sbox,$t1,1),$t1
-	movzb	($sbox,$t2,1),$t2
-	movzb	($sbox,$t3,1),$t3
-
-	movzb	($sbox,$acc0,1),$t4	#$t0
-	movzb	`&hi("$s0")`,$acc0
-	movzb	($sbox,$acc1,1),$t5	#$t1
-	movzb	`&lo("$s2")`,$acc1
-	movzb	($sbox,$acc2,1),$acc2	#$t2
-	movzb	($sbox,$acc0,1),$acc0	#$t3
-
-	shl	\$8,$t4
-	shr	\$16,$s3
-	shl	\$8,$t5
-	xor	$t4,$t0
-	shr	\$16,$s0
-	movzb	`&lo("$s3")`,$t4
-	shr	\$16,$s1
-	xor	$t5,$t1
-	shl	\$8,$acc2
-	movzb	`&lo("$s0")`,$t5
-	movzb	($sbox,$acc1,1),$acc1	#$t0
-	xor	$acc2,$t2
-
-	shl	\$8,$acc0
-	movzb	`&lo("$s1")`,$acc2
-	shl	\$16,$acc1
-	xor	$acc0,$t3
-	movzb	($sbox,$t4,1),$t4	#$t1
-	movzb	`&hi("$s3")`,$acc0
-	movzb	($sbox,$t5,1),$t5	#$t2
-	xor	$acc1,$t0
-
-	shr	\$8,$s2
-	movzb	`&hi("$s0")`,$acc1
-	shl	\$16,$t4
-	shr	\$8,$s1
-	shl	\$16,$t5
-	xor	$t4,$t1
-	movzb	($sbox,$acc2,1),$acc2	#$t3
-	movzb	($sbox,$acc0,1),$acc0	#$t0
-	movzb	($sbox,$acc1,1),$acc1	#$t1
-	movzb	($sbox,$s2,1),$s3	#$t3
-	movzb	($sbox,$s1,1),$s2	#$t2
-
-	shl	\$16,$acc2
-	xor	$t5,$t2
-	shl	\$24,$acc0
-	xor	$acc2,$t3
-	shl	\$24,$acc1
-	xor	$acc0,$t0
-	shl	\$24,$s3
-	xor	$acc1,$t1
-	shl	\$24,$s2
-	mov	$t0,$s0
-	mov	$t1,$s1
-	xor	$t2,$s2
-	xor	$t3,$s3
-___
-}
-
-sub enctransform_ref()
-{ my $sn = shift;
-  my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
-
-$code.=<<___;
-	mov	$sn,$acc
-	and	\$0x80808080,$acc
-	mov	$acc,$tmp
-	shr	\$7,$tmp
-	lea	($sn,$sn),$r2
-	sub	$tmp,$acc
-	and	\$0xfefefefe,$r2
-	and	\$0x1b1b1b1b,$acc
-	mov	$sn,$tmp
-	xor	$acc,$r2
-
-	xor	$r2,$sn
-	rol	\$24,$sn
-	xor	$r2,$sn
-	ror	\$16,$tmp
-	xor	$tmp,$sn
-	ror	\$8,$tmp
-	xor	$tmp,$sn
-___
-}
-
-# unlike decrypt case it does not pay off to parallelize enctransform
-sub enctransform()
-{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
-
-$code.=<<___;
-	mov	\$0x80808080,$t0
-	mov	\$0x80808080,$t1
-	and	$s0,$t0
-	and	$s1,$t1
-	mov	$t0,$acc0
-	mov	$t1,$acc1
-	shr	\$7,$t0
-	lea	($s0,$s0),$r20
-	shr	\$7,$t1
-	lea	($s1,$s1),$r21
-	sub	$t0,$acc0
-	sub	$t1,$acc1
-	and	\$0xfefefefe,$r20
-	and	\$0xfefefefe,$r21
-	and	\$0x1b1b1b1b,$acc0
-	and	\$0x1b1b1b1b,$acc1
-	mov	$s0,$t0
-	mov	$s1,$t1
-	xor	$acc0,$r20
-	xor	$acc1,$r21
-
-	xor	$r20,$s0
-	xor	$r21,$s1
-	 mov	\$0x80808080,$t2
-	rol	\$24,$s0
-	 mov	\$0x80808080,$t3
-	rol	\$24,$s1
-	 and	$s2,$t2
-	 and	$s3,$t3
-	xor	$r20,$s0
-	xor	$r21,$s1
-	 mov	$t2,$acc0
-	ror	\$16,$t0
-	 mov	$t3,$acc1
-	ror	\$16,$t1
-	 lea	($s2,$s2),$r20
-	 shr	\$7,$t2
-	xor	$t0,$s0
-	 shr	\$7,$t3
-	xor	$t1,$s1
-	ror	\$8,$t0
-	 lea	($s3,$s3),$r21
-	ror	\$8,$t1
-	 sub	$t2,$acc0
-	 sub	$t3,$acc1
-	xor	$t0,$s0
-	xor	$t1,$s1
-
-	and	\$0xfefefefe,$r20
-	and	\$0xfefefefe,$r21
-	and	\$0x1b1b1b1b,$acc0
-	and	\$0x1b1b1b1b,$acc1
-	mov	$s2,$t2
-	mov	$s3,$t3
-	xor	$acc0,$r20
-	xor	$acc1,$r21
-
-	ror	\$16,$t2
-	xor	$r20,$s2
-	ror	\$16,$t3
-	xor	$r21,$s3
-	rol	\$24,$s2
-	mov	0($sbox),$acc0			# prefetch Te4
-	rol	\$24,$s3
-	xor	$r20,$s2
-	mov	64($sbox),$acc1
-	xor	$r21,$s3
-	mov	128($sbox),$r20
-	xor	$t2,$s2
-	ror	\$8,$t2
-	xor	$t3,$s3
-	ror	\$8,$t3
-	xor	$t2,$s2
-	mov	192($sbox),$r21
-	xor	$t3,$s3
-___
-}
-
-$code.=<<___;
-.type	_x86_64_AES_encrypt_compact,\@abi-omnipotent
-.align	16
-_x86_64_AES_encrypt_compact:
-.cfi_startproc
-	lea	128($sbox),$inp			# size optimization
-	mov	0-128($inp),$acc1		# prefetch Te4
-	mov	32-128($inp),$acc2
-	mov	64-128($inp),$t0
-	mov	96-128($inp),$t1
-	mov	128-128($inp),$acc1
-	mov	160-128($inp),$acc2
-	mov	192-128($inp),$t0
-	mov	224-128($inp),$t1
-	jmp	.Lenc_loop_compact
-.align	16
-.Lenc_loop_compact:
-		xor	0($key),$s0		# xor with key
-		xor	4($key),$s1
-		xor	8($key),$s2
-		xor	12($key),$s3
-		lea	16($key),$key
-___
-		&enccompactvert();
-$code.=<<___;
-		cmp	16(%rsp),$key
-		je	.Lenc_compact_done
-___
-		&enctransform();
-$code.=<<___;
-	jmp	.Lenc_loop_compact
-.align	16
-.Lenc_compact_done:
-	xor	0($key),$s0
-	xor	4($key),$s1
-	xor	8($key),$s2
-	xor	12($key),$s3
-	.byte	0xf3,0xc3			# rep ret
-.cfi_endproc
-.size	_x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
-___
-
-# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
-$code.=<<___;
-.globl	AES_encrypt
-.type	AES_encrypt,\@function,3
-.align	16
-.globl	asm_AES_encrypt
-.hidden	asm_AES_encrypt
-asm_AES_encrypt:
-AES_encrypt:
-.cfi_startproc
-	mov	%rsp,%rax
-.cfi_def_cfa_register	%rax
-	push	%rbx
-.cfi_push	%rbx
-	push	%rbp
-.cfi_push	%rbp
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-
-	# allocate frame "above" key schedule
-	lea	-63(%rdx),%rcx	# %rdx is key argument
-	and	\$-64,%rsp
-	sub	%rsp,%rcx
-	neg	%rcx
-	and	\$0x3c0,%rcx
-	sub	%rcx,%rsp
-	sub	\$32,%rsp
-
-	mov	%rsi,16(%rsp)	# save out
-	mov	%rax,24(%rsp)	# save original stack pointer
-.cfi_cfa_expression	%rsp+24,deref,+8
-.Lenc_prologue:
-
-	mov	%rdx,$key
-	mov	240($key),$rnds	# load rounds
-
-	mov	0(%rdi),$s0	# load input vector
-	mov	4(%rdi),$s1
-	mov	8(%rdi),$s2
-	mov	12(%rdi),$s3
-
-	shl	\$4,$rnds
-	lea	($key,$rnds),%rbp
-	mov	$key,(%rsp)	# key schedule
-	mov	%rbp,8(%rsp)	# end of key schedule
-
-	# pick Te4 copy which can't "overlap" with stack frame or key schedule
-	lea	.LAES_Te+2048(%rip),$sbox
-	lea	768(%rsp),%rbp
-	sub	$sbox,%rbp
-	and	\$0x300,%rbp
-	lea	($sbox,%rbp),$sbox
-
-	call	_x86_64_AES_encrypt_compact
-
-	mov	16(%rsp),$out	# restore out
-	mov	24(%rsp),%rsi	# restore saved stack pointer
-.cfi_def_cfa	%rsi,8
-	mov	$s0,0($out)	# write output vector
-	mov	$s1,4($out)
-	mov	$s2,8($out)
-	mov	$s3,12($out)
-
-	mov	-48(%rsi),%r15
-.cfi_restore	%r15
-	mov	-40(%rsi),%r14
-.cfi_restore	%r14
-	mov	-32(%rsi),%r13
-.cfi_restore	%r13
-	mov	-24(%rsi),%r12
-.cfi_restore	%r12
-	mov	-16(%rsi),%rbp
-.cfi_restore	%rbp
-	mov	-8(%rsi),%rbx
-.cfi_restore	%rbx
-	lea	(%rsi),%rsp
-.cfi_def_cfa_register	%rsp
-.Lenc_epilogue:
-	ret
-.cfi_endproc
-.size	AES_encrypt,.-AES_encrypt
-___
-
-#------------------------------------------------------------------#
-
-sub decvert()
-{ my $t3="%r8d";	# zaps $inp!
-
-$code.=<<___;
-	# favor 3-way issue Opteron pipeline...
-	movzb	`&lo("$s0")`,$acc0
-	movzb	`&lo("$s1")`,$acc1
-	movzb	`&lo("$s2")`,$acc2
-	mov	0($sbox,$acc0,8),$t0
-	mov	0($sbox,$acc1,8),$t1
-	mov	0($sbox,$acc2,8),$t2
-
-	movzb	`&hi("$s3")`,$acc0
-	movzb	`&hi("$s0")`,$acc1
-	movzb	`&lo("$s3")`,$acc2
-	xor	3($sbox,$acc0,8),$t0
-	xor	3($sbox,$acc1,8),$t1
-	mov	0($sbox,$acc2,8),$t3
-
-	movzb	`&hi("$s1")`,$acc0
-	shr	\$16,$s0
-	movzb	`&hi("$s2")`,$acc2
-	xor	3($sbox,$acc0,8),$t2
-	shr	\$16,$s3
-	xor	3($sbox,$acc2,8),$t3
-
-	shr	\$16,$s1
-	lea	16($key),$key
-	shr	\$16,$s2
-
-	movzb	`&lo("$s2")`,$acc0
-	movzb	`&lo("$s3")`,$acc1
-	movzb	`&lo("$s0")`,$acc2
-	xor	2($sbox,$acc0,8),$t0
-	xor	2($sbox,$acc1,8),$t1
-	xor	2($sbox,$acc2,8),$t2
-
-	movzb	`&hi("$s1")`,$acc0
-	movzb	`&hi("$s2")`,$acc1
-	movzb	`&lo("$s1")`,$acc2
-	xor	1($sbox,$acc0,8),$t0
-	xor	1($sbox,$acc1,8),$t1
-	xor	2($sbox,$acc2,8),$t3
-
-	movzb	`&hi("$s3")`,$acc0
-	mov	12($key),$s3
-	movzb	`&hi("$s0")`,$acc2
-	xor	1($sbox,$acc0,8),$t2
-	mov	0($key),$s0
-	xor	1($sbox,$acc2,8),$t3
-
-	xor	$t0,$s0
-	mov	4($key),$s1
-	mov	8($key),$s2
-	xor	$t2,$s2
-	xor	$t1,$s1
-	xor	$t3,$s3
-___
-}
-
-sub declastvert()
-{ my $t3="%r8d";	# zaps $inp!
-
-$code.=<<___;
-	lea	2048($sbox),$sbox	# size optimization
-	movzb	`&lo("$s0")`,$acc0
-	movzb	`&lo("$s1")`,$acc1
-	movzb	`&lo("$s2")`,$acc2
-	movzb	($sbox,$acc0,1),$t0
-	movzb	($sbox,$acc1,1),$t1
-	movzb	($sbox,$acc2,1),$t2
-
-	movzb	`&lo("$s3")`,$acc0
-	movzb	`&hi("$s3")`,$acc1
-	movzb	`&hi("$s0")`,$acc2
-	movzb	($sbox,$acc0,1),$t3
-	movzb	($sbox,$acc1,1),$acc1	#$t0
-	movzb	($sbox,$acc2,1),$acc2	#$t1
-
-	shl	\$8,$acc1
-	shl	\$8,$acc2
-
-	xor	$acc1,$t0
-	xor	$acc2,$t1
-	shr	\$16,$s3
-
-	movzb	`&hi("$s1")`,$acc0
-	movzb	`&hi("$s2")`,$acc1
-	shr	\$16,$s0
-	movzb	($sbox,$acc0,1),$acc0	#$t2
-	movzb	($sbox,$acc1,1),$acc1	#$t3
-
-	shl	\$8,$acc0
-	shl	\$8,$acc1
-	shr	\$16,$s1
-	xor	$acc0,$t2
-	xor	$acc1,$t3
-	shr	\$16,$s2
-
-	movzb	`&lo("$s2")`,$acc0
-	movzb	`&lo("$s3")`,$acc1
-	movzb	`&lo("$s0")`,$acc2
-	movzb	($sbox,$acc0,1),$acc0	#$t0
-	movzb	($sbox,$acc1,1),$acc1	#$t1
-	movzb	($sbox,$acc2,1),$acc2	#$t2
-
-	shl	\$16,$acc0
-	shl	\$16,$acc1
-	shl	\$16,$acc2
-
-	xor	$acc0,$t0
-	xor	$acc1,$t1
-	xor	$acc2,$t2
-
-	movzb	`&lo("$s1")`,$acc0
-	movzb	`&hi("$s1")`,$acc1
-	movzb	`&hi("$s2")`,$acc2
-	movzb	($sbox,$acc0,1),$acc0	#$t3
-	movzb	($sbox,$acc1,1),$acc1	#$t0
-	movzb	($sbox,$acc2,1),$acc2	#$t1
-
-	shl	\$16,$acc0
-	shl	\$24,$acc1
-	shl	\$24,$acc2
-
-	xor	$acc0,$t3
-	xor	$acc1,$t0
-	xor	$acc2,$t1
-
-	movzb	`&hi("$s3")`,$acc0
-	movzb	`&hi("$s0")`,$acc1
-	mov	16+12($key),$s3
-	movzb	($sbox,$acc0,1),$acc0	#$t2
-	movzb	($sbox,$acc1,1),$acc1	#$t3
-	mov	16+0($key),$s0
-
-	shl	\$24,$acc0
-	shl	\$24,$acc1
-
-	xor	$acc0,$t2
-	xor	$acc1,$t3
-
-	mov	16+4($key),$s1
-	mov	16+8($key),$s2
-	lea	-2048($sbox),$sbox
-	xor	$t0,$s0
-	xor	$t1,$s1
-	xor	$t2,$s2
-	xor	$t3,$s3
-___
-}
-
-sub decstep()
-{ my ($i,@s) = @_;
-  my $tmp0=$acc0;
-  my $tmp1=$acc1;
-  my $tmp2=$acc2;
-  my $out=($t0,$t1,$t2,$s[0])[$i];
-
-	$code.="	mov	$s[0],$out\n"		if ($i!=3);
-			$tmp1=$s[2]			if ($i==3);
-	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
-	$code.="	and	\$0xFF,$out\n";
-
-	$code.="	mov	0($sbox,$out,8),$out\n";
-	$code.="	shr	\$16,$tmp1\n";
-			$tmp2=$s[3]			if ($i==3);
-	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
-
-			$tmp0=$s[1]			if ($i==3);
-	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
-	$code.="	and	\$0xFF,$tmp1\n";
-	$code.="	shr	\$24,$tmp2\n";
-
-	$code.="	xor	3($sbox,$tmp0,8),$out\n";
-	$code.="	xor	2($sbox,$tmp1,8),$out\n";
-	$code.="	xor	1($sbox,$tmp2,8),$out\n";
-
-	$code.="	mov	$t2,$s[1]\n"		if ($i==3);
-	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
-	$code.="	mov	$t0,$s[3]\n"		if ($i==3);
-	$code.="\n";
-}
-
-sub declast()
-{ my ($i,@s)=@_;
-  my $tmp0=$acc0;
-  my $tmp1=$acc1;
-  my $tmp2=$acc2;
-  my $out=($t0,$t1,$t2,$s[0])[$i];
-
-	$code.="	mov	$s[0],$out\n"		if ($i!=3);
-			$tmp1=$s[2]			if ($i==3);
-	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
-	$code.="	and	\$0xFF,$out\n";
-
-	$code.="	movzb	2048($sbox,$out,1),$out\n";
-	$code.="	shr	\$16,$tmp1\n";
-			$tmp2=$s[3]			if ($i==3);
-	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
-
-			$tmp0=$s[1]			if ($i==3);
-	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
-	$code.="	and	\$0xFF,$tmp1\n";
-	$code.="	shr	\$24,$tmp2\n";
-
-	$code.="	movzb	2048($sbox,$tmp0,1),$tmp0\n";
-	$code.="	movzb	2048($sbox,$tmp1,1),$tmp1\n";
-	$code.="	movzb	2048($sbox,$tmp2,1),$tmp2\n";
-
-	$code.="	shl	\$8,$tmp0\n";
-	$code.="	shl	\$16,$tmp1\n";
-	$code.="	shl	\$24,$tmp2\n";
-
-	$code.="	xor	$tmp0,$out\n";
-	$code.="	mov	$t2,$s[1]\n"		if ($i==3);
-	$code.="	xor	$tmp1,$out\n";
-	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
-	$code.="	xor	$tmp2,$out\n";
-	$code.="	mov	$t0,$s[3]\n"		if ($i==3);
-	$code.="\n";
-}
-
-$code.=<<___;
-.type	_x86_64_AES_decrypt,\@abi-omnipotent
-.align	16
-_x86_64_AES_decrypt:
-	xor	0($key),$s0			# xor with key
-	xor	4($key),$s1
-	xor	8($key),$s2
-	xor	12($key),$s3
-
-	mov	240($key),$rnds			# load key->rounds
-	sub	\$1,$rnds
-	jmp	.Ldec_loop
-.align	16
-.Ldec_loop:
-___
-	if ($verticalspin) { &decvert(); }
-	else {	&decstep(0,$s0,$s3,$s2,$s1);
-		&decstep(1,$s1,$s0,$s3,$s2);
-		&decstep(2,$s2,$s1,$s0,$s3);
-		&decstep(3,$s3,$s2,$s1,$s0);
-		$code.=<<___;
-		lea	16($key),$key
-		xor	0($key),$s0			# xor with key
-		xor	4($key),$s1
-		xor	8($key),$s2
-		xor	12($key),$s3
-___
-	}
-$code.=<<___;
-	sub	\$1,$rnds
-	jnz	.Ldec_loop
-___
-	if ($verticalspin) { &declastvert(); }
-	else {	&declast(0,$s0,$s3,$s2,$s1);
-		&declast(1,$s1,$s0,$s3,$s2);
-		&declast(2,$s2,$s1,$s0,$s3);
-		&declast(3,$s3,$s2,$s1,$s0);
-		$code.=<<___;
-		xor	16+0($key),$s0			# xor with key
-		xor	16+4($key),$s1
-		xor	16+8($key),$s2
-		xor	16+12($key),$s3
-___
-	}
-$code.=<<___;
-	.byte	0xf3,0xc3			# rep ret
-.size	_x86_64_AES_decrypt,.-_x86_64_AES_decrypt
-___
-
-sub deccompactvert()
-{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
-
-$code.=<<___;
-	movzb	`&lo("$s0")`,$t0
-	movzb	`&lo("$s1")`,$t1
-	movzb	`&lo("$s2")`,$t2
-	movzb	`&lo("$s3")`,$t3
-	movzb	`&hi("$s3")`,$acc0
-	movzb	`&hi("$s0")`,$acc1
-	shr	\$16,$s3
-	movzb	`&hi("$s1")`,$acc2
-	movzb	($sbox,$t0,1),$t0
-	movzb	($sbox,$t1,1),$t1
-	movzb	($sbox,$t2,1),$t2
-	movzb	($sbox,$t3,1),$t3
-
-	movzb	($sbox,$acc0,1),$t4	#$t0
-	movzb	`&hi("$s2")`,$acc0
-	movzb	($sbox,$acc1,1),$t5	#$t1
-	movzb	($sbox,$acc2,1),$acc2	#$t2
-	movzb	($sbox,$acc0,1),$acc0	#$t3
-
-	shr	\$16,$s2
-	shl	\$8,$t5
-	shl	\$8,$t4
-	movzb	`&lo("$s2")`,$acc1
-	shr	\$16,$s0
-	xor	$t4,$t0
-	shr	\$16,$s1
-	movzb	`&lo("$s3")`,$t4
-
-	shl	\$8,$acc2
-	xor	$t5,$t1
-	shl	\$8,$acc0
-	movzb	`&lo("$s0")`,$t5
-	movzb	($sbox,$acc1,1),$acc1	#$t0
-	xor	$acc2,$t2
-	movzb	`&lo("$s1")`,$acc2
-
-	shl	\$16,$acc1
-	xor	$acc0,$t3
-	movzb	($sbox,$t4,1),$t4	#$t1
-	movzb	`&hi("$s1")`,$acc0
-	movzb	($sbox,$acc2,1),$acc2	#$t3
-	xor	$acc1,$t0
-	movzb	($sbox,$t5,1),$t5	#$t2
-	movzb	`&hi("$s2")`,$acc1
-
-	shl	\$16,$acc2
-	shl	\$16,$t4
-	shl	\$16,$t5
-	xor	$acc2,$t3
-	movzb	`&hi("$s3")`,$acc2
-	xor	$t4,$t1
-	shr	\$8,$s0
-	xor	$t5,$t2
-
-	movzb	($sbox,$acc0,1),$acc0	#$t0
-	movzb	($sbox,$acc1,1),$s1	#$t1
-	movzb	($sbox,$acc2,1),$s2	#$t2
-	movzb	($sbox,$s0,1),$s3	#$t3
-
-	mov	$t0,$s0
-	shl	\$24,$acc0
-	shl	\$24,$s1
-	shl	\$24,$s2
-	xor	$acc0,$s0
-	shl	\$24,$s3
-	xor	$t1,$s1
-	xor	$t2,$s2
-	xor	$t3,$s3
-___
-}
-
-# parallelized version! input is pair of 64-bit values: %rax=s1.s0
-# and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
-# %ecx=s2 and %edx=s3.
-sub dectransform()
-{ my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");
-  my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx");
-  my $prefetch = shift;
-
-$code.=<<___;
-	mov	$mask80,$tp40
-	mov	$mask80,$tp48
-	and	$tp10,$tp40
-	and	$tp18,$tp48
-	mov	$tp40,$acc0
-	mov	$tp48,$acc8
-	shr	\$7,$tp40
-	lea	($tp10,$tp10),$tp20
-	shr	\$7,$tp48
-	lea	($tp18,$tp18),$tp28
-	sub	$tp40,$acc0
-	sub	$tp48,$acc8
-	and	$maskfe,$tp20
-	and	$maskfe,$tp28
-	and	$mask1b,$acc0
-	and	$mask1b,$acc8
-	xor	$acc0,$tp20
-	xor	$acc8,$tp28
-	mov	$mask80,$tp80
-	mov	$mask80,$tp88
-
-	and	$tp20,$tp80
-	and	$tp28,$tp88
-	mov	$tp80,$acc0
-	mov	$tp88,$acc8
-	shr	\$7,$tp80
-	lea	($tp20,$tp20),$tp40
-	shr	\$7,$tp88
-	lea	($tp28,$tp28),$tp48
-	sub	$tp80,$acc0
-	sub	$tp88,$acc8
-	and	$maskfe,$tp40
-	and	$maskfe,$tp48
-	and	$mask1b,$acc0
-	and	$mask1b,$acc8
-	xor	$acc0,$tp40
-	xor	$acc8,$tp48
-	mov	$mask80,$tp80
-	mov	$mask80,$tp88
-
-	and	$tp40,$tp80
-	and	$tp48,$tp88
-	mov	$tp80,$acc0
-	mov	$tp88,$acc8
-	shr	\$7,$tp80
-	 xor	$tp10,$tp20		# tp2^=tp1
-	shr	\$7,$tp88
-	 xor	$tp18,$tp28		# tp2^=tp1
-	sub	$tp80,$acc0
-	sub	$tp88,$acc8
-	lea	($tp40,$tp40),$tp80
-	lea	($tp48,$tp48),$tp88
-	 xor	$tp10,$tp40		# tp4^=tp1
-	 xor	$tp18,$tp48		# tp4^=tp1
-	and	$maskfe,$tp80
-	and	$maskfe,$tp88
-	and	$mask1b,$acc0
-	and	$mask1b,$acc8
-	xor	$acc0,$tp80
-	xor	$acc8,$tp88
-
-	xor	$tp80,$tp10		# tp1^=tp8
-	xor	$tp88,$tp18		# tp1^=tp8
-	xor	$tp80,$tp20		# tp2^tp1^=tp8
-	xor	$tp88,$tp28		# tp2^tp1^=tp8
-	mov	$tp10,$acc0
-	mov	$tp18,$acc8
-	xor	$tp80,$tp40		# tp4^tp1^=tp8
-	shr	\$32,$acc0
-	xor	$tp88,$tp48		# tp4^tp1^=tp8
-	shr	\$32,$acc8
-	xor	$tp20,$tp80		# tp8^=tp8^tp2^tp1=tp2^tp1
-	rol	\$8,`&LO("$tp10")`	# ROTATE(tp1^tp8,8)
-	xor	$tp28,$tp88		# tp8^=tp8^tp2^tp1=tp2^tp1
-	rol	\$8,`&LO("$tp18")`	# ROTATE(tp1^tp8,8)
-	xor	$tp40,$tp80		# tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
-	rol	\$8,`&LO("$acc0")`	# ROTATE(tp1^tp8,8)
-	xor	$tp48,$tp88		# tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
-
-	rol	\$8,`&LO("$acc8")`	# ROTATE(tp1^tp8,8)
-	xor	`&LO("$tp80")`,`&LO("$tp10")`
-	shr	\$32,$tp80
-	xor	`&LO("$tp88")`,`&LO("$tp18")`
-	shr	\$32,$tp88
-	xor	`&LO("$tp80")`,`&LO("$acc0")`
-	xor	`&LO("$tp88")`,`&LO("$acc8")`
-
-	mov	$tp20,$tp80
-	rol	\$24,`&LO("$tp20")`	# ROTATE(tp2^tp1^tp8,24)
-	mov	$tp28,$tp88
-	rol	\$24,`&LO("$tp28")`	# ROTATE(tp2^tp1^tp8,24)
-	shr	\$32,$tp80
-	xor	`&LO("$tp20")`,`&LO("$tp10")`
-	shr	\$32,$tp88
-	xor	`&LO("$tp28")`,`&LO("$tp18")`
-	rol	\$24,`&LO("$tp80")`	# ROTATE(tp2^tp1^tp8,24)
-	mov	$tp40,$tp20
-	rol	\$24,`&LO("$tp88")`	# ROTATE(tp2^tp1^tp8,24)
-	mov	$tp48,$tp28
-	shr	\$32,$tp20
-	xor	`&LO("$tp80")`,`&LO("$acc0")`
-	shr	\$32,$tp28
-	xor	`&LO("$tp88")`,`&LO("$acc8")`
-
-	`"mov	0($sbox),$mask80"	if ($prefetch)`
-	rol	\$16,`&LO("$tp40")`	# ROTATE(tp4^tp1^tp8,16)
-	`"mov	64($sbox),$maskfe"	if ($prefetch)`
-	rol	\$16,`&LO("$tp48")`	# ROTATE(tp4^tp1^tp8,16)
-	`"mov	128($sbox),$mask1b"	if ($prefetch)`
-	rol	\$16,`&LO("$tp20")`	# ROTATE(tp4^tp1^tp8,16)
-	`"mov	192($sbox),$tp80"	if ($prefetch)`
-	xor	`&LO("$tp40")`,`&LO("$tp10")`
-	rol	\$16,`&LO("$tp28")`	# ROTATE(tp4^tp1^tp8,16)
-	xor	`&LO("$tp48")`,`&LO("$tp18")`
-	`"mov	256($sbox),$tp88"	if ($prefetch)`
-	xor	`&LO("$tp20")`,`&LO("$acc0")`
-	xor	`&LO("$tp28")`,`&LO("$acc8")`
-___
-}
-
-$code.=<<___;
-.type	_x86_64_AES_decrypt_compact,\@abi-omnipotent
-.align	16
-_x86_64_AES_decrypt_compact:
-.cfi_startproc
-	lea	128($sbox),$inp			# size optimization
-	mov	0-128($inp),$acc1		# prefetch Td4
-	mov	32-128($inp),$acc2
-	mov	64-128($inp),$t0
-	mov	96-128($inp),$t1
-	mov	128-128($inp),$acc1
-	mov	160-128($inp),$acc2
-	mov	192-128($inp),$t0
-	mov	224-128($inp),$t1
-	jmp	.Ldec_loop_compact
-
-.align	16
-.Ldec_loop_compact:
-		xor	0($key),$s0		# xor with key
-		xor	4($key),$s1
-		xor	8($key),$s2
-		xor	12($key),$s3
-		lea	16($key),$key
-___
-		&deccompactvert();
-$code.=<<___;
-		cmp	16(%rsp),$key
-		je	.Ldec_compact_done
-
-		mov	256+0($sbox),$mask80
-		shl	\$32,%rbx
-		shl	\$32,%rdx
-		mov	256+8($sbox),$maskfe
-		or	%rbx,%rax
-		or	%rdx,%rcx
-		mov	256+16($sbox),$mask1b
-___
-		&dectransform(1);
-$code.=<<___;
-	jmp	.Ldec_loop_compact
-.align	16
-.Ldec_compact_done:
-	xor	0($key),$s0
-	xor	4($key),$s1
-	xor	8($key),$s2
-	xor	12($key),$s3
-	.byte	0xf3,0xc3			# rep ret
-.cfi_endproc
-.size	_x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
-___
-
-# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
-$code.=<<___;
-.globl	AES_decrypt
-.type	AES_decrypt,\@function,3
-.align	16
-.globl	asm_AES_decrypt
-.hidden	asm_AES_decrypt
-asm_AES_decrypt:
-AES_decrypt:
-.cfi_startproc
-	mov	%rsp,%rax
-.cfi_def_cfa_register	%rax
-	push	%rbx
-.cfi_push	%rbx
-	push	%rbp
-.cfi_push	%rbp
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-
-	# allocate frame "above" key schedule
-	lea	-63(%rdx),%rcx	# %rdx is key argument
-	and	\$-64,%rsp
-	sub	%rsp,%rcx
-	neg	%rcx
-	and	\$0x3c0,%rcx
-	sub	%rcx,%rsp
-	sub	\$32,%rsp
-
-	mov	%rsi,16(%rsp)	# save out
-	mov	%rax,24(%rsp)	# save original stack pointer
-.cfi_cfa_expression	%rsp+24,deref,+8
-.Ldec_prologue:
-
-	mov	%rdx,$key
-	mov	240($key),$rnds	# load rounds
-
-	mov	0(%rdi),$s0	# load input vector
-	mov	4(%rdi),$s1
-	mov	8(%rdi),$s2
-	mov	12(%rdi),$s3
-
-	shl	\$4,$rnds
-	lea	($key,$rnds),%rbp
-	mov	$key,(%rsp)	# key schedule
-	mov	%rbp,8(%rsp)	# end of key schedule
-
-	# pick Td4 copy which can't "overlap" with stack frame or key schedule
-	lea	.LAES_Td+2048(%rip),$sbox
-	lea	768(%rsp),%rbp
-	sub	$sbox,%rbp
-	and	\$0x300,%rbp
-	lea	($sbox,%rbp),$sbox
-	shr	\$3,%rbp	# recall "magic" constants!
-	add	%rbp,$sbox
-
-	call	_x86_64_AES_decrypt_compact
-
-	mov	16(%rsp),$out	# restore out
-	mov	24(%rsp),%rsi	# restore saved stack pointer
-.cfi_def_cfa	%rsi,8
-	mov	$s0,0($out)	# write output vector
-	mov	$s1,4($out)
-	mov	$s2,8($out)
-	mov	$s3,12($out)
-
-	mov	-48(%rsi),%r15
-.cfi_restore	%r15
-	mov	-40(%rsi),%r14
-.cfi_restore	%r14
-	mov	-32(%rsi),%r13
-.cfi_restore	%r13
-	mov	-24(%rsi),%r12
-.cfi_restore	%r12
-	mov	-16(%rsi),%rbp
-.cfi_restore	%rbp
-	mov	-8(%rsi),%rbx
-.cfi_restore	%rbx
-	lea	(%rsi),%rsp
-.cfi_def_cfa_register	%rsp
-.Ldec_epilogue:
-	ret
-.cfi_endproc
-.size	AES_decrypt,.-AES_decrypt
-___
-#------------------------------------------------------------------#
-
-sub enckey()
-{
-$code.=<<___;
-	movz	%dl,%esi		# rk[i]>>0
-	movzb	-128(%rbp,%rsi),%ebx
-	movz	%dh,%esi		# rk[i]>>8
-	shl	\$24,%ebx
-	xor	%ebx,%eax
-
-	movzb	-128(%rbp,%rsi),%ebx
-	shr	\$16,%edx
-	movz	%dl,%esi		# rk[i]>>16
-	xor	%ebx,%eax
-
-	movzb	-128(%rbp,%rsi),%ebx
-	movz	%dh,%esi		# rk[i]>>24
-	shl	\$8,%ebx
-	xor	%ebx,%eax
-
-	movzb	-128(%rbp,%rsi),%ebx
-	shl	\$16,%ebx
-	xor	%ebx,%eax
-
-	xor	1024-128(%rbp,%rcx,4),%eax		# rcon
-___
-}
-
-# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
-#                        AES_KEY *key)
-$code.=<<___;
-.globl	AES_set_encrypt_key
-.type	AES_set_encrypt_key,\@function,3
-.align	16
-AES_set_encrypt_key:
-.cfi_startproc
-	push	%rbx
-.cfi_push	%rbx
-	push	%rbp
-.cfi_push	%rbp
-	push	%r12			# redundant, but allows to share
-.cfi_push	%r12
-	push	%r13			# exception handler...
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	sub	\$8,%rsp
-.cfi_adjust_cfa_offset	8
-.Lenc_key_prologue:
-
-	call	_x86_64_AES_set_encrypt_key
-
-	mov	40(%rsp),%rbp
-.cfi_restore	%rbp
-	mov	48(%rsp),%rbx
-.cfi_restore	%rbx
-	add	\$56,%rsp
-.cfi_adjust_cfa_offset	-56
-.Lenc_key_epilogue:
-	ret
-.cfi_endproc
-.size	AES_set_encrypt_key,.-AES_set_encrypt_key
-
-.type	_x86_64_AES_set_encrypt_key,\@abi-omnipotent
-.align	16
-_x86_64_AES_set_encrypt_key:
-.cfi_startproc
-	mov	%esi,%ecx			# %ecx=bits
-	mov	%rdi,%rsi			# %rsi=userKey
-	mov	%rdx,%rdi			# %rdi=key
-
-	test	\$-1,%rsi
-	jz	.Lbadpointer
-	test	\$-1,%rdi
-	jz	.Lbadpointer
-
-	lea	.LAES_Te(%rip),%rbp
-	lea	2048+128(%rbp),%rbp
-
-	# prefetch Te4
-	mov	0-128(%rbp),%eax
-	mov	32-128(%rbp),%ebx
-	mov	64-128(%rbp),%r8d
-	mov	96-128(%rbp),%edx
-	mov	128-128(%rbp),%eax
-	mov	160-128(%rbp),%ebx
-	mov	192-128(%rbp),%r8d
-	mov	224-128(%rbp),%edx
-
-	cmp	\$128,%ecx
-	je	.L10rounds
-	cmp	\$192,%ecx
-	je	.L12rounds
-	cmp	\$256,%ecx
-	je	.L14rounds
-	mov	\$-2,%rax			# invalid number of bits
-	jmp	.Lexit
-
-.L10rounds:
-	mov	0(%rsi),%rax			# copy first 4 dwords
-	mov	8(%rsi),%rdx
-	mov	%rax,0(%rdi)
-	mov	%rdx,8(%rdi)
-
-	shr	\$32,%rdx
-	xor	%ecx,%ecx
-	jmp	.L10shortcut
-.align	4
-.L10loop:
-		mov	0(%rdi),%eax			# rk[0]
-		mov	12(%rdi),%edx			# rk[3]
-.L10shortcut:
-___
-		&enckey	();
-$code.=<<___;
-		mov	%eax,16(%rdi)			# rk[4]
-		xor	4(%rdi),%eax
-		mov	%eax,20(%rdi)			# rk[5]
-		xor	8(%rdi),%eax
-		mov	%eax,24(%rdi)			# rk[6]
-		xor	12(%rdi),%eax
-		mov	%eax,28(%rdi)			# rk[7]
-		add	\$1,%ecx
-		lea	16(%rdi),%rdi
-		cmp	\$10,%ecx
-	jl	.L10loop
-
-	movl	\$10,80(%rdi)			# setup number of rounds
-	xor	%rax,%rax
-	jmp	.Lexit
-
-.L12rounds:
-	mov	0(%rsi),%rax			# copy first 6 dwords
-	mov	8(%rsi),%rbx
-	mov	16(%rsi),%rdx
-	mov	%rax,0(%rdi)
-	mov	%rbx,8(%rdi)
-	mov	%rdx,16(%rdi)
-
-	shr	\$32,%rdx
-	xor	%ecx,%ecx
-	jmp	.L12shortcut
-.align	4
-.L12loop:
-		mov	0(%rdi),%eax			# rk[0]
-		mov	20(%rdi),%edx			# rk[5]
-.L12shortcut:
-___
-		&enckey	();
-$code.=<<___;
-		mov	%eax,24(%rdi)			# rk[6]
-		xor	4(%rdi),%eax
-		mov	%eax,28(%rdi)			# rk[7]
-		xor	8(%rdi),%eax
-		mov	%eax,32(%rdi)			# rk[8]
-		xor	12(%rdi),%eax
-		mov	%eax,36(%rdi)			# rk[9]
-
-		cmp	\$7,%ecx
-		je	.L12break
-		add	\$1,%ecx
-
-		xor	16(%rdi),%eax
-		mov	%eax,40(%rdi)			# rk[10]
-		xor	20(%rdi),%eax
-		mov	%eax,44(%rdi)			# rk[11]
-
-		lea	24(%rdi),%rdi
-	jmp	.L12loop
-.L12break:
-	movl	\$12,72(%rdi)		# setup number of rounds
-	xor	%rax,%rax
-	jmp	.Lexit
-
-.L14rounds:
-	mov	0(%rsi),%rax			# copy first 8 dwords
-	mov	8(%rsi),%rbx
-	mov	16(%rsi),%rcx
-	mov	24(%rsi),%rdx
-	mov	%rax,0(%rdi)
-	mov	%rbx,8(%rdi)
-	mov	%rcx,16(%rdi)
-	mov	%rdx,24(%rdi)
-
-	shr	\$32,%rdx
-	xor	%ecx,%ecx
-	jmp	.L14shortcut
-.align	4
-.L14loop:
-		mov	0(%rdi),%eax			# rk[0]
-		mov	28(%rdi),%edx			# rk[4]
-.L14shortcut:
-___
-		&enckey	();
-$code.=<<___;
-		mov	%eax,32(%rdi)			# rk[8]
-		xor	4(%rdi),%eax
-		mov	%eax,36(%rdi)			# rk[9]
-		xor	8(%rdi),%eax
-		mov	%eax,40(%rdi)			# rk[10]
-		xor	12(%rdi),%eax
-		mov	%eax,44(%rdi)			# rk[11]
-
-		cmp	\$6,%ecx
-		je	.L14break
-		add	\$1,%ecx
-
-		mov	%eax,%edx
-		mov	16(%rdi),%eax			# rk[4]
-		movz	%dl,%esi			# rk[11]>>0
-		movzb	-128(%rbp,%rsi),%ebx
-		movz	%dh,%esi			# rk[11]>>8
-		xor	%ebx,%eax
-
-		movzb	-128(%rbp,%rsi),%ebx
-		shr	\$16,%edx
-		shl	\$8,%ebx
-		movz	%dl,%esi			# rk[11]>>16
-		xor	%ebx,%eax
-
-		movzb	-128(%rbp,%rsi),%ebx
-		movz	%dh,%esi			# rk[11]>>24
-		shl	\$16,%ebx
-		xor	%ebx,%eax
-
-		movzb	-128(%rbp,%rsi),%ebx
-		shl	\$24,%ebx
-		xor	%ebx,%eax
-
-		mov	%eax,48(%rdi)			# rk[12]
-		xor	20(%rdi),%eax
-		mov	%eax,52(%rdi)			# rk[13]
-		xor	24(%rdi),%eax
-		mov	%eax,56(%rdi)			# rk[14]
-		xor	28(%rdi),%eax
-		mov	%eax,60(%rdi)			# rk[15]
-
-		lea	32(%rdi),%rdi
-	jmp	.L14loop
-.L14break:
-	movl	\$14,48(%rdi)		# setup number of rounds
-	xor	%rax,%rax
-	jmp	.Lexit
-
-.Lbadpointer:
-	mov	\$-1,%rax
-.Lexit:
-	.byte	0xf3,0xc3			# rep ret
-.cfi_endproc
-.size	_x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
-___
-
-sub deckey_ref()
-{ my ($i,$ptr,$te,$td) = @_;
-  my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d");
-$code.=<<___;
-	mov	$i($ptr),$tp1
-	mov	$tp1,$acc
-	and	\$0x80808080,$acc
-	mov	$acc,$tp4
-	shr	\$7,$tp4
-	lea	0($tp1,$tp1),$tp2
-	sub	$tp4,$acc
-	and	\$0xfefefefe,$tp2
-	and	\$0x1b1b1b1b,$acc
-	xor	$tp2,$acc
-	mov	$acc,$tp2
-
-	and	\$0x80808080,$acc
-	mov	$acc,$tp8
-	shr	\$7,$tp8
-	lea	0($tp2,$tp2),$tp4
-	sub	$tp8,$acc
-	and	\$0xfefefefe,$tp4
-	and	\$0x1b1b1b1b,$acc
-	 xor	$tp1,$tp2		# tp2^tp1
-	xor	$tp4,$acc
-	mov	$acc,$tp4
-
-	and	\$0x80808080,$acc
-	mov	$acc,$tp8
-	shr	\$7,$tp8
-	sub	$tp8,$acc
-	lea	0($tp4,$tp4),$tp8
-	 xor	$tp1,$tp4		# tp4^tp1
-	and	\$0xfefefefe,$tp8
-	and	\$0x1b1b1b1b,$acc
-	xor	$acc,$tp8
-
-	xor	$tp8,$tp1		# tp1^tp8
-	rol	\$8,$tp1		# ROTATE(tp1^tp8,8)
-	xor	$tp8,$tp2		# tp2^tp1^tp8
-	xor	$tp8,$tp4		# tp4^tp1^tp8
-	xor	$tp2,$tp8
-	xor	$tp4,$tp8		# tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2
-
-	xor	$tp8,$tp1
-	rol	\$24,$tp2		# ROTATE(tp2^tp1^tp8,24)
-	xor	$tp2,$tp1
-	rol	\$16,$tp4		# ROTATE(tp4^tp1^tp8,16)
-	xor	$tp4,$tp1
-
-	mov	$tp1,$i($ptr)
-___
-}
-
-# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
-#                        AES_KEY *key)
-$code.=<<___;
-.globl	AES_set_decrypt_key
-.type	AES_set_decrypt_key,\@function,3
-.align	16
-AES_set_decrypt_key:
-.cfi_startproc
-	push	%rbx
-.cfi_push	%rbx
-	push	%rbp
-.cfi_push	%rbp
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	push	%rdx			# save key schedule
-.cfi_adjust_cfa_offset	8
-.Ldec_key_prologue:
-
-	call	_x86_64_AES_set_encrypt_key
-	mov	(%rsp),%r8		# restore key schedule
-	cmp	\$0,%eax
-	jne	.Labort
-
-	mov	240(%r8),%r14d		# pull number of rounds
-	xor	%rdi,%rdi
-	lea	(%rdi,%r14d,4),%rcx
-	mov	%r8,%rsi
-	lea	(%r8,%rcx,4),%rdi	# pointer to last chunk
-.align	4
-.Linvert:
-		mov	0(%rsi),%rax
-		mov	8(%rsi),%rbx
-		mov	0(%rdi),%rcx
-		mov	8(%rdi),%rdx
-		mov	%rax,0(%rdi)
-		mov	%rbx,8(%rdi)
-		mov	%rcx,0(%rsi)
-		mov	%rdx,8(%rsi)
-		lea	16(%rsi),%rsi
-		lea	-16(%rdi),%rdi
-		cmp	%rsi,%rdi
-	jne	.Linvert
-
-	lea	.LAES_Te+2048+1024(%rip),%rax	# rcon
-
-	mov	40(%rax),$mask80
-	mov	48(%rax),$maskfe
-	mov	56(%rax),$mask1b
-
-	mov	%r8,$key
-	sub	\$1,%r14d
-.align	4
-.Lpermute:
-		lea	16($key),$key
-		mov	0($key),%rax
-		mov	8($key),%rcx
-___
-		&dectransform ();
-$code.=<<___;
-		mov	%eax,0($key)
-		mov	%ebx,4($key)
-		mov	%ecx,8($key)
-		mov	%edx,12($key)
-		sub	\$1,%r14d
-	jnz	.Lpermute
-
-	xor	%rax,%rax
-.Labort:
-	mov	8(%rsp),%r15
-.cfi_restore	%r15
-	mov	16(%rsp),%r14
-.cfi_restore	%r14
-	mov	24(%rsp),%r13
-.cfi_restore	%r13
-	mov	32(%rsp),%r12
-.cfi_restore	%r12
-	mov	40(%rsp),%rbp
-.cfi_restore	%rbp
-	mov	48(%rsp),%rbx
-.cfi_restore	%rbx
-	add	\$56,%rsp
-.cfi_adjust_cfa_offset	-56
-.Ldec_key_epilogue:
-	ret
-.cfi_endproc
-.size	AES_set_decrypt_key,.-AES_set_decrypt_key
-___
-
-# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
-#			size_t length, const AES_KEY *key,
-#			unsigned char *ivp,const int enc);
-{
-# stack frame layout
-# -8(%rsp)		return address
-my $keyp="0(%rsp)";		# one to pass as $key
-my $keyend="8(%rsp)";		# &(keyp->rd_key[4*keyp->rounds])
-my $_rsp="16(%rsp)";		# saved %rsp
-my $_inp="24(%rsp)";		# copy of 1st parameter, inp
-my $_out="32(%rsp)";		# copy of 2nd parameter, out
-my $_len="40(%rsp)";		# copy of 3rd parameter, length
-my $_key="48(%rsp)";		# copy of 4th parameter, key
-my $_ivp="56(%rsp)";		# copy of 5th parameter, ivp
-my $ivec="64(%rsp)";		# ivec[16]
-my $aes_key="80(%rsp)";		# copy of aes_key
-my $mark="80+240(%rsp)";	# copy of aes_key->rounds
-
-$code.=<<___;
-.globl	AES_cbc_encrypt
-.type	AES_cbc_encrypt,\@function,6
-.align	16
-.extern	OPENSSL_ia32cap_P
-.globl	asm_AES_cbc_encrypt
-.hidden	asm_AES_cbc_encrypt
-asm_AES_cbc_encrypt:
-AES_cbc_encrypt:
-.cfi_startproc
-	cmp	\$0,%rdx	# check length
-	je	.Lcbc_epilogue
-	pushfq
-# This could be .cfi_push 49, but libunwind fails on registers it does not
-# recognize. See https://bugzilla.redhat.com/show_bug.cgi?id=217087.
-.cfi_adjust_cfa_offset	8
-	push	%rbx
-.cfi_push	%rbx
-	push	%rbp
-.cfi_push	%rbp
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-.Lcbc_prologue:
-
-	cld
-	mov	%r9d,%r9d	# clear upper half of enc
-
-	lea	.LAES_Te(%rip),$sbox
-	lea	.LAES_Td(%rip),%r10
-	cmp	\$0,%r9
-	cmoveq	%r10,$sbox
-
-.cfi_remember_state
-	mov	OPENSSL_ia32cap_P(%rip),%r10d
-	cmp	\$$speed_limit,%rdx
-	jb	.Lcbc_slow_prologue
-	test	\$15,%rdx
-	jnz	.Lcbc_slow_prologue
-	bt	\$28,%r10d
-	jc	.Lcbc_slow_prologue
-
-	# allocate aligned stack frame...
-	lea	-88-248(%rsp),$key
-	and	\$-64,$key
-
-	# ... and make sure it doesn't alias with AES_T[ed] modulo 4096
-	mov	$sbox,%r10
-	lea	2304($sbox),%r11
-	mov	$key,%r12
-	and	\$0xFFF,%r10	# s = $sbox&0xfff
-	and	\$0xFFF,%r11	# e = ($sbox+2048)&0xfff
-	and	\$0xFFF,%r12	# p = %rsp&0xfff
-
-	cmp	%r11,%r12	# if (p=>e) %rsp =- (p-e);
-	jb	.Lcbc_te_break_out
-	sub	%r11,%r12
-	sub	%r12,$key
-	jmp	.Lcbc_te_ok
-.Lcbc_te_break_out:		# else %rsp -= (p-s)&0xfff + framesz
-	sub	%r10,%r12
-	and	\$0xFFF,%r12
-	add	\$320,%r12
-	sub	%r12,$key
-.align	4
-.Lcbc_te_ok:
-
-	xchg	%rsp,$key
-.cfi_def_cfa_register	$key
-	#add	\$8,%rsp	# reserve for return address!
-	mov	$key,$_rsp	# save %rsp
-.cfi_cfa_expression	$_rsp,deref,+64
-.Lcbc_fast_body:
-	mov	%rdi,$_inp	# save copy of inp
-	mov	%rsi,$_out	# save copy of out
-	mov	%rdx,$_len	# save copy of len
-	mov	%rcx,$_key	# save copy of key
-	mov	%r8,$_ivp	# save copy of ivp
-	movl	\$0,$mark	# copy of aes_key->rounds = 0;
-	mov	%r8,%rbp	# rearrange input arguments
-	mov	%r9,%rbx
-	mov	%rsi,$out
-	mov	%rdi,$inp
-	mov	%rcx,$key
-
-	mov	240($key),%eax		# key->rounds
-	# do we copy key schedule to stack?
-	mov	$key,%r10
-	sub	$sbox,%r10
-	and	\$0xfff,%r10
-	cmp	\$2304,%r10
-	jb	.Lcbc_do_ecopy
-	cmp	\$4096-248,%r10
-	jb	.Lcbc_skip_ecopy
-.align	4
-.Lcbc_do_ecopy:
-		mov	$key,%rsi
-		lea	$aes_key,%rdi
-		lea	$aes_key,$key
-		mov	\$240/8,%ecx
-		.long	0x90A548F3	# rep movsq
-		mov	%eax,(%rdi)	# copy aes_key->rounds
-.Lcbc_skip_ecopy:
-	mov	$key,$keyp	# save key pointer
-
-	mov	\$18,%ecx
-.align	4
-.Lcbc_prefetch_te:
-		mov	0($sbox),%r10
-		mov	32($sbox),%r11
-		mov	64($sbox),%r12
-		mov	96($sbox),%r13
-		lea	128($sbox),$sbox
-		sub	\$1,%ecx
-	jnz	.Lcbc_prefetch_te
-	lea	-2304($sbox),$sbox
-
-	cmp	\$0,%rbx
-	je	.LFAST_DECRYPT
-
-#----------------------------- ENCRYPT -----------------------------#
-	mov	0(%rbp),$s0		# load iv
-	mov	4(%rbp),$s1
-	mov	8(%rbp),$s2
-	mov	12(%rbp),$s3
-
-.align	4
-.Lcbc_fast_enc_loop:
-		xor	0($inp),$s0
-		xor	4($inp),$s1
-		xor	8($inp),$s2
-		xor	12($inp),$s3
-		mov	$keyp,$key	# restore key
-		mov	$inp,$_inp	# if ($verticalspin) save inp
-
-		call	_x86_64_AES_encrypt
-
-		mov	$_inp,$inp	# if ($verticalspin) restore inp
-		mov	$_len,%r10
-		mov	$s0,0($out)
-		mov	$s1,4($out)
-		mov	$s2,8($out)
-		mov	$s3,12($out)
-
-		lea	16($inp),$inp
-		lea	16($out),$out
-		sub	\$16,%r10
-		test	\$-16,%r10
-		mov	%r10,$_len
-	jnz	.Lcbc_fast_enc_loop
-	mov	$_ivp,%rbp	# restore ivp
-	mov	$s0,0(%rbp)	# save ivec
-	mov	$s1,4(%rbp)
-	mov	$s2,8(%rbp)
-	mov	$s3,12(%rbp)
-
-	jmp	.Lcbc_fast_cleanup
-
-#----------------------------- DECRYPT -----------------------------#
-.align	16
-.LFAST_DECRYPT:
-	cmp	$inp,$out
-	je	.Lcbc_fast_dec_in_place
-
-	mov	%rbp,$ivec
-.align	4
-.Lcbc_fast_dec_loop:
-		mov	0($inp),$s0	# read input
-		mov	4($inp),$s1
-		mov	8($inp),$s2
-		mov	12($inp),$s3
-		mov	$keyp,$key	# restore key
-		mov	$inp,$_inp	# if ($verticalspin) save inp
-
-		call	_x86_64_AES_decrypt
-
-		mov	$ivec,%rbp	# load ivp
-		mov	$_inp,$inp	# if ($verticalspin) restore inp
-		mov	$_len,%r10	# load len
-		xor	0(%rbp),$s0	# xor iv
-		xor	4(%rbp),$s1
-		xor	8(%rbp),$s2
-		xor	12(%rbp),$s3
-		mov	$inp,%rbp	# current input, next iv
-
-		sub	\$16,%r10
-		mov	%r10,$_len	# update len
-		mov	%rbp,$ivec	# update ivp
-
-		mov	$s0,0($out)	# write output
-		mov	$s1,4($out)
-		mov	$s2,8($out)
-		mov	$s3,12($out)
-
-		lea	16($inp),$inp
-		lea	16($out),$out
-	jnz	.Lcbc_fast_dec_loop
-	mov	$_ivp,%r12		# load user ivp
-	mov	0(%rbp),%r10		# load iv
-	mov	8(%rbp),%r11
-	mov	%r10,0(%r12)		# copy back to user
-	mov	%r11,8(%r12)
-	jmp	.Lcbc_fast_cleanup
-
-.align	16
-.Lcbc_fast_dec_in_place:
-	mov	0(%rbp),%r10		# copy iv to stack
-	mov	8(%rbp),%r11
-	mov	%r10,0+$ivec
-	mov	%r11,8+$ivec
-.align	4
-.Lcbc_fast_dec_in_place_loop:
-		mov	0($inp),$s0	# load input
-		mov	4($inp),$s1
-		mov	8($inp),$s2
-		mov	12($inp),$s3
-		mov	$keyp,$key	# restore key
-		mov	$inp,$_inp	# if ($verticalspin) save inp
-
-		call	_x86_64_AES_decrypt
-
-		mov	$_inp,$inp	# if ($verticalspin) restore inp
-		mov	$_len,%r10
-		xor	0+$ivec,$s0
-		xor	4+$ivec,$s1
-		xor	8+$ivec,$s2
-		xor	12+$ivec,$s3
-
-		mov	0($inp),%r11	# load input
-		mov	8($inp),%r12
-		sub	\$16,%r10
-		jz	.Lcbc_fast_dec_in_place_done
-
-		mov	%r11,0+$ivec	# copy input to iv
-		mov	%r12,8+$ivec
-
-		mov	$s0,0($out)	# save output [zaps input]
-		mov	$s1,4($out)
-		mov	$s2,8($out)
-		mov	$s3,12($out)
-
-		lea	16($inp),$inp
-		lea	16($out),$out
-		mov	%r10,$_len
-	jmp	.Lcbc_fast_dec_in_place_loop
-.Lcbc_fast_dec_in_place_done:
-	mov	$_ivp,%rdi
-	mov	%r11,0(%rdi)	# copy iv back to user
-	mov	%r12,8(%rdi)
-
-	mov	$s0,0($out)	# save output [zaps input]
-	mov	$s1,4($out)
-	mov	$s2,8($out)
-	mov	$s3,12($out)
-
-.align	4
-.Lcbc_fast_cleanup:
-	cmpl	\$0,$mark	# was the key schedule copied?
-	lea	$aes_key,%rdi
-	je	.Lcbc_exit
-		mov	\$240/8,%ecx
-		xor	%rax,%rax
-		.long	0x90AB48F3	# rep stosq
-
-	jmp	.Lcbc_exit
-
-#--------------------------- SLOW ROUTINE ---------------------------#
-.align	16
-.Lcbc_slow_prologue:
-.cfi_restore_state
-	# allocate aligned stack frame...
-	lea	-88(%rsp),%rbp
-	and	\$-64,%rbp
-	# ... just "above" key schedule
-	lea	-88-63(%rcx),%r10
-	sub	%rbp,%r10
-	neg	%r10
-	and	\$0x3c0,%r10
-	sub	%r10,%rbp
-
-	xchg	%rsp,%rbp
-.cfi_def_cfa_register	%rbp
-	#add	\$8,%rsp	# reserve for return address!
-	mov	%rbp,$_rsp	# save %rsp
-.cfi_cfa_expression	$_rsp,deref,+64
-.Lcbc_slow_body:
-	#mov	%rdi,$_inp	# save copy of inp
-	#mov	%rsi,$_out	# save copy of out
-	#mov	%rdx,$_len	# save copy of len
-	#mov	%rcx,$_key	# save copy of key
-	mov	%r8,$_ivp	# save copy of ivp
-	mov	%r8,%rbp	# rearrange input arguments
-	mov	%r9,%rbx
-	mov	%rsi,$out
-	mov	%rdi,$inp
-	mov	%rcx,$key
-	mov	%rdx,%r10
-
-	mov	240($key),%eax
-	mov	$key,$keyp	# save key pointer
-	shl	\$4,%eax
-	lea	($key,%rax),%rax
-	mov	%rax,$keyend
-
-	# pick Te4 copy which can't "overlap" with stack frame or key schedule
-	lea	2048($sbox),$sbox
-	lea	768-8(%rsp),%rax
-	sub	$sbox,%rax
-	and	\$0x300,%rax
-	lea	($sbox,%rax),$sbox
-
-	cmp	\$0,%rbx
-	je	.LSLOW_DECRYPT
-
-#--------------------------- SLOW ENCRYPT ---------------------------#
-	test	\$-16,%r10		# check upon length
-	mov	0(%rbp),$s0		# load iv
-	mov	4(%rbp),$s1
-	mov	8(%rbp),$s2
-	mov	12(%rbp),$s3
-	jz	.Lcbc_slow_enc_tail	# short input...
-
-.align	4
-.Lcbc_slow_enc_loop:
-		xor	0($inp),$s0
-		xor	4($inp),$s1
-		xor	8($inp),$s2
-		xor	12($inp),$s3
-		mov	$keyp,$key	# restore key
-		mov	$inp,$_inp	# save inp
-		mov	$out,$_out	# save out
-		mov	%r10,$_len	# save len
-
-		call	_x86_64_AES_encrypt_compact
-
-		mov	$_inp,$inp	# restore inp
-		mov	$_out,$out	# restore out
-		mov	$_len,%r10	# restore len
-		mov	$s0,0($out)
-		mov	$s1,4($out)
-		mov	$s2,8($out)
-		mov	$s3,12($out)
-
-		lea	16($inp),$inp
-		lea	16($out),$out
-		sub	\$16,%r10
-		test	\$-16,%r10
-	jnz	.Lcbc_slow_enc_loop
-	test	\$15,%r10
-	jnz	.Lcbc_slow_enc_tail
-	mov	$_ivp,%rbp	# restore ivp
-	mov	$s0,0(%rbp)	# save ivec
-	mov	$s1,4(%rbp)
-	mov	$s2,8(%rbp)
-	mov	$s3,12(%rbp)
-
-	jmp	.Lcbc_exit
-
-.align	4
-.Lcbc_slow_enc_tail:
-	mov	%rax,%r11
-	mov	%rcx,%r12
-	mov	%r10,%rcx
-	mov	$inp,%rsi
-	mov	$out,%rdi
-	.long	0x9066A4F3		# rep movsb
-	mov	\$16,%rcx		# zero tail
-	sub	%r10,%rcx
-	xor	%rax,%rax
-	.long	0x9066AAF3		# rep stosb
-	mov	$out,$inp		# this is not a mistake!
-	mov	\$16,%r10		# len=16
-	mov	%r11,%rax
-	mov	%r12,%rcx
-	jmp	.Lcbc_slow_enc_loop	# one more spin...
-#--------------------------- SLOW DECRYPT ---------------------------#
-.align	16
-.LSLOW_DECRYPT:
-	shr	\$3,%rax
-	add	%rax,$sbox		# recall "magic" constants!
-
-	mov	0(%rbp),%r11		# copy iv to stack
-	mov	8(%rbp),%r12
-	mov	%r11,0+$ivec
-	mov	%r12,8+$ivec
-
-.align	4
-.Lcbc_slow_dec_loop:
-		mov	0($inp),$s0	# load input
-		mov	4($inp),$s1
-		mov	8($inp),$s2
-		mov	12($inp),$s3
-		mov	$keyp,$key	# restore key
-		mov	$inp,$_inp	# save inp
-		mov	$out,$_out	# save out
-		mov	%r10,$_len	# save len
-
-		call	_x86_64_AES_decrypt_compact
-
-		mov	$_inp,$inp	# restore inp
-		mov	$_out,$out	# restore out
-		mov	$_len,%r10
-		xor	0+$ivec,$s0
-		xor	4+$ivec,$s1
-		xor	8+$ivec,$s2
-		xor	12+$ivec,$s3
-
-		mov	0($inp),%r11	# load input
-		mov	8($inp),%r12
-		sub	\$16,%r10
-		jc	.Lcbc_slow_dec_partial
-		jz	.Lcbc_slow_dec_done
-
-		mov	%r11,0+$ivec	# copy input to iv
-		mov	%r12,8+$ivec
-
-		mov	$s0,0($out)	# save output [can zap input]
-		mov	$s1,4($out)
-		mov	$s2,8($out)
-		mov	$s3,12($out)
-
-		lea	16($inp),$inp
-		lea	16($out),$out
-	jmp	.Lcbc_slow_dec_loop
-.Lcbc_slow_dec_done:
-	mov	$_ivp,%rdi
-	mov	%r11,0(%rdi)		# copy iv back to user
-	mov	%r12,8(%rdi)
-
-	mov	$s0,0($out)		# save output [can zap input]
-	mov	$s1,4($out)
-	mov	$s2,8($out)
-	mov	$s3,12($out)
-
-	jmp	.Lcbc_exit
-
-.align	4
-.Lcbc_slow_dec_partial:
-	mov	$_ivp,%rdi
-	mov	%r11,0(%rdi)		# copy iv back to user
-	mov	%r12,8(%rdi)
-
-	mov	$s0,0+$ivec		# save output to stack
-	mov	$s1,4+$ivec
-	mov	$s2,8+$ivec
-	mov	$s3,12+$ivec
-
-	mov	$out,%rdi
-	lea	$ivec,%rsi
-	lea	16(%r10),%rcx
-	.long	0x9066A4F3	# rep movsb
-	jmp	.Lcbc_exit
-
-.align	16
-.Lcbc_exit:
-	mov	$_rsp,%rsi
-.cfi_def_cfa	%rsi,64
-	mov	(%rsi),%r15
-.cfi_restore	%r15
-	mov	8(%rsi),%r14
-.cfi_restore	%r14
-	mov	16(%rsi),%r13
-.cfi_restore	%r13
-	mov	24(%rsi),%r12
-.cfi_restore	%r12
-	mov	32(%rsi),%rbp
-.cfi_restore	%rbp
-	mov	40(%rsi),%rbx
-.cfi_restore	%rbx
-	lea	48(%rsi),%rsp
-.cfi_def_cfa	%rsp,16
-.Lcbc_popfq:
-	popfq
-# This could be .cfi_pop 49, but libunwind fails on registers it does not
-# recognize. See https://bugzilla.redhat.com/show_bug.cgi?id=217087.
-.cfi_adjust_cfa_offset	-8
-.Lcbc_epilogue:
-	ret
-.cfi_endproc
-.size	AES_cbc_encrypt,.-AES_cbc_encrypt
-___
-}
-
-$code.=<<___;
-.align	64
-.LAES_Te:
-___
-	&_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
-	&_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
-	&_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
-	&_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
-	&_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
-	&_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
-	&_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
-	&_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
-	&_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
-	&_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
-	&_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
-	&_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
-	&_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
-	&_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
-	&_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
-	&_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
-	&_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
-	&_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
-	&_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
-	&_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
-	&_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
-	&_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
-	&_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
-	&_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
-	&_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
-	&_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
-	&_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
-	&_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
-	&_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
-	&_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
-	&_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
-	&_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
-	&_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
-	&_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
-	&_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
-	&_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
-	&_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
-	&_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
-	&_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
-	&_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
-	&_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
-	&_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
-	&_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
-	&_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
-	&_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
-	&_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
-	&_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
-	&_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
-	&_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
-	&_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
-	&_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
-	&_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
-	&_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
-	&_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
-	&_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
-	&_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
-	&_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
-	&_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
-	&_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
-	&_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
-	&_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
-	&_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
-	&_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
-	&_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
-
-#Te4	# four copies of Te4 to choose from to avoid L1 aliasing
-	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-
-	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-
-	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-
-	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-#rcon:
-$code.=<<___;
-	.long	0x00000001, 0x00000002, 0x00000004, 0x00000008
-	.long	0x00000010, 0x00000020, 0x00000040, 0x00000080
-	.long	0x0000001b, 0x00000036, 0x80808080, 0x80808080
-	.long	0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
-___
-$code.=<<___;
-.align	64
-.LAES_Td:
-___
-	&_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
-	&_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
-	&_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
-	&_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
-	&_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
-	&_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
-	&_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
-	&_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
-	&_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
-	&_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
-	&_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
-	&_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
-	&_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
-	&_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
-	&_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
-	&_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
-	&_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
-	&_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
-	&_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
-	&_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
-	&_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
-	&_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
-	&_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
-	&_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
-	&_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
-	&_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
-	&_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
-	&_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
-	&_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
-	&_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
-	&_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
-	&_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
-	&_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
-	&_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
-	&_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
-	&_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
-	&_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
-	&_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
-	&_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
-	&_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
-	&_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
-	&_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
-	&_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
-	&_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
-	&_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
-	&_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
-	&_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
-	&_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
-	&_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
-	&_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
-	&_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
-	&_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
-	&_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
-	&_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
-	&_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
-	&_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
-	&_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
-	&_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
-	&_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
-	&_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
-	&_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
-	&_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
-	&_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
-	&_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
-
-#Td4:	# four copies of Td4 to choose from to avoid L1 aliasing
-	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-$code.=<<___;
-	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
-	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
-___
-	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-$code.=<<___;
-	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
-	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
-___
-	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-$code.=<<___;
-	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
-	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
-___
-	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-$code.=<<___;
-	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
-	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
-.asciz  "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-.align	64
-___
-
-# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
-if ($win64) {
-$rec="%rcx";
-$frame="%rdx";
-$context="%r8";
-$disp="%r9";
-
-$code.=<<___;
-.extern	__imp_RtlVirtualUnwind
-.type	block_se_handler,\@abi-omnipotent
-.align	16
-block_se_handler:
-	push	%rsi
-	push	%rdi
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-	pushfq
-	sub	\$64,%rsp
-
-	mov	120($context),%rax	# pull context->Rax
-	mov	248($context),%rbx	# pull context->Rip
-
-	mov	8($disp),%rsi		# disp->ImageBase
-	mov	56($disp),%r11		# disp->HandlerData
-
-	mov	0(%r11),%r10d		# HandlerData[0]
-	lea	(%rsi,%r10),%r10	# prologue label
-	cmp	%r10,%rbx		# context->Rip<prologue label
-	jb	.Lin_block_prologue
-
-	mov	152($context),%rax	# pull context->Rsp
-
-	mov	4(%r11),%r10d		# HandlerData[1]
-	lea	(%rsi,%r10),%r10	# epilogue label
-	cmp	%r10,%rbx		# context->Rip>=epilogue label
-	jae	.Lin_block_prologue
-
-	mov	24(%rax),%rax		# pull saved real stack pointer
-
-	mov	-8(%rax),%rbx
-	mov	-16(%rax),%rbp
-	mov	-24(%rax),%r12
-	mov	-32(%rax),%r13
-	mov	-40(%rax),%r14
-	mov	-48(%rax),%r15
-	mov	%rbx,144($context)	# restore context->Rbx
-	mov	%rbp,160($context)	# restore context->Rbp
-	mov	%r12,216($context)	# restore context->R12
-	mov	%r13,224($context)	# restore context->R13
-	mov	%r14,232($context)	# restore context->R14
-	mov	%r15,240($context)	# restore context->R15
-
-.Lin_block_prologue:
-	mov	8(%rax),%rdi
-	mov	16(%rax),%rsi
-	mov	%rax,152($context)	# restore context->Rsp
-	mov	%rsi,168($context)	# restore context->Rsi
-	mov	%rdi,176($context)	# restore context->Rdi
-
-	jmp	.Lcommon_seh_exit
-.size	block_se_handler,.-block_se_handler
-
-.type	key_se_handler,\@abi-omnipotent
-.align	16
-key_se_handler:
-	push	%rsi
-	push	%rdi
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-	pushfq
-	sub	\$64,%rsp
-
-	mov	120($context),%rax	# pull context->Rax
-	mov	248($context),%rbx	# pull context->Rip
-
-	mov	8($disp),%rsi		# disp->ImageBase
-	mov	56($disp),%r11		# disp->HandlerData
-
-	mov	0(%r11),%r10d		# HandlerData[0]
-	lea	(%rsi,%r10),%r10	# prologue label
-	cmp	%r10,%rbx		# context->Rip<prologue label
-	jb	.Lin_key_prologue
-
-	mov	152($context),%rax	# pull context->Rsp
-
-	mov	4(%r11),%r10d		# HandlerData[1]
-	lea	(%rsi,%r10),%r10	# epilogue label
-	cmp	%r10,%rbx		# context->Rip>=epilogue label
-	jae	.Lin_key_prologue
-
-	lea	56(%rax),%rax
-
-	mov	-8(%rax),%rbx
-	mov	-16(%rax),%rbp
-	mov	-24(%rax),%r12
-	mov	-32(%rax),%r13
-	mov	-40(%rax),%r14
-	mov	-48(%rax),%r15
-	mov	%rbx,144($context)	# restore context->Rbx
-	mov	%rbp,160($context)	# restore context->Rbp
-	mov	%r12,216($context)	# restore context->R12
-	mov	%r13,224($context)	# restore context->R13
-	mov	%r14,232($context)	# restore context->R14
-	mov	%r15,240($context)	# restore context->R15
-
-.Lin_key_prologue:
-	mov	8(%rax),%rdi
-	mov	16(%rax),%rsi
-	mov	%rax,152($context)	# restore context->Rsp
-	mov	%rsi,168($context)	# restore context->Rsi
-	mov	%rdi,176($context)	# restore context->Rdi
-
-	jmp	.Lcommon_seh_exit
-.size	key_se_handler,.-key_se_handler
-
-.type	cbc_se_handler,\@abi-omnipotent
-.align	16
-cbc_se_handler:
-	push	%rsi
-	push	%rdi
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-	pushfq
-	sub	\$64,%rsp
-
-	mov	120($context),%rax	# pull context->Rax
-	mov	248($context),%rbx	# pull context->Rip
-
-	lea	.Lcbc_prologue(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip<.Lcbc_prologue
-	jb	.Lin_cbc_prologue
-
-	lea	.Lcbc_fast_body(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip<.Lcbc_fast_body
-	jb	.Lin_cbc_frame_setup
-
-	lea	.Lcbc_slow_prologue(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip<.Lcbc_slow_prologue
-	jb	.Lin_cbc_body
-
-	lea	.Lcbc_slow_body(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip<.Lcbc_slow_body
-	jb	.Lin_cbc_frame_setup
-
-.Lin_cbc_body:
-	mov	152($context),%rax	# pull context->Rsp
-
-	lea	.Lcbc_epilogue(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip>=.Lcbc_epilogue
-	jae	.Lin_cbc_prologue
-
-	lea	8(%rax),%rax
-
-	lea	.Lcbc_popfq(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip>=.Lcbc_popfq
-	jae	.Lin_cbc_prologue
-
-	mov	`16-8`(%rax),%rax	# biased $_rsp
-	lea	56(%rax),%rax
-
-.Lin_cbc_frame_setup:
-	mov	-16(%rax),%rbx
-	mov	-24(%rax),%rbp
-	mov	-32(%rax),%r12
-	mov	-40(%rax),%r13
-	mov	-48(%rax),%r14
-	mov	-56(%rax),%r15
-	mov	%rbx,144($context)	# restore context->Rbx
-	mov	%rbp,160($context)	# restore context->Rbp
-	mov	%r12,216($context)	# restore context->R12
-	mov	%r13,224($context)	# restore context->R13
-	mov	%r14,232($context)	# restore context->R14
-	mov	%r15,240($context)	# restore context->R15
-
-.Lin_cbc_prologue:
-	mov	8(%rax),%rdi
-	mov	16(%rax),%rsi
-	mov	%rax,152($context)	# restore context->Rsp
-	mov	%rsi,168($context)	# restore context->Rsi
-	mov	%rdi,176($context)	# restore context->Rdi
-
-.Lcommon_seh_exit:
-
-	mov	40($disp),%rdi		# disp->ContextRecord
-	mov	$context,%rsi		# context
-	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
-	.long	0xa548f3fc		# cld; rep movsq
-
-	mov	$disp,%rsi
-	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
-	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
-	mov	0(%rsi),%r8		# arg3, disp->ControlPc
-	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
-	mov	40(%rsi),%r10		# disp->ContextRecord
-	lea	56(%rsi),%r11		# &disp->HandlerData
-	lea	24(%rsi),%r12		# &disp->EstablisherFrame
-	mov	%r10,32(%rsp)		# arg5
-	mov	%r11,40(%rsp)		# arg6
-	mov	%r12,48(%rsp)		# arg7
-	mov	%rcx,56(%rsp)		# arg8, (NULL)
-	call	*__imp_RtlVirtualUnwind(%rip)
-
-	mov	\$1,%eax		# ExceptionContinueSearch
-	add	\$64,%rsp
-	popfq
-	pop	%r15
-	pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbp
-	pop	%rbx
-	pop	%rdi
-	pop	%rsi
-	ret
-.size	cbc_se_handler,.-cbc_se_handler
-
-.section	.pdata
-.align	4
-	.rva	.LSEH_begin_AES_encrypt
-	.rva	.LSEH_end_AES_encrypt
-	.rva	.LSEH_info_AES_encrypt
-
-	.rva	.LSEH_begin_AES_decrypt
-	.rva	.LSEH_end_AES_decrypt
-	.rva	.LSEH_info_AES_decrypt
-
-	.rva	.LSEH_begin_AES_set_encrypt_key
-	.rva	.LSEH_end_AES_set_encrypt_key
-	.rva	.LSEH_info_AES_set_encrypt_key
-
-	.rva	.LSEH_begin_AES_set_decrypt_key
-	.rva	.LSEH_end_AES_set_decrypt_key
-	.rva	.LSEH_info_AES_set_decrypt_key
-
-	.rva	.LSEH_begin_AES_cbc_encrypt
-	.rva	.LSEH_end_AES_cbc_encrypt
-	.rva	.LSEH_info_AES_cbc_encrypt
-
-.section	.xdata
-.align	8
-.LSEH_info_AES_encrypt:
-	.byte	9,0,0,0
-	.rva	block_se_handler
-	.rva	.Lenc_prologue,.Lenc_epilogue	# HandlerData[]
-.LSEH_info_AES_decrypt:
-	.byte	9,0,0,0
-	.rva	block_se_handler
-	.rva	.Ldec_prologue,.Ldec_epilogue	# HandlerData[]
-.LSEH_info_AES_set_encrypt_key:
-	.byte	9,0,0,0
-	.rva	key_se_handler
-	.rva	.Lenc_key_prologue,.Lenc_key_epilogue	# HandlerData[]
-.LSEH_info_AES_set_decrypt_key:
-	.byte	9,0,0,0
-	.rva	key_se_handler
-	.rva	.Ldec_key_prologue,.Ldec_key_epilogue	# HandlerData[]
-.LSEH_info_AES_cbc_encrypt:
-	.byte	9,0,0,0
-	.rva	cbc_se_handler
-___
-}
-
-$code =~ s/\`([^\`]*)\`/eval($1)/gem;
-
-print $code;
-
-close STDOUT;

+ 0 - 3239
libs/openssl/crypto/aes/asm/bsaes-x86_64.pl

@@ -1,3239 +0,0 @@
-#! /usr/bin/env perl
-# Copyright 2011-2019 The OpenSSL Project Authors. All Rights Reserved.
-#
-# Licensed under the OpenSSL license (the "License").  You may not use
-# this file except in compliance with the License.  You can obtain a copy
-# in the file LICENSE in the source distribution or at
-# https://www.openssl.org/source/license.html
-
-
-###################################################################
-### AES-128 [originally in CTR mode]				###
-### bitsliced implementation for Intel Core 2 processors	###
-### requires support of SSE extensions up to SSSE3		###
-### Author: Emilia Käsper and Peter Schwabe			###
-### Date: 2009-03-19						###
-### Public domain						###
-###								###
-### See http://homes.esat.kuleuven.be/~ekasper/#software for	###
-### further information.					###
-###################################################################
-#
-# September 2011.
-#
-# Started as transliteration to "perlasm" the original code has
-# undergone following changes:
-#
-# - code was made position-independent;
-# - rounds were folded into a loop resulting in >5x size reduction
-#   from 12.5KB to 2.2KB;
-# - above was possibile thanks to mixcolumns() modification that
-#   allowed to feed its output back to aesenc[last], this was
-#   achieved at cost of two additional inter-registers moves;
-# - some instruction reordering and interleaving;
-# - this module doesn't implement key setup subroutine, instead it
-#   relies on conversion of "conventional" key schedule as returned
-#   by AES_set_encrypt_key (see discussion below);
-# - first and last round keys are treated differently, which allowed
-#   to skip one shiftrows(), reduce bit-sliced key schedule and
-#   speed-up conversion by 22%;
-# - support for 192- and 256-bit keys was added;
-#
-# Resulting performance in CPU cycles spent to encrypt one byte out
-# of 4096-byte buffer with 128-bit key is:
-#
-#		Emilia's	this(*)		difference
-#
-# Core 2    	9.30		8.69		+7%
-# Nehalem(**) 	7.63		6.88		+11%
-# Atom	    	17.1		16.4		+4%
-# Silvermont	-		12.9
-# Goldmont	-		8.85
-#
-# (*)	Comparison is not completely fair, because "this" is ECB,
-#	i.e. no extra processing such as counter values calculation
-#	and xor-ing input as in Emilia's CTR implementation is
-#	performed. However, the CTR calculations stand for not more
-#	than 1% of total time, so comparison is *rather* fair.
-#
-# (**)	Results were collected on Westmere, which is considered to
-#	be equivalent to Nehalem for this code.
-#
-# As for key schedule conversion subroutine. Interface to OpenSSL
-# relies on per-invocation on-the-fly conversion. This naturally
-# has impact on performance, especially for short inputs. Conversion
-# time in CPU cycles and its ratio to CPU cycles spent in 8x block
-# function is:
-#
-# 		conversion	conversion/8x block
-# Core 2	240		0.22
-# Nehalem	180		0.20
-# Atom		430		0.20
-#
-# The ratio values mean that 128-byte blocks will be processed
-# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
-# etc. Then keep in mind that input sizes not divisible by 128 are
-# *effectively* slower, especially shortest ones, e.g. consecutive
-# 144-byte blocks are processed 44% slower than one would expect,
-# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
-# it's still faster than ["hyper-threading-safe" code path in]
-# aes-x86_64.pl on all lengths above 64 bytes...
-#
-# October 2011.
-#
-# Add decryption procedure. Performance in CPU cycles spent to decrypt
-# one byte out of 4096-byte buffer with 128-bit key is:
-#
-# Core 2	9.98
-# Nehalem	7.80
-# Atom		17.9
-# Silvermont	14.0
-# Goldmont	10.2
-#
-# November 2011.
-#
-# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
-# suboptimal, but XTS is meant to be used with larger blocks...
-#
-#						<[email protected]>
-
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
-*STDOUT=*OUT;
-
-my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
-my @XMM=map("%xmm$_",(15,0..14));	# best on Atom, +10% over (0..15)
-my $ecb=0;	# suppress unreferenced ECB subroutines, spare some space...
-
-{
-my ($key,$rounds,$const)=("%rax","%r10d","%r11");
-
-sub Sbox {
-# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
-my @b=@_[0..7];
-my @t=@_[8..11];
-my @s=@_[12..15];
-	&InBasisChange	(@b);
-	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
-	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
-}
-
-sub InBasisChange {
-# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
-my @b=@_[0..7];
-$code.=<<___;
-	pxor	@b[6], @b[5]
-	pxor	@b[1], @b[2]
-	pxor	@b[0], @b[3]
-	pxor	@b[2], @b[6]
-	pxor 	@b[0], @b[5]
-
-	pxor	@b[3], @b[6]
-	pxor	@b[7], @b[3]
-	pxor	@b[5], @b[7]
-	pxor	@b[4], @b[3]
-	pxor	@b[5], @b[4]
-	pxor	@b[1], @b[3]
-
-	pxor	@b[7], @b[2]
-	pxor	@b[5], @b[1]
-___
-}
-
-sub OutBasisChange {
-# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
-my @b=@_[0..7];
-$code.=<<___;
-	pxor	@b[6], @b[0]
-	pxor	@b[4], @b[1]
-	pxor	@b[0], @b[2]
-	pxor	@b[6], @b[4]
-	pxor	@b[1], @b[6]
-
-	pxor	@b[5], @b[1]
-	pxor	@b[3], @b[5]
-	pxor	@b[7], @b[3]
-	pxor	@b[5], @b[7]
-	pxor	@b[5], @b[2]
-
-	pxor	@b[7], @b[4]
-___
-}
-
-sub InvSbox {
-# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
-my @b=@_[0..7];
-my @t=@_[8..11];
-my @s=@_[12..15];
-	&InvInBasisChange	(@b);
-	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
-	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
-}
-
-sub InvInBasisChange {		# OutBasisChange in reverse
-my @b=@_[5,1,2,6,3,7,0,4];
-$code.=<<___
-	pxor	@b[7], @b[4]
-
-	pxor	@b[5], @b[7]
-	pxor	@b[5], @b[2]
-	pxor	@b[7], @b[3]
-	pxor	@b[3], @b[5]
-	pxor	@b[5], @b[1]
-
-	pxor	@b[1], @b[6]
-	pxor	@b[0], @b[2]
-	pxor	@b[6], @b[4]
-	pxor	@b[6], @b[0]
-	pxor	@b[4], @b[1]
-___
-}
-
-sub InvOutBasisChange {		# InBasisChange in reverse
-my @b=@_[2,5,7,3,6,1,0,4];
-$code.=<<___;
-	pxor	@b[5], @b[1]
-	pxor	@b[7], @b[2]
-
-	pxor	@b[1], @b[3]
-	pxor	@b[5], @b[4]
-	pxor	@b[5], @b[7]
-	pxor	@b[4], @b[3]
-	 pxor 	@b[0], @b[5]
-	pxor	@b[7], @b[3]
-	 pxor	@b[2], @b[6]
-	 pxor	@b[1], @b[2]
-	pxor	@b[3], @b[6]
-
-	pxor	@b[0], @b[3]
-	pxor	@b[6], @b[5]
-___
-}
-
-sub Mul_GF4 {
-#;*************************************************************
-#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
-#;*************************************************************
-my ($x0,$x1,$y0,$y1,$t0)=@_;
-$code.=<<___;
-	movdqa	$y0, $t0
-	pxor 	$y1, $t0
-	pand	$x0, $t0
-	pxor	$x1, $x0
-	pand	$y0, $x1
-	pand	$y1, $x0
-	pxor	$x1, $x0
-	pxor	$t0, $x1
-___
-}
-
-sub Mul_GF4_N {				# not used, see next subroutine
-# multiply and scale by N
-my ($x0,$x1,$y0,$y1,$t0)=@_;
-$code.=<<___;
-	movdqa	$y0, $t0
-	pxor	$y1, $t0
-	pand	$x0, $t0
-	pxor	$x1, $x0
-	pand	$y0, $x1
-	pand	$y1, $x0
-	pxor	$x0, $x1
-	pxor	$t0, $x0
-___
-}
-
-sub Mul_GF4_N_GF4 {
-# interleaved Mul_GF4_N and Mul_GF4
-my ($x0,$x1,$y0,$y1,$t0,
-    $x2,$x3,$y2,$y3,$t1)=@_;
-$code.=<<___;
-	movdqa	$y0, $t0
-	 movdqa	$y2, $t1
-	pxor	$y1, $t0
-	 pxor 	$y3, $t1
-	pand	$x0, $t0
-	 pand	$x2, $t1
-	pxor	$x1, $x0
-	 pxor	$x3, $x2
-	pand	$y0, $x1
-	 pand	$y2, $x3
-	pand	$y1, $x0
-	 pand	$y3, $x2
-	pxor	$x0, $x1
-	 pxor	$x3, $x2
-	pxor	$t0, $x0
-	 pxor	$t1, $x3
-___
-}
-sub Mul_GF16_2 {
-my @x=@_[0..7];
-my @y=@_[8..11];
-my @t=@_[12..15];
-$code.=<<___;
-	movdqa	@x[0], @t[0]
-	movdqa	@x[1], @t[1]
-___
-	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2]);
-$code.=<<___;
-	pxor	@x[2], @t[0]
-	pxor	@x[3], @t[1]
-	pxor	@y[2], @y[0]
-	pxor	@y[3], @y[1]
-___
-	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
-			 @x[2], @x[3], @y[2], @y[3], @t[2]);
-$code.=<<___;
-	pxor	@t[0], @x[0]
-	pxor	@t[0], @x[2]
-	pxor	@t[1], @x[1]
-	pxor	@t[1], @x[3]
-
-	movdqa	@x[4], @t[0]
-	movdqa	@x[5], @t[1]
-	pxor	@x[6], @t[0]
-	pxor	@x[7], @t[1]
-___
-	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
-			 @x[6], @x[7], @y[2], @y[3], @t[2]);
-$code.=<<___;
-	pxor	@y[2], @y[0]
-	pxor	@y[3], @y[1]
-___
-	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[3]);
-$code.=<<___;
-	pxor	@t[0], @x[4]
-	pxor	@t[0], @x[6]
-	pxor	@t[1], @x[5]
-	pxor	@t[1], @x[7]
-___
-}
-sub Inv_GF256 {
-#;********************************************************************
-#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
-#;********************************************************************
-my @x=@_[0..7];
-my @t=@_[8..11];
-my @s=@_[12..15];
-# direct optimizations from hardware
-$code.=<<___;
-	movdqa	@x[4], @t[3]
-	movdqa	@x[5], @t[2]
-	movdqa	@x[1], @t[1]
-	movdqa	@x[7], @s[1]
-	movdqa	@x[0], @s[0]
-
-	pxor	@x[6], @t[3]
-	pxor	@x[7], @t[2]
-	pxor	@x[3], @t[1]
-	 movdqa	@t[3], @s[2]
-	pxor	@x[6], @s[1]
-	 movdqa	@t[2], @t[0]
-	pxor	@x[2], @s[0]
-	 movdqa	@t[3], @s[3]
-
-	por	@t[1], @t[2]
-	por	@s[0], @t[3]
-	pxor	@t[0], @s[3]
-	pand	@s[0], @s[2]
-	pxor	@t[1], @s[0]
-	pand	@t[1], @t[0]
-	pand	@s[0], @s[3]
-	movdqa	@x[3], @s[0]
-	pxor	@x[2], @s[0]
-	pand	@s[0], @s[1]
-	pxor	@s[1], @t[3]
-	pxor	@s[1], @t[2]
-	movdqa	@x[4], @s[1]
-	movdqa	@x[1], @s[0]
-	pxor	@x[5], @s[1]
-	pxor	@x[0], @s[0]
-	movdqa	@s[1], @t[1]
-	pand	@s[0], @s[1]
-	por	@s[0], @t[1]
-	pxor	@s[1], @t[0]
-	pxor	@s[3], @t[3]
-	pxor	@s[2], @t[2]
-	pxor	@s[3], @t[1]
-	movdqa	@x[7], @s[0]
-	pxor	@s[2], @t[0]
-	movdqa	@x[6], @s[1]
-	pxor	@s[2], @t[1]
-	movdqa	@x[5], @s[2]
-	pand	@x[3], @s[0]
-	movdqa	@x[4], @s[3]
-	pand	@x[2], @s[1]
-	pand	@x[1], @s[2]
-	por	@x[0], @s[3]
-	pxor	@s[0], @t[3]
-	pxor	@s[1], @t[2]
-	pxor	@s[2], @t[1]
-	pxor	@s[3], @t[0]
-
-	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
-
-	# new smaller inversion
-
-	movdqa	@t[3], @s[0]
-	pand	@t[1], @t[3]
-	pxor	@t[2], @s[0]
-
-	movdqa	@t[0], @s[2]
-	movdqa	@s[0], @s[3]
-	pxor	@t[3], @s[2]
-	pand	@s[2], @s[3]
-
-	movdqa	@t[1], @s[1]
-	pxor	@t[2], @s[3]
-	pxor	@t[0], @s[1]
-
-	pxor	@t[2], @t[3]
-
-	pand	@t[3], @s[1]
-
-	movdqa	@s[2], @t[2]
-	pxor	@t[0], @s[1]
-
-	pxor	@s[1], @t[2]
-	pxor	@s[1], @t[1]
-
-	pand	@t[0], @t[2]
-
-	pxor	@t[2], @s[2]
-	pxor	@t[2], @t[1]
-
-	pand	@s[3], @s[2]
-
-	pxor	@s[0], @s[2]
-___
-# output in s3, s2, s1, t1
-
-# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
-
-# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
-	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
-
-### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
-}
-
-# AES linear components
-
-sub ShiftRows {
-my @x=@_[0..7];
-my $mask=pop;
-$code.=<<___;
-	pxor	0x00($key),@x[0]
-	pxor	0x10($key),@x[1]
-	pxor	0x20($key),@x[2]
-	pxor	0x30($key),@x[3]
-	pshufb	$mask,@x[0]
-	pshufb	$mask,@x[1]
-	pxor	0x40($key),@x[4]
-	pxor	0x50($key),@x[5]
-	pshufb	$mask,@x[2]
-	pshufb	$mask,@x[3]
-	pxor	0x60($key),@x[6]
-	pxor	0x70($key),@x[7]
-	pshufb	$mask,@x[4]
-	pshufb	$mask,@x[5]
-	pshufb	$mask,@x[6]
-	pshufb	$mask,@x[7]
-	lea	0x80($key),$key
-___
-}
-
-sub MixColumns {
-# modified to emit output in order suitable for feeding back to aesenc[last]
-my @x=@_[0..7];
-my @t=@_[8..15];
-my $inv=@_[16];	# optional
-$code.=<<___;
-	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
-	pshufd	\$0x93, @x[1], @t[1]
-	 pxor	@t[0], @x[0]		# x0 ^ (x0 <<< 32)
-	pshufd	\$0x93, @x[2], @t[2]
-	 pxor	@t[1], @x[1]
-	pshufd	\$0x93, @x[3], @t[3]
-	 pxor	@t[2], @x[2]
-	pshufd	\$0x93, @x[4], @t[4]
-	 pxor	@t[3], @x[3]
-	pshufd	\$0x93, @x[5], @t[5]
-	 pxor	@t[4], @x[4]
-	pshufd	\$0x93, @x[6], @t[6]
-	 pxor	@t[5], @x[5]
-	pshufd	\$0x93, @x[7], @t[7]
-	 pxor	@t[6], @x[6]
-	 pxor	@t[7], @x[7]
-
-	pxor	@x[0], @t[1]
-	pxor	@x[7], @t[0]
-	pxor	@x[7], @t[1]
-	 pshufd	\$0x4E, @x[0], @x[0] 	# (x0 ^ (x0 <<< 32)) <<< 64)
-	pxor	@x[1], @t[2]
-	 pshufd	\$0x4E, @x[1], @x[1]
-	pxor	@x[4], @t[5]
-	 pxor	@t[0], @x[0]
-	pxor	@x[5], @t[6]
-	 pxor	@t[1], @x[1]
-	pxor	@x[3], @t[4]
-	 pshufd	\$0x4E, @x[4], @t[0]
-	pxor	@x[6], @t[7]
-	 pshufd	\$0x4E, @x[5], @t[1]
-	pxor	@x[2], @t[3]
-	 pshufd	\$0x4E, @x[3], @x[4]
-	pxor	@x[7], @t[3]
-	 pshufd	\$0x4E, @x[7], @x[5]
-	pxor	@x[7], @t[4]
-	 pshufd	\$0x4E, @x[6], @x[3]
-	pxor	@t[4], @t[0]
-	 pshufd	\$0x4E, @x[2], @x[6]
-	pxor	@t[5], @t[1]
-___
-$code.=<<___ if (!$inv);
-	pxor	@t[3], @x[4]
-	pxor	@t[7], @x[5]
-	pxor	@t[6], @x[3]
-	 movdqa	@t[0], @x[2]
-	pxor	@t[2], @x[6]
-	 movdqa	@t[1], @x[7]
-___
-$code.=<<___ if ($inv);
-	pxor	@x[4], @t[3]
-	pxor	@t[7], @x[5]
-	pxor	@x[3], @t[6]
-	 movdqa	@t[0], @x[3]
-	pxor	@t[2], @x[6]
-	 movdqa	@t[6], @x[2]
-	 movdqa	@t[1], @x[7]
-	 movdqa	@x[6], @x[4]
-	 movdqa	@t[3], @x[6]
-___
-}
-
-sub InvMixColumns_orig {
-my @x=@_[0..7];
-my @t=@_[8..15];
-
-$code.=<<___;
-	# multiplication by 0x0e
-	pshufd	\$0x93, @x[7], @t[7]
-	movdqa	@x[2], @t[2]
-	pxor	@x[5], @x[7]		# 7 5
-	pxor	@x[5], @x[2]		# 2 5
-	pshufd	\$0x93, @x[0], @t[0]
-	movdqa	@x[5], @t[5]
-	pxor	@x[0], @x[5]		# 5 0		[1]
-	pxor	@x[1], @x[0]		# 0 1
-	pshufd	\$0x93, @x[1], @t[1]
-	pxor	@x[2], @x[1]		# 1 25
-	pxor	@x[6], @x[0]		# 01 6		[2]
-	pxor	@x[3], @x[1]		# 125 3		[4]
-	pshufd	\$0x93, @x[3], @t[3]
-	pxor	@x[0], @x[2]		# 25 016	[3]
-	pxor	@x[7], @x[3]		# 3 75
-	pxor	@x[6], @x[7]		# 75 6		[0]
-	pshufd	\$0x93, @x[6], @t[6]
-	movdqa	@x[4], @t[4]
-	pxor	@x[4], @x[6]		# 6 4
-	pxor	@x[3], @x[4]		# 4 375		[6]
-	pxor	@x[7], @x[3]		# 375 756=36
-	pxor	@t[5], @x[6]		# 64 5		[7]
-	pxor	@t[2], @x[3]		# 36 2
-	pxor	@t[4], @x[3]		# 362 4		[5]
-	pshufd	\$0x93, @t[5], @t[5]
-___
-					my @y = @x[7,5,0,2,1,3,4,6];
-$code.=<<___;
-	# multiplication by 0x0b
-	pxor	@y[0], @y[1]
-	pxor	@t[0], @y[0]
-	pxor	@t[1], @y[1]
-	pshufd	\$0x93, @t[2], @t[2]
-	pxor	@t[5], @y[0]
-	pxor	@t[6], @y[1]
-	pxor	@t[7], @y[0]
-	pshufd	\$0x93, @t[4], @t[4]
-	pxor	@t[6], @t[7]		# clobber t[7]
-	pxor	@y[0], @y[1]
-
-	pxor	@t[0], @y[3]
-	pshufd	\$0x93, @t[0], @t[0]
-	pxor	@t[1], @y[2]
-	pxor	@t[1], @y[4]
-	pxor	@t[2], @y[2]
-	pshufd	\$0x93, @t[1], @t[1]
-	pxor	@t[2], @y[3]
-	pxor	@t[2], @y[5]
-	pxor	@t[7], @y[2]
-	pshufd	\$0x93, @t[2], @t[2]
-	pxor	@t[3], @y[3]
-	pxor	@t[3], @y[6]
-	pxor	@t[3], @y[4]
-	pshufd	\$0x93, @t[3], @t[3]
-	pxor	@t[4], @y[7]
-	pxor	@t[4], @y[5]
-	pxor	@t[7], @y[7]
-	pxor	@t[5], @y[3]
-	pxor	@t[4], @y[4]
-	pxor	@t[5], @t[7]		# clobber t[7] even more
-
-	pxor	@t[7], @y[5]
-	pshufd	\$0x93, @t[4], @t[4]
-	pxor	@t[7], @y[6]
-	pxor	@t[7], @y[4]
-
-	pxor	@t[5], @t[7]
-	pshufd	\$0x93, @t[5], @t[5]
-	pxor	@t[6], @t[7]		# restore t[7]
-
-	# multiplication by 0x0d
-	pxor	@y[7], @y[4]
-	pxor	@t[4], @y[7]
-	pshufd	\$0x93, @t[6], @t[6]
-	pxor	@t[0], @y[2]
-	pxor	@t[5], @y[7]
-	pxor	@t[2], @y[2]
-	pshufd	\$0x93, @t[7], @t[7]
-
-	pxor	@y[1], @y[3]
-	pxor	@t[1], @y[1]
-	pxor	@t[0], @y[0]
-	pxor	@t[0], @y[3]
-	pxor	@t[5], @y[1]
-	pxor	@t[5], @y[0]
-	pxor	@t[7], @y[1]
-	pshufd	\$0x93, @t[0], @t[0]
-	pxor	@t[6], @y[0]
-	pxor	@y[1], @y[3]
-	pxor	@t[1], @y[4]
-	pshufd	\$0x93, @t[1], @t[1]
-
-	pxor	@t[7], @y[7]
-	pxor	@t[2], @y[4]
-	pxor	@t[2], @y[5]
-	pshufd	\$0x93, @t[2], @t[2]
-	pxor	@t[6], @y[2]
-	pxor	@t[3], @t[6]		# clobber t[6]
-	pxor	@y[7], @y[4]
-	pxor	@t[6], @y[3]
-
-	pxor	@t[6], @y[6]
-	pxor	@t[5], @y[5]
-	pxor	@t[4], @y[6]
-	pshufd	\$0x93, @t[4], @t[4]
-	pxor	@t[6], @y[5]
-	pxor	@t[7], @y[6]
-	pxor	@t[3], @t[6]		# restore t[6]
-
-	pshufd	\$0x93, @t[5], @t[5]
-	pshufd	\$0x93, @t[6], @t[6]
-	pshufd	\$0x93, @t[7], @t[7]
-	pshufd	\$0x93, @t[3], @t[3]
-
-	# multiplication by 0x09
-	pxor	@y[1], @y[4]
-	pxor	@y[1], @t[1]		# t[1]=y[1]
-	pxor	@t[5], @t[0]		# clobber t[0]
-	pxor	@t[5], @t[1]
-	pxor	@t[0], @y[3]
-	pxor	@y[0], @t[0]		# t[0]=y[0]
-	pxor	@t[6], @t[1]
-	pxor	@t[7], @t[6]		# clobber t[6]
-	pxor	@t[1], @y[4]
-	pxor	@t[4], @y[7]
-	pxor	@y[4], @t[4]		# t[4]=y[4]
-	pxor	@t[3], @y[6]
-	pxor	@y[3], @t[3]		# t[3]=y[3]
-	pxor	@t[2], @y[5]
-	pxor	@y[2], @t[2]		# t[2]=y[2]
-	pxor	@t[7], @t[3]
-	pxor	@y[5], @t[5]		# t[5]=y[5]
-	pxor	@t[6], @t[2]
-	pxor	@t[6], @t[5]
-	pxor	@y[6], @t[6]		# t[6]=y[6]
-	pxor	@y[7], @t[7]		# t[7]=y[7]
-
-	movdqa	@t[0],@XMM[0]
-	movdqa	@t[1],@XMM[1]
-	movdqa	@t[2],@XMM[2]
-	movdqa	@t[3],@XMM[3]
-	movdqa	@t[4],@XMM[4]
-	movdqa	@t[5],@XMM[5]
-	movdqa	@t[6],@XMM[6]
-	movdqa	@t[7],@XMM[7]
-___
-}
-
-sub InvMixColumns {
-my @x=@_[0..7];
-my @t=@_[8..15];
-
-# Thanks to Jussi Kivilinna for providing pointer to
-#
-# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
-# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
-# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
-# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
-
-$code.=<<___;
-	# multiplication by 0x05-0x00-0x04-0x00
-	pshufd	\$0x4E, @x[0], @t[0]
-	pshufd	\$0x4E, @x[6], @t[6]
-	pxor	@x[0], @t[0]
-	pshufd	\$0x4E, @x[7], @t[7]
-	pxor	@x[6], @t[6]
-	pshufd	\$0x4E, @x[1], @t[1]
-	pxor	@x[7], @t[7]
-	pshufd	\$0x4E, @x[2], @t[2]
-	pxor	@x[1], @t[1]
-	pshufd	\$0x4E, @x[3], @t[3]
-	pxor	@x[2], @t[2]
-	 pxor	@t[6], @x[0]
-	 pxor	@t[6], @x[1]
-	pshufd	\$0x4E, @x[4], @t[4]
-	pxor	@x[3], @t[3]
-	 pxor	@t[0], @x[2]
-	 pxor	@t[1], @x[3]
-	pshufd	\$0x4E, @x[5], @t[5]
-	pxor	@x[4], @t[4]
-	 pxor	@t[7], @x[1]
-	 pxor	@t[2], @x[4]
-	pxor	@x[5], @t[5]
-
-	 pxor	@t[7], @x[2]
-	 pxor	@t[6], @x[3]
-	 pxor	@t[6], @x[4]
-	 pxor	@t[3], @x[5]
-	 pxor	@t[4], @x[6]
-	 pxor	@t[7], @x[4]
-	 pxor	@t[7], @x[5]
-	 pxor	@t[5], @x[7]
-___
-	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
-}
-
-sub aesenc {				# not used
-my @b=@_[0..7];
-my @t=@_[8..15];
-$code.=<<___;
-	movdqa	0x30($const),@t[0]	# .LSR
-___
-	&ShiftRows	(@b,@t[0]);
-	&Sbox		(@b,@t);
-	&MixColumns	(@b[0,1,4,6,3,7,2,5],@t);
-}
-
-sub aesenclast {			# not used
-my @b=@_[0..7];
-my @t=@_[8..15];
-$code.=<<___;
-	movdqa	0x40($const),@t[0]	# .LSRM0
-___
-	&ShiftRows	(@b,@t[0]);
-	&Sbox		(@b,@t);
-$code.=<<___
-	pxor	0x00($key),@b[0]
-	pxor	0x10($key),@b[1]
-	pxor	0x20($key),@b[4]
-	pxor	0x30($key),@b[6]
-	pxor	0x40($key),@b[3]
-	pxor	0x50($key),@b[7]
-	pxor	0x60($key),@b[2]
-	pxor	0x70($key),@b[5]
-___
-}
-
-sub swapmove {
-my ($a,$b,$n,$mask,$t)=@_;
-$code.=<<___;
-	movdqa	$b,$t
-	psrlq	\$$n,$b
-	pxor  	$a,$b
-	pand	$mask,$b
-	pxor	$b,$a
-	psllq	\$$n,$b
-	pxor	$t,$b
-___
-}
-sub swapmove2x {
-my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
-$code.=<<___;
-	movdqa	$b0,$t0
-	psrlq	\$$n,$b0
-	 movdqa	$b1,$t1
-	 psrlq	\$$n,$b1
-	pxor  	$a0,$b0
-	 pxor  	$a1,$b1
-	pand	$mask,$b0
-	 pand	$mask,$b1
-	pxor	$b0,$a0
-	psllq	\$$n,$b0
-	 pxor	$b1,$a1
-	 psllq	\$$n,$b1
-	pxor	$t0,$b0
-	 pxor	$t1,$b1
-___
-}
-
-sub bitslice {
-my @x=reverse(@_[0..7]);
-my ($t0,$t1,$t2,$t3)=@_[8..11];
-$code.=<<___;
-	movdqa	0x00($const),$t0	# .LBS0
-	movdqa	0x10($const),$t1	# .LBS1
-___
-	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
-	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
-$code.=<<___;
-	movdqa	0x20($const),$t0	# .LBS2
-___
-	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
-	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
-
-	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
-	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
-}
-
-$code.=<<___;
-.text
-
-.extern	asm_AES_encrypt
-.extern	asm_AES_decrypt
-
-.type	_bsaes_encrypt8,\@abi-omnipotent
-.align	64
-_bsaes_encrypt8:
-.cfi_startproc
-	lea	.LBS0(%rip), $const	# constants table
-
-	movdqa	($key), @XMM[9]		# round 0 key
-	lea	0x10($key), $key
-	movdqa	0x50($const), @XMM[8]	# .LM0SR
-	pxor	@XMM[9], @XMM[0]	# xor with round0 key
-	pxor	@XMM[9], @XMM[1]
-	pxor	@XMM[9], @XMM[2]
-	pxor	@XMM[9], @XMM[3]
-	 pshufb	@XMM[8], @XMM[0]
-	 pshufb	@XMM[8], @XMM[1]
-	pxor	@XMM[9], @XMM[4]
-	pxor	@XMM[9], @XMM[5]
-	 pshufb	@XMM[8], @XMM[2]
-	 pshufb	@XMM[8], @XMM[3]
-	pxor	@XMM[9], @XMM[6]
-	pxor	@XMM[9], @XMM[7]
-	 pshufb	@XMM[8], @XMM[4]
-	 pshufb	@XMM[8], @XMM[5]
-	 pshufb	@XMM[8], @XMM[6]
-	 pshufb	@XMM[8], @XMM[7]
-_bsaes_encrypt8_bitslice:
-___
-	&bitslice	(@XMM[0..7, 8..11]);
-$code.=<<___;
-	dec	$rounds
-	jmp	.Lenc_sbox
-.align	16
-.Lenc_loop:
-___
-	&ShiftRows	(@XMM[0..7, 8]);
-$code.=".Lenc_sbox:\n";
-	&Sbox		(@XMM[0..7, 8..15]);
-$code.=<<___;
-	dec	$rounds
-	jl	.Lenc_done
-___
-	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
-$code.=<<___;
-	movdqa	0x30($const), @XMM[8]	# .LSR
-	jnz	.Lenc_loop
-	movdqa	0x40($const), @XMM[8]	# .LSRM0
-	jmp	.Lenc_loop
-.align	16
-.Lenc_done:
-___
-	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
-	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
-$code.=<<___;
-	movdqa	($key), @XMM[8]		# last round key
-	pxor	@XMM[8], @XMM[4]
-	pxor	@XMM[8], @XMM[6]
-	pxor	@XMM[8], @XMM[3]
-	pxor	@XMM[8], @XMM[7]
-	pxor	@XMM[8], @XMM[2]
-	pxor	@XMM[8], @XMM[5]
-	pxor	@XMM[8], @XMM[0]
-	pxor	@XMM[8], @XMM[1]
-	ret
-.cfi_endproc
-.size	_bsaes_encrypt8,.-_bsaes_encrypt8
-
-.type	_bsaes_decrypt8,\@abi-omnipotent
-.align	64
-_bsaes_decrypt8:
-.cfi_startproc
-	lea	.LBS0(%rip), $const	# constants table
-
-	movdqa	($key), @XMM[9]		# round 0 key
-	lea	0x10($key), $key
-	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
-	pxor	@XMM[9], @XMM[0]	# xor with round0 key
-	pxor	@XMM[9], @XMM[1]
-	pxor	@XMM[9], @XMM[2]
-	pxor	@XMM[9], @XMM[3]
-	 pshufb	@XMM[8], @XMM[0]
-	 pshufb	@XMM[8], @XMM[1]
-	pxor	@XMM[9], @XMM[4]
-	pxor	@XMM[9], @XMM[5]
-	 pshufb	@XMM[8], @XMM[2]
-	 pshufb	@XMM[8], @XMM[3]
-	pxor	@XMM[9], @XMM[6]
-	pxor	@XMM[9], @XMM[7]
-	 pshufb	@XMM[8], @XMM[4]
-	 pshufb	@XMM[8], @XMM[5]
-	 pshufb	@XMM[8], @XMM[6]
-	 pshufb	@XMM[8], @XMM[7]
-___
-	&bitslice	(@XMM[0..7, 8..11]);
-$code.=<<___;
-	dec	$rounds
-	jmp	.Ldec_sbox
-.align	16
-.Ldec_loop:
-___
-	&ShiftRows	(@XMM[0..7, 8]);
-$code.=".Ldec_sbox:\n";
-	&InvSbox	(@XMM[0..7, 8..15]);
-$code.=<<___;
-	dec	$rounds
-	jl	.Ldec_done
-___
-	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
-$code.=<<___;
-	movdqa	-0x10($const), @XMM[8]	# .LISR
-	jnz	.Ldec_loop
-	movdqa	-0x20($const), @XMM[8]	# .LISRM0
-	jmp	.Ldec_loop
-.align	16
-.Ldec_done:
-___
-	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
-$code.=<<___;
-	movdqa	($key), @XMM[8]		# last round key
-	pxor	@XMM[8], @XMM[6]
-	pxor	@XMM[8], @XMM[4]
-	pxor	@XMM[8], @XMM[2]
-	pxor	@XMM[8], @XMM[7]
-	pxor	@XMM[8], @XMM[3]
-	pxor	@XMM[8], @XMM[5]
-	pxor	@XMM[8], @XMM[0]
-	pxor	@XMM[8], @XMM[1]
-	ret
-.cfi_endproc
-.size	_bsaes_decrypt8,.-_bsaes_decrypt8
-___
-}
-{
-my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
-
-sub bitslice_key {
-my @x=reverse(@_[0..7]);
-my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
-
-	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
-$code.=<<___;
-	#&swapmove(@x[2,3],1,$t0,$t2,$t3);
-	movdqa	@x[0], @x[2]
-	movdqa	@x[1], @x[3]
-___
-	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
-
-	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
-$code.=<<___;
-	#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
-	movdqa	@x[0], @x[4]
-	movdqa	@x[2], @x[6]
-	movdqa	@x[1], @x[5]
-	movdqa	@x[3], @x[7]
-___
-	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
-	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
-}
-
-$code.=<<___;
-.type	_bsaes_key_convert,\@abi-omnipotent
-.align	16
-_bsaes_key_convert:
-.cfi_startproc
-	lea	.Lmasks(%rip), $const
-	movdqu	($inp), %xmm7		# load round 0 key
-	lea	0x10($inp), $inp
-	movdqa	0x00($const), %xmm0	# 0x01...
-	movdqa	0x10($const), %xmm1	# 0x02...
-	movdqa	0x20($const), %xmm2	# 0x04...
-	movdqa	0x30($const), %xmm3	# 0x08...
-	movdqa	0x40($const), %xmm4	# .LM0
-	pcmpeqd	%xmm5, %xmm5		# .LNOT
-
-	movdqu	($inp), %xmm6		# load round 1 key
-	movdqa	%xmm7, ($out)		# save round 0 key
-	lea	0x10($out), $out
-	dec	$rounds
-	jmp	.Lkey_loop
-.align	16
-.Lkey_loop:
-	pshufb	%xmm4, %xmm6		# .LM0
-
-	movdqa	%xmm0,	%xmm8
-	movdqa	%xmm1,	%xmm9
-
-	pand	%xmm6,	%xmm8
-	pand	%xmm6,	%xmm9
-	movdqa	%xmm2,	%xmm10
-	pcmpeqb	%xmm0,	%xmm8
-	psllq	\$4,	%xmm0		# 0x10...
-	movdqa	%xmm3,	%xmm11
-	pcmpeqb	%xmm1,	%xmm9
-	psllq	\$4,	%xmm1		# 0x20...
-
-	pand	%xmm6,	%xmm10
-	pand	%xmm6,	%xmm11
-	movdqa	%xmm0,	%xmm12
-	pcmpeqb	%xmm2,	%xmm10
-	psllq	\$4,	%xmm2		# 0x40...
-	movdqa	%xmm1,	%xmm13
-	pcmpeqb	%xmm3,	%xmm11
-	psllq	\$4,	%xmm3		# 0x80...
-
-	movdqa	%xmm2,	%xmm14
-	movdqa	%xmm3,	%xmm15
-	 pxor	%xmm5,	%xmm8		# "pnot"
-	 pxor	%xmm5,	%xmm9
-
-	pand	%xmm6,	%xmm12
-	pand	%xmm6,	%xmm13
-	 movdqa	%xmm8, 0x00($out)	# write bit-sliced round key
-	pcmpeqb	%xmm0,	%xmm12
-	psrlq	\$4,	%xmm0		# 0x01...
-	 movdqa	%xmm9, 0x10($out)
-	pcmpeqb	%xmm1,	%xmm13
-	psrlq	\$4,	%xmm1		# 0x02...
-	 lea	0x10($inp), $inp
-
-	pand	%xmm6,	%xmm14
-	pand	%xmm6,	%xmm15
-	 movdqa	%xmm10, 0x20($out)
-	pcmpeqb	%xmm2,	%xmm14
-	psrlq	\$4,	%xmm2		# 0x04...
-	 movdqa	%xmm11, 0x30($out)
-	pcmpeqb	%xmm3,	%xmm15
-	psrlq	\$4,	%xmm3		# 0x08...
-	 movdqu	($inp), %xmm6		# load next round key
-
-	pxor	%xmm5, %xmm13		# "pnot"
-	pxor	%xmm5, %xmm14
-	movdqa	%xmm12, 0x40($out)
-	movdqa	%xmm13, 0x50($out)
-	movdqa	%xmm14, 0x60($out)
-	movdqa	%xmm15, 0x70($out)
-	lea	0x80($out),$out
-	dec	$rounds
-	jnz	.Lkey_loop
-
-	movdqa	0x50($const), %xmm7	# .L63
-	#movdqa	%xmm6, ($out)		# don't save last round key
-	ret
-.cfi_endproc
-.size	_bsaes_key_convert,.-_bsaes_key_convert
-___
-}
-
-if (0 && !$win64) {	# following four functions are unsupported interface
-			# used for benchmarking...
-$code.=<<___;
-.globl	bsaes_enc_key_convert
-.type	bsaes_enc_key_convert,\@function,2
-.align	16
-bsaes_enc_key_convert:
-	mov	240($inp),%r10d		# pass rounds
-	mov	$inp,%rcx		# pass key
-	mov	$out,%rax		# pass key schedule
-	call	_bsaes_key_convert
-	pxor	%xmm6,%xmm7		# fix up last round key
-	movdqa	%xmm7,(%rax)		# save last round key
-	ret
-.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
-
-.globl	bsaes_encrypt_128
-.type	bsaes_encrypt_128,\@function,4
-.align	16
-bsaes_encrypt_128:
-.Lenc128_loop:
-	movdqu	0x00($inp), @XMM[0]	# load input
-	movdqu	0x10($inp), @XMM[1]
-	movdqu	0x20($inp), @XMM[2]
-	movdqu	0x30($inp), @XMM[3]
-	movdqu	0x40($inp), @XMM[4]
-	movdqu	0x50($inp), @XMM[5]
-	movdqu	0x60($inp), @XMM[6]
-	movdqu	0x70($inp), @XMM[7]
-	mov	$key, %rax		# pass the $key
-	lea	0x80($inp), $inp
-	mov	\$10,%r10d
-
-	call	_bsaes_encrypt8
-
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[4], 0x20($out)
-	movdqu	@XMM[6], 0x30($out)
-	movdqu	@XMM[3], 0x40($out)
-	movdqu	@XMM[7], 0x50($out)
-	movdqu	@XMM[2], 0x60($out)
-	movdqu	@XMM[5], 0x70($out)
-	lea	0x80($out), $out
-	sub	\$0x80,$len
-	ja	.Lenc128_loop
-	ret
-.size	bsaes_encrypt_128,.-bsaes_encrypt_128
-
-.globl	bsaes_dec_key_convert
-.type	bsaes_dec_key_convert,\@function,2
-.align	16
-bsaes_dec_key_convert:
-	mov	240($inp),%r10d		# pass rounds
-	mov	$inp,%rcx		# pass key
-	mov	$out,%rax		# pass key schedule
-	call	_bsaes_key_convert
-	pxor	($out),%xmm7		# fix up round 0 key
-	movdqa	%xmm6,(%rax)		# save last round key
-	movdqa	%xmm7,($out)
-	ret
-.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
-
-.globl	bsaes_decrypt_128
-.type	bsaes_decrypt_128,\@function,4
-.align	16
-bsaes_decrypt_128:
-.Ldec128_loop:
-	movdqu	0x00($inp), @XMM[0]	# load input
-	movdqu	0x10($inp), @XMM[1]
-	movdqu	0x20($inp), @XMM[2]
-	movdqu	0x30($inp), @XMM[3]
-	movdqu	0x40($inp), @XMM[4]
-	movdqu	0x50($inp), @XMM[5]
-	movdqu	0x60($inp), @XMM[6]
-	movdqu	0x70($inp), @XMM[7]
-	mov	$key, %rax		# pass the $key
-	lea	0x80($inp), $inp
-	mov	\$10,%r10d
-
-	call	_bsaes_decrypt8
-
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[6], 0x20($out)
-	movdqu	@XMM[4], 0x30($out)
-	movdqu	@XMM[2], 0x40($out)
-	movdqu	@XMM[7], 0x50($out)
-	movdqu	@XMM[3], 0x60($out)
-	movdqu	@XMM[5], 0x70($out)
-	lea	0x80($out), $out
-	sub	\$0x80,$len
-	ja	.Ldec128_loop
-	ret
-.size	bsaes_decrypt_128,.-bsaes_decrypt_128
-___
-}
-{
-######################################################################
-#
-# OpenSSL interface
-#
-my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64	? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
-						: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
-my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
-
-if ($ecb) {
-$code.=<<___;
-.globl	bsaes_ecb_encrypt_blocks
-.type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
-.align	16
-bsaes_ecb_encrypt_blocks:
-.cfi_startproc
-	mov	%rsp, %rax
-.Lecb_enc_prologue:
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	lea	-0x48(%rsp),%rsp
-.cfi_adjust_cfa_offset	0x48
-___
-$code.=<<___ if ($win64);
-	lea	-0xa0(%rsp), %rsp
-	movaps	%xmm6, 0x40(%rsp)
-	movaps	%xmm7, 0x50(%rsp)
-	movaps	%xmm8, 0x60(%rsp)
-	movaps	%xmm9, 0x70(%rsp)
-	movaps	%xmm10, 0x80(%rsp)
-	movaps	%xmm11, 0x90(%rsp)
-	movaps	%xmm12, 0xa0(%rsp)
-	movaps	%xmm13, 0xb0(%rsp)
-	movaps	%xmm14, 0xc0(%rsp)
-	movaps	%xmm15, 0xd0(%rsp)
-.Lecb_enc_body:
-___
-$code.=<<___;
-	mov	%rsp,%rbp		# backup %rsp
-.cfi_def_cfa_register	%rbp
-	mov	240($arg4),%eax		# rounds
-	mov	$arg1,$inp		# backup arguments
-	mov	$arg2,$out
-	mov	$arg3,$len
-	mov	$arg4,$key
-	cmp	\$8,$arg3
-	jb	.Lecb_enc_short
-
-	mov	%eax,%ebx		# backup rounds
-	shl	\$7,%rax		# 128 bytes per inner round key
-	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
-	sub	%rax,%rsp
-	mov	%rsp,%rax		# pass key schedule
-	mov	$key,%rcx		# pass key
-	mov	%ebx,%r10d		# pass rounds
-	call	_bsaes_key_convert
-	pxor	%xmm6,%xmm7		# fix up last round key
-	movdqa	%xmm7,(%rax)		# save last round key
-
-	sub	\$8,$len
-.Lecb_enc_loop:
-	movdqu	0x00($inp), @XMM[0]	# load input
-	movdqu	0x10($inp), @XMM[1]
-	movdqu	0x20($inp), @XMM[2]
-	movdqu	0x30($inp), @XMM[3]
-	movdqu	0x40($inp), @XMM[4]
-	movdqu	0x50($inp), @XMM[5]
-	mov	%rsp, %rax		# pass key schedule
-	movdqu	0x60($inp), @XMM[6]
-	mov	%ebx,%r10d		# pass rounds
-	movdqu	0x70($inp), @XMM[7]
-	lea	0x80($inp), $inp
-
-	call	_bsaes_encrypt8
-
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[4], 0x20($out)
-	movdqu	@XMM[6], 0x30($out)
-	movdqu	@XMM[3], 0x40($out)
-	movdqu	@XMM[7], 0x50($out)
-	movdqu	@XMM[2], 0x60($out)
-	movdqu	@XMM[5], 0x70($out)
-	lea	0x80($out), $out
-	sub	\$8,$len
-	jnc	.Lecb_enc_loop
-
-	add	\$8,$len
-	jz	.Lecb_enc_done
-
-	movdqu	0x00($inp), @XMM[0]	# load input
-	mov	%rsp, %rax		# pass key schedule
-	mov	%ebx,%r10d		# pass rounds
-	cmp	\$2,$len
-	jb	.Lecb_enc_one
-	movdqu	0x10($inp), @XMM[1]
-	je	.Lecb_enc_two
-	movdqu	0x20($inp), @XMM[2]
-	cmp	\$4,$len
-	jb	.Lecb_enc_three
-	movdqu	0x30($inp), @XMM[3]
-	je	.Lecb_enc_four
-	movdqu	0x40($inp), @XMM[4]
-	cmp	\$6,$len
-	jb	.Lecb_enc_five
-	movdqu	0x50($inp), @XMM[5]
-	je	.Lecb_enc_six
-	movdqu	0x60($inp), @XMM[6]
-	call	_bsaes_encrypt8
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[4], 0x20($out)
-	movdqu	@XMM[6], 0x30($out)
-	movdqu	@XMM[3], 0x40($out)
-	movdqu	@XMM[7], 0x50($out)
-	movdqu	@XMM[2], 0x60($out)
-	jmp	.Lecb_enc_done
-.align	16
-.Lecb_enc_six:
-	call	_bsaes_encrypt8
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[4], 0x20($out)
-	movdqu	@XMM[6], 0x30($out)
-	movdqu	@XMM[3], 0x40($out)
-	movdqu	@XMM[7], 0x50($out)
-	jmp	.Lecb_enc_done
-.align	16
-.Lecb_enc_five:
-	call	_bsaes_encrypt8
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[4], 0x20($out)
-	movdqu	@XMM[6], 0x30($out)
-	movdqu	@XMM[3], 0x40($out)
-	jmp	.Lecb_enc_done
-.align	16
-.Lecb_enc_four:
-	call	_bsaes_encrypt8
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[4], 0x20($out)
-	movdqu	@XMM[6], 0x30($out)
-	jmp	.Lecb_enc_done
-.align	16
-.Lecb_enc_three:
-	call	_bsaes_encrypt8
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[4], 0x20($out)
-	jmp	.Lecb_enc_done
-.align	16
-.Lecb_enc_two:
-	call	_bsaes_encrypt8
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	jmp	.Lecb_enc_done
-.align	16
-.Lecb_enc_one:
-	call	_bsaes_encrypt8
-	movdqu	@XMM[0], 0x00($out)	# write output
-	jmp	.Lecb_enc_done
-.align	16
-.Lecb_enc_short:
-	lea	($inp), $arg1
-	lea	($out), $arg2
-	lea	($key), $arg3
-	call	asm_AES_encrypt
-	lea	16($inp), $inp
-	lea	16($out), $out
-	dec	$len
-	jnz	.Lecb_enc_short
-
-.Lecb_enc_done:
-	lea	(%rsp),%rax
-	pxor	%xmm0, %xmm0
-.Lecb_enc_bzero:			# wipe key schedule [if any]
-	movdqa	%xmm0, 0x00(%rax)
-	movdqa	%xmm0, 0x10(%rax)
-	lea	0x20(%rax), %rax
-	cmp	%rax, %rbp
-	jb	.Lecb_enc_bzero
-
-	lea	0x78(%rbp),%rax
-.cfi_def_cfa	%rax,8
-___
-$code.=<<___ if ($win64);
-	movaps	0x40(%rbp), %xmm6
-	movaps	0x50(%rbp), %xmm7
-	movaps	0x60(%rbp), %xmm8
-	movaps	0x70(%rbp), %xmm9
-	movaps	0x80(%rbp), %xmm10
-	movaps	0x90(%rbp), %xmm11
-	movaps	0xa0(%rbp), %xmm12
-	movaps	0xb0(%rbp), %xmm13
-	movaps	0xc0(%rbp), %xmm14
-	movaps	0xd0(%rbp), %xmm15
-	lea	0xa0(%rax), %rax
-.Lecb_enc_tail:
-___
-$code.=<<___;
-	mov	-48(%rax), %r15
-.cfi_restore	%r15
-	mov	-40(%rax), %r14
-.cfi_restore	%r14
-	mov	-32(%rax), %r13
-.cfi_restore	%r13
-	mov	-24(%rax), %r12
-.cfi_restore	%r12
-	mov	-16(%rax), %rbx
-.cfi_restore	%rbx
-	mov	-8(%rax), %rbp
-.cfi_restore	%rbp
-	lea	(%rax), %rsp		# restore %rsp
-.cfi_def_cfa_register	%rsp
-.Lecb_enc_epilogue:
-	ret
-.cfi_endproc
-.size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
-
-.globl	bsaes_ecb_decrypt_blocks
-.type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
-.align	16
-bsaes_ecb_decrypt_blocks:
-.cfi_startproc
-	mov	%rsp, %rax
-.Lecb_dec_prologue:
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	lea	-0x48(%rsp),%rsp
-.cfi_adjust_cfa_offset	0x48
-___
-$code.=<<___ if ($win64);
-	lea	-0xa0(%rsp), %rsp
-	movaps	%xmm6, 0x40(%rsp)
-	movaps	%xmm7, 0x50(%rsp)
-	movaps	%xmm8, 0x60(%rsp)
-	movaps	%xmm9, 0x70(%rsp)
-	movaps	%xmm10, 0x80(%rsp)
-	movaps	%xmm11, 0x90(%rsp)
-	movaps	%xmm12, 0xa0(%rsp)
-	movaps	%xmm13, 0xb0(%rsp)
-	movaps	%xmm14, 0xc0(%rsp)
-	movaps	%xmm15, 0xd0(%rsp)
-.Lecb_dec_body:
-___
-$code.=<<___;
-	mov	%rsp,%rbp		# backup %rsp
-.cfi_def_cfa_register	%rbp
-	mov	240($arg4),%eax		# rounds
-	mov	$arg1,$inp		# backup arguments
-	mov	$arg2,$out
-	mov	$arg3,$len
-	mov	$arg4,$key
-	cmp	\$8,$arg3
-	jb	.Lecb_dec_short
-
-	mov	%eax,%ebx		# backup rounds
-	shl	\$7,%rax		# 128 bytes per inner round key
-	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
-	sub	%rax,%rsp
-	mov	%rsp,%rax		# pass key schedule
-	mov	$key,%rcx		# pass key
-	mov	%ebx,%r10d		# pass rounds
-	call	_bsaes_key_convert
-	pxor	(%rsp),%xmm7		# fix up 0 round key
-	movdqa	%xmm6,(%rax)		# save last round key
-	movdqa	%xmm7,(%rsp)
-
-	sub	\$8,$len
-.Lecb_dec_loop:
-	movdqu	0x00($inp), @XMM[0]	# load input
-	movdqu	0x10($inp), @XMM[1]
-	movdqu	0x20($inp), @XMM[2]
-	movdqu	0x30($inp), @XMM[3]
-	movdqu	0x40($inp), @XMM[4]
-	movdqu	0x50($inp), @XMM[5]
-	mov	%rsp, %rax		# pass key schedule
-	movdqu	0x60($inp), @XMM[6]
-	mov	%ebx,%r10d		# pass rounds
-	movdqu	0x70($inp), @XMM[7]
-	lea	0x80($inp), $inp
-
-	call	_bsaes_decrypt8
-
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[6], 0x20($out)
-	movdqu	@XMM[4], 0x30($out)
-	movdqu	@XMM[2], 0x40($out)
-	movdqu	@XMM[7], 0x50($out)
-	movdqu	@XMM[3], 0x60($out)
-	movdqu	@XMM[5], 0x70($out)
-	lea	0x80($out), $out
-	sub	\$8,$len
-	jnc	.Lecb_dec_loop
-
-	add	\$8,$len
-	jz	.Lecb_dec_done
-
-	movdqu	0x00($inp), @XMM[0]	# load input
-	mov	%rsp, %rax		# pass key schedule
-	mov	%ebx,%r10d		# pass rounds
-	cmp	\$2,$len
-	jb	.Lecb_dec_one
-	movdqu	0x10($inp), @XMM[1]
-	je	.Lecb_dec_two
-	movdqu	0x20($inp), @XMM[2]
-	cmp	\$4,$len
-	jb	.Lecb_dec_three
-	movdqu	0x30($inp), @XMM[3]
-	je	.Lecb_dec_four
-	movdqu	0x40($inp), @XMM[4]
-	cmp	\$6,$len
-	jb	.Lecb_dec_five
-	movdqu	0x50($inp), @XMM[5]
-	je	.Lecb_dec_six
-	movdqu	0x60($inp), @XMM[6]
-	call	_bsaes_decrypt8
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[6], 0x20($out)
-	movdqu	@XMM[4], 0x30($out)
-	movdqu	@XMM[2], 0x40($out)
-	movdqu	@XMM[7], 0x50($out)
-	movdqu	@XMM[3], 0x60($out)
-	jmp	.Lecb_dec_done
-.align	16
-.Lecb_dec_six:
-	call	_bsaes_decrypt8
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[6], 0x20($out)
-	movdqu	@XMM[4], 0x30($out)
-	movdqu	@XMM[2], 0x40($out)
-	movdqu	@XMM[7], 0x50($out)
-	jmp	.Lecb_dec_done
-.align	16
-.Lecb_dec_five:
-	call	_bsaes_decrypt8
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[6], 0x20($out)
-	movdqu	@XMM[4], 0x30($out)
-	movdqu	@XMM[2], 0x40($out)
-	jmp	.Lecb_dec_done
-.align	16
-.Lecb_dec_four:
-	call	_bsaes_decrypt8
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[6], 0x20($out)
-	movdqu	@XMM[4], 0x30($out)
-	jmp	.Lecb_dec_done
-.align	16
-.Lecb_dec_three:
-	call	_bsaes_decrypt8
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[6], 0x20($out)
-	jmp	.Lecb_dec_done
-.align	16
-.Lecb_dec_two:
-	call	_bsaes_decrypt8
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	jmp	.Lecb_dec_done
-.align	16
-.Lecb_dec_one:
-	call	_bsaes_decrypt8
-	movdqu	@XMM[0], 0x00($out)	# write output
-	jmp	.Lecb_dec_done
-.align	16
-.Lecb_dec_short:
-	lea	($inp), $arg1
-	lea	($out), $arg2
-	lea	($key), $arg3
-	call	asm_AES_decrypt
-	lea	16($inp), $inp
-	lea	16($out), $out
-	dec	$len
-	jnz	.Lecb_dec_short
-
-.Lecb_dec_done:
-	lea	(%rsp),%rax
-	pxor	%xmm0, %xmm0
-.Lecb_dec_bzero:			# wipe key schedule [if any]
-	movdqa	%xmm0, 0x00(%rax)
-	movdqa	%xmm0, 0x10(%rax)
-	lea	0x20(%rax), %rax
-	cmp	%rax, %rbp
-	jb	.Lecb_dec_bzero
-
-	lea	0x78(%rbp),%rax
-.cfi_def_cfa	%rax,8
-___
-$code.=<<___ if ($win64);
-	movaps	0x40(%rbp), %xmm6
-	movaps	0x50(%rbp), %xmm7
-	movaps	0x60(%rbp), %xmm8
-	movaps	0x70(%rbp), %xmm9
-	movaps	0x80(%rbp), %xmm10
-	movaps	0x90(%rbp), %xmm11
-	movaps	0xa0(%rbp), %xmm12
-	movaps	0xb0(%rbp), %xmm13
-	movaps	0xc0(%rbp), %xmm14
-	movaps	0xd0(%rbp), %xmm15
-	lea	0xa0(%rax), %rax
-.Lecb_dec_tail:
-___
-$code.=<<___;
-	mov	-48(%rax), %r15
-.cfi_restore	%r15
-	mov	-40(%rax), %r14
-.cfi_restore	%r14
-	mov	-32(%rax), %r13
-.cfi_restore	%r13
-	mov	-24(%rax), %r12
-.cfi_restore	%r12
-	mov	-16(%rax), %rbx
-.cfi_restore	%rbx
-	mov	-8(%rax), %rbp
-.cfi_restore	%rbp
-	lea	(%rax), %rsp		# restore %rsp
-.cfi_def_cfa_register	%rsp
-.Lecb_dec_epilogue:
-	ret
-.cfi_endproc
-.size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
-___
-}
-$code.=<<___;
-.extern	asm_AES_cbc_encrypt
-.globl	bsaes_cbc_encrypt
-.type	bsaes_cbc_encrypt,\@abi-omnipotent
-.align	16
-bsaes_cbc_encrypt:
-.cfi_startproc
-___
-$code.=<<___ if ($win64);
-	mov	48(%rsp),$arg6		# pull direction flag
-___
-$code.=<<___;
-	cmp	\$0,$arg6
-	jne	asm_AES_cbc_encrypt
-	cmp	\$128,$arg3
-	jb	asm_AES_cbc_encrypt
-
-	mov	%rsp, %rax
-.Lcbc_dec_prologue:
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	lea	-0x48(%rsp), %rsp
-.cfi_adjust_cfa_offset	0x48
-___
-$code.=<<___ if ($win64);
-	mov	0xa0(%rsp),$arg5	# pull ivp
-	lea	-0xa0(%rsp), %rsp
-	movaps	%xmm6, 0x40(%rsp)
-	movaps	%xmm7, 0x50(%rsp)
-	movaps	%xmm8, 0x60(%rsp)
-	movaps	%xmm9, 0x70(%rsp)
-	movaps	%xmm10, 0x80(%rsp)
-	movaps	%xmm11, 0x90(%rsp)
-	movaps	%xmm12, 0xa0(%rsp)
-	movaps	%xmm13, 0xb0(%rsp)
-	movaps	%xmm14, 0xc0(%rsp)
-	movaps	%xmm15, 0xd0(%rsp)
-.Lcbc_dec_body:
-___
-$code.=<<___;
-	mov	%rsp, %rbp		# backup %rsp
-.cfi_def_cfa_register	%rbp
-	mov	240($arg4), %eax	# rounds
-	mov	$arg1, $inp		# backup arguments
-	mov	$arg2, $out
-	mov	$arg3, $len
-	mov	$arg4, $key
-	mov	$arg5, %rbx
-	shr	\$4, $len		# bytes to blocks
-
-	mov	%eax, %edx		# rounds
-	shl	\$7, %rax		# 128 bytes per inner round key
-	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
-	sub	%rax, %rsp
-
-	mov	%rsp, %rax		# pass key schedule
-	mov	$key, %rcx		# pass key
-	mov	%edx, %r10d		# pass rounds
-	call	_bsaes_key_convert
-	pxor	(%rsp),%xmm7		# fix up 0 round key
-	movdqa	%xmm6,(%rax)		# save last round key
-	movdqa	%xmm7,(%rsp)
-
-	movdqu	(%rbx), @XMM[15]	# load IV
-	sub	\$8,$len
-.Lcbc_dec_loop:
-	movdqu	0x00($inp), @XMM[0]	# load input
-	movdqu	0x10($inp), @XMM[1]
-	movdqu	0x20($inp), @XMM[2]
-	movdqu	0x30($inp), @XMM[3]
-	movdqu	0x40($inp), @XMM[4]
-	movdqu	0x50($inp), @XMM[5]
-	mov	%rsp, %rax		# pass key schedule
-	movdqu	0x60($inp), @XMM[6]
-	mov	%edx,%r10d		# pass rounds
-	movdqu	0x70($inp), @XMM[7]
-	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
-
-	call	_bsaes_decrypt8
-
-	pxor	0x20(%rbp), @XMM[0]	# ^= IV
-	movdqu	0x00($inp), @XMM[8]	# re-load input
-	movdqu	0x10($inp), @XMM[9]
-	pxor	@XMM[8], @XMM[1]
-	movdqu	0x20($inp), @XMM[10]
-	pxor	@XMM[9], @XMM[6]
-	movdqu	0x30($inp), @XMM[11]
-	pxor	@XMM[10], @XMM[4]
-	movdqu	0x40($inp), @XMM[12]
-	pxor	@XMM[11], @XMM[2]
-	movdqu	0x50($inp), @XMM[13]
-	pxor	@XMM[12], @XMM[7]
-	movdqu	0x60($inp), @XMM[14]
-	pxor	@XMM[13], @XMM[3]
-	movdqu	0x70($inp), @XMM[15]	# IV
-	pxor	@XMM[14], @XMM[5]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	lea	0x80($inp), $inp
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[6], 0x20($out)
-	movdqu	@XMM[4], 0x30($out)
-	movdqu	@XMM[2], 0x40($out)
-	movdqu	@XMM[7], 0x50($out)
-	movdqu	@XMM[3], 0x60($out)
-	movdqu	@XMM[5], 0x70($out)
-	lea	0x80($out), $out
-	sub	\$8,$len
-	jnc	.Lcbc_dec_loop
-
-	add	\$8,$len
-	jz	.Lcbc_dec_done
-
-	movdqu	0x00($inp), @XMM[0]	# load input
-	mov	%rsp, %rax		# pass key schedule
-	mov	%edx, %r10d		# pass rounds
-	cmp	\$2,$len
-	jb	.Lcbc_dec_one
-	movdqu	0x10($inp), @XMM[1]
-	je	.Lcbc_dec_two
-	movdqu	0x20($inp), @XMM[2]
-	cmp	\$4,$len
-	jb	.Lcbc_dec_three
-	movdqu	0x30($inp), @XMM[3]
-	je	.Lcbc_dec_four
-	movdqu	0x40($inp), @XMM[4]
-	cmp	\$6,$len
-	jb	.Lcbc_dec_five
-	movdqu	0x50($inp), @XMM[5]
-	je	.Lcbc_dec_six
-	movdqu	0x60($inp), @XMM[6]
-	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
-	call	_bsaes_decrypt8
-	pxor	0x20(%rbp), @XMM[0]	# ^= IV
-	movdqu	0x00($inp), @XMM[8]	# re-load input
-	movdqu	0x10($inp), @XMM[9]
-	pxor	@XMM[8], @XMM[1]
-	movdqu	0x20($inp), @XMM[10]
-	pxor	@XMM[9], @XMM[6]
-	movdqu	0x30($inp), @XMM[11]
-	pxor	@XMM[10], @XMM[4]
-	movdqu	0x40($inp), @XMM[12]
-	pxor	@XMM[11], @XMM[2]
-	movdqu	0x50($inp), @XMM[13]
-	pxor	@XMM[12], @XMM[7]
-	movdqu	0x60($inp), @XMM[15]	# IV
-	pxor	@XMM[13], @XMM[3]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[6], 0x20($out)
-	movdqu	@XMM[4], 0x30($out)
-	movdqu	@XMM[2], 0x40($out)
-	movdqu	@XMM[7], 0x50($out)
-	movdqu	@XMM[3], 0x60($out)
-	jmp	.Lcbc_dec_done
-.align	16
-.Lcbc_dec_six:
-	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
-	call	_bsaes_decrypt8
-	pxor	0x20(%rbp), @XMM[0]	# ^= IV
-	movdqu	0x00($inp), @XMM[8]	# re-load input
-	movdqu	0x10($inp), @XMM[9]
-	pxor	@XMM[8], @XMM[1]
-	movdqu	0x20($inp), @XMM[10]
-	pxor	@XMM[9], @XMM[6]
-	movdqu	0x30($inp), @XMM[11]
-	pxor	@XMM[10], @XMM[4]
-	movdqu	0x40($inp), @XMM[12]
-	pxor	@XMM[11], @XMM[2]
-	movdqu	0x50($inp), @XMM[15]	# IV
-	pxor	@XMM[12], @XMM[7]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[6], 0x20($out)
-	movdqu	@XMM[4], 0x30($out)
-	movdqu	@XMM[2], 0x40($out)
-	movdqu	@XMM[7], 0x50($out)
-	jmp	.Lcbc_dec_done
-.align	16
-.Lcbc_dec_five:
-	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
-	call	_bsaes_decrypt8
-	pxor	0x20(%rbp), @XMM[0]	# ^= IV
-	movdqu	0x00($inp), @XMM[8]	# re-load input
-	movdqu	0x10($inp), @XMM[9]
-	pxor	@XMM[8], @XMM[1]
-	movdqu	0x20($inp), @XMM[10]
-	pxor	@XMM[9], @XMM[6]
-	movdqu	0x30($inp), @XMM[11]
-	pxor	@XMM[10], @XMM[4]
-	movdqu	0x40($inp), @XMM[15]	# IV
-	pxor	@XMM[11], @XMM[2]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[6], 0x20($out)
-	movdqu	@XMM[4], 0x30($out)
-	movdqu	@XMM[2], 0x40($out)
-	jmp	.Lcbc_dec_done
-.align	16
-.Lcbc_dec_four:
-	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
-	call	_bsaes_decrypt8
-	pxor	0x20(%rbp), @XMM[0]	# ^= IV
-	movdqu	0x00($inp), @XMM[8]	# re-load input
-	movdqu	0x10($inp), @XMM[9]
-	pxor	@XMM[8], @XMM[1]
-	movdqu	0x20($inp), @XMM[10]
-	pxor	@XMM[9], @XMM[6]
-	movdqu	0x30($inp), @XMM[15]	# IV
-	pxor	@XMM[10], @XMM[4]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[6], 0x20($out)
-	movdqu	@XMM[4], 0x30($out)
-	jmp	.Lcbc_dec_done
-.align	16
-.Lcbc_dec_three:
-	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
-	call	_bsaes_decrypt8
-	pxor	0x20(%rbp), @XMM[0]	# ^= IV
-	movdqu	0x00($inp), @XMM[8]	# re-load input
-	movdqu	0x10($inp), @XMM[9]
-	pxor	@XMM[8], @XMM[1]
-	movdqu	0x20($inp), @XMM[15]	# IV
-	pxor	@XMM[9], @XMM[6]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[6], 0x20($out)
-	jmp	.Lcbc_dec_done
-.align	16
-.Lcbc_dec_two:
-	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
-	call	_bsaes_decrypt8
-	pxor	0x20(%rbp), @XMM[0]	# ^= IV
-	movdqu	0x00($inp), @XMM[8]	# re-load input
-	movdqu	0x10($inp), @XMM[15]	# IV
-	pxor	@XMM[8], @XMM[1]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	jmp	.Lcbc_dec_done
-.align	16
-.Lcbc_dec_one:
-	lea	($inp), $arg1
-	lea	0x20(%rbp), $arg2	# buffer output
-	lea	($key), $arg3
-	call	asm_AES_decrypt		# doesn't touch %xmm
-	pxor	0x20(%rbp), @XMM[15]	# ^= IV
-	movdqu	@XMM[15], ($out)	# write output
-	movdqa	@XMM[0], @XMM[15]	# IV
-
-.Lcbc_dec_done:
-	movdqu	@XMM[15], (%rbx)	# return IV
-	lea	(%rsp), %rax
-	pxor	%xmm0, %xmm0
-.Lcbc_dec_bzero:			# wipe key schedule [if any]
-	movdqa	%xmm0, 0x00(%rax)
-	movdqa	%xmm0, 0x10(%rax)
-	lea	0x20(%rax), %rax
-	cmp	%rax, %rbp
-	ja	.Lcbc_dec_bzero
-
-	lea	0x78(%rbp),%rax
-.cfi_def_cfa	%rax,8
-___
-$code.=<<___ if ($win64);
-	movaps	0x40(%rbp), %xmm6
-	movaps	0x50(%rbp), %xmm7
-	movaps	0x60(%rbp), %xmm8
-	movaps	0x70(%rbp), %xmm9
-	movaps	0x80(%rbp), %xmm10
-	movaps	0x90(%rbp), %xmm11
-	movaps	0xa0(%rbp), %xmm12
-	movaps	0xb0(%rbp), %xmm13
-	movaps	0xc0(%rbp), %xmm14
-	movaps	0xd0(%rbp), %xmm15
-	lea	0xa0(%rax), %rax
-.Lcbc_dec_tail:
-___
-$code.=<<___;
-	mov	-48(%rax), %r15
-.cfi_restore	%r15
-	mov	-40(%rax), %r14
-.cfi_restore	%r14
-	mov	-32(%rax), %r13
-.cfi_restore	%r13
-	mov	-24(%rax), %r12
-.cfi_restore	%r12
-	mov	-16(%rax), %rbx
-.cfi_restore	%rbx
-	mov	-8(%rax), %rbp
-.cfi_restore	%rbp
-	lea	(%rax), %rsp		# restore %rsp
-.cfi_def_cfa_register	%rsp
-.Lcbc_dec_epilogue:
-	ret
-.cfi_endproc
-.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
-
-.globl	bsaes_ctr32_encrypt_blocks
-.type	bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
-.align	16
-bsaes_ctr32_encrypt_blocks:
-.cfi_startproc
-	mov	%rsp, %rax
-.Lctr_enc_prologue:
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	lea	-0x48(%rsp), %rsp
-.cfi_adjust_cfa_offset	0x48
-___
-$code.=<<___ if ($win64);
-	mov	0xa0(%rsp),$arg5	# pull ivp
-	lea	-0xa0(%rsp), %rsp
-	movaps	%xmm6, 0x40(%rsp)
-	movaps	%xmm7, 0x50(%rsp)
-	movaps	%xmm8, 0x60(%rsp)
-	movaps	%xmm9, 0x70(%rsp)
-	movaps	%xmm10, 0x80(%rsp)
-	movaps	%xmm11, 0x90(%rsp)
-	movaps	%xmm12, 0xa0(%rsp)
-	movaps	%xmm13, 0xb0(%rsp)
-	movaps	%xmm14, 0xc0(%rsp)
-	movaps	%xmm15, 0xd0(%rsp)
-.Lctr_enc_body:
-___
-$code.=<<___;
-	mov	%rsp, %rbp		# backup %rsp
-.cfi_def_cfa_register	%rbp
-	movdqu	($arg5), %xmm0		# load counter
-	mov	240($arg4), %eax	# rounds
-	mov	$arg1, $inp		# backup arguments
-	mov	$arg2, $out
-	mov	$arg3, $len
-	mov	$arg4, $key
-	movdqa	%xmm0, 0x20(%rbp)	# copy counter
-	cmp	\$8, $arg3
-	jb	.Lctr_enc_short
-
-	mov	%eax, %ebx		# rounds
-	shl	\$7, %rax		# 128 bytes per inner round key
-	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
-	sub	%rax, %rsp
-
-	mov	%rsp, %rax		# pass key schedule
-	mov	$key, %rcx		# pass key
-	mov	%ebx, %r10d		# pass rounds
-	call	_bsaes_key_convert
-	pxor	%xmm6,%xmm7		# fix up last round key
-	movdqa	%xmm7,(%rax)		# save last round key
-
-	movdqa	(%rsp), @XMM[9]		# load round0 key
-	lea	.LADD1(%rip), %r11
-	movdqa	0x20(%rbp), @XMM[0]	# counter copy
-	movdqa	-0x20(%r11), @XMM[8]	# .LSWPUP
-	pshufb	@XMM[8], @XMM[9]	# byte swap upper part
-	pshufb	@XMM[8], @XMM[0]
-	movdqa	@XMM[9], (%rsp)		# save adjusted round0 key
-	jmp	.Lctr_enc_loop
-.align	16
-.Lctr_enc_loop:
-	movdqa	@XMM[0], 0x20(%rbp)	# save counter
-	movdqa	@XMM[0], @XMM[1]	# prepare 8 counter values
-	movdqa	@XMM[0], @XMM[2]
-	paddd	0x00(%r11), @XMM[1]	# .LADD1
-	movdqa	@XMM[0], @XMM[3]
-	paddd	0x10(%r11), @XMM[2]	# .LADD2
-	movdqa	@XMM[0], @XMM[4]
-	paddd	0x20(%r11), @XMM[3]	# .LADD3
-	movdqa	@XMM[0], @XMM[5]
-	paddd	0x30(%r11), @XMM[4]	# .LADD4
-	movdqa	@XMM[0], @XMM[6]
-	paddd	0x40(%r11), @XMM[5]	# .LADD5
-	movdqa	@XMM[0], @XMM[7]
-	paddd	0x50(%r11), @XMM[6]	# .LADD6
-	paddd	0x60(%r11), @XMM[7]	# .LADD7
-
-	# Borrow prologue from _bsaes_encrypt8 to use the opportunity
-	# to flip byte order in 32-bit counter
-	movdqa	(%rsp), @XMM[9]		# round 0 key
-	lea	0x10(%rsp), %rax	# pass key schedule
-	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
-	pxor	@XMM[9], @XMM[0]	# xor with round0 key
-	pxor	@XMM[9], @XMM[1]
-	pxor	@XMM[9], @XMM[2]
-	pxor	@XMM[9], @XMM[3]
-	 pshufb	@XMM[8], @XMM[0]
-	 pshufb	@XMM[8], @XMM[1]
-	pxor	@XMM[9], @XMM[4]
-	pxor	@XMM[9], @XMM[5]
-	 pshufb	@XMM[8], @XMM[2]
-	 pshufb	@XMM[8], @XMM[3]
-	pxor	@XMM[9], @XMM[6]
-	pxor	@XMM[9], @XMM[7]
-	 pshufb	@XMM[8], @XMM[4]
-	 pshufb	@XMM[8], @XMM[5]
-	 pshufb	@XMM[8], @XMM[6]
-	 pshufb	@XMM[8], @XMM[7]
-	lea	.LBS0(%rip), %r11	# constants table
-	mov	%ebx,%r10d		# pass rounds
-
-	call	_bsaes_encrypt8_bitslice
-
-	sub	\$8,$len
-	jc	.Lctr_enc_loop_done
-
-	movdqu	0x00($inp), @XMM[8]	# load input
-	movdqu	0x10($inp), @XMM[9]
-	movdqu	0x20($inp), @XMM[10]
-	movdqu	0x30($inp), @XMM[11]
-	movdqu	0x40($inp), @XMM[12]
-	movdqu	0x50($inp), @XMM[13]
-	movdqu	0x60($inp), @XMM[14]
-	movdqu	0x70($inp), @XMM[15]
-	lea	0x80($inp),$inp
-	pxor	@XMM[0], @XMM[8]
-	movdqa	0x20(%rbp), @XMM[0]	# load counter
-	pxor	@XMM[9], @XMM[1]
-	movdqu	@XMM[8], 0x00($out)	# write output
-	pxor	@XMM[10], @XMM[4]
-	movdqu	@XMM[1], 0x10($out)
-	pxor	@XMM[11], @XMM[6]
-	movdqu	@XMM[4], 0x20($out)
-	pxor	@XMM[12], @XMM[3]
-	movdqu	@XMM[6], 0x30($out)
-	pxor	@XMM[13], @XMM[7]
-	movdqu	@XMM[3], 0x40($out)
-	pxor	@XMM[14], @XMM[2]
-	movdqu	@XMM[7], 0x50($out)
-	pxor	@XMM[15], @XMM[5]
-	movdqu	@XMM[2], 0x60($out)
-	lea	.LADD1(%rip), %r11
-	movdqu	@XMM[5], 0x70($out)
-	lea	0x80($out), $out
-	paddd	0x70(%r11), @XMM[0]	# .LADD8
-	jnz	.Lctr_enc_loop
-
-	jmp	.Lctr_enc_done
-.align	16
-.Lctr_enc_loop_done:
-	add	\$8, $len
-	movdqu	0x00($inp), @XMM[8]	# load input
-	pxor	@XMM[8], @XMM[0]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	cmp	\$2,$len
-	jb	.Lctr_enc_done
-	movdqu	0x10($inp), @XMM[9]
-	pxor	@XMM[9], @XMM[1]
-	movdqu	@XMM[1], 0x10($out)
-	je	.Lctr_enc_done
-	movdqu	0x20($inp), @XMM[10]
-	pxor	@XMM[10], @XMM[4]
-	movdqu	@XMM[4], 0x20($out)
-	cmp	\$4,$len
-	jb	.Lctr_enc_done
-	movdqu	0x30($inp), @XMM[11]
-	pxor	@XMM[11], @XMM[6]
-	movdqu	@XMM[6], 0x30($out)
-	je	.Lctr_enc_done
-	movdqu	0x40($inp), @XMM[12]
-	pxor	@XMM[12], @XMM[3]
-	movdqu	@XMM[3], 0x40($out)
-	cmp	\$6,$len
-	jb	.Lctr_enc_done
-	movdqu	0x50($inp), @XMM[13]
-	pxor	@XMM[13], @XMM[7]
-	movdqu	@XMM[7], 0x50($out)
-	je	.Lctr_enc_done
-	movdqu	0x60($inp), @XMM[14]
-	pxor	@XMM[14], @XMM[2]
-	movdqu	@XMM[2], 0x60($out)
-	jmp	.Lctr_enc_done
-
-.align	16
-.Lctr_enc_short:
-	lea	0x20(%rbp), $arg1
-	lea	0x30(%rbp), $arg2
-	lea	($key), $arg3
-	call	asm_AES_encrypt
-	movdqu	($inp), @XMM[1]
-	lea	16($inp), $inp
-	mov	0x2c(%rbp), %eax	# load 32-bit counter
-	bswap	%eax
-	pxor	0x30(%rbp), @XMM[1]
-	inc	%eax			# increment
-	movdqu	@XMM[1], ($out)
-	bswap	%eax
-	lea	16($out), $out
-	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
-	dec	$len
-	jnz	.Lctr_enc_short
-
-.Lctr_enc_done:
-	lea	(%rsp), %rax
-	pxor	%xmm0, %xmm0
-.Lctr_enc_bzero:			# wipe key schedule [if any]
-	movdqa	%xmm0, 0x00(%rax)
-	movdqa	%xmm0, 0x10(%rax)
-	lea	0x20(%rax), %rax
-	cmp	%rax, %rbp
-	ja	.Lctr_enc_bzero
-
-	lea	0x78(%rbp),%rax
-.cfi_def_cfa	%rax,8
-___
-$code.=<<___ if ($win64);
-	movaps	0x40(%rbp), %xmm6
-	movaps	0x50(%rbp), %xmm7
-	movaps	0x60(%rbp), %xmm8
-	movaps	0x70(%rbp), %xmm9
-	movaps	0x80(%rbp), %xmm10
-	movaps	0x90(%rbp), %xmm11
-	movaps	0xa0(%rbp), %xmm12
-	movaps	0xb0(%rbp), %xmm13
-	movaps	0xc0(%rbp), %xmm14
-	movaps	0xd0(%rbp), %xmm15
-	lea	0xa0(%rax), %rax
-.Lctr_enc_tail:
-___
-$code.=<<___;
-	mov	-48(%rax), %r15
-.cfi_restore	%r15
-	mov	-40(%rax), %r14
-.cfi_restore	%r14
-	mov	-32(%rax), %r13
-.cfi_restore	%r13
-	mov	-24(%rax), %r12
-.cfi_restore	%r12
-	mov	-16(%rax), %rbx
-.cfi_restore	%rbx
-	mov	-8(%rax), %rbp
-.cfi_restore	%rbp
-	lea	(%rax), %rsp		# restore %rsp
-.cfi_def_cfa_register	%rsp
-.Lctr_enc_epilogue:
-	ret
-.cfi_endproc
-.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
-___
-######################################################################
-# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
-#	const AES_KEY *key1, const AES_KEY *key2,
-#	const unsigned char iv[16]);
-#
-my ($twmask,$twres,$twtmp)=@XMM[13..15];
-$arg6=~s/d$//;
-
-$code.=<<___;
-.globl	bsaes_xts_encrypt
-.type	bsaes_xts_encrypt,\@abi-omnipotent
-.align	16
-bsaes_xts_encrypt:
-.cfi_startproc
-	mov	%rsp, %rax
-.Lxts_enc_prologue:
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	lea	-0x48(%rsp), %rsp
-.cfi_adjust_cfa_offset	0x48
-___
-$code.=<<___ if ($win64);
-	mov	0xa0(%rsp),$arg5	# pull key2
-	mov	0xa8(%rsp),$arg6	# pull ivp
-	lea	-0xa0(%rsp), %rsp
-	movaps	%xmm6, 0x40(%rsp)
-	movaps	%xmm7, 0x50(%rsp)
-	movaps	%xmm8, 0x60(%rsp)
-	movaps	%xmm9, 0x70(%rsp)
-	movaps	%xmm10, 0x80(%rsp)
-	movaps	%xmm11, 0x90(%rsp)
-	movaps	%xmm12, 0xa0(%rsp)
-	movaps	%xmm13, 0xb0(%rsp)
-	movaps	%xmm14, 0xc0(%rsp)
-	movaps	%xmm15, 0xd0(%rsp)
-.Lxts_enc_body:
-___
-$code.=<<___;
-	mov	%rsp, %rbp		# backup %rsp
-.cfi_def_cfa_register	%rbp
-	mov	$arg1, $inp		# backup arguments
-	mov	$arg2, $out
-	mov	$arg3, $len
-	mov	$arg4, $key
-
-	lea	($arg6), $arg1
-	lea	0x20(%rbp), $arg2
-	lea	($arg5), $arg3
-	call	asm_AES_encrypt		# generate initial tweak
-
-	mov	240($key), %eax		# rounds
-	mov	$len, %rbx		# backup $len
-
-	mov	%eax, %edx		# rounds
-	shl	\$7, %rax		# 128 bytes per inner round key
-	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
-	sub	%rax, %rsp
-
-	mov	%rsp, %rax		# pass key schedule
-	mov	$key, %rcx		# pass key
-	mov	%edx, %r10d		# pass rounds
-	call	_bsaes_key_convert
-	pxor	%xmm6, %xmm7		# fix up last round key
-	movdqa	%xmm7, (%rax)		# save last round key
-
-	and	\$-16, $len
-	sub	\$0x80, %rsp		# place for tweak[8]
-	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
-
-	pxor	$twtmp, $twtmp
-	movdqa	.Lxts_magic(%rip), $twmask
-	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
-
-	sub	\$0x80, $len
-	jc	.Lxts_enc_short
-	jmp	.Lxts_enc_loop
-
-.align	16
-.Lxts_enc_loop:
-___
-    for ($i=0;$i<7;$i++) {
-    $code.=<<___;
-	pshufd	\$0x13, $twtmp, $twres
-	pxor	$twtmp, $twtmp
-	movdqa	@XMM[7], @XMM[$i]
-	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
-	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
-	pand	$twmask, $twres		# isolate carry and residue
-	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
-	pxor	$twres, @XMM[7]
-___
-    $code.=<<___ if ($i>=1);
-	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
-___
-    $code.=<<___ if ($i>=2);
-	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
-___
-    }
-$code.=<<___;
-	movdqu	0x60($inp), @XMM[8+6]
-	pxor	@XMM[8+5], @XMM[5]
-	movdqu	0x70($inp), @XMM[8+7]
-	lea	0x80($inp), $inp
-	movdqa	@XMM[7], 0x70(%rsp)
-	pxor	@XMM[8+6], @XMM[6]
-	lea	0x80(%rsp), %rax	# pass key schedule
-	pxor	@XMM[8+7], @XMM[7]
-	mov	%edx, %r10d		# pass rounds
-
-	call	_bsaes_encrypt8
-
-	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
-	pxor	0x10(%rsp), @XMM[1]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	pxor	0x20(%rsp), @XMM[4]
-	movdqu	@XMM[1], 0x10($out)
-	pxor	0x30(%rsp), @XMM[6]
-	movdqu	@XMM[4], 0x20($out)
-	pxor	0x40(%rsp), @XMM[3]
-	movdqu	@XMM[6], 0x30($out)
-	pxor	0x50(%rsp), @XMM[7]
-	movdqu	@XMM[3], 0x40($out)
-	pxor	0x60(%rsp), @XMM[2]
-	movdqu	@XMM[7], 0x50($out)
-	pxor	0x70(%rsp), @XMM[5]
-	movdqu	@XMM[2], 0x60($out)
-	movdqu	@XMM[5], 0x70($out)
-	lea	0x80($out), $out
-
-	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
-	pxor	$twtmp, $twtmp
-	movdqa	.Lxts_magic(%rip), $twmask
-	pcmpgtd	@XMM[7], $twtmp
-	pshufd	\$0x13, $twtmp, $twres
-	pxor	$twtmp, $twtmp
-	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
-	pand	$twmask, $twres		# isolate carry and residue
-	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
-	pxor	$twres, @XMM[7]
-
-	sub	\$0x80,$len
-	jnc	.Lxts_enc_loop
-
-.Lxts_enc_short:
-	add	\$0x80, $len
-	jz	.Lxts_enc_done
-___
-    for ($i=0;$i<7;$i++) {
-    $code.=<<___;
-	pshufd	\$0x13, $twtmp, $twres
-	pxor	$twtmp, $twtmp
-	movdqa	@XMM[7], @XMM[$i]
-	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
-	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
-	pand	$twmask, $twres		# isolate carry and residue
-	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
-	pxor	$twres, @XMM[7]
-___
-    $code.=<<___ if ($i>=1);
-	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
-	cmp	\$`0x10*$i`,$len
-	je	.Lxts_enc_$i
-___
-    $code.=<<___ if ($i>=2);
-	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
-___
-    }
-$code.=<<___;
-	movdqu	0x60($inp), @XMM[8+6]
-	pxor	@XMM[8+5], @XMM[5]
-	movdqa	@XMM[7], 0x70(%rsp)
-	lea	0x70($inp), $inp
-	pxor	@XMM[8+6], @XMM[6]
-	lea	0x80(%rsp), %rax	# pass key schedule
-	mov	%edx, %r10d		# pass rounds
-
-	call	_bsaes_encrypt8
-
-	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
-	pxor	0x10(%rsp), @XMM[1]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	pxor	0x20(%rsp), @XMM[4]
-	movdqu	@XMM[1], 0x10($out)
-	pxor	0x30(%rsp), @XMM[6]
-	movdqu	@XMM[4], 0x20($out)
-	pxor	0x40(%rsp), @XMM[3]
-	movdqu	@XMM[6], 0x30($out)
-	pxor	0x50(%rsp), @XMM[7]
-	movdqu	@XMM[3], 0x40($out)
-	pxor	0x60(%rsp), @XMM[2]
-	movdqu	@XMM[7], 0x50($out)
-	movdqu	@XMM[2], 0x60($out)
-	lea	0x70($out), $out
-
-	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
-	jmp	.Lxts_enc_done
-.align	16
-.Lxts_enc_6:
-	pxor	@XMM[8+4], @XMM[4]
-	lea	0x60($inp), $inp
-	pxor	@XMM[8+5], @XMM[5]
-	lea	0x80(%rsp), %rax	# pass key schedule
-	mov	%edx, %r10d		# pass rounds
-
-	call	_bsaes_encrypt8
-
-	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
-	pxor	0x10(%rsp), @XMM[1]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	pxor	0x20(%rsp), @XMM[4]
-	movdqu	@XMM[1], 0x10($out)
-	pxor	0x30(%rsp), @XMM[6]
-	movdqu	@XMM[4], 0x20($out)
-	pxor	0x40(%rsp), @XMM[3]
-	movdqu	@XMM[6], 0x30($out)
-	pxor	0x50(%rsp), @XMM[7]
-	movdqu	@XMM[3], 0x40($out)
-	movdqu	@XMM[7], 0x50($out)
-	lea	0x60($out), $out
-
-	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
-	jmp	.Lxts_enc_done
-.align	16
-.Lxts_enc_5:
-	pxor	@XMM[8+3], @XMM[3]
-	lea	0x50($inp), $inp
-	pxor	@XMM[8+4], @XMM[4]
-	lea	0x80(%rsp), %rax	# pass key schedule
-	mov	%edx, %r10d		# pass rounds
-
-	call	_bsaes_encrypt8
-
-	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
-	pxor	0x10(%rsp), @XMM[1]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	pxor	0x20(%rsp), @XMM[4]
-	movdqu	@XMM[1], 0x10($out)
-	pxor	0x30(%rsp), @XMM[6]
-	movdqu	@XMM[4], 0x20($out)
-	pxor	0x40(%rsp), @XMM[3]
-	movdqu	@XMM[6], 0x30($out)
-	movdqu	@XMM[3], 0x40($out)
-	lea	0x50($out), $out
-
-	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
-	jmp	.Lxts_enc_done
-.align	16
-.Lxts_enc_4:
-	pxor	@XMM[8+2], @XMM[2]
-	lea	0x40($inp), $inp
-	pxor	@XMM[8+3], @XMM[3]
-	lea	0x80(%rsp), %rax	# pass key schedule
-	mov	%edx, %r10d		# pass rounds
-
-	call	_bsaes_encrypt8
-
-	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
-	pxor	0x10(%rsp), @XMM[1]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	pxor	0x20(%rsp), @XMM[4]
-	movdqu	@XMM[1], 0x10($out)
-	pxor	0x30(%rsp), @XMM[6]
-	movdqu	@XMM[4], 0x20($out)
-	movdqu	@XMM[6], 0x30($out)
-	lea	0x40($out), $out
-
-	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
-	jmp	.Lxts_enc_done
-.align	16
-.Lxts_enc_3:
-	pxor	@XMM[8+1], @XMM[1]
-	lea	0x30($inp), $inp
-	pxor	@XMM[8+2], @XMM[2]
-	lea	0x80(%rsp), %rax	# pass key schedule
-	mov	%edx, %r10d		# pass rounds
-
-	call	_bsaes_encrypt8
-
-	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
-	pxor	0x10(%rsp), @XMM[1]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	pxor	0x20(%rsp), @XMM[4]
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[4], 0x20($out)
-	lea	0x30($out), $out
-
-	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
-	jmp	.Lxts_enc_done
-.align	16
-.Lxts_enc_2:
-	pxor	@XMM[8+0], @XMM[0]
-	lea	0x20($inp), $inp
-	pxor	@XMM[8+1], @XMM[1]
-	lea	0x80(%rsp), %rax	# pass key schedule
-	mov	%edx, %r10d		# pass rounds
-
-	call	_bsaes_encrypt8
-
-	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
-	pxor	0x10(%rsp), @XMM[1]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	lea	0x20($out), $out
-
-	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
-	jmp	.Lxts_enc_done
-.align	16
-.Lxts_enc_1:
-	pxor	@XMM[0], @XMM[8]
-	lea	0x10($inp), $inp
-	movdqa	@XMM[8], 0x20(%rbp)
-	lea	0x20(%rbp), $arg1
-	lea	0x20(%rbp), $arg2
-	lea	($key), $arg3
-	call	asm_AES_encrypt		# doesn't touch %xmm
-	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
-	#pxor	@XMM[8], @XMM[0]
-	#lea	0x80(%rsp), %rax	# pass key schedule
-	#mov	%edx, %r10d		# pass rounds
-	#call	_bsaes_encrypt8
-	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	lea	0x10($out), $out
-
-	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
-
-.Lxts_enc_done:
-	and	\$15, %ebx
-	jz	.Lxts_enc_ret
-	mov	$out, %rdx
-
-.Lxts_enc_steal:
-	movzb	($inp), %eax
-	movzb	-16(%rdx), %ecx
-	lea	1($inp), $inp
-	mov	%al, -16(%rdx)
-	mov	%cl, 0(%rdx)
-	lea	1(%rdx), %rdx
-	sub	\$1,%ebx
-	jnz	.Lxts_enc_steal
-
-	movdqu	-16($out), @XMM[0]
-	lea	0x20(%rbp), $arg1
-	pxor	@XMM[7], @XMM[0]
-	lea	0x20(%rbp), $arg2
-	movdqa	@XMM[0], 0x20(%rbp)
-	lea	($key), $arg3
-	call	asm_AES_encrypt		# doesn't touch %xmm
-	pxor	0x20(%rbp), @XMM[7]
-	movdqu	@XMM[7], -16($out)
-
-.Lxts_enc_ret:
-	lea	(%rsp), %rax
-	pxor	%xmm0, %xmm0
-.Lxts_enc_bzero:			# wipe key schedule [if any]
-	movdqa	%xmm0, 0x00(%rax)
-	movdqa	%xmm0, 0x10(%rax)
-	lea	0x20(%rax), %rax
-	cmp	%rax, %rbp
-	ja	.Lxts_enc_bzero
-
-	lea	0x78(%rbp),%rax
-.cfi_def_cfa	%rax,8
-___
-$code.=<<___ if ($win64);
-	movaps	0x40(%rbp), %xmm6
-	movaps	0x50(%rbp), %xmm7
-	movaps	0x60(%rbp), %xmm8
-	movaps	0x70(%rbp), %xmm9
-	movaps	0x80(%rbp), %xmm10
-	movaps	0x90(%rbp), %xmm11
-	movaps	0xa0(%rbp), %xmm12
-	movaps	0xb0(%rbp), %xmm13
-	movaps	0xc0(%rbp), %xmm14
-	movaps	0xd0(%rbp), %xmm15
-	lea	0xa0(%rax), %rax
-.Lxts_enc_tail:
-___
-$code.=<<___;
-	mov	-48(%rax), %r15
-.cfi_restore	%r15
-	mov	-40(%rax), %r14
-.cfi_restore	%r14
-	mov	-32(%rax), %r13
-.cfi_restore	%r13
-	mov	-24(%rax), %r12
-.cfi_restore	%r12
-	mov	-16(%rax), %rbx
-.cfi_restore	%rbx
-	mov	-8(%rax), %rbp
-.cfi_restore	%rbp
-	lea	(%rax), %rsp		# restore %rsp
-.cfi_def_cfa_register	%rsp
-.Lxts_enc_epilogue:
-	ret
-.cfi_endproc
-.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
-
-.globl	bsaes_xts_decrypt
-.type	bsaes_xts_decrypt,\@abi-omnipotent
-.align	16
-bsaes_xts_decrypt:
-.cfi_startproc
-	mov	%rsp, %rax
-.Lxts_dec_prologue:
-	push	%rbp
-.cfi_push	%rbp
-	push	%rbx
-.cfi_push	%rbx
-	push	%r12
-.cfi_push	%r12
-	push	%r13
-.cfi_push	%r13
-	push	%r14
-.cfi_push	%r14
-	push	%r15
-.cfi_push	%r15
-	lea	-0x48(%rsp), %rsp
-.cfi_adjust_cfa_offset	0x48
-___
-$code.=<<___ if ($win64);
-	mov	0xa0(%rsp),$arg5	# pull key2
-	mov	0xa8(%rsp),$arg6	# pull ivp
-	lea	-0xa0(%rsp), %rsp
-	movaps	%xmm6, 0x40(%rsp)
-	movaps	%xmm7, 0x50(%rsp)
-	movaps	%xmm8, 0x60(%rsp)
-	movaps	%xmm9, 0x70(%rsp)
-	movaps	%xmm10, 0x80(%rsp)
-	movaps	%xmm11, 0x90(%rsp)
-	movaps	%xmm12, 0xa0(%rsp)
-	movaps	%xmm13, 0xb0(%rsp)
-	movaps	%xmm14, 0xc0(%rsp)
-	movaps	%xmm15, 0xd0(%rsp)
-.Lxts_dec_body:
-___
-$code.=<<___;
-	mov	%rsp, %rbp		# backup %rsp
-	mov	$arg1, $inp		# backup arguments
-	mov	$arg2, $out
-	mov	$arg3, $len
-	mov	$arg4, $key
-
-	lea	($arg6), $arg1
-	lea	0x20(%rbp), $arg2
-	lea	($arg5), $arg3
-	call	asm_AES_encrypt		# generate initial tweak
-
-	mov	240($key), %eax		# rounds
-	mov	$len, %rbx		# backup $len
-
-	mov	%eax, %edx		# rounds
-	shl	\$7, %rax		# 128 bytes per inner round key
-	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
-	sub	%rax, %rsp
-
-	mov	%rsp, %rax		# pass key schedule
-	mov	$key, %rcx		# pass key
-	mov	%edx, %r10d		# pass rounds
-	call	_bsaes_key_convert
-	pxor	(%rsp), %xmm7		# fix up round 0 key
-	movdqa	%xmm6, (%rax)		# save last round key
-	movdqa	%xmm7, (%rsp)
-
-	xor	%eax, %eax		# if ($len%16) len-=16;
-	and	\$-16, $len
-	test	\$15, %ebx
-	setnz	%al
-	shl	\$4, %rax
-	sub	%rax, $len
-
-	sub	\$0x80, %rsp		# place for tweak[8]
-	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
-
-	pxor	$twtmp, $twtmp
-	movdqa	.Lxts_magic(%rip), $twmask
-	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
-
-	sub	\$0x80, $len
-	jc	.Lxts_dec_short
-	jmp	.Lxts_dec_loop
-
-.align	16
-.Lxts_dec_loop:
-___
-    for ($i=0;$i<7;$i++) {
-    $code.=<<___;
-	pshufd	\$0x13, $twtmp, $twres
-	pxor	$twtmp, $twtmp
-	movdqa	@XMM[7], @XMM[$i]
-	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
-	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
-	pand	$twmask, $twres		# isolate carry and residue
-	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
-	pxor	$twres, @XMM[7]
-___
-    $code.=<<___ if ($i>=1);
-	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
-___
-    $code.=<<___ if ($i>=2);
-	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
-___
-    }
-$code.=<<___;
-	movdqu	0x60($inp), @XMM[8+6]
-	pxor	@XMM[8+5], @XMM[5]
-	movdqu	0x70($inp), @XMM[8+7]
-	lea	0x80($inp), $inp
-	movdqa	@XMM[7], 0x70(%rsp)
-	pxor	@XMM[8+6], @XMM[6]
-	lea	0x80(%rsp), %rax	# pass key schedule
-	pxor	@XMM[8+7], @XMM[7]
-	mov	%edx, %r10d		# pass rounds
-
-	call	_bsaes_decrypt8
-
-	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
-	pxor	0x10(%rsp), @XMM[1]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	pxor	0x20(%rsp), @XMM[6]
-	movdqu	@XMM[1], 0x10($out)
-	pxor	0x30(%rsp), @XMM[4]
-	movdqu	@XMM[6], 0x20($out)
-	pxor	0x40(%rsp), @XMM[2]
-	movdqu	@XMM[4], 0x30($out)
-	pxor	0x50(%rsp), @XMM[7]
-	movdqu	@XMM[2], 0x40($out)
-	pxor	0x60(%rsp), @XMM[3]
-	movdqu	@XMM[7], 0x50($out)
-	pxor	0x70(%rsp), @XMM[5]
-	movdqu	@XMM[3], 0x60($out)
-	movdqu	@XMM[5], 0x70($out)
-	lea	0x80($out), $out
-
-	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
-	pxor	$twtmp, $twtmp
-	movdqa	.Lxts_magic(%rip), $twmask
-	pcmpgtd	@XMM[7], $twtmp
-	pshufd	\$0x13, $twtmp, $twres
-	pxor	$twtmp, $twtmp
-	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
-	pand	$twmask, $twres		# isolate carry and residue
-	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
-	pxor	$twres, @XMM[7]
-
-	sub	\$0x80,$len
-	jnc	.Lxts_dec_loop
-
-.Lxts_dec_short:
-	add	\$0x80, $len
-	jz	.Lxts_dec_done
-___
-    for ($i=0;$i<7;$i++) {
-    $code.=<<___;
-	pshufd	\$0x13, $twtmp, $twres
-	pxor	$twtmp, $twtmp
-	movdqa	@XMM[7], @XMM[$i]
-	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
-	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
-	pand	$twmask, $twres		# isolate carry and residue
-	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
-	pxor	$twres, @XMM[7]
-___
-    $code.=<<___ if ($i>=1);
-	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
-	cmp	\$`0x10*$i`,$len
-	je	.Lxts_dec_$i
-___
-    $code.=<<___ if ($i>=2);
-	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
-___
-    }
-$code.=<<___;
-	movdqu	0x60($inp), @XMM[8+6]
-	pxor	@XMM[8+5], @XMM[5]
-	movdqa	@XMM[7], 0x70(%rsp)
-	lea	0x70($inp), $inp
-	pxor	@XMM[8+6], @XMM[6]
-	lea	0x80(%rsp), %rax	# pass key schedule
-	mov	%edx, %r10d		# pass rounds
-
-	call	_bsaes_decrypt8
-
-	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
-	pxor	0x10(%rsp), @XMM[1]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	pxor	0x20(%rsp), @XMM[6]
-	movdqu	@XMM[1], 0x10($out)
-	pxor	0x30(%rsp), @XMM[4]
-	movdqu	@XMM[6], 0x20($out)
-	pxor	0x40(%rsp), @XMM[2]
-	movdqu	@XMM[4], 0x30($out)
-	pxor	0x50(%rsp), @XMM[7]
-	movdqu	@XMM[2], 0x40($out)
-	pxor	0x60(%rsp), @XMM[3]
-	movdqu	@XMM[7], 0x50($out)
-	movdqu	@XMM[3], 0x60($out)
-	lea	0x70($out), $out
-
-	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
-	jmp	.Lxts_dec_done
-.align	16
-.Lxts_dec_6:
-	pxor	@XMM[8+4], @XMM[4]
-	lea	0x60($inp), $inp
-	pxor	@XMM[8+5], @XMM[5]
-	lea	0x80(%rsp), %rax	# pass key schedule
-	mov	%edx, %r10d		# pass rounds
-
-	call	_bsaes_decrypt8
-
-	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
-	pxor	0x10(%rsp), @XMM[1]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	pxor	0x20(%rsp), @XMM[6]
-	movdqu	@XMM[1], 0x10($out)
-	pxor	0x30(%rsp), @XMM[4]
-	movdqu	@XMM[6], 0x20($out)
-	pxor	0x40(%rsp), @XMM[2]
-	movdqu	@XMM[4], 0x30($out)
-	pxor	0x50(%rsp), @XMM[7]
-	movdqu	@XMM[2], 0x40($out)
-	movdqu	@XMM[7], 0x50($out)
-	lea	0x60($out), $out
-
-	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
-	jmp	.Lxts_dec_done
-.align	16
-.Lxts_dec_5:
-	pxor	@XMM[8+3], @XMM[3]
-	lea	0x50($inp), $inp
-	pxor	@XMM[8+4], @XMM[4]
-	lea	0x80(%rsp), %rax	# pass key schedule
-	mov	%edx, %r10d		# pass rounds
-
-	call	_bsaes_decrypt8
-
-	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
-	pxor	0x10(%rsp), @XMM[1]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	pxor	0x20(%rsp), @XMM[6]
-	movdqu	@XMM[1], 0x10($out)
-	pxor	0x30(%rsp), @XMM[4]
-	movdqu	@XMM[6], 0x20($out)
-	pxor	0x40(%rsp), @XMM[2]
-	movdqu	@XMM[4], 0x30($out)
-	movdqu	@XMM[2], 0x40($out)
-	lea	0x50($out), $out
-
-	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
-	jmp	.Lxts_dec_done
-.align	16
-.Lxts_dec_4:
-	pxor	@XMM[8+2], @XMM[2]
-	lea	0x40($inp), $inp
-	pxor	@XMM[8+3], @XMM[3]
-	lea	0x80(%rsp), %rax	# pass key schedule
-	mov	%edx, %r10d		# pass rounds
-
-	call	_bsaes_decrypt8
-
-	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
-	pxor	0x10(%rsp), @XMM[1]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	pxor	0x20(%rsp), @XMM[6]
-	movdqu	@XMM[1], 0x10($out)
-	pxor	0x30(%rsp), @XMM[4]
-	movdqu	@XMM[6], 0x20($out)
-	movdqu	@XMM[4], 0x30($out)
-	lea	0x40($out), $out
-
-	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
-	jmp	.Lxts_dec_done
-.align	16
-.Lxts_dec_3:
-	pxor	@XMM[8+1], @XMM[1]
-	lea	0x30($inp), $inp
-	pxor	@XMM[8+2], @XMM[2]
-	lea	0x80(%rsp), %rax	# pass key schedule
-	mov	%edx, %r10d		# pass rounds
-
-	call	_bsaes_decrypt8
-
-	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
-	pxor	0x10(%rsp), @XMM[1]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	pxor	0x20(%rsp), @XMM[6]
-	movdqu	@XMM[1], 0x10($out)
-	movdqu	@XMM[6], 0x20($out)
-	lea	0x30($out), $out
-
-	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
-	jmp	.Lxts_dec_done
-.align	16
-.Lxts_dec_2:
-	pxor	@XMM[8+0], @XMM[0]
-	lea	0x20($inp), $inp
-	pxor	@XMM[8+1], @XMM[1]
-	lea	0x80(%rsp), %rax	# pass key schedule
-	mov	%edx, %r10d		# pass rounds
-
-	call	_bsaes_decrypt8
-
-	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
-	pxor	0x10(%rsp), @XMM[1]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	movdqu	@XMM[1], 0x10($out)
-	lea	0x20($out), $out
-
-	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
-	jmp	.Lxts_dec_done
-.align	16
-.Lxts_dec_1:
-	pxor	@XMM[0], @XMM[8]
-	lea	0x10($inp), $inp
-	movdqa	@XMM[8], 0x20(%rbp)
-	lea	0x20(%rbp), $arg1
-	lea	0x20(%rbp), $arg2
-	lea	($key), $arg3
-	call	asm_AES_decrypt		# doesn't touch %xmm
-	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
-	#pxor	@XMM[8], @XMM[0]
-	#lea	0x80(%rsp), %rax	# pass key schedule
-	#mov	%edx, %r10d		# pass rounds
-	#call	_bsaes_decrypt8
-	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
-	movdqu	@XMM[0], 0x00($out)	# write output
-	lea	0x10($out), $out
-
-	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
-
-.Lxts_dec_done:
-	and	\$15, %ebx
-	jz	.Lxts_dec_ret
-
-	pxor	$twtmp, $twtmp
-	movdqa	.Lxts_magic(%rip), $twmask
-	pcmpgtd	@XMM[7], $twtmp
-	pshufd	\$0x13, $twtmp, $twres
-	movdqa	@XMM[7], @XMM[6]
-	paddq	@XMM[7], @XMM[7]	# psllq 1,$tweak
-	pand	$twmask, $twres		# isolate carry and residue
-	movdqu	($inp), @XMM[0]
-	pxor	$twres, @XMM[7]
-
-	lea	0x20(%rbp), $arg1
-	pxor	@XMM[7], @XMM[0]
-	lea	0x20(%rbp), $arg2
-	movdqa	@XMM[0], 0x20(%rbp)
-	lea	($key), $arg3
-	call	asm_AES_decrypt		# doesn't touch %xmm
-	pxor	0x20(%rbp), @XMM[7]
-	mov	$out, %rdx
-	movdqu	@XMM[7], ($out)
-
-.Lxts_dec_steal:
-	movzb	16($inp), %eax
-	movzb	(%rdx), %ecx
-	lea	1($inp), $inp
-	mov	%al, (%rdx)
-	mov	%cl, 16(%rdx)
-	lea	1(%rdx), %rdx
-	sub	\$1,%ebx
-	jnz	.Lxts_dec_steal
-
-	movdqu	($out), @XMM[0]
-	lea	0x20(%rbp), $arg1
-	pxor	@XMM[6], @XMM[0]
-	lea	0x20(%rbp), $arg2
-	movdqa	@XMM[0], 0x20(%rbp)
-	lea	($key), $arg3
-	call	asm_AES_decrypt		# doesn't touch %xmm
-	pxor	0x20(%rbp), @XMM[6]
-	movdqu	@XMM[6], ($out)
-
-.Lxts_dec_ret:
-	lea	(%rsp), %rax
-	pxor	%xmm0, %xmm0
-.Lxts_dec_bzero:			# wipe key schedule [if any]
-	movdqa	%xmm0, 0x00(%rax)
-	movdqa	%xmm0, 0x10(%rax)
-	lea	0x20(%rax), %rax
-	cmp	%rax, %rbp
-	ja	.Lxts_dec_bzero
-
-	lea	0x78(%rbp),%rax
-.cfi_def_cfa	%rax,8
-___
-$code.=<<___ if ($win64);
-	movaps	0x40(%rbp), %xmm6
-	movaps	0x50(%rbp), %xmm7
-	movaps	0x60(%rbp), %xmm8
-	movaps	0x70(%rbp), %xmm9
-	movaps	0x80(%rbp), %xmm10
-	movaps	0x90(%rbp), %xmm11
-	movaps	0xa0(%rbp), %xmm12
-	movaps	0xb0(%rbp), %xmm13
-	movaps	0xc0(%rbp), %xmm14
-	movaps	0xd0(%rbp), %xmm15
-	lea	0xa0(%rax), %rax
-.Lxts_dec_tail:
-___
-$code.=<<___;
-	mov	-48(%rax), %r15
-.cfi_restore	%r15
-	mov	-40(%rax), %r14
-.cfi_restore	%r14
-	mov	-32(%rax), %r13
-.cfi_restore	%r13
-	mov	-24(%rax), %r12
-.cfi_restore	%r12
-	mov	-16(%rax), %rbx
-.cfi_restore	%rbx
-	mov	-8(%rax), %rbp
-.cfi_restore	%rbp
-	lea	(%rax), %rsp		# restore %rsp
-.cfi_def_cfa_register	%rsp
-.Lxts_dec_epilogue:
-	ret
-.cfi_endproc
-.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
-___
-}
-$code.=<<___;
-.type	_bsaes_const,\@object
-.align	64
-_bsaes_const:
-.LM0ISR:	# InvShiftRows constants
-	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
-.LISRM0:
-	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
-.LISR:
-	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
-.LBS0:		# bit-slice constants
-	.quad	0x5555555555555555, 0x5555555555555555
-.LBS1:
-	.quad	0x3333333333333333, 0x3333333333333333
-.LBS2:
-	.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-.LSR:		# shiftrows constants
-	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
-.LSRM0:
-	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
-.LM0SR:
-	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
-.LSWPUP:	# byte-swap upper dword
-	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
-.LSWPUPM0SR:
-	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
-.LADD1:		# counter increment constants
-	.quad	0x0000000000000000, 0x0000000100000000
-.LADD2:
-	.quad	0x0000000000000000, 0x0000000200000000
-.LADD3:
-	.quad	0x0000000000000000, 0x0000000300000000
-.LADD4:
-	.quad	0x0000000000000000, 0x0000000400000000
-.LADD5:
-	.quad	0x0000000000000000, 0x0000000500000000
-.LADD6:
-	.quad	0x0000000000000000, 0x0000000600000000
-.LADD7:
-	.quad	0x0000000000000000, 0x0000000700000000
-.LADD8:
-	.quad	0x0000000000000000, 0x0000000800000000
-.Lxts_magic:
-	.long	0x87,0,1,0
-.Lmasks:
-	.quad	0x0101010101010101, 0x0101010101010101
-	.quad	0x0202020202020202, 0x0202020202020202
-	.quad	0x0404040404040404, 0x0404040404040404
-	.quad	0x0808080808080808, 0x0808080808080808
-.LM0:
-	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
-.L63:
-	.quad	0x6363636363636363, 0x6363636363636363
-.asciz	"Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
-.align	64
-.size	_bsaes_const,.-_bsaes_const
-___
-
-# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
-if ($win64) {
-$rec="%rcx";
-$frame="%rdx";
-$context="%r8";
-$disp="%r9";
-
-$code.=<<___;
-.extern	__imp_RtlVirtualUnwind
-.type	se_handler,\@abi-omnipotent
-.align	16
-se_handler:
-	push	%rsi
-	push	%rdi
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-	pushfq
-	sub	\$64,%rsp
-
-	mov	120($context),%rax	# pull context->Rax
-	mov	248($context),%rbx	# pull context->Rip
-
-	mov	8($disp),%rsi		# disp->ImageBase
-	mov	56($disp),%r11		# disp->HandlerData
-
-	mov	0(%r11),%r10d		# HandlerData[0]
-	lea	(%rsi,%r10),%r10	# prologue label
-	cmp	%r10,%rbx		# context->Rip<=prologue label
-	jbe	.Lin_prologue
-
-	mov	4(%r11),%r10d		# HandlerData[1]
-	lea	(%rsi,%r10),%r10	# epilogue label
-	cmp	%r10,%rbx		# context->Rip>=epilogue label
-	jae	.Lin_prologue
-
-	mov	8(%r11),%r10d		# HandlerData[2]
-	lea	(%rsi,%r10),%r10	# epilogue label
-	cmp	%r10,%rbx		# context->Rip>=tail label
-	jae	.Lin_tail
-
-	mov	160($context),%rax	# pull context->Rbp
-
-	lea	0x40(%rax),%rsi		# %xmm save area
-	lea	512($context),%rdi	# &context.Xmm6
-	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
-	.long	0xa548f3fc		# cld; rep movsq
-	lea	0xa0+0x78(%rax),%rax	# adjust stack pointer
-
-.Lin_tail:
-	mov	-48(%rax),%rbp
-	mov	-40(%rax),%rbx
-	mov	-32(%rax),%r12
-	mov	-24(%rax),%r13
-	mov	-16(%rax),%r14
-	mov	-8(%rax),%r15
-	mov	%rbx,144($context)	# restore context->Rbx
-	mov	%rbp,160($context)	# restore context->Rbp
-	mov	%r12,216($context)	# restore context->R12
-	mov	%r13,224($context)	# restore context->R13
-	mov	%r14,232($context)	# restore context->R14
-	mov	%r15,240($context)	# restore context->R15
-
-.Lin_prologue:
-	mov	%rax,152($context)	# restore context->Rsp
-
-	mov	40($disp),%rdi		# disp->ContextRecord
-	mov	$context,%rsi		# context
-	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
-	.long	0xa548f3fc		# cld; rep movsq
-
-	mov	$disp,%rsi
-	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
-	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
-	mov	0(%rsi),%r8		# arg3, disp->ControlPc
-	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
-	mov	40(%rsi),%r10		# disp->ContextRecord
-	lea	56(%rsi),%r11		# &disp->HandlerData
-	lea	24(%rsi),%r12		# &disp->EstablisherFrame
-	mov	%r10,32(%rsp)		# arg5
-	mov	%r11,40(%rsp)		# arg6
-	mov	%r12,48(%rsp)		# arg7
-	mov	%rcx,56(%rsp)		# arg8, (NULL)
-	call	*__imp_RtlVirtualUnwind(%rip)
-
-	mov	\$1,%eax		# ExceptionContinueSearch
-	add	\$64,%rsp
-	popfq
-	pop	%r15
-	pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbp
-	pop	%rbx
-	pop	%rdi
-	pop	%rsi
-	ret
-.size	se_handler,.-se_handler
-
-.section	.pdata
-.align	4
-___
-$code.=<<___ if ($ecb);
-	.rva	.Lecb_enc_prologue
-	.rva	.Lecb_enc_epilogue
-	.rva	.Lecb_enc_info
-
-	.rva	.Lecb_dec_prologue
-	.rva	.Lecb_dec_epilogue
-	.rva	.Lecb_dec_info
-___
-$code.=<<___;
-	.rva	.Lcbc_dec_prologue
-	.rva	.Lcbc_dec_epilogue
-	.rva	.Lcbc_dec_info
-
-	.rva	.Lctr_enc_prologue
-	.rva	.Lctr_enc_epilogue
-	.rva	.Lctr_enc_info
-
-	.rva	.Lxts_enc_prologue
-	.rva	.Lxts_enc_epilogue
-	.rva	.Lxts_enc_info
-
-	.rva	.Lxts_dec_prologue
-	.rva	.Lxts_dec_epilogue
-	.rva	.Lxts_dec_info
-
-.section	.xdata
-.align	8
-___
-$code.=<<___ if ($ecb);
-.Lecb_enc_info:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
-	.rva	.Lecb_enc_tail
-	.long	0
-.Lecb_dec_info:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
-	.rva	.Lecb_dec_tail
-	.long	0
-___
-$code.=<<___;
-.Lcbc_dec_info:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
-	.rva	.Lcbc_dec_tail
-	.long	0
-.Lctr_enc_info:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
-	.rva	.Lctr_enc_tail
-	.long	0
-.Lxts_enc_info:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
-	.rva	.Lxts_enc_tail
-	.long	0
-.Lxts_dec_info:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
-	.rva	.Lxts_dec_tail
-	.long	0
-___
-}
-
-$code =~ s/\`([^\`]*)\`/eval($1)/gem;
-
-print $code;
-
-close STDOUT;

+ 26 - 21
libs/openssl/crypto/asn1/a_time.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 1999-2017 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1999-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -67,7 +67,7 @@ static void determine_days(struct tm *tm)
     }
     c = y / 100;
     y %= 100;
-    /* Zeller's congruance */
+    /* Zeller's congruence */
     tm->tm_wday = (d + (13 * m) / 5 + y + y / 4 + c / 4 + 5 * c + 6) % 7;
 }
 
@@ -79,7 +79,11 @@ int asn1_time_to_tm(struct tm *tm, const ASN1_TIME *d)
     char *a;
     int n, i, i2, l, o, min_l = 11, strict = 0, end = 6, btz = 5, md;
     struct tm tmp;
-
+#if defined(CHARSET_EBCDIC)
+    const char upper_z = 0x5A, num_zero = 0x30, period = 0x2E, minus = 0x2D, plus = 0x2B;
+#else
+    const char upper_z = 'Z', num_zero = '0', period = '.', minus = '-', plus = '+';
+#endif
     /*
      * ASN1_STRING_FLAG_X509_TIME is used to enforce RFC 5280
      * time string format, in which:
@@ -120,20 +124,20 @@ int asn1_time_to_tm(struct tm *tm, const ASN1_TIME *d)
     if (l < min_l)
         goto err;
     for (i = 0; i < end; i++) {
-        if (!strict && (i == btz) && ((a[o] == 'Z') || (a[o] == '+') || (a[o] == '-'))) {
+        if (!strict && (i == btz) && ((a[o] == upper_z) || (a[o] == plus) || (a[o] == minus))) {
             i++;
             break;
         }
-        if (!ossl_isdigit(a[o]))
+        if (!ascii_isdigit(a[o]))
             goto err;
-        n = a[o] - '0';
+        n = a[o] - num_zero;
         /* incomplete 2-digital number */
         if (++o == l)
             goto err;
 
-        if (!ossl_isdigit(a[o]))
+        if (!ascii_isdigit(a[o]))
             goto err;
-        n = (n * 10) + a[o] - '0';
+        n = (n * 10) + a[o] - num_zero;
         /* no more bytes to read, but we haven't seen time-zone yet */
         if (++o == l)
             goto err;
@@ -185,14 +189,14 @@ int asn1_time_to_tm(struct tm *tm, const ASN1_TIME *d)
      * Optional fractional seconds: decimal point followed by one or more
      * digits.
      */
-    if (d->type == V_ASN1_GENERALIZEDTIME && a[o] == '.') {
+    if (d->type == V_ASN1_GENERALIZEDTIME && a[o] == period) {
         if (strict)
             /* RFC 5280 forbids fractional seconds */
             goto err;
         if (++o == l)
             goto err;
         i = o;
-        while ((o < l) && ossl_isdigit(a[o]))
+        while ((o < l) && ascii_isdigit(a[o]))
             o++;
         /* Must have at least one digit after decimal point */
         if (i == o)
@@ -207,10 +211,10 @@ int asn1_time_to_tm(struct tm *tm, const ASN1_TIME *d)
      * 'o' can point to '\0' is either the subsequent if or the first
      * else if is true.
      */
-    if (a[o] == 'Z') {
+    if (a[o] == upper_z) {
         o++;
-    } else if (!strict && ((a[o] == '+') || (a[o] == '-'))) {
-        int offsign = a[o] == '-' ? 1 : -1;
+    } else if (!strict && ((a[o] == plus) || (a[o] == minus))) {
+        int offsign = a[o] == minus ? 1 : -1;
         int offset = 0;
 
         o++;
@@ -223,13 +227,13 @@ int asn1_time_to_tm(struct tm *tm, const ASN1_TIME *d)
         if (o + 4 != l)
             goto err;
         for (i = end; i < end + 2; i++) {
-            if (!ossl_isdigit(a[o]))
+            if (!ascii_isdigit(a[o]))
                 goto err;
-            n = a[o] - '0';
+            n = a[o] - num_zero;
             o++;
-            if (!ossl_isdigit(a[o]))
+            if (!ascii_isdigit(a[o]))
                 goto err;
-            n = (n * 10) + a[o] - '0';
+            n = (n * 10) + a[o] - num_zero;
             i2 = (d->type == V_ASN1_UTCTIME) ? i + 1 : i;
             if ((n < min[i2]) || (n > max[i2]))
                 goto err;
@@ -300,7 +304,7 @@ ASN1_TIME *asn1_time_from_tm(ASN1_TIME *s, struct tm *ts, int type)
                                     ts->tm_mday, ts->tm_hour, ts->tm_min,
                                     ts->tm_sec);
 
-#ifdef CHARSET_EBCDIC_not
+#ifdef CHARSET_EBCDIC
     ebcdic2ascii(tmps->data, tmps->data, tmps->length);
 #endif
     return tmps;
@@ -467,6 +471,7 @@ int ASN1_TIME_print(BIO *bp, const ASN1_TIME *tm)
     char *v;
     int gmt = 0, l;
     struct tm stm;
+    const char upper_z = 0x5A, period = 0x2E;
 
     if (!asn1_time_to_tm(&stm, tm)) {
         /* asn1_time_to_tm will check the time type */
@@ -475,7 +480,7 @@ int ASN1_TIME_print(BIO *bp, const ASN1_TIME *tm)
 
     l = tm->length;
     v = (char *)tm->data;
-    if (v[l - 1] == 'Z')
+    if (v[l - 1] == upper_z)
         gmt = 1;
 
     if (tm->type == V_ASN1_GENERALIZEDTIME) {
@@ -486,10 +491,10 @@ int ASN1_TIME_print(BIO *bp, const ASN1_TIME *tm)
          * Try to parse fractional seconds. '14' is the place of
          * 'fraction point' in a GeneralizedTime string.
          */
-        if (tm->length > 15 && v[14] == '.') {
+        if (tm->length > 15 && v[14] == period) {
             f = &v[14];
             f_len = 1;
-            while (14 + f_len < l && ossl_isdigit(f[f_len]))
+            while (14 + f_len < l && ascii_isdigit(f[f_len]))
                 ++f_len;
         }
 

+ 7 - 3
libs/openssl/crypto/asn1/a_type.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -15,7 +15,9 @@
 
 int ASN1_TYPE_get(const ASN1_TYPE *a)
 {
-    if ((a->value.ptr != NULL) || (a->type == V_ASN1_NULL))
+    if (a->type == V_ASN1_BOOLEAN
+            || a->type == V_ASN1_NULL
+            || a->value.ptr != NULL)
         return a->type;
     else
         return 0;
@@ -23,7 +25,9 @@ int ASN1_TYPE_get(const ASN1_TYPE *a)
 
 void ASN1_TYPE_set(ASN1_TYPE *a, int type, void *value)
 {
-    if (a->value.ptr != NULL) {
+    if (a->type != V_ASN1_BOOLEAN
+            && a->type != V_ASN1_NULL
+            && a->value.ptr != NULL) {
         ASN1_TYPE **tmp_a = &a;
         asn1_primitive_free((ASN1_VALUE **)tmp_a, NULL, 0);
     }

+ 15 - 4
libs/openssl/crypto/asn1/x_bignum.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2000-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -130,9 +130,20 @@ static int bn_c2i(ASN1_VALUE **pval, const unsigned char *cont, int len,
 static int bn_secure_c2i(ASN1_VALUE **pval, const unsigned char *cont, int len,
                          int utype, char *free_cont, const ASN1_ITEM *it)
 {
-    if (!*pval)
-        bn_secure_new(pval, it);
-    return bn_c2i(pval, cont, len, utype, free_cont, it);
+    int ret;
+    BIGNUM *bn;
+
+    if (!*pval && !bn_secure_new(pval, it))
+        return 0;
+
+    ret = bn_c2i(pval, cont, len, utype, free_cont, it);
+    if (!ret)
+        return 0;
+
+    /* Set constant-time flag for all secure BIGNUMS */
+    bn = (BIGNUM *)*pval;
+    BN_set_flags(bn, BN_FLG_CONSTTIME);
+    return ret;
 }
 
 static int bn_print(BIO *out, ASN1_VALUE **pval, const ASN1_ITEM *it,

+ 20 - 6
libs/openssl/crypto/bio/b_addr.c

@@ -675,7 +675,7 @@ int BIO_lookup_ex(const char *host, const char *service, int lookup_type,
 
     if (1) {
 #ifdef AI_PASSIVE
-        int gai_ret = 0;
+        int gai_ret = 0, old_ret = 0;
         struct addrinfo hints;
 
         memset(&hints, 0, sizeof(hints));
@@ -683,12 +683,12 @@ int BIO_lookup_ex(const char *host, const char *service, int lookup_type,
         hints.ai_family = family;
         hints.ai_socktype = socktype;
         hints.ai_protocol = protocol;
-#ifdef AI_ADDRCONFIG
-#ifdef AF_UNSPEC
+# ifdef AI_ADDRCONFIG
+#  ifdef AF_UNSPEC
         if (family == AF_UNSPEC)
-#endif
+#  endif
             hints.ai_flags |= AI_ADDRCONFIG;
-#endif
+# endif
 
         if (lookup_type == BIO_LOOKUP_SERVER)
             hints.ai_flags |= AI_PASSIVE;
@@ -696,19 +696,33 @@ int BIO_lookup_ex(const char *host, const char *service, int lookup_type,
         /* Note that |res| SHOULD be a 'struct addrinfo **' thanks to
          * macro magic in bio_lcl.h
          */
+      retry:
         switch ((gai_ret = getaddrinfo(host, service, &hints, res))) {
 # ifdef EAI_SYSTEM
         case EAI_SYSTEM:
             SYSerr(SYS_F_GETADDRINFO, get_last_socket_error());
             BIOerr(BIO_F_BIO_LOOKUP_EX, ERR_R_SYS_LIB);
             break;
+# endif
+# ifdef EAI_MEMORY
+        case EAI_MEMORY:
+            BIOerr(BIO_F_BIO_LOOKUP_EX, ERR_R_MALLOC_FAILURE);
+            break;
 # endif
         case 0:
             ret = 1;             /* Success */
             break;
         default:
+# if defined(AI_ADDRCONFIG) && defined(AI_NUMERICHOST)
+            if (hints.ai_flags & AI_ADDRCONFIG) {
+                hints.ai_flags &= ~AI_ADDRCONFIG;
+                hints.ai_flags |= AI_NUMERICHOST;
+                old_ret = gai_ret;
+                goto retry;
+            }
+# endif
             BIOerr(BIO_F_BIO_LOOKUP_EX, ERR_R_SYS_LIB);
-            ERR_add_error_data(1, gai_strerror(gai_ret));
+            ERR_add_error_data(1, gai_strerror(old_ret ? old_ret : gai_ret));
             break;
         }
     } else {

+ 2 - 2
libs/openssl/crypto/bio/bss_dgram.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 2005-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2005-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -784,7 +784,7 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
      * reasons. When BIO_CTRL_DGRAM_SET_PEEK_MODE was first defined its value
      * was incorrectly clashing with BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE. The
      * value has been updated to a non-clashing value. However to preserve
-     * binary compatiblity we now respond to both the old value and the new one
+     * binary compatibility we now respond to both the old value and the new one
      */
     case BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE:
     case BIO_CTRL_DGRAM_SET_PEEK_MODE:

+ 28 - 33
libs/openssl/crypto/bio/bss_file.c

@@ -7,10 +7,7 @@
  * https://www.openssl.org/source/license.html
  */
 
-#ifndef HEADER_BSS_FILE_C
-# define HEADER_BSS_FILE_C
-
-# if defined(__linux) || defined(__sun) || defined(__hpux)
+#if defined(__linux) || defined(__sun) || defined(__hpux)
 /*
  * Following definition aliases fopen to fopen64 on above mentioned
  * platforms. This makes it possible to open and sequentially access files
@@ -23,17 +20,17 @@
  * of 32-bit platforms which allow for sequential access of large files
  * without extra "magic" comprise *BSD, Darwin, IRIX...
  */
-#  ifndef _FILE_OFFSET_BITS
-#   define _FILE_OFFSET_BITS 64
-#  endif
+# ifndef _FILE_OFFSET_BITS
+#  define _FILE_OFFSET_BITS 64
 # endif
+#endif
 
-# include <stdio.h>
-# include <errno.h>
-# include "bio_lcl.h"
-# include <openssl/err.h>
+#include <stdio.h>
+#include <errno.h>
+#include "bio_lcl.h"
+#include <openssl/err.h>
 
-# if !defined(OPENSSL_NO_STDIO)
+#if !defined(OPENSSL_NO_STDIO)
 
 static int file_write(BIO *h, const char *buf, int num);
 static int file_read(BIO *h, char *buf, int size);
@@ -72,9 +69,9 @@ BIO *BIO_new_file(const char *filename, const char *mode)
         SYSerr(SYS_F_FOPEN, get_last_sys_error());
         ERR_add_error_data(5, "fopen('", filename, "','", mode, "')");
         if (errno == ENOENT
-# ifdef ENXIO
+#ifdef ENXIO
             || errno == ENXIO
-# endif
+#endif
             )
             BIOerr(BIO_F_BIO_NEW_FILE, BIO_R_NO_SUCH_FILE);
         else
@@ -212,33 +209,33 @@ static long file_ctrl(BIO *b, int cmd, long num, void *ptr)
         b->shutdown = (int)num & BIO_CLOSE;
         b->ptr = ptr;
         b->init = 1;
-#  if BIO_FLAGS_UPLINK!=0
-#   if defined(__MINGW32__) && defined(__MSVCRT__) && !defined(_IOB_ENTRIES)
-#    define _IOB_ENTRIES 20
-#   endif
+# if BIO_FLAGS_UPLINK!=0
+#  if defined(__MINGW32__) && defined(__MSVCRT__) && !defined(_IOB_ENTRIES)
+#   define _IOB_ENTRIES 20
+#  endif
         /* Safety net to catch purely internal BIO_set_fp calls */
-#   if defined(_MSC_VER) && _MSC_VER>=1900
+#  if defined(_MSC_VER) && _MSC_VER>=1900
         if (ptr == stdin || ptr == stdout || ptr == stderr)
             BIO_clear_flags(b, BIO_FLAGS_UPLINK);
-#   elif defined(_IOB_ENTRIES)
+#  elif defined(_IOB_ENTRIES)
         if ((size_t)ptr >= (size_t)stdin &&
             (size_t)ptr < (size_t)(stdin + _IOB_ENTRIES))
             BIO_clear_flags(b, BIO_FLAGS_UPLINK);
-#   endif
 #  endif
-#  ifdef UP_fsetmod
+# endif
+# ifdef UP_fsetmod
         if (b->flags & BIO_FLAGS_UPLINK)
             UP_fsetmod(b->ptr, (char)((num & BIO_FP_TEXT) ? 't' : 'b'));
         else
-#  endif
+# endif
         {
-#  if defined(OPENSSL_SYS_WINDOWS)
+# if defined(OPENSSL_SYS_WINDOWS)
             int fd = _fileno((FILE *)ptr);
             if (num & BIO_FP_TEXT)
                 _setmode(fd, _O_TEXT);
             else
                 _setmode(fd, _O_BINARY);
-#  elif defined(OPENSSL_SYS_MSDOS)
+# elif defined(OPENSSL_SYS_MSDOS)
             int fd = fileno((FILE *)ptr);
             /* Set correct text/binary mode */
             if (num & BIO_FP_TEXT)
@@ -251,11 +248,11 @@ static long file_ctrl(BIO *b, int cmd, long num, void *ptr)
                 } else
                     _setmode(fd, _O_BINARY);
             }
-#  elif defined(OPENSSL_SYS_WIN32_CYGWIN)
+# elif defined(OPENSSL_SYS_WIN32_CYGWIN)
             int fd = fileno((FILE *)ptr);
             if (!(num & BIO_FP_TEXT))
                 setmode(fd, O_BINARY);
-#  endif
+# endif
         }
         break;
     case BIO_C_SET_FILENAME:
@@ -277,15 +274,15 @@ static long file_ctrl(BIO *b, int cmd, long num, void *ptr)
             ret = 0;
             break;
         }
-#  if defined(OPENSSL_SYS_MSDOS) || defined(OPENSSL_SYS_WINDOWS)
+# if defined(OPENSSL_SYS_MSDOS) || defined(OPENSSL_SYS_WINDOWS)
         if (!(num & BIO_FP_TEXT))
             OPENSSL_strlcat(p, "b", sizeof(p));
         else
             OPENSSL_strlcat(p, "t", sizeof(p));
-#  elif defined(OPENSSL_SYS_WIN32_CYGWIN)
+# elif defined(OPENSSL_SYS_WIN32_CYGWIN)
         if (!(num & BIO_FP_TEXT))
             OPENSSL_strlcat(p, "b", sizeof(p));
-#  endif
+# endif
         fp = openssl_fopen(ptr, p);
         if (fp == NULL) {
             SYSerr(SYS_F_FOPEN, get_last_sys_error());
@@ -422,6 +419,4 @@ BIO *BIO_new_file(const char *filename, const char *mode)
     return NULL;
 }
 
-# endif                         /* OPENSSL_NO_STDIO */
-
-#endif                          /* HEADER_BSS_FILE_C */
+#endif                         /* OPENSSL_NO_STDIO */

+ 1 - 3
libs/openssl/crypto/bio/bss_mem.c

@@ -259,9 +259,7 @@ static long mem_ctrl(BIO *b, int cmd, long num, void *ptr)
         bm = bbm->buf;
         if (bm->data != NULL) {
             if (!(b->flags & BIO_FLAGS_MEM_RDONLY)) {
-                if (b->flags & BIO_FLAGS_NONCLEAR_RST) {
-                    bm->length = bm->max;
-                } else {
+                if (!(b->flags & BIO_FLAGS_NONCLEAR_RST)) {
                     memset(bm->data, 0, bm->max);
                     bm->length = 0;
                 }

+ 1 - 1
libs/openssl/crypto/bn/asm/mips.pl

@@ -801,7 +801,7 @@ $code.=<<___;
 #if 0
 /*
  * The bn_div_3_words entry point is re-used for constant-time interface.
- * Implementation is retained as hystorical reference.
+ * Implementation is retained as historical reference.
  */
 .align 5
 .globl	bn_div_3_words

+ 2 - 2
libs/openssl/crypto/bn/bn_div.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -258,7 +258,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
  *
  *     - availability of constant-time bn_div_3_words;
  *     - dividend is at least as "wide" as divisor, limb-wise, zero-padded
- *       if so requied, which shouldn't be a privacy problem, because
+ *       if so required, which shouldn't be a privacy problem, because
  *       divisor's length is considered public;
  */
 int bn_div_fixed_top(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,

+ 2 - 2
libs/openssl/crypto/bn/bn_lcl.h

@@ -1,5 +1,5 @@
 /*
- * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -295,7 +295,7 @@ struct bn_gencb_st {
                  (b) >  23 ? 3 : 1)
 
 /*
- * BN_mod_exp_mont_conttime is based on the assumption that the L1 data cache
+ * BN_mod_exp_mont_consttime is based on the assumption that the L1 data cache
  * line width of the target processor is at least the following value.
  */
 # define MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH      ( 64 )

+ 70 - 31
libs/openssl/crypto/bn/bn_lib.c

@@ -132,20 +132,66 @@ int BN_num_bits_word(BN_ULONG l)
     return bits;
 }
 
+/*
+ * This function still leaks `a->dmax`: it's caller's responsibility to
+ * expand the input `a` in advance to a public length.
+ */
+static ossl_inline
+int bn_num_bits_consttime(const BIGNUM *a)
+{
+    int j, ret;
+    unsigned int mask, past_i;
+    int i = a->top - 1;
+    bn_check_top(a);
+
+    for (j = 0, past_i = 0, ret = 0; j < a->dmax; j++) {
+        mask = constant_time_eq_int(i, j); /* 0xff..ff if i==j, 0x0 otherwise */
+
+        ret += BN_BITS2 & (~mask & ~past_i);
+        ret += BN_num_bits_word(a->d[j]) & mask;
+
+        past_i |= mask; /* past_i will become 0xff..ff after i==j */
+    }
+
+    /*
+     * if BN_is_zero(a) => i is -1 and ret contains garbage, so we mask the
+     * final result.
+     */
+    mask = ~(constant_time_eq_int(i, ((int)-1)));
+
+    return ret & mask;
+}
+
 int BN_num_bits(const BIGNUM *a)
 {
     int i = a->top - 1;
     bn_check_top(a);
 
+    if (a->flags & BN_FLG_CONSTTIME) {
+        /*
+         * We assume that BIGNUMs flagged as CONSTTIME have also been expanded
+         * so that a->dmax is not leaking secret information.
+         *
+         * In other words, it's the caller's responsibility to ensure `a` has
+         * been preallocated in advance to a public length if we hit this
+         * branch.
+         *
+         */
+        return bn_num_bits_consttime(a);
+    }
+
     if (BN_is_zero(a))
         return 0;
+
     return ((i * BN_BITS2) + BN_num_bits_word(a->d[i]));
 }
 
-static void bn_free_d(BIGNUM *a)
+static void bn_free_d(BIGNUM *a, int clear)
 {
     if (BN_get_flags(a, BN_FLG_SECURE))
-        OPENSSL_secure_free(a->d);
+        OPENSSL_secure_clear_free(a->d, a->dmax * sizeof(a->d[0]));
+    else if (clear != 0)
+        OPENSSL_clear_free(a->d, a->dmax * sizeof(a->d[0]));
     else
         OPENSSL_free(a->d);
 }
@@ -155,10 +201,8 @@ void BN_clear_free(BIGNUM *a)
 {
     if (a == NULL)
         return;
-    if (a->d != NULL && !BN_get_flags(a, BN_FLG_STATIC_DATA)) {
-        OPENSSL_cleanse(a->d, a->dmax * sizeof(a->d[0]));
-        bn_free_d(a);
-    }
+    if (a->d != NULL && !BN_get_flags(a, BN_FLG_STATIC_DATA))
+        bn_free_d(a, 1);
     if (BN_get_flags(a, BN_FLG_MALLOCED)) {
         OPENSSL_cleanse(a, sizeof(*a));
         OPENSSL_free(a);
@@ -170,7 +214,7 @@ void BN_free(BIGNUM *a)
     if (a == NULL)
         return;
     if (!BN_get_flags(a, BN_FLG_STATIC_DATA))
-        bn_free_d(a);
+        bn_free_d(a, 0);
     if (a->flags & BN_FLG_MALLOCED)
         OPENSSL_free(a);
 }
@@ -248,10 +292,8 @@ BIGNUM *bn_expand2(BIGNUM *b, int words)
         BN_ULONG *a = bn_expand_internal(b, words);
         if (!a)
             return NULL;
-        if (b->d) {
-            OPENSSL_cleanse(b->d, b->dmax * sizeof(b->d[0]));
-            bn_free_d(b);
-        }
+        if (b->d != NULL)
+            bn_free_d(b, 1);
         b->d = a;
         b->dmax = words;
     }
@@ -416,8 +458,11 @@ BIGNUM *BN_bin2bn(const unsigned char *s, int len, BIGNUM *ret)
     return ret;
 }
 
+typedef enum {big, little} endianess_t;
+
 /* ignore negative */
-static int bn2binpad(const BIGNUM *a, unsigned char *to, int tolen)
+static
+int bn2binpad(const BIGNUM *a, unsigned char *to, int tolen, endianess_t endianess)
 {
     int n;
     size_t i, lasti, j, atop, mask;
@@ -449,10 +494,17 @@ static int bn2binpad(const BIGNUM *a, unsigned char *to, int tolen)
 
     lasti = atop - 1;
     atop = a->top * BN_BYTES;
-    for (i = 0, j = 0, to += tolen; j < (size_t)tolen; j++) {
+    if (endianess == big)
+        to += tolen; /* start from the end of the buffer */
+    for (i = 0, j = 0; j < (size_t)tolen; j++) {
+        unsigned char val;
         l = a->d[i / BN_BYTES];
         mask = 0 - ((j - atop) >> (8 * sizeof(i) - 1));
-        *--to = (unsigned char)(l >> (8 * (i % BN_BYTES)) & mask);
+        val = (unsigned char)(l >> (8 * (i % BN_BYTES)) & mask);
+        if (endianess == big)
+            *--to = val;
+        else
+            *to++ = val;
         i += (i - lasti) >> (8 * sizeof(i) - 1); /* stay on last limb */
     }
 
@@ -463,12 +515,12 @@ int BN_bn2binpad(const BIGNUM *a, unsigned char *to, int tolen)
 {
     if (tolen < 0)
         return -1;
-    return bn2binpad(a, to, tolen);
+    return bn2binpad(a, to, tolen, big);
 }
 
 int BN_bn2bin(const BIGNUM *a, unsigned char *to)
 {
-    return bn2binpad(a, to, -1);
+    return bn2binpad(a, to, -1, big);
 }
 
 BIGNUM *BN_lebin2bn(const unsigned char *s, int len, BIGNUM *ret)
@@ -520,22 +572,9 @@ BIGNUM *BN_lebin2bn(const unsigned char *s, int len, BIGNUM *ret)
 
 int BN_bn2lebinpad(const BIGNUM *a, unsigned char *to, int tolen)
 {
-    int i;
-    BN_ULONG l;
-    bn_check_top(a);
-    i = BN_num_bytes(a);
-    if (tolen < i)
+    if (tolen < 0)
         return -1;
-    /* Add trailing zeroes if necessary */
-    if (tolen > i)
-        memset(to + i, 0, tolen - i);
-    to += i;
-    while (i--) {
-        l = a->d[i / BN_BYTES];
-        to--;
-        *to = (unsigned char)(l >> (8 * (i % BN_BYTES))) & 0xff;
-    }
-    return tolen;
+    return bn2binpad(a, to, tolen, little);
 }
 
 int BN_ucmp(const BIGNUM *a, const BIGNUM *b)

+ 6 - 2
libs/openssl/crypto/bn/bn_prime.c

@@ -63,8 +63,12 @@ int BN_generate_prime_ex(BIGNUM *ret, int bits, int safe,
         /* There are no prime numbers this small. */
         BNerr(BN_F_BN_GENERATE_PRIME_EX, BN_R_BITS_TOO_SMALL);
         return 0;
-    } else if (bits == 2 && safe) {
-        /* The smallest safe prime (7) is three bits. */
+    } else if (add == NULL && safe && bits < 6 && bits != 3) {
+        /*
+         * The smallest safe prime (7) is three bits.
+         * But the following two safe primes with less than 6 bits (11, 23)
+         * are unreachable for BN_rand with BN_RAND_TOP_TWO.
+         */
         BNerr(BN_F_BN_GENERATE_PRIME_EX, BN_R_BITS_TOO_SMALL);
         return 0;
     }

+ 2 - 5
libs/openssl/crypto/bn/bn_rand.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -225,8 +225,7 @@ int BN_generate_dsa_nonce(BIGNUM *out, const BIGNUM *range,
         goto err;
 
     /* We copy |priv| into a local buffer to avoid exposing its length. */
-    todo = sizeof(priv->d[0]) * priv->top;
-    if (todo > sizeof(private_bytes)) {
+    if (BN_bn2binpad(priv, private_bytes, sizeof(private_bytes)) < 0) {
         /*
          * No reasonable DSA or ECDSA key should have a private key this
          * large and we don't handle this case in order to avoid leaking the
@@ -235,8 +234,6 @@ int BN_generate_dsa_nonce(BIGNUM *out, const BIGNUM *range,
         BNerr(BN_F_BN_GENERATE_DSA_NONCE, BN_R_PRIVATE_KEY_TOO_LARGE);
         goto err;
     }
-    memcpy(private_bytes, priv->d, todo);
-    memset(private_bytes + todo, 0, sizeof(private_bytes) - todo);
 
     for (done = 0; done < num_k_bytes;) {
         if (RAND_priv_bytes(random_bytes, sizeof(random_bytes)) != 1)

+ 3 - 2
libs/openssl/crypto/bn/bn_sqrt.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 2000-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2000-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -125,7 +125,8 @@ BIGNUM *BN_mod_sqrt(BIGNUM *in, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
          *         = a.
          *
          * (This is due to A.O.L. Atkin,
-         * <URL: http://listserv.nodak.edu/scripts/wa.exe?A2=ind9211&L=nmbrthry&O=T&P=562>,
+         * Subject: Square Roots and Cognate Matters modulo p=8n+5.
+         * URL: https://listserv.nodak.edu/cgi-bin/wa.exe?A2=ind9211&L=NMBRTHRY&P=4026
          * November 1992.)
          */
 

+ 134 - 2
libs/openssl/crypto/cms/cms_att.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2008-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -13,6 +13,56 @@
 #include <openssl/err.h>
 #include <openssl/cms.h>
 #include "cms_lcl.h"
+#include "internal/nelem.h"
+
+/*-
+ * Attribute flags.
+ * CMS attribute restrictions are discussed in
+ *  - RFC 5652 Section 11.
+ * ESS attribute restrictions are discussed in
+ *  - RFC 2634 Section 1.3.4  AND
+ *  - RFC 5035 Section 5.4
+ */
+/* This is a signed attribute */
+#define CMS_ATTR_F_SIGNED         0x01
+/* This is an unsigned attribute */
+#define CMS_ATTR_F_UNSIGNED       0x02
+/* Must be present if there are any other attributes of the same type */
+#define CMS_ATTR_F_REQUIRED_COND  0x10
+/* There can only be one instance of this attribute */
+#define CMS_ATTR_F_ONLY_ONE       0x20
+/* The Attribute's value must have exactly one entry */
+#define CMS_ATTR_F_ONE_ATTR_VALUE 0x40
+
+/* Attributes rules for different attributes */
+static const struct {
+    int nid;   /* The attribute id */
+    int flags;
+} cms_attribute_properties[] = {
+    /* See RFC Section 11 */
+    { NID_pkcs9_contentType, CMS_ATTR_F_SIGNED
+                             | CMS_ATTR_F_ONLY_ONE
+                             | CMS_ATTR_F_ONE_ATTR_VALUE
+                             | CMS_ATTR_F_REQUIRED_COND },
+    { NID_pkcs9_messageDigest, CMS_ATTR_F_SIGNED
+                               | CMS_ATTR_F_ONLY_ONE
+                               | CMS_ATTR_F_ONE_ATTR_VALUE
+                               | CMS_ATTR_F_REQUIRED_COND },
+    { NID_pkcs9_signingTime, CMS_ATTR_F_SIGNED
+                             | CMS_ATTR_F_ONLY_ONE
+                             | CMS_ATTR_F_ONE_ATTR_VALUE },
+    { NID_pkcs9_countersignature, CMS_ATTR_F_UNSIGNED },
+    /* ESS */
+    { NID_id_smime_aa_signingCertificate, CMS_ATTR_F_SIGNED
+                                          | CMS_ATTR_F_ONLY_ONE
+                                          | CMS_ATTR_F_ONE_ATTR_VALUE },
+    { NID_id_smime_aa_signingCertificateV2, CMS_ATTR_F_SIGNED
+                                            | CMS_ATTR_F_ONLY_ONE
+                                            | CMS_ATTR_F_ONE_ATTR_VALUE },
+    { NID_id_smime_aa_receiptRequest, CMS_ATTR_F_SIGNED
+                                      | CMS_ATTR_F_ONLY_ONE
+                                      | CMS_ATTR_F_ONE_ATTR_VALUE }
+};
 
 /* CMS SignedData Attribute utilities */
 
@@ -149,4 +199,86 @@ void *CMS_unsigned_get0_data_by_OBJ(CMS_SignerInfo *si, ASN1_OBJECT *oid,
     return X509at_get0_data_by_OBJ(si->unsignedAttrs, oid, lastpos, type);
 }
 
-/* Specific attribute cases */
+/*
+ * Retrieve an attribute by nid from a stack of attributes starting at index
+ * *lastpos + 1.
+ * Returns the attribute or NULL if there is no attribute.
+ * If an attribute was found *lastpos returns the index of the found attribute.
+ */
+static X509_ATTRIBUTE *cms_attrib_get(int nid,
+                                      const STACK_OF(X509_ATTRIBUTE) *attrs,
+                                      int *lastpos)
+{
+    X509_ATTRIBUTE *at;
+    int loc;
+
+    loc = X509at_get_attr_by_NID(attrs, nid, *lastpos);
+    if (loc < 0)
+        return NULL;
+
+    at = X509at_get_attr(attrs, loc);
+    *lastpos = loc;
+    return at;
+}
+
+static int cms_check_attribute(int nid, int flags, int type,
+                               const STACK_OF(X509_ATTRIBUTE) *attrs,
+                               int have_attrs)
+{
+    int lastpos = -1;
+    X509_ATTRIBUTE *at = cms_attrib_get(nid, attrs, &lastpos);
+
+    if (at != NULL) {
+        int count = X509_ATTRIBUTE_count(at);
+
+        /* Is this attribute allowed? */
+        if (((flags & type) == 0)
+            /* check if multiple attributes of the same type are allowed */
+            || (((flags & CMS_ATTR_F_ONLY_ONE) != 0)
+                && cms_attrib_get(nid, attrs, &lastpos) != NULL)
+            /* Check if attribute should have exactly one value in its set */
+            || (((flags & CMS_ATTR_F_ONE_ATTR_VALUE) != 0)
+                && count != 1)
+            /* There should be at least one value */
+            || count == 0)
+        return 0;
+    } else {
+        /* fail if a required attribute is missing */
+        if (have_attrs
+            && ((flags & CMS_ATTR_F_REQUIRED_COND) != 0)
+            && (flags & type) != 0)
+            return 0;
+    }
+    return 1;
+}
+
+/*
+ * Check that the signerinfo attributes obey the attribute rules which includes
+ * the following checks
+ * - If any signed attributes exist then there must be a Content Type
+ * and Message Digest attribute in the signed attributes.
+ * - The countersignature attribute is an optional unsigned attribute only.
+ * - Content Type, Message Digest, and Signing time attributes are signed
+ *     attributes. Only one instance of each is allowed, with each of these
+ *     attributes containing a single attribute value in its set.
+ */
+int CMS_si_check_attributes(const CMS_SignerInfo *si)
+{
+    int i;
+    int have_signed_attrs = (CMS_signed_get_attr_count(si) > 0);
+    int have_unsigned_attrs = (CMS_unsigned_get_attr_count(si) > 0);
+
+    for (i = 0; i < (int)OSSL_NELEM(cms_attribute_properties); ++i) {
+        int nid = cms_attribute_properties[i].nid;
+        int flags = cms_attribute_properties[i].flags;
+
+        if (!cms_check_attribute(nid, flags, CMS_ATTR_F_SIGNED,
+                                 si->signedAttrs, have_signed_attrs)
+            || !cms_check_attribute(nid, flags, CMS_ATTR_F_UNSIGNED,
+                                    si->unsignedAttrs, have_unsigned_attrs)) {
+            CMSerr(CMS_F_CMS_SI_CHECK_ATTRIBUTES, CMS_R_ATTRIBUTE_ERROR);
+            return 0;
+        }
+    }
+    return 1;
+}

+ 18 - 2
libs/openssl/crypto/cms/cms_env.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 2008-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2008-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -363,6 +363,7 @@ static int cms_RecipientInfo_ktri_decrypt(CMS_ContentInfo *cms,
     unsigned char *ek = NULL;
     size_t eklen;
     int ret = 0;
+    size_t fixlen = 0;
     CMS_EncryptedContentInfo *ec;
     ec = cms->d.envelopedData->encryptedContentInfo;
 
@@ -371,6 +372,19 @@ static int cms_RecipientInfo_ktri_decrypt(CMS_ContentInfo *cms,
         return 0;
     }
 
+    if (cms->d.envelopedData->encryptedContentInfo->havenocert
+            && !cms->d.envelopedData->encryptedContentInfo->debug) {
+        X509_ALGOR *calg = ec->contentEncryptionAlgorithm;
+        const EVP_CIPHER *ciph = EVP_get_cipherbyobj(calg->algorithm);
+
+        if (ciph == NULL) {
+            CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_DECRYPT, CMS_R_UNKNOWN_CIPHER);
+            return 0;
+        }
+
+        fixlen = EVP_CIPHER_key_length(ciph);
+    }
+
     ktri->pctx = EVP_PKEY_CTX_new(pkey, NULL);
     if (ktri->pctx == NULL)
         return 0;
@@ -401,7 +415,9 @@ static int cms_RecipientInfo_ktri_decrypt(CMS_ContentInfo *cms,
 
     if (EVP_PKEY_decrypt(ktri->pctx, ek, &eklen,
                          ktri->encryptedKey->data,
-                         ktri->encryptedKey->length) <= 0) {
+                         ktri->encryptedKey->length) <= 0
+            || eklen == 0
+            || (fixlen != 0 && eklen != fixlen)) {
         CMSerr(CMS_F_CMS_RECIPIENTINFO_KTRI_DECRYPT, CMS_R_CMS_LIB);
         goto err;
     }

+ 4 - 1
libs/openssl/crypto/cms/cms_err.c

@@ -1,6 +1,6 @@
 /*
  * Generated by util/mkerr.pl DO NOT EDIT
- * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -146,6 +146,8 @@ static const ERR_STRING_DATA CMS_str_functs[] = {
     {ERR_PACK(ERR_LIB_CMS, CMS_F_CMS_SIGNERINFO_VERIFY_CONTENT, 0),
      "CMS_SignerInfo_verify_content"},
     {ERR_PACK(ERR_LIB_CMS, CMS_F_CMS_SIGN_RECEIPT, 0), "CMS_sign_receipt"},
+    {ERR_PACK(ERR_LIB_CMS, CMS_F_CMS_SI_CHECK_ATTRIBUTES, 0),
+     "CMS_si_check_attributes"},
     {ERR_PACK(ERR_LIB_CMS, CMS_F_CMS_STREAM, 0), "CMS_stream"},
     {ERR_PACK(ERR_LIB_CMS, CMS_F_CMS_UNCOMPRESS, 0), "CMS_uncompress"},
     {ERR_PACK(ERR_LIB_CMS, CMS_F_CMS_VERIFY, 0), "CMS_verify"},
@@ -155,6 +157,7 @@ static const ERR_STRING_DATA CMS_str_functs[] = {
 
 static const ERR_STRING_DATA CMS_str_reasons[] = {
     {ERR_PACK(ERR_LIB_CMS, 0, CMS_R_ADD_SIGNER_ERROR), "add signer error"},
+    {ERR_PACK(ERR_LIB_CMS, 0, CMS_R_ATTRIBUTE_ERROR), "attribute error"},
     {ERR_PACK(ERR_LIB_CMS, 0, CMS_R_CERTIFICATE_ALREADY_PRESENT),
     "certificate already present"},
     {ERR_PACK(ERR_LIB_CMS, 0, CMS_R_CERTIFICATE_HAS_NO_KEYID),

+ 5 - 4
libs/openssl/crypto/cms/cms_lcl.h

@@ -1,5 +1,5 @@
 /*
- * Copyright 2008-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2008-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -125,6 +125,8 @@ struct CMS_EncryptedContentInfo_st {
     size_t keylen;
     /* Set to 1 if we are debugging decrypt and don't fake keys for MMA */
     int debug;
+    /* Set to 1 if we have no cert and need extra safety measures for MMA */
+    int havenocert;
 };
 
 struct CMS_RecipientInfo_st {
@@ -317,8 +319,6 @@ struct CMS_OtherKeyAttribute_st {
 
 /* ESS structures */
 
-# ifdef HEADER_X509V3_H
-
 struct CMS_ReceiptRequest_st {
     ASN1_OCTET_STRING *signedContentIdentifier;
     CMS_ReceiptsFrom *receiptsFrom;
@@ -332,7 +332,6 @@ struct CMS_ReceiptsFrom_st {
         STACK_OF(GENERAL_NAMES) *receiptList;
     } d;
 };
-# endif
 
 struct CMS_Receipt_st {
     int32_t version;
@@ -416,6 +415,8 @@ int cms_RecipientInfo_kari_encrypt(CMS_ContentInfo *cms,
 /* PWRI routines */
 int cms_RecipientInfo_pwri_crypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri,
                                  int en_de);
+/* SignerInfo routines */
+int CMS_si_check_attributes(const CMS_SignerInfo *si);
 
 DECLARE_ASN1_ITEM(CMS_CertificateChoices)
 DECLARE_ASN1_ITEM(CMS_DigestedData)

+ 32 - 6
libs/openssl/crypto/cms/cms_sd.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2008-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -109,6 +109,27 @@ static void cms_sd_set_version(CMS_SignedData *sd)
 
 }
 
+/*
+ * RFC 5652 Section 11.1 Content Type
+ * The content-type attribute within signed-data MUST
+ *   1) be present if there are signed attributes
+ *   2) match the content type in the signed-data,
+ *   3) be a signed attribute.
+ *   4) not have more than one copy of the attribute.
+ *
+ * Note that since the CMS_SignerInfo_sign() always adds the "signing time"
+ * attribute, the content type attribute MUST be added also.
+ * Assumptions: This assumes that the attribute does not already exist.
+ */
+static int cms_set_si_contentType_attr(CMS_ContentInfo *cms, CMS_SignerInfo *si)
+{
+    ASN1_OBJECT *ctype = cms->d.signedData->encapContentInfo->eContentType;
+
+    /* Add the contentType attribute */
+    return CMS_signed_add1_attr_by_NID(si, NID_pkcs9_contentType,
+                                       V_ASN1_OBJECT, ctype, -1) > 0;
+}
+
 /* Copy an existing messageDigest value */
 
 static int cms_copy_messageDigest(CMS_ContentInfo *cms, CMS_SignerInfo *si)
@@ -328,6 +349,8 @@ CMS_SignerInfo *CMS_add1_signer(CMS_ContentInfo *cms,
         if (flags & CMS_REUSE_DIGEST) {
             if (!cms_copy_messageDigest(cms, si))
                 goto err;
+            if (!cms_set_si_contentType_attr(cms, si))
+                goto err;
             if (!(flags & (CMS_PARTIAL | CMS_KEY_PARAM)) &&
                 !CMS_SignerInfo_sign(si))
                 goto err;
@@ -558,8 +581,6 @@ static int cms_SignerInfo_content_sign(CMS_ContentInfo *cms,
      */
 
     if (CMS_signed_get_attr_count(si) >= 0) {
-        ASN1_OBJECT *ctype =
-            cms->d.signedData->encapContentInfo->eContentType;
         unsigned char md[EVP_MAX_MD_SIZE];
         unsigned int mdlen;
         if (!EVP_DigestFinal_ex(mctx, md, &mdlen))
@@ -568,9 +589,9 @@ static int cms_SignerInfo_content_sign(CMS_ContentInfo *cms,
                                          V_ASN1_OCTET_STRING, md, mdlen))
             goto err;
         /* Copy content type across */
-        if (CMS_signed_add1_attr_by_NID(si, NID_pkcs9_contentType,
-                                        V_ASN1_OBJECT, ctype, -1) <= 0)
+        if (!cms_set_si_contentType_attr(cms, si))
             goto err;
+
         if (!CMS_SignerInfo_sign(si))
             goto err;
     } else if (si->pctx) {
@@ -650,6 +671,9 @@ int CMS_SignerInfo_sign(CMS_SignerInfo *si)
             goto err;
     }
 
+    if (!CMS_si_check_attributes(si))
+        goto err;
+
     if (si->pctx)
         pctx = si->pctx;
     else {
@@ -696,7 +720,6 @@ int CMS_SignerInfo_sign(CMS_SignerInfo *si)
     OPENSSL_free(abuf);
     EVP_MD_CTX_reset(mctx);
     return 0;
-
 }
 
 int CMS_SignerInfo_verify(CMS_SignerInfo *si)
@@ -711,6 +734,9 @@ int CMS_SignerInfo_verify(CMS_SignerInfo *si)
         return -1;
     }
 
+    if (!CMS_si_check_attributes(si))
+        return -1;
+
     md = EVP_get_digestbyobj(si->digestAlgorithm->algorithm);
     if (md == NULL)
         return -1;

+ 5 - 1
libs/openssl/crypto/cms/cms_smime.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 2008-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2008-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -743,6 +743,10 @@ int CMS_decrypt(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert,
         cms->d.envelopedData->encryptedContentInfo->debug = 1;
     else
         cms->d.envelopedData->encryptedContentInfo->debug = 0;
+    if (!cert)
+        cms->d.envelopedData->encryptedContentInfo->havenocert = 1;
+    else
+        cms->d.envelopedData->encryptedContentInfo->havenocert = 0;
     if (!pk && !cert && !dcont && !out)
         return 1;
     if (pk && !CMS_decrypt_set1_pkey(cms, pk, cert))

+ 1 - 1
libs/openssl/crypto/conf/conf_sap.c

@@ -42,7 +42,7 @@ void OPENSSL_config(const char *appname)
 
 int openssl_config_int(const OPENSSL_INIT_SETTINGS *settings)
 {
-    int ret;
+    int ret = 0;
     const char *filename;
     const char *appname;
     unsigned long flags;

+ 7 - 1
libs/openssl/crypto/ctype.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2017-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -272,3 +272,9 @@ int ossl_toupper(int c)
 {
     return ossl_islower(c) ? c ^ case_change : c;
 }
+
+int ascii_isdigit(const char inchar) {
+    if (inchar > 0x2F && inchar < 0x3A)
+        return 1;
+    return 0;
+}

+ 11 - 26
libs/openssl/crypto/dh/dh_check.c

@@ -24,7 +24,8 @@ int DH_check_params_ex(const DH *dh)
 {
     int errflags = 0;
 
-    (void)DH_check_params(dh, &errflags);
+    if (!DH_check_params(dh, &errflags))
+        return 0;
 
     if ((errflags & DH_CHECK_P_NOT_PRIME) != 0)
         DHerr(DH_F_DH_CHECK_PARAMS_EX, DH_R_CHECK_P_NOT_PRIME);
@@ -67,18 +68,14 @@ int DH_check_params(const DH *dh, int *ret)
 
 /*-
  * Check that p is a safe prime and
- * if g is 2, 3 or 5, check that it is a suitable generator
- * where
- * for 2, p mod 24 == 11
- * for 3, p mod 12 == 5
- * for 5, p mod 10 == 3 or 7
- * should hold.
+ * g is a suitable generator.
  */
 int DH_check_ex(const DH *dh)
 {
     int errflags = 0;
 
-    (void)DH_check(dh, &errflags);
+    if (!DH_check(dh, &errflags))
+        return 0;
 
     if ((errflags & DH_NOT_SUITABLE_GENERATOR) != 0)
         DHerr(DH_F_DH_CHECK_EX, DH_R_NOT_SUITABLE_GENERATOR);
@@ -102,10 +99,11 @@ int DH_check(const DH *dh, int *ret)
 {
     int ok = 0, r;
     BN_CTX *ctx = NULL;
-    BN_ULONG l;
     BIGNUM *t1 = NULL, *t2 = NULL;
 
-    *ret = 0;
+    if (!DH_check_params(dh, ret))
+        return 0;
+
     ctx = BN_CTX_new();
     if (ctx == NULL)
         goto err;
@@ -139,21 +137,7 @@ int DH_check(const DH *dh, int *ret)
             *ret |= DH_CHECK_INVALID_Q_VALUE;
         if (dh->j && BN_cmp(dh->j, t1))
             *ret |= DH_CHECK_INVALID_J_VALUE;
-
-    } else if (BN_is_word(dh->g, DH_GENERATOR_2)) {
-        l = BN_mod_word(dh->p, 24);
-        if (l == (BN_ULONG)-1)
-            goto err;
-        if (l != 11)
-            *ret |= DH_NOT_SUITABLE_GENERATOR;
-    } else if (BN_is_word(dh->g, DH_GENERATOR_5)) {
-        l = BN_mod_word(dh->p, 10);
-        if (l == (BN_ULONG)-1)
-            goto err;
-        if ((l != 3) && (l != 7))
-            *ret |= DH_NOT_SUITABLE_GENERATOR;
-    } else
-        *ret |= DH_UNABLE_TO_CHECK_GENERATOR;
+    }
 
     r = BN_is_prime_ex(dh->p, DH_NUMBER_ITERATIONS_FOR_PRIME, ctx, NULL);
     if (r < 0)
@@ -180,7 +164,8 @@ int DH_check_pub_key_ex(const DH *dh, const BIGNUM *pub_key)
 {
     int errflags = 0;
 
-    (void)DH_check(dh, &errflags);
+    if (!DH_check_pub_key(dh, pub_key, &errflags))
+        return 0;
 
     if ((errflags & DH_CHECK_PUBKEY_TOO_SMALL) != 0)
         DHerr(DH_F_DH_CHECK_PUB_KEY_EX, DH_R_CHECK_PUBKEY_TOO_SMALL);

+ 26 - 26
libs/openssl/crypto/dh/dh_gen.c

@@ -30,30 +30,33 @@ int DH_generate_parameters_ex(DH *ret, int prime_len, int generator,
 
 /*-
  * We generate DH parameters as follows
- * find a prime q which is prime_len/2 bits long.
- * p=(2*q)+1 or (p-1)/2 = q
- * For this case, g is a generator if
- * g^((p-1)/q) mod p != 1 for values of q which are the factors of p-1.
- * Since the factors of p-1 are q and 2, we just need to check
- * g^2 mod p != 1 and g^q mod p != 1.
+ * find a prime p which is prime_len bits long,
+ * where q=(p-1)/2 is also prime.
+ * In the following we assume that g is not 0, 1 or p-1, since it
+ * would generate only trivial subgroups.
+ * For this case, g is a generator of the order-q subgroup if
+ * g^q mod p == 1.
+ * Or in terms of the Legendre symbol: (g/p) == 1.
  *
  * Having said all that,
  * there is another special case method for the generators 2, 3 and 5.
- * for 2, p mod 24 == 11
- * for 3, p mod 12 == 5  <<<<< does not work for safe primes.
- * for 5, p mod 10 == 3 or 7
+ * Using the quadratic reciprocity law it is possible to solve
+ * (g/p) == 1 for the special values 2, 3, 5:
+ * (2/p) == 1 if p mod 8 == 1 or 7.
+ * (3/p) == 1 if p mod 12 == 1 or 11.
+ * (5/p) == 1 if p mod 5 == 1 or 4.
+ * See for instance: https://en.wikipedia.org/wiki/Legendre_symbol
  *
- * Thanks to Phil Karn for the pointers about the
- * special generators and for answering some of my questions.
+ * Since all safe primes > 7 must satisfy p mod 12 == 11
+ * and all safe primes > 11 must satisfy p mod 5 != 1
+ * we can further improve the condition for g = 2, 3 and 5:
+ * for 2, p mod 24 == 23
+ * for 3, p mod 12 == 11
+ * for 5, p mod 60 == 59
  *
- * I've implemented the second simple method :-).
- * Since DH should be using a safe prime (both p and q are prime),
- * this generator function can take a very very long time to run.
- */
-/*
- * Actually there is no reason to insist that 'generator' be a generator.
- * It's just as OK (and in some sense better) to use a generator of the
- * order-q subgroup.
+ * However for compatibilty with previous versions we use:
+ * for 2, p mod 24 == 11
+ * for 5, p mod 60 == 23
  */
 static int dh_builtin_genparams(DH *ret, int prime_len, int generator,
                                 BN_GENCB *cb)
@@ -88,13 +91,10 @@ static int dh_builtin_genparams(DH *ret, int prime_len, int generator,
             goto err;
         g = 2;
     } else if (generator == DH_GENERATOR_5) {
-        if (!BN_set_word(t1, 10))
+        if (!BN_set_word(t1, 60))
             goto err;
-        if (!BN_set_word(t2, 3))
+        if (!BN_set_word(t2, 23))
             goto err;
-        /*
-         * BN_set_word(t3,7); just have to miss out on these ones :-(
-         */
         g = 5;
     } else {
         /*
@@ -102,9 +102,9 @@ static int dh_builtin_genparams(DH *ret, int prime_len, int generator,
          * not: since we are using safe primes, it will generate either an
          * order-q or an order-2q group, which both is OK
          */
-        if (!BN_set_word(t1, 2))
+        if (!BN_set_word(t1, 12))
             goto err;
-        if (!BN_set_word(t2, 1))
+        if (!BN_set_word(t2, 11))
             goto err;
         g = generator;
     }

+ 11 - 2
libs/openssl/crypto/dh/dh_key.c

@@ -125,6 +125,15 @@ static int generate_key(DH *dh)
             l = dh->length ? dh->length : BN_num_bits(dh->p) - 1;
             if (!BN_priv_rand(priv_key, l, BN_RAND_TOP_ONE, BN_RAND_BOTTOM_ANY))
                 goto err;
+            /*
+             * We handle just one known case where g is a quadratic non-residue:
+             * for g = 2: p % 8 == 3
+             */
+            if (BN_is_word(dh->g, DH_GENERATOR_2) && !BN_is_bit_set(dh->p, 2)) {
+                /* clear bit 0, since it won't be a secret anyway */
+                if (!BN_clear_bit(priv_key, 0))
+                    goto err;
+            }
         }
     }
 
@@ -136,11 +145,11 @@ static int generate_key(DH *dh)
         BN_with_flags(prk, priv_key, BN_FLG_CONSTTIME);
 
         if (!dh->meth->bn_mod_exp(dh, pub_key, dh->g, prk, dh->p, ctx, mont)) {
-            BN_free(prk);
+            BN_clear_free(prk);
             goto err;
         }
         /* We MUST free prk before any further use of priv_key */
-        BN_free(prk);
+        BN_clear_free(prk);
     }
 
     dh->pub_key = pub_key;

+ 3 - 3
libs/openssl/crypto/dh/dh_lib.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -234,11 +234,11 @@ void DH_get0_key(const DH *dh, const BIGNUM **pub_key, const BIGNUM **priv_key)
 int DH_set0_key(DH *dh, BIGNUM *pub_key, BIGNUM *priv_key)
 {
     if (pub_key != NULL) {
-        BN_free(dh->pub_key);
+        BN_clear_free(dh->pub_key);
         dh->pub_key = pub_key;
     }
     if (priv_key != NULL) {
-        BN_free(dh->priv_key);
+        BN_clear_free(dh->priv_key);
         dh->priv_key = priv_key;
     }
 

+ 2 - 2
libs/openssl/crypto/dsa/dsa_ameth.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2006-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -503,7 +503,7 @@ static int dsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
 
     case ASN1_PKEY_CTRL_DEFAULT_MD_NID:
         *(int *)arg2 = NID_sha256;
-        return 2;
+        return 1;
 
     default:
         return -2;

+ 3 - 1
libs/openssl/crypto/dsa/dsa_err.c

@@ -1,6 +1,6 @@
 /*
  * Generated by util/mkerr.pl DO NOT EDIT
- * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -52,6 +52,8 @@ static const ERR_STRING_DATA DSA_str_reasons[] = {
     "invalid digest type"},
     {ERR_PACK(ERR_LIB_DSA, 0, DSA_R_INVALID_PARAMETERS), "invalid parameters"},
     {ERR_PACK(ERR_LIB_DSA, 0, DSA_R_MISSING_PARAMETERS), "missing parameters"},
+    {ERR_PACK(ERR_LIB_DSA, 0, DSA_R_MISSING_PRIVATE_KEY),
+    "missing private key"},
     {ERR_PACK(ERR_LIB_DSA, 0, DSA_R_MODULUS_TOO_LARGE), "modulus too large"},
     {ERR_PACK(ERR_LIB_DSA, 0, DSA_R_NO_PARAMETERS_SET), "no parameters set"},
     {ERR_PACK(ERR_LIB_DSA, 0, DSA_R_PARAMETER_ENCODING_ERROR),

+ 9 - 1
libs/openssl/crypto/dsa/dsa_ossl.c

@@ -72,6 +72,10 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
         reason = DSA_R_MISSING_PARAMETERS;
         goto err;
     }
+    if (dsa->priv_key == NULL) {
+        reason = DSA_R_MISSING_PRIVATE_KEY;
+        goto err;
+    }
 
     ret = DSA_SIG_new();
     if (ret == NULL)
@@ -195,6 +199,10 @@ static int dsa_sign_setup(DSA *dsa, BN_CTX *ctx_in,
         DSAerr(DSA_F_DSA_SIGN_SETUP, DSA_R_INVALID_PARAMETERS);
         return 0;
     }
+    if (dsa->priv_key == NULL) {
+        DSAerr(DSA_F_DSA_SIGN_SETUP, DSA_R_MISSING_PRIVATE_KEY);
+        return 0;
+    }
 
     k = BN_new();
     l = BN_new();
@@ -248,7 +256,7 @@ static int dsa_sign_setup(DSA *dsa, BN_CTX *ctx_in,
      * one bit longer than the modulus.
      *
      * There are some concerns about the efficacy of doing this.  More
-     * specificly refer to the discussion starting with:
+     * specifically refer to the discussion starting with:
      *     https://github.com/openssl/openssl/pull/7486#discussion_r228323705
      * The fix is to rework BN so these gymnastics aren't required.
      */

+ 2 - 3
libs/openssl/crypto/dso/dso_dlfcn.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 2000-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2000-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -27,8 +27,7 @@
 #  endif
 #  include <dlfcn.h>
 #  define HAVE_DLINFO 1
-#  if defined(__CYGWIN__) || \
-     defined(__SCO_VERSION__) || defined(_SCO_ELF) || \
+#  if defined(__SCO_VERSION__) || defined(_SCO_ELF) || \
      (defined(__osf__) && !defined(RTLD_NEXT))     || \
      (defined(__OpenBSD__) && !defined(RTLD_SELF)) || \
         defined(__ANDROID__)

+ 1 - 2
libs/openssl/crypto/ec/asm/ecp_nistz256-sparcv9.pl

@@ -1,5 +1,5 @@
 #! /usr/bin/env perl
-# Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2015-2019 The OpenSSL Project Authors. All Rights Reserved.
 #
 # Licensed under the OpenSSL license (the "License").  You may not use
 # this file except in compliance with the License.  You can obtain a copy
@@ -2301,7 +2301,6 @@ my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
 # !in1infty, !in2infty and result of check for zero.
 
 $code.=<<___;
-.globl	ecp_nistz256_point_add_vis3
 .align	32
 ecp_nistz256_point_add_vis3:
 	save	%sp,-STACK64_FRAME-32*18-32,%sp

+ 1 - 1
libs/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl

@@ -1301,7 +1301,7 @@ ecp_nistz256_ord_mul_montx:
 
 	################################# reduction
 	mulx	8*0+128(%r14), $t0, $t1
-	adcx	$t0, $acc3		# guranteed to be zero
+	adcx	$t0, $acc3		# guaranteed to be zero
 	adox	$t1, $acc4
 
 	mulx	8*1+128(%r14), $t0, $t1

+ 3 - 3
libs/openssl/crypto/ec/asm/x25519-ppc64.pl

@@ -1,5 +1,5 @@
 #! /usr/bin/env perl
-# Copyright 2018 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2018-2019 The OpenSSL Project Authors. All Rights Reserved.
 #
 # Licensed under the OpenSSL license (the "License").  You may not use
 # this file except in compliance with the License.  You can obtain a copy
@@ -451,7 +451,7 @@ x25519_fe64_tobytes:
 	and	$t0,$t0,$t1
 	sldi	$a3,$a3,1
 	add	$t0,$t0,$t1		# compare to modulus in the same go
-	srdi	$a3,$a3,1		# most signifcant bit cleared
+	srdi	$a3,$a3,1		# most significant bit cleared
 
 	addc	$a0,$a0,$t0
 	addze	$a1,$a1
@@ -462,7 +462,7 @@ x25519_fe64_tobytes:
 	sradi	$t0,$a3,63		# most significant bit -> mask
 	sldi	$a3,$a3,1
 	andc	$t0,$t1,$t0
-	srdi	$a3,$a3,1		# most signifcant bit cleared
+	srdi	$a3,$a3,1		# most significant bit cleared
 
 	subi	$rp,$rp,1
 	subfc	$a0,$t0,$a0

+ 68 - 5
libs/openssl/crypto/ec/ec_asn1.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 2002-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2002-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -568,10 +568,12 @@ ECPKPARAMETERS *EC_GROUP_get_ecpkparameters(const EC_GROUP *group,
 EC_GROUP *EC_GROUP_new_from_ecparameters(const ECPARAMETERS *params)
 {
     int ok = 0, tmp;
-    EC_GROUP *ret = NULL;
+    EC_GROUP *ret = NULL, *dup = NULL;
     BIGNUM *p = NULL, *a = NULL, *b = NULL;
     EC_POINT *point = NULL;
     long field_bits;
+    int curve_name = NID_undef;
+    BN_CTX *ctx = NULL;
 
     if (!params->fieldID || !params->fieldID->fieldType ||
         !params->fieldID->p.ptr) {
@@ -789,18 +791,79 @@ EC_GROUP *EC_GROUP_new_from_ecparameters(const ECPARAMETERS *params)
         goto err;
     }
 
+    /*
+     * Check if the explicit parameters group just created matches one of the
+     * built-in curves.
+     *
+     * We create a copy of the group just built, so that we can remove optional
+     * fields for the lookup: we do this to avoid the possibility that one of
+     * the optional parameters is used to force the library into using a less
+     * performant and less secure EC_METHOD instead of the specialized one.
+     * In any case, `seed` is not really used in any computation, while a
+     * cofactor different from the one in the built-in table is just
+     * mathematically wrong anyway and should not be used.
+     */
+    if ((ctx = BN_CTX_new()) == NULL) {
+        ECerr(EC_F_EC_GROUP_NEW_FROM_ECPARAMETERS, ERR_R_BN_LIB);
+        goto err;
+    }
+    if ((dup = EC_GROUP_dup(ret)) == NULL
+            || EC_GROUP_set_seed(dup, NULL, 0) != 1
+            || !EC_GROUP_set_generator(dup, point, a, NULL)) {
+        ECerr(EC_F_EC_GROUP_NEW_FROM_ECPARAMETERS, ERR_R_EC_LIB);
+        goto err;
+    }
+    if ((curve_name = ec_curve_nid_from_params(dup, ctx)) != NID_undef) {
+        /*
+         * The input explicit parameters successfully matched one of the
+         * built-in curves: often for built-in curves we have specialized
+         * methods with better performance and hardening.
+         *
+         * In this case we replace the `EC_GROUP` created through explicit
+         * parameters with one created from a named group.
+         */
+        EC_GROUP *named_group = NULL;
+
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+        /*
+         * NID_wap_wsg_idm_ecid_wtls12 and NID_secp224r1 are both aliases for
+         * the same curve, we prefer the SECP nid when matching explicit
+         * parameters as that is associated with a specialized EC_METHOD.
+         */
+        if (curve_name == NID_wap_wsg_idm_ecid_wtls12)
+            curve_name = NID_secp224r1;
+#endif /* !def(OPENSSL_NO_EC_NISTP_64_GCC_128) */
+
+        if ((named_group = EC_GROUP_new_by_curve_name(curve_name)) == NULL) {
+            ECerr(EC_F_EC_GROUP_NEW_FROM_ECPARAMETERS, ERR_R_EC_LIB);
+            goto err;
+        }
+        EC_GROUP_free(ret);
+        ret = named_group;
+
+        /*
+         * Set the flag so that EC_GROUPs created from explicit parameters are
+         * serialized using explicit parameters by default.
+         */
+        EC_GROUP_set_asn1_flag(ret, OPENSSL_EC_EXPLICIT_CURVE);
+    }
+
     ok = 1;
 
  err:
     if (!ok) {
-        EC_GROUP_clear_free(ret);
+        EC_GROUP_free(ret);
         ret = NULL;
     }
+    EC_GROUP_free(dup);
 
     BN_free(p);
     BN_free(a);
     BN_free(b);
     EC_POINT_free(point);
+
+    BN_CTX_free(ctx);
+
     return ret;
 }
 
@@ -861,7 +924,7 @@ EC_GROUP *d2i_ECPKParameters(EC_GROUP **a, const unsigned char **in, long len)
     }
 
     if (a) {
-        EC_GROUP_clear_free(*a);
+        EC_GROUP_free(*a);
         *a = group;
     }
 
@@ -909,7 +972,7 @@ EC_KEY *d2i_ECPrivateKey(EC_KEY **a, const unsigned char **in, long len)
         ret = *a;
 
     if (priv_key->parameters) {
-        EC_GROUP_clear_free(ret->group);
+        EC_GROUP_free(ret->group);
         ret->group = EC_GROUP_new_from_ecpkparameters(priv_key->parameters);
     }
 

+ 113 - 1
libs/openssl/crypto/ec/ec_curve.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 2002-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2002-2019 The OpenSSL Project Authors. All Rights Reserved.
  * Copyright (c) 2002, Oracle and/or its affiliates. All rights reserved
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
@@ -3197,3 +3197,115 @@ int EC_curve_nist2nid(const char *name)
     }
     return NID_undef;
 }
+
+#define NUM_BN_FIELDS 6
+/*
+ * Validates EC domain parameter data for known named curves.
+ * This can be used when a curve is loaded explicitly (without a curve
+ * name) or to validate that domain parameters have not been modified.
+ *
+ * Returns: The nid associated with the found named curve, or NID_undef
+ *          if not found. If there was an error it returns -1.
+ */
+int ec_curve_nid_from_params(const EC_GROUP *group, BN_CTX *ctx)
+{
+    int ret = -1, nid, len, field_type, param_len;
+    size_t i, seed_len;
+    const unsigned char *seed, *params_seed, *params;
+    unsigned char *param_bytes = NULL;
+    const EC_CURVE_DATA *data;
+    const EC_POINT *generator = NULL;
+    const EC_METHOD *meth;
+    const BIGNUM *cofactor = NULL;
+    /* An array of BIGNUMs for (p, a, b, x, y, order) */
+    BIGNUM *bn[NUM_BN_FIELDS] = {NULL, NULL, NULL, NULL, NULL, NULL};
+
+    meth = EC_GROUP_method_of(group);
+    if (meth == NULL)
+        return -1;
+    /* Use the optional named curve nid as a search field */
+    nid = EC_GROUP_get_curve_name(group);
+    field_type = EC_METHOD_get_field_type(meth);
+    seed_len = EC_GROUP_get_seed_len(group);
+    seed = EC_GROUP_get0_seed(group);
+    cofactor = EC_GROUP_get0_cofactor(group);
+
+    BN_CTX_start(ctx);
+
+    /*
+     * The built-in curves contains data fields (p, a, b, x, y, order) that are
+     * all zero-padded to be the same size. The size of the padding is
+     * determined by either the number of bytes in the field modulus (p) or the
+     * EC group order, whichever is larger.
+     */
+    param_len = BN_num_bytes(group->order);
+    len = BN_num_bytes(group->field);
+    if (len > param_len)
+        param_len = len;
+
+    /* Allocate space to store the padded data for (p, a, b, x, y, order)  */
+    param_bytes = OPENSSL_malloc(param_len * NUM_BN_FIELDS);
+    if (param_bytes == NULL)
+        goto end;
+
+    /* Create the bignums */
+    for (i = 0; i < NUM_BN_FIELDS; ++i) {
+        if ((bn[i] = BN_CTX_get(ctx)) == NULL)
+            goto end;
+    }
+    /*
+     * Fill in the bn array with the same values as the internal curves
+     * i.e. the values are p, a, b, x, y, order.
+     */
+    /* Get p, a & b */
+    if (!(EC_GROUP_get_curve(group, bn[0], bn[1], bn[2], ctx)
+        && ((generator = EC_GROUP_get0_generator(group)) != NULL)
+        /* Get x & y */
+        && EC_POINT_get_affine_coordinates(group, generator, bn[3], bn[4], ctx)
+        /* Get order */
+        && EC_GROUP_get_order(group, bn[5], ctx)))
+        goto end;
+
+   /*
+     * Convert the bignum array to bytes that are joined together to form
+     * a single buffer that contains data for all fields.
+     * (p, a, b, x, y, order) are all zero padded to be the same size.
+     */
+    for (i = 0; i < NUM_BN_FIELDS; ++i) {
+        if (BN_bn2binpad(bn[i], &param_bytes[i*param_len], param_len) <= 0)
+            goto end;
+    }
+
+    for (i = 0; i < curve_list_length; i++) {
+        const ec_list_element curve = curve_list[i];
+
+        data = curve.data;
+        /* Get the raw order byte data */
+        params_seed = (const unsigned char *)(data + 1); /* skip header */
+        params = params_seed + data->seed_len;
+
+        /* Look for unique fields in the fixed curve data */
+        if (data->field_type == field_type
+            && param_len == data->param_len
+            && (nid <= 0 || nid == curve.nid)
+            /* check the optional cofactor (ignore if its zero) */
+            && (BN_is_zero(cofactor)
+                || BN_is_word(cofactor, (const BN_ULONG)curve.data->cofactor))
+            /* Check the optional seed (ignore if its not set) */
+            && (data->seed_len == 0 || seed_len == 0
+                || ((size_t)data->seed_len == seed_len
+                     && memcmp(params_seed, seed, seed_len) == 0))
+            /* Check that the groups params match the built-in curve params */
+            && memcmp(param_bytes, params, param_len * NUM_BN_FIELDS)
+                             == 0) {
+            ret = curve.nid;
+            goto end;
+        }
+    }
+    /* Gets here if the group was not found */
+    ret = NID_undef;
+end:
+    OPENSSL_free(param_bytes);
+    BN_CTX_end(ctx);
+    return ret;
+}

+ 3 - 1
libs/openssl/crypto/ec/ec_lcl.h

@@ -154,7 +154,7 @@ struct ec_method_st {
     int (*field_div) (const EC_GROUP *, BIGNUM *r, const BIGNUM *a,
                       const BIGNUM *b, BN_CTX *);
     /*-
-     * 'field_inv' computes the multipicative inverse of a in the field,
+     * 'field_inv' computes the multiplicative inverse of a in the field,
      * storing the result in r.
      *
      * If 'a' is zero (or equivalent), you'll get an EC_R_CANNOT_INVERT error.
@@ -595,6 +595,8 @@ int ec_key_simple_generate_key(EC_KEY *eckey);
 int ec_key_simple_generate_public_key(EC_KEY *eckey);
 int ec_key_simple_check_key(const EC_KEY *eckey);
 
+int ec_curve_nid_from_params(const EC_GROUP *group, BN_CTX *ctx);
+
 /* EC_METHOD definitions */
 
 struct ec_key_method_st {

+ 96 - 7
libs/openssl/crypto/ec/ec_lib.c

@@ -265,6 +265,67 @@ int EC_METHOD_get_field_type(const EC_METHOD *meth)
 
 static int ec_precompute_mont_data(EC_GROUP *);
 
+/*-
+ * Try computing cofactor from the generator order (n) and field cardinality (q).
+ * This works for all curves of cryptographic interest.
+ *
+ * Hasse thm: q + 1 - 2*sqrt(q) <= n*h <= q + 1 + 2*sqrt(q)
+ * h_min = (q + 1 - 2*sqrt(q))/n
+ * h_max = (q + 1 + 2*sqrt(q))/n
+ * h_max - h_min = 4*sqrt(q)/n
+ * So if n > 4*sqrt(q) holds, there is only one possible value for h:
+ * h = \lfloor (h_min + h_max)/2 \rceil = \lfloor (q + 1)/n \rceil
+ *
+ * Otherwise, zero cofactor and return success.
+ */
+static int ec_guess_cofactor(EC_GROUP *group) {
+    int ret = 0;
+    BN_CTX *ctx = NULL;
+    BIGNUM *q = NULL;
+
+    /*-
+     * If the cofactor is too large, we cannot guess it.
+     * The RHS of below is a strict overestimate of lg(4 * sqrt(q))
+     */
+    if (BN_num_bits(group->order) <= (BN_num_bits(group->field) + 1) / 2 + 3) {
+        /* default to 0 */
+        BN_zero(group->cofactor);
+        /* return success */
+        return 1;
+    }
+
+    if ((ctx = BN_CTX_new()) == NULL)
+        return 0;
+
+    BN_CTX_start(ctx);
+    if ((q = BN_CTX_get(ctx)) == NULL)
+        goto err;
+
+    /* set q = 2**m for binary fields; q = p otherwise */
+    if (group->meth->field_type == NID_X9_62_characteristic_two_field) {
+        BN_zero(q);
+        if (!BN_set_bit(q, BN_num_bits(group->field) - 1))
+            goto err;
+    } else {
+        if (!BN_copy(q, group->field))
+            goto err;
+    }
+
+    /* compute h = \lfloor (q + 1)/n \rceil = \lfloor (q + 1 + n/2)/n \rfloor */
+    if (!BN_rshift1(group->cofactor, group->order) /* n/2 */
+        || !BN_add(group->cofactor, group->cofactor, q) /* q + n/2 */
+        /* q + 1 + n/2 */
+        || !BN_add(group->cofactor, group->cofactor, BN_value_one())
+        /* (q + 1 + n/2)/n */
+        || !BN_div(group->cofactor, NULL, group->cofactor, group->order, ctx))
+        goto err;
+    ret = 1;
+ err:
+    BN_CTX_end(ctx);
+    BN_CTX_free(ctx);
+    return ret;
+}
+
 int EC_GROUP_set_generator(EC_GROUP *group, const EC_POINT *generator,
                            const BIGNUM *order, const BIGNUM *cofactor)
 {
@@ -273,6 +334,34 @@ int EC_GROUP_set_generator(EC_GROUP *group, const EC_POINT *generator,
         return 0;
     }
 
+    /* require group->field >= 1 */
+    if (group->field == NULL || BN_is_zero(group->field)
+        || BN_is_negative(group->field)) {
+        ECerr(EC_F_EC_GROUP_SET_GENERATOR, EC_R_INVALID_FIELD);
+        return 0;
+    }
+
+    /*-
+     * - require order >= 1
+     * - enforce upper bound due to Hasse thm: order can be no more than one bit
+     *   longer than field cardinality
+     */
+    if (order == NULL || BN_is_zero(order) || BN_is_negative(order)
+        || BN_num_bits(order) > BN_num_bits(group->field) + 1) {
+        ECerr(EC_F_EC_GROUP_SET_GENERATOR, EC_R_INVALID_GROUP_ORDER);
+        return 0;
+    }
+
+    /*-
+     * Unfortunately the cofactor is an optional field in many standards.
+     * Internally, the lib uses 0 cofactor as a marker for "unknown cofactor".
+     * So accept cofactor == NULL or cofactor >= 0.
+     */
+    if (cofactor != NULL && BN_is_negative(cofactor)) {
+        ECerr(EC_F_EC_GROUP_SET_GENERATOR, EC_R_UNKNOWN_COFACTOR);
+        return 0;
+    }
+
     if (group->generator == NULL) {
         group->generator = EC_POINT_new(group);
         if (group->generator == NULL)
@@ -281,17 +370,17 @@ int EC_GROUP_set_generator(EC_GROUP *group, const EC_POINT *generator,
     if (!EC_POINT_copy(group->generator, generator))
         return 0;
 
-    if (order != NULL) {
-        if (!BN_copy(group->order, order))
-            return 0;
-    } else
-        BN_zero(group->order);
+    if (!BN_copy(group->order, order))
+        return 0;
 
-    if (cofactor != NULL) {
+    /* Either take the provided positive cofactor, or try to compute it */
+    if (cofactor != NULL && !BN_is_zero(cofactor)) {
         if (!BN_copy(group->cofactor, cofactor))
             return 0;
-    } else
+    } else if (!ec_guess_cofactor(group)) {
         BN_zero(group->cofactor);
+        return 0;
+    }
 
     /*
      * Some groups have an order with

+ 1 - 1
libs/openssl/crypto/ec/ecdh_ossl.c

@@ -58,7 +58,7 @@ int ecdh_simple_compute_key(unsigned char **pout, size_t *poutlen,
 
     priv_key = EC_KEY_get0_private_key(ecdh);
     if (priv_key == NULL) {
-        ECerr(EC_F_ECDH_SIMPLE_COMPUTE_KEY, EC_R_NO_PRIVATE_VALUE);
+        ECerr(EC_F_ECDH_SIMPLE_COMPUTE_KEY, EC_R_MISSING_PRIVATE_KEY);
         goto err;
     }
 

+ 12 - 4
libs/openssl/crypto/ec/ecdsa_ossl.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 2002-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2002-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -41,11 +41,16 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in,
     const EC_GROUP *group;
     int ret = 0;
     int order_bits;
+    const BIGNUM *priv_key;
 
     if (eckey == NULL || (group = EC_KEY_get0_group(eckey)) == NULL) {
         ECerr(EC_F_ECDSA_SIGN_SETUP, ERR_R_PASSED_NULL_PARAMETER);
         return 0;
     }
+    if ((priv_key = EC_KEY_get0_private_key(eckey)) == NULL) {
+        ECerr(EC_F_ECDSA_SIGN_SETUP, EC_R_MISSING_PRIVATE_KEY);
+        return 0;
+    }
 
     if (!EC_KEY_can_sign(eckey)) {
         ECerr(EC_F_ECDSA_SIGN_SETUP, EC_R_CURVE_DOES_NOT_SUPPORT_SIGNING);
@@ -83,8 +88,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in,
         /* get random k */
         do {
             if (dgst != NULL) {
-                if (!BN_generate_dsa_nonce(k, order,
-                                           EC_KEY_get0_private_key(eckey),
+                if (!BN_generate_dsa_nonce(k, order, priv_key,
                                            dgst, dlen, ctx)) {
                     ECerr(EC_F_ECDSA_SIGN_SETUP,
                           EC_R_RANDOM_NUMBER_GENERATION_FAILED);
@@ -162,10 +166,14 @@ ECDSA_SIG *ossl_ecdsa_sign_sig(const unsigned char *dgst, int dgst_len,
     group = EC_KEY_get0_group(eckey);
     priv_key = EC_KEY_get0_private_key(eckey);
 
-    if (group == NULL || priv_key == NULL) {
+    if (group == NULL) {
         ECerr(EC_F_OSSL_ECDSA_SIGN_SIG, ERR_R_PASSED_NULL_PARAMETER);
         return NULL;
     }
+    if (priv_key == NULL) {
+        ECerr(EC_F_OSSL_ECDSA_SIGN_SIG, EC_R_MISSING_PRIVATE_KEY);
+        return NULL;
+    }
 
     if (!EC_KEY_can_sign(eckey)) {
         ECerr(EC_F_OSSL_ECDSA_SIGN_SIG, EC_R_CURVE_DOES_NOT_SUPPORT_SIGNING);

+ 26 - 36
libs/openssl/crypto/ec/ecp_nistp224.c

@@ -324,34 +324,21 @@ static void felem_to_bin28(u8 out[28], const felem in)
     }
 }
 
-/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
-static void flip_endian(u8 *out, const u8 *in, unsigned len)
-{
-    unsigned i;
-    for (i = 0; i < len; ++i)
-        out[i] = in[len - 1 - i];
-}
-
 /* From OpenSSL BIGNUM to internal representation */
 static int BN_to_felem(felem out, const BIGNUM *bn)
 {
-    felem_bytearray b_in;
     felem_bytearray b_out;
-    unsigned num_bytes;
+    int num_bytes;
 
-    /* BN_bn2bin eats leading zeroes */
-    memset(b_out, 0, sizeof(b_out));
-    num_bytes = BN_num_bytes(bn);
-    if (num_bytes > sizeof(b_out)) {
+    if (BN_is_negative(bn)) {
         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
         return 0;
     }
-    if (BN_is_negative(bn)) {
+    num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
+    if (num_bytes < 0) {
         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
         return 0;
     }
-    num_bytes = BN_bn2bin(bn, b_in);
-    flip_endian(b_out, b_in, num_bytes);
     bin28_to_felem(out, b_out);
     return 1;
 }
@@ -359,10 +346,9 @@ static int BN_to_felem(felem out, const BIGNUM *bn)
 /* From internal representation to OpenSSL BIGNUM */
 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 {
-    felem_bytearray b_in, b_out;
-    felem_to_bin28(b_in, in);
-    flip_endian(b_out, b_in, sizeof(b_out));
-    return BN_bin2bn(b_out, sizeof(b_out), out);
+    felem_bytearray b_out;
+    felem_to_bin28(b_out, in);
+    return BN_lebin2bn(b_out, sizeof(b_out), out);
 }
 
 /******************************************************************************/
@@ -1402,8 +1388,7 @@ int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
     felem_bytearray *secrets = NULL;
     felem (*pre_comp)[17][3] = NULL;
     felem *tmp_felems = NULL;
-    felem_bytearray tmp;
-    unsigned num_bytes;
+    int num_bytes;
     int have_pre_comp = 0;
     size_t num_points = num;
     felem x_in, y_in, z_in, x_out, y_out, z_out;
@@ -1478,14 +1463,12 @@ int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
          * i.e., they contribute nothing to the linear combination
          */
         for (i = 0; i < num_points; ++i) {
-            if (i == num)
+            if (i == num) {
                 /* the generator */
-            {
                 p = EC_GROUP_get0_generator(group);
                 p_scalar = scalar;
-            } else
+            } else {
                 /* the i^th point */
-            {
                 p = points[i];
                 p_scalar = scalars[i];
             }
@@ -1501,10 +1484,16 @@ int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
                         ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
                         goto err;
                     }
-                    num_bytes = BN_bn2bin(tmp_scalar, tmp);
-                } else
-                    num_bytes = BN_bn2bin(p_scalar, tmp);
-                flip_endian(secrets[i], tmp, num_bytes);
+                    num_bytes = BN_bn2lebinpad(tmp_scalar,
+                                               secrets[i], sizeof(secrets[i]));
+                } else {
+                    num_bytes = BN_bn2lebinpad(p_scalar,
+                                               secrets[i], sizeof(secrets[i]));
+                }
+                if (num_bytes < 0) {
+                    ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
+                    goto err;
+                }
                 /* precompute multiples */
                 if ((!BN_to_felem(x_out, p->X)) ||
                     (!BN_to_felem(y_out, p->Y)) ||
@@ -1547,20 +1536,21 @@ int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
                 ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
                 goto err;
             }
-            num_bytes = BN_bn2bin(tmp_scalar, tmp);
-        } else
-            num_bytes = BN_bn2bin(scalar, tmp);
-        flip_endian(g_secret, tmp, num_bytes);
+            num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
+        } else {
+            num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
+        }
         /* do the multiplication with generator precomputation */
         batch_mul(x_out, y_out, z_out,
                   (const felem_bytearray(*))secrets, num_points,
                   g_secret,
                   mixed, (const felem(*)[17][3])pre_comp, g_pre_comp);
-    } else
+    } else {
         /* do the multiplication without generator precomputation */
         batch_mul(x_out, y_out, z_out,
                   (const felem_bytearray(*))secrets, num_points,
                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
+    }
     /* reduce the output to its unique minimal representation */
     felem_contract(x_in, x_out);
     felem_contract(y_in, y_out);

+ 27 - 36
libs/openssl/crypto/ec/ecp_nistp256.c

@@ -146,34 +146,21 @@ static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
     *((u64 *)&out[24]) = in[3];
 }
 
-/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
-static void flip_endian(u8 *out, const u8 *in, unsigned len)
-{
-    unsigned i;
-    for (i = 0; i < len; ++i)
-        out[i] = in[len - 1 - i];
-}
-
 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 static int BN_to_felem(felem out, const BIGNUM *bn)
 {
-    felem_bytearray b_in;
     felem_bytearray b_out;
-    unsigned num_bytes;
+    int num_bytes;
 
-    /* BN_bn2bin eats leading zeroes */
-    memset(b_out, 0, sizeof(b_out));
-    num_bytes = BN_num_bytes(bn);
-    if (num_bytes > sizeof(b_out)) {
+    if (BN_is_negative(bn)) {
         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
         return 0;
     }
-    if (BN_is_negative(bn)) {
+    num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
+    if (num_bytes < 0) {
         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
         return 0;
     }
-    num_bytes = BN_bn2bin(bn, b_in);
-    flip_endian(b_out, b_in, num_bytes);
     bin32_to_felem(out, b_out);
     return 1;
 }
@@ -181,10 +168,9 @@ static int BN_to_felem(felem out, const BIGNUM *bn)
 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
 {
-    felem_bytearray b_in, b_out;
-    smallfelem_to_bin32(b_in, in);
-    flip_endian(b_out, b_in, sizeof(b_out));
-    return BN_bin2bn(b_out, sizeof(b_out), out);
+    felem_bytearray b_out;
+    smallfelem_to_bin32(b_out, in);
+    return BN_lebin2bn(b_out, sizeof(b_out), out);
 }
 
 /*-
@@ -2024,8 +2010,8 @@ int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
     felem_bytearray *secrets = NULL;
     smallfelem (*pre_comp)[17][3] = NULL;
     smallfelem *tmp_smallfelems = NULL;
-    felem_bytearray tmp;
-    unsigned i, num_bytes;
+    unsigned i;
+    int num_bytes;
     int have_pre_comp = 0;
     size_t num_points = num;
     smallfelem x_in, y_in, z_in;
@@ -2102,17 +2088,15 @@ int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
         memset(secrets, 0, sizeof(*secrets) * num_points);
         memset(pre_comp, 0, sizeof(*pre_comp) * num_points);
         for (i = 0; i < num_points; ++i) {
-            if (i == num)
+            if (i == num) {
                 /*
                  * we didn't have a valid precomputation, so we pick the
                  * generator
                  */
-            {
                 p = EC_GROUP_get0_generator(group);
                 p_scalar = scalar;
-            } else
+            } else {
                 /* the i^th point */
-            {
                 p = points[i];
                 p_scalar = scalars[i];
             }
@@ -2128,10 +2112,16 @@ int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
                         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
                         goto err;
                     }
-                    num_bytes = BN_bn2bin(tmp_scalar, tmp);
-                } else
-                    num_bytes = BN_bn2bin(p_scalar, tmp);
-                flip_endian(secrets[i], tmp, num_bytes);
+                    num_bytes = BN_bn2lebinpad(tmp_scalar,
+                                               secrets[i], sizeof(secrets[i]));
+                } else {
+                    num_bytes = BN_bn2lebinpad(p_scalar,
+                                               secrets[i], sizeof(secrets[i]));
+                }
+                if (num_bytes < 0) {
+                    ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
+                    goto err;
+                }
                 /* precompute multiples */
                 if ((!BN_to_felem(x_out, p->X)) ||
                     (!BN_to_felem(y_out, p->Y)) ||
@@ -2176,20 +2166,21 @@ int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
                 ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
                 goto err;
             }
-            num_bytes = BN_bn2bin(tmp_scalar, tmp);
-        } else
-            num_bytes = BN_bn2bin(scalar, tmp);
-        flip_endian(g_secret, tmp, num_bytes);
+            num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
+        } else {
+            num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
+        }
         /* do the multiplication with generator precomputation */
         batch_mul(x_out, y_out, z_out,
                   (const felem_bytearray(*))secrets, num_points,
                   g_secret,
                   mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
-    } else
+    } else {
         /* do the multiplication without generator precomputation */
         batch_mul(x_out, y_out, z_out,
                   (const felem_bytearray(*))secrets, num_points,
                   NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
+    }
     /* reduce the output to its unique minimal representation */
     felem_contract(x_in, x_out);
     felem_contract(y_in, y_out);

+ 28 - 37
libs/openssl/crypto/ec/ecp_nistp521.c

@@ -169,34 +169,21 @@ static void felem_to_bin66(u8 out[66], const felem in)
     (*((limb *) & out[58])) = in[8];
 }
 
-/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
-static void flip_endian(u8 *out, const u8 *in, unsigned len)
-{
-    unsigned i;
-    for (i = 0; i < len; ++i)
-        out[i] = in[len - 1 - i];
-}
-
 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 static int BN_to_felem(felem out, const BIGNUM *bn)
 {
-    felem_bytearray b_in;
     felem_bytearray b_out;
-    unsigned num_bytes;
+    int num_bytes;
 
-    /* BN_bn2bin eats leading zeroes */
-    memset(b_out, 0, sizeof(b_out));
-    num_bytes = BN_num_bytes(bn);
-    if (num_bytes > sizeof(b_out)) {
+    if (BN_is_negative(bn)) {
         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
         return 0;
     }
-    if (BN_is_negative(bn)) {
+    num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
+    if (num_bytes < 0) {
         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
         return 0;
     }
-    num_bytes = BN_bn2bin(bn, b_in);
-    flip_endian(b_out, b_in, num_bytes);
     bin66_to_felem(out, b_out);
     return 1;
 }
@@ -204,10 +191,9 @@ static int BN_to_felem(felem out, const BIGNUM *bn)
 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 {
-    felem_bytearray b_in, b_out;
-    felem_to_bin66(b_in, in);
-    flip_endian(b_out, b_in, sizeof(b_out));
-    return BN_bin2bn(b_out, sizeof(b_out), out);
+    felem_bytearray b_out;
+    felem_to_bin66(b_out, in);
+    return BN_lebin2bn(b_out, sizeof(b_out), out);
 }
 
 /*-
@@ -1269,7 +1255,7 @@ static void point_add(felem x3, felem y3, felem z3,
          * ffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb
          * 71e913863f7, in that case the penultimate intermediate is -9G and
          * the final digit is also -9G. Since this only happens for a single
-         * scalar, the timing leak is irrelevent. (Any attacker who wanted to
+         * scalar, the timing leak is irrelevant. (Any attacker who wanted to
          * check whether a secret scalar was that exact value, can already do
          * so.)
          */
@@ -1866,8 +1852,8 @@ int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
     felem_bytearray *secrets = NULL;
     felem (*pre_comp)[17][3] = NULL;
     felem *tmp_felems = NULL;
-    felem_bytearray tmp;
-    unsigned i, num_bytes;
+    unsigned i;
+    int num_bytes;
     int have_pre_comp = 0;
     size_t num_points = num;
     felem x_in, y_in, z_in, x_out, y_out, z_out;
@@ -1942,17 +1928,15 @@ int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
          * i.e., they contribute nothing to the linear combination
          */
         for (i = 0; i < num_points; ++i) {
-            if (i == num)
+            if (i == num) {
                 /*
                  * we didn't have a valid precomputation, so we pick the
                  * generator
                  */
-            {
                 p = EC_GROUP_get0_generator(group);
                 p_scalar = scalar;
-            } else
+            } else {
                 /* the i^th point */
-            {
                 p = points[i];
                 p_scalar = scalars[i];
             }
@@ -1968,10 +1952,16 @@ int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
                         goto err;
                     }
-                    num_bytes = BN_bn2bin(tmp_scalar, tmp);
-                } else
-                    num_bytes = BN_bn2bin(p_scalar, tmp);
-                flip_endian(secrets[i], tmp, num_bytes);
+                    num_bytes = BN_bn2lebinpad(tmp_scalar,
+                                               secrets[i], sizeof(secrets[i]));
+                } else {
+                    num_bytes = BN_bn2lebinpad(p_scalar,
+                                               secrets[i], sizeof(secrets[i]));
+                }
+                if (num_bytes < 0) {
+                    ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
+                    goto err;
+                }
                 /* precompute multiples */
                 if ((!BN_to_felem(x_out, p->X)) ||
                     (!BN_to_felem(y_out, p->Y)) ||
@@ -2014,21 +2004,22 @@ int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
                 goto err;
             }
-            num_bytes = BN_bn2bin(tmp_scalar, tmp);
-        } else
-            num_bytes = BN_bn2bin(scalar, tmp);
-        flip_endian(g_secret, tmp, num_bytes);
+            num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
+        } else {
+            num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
+        }
         /* do the multiplication with generator precomputation */
         batch_mul(x_out, y_out, z_out,
                   (const felem_bytearray(*))secrets, num_points,
                   g_secret,
                   mixed, (const felem(*)[17][3])pre_comp,
                   (const felem(*)[3])g_pre_comp);
-    } else
+    } else {
         /* do the multiplication without generator precomputation */
         batch_mul(x_out, y_out, z_out,
                   (const felem_bytearray(*))secrets, num_points,
                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
+    }
     /* reduce the output to its unique minimal representation */
     felem_contract(x_in, x_out);
     felem_contract(y_in, y_out);

+ 13 - 13
libs/openssl/crypto/ec/ecp_nistputil.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2011-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -158,13 +158,13 @@ void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array,
  *     of a nonnegative integer (b_k in {0, 1}), rewrite it in digits 0, 1, -1
  *     by using bit-wise subtraction as follows:
  *
- *        b_k b_(k-1)  ...  b_2  b_1  b_0
- *      -     b_k      ...  b_3  b_2  b_1  b_0
- *       -------------------------------------
- *        s_k b_(k-1)  ...  s_3  s_2  s_1  s_0
+ *        b_k     b_(k-1)  ...  b_2  b_1  b_0
+ *      -         b_k      ...  b_3  b_2  b_1  b_0
+ *       -----------------------------------------
+ *        s_(k+1) s_k      ...  s_3  s_2  s_1  s_0
  *
  *     A left-shift followed by subtraction of the original value yields a new
- *     representation of the same value, using signed bits s_i = b_(i+1) - b_i.
+ *     representation of the same value, using signed bits s_i = b_(i-1) - b_i.
  *     This representation from Booth's paper has since appeared in the
  *     literature under a variety of different names including "reversed binary
  *     form", "alternating greedy expansion", "mutual opposite form", and
@@ -188,7 +188,7 @@ void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array,
  * (1961), pp. 67-91), in a radix-2^5 setting.  That is, we always combine five
  * signed bits into a signed digit:
  *
- *       s_(4j + 4) s_(4j + 3) s_(4j + 2) s_(4j + 1) s_(4j)
+ *       s_(5j + 4) s_(5j + 3) s_(5j + 2) s_(5j + 1) s_(5j)
  *
  * The sign-alternating property implies that the resulting digit values are
  * integers from -16 to 16.
@@ -196,14 +196,14 @@ void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array,
  * Of course, we don't actually need to compute the signed digits s_i as an
  * intermediate step (that's just a nice way to see how this scheme relates
  * to the wNAF): a direct computation obtains the recoded digit from the
- * six bits b_(4j + 4) ... b_(4j - 1).
+ * six bits b_(5j + 4) ... b_(5j - 1).
  *
- * This function takes those five bits as an integer (0 .. 63), writing the
+ * This function takes those six bits as an integer (0 .. 63), writing the
  * recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute
- * value, in the range 0 .. 8).  Note that this integer essentially provides the
- * input bits "shifted to the left" by one position: for example, the input to
- * compute the least significant recoded digit, given that there's no bit b_-1,
- * has to be b_4 b_3 b_2 b_1 b_0 0.
+ * value, in the range 0 .. 16).  Note that this integer essentially provides
+ * the input bits "shifted to the left" by one position: for example, the input
+ * to compute the least significant recoded digit, given that there's no bit
+ * b_-1, has to be b_4 b_3 b_2 b_1 b_0 0.
  *
  */
 void ec_GFp_nistp_recode_scalar_bits(unsigned char *sign,

+ 1 - 1
libs/openssl/crypto/ec/ecx_meth.c

@@ -532,7 +532,7 @@ static int ecd_item_sign25519(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
     X509_ALGOR_set0(alg1, OBJ_nid2obj(NID_ED25519), V_ASN1_UNDEF, NULL);
     if (alg2)
         X509_ALGOR_set0(alg2, OBJ_nid2obj(NID_ED25519), V_ASN1_UNDEF, NULL);
-    /* Algorithm idetifiers set: carry on as normal */
+    /* Algorithm identifiers set: carry on as normal */
     return 3;
 }
 

+ 1 - 1
libs/openssl/crypto/engine/eng_devcrypto.c

@@ -26,7 +26,7 @@
 
 /* #define ENGINE_DEVCRYPTO_DEBUG */
 
-#ifdef CRYPTO_ALGORITHM_MIN
+#if CRYPTO_ALGORITHM_MIN < CRYPTO_ALGORITHM_MAX
 # define CHECK_BSD_STYLE_MACROS
 #endif
 

+ 5 - 3
libs/openssl/crypto/engine/eng_openssl.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 2001-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2001-2019 The OpenSSL Project Authors. All Rights Reserved.
  * Copyright (c) 2002, Oracle and/or its affiliates. All rights reserved
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
@@ -29,12 +29,14 @@
  */
 #define TEST_ENG_OPENSSL_RC4
 #ifndef OPENSSL_NO_STDIO
-#define TEST_ENG_OPENSSL_PKEY
+# define TEST_ENG_OPENSSL_PKEY
 #endif
 /* #define TEST_ENG_OPENSSL_HMAC */
 /* #define TEST_ENG_OPENSSL_HMAC_INIT */
 /* #define TEST_ENG_OPENSSL_RC4_OTHERS */
-#define TEST_ENG_OPENSSL_RC4_P_INIT
+#ifndef OPENSSL_NO_STDIO
+# define TEST_ENG_OPENSSL_RC4_P_INIT
+#endif
 /* #define TEST_ENG_OPENSSL_RC4_P_CIPHER */
 #define TEST_ENG_OPENSSL_SHA
 /* #define TEST_ENG_OPENSSL_SHA_OTHERS */

+ 9 - 7
libs/openssl/crypto/err/err.c

@@ -184,8 +184,8 @@ static ERR_STRING_DATA *int_err_get_item(const ERR_STRING_DATA *d)
 }
 
 #ifndef OPENSSL_NO_ERR
-/* A measurement on Linux 2018-11-21 showed about 3.5kib */
-# define SPACE_SYS_STR_REASONS 4 * 1024
+/* 2019-05-21: Russian and Ukrainian locales on Linux require more than 6,5 kB */
+# define SPACE_SYS_STR_REASONS 8 * 1024
 # define NUM_SYS_STR_REASONS 127
 
 static ERR_STRING_DATA SYS_str_reasons[NUM_SYS_STR_REASONS + 1];
@@ -219,21 +219,23 @@ static void build_SYS_str_reasons(void)
         ERR_STRING_DATA *str = &SYS_str_reasons[i - 1];
 
         str->error = ERR_PACK(ERR_LIB_SYS, 0, i);
-        if (str->string == NULL) {
+        /*
+         * If we have used up all the space in strerror_pool,
+         * there's no point in calling openssl_strerror_r()
+         */
+        if (str->string == NULL && cnt < sizeof(strerror_pool)) {
             if (openssl_strerror_r(i, cur, sizeof(strerror_pool) - cnt)) {
                 size_t l = strlen(cur);
 
                 str->string = cur;
                 cnt += l;
-                if (cnt > sizeof(strerror_pool))
-                    cnt = sizeof(strerror_pool);
                 cur += l;
 
                 /*
                  * VMS has an unusual quirk of adding spaces at the end of
-                 * some (most? all?) messages.  Lets trim them off.
+                 * some (most? all?) messages. Lets trim them off.
                  */
-                while (ossl_isspace(cur[-1])) {
+                while (cur > strerror_pool && ossl_isspace(cur[-1])) {
                     cur--;
                     cnt--;
                 }

+ 12 - 0
libs/openssl/crypto/err/openssl.txt

@@ -314,6 +314,7 @@ CMS_F_CMS_SIGNERINFO_VERIFY:152:CMS_SignerInfo_verify
 CMS_F_CMS_SIGNERINFO_VERIFY_CERT:153:cms_signerinfo_verify_cert
 CMS_F_CMS_SIGNERINFO_VERIFY_CONTENT:154:CMS_SignerInfo_verify_content
 CMS_F_CMS_SIGN_RECEIPT:163:CMS_sign_receipt
+CMS_F_CMS_SI_CHECK_ATTRIBUTES:183:CMS_si_check_attributes
 CMS_F_CMS_STREAM:155:CMS_stream
 CMS_F_CMS_UNCOMPRESS:156:CMS_uncompress
 CMS_F_CMS_VERIFY:157:CMS_verify
@@ -713,11 +714,14 @@ ENGINE_F_INT_ENGINE_CONFIGURE:188:int_engine_configure
 ENGINE_F_INT_ENGINE_MODULE_INIT:187:int_engine_module_init
 ENGINE_F_OSSL_HMAC_INIT:200:ossl_hmac_init
 EVP_F_AESNI_INIT_KEY:165:aesni_init_key
+EVP_F_AESNI_XTS_INIT_KEY:207:aesni_xts_init_key
 EVP_F_AES_GCM_CTRL:196:aes_gcm_ctrl
 EVP_F_AES_INIT_KEY:133:aes_init_key
 EVP_F_AES_OCB_CIPHER:169:aes_ocb_cipher
 EVP_F_AES_T4_INIT_KEY:178:aes_t4_init_key
+EVP_F_AES_T4_XTS_INIT_KEY:208:aes_t4_xts_init_key
 EVP_F_AES_WRAP_CIPHER:170:aes_wrap_cipher
+EVP_F_AES_XTS_INIT_KEY:209:aes_xts_init_key
 EVP_F_ALG_MODULE_INIT:177:alg_module_init
 EVP_F_ARIA_CCM_INIT_KEY:175:aria_ccm_init_key
 EVP_F_ARIA_GCM_CTRL:197:aria_gcm_ctrl
@@ -808,6 +812,7 @@ EVP_F_PKCS5_V2_SCRYPT_KEYIVGEN:180:PKCS5_v2_scrypt_keyivgen
 EVP_F_PKEY_SET_TYPE:158:pkey_set_type
 EVP_F_RC2_MAGIC_TO_METH:109:rc2_magic_to_meth
 EVP_F_RC5_CTRL:125:rc5_ctrl
+EVP_F_R_32_12_16_INIT_KEY:242:r_32_12_16_init_key
 EVP_F_S390X_AES_GCM_CTRL:201:s390x_aes_gcm_ctrl
 EVP_F_UPDATE:173:update
 KDF_F_PKEY_HKDF_CTRL_STR:103:pkey_hkdf_ctrl_str
@@ -1020,6 +1025,7 @@ RAND_F_RAND_POOL_ADD_BEGIN:113:rand_pool_add_begin
 RAND_F_RAND_POOL_ADD_END:114:rand_pool_add_end
 RAND_F_RAND_POOL_ATTACH:124:rand_pool_attach
 RAND_F_RAND_POOL_BYTES_NEEDED:115:rand_pool_bytes_needed
+RAND_F_RAND_POOL_GROW:125:rand_pool_grow
 RAND_F_RAND_POOL_NEW:116:rand_pool_new
 RAND_F_RAND_WRITE_FILE:112:RAND_write_file
 RSA_F_CHECK_PADDING_MD:140:check_padding_md
@@ -1930,6 +1936,7 @@ BN_R_P_IS_NOT_PRIME:112:p is not prime
 BN_R_TOO_MANY_ITERATIONS:113:too many iterations
 BN_R_TOO_MANY_TEMPORARY_VARIABLES:109:too many temporary variables
 CMS_R_ADD_SIGNER_ERROR:99:add signer error
+CMS_R_ATTRIBUTE_ERROR:161:attribute error
 CMS_R_CERTIFICATE_ALREADY_PRESENT:175:certificate already present
 CMS_R_CERTIFICATE_HAS_NO_KEYID:160:certificate has no keyid
 CMS_R_CERTIFICATE_VERIFY_ERROR:100:certificate verify error
@@ -2094,6 +2101,7 @@ DSA_R_DECODE_ERROR:104:decode error
 DSA_R_INVALID_DIGEST_TYPE:106:invalid digest type
 DSA_R_INVALID_PARAMETERS:112:invalid parameters
 DSA_R_MISSING_PARAMETERS:101:missing parameters
+DSA_R_MISSING_PRIVATE_KEY:111:missing private key
 DSA_R_MODULUS_TOO_LARGE:103:modulus too large
 DSA_R_NO_PARAMETERS_SET:107:no parameters set
 DSA_R_PARAMETER_ENCODING_ERROR:105:parameter encoding error
@@ -2218,6 +2226,7 @@ ENGINE_R_VERSION_INCOMPATIBILITY:145:version incompatibility
 EVP_R_AES_KEY_SETUP_FAILED:143:aes key setup failed
 EVP_R_ARIA_KEY_SETUP_FAILED:176:aria key setup failed
 EVP_R_BAD_DECRYPT:100:bad decrypt
+EVP_R_BAD_KEY_LENGTH:195:bad key length
 EVP_R_BUFFER_TOO_SMALL:155:buffer too small
 EVP_R_CAMELLIA_KEY_SETUP_FAILED:157:camellia key setup failed
 EVP_R_CIPHER_PARAMETER_ERROR:122:cipher parameter error
@@ -2287,6 +2296,7 @@ EVP_R_UNSUPPORTED_PRIVATE_KEY_ALGORITHM:118:unsupported private key algorithm
 EVP_R_UNSUPPORTED_SALT_TYPE:126:unsupported salt type
 EVP_R_WRAP_MODE_NOT_ALLOWED:170:wrap mode not allowed
 EVP_R_WRONG_FINAL_BLOCK_LENGTH:109:wrong final block length
+EVP_R_XTS_DUPLICATED_KEYS:183:xts duplicated keys
 KDF_R_INVALID_DIGEST:100:invalid digest
 KDF_R_MISSING_ITERATION_COUNT:109:missing iteration count
 KDF_R_MISSING_KEY:104:missing key
@@ -2527,6 +2537,7 @@ RSA_R_KEY_PRIME_NUM_INVALID:165:key prime num invalid
 RSA_R_KEY_SIZE_TOO_SMALL:120:key size too small
 RSA_R_LAST_OCTET_INVALID:134:last octet invalid
 RSA_R_MGF1_DIGEST_NOT_ALLOWED:152:mgf1 digest not allowed
+RSA_R_MISSING_PRIVATE_KEY:179:missing private key
 RSA_R_MODULUS_TOO_LARGE:105:modulus too large
 RSA_R_MP_COEFFICIENT_NOT_INVERSE_OF_R:168:mp coefficient not inverse of r
 RSA_R_MP_EXPONENT_NOT_CONGRUENT_TO_D:169:mp exponent not congruent to d
@@ -3003,6 +3014,7 @@ X509_R_CERT_ALREADY_IN_HASH_TABLE:101:cert already in hash table
 X509_R_CRL_ALREADY_DELTA:127:crl already delta
 X509_R_CRL_VERIFY_FAILURE:131:crl verify failure
 X509_R_IDP_MISMATCH:128:idp mismatch
+X509_R_INVALID_ATTRIBUTES:138:invalid attributes
 X509_R_INVALID_DIRECTORY:113:invalid directory
 X509_R_INVALID_FIELD_NAME:119:invalid field name
 X509_R_INVALID_TRUST:123:invalid trust

+ 3 - 3
libs/openssl/crypto/evp/bio_ok.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -38,9 +38,9 @@
         of memory.
 
         BIO_f_reliable splits data stream into blocks. Each block is prefixed
-        with it's length and suffixed with it's digest. So you need only
+        with its length and suffixed with its digest. So you need only
         several Kbytes of memory to buffer single block before verifying
-        it's digest.
+        its digest.
 
         BIO_f_reliable goes further and adds several important capabilities:
 

+ 83 - 8
libs/openssl/crypto/evp/e_aes.c

@@ -176,7 +176,7 @@ static void ctr64_inc(unsigned char *counter)
 # define HWAES_xts_decrypt aes_p8_xts_decrypt
 #endif
 
-#if     defined(AES_ASM) && !defined(I386_ONLY) &&      (  \
+#if     !defined(OPENSSL_NO_ASM) &&                     (  \
         ((defined(__i386)       || defined(__i386__)    || \
           defined(_M_IX86)) && defined(OPENSSL_IA32_SSE2))|| \
         defined(__x86_64)       || defined(__x86_64__)  || \
@@ -383,10 +383,25 @@ static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
                               const unsigned char *iv, int enc)
 {
     EVP_AES_XTS_CTX *xctx = EVP_C_DATA(EVP_AES_XTS_CTX,ctx);
+
     if (!iv && !key)
         return 1;
 
     if (key) {
+        /* The key is two half length keys in reality */
+        const int bytes = EVP_CIPHER_CTX_key_length(ctx) / 2;
+
+        /*
+         * Verify that the two keys are different.
+         * 
+         * This addresses Rogaway's vulnerability.
+         * See comment in aes_xts_init_key() below.
+         */
+        if (enc && CRYPTO_memcmp(key, key + bytes, bytes) == 0) {
+            EVPerr(EVP_F_AESNI_XTS_INIT_KEY, EVP_R_XTS_DUPLICATED_KEYS);
+            return 0;
+        }
+
         /* key_len is two AES keys */
         if (enc) {
             aesni_set_encrypt_key(key, EVP_CIPHER_CTX_key_length(ctx) * 4,
@@ -787,11 +802,26 @@ static int aes_t4_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
                                const unsigned char *iv, int enc)
 {
     EVP_AES_XTS_CTX *xctx = EVP_C_DATA(EVP_AES_XTS_CTX,ctx);
+
     if (!iv && !key)
         return 1;
 
     if (key) {
-        int bits = EVP_CIPHER_CTX_key_length(ctx) * 4;
+        /* The key is two half length keys in reality */
+        const int bytes = EVP_CIPHER_CTX_key_length(ctx) / 2;
+        const int bits = bytes * 8;
+
+        /*
+         * Verify that the two keys are different.
+         * 
+         * This addresses Rogaway's vulnerability.
+         * See comment in aes_xts_init_key() below.
+         */
+        if (enc && CRYPTO_memcmp(key, key + bytes, bytes) == 0) {
+            EVPerr(EVP_F_AES_T4_XTS_INIT_KEY, EVP_R_XTS_DUPLICATED_KEYS);
+            return 0;
+        }
+
         xctx->stream = NULL;
         /* key_len is two AES keys */
         if (enc) {
@@ -1578,7 +1608,7 @@ static int s390x_aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
 
     switch (type) {
     case EVP_CTRL_INIT:
-        ivlen = EVP_CIPHER_CTX_iv_length(c);
+        ivlen = EVP_CIPHER_iv_length(c->cipher);
         iv = EVP_CIPHER_CTX_iv_noconst(c);
         gctx->key_set = 0;
         gctx->iv_set = 0;
@@ -1589,6 +1619,10 @@ static int s390x_aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
         gctx->tls_aad_len = -1;
         return 1;
 
+    case EVP_CTRL_GET_IVLEN:
+        *(int *)ptr = gctx->ivlen;
+        return 1;
+
     case EVP_CTRL_AEAD_SET_IVLEN:
         if (arg <= 0)
             return 0;
@@ -2299,6 +2333,10 @@ static int s390x_aes_ccm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
         cctx->aes.ccm.tls_aad_len = -1;
         return 1;
 
+    case EVP_CTRL_GET_IVLEN:
+        *(int *)ptr = 15 - cctx->aes.ccm.l;
+        return 1;
+
     case EVP_CTRL_AEAD_TLS1_AAD:
         if (arg != EVP_AEAD_TLS1_AAD_LEN)
             return 0;
@@ -2817,13 +2855,17 @@ static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
     case EVP_CTRL_INIT:
         gctx->key_set = 0;
         gctx->iv_set = 0;
-        gctx->ivlen = c->cipher->iv_len;
+        gctx->ivlen = EVP_CIPHER_iv_length(c->cipher);
         gctx->iv = c->iv;
         gctx->taglen = -1;
         gctx->iv_gen = 0;
         gctx->tls_aad_len = -1;
         return 1;
 
+    case EVP_CTRL_GET_IVLEN:
+        *(int *)ptr = gctx->ivlen;
+        return 1;
+
     case EVP_CTRL_AEAD_SET_IVLEN:
         if (arg <= 0)
             return 0;
@@ -3273,7 +3315,7 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
 #define CUSTOM_FLAGS    (EVP_CIPH_FLAG_DEFAULT_ASN1 \
                 | EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER \
                 | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT \
-                | EVP_CIPH_CUSTOM_COPY)
+                | EVP_CIPH_CUSTOM_COPY | EVP_CIPH_CUSTOM_IV_LENGTH)
 
 BLOCK_CIPHER_custom(NID_aes, 128, 1, 12, gcm, GCM,
                     EVP_CIPH_FLAG_AEAD_CIPHER | CUSTOM_FLAGS)
@@ -3284,10 +3326,12 @@ BLOCK_CIPHER_custom(NID_aes, 128, 1, 12, gcm, GCM,
 
 static int aes_xts_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
 {
-    EVP_AES_XTS_CTX *xctx = EVP_C_DATA(EVP_AES_XTS_CTX,c);
+    EVP_AES_XTS_CTX *xctx = EVP_C_DATA(EVP_AES_XTS_CTX, c);
+
     if (type == EVP_CTRL_COPY) {
         EVP_CIPHER_CTX *out = ptr;
         EVP_AES_XTS_CTX *xctx_out = EVP_C_DATA(EVP_AES_XTS_CTX,out);
+
         if (xctx->xts.key1) {
             if (xctx->xts.key1 != &xctx->ks1)
                 return 0;
@@ -3311,11 +3355,36 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
                             const unsigned char *iv, int enc)
 {
     EVP_AES_XTS_CTX *xctx = EVP_C_DATA(EVP_AES_XTS_CTX,ctx);
+
     if (!iv && !key)
         return 1;
 
     if (key)
         do {
+            /* The key is two half length keys in reality */
+            const int bytes = EVP_CIPHER_CTX_key_length(ctx) / 2;
+
+            /*
+             * Verify that the two keys are different.
+             *
+             * This addresses the vulnerability described in Rogaway's
+             * September 2004 paper:
+             *
+             *      "Efficient Instantiations of Tweakable Blockciphers and
+             *       Refinements to Modes OCB and PMAC".
+             *      (http://web.cs.ucdavis.edu/~rogaway/papers/offsets.pdf)
+             *
+             * FIPS 140-2 IG A.9 XTS-AES Key Generation Requirements states
+             * that:
+             *      "The check for Key_1 != Key_2 shall be done at any place
+             *       BEFORE using the keys in the XTS-AES algorithm to process
+             *       data with them."
+             */
+            if (enc && CRYPTO_memcmp(key, key + bytes, bytes) == 0) {
+                EVPerr(EVP_F_AES_XTS_INIT_KEY, EVP_R_XTS_DUPLICATED_KEYS);
+                return 0;
+            }
+
 #ifdef AES_XTS_ASM
             xctx->stream = enc ? AES_xts_encrypt : AES_xts_decrypt;
 #else
@@ -3448,7 +3517,9 @@ static int aes_ccm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
         cctx->len_set = 0;
         cctx->tls_aad_len = -1;
         return 1;
-
+    case EVP_CTRL_GET_IVLEN:
+        *(int *)ptr = 15 - cctx->L;
+        return 1;
     case EVP_CTRL_AEAD_TLS1_AAD:
         /* Save the AAD for later use */
         if (arg != EVP_AEAD_TLS1_AAD_LEN)
@@ -3897,13 +3968,17 @@ static int aes_ocb_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
     case EVP_CTRL_INIT:
         octx->key_set = 0;
         octx->iv_set = 0;
-        octx->ivlen = EVP_CIPHER_CTX_iv_length(c);
+        octx->ivlen = EVP_CIPHER_iv_length(c->cipher);
         octx->iv = EVP_CIPHER_CTX_iv_noconst(c);
         octx->taglen = 16;
         octx->data_buf_len = 0;
         octx->aad_buf_len = 0;
         return 1;
 
+    case EVP_CTRL_GET_IVLEN:
+        *(int *)ptr = octx->ivlen;
+        return 1;
+
     case EVP_CTRL_AEAD_SET_IVLEN:
         /* IV len must be 1 to 15 */
         if (arg <= 0 || arg > 15)

+ 11 - 2
libs/openssl/crypto/evp/e_aria.c

@@ -252,7 +252,7 @@ static int aria_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
     case EVP_CTRL_INIT:
         gctx->key_set = 0;
         gctx->iv_set = 0;
-        gctx->ivlen = EVP_CIPHER_CTX_iv_length(c);
+        gctx->ivlen = EVP_CIPHER_iv_length(c->cipher);
         gctx->iv = EVP_CIPHER_CTX_iv_noconst(c);
         gctx->taglen = -1;
         gctx->iv_gen = 0;
@@ -274,6 +274,10 @@ static int aria_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
         gctx->ivlen = arg;
         return 1;
 
+    case EVP_CTRL_GET_IVLEN:
+        *(int *)ptr = gctx->ivlen;
+        return 1;
+
     case EVP_CTRL_AEAD_SET_TAG:
         if (arg <= 0 || arg > 16 || EVP_CIPHER_CTX_encrypting(c))
             return 0;
@@ -573,6 +577,10 @@ static int aria_ccm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
         memcpy(EVP_CIPHER_CTX_iv_noconst(c), ptr, arg);
         return 1;
 
+    case EVP_CTRL_GET_IVLEN:
+        *(int *)ptr = 15 - cctx->L;
+        return 1;
+
     case EVP_CTRL_AEAD_SET_IVLEN:
         arg = 15 - arg;
         /* fall thru */
@@ -742,7 +750,8 @@ static int aria_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
 #define ARIA_AUTH_FLAGS  (EVP_CIPH_FLAG_DEFAULT_ASN1 \
                           | EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER \
                           | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT \
-                          | EVP_CIPH_CUSTOM_COPY | EVP_CIPH_FLAG_AEAD_CIPHER)
+                          | EVP_CIPH_CUSTOM_COPY | EVP_CIPH_FLAG_AEAD_CIPHER \
+                          | EVP_CIPH_CUSTOM_IV_LENGTH)
 
 #define BLOCK_CIPHER_aead(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
 static const EVP_CIPHER aria_##keylen##_##mode = { \

+ 6 - 1
libs/openssl/crypto/evp/e_chacha20_poly1305.c

@@ -534,6 +534,10 @@ static int chacha20_poly1305_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg,
         }
         return 1;
 
+    case EVP_CTRL_GET_IVLEN:
+        *(int *)ptr = actx->nonce_len;
+        return 1;
+
     case EVP_CTRL_AEAD_SET_IVLEN:
         if (arg <= 0 || arg > CHACHA20_POLY1305_MAX_IVLEN)
             return 0;
@@ -613,7 +617,8 @@ static EVP_CIPHER chacha20_poly1305 = {
     12,                 /* iv_len, 96-bit nonce in the context */
     EVP_CIPH_FLAG_AEAD_CIPHER | EVP_CIPH_CUSTOM_IV |
     EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT |
-    EVP_CIPH_CUSTOM_COPY | EVP_CIPH_FLAG_CUSTOM_CIPHER,
+    EVP_CIPH_CUSTOM_COPY | EVP_CIPH_FLAG_CUSTOM_CIPHER |
+    EVP_CIPH_CUSTOM_IV_LENGTH,
     chacha20_poly1305_init_key,
     chacha20_poly1305_cipher,
     chacha20_poly1305_cleanup,

+ 5 - 1
libs/openssl/crypto/evp/e_rc5.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -66,6 +66,10 @@ static int rc5_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
 static int r_32_12_16_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
                                const unsigned char *iv, int enc)
 {
+    if (EVP_CIPHER_CTX_key_length(ctx) > 255) {
+        EVPerr(EVP_F_R_32_12_16_INIT_KEY, EVP_R_BAD_KEY_LENGTH);
+        return 0;
+    }
     RC5_32_set_key(&data(ctx)->ks, EVP_CIPHER_CTX_key_length(ctx),
                    key, data(ctx)->rounds);
     return 1;

+ 10 - 1
libs/openssl/crypto/evp/evp_err.c

@@ -1,6 +1,6 @@
 /*
  * Generated by util/mkerr.pl DO NOT EDIT
- * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -15,11 +15,15 @@
 
 static const ERR_STRING_DATA EVP_str_functs[] = {
     {ERR_PACK(ERR_LIB_EVP, EVP_F_AESNI_INIT_KEY, 0), "aesni_init_key"},
+    {ERR_PACK(ERR_LIB_EVP, EVP_F_AESNI_XTS_INIT_KEY, 0), "aesni_xts_init_key"},
     {ERR_PACK(ERR_LIB_EVP, EVP_F_AES_GCM_CTRL, 0), "aes_gcm_ctrl"},
     {ERR_PACK(ERR_LIB_EVP, EVP_F_AES_INIT_KEY, 0), "aes_init_key"},
     {ERR_PACK(ERR_LIB_EVP, EVP_F_AES_OCB_CIPHER, 0), "aes_ocb_cipher"},
     {ERR_PACK(ERR_LIB_EVP, EVP_F_AES_T4_INIT_KEY, 0), "aes_t4_init_key"},
+    {ERR_PACK(ERR_LIB_EVP, EVP_F_AES_T4_XTS_INIT_KEY, 0),
+     "aes_t4_xts_init_key"},
     {ERR_PACK(ERR_LIB_EVP, EVP_F_AES_WRAP_CIPHER, 0), "aes_wrap_cipher"},
+    {ERR_PACK(ERR_LIB_EVP, EVP_F_AES_XTS_INIT_KEY, 0), "aes_xts_init_key"},
     {ERR_PACK(ERR_LIB_EVP, EVP_F_ALG_MODULE_INIT, 0), "alg_module_init"},
     {ERR_PACK(ERR_LIB_EVP, EVP_F_ARIA_CCM_INIT_KEY, 0), "aria_ccm_init_key"},
     {ERR_PACK(ERR_LIB_EVP, EVP_F_ARIA_GCM_CTRL, 0), "aria_gcm_ctrl"},
@@ -149,6 +153,8 @@ static const ERR_STRING_DATA EVP_str_functs[] = {
     {ERR_PACK(ERR_LIB_EVP, EVP_F_PKEY_SET_TYPE, 0), "pkey_set_type"},
     {ERR_PACK(ERR_LIB_EVP, EVP_F_RC2_MAGIC_TO_METH, 0), "rc2_magic_to_meth"},
     {ERR_PACK(ERR_LIB_EVP, EVP_F_RC5_CTRL, 0), "rc5_ctrl"},
+    {ERR_PACK(ERR_LIB_EVP, EVP_F_R_32_12_16_INIT_KEY, 0),
+     "r_32_12_16_init_key"},
     {ERR_PACK(ERR_LIB_EVP, EVP_F_S390X_AES_GCM_CTRL, 0), "s390x_aes_gcm_ctrl"},
     {ERR_PACK(ERR_LIB_EVP, EVP_F_UPDATE, 0), "update"},
     {0, NULL}
@@ -160,6 +166,7 @@ static const ERR_STRING_DATA EVP_str_reasons[] = {
     {ERR_PACK(ERR_LIB_EVP, 0, EVP_R_ARIA_KEY_SETUP_FAILED),
     "aria key setup failed"},
     {ERR_PACK(ERR_LIB_EVP, 0, EVP_R_BAD_DECRYPT), "bad decrypt"},
+    {ERR_PACK(ERR_LIB_EVP, 0, EVP_R_BAD_KEY_LENGTH), "bad key length"},
     {ERR_PACK(ERR_LIB_EVP, 0, EVP_R_BUFFER_TOO_SMALL), "buffer too small"},
     {ERR_PACK(ERR_LIB_EVP, 0, EVP_R_CAMELLIA_KEY_SETUP_FAILED),
     "camellia key setup failed"},
@@ -266,6 +273,8 @@ static const ERR_STRING_DATA EVP_str_reasons[] = {
     "wrap mode not allowed"},
     {ERR_PACK(ERR_LIB_EVP, 0, EVP_R_WRONG_FINAL_BLOCK_LENGTH),
     "wrong final block length"},
+    {ERR_PACK(ERR_LIB_EVP, 0, EVP_R_XTS_DUPLICATED_KEYS),
+    "xts duplicated keys"},
     {0, NULL}
 };
 

+ 8 - 1
libs/openssl/crypto/evp/evp_lib.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -242,6 +242,13 @@ int EVP_CIPHER_iv_length(const EVP_CIPHER *cipher)
 
 int EVP_CIPHER_CTX_iv_length(const EVP_CIPHER_CTX *ctx)
 {
+    int i, rv;
+
+    if ((EVP_CIPHER_flags(ctx->cipher) & EVP_CIPH_CUSTOM_IV_LENGTH) != 0) {
+        rv = EVP_CIPHER_CTX_ctrl((EVP_CIPHER_CTX *)ctx, EVP_CTRL_GET_IVLEN,
+                                 0, &i);
+        return (rv == 1) ? i : -1;
+    }
     return ctx->cipher->iv_len;
 }
 

+ 4 - 1
libs/openssl/crypto/evp/m_sha3.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2017-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -108,6 +108,9 @@ static int sha3_final(EVP_MD_CTX *evp_ctx, unsigned char *md)
     size_t bsz = ctx->block_size;
     size_t num = ctx->num;
 
+    if (ctx->md_size == 0)
+        return 1;
+
     /*
      * Pad the data with 10*1. Note that |num| can be |bsz - 1|
      * in which case both byte operations below are performed on

+ 3 - 1
libs/openssl/crypto/include/internal/ctype.h

@@ -1,5 +1,5 @@
 /*
- * Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2017-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -57,6 +57,8 @@ int ossl_ctype_check(int c, unsigned int mask);
 int ossl_tolower(int c);
 int ossl_toupper(int c);
 
+int ascii_isdigit(const char inchar);
+
 # define ossl_isalnum(c)        (ossl_ctype_check((c), CTYPE_MASK_alnum))
 # define ossl_isalpha(c)        (ossl_ctype_check((c), CTYPE_MASK_alpha))
 # ifdef CHARSET_EBCDIC

+ 3 - 3
libs/openssl/crypto/include/internal/rand_int.h

@@ -1,5 +1,5 @@
 /*
- * Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -26,7 +26,6 @@ typedef struct rand_pool_st RAND_POOL;
 void rand_cleanup_int(void);
 void rand_drbg_cleanup_int(void);
 void drbg_delete_thread_state(void);
-void rand_fork(void);
 
 /* Hardware-based seeding functions. */
 size_t rand_acquire_entropy_from_tsc(RAND_POOL *pool);
@@ -52,7 +51,8 @@ void rand_drbg_cleanup_additional_data(RAND_POOL *pool, unsigned char *out);
 /*
  * RAND_POOL functions
  */
-RAND_POOL *rand_pool_new(int entropy_requested, size_t min_len, size_t max_len);
+RAND_POOL *rand_pool_new(int entropy_requested, int secure,
+                         size_t min_len, size_t max_len);
 RAND_POOL *rand_pool_attach(const unsigned char *buffer, size_t len,
                             size_t entropy);
 void rand_pool_free(RAND_POOL *pool);

+ 5 - 1
libs/openssl/crypto/include/internal/sm2err.h

@@ -1,6 +1,6 @@
 /*
  * Generated by util/mkerr.pl DO NOT EDIT
- * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -11,6 +11,10 @@
 #ifndef HEADER_SM2ERR_H
 # define HEADER_SM2ERR_H
 
+# ifndef HEADER_SYMHACKS_H
+#  include <openssl/symhacks.h>
+# endif
+
 # include <openssl/opensslconf.h>
 
 # ifndef OPENSSL_NO_SM2

+ 1 - 2
libs/openssl/crypto/init.c

@@ -40,7 +40,7 @@ static int stopped = 0;
  * destructor for threads terminating before libcrypto is initialized or
  * after it's de-initialized. Access to the key doesn't have to be
  * serialized for the said threads, because they didn't use libcrypto
- * and it doesn't matter if they pick "impossible" or derefernce real
+ * and it doesn't matter if they pick "impossible" or dereference real
  * key value and pull NULL past initialization in the first thread that
  * intends to use libcrypto.
  */
@@ -847,6 +847,5 @@ void OPENSSL_fork_parent(void)
 
 void OPENSSL_fork_child(void)
 {
-    rand_fork();
 }
 #endif

+ 4 - 4
libs/openssl/crypto/lhash/lhash.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -19,14 +19,14 @@
 
 /*
  * A hashing implementation that appears to be based on the linear hashing
- * alogrithm:
+ * algorithm:
  * https://en.wikipedia.org/wiki/Linear_hashing
  *
  * Litwin, Witold (1980), "Linear hashing: A new tool for file and table
  * addressing", Proc. 6th Conference on Very Large Databases: 212-223
- * http://hackthology.com/pdfs/Litwin-1980-Linear_Hashing.pdf
+ * https://hackthology.com/pdfs/Litwin-1980-Linear_Hashing.pdf
  *
- * From the wikipedia article "Linear hashing is used in the BDB Berkeley
+ * From the Wikipedia article "Linear hashing is used in the BDB Berkeley
  * database system, which in turn is used by many software systems such as
  * OpenLDAP, using a C implementation derived from the CACM article and first
  * published on the Usenet in 1988 by Esmond Pitt."

+ 1 - 1
libs/openssl/crypto/o_str.c

@@ -231,7 +231,7 @@ int openssl_strerror_r(int errnum, char *buf, size_t buflen)
      * buf is left unused.
      */
     err = strerror_r(errnum, buf, buflen);
-    if (err == NULL)
+    if (err == NULL || buflen == 0)
         return 0;
     /*
      * If err is statically allocated, err != buf and we need to copy the data.

+ 4 - 1
libs/openssl/crypto/pem/pvkfmt.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 2005-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 2005-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -274,6 +274,9 @@ static EVP_PKEY *b2i_dss(const unsigned char **in,
         if (!read_lebn(&p, 20, &priv_key))
             goto memerr;
 
+        /* Set constant time flag before public key calculation */
+        BN_set_flags(priv_key, BN_FLG_CONSTTIME);
+
         /* Calculate public key */
         pub_key = BN_new();
         if (pub_key == NULL)

+ 9 - 5
libs/openssl/crypto/pkcs7/pk7_doit.c

@@ -1,5 +1,5 @@
 /*
- * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -137,7 +137,8 @@ static int pkcs7_encode_rinfo(PKCS7_RECIP_INFO *ri,
 }
 
 static int pkcs7_decrypt_rinfo(unsigned char **pek, int *peklen,
-                               PKCS7_RECIP_INFO *ri, EVP_PKEY *pkey)
+                               PKCS7_RECIP_INFO *ri, EVP_PKEY *pkey,
+                               size_t fixlen)
 {
     EVP_PKEY_CTX *pctx = NULL;
     unsigned char *ek = NULL;
@@ -170,7 +171,9 @@ static int pkcs7_decrypt_rinfo(unsigned char **pek, int *peklen,
     }
 
     if (EVP_PKEY_decrypt(pctx, ek, &eklen,
-                         ri->enc_key->data, ri->enc_key->length) <= 0) {
+                         ri->enc_key->data, ri->enc_key->length) <= 0
+            || eklen == 0
+            || (fixlen != 0 && eklen != fixlen)) {
         ret = 0;
         PKCS7err(PKCS7_F_PKCS7_DECRYPT_RINFO, ERR_R_EVP_LIB);
         goto err;
@@ -499,13 +502,14 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
             for (i = 0; i < sk_PKCS7_RECIP_INFO_num(rsk); i++) {
                 ri = sk_PKCS7_RECIP_INFO_value(rsk, i);
 
-                if (pkcs7_decrypt_rinfo(&ek, &eklen, ri, pkey) < 0)
+                if (pkcs7_decrypt_rinfo(&ek, &eklen, ri, pkey,
+                        EVP_CIPHER_key_length(evp_cipher)) < 0)
                     goto err;
                 ERR_clear_error();
             }
         } else {
             /* Only exit on fatal errors, not decrypt failure */
-            if (pkcs7_decrypt_rinfo(&ek, &eklen, ri, pkey) < 0)
+            if (pkcs7_decrypt_rinfo(&ek, &eklen, ri, pkey, 0) < 0)
                 goto err;
             ERR_clear_error();
         }

+ 8 - 5
libs/openssl/crypto/rand/drbg_lib.c

@@ -197,7 +197,7 @@ static RAND_DRBG *rand_drbg_new(int secure,
     }
 
     drbg->secure = secure && CRYPTO_secure_allocated(drbg);
-    drbg->fork_count = rand_fork_count;
+    drbg->fork_id = openssl_get_fork_id();
     drbg->parent = parent;
 
     if (parent == NULL) {
@@ -318,7 +318,7 @@ int RAND_DRBG_instantiate(RAND_DRBG *drbg,
     /*
      * NIST SP800-90Ar1 section 9.1 says you can combine getting the entropy
      * and nonce in 1 call by increasing the entropy with 50% and increasing
-     * the minimum length to accomadate the length of the nonce.
+     * the minimum length to accommodate the length of the nonce.
      * We do this in case a nonce is require and get_nonce is NULL.
      */
     if (drbg->min_noncelen > 0 && drbg->get_nonce == NULL) {
@@ -578,6 +578,7 @@ int RAND_DRBG_generate(RAND_DRBG *drbg, unsigned char *out, size_t outlen,
                        int prediction_resistance,
                        const unsigned char *adin, size_t adinlen)
 {
+    int fork_id;
     int reseed_required = 0;
 
     if (drbg->state != DRBG_READY) {
@@ -603,8 +604,10 @@ int RAND_DRBG_generate(RAND_DRBG *drbg, unsigned char *out, size_t outlen,
         return 0;
     }
 
-    if (drbg->fork_count != rand_fork_count) {
-        drbg->fork_count = rand_fork_count;
+    fork_id = openssl_get_fork_id();
+
+    if (drbg->fork_id != fork_id) {
+        drbg->fork_id = fork_id;
         reseed_required = 1;
     }
 
@@ -664,7 +667,7 @@ int RAND_DRBG_bytes(RAND_DRBG *drbg, unsigned char *out, size_t outlen)
     if (drbg->adin_pool == NULL) {
         if (drbg->type == 0)
             goto err;
-        drbg->adin_pool = rand_pool_new(0, 0, drbg->max_adinlen);
+        drbg->adin_pool = rand_pool_new(0, 0, 0, drbg->max_adinlen);
         if (drbg->adin_pool == NULL)
             goto err;
     }

+ 2 - 1
libs/openssl/crypto/rand/rand_err.c

@@ -1,6 +1,6 @@
 /*
  * Generated by util/mkerr.pl DO NOT EDIT
- * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -47,6 +47,7 @@ static const ERR_STRING_DATA RAND_str_functs[] = {
     {ERR_PACK(ERR_LIB_RAND, RAND_F_RAND_POOL_ATTACH, 0), "rand_pool_attach"},
     {ERR_PACK(ERR_LIB_RAND, RAND_F_RAND_POOL_BYTES_NEEDED, 0),
      "rand_pool_bytes_needed"},
+    {ERR_PACK(ERR_LIB_RAND, RAND_F_RAND_POOL_GROW, 0), "rand_pool_grow"},
     {ERR_PACK(ERR_LIB_RAND, RAND_F_RAND_POOL_NEW, 0), "rand_pool_new"},
     {ERR_PACK(ERR_LIB_RAND, RAND_F_RAND_WRITE_FILE, 0), "RAND_write_file"},
     {0, NULL}

+ 26 - 20
libs/openssl/crypto/rand/rand_lcl.h

@@ -1,5 +1,5 @@
 /*
- * Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the OpenSSL license (the "License").  You may not use
  * this file except in compliance with the License.  You can obtain a copy
@@ -45,7 +45,6 @@
 # define DRBG_MAX_LENGTH                         INT32_MAX
 
 
-
 /*
  * Maximum allocation size for RANDOM_POOL buffers
  *
@@ -72,6 +71,24 @@
  *                                1.5 * (RAND_DRBG_STRENGTH / 8))
  */
 
+/*
+ * Initial allocation minimum.
+ *
+ * There is a distinction between the secure and normal allocation minimums.
+ * Ideally, the secure allocation size should be a power of two.  The normal
+ * allocation size doesn't have any such restriction.
+ *
+ * The secure value is based on 128 bits of secure material, which is 16 bytes.
+ * Typically, the DRBGs will set a minimum larger than this so optimal
+ * allocation ought to take place (for full quality seed material).
+ *
+ * The normal value has been chosed by noticing that the rand_drbg_get_nonce
+ * function is usually the largest of the built in allocation (twenty four
+ * bytes and then appending another sixteen bytes).  This means the buffer ends
+ * with 40 bytes.  The value of forty eight is comfortably above this which
+ * allows some slack in the platform specific values used.
+ */
+# define RAND_POOL_MIN_ALLOCATION(secure) ((secure) ? 16 : 48)
 
 /* DRBG status values */
 typedef enum drbg_status_e {
@@ -150,9 +167,11 @@ struct rand_pool_st {
     size_t len; /* current number of random bytes contained in the pool */
 
     int attached;  /* true pool was attached to existing buffer */
+    int secure;    /* 1: allocated on the secure heap, 0: otherwise */
 
     size_t min_len; /* minimum number of random bytes requested */
     size_t max_len; /* maximum number of random bytes (allocated buffer size) */
+    size_t alloc_len; /* current number of bytes allocated */
     size_t entropy; /* current entropy count in bits */
     size_t entropy_requested; /* requested entropy count in bits */
 };
@@ -167,12 +186,12 @@ struct rand_drbg_st {
     int secure; /* 1: allocated on the secure heap, 0: otherwise */
     int type; /* the nid of the underlying algorithm */
     /*
-     * Stores the value of the rand_fork_count global as of when we last
-     * reseeded.  The DRBG reseeds automatically whenever drbg->fork_count !=
-     * rand_fork_count.  Used to provide fork-safety and reseed this DRBG in
-     * the child process.
+     * Stores the return value of openssl_get_fork_id() as of when we last
+     * reseeded.  The DRBG reseeds automatically whenever drbg->fork_id !=
+     * openssl_get_fork_id().  Used to provide fork-safety and reseed this
+     * DRBG in the child process.
      */
-    int fork_count;
+    int fork_id;
     unsigned short flags; /* various external flags */
 
     /*
@@ -264,19 +283,6 @@ struct rand_drbg_st {
 /* The global RAND method, and the global buffer and DRBG instance. */
 extern RAND_METHOD rand_meth;
 
-/*
- * A "generation count" of forks.  Incremented in the child process after a
- * fork.  Since rand_fork_count is increment-only, and only ever written to in
- * the child process of the fork, which is guaranteed to be single-threaded, no
- * locking is needed for normal (read) accesses; the rest of pthread fork
- * processing is assumed to introduce the necessary memory barriers.  Sibling
- * children of a given parent will produce duplicate values, but this is not
- * problematic because the reseeding process pulls input from the system CSPRNG
- * and/or other global sources, so the siblings will end up generating
- * different output streams.
- */
-extern int rand_fork_count;
-
 /* DRBG helpers */
 int rand_drbg_restart(RAND_DRBG *drbg,
                       const unsigned char *buffer, size_t len, size_t entropy);

+ 125 - 23
libs/openssl/crypto/rand/rand_lib.c

@@ -26,8 +26,6 @@ static CRYPTO_RWLOCK *rand_meth_lock;
 static const RAND_METHOD *default_RAND_meth;
 static CRYPTO_ONCE rand_init = CRYPTO_ONCE_STATIC_INIT;
 
-int rand_fork_count;
-
 static CRYPTO_RWLOCK *rand_nonce_lock;
 static int rand_nonce_count;
 
@@ -150,7 +148,7 @@ size_t rand_drbg_get_entropy(RAND_DRBG *drbg,
         pool = drbg->seed_pool;
         pool->entropy_requested = entropy;
     } else {
-        pool = rand_pool_new(entropy, min_len, max_len);
+        pool = rand_pool_new(entropy, drbg->secure, min_len, max_len);
         if (pool == NULL)
             return 0;
     }
@@ -163,7 +161,9 @@ size_t rand_drbg_get_entropy(RAND_DRBG *drbg,
             size_t bytes = 0;
 
             /*
-             * Get random from parent, include our state as additional input.
+             * Get random data from parent. Include our address as additional input,
+             * in order to provide some additional distinction between different
+             * DRBG child instances.
              * Our lock is already held, but we need to lock our parent before
              * generating bits from it. (Note: taking the lock will be a no-op
              * if locking if drbg->parent->lock == NULL.)
@@ -172,7 +172,7 @@ size_t rand_drbg_get_entropy(RAND_DRBG *drbg,
             if (RAND_DRBG_generate(drbg->parent,
                                    buffer, bytes_needed,
                                    prediction_resistance,
-                                   NULL, 0) != 0)
+                                   (unsigned char *)&drbg, sizeof(drbg)) != 0)
                 bytes = bytes_needed;
             drbg->reseed_next_counter
                 = tsan_load(&drbg->parent->reseed_prop_counter);
@@ -216,8 +216,12 @@ size_t rand_drbg_get_entropy(RAND_DRBG *drbg,
 void rand_drbg_cleanup_entropy(RAND_DRBG *drbg,
                                unsigned char *out, size_t outlen)
 {
-    if (drbg->seed_pool == NULL)
-        OPENSSL_secure_clear_free(out, outlen);
+    if (drbg->seed_pool == NULL) {
+        if (drbg->secure)
+            OPENSSL_secure_clear_free(out, outlen);
+        else
+            OPENSSL_clear_free(out, outlen);
+    }
 }
 
 
@@ -235,9 +239,10 @@ size_t rand_drbg_get_nonce(RAND_DRBG *drbg,
     struct {
         void * instance;
         int count;
-    } data = { NULL, 0 };
+    } data;
 
-    pool = rand_pool_new(0, min_len, max_len);
+    memset(&data, 0, sizeof(data));
+    pool = rand_pool_new(0, 0, min_len, max_len);
     if (pool == NULL)
         return 0;
 
@@ -266,7 +271,7 @@ size_t rand_drbg_get_nonce(RAND_DRBG *drbg,
 void rand_drbg_cleanup_nonce(RAND_DRBG *drbg,
                              unsigned char *out, size_t outlen)
 {
-    OPENSSL_secure_clear_free(out, outlen);
+    OPENSSL_clear_free(out, outlen);
 }
 
 /*
@@ -298,11 +303,6 @@ void rand_drbg_cleanup_additional_data(RAND_POOL *pool, unsigned char *out)
     rand_pool_reattach(pool, out);
 }
 
-void rand_fork(void)
-{
-    rand_fork_count++;
-}
-
 DEFINE_RUN_ONCE_STATIC(do_rand_init)
 {
 #ifndef OPENSSL_NO_ENGINE
@@ -362,7 +362,7 @@ void rand_cleanup_int(void)
 }
 
 /*
- * RAND_close_seed_files() ensures that any seed file decriptors are
+ * RAND_close_seed_files() ensures that any seed file descriptors are
  * closed after use.
  */
 void RAND_keep_random_devices_open(int keep)
@@ -401,7 +401,7 @@ int RAND_poll(void)
 
     } else {
         /* fill random pool and seed the current legacy RNG */
-        pool = rand_pool_new(RAND_DRBG_STRENGTH,
+        pool = rand_pool_new(RAND_DRBG_STRENGTH, 1,
                              (RAND_DRBG_STRENGTH + 7) / 8,
                              RAND_POOL_MAX_LENGTH);
         if (pool == NULL)
@@ -428,9 +428,11 @@ err:
  * Allocate memory and initialize a new random pool
  */
 
-RAND_POOL *rand_pool_new(int entropy_requested, size_t min_len, size_t max_len)
+RAND_POOL *rand_pool_new(int entropy_requested, int secure,
+                         size_t min_len, size_t max_len)
 {
     RAND_POOL *pool = OPENSSL_zalloc(sizeof(*pool));
+    size_t min_alloc_size = RAND_POOL_MIN_ALLOCATION(secure);
 
     if (pool == NULL) {
         RANDerr(RAND_F_RAND_POOL_NEW, ERR_R_MALLOC_FAILURE);
@@ -440,14 +442,22 @@ RAND_POOL *rand_pool_new(int entropy_requested, size_t min_len, size_t max_len)
     pool->min_len = min_len;
     pool->max_len = (max_len > RAND_POOL_MAX_LENGTH) ?
         RAND_POOL_MAX_LENGTH : max_len;
+    pool->alloc_len = min_len < min_alloc_size ? min_alloc_size : min_len;
+    if (pool->alloc_len > pool->max_len)
+        pool->alloc_len = pool->max_len;
+
+    if (secure)
+        pool->buffer = OPENSSL_secure_zalloc(pool->alloc_len);
+    else
+        pool->buffer = OPENSSL_zalloc(pool->alloc_len);
 
-    pool->buffer = OPENSSL_secure_zalloc(pool->max_len);
     if (pool->buffer == NULL) {
         RANDerr(RAND_F_RAND_POOL_NEW, ERR_R_MALLOC_FAILURE);
         goto err;
     }
 
     pool->entropy_requested = entropy_requested;
+    pool->secure = secure;
 
     return pool;
 
@@ -482,7 +492,7 @@ RAND_POOL *rand_pool_attach(const unsigned char *buffer, size_t len,
 
     pool->attached = 1;
 
-    pool->min_len = pool->max_len = pool->len;
+    pool->min_len = pool->max_len = pool->alloc_len = pool->len;
     pool->entropy = entropy;
 
     return pool;
@@ -502,8 +512,13 @@ void rand_pool_free(RAND_POOL *pool)
      * to rand_pool_attach() as `const unsigned char*`.
      * (see corresponding comment in rand_pool_attach()).
      */
-    if (!pool->attached)
-        OPENSSL_secure_clear_free(pool->buffer, pool->max_len);
+    if (!pool->attached) {
+        if (pool->secure)
+            OPENSSL_secure_clear_free(pool->buffer, pool->alloc_len);
+        else
+            OPENSSL_clear_free(pool->buffer, pool->alloc_len);
+    }
+
     OPENSSL_free(pool);
 }
 
@@ -596,6 +611,42 @@ size_t rand_pool_entropy_needed(RAND_POOL *pool)
     return 0;
 }
 
+/* Increase the allocation size -- not usable for an attached pool */
+static int rand_pool_grow(RAND_POOL *pool, size_t len)
+{
+    if (len > pool->alloc_len - pool->len) {
+        unsigned char *p;
+        const size_t limit = pool->max_len / 2;
+        size_t newlen = pool->alloc_len;
+
+        if (pool->attached || len > pool->max_len - pool->len) {
+            RANDerr(RAND_F_RAND_POOL_GROW, ERR_R_INTERNAL_ERROR);
+            return 0;
+        }
+
+        do
+            newlen = newlen < limit ? newlen * 2 : pool->max_len;
+        while (len > newlen - pool->len);
+
+        if (pool->secure)
+            p = OPENSSL_secure_zalloc(newlen);
+        else
+            p = OPENSSL_zalloc(newlen);
+        if (p == NULL) {
+            RANDerr(RAND_F_RAND_POOL_GROW, ERR_R_MALLOC_FAILURE);
+            return 0;
+        }
+        memcpy(p, pool->buffer, pool->len);
+        if (pool->secure)
+            OPENSSL_secure_clear_free(pool->buffer, pool->alloc_len);
+        else
+            OPENSSL_clear_free(pool->buffer, pool->alloc_len);
+        pool->buffer = p;
+        pool->alloc_len = newlen;
+    }
+    return 1;
+}
+
 /*
  * Returns the number of bytes needed to fill the pool, assuming
  * the input has 1 / |entropy_factor| entropy bits per data bit.
@@ -625,6 +676,24 @@ size_t rand_pool_bytes_needed(RAND_POOL *pool, unsigned int entropy_factor)
         /* to meet the min_len requirement */
         bytes_needed = pool->min_len - pool->len;
 
+    /*
+     * Make sure the buffer is large enough for the requested amount
+     * of data. This guarantees that existing code patterns where
+     * rand_pool_add_begin, rand_pool_add_end or rand_pool_add
+     * are used to collect entropy data without any error handling
+     * whatsoever, continue to be valid.
+     * Furthermore if the allocation here fails once, make sure that
+     * we don't fall back to a less secure or even blocking random source,
+     * as that could happen by the existing code patterns.
+     * This is not a concern for additional data, therefore that
+     * is not needed if rand_pool_grow fails in other places.
+     */
+    if (!rand_pool_grow(pool, bytes_needed)) {
+        /* persistent error for this pool */
+        pool->max_len = pool->len = 0;
+        return 0;
+    }
+
     return bytes_needed;
 }
 
@@ -657,6 +726,27 @@ int rand_pool_add(RAND_POOL *pool,
     }
 
     if (len > 0) {
+        /*
+         * This is to protect us from accidentally passing the buffer
+         * returned from rand_pool_add_begin.
+         * The check for alloc_len makes sure we do not compare the
+         * address of the end of the allocated memory to something
+         * different, since that comparison would have an
+         * indeterminate result.
+         */
+        if (pool->alloc_len > pool->len && pool->buffer + pool->len == buffer) {
+            RANDerr(RAND_F_RAND_POOL_ADD, ERR_R_INTERNAL_ERROR);
+            return 0;
+        }
+        /*
+         * We have that only for cases when a pool is used to collect
+         * additional data.
+         * For entropy data, as long as the allocation request stays within
+         * the limits given by rand_pool_bytes_needed this rand_pool_grow
+         * below is guaranteed to succeed, thus no allocation happens.
+         */
+        if (!rand_pool_grow(pool, len))
+            return 0;
         memcpy(pool->buffer + pool->len, buffer, len);
         pool->len += len;
         pool->entropy += entropy;
@@ -692,6 +782,18 @@ unsigned char *rand_pool_add_begin(RAND_POOL *pool, size_t len)
         return NULL;
     }
 
+    /*
+     * As long as the allocation request stays within the limits given
+     * by rand_pool_bytes_needed this rand_pool_grow below is guaranteed
+     * to succeed, thus no allocation happens.
+     * We have that only for cases when a pool is used to collect
+     * additional data. Then the buffer might need to grow here,
+     * and of course the caller is responsible to check the return
+     * value of this function.
+     */
+    if (!rand_pool_grow(pool, len))
+        return NULL;
+
     return pool->buffer + pool->len;
 }
 
@@ -706,7 +808,7 @@ unsigned char *rand_pool_add_begin(RAND_POOL *pool, size_t len)
  */
 int rand_pool_add_end(RAND_POOL *pool, size_t len, size_t entropy)
 {
-    if (len > pool->max_len - pool->len) {
+    if (len > pool->alloc_len - pool->len) {
         RANDerr(RAND_F_RAND_POOL_ADD_END, RAND_R_RANDOM_POOL_OVERFLOW);
         return 0;
     }

+ 136 - 49
libs/openssl/crypto/rand/rand_unix.c

@@ -14,14 +14,19 @@
 #include <stdio.h>
 #include "internal/cryptlib.h"
 #include <openssl/rand.h>
+#include <openssl/crypto.h>
 #include "rand_lcl.h"
 #include "internal/rand_int.h"
 #include <stdio.h>
 #include "internal/dso.h"
-#if defined(__linux)
-# include <asm/unistd.h>
+#ifdef __linux
+# include <sys/syscall.h>
+# ifdef DEVRANDOM_WAIT
+#  include <sys/shm.h>
+#  include <sys/utsname.h>
+# endif
 #endif
-#if defined(__FreeBSD__)
+#if defined(__FreeBSD__) && !defined(OPENSSL_SYS_UEFI)
 # include <sys/types.h>
 # include <sys/sysctl.h>
 # include <sys/param.h>
@@ -275,6 +280,17 @@ static ssize_t sysctl_random(char *buf, size_t buflen)
 #  endif
 
 #  if defined(OPENSSL_RAND_SEED_GETRANDOM)
+
+#   if defined(__linux) && !defined(__NR_getrandom)
+#    if defined(__arm__) && defined(__NR_SYSCALL_BASE)
+#     define __NR_getrandom    (__NR_SYSCALL_BASE+384)
+#    elif defined(__i386__)
+#     define __NR_getrandom    355
+#    elif defined(__x86_64__) && !defined(__ILP32__)
+#     define __NR_getrandom    318
+#    endif
+#   endif
+
 /*
  * syscall_random(): Try to get random data using a system call
  * returns the number of bytes returned in buf, or < 0 on error.
@@ -346,6 +362,91 @@ static struct random_device {
 } random_devices[OSSL_NELEM(random_device_paths)];
 static int keep_random_devices_open = 1;
 
+#   if defined(__linux) && defined(DEVRANDOM_WAIT)
+static void *shm_addr;
+
+static void cleanup_shm(void)
+{
+    shmdt(shm_addr);
+}
+
+/*
+ * Ensure that the system randomness source has been adequately seeded.
+ * This is done by having the first start of libcrypto, wait until the device
+ * /dev/random becomes able to supply a byte of entropy.  Subsequent starts
+ * of the library and later reseedings do not need to do this.
+ */
+static int wait_random_seeded(void)
+{
+    static int seeded = OPENSSL_RAND_SEED_DEVRANDOM_SHM_ID < 0;
+    static const int kernel_version[] = { DEVRANDOM_SAFE_KERNEL };
+    int kernel[2];
+    int shm_id, fd, r;
+    char c, *p;
+    struct utsname un;
+    fd_set fds;
+
+    if (!seeded) {
+        /* See if anything has created the global seeded indication */
+        if ((shm_id = shmget(OPENSSL_RAND_SEED_DEVRANDOM_SHM_ID, 1, 0)) == -1) {
+            /*
+             * Check the kernel's version and fail if it is too recent.
+             *
+             * Linux kernels from 4.8 onwards do not guarantee that
+             * /dev/urandom is properly seeded when /dev/random becomes
+             * readable.  However, such kernels support the getentropy(2)
+             * system call and this should always succeed which renders
+             * this alternative but essentially identical source moot.
+             */
+            if (uname(&un) == 0) {
+                kernel[0] = atoi(un.release);
+                p = strchr(un.release, '.');
+                kernel[1] = p == NULL ? 0 : atoi(p + 1);
+                if (kernel[0] > kernel_version[0]
+                    || (kernel[0] == kernel_version[0]
+                        && kernel[1] >= kernel_version[1])) {
+                    return 0;
+                }
+            }
+            /* Open /dev/random and wait for it to be readable */
+            if ((fd = open(DEVRANDOM_WAIT, O_RDONLY)) != -1) {
+                if (DEVRANDM_WAIT_USE_SELECT && fd < FD_SETSIZE) {
+                    FD_ZERO(&fds);
+                    FD_SET(fd, &fds);
+                    while ((r = select(fd + 1, &fds, NULL, NULL, NULL)) < 0
+                           && errno == EINTR);
+                } else {
+                    while ((r = read(fd, &c, 1)) < 0 && errno == EINTR);
+                }
+                close(fd);
+                if (r == 1) {
+                    seeded = 1;
+                    /* Create the shared memory indicator */
+                    shm_id = shmget(OPENSSL_RAND_SEED_DEVRANDOM_SHM_ID, 1,
+                                    IPC_CREAT | S_IRUSR | S_IRGRP | S_IROTH);
+                }
+            }
+        }
+        if (shm_id != -1) {
+            seeded = 1;
+            /*
+             * Map the shared memory to prevent its premature destruction.
+             * If this call fails, it isn't a big problem.
+             */
+            shm_addr = shmat(shm_id, NULL, SHM_RDONLY);
+            if (shm_addr != (void *)-1)
+                OPENSSL_atexit(&cleanup_shm);
+        }
+    }
+    return seeded;
+}
+#   else /* defined __linux */
+static int wait_random_seeded(void)
+{
+    return 1;
+}
+#   endif
+
 /*
  * Verify that the file descriptor associated with the random source is
  * still valid. The rationale for doing this is the fact that it is not
@@ -472,12 +573,12 @@ size_t rand_pool_acquire_entropy(RAND_POOL *pool)
 #  if defined(OPENSSL_RAND_SEED_NONE)
     return rand_pool_entropy_available(pool);
 #  else
-    size_t bytes_needed;
-    size_t entropy_available = 0;
-    unsigned char *buffer;
+    size_t entropy_available;
 
 #   if defined(OPENSSL_RAND_SEED_GETRANDOM)
     {
+        size_t bytes_needed;
+        unsigned char *buffer;
         ssize_t bytes;
         /* Maximum allowed number of consecutive unsuccessful attempts */
         int attempts = 3;
@@ -507,36 +608,16 @@ size_t rand_pool_acquire_entropy(RAND_POOL *pool)
 #   endif
 
 #   if defined(OPENSSL_RAND_SEED_DEVRANDOM)
-    bytes_needed = rand_pool_bytes_needed(pool, 1 /*entropy_factor*/);
-    {
+    if (wait_random_seeded()) {
+        size_t bytes_needed;
+        unsigned char *buffer;
         size_t i;
-#ifdef DEVRANDOM_WAIT
-        static int wait_done = 0;
 
-        /*
-         * On some implementations reading from /dev/urandom is possible
-         * before it is initialized. Therefore we wait for /dev/random
-         * to be readable to make sure /dev/urandom is initialized.
-         */
-        if (!wait_done && bytes_needed > 0) {
-             int f = open(DEVRANDOM_WAIT, O_RDONLY);
-
-             if (f >= 0) {
-                 fd_set fds;
-
-                 FD_ZERO(&fds);
-                 FD_SET(f, &fds);
-                 while (select(f+1, &fds, NULL, NULL, NULL) < 0
-                        && errno == EINTR);
-                 close(f);
-             }
-             wait_done = 1;
-        }
-#endif
-
-        for (i = 0; bytes_needed > 0 && i < OSSL_NELEM(random_device_paths); i++) {
+        bytes_needed = rand_pool_bytes_needed(pool, 1 /*entropy_factor*/);
+        for (i = 0; bytes_needed > 0 && i < OSSL_NELEM(random_device_paths);
+             i++) {
             ssize_t bytes = 0;
-            /* Maximum allowed number of consecutive unsuccessful attempts */
+            /* Maximum number of consecutive unsuccessful attempts */
             int attempts = 3;
             const int fd = get_random_device(i);
 
@@ -550,7 +631,7 @@ size_t rand_pool_acquire_entropy(RAND_POOL *pool)
                 if (bytes > 0) {
                     rand_pool_add_end(pool, bytes, 8 * bytes);
                     bytes_needed -= bytes;
-                    attempts = 3; /* reset counter after successful attempt */
+                    attempts = 3; /* reset counter on successful attempt */
                 } else if (bytes < 0 && errno != EINTR) {
                     break;
                 }
@@ -558,7 +639,7 @@ size_t rand_pool_acquire_entropy(RAND_POOL *pool)
             if (bytes < 0 || !keep_random_devices_open)
                 close_random_device(i);
 
-            bytes_needed = rand_pool_bytes_needed(pool, 1 /*entropy_factor*/);
+            bytes_needed = rand_pool_bytes_needed(pool, 1);
         }
         entropy_available = rand_pool_entropy_available(pool);
         if (entropy_available > 0)
@@ -579,26 +660,29 @@ size_t rand_pool_acquire_entropy(RAND_POOL *pool)
 #   endif
 
 #   if defined(OPENSSL_RAND_SEED_EGD)
-    bytes_needed = rand_pool_bytes_needed(pool, 1 /*entropy_factor*/);
-    if (bytes_needed > 0) {
+    {
         static const char *paths[] = { DEVRANDOM_EGD, NULL };
+        size_t bytes_needed;
+        unsigned char *buffer;
         int i;
 
-        for (i = 0; paths[i] != NULL; i++) {
+        bytes_needed = rand_pool_bytes_needed(pool, 1 /*entropy_factor*/);
+        for (i = 0; bytes_needed > 0 && paths[i] != NULL; i++) {
+            size_t bytes = 0;
+            int num;
+
             buffer = rand_pool_add_begin(pool, bytes_needed);
-            if (buffer != NULL) {
-                size_t bytes = 0;
-                int num = RAND_query_egd_bytes(paths[i],
-                                               buffer, (int)bytes_needed);
-                if (num == (int)bytes_needed)
-                    bytes = bytes_needed;
+            num = RAND_query_egd_bytes(paths[i],
+                                       buffer, (int)bytes_needed);
+            if (num == (int)bytes_needed)
+                bytes = bytes_needed;
 
-                rand_pool_add_end(pool, bytes, 8 * bytes);
-                entropy_available = rand_pool_entropy_available(pool);
-            }
-            if (entropy_available > 0)
-                return entropy_available;
+            rand_pool_add_end(pool, bytes, 8 * bytes);
+            bytes_needed = rand_pool_bytes_needed(pool, 1);
         }
+        entropy_available = rand_pool_entropy_available(pool);
+        if (entropy_available > 0)
+            return entropy_available;
     }
 #   endif
 
@@ -632,15 +716,18 @@ int rand_pool_add_nonce_data(RAND_POOL *pool)
 int rand_pool_add_additional_data(RAND_POOL *pool)
 {
     struct {
+        int fork_id;
         CRYPTO_THREAD_ID tid;
         uint64_t time;
     } data = { 0 };
 
     /*
      * Add some noise from the thread id and a high resolution timer.
+     * The fork_id adds some extra fork-safety.
      * The thread id adds a little randomness if the drbg is accessed
      * concurrently (which is the case for the <master> drbg).
      */
+    data.fork_id = openssl_get_fork_id();
     data.tid = CRYPTO_THREAD_get_current_id();
     data.time = get_timer_bits();
 

Some files were not shown because too many files changed in this diff