8 years ago · 4d797fb55d
--- a/lib/aes_acc/aes0.c
+++ b/lib/aes_acc/aes0.c
@@ -1,600 +0,0 @@
 
				-
			
 
				-/*
			
 
				- *  this file comes from https://github.com/kokke/tiny-AES128-C
			
 
				- */
			
 
				-
			
 
				-/*
			
 
				-
			
 
				-This is an implementation of the AES algorithm, specifically ECB and CBC mode.
			
 
				-Block size can be chosen in aes.h - available choices are AES128, AES192, AES256.
			
 
				-
			
 
				-The implementation is verified against the test vectors in:
			
 
				-  National Institute of Standards and Technology Special Publication 800-38A 2001 ED
			
 
				-
			
 
				-ECB-AES128
			
 
				-----------
			
 
				-
			
 
				-  plain-text:
			
 
				-    6bc1bee22e409f96e93d7e117393172a
			
 
				-    ae2d8a571e03ac9c9eb76fac45af8e51
			
 
				-    30c81c46a35ce411e5fbc1191a0a52ef
			
 
				-    f69f2445df4f9b17ad2b417be66c3710
			
 
				-
			
 
				-  key:
			
 
				-    2b7e151628aed2a6abf7158809cf4f3c
			
 
				-
			
 
				-  resulting cipher
			
 
				-    3ad77bb40d7a3660a89ecaf32466ef97 
			
 
				-    f5d3d58503b9699de785895a96fdbaaf 
			
 
				-    43b1cd7f598ece23881b00e3ed030688 
			
 
				-    7b0c785e27e8ad3f8223207104725dd4 
			
 
				-
			
 
				-
			
 
				-NOTE:   String length must be evenly divisible by 16byte (str_len % 16 == 0)
			
 
				-        You should pad the end of the string with zeros if this is not the case.
			
 
				-        For AES192/256 the block size is proportionally larger.
			
 
				-
			
 
				-*/
			
 
				-
			
 
				-
			
 
				-/*****************************************************************************/
			
 
				-/* Includes:                                                                 */
			
 
				-/*****************************************************************************/
			
 
				-#include <stdint.h>
			
 
				-#include <string.h> // CBC mode, for memset
			
 
				-#include "aes0.h"
			
 
				-
			
 
				-/*****************************************************************************/
			
 
				-/* Defines:                                                                  */
			
 
				-/*****************************************************************************/
			
 
				-// The number of columns comprising a state in AES. This is a constant in AES. Value=4
			
 
				-#define Nb 4
			
 
				-#define BLOCKLEN 16 //Block length in bytes AES is 128b block only
			
 
				-
			
 
				-#if defined(AES256) && (AES256 == 1)
			
 
				-    #define Nk 8
			
 
				-    #define KEYLEN 32
			
 
				-    #define Nr 14
			
 
				-    #define keyExpSize 240
			
 
				-#elif defined(AES192) && (AES192 == 1)
			
 
				-    #define Nk 6
			
 
				-    #define KEYLEN 24
			
 
				-    #define Nr 12
			
 
				-    #define keyExpSize 208
			
 
				-#else
			
 
				-    #define Nk 4        // The number of 32 bit words in a key.
			
 
				-    #define KEYLEN 16   // Key length in bytes
			
 
				-    #define Nr 10       // The number of rounds in AES Cipher.
			
 
				-    #define keyExpSize 176
			
 
				-#endif
			
 
				-
			
 
				-// jcallan@github points out that declaring Multiply as a function 
			
 
				-// reduces code size considerably with the Keil ARM compiler.
			
 
				-// See this link for more information: https://github.com/kokke/tiny-AES128-C/pull/3
			
 
				-#ifndef MULTIPLY_AS_A_FUNCTION
			
 
				-  #define MULTIPLY_AS_A_FUNCTION 0
			
 
				-#endif
			
 
				-
			
 
				-
			
 
				-/*****************************************************************************/
			
 
				-/* Private variables:                                                        */
			
 
				-/*****************************************************************************/
			
 
				-// state - array holding the intermediate results during decryption.
			
 
				-typedef uint8_t state_t[4][4];
			
 
				-static state_t* state;
			
 
				-
			
 
				-// The array that stores the round keys.
			
 
				-static uint8_t RoundKey[keyExpSize];
			
 
				-
			
 
				-// The Key input to the AES Program
			
 
				-static const uint8_t* Key;
			
 
				-
			
 
				-#if defined(CBC) && CBC
			
 
				-  // Initial Vector used only for CBC mode
			
 
				-  static uint8_t* Iv;
			
 
				-#endif
			
 
				-
			
 
				-// The lookup-tables are marked const so they can be placed in read-only storage instead of RAM
			
 
				-// The numbers below can be computed dynamically trading ROM for RAM - 
			
 
				-// This can be useful in (embedded) bootloader applications, where ROM is often limited.
			
 
				-static const uint8_t sbox[256] = {
			
 
				-  //0     1    2      3     4    5     6     7      8    9     A      B    C     D     E     F
			
 
				-  0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
			
 
				-  0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
			
 
				-  0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
			
 
				-  0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
			
 
				-  0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
			
 
				-  0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
			
 
				-  0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
			
 
				-  0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
			
 
				-  0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
			
 
				-  0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
			
 
				-  0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
			
 
				-  0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
			
 
				-  0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
			
 
				-  0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
			
 
				-  0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
			
 
				-  0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 };
			
 
				-
			
 
				-static const uint8_t rsbox[256] = {
			
 
				-  0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
			
 
				-  0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
			
 
				-  0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
			
 
				-  0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
			
 
				-  0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
			
 
				-  0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
			
 
				-  0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
			
 
				-  0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
			
 
				-  0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
			
 
				-  0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
			
 
				-  0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
			
 
				-  0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
			
 
				-  0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
			
 
				-  0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
			
 
				-  0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
			
 
				-  0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d };
			
 
				-
			
 
				-// The round constant word array, Rcon[i], contains the values given by 
			
 
				-// x to th e power (i-1) being powers of x (x is denoted as {02}) in the field GF(2^8)
			
 
				-static const uint8_t Rcon[11] = {
			
 
				-  0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 };
			
 
				-
			
 
				-/*
			
 
				- * Jordan Goulder points out in PR #12 (https://github.com/kokke/tiny-AES128-C/pull/12),
			
 
				- * that you can remove most of the elements in the Rcon array, because they are unused.
			
 
				- *
			
 
				- * From Wikipedia's article on the Rijndael key schedule @ https://en.wikipedia.org/wiki/Rijndael_key_schedule#Rcon
			
 
				- * 
			
 
				- * "Only the first some of these constants are actually used – up to rcon[10] for AES-128 (as 11 round keys are needed), 
			
 
				- *  up to rcon[8] for AES-192, up to rcon[7] for AES-256. rcon[0] is not used in AES algorithm."
			
 
				- *
			
 
				- * ... which is why the full array below has been 'disabled' below.
			
 
				- */
			
 
				-#if 0
			
 
				-static const uint8_t Rcon[256] = {
			
 
				-  0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a,
			
 
				-  0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39,
			
 
				-  0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a,
			
 
				-  0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8,
			
 
				-  0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef,
			
 
				-  0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc,
			
 
				-  0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b,
			
 
				-  0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3,
			
 
				-  0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94,
			
 
				-  0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20,
			
 
				-  0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35,
			
 
				-  0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f,
			
 
				-  0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04,
			
 
				-  0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63,
			
 
				-  0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd,
			
 
				-  0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d };
			
 
				-#endif
			
 
				-
			
 
				-/*****************************************************************************/
			
 
				-/* Private functions:                                                        */
			
 
				-/*****************************************************************************/
			
 
				-static uint8_t getSBoxValue(uint8_t num)
			
 
				-{
			
 
				-  return sbox[num];
			
 
				-}
			
 
				-
			
 
				-static uint8_t getSBoxInvert(uint8_t num)
			
 
				-{
			
 
				-  return rsbox[num];
			
 
				-}
			
 
				-
			
 
				-// This function produces Nb(Nr+1) round keys. The round keys are used in each round to decrypt the states. 
			
 
				-static void KeyExpansion(void)
			
 
				-{
			
 
				-  uint32_t i, k;
			
 
				-  uint8_t tempa[4]; // Used for the column/row operations
			
 
				-  
			
 
				-  // The first round key is the key itself.
			
 
				-  for (i = 0; i < Nk; ++i)
			
 
				-  {
			
 
				-    RoundKey[(i * 4) + 0] = Key[(i * 4) + 0];
			
 
				-    RoundKey[(i * 4) + 1] = Key[(i * 4) + 1];
			
 
				-    RoundKey[(i * 4) + 2] = Key[(i * 4) + 2];
			
 
				-    RoundKey[(i * 4) + 3] = Key[(i * 4) + 3];
			
 
				-  }
			
 
				-
			
 
				-  // All other round keys are found from the previous round keys.
			
 
				-  //i == Nk
			
 
				-  for (; i < Nb * (Nr + 1); ++i)
			
 
				-  {
			
 
				-    {
			
 
				-      tempa[0]=RoundKey[(i-1) * 4 + 0];
			
 
				-      tempa[1]=RoundKey[(i-1) * 4 + 1];
			
 
				-      tempa[2]=RoundKey[(i-1) * 4 + 2];
			
 
				-      tempa[3]=RoundKey[(i-1) * 4 + 3];
			
 
				-    }
			
 
				-
			
 
				-    if (i % Nk == 0)
			
 
				-    {
			
 
				-      // This function shifts the 4 bytes in a word to the left once.
			
 
				-      // [a0,a1,a2,a3] becomes [a1,a2,a3,a0]
			
 
				-
			
 
				-      // Function RotWord()
			
 
				-      {
			
 
				-        k = tempa[0];
			
 
				-        tempa[0] = tempa[1];
			
 
				-        tempa[1] = tempa[2];
			
 
				-        tempa[2] = tempa[3];
			
 
				-        tempa[3] = k;
			
 
				-      }
			
 
				-
			
 
				-      // SubWord() is a function that takes a four-byte input word and 
			
 
				-      // applies the S-box to each of the four bytes to produce an output word.
			
 
				-
			
 
				-      // Function Subword()
			
 
				-      {
			
 
				-        tempa[0] = getSBoxValue(tempa[0]);
			
 
				-        tempa[1] = getSBoxValue(tempa[1]);
			
 
				-        tempa[2] = getSBoxValue(tempa[2]);
			
 
				-        tempa[3] = getSBoxValue(tempa[3]);
			
 
				-      }
			
 
				-
			
 
				-      tempa[0] =  tempa[0] ^ Rcon[i/Nk];
			
 
				-    }
			
 
				-#if defined(AES256) && (AES256 == 1)
			
 
				-    if (i % Nk == 4)
			
 
				-    {
			
 
				-      // Function Subword()
			
 
				-      {
			
 
				-        tempa[0] = getSBoxValue(tempa[0]);
			
 
				-        tempa[1] = getSBoxValue(tempa[1]);
			
 
				-        tempa[2] = getSBoxValue(tempa[2]);
			
 
				-        tempa[3] = getSBoxValue(tempa[3]);
			
 
				-      }
			
 
				-    }
			
 
				-#endif
			
 
				-    RoundKey[i * 4 + 0] = RoundKey[(i - Nk) * 4 + 0] ^ tempa[0];
			
 
				-    RoundKey[i * 4 + 1] = RoundKey[(i - Nk) * 4 + 1] ^ tempa[1];
			
 
				-    RoundKey[i * 4 + 2] = RoundKey[(i - Nk) * 4 + 2] ^ tempa[2];
			
 
				-    RoundKey[i * 4 + 3] = RoundKey[(i - Nk) * 4 + 3] ^ tempa[3];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// This function adds the round key to state.
			
 
				-// The round key is added to the state by an XOR function.
			
 
				-static void AddRoundKey(uint8_t round)
			
 
				-{
			
 
				-  uint8_t i,j;
			
 
				-  for (i=0;i<4;++i)
			
 
				-  {
			
 
				-    for (j = 0; j < 4; ++j)
			
 
				-    {
			
 
				-      (*state)[i][j] ^= RoundKey[round * Nb * 4 + i * Nb + j];
			
 
				-    }
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// The SubBytes Function Substitutes the values in the
			
 
				-// state matrix with values in an S-box.
			
 
				-static void SubBytes(void)
			
 
				-{
			
 
				-  uint8_t i, j;
			
 
				-  for (i = 0; i < 4; ++i)
			
 
				-  {
			
 
				-    for (j = 0; j < 4; ++j)
			
 
				-    {
			
 
				-      (*state)[j][i] = getSBoxValue((*state)[j][i]);
			
 
				-    }
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// The ShiftRows() function shifts the rows in the state to the left.
			
 
				-// Each row is shifted with different offset.
			
 
				-// Offset = Row number. So the first row is not shifted.
			
 
				-static void ShiftRows(void)
			
 
				-{
			
 
				-  uint8_t temp;
			
 
				-
			
 
				-  // Rotate first row 1 columns to left  
			
 
				-  temp           = (*state)[0][1];
			
 
				-  (*state)[0][1] = (*state)[1][1];
			
 
				-  (*state)[1][1] = (*state)[2][1];
			
 
				-  (*state)[2][1] = (*state)[3][1];
			
 
				-  (*state)[3][1] = temp;
			
 
				-
			
 
				-  // Rotate second row 2 columns to left  
			
 
				-  temp           = (*state)[0][2];
			
 
				-  (*state)[0][2] = (*state)[2][2];
			
 
				-  (*state)[2][2] = temp;
			
 
				-
			
 
				-  temp           = (*state)[1][2];
			
 
				-  (*state)[1][2] = (*state)[3][2];
			
 
				-  (*state)[3][2] = temp;
			
 
				-
			
 
				-  // Rotate third row 3 columns to left
			
 
				-  temp           = (*state)[0][3];
			
 
				-  (*state)[0][3] = (*state)[3][3];
			
 
				-  (*state)[3][3] = (*state)[2][3];
			
 
				-  (*state)[2][3] = (*state)[1][3];
			
 
				-  (*state)[1][3] = temp;
			
 
				-}
			
 
				-
			
 
				-static uint8_t xtime(uint8_t x)
			
 
				-{
			
 
				-  return ((x<<1) ^ (((x>>7) & 1) * 0x1b));
			
 
				-}
			
 
				-
			
 
				-// MixColumns function mixes the columns of the state matrix
			
 
				-static void MixColumns(void)
			
 
				-{
			
 
				-  uint8_t i;
			
 
				-  uint8_t Tmp,Tm,t;
			
 
				-  for (i = 0; i < 4; ++i)
			
 
				-  {  
			
 
				-    t   = (*state)[i][0];
			
 
				-    Tmp = (*state)[i][0] ^ (*state)[i][1] ^ (*state)[i][2] ^ (*state)[i][3] ;
			
 
				-    Tm  = (*state)[i][0] ^ (*state)[i][1] ; Tm = xtime(Tm);  (*state)[i][0] ^= Tm ^ Tmp ;
			
 
				-    Tm  = (*state)[i][1] ^ (*state)[i][2] ; Tm = xtime(Tm);  (*state)[i][1] ^= Tm ^ Tmp ;
			
 
				-    Tm  = (*state)[i][2] ^ (*state)[i][3] ; Tm = xtime(Tm);  (*state)[i][2] ^= Tm ^ Tmp ;
			
 
				-    Tm  = (*state)[i][3] ^ t ;              Tm = xtime(Tm);  (*state)[i][3] ^= Tm ^ Tmp ;
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-// Multiply is used to multiply numbers in the field GF(2^8)
			
 
				-#if MULTIPLY_AS_A_FUNCTION
			
 
				-static uint8_t Multiply(uint8_t x, uint8_t y)
			
 
				-{
			
 
				-  return (((y & 1) * x) ^
			
 
				-       ((y>>1 & 1) * xtime(x)) ^
			
 
				-       ((y>>2 & 1) * xtime(xtime(x))) ^
			
 
				-       ((y>>3 & 1) * xtime(xtime(xtime(x)))) ^
			
 
				-       ((y>>4 & 1) * xtime(xtime(xtime(xtime(x))))));
			
 
				-  }
			
 
				-#else
			
 
				-#define Multiply(x, y)                                \
			
 
				-      (  ((y & 1) * x) ^                              \
			
 
				-      ((y>>1 & 1) * xtime(x)) ^                       \
			
 
				-      ((y>>2 & 1) * xtime(xtime(x))) ^                \
			
 
				-      ((y>>3 & 1) * xtime(xtime(xtime(x)))) ^         \
			
 
				-      ((y>>4 & 1) * xtime(xtime(xtime(xtime(x))))))   \
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				-// MixColumns function mixes the columns of the state matrix.
			
 
				-// The method used to multiply may be difficult to understand for the inexperienced.
			
 
				-// Please use the references to gain more information.
			
 
				-static void InvMixColumns(void)
			
 
				-{
			
 
				-  int i;
			
 
				-  uint8_t a, b, c, d;
			
 
				-  for (i = 0; i < 4; ++i)
			
 
				-  { 
			
 
				-    a = (*state)[i][0];
			
 
				-    b = (*state)[i][1];
			
 
				-    c = (*state)[i][2];
			
 
				-    d = (*state)[i][3];
			
 
				-
			
 
				-    (*state)[i][0] = Multiply(a, 0x0e) ^ Multiply(b, 0x0b) ^ Multiply(c, 0x0d) ^ Multiply(d, 0x09);
			
 
				-    (*state)[i][1] = Multiply(a, 0x09) ^ Multiply(b, 0x0e) ^ Multiply(c, 0x0b) ^ Multiply(d, 0x0d);
			
 
				-    (*state)[i][2] = Multiply(a, 0x0d) ^ Multiply(b, 0x09) ^ Multiply(c, 0x0e) ^ Multiply(d, 0x0b);
			
 
				-    (*state)[i][3] = Multiply(a, 0x0b) ^ Multiply(b, 0x0d) ^ Multiply(c, 0x09) ^ Multiply(d, 0x0e);
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-
			
 
				-// The SubBytes Function Substitutes the values in the
			
 
				-// state matrix with values in an S-box.
			
 
				-static void InvSubBytes(void)
			
 
				-{
			
 
				-  uint8_t i,j;
			
 
				-  for (i = 0; i < 4; ++i)
			
 
				-  {
			
 
				-    for (j = 0; j < 4; ++j)
			
 
				-    {
			
 
				-      (*state)[j][i] = getSBoxInvert((*state)[j][i]);
			
 
				-    }
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-static void InvShiftRows(void)
			
 
				-{
			
 
				-  uint8_t temp;
			
 
				-
			
 
				-  // Rotate first row 1 columns to right  
			
 
				-  temp = (*state)[3][1];
			
 
				-  (*state)[3][1] = (*state)[2][1];
			
 
				-  (*state)[2][1] = (*state)[1][1];
			
 
				-  (*state)[1][1] = (*state)[0][1];
			
 
				-  (*state)[0][1] = temp;
			
 
				-
			
 
				-  // Rotate second row 2 columns to right 
			
 
				-  temp = (*state)[0][2];
			
 
				-  (*state)[0][2] = (*state)[2][2];
			
 
				-  (*state)[2][2] = temp;
			
 
				-
			
 
				-  temp = (*state)[1][2];
			
 
				-  (*state)[1][2] = (*state)[3][2];
			
 
				-  (*state)[3][2] = temp;
			
 
				-
			
 
				-  // Rotate third row 3 columns to right
			
 
				-  temp = (*state)[0][3];
			
 
				-  (*state)[0][3] = (*state)[1][3];
			
 
				-  (*state)[1][3] = (*state)[2][3];
			
 
				-  (*state)[2][3] = (*state)[3][3];
			
 
				-  (*state)[3][3] = temp;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-// Cipher is the main function that encrypts the PlainText.
			
 
				-static void Cipher(void)
			
 
				-{
			
 
				-  uint8_t round = 0;
			
 
				-
			
 
				-  // Add the First round key to the state before starting the rounds.
			
 
				-  AddRoundKey(0); 
			
 
				-  
			
 
				-  // There will be Nr rounds.
			
 
				-  // The first Nr-1 rounds are identical.
			
 
				-  // These Nr-1 rounds are executed in the loop below.
			
 
				-  for (round = 1; round < Nr; ++round)
			
 
				-  {
			
 
				-    SubBytes();
			
 
				-    ShiftRows();
			
 
				-    MixColumns();
			
 
				-    AddRoundKey(round);
			
 
				-  }
			
 
				-  
			
 
				-  // The last round is given below.
			
 
				-  // The MixColumns function is not here in the last round.
			
 
				-  SubBytes();
			
 
				-  ShiftRows();
			
 
				-  AddRoundKey(Nr);
			
 
				-}
			
 
				-
			
 
				-static void InvCipher(void)
			
 
				-{
			
 
				-  uint8_t round=0;
			
 
				-
			
 
				-  // Add the First round key to the state before starting the rounds.
			
 
				-  AddRoundKey(Nr); 
			
 
				-
			
 
				-  // There will be Nr rounds.
			
 
				-  // The first Nr-1 rounds are identical.
			
 
				-  // These Nr-1 rounds are executed in the loop below.
			
 
				-  for (round = (Nr - 1); round > 0; --round)
			
 
				-  {
			
 
				-    InvShiftRows();
			
 
				-    InvSubBytes();
			
 
				-    AddRoundKey(round);
			
 
				-    InvMixColumns();
			
 
				-  }
			
 
				-  
			
 
				-  // The last round is given below.
			
 
				-  // The MixColumns function is not here in the last round.
			
 
				-  InvShiftRows();
			
 
				-  InvSubBytes();
			
 
				-  AddRoundKey(0);
			
 
				-}
			
 
				-
			
 
				-
			
 
				-/*****************************************************************************/
			
 
				-/* Public functions:                                                         */
			
 
				-/*****************************************************************************/
			
 
				-#if defined(ECB) && (ECB == 1)
			
 
				-
			
 
				-
			
 
				-void AES_ECB_encrypt0(const uint8_t* input, const uint8_t* key, uint8_t* output, const uint32_t length)
			
 
				-{
			
 
				-  // Copy input to output, and work in-memory on output
			
 
				-  memcpy(output, input, length);
			
 
				-  state = (state_t*)output;
			
 
				-
			
 
				-  Key = key;
			
 
				-  KeyExpansion();
			
 
				-
			
 
				-  // The next function call encrypts the PlainText with the Key using AES algorithm.
			
 
				-  Cipher();
			
 
				-}
			
 
				-
			
 
				-void AES_ECB_decrypt0(const uint8_t* input, const uint8_t* key, uint8_t *output, const uint32_t length)
			
 
				-{
			
 
				-  // Copy input to output, and work in-memory on output
			
 
				-  memcpy(output, input, length);
			
 
				-  state = (state_t*)output;
			
 
				-
			
 
				-  // The KeyExpansion routine must be called before encryption.
			
 
				-  Key = key;
			
 
				-  KeyExpansion();
			
 
				-
			
 
				-  InvCipher();
			
 
				-}
			
 
				-
			
 
				-
			
 
				-#endif // #if defined(ECB) && (ECB == 1)
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-#if defined(CBC) && (CBC == 1)
			
 
				-
			
 
				-
			
 
				-static void XorWithIv(uint8_t* buf)
			
 
				-{
			
 
				-  uint8_t i;
			
 
				-  for (i = 0; i < BLOCKLEN; ++i) //WAS for(i = 0; i < KEYLEN; ++i) but the block in AES is always 128bit so 16 bytes!
			
 
				-  {
			
 
				-    buf[i] ^= Iv[i];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void AES_CBC_encrypt_buffer0(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv)
			
 
				-{
			
 
				-  uintptr_t i;
			
 
				-  uint8_t extra = length % BLOCKLEN; /* Remaining bytes in the last non-full block */
			
 
				-
			
 
				-  // Skip the key expansion if key is passed as 0
			
 
				-  if (0 != key)
			
 
				-  {
			
 
				-    Key = key;
			
 
				-    KeyExpansion();
			
 
				-  }
			
 
				-
			
 
				-  if (iv != 0)
			
 
				-  {
			
 
				-    Iv = (uint8_t*)iv;
			
 
				-  }
			
 
				-
			
 
				-  for (i = 0; i < length; i += BLOCKLEN)
			
 
				-  {
			
 
				-    XorWithIv(input);
			
 
				-    memcpy(output, input, BLOCKLEN);
			
 
				-    state = (state_t*)output;
			
 
				-    Cipher();
			
 
				-    Iv = output;
			
 
				-    input += BLOCKLEN;
			
 
				-    output += BLOCKLEN;
			
 
				-    //printf("Step %d - %d", i/16, i);
			
 
				-  }
			
 
				-
			
 
				-  if (extra)
			
 
				-  {
			
 
				-    memcpy(output, input, extra);
			
 
				-    state = (state_t*)output;
			
 
				-    Cipher();
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void AES_CBC_decrypt_buffer0(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv)
			
 
				-{
			
 
				-  uintptr_t i;
			
 
				-  uint8_t extra = length % BLOCKLEN; /* Remaining bytes in the last non-full block */
			
 
				-
			
 
				-  // Skip the key expansion if key is passed as 0
			
 
				-  if (0 != key)
			
 
				-  {
			
 
				-    Key = key;
			
 
				-    KeyExpansion();
			
 
				-  }
			
 
				-
			
 
				-  // If iv is passed as 0, we continue to encrypt without re-setting the Iv
			
 
				-  if (iv != 0)
			
 
				-  {
			
 
				-    Iv = (uint8_t*)iv;
			
 
				-  }
			
 
				-
			
 
				-  for (i = 0; i < length; i += BLOCKLEN)
			
 
				-  {
			
 
				-    memcpy(output, input, BLOCKLEN);
			
 
				-    state = (state_t*)output;
			
 
				-    InvCipher();
			
 
				-    XorWithIv(output);
			
 
				-    Iv = input;
			
 
				-    input += BLOCKLEN;
			
 
				-    output += BLOCKLEN;
			
 
				-  }
			
 
				-
			
 
				-  if (extra)
			
 
				-  {
			
 
				-    memcpy(output, input, extra);
			
 
				-    state = (state_t*)output;
			
 
				-    InvCipher();
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-#endif // #if defined(CBC) && (CBC == 1)
			
--- a/lib/aes_acc/aes0.h
+++ b/lib/aes_acc/aes0.h
@@ -1,45 +0,0 @@
 
				-/*
			
 
				- *  this file comes from https://github.com/kokke/tiny-AES128-C
			
 
				- */
			
 
				-
			
 
				-#ifndef _AES_H_
			
 
				-#define _AES_H_
			
 
				-
			
 
				-#include <stdint.h>
			
 
				-
			
 
				-
			
 
				-// #define the macros below to 1/0 to enable/disable the mode of operation.
			
 
				-//
			
 
				-// CBC enables AES encryption in CBC-mode of operation.
			
 
				-// ECB enables the basic ECB 16-byte block algorithm. Both can be enabled simultaneously.
			
 
				-
			
 
				-// The #ifndef-guard allows it to be configured before #include'ing or at compile time.
			
 
				-#ifndef CBC
			
 
				-  #define CBC 1
			
 
				-#endif
			
 
				-
			
 
				-#ifndef ECB
			
 
				-  #define ECB 1
			
 
				-#endif
			
 
				-
			
 
				-#define AES128 1
			
 
				-//#define AES192 1
			
 
				-//#define AES256 1
			
 
				-
			
 
				-#if defined(ECB) && (ECB == 1)
			
 
				-
			
 
				-void AES_ECB_encrypt0(const uint8_t* input, const uint8_t* key, uint8_t *output, const uint32_t length);
			
 
				-void AES_ECB_decrypt0(const uint8_t* input, const uint8_t* key, uint8_t *output, const uint32_t length);
			
 
				-
			
 
				-#endif // #if defined(ECB) && (ECB == !)
			
 
				-
			
 
				-
			
 
				-#if defined(CBC) && (CBC == 1)
			
 
				-
			
 
				-void AES_CBC_encrypt_buffer0(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv);
			
 
				-void AES_CBC_decrypt_buffer0(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv);
			
 
				-
			
 
				-#endif // #if defined(CBC) && (CBC == 1)
			
 
				-
			
 
				-
			
 
				-#endif //_AES_H_
			
--- a/lib/aes_acc/aesacc.c
+++ b/lib/aes_acc/aesacc.c
@@ -2,27 +2,25 @@
 
				  * This file is adapted from PolarSSL 1.3.19 (GPL)
			
 
				  */
			
 
				 
			
 
				-#include "aes0.h"
			
 
				 #include "aesni.h"
			
 
				 #include "aesarm.h"
			
 
				-#include "aesacc.h"
			
 
				-
			
 
				+#include <stdint.h>
			
 
				 #include <string.h>
			
 
				 
			
 
				 #if defined(AES256) && (AES256 == 1)
			
 
				 #define AES_KEYSIZE 256
			
 
				 #ifdef HAVE_AMD64
			
 
				-  #define aes_setkey_enc aesni_setkey_enc_256
			
 
				+  #define aeshw_setkey_enc aesni_setkey_enc_256
			
 
				 #endif
			
 
				 #elif defined(AES192) && (AES192 == 1)
			
 
				 #define AES_KEYSIZE 192
			
 
				 #ifdef HAVE_AMD64
			
 
				-  #define aes_setkey_enc aesni_setkey_enc_192
			
 
				+  #define aeshw_setkey_enc aesni_setkey_enc_192
			
 
				 #endif
			
 
				 #else
			
 
				 #define AES_KEYSIZE 128
			
 
				 #ifdef HAVE_AMD64
			
 
				-  #define aes_setkey_enc aesni_setkey_enc_128
			
 
				+  #define aeshw_setkey_enc aesni_setkey_enc_128
			
 
				 #endif
			
 
				 #endif
			
 
				 
			
@@ -31,15 +29,15 @@
 
				 
			
 
				 #ifdef HAVE_AMD64
			
 
				 #define HAVE_HARDAES 1
			
 
				-#define aes_supported aesni_supported
			
 
				-#define aes_crypt_ecb aesni_crypt_ecb
			
 
				-#define aes_inverse_key(a,b) aesni_inverse_key(a,b,AES_NR)
			
 
				+#define aeshw_supported aesni_supported
			
 
				+#define aeshw_crypt_ecb aesni_crypt_ecb
			
 
				+#define aeshw_inverse_key(a,b) aesni_inverse_key(a,b,AES_NR)
			
 
				 #endif /* HAVE_AMD64 */
			
 
				 
			
 
				 #ifdef HAVE_ARM64
			
 
				 #define HAVE_HARDAES 1
			
 
				-#define aes_supported aesarm_supported
			
 
				-#define aes_crypt_ecb aesarm_crypt_ecb
			
 
				+#define aeshw_supported aesarm_supported
			
 
				+#define aeshw_crypt_ecb aesarm_crypt_ecb
			
 
				 
			
 
				 #include "aesarm_table.h"
			
 
				 
			
@@ -53,7 +51,7 @@
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static void aes_setkey_enc(uint8_t *rk, const uint8_t *key)
			
 
				+static void aeshw_setkey_enc(uint8_t *rk, const uint8_t *key)
			
 
				 {
			
 
				     unsigned int i;
			
 
				     uint32_t *RK;
			
@@ -129,7 +127,7 @@ static void aes_setkey_enc(uint8_t *rk, const uint8_t *key)
 
				     }
			
 
				 }
			
 
				 
			
 
				-static void aes_inverse_key(uint8_t *invkey, const uint8_t *fwdkey)
			
 
				+static void aeshw_inverse_key(uint8_t *invkey, const uint8_t *fwdkey)
			
 
				 {
			
 
				   int i, j;
			
 
				   uint32_t *RK;
			
@@ -159,18 +157,32 @@ static void aes_inverse_key(uint8_t *invkey, const uint8_t *fwdkey)
 
				   *RK++ = *SK++;
			
 
				   *RK++ = *SK++;
			
 
				 }
			
 
				-
			
 
				 #endif /* HAVE_ARM64 */
			
 
				 
			
 
				-#ifdef HAVE_ASM
			
 
				+#ifdef HAVE_HARDAES
			
 
				+static void aeshw_setkey_dec(uint8_t *rk, const uint8_t *key)
			
 
				+{
			
 
				+  uint8_t rk_tmp[AES_RKSIZE];
			
 
				+  aeshw_setkey_enc(rk_tmp, key);
			
 
				+  aeshw_inverse_key(rk, rk_tmp);
			
 
				+}
			
 
				+#endif /* HAVE_HARDAES */
			
 
				 
			
 
				+/* OpenSSL assembly functions */
			
 
				 #define AES_MAXNR 14
			
 
				-
			
 
				 typedef struct {
			
 
				   uint32_t rd_key[4 * (AES_MAXNR + 1)];
			
 
				-  int rounds;
			
 
				+  uint32_t rounds;
			
 
				 } AES_KEY;
			
 
				 
			
 
				+#if defined(__amd64__) || defined(__x86_64__) || \
			
 
				+    defined(__aarch64__)
			
 
				+#define AES_set_encrypt_key vpaes_set_encrypt_key
			
 
				+#define AES_set_decrypt_key vpaes_set_decrypt_key
			
 
				+#define AES_encrypt vpaes_encrypt
			
 
				+#define AES_decrypt vpaes_decrypt
			
 
				+#endif /* VPAES for 64-bit Intel and ARM */
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 extern "C" {
			
 
				 #endif
			
@@ -189,69 +201,51 @@ void AES_decrypt(const unsigned char *in, unsigned char *out,
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static int aes_supported(void)
			
 
				-{
			
 
				-  return 2;
			
 
				-}
			
 
				-
			
 
				 static void aes_crypt_ecb( int nr,
			
 
				                            unsigned char *rk,
			
 
				                            int mode,
			
 
				                            const unsigned char input[16],
			
 
				                            unsigned char output[16] )
			
 
				 {
			
 
				-  AES_KEY *ctx;
			
 
				-  ctx = (AES_KEY *) rk;
			
 
				-  ctx->rounds = nr;
			
 
				   if (mode == AES_DECRYPT) {
			
 
				-    AES_decrypt(input, output, ctx);
			
 
				+    AES_decrypt(input, output, (AES_KEY *) rk);
			
 
				   } else {
			
 
				-    AES_encrypt(input, output, ctx);
			
 
				+    AES_encrypt(input, output, (AES_KEY *) rk);
			
 
				   }
			
 
				 }
			
 
				 
			
 
				 static void aes_setkey_enc(uint8_t *rk, const uint8_t *key)
			
 
				 {
			
 
				-  AES_KEY *ctx;
			
 
				-  ctx = (AES_KEY *) rk;
			
 
				-  ctx->rounds = AES_NR;
			
 
				-  AES_set_encrypt_key(key, AES_KEYSIZE, ctx);
			
 
				+  AES_set_encrypt_key(key, AES_KEYSIZE, (AES_KEY *) rk);
			
 
				 }
			
 
				 
			
 
				 static void aes_setkey_dec(uint8_t *rk, const uint8_t *key)
			
 
				 {
			
 
				-  AES_KEY *ctx;
			
 
				-  ctx = (AES_KEY *) rk;
			
 
				-  ctx->rounds = AES_NR;
			
 
				-  AES_set_decrypt_key(key, AES_KEYSIZE, ctx);
			
 
				+  AES_set_decrypt_key(key, AES_KEYSIZE, (AES_KEY *) rk);
			
 
				 }
			
 
				 
			
 
				-#endif
			
 
				-
			
 
				-#ifdef HAVE_HARDAES
			
 
				-
			
 
				-static void aes_setkey_dec(uint8_t *rk, const uint8_t *key)
			
 
				-{
			
 
				-  uint8_t rk_tmp[AES_RKSIZE];
			
 
				-  aes_setkey_enc(rk_tmp, key);
			
 
				-  aes_inverse_key(rk, rk_tmp);
			
 
				-}
			
 
				-
			
 
				-#endif
			
 
				+static void (*crypt_ecb) ( int nr,
			
 
				+                           unsigned char *rk,
			
 
				+                           int mode,
			
 
				+                           const unsigned char input[16],
			
 
				+                           unsigned char output[16] )
			
 
				+  = aes_crypt_ecb;
			
 
				 
			
 
				-#if defined(HAVE_HARDAES) || defined(HAVE_ASM)
			
 
				+static void (*setkey_enc) (uint8_t *rk, const uint8_t *key)
			
 
				+  = aes_setkey_enc;
			
 
				 
			
 
				-#define HAVE_ACC 1
			
 
				+static void (*setkey_dec) (uint8_t *rk, const uint8_t *key)
			
 
				+  = aes_setkey_dec;
			
 
				 
			
 
				 /*
			
 
				  * AESNI-CBC buffer encryption/decryption
			
 
				  */
			
 
				-static void aes_crypt_cbc( int mode,
			
 
				-                           uint8_t* rk,
			
 
				-                           uint32_t length,
			
 
				-                           uint8_t iv[16],
			
 
				-                           const uint8_t *input,
			
 
				-                           uint8_t *output )
			
 
				+static void crypt_cbc( int mode,
			
 
				+                       uint8_t* rk,
			
 
				+                       uint32_t length,
			
 
				+                       uint8_t iv[16],
			
 
				+                       const uint8_t *input,
			
 
				+                       uint8_t *output )
			
 
				 {
			
 
				     int i;
			
 
				     uint8_t temp[16];
			
@@ -261,7 +255,7 @@ static void aes_crypt_cbc( int mode,
 
				         while( length > 0 )
			
 
				         {
			
 
				             memcpy( temp, input, 16 );
			
 
				-            aes_crypt_ecb( AES_NR, rk, mode, input, output );
			
 
				+            crypt_ecb( AES_NR, rk, mode, input, output );
			
 
				 
			
 
				             for( i = 0; i < 16; i++ )
			
 
				                 output[i] = (uint8_t)( output[i] ^ iv[i] );
			
@@ -280,7 +274,7 @@ static void aes_crypt_cbc( int mode,
 
				             for( i = 0; i < 16; i++ )
			
 
				                 output[i] = (uint8_t)( input[i] ^ iv[i] );
			
 
				 
			
 
				-            aes_crypt_ecb( AES_NR, rk, mode, output, output );
			
 
				+            crypt_ecb( AES_NR, rk, mode, output, output );
			
 
				             memcpy( iv, output, 16 );
			
 
				 
			
 
				             input  += 16;
			
@@ -290,12 +284,26 @@ static void aes_crypt_cbc( int mode,
 
				     }
			
 
				 }
			
 
				 
			
 
				-#endif /* HAVE_HARDAES or HAVE_ASM */
			
 
				 
			
 
				-int AESACC_supported(void)
			
 
				+static void aeshw_init(void)
			
 
				+{
			
 
				+#ifdef HAVE_HARDAES
			
 
				+  static int done = 0;
			
 
				+  if (!done) {
			
 
				+    if (aeshw_supported()) {
			
 
				+      crypt_ecb = aeshw_crypt_ecb;
			
 
				+      setkey_enc = aeshw_setkey_enc;
			
 
				+      setkey_dec = aeshw_setkey_dec;
			
 
				+    }
			
 
				+    done = 1;
			
 
				+  }
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+int AES_support_hwaccel(void)
			
 
				 {
			
 
				-#if defined(HAVE_ACC)
			
 
				-  return aes_supported();
			
 
				+#ifdef HAVE_HARDAES
			
 
				+  return aeshw_supported();
			
 
				 #else
			
 
				   return 0;
			
 
				 #endif
			
@@ -303,86 +311,59 @@ int AESACC_supported(void)
 
				 
			
 
				 void AES_CBC_encrypt_buffer(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv)
			
 
				 {
			
 
				-#if defined(HAVE_ACC)
			
 
				   uint8_t iv_tmp[16];
			
 
				   uint8_t rk[AES_RKSIZE];
			
 
				 
			
 
				-  if (aes_supported())
			
 
				+  if (key == NULL || iv == NULL)
			
 
				   {
			
 
				-    if (key == NULL || iv == NULL)
			
 
				-    {
			
 
				-      return;
			
 
				-    }
			
 
				-    memcpy(iv_tmp, iv, 16);
			
 
				-    aes_setkey_enc(rk, key);
			
 
				-    aes_crypt_cbc(AES_ENCRYPT, rk, \
			
 
				-                  length, iv_tmp, input, output);
			
 
				     return;
			
 
				   }
			
 
				-#endif
			
 
				-
			
 
				-  AES_CBC_encrypt_buffer0(output, input, length, key, iv);
			
 
				+  aeshw_init();
			
 
				+  memcpy(iv_tmp, iv, 16);
			
 
				+  setkey_enc(rk, key);
			
 
				+  crypt_cbc(AES_ENCRYPT, rk, \
			
 
				+            length, iv_tmp, input, output);
			
 
				 }
			
 
				 
			
 
				 void AES_CBC_decrypt_buffer(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv)
			
 
				 {
			
 
				-#if defined(HAVE_ACC)
			
 
				   uint8_t iv_tmp[16];
			
 
				   uint8_t rk[AES_RKSIZE];
			
 
				 
			
 
				-  if (aes_supported())
			
 
				+  if (key == NULL || iv == NULL)
			
 
				   {
			
 
				-    if (key == NULL || iv == NULL)
			
 
				-    {
			
 
				-      return;
			
 
				-    }
			
 
				-    memcpy(iv_tmp, iv, 16);
			
 
				-    aes_setkey_dec(rk, key);
			
 
				-    aes_crypt_cbc(AES_DECRYPT, rk, \
			
 
				-                  length, iv_tmp, input, output);
			
 
				     return;
			
 
				   }
			
 
				-#endif
			
 
				+  aeshw_init();
			
 
				+  memcpy(iv_tmp, iv, 16);
			
 
				+  setkey_dec(rk, key);
			
 
				+  crypt_cbc(AES_DECRYPT, rk, \
			
 
				+            length, iv_tmp, input, output);
			
 
				 
			
 
				-  AES_CBC_decrypt_buffer0(output, input, length, key, iv);
			
 
				 }
			
 
				 
			
 
				 void AES_ECB_encrypt(const uint8_t* input, const uint8_t* key, uint8_t* output, const uint32_t length)
			
 
				 {
			
 
				-#if defined(HAVE_ACC)
			
 
				   uint8_t rk[AES_RKSIZE];
			
 
				 
			
 
				-  if (aes_supported())
			
 
				+  if (key == NULL)
			
 
				   {
			
 
				-    if (key == NULL)
			
 
				-    {
			
 
				-      return;
			
 
				-    }
			
 
				-    aes_setkey_enc(rk, key);
			
 
				-    aes_crypt_ecb(AES_NR, rk, AES_ENCRYPT, input, output);
			
 
				     return;
			
 
				   }
			
 
				-#endif
			
 
				-
			
 
				-  AES_ECB_encrypt0(input, key, output, length);
			
 
				+  aeshw_init();
			
 
				+  setkey_enc(rk, key);
			
 
				+  crypt_ecb(AES_NR, rk, AES_ENCRYPT, input, output);
			
 
				 }
			
 
				 
			
 
				 void AES_ECB_decrypt(const uint8_t* input, const uint8_t* key, uint8_t *output, const uint32_t length)
			
 
				 {
			
 
				-#if defined(HAVE_ACC)
			
 
				   uint8_t rk[AES_RKSIZE];
			
 
				 
			
 
				-  if (aes_supported())
			
 
				+  if (key == NULL)
			
 
				   {
			
 
				-    if (key == NULL)
			
 
				-    {
			
 
				-      return;
			
 
				-    }
			
 
				-    aes_setkey_dec(rk, key);
			
 
				-    aes_crypt_ecb(AES_NR, rk, AES_DECRYPT, input, output);
			
 
				     return;
			
 
				   }
			
 
				-#endif
			
 
				-
			
 
				-  AES_ECB_decrypt0(input, key, output, length);
			
 
				+  aeshw_init();
			
 
				+  setkey_dec(rk, key);
			
 
				+  crypt_ecb(AES_NR, rk, AES_DECRYPT, input, output);
			
 
				 }
			
--- a/lib/aes_acc/aesacc.h
+++ b/lib/aes_acc/aesacc.h
@@ -1,20 +0,0 @@
 
				-#ifndef _AESACC_H_
			
 
				-#define _AESACC_H_
			
 
				-
			
 
				-#include <stdint.h>
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-int AESACC_supported(void);
			
 
				-void AESACC_ECB_encrypt(const uint8_t* input, const uint8_t* key, uint8_t *output, const uint32_t length);
			
 
				-void AESACC_ECB_decrypt(const uint8_t* input, const uint8_t* key, uint8_t *output, const uint32_t length);
			
 
				-void AESACC_CBC_encrypt_buffer(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv);
			
 
				-void AESACC_CBC_decrypt_buffer(uint8_t* output, uint8_t* input, uint32_t length, const uint8_t* key, const uint8_t* iv);
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-#endif /* _AESACC_H_ */
			
--- a/lib/aes_acc/aesni.c
+++ b/lib/aes_acc/aesni.c
@@ -86,11 +86,11 @@ int aesni_supported( void )
 
				 /*
			
 
				  * AES-NI AES-ECB block en(de)cryption
			
 
				  */
			
 
				-int aesni_crypt_ecb( int nr,
			
 
				-                     unsigned char *rk,
			
 
				-                     int mode,
			
 
				-                     const unsigned char input[16],
			
 
				-                     unsigned char output[16] )
			
 
				+void aesni_crypt_ecb( int nr,
			
 
				+                      unsigned char *rk,
			
 
				+                      int mode,
			
 
				+                      const unsigned char input[16],
			
 
				+                      unsigned char output[16] )
			
 
				 {
			
 
				     asm( "movdqu    (%3), %%xmm0    \n\t" // load input
			
 
				          "movdqu    (%1), %%xmm1    \n\t" // load round key 0
			
@@ -124,9 +124,6 @@ int aesni_crypt_ecb( int nr,
 
				          :
			
 
				          : "r" (nr), "r" (rk), "r" (mode), "r" (input), "r" (output)
			
 
				          : "memory", "cc", "xmm0", "xmm1" );
			
 
				-
			
 
				-
			
 
				-    return( 0 );
			
 
				 }
			
 
				 
			
 
				 /*
			
--- a/lib/aes_acc/aesni.h
+++ b/lib/aes_acc/aesni.h
@@ -64,14 +64,12 @@ int aesni_supported( void );
 
				  * \param mode     AES_ENCRYPT or AES_DECRYPT
			
 
				  * \param input    16-byte input block
			
 
				  * \param output   16-byte output block
			
 
				- *
			
 
				- * \return         0 on success (cannot fail)
			
 
				  */
			
 
				-int aesni_crypt_ecb( int nr,
			
 
				-                     unsigned char *rk,
			
 
				-                     int mode,
			
 
				-                     const unsigned char input[16],
			
 
				-                     unsigned char output[16] );
			
 
				+void aesni_crypt_ecb( int nr,
			
 
				+                      unsigned char *rk,
			
 
				+                      int mode,
			
 
				+                      const unsigned char input[16],
			
 
				+                      unsigned char output[16] );
			
 
				 
			
 
				 /**
			
 
				  * \brief           Compute decryption round keys from encryption round keys
			
--- a/lib/aes_acc/asm/arm64.S
+++ b/lib/aes_acc/asm/arm64.S
@@ -0,0 +1,1178 @@
 
				+.text
			
 
				+
			
 
				+.type	_vpaes_consts,%object
			
 
				+.align	7	// totally strategic alignment
			
 
				+_vpaes_consts:
			
 
				+.Lk_mc_forward:	//	mc_forward
			
 
				+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
			
 
				+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
			
 
				+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
			
 
				+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
			
 
				+.Lk_mc_backward:	//	mc_backward
			
 
				+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
			
 
				+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
			
 
				+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
			
 
				+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
			
 
				+.Lk_sr:	//	sr
			
 
				+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
			
 
				+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
			
 
				+.quad	0x0F060D040B020900, 0x070E050C030A0108
			
 
				+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
			
 
				+
			
 
				+//
			
 
				+// "Hot" constants
			
 
				+//
			
 
				+.Lk_inv:	//	inv, inva
			
 
				+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
			
 
				+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
			
 
				+.Lk_ipt:	//	input transform (lo, hi)
			
 
				+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
			
 
				+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
			
 
				+.Lk_sbo:	//	sbou, sbot
			
 
				+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
			
 
				+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
			
 
				+.Lk_sb1:	//	sb1u, sb1t
			
 
				+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
			
 
				+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
			
 
				+.Lk_sb2:	//	sb2u, sb2t
			
 
				+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
			
 
				+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
			
 
				+
			
 
				+//
			
 
				+//  Decryption stuff
			
 
				+//
			
 
				+.Lk_dipt:	//	decryption input transform
			
 
				+.quad	0x0F505B040B545F00, 0x154A411E114E451A
			
 
				+.quad	0x86E383E660056500, 0x12771772F491F194
			
 
				+.Lk_dsbo:	//	decryption sbox final output
			
 
				+.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
			
 
				+.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
			
 
				+.Lk_dsb9:	//	decryption sbox output *9*u, *9*t
			
 
				+.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
			
 
				+.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
			
 
				+.Lk_dsbd:	//	decryption sbox output *D*u, *D*t
			
 
				+.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
			
 
				+.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
			
 
				+.Lk_dsbb:	//	decryption sbox output *B*u, *B*t
			
 
				+.quad	0xD022649296B44200, 0x602646F6B0F2D404
			
 
				+.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
			
 
				+.Lk_dsbe:	//	decryption sbox output *E*u, *E*t
			
 
				+.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
			
 
				+.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
			
 
				+
			
 
				+//
			
 
				+//  Key schedule constants
			
 
				+//
			
 
				+.Lk_dksd:	//	decryption key schedule: invskew x*D
			
 
				+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
			
 
				+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
			
 
				+.Lk_dksb:	//	decryption key schedule: invskew x*B
			
 
				+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
			
 
				+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
			
 
				+.Lk_dkse:	//	decryption key schedule: invskew x*E + 0x63
			
 
				+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
			
 
				+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
			
 
				+.Lk_dks9:	//	decryption key schedule: invskew x*9
			
 
				+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
			
 
				+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
			
 
				+
			
 
				+.Lk_rcon:	//	rcon
			
 
				+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
			
 
				+
			
 
				+.Lk_opt:	//	output transform
			
 
				+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
			
 
				+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
			
 
				+.Lk_deskew:	//	deskew tables: inverts the sbox's "skew"
			
 
				+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
			
 
				+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
			
 
				+
			
 
				+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
			
 
				+.align	2
			
 
				+.size	_vpaes_consts,.-_vpaes_consts
			
 
				+.align	6
			
 
				+##
			
 
				+##  _aes_preheat
			
 
				+##
			
 
				+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
			
 
				+##  and %xmm9-%xmm15 as specified below.
			
 
				+##
			
 
				+.type	_vpaes_encrypt_preheat,%function
			
 
				+.align	4
			
 
				+_vpaes_encrypt_preheat:
			
 
				+	adr	x10, .Lk_inv
			
 
				+	movi	v17.16b, #0x0f
			
 
				+	ld1	{v18.2d,v19.2d}, [x10],#32	// .Lk_inv
			
 
				+	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64	// .Lk_ipt, .Lk_sbo
			
 
				+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10]		// .Lk_sb1, .Lk_sb2
			
 
				+	ret
			
 
				+.size	_vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
			
 
				+
			
 
				+##
			
 
				+##  _aes_encrypt_core
			
 
				+##
			
 
				+##  AES-encrypt %xmm0.
			
 
				+##
			
 
				+##  Inputs:
			
 
				+##     %xmm0 = input
			
 
				+##     %xmm9-%xmm15 as in _vpaes_preheat
			
 
				+##    (%rdx) = scheduled keys
			
 
				+##
			
 
				+##  Output in %xmm0
			
 
				+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
			
 
				+##  Preserves %xmm6 - %xmm8 so you get some local vectors
			
 
				+##
			
 
				+##
			
 
				+.type	_vpaes_encrypt_core,%function
			
 
				+.align	4
			
 
				+_vpaes_encrypt_core:
			
 
				+	mov	x9, x2
			
 
				+	ldr	w8, [x2,#240]			// pull rounds
			
 
				+	adr	x11, .Lk_mc_forward+16
			
 
				+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
			
 
				+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
			
 
				+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
			
 
				+	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
			
 
				+	tbl	v1.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
			
 
				+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
			
 
				+	tbl	v2.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
			
 
				+	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
			
 
				+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
			
 
				+	b	.Lenc_entry
			
 
				+
			
 
				+.align	4
			
 
				+.Lenc_loop:
			
 
				+	// middle of middle round
			
 
				+	add	x10, x11, #0x40
			
 
				+	tbl	v4.16b, {v25.16b}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
			
 
				+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
			
 
				+	tbl	v0.16b, {v24.16b}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
			
 
				+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
			
 
				+	tbl	v5.16b,	{v27.16b}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
			
 
				+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
			
 
				+	tbl	v2.16b, {v26.16b}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
			
 
				+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
			
 
				+	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
			
 
				+	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
			
 
				+	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
			
 
				+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
			
 
				+	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
			
 
				+	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
			
 
				+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
			
 
				+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
			
 
				+	sub	w8, w8, #1			// nr--
			
 
				+
			
 
				+.Lenc_entry:
			
 
				+	// top of round
			
 
				+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
			
 
				+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
			
 
				+	tbl	v5.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
			
 
				+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
			
 
				+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
			
 
				+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
			
 
				+	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
			
 
				+	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
			
 
				+	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
			
 
				+	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
			
 
				+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
			
 
				+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
			
 
				+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
			
 
				+	cbnz	w8, .Lenc_loop
			
 
				+
			
 
				+	// middle of last round
			
 
				+	add	x10, x11, #0x80
			
 
				+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
			
 
				+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
			
 
				+	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
			
 
				+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
			
 
				+	tbl	v0.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
			
 
				+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
			
 
				+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
			
 
				+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
			
 
				+	ret
			
 
				+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
			
 
				+
			
 
				+.globl	vpaes_encrypt
			
 
				+.type	vpaes_encrypt,%function
			
 
				+.align	4
			
 
				+vpaes_encrypt:
			
 
				+	stp	x29,x30,[sp,#-16]!
			
 
				+	add	x29,sp,#0
			
 
				+
			
 
				+	ld1	{v7.16b}, [x0]
			
 
				+	bl	_vpaes_encrypt_preheat
			
 
				+	bl	_vpaes_encrypt_core
			
 
				+	st1	{v0.16b}, [x1]
			
 
				+
			
 
				+	ldp	x29,x30,[sp],#16
			
 
				+	ret
			
 
				+.size	vpaes_encrypt,.-vpaes_encrypt
			
 
				+
			
 
				+.type	_vpaes_encrypt_2x,%function
			
 
				+.align	4
			
 
				+_vpaes_encrypt_2x:
			
 
				+	mov	x9, x2
			
 
				+	ldr	w8, [x2,#240]			// pull rounds
			
 
				+	adr	x11, .Lk_mc_forward+16
			
 
				+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
			
 
				+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
			
 
				+	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
			
 
				+	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0
			
 
				+	and	v9.16b,  v15.16b,  v17.16b
			
 
				+	ushr	v8.16b,  v15.16b,  #4
			
 
				+	tbl	v1.16b,  {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
			
 
				+	tbl	v9.16b,  {v20.16b}, v9.16b
			
 
				+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
			
 
				+	tbl	v2.16b,  {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
			
 
				+	tbl	v10.16b, {v21.16b}, v8.16b
			
 
				+	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
			
 
				+	eor	v8.16b,  v9.16b,   v16.16b
			
 
				+	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
			
 
				+	eor	v8.16b,  v8.16b,   v10.16b
			
 
				+	b	.Lenc_2x_entry
			
 
				+
			
 
				+.align	4
			
 
				+.Lenc_2x_loop:
			
 
				+	// middle of middle round
			
 
				+	add	x10, x11, #0x40
			
 
				+	tbl	v4.16b,  {v25.16b}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
			
 
				+	tbl	v12.16b, {v25.16b}, v10.16b
			
 
				+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
			
 
				+	tbl	v0.16b,  {v24.16b}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
			
 
				+	tbl	v8.16b,  {v24.16b}, v11.16b
			
 
				+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
			
 
				+	eor	v12.16b, v12.16b, v16.16b
			
 
				+	tbl	v5.16b,	 {v27.16b}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
			
 
				+	tbl	v13.16b, {v27.16b}, v10.16b
			
 
				+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
			
 
				+	eor	v8.16b,  v8.16b,  v12.16b
			
 
				+	tbl	v2.16b,  {v26.16b}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
			
 
				+	tbl	v10.16b, {v26.16b}, v11.16b
			
 
				+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
			
 
				+	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
			
 
				+	tbl	v11.16b, {v8.16b}, v1.16b
			
 
				+	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
			
 
				+	eor	v10.16b, v10.16b, v13.16b
			
 
				+	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
			
 
				+	tbl	v8.16b,  {v8.16b}, v4.16b
			
 
				+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
			
 
				+	eor	v11.16b, v11.16b, v10.16b
			
 
				+	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
			
 
				+	tbl	v12.16b, {v11.16b},v1.16b
			
 
				+	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
			
 
				+	eor	v8.16b,  v8.16b,  v11.16b
			
 
				+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
			
 
				+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
			
 
				+	eor	v8.16b,  v8.16b,  v12.16b
			
 
				+	sub	w8, w8, #1			// nr--
			
 
				+
			
 
				+.Lenc_2x_entry:
			
 
				+	// top of round
			
 
				+	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
			
 
				+	ushr	v0.16b,  v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
			
 
				+	and	v9.16b,  v8.16b, v17.16b
			
 
				+	ushr	v8.16b,  v8.16b, #4
			
 
				+	tbl	v5.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
			
 
				+	tbl	v13.16b, {v19.16b},v9.16b
			
 
				+	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
			
 
				+	eor	v9.16b,  v9.16b,  v8.16b
			
 
				+	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
			
 
				+	tbl	v11.16b, {v18.16b},v8.16b
			
 
				+	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
			
 
				+	tbl	v12.16b, {v18.16b},v9.16b
			
 
				+	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
			
 
				+	eor	v11.16b, v11.16b, v13.16b
			
 
				+	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
			
 
				+	eor	v12.16b, v12.16b, v13.16b
			
 
				+	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
			
 
				+	tbl	v10.16b, {v18.16b},v11.16b
			
 
				+	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
			
 
				+	tbl	v11.16b, {v18.16b},v12.16b
			
 
				+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
			
 
				+	eor	v10.16b, v10.16b, v9.16b
			
 
				+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
			
 
				+	eor	v11.16b, v11.16b, v8.16b
			
 
				+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
			
 
				+	cbnz	w8, .Lenc_2x_loop
			
 
				+
			
 
				+	// middle of last round
			
 
				+	add	x10, x11, #0x80
			
 
				+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
			
 
				+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
			
 
				+	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
			
 
				+	tbl	v12.16b, {v22.16b}, v10.16b
			
 
				+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
			
 
				+	tbl	v0.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
			
 
				+	tbl	v8.16b,  {v23.16b}, v11.16b
			
 
				+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
			
 
				+	eor	v12.16b, v12.16b, v16.16b
			
 
				+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
			
 
				+	eor	v8.16b,  v8.16b,  v12.16b
			
 
				+	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
			
 
				+	tbl	v1.16b,  {v8.16b},v1.16b
			
 
				+	ret
			
 
				+.size	_vpaes_encrypt_2x,.-_vpaes_encrypt_2x
			
 
				+
			
 
				+.type	_vpaes_decrypt_preheat,%function
			
 
				+.align	4
			
 
				+_vpaes_decrypt_preheat:
			
 
				+	adr	x10, .Lk_inv
			
 
				+	movi	v17.16b, #0x0f
			
 
				+	adr	x11, .Lk_dipt
			
 
				+	ld1	{v18.2d,v19.2d}, [x10],#32	// .Lk_inv
			
 
				+	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64	// .Lk_dipt, .Lk_dsbo
			
 
				+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64	// .Lk_dsb9, .Lk_dsbd
			
 
				+	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x11]		// .Lk_dsbb, .Lk_dsbe
			
 
				+	ret
			
 
				+.size	_vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
			
 
				+
			
 
				+##
			
 
				+##  Decryption core
			
 
				+##
			
 
				+##  Same API as encryption core.
			
 
				+##
			
 
				+.type	_vpaes_decrypt_core,%function
			
 
				+.align	4
			
 
				+_vpaes_decrypt_core:
			
 
				+	mov	x9, x2
			
 
				+	ldr	w8, [x2,#240]			// pull rounds
			
 
				+
			
 
				+						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
			
 
				+	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
			
 
				+	eor	x11, x11, #0x30			// xor		$0x30,	%r11
			
 
				+	adr	x10, .Lk_sr
			
 
				+	and	x11, x11, #0x30			// and		$0x30,	%r11
			
 
				+	add	x11, x11, x10
			
 
				+	adr	x10, .Lk_mc_forward+48
			
 
				+
			
 
				+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
			
 
				+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
			
 
				+	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
			
 
				+	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
			
 
				+	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
			
 
				+						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
			
 
				+	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
			
 
				+	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
			
 
				+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
			
 
				+	b	.Ldec_entry
			
 
				+
			
 
				+.align	4
			
 
				+.Ldec_loop:
			
 
				+//
			
 
				+//  Inverse mix columns
			
 
				+//
			
 
				+						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
			
 
				+						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
			
 
				+	tbl	v4.16b, {v24.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
			
 
				+	tbl	v1.16b, {v25.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
			
 
				+	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
			
 
				+						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
			
 
				+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
			
 
				+						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
			
 
				+
			
 
				+	tbl	v4.16b, {v26.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
			
 
				+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
			
 
				+	tbl	v1.16b, {v27.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
			
 
				+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
			
 
				+						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
			
 
				+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
			
 
				+						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
			
 
				+
			
 
				+	tbl	v4.16b, {v28.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
			
 
				+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
			
 
				+	tbl	v1.16b, {v29.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
			
 
				+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
			
 
				+						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
			
 
				+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
			
 
				+						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
			
 
				+
			
 
				+	tbl	v4.16b, {v30.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
			
 
				+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
			
 
				+	tbl	v1.16b, {v31.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
			
 
				+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
			
 
				+	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
			
 
				+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
			
 
				+	sub	w8, w8, #1			// sub		$1,%rax			# nr--
			
 
				+
			
 
				+.Ldec_entry:
			
 
				+	// top of round
			
 
				+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
			
 
				+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
			
 
				+	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
			
 
				+	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
			
 
				+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
			
 
				+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
			
 
				+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
			
 
				+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
			
 
				+	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
			
 
				+	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
			
 
				+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
			
 
				+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
			
 
				+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
			
 
				+	cbnz	w8, .Ldec_loop
			
 
				+
			
 
				+	// middle of last round
			
 
				+						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
			
 
				+	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
			
 
				+						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
			
 
				+	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
			
 
				+	tbl	v1.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
			
 
				+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
			
 
				+	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
			
 
				+	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
			
 
				+	ret
			
 
				+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
			
 
				+
			
 
				+.globl	vpaes_decrypt
			
 
				+.type	vpaes_decrypt,%function
			
 
				+.align	4
			
 
				+vpaes_decrypt:
			
 
				+	stp	x29,x30,[sp,#-16]!
			
 
				+	add	x29,sp,#0
			
 
				+
			
 
				+	ld1	{v7.16b}, [x0]
			
 
				+	bl	_vpaes_decrypt_preheat
			
 
				+	bl	_vpaes_decrypt_core
			
 
				+	st1	{v0.16b}, [x1]
			
 
				+
			
 
				+	ldp	x29,x30,[sp],#16
			
 
				+	ret
			
 
				+.size	vpaes_decrypt,.-vpaes_decrypt
			
 
				+
			
 
				+// v14-v15 input, v0-v1 output
			
 
				+.type	_vpaes_decrypt_2x,%function
			
 
				+.align	4
			
 
				+_vpaes_decrypt_2x:
			
 
				+	mov	x9, x2
			
 
				+	ldr	w8, [x2,#240]			// pull rounds
			
 
				+
			
 
				+						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
			
 
				+	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
			
 
				+	eor	x11, x11, #0x30			// xor		$0x30,	%r11
			
 
				+	adr	x10, .Lk_sr
			
 
				+	and	x11, x11, #0x30			// and		$0x30,	%r11
			
 
				+	add	x11, x11, x10
			
 
				+	adr	x10, .Lk_mc_forward+48
			
 
				+
			
 
				+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
			
 
				+	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
			
 
				+	ushr	v0.16b,  v14.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
			
 
				+	and	v9.16b,  v15.16b, v17.16b
			
 
				+	ushr	v8.16b,  v15.16b, #4
			
 
				+	tbl	v2.16b,  {v20.16b},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
			
 
				+	tbl	v10.16b, {v20.16b},v9.16b
			
 
				+	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
			
 
				+						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
			
 
				+	tbl	v0.16b,  {v21.16b},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
			
 
				+	tbl	v8.16b,  {v21.16b},v8.16b
			
 
				+	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
			
 
				+	eor	v10.16b, v10.16b, v16.16b
			
 
				+	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
			
 
				+	eor	v8.16b,  v8.16b,  v10.16b
			
 
				+	b	.Ldec_2x_entry
			
 
				+
			
 
				+.align	4
			
 
				+.Ldec_2x_loop:
			
 
				+//
			
 
				+//  Inverse mix columns
			
 
				+//
			
 
				+						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
			
 
				+						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
			
 
				+	tbl	v4.16b,  {v24.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
			
 
				+	tbl	v12.16b, {v24.16b}, v10.16b
			
 
				+	tbl	v1.16b,  {v25.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
			
 
				+	tbl	v9.16b,  {v25.16b}, v11.16b
			
 
				+	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
			
 
				+	eor	v8.16b,  v12.16b, v16.16b
			
 
				+						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
			
 
				+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
			
 
				+	eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
			
 
				+						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
			
 
				+
			
 
				+	tbl	v4.16b,  {v26.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
			
 
				+	tbl	v12.16b, {v26.16b}, v10.16b
			
 
				+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
			
 
				+	tbl	v8.16b,  {v8.16b},v5.16b
			
 
				+	tbl	v1.16b,  {v27.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
			
 
				+	tbl	v9.16b,  {v27.16b}, v11.16b
			
 
				+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
			
 
				+	eor	v8.16b,  v8.16b,  v12.16b
			
 
				+						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
			
 
				+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
			
 
				+	eor	v8.16b,  v8.16b,  v9.16b
			
 
				+						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
			
 
				+
			
 
				+	tbl	v4.16b,  {v28.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
			
 
				+	tbl	v12.16b, {v28.16b}, v10.16b
			
 
				+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
			
 
				+	tbl	v8.16b,  {v8.16b},v5.16b
			
 
				+	tbl	v1.16b,  {v29.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
			
 
				+	tbl	v9.16b,  {v29.16b}, v11.16b
			
 
				+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
			
 
				+	eor	v8.16b,  v8.16b,  v12.16b
			
 
				+						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
			
 
				+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
			
 
				+	eor	v8.16b,  v8.16b,  v9.16b
			
 
				+						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
			
 
				+
			
 
				+	tbl	v4.16b,  {v30.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
			
 
				+	tbl	v12.16b, {v30.16b}, v10.16b
			
 
				+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
			
 
				+	tbl	v8.16b,  {v8.16b},v5.16b
			
 
				+	tbl	v1.16b,  {v31.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
			
 
				+	tbl	v9.16b,  {v31.16b}, v11.16b
			
 
				+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
			
 
				+	eor	v8.16b,  v8.16b,  v12.16b
			
 
				+	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
			
 
				+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
			
 
				+	eor	v8.16b,  v8.16b,  v9.16b
			
 
				+	sub	w8, w8, #1			// sub		$1,%rax			# nr--
			
 
				+
			
 
				+.Ldec_2x_entry:
			
 
				+	// top of round
			
 
				+	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
			
 
				+	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
			
 
				+	and	v9.16b,  v8.16b,  v17.16b
			
 
				+	ushr	v8.16b,  v8.16b,  #4
			
 
				+	tbl	v2.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
			
 
				+	tbl	v10.16b, {v19.16b},v9.16b
			
 
				+	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
			
 
				+	eor	v9.16b,	 v9.16b,  v8.16b
			
 
				+	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
			
 
				+	tbl	v11.16b, {v18.16b},v8.16b
			
 
				+	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
			
 
				+	tbl	v12.16b, {v18.16b},v9.16b
			
 
				+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
			
 
				+	eor	v11.16b, v11.16b, v10.16b
			
 
				+	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
			
 
				+	eor	v12.16b, v12.16b, v10.16b
			
 
				+	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
			
 
				+	tbl	v10.16b, {v18.16b},v11.16b
			
 
				+	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
			
 
				+	tbl	v11.16b, {v18.16b},v12.16b
			
 
				+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
			
 
				+	eor	v10.16b, v10.16b, v9.16b
			
 
				+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
			
 
				+	eor	v11.16b, v11.16b, v8.16b
			
 
				+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
			
 
				+	cbnz	w8, .Ldec_2x_loop
			
 
				+
			
 
				+	// middle of last round
			
 
				+						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
			
 
				+	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
			
 
				+	tbl	v12.16b, {v22.16b}, v10.16b
			
 
				+						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
			
 
				+	tbl	v1.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
			
 
				+	tbl	v9.16b,  {v23.16b}, v11.16b
			
 
				+	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
			
 
				+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
			
 
				+	eor	v12.16b, v12.16b, v16.16b
			
 
				+	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
			
 
				+	eor	v8.16b,  v9.16b,  v12.16b
			
 
				+	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
			
 
				+	tbl	v1.16b,  {v8.16b},v2.16b
			
 
				+	ret
			
 
				+.size	_vpaes_decrypt_2x,.-_vpaes_decrypt_2x
			
 
				+########################################################
			
 
				+##                                                    ##
			
 
				+##                  AES key schedule                  ##
			
 
				+##                                                    ##
			
 
				+########################################################
			
 
				+.type	_vpaes_key_preheat,%function
			
 
				+.align	4
			
 
				+_vpaes_key_preheat:
			
 
				+	adr	x10, .Lk_inv
			
 
				+	movi	v16.16b, #0x5b			// .Lk_s63
			
 
				+	adr	x11, .Lk_sb1
			
 
				+	movi	v17.16b, #0x0f			// .Lk_s0F
			
 
				+	ld1	{v18.2d,v19.2d,v20.2d,v21.2d}, [x10]		// .Lk_inv, .Lk_ipt
			
 
				+	adr	x10, .Lk_dksd
			
 
				+	ld1	{v22.2d,v23.2d}, [x11]		// .Lk_sb1
			
 
				+	adr	x11, .Lk_mc_forward
			
 
				+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64	// .Lk_dksd, .Lk_dksb
			
 
				+	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64	// .Lk_dkse, .Lk_dks9
			
 
				+	ld1	{v8.2d}, [x10]			// .Lk_rcon
			
 
				+	ld1	{v9.2d}, [x11]			// .Lk_mc_forward[0]
			
 
				+	ret
			
 
				+.size	_vpaes_key_preheat,.-_vpaes_key_preheat
			
 
				+
			
 
				+.type	_vpaes_schedule_core,%function
			
 
				+.align	4
			
 
				+_vpaes_schedule_core:
			
 
				+	stp	x29, x30, [sp,#-16]!
			
 
				+	add	x29,sp,#0
			
 
				+
			
 
				+	bl	_vpaes_key_preheat		// load the tables
			
 
				+
			
 
				+	ld1	{v0.16b}, [x0],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
			
 
				+
			
 
				+	// input transform
			
 
				+	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
			
 
				+	bl	_vpaes_schedule_transform
			
 
				+	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
			
 
				+
			
 
				+	adr	x10, .Lk_sr			// lea	.Lk_sr(%rip),%r10
			
 
				+	add	x8, x8, x10
			
 
				+	cbnz	w3, .Lschedule_am_decrypting
			
 
				+
			
 
				+	// encrypting, output zeroth round key after transform
			
 
				+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)
			
 
				+	b	.Lschedule_go
			
 
				+
			
 
				+.Lschedule_am_decrypting:
			
 
				+	// decrypting, output zeroth round key after shiftrows
			
 
				+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
			
 
				+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
			
 
				+	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
			
 
				+	eor	x8, x8, #0x30			// xor	$0x30, %r8
			
 
				+
			
 
				+.Lschedule_go:
			
 
				+	cmp	w1, #192			// cmp	$192,	%esi
			
 
				+	b.hi	.Lschedule_256
			
 
				+	b.eq	.Lschedule_192
			
 
				+	// 128: fall though
			
 
				+
			
 
				+##
			
 
				+##  .schedule_128
			
 
				+##
			
 
				+##  128-bit specific part of key schedule.
			
 
				+##
			
 
				+##  This schedule is really simple, because all its parts
			
 
				+##  are accomplished by the subroutines.
			
 
				+##
			
 
				+.Lschedule_128:
			
 
				+	mov	x0, #10			// mov	$10, %esi
			
 
				+
			
 
				+.Loop_schedule_128:
			
 
				+	sub	x0, x0, #1			// dec	%esi
			
 
				+	bl	_vpaes_schedule_round
			
 
				+	cbz	x0, .Lschedule_mangle_last
			
 
				+	bl	_vpaes_schedule_mangle		// write output
			
 
				+	b	.Loop_schedule_128
			
 
				+
			
 
				+##
			
 
				+##  .aes_schedule_192
			
 
				+##
			
 
				+##  192-bit specific part of key schedule.
			
 
				+##
			
 
				+##  The main body of this schedule is the same as the 128-bit
			
 
				+##  schedule, but with more smearing.  The long, high side is
			
 
				+##  stored in %xmm7 as before, and the short, low side is in
			
 
				+##  the high bits of %xmm6.
			
 
				+##
			
 
				+##  This schedule is somewhat nastier, however, because each
			
 
				+##  round produces 192 bits of key material, or 1.5 round keys.
			
 
				+##  Therefore, on each cycle we do 2 rounds and produce 3 round
			
 
				+##  keys.
			
 
				+##
			
 
				+.align	4
			
 
				+.Lschedule_192:
			
 
				+	sub	x0, x0, #8
			
 
				+	ld1	{v0.16b}, [x0]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
			
 
				+	bl	_vpaes_schedule_transform	// input transform
			
 
				+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
			
 
				+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
			
 
				+	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
			
 
				+	mov	x0, #4			// mov	$4,	%esi
			
 
				+
			
 
				+.Loop_schedule_192:
			
 
				+	sub	x0, x0, #1			// dec	%esi
			
 
				+	bl	_vpaes_schedule_round
			
 
				+	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	$8,%xmm6,%xmm0,%xmm0
			
 
				+	bl	_vpaes_schedule_mangle		// save key n
			
 
				+	bl	_vpaes_schedule_192_smear
			
 
				+	bl	_vpaes_schedule_mangle		// save key n+1
			
 
				+	bl	_vpaes_schedule_round
			
 
				+	cbz	x0, .Lschedule_mangle_last
			
 
				+	bl	_vpaes_schedule_mangle		// save key n+2
			
 
				+	bl	_vpaes_schedule_192_smear
			
 
				+	b	.Loop_schedule_192
			
 
				+
			
 
				+##
			
 
				+##  .aes_schedule_256
			
 
				+##
			
 
				+##  256-bit specific part of key schedule.
			
 
				+##
			
 
				+##  The structure here is very similar to the 128-bit
			
 
				+##  schedule, but with an additional "low side" in
			
 
				+##  %xmm6.  The low side's rounds are the same as the
			
 
				+##  high side's, except no rcon and no rotation.
			
 
				+##
			
 
				+.align	4
			
 
				+.Lschedule_256:
			
 
				+	ld1	{v0.16b}, [x0]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
			
 
				+	bl	_vpaes_schedule_transform	// input transform
			
 
				+	mov	x0, #7			// mov	$7, %esi
			
 
				+
			
 
				+.Loop_schedule_256:
			
 
				+	sub	x0, x0, #1			// dec	%esi
			
 
				+	bl	_vpaes_schedule_mangle		// output low result
			
 
				+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
			
 
				+
			
 
				+	// high round
			
 
				+	bl	_vpaes_schedule_round
			
 
				+	cbz	x0, .Lschedule_mangle_last
			
 
				+	bl	_vpaes_schedule_mangle
			
 
				+
			
 
				+	// low round. swap xmm7 and xmm6
			
 
				+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
			
 
				+	movi	v4.16b, #0
			
 
				+	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
			
 
				+	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
			
 
				+	bl	_vpaes_schedule_low_round
			
 
				+	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
			
 
				+
			
 
				+	b	.Loop_schedule_256
			
 
				+
			
 
				+##
			
 
				+##  .aes_schedule_mangle_last
			
 
				+##
			
 
				+##  Mangler for last round of key schedule
			
 
				+##  Mangles %xmm0
			
 
				+##    when encrypting, outputs out(%xmm0) ^ 63
			
 
				+##    when decrypting, outputs unskew(%xmm0)
			
 
				+##
			
 
				+##  Always called right before return... jumps to cleanup and exits
			
 
				+##
			
 
				+.align	4
			
 
				+.Lschedule_mangle_last:
			
 
				+	// schedule last round key from xmm0
			
 
				+	adr	x11, .Lk_deskew			// lea	.Lk_deskew(%rip),%r11	# prepare to deskew
			
 
				+	cbnz	w3, .Lschedule_mangle_last_dec
			
 
				+
			
 
				+	// encrypting
			
 
				+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
			
 
				+	adr	x11, .Lk_opt			// lea	.Lk_opt(%rip),	%r11		# prepare to output transform
			
 
				+	add	x2, x2, #32			// add	$32,	%rdx
			
 
				+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
			
 
				+
			
 
				+.Lschedule_mangle_last_dec:
			
 
				+	ld1	{v20.2d,v21.2d}, [x11]		// reload constants
			
 
				+	sub	x2, x2, #16			// add	$-16,	%rdx
			
 
				+	eor	v0.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
			
 
				+	bl	_vpaes_schedule_transform	// output transform
			
 
				+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)		# save last key
			
 
				+
			
 
				+	// cleanup
			
 
				+	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
			
 
				+	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
			
 
				+	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
			
 
				+	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
			
 
				+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
			
 
				+	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
			
 
				+	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
			
 
				+	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
			
 
				+	ldp	x29, x30, [sp],#16
			
 
				+	ret
			
 
				+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
			
 
				+
			
 
				+##
			
 
				+##  .aes_schedule_192_smear
			
 
				+##
			
 
				+##  Smear the short, low side in the 192-bit key schedule.
			
 
				+##
			
 
				+##  Inputs:
			
 
				+##    %xmm7: high side, b  a  x  y
			
 
				+##    %xmm6:  low side, d  c  0  0
			
 
				+##    %xmm13: 0
			
 
				+##
			
 
				+##  Outputs:
			
 
				+##    %xmm6: b+c+d  b+c  0  0
			
 
				+##    %xmm0: b+c+d  b+c  b  a
			
 
				+##
			
 
				+.type	_vpaes_schedule_192_smear,%function
			
 
				+.align	4
			
 
				+_vpaes_schedule_192_smear:
			
 
				+	movi	v1.16b, #0
			
 
				+	dup	v0.4s, v7.s[3]
			
 
				+	ins	v1.s[3], v6.s[2]	// vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
			
 
				+	ins	v0.s[0], v7.s[2]	// vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
			
 
				+	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
			
 
				+	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
			
 
				+	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
			
 
				+	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
			
 
				+	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
			
 
				+	ret
			
 
				+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
			
 
				+
			
 
				+##
			
 
				+##  .aes_schedule_round
			
 
				+##
			
 
				+##  Runs one main round of the key schedule on %xmm0, %xmm7
			
 
				+##
			
 
				+##  Specifically, runs subbytes on the high dword of %xmm0
			
 
				+##  then rotates it by one byte and xors into the low dword of
			
 
				+##  %xmm7.
			
 
				+##
			
 
				+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
			
 
				+##  next rcon.
			
 
				+##
			
 
				+##  Smears the dwords of %xmm7 by xoring the low into the
			
 
				+##  second low, result into third, result into highest.
			
 
				+##
			
 
				+##  Returns results in %xmm7 = %xmm0.
			
 
				+##  Clobbers %xmm1-%xmm4, %r11.
			
 
				+##
			
 
				+.type	_vpaes_schedule_round,%function
			
 
				+.align	4
			
 
				+_vpaes_schedule_round:
			
 
				+	// extract rcon from xmm8
			
 
				+	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
			
 
				+	ext	v1.16b, v8.16b, v4.16b, #15	// vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
			
 
				+	ext	v8.16b, v8.16b, v8.16b, #15	// vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
			
 
				+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
			
 
				+
			
 
				+	// rotate
			
 
				+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
			
 
				+	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
			
 
				+
			
 
				+	// fall through...
			
 
				+
			
 
				+	// low round: same as high round, but no rotation and no rcon.
			
 
				+_vpaes_schedule_low_round:
			
 
				+	// smear xmm7
			
 
				+	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	$4,	%xmm7,	%xmm1
			
 
				+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
			
 
				+	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	$8,	%xmm7,	%xmm4
			
 
				+
			
 
				+	// subbytes
			
 
				+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
			
 
				+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
			
 
				+	eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
			
 
				+	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
			
 
				+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
			
 
				+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
			
 
				+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
			
 
				+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
			
 
				+	eor	v7.16b, v7.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm7,	%xmm7
			
 
				+	tbl	v3.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
			
 
				+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
			
 
				+	tbl	v2.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
			
 
				+	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
			
 
				+	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
			
 
				+	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
			
 
				+	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
			
 
				+	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
			
 
				+
			
 
				+	// add in smeared stuff
			
 
				+	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
			
 
				+	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
			
 
				+	ret
			
 
				+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
			
 
				+
			
 
				+##
			
 
				+##  .aes_schedule_transform
			
 
				+##
			
 
				+##  Linear-transform %xmm0 according to tables at (%r11)
			
 
				+##
			
 
				+##  Requires that %xmm9 = 0x0F0F... as in preheat
			
 
				+##  Output in %xmm0
			
 
				+##  Clobbers %xmm1, %xmm2
			
 
				+##
			
 
				+.type	_vpaes_schedule_transform,%function
			
 
				+.align	4
			
 
				+_vpaes_schedule_transform:
			
 
				+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
			
 
				+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
			
 
				+						// vmovdqa	(%r11),	%xmm2 	# lo
			
 
				+	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
			
 
				+						// vmovdqa	16(%r11),	%xmm1 # hi
			
 
				+	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
			
 
				+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
			
 
				+	ret
			
 
				+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
			
 
				+
			
 
				+##
			
 
				+##  .aes_schedule_mangle
			
 
				+##
			
 
				+##  Mangle xmm0 from (basis-transformed) standard version
			
 
				+##  to our version.
			
 
				+##
			
 
				+##  On encrypt,
			
 
				+##    xor with 0x63
			
 
				+##    multiply by circulant 0,1,1,1
			
 
				+##    apply shiftrows transform
			
 
				+##
			
 
				+##  On decrypt,
			
 
				+##    xor with 0x63
			
 
				+##    multiply by "inverse mixcolumns" circulant E,B,D,9
			
 
				+##    deskew
			
 
				+##    apply shiftrows transform
			
 
				+##
			
 
				+##
			
 
				+##  Writes out to (%rdx), and increments or decrements it
			
 
				+##  Keeps track of round number mod 4 in %r8
			
 
				+##  Preserves xmm0
			
 
				+##  Clobbers xmm1-xmm5
			
 
				+##
			
 
				+.type	_vpaes_schedule_mangle,%function
			
 
				+.align	4
			
 
				+_vpaes_schedule_mangle:
			
 
				+	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
			
 
				+						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
			
 
				+	cbnz	w3, .Lschedule_mangle_dec
			
 
				+
			
 
				+	// encrypting
			
 
				+	eor	v4.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm4
			
 
				+	add	x2, x2, #16			// add	$16,	%rdx
			
 
				+	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
			
 
				+	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
			
 
				+	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
			
 
				+	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
			
 
				+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
			
 
				+	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
			
 
				+
			
 
				+	b	.Lschedule_mangle_both
			
 
				+.align	4
			
 
				+.Lschedule_mangle_dec:
			
 
				+	// inverse mix columns
			
 
				+						// lea	.Lk_dksd(%rip),%r11
			
 
				+	ushr	v1.16b, v4.16b, #4		// vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
			
 
				+	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
			
 
				+
			
 
				+						// vmovdqa	0x00(%r11),	%xmm2
			
 
				+	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
			
 
				+						// vmovdqa	0x10(%r11),	%xmm3
			
 
				+	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
			
 
				+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
			
 
				+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
			
 
				+
			
 
				+						// vmovdqa	0x20(%r11),	%xmm2
			
 
				+	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
			
 
				+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
			
 
				+						// vmovdqa	0x30(%r11),	%xmm3
			
 
				+	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
			
 
				+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
			
 
				+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
			
 
				+
			
 
				+						// vmovdqa	0x40(%r11),	%xmm2
			
 
				+	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
			
 
				+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
			
 
				+						// vmovdqa	0x50(%r11),	%xmm3
			
 
				+	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
			
 
				+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
			
 
				+
			
 
				+						// vmovdqa	0x60(%r11),	%xmm2
			
 
				+	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
			
 
				+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
			
 
				+						// vmovdqa	0x70(%r11),	%xmm4
			
 
				+	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
			
 
				+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
			
 
				+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
			
 
				+	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
			
 
				+
			
 
				+	sub	x2, x2, #16			// add	$-16,	%rdx
			
 
				+
			
 
				+.Lschedule_mangle_both:
			
 
				+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
			
 
				+	add	x8, x8, #64-16			// add	$-16,	%r8
			
 
				+	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
			
 
				+	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
			
 
				+	ret
			
 
				+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
			
 
				+
			
 
				+.globl	vpaes_set_encrypt_key
			
 
				+.type	vpaes_set_encrypt_key,%function
			
 
				+.align	4
			
 
				+vpaes_set_encrypt_key:
			
 
				+	stp	x29,x30,[sp,#-16]!
			
 
				+	add	x29,sp,#0
			
 
				+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
			
 
				+
			
 
				+	lsr	w9, w1, #5		// shr	$5,%eax
			
 
				+	add	w9, w9, #5		// $5,%eax
			
 
				+	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
			
 
				+
			
 
				+	mov	w3, #0		// mov	$0,%ecx
			
 
				+	mov	x8, #0x30		// mov	$0x30,%r8d
			
 
				+	bl	_vpaes_schedule_core
			
 
				+	eor	x0, x0, x0
			
 
				+
			
 
				+	ldp	d8,d9,[sp],#16
			
 
				+	ldp	x29,x30,[sp],#16
			
 
				+	ret
			
 
				+.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
			
 
				+
			
 
				+.globl	vpaes_set_decrypt_key
			
 
				+.type	vpaes_set_decrypt_key,%function
			
 
				+.align	4
			
 
				+vpaes_set_decrypt_key:
			
 
				+	stp	x29,x30,[sp,#-16]!
			
 
				+	add	x29,sp,#0
			
 
				+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
			
 
				+
			
 
				+	lsr	w9, w1, #5		// shr	$5,%eax
			
 
				+	add	w9, w9, #5		// $5,%eax
			
 
				+	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
			
 
				+	lsl	w9, w9, #4		// shl	$4,%eax
			
 
				+	add	x2, x2, #16		// lea	16(%rdx,%rax),%rdx
			
 
				+	add	x2, x2, x9
			
 
				+
			
 
				+	mov	w3, #1		// mov	$1,%ecx
			
 
				+	lsr	w8, w1, #1		// shr	$1,%r8d
			
 
				+	and	x8, x8, #32		// and	$32,%r8d
			
 
				+	eor	x8, x8, #32		// xor	$32,%r8d	# nbits==192?0:32
			
 
				+	bl	_vpaes_schedule_core
			
 
				+
			
 
				+	ldp	d8,d9,[sp],#16
			
 
				+	ldp	x29,x30,[sp],#16
			
 
				+	ret
			
 
				+.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
			
 
				+.globl	vpaes_cbc_encrypt
			
 
				+.type	vpaes_cbc_encrypt,%function
			
 
				+.align	4
			
 
				+vpaes_cbc_encrypt:
			
 
				+	cbz	x2, .Lcbc_abort
			
 
				+	cmp	w5, #0			// check direction
			
 
				+	b.eq	vpaes_cbc_decrypt
			
 
				+
			
 
				+	stp	x29,x30,[sp,#-16]!
			
 
				+	add	x29,sp,#0
			
 
				+
			
 
				+	mov	x17, x2		// reassign
			
 
				+	mov	x2,  x3		// reassign
			
 
				+
			
 
				+	ld1	{v0.16b}, [x4]	// load ivec
			
 
				+	bl	_vpaes_encrypt_preheat
			
 
				+	b	.Lcbc_enc_loop
			
 
				+
			
 
				+.align	4
			
 
				+.Lcbc_enc_loop:
			
 
				+	ld1	{v7.16b}, [x0],#16	// load input
			
 
				+	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
			
 
				+	bl	_vpaes_encrypt_core
			
 
				+	st1	{v0.16b}, [x1],#16	// save output
			
 
				+	subs	x17, x17, #16
			
 
				+	b.hi	.Lcbc_enc_loop
			
 
				+
			
 
				+	st1	{v0.16b}, [x4]	// write ivec
			
 
				+
			
 
				+	ldp	x29,x30,[sp],#16
			
 
				+.Lcbc_abort:
			
 
				+	ret
			
 
				+.size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
			
 
				+
			
 
				+.type	vpaes_cbc_decrypt,%function
			
 
				+.align	4
			
 
				+vpaes_cbc_decrypt:
			
 
				+	stp	x29,x30,[sp,#-16]!
			
 
				+	add	x29,sp,#0
			
 
				+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
			
 
				+	stp	d10,d11,[sp,#-16]!
			
 
				+	stp	d12,d13,[sp,#-16]!
			
 
				+	stp	d14,d15,[sp,#-16]!
			
 
				+
			
 
				+	mov	x17, x2		// reassign
			
 
				+	mov	x2,  x3		// reassign
			
 
				+	ld1	{v6.16b}, [x4]	// load ivec
			
 
				+	bl	_vpaes_decrypt_preheat
			
 
				+	tst	x17, #16
			
 
				+	b.eq	.Lcbc_dec_loop2x
			
 
				+
			
 
				+	ld1	{v7.16b}, [x0], #16	// load input
			
 
				+	bl	_vpaes_decrypt_core
			
 
				+	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
			
 
				+	orr	v6.16b, v7.16b, v7.16b	// next ivec value
			
 
				+	st1	{v0.16b}, [x1], #16
			
 
				+	subs	x17, x17, #16
			
 
				+	b.ls	.Lcbc_dec_done
			
 
				+
			
 
				+.align	4
			
 
				+.Lcbc_dec_loop2x:
			
 
				+	ld1	{v14.16b,v15.16b}, [x0], #32
			
 
				+	bl	_vpaes_decrypt_2x
			
 
				+	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
			
 
				+	eor	v1.16b, v1.16b, v14.16b
			
 
				+	orr	v6.16b, v15.16b, v15.16b
			
 
				+	st1	{v0.16b,v1.16b}, [x1], #32
			
 
				+	subs	x17, x17, #32
			
 
				+	b.hi	.Lcbc_dec_loop2x
			
 
				+
			
 
				+.Lcbc_dec_done:
			
 
				+	st1	{v6.16b}, [x4]
			
 
				+
			
 
				+	ldp	d14,d15,[sp],#16
			
 
				+	ldp	d12,d13,[sp],#16
			
 
				+	ldp	d10,d11,[sp],#16
			
 
				+	ldp	d8,d9,[sp],#16
			
 
				+	ldp	x29,x30,[sp],#16
			
 
				+	ret
			
 
				+.size	vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
			
 
				+.globl	vpaes_ecb_encrypt
			
 
				+.type	vpaes_ecb_encrypt,%function
			
 
				+.align	4
			
 
				+vpaes_ecb_encrypt:
			
 
				+	stp	x29,x30,[sp,#-16]!
			
 
				+	add	x29,sp,#0
			
 
				+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
			
 
				+	stp	d10,d11,[sp,#-16]!
			
 
				+	stp	d12,d13,[sp,#-16]!
			
 
				+	stp	d14,d15,[sp,#-16]!
			
 
				+
			
 
				+	mov	x17, x2
			
 
				+	mov	x2,  x3
			
 
				+	bl	_vpaes_encrypt_preheat
			
 
				+	tst	x17, #16
			
 
				+	b.eq	.Lecb_enc_loop
			
 
				+
			
 
				+	ld1	{v7.16b}, [x0],#16
			
 
				+	bl	_vpaes_encrypt_core
			
 
				+	st1	{v0.16b}, [x1],#16
			
 
				+	subs	x17, x17, #16
			
 
				+	b.ls	.Lecb_enc_done
			
 
				+
			
 
				+.align	4
			
 
				+.Lecb_enc_loop:
			
 
				+	ld1	{v14.16b,v15.16b}, [x0], #32
			
 
				+	bl	_vpaes_encrypt_2x
			
 
				+	st1	{v0.16b,v1.16b}, [x1], #32
			
 
				+	subs	x17, x17, #32
			
 
				+	b.hi	.Lecb_enc_loop
			
 
				+
			
 
				+.Lecb_enc_done:
			
 
				+	ldp	d14,d15,[sp],#16
			
 
				+	ldp	d12,d13,[sp],#16
			
 
				+	ldp	d10,d11,[sp],#16
			
 
				+	ldp	d8,d9,[sp],#16
			
 
				+	ldp	x29,x30,[sp],#16
			
 
				+	ret
			
 
				+.size	vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
			
 
				+
			
 
				+.globl	vpaes_ecb_decrypt
			
 
				+.type	vpaes_ecb_decrypt,%function
			
 
				+.align	4
			
 
				+vpaes_ecb_decrypt:
			
 
				+	stp	x29,x30,[sp,#-16]!
			
 
				+	add	x29,sp,#0
			
 
				+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
			
 
				+	stp	d10,d11,[sp,#-16]!
			
 
				+	stp	d12,d13,[sp,#-16]!
			
 
				+	stp	d14,d15,[sp,#-16]!
			
 
				+
			
 
				+	mov	x17, x2
			
 
				+	mov	x2,  x3
			
 
				+	bl	_vpaes_decrypt_preheat
			
 
				+	tst	x17, #16
			
 
				+	b.eq	.Lecb_dec_loop
			
 
				+
			
 
				+	ld1	{v7.16b}, [x0],#16
			
 
				+	bl	_vpaes_encrypt_core
			
 
				+	st1	{v0.16b}, [x1],#16
			
 
				+	subs	x17, x17, #16
			
 
				+	b.ls	.Lecb_dec_done
			
 
				+
			
 
				+.align	4
			
 
				+.Lecb_dec_loop:
			
 
				+	ld1	{v14.16b,v15.16b}, [x0], #32
			
 
				+	bl	_vpaes_decrypt_2x
			
 
				+	st1	{v0.16b,v1.16b}, [x1], #32
			
 
				+	subs	x17, x17, #32
			
 
				+	b.hi	.Lecb_dec_loop
			
 
				+
			
 
				+.Lecb_dec_done:
			
 
				+	ldp	d14,d15,[sp],#16
			
 
				+	ldp	d12,d13,[sp],#16
			
 
				+	ldp	d10,d11,[sp],#16
			
 
				+	ldp	d8,d9,[sp],#16
			
 
				+	ldp	x29,x30,[sp],#16
			
 
				+	ret
			
 
				+.size	vpaes_ecb_decrypt,.-vpaes_ecb_decrypt
			
--- a/lib/aes_acc/asm/x64.S
+++ b/lib/aes_acc/asm/x64.S
@@ -0,0 +1,827 @@
 
				+.text	
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+.type	_vpaes_encrypt_core,@function
			
 
				+.align	16
			
 
				+_vpaes_encrypt_core:
			
 
				+	movq	%rdx,%r9
			
 
				+	movq	$16,%r11
			
 
				+	movl	240(%rdx),%eax
			
 
				+	movdqa	%xmm9,%xmm1
			
 
				+	movdqa	.Lk_ipt(%rip),%xmm2
			
 
				+	pandn	%xmm0,%xmm1
			
 
				+	movdqu	(%r9),%xmm5
			
 
				+	psrld	$4,%xmm1
			
 
				+	pand	%xmm9,%xmm0
			
 
				+.byte	102,15,56,0,208
			
 
				+	movdqa	.Lk_ipt+16(%rip),%xmm0
			
 
				+.byte	102,15,56,0,193
			
 
				+	pxor	%xmm5,%xmm2
			
 
				+	addq	$16,%r9
			
 
				+	pxor	%xmm2,%xmm0
			
 
				+	leaq	.Lk_mc_backward(%rip),%r10
			
 
				+	jmp	.Lenc_entry
			
 
				+
			
 
				+.align	16
			
 
				+.Lenc_loop:
			
 
				+
			
 
				+	movdqa	%xmm13,%xmm4
			
 
				+	movdqa	%xmm12,%xmm0
			
 
				+.byte	102,15,56,0,226
			
 
				+.byte	102,15,56,0,195
			
 
				+	pxor	%xmm5,%xmm4
			
 
				+	movdqa	%xmm15,%xmm5
			
 
				+	pxor	%xmm4,%xmm0
			
 
				+	movdqa	-64(%r11,%r10,1),%xmm1
			
 
				+.byte	102,15,56,0,234
			
 
				+	movdqa	(%r11,%r10,1),%xmm4
			
 
				+	movdqa	%xmm14,%xmm2
			
 
				+.byte	102,15,56,0,211
			
 
				+	movdqa	%xmm0,%xmm3
			
 
				+	pxor	%xmm5,%xmm2
			
 
				+.byte	102,15,56,0,193
			
 
				+	addq	$16,%r9
			
 
				+	pxor	%xmm2,%xmm0
			
 
				+.byte	102,15,56,0,220
			
 
				+	addq	$16,%r11
			
 
				+	pxor	%xmm0,%xmm3
			
 
				+.byte	102,15,56,0,193
			
 
				+	andq	$0x30,%r11
			
 
				+	subq	$1,%rax
			
 
				+	pxor	%xmm3,%xmm0
			
 
				+
			
 
				+.Lenc_entry:
			
 
				+
			
 
				+	movdqa	%xmm9,%xmm1
			
 
				+	movdqa	%xmm11,%xmm5
			
 
				+	pandn	%xmm0,%xmm1
			
 
				+	psrld	$4,%xmm1
			
 
				+	pand	%xmm9,%xmm0
			
 
				+.byte	102,15,56,0,232
			
 
				+	movdqa	%xmm10,%xmm3
			
 
				+	pxor	%xmm1,%xmm0
			
 
				+.byte	102,15,56,0,217
			
 
				+	movdqa	%xmm10,%xmm4
			
 
				+	pxor	%xmm5,%xmm3
			
 
				+.byte	102,15,56,0,224
			
 
				+	movdqa	%xmm10,%xmm2
			
 
				+	pxor	%xmm5,%xmm4
			
 
				+.byte	102,15,56,0,211
			
 
				+	movdqa	%xmm10,%xmm3
			
 
				+	pxor	%xmm0,%xmm2
			
 
				+.byte	102,15,56,0,220
			
 
				+	movdqu	(%r9),%xmm5
			
 
				+	pxor	%xmm1,%xmm3
			
 
				+	jnz	.Lenc_loop
			
 
				+
			
 
				+
			
 
				+	movdqa	-96(%r10),%xmm4
			
 
				+	movdqa	-80(%r10),%xmm0
			
 
				+.byte	102,15,56,0,226
			
 
				+	pxor	%xmm5,%xmm4
			
 
				+.byte	102,15,56,0,195
			
 
				+	movdqa	64(%r11,%r10,1),%xmm1
			
 
				+	pxor	%xmm4,%xmm0
			
 
				+.byte	102,15,56,0,193
			
 
				+	.byte	0xf3,0xc3
			
 
				+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+.type	_vpaes_decrypt_core,@function
			
 
				+.align	16
			
 
				+_vpaes_decrypt_core:
			
 
				+	movq	%rdx,%r9
			
 
				+	movl	240(%rdx),%eax
			
 
				+	movdqa	%xmm9,%xmm1
			
 
				+	movdqa	.Lk_dipt(%rip),%xmm2
			
 
				+	pandn	%xmm0,%xmm1
			
 
				+	movq	%rax,%r11
			
 
				+	psrld	$4,%xmm1
			
 
				+	movdqu	(%r9),%xmm5
			
 
				+	shlq	$4,%r11
			
 
				+	pand	%xmm9,%xmm0
			
 
				+.byte	102,15,56,0,208
			
 
				+	movdqa	.Lk_dipt+16(%rip),%xmm0
			
 
				+	xorq	$0x30,%r11
			
 
				+	leaq	.Lk_dsbd(%rip),%r10
			
 
				+.byte	102,15,56,0,193
			
 
				+	andq	$0x30,%r11
			
 
				+	pxor	%xmm5,%xmm2
			
 
				+	movdqa	.Lk_mc_forward+48(%rip),%xmm5
			
 
				+	pxor	%xmm2,%xmm0
			
 
				+	addq	$16,%r9
			
 
				+	addq	%r10,%r11
			
 
				+	jmp	.Ldec_entry
			
 
				+
			
 
				+.align	16
			
 
				+.Ldec_loop:
			
 
				+
			
 
				+
			
 
				+
			
 
				+	movdqa	-32(%r10),%xmm4
			
 
				+	movdqa	-16(%r10),%xmm1
			
 
				+.byte	102,15,56,0,226
			
 
				+.byte	102,15,56,0,203
			
 
				+	pxor	%xmm4,%xmm0
			
 
				+	movdqa	0(%r10),%xmm4
			
 
				+	pxor	%xmm1,%xmm0
			
 
				+	movdqa	16(%r10),%xmm1
			
 
				+
			
 
				+.byte	102,15,56,0,226
			
 
				+.byte	102,15,56,0,197
			
 
				+.byte	102,15,56,0,203
			
 
				+	pxor	%xmm4,%xmm0
			
 
				+	movdqa	32(%r10),%xmm4
			
 
				+	pxor	%xmm1,%xmm0
			
 
				+	movdqa	48(%r10),%xmm1
			
 
				+
			
 
				+.byte	102,15,56,0,226
			
 
				+.byte	102,15,56,0,197
			
 
				+.byte	102,15,56,0,203
			
 
				+	pxor	%xmm4,%xmm0
			
 
				+	movdqa	64(%r10),%xmm4
			
 
				+	pxor	%xmm1,%xmm0
			
 
				+	movdqa	80(%r10),%xmm1
			
 
				+
			
 
				+.byte	102,15,56,0,226
			
 
				+.byte	102,15,56,0,197
			
 
				+.byte	102,15,56,0,203
			
 
				+	pxor	%xmm4,%xmm0
			
 
				+	addq	$16,%r9
			
 
				+.byte	102,15,58,15,237,12
			
 
				+	pxor	%xmm1,%xmm0
			
 
				+	subq	$1,%rax
			
 
				+
			
 
				+.Ldec_entry:
			
 
				+
			
 
				+	movdqa	%xmm9,%xmm1
			
 
				+	pandn	%xmm0,%xmm1
			
 
				+	movdqa	%xmm11,%xmm2
			
 
				+	psrld	$4,%xmm1
			
 
				+	pand	%xmm9,%xmm0
			
 
				+.byte	102,15,56,0,208
			
 
				+	movdqa	%xmm10,%xmm3
			
 
				+	pxor	%xmm1,%xmm0
			
 
				+.byte	102,15,56,0,217
			
 
				+	movdqa	%xmm10,%xmm4
			
 
				+	pxor	%xmm2,%xmm3
			
 
				+.byte	102,15,56,0,224
			
 
				+	pxor	%xmm2,%xmm4
			
 
				+	movdqa	%xmm10,%xmm2
			
 
				+.byte	102,15,56,0,211
			
 
				+	movdqa	%xmm10,%xmm3
			
 
				+	pxor	%xmm0,%xmm2
			
 
				+.byte	102,15,56,0,220
			
 
				+	movdqu	(%r9),%xmm0
			
 
				+	pxor	%xmm1,%xmm3
			
 
				+	jnz	.Ldec_loop
			
 
				+
			
 
				+
			
 
				+	movdqa	96(%r10),%xmm4
			
 
				+.byte	102,15,56,0,226
			
 
				+	pxor	%xmm0,%xmm4
			
 
				+	movdqa	112(%r10),%xmm0
			
 
				+	movdqa	-352(%r11),%xmm2
			
 
				+.byte	102,15,56,0,195
			
 
				+	pxor	%xmm4,%xmm0
			
 
				+.byte	102,15,56,0,194
			
 
				+	.byte	0xf3,0xc3
			
 
				+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+.type	_vpaes_schedule_core,@function
			
 
				+.align	16
			
 
				+_vpaes_schedule_core:
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+	call	_vpaes_preheat
			
 
				+	movdqa	.Lk_rcon(%rip),%xmm8
			
 
				+	movdqu	(%rdi),%xmm0
			
 
				+
			
 
				+
			
 
				+	movdqa	%xmm0,%xmm3
			
 
				+	leaq	.Lk_ipt(%rip),%r11
			
 
				+	call	_vpaes_schedule_transform
			
 
				+	movdqa	%xmm0,%xmm7
			
 
				+
			
 
				+	leaq	.Lk_sr(%rip),%r10
			
 
				+	testq	%rcx,%rcx
			
 
				+	jnz	.Lschedule_am_decrypting
			
 
				+
			
 
				+
			
 
				+	movdqu	%xmm0,(%rdx)
			
 
				+	jmp	.Lschedule_go
			
 
				+
			
 
				+.Lschedule_am_decrypting:
			
 
				+
			
 
				+	movdqa	(%r8,%r10,1),%xmm1
			
 
				+.byte	102,15,56,0,217
			
 
				+	movdqu	%xmm3,(%rdx)
			
 
				+	xorq	$0x30,%r8
			
 
				+
			
 
				+.Lschedule_go:
			
 
				+	cmpl	$192,%esi
			
 
				+	ja	.Lschedule_256
			
 
				+	je	.Lschedule_192
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+.Lschedule_128:
			
 
				+	movl	$10,%esi
			
 
				+
			
 
				+.Loop_schedule_128:
			
 
				+	call	_vpaes_schedule_round
			
 
				+	decq	%rsi
			
 
				+	jz	.Lschedule_mangle_last
			
 
				+	call	_vpaes_schedule_mangle
			
 
				+	jmp	.Loop_schedule_128
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+.align	16
			
 
				+.Lschedule_192:
			
 
				+	movdqu	8(%rdi),%xmm0
			
 
				+	call	_vpaes_schedule_transform
			
 
				+	movdqa	%xmm0,%xmm6
			
 
				+	pxor	%xmm4,%xmm4
			
 
				+	movhlps	%xmm4,%xmm6
			
 
				+	movl	$4,%esi
			
 
				+
			
 
				+.Loop_schedule_192:
			
 
				+	call	_vpaes_schedule_round
			
 
				+.byte	102,15,58,15,198,8
			
 
				+	call	_vpaes_schedule_mangle
			
 
				+	call	_vpaes_schedule_192_smear
			
 
				+	call	_vpaes_schedule_mangle
			
 
				+	call	_vpaes_schedule_round
			
 
				+	decq	%rsi
			
 
				+	jz	.Lschedule_mangle_last
			
 
				+	call	_vpaes_schedule_mangle
			
 
				+	call	_vpaes_schedule_192_smear
			
 
				+	jmp	.Loop_schedule_192
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+.align	16
			
 
				+.Lschedule_256:
			
 
				+	movdqu	16(%rdi),%xmm0
			
 
				+	call	_vpaes_schedule_transform
			
 
				+	movl	$7,%esi
			
 
				+
			
 
				+.Loop_schedule_256:
			
 
				+	call	_vpaes_schedule_mangle
			
 
				+	movdqa	%xmm0,%xmm6
			
 
				+
			
 
				+
			
 
				+	call	_vpaes_schedule_round
			
 
				+	decq	%rsi
			
 
				+	jz	.Lschedule_mangle_last
			
 
				+	call	_vpaes_schedule_mangle
			
 
				+
			
 
				+
			
 
				+	pshufd	$0xFF,%xmm0,%xmm0
			
 
				+	movdqa	%xmm7,%xmm5
			
 
				+	movdqa	%xmm6,%xmm7
			
 
				+	call	_vpaes_schedule_low_round
			
 
				+	movdqa	%xmm5,%xmm7
			
 
				+
			
 
				+	jmp	.Loop_schedule_256
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+.align	16
			
 
				+.Lschedule_mangle_last:
			
 
				+
			
 
				+	leaq	.Lk_deskew(%rip),%r11
			
 
				+	testq	%rcx,%rcx
			
 
				+	jnz	.Lschedule_mangle_last_dec
			
 
				+
			
 
				+
			
 
				+	movdqa	(%r8,%r10,1),%xmm1
			
 
				+.byte	102,15,56,0,193
			
 
				+	leaq	.Lk_opt(%rip),%r11
			
 
				+	addq	$32,%rdx
			
 
				+
			
 
				+.Lschedule_mangle_last_dec:
			
 
				+	addq	$-16,%rdx
			
 
				+	pxor	.Lk_s63(%rip),%xmm0
			
 
				+	call	_vpaes_schedule_transform
			
 
				+	movdqu	%xmm0,(%rdx)
			
 
				+
			
 
				+
			
 
				+	pxor	%xmm0,%xmm0
			
 
				+	pxor	%xmm1,%xmm1
			
 
				+	pxor	%xmm2,%xmm2
			
 
				+	pxor	%xmm3,%xmm3
			
 
				+	pxor	%xmm4,%xmm4
			
 
				+	pxor	%xmm5,%xmm5
			
 
				+	pxor	%xmm6,%xmm6
			
 
				+	pxor	%xmm7,%xmm7
			
 
				+	.byte	0xf3,0xc3
			
 
				+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+.type	_vpaes_schedule_192_smear,@function
			
 
				+.align	16
			
 
				+_vpaes_schedule_192_smear:
			
 
				+	pshufd	$0x80,%xmm6,%xmm1
			
 
				+	pshufd	$0xFE,%xmm7,%xmm0
			
 
				+	pxor	%xmm1,%xmm6
			
 
				+	pxor	%xmm1,%xmm1
			
 
				+	pxor	%xmm0,%xmm6
			
 
				+	movdqa	%xmm6,%xmm0
			
 
				+	movhlps	%xmm1,%xmm6
			
 
				+	.byte	0xf3,0xc3
			
 
				+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+.type	_vpaes_schedule_round,@function
			
 
				+.align	16
			
 
				+_vpaes_schedule_round:
			
 
				+
			
 
				+	pxor	%xmm1,%xmm1
			
 
				+.byte	102,65,15,58,15,200,15
			
 
				+.byte	102,69,15,58,15,192,15
			
 
				+	pxor	%xmm1,%xmm7
			
 
				+
			
 
				+
			
 
				+	pshufd	$0xFF,%xmm0,%xmm0
			
 
				+.byte	102,15,58,15,192,1
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+_vpaes_schedule_low_round:
			
 
				+
			
 
				+	movdqa	%xmm7,%xmm1
			
 
				+	pslldq	$4,%xmm7
			
 
				+	pxor	%xmm1,%xmm7
			
 
				+	movdqa	%xmm7,%xmm1
			
 
				+	pslldq	$8,%xmm7
			
 
				+	pxor	%xmm1,%xmm7
			
 
				+	pxor	.Lk_s63(%rip),%xmm7
			
 
				+
			
 
				+
			
 
				+	movdqa	%xmm9,%xmm1
			
 
				+	pandn	%xmm0,%xmm1
			
 
				+	psrld	$4,%xmm1
			
 
				+	pand	%xmm9,%xmm0
			
 
				+	movdqa	%xmm11,%xmm2
			
 
				+.byte	102,15,56,0,208
			
 
				+	pxor	%xmm1,%xmm0
			
 
				+	movdqa	%xmm10,%xmm3
			
 
				+.byte	102,15,56,0,217
			
 
				+	pxor	%xmm2,%xmm3
			
 
				+	movdqa	%xmm10,%xmm4
			
 
				+.byte	102,15,56,0,224
			
 
				+	pxor	%xmm2,%xmm4
			
 
				+	movdqa	%xmm10,%xmm2
			
 
				+.byte	102,15,56,0,211
			
 
				+	pxor	%xmm0,%xmm2
			
 
				+	movdqa	%xmm10,%xmm3
			
 
				+.byte	102,15,56,0,220
			
 
				+	pxor	%xmm1,%xmm3
			
 
				+	movdqa	%xmm13,%xmm4
			
 
				+.byte	102,15,56,0,226
			
 
				+	movdqa	%xmm12,%xmm0
			
 
				+.byte	102,15,56,0,195
			
 
				+	pxor	%xmm4,%xmm0
			
 
				+
			
 
				+
			
 
				+	pxor	%xmm7,%xmm0
			
 
				+	movdqa	%xmm0,%xmm7
			
 
				+	.byte	0xf3,0xc3
			
 
				+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+.type	_vpaes_schedule_transform,@function
			
 
				+.align	16
			
 
				+_vpaes_schedule_transform:
			
 
				+	movdqa	%xmm9,%xmm1
			
 
				+	pandn	%xmm0,%xmm1
			
 
				+	psrld	$4,%xmm1
			
 
				+	pand	%xmm9,%xmm0
			
 
				+	movdqa	(%r11),%xmm2
			
 
				+.byte	102,15,56,0,208
			
 
				+	movdqa	16(%r11),%xmm0
			
 
				+.byte	102,15,56,0,193
			
 
				+	pxor	%xmm2,%xmm0
			
 
				+	.byte	0xf3,0xc3
			
 
				+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+.type	_vpaes_schedule_mangle,@function
			
 
				+.align	16
			
 
				+_vpaes_schedule_mangle:
			
 
				+	movdqa	%xmm0,%xmm4
			
 
				+	movdqa	.Lk_mc_forward(%rip),%xmm5
			
 
				+	testq	%rcx,%rcx
			
 
				+	jnz	.Lschedule_mangle_dec
			
 
				+
			
 
				+
			
 
				+	addq	$16,%rdx
			
 
				+	pxor	.Lk_s63(%rip),%xmm4
			
 
				+.byte	102,15,56,0,229
			
 
				+	movdqa	%xmm4,%xmm3
			
 
				+.byte	102,15,56,0,229
			
 
				+	pxor	%xmm4,%xmm3
			
 
				+.byte	102,15,56,0,229
			
 
				+	pxor	%xmm4,%xmm3
			
 
				+
			
 
				+	jmp	.Lschedule_mangle_both
			
 
				+.align	16
			
 
				+.Lschedule_mangle_dec:
			
 
				+
			
 
				+	leaq	.Lk_dksd(%rip),%r11
			
 
				+	movdqa	%xmm9,%xmm1
			
 
				+	pandn	%xmm4,%xmm1
			
 
				+	psrld	$4,%xmm1
			
 
				+	pand	%xmm9,%xmm4
			
 
				+
			
 
				+	movdqa	0(%r11),%xmm2
			
 
				+.byte	102,15,56,0,212
			
 
				+	movdqa	16(%r11),%xmm3
			
 
				+.byte	102,15,56,0,217
			
 
				+	pxor	%xmm2,%xmm3
			
 
				+.byte	102,15,56,0,221
			
 
				+
			
 
				+	movdqa	32(%r11),%xmm2
			
 
				+.byte	102,15,56,0,212
			
 
				+	pxor	%xmm3,%xmm2
			
 
				+	movdqa	48(%r11),%xmm3
			
 
				+.byte	102,15,56,0,217
			
 
				+	pxor	%xmm2,%xmm3
			
 
				+.byte	102,15,56,0,221
			
 
				+
			
 
				+	movdqa	64(%r11),%xmm2
			
 
				+.byte	102,15,56,0,212
			
 
				+	pxor	%xmm3,%xmm2
			
 
				+	movdqa	80(%r11),%xmm3
			
 
				+.byte	102,15,56,0,217
			
 
				+	pxor	%xmm2,%xmm3
			
 
				+.byte	102,15,56,0,221
			
 
				+
			
 
				+	movdqa	96(%r11),%xmm2
			
 
				+.byte	102,15,56,0,212
			
 
				+	pxor	%xmm3,%xmm2
			
 
				+	movdqa	112(%r11),%xmm3
			
 
				+.byte	102,15,56,0,217
			
 
				+	pxor	%xmm2,%xmm3
			
 
				+
			
 
				+	addq	$-16,%rdx
			
 
				+
			
 
				+.Lschedule_mangle_both:
			
 
				+	movdqa	(%r8,%r10,1),%xmm1
			
 
				+.byte	102,15,56,0,217
			
 
				+	addq	$-16,%r8
			
 
				+	andq	$0x30,%r8
			
 
				+	movdqu	%xmm3,(%rdx)
			
 
				+	.byte	0xf3,0xc3
			
 
				+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+.globl	vpaes_set_encrypt_key
			
 
				+.type	vpaes_set_encrypt_key,@function
			
 
				+.align	16
			
 
				+vpaes_set_encrypt_key:
			
 
				+	movl	%esi,%eax
			
 
				+	shrl	$5,%eax
			
 
				+	addl	$5,%eax
			
 
				+	movl	%eax,240(%rdx)
			
 
				+
			
 
				+	movl	$0,%ecx
			
 
				+	movl	$0x30,%r8d
			
 
				+	call	_vpaes_schedule_core
			
 
				+	xorl	%eax,%eax
			
 
				+	.byte	0xf3,0xc3
			
 
				+.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
			
 
				+
			
 
				+.globl	vpaes_set_decrypt_key
			
 
				+.type	vpaes_set_decrypt_key,@function
			
 
				+.align	16
			
 
				+vpaes_set_decrypt_key:
			
 
				+	movl	%esi,%eax
			
 
				+	shrl	$5,%eax
			
 
				+	addl	$5,%eax
			
 
				+	movl	%eax,240(%rdx)
			
 
				+	shll	$4,%eax
			
 
				+	leaq	16(%rdx,%rax,1),%rdx
			
 
				+
			
 
				+	movl	$1,%ecx
			
 
				+	movl	%esi,%r8d
			
 
				+	shrl	$1,%r8d
			
 
				+	andl	$32,%r8d
			
 
				+	xorl	$32,%r8d
			
 
				+	call	_vpaes_schedule_core
			
 
				+	xorl	%eax,%eax
			
 
				+	.byte	0xf3,0xc3
			
 
				+.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
			
 
				+
			
 
				+.globl	vpaes_encrypt
			
 
				+.type	vpaes_encrypt,@function
			
 
				+.align	16
			
 
				+vpaes_encrypt:
			
 
				+	movdqu	(%rdi),%xmm0
			
 
				+	call	_vpaes_preheat
			
 
				+	call	_vpaes_encrypt_core
			
 
				+	movdqu	%xmm0,(%rsi)
			
 
				+	.byte	0xf3,0xc3
			
 
				+.size	vpaes_encrypt,.-vpaes_encrypt
			
 
				+
			
 
				+.globl	vpaes_decrypt
			
 
				+.type	vpaes_decrypt,@function
			
 
				+.align	16
			
 
				+vpaes_decrypt:
			
 
				+	movdqu	(%rdi),%xmm0
			
 
				+	call	_vpaes_preheat
			
 
				+	call	_vpaes_decrypt_core
			
 
				+	movdqu	%xmm0,(%rsi)
			
 
				+	.byte	0xf3,0xc3
			
 
				+.size	vpaes_decrypt,.-vpaes_decrypt
			
 
				+.globl	vpaes_cbc_encrypt
			
 
				+.type	vpaes_cbc_encrypt,@function
			
 
				+.align	16
			
 
				+vpaes_cbc_encrypt:
			
 
				+	xchgq	%rcx,%rdx
			
 
				+	subq	$16,%rcx
			
 
				+	jc	.Lcbc_abort
			
 
				+	movdqu	(%r8),%xmm6
			
 
				+	subq	%rdi,%rsi
			
 
				+	call	_vpaes_preheat
			
 
				+	cmpl	$0,%r9d
			
 
				+	je	.Lcbc_dec_loop
			
 
				+	jmp	.Lcbc_enc_loop
			
 
				+.align	16
			
 
				+.Lcbc_enc_loop:
			
 
				+	movdqu	(%rdi),%xmm0
			
 
				+	pxor	%xmm6,%xmm0
			
 
				+	call	_vpaes_encrypt_core
			
 
				+	movdqa	%xmm0,%xmm6
			
 
				+	movdqu	%xmm0,(%rsi,%rdi,1)
			
 
				+	leaq	16(%rdi),%rdi
			
 
				+	subq	$16,%rcx
			
 
				+	jnc	.Lcbc_enc_loop
			
 
				+	jmp	.Lcbc_done
			
 
				+.align	16
			
 
				+.Lcbc_dec_loop:
			
 
				+	movdqu	(%rdi),%xmm0
			
 
				+	movdqa	%xmm0,%xmm7
			
 
				+	call	_vpaes_decrypt_core
			
 
				+	pxor	%xmm6,%xmm0
			
 
				+	movdqa	%xmm7,%xmm6
			
 
				+	movdqu	%xmm0,(%rsi,%rdi,1)
			
 
				+	leaq	16(%rdi),%rdi
			
 
				+	subq	$16,%rcx
			
 
				+	jnc	.Lcbc_dec_loop
			
 
				+.Lcbc_done:
			
 
				+	movdqu	%xmm6,(%r8)
			
 
				+.Lcbc_abort:
			
 
				+	.byte	0xf3,0xc3
			
 
				+.size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+.type	_vpaes_preheat,@function
			
 
				+.align	16
			
 
				+_vpaes_preheat:
			
 
				+	leaq	.Lk_s0F(%rip),%r10
			
 
				+	movdqa	-32(%r10),%xmm10
			
 
				+	movdqa	-16(%r10),%xmm11
			
 
				+	movdqa	0(%r10),%xmm9
			
 
				+	movdqa	48(%r10),%xmm13
			
 
				+	movdqa	64(%r10),%xmm12
			
 
				+	movdqa	80(%r10),%xmm15
			
 
				+	movdqa	96(%r10),%xmm14
			
 
				+	.byte	0xf3,0xc3
			
 
				+.size	_vpaes_preheat,.-_vpaes_preheat
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+.type	_vpaes_consts,@object
			
 
				+.align	64
			
 
				+_vpaes_consts:
			
 
				+.Lk_inv:
			
 
				+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
			
 
				+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
			
 
				+
			
 
				+.Lk_s0F:
			
 
				+.quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
			
 
				+
			
 
				+.Lk_ipt:
			
 
				+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
			
 
				+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
			
 
				+
			
 
				+.Lk_sb1:
			
 
				+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
			
 
				+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
			
 
				+.Lk_sb2:
			
 
				+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
			
 
				+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
			
 
				+.Lk_sbo:
			
 
				+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
			
 
				+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
			
 
				+
			
 
				+.Lk_mc_forward:
			
 
				+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
			
 
				+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
			
 
				+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
			
 
				+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
			
 
				+
			
 
				+.Lk_mc_backward:
			
 
				+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
			
 
				+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
			
 
				+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
			
 
				+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
			
 
				+
			
 
				+.Lk_sr:
			
 
				+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
			
 
				+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
			
 
				+.quad	0x0F060D040B020900, 0x070E050C030A0108
			
 
				+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
			
 
				+
			
 
				+.Lk_rcon:
			
 
				+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
			
 
				+
			
 
				+.Lk_s63:
			
 
				+.quad	0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
			
 
				+
			
 
				+.Lk_opt:
			
 
				+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
			
 
				+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
			
 
				+
			
 
				+.Lk_deskew:
			
 
				+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
			
 
				+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+.Lk_dksd:
			
 
				+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
			
 
				+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
			
 
				+.Lk_dksb:
			
 
				+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
			
 
				+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
			
 
				+.Lk_dkse:
			
 
				+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
			
 
				+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
			
 
				+.Lk_dks9:
			
 
				+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
			
 
				+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+.Lk_dipt:
			
 
				+.quad	0x0F505B040B545F00, 0x154A411E114E451A
			
 
				+.quad	0x86E383E660056500, 0x12771772F491F194
			
 
				+
			
 
				+.Lk_dsb9:
			
 
				+.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
			
 
				+.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
			
 
				+.Lk_dsbd:
			
 
				+.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
			
 
				+.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
			
 
				+.Lk_dsbb:
			
 
				+.quad	0xD022649296B44200, 0x602646F6B0F2D404
			
 
				+.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
			
 
				+.Lk_dsbe:
			
 
				+.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
			
 
				+.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
			
 
				+.Lk_dsbo:
			
 
				+.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
			
 
				+.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
			
 
				+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
			
 
				+.align	64
			
 
				+.size	_vpaes_consts,.-_vpaes_consts
			
--- a/makefile
+++ b/makefile
@@ -6,7 +6,7 @@ cc_arm=/home/wangyu/Desktop/arm-2014.05/bin/arm-none-linux-gnueabi-g++
 
				 FLAGS= -std=c++11 -Wall -Wextra -Wno-unused-variable -Wno-unused-parameter -Wno-missing-field-initializers
			
 
				 
			
 
				 SOURCES=main.cpp lib/aes.c lib/md5.c encrypt.cpp log.cpp network.cpp common.cpp
			
 
				-SOURCES_AES_ACC=main.cpp $(wildcard lib/aes_acc/aes*.c) lib/md5.c encrypt.cpp log.cpp network.cpp common.cpp
			
 
				+SOURCES_AES_ACC=$(filter-out lib/aes.c,$(SOURCES)) $(wildcard lib/aes_acc/aes*.c)
			
 
				 
			
 
				 NAME=udp2raw
			
 
				 TAR=${NAME}_binaries.tar.gz ${NAME}_amd64  ${NAME}_x86  ${NAME}_x86_asm_aes ${NAME}_ar71xx ${NAME}_bcm2708 ${NAME}_arm ${NAME}_amd64_hw_aes ${NAME}_arm_asm_aes
			
@@ -32,16 +32,16 @@ bcm2708:
 
				 amd64:
			
 
				 	${cc_local}   -o ${NAME}_amd64    -I. ${SOURCES} ${FLAGS} -lrt -static -O3
			
 
				 amd64_hw_aes:
			
 
				-	${cc_local}   -o ${NAME}_amd64_hw_aes   -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -static -O3
			
 
				+	${cc_local}   -o ${NAME}_amd64_hw_aes   -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -static -O3 lib/aes_acc/asm/x64.S
			
 
				 x86:
			
 
				 	${cc_local}   -o ${NAME}_x86      -I. ${SOURCES} ${FLAGS} -lrt -static -O3 -m32
			
 
				 x86_asm_aes:
			
 
				-	${cc_local}   -o ${NAME}_x86_asm_aes    -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -static -O3 -m32 -DHAVE_ASM lib/aes_acc/asm/x86.S
			
 
				+	${cc_local}   -o ${NAME}_x86_asm_aes    -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -static -O3 -m32 lib/aes_acc/asm/x86.S
			
 
				 arm:
			
 
				 	${cc_cross}   -o ${NAME}_arm      -I. ${SOURCES} ${FLAGS} -lrt -static -O3
			
 
				 
			
 
				 arm_asm_aes:
			
 
				-	${cc_cross}   -o ${NAME}_arm_asm_aes    -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -static -O3  -DHAVE_ASM lib/aes_acc/asm/arm.S
			
 
				+	${cc_cross}   -o ${NAME}_arm_asm_aes    -I. ${SOURCES_AES_ACC} ${FLAGS} -lrt -static -O3 lib/aes_acc/asm/arm.S
			
 
				 
			
 
				 cross:
			
 
				 	${cc_cross}   -o ${NAME}_cross    -I. ${SOURCES} ${FLAGS} -lrt -O3