1
0

Salsa20.cpp 54 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345
  1. /*
  2. * Based on public domain code available at: http://cr.yp.to/snuffle.html
  3. *
  4. * Modifications and C-native SSE macro based SSE implementation by
  5. * Adam Ierymenko <[email protected]>.
  6. *
  7. * Since the original was public domain, this is too.
  8. */
  9. #include "Constants.hpp"
  10. #include "Salsa20.hpp"
  11. #define ROTATE(v,c) (((v) << (c)) | ((v) >> (32 - (c))))
  12. #define XOR(v,w) ((v) ^ (w))
  13. #define PLUS(v,w) ((uint32_t)((v) + (w)))
  14. // Set up laod/store macros with appropriate endianness (we don't use these in SSE mode)
  15. #ifndef ZT_SALSA20_SSE
  16. #if __BYTE_ORDER == __LITTLE_ENDIAN
  17. #ifdef ZT_NO_TYPE_PUNNING
  18. // Slower version that does not use type punning
  19. #define U8TO32_LITTLE(p) ( ((uint32_t)(p)[0]) | ((uint32_t)(p)[1] << 8) | ((uint32_t)(p)[2] << 16) | ((uint32_t)(p)[3] << 24) )
  20. static inline void U32TO8_LITTLE(uint8_t *const c,const uint32_t v) { c[0] = (uint8_t)v; c[1] = (uint8_t)(v >> 8); c[2] = (uint8_t)(v >> 16); c[3] = (uint8_t)(v >> 24); }
  21. #else
  22. // Fast version that just does 32-bit load/store
  23. #define U8TO32_LITTLE(p) (*((const uint32_t *)((const void *)(p))))
  24. #define U32TO8_LITTLE(c,v) *((uint32_t *)((void *)(c))) = (v)
  25. #endif // ZT_NO_TYPE_PUNNING
  26. #else // __BYTE_ORDER == __BIG_ENDIAN (we don't support anything else... does MIDDLE_ENDIAN even still exist?)
  27. #ifdef __GNUC__
  28. // Use GNUC builtin bswap macros on big-endian machines if available
  29. #define U8TO32_LITTLE(p) __builtin_bswap32(*((const uint32_t *)((const void *)(p))))
  30. #define U32TO8_LITTLE(c,v) *((uint32_t *)((void *)(c))) = __builtin_bswap32((v))
  31. #else // no __GNUC__
  32. // Otherwise do it the slow, manual way on BE machines
  33. #define U8TO32_LITTLE(p) ( ((uint32_t)(p)[0]) | ((uint32_t)(p)[1] << 8) | ((uint32_t)(p)[2] << 16) | ((uint32_t)(p)[3] << 24) )
  34. static inline void U32TO8_LITTLE(uint8_t *const c,const uint32_t v) { c[0] = (uint8_t)v; c[1] = (uint8_t)(v >> 8); c[2] = (uint8_t)(v >> 16); c[3] = (uint8_t)(v >> 24); }
  35. #endif // __GNUC__ or not
  36. #endif // __BYTE_ORDER little or big?
  37. #endif // !ZT_SALSA20_SSE
  38. // Statically compute and define SSE constants
  39. #ifdef ZT_SALSA20_SSE
  40. class _s20sseconsts
  41. {
  42. public:
  43. _s20sseconsts()
  44. {
  45. maskLo32 = _mm_shuffle_epi32(_mm_cvtsi32_si128(-1), _MM_SHUFFLE(1, 0, 1, 0));
  46. maskHi32 = _mm_slli_epi64(maskLo32, 32);
  47. }
  48. __m128i maskLo32,maskHi32;
  49. };
  50. static const _s20sseconsts _S20SSECONSTANTS;
  51. #endif
  52. namespace ZeroTier {
  53. void Salsa20::init(const void *key,const void *iv)
  54. {
  55. #ifdef ZT_SALSA20_SSE
  56. const uint32_t *k = (const uint32_t *)key;
  57. _state.i[0] = 0x61707865;
  58. _state.i[3] = 0x6b206574;
  59. _state.i[13] = k[0];
  60. _state.i[10] = k[1];
  61. _state.i[7] = k[2];
  62. _state.i[4] = k[3];
  63. k += 4;
  64. _state.i[1] = 0x3320646e;
  65. _state.i[2] = 0x79622d32;
  66. _state.i[15] = k[0];
  67. _state.i[12] = k[1];
  68. _state.i[9] = k[2];
  69. _state.i[6] = k[3];
  70. _state.i[14] = ((const uint32_t *)iv)[0];
  71. _state.i[11] = ((const uint32_t *)iv)[1];
  72. _state.i[5] = 0;
  73. _state.i[8] = 0;
  74. #else
  75. const char *const constants = "expand 32-byte k";
  76. const uint8_t *k = (const uint8_t *)key;
  77. _state.i[1] = U8TO32_LITTLE(k + 0);
  78. _state.i[2] = U8TO32_LITTLE(k + 4);
  79. _state.i[3] = U8TO32_LITTLE(k + 8);
  80. _state.i[4] = U8TO32_LITTLE(k + 12);
  81. k += 16;
  82. _state.i[5] = U8TO32_LITTLE(constants + 4);
  83. _state.i[6] = U8TO32_LITTLE(((const uint8_t *)iv) + 0);
  84. _state.i[7] = U8TO32_LITTLE(((const uint8_t *)iv) + 4);
  85. _state.i[8] = 0;
  86. _state.i[9] = 0;
  87. _state.i[10] = U8TO32_LITTLE(constants + 8);
  88. _state.i[11] = U8TO32_LITTLE(k + 0);
  89. _state.i[12] = U8TO32_LITTLE(k + 4);
  90. _state.i[13] = U8TO32_LITTLE(k + 8);
  91. _state.i[14] = U8TO32_LITTLE(k + 12);
  92. _state.i[15] = U8TO32_LITTLE(constants + 12);
  93. _state.i[0] = U8TO32_LITTLE(constants + 0);
  94. #endif
  95. }
  96. void Salsa20::crypt12(const void *in,void *out,unsigned int bytes)
  97. {
  98. uint8_t tmp[64];
  99. const uint8_t *m = (const uint8_t *)in;
  100. uint8_t *c = (uint8_t *)out;
  101. uint8_t *ctarget = c;
  102. unsigned int i;
  103. #ifndef ZT_SALSA20_SSE
  104. uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
  105. uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
  106. #endif
  107. if (!bytes)
  108. return;
  109. #ifndef ZT_SALSA20_SSE
  110. j0 = _state.i[0];
  111. j1 = _state.i[1];
  112. j2 = _state.i[2];
  113. j3 = _state.i[3];
  114. j4 = _state.i[4];
  115. j5 = _state.i[5];
  116. j6 = _state.i[6];
  117. j7 = _state.i[7];
  118. j8 = _state.i[8];
  119. j9 = _state.i[9];
  120. j10 = _state.i[10];
  121. j11 = _state.i[11];
  122. j12 = _state.i[12];
  123. j13 = _state.i[13];
  124. j14 = _state.i[14];
  125. j15 = _state.i[15];
  126. #endif
  127. for (;;) {
  128. if (bytes < 64) {
  129. for (i = 0;i < bytes;++i)
  130. tmp[i] = m[i];
  131. m = tmp;
  132. ctarget = c;
  133. c = tmp;
  134. }
  135. #ifdef ZT_SALSA20_SSE
  136. __m128i X0 = _mm_loadu_si128((const __m128i *)&(_state.v[0]));
  137. __m128i X1 = _mm_loadu_si128((const __m128i *)&(_state.v[1]));
  138. __m128i X2 = _mm_loadu_si128((const __m128i *)&(_state.v[2]));
  139. __m128i X3 = _mm_loadu_si128((const __m128i *)&(_state.v[3]));
  140. __m128i T;
  141. __m128i X0s = X0;
  142. __m128i X1s = X1;
  143. __m128i X2s = X2;
  144. __m128i X3s = X3;
  145. // 2X round -------------------------------------------------------------
  146. T = _mm_add_epi32(X0, X3);
  147. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  148. T = _mm_add_epi32(X1, X0);
  149. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  150. T = _mm_add_epi32(X2, X1);
  151. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  152. T = _mm_add_epi32(X3, X2);
  153. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  154. X1 = _mm_shuffle_epi32(X1, 0x93);
  155. X2 = _mm_shuffle_epi32(X2, 0x4E);
  156. X3 = _mm_shuffle_epi32(X3, 0x39);
  157. T = _mm_add_epi32(X0, X1);
  158. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  159. T = _mm_add_epi32(X3, X0);
  160. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  161. T = _mm_add_epi32(X2, X3);
  162. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  163. T = _mm_add_epi32(X1, X2);
  164. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  165. X1 = _mm_shuffle_epi32(X1, 0x39);
  166. X2 = _mm_shuffle_epi32(X2, 0x4E);
  167. X3 = _mm_shuffle_epi32(X3, 0x93);
  168. // 2X round -------------------------------------------------------------
  169. T = _mm_add_epi32(X0, X3);
  170. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  171. T = _mm_add_epi32(X1, X0);
  172. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  173. T = _mm_add_epi32(X2, X1);
  174. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  175. T = _mm_add_epi32(X3, X2);
  176. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  177. X1 = _mm_shuffle_epi32(X1, 0x93);
  178. X2 = _mm_shuffle_epi32(X2, 0x4E);
  179. X3 = _mm_shuffle_epi32(X3, 0x39);
  180. T = _mm_add_epi32(X0, X1);
  181. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  182. T = _mm_add_epi32(X3, X0);
  183. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  184. T = _mm_add_epi32(X2, X3);
  185. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  186. T = _mm_add_epi32(X1, X2);
  187. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  188. X1 = _mm_shuffle_epi32(X1, 0x39);
  189. X2 = _mm_shuffle_epi32(X2, 0x4E);
  190. X3 = _mm_shuffle_epi32(X3, 0x93);
  191. // 2X round -------------------------------------------------------------
  192. T = _mm_add_epi32(X0, X3);
  193. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  194. T = _mm_add_epi32(X1, X0);
  195. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  196. T = _mm_add_epi32(X2, X1);
  197. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  198. T = _mm_add_epi32(X3, X2);
  199. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  200. X1 = _mm_shuffle_epi32(X1, 0x93);
  201. X2 = _mm_shuffle_epi32(X2, 0x4E);
  202. X3 = _mm_shuffle_epi32(X3, 0x39);
  203. T = _mm_add_epi32(X0, X1);
  204. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  205. T = _mm_add_epi32(X3, X0);
  206. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  207. T = _mm_add_epi32(X2, X3);
  208. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  209. T = _mm_add_epi32(X1, X2);
  210. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  211. X1 = _mm_shuffle_epi32(X1, 0x39);
  212. X2 = _mm_shuffle_epi32(X2, 0x4E);
  213. X3 = _mm_shuffle_epi32(X3, 0x93);
  214. // 2X round -------------------------------------------------------------
  215. T = _mm_add_epi32(X0, X3);
  216. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  217. T = _mm_add_epi32(X1, X0);
  218. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  219. T = _mm_add_epi32(X2, X1);
  220. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  221. T = _mm_add_epi32(X3, X2);
  222. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  223. X1 = _mm_shuffle_epi32(X1, 0x93);
  224. X2 = _mm_shuffle_epi32(X2, 0x4E);
  225. X3 = _mm_shuffle_epi32(X3, 0x39);
  226. T = _mm_add_epi32(X0, X1);
  227. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  228. T = _mm_add_epi32(X3, X0);
  229. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  230. T = _mm_add_epi32(X2, X3);
  231. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  232. T = _mm_add_epi32(X1, X2);
  233. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  234. X1 = _mm_shuffle_epi32(X1, 0x39);
  235. X2 = _mm_shuffle_epi32(X2, 0x4E);
  236. X3 = _mm_shuffle_epi32(X3, 0x93);
  237. // 2X round -------------------------------------------------------------
  238. T = _mm_add_epi32(X0, X3);
  239. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  240. T = _mm_add_epi32(X1, X0);
  241. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  242. T = _mm_add_epi32(X2, X1);
  243. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  244. T = _mm_add_epi32(X3, X2);
  245. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  246. X1 = _mm_shuffle_epi32(X1, 0x93);
  247. X2 = _mm_shuffle_epi32(X2, 0x4E);
  248. X3 = _mm_shuffle_epi32(X3, 0x39);
  249. T = _mm_add_epi32(X0, X1);
  250. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  251. T = _mm_add_epi32(X3, X0);
  252. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  253. T = _mm_add_epi32(X2, X3);
  254. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  255. T = _mm_add_epi32(X1, X2);
  256. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  257. X1 = _mm_shuffle_epi32(X1, 0x39);
  258. X2 = _mm_shuffle_epi32(X2, 0x4E);
  259. X3 = _mm_shuffle_epi32(X3, 0x93);
  260. // 2X round -------------------------------------------------------------
  261. T = _mm_add_epi32(X0, X3);
  262. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  263. T = _mm_add_epi32(X1, X0);
  264. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  265. T = _mm_add_epi32(X2, X1);
  266. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  267. T = _mm_add_epi32(X3, X2);
  268. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  269. X1 = _mm_shuffle_epi32(X1, 0x93);
  270. X2 = _mm_shuffle_epi32(X2, 0x4E);
  271. X3 = _mm_shuffle_epi32(X3, 0x39);
  272. T = _mm_add_epi32(X0, X1);
  273. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  274. T = _mm_add_epi32(X3, X0);
  275. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  276. T = _mm_add_epi32(X2, X3);
  277. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  278. T = _mm_add_epi32(X1, X2);
  279. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  280. X1 = _mm_shuffle_epi32(X1, 0x39);
  281. X2 = _mm_shuffle_epi32(X2, 0x4E);
  282. X3 = _mm_shuffle_epi32(X3, 0x93);
  283. X0 = _mm_add_epi32(X0s,X0);
  284. X1 = _mm_add_epi32(X1s,X1);
  285. X2 = _mm_add_epi32(X2s,X2);
  286. X3 = _mm_add_epi32(X3s,X3);
  287. __m128i k02 = _mm_shuffle_epi32(_mm_or_si128(_mm_slli_epi64(X0, 32), _mm_srli_epi64(X3, 32)), _MM_SHUFFLE(0, 1, 2, 3));
  288. __m128i k13 = _mm_shuffle_epi32(_mm_or_si128(_mm_slli_epi64(X1, 32), _mm_srli_epi64(X0, 32)), _MM_SHUFFLE(0, 1, 2, 3));
  289. __m128i k20 = _mm_or_si128(_mm_and_si128(X2, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X1, _S20SSECONSTANTS.maskHi32));
  290. __m128i k31 = _mm_or_si128(_mm_and_si128(X3, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X2, _S20SSECONSTANTS.maskHi32));
  291. _mm_storeu_ps(reinterpret_cast<float *>(c),_mm_castsi128_ps(_mm_xor_si128(_mm_unpackhi_epi64(k02,k20),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<const float *>(m))))));
  292. _mm_storeu_ps(reinterpret_cast<float *>(c) + 4,_mm_castsi128_ps(_mm_xor_si128(_mm_unpackhi_epi64(k13,k31),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<const float *>(m) + 4)))));
  293. _mm_storeu_ps(reinterpret_cast<float *>(c) + 8,_mm_castsi128_ps(_mm_xor_si128(_mm_unpacklo_epi64(k20,k02),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<const float *>(m) + 8)))));
  294. _mm_storeu_ps(reinterpret_cast<float *>(c) + 12,_mm_castsi128_ps(_mm_xor_si128(_mm_unpacklo_epi64(k31,k13),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<const float *>(m) + 12)))));
  295. if (!(++_state.i[8])) {
  296. ++_state.i[5]; // state reordered for SSE
  297. /* stopping at 2^70 bytes per nonce is user's responsibility */
  298. }
  299. #else
  300. x0 = j0;
  301. x1 = j1;
  302. x2 = j2;
  303. x3 = j3;
  304. x4 = j4;
  305. x5 = j5;
  306. x6 = j6;
  307. x7 = j7;
  308. x8 = j8;
  309. x9 = j9;
  310. x10 = j10;
  311. x11 = j11;
  312. x12 = j12;
  313. x13 = j13;
  314. x14 = j14;
  315. x15 = j15;
  316. // 2X round -------------------------------------------------------------
  317. x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
  318. x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
  319. x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
  320. x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
  321. x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
  322. x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
  323. x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
  324. x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
  325. x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
  326. x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
  327. x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
  328. x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
  329. x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
  330. x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
  331. x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
  332. x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
  333. x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
  334. x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
  335. x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
  336. x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
  337. x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
  338. x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
  339. x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
  340. x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
  341. x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
  342. x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
  343. x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
  344. x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
  345. x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
  346. x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
  347. x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
  348. x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
  349. // 2X round -------------------------------------------------------------
  350. x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
  351. x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
  352. x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
  353. x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
  354. x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
  355. x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
  356. x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
  357. x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
  358. x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
  359. x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
  360. x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
  361. x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
  362. x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
  363. x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
  364. x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
  365. x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
  366. x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
  367. x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
  368. x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
  369. x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
  370. x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
  371. x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
  372. x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
  373. x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
  374. x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
  375. x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
  376. x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
  377. x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
  378. x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
  379. x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
  380. x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
  381. x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
  382. // 2X round -------------------------------------------------------------
  383. x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
  384. x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
  385. x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
  386. x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
  387. x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
  388. x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
  389. x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
  390. x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
  391. x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
  392. x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
  393. x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
  394. x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
  395. x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
  396. x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
  397. x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
  398. x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
  399. x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
  400. x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
  401. x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
  402. x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
  403. x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
  404. x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
  405. x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
  406. x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
  407. x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
  408. x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
  409. x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
  410. x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
  411. x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
  412. x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
  413. x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
  414. x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
  415. // 2X round -------------------------------------------------------------
  416. x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
  417. x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
  418. x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
  419. x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
  420. x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
  421. x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
  422. x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
  423. x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
  424. x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
  425. x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
  426. x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
  427. x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
  428. x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
  429. x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
  430. x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
  431. x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
  432. x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
  433. x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
  434. x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
  435. x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
  436. x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
  437. x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
  438. x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
  439. x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
  440. x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
  441. x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
  442. x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
  443. x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
  444. x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
  445. x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
  446. x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
  447. x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
  448. // 2X round -------------------------------------------------------------
  449. x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
  450. x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
  451. x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
  452. x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
  453. x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
  454. x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
  455. x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
  456. x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
  457. x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
  458. x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
  459. x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
  460. x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
  461. x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
  462. x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
  463. x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
  464. x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
  465. x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
  466. x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
  467. x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
  468. x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
  469. x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
  470. x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
  471. x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
  472. x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
  473. x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
  474. x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
  475. x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
  476. x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
  477. x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
  478. x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
  479. x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
  480. x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
  481. // 2X round -------------------------------------------------------------
  482. x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
  483. x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
  484. x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
  485. x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
  486. x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
  487. x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
  488. x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
  489. x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
  490. x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
  491. x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
  492. x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
  493. x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
  494. x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
  495. x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
  496. x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
  497. x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
  498. x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
  499. x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
  500. x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
  501. x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
  502. x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
  503. x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
  504. x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
  505. x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
  506. x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
  507. x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
  508. x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
  509. x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
  510. x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
  511. x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
  512. x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
  513. x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
  514. x0 = PLUS(x0,j0);
  515. x1 = PLUS(x1,j1);
  516. x2 = PLUS(x2,j2);
  517. x3 = PLUS(x3,j3);
  518. x4 = PLUS(x4,j4);
  519. x5 = PLUS(x5,j5);
  520. x6 = PLUS(x6,j6);
  521. x7 = PLUS(x7,j7);
  522. x8 = PLUS(x8,j8);
  523. x9 = PLUS(x9,j9);
  524. x10 = PLUS(x10,j10);
  525. x11 = PLUS(x11,j11);
  526. x12 = PLUS(x12,j12);
  527. x13 = PLUS(x13,j13);
  528. x14 = PLUS(x14,j14);
  529. x15 = PLUS(x15,j15);
  530. U32TO8_LITTLE(c + 0,XOR(x0,U8TO32_LITTLE(m + 0)));
  531. U32TO8_LITTLE(c + 4,XOR(x1,U8TO32_LITTLE(m + 4)));
  532. U32TO8_LITTLE(c + 8,XOR(x2,U8TO32_LITTLE(m + 8)));
  533. U32TO8_LITTLE(c + 12,XOR(x3,U8TO32_LITTLE(m + 12)));
  534. U32TO8_LITTLE(c + 16,XOR(x4,U8TO32_LITTLE(m + 16)));
  535. U32TO8_LITTLE(c + 20,XOR(x5,U8TO32_LITTLE(m + 20)));
  536. U32TO8_LITTLE(c + 24,XOR(x6,U8TO32_LITTLE(m + 24)));
  537. U32TO8_LITTLE(c + 28,XOR(x7,U8TO32_LITTLE(m + 28)));
  538. U32TO8_LITTLE(c + 32,XOR(x8,U8TO32_LITTLE(m + 32)));
  539. U32TO8_LITTLE(c + 36,XOR(x9,U8TO32_LITTLE(m + 36)));
  540. U32TO8_LITTLE(c + 40,XOR(x10,U8TO32_LITTLE(m + 40)));
  541. U32TO8_LITTLE(c + 44,XOR(x11,U8TO32_LITTLE(m + 44)));
  542. U32TO8_LITTLE(c + 48,XOR(x12,U8TO32_LITTLE(m + 48)));
  543. U32TO8_LITTLE(c + 52,XOR(x13,U8TO32_LITTLE(m + 52)));
  544. U32TO8_LITTLE(c + 56,XOR(x14,U8TO32_LITTLE(m + 56)));
  545. U32TO8_LITTLE(c + 60,XOR(x15,U8TO32_LITTLE(m + 60)));
  546. if (!(++j8)) {
  547. ++j9;
  548. /* stopping at 2^70 bytes per nonce is user's responsibility */
  549. }
  550. #endif
  551. if (bytes <= 64) {
  552. if (bytes < 64) {
  553. for (i = 0;i < bytes;++i)
  554. ctarget[i] = c[i];
  555. }
  556. #ifndef ZT_SALSA20_SSE
  557. _state.i[8] = j8;
  558. _state.i[9] = j9;
  559. #endif
  560. return;
  561. }
  562. bytes -= 64;
  563. c += 64;
  564. m += 64;
  565. }
  566. }
  567. void Salsa20::crypt20(const void *in,void *out,unsigned int bytes)
  568. {
  569. uint8_t tmp[64];
  570. const uint8_t *m = (const uint8_t *)in;
  571. uint8_t *c = (uint8_t *)out;
  572. uint8_t *ctarget = c;
  573. unsigned int i;
  574. #ifndef ZT_SALSA20_SSE
  575. uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
  576. uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
  577. #endif
  578. if (!bytes)
  579. return;
  580. #ifndef ZT_SALSA20_SSE
  581. j0 = _state.i[0];
  582. j1 = _state.i[1];
  583. j2 = _state.i[2];
  584. j3 = _state.i[3];
  585. j4 = _state.i[4];
  586. j5 = _state.i[5];
  587. j6 = _state.i[6];
  588. j7 = _state.i[7];
  589. j8 = _state.i[8];
  590. j9 = _state.i[9];
  591. j10 = _state.i[10];
  592. j11 = _state.i[11];
  593. j12 = _state.i[12];
  594. j13 = _state.i[13];
  595. j14 = _state.i[14];
  596. j15 = _state.i[15];
  597. #endif
  598. for (;;) {
  599. if (bytes < 64) {
  600. for (i = 0;i < bytes;++i)
  601. tmp[i] = m[i];
  602. m = tmp;
  603. ctarget = c;
  604. c = tmp;
  605. }
  606. #ifdef ZT_SALSA20_SSE
  607. __m128i X0 = _mm_loadu_si128((const __m128i *)&(_state.v[0]));
  608. __m128i X1 = _mm_loadu_si128((const __m128i *)&(_state.v[1]));
  609. __m128i X2 = _mm_loadu_si128((const __m128i *)&(_state.v[2]));
  610. __m128i X3 = _mm_loadu_si128((const __m128i *)&(_state.v[3]));
  611. __m128i T;
  612. __m128i X0s = X0;
  613. __m128i X1s = X1;
  614. __m128i X2s = X2;
  615. __m128i X3s = X3;
  616. // 2X round -------------------------------------------------------------
  617. T = _mm_add_epi32(X0, X3);
  618. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  619. T = _mm_add_epi32(X1, X0);
  620. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  621. T = _mm_add_epi32(X2, X1);
  622. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  623. T = _mm_add_epi32(X3, X2);
  624. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  625. X1 = _mm_shuffle_epi32(X1, 0x93);
  626. X2 = _mm_shuffle_epi32(X2, 0x4E);
  627. X3 = _mm_shuffle_epi32(X3, 0x39);
  628. T = _mm_add_epi32(X0, X1);
  629. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  630. T = _mm_add_epi32(X3, X0);
  631. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  632. T = _mm_add_epi32(X2, X3);
  633. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  634. T = _mm_add_epi32(X1, X2);
  635. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  636. X1 = _mm_shuffle_epi32(X1, 0x39);
  637. X2 = _mm_shuffle_epi32(X2, 0x4E);
  638. X3 = _mm_shuffle_epi32(X3, 0x93);
  639. // 2X round -------------------------------------------------------------
  640. T = _mm_add_epi32(X0, X3);
  641. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  642. T = _mm_add_epi32(X1, X0);
  643. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  644. T = _mm_add_epi32(X2, X1);
  645. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  646. T = _mm_add_epi32(X3, X2);
  647. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  648. X1 = _mm_shuffle_epi32(X1, 0x93);
  649. X2 = _mm_shuffle_epi32(X2, 0x4E);
  650. X3 = _mm_shuffle_epi32(X3, 0x39);
  651. T = _mm_add_epi32(X0, X1);
  652. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  653. T = _mm_add_epi32(X3, X0);
  654. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  655. T = _mm_add_epi32(X2, X3);
  656. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  657. T = _mm_add_epi32(X1, X2);
  658. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  659. X1 = _mm_shuffle_epi32(X1, 0x39);
  660. X2 = _mm_shuffle_epi32(X2, 0x4E);
  661. X3 = _mm_shuffle_epi32(X3, 0x93);
  662. // 2X round -------------------------------------------------------------
  663. T = _mm_add_epi32(X0, X3);
  664. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  665. T = _mm_add_epi32(X1, X0);
  666. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  667. T = _mm_add_epi32(X2, X1);
  668. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  669. T = _mm_add_epi32(X3, X2);
  670. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  671. X1 = _mm_shuffle_epi32(X1, 0x93);
  672. X2 = _mm_shuffle_epi32(X2, 0x4E);
  673. X3 = _mm_shuffle_epi32(X3, 0x39);
  674. T = _mm_add_epi32(X0, X1);
  675. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  676. T = _mm_add_epi32(X3, X0);
  677. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  678. T = _mm_add_epi32(X2, X3);
  679. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  680. T = _mm_add_epi32(X1, X2);
  681. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  682. X1 = _mm_shuffle_epi32(X1, 0x39);
  683. X2 = _mm_shuffle_epi32(X2, 0x4E);
  684. X3 = _mm_shuffle_epi32(X3, 0x93);
  685. // 2X round -------------------------------------------------------------
  686. T = _mm_add_epi32(X0, X3);
  687. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  688. T = _mm_add_epi32(X1, X0);
  689. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  690. T = _mm_add_epi32(X2, X1);
  691. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  692. T = _mm_add_epi32(X3, X2);
  693. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  694. X1 = _mm_shuffle_epi32(X1, 0x93);
  695. X2 = _mm_shuffle_epi32(X2, 0x4E);
  696. X3 = _mm_shuffle_epi32(X3, 0x39);
  697. T = _mm_add_epi32(X0, X1);
  698. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  699. T = _mm_add_epi32(X3, X0);
  700. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  701. T = _mm_add_epi32(X2, X3);
  702. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  703. T = _mm_add_epi32(X1, X2);
  704. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  705. X1 = _mm_shuffle_epi32(X1, 0x39);
  706. X2 = _mm_shuffle_epi32(X2, 0x4E);
  707. X3 = _mm_shuffle_epi32(X3, 0x93);
  708. // 2X round -------------------------------------------------------------
  709. T = _mm_add_epi32(X0, X3);
  710. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  711. T = _mm_add_epi32(X1, X0);
  712. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  713. T = _mm_add_epi32(X2, X1);
  714. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  715. T = _mm_add_epi32(X3, X2);
  716. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  717. X1 = _mm_shuffle_epi32(X1, 0x93);
  718. X2 = _mm_shuffle_epi32(X2, 0x4E);
  719. X3 = _mm_shuffle_epi32(X3, 0x39);
  720. T = _mm_add_epi32(X0, X1);
  721. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  722. T = _mm_add_epi32(X3, X0);
  723. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  724. T = _mm_add_epi32(X2, X3);
  725. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  726. T = _mm_add_epi32(X1, X2);
  727. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  728. X1 = _mm_shuffle_epi32(X1, 0x39);
  729. X2 = _mm_shuffle_epi32(X2, 0x4E);
  730. X3 = _mm_shuffle_epi32(X3, 0x93);
  731. // 2X round -------------------------------------------------------------
  732. T = _mm_add_epi32(X0, X3);
  733. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  734. T = _mm_add_epi32(X1, X0);
  735. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  736. T = _mm_add_epi32(X2, X1);
  737. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  738. T = _mm_add_epi32(X3, X2);
  739. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  740. X1 = _mm_shuffle_epi32(X1, 0x93);
  741. X2 = _mm_shuffle_epi32(X2, 0x4E);
  742. X3 = _mm_shuffle_epi32(X3, 0x39);
  743. T = _mm_add_epi32(X0, X1);
  744. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  745. T = _mm_add_epi32(X3, X0);
  746. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  747. T = _mm_add_epi32(X2, X3);
  748. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  749. T = _mm_add_epi32(X1, X2);
  750. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  751. X1 = _mm_shuffle_epi32(X1, 0x39);
  752. X2 = _mm_shuffle_epi32(X2, 0x4E);
  753. X3 = _mm_shuffle_epi32(X3, 0x93);
  754. // 2X round -------------------------------------------------------------
  755. T = _mm_add_epi32(X0, X3);
  756. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  757. T = _mm_add_epi32(X1, X0);
  758. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  759. T = _mm_add_epi32(X2, X1);
  760. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  761. T = _mm_add_epi32(X3, X2);
  762. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  763. X1 = _mm_shuffle_epi32(X1, 0x93);
  764. X2 = _mm_shuffle_epi32(X2, 0x4E);
  765. X3 = _mm_shuffle_epi32(X3, 0x39);
  766. T = _mm_add_epi32(X0, X1);
  767. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  768. T = _mm_add_epi32(X3, X0);
  769. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  770. T = _mm_add_epi32(X2, X3);
  771. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  772. T = _mm_add_epi32(X1, X2);
  773. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  774. X1 = _mm_shuffle_epi32(X1, 0x39);
  775. X2 = _mm_shuffle_epi32(X2, 0x4E);
  776. X3 = _mm_shuffle_epi32(X3, 0x93);
  777. // 2X round -------------------------------------------------------------
  778. T = _mm_add_epi32(X0, X3);
  779. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  780. T = _mm_add_epi32(X1, X0);
  781. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  782. T = _mm_add_epi32(X2, X1);
  783. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  784. T = _mm_add_epi32(X3, X2);
  785. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  786. X1 = _mm_shuffle_epi32(X1, 0x93);
  787. X2 = _mm_shuffle_epi32(X2, 0x4E);
  788. X3 = _mm_shuffle_epi32(X3, 0x39);
  789. T = _mm_add_epi32(X0, X1);
  790. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  791. T = _mm_add_epi32(X3, X0);
  792. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  793. T = _mm_add_epi32(X2, X3);
  794. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  795. T = _mm_add_epi32(X1, X2);
  796. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  797. X1 = _mm_shuffle_epi32(X1, 0x39);
  798. X2 = _mm_shuffle_epi32(X2, 0x4E);
  799. X3 = _mm_shuffle_epi32(X3, 0x93);
  800. // 2X round -------------------------------------------------------------
  801. T = _mm_add_epi32(X0, X3);
  802. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  803. T = _mm_add_epi32(X1, X0);
  804. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  805. T = _mm_add_epi32(X2, X1);
  806. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  807. T = _mm_add_epi32(X3, X2);
  808. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  809. X1 = _mm_shuffle_epi32(X1, 0x93);
  810. X2 = _mm_shuffle_epi32(X2, 0x4E);
  811. X3 = _mm_shuffle_epi32(X3, 0x39);
  812. T = _mm_add_epi32(X0, X1);
  813. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  814. T = _mm_add_epi32(X3, X0);
  815. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  816. T = _mm_add_epi32(X2, X3);
  817. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  818. T = _mm_add_epi32(X1, X2);
  819. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  820. X1 = _mm_shuffle_epi32(X1, 0x39);
  821. X2 = _mm_shuffle_epi32(X2, 0x4E);
  822. X3 = _mm_shuffle_epi32(X3, 0x93);
  823. // 2X round -------------------------------------------------------------
  824. T = _mm_add_epi32(X0, X3);
  825. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  826. T = _mm_add_epi32(X1, X0);
  827. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  828. T = _mm_add_epi32(X2, X1);
  829. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  830. T = _mm_add_epi32(X3, X2);
  831. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  832. X1 = _mm_shuffle_epi32(X1, 0x93);
  833. X2 = _mm_shuffle_epi32(X2, 0x4E);
  834. X3 = _mm_shuffle_epi32(X3, 0x39);
  835. T = _mm_add_epi32(X0, X1);
  836. X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
  837. T = _mm_add_epi32(X3, X0);
  838. X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
  839. T = _mm_add_epi32(X2, X3);
  840. X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
  841. T = _mm_add_epi32(X1, X2);
  842. X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
  843. X1 = _mm_shuffle_epi32(X1, 0x39);
  844. X2 = _mm_shuffle_epi32(X2, 0x4E);
  845. X3 = _mm_shuffle_epi32(X3, 0x93);
  846. X0 = _mm_add_epi32(X0s,X0);
  847. X1 = _mm_add_epi32(X1s,X1);
  848. X2 = _mm_add_epi32(X2s,X2);
  849. X3 = _mm_add_epi32(X3s,X3);
  850. __m128i k02 = _mm_shuffle_epi32(_mm_or_si128(_mm_slli_epi64(X0, 32), _mm_srli_epi64(X3, 32)), _MM_SHUFFLE(0, 1, 2, 3));
  851. __m128i k13 = _mm_shuffle_epi32(_mm_or_si128(_mm_slli_epi64(X1, 32), _mm_srli_epi64(X0, 32)), _MM_SHUFFLE(0, 1, 2, 3));
  852. __m128i k20 = _mm_or_si128(_mm_and_si128(X2, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X1, _S20SSECONSTANTS.maskHi32));
  853. __m128i k31 = _mm_or_si128(_mm_and_si128(X3, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X2, _S20SSECONSTANTS.maskHi32));
  854. _mm_storeu_ps(reinterpret_cast<float *>(c),_mm_castsi128_ps(_mm_xor_si128(_mm_unpackhi_epi64(k02,k20),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<const float *>(m))))));
  855. _mm_storeu_ps(reinterpret_cast<float *>(c) + 4,_mm_castsi128_ps(_mm_xor_si128(_mm_unpackhi_epi64(k13,k31),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<const float *>(m) + 4)))));
  856. _mm_storeu_ps(reinterpret_cast<float *>(c) + 8,_mm_castsi128_ps(_mm_xor_si128(_mm_unpacklo_epi64(k20,k02),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<const float *>(m) + 8)))));
  857. _mm_storeu_ps(reinterpret_cast<float *>(c) + 12,_mm_castsi128_ps(_mm_xor_si128(_mm_unpacklo_epi64(k31,k13),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<const float *>(m) + 12)))));
  858. if (!(++_state.i[8])) {
  859. ++_state.i[5]; // state reordered for SSE
  860. /* stopping at 2^70 bytes per nonce is user's responsibility */
  861. }
  862. #else
  863. x0 = j0;
  864. x1 = j1;
  865. x2 = j2;
  866. x3 = j3;
  867. x4 = j4;
  868. x5 = j5;
  869. x6 = j6;
  870. x7 = j7;
  871. x8 = j8;
  872. x9 = j9;
  873. x10 = j10;
  874. x11 = j11;
  875. x12 = j12;
  876. x13 = j13;
  877. x14 = j14;
  878. x15 = j15;
  879. // 2X round -------------------------------------------------------------
  880. x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
  881. x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
  882. x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
  883. x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
  884. x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
  885. x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
  886. x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
  887. x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
  888. x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
  889. x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
  890. x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
  891. x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
  892. x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
  893. x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
  894. x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
  895. x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
  896. x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
  897. x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
  898. x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
  899. x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
  900. x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
  901. x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
  902. x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
  903. x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
  904. x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
  905. x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
  906. x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
  907. x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
  908. x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
  909. x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
  910. x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
  911. x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
  912. // 2X round -------------------------------------------------------------
  913. x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
  914. x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
  915. x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
  916. x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
  917. x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
  918. x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
  919. x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
  920. x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
  921. x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
  922. x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
  923. x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
  924. x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
  925. x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
  926. x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
  927. x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
  928. x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
  929. x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
  930. x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
  931. x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
  932. x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
  933. x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
  934. x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
  935. x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
  936. x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
  937. x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
  938. x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
  939. x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
  940. x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
  941. x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
  942. x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
  943. x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
  944. x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
  945. // 2X round -------------------------------------------------------------
  946. x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
  947. x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
  948. x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
  949. x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
  950. x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
  951. x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
  952. x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
  953. x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
  954. x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
  955. x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
  956. x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
  957. x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
  958. x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
  959. x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
  960. x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
  961. x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
  962. x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
  963. x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
  964. x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
  965. x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
  966. x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
  967. x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
  968. x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
  969. x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
  970. x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
  971. x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
  972. x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
  973. x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
  974. x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
  975. x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
  976. x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
  977. x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
  978. // 2X round -------------------------------------------------------------
  979. x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
  980. x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
  981. x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
  982. x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
  983. x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
  984. x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
  985. x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
  986. x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
  987. x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
  988. x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
  989. x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
  990. x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
  991. x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
  992. x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
  993. x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
  994. x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
  995. x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
  996. x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
  997. x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
  998. x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
  999. x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
  1000. x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
  1001. x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
  1002. x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
  1003. x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
  1004. x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
  1005. x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
  1006. x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
  1007. x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
  1008. x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
  1009. x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
  1010. x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
  1011. // 2X round -------------------------------------------------------------
  1012. x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
  1013. x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
  1014. x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
  1015. x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
  1016. x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
  1017. x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
  1018. x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
  1019. x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
  1020. x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
  1021. x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
  1022. x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
  1023. x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
  1024. x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
  1025. x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
  1026. x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
  1027. x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
  1028. x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
  1029. x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
  1030. x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
  1031. x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
  1032. x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
  1033. x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
  1034. x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
  1035. x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
  1036. x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
  1037. x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
  1038. x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
  1039. x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
  1040. x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
  1041. x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
  1042. x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
  1043. x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
  1044. // 2X round -------------------------------------------------------------
  1045. x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
  1046. x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
  1047. x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
  1048. x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
  1049. x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
  1050. x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
  1051. x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
  1052. x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
  1053. x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
  1054. x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
  1055. x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
  1056. x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
  1057. x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
  1058. x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
  1059. x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
  1060. x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
  1061. x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
  1062. x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
  1063. x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
  1064. x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
  1065. x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
  1066. x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
  1067. x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
  1068. x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
  1069. x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
  1070. x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
  1071. x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
  1072. x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
  1073. x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
  1074. x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
  1075. x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
  1076. x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
  1077. // 2X round -------------------------------------------------------------
  1078. x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
  1079. x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
  1080. x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
  1081. x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
  1082. x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
  1083. x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
  1084. x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
  1085. x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
  1086. x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
  1087. x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
  1088. x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
  1089. x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
  1090. x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
  1091. x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
  1092. x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
  1093. x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
  1094. x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
  1095. x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
  1096. x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
  1097. x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
  1098. x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
  1099. x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
  1100. x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
  1101. x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
  1102. x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
  1103. x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
  1104. x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
  1105. x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
  1106. x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
  1107. x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
  1108. x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
  1109. x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
  1110. // 2X round -------------------------------------------------------------
  1111. x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
  1112. x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
  1113. x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
  1114. x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
  1115. x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
  1116. x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
  1117. x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
  1118. x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
  1119. x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
  1120. x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
  1121. x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
  1122. x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
  1123. x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
  1124. x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
  1125. x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
  1126. x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
  1127. x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
  1128. x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
  1129. x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
  1130. x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
  1131. x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
  1132. x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
  1133. x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
  1134. x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
  1135. x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
  1136. x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
  1137. x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
  1138. x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
  1139. x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
  1140. x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
  1141. x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
  1142. x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
  1143. // 2X round -------------------------------------------------------------
  1144. x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
  1145. x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
  1146. x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
  1147. x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
  1148. x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
  1149. x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
  1150. x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
  1151. x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
  1152. x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
  1153. x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
  1154. x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
  1155. x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
  1156. x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
  1157. x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
  1158. x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
  1159. x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
  1160. x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
  1161. x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
  1162. x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
  1163. x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
  1164. x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
  1165. x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
  1166. x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
  1167. x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
  1168. x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
  1169. x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
  1170. x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
  1171. x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
  1172. x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
  1173. x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
  1174. x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
  1175. x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
  1176. // 2X round -------------------------------------------------------------
  1177. x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
  1178. x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
  1179. x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
  1180. x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
  1181. x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
  1182. x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
  1183. x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
  1184. x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
  1185. x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
  1186. x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
  1187. x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
  1188. x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
  1189. x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
  1190. x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
  1191. x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
  1192. x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
  1193. x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
  1194. x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
  1195. x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
  1196. x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
  1197. x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
  1198. x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
  1199. x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
  1200. x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
  1201. x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
  1202. x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
  1203. x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
  1204. x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
  1205. x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
  1206. x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
  1207. x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
  1208. x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
  1209. x0 = PLUS(x0,j0);
  1210. x1 = PLUS(x1,j1);
  1211. x2 = PLUS(x2,j2);
  1212. x3 = PLUS(x3,j3);
  1213. x4 = PLUS(x4,j4);
  1214. x5 = PLUS(x5,j5);
  1215. x6 = PLUS(x6,j6);
  1216. x7 = PLUS(x7,j7);
  1217. x8 = PLUS(x8,j8);
  1218. x9 = PLUS(x9,j9);
  1219. x10 = PLUS(x10,j10);
  1220. x11 = PLUS(x11,j11);
  1221. x12 = PLUS(x12,j12);
  1222. x13 = PLUS(x13,j13);
  1223. x14 = PLUS(x14,j14);
  1224. x15 = PLUS(x15,j15);
  1225. U32TO8_LITTLE(c + 0,XOR(x0,U8TO32_LITTLE(m + 0)));
  1226. U32TO8_LITTLE(c + 4,XOR(x1,U8TO32_LITTLE(m + 4)));
  1227. U32TO8_LITTLE(c + 8,XOR(x2,U8TO32_LITTLE(m + 8)));
  1228. U32TO8_LITTLE(c + 12,XOR(x3,U8TO32_LITTLE(m + 12)));
  1229. U32TO8_LITTLE(c + 16,XOR(x4,U8TO32_LITTLE(m + 16)));
  1230. U32TO8_LITTLE(c + 20,XOR(x5,U8TO32_LITTLE(m + 20)));
  1231. U32TO8_LITTLE(c + 24,XOR(x6,U8TO32_LITTLE(m + 24)));
  1232. U32TO8_LITTLE(c + 28,XOR(x7,U8TO32_LITTLE(m + 28)));
  1233. U32TO8_LITTLE(c + 32,XOR(x8,U8TO32_LITTLE(m + 32)));
  1234. U32TO8_LITTLE(c + 36,XOR(x9,U8TO32_LITTLE(m + 36)));
  1235. U32TO8_LITTLE(c + 40,XOR(x10,U8TO32_LITTLE(m + 40)));
  1236. U32TO8_LITTLE(c + 44,XOR(x11,U8TO32_LITTLE(m + 44)));
  1237. U32TO8_LITTLE(c + 48,XOR(x12,U8TO32_LITTLE(m + 48)));
  1238. U32TO8_LITTLE(c + 52,XOR(x13,U8TO32_LITTLE(m + 52)));
  1239. U32TO8_LITTLE(c + 56,XOR(x14,U8TO32_LITTLE(m + 56)));
  1240. U32TO8_LITTLE(c + 60,XOR(x15,U8TO32_LITTLE(m + 60)));
  1241. if (!(++j8)) {
  1242. ++j9;
  1243. /* stopping at 2^70 bytes per nonce is user's responsibility */
  1244. }
  1245. #endif
  1246. if (bytes <= 64) {
  1247. if (bytes < 64) {
  1248. for (i = 0;i < bytes;++i)
  1249. ctarget[i] = c[i];
  1250. }
  1251. #ifndef ZT_SALSA20_SSE
  1252. _state.i[8] = j8;
  1253. _state.i[9] = j9;
  1254. #endif
  1255. return;
  1256. }
  1257. bytes -= 64;
  1258. c += 64;
  1259. m += 64;
  1260. }
  1261. }
  1262. } // namespace ZeroTier