071-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. From 03662fcd41f4b764857f17b95f9a2a63c24bddd4 Mon Sep 17 00:00:00 2001
  2. From: Ard Biesheuvel <[email protected]>
  3. Date: Tue, 3 Nov 2020 17:28:09 +0100
  4. Subject: [PATCH 1/2] crypto: arm/chacha-neon - optimize for non-block size
  5. multiples
  6. commit 86cd97ec4b943af35562a74688bc4e909b32c3d1 upstream.
  7. The current NEON based ChaCha implementation for ARM is optimized for
  8. multiples of 4x the ChaCha block size (64 bytes). This makes sense for
  9. block encryption, but given that ChaCha is also often used in the
  10. context of networking, it makes sense to consider arbitrary length
  11. inputs as well.
  12. For example, WireGuard typically uses 1420 byte packets, and performing
  13. ChaCha encryption involves 5 invocations of chacha_4block_xor_neon()
  14. and 3 invocations of chacha_block_xor_neon(), where the last one also
  15. involves a memcpy() using a buffer on the stack to process the final
  16. chunk of 1420 % 64 == 12 bytes.
  17. Let's optimize for this case as well, by letting chacha_4block_xor_neon()
  18. deal with any input size between 64 and 256 bytes, using NEON permutation
  19. instructions and overlapping loads and stores. This way, the 140 byte
  20. tail of a 1420 byte input buffer can simply be processed in one go.
  21. This results in the following performance improvements for 1420 byte
  22. blocks, without significant impact on power-of-2 input sizes. (Note
  23. that Raspberry Pi is widely used in combination with a 32-bit kernel,
  24. even though the core is 64-bit capable)
  25. Cortex-A8 (BeagleBone) : 7%
  26. Cortex-A15 (Calxeda Midway) : 21%
  27. Cortex-A53 (Raspberry Pi 3) : 3%
  28. Cortex-A72 (Raspberry Pi 4) : 19%
  29. Cc: Eric Biggers <[email protected]>
  30. Cc: "Jason A . Donenfeld" <[email protected]>
  31. Signed-off-by: Ard Biesheuvel <[email protected]>
  32. Signed-off-by: Herbert Xu <[email protected]>
  33. Signed-off-by: Jason A. Donenfeld <[email protected]>
  34. ---
  35. arch/arm/crypto/chacha-glue.c | 34 +++++------
  36. arch/arm/crypto/chacha-neon-core.S | 97 +++++++++++++++++++++++++++---
  37. 2 files changed, 107 insertions(+), 24 deletions(-)
  38. --- a/arch/arm/crypto/chacha-glue.c
  39. +++ b/arch/arm/crypto/chacha-glue.c
  40. @@ -23,7 +23,7 @@
  41. asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
  42. int nrounds);
  43. asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
  44. - int nrounds);
  45. + int nrounds, unsigned int nbytes);
  46. asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
  47. asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
  48. @@ -42,24 +42,24 @@ static void chacha_doneon(u32 *state, u8
  49. {
  50. u8 buf[CHACHA_BLOCK_SIZE];
  51. - while (bytes >= CHACHA_BLOCK_SIZE * 4) {
  52. - chacha_4block_xor_neon(state, dst, src, nrounds);
  53. - bytes -= CHACHA_BLOCK_SIZE * 4;
  54. - src += CHACHA_BLOCK_SIZE * 4;
  55. - dst += CHACHA_BLOCK_SIZE * 4;
  56. - state[12] += 4;
  57. - }
  58. - while (bytes >= CHACHA_BLOCK_SIZE) {
  59. - chacha_block_xor_neon(state, dst, src, nrounds);
  60. - bytes -= CHACHA_BLOCK_SIZE;
  61. - src += CHACHA_BLOCK_SIZE;
  62. - dst += CHACHA_BLOCK_SIZE;
  63. - state[12]++;
  64. + while (bytes > CHACHA_BLOCK_SIZE) {
  65. + unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
  66. +
  67. + chacha_4block_xor_neon(state, dst, src, nrounds, l);
  68. + bytes -= l;
  69. + src += l;
  70. + dst += l;
  71. + state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
  72. }
  73. if (bytes) {
  74. - memcpy(buf, src, bytes);
  75. - chacha_block_xor_neon(state, buf, buf, nrounds);
  76. - memcpy(dst, buf, bytes);
  77. + const u8 *s = src;
  78. + u8 *d = dst;
  79. +
  80. + if (bytes != CHACHA_BLOCK_SIZE)
  81. + s = d = memcpy(buf, src, bytes);
  82. + chacha_block_xor_neon(state, d, s, nrounds);
  83. + if (d != dst)
  84. + memcpy(dst, buf, bytes);
  85. }
  86. }
  87. --- a/arch/arm/crypto/chacha-neon-core.S
  88. +++ b/arch/arm/crypto/chacha-neon-core.S
  89. @@ -47,6 +47,7 @@
  90. */
  91. #include <linux/linkage.h>
  92. +#include <asm/cache.h>
  93. .text
  94. .fpu neon
  95. @@ -205,7 +206,7 @@ ENDPROC(hchacha_block_neon)
  96. .align 5
  97. ENTRY(chacha_4block_xor_neon)
  98. - push {r4-r5}
  99. + push {r4, lr}
  100. mov r4, sp // preserve the stack pointer
  101. sub ip, sp, #0x20 // allocate a 32 byte buffer
  102. bic ip, ip, #0x1f // aligned to 32 bytes
  103. @@ -229,10 +230,10 @@ ENTRY(chacha_4block_xor_neon)
  104. vld1.32 {q0-q1}, [r0]
  105. vld1.32 {q2-q3}, [ip]
  106. - adr r5, .Lctrinc
  107. + adr lr, .Lctrinc
  108. vdup.32 q15, d7[1]
  109. vdup.32 q14, d7[0]
  110. - vld1.32 {q4}, [r5, :128]
  111. + vld1.32 {q4}, [lr, :128]
  112. vdup.32 q13, d6[1]
  113. vdup.32 q12, d6[0]
  114. vdup.32 q11, d5[1]
  115. @@ -455,7 +456,7 @@ ENTRY(chacha_4block_xor_neon)
  116. // Re-interleave the words in the first two rows of each block (x0..7).
  117. // Also add the counter values 0-3 to x12[0-3].
  118. - vld1.32 {q8}, [r5, :128] // load counter values 0-3
  119. + vld1.32 {q8}, [lr, :128] // load counter values 0-3
  120. vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1)
  121. vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3)
  122. vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5)
  123. @@ -493,6 +494,8 @@ ENTRY(chacha_4block_xor_neon)
  124. // Re-interleave the words in the last two rows of each block (x8..15).
  125. vld1.32 {q8-q9}, [sp, :256]
  126. + mov sp, r4 // restore original stack pointer
  127. + ldr r4, [r4, #8] // load number of bytes
  128. vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13)
  129. vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15)
  130. vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9)
  131. @@ -520,41 +523,121 @@ ENTRY(chacha_4block_xor_neon)
  132. // XOR the rest of the data with the keystream
  133. vld1.8 {q0-q1}, [r2]!
  134. + subs r4, r4, #96
  135. veor q0, q0, q8
  136. veor q1, q1, q12
  137. + ble .Lle96
  138. vst1.8 {q0-q1}, [r1]!
  139. vld1.8 {q0-q1}, [r2]!
  140. + subs r4, r4, #32
  141. veor q0, q0, q2
  142. veor q1, q1, q6
  143. + ble .Lle128
  144. vst1.8 {q0-q1}, [r1]!
  145. vld1.8 {q0-q1}, [r2]!
  146. + subs r4, r4, #32
  147. veor q0, q0, q10
  148. veor q1, q1, q14
  149. + ble .Lle160
  150. vst1.8 {q0-q1}, [r1]!
  151. vld1.8 {q0-q1}, [r2]!
  152. + subs r4, r4, #32
  153. veor q0, q0, q4
  154. veor q1, q1, q5
  155. + ble .Lle192
  156. vst1.8 {q0-q1}, [r1]!
  157. vld1.8 {q0-q1}, [r2]!
  158. + subs r4, r4, #32
  159. veor q0, q0, q9
  160. veor q1, q1, q13
  161. + ble .Lle224
  162. vst1.8 {q0-q1}, [r1]!
  163. vld1.8 {q0-q1}, [r2]!
  164. + subs r4, r4, #32
  165. veor q0, q0, q3
  166. veor q1, q1, q7
  167. + blt .Llt256
  168. +.Lout:
  169. vst1.8 {q0-q1}, [r1]!
  170. vld1.8 {q0-q1}, [r2]
  171. - mov sp, r4 // restore original stack pointer
  172. veor q0, q0, q11
  173. veor q1, q1, q15
  174. vst1.8 {q0-q1}, [r1]
  175. - pop {r4-r5}
  176. - bx lr
  177. + pop {r4, pc}
  178. +
  179. +.Lle192:
  180. + vmov q4, q9
  181. + vmov q5, q13
  182. +
  183. +.Lle160:
  184. + // nothing to do
  185. +
  186. +.Lfinalblock:
  187. + // Process the final block if processing less than 4 full blocks.
  188. + // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
  189. + // previous 32 byte output block that still needs to be written at
  190. + // [r1] in q0-q1.
  191. + beq .Lfullblock
  192. +
  193. +.Lpartialblock:
  194. + adr lr, .Lpermute + 32
  195. + add r2, r2, r4
  196. + add lr, lr, r4
  197. + add r4, r4, r1
  198. +
  199. + vld1.8 {q2-q3}, [lr]
  200. + vld1.8 {q6-q7}, [r2]
  201. +
  202. + add r4, r4, #32
  203. +
  204. + vtbl.8 d4, {q4-q5}, d4
  205. + vtbl.8 d5, {q4-q5}, d5
  206. + vtbl.8 d6, {q4-q5}, d6
  207. + vtbl.8 d7, {q4-q5}, d7
  208. +
  209. + veor q6, q6, q2
  210. + veor q7, q7, q3
  211. +
  212. + vst1.8 {q6-q7}, [r4] // overlapping stores
  213. + vst1.8 {q0-q1}, [r1]
  214. + pop {r4, pc}
  215. +
  216. +.Lfullblock:
  217. + vmov q11, q4
  218. + vmov q15, q5
  219. + b .Lout
  220. +.Lle96:
  221. + vmov q4, q2
  222. + vmov q5, q6
  223. + b .Lfinalblock
  224. +.Lle128:
  225. + vmov q4, q10
  226. + vmov q5, q14
  227. + b .Lfinalblock
  228. +.Lle224:
  229. + vmov q4, q3
  230. + vmov q5, q7
  231. + b .Lfinalblock
  232. +.Llt256:
  233. + vmov q4, q11
  234. + vmov q5, q15
  235. + b .Lpartialblock
  236. ENDPROC(chacha_4block_xor_neon)
  237. +
  238. + .align L1_CACHE_SHIFT
  239. +.Lpermute:
  240. + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
  241. + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
  242. + .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
  243. + .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
  244. + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
  245. + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
  246. + .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
  247. + .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f