0014-riscv-Optimize-memcpy-with-aligned-version.patch 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508
  1. From 16fef31de538ce55e286b630d0b33d872707420d Mon Sep 17 00:00:00 2001
  2. From: Mason Huo <[email protected]>
  3. Date: Tue, 20 Jun 2023 13:37:52 +0800
  4. Subject: [PATCH 14/55] riscv: Optimize memcpy with aligned version
  5. Optimizing the 128 byte align case, this will improve the
  6. performance of large block memcpy.
  7. Here we combine the memcpy of glibc and kernel.
  8. Signed-off-by: Mason Huo <[email protected]>
  9. Signed-off-by: Hal Feng <[email protected]>
  10. ---
  11. arch/riscv/lib/Makefile | 3 +-
  12. arch/riscv/lib/{memcpy.S => memcpy_aligned.S} | 37 +--
  13. arch/riscv/lib/string.c | 266 ++++++++++++++++++
  14. 3 files changed, 274 insertions(+), 32 deletions(-)
  15. rename arch/riscv/lib/{memcpy.S => memcpy_aligned.S} (65%)
  16. create mode 100644 arch/riscv/lib/string.c
  17. --- a/arch/riscv/lib/Makefile
  18. +++ b/arch/riscv/lib/Makefile
  19. @@ -1,6 +1,5 @@
  20. # SPDX-License-Identifier: GPL-2.0-only
  21. lib-y += delay.o
  22. -lib-y += memcpy.o
  23. lib-y += memset.o
  24. lib-y += memmove.o
  25. ifeq ($(CONFIG_KASAN_GENERIC)$(CONFIG_KASAN_SW_TAGS),)
  26. @@ -16,6 +15,8 @@ lib-$(CONFIG_MMU) += uaccess.o
  27. lib-$(CONFIG_64BIT) += tishift.o
  28. lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o
  29. lib-$(CONFIG_RISCV_ISA_ZBC) += crc32.o
  30. +lib-y += string.o
  31. +lib-y += memcpy_aligned.o
  32. obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
  33. lib-$(CONFIG_RISCV_ISA_V) += xor.o
  34. --- a/arch/riscv/lib/memcpy.S
  35. +++ /dev/null
  36. @@ -1,110 +0,0 @@
  37. -/* SPDX-License-Identifier: GPL-2.0-only */
  38. -/*
  39. - * Copyright (C) 2013 Regents of the University of California
  40. - */
  41. -
  42. -#include <linux/linkage.h>
  43. -#include <asm/asm.h>
  44. -
  45. -/* void *memcpy(void *, const void *, size_t) */
  46. -SYM_FUNC_START(__memcpy)
  47. - move t6, a0 /* Preserve return value */
  48. -
  49. - /* Defer to byte-oriented copy for small sizes */
  50. - sltiu a3, a2, 128
  51. - bnez a3, 4f
  52. - /* Use word-oriented copy only if low-order bits match */
  53. - andi a3, t6, SZREG-1
  54. - andi a4, a1, SZREG-1
  55. - bne a3, a4, 4f
  56. -
  57. - beqz a3, 2f /* Skip if already aligned */
  58. - /*
  59. - * Round to nearest double word-aligned address
  60. - * greater than or equal to start address
  61. - */
  62. - andi a3, a1, ~(SZREG-1)
  63. - addi a3, a3, SZREG
  64. - /* Handle initial misalignment */
  65. - sub a4, a3, a1
  66. -1:
  67. - lb a5, 0(a1)
  68. - addi a1, a1, 1
  69. - sb a5, 0(t6)
  70. - addi t6, t6, 1
  71. - bltu a1, a3, 1b
  72. - sub a2, a2, a4 /* Update count */
  73. -
  74. -2:
  75. - andi a4, a2, ~((16*SZREG)-1)
  76. - beqz a4, 4f
  77. - add a3, a1, a4
  78. -3:
  79. - REG_L a4, 0(a1)
  80. - REG_L a5, SZREG(a1)
  81. - REG_L a6, 2*SZREG(a1)
  82. - REG_L a7, 3*SZREG(a1)
  83. - REG_L t0, 4*SZREG(a1)
  84. - REG_L t1, 5*SZREG(a1)
  85. - REG_L t2, 6*SZREG(a1)
  86. - REG_L t3, 7*SZREG(a1)
  87. - REG_L t4, 8*SZREG(a1)
  88. - REG_L t5, 9*SZREG(a1)
  89. - REG_S a4, 0(t6)
  90. - REG_S a5, SZREG(t6)
  91. - REG_S a6, 2*SZREG(t6)
  92. - REG_S a7, 3*SZREG(t6)
  93. - REG_S t0, 4*SZREG(t6)
  94. - REG_S t1, 5*SZREG(t6)
  95. - REG_S t2, 6*SZREG(t6)
  96. - REG_S t3, 7*SZREG(t6)
  97. - REG_S t4, 8*SZREG(t6)
  98. - REG_S t5, 9*SZREG(t6)
  99. - REG_L a4, 10*SZREG(a1)
  100. - REG_L a5, 11*SZREG(a1)
  101. - REG_L a6, 12*SZREG(a1)
  102. - REG_L a7, 13*SZREG(a1)
  103. - REG_L t0, 14*SZREG(a1)
  104. - REG_L t1, 15*SZREG(a1)
  105. - addi a1, a1, 16*SZREG
  106. - REG_S a4, 10*SZREG(t6)
  107. - REG_S a5, 11*SZREG(t6)
  108. - REG_S a6, 12*SZREG(t6)
  109. - REG_S a7, 13*SZREG(t6)
  110. - REG_S t0, 14*SZREG(t6)
  111. - REG_S t1, 15*SZREG(t6)
  112. - addi t6, t6, 16*SZREG
  113. - bltu a1, a3, 3b
  114. - andi a2, a2, (16*SZREG)-1 /* Update count */
  115. -
  116. -4:
  117. - /* Handle trailing misalignment */
  118. - beqz a2, 6f
  119. - add a3, a1, a2
  120. -
  121. - /* Use word-oriented copy if co-aligned to word boundary */
  122. - or a5, a1, t6
  123. - or a5, a5, a3
  124. - andi a5, a5, 3
  125. - bnez a5, 5f
  126. -7:
  127. - lw a4, 0(a1)
  128. - addi a1, a1, 4
  129. - sw a4, 0(t6)
  130. - addi t6, t6, 4
  131. - bltu a1, a3, 7b
  132. -
  133. - ret
  134. -
  135. -5:
  136. - lb a4, 0(a1)
  137. - addi a1, a1, 1
  138. - sb a4, 0(t6)
  139. - addi t6, t6, 1
  140. - bltu a1, a3, 5b
  141. -6:
  142. - ret
  143. -SYM_FUNC_END(__memcpy)
  144. -SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
  145. -SYM_FUNC_ALIAS(__pi_memcpy, __memcpy)
  146. -SYM_FUNC_ALIAS(__pi___memcpy, __memcpy)
  147. --- /dev/null
  148. +++ b/arch/riscv/lib/memcpy_aligned.S
  149. @@ -0,0 +1,85 @@
  150. +/* SPDX-License-Identifier: GPL-2.0-only */
  151. +/*
  152. + * Copyright (C) 2013 Regents of the University of California
  153. + */
  154. +
  155. +#include <linux/linkage.h>
  156. +#include <asm/asm.h>
  157. +
  158. +/* void *__memcpy_aligned(void *, const void *, size_t) */
  159. +SYM_FUNC_START(__memcpy_aligned)
  160. + move t6, a0 /* Preserve return value */
  161. +
  162. +2:
  163. + andi a4, a2, ~((16*SZREG)-1)
  164. + beqz a4, 4f
  165. + add a3, a1, a4
  166. +3:
  167. + REG_L a4, 0(a1)
  168. + REG_L a5, SZREG(a1)
  169. + REG_L a6, 2*SZREG(a1)
  170. + REG_L a7, 3*SZREG(a1)
  171. + REG_L t0, 4*SZREG(a1)
  172. + REG_L t1, 5*SZREG(a1)
  173. + REG_L t2, 6*SZREG(a1)
  174. + REG_L t3, 7*SZREG(a1)
  175. + REG_L t4, 8*SZREG(a1)
  176. + REG_L t5, 9*SZREG(a1)
  177. + REG_S a4, 0(t6)
  178. + REG_S a5, SZREG(t6)
  179. + REG_S a6, 2*SZREG(t6)
  180. + REG_S a7, 3*SZREG(t6)
  181. + REG_S t0, 4*SZREG(t6)
  182. + REG_S t1, 5*SZREG(t6)
  183. + REG_S t2, 6*SZREG(t6)
  184. + REG_S t3, 7*SZREG(t6)
  185. + REG_S t4, 8*SZREG(t6)
  186. + REG_S t5, 9*SZREG(t6)
  187. + REG_L a4, 10*SZREG(a1)
  188. + REG_L a5, 11*SZREG(a1)
  189. + REG_L a6, 12*SZREG(a1)
  190. + REG_L a7, 13*SZREG(a1)
  191. + REG_L t0, 14*SZREG(a1)
  192. + REG_L t1, 15*SZREG(a1)
  193. + addi a1, a1, 16*SZREG
  194. + REG_S a4, 10*SZREG(t6)
  195. + REG_S a5, 11*SZREG(t6)
  196. + REG_S a6, 12*SZREG(t6)
  197. + REG_S a7, 13*SZREG(t6)
  198. + REG_S t0, 14*SZREG(t6)
  199. + REG_S t1, 15*SZREG(t6)
  200. + addi t6, t6, 16*SZREG
  201. + bltu a1, a3, 3b
  202. + andi a2, a2, (16*SZREG)-1 /* Update count */
  203. +
  204. +4:
  205. + /* Handle trailing misalignment */
  206. + beqz a2, 6f
  207. + add a3, a1, a2
  208. +
  209. + /* Use word-oriented copy if co-aligned to word boundary */
  210. + or a5, a1, t6
  211. + or a5, a5, a3
  212. + andi a5, a5, 3
  213. + bnez a5, 5f
  214. +7:
  215. + lw a4, 0(a1)
  216. + addi a1, a1, 4
  217. + sw a4, 0(t6)
  218. + addi t6, t6, 4
  219. + bltu a1, a3, 7b
  220. +
  221. + ret
  222. +
  223. +5:
  224. + lb a4, 0(a1)
  225. + addi a1, a1, 1
  226. + sb a4, 0(t6)
  227. + addi t6, t6, 1
  228. + bltu a1, a3, 5b
  229. +6:
  230. + ret
  231. +SYM_FUNC_END(__memcpy_aligned)
  232. +SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy_aligned)
  233. +SYM_FUNC_ALIAS(__pi_memcpy, __memcpy_aligned)
  234. +SYM_FUNC_ALIAS(__pi___memcpy, __memcpy_aligned)
  235. --- /dev/null
  236. +++ b/arch/riscv/lib/string.c
  237. @@ -0,0 +1,266 @@
  238. +// SPDX-License-Identifier: GPL-2.0-only
  239. +/*
  240. + * Copy memory to memory until the specified number of bytes
  241. + * has been copied. Overlap is NOT handled correctly.
  242. + * Copyright (C) 1991-2020 Free Software Foundation, Inc.
  243. + * This file is part of the GNU C Library.
  244. + * Contributed by Torbjorn Granlund ([email protected]).
  245. + *
  246. + * The GNU C Library is free software; you can redistribute it and/or
  247. + * modify it under the terms of the GNU Lesser General Public
  248. + * License as published by the Free Software Foundation; either
  249. + * version 2.1 of the License, or (at your option) any later version.
  250. + *
  251. + * The GNU C Library is distributed in the hope that it will be useful,
  252. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  253. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  254. + * Lesser General Public License for more details.
  255. + *
  256. + * You should have received a copy of the GNU Lesser General Public
  257. + * License along with the GNU C Library; if not, see
  258. + * <https://www.gnu.org/licenses/>.
  259. + *
  260. + */
  261. +
  262. +#define __NO_FORTIFY
  263. +#include <linux/types.h>
  264. +#include <linux/module.h>
  265. +
  266. +#define MERGE(w0, sh_1, w1, sh_2) (((w0) >> (sh_1)) | ((w1) << (sh_2)))
  267. +#define OP_T_THRES 16
  268. +#define op_t unsigned long
  269. +#define OPSIZ (sizeof(op_t))
  270. +#define OPSIZ_MASK (sizeof(op_t) - 1)
  271. +#define FAST_COPY_THRES (128)
  272. +#define byte unsigned char
  273. +
  274. +static void _wordcopy_fwd_aligned(long dstp, long srcp, size_t len)
  275. +{
  276. + op_t a0, a1;
  277. +
  278. + switch (len % 8) {
  279. + case 2:
  280. + a0 = ((op_t *) srcp)[0];
  281. + srcp -= 6 * OPSIZ;
  282. + dstp -= 7 * OPSIZ;
  283. + len += 6;
  284. + goto do1;
  285. + case 3:
  286. + a1 = ((op_t *) srcp)[0];
  287. + srcp -= 5 * OPSIZ;
  288. + dstp -= 6 * OPSIZ;
  289. + len += 5;
  290. + goto do2;
  291. + case 4:
  292. + a0 = ((op_t *) srcp)[0];
  293. + srcp -= 4 * OPSIZ;
  294. + dstp -= 5 * OPSIZ;
  295. + len += 4;
  296. + goto do3;
  297. + case 5:
  298. + a1 = ((op_t *) srcp)[0];
  299. + srcp -= 3 * OPSIZ;
  300. + dstp -= 4 * OPSIZ;
  301. + len += 3;
  302. + goto do4;
  303. + case 6:
  304. + a0 = ((op_t *) srcp)[0];
  305. + srcp -= 2 * OPSIZ;
  306. + dstp -= 3 * OPSIZ;
  307. + len += 2;
  308. + goto do5;
  309. + case 7:
  310. + a1 = ((op_t *) srcp)[0];
  311. + srcp -= 1 * OPSIZ;
  312. + dstp -= 2 * OPSIZ;
  313. + len += 1;
  314. + goto do6;
  315. +
  316. + case 0:
  317. + if (OP_T_THRES <= 3 * OPSIZ && len == 0)
  318. + return;
  319. + a0 = ((op_t *) srcp)[0];
  320. + srcp -= 0 * OPSIZ;
  321. + dstp -= 1 * OPSIZ;
  322. + goto do7;
  323. + case 1:
  324. + a1 = ((op_t *) srcp)[0];
  325. + srcp -= -1 * OPSIZ;
  326. + dstp -= 0 * OPSIZ;
  327. + len -= 1;
  328. + if (OP_T_THRES <= 3 * OPSIZ && len == 0)
  329. + goto do0;
  330. + goto do8; /* No-op. */
  331. + }
  332. +
  333. + do {
  334. +do8:
  335. + a0 = ((op_t *) srcp)[0];
  336. + ((op_t *) dstp)[0] = a1;
  337. +do7:
  338. + a1 = ((op_t *) srcp)[1];
  339. + ((op_t *) dstp)[1] = a0;
  340. +do6:
  341. + a0 = ((op_t *) srcp)[2];
  342. + ((op_t *) dstp)[2] = a1;
  343. +do5:
  344. + a1 = ((op_t *) srcp)[3];
  345. + ((op_t *) dstp)[3] = a0;
  346. +do4:
  347. + a0 = ((op_t *) srcp)[4];
  348. + ((op_t *) dstp)[4] = a1;
  349. +do3:
  350. + a1 = ((op_t *) srcp)[5];
  351. + ((op_t *) dstp)[5] = a0;
  352. +do2:
  353. + a0 = ((op_t *) srcp)[6];
  354. + ((op_t *) dstp)[6] = a1;
  355. +do1:
  356. + a1 = ((op_t *) srcp)[7];
  357. + ((op_t *) dstp)[7] = a0;
  358. +
  359. + srcp += 8 * OPSIZ;
  360. + dstp += 8 * OPSIZ;
  361. + len -= 8;
  362. + } while (len != 0);
  363. +
  364. + /* This is the right position for do0. Please don't move
  365. + * it into the loop.
  366. + */
  367. +do0:
  368. + ((op_t *) dstp)[0] = a1;
  369. +}
  370. +
  371. +static void _wordcopy_fwd_dest_aligned(long dstp, long srcp, size_t len)
  372. +{
  373. + op_t a0, a1, a2, a3;
  374. + int sh_1, sh_2;
  375. +
  376. + /* Calculate how to shift a word read at the memory operation
  377. + * aligned srcp to make it aligned for copy.
  378. + */
  379. +
  380. + sh_1 = 8 * (srcp % OPSIZ);
  381. + sh_2 = 8 * OPSIZ - sh_1;
  382. +
  383. + /* Make SRCP aligned by rounding it down to the beginning of the `op_t'
  384. + * it points in the middle of.
  385. + */
  386. + srcp &= -OPSIZ;
  387. +
  388. + switch (len % 4) {
  389. + case 2:
  390. + a1 = ((op_t *) srcp)[0];
  391. + a2 = ((op_t *) srcp)[1];
  392. + srcp -= 1 * OPSIZ;
  393. + dstp -= 3 * OPSIZ;
  394. + len += 2;
  395. + goto do1;
  396. + case 3:
  397. + a0 = ((op_t *) srcp)[0];
  398. + a1 = ((op_t *) srcp)[1];
  399. + srcp -= 0 * OPSIZ;
  400. + dstp -= 2 * OPSIZ;
  401. + len += 1;
  402. + goto do2;
  403. + case 0:
  404. + if (OP_T_THRES <= 3 * OPSIZ && len == 0)
  405. + return;
  406. + a3 = ((op_t *) srcp)[0];
  407. + a0 = ((op_t *) srcp)[1];
  408. + srcp -= -1 * OPSIZ;
  409. + dstp -= 1 * OPSIZ;
  410. + len += 0;
  411. + goto do3;
  412. + case 1:
  413. + a2 = ((op_t *) srcp)[0];
  414. + a3 = ((op_t *) srcp)[1];
  415. + srcp -= -2 * OPSIZ;
  416. + dstp -= 0 * OPSIZ;
  417. + len -= 1;
  418. + if (OP_T_THRES <= 3 * OPSIZ && len == 0)
  419. + goto do0;
  420. + goto do4; /* No-op. */
  421. + }
  422. +
  423. + do {
  424. +do4:
  425. + a0 = ((op_t *) srcp)[0];
  426. + ((op_t *) dstp)[0] = MERGE(a2, sh_1, a3, sh_2);
  427. +do3:
  428. + a1 = ((op_t *) srcp)[1];
  429. + ((op_t *) dstp)[1] = MERGE(a3, sh_1, a0, sh_2);
  430. +do2:
  431. + a2 = ((op_t *) srcp)[2];
  432. + ((op_t *) dstp)[2] = MERGE(a0, sh_1, a1, sh_2);
  433. +do1:
  434. + a3 = ((op_t *) srcp)[3];
  435. + ((op_t *) dstp)[3] = MERGE(a1, sh_1, a2, sh_2);
  436. +
  437. + srcp += 4 * OPSIZ;
  438. + dstp += 4 * OPSIZ;
  439. + len -= 4;
  440. + } while (len != 0);
  441. +
  442. + /* This is the right position for do0. Please don't move
  443. + * it into the loop.
  444. + */
  445. +do0:
  446. + ((op_t *) dstp)[0] = MERGE(a2, sh_1, a3, sh_2);
  447. +}
  448. +
  449. +#define BYTE_COPY_FWD(dst_bp, src_bp, nbytes) \
  450. +do { \
  451. + size_t __nbytes = (nbytes); \
  452. + while (__nbytes > 0) { \
  453. + byte __x = ((byte *) src_bp)[0]; \
  454. + src_bp += 1; \
  455. + __nbytes -= 1; \
  456. + ((byte *) dst_bp)[0] = __x; \
  457. + dst_bp += 1; \
  458. + } \
  459. +} while (0)
  460. +
  461. +#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes) \
  462. +do { \
  463. + if (src_bp % OPSIZ == 0) \
  464. + _wordcopy_fwd_aligned(dst_bp, src_bp, (nbytes) / OPSIZ); \
  465. + else \
  466. + _wordcopy_fwd_dest_aligned(dst_bp, src_bp, (nbytes) / OPSIZ); \
  467. + src_bp += (nbytes) & -OPSIZ; \
  468. + dst_bp += (nbytes) & -OPSIZ; \
  469. + (nbytes_left) = (nbytes) % OPSIZ; \
  470. +} while (0)
  471. +
  472. +extern void *__memcpy_aligned(void *dest, const void *src, size_t len);
  473. +void *__memcpy(void *dest, const void *src, size_t len)
  474. +{
  475. + unsigned long dstp = (long) dest;
  476. + unsigned long srcp = (long) src;
  477. +
  478. + /* If there not too few bytes to copy, use word copy. */
  479. + if (len >= OP_T_THRES) {
  480. + if ((len >= FAST_COPY_THRES) && ((dstp & OPSIZ_MASK) == 0) &&
  481. + ((srcp & OPSIZ_MASK) == 0)) {
  482. + __memcpy_aligned(dest, src, len);
  483. + return dest;
  484. + }
  485. + /* Copy just a few bytes to make DSTP aligned. */
  486. + len -= (-dstp) % OPSIZ;
  487. + BYTE_COPY_FWD(dstp, srcp, (-dstp) % OPSIZ);
  488. +
  489. + /* Copy from SRCP to DSTP taking advantage of the known alignment of
  490. + * DSTP. Number of bytes remaining is put in the third argument,
  491. + * i.e. in LEN. This number may vary from machine to machine.
  492. + */
  493. + WORD_COPY_FWD(dstp, srcp, len, len);
  494. + /* Fall out and copy the tail. */
  495. + }
  496. +
  497. + /* There are just a few bytes to copy. Use byte memory operations. */
  498. + BYTE_COPY_FWD(dstp, srcp, len);
  499. +
  500. + return dest;
  501. +}
  502. +
  503. +void *memcpy(void *dest, const void *src, size_t len) __weak __alias(__memcpy);