Explorar o código

generic: copy backport, hack, pending patch and config from 5.15 to 6.1

Copy backport, hack, pending patch and config from 5.15 to 6.1.

Signed-off-by: Christian Marangi <[email protected]>
Christian Marangi %!s(int64=3) %!d(string=hai) anos
pai
achega
fa79baf4a6
Modificáronse 100 ficheiros con 25539 adicións e 0 borrados
  1. 73 0
      target/linux/generic/backport-6.1/005-v5.17-01-Kbuild-use-Wdeclaration-after-statement.patch
  2. 60 0
      target/linux/generic/backport-6.1/005-v5.17-02-Kbuild-move-to-std-gnu11.patch
  3. 43 0
      target/linux/generic/backport-6.1/005-v5.17-03-Kbuild-use-std-gnu11-for-KBUILD_USERCFLAGS.patch
  4. 425 0
      target/linux/generic/backport-6.1/020-v6.1-01-mm-x86-arm64-add-arch_has_hw_pte_young.patch
  5. 153 0
      target/linux/generic/backport-6.1/020-v6.1-02-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch
  6. 275 0
      target/linux/generic/backport-6.1/020-v6.1-03-mm-vmscan.c-refactor-shrink_node.patch
  7. 82 0
      target/linux/generic/backport-6.1/020-v6.1-04-Revert-include-linux-mm_inline.h-fold-__update_lru_s.patch
  8. 807 0
      target/linux/generic/backport-6.1/020-v6.1-05-mm-multi-gen-LRU-groundwork.patch
  9. 1447 0
      target/linux/generic/backport-6.1/020-v6.1-06-mm-multi-gen-LRU-minimal-implementation.patch
  10. 491 0
      target/linux/generic/backport-6.1/020-v6.1-07-mm-multi-gen-LRU-exploit-locality-in-rmap.patch
  11. 1687 0
      target/linux/generic/backport-6.1/020-v6.1-08-mm-multi-gen-LRU-support-page-table-walks.patch
  12. 315 0
      target/linux/generic/backport-6.1/020-v6.1-09-mm-multi-gen-LRU-optimize-multiple-memcgs.patch
  13. 498 0
      target/linux/generic/backport-6.1/020-v6.1-10-mm-multi-gen-LRU-kill-switch.patch
  14. 226 0
      target/linux/generic/backport-6.1/020-v6.1-11-mm-multi-gen-LRU-thrashing-prevention.patch
  15. 579 0
      target/linux/generic/backport-6.1/020-v6.1-12-mm-multi-gen-LRU-debugfs-interface.patch
  16. 32 0
      target/linux/generic/backport-6.1/020-v6.1-13-mm-mglru-don-t-sync-disk-for-each-aging-cycle.patch
  17. 124 0
      target/linux/generic/backport-6.1/020-v6.1-14-mm-multi-gen-LRU-retry-pages-written-back-while-isol.patch
  18. 49 0
      target/linux/generic/backport-6.1/020-v6.1-15-mm-multi-gen-LRU-move-lru_gen_add_mm-out-of-IRQ-off-.patch
  19. 96 0
      target/linux/generic/backport-6.1/020-v6.1-17-mm-add-dummy-pmd_young-for-architectures-not-having-.patch
  20. 113 0
      target/linux/generic/backport-6.1/020-v6.1-18-mm-introduce-arch_has_hw_nonleaf_pmd_young.patch
  21. 56 0
      target/linux/generic/backport-6.1/020-v6.2-16-mm-multi-gen-LRU-fix-crash-during-cgroup-migration.patch
  22. 196 0
      target/linux/generic/backport-6.1/020-v6.3-19-mm-add-vma_has_recency.patch
  23. 125 0
      target/linux/generic/backport-6.1/020-v6.3-20-mm-support-POSIX_FADV_NOREUSE.patch
  24. 348 0
      target/linux/generic/backport-6.1/020-v6.3-21-mm-multi-gen-LRU-rename-lru_gen_struct-to-lru_gen_pa.patch
  25. 162 0
      target/linux/generic/backport-6.1/020-v6.3-22-mm-multi-gen-LRU-rename-lrugen-lists-to-lrugen-pages.patch
  26. 188 0
      target/linux/generic/backport-6.1/020-v6.3-23-mm-multi-gen-LRU-remove-eviction-fairness-safeguard.patch
  27. 287 0
      target/linux/generic/backport-6.1/020-v6.3-24-mm-multi-gen-LRU-remove-aging-fairness-safeguard.patch
  28. 161 0
      target/linux/generic/backport-6.1/020-v6.3-25-mm-multi-gen-LRU-shuffle-should_run_aging.patch
  29. 868 0
      target/linux/generic/backport-6.1/020-v6.3-26-mm-multi-gen-LRU-per-node-lru_gen_page-lists.patch
  30. 196 0
      target/linux/generic/backport-6.1/020-v6.3-27-mm-multi-gen-LRU-clarify-scan_control-flags.patch
  31. 34 0
      target/linux/generic/backport-6.1/020-v6.3-28-mm-multi-gen-LRU-simplify-arch_has_hw_pte_young-chec.patch
  32. 88 0
      target/linux/generic/backport-6.1/020-v6.3-29-mm-multi-gen-LRU-avoid-futile-retries.patch
  33. 65 0
      target/linux/generic/backport-6.1/050-v5.16-00-MIPS-uasm-Enable-muhu-opcode-for-MIPS-R6.patch
  34. 31 0
      target/linux/generic/backport-6.1/050-v5.16-01-mips-uasm-Add-workaround-for-Loongson-2F-nop-CPU-err.patch
  35. 3078 0
      target/linux/generic/backport-6.1/050-v5.16-02-mips-bpf-Add-eBPF-JIT-for-32-bit-MIPS.patch
  36. 1005 0
      target/linux/generic/backport-6.1/050-v5.16-03-mips-bpf-Add-new-eBPF-JIT-for-64-bit-MIPS.patch
  37. 120 0
      target/linux/generic/backport-6.1/050-v5.16-04-mips-bpf-Add-JIT-workarounds-for-CPU-errata.patch
  38. 61 0
      target/linux/generic/backport-6.1/050-v5.16-05-mips-bpf-Enable-eBPF-JITs.patch
  39. 387 0
      target/linux/generic/backport-6.1/050-v5.16-06-mips-bpf-Remove-old-BPF-JIT-implementations.patch
  40. 105 0
      target/linux/generic/backport-6.1/080-v5.17-clk-gate-Add-devm_clk_hw_register_gate.patch
  41. 52 0
      target/linux/generic/backport-6.1/081-v5.17-regmap-allow-to-define-reg_update_bits-for-no-bus.patch
  42. 37 0
      target/linux/generic/backport-6.1/100-v5.18-tty-serial-bcm63xx-use-more-precise-Kconfig-symbol.patch
  43. 49 0
      target/linux/generic/backport-6.1/200-v5.18-tools-resolve_btfids-Build-with-host-flags.patch
  44. 997 0
      target/linux/generic/backport-6.1/201-v5.16-scripts-dtc-Update-to-upstream-version-v1.6.1-19-g0a.patch
  45. 48 0
      target/linux/generic/backport-6.1/300-v5.18-pinctrl-qcom-Return--EINVAL-for-setting-affinity-if-no-IRQ-parent.patch
  46. 166 0
      target/linux/generic/backport-6.1/301-v5.16-soc-qcom-smem-Support-reserved-memory-description.patch
  47. 33 0
      target/linux/generic/backport-6.1/302-v5.16-watchdog-bcm63xx_wdt-fix-fallthrough-warning.patch
  48. 162 0
      target/linux/generic/backport-6.1/330-v5.16-01-MIPS-kernel-proc-add-CPU-option-reporting.patch
  49. 62 0
      target/linux/generic/backport-6.1/330-v5.16-02-MIPS-Fix-using-smp_processor_id-in-preemptible-in-sh.patch
  50. 186 0
      target/linux/generic/backport-6.1/331-v5.19-mtd-spinand-Add-support-for-XTX-XT26G0xA.patch
  51. 219 0
      target/linux/generic/backport-6.1/344-v5.18-01-phy-marvell-phy-mvebu-a3700-comphy-Remove-port-from-.patch
  52. 1552 0
      target/linux/generic/backport-6.1/344-v5.18-02-phy-marvell-phy-mvebu-a3700-comphy-Add-native-kernel.patch
  53. 32 0
      target/linux/generic/backport-6.1/345-v5.17-arm64-dts-marvell-armada-37xx-Add-xtal-clock-to-comp.patch
  54. 64 0
      target/linux/generic/backport-6.1/346-v5.18-01-Revert-ata-ahci-mvebu-Make-SATA-PHY-optional-for-Arm.patch
  55. 166 0
      target/linux/generic/backport-6.1/346-v5.18-02-Revert-usb-host-xhci-mvebu-make-USB-3.0-PHY-optional.patch
  56. 39 0
      target/linux/generic/backport-6.1/346-v5.18-03-Revert-PCI-aardvark-Fix-initialization-with-old-Marv.patch
  57. 194 0
      target/linux/generic/backport-6.1/347-v6.0-phy-marvell-phy-mvebu-a3700-comphy-Remove-broken-res.patch
  58. 90 0
      target/linux/generic/backport-6.1/350-v5.18-regmap-add-configurable-downshift-for-addresses.patch
  59. 95 0
      target/linux/generic/backport-6.1/351-v5.18-regmap-allow-a-defined-reg_base-to-be-added-to-every.patch
  60. 57 0
      target/linux/generic/backport-6.1/352-v6.3-regmap-apply-reg_base-and-reg_downshift-for-single-r.patch
  61. 72 0
      target/linux/generic/backport-6.1/400-v5.19-mtd-call-of_platform_populate-for-MTD-partitions.patch
  62. 302 0
      target/linux/generic/backport-6.1/401-v6.0-mtd-parsers-add-support-for-Sercomm-partitions.patch
  63. 106 0
      target/linux/generic/backport-6.1/402-v6.0-mtd-next-mtd-core-introduce-of-support-for-dynamic-partitions.patch
  64. 72 0
      target/linux/generic/backport-6.1/403-v6.1-mtd-allow-getting-MTD-device-associated-with-a-speci.patch
  65. 30 0
      target/linux/generic/backport-6.1/404-v6.0-mtd-core-check-partition-before-dereference.patch
  66. 101 0
      target/linux/generic/backport-6.1/405-v6.1-mtd-core-add-missing-of_node_get-in-dynamic-partitio.patch
  67. 65 0
      target/linux/generic/backport-6.1/406-v6.2-0001-mtd-core-simplify-a-bit-code-find-partition-matching.patch
  68. 84 0
      target/linux/generic/backport-6.1/406-v6.2-0002-mtd-core-try-to-find-OF-node-for-every-MTD-partition.patch
  69. 32 0
      target/linux/generic/backport-6.1/407-v5.17-mtd-parsers-qcom-Don-t-print-error-message-on-EPROBE.patch
  70. 47 0
      target/linux/generic/backport-6.1/408-v6.2-mtd-core-set-ROOT_DEV-for-partitions-marked-as-rootf.patch
  71. 33 0
      target/linux/generic/backport-6.1/410-v5.18-mtd-parsers-trx-allow-to-use-on-MediaTek-MIPS-SoCs.patch
  72. 58 0
      target/linux/generic/backport-6.1/420-v5.19-02-mtd-spinand-gigadevice-add-support-for-GD5FxGQ4xExxG.patch
  73. 33 0
      target/linux/generic/backport-6.1/420-v5.19-03-mtd-spinand-gigadevice-add-support-for-GD5F1GQ5RExxG.patch
  74. 84 0
      target/linux/generic/backport-6.1/420-v5.19-04-mtd-spinand-gigadevice-add-support-for-GD5F-2-4-GQ5x.patch
  75. 91 0
      target/linux/generic/backport-6.1/420-v5.19-05-mtd-spinand-gigadevice-add-support-for-GD5FxGM7xExxG.patch
  76. 229 0
      target/linux/generic/backport-6.1/421-v6.2-mtd-parsers-add-TP-Link-SafeLoader-partitions-table-.patch
  77. 49 0
      target/linux/generic/backport-6.1/422-v5.19-mtd-spi-nor-support-eon-en25qh256a.patch
  78. 73 0
      target/linux/generic/backport-6.1/423-v6.1-0001-mtd-track-maximum-number-of-bitflips-for-each-read-r.patch
  79. 325 0
      target/linux/generic/backport-6.1/423-v6.1-0002-mtd-always-initialize-stats-in-struct-mtd_oob_ops.patch
  80. 172 0
      target/linux/generic/backport-6.1/423-v6.1-0003-mtd-add-ECC-error-accounting-for-each-read-request.patch
  81. 321 0
      target/linux/generic/backport-6.1/423-v6.1-0004-mtdchar-add-MEMREAD-ioctl.patch
  82. 35 0
      target/linux/generic/backport-6.1/423-v6.3-mtd-spinand-macronix-use-scratch-buffer-for-DMA-oper.patch
  83. 47 0
      target/linux/generic/backport-6.1/424-v6.4-0004-mtd-core-prepare-mtd_otp_nvmem_add-to-handle-EPROBE_.patch
  84. 165 0
      target/linux/generic/backport-6.1/600-v5.18-page_pool-Add-allocation-stats.patch
  85. 140 0
      target/linux/generic/backport-6.1/601-v5.18-page_pool-Add-recycle-stats.patch
  86. 77 0
      target/linux/generic/backport-6.1/602-v5.18-page_pool-Add-function-to-batch-and-return-stats.patch
  87. 55 0
      target/linux/generic/backport-6.1/603-v5.19-page_pool-Add-recycle-stats-to-page_pool_put_page_bu.patch
  88. 147 0
      target/linux/generic/backport-6.1/604-v5.19-net-page_pool-introduce-ethtool-stats.patch
  89. 99 0
      target/linux/generic/backport-6.1/605-v5.18-xdp-introduce-flags-field-in-xdp_buff-xdp_frame.patch
  90. 137 0
      target/linux/generic/backport-6.1/606-v5.18-xdp-add-frags-support-to-xdp_return_-buff-frame.patch
  91. 31 0
      target/linux/generic/backport-6.1/607-v5.18-net-skbuff-add-size-metadata-to-skb_shared_info-for-.patch
  92. 65 0
      target/linux/generic/backport-6.1/608-v5.18-net-veth-Account-total-xdp_frame-len-running-ndo_xdp.patch
  93. 40 0
      target/linux/generic/backport-6.1/609-v5.18-veth-Allow-jumbo-frames-in-xdp-mode.patch
  94. 56 0
      target/linux/generic/backport-6.1/610-v6.3-net-page_pool-use-in_softirq-instead.patch
  95. 41 0
      target/linux/generic/backport-6.1/611-v6.3-net-add-helper-eth_addr_add.patch
  96. 279 0
      target/linux/generic/backport-6.1/700-v5.17-net-dsa-introduce-tagger-owned-storage-for-private.patch
  97. 274 0
      target/linux/generic/backport-6.1/701-v5.17-dsa-make-tagging-protocols-connect-to-individual-switches.patch
  98. 327 0
      target/linux/generic/backport-6.1/702-v5.19-00-net-ethernet-mtk_eth_soc-add-support-for-coherent-DM.patch
  99. 30 0
      target/linux/generic/backport-6.1/702-v5.19-01-arm64-dts-mediatek-mt7622-add-support-for-coherent-D.patch
  100. 1679 0
      target/linux/generic/backport-6.1/702-v5.19-02-net-ethernet-mtk_eth_soc-add-support-for-Wireless-Et.patch

+ 73 - 0
target/linux/generic/backport-6.1/005-v5.17-01-Kbuild-use-Wdeclaration-after-statement.patch

@@ -0,0 +1,73 @@
+From 2fd7e7f9317d3048a14026816d081b08ba98ea8e Mon Sep 17 00:00:00 2001
+From: Mark Rutland <[email protected]>
+Date: Tue, 8 Mar 2022 22:56:13 +0100
+Subject: [PATCH 1/3] Kbuild: use -Wdeclaration-after-statement
+
+The kernel is moving from using `-std=gnu89` to `-std=gnu11`, permitting
+the use of additional C11 features such as for-loop initial declarations.
+
+One contentious aspect of C99 is that it permits mixed declarations and
+code, and for now at least, it seems preferable to enforce that
+declarations must come first.
+
+These warnings were already enabled in the kernel itself, but not
+for KBUILD_USERCFLAGS or the compat VDSO on arch/arm64, which uses
+a separate set of CFLAGS.
+
+This patch fixes an existing violation in modpost.c, which is not
+reported because of the missing flag in KBUILD_USERCFLAGS:
+
+| scripts/mod/modpost.c: In function ‘match’:
+| scripts/mod/modpost.c:837:3: warning: ISO C90 forbids mixed declarations and code [-Wdeclaration-after-statement]
+|   837 |   const char *endp = p + strlen(p) - 1;
+|       |   ^~~~~
+
+Signed-off-by: Mark Rutland <[email protected]>
+[arnd: don't add a duplicate flag to the default set, update changelog]
+Signed-off-by: Arnd Bergmann <[email protected]>
+Reviewed-by: Nathan Chancellor <[email protected]>
+Reviewed-by: Nick Desaulniers <[email protected]>
+Tested-by: Sedat Dilek <[email protected]> # LLVM/Clang v13.0.0 (x86-64)
+Signed-off-by: Masahiro Yamada <[email protected]>
+---
+ Makefile                          | 3 ++-
+ arch/arm64/kernel/vdso32/Makefile | 1 +
+ scripts/mod/modpost.c             | 4 +++-
+ 3 files changed, 6 insertions(+), 2 deletions(-)
+
+--- a/Makefile
++++ b/Makefile
+@@ -440,7 +440,8 @@ endif
+ HOSTPKG_CONFIG	= pkg-config
+ 
+ export KBUILD_USERCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes \
+-			      -O2 -fomit-frame-pointer -std=gnu89
++			      -O2 -fomit-frame-pointer -std=gnu89 \
++			      -Wdeclaration-after-statement
+ export KBUILD_USERLDFLAGS :=
+ 
+ KBUILD_HOSTCFLAGS   := $(KBUILD_USERCFLAGS) $(HOST_LFS_CFLAGS) $(HOSTCFLAGS)
+--- a/arch/arm64/kernel/vdso32/Makefile
++++ b/arch/arm64/kernel/vdso32/Makefile
+@@ -76,6 +76,7 @@ VDSO_CFLAGS += -Wall -Wundef -Wstrict-pr
+                -fno-strict-aliasing -fno-common \
+                -Werror-implicit-function-declaration \
+                -Wno-format-security \
++               -Wdeclaration-after-statement \
+                -std=gnu89
+ VDSO_CFLAGS  += -O2
+ # Some useful compiler-dependent flags from top-level Makefile
+--- a/scripts/mod/modpost.c
++++ b/scripts/mod/modpost.c
+@@ -833,8 +833,10 @@ static int match(const char *sym, const
+ {
+ 	const char *p;
+ 	while (*pat) {
++		const char *endp;
++
+ 		p = *pat++;
+-		const char *endp = p + strlen(p) - 1;
++		endp = p + strlen(p) - 1;
+ 
+ 		/* "*foo*" */
+ 		if (*p == '*' && *endp == '*') {

+ 60 - 0
target/linux/generic/backport-6.1/005-v5.17-02-Kbuild-move-to-std-gnu11.patch

@@ -0,0 +1,60 @@
+From b810c8e719ea082e47c7a8f7cf878bc84fa2455d Mon Sep 17 00:00:00 2001
+From: Arnd Bergmann <[email protected]>
+Date: Tue, 8 Mar 2022 22:56:14 +0100
+Subject: [PATCH 2/3] Kbuild: move to -std=gnu11
+
+During a patch discussion, Linus brought up the option of changing
+the C standard version from gnu89 to gnu99, which allows using variable
+declaration inside of a for() loop. While the C99, C11 and later standards
+introduce many other features, most of these are already available in
+gnu89 as GNU extensions as well.
+
+An earlier attempt to do this when gcc-5 started defaulting to
+-std=gnu11 failed because at the time that caused warnings about
+designated initializers with older compilers. Now that gcc-5.1 is
+the minimum compiler version used for building kernels, that is no
+longer a concern. Similarly, the behavior of 'inline' functions changes
+between gnu89 using gnu_inline behavior and gnu11 using standard c99+
+behavior, but this was taken care of by defining 'inline' to include
+__attribute__((gnu_inline)) in order to allow building with clang a
+while ago.
+
+Nathan Chancellor reported a new -Wdeclaration-after-statement
+warning that appears in a system header on arm, this still needs a
+workaround.
+
+The differences between gnu99, gnu11, gnu1x and gnu17 are fairly
+minimal and mainly impact warnings at the -Wpedantic level that the
+kernel never enables. Between these, gnu11 is the newest version
+that is supported by all supported compiler versions, though it is
+only the default on gcc-5, while all other supported versions of
+gcc or clang default to gnu1x/gnu17.
+
+Link: https://lore.kernel.org/lkml/CAHk-=wiyCH7xeHcmiFJ-YgXUy2Jaj7pnkdKpcovt8fYbVFW3TA@mail.gmail.com/
+Link: https://github.com/ClangBuiltLinux/linux/issues/1603
+Suggested-by: Linus Torvalds <[email protected]>
+Acked-by: Marco Elver <[email protected]>
+Acked-by: Jani Nikula <[email protected]>
+Acked-by: David Sterba <[email protected]>
+Tested-by: Sedat Dilek <[email protected]>
+Reviewed-by: Alex Shi <[email protected]>
+Reviewed-by: Nick Desaulniers <[email protected]>
+Reviewed-by: Miguel Ojeda <[email protected]>
+Signed-off-by: Arnd Bergmann <[email protected]>
+Reviewed-by: Nathan Chancellor <[email protected]>
+Signed-off-by: Masahiro Yamada <[email protected]>
+---
+ Makefile | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/Makefile
++++ b/Makefile
+@@ -524,7 +524,7 @@ KBUILD_CFLAGS   := -Wall -Wundef -Werror
+ 		   -fno-strict-aliasing -fno-common -fshort-wchar -fno-PIE \
+ 		   -Werror=implicit-function-declaration -Werror=implicit-int \
+ 		   -Werror=return-type -Wno-format-security \
+-		   -std=gnu89
++		   -std=gnu11
+ KBUILD_CPPFLAGS := -D__KERNEL__
+ KBUILD_AFLAGS_KERNEL :=
+ KBUILD_CFLAGS_KERNEL :=

+ 43 - 0
target/linux/generic/backport-6.1/005-v5.17-03-Kbuild-use-std-gnu11-for-KBUILD_USERCFLAGS.patch

@@ -0,0 +1,43 @@
+From 40337d6f3d677aee7ad3052ae662d3f53dd4d5cb Mon Sep 17 00:00:00 2001
+From: Arnd Bergmann <[email protected]>
+Date: Tue, 8 Mar 2022 22:56:15 +0100
+Subject: [PATCH 3/3] Kbuild: use -std=gnu11 for KBUILD_USERCFLAGS
+
+As we change the C language standard for the kernel from gnu89 to
+gnu11, it makes sense to also update the version for user space
+compilation.
+
+Some users have older native compilers than what they use for
+kernel builds, so I considered using gnu99 as the default version
+for wider compatibility with gcc-4.6 and earlier.
+
+However, testing with older compilers showed that we already require
+HOSTCC version 5.1 as well because a lot of host tools include
+linux/compiler.h that uses __has_attribute():
+
+  CC      tools/objtool/exec-cmd.o
+In file included from tools/include/linux/compiler_types.h:36:0,
+                 from tools/include/linux/compiler.h:5,
+                 from exec-cmd.c:2:
+tools/include/linux/compiler-gcc.h:19:5: error: "__has_attribute" is not defined [-Werror=undef]
+
+Signed-off-by: Arnd Bergmann <[email protected]>
+Reviewed-by: Nathan Chancellor <[email protected]>
+Reviewed-by: Nick Desaulniers <[email protected]>
+Tested-by: Sedat Dilek <[email protected]>
+Signed-off-by: Masahiro Yamada <[email protected]>
+---
+ Makefile | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/Makefile
++++ b/Makefile
+@@ -440,7 +440,7 @@ endif
+ HOSTPKG_CONFIG	= pkg-config
+ 
+ export KBUILD_USERCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes \
+-			      -O2 -fomit-frame-pointer -std=gnu89 \
++			      -O2 -fomit-frame-pointer -std=gnu11 \
+ 			      -Wdeclaration-after-statement
+ export KBUILD_USERLDFLAGS :=
+ 

+ 425 - 0
target/linux/generic/backport-6.1/020-v6.1-01-mm-x86-arm64-add-arch_has_hw_pte_young.patch

@@ -0,0 +1,425 @@
+From a4103262b01a1b8704b37c01c7c813df91b7b119 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Sun, 18 Sep 2022 01:59:58 -0600
+Subject: [PATCH 01/29] mm: x86, arm64: add arch_has_hw_pte_young()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Patch series "Multi-Gen LRU Framework", v14.
+
+What's new
+==========
+1. OpenWrt, in addition to Android, Arch Linux Zen, Armbian, ChromeOS,
+   Liquorix, post-factum and XanMod, is now shipping MGLRU on 5.15.
+2. Fixed long-tailed direct reclaim latency seen on high-memory (TBs)
+   machines. The old direct reclaim backoff, which tries to enforce a
+   minimum fairness among all eligible memcgs, over-swapped by about
+   (total_mem>>DEF_PRIORITY)-nr_to_reclaim. The new backoff, which
+   pulls the plug on swapping once the target is met, trades some
+   fairness for curtailed latency:
+   https://lore.kernel.org/r/[email protected]/
+3. Fixed minior build warnings and conflicts. More comments and nits.
+
+TLDR
+====
+The current page reclaim is too expensive in terms of CPU usage and it
+often makes poor choices about what to evict. This patchset offers an
+alternative solution that is performant, versatile and
+straightforward.
+
+Patchset overview
+=================
+The design and implementation overview is in patch 14:
+https://lore.kernel.org/r/[email protected]/
+
+01. mm: x86, arm64: add arch_has_hw_pte_young()
+02. mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
+Take advantage of hardware features when trying to clear the accessed
+bit in many PTEs.
+
+03. mm/vmscan.c: refactor shrink_node()
+04. Revert "include/linux/mm_inline.h: fold __update_lru_size() into
+    its sole caller"
+Minor refactors to improve readability for the following patches.
+
+05. mm: multi-gen LRU: groundwork
+Adds the basic data structure and the functions that insert pages to
+and remove pages from the multi-gen LRU (MGLRU) lists.
+
+06. mm: multi-gen LRU: minimal implementation
+A minimal implementation without optimizations.
+
+07. mm: multi-gen LRU: exploit locality in rmap
+Exploits spatial locality to improve efficiency when using the rmap.
+
+08. mm: multi-gen LRU: support page table walks
+Further exploits spatial locality by optionally scanning page tables.
+
+09. mm: multi-gen LRU: optimize multiple memcgs
+Optimizes the overall performance for multiple memcgs running mixed
+types of workloads.
+
+10. mm: multi-gen LRU: kill switch
+Adds a kill switch to enable or disable MGLRU at runtime.
+
+11. mm: multi-gen LRU: thrashing prevention
+12. mm: multi-gen LRU: debugfs interface
+Provide userspace with features like thrashing prevention, working set
+estimation and proactive reclaim.
+
+13. mm: multi-gen LRU: admin guide
+14. mm: multi-gen LRU: design doc
+Add an admin guide and a design doc.
+
+Benchmark results
+=================
+Independent lab results
+-----------------------
+Based on the popularity of searches [01] and the memory usage in
+Google's public cloud, the most popular open-source memory-hungry
+applications, in alphabetical order, are:
+      Apache Cassandra      Memcached
+      Apache Hadoop         MongoDB
+      Apache Spark          PostgreSQL
+      MariaDB (MySQL)       Redis
+
+An independent lab evaluated MGLRU with the most widely used benchmark
+suites for the above applications. They posted 960 data points along
+with kernel metrics and perf profiles collected over more than 500
+hours of total benchmark time. Their final reports show that, with 95%
+confidence intervals (CIs), the above applications all performed
+significantly better for at least part of their benchmark matrices.
+
+On 5.14:
+1. Apache Spark [02] took 95% CIs [9.28, 11.19]% and [12.20, 14.93]%
+   less wall time to sort three billion random integers, respectively,
+   under the medium- and the high-concurrency conditions, when
+   overcommitting memory. There were no statistically significant
+   changes in wall time for the rest of the benchmark matrix.
+2. MariaDB [03] achieved 95% CIs [5.24, 10.71]% and [20.22, 25.97]%
+   more transactions per minute (TPM), respectively, under the medium-
+   and the high-concurrency conditions, when overcommitting memory.
+   There were no statistically significant changes in TPM for the rest
+   of the benchmark matrix.
+3. Memcached [04] achieved 95% CIs [23.54, 32.25]%, [20.76, 41.61]%
+   and [21.59, 30.02]% more operations per second (OPS), respectively,
+   for sequential access, random access and Gaussian (distribution)
+   access, when THP=always; 95% CIs [13.85, 15.97]% and
+   [23.94, 29.92]% more OPS, respectively, for random access and
+   Gaussian access, when THP=never. There were no statistically
+   significant changes in OPS for the rest of the benchmark matrix.
+4. MongoDB [05] achieved 95% CIs [2.23, 3.44]%, [6.97, 9.73]% and
+   [2.16, 3.55]% more operations per second (OPS), respectively, for
+   exponential (distribution) access, random access and Zipfian
+   (distribution) access, when underutilizing memory; 95% CIs
+   [8.83, 10.03]%, [21.12, 23.14]% and [5.53, 6.46]% more OPS,
+   respectively, for exponential access, random access and Zipfian
+   access, when overcommitting memory.
+
+On 5.15:
+5. Apache Cassandra [06] achieved 95% CIs [1.06, 4.10]%, [1.94, 5.43]%
+   and [4.11, 7.50]% more operations per second (OPS), respectively,
+   for exponential (distribution) access, random access and Zipfian
+   (distribution) access, when swap was off; 95% CIs [0.50, 2.60]%,
+   [6.51, 8.77]% and [3.29, 6.75]% more OPS, respectively, for
+   exponential access, random access and Zipfian access, when swap was
+   on.
+6. Apache Hadoop [07] took 95% CIs [5.31, 9.69]% and [2.02, 7.86]%
+   less average wall time to finish twelve parallel TeraSort jobs,
+   respectively, under the medium- and the high-concurrency
+   conditions, when swap was on. There were no statistically
+   significant changes in average wall time for the rest of the
+   benchmark matrix.
+7. PostgreSQL [08] achieved 95% CI [1.75, 6.42]% more transactions per
+   minute (TPM) under the high-concurrency condition, when swap was
+   off; 95% CIs [12.82, 18.69]% and [22.70, 46.86]% more TPM,
+   respectively, under the medium- and the high-concurrency
+   conditions, when swap was on. There were no statistically
+   significant changes in TPM for the rest of the benchmark matrix.
+8. Redis [09] achieved 95% CIs [0.58, 5.94]%, [6.55, 14.58]% and
+   [11.47, 19.36]% more total operations per second (OPS),
+   respectively, for sequential access, random access and Gaussian
+   (distribution) access, when THP=always; 95% CIs [1.27, 3.54]%,
+   [10.11, 14.81]% and [8.75, 13.64]% more total OPS, respectively,
+   for sequential access, random access and Gaussian access, when
+   THP=never.
+
+Our lab results
+---------------
+To supplement the above results, we ran the following benchmark suites
+on 5.16-rc7 and found no regressions [10].
+      fs_fio_bench_hdd_mq      pft
+      fs_lmbench               pgsql-hammerdb
+      fs_parallelio            redis
+      fs_postmark              stream
+      hackbench                sysbenchthread
+      kernbench                tpcc_spark
+      memcached                unixbench
+      multichase               vm-scalability
+      mutilate                 will-it-scale
+      nginx
+
+[01] https://trends.google.com
+[02] https://lore.kernel.org/r/[email protected]/
+[03] https://lore.kernel.org/r/[email protected]/
+[04] https://lore.kernel.org/r/[email protected]/
+[05] https://lore.kernel.org/r/[email protected]/
+[06] https://lore.kernel.org/r/[email protected]/
+[07] https://lore.kernel.org/r/[email protected]/
+[08] https://lore.kernel.org/r/[email protected]/
+[09] https://lore.kernel.org/r/[email protected]/
+[10] https://lore.kernel.org/r/[email protected]/
+
+Read-world applications
+=======================
+Third-party testimonials
+------------------------
+Konstantin reported [11]:
+   I have Archlinux with 8G RAM + zswap + swap. While developing, I
+   have lots of apps opened such as multiple LSP-servers for different
+   langs, chats, two browsers, etc... Usually, my system gets quickly
+   to a point of SWAP-storms, where I have to kill LSP-servers,
+   restart browsers to free memory, etc, otherwise the system lags
+   heavily and is barely usable.
+
+   1.5 day ago I migrated from 5.11.15 kernel to 5.12 + the LRU
+   patchset, and I started up by opening lots of apps to create memory
+   pressure, and worked for a day like this. Till now I had not a
+   single SWAP-storm, and mind you I got 3.4G in SWAP. I was never
+   getting to the point of 3G in SWAP before without a single
+   SWAP-storm.
+
+Vaibhav from IBM reported [12]:
+   In a synthetic MongoDB Benchmark, seeing an average of ~19%
+   throughput improvement on POWER10(Radix MMU + 64K Page Size) with
+   MGLRU patches on top of 5.16 kernel for MongoDB + YCSB across
+   three different request distributions, namely, Exponential, Uniform
+   and Zipfan.
+
+Shuang from U of Rochester reported [13]:
+   With the MGLRU, fio achieved 95% CIs [38.95, 40.26]%, [4.12, 6.64]%
+   and [9.26, 10.36]% higher throughput, respectively, for random
+   access, Zipfian (distribution) access and Gaussian (distribution)
+   access, when the average number of jobs per CPU is 1; 95% CIs
+   [42.32, 49.15]%, [9.44, 9.89]% and [20.99, 22.86]% higher
+   throughput, respectively, for random access, Zipfian access and
+   Gaussian access, when the average number of jobs per CPU is 2.
+
+Daniel from Michigan Tech reported [14]:
+   With Memcached allocating ~100GB of byte-addressable Optante,
+   performance improvement in terms of throughput (measured as queries
+   per second) was about 10% for a series of workloads.
+
+Large-scale deployments
+-----------------------
+We've rolled out MGLRU to tens of millions of ChromeOS users and
+about a million Android users. Google's fleetwide profiling [15] shows
+an overall 40% decrease in kswapd CPU usage, in addition to
+improvements in other UX metrics, e.g., an 85% decrease in the number
+of low-memory kills at the 75th percentile and an 18% decrease in
+app launch time at the 50th percentile.
+
+The downstream kernels that have been using MGLRU include:
+1. Android [16]
+2. Arch Linux Zen [17]
+3. Armbian [18]
+4. ChromeOS [19]
+5. Liquorix [20]
+6. OpenWrt [21]
+7. post-factum [22]
+8. XanMod [23]
+
+[11] https://lore.kernel.org/r/[email protected]/
+[12] https://lore.kernel.org/r/[email protected]/
+[13] https://lore.kernel.org/r/[email protected]/
+[14] https://lore.kernel.org/r/CA+4-3vksGvKd18FgRinxhqHetBS1hQekJE2gwco8Ja-bJWKtFw@mail.gmail.com/
+[15] https://dl.acm.org/doi/10.1145/2749469.2750392
+[16] https://android.com
+[17] https://archlinux.org
+[18] https://armbian.com
+[19] https://chromium.org
+[20] https://liquorix.net
+[21] https://openwrt.org
+[22] https://codeberg.org/pf-kernel
+[23] https://xanmod.org
+
+Summary
+=======
+The facts are:
+1. The independent lab results and the real-world applications
+   indicate substantial improvements; there are no known regressions.
+2. Thrashing prevention, working set estimation and proactive reclaim
+   work out of the box; there are no equivalent solutions.
+3. There is a lot of new code; no smaller changes have been
+   demonstrated similar effects.
+
+Our options, accordingly, are:
+1. Given the amount of evidence, the reported improvements will likely
+   materialize for a wide range of workloads.
+2. Gauging the interest from the past discussions, the new features
+   will likely be put to use for both personal computers and data
+   centers.
+3. Based on Google's track record, the new code will likely be well
+   maintained in the long term. It'd be more difficult if not
+   impossible to achieve similar effects with other approaches.
+
+This patch (of 14):
+
+Some architectures automatically set the accessed bit in PTEs, e.g., x86
+and arm64 v8.2.  On architectures that do not have this capability,
+clearing the accessed bit in a PTE usually triggers a page fault following
+the TLB miss of this PTE (to emulate the accessed bit).
+
+Being aware of this capability can help make better decisions, e.g.,
+whether to spread the work out over a period of time to reduce bursty page
+faults when trying to clear the accessed bit in many PTEs.
+
+Note that theoretically this capability can be unreliable, e.g.,
+hotplugged CPUs might be different from builtin ones.  Therefore it should
+not be used in architecture-independent code that involves correctness,
+e.g., to determine whether TLB flushes are required (in combination with
+the accessed bit).
+
+Link: https://lkml.kernel.org/r/[email protected]
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Reviewed-by: Barry Song <[email protected]>
+Acked-by: Brian Geffon <[email protected]>
+Acked-by: Jan Alexander Steffens (heftig) <[email protected]>
+Acked-by: Oleksandr Natalenko <[email protected]>
+Acked-by: Steven Barrett <[email protected]>
+Acked-by: Suleiman Souhlal <[email protected]>
+Acked-by: Will Deacon <[email protected]>
+Tested-by: Daniel Byrne <[email protected]>
+Tested-by: Donald Carr <[email protected]>
+Tested-by: Holger Hoffstätte <[email protected]>
+Tested-by: Konstantin Kharlamov <[email protected]>
+Tested-by: Shuang Zhai <[email protected]>
+Tested-by: Sofia Trinh <[email protected]>
+Tested-by: Vaibhav Jain <[email protected]>
+Cc: Andi Kleen <[email protected]>
+Cc: Aneesh Kumar K.V <[email protected]>
+Cc: Catalin Marinas <[email protected]>
+Cc: Dave Hansen <[email protected]>
+Cc: Hillf Danton <[email protected]>
+Cc: Jens Axboe <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Jonathan Corbet <[email protected]>
+Cc: Linus Torvalds <[email protected]>
+Cc: [email protected]
+Cc: Matthew Wilcox <[email protected]>
+Cc: Mel Gorman <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Cc: Michal Hocko <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Peter Zijlstra <[email protected]>
+Cc: Tejun Heo <[email protected]>
+Cc: Vlastimil Babka <[email protected]>
+Cc: Miaohe Lin <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Qi Zheng <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ arch/arm64/include/asm/pgtable.h | 14 ++------------
+ arch/x86/include/asm/pgtable.h   |  6 +++---
+ include/linux/pgtable.h          | 13 +++++++++++++
+ mm/memory.c                      | 14 +-------------
+ 4 files changed, 19 insertions(+), 28 deletions(-)
+
+--- a/arch/arm64/include/asm/pgtable.h
++++ b/arch/arm64/include/asm/pgtable.h
+@@ -999,23 +999,13 @@ static inline void update_mmu_cache(stru
+  * page after fork() + CoW for pfn mappings. We don't always have a
+  * hardware-managed access flag on arm64.
+  */
+-static inline bool arch_faults_on_old_pte(void)
+-{
+-	WARN_ON(preemptible());
+-
+-	return !cpu_has_hw_af();
+-}
+-#define arch_faults_on_old_pte		arch_faults_on_old_pte
++#define arch_has_hw_pte_young		cpu_has_hw_af
+ 
+ /*
+  * Experimentally, it's cheap to set the access flag in hardware and we
+  * benefit from prefaulting mappings as 'old' to start with.
+  */
+-static inline bool arch_wants_old_prefaulted_pte(void)
+-{
+-	return !arch_faults_on_old_pte();
+-}
+-#define arch_wants_old_prefaulted_pte	arch_wants_old_prefaulted_pte
++#define arch_wants_old_prefaulted_pte	cpu_has_hw_af
+ 
+ #endif /* !__ASSEMBLY__ */
+ 
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -1397,10 +1397,10 @@ static inline bool arch_has_pfn_modify_c
+ 	return boot_cpu_has_bug(X86_BUG_L1TF);
+ }
+ 
+-#define arch_faults_on_old_pte arch_faults_on_old_pte
+-static inline bool arch_faults_on_old_pte(void)
++#define arch_has_hw_pte_young arch_has_hw_pte_young
++static inline bool arch_has_hw_pte_young(void)
+ {
+-	return false;
++	return true;
+ }
+ 
+ #endif	/* __ASSEMBLY__ */
+--- a/include/linux/pgtable.h
++++ b/include/linux/pgtable.h
+@@ -259,6 +259,19 @@ static inline int pmdp_clear_flush_young
+ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ #endif
+ 
++#ifndef arch_has_hw_pte_young
++/*
++ * Return whether the accessed bit is supported on the local CPU.
++ *
++ * This stub assumes accessing through an old PTE triggers a page fault.
++ * Architectures that automatically set the access bit should overwrite it.
++ */
++static inline bool arch_has_hw_pte_young(void)
++{
++	return false;
++}
++#endif
++
+ #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
+ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+ 				       unsigned long address,
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -121,18 +121,6 @@ int randomize_va_space __read_mostly =
+ 					2;
+ #endif
+ 
+-#ifndef arch_faults_on_old_pte
+-static inline bool arch_faults_on_old_pte(void)
+-{
+-	/*
+-	 * Those arches which don't have hw access flag feature need to
+-	 * implement their own helper. By default, "true" means pagefault
+-	 * will be hit on old pte.
+-	 */
+-	return true;
+-}
+-#endif
+-
+ #ifndef arch_wants_old_prefaulted_pte
+ static inline bool arch_wants_old_prefaulted_pte(void)
+ {
+@@ -2782,7 +2770,7 @@ static inline bool cow_user_page(struct
+ 	 * On architectures with software "accessed" bits, we would
+ 	 * take a double page fault, so mark it accessed here.
+ 	 */
+-	if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
++	if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
+ 		pte_t entry;
+ 
+ 		vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);

+ 153 - 0
target/linux/generic/backport-6.1/020-v6.1-02-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch

@@ -0,0 +1,153 @@
+From 493de1c4b0f2cd909169401da8c445f6c8a7e29d Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Sun, 18 Sep 2022 01:59:59 -0600
+Subject: [PATCH 02/29] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Some architectures support the accessed bit in non-leaf PMD entries, e.g.,
+x86 sets the accessed bit in a non-leaf PMD entry when using it as part of
+linear address translation [1].  Page table walkers that clear the
+accessed bit may use this capability to reduce their search space.
+
+Note that:
+1. Although an inline function is preferable, this capability is added
+   as a configuration option for consistency with the existing macros.
+2. Due to the little interest in other varieties, this capability was
+   only tested on Intel and AMD CPUs.
+
+Thanks to the following developers for their efforts [2][3].
+  Randy Dunlap <[email protected]>
+  Stephen Rothwell <[email protected]>
+
+[1]: Intel 64 and IA-32 Architectures Software Developer's Manual
+     Volume 3 (June 2021), section 4.8
+[2] https://lore.kernel.org/r/[email protected]/
+[3] https://lore.kernel.org/r/[email protected]/
+
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Reviewed-by: Barry Song <[email protected]>
+Acked-by: Brian Geffon <[email protected]>
+Acked-by: Jan Alexander Steffens (heftig) <[email protected]>
+Acked-by: Oleksandr Natalenko <[email protected]>
+Acked-by: Steven Barrett <[email protected]>
+Acked-by: Suleiman Souhlal <[email protected]>
+Tested-by: Daniel Byrne <[email protected]>
+Tested-by: Donald Carr <[email protected]>
+Tested-by: Holger Hoffstätte <[email protected]>
+Tested-by: Konstantin Kharlamov <[email protected]>
+Tested-by: Shuang Zhai <[email protected]>
+Tested-by: Sofia Trinh <[email protected]>
+Tested-by: Vaibhav Jain <[email protected]>
+Cc: Andi Kleen <[email protected]>
+Cc: Aneesh Kumar K.V <[email protected]>
+Cc: Catalin Marinas <[email protected]>
+Cc: Dave Hansen <[email protected]>
+Cc: Hillf Danton <[email protected]>
+Cc: Jens Axboe <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Jonathan Corbet <[email protected]>
+Cc: Linus Torvalds <[email protected]>
+Cc: Matthew Wilcox <[email protected]>
+Cc: Mel Gorman <[email protected]>
+Cc: Miaohe Lin <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Cc: Michal Hocko <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Peter Zijlstra <[email protected]>
+Cc: Qi Zheng <[email protected]>
+Cc: Tejun Heo <[email protected]>
+Cc: Vlastimil Babka <[email protected]>
+Cc: Will Deacon <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ arch/Kconfig                   | 8 ++++++++
+ arch/x86/Kconfig               | 1 +
+ arch/x86/include/asm/pgtable.h | 3 ++-
+ arch/x86/mm/pgtable.c          | 5 ++++-
+ include/linux/pgtable.h        | 4 ++--
+ 5 files changed, 17 insertions(+), 4 deletions(-)
+
+--- a/arch/Kconfig
++++ b/arch/Kconfig
+@@ -1295,6 +1295,14 @@ config ARCH_HAS_ELFCORE_COMPAT
+ config ARCH_HAS_PARANOID_L1D_FLUSH
+ 	bool
+ 
++config ARCH_HAS_NONLEAF_PMD_YOUNG
++	bool
++	help
++	  Architectures that select this option are capable of setting the
++	  accessed bit in non-leaf PMD entries when using them as part of linear
++	  address translations. Page table walkers that clear the accessed bit
++	  may use this capability to reduce their search space.
++
+ source "kernel/gcov/Kconfig"
+ 
+ source "scripts/gcc-plugins/Kconfig"
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -84,6 +84,7 @@ config X86
+ 	select ARCH_HAS_PMEM_API		if X86_64
+ 	select ARCH_HAS_PTE_DEVMAP		if X86_64
+ 	select ARCH_HAS_PTE_SPECIAL
++	select ARCH_HAS_NONLEAF_PMD_YOUNG	if PGTABLE_LEVELS > 2
+ 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
+ 	select ARCH_HAS_COPY_MC			if X86_64
+ 	select ARCH_HAS_SET_MEMORY
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -817,7 +817,8 @@ static inline unsigned long pmd_page_vad
+ 
+ static inline int pmd_bad(pmd_t pmd)
+ {
+-	return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
++	return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
++	       (_KERNPG_TABLE & ~_PAGE_ACCESSED);
+ }
+ 
+ static inline unsigned long pages_to_mb(unsigned long npg)
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_
+ 	return ret;
+ }
+ 
+-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
+ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ 			      unsigned long addr, pmd_t *pmdp)
+ {
+@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_
+ 
+ 	return ret;
+ }
++#endif
++
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ int pudp_test_and_clear_young(struct vm_area_struct *vma,
+ 			      unsigned long addr, pud_t *pudp)
+ {
+--- a/include/linux/pgtable.h
++++ b/include/linux/pgtable.h
+@@ -212,7 +212,7 @@ static inline int ptep_test_and_clear_yo
+ #endif
+ 
+ #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
+ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ 					    unsigned long address,
+ 					    pmd_t *pmdp)
+@@ -233,7 +233,7 @@ static inline int pmdp_test_and_clear_yo
+ 	BUILD_BUG();
+ 	return 0;
+ }
+-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
++#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
+ #endif
+ 
+ #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH

+ 275 - 0
target/linux/generic/backport-6.1/020-v6.1-03-mm-vmscan.c-refactor-shrink_node.patch

@@ -0,0 +1,275 @@
+From 9e17efd11450d3d2069adaa3c58db9ac8ebd1c66 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Sun, 18 Sep 2022 02:00:00 -0600
+Subject: [PATCH 03/29] mm/vmscan.c: refactor shrink_node()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This patch refactors shrink_node() to improve readability for the upcoming
+changes to mm/vmscan.c.
+
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Reviewed-by: Barry Song <[email protected]>
+Reviewed-by: Miaohe Lin <[email protected]>
+Acked-by: Brian Geffon <[email protected]>
+Acked-by: Jan Alexander Steffens (heftig) <[email protected]>
+Acked-by: Oleksandr Natalenko <[email protected]>
+Acked-by: Steven Barrett <[email protected]>
+Acked-by: Suleiman Souhlal <[email protected]>
+Tested-by: Daniel Byrne <[email protected]>
+Tested-by: Donald Carr <[email protected]>
+Tested-by: Holger Hoffstätte <[email protected]>
+Tested-by: Konstantin Kharlamov <[email protected]>
+Tested-by: Shuang Zhai <[email protected]>
+Tested-by: Sofia Trinh <[email protected]>
+Tested-by: Vaibhav Jain <[email protected]>
+Cc: Andi Kleen <[email protected]>
+Cc: Aneesh Kumar K.V <[email protected]>
+Cc: Catalin Marinas <[email protected]>
+Cc: Dave Hansen <[email protected]>
+Cc: Hillf Danton <[email protected]>
+Cc: Jens Axboe <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Jonathan Corbet <[email protected]>
+Cc: Linus Torvalds <[email protected]>
+Cc: Matthew Wilcox <[email protected]>
+Cc: Mel Gorman <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Cc: Michal Hocko <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Peter Zijlstra <[email protected]>
+Cc: Qi Zheng <[email protected]>
+Cc: Tejun Heo <[email protected]>
+Cc: Vlastimil Babka <[email protected]>
+Cc: Will Deacon <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ mm/vmscan.c | 198 +++++++++++++++++++++++++++-------------------------
+ 1 file changed, 104 insertions(+), 94 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2497,6 +2497,109 @@ enum scan_balance {
+ 	SCAN_FILE,
+ };
+ 
++static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
++{
++	unsigned long file;
++	struct lruvec *target_lruvec;
++
++	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
++
++	/*
++	 * Flush the memory cgroup stats, so that we read accurate per-memcg
++	 * lruvec stats for heuristics.
++	 */
++	mem_cgroup_flush_stats();
++
++	/*
++	 * Determine the scan balance between anon and file LRUs.
++	 */
++	spin_lock_irq(&target_lruvec->lru_lock);
++	sc->anon_cost = target_lruvec->anon_cost;
++	sc->file_cost = target_lruvec->file_cost;
++	spin_unlock_irq(&target_lruvec->lru_lock);
++
++	/*
++	 * Target desirable inactive:active list ratios for the anon
++	 * and file LRU lists.
++	 */
++	if (!sc->force_deactivate) {
++		unsigned long refaults;
++
++		refaults = lruvec_page_state(target_lruvec,
++				WORKINGSET_ACTIVATE_ANON);
++		if (refaults != target_lruvec->refaults[0] ||
++			inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
++			sc->may_deactivate |= DEACTIVATE_ANON;
++		else
++			sc->may_deactivate &= ~DEACTIVATE_ANON;
++
++		/*
++		 * When refaults are being observed, it means a new
++		 * workingset is being established. Deactivate to get
++		 * rid of any stale active pages quickly.
++		 */
++		refaults = lruvec_page_state(target_lruvec,
++				WORKINGSET_ACTIVATE_FILE);
++		if (refaults != target_lruvec->refaults[1] ||
++		    inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
++			sc->may_deactivate |= DEACTIVATE_FILE;
++		else
++			sc->may_deactivate &= ~DEACTIVATE_FILE;
++	} else
++		sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
++
++	/*
++	 * If we have plenty of inactive file pages that aren't
++	 * thrashing, try to reclaim those first before touching
++	 * anonymous pages.
++	 */
++	file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
++	if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
++		sc->cache_trim_mode = 1;
++	else
++		sc->cache_trim_mode = 0;
++
++	/*
++	 * Prevent the reclaimer from falling into the cache trap: as
++	 * cache pages start out inactive, every cache fault will tip
++	 * the scan balance towards the file LRU.  And as the file LRU
++	 * shrinks, so does the window for rotation from references.
++	 * This means we have a runaway feedback loop where a tiny
++	 * thrashing file LRU becomes infinitely more attractive than
++	 * anon pages.  Try to detect this based on file LRU size.
++	 */
++	if (!cgroup_reclaim(sc)) {
++		unsigned long total_high_wmark = 0;
++		unsigned long free, anon;
++		int z;
++
++		free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
++		file = node_page_state(pgdat, NR_ACTIVE_FILE) +
++			   node_page_state(pgdat, NR_INACTIVE_FILE);
++
++		for (z = 0; z < MAX_NR_ZONES; z++) {
++			struct zone *zone = &pgdat->node_zones[z];
++
++			if (!managed_zone(zone))
++				continue;
++
++			total_high_wmark += high_wmark_pages(zone);
++		}
++
++		/*
++		 * Consider anon: if that's low too, this isn't a
++		 * runaway file reclaim problem, but rather just
++		 * extreme pressure. Reclaim as per usual then.
++		 */
++		anon = node_page_state(pgdat, NR_INACTIVE_ANON);
++
++		sc->file_is_tiny =
++			file + free <= total_high_wmark &&
++			!(sc->may_deactivate & DEACTIVATE_ANON) &&
++			anon >> sc->priority;
++	}
++}
++
+ /*
+  * Determine how aggressively the anon and file LRU lists should be
+  * scanned.  The relative value of each set of LRU lists is determined
+@@ -2965,109 +3068,16 @@ static void shrink_node(pg_data_t *pgdat
+ 	unsigned long nr_reclaimed, nr_scanned;
+ 	struct lruvec *target_lruvec;
+ 	bool reclaimable = false;
+-	unsigned long file;
+ 
+ 	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
+ 
+ again:
+-	/*
+-	 * Flush the memory cgroup stats, so that we read accurate per-memcg
+-	 * lruvec stats for heuristics.
+-	 */
+-	mem_cgroup_flush_stats();
+-
+ 	memset(&sc->nr, 0, sizeof(sc->nr));
+ 
+ 	nr_reclaimed = sc->nr_reclaimed;
+ 	nr_scanned = sc->nr_scanned;
+ 
+-	/*
+-	 * Determine the scan balance between anon and file LRUs.
+-	 */
+-	spin_lock_irq(&target_lruvec->lru_lock);
+-	sc->anon_cost = target_lruvec->anon_cost;
+-	sc->file_cost = target_lruvec->file_cost;
+-	spin_unlock_irq(&target_lruvec->lru_lock);
+-
+-	/*
+-	 * Target desirable inactive:active list ratios for the anon
+-	 * and file LRU lists.
+-	 */
+-	if (!sc->force_deactivate) {
+-		unsigned long refaults;
+-
+-		refaults = lruvec_page_state(target_lruvec,
+-				WORKINGSET_ACTIVATE_ANON);
+-		if (refaults != target_lruvec->refaults[0] ||
+-			inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
+-			sc->may_deactivate |= DEACTIVATE_ANON;
+-		else
+-			sc->may_deactivate &= ~DEACTIVATE_ANON;
+-
+-		/*
+-		 * When refaults are being observed, it means a new
+-		 * workingset is being established. Deactivate to get
+-		 * rid of any stale active pages quickly.
+-		 */
+-		refaults = lruvec_page_state(target_lruvec,
+-				WORKINGSET_ACTIVATE_FILE);
+-		if (refaults != target_lruvec->refaults[1] ||
+-		    inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
+-			sc->may_deactivate |= DEACTIVATE_FILE;
+-		else
+-			sc->may_deactivate &= ~DEACTIVATE_FILE;
+-	} else
+-		sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
+-
+-	/*
+-	 * If we have plenty of inactive file pages that aren't
+-	 * thrashing, try to reclaim those first before touching
+-	 * anonymous pages.
+-	 */
+-	file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
+-	if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
+-		sc->cache_trim_mode = 1;
+-	else
+-		sc->cache_trim_mode = 0;
+-
+-	/*
+-	 * Prevent the reclaimer from falling into the cache trap: as
+-	 * cache pages start out inactive, every cache fault will tip
+-	 * the scan balance towards the file LRU.  And as the file LRU
+-	 * shrinks, so does the window for rotation from references.
+-	 * This means we have a runaway feedback loop where a tiny
+-	 * thrashing file LRU becomes infinitely more attractive than
+-	 * anon pages.  Try to detect this based on file LRU size.
+-	 */
+-	if (!cgroup_reclaim(sc)) {
+-		unsigned long total_high_wmark = 0;
+-		unsigned long free, anon;
+-		int z;
+-
+-		free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
+-		file = node_page_state(pgdat, NR_ACTIVE_FILE) +
+-			   node_page_state(pgdat, NR_INACTIVE_FILE);
+-
+-		for (z = 0; z < MAX_NR_ZONES; z++) {
+-			struct zone *zone = &pgdat->node_zones[z];
+-			if (!managed_zone(zone))
+-				continue;
+-
+-			total_high_wmark += high_wmark_pages(zone);
+-		}
+-
+-		/*
+-		 * Consider anon: if that's low too, this isn't a
+-		 * runaway file reclaim problem, but rather just
+-		 * extreme pressure. Reclaim as per usual then.
+-		 */
+-		anon = node_page_state(pgdat, NR_INACTIVE_ANON);
+-
+-		sc->file_is_tiny =
+-			file + free <= total_high_wmark &&
+-			!(sc->may_deactivate & DEACTIVATE_ANON) &&
+-			anon >> sc->priority;
+-	}
++	prepare_scan_count(pgdat, sc);
+ 
+ 	shrink_node_memcgs(pgdat, sc);
+ 

+ 82 - 0
target/linux/generic/backport-6.1/020-v6.1-04-Revert-include-linux-mm_inline.h-fold-__update_lru_s.patch

@@ -0,0 +1,82 @@
+From 03705be42114db7cc5bd6eb7bf7e8703c94d4880 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Sun, 18 Sep 2022 02:00:01 -0600
+Subject: [PATCH 04/29] Revert "include/linux/mm_inline.h: fold
+ __update_lru_size() into its sole caller"
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This patch undoes the following refactor: commit 289ccba18af4
+("include/linux/mm_inline.h: fold __update_lru_size() into its sole
+caller")
+
+The upcoming changes to include/linux/mm_inline.h will reuse
+__update_lru_size().
+
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Reviewed-by: Miaohe Lin <[email protected]>
+Acked-by: Brian Geffon <[email protected]>
+Acked-by: Jan Alexander Steffens (heftig) <[email protected]>
+Acked-by: Oleksandr Natalenko <[email protected]>
+Acked-by: Steven Barrett <[email protected]>
+Acked-by: Suleiman Souhlal <[email protected]>
+Tested-by: Daniel Byrne <[email protected]>
+Tested-by: Donald Carr <[email protected]>
+Tested-by: Holger Hoffstätte <[email protected]>
+Tested-by: Konstantin Kharlamov <[email protected]>
+Tested-by: Shuang Zhai <[email protected]>
+Tested-by: Sofia Trinh <[email protected]>
+Tested-by: Vaibhav Jain <[email protected]>
+Cc: Andi Kleen <[email protected]>
+Cc: Aneesh Kumar K.V <[email protected]>
+Cc: Barry Song <[email protected]>
+Cc: Catalin Marinas <[email protected]>
+Cc: Dave Hansen <[email protected]>
+Cc: Hillf Danton <[email protected]>
+Cc: Jens Axboe <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Jonathan Corbet <[email protected]>
+Cc: Linus Torvalds <[email protected]>
+Cc: Matthew Wilcox <[email protected]>
+Cc: Mel Gorman <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Cc: Michal Hocko <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Peter Zijlstra <[email protected]>
+Cc: Qi Zheng <[email protected]>
+Cc: Tejun Heo <[email protected]>
+Cc: Vlastimil Babka <[email protected]>
+Cc: Will Deacon <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ include/linux/mm_inline.h | 9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+--- a/include/linux/mm_inline.h
++++ b/include/linux/mm_inline.h
+@@ -24,7 +24,7 @@ static inline int page_is_file_lru(struc
+ 	return !PageSwapBacked(page);
+ }
+ 
+-static __always_inline void update_lru_size(struct lruvec *lruvec,
++static __always_inline void __update_lru_size(struct lruvec *lruvec,
+ 				enum lru_list lru, enum zone_type zid,
+ 				int nr_pages)
+ {
+@@ -33,6 +33,13 @@ static __always_inline void update_lru_s
+ 	__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
+ 	__mod_zone_page_state(&pgdat->node_zones[zid],
+ 				NR_ZONE_LRU_BASE + lru, nr_pages);
++}
++
++static __always_inline void update_lru_size(struct lruvec *lruvec,
++				enum lru_list lru, enum zone_type zid,
++				long nr_pages)
++{
++	__update_lru_size(lruvec, lru, zid, nr_pages);
+ #ifdef CONFIG_MEMCG
+ 	mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
+ #endif

+ 807 - 0
target/linux/generic/backport-6.1/020-v6.1-05-mm-multi-gen-LRU-groundwork.patch

@@ -0,0 +1,807 @@
+From a9b328add8422921a0dbbef162730800e16e8cfd Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Sun, 18 Sep 2022 02:00:02 -0600
+Subject: [PATCH 05/29] mm: multi-gen LRU: groundwork
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Evictable pages are divided into multiple generations for each lruvec.
+The youngest generation number is stored in lrugen->max_seq for both
+anon and file types as they are aged on an equal footing. The oldest
+generation numbers are stored in lrugen->min_seq[] separately for anon
+and file types as clean file pages can be evicted regardless of swap
+constraints. These three variables are monotonically increasing.
+
+Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits
+in order to fit into the gen counter in page->flags. Each truncated
+generation number is an index to lrugen->lists[]. The sliding window
+technique is used to track at least MIN_NR_GENS and at most
+MAX_NR_GENS generations. The gen counter stores a value within [1,
+MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it
+stores 0.
+
+There are two conceptually independent procedures: "the aging", which
+produces young generations, and "the eviction", which consumes old
+generations.  They form a closed-loop system, i.e., "the page reclaim".
+Both procedures can be invoked from userspace for the purposes of working
+set estimation and proactive reclaim.  These techniques are commonly used
+to optimize job scheduling (bin packing) in data centers [1][2].
+
+To avoid confusion, the terms "hot" and "cold" will be applied to the
+multi-gen LRU, as a new convention; the terms "active" and "inactive" will
+be applied to the active/inactive LRU, as usual.
+
+The protection of hot pages and the selection of cold pages are based
+on page access channels and patterns. There are two access channels:
+one through page tables and the other through file descriptors. The
+protection of the former channel is by design stronger because:
+1. The uncertainty in determining the access patterns of the former
+   channel is higher due to the approximation of the accessed bit.
+2. The cost of evicting the former channel is higher due to the TLB
+   flushes required and the likelihood of encountering the dirty bit.
+3. The penalty of underprotecting the former channel is higher because
+   applications usually do not prepare themselves for major page
+   faults like they do for blocked I/O. E.g., GUI applications
+   commonly use dedicated I/O threads to avoid blocking rendering
+   threads.
+
+There are also two access patterns: one with temporal locality and the
+other without.  For the reasons listed above, the former channel is
+assumed to follow the former pattern unless VM_SEQ_READ or VM_RAND_READ is
+present; the latter channel is assumed to follow the latter pattern unless
+outlying refaults have been observed [3][4].
+
+The next patch will address the "outlying refaults".  Three macros, i.e.,
+LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are added in
+this patch to make the entire patchset less diffy.
+
+A page is added to the youngest generation on faulting.  The aging needs
+to check the accessed bit at least twice before handing this page over to
+the eviction.  The first check takes care of the accessed bit set on the
+initial fault; the second check makes sure this page has not been used
+since then.  This protocol, AKA second chance, requires a minimum of two
+generations, hence MIN_NR_GENS.
+
+[1] https://dl.acm.org/doi/10.1145/3297858.3304053
+[2] https://dl.acm.org/doi/10.1145/3503222.3507731
+[3] https://lwn.net/Articles/495543/
+[4] https://lwn.net/Articles/815342/
+
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Acked-by: Brian Geffon <[email protected]>
+Acked-by: Jan Alexander Steffens (heftig) <[email protected]>
+Acked-by: Oleksandr Natalenko <[email protected]>
+Acked-by: Steven Barrett <[email protected]>
+Acked-by: Suleiman Souhlal <[email protected]>
+Tested-by: Daniel Byrne <[email protected]>
+Tested-by: Donald Carr <[email protected]>
+Tested-by: Holger Hoffstätte <[email protected]>
+Tested-by: Konstantin Kharlamov <[email protected]>
+Tested-by: Shuang Zhai <[email protected]>
+Tested-by: Sofia Trinh <[email protected]>
+Tested-by: Vaibhav Jain <[email protected]>
+Cc: Andi Kleen <[email protected]>
+Cc: Aneesh Kumar K.V <[email protected]>
+Cc: Barry Song <[email protected]>
+Cc: Catalin Marinas <[email protected]>
+Cc: Dave Hansen <[email protected]>
+Cc: Hillf Danton <[email protected]>
+Cc: Jens Axboe <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Jonathan Corbet <[email protected]>
+Cc: Linus Torvalds <[email protected]>
+Cc: Matthew Wilcox <[email protected]>
+Cc: Mel Gorman <[email protected]>
+Cc: Miaohe Lin <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Cc: Michal Hocko <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Peter Zijlstra <[email protected]>
+Cc: Qi Zheng <[email protected]>
+Cc: Tejun Heo <[email protected]>
+Cc: Vlastimil Babka <[email protected]>
+Cc: Will Deacon <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ fs/fuse/dev.c                     |   3 +-
+ include/linux/mm.h                |   2 +
+ include/linux/mm_inline.h         | 177 +++++++++++++++++++++++++++++-
+ include/linux/mmzone.h            | 100 +++++++++++++++++
+ include/linux/page-flags-layout.h |  13 ++-
+ include/linux/page-flags.h        |   4 +-
+ include/linux/sched.h             |   4 +
+ kernel/bounds.c                   |   5 +
+ mm/Kconfig                        |   8 ++
+ mm/huge_memory.c                  |   3 +-
+ mm/memcontrol.c                   |   2 +
+ mm/memory.c                       |  25 +++++
+ mm/mm_init.c                      |   6 +-
+ mm/mmzone.c                       |   2 +
+ mm/swap.c                         |  10 +-
+ mm/vmscan.c                       |  75 +++++++++++++
+ 16 files changed, 425 insertions(+), 14 deletions(-)
+
+--- a/fs/fuse/dev.c
++++ b/fs/fuse/dev.c
+@@ -785,7 +785,8 @@ static int fuse_check_page(struct page *
+ 	       1 << PG_active |
+ 	       1 << PG_workingset |
+ 	       1 << PG_reclaim |
+-	       1 << PG_waiters))) {
++	       1 << PG_waiters |
++	       LRU_GEN_MASK | LRU_REFS_MASK))) {
+ 		dump_page(page, "fuse: trying to steal weird page");
+ 		return 1;
+ 	}
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v
+ #define ZONES_PGOFF		(NODES_PGOFF - ZONES_WIDTH)
+ #define LAST_CPUPID_PGOFF	(ZONES_PGOFF - LAST_CPUPID_WIDTH)
+ #define KASAN_TAG_PGOFF		(LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
++#define LRU_GEN_PGOFF		(KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
++#define LRU_REFS_PGOFF		(LRU_GEN_PGOFF - LRU_REFS_WIDTH)
+ 
+ /*
+  * Define the bit shifts to access each section.  For non-existent
+--- a/include/linux/mm_inline.h
++++ b/include/linux/mm_inline.h
+@@ -26,10 +26,13 @@ static inline int page_is_file_lru(struc
+ 
+ static __always_inline void __update_lru_size(struct lruvec *lruvec,
+ 				enum lru_list lru, enum zone_type zid,
+-				int nr_pages)
++				long nr_pages)
+ {
+ 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ 
++	lockdep_assert_held(&lruvec->lru_lock);
++	WARN_ON_ONCE(nr_pages != (int)nr_pages);
++
+ 	__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
+ 	__mod_zone_page_state(&pgdat->node_zones[zid],
+ 				NR_ZONE_LRU_BASE + lru, nr_pages);
+@@ -86,11 +89,177 @@ static __always_inline enum lru_list pag
+ 	return lru;
+ }
+ 
++#ifdef CONFIG_LRU_GEN
++
++static inline bool lru_gen_enabled(void)
++{
++	return true;
++}
++
++static inline bool lru_gen_in_fault(void)
++{
++	return current->in_lru_fault;
++}
++
++static inline int lru_gen_from_seq(unsigned long seq)
++{
++	return seq % MAX_NR_GENS;
++}
++
++static inline int page_lru_gen(struct page *page)
++{
++	unsigned long flags = READ_ONCE(page->flags);
++
++	return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
++}
++
++static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
++{
++	unsigned long max_seq = lruvec->lrugen.max_seq;
++
++	VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
++
++	/* see the comment on MIN_NR_GENS */
++	return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
++}
++
++static inline void lru_gen_update_size(struct lruvec *lruvec, struct page *page,
++				       int old_gen, int new_gen)
++{
++	int type = page_is_file_lru(page);
++	int zone = page_zonenum(page);
++	int delta = thp_nr_pages(page);
++	enum lru_list lru = type * LRU_INACTIVE_FILE;
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++
++	VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
++	VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
++	VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);
++
++	if (old_gen >= 0)
++		WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
++			   lrugen->nr_pages[old_gen][type][zone] - delta);
++	if (new_gen >= 0)
++		WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
++			   lrugen->nr_pages[new_gen][type][zone] + delta);
++
++	/* addition */
++	if (old_gen < 0) {
++		if (lru_gen_is_active(lruvec, new_gen))
++			lru += LRU_ACTIVE;
++		__update_lru_size(lruvec, lru, zone, delta);
++		return;
++	}
++
++	/* deletion */
++	if (new_gen < 0) {
++		if (lru_gen_is_active(lruvec, old_gen))
++			lru += LRU_ACTIVE;
++		__update_lru_size(lruvec, lru, zone, -delta);
++		return;
++	}
++}
++
++static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
++{
++	unsigned long seq;
++	unsigned long flags;
++	int gen = page_lru_gen(page);
++	int type = page_is_file_lru(page);
++	int zone = page_zonenum(page);
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++
++	VM_WARN_ON_ONCE_PAGE(gen != -1, page);
++
++	if (PageUnevictable(page))
++		return false;
++	/*
++	 * There are three common cases for this page:
++	 * 1. If it's hot, e.g., freshly faulted in or previously hot and
++	 *    migrated, add it to the youngest generation.
++	 * 2. If it's cold but can't be evicted immediately, i.e., an anon page
++	 *    not in swapcache or a dirty page pending writeback, add it to the
++	 *    second oldest generation.
++	 * 3. Everything else (clean, cold) is added to the oldest generation.
++	 */
++	if (PageActive(page))
++		seq = lrugen->max_seq;
++	else if ((type == LRU_GEN_ANON && !PageSwapCache(page)) ||
++		 (PageReclaim(page) &&
++		  (PageDirty(page) || PageWriteback(page))))
++		seq = lrugen->min_seq[type] + 1;
++	else
++		seq = lrugen->min_seq[type];
++
++	gen = lru_gen_from_seq(seq);
++	flags = (gen + 1UL) << LRU_GEN_PGOFF;
++	/* see the comment on MIN_NR_GENS about PG_active */
++	set_mask_bits(&page->flags, LRU_GEN_MASK | BIT(PG_active), flags);
++
++	lru_gen_update_size(lruvec, page, -1, gen);
++	/* for rotate_reclaimable_page() */
++	if (reclaiming)
++		list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
++	else
++		list_add(&page->lru, &lrugen->lists[gen][type][zone]);
++
++	return true;
++}
++
++static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
++{
++	unsigned long flags;
++	int gen = page_lru_gen(page);
++
++	if (gen < 0)
++		return false;
++
++	VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
++	VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
++
++	/* for migrate_page_states() */
++	flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
++	flags = set_mask_bits(&page->flags, LRU_GEN_MASK, flags);
++	gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
++
++	lru_gen_update_size(lruvec, page, gen, -1);
++	list_del(&page->lru);
++
++	return true;
++}
++
++#else /* !CONFIG_LRU_GEN */
++
++static inline bool lru_gen_enabled(void)
++{
++	return false;
++}
++
++static inline bool lru_gen_in_fault(void)
++{
++	return false;
++}
++
++static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
++{
++	return false;
++}
++
++static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
++{
++	return false;
++}
++
++#endif /* CONFIG_LRU_GEN */
++
+ static __always_inline void add_page_to_lru_list(struct page *page,
+ 				struct lruvec *lruvec)
+ {
+ 	enum lru_list lru = page_lru(page);
+ 
++	if (lru_gen_add_page(lruvec, page, false))
++		return;
++
+ 	update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
+ 	list_add(&page->lru, &lruvec->lists[lru]);
+ }
+@@ -100,6 +269,9 @@ static __always_inline void add_page_to_
+ {
+ 	enum lru_list lru = page_lru(page);
+ 
++	if (lru_gen_add_page(lruvec, page, true))
++		return;
++
+ 	update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
+ 	list_add_tail(&page->lru, &lruvec->lists[lru]);
+ }
+@@ -107,6 +279,9 @@ static __always_inline void add_page_to_
+ static __always_inline void del_page_from_lru_list(struct page *page,
+ 				struct lruvec *lruvec)
+ {
++	if (lru_gen_del_page(lruvec, page, false))
++		return;
++
+ 	list_del(&page->lru);
+ 	update_lru_size(lruvec, page_lru(page), page_zonenum(page),
+ 			-thp_nr_pages(page));
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -294,6 +294,102 @@ enum lruvec_flags {
+ 					 */
+ };
+ 
++#endif /* !__GENERATING_BOUNDS_H */
++
++/*
++ * Evictable pages are divided into multiple generations. The youngest and the
++ * oldest generation numbers, max_seq and min_seq, are monotonically increasing.
++ * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
++ * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
++ * corresponding generation. The gen counter in page->flags stores gen+1 while
++ * a page is on one of lrugen->lists[]. Otherwise it stores 0.
++ *
++ * A page is added to the youngest generation on faulting. The aging needs to
++ * check the accessed bit at least twice before handing this page over to the
++ * eviction. The first check takes care of the accessed bit set on the initial
++ * fault; the second check makes sure this page hasn't been used since then.
++ * This process, AKA second chance, requires a minimum of two generations,
++ * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
++ * LRU, e.g., /proc/vmstat, these two generations are considered active; the
++ * rest of generations, if they exist, are considered inactive. See
++ * lru_gen_is_active().
++ *
++ * PG_active is always cleared while a page is on one of lrugen->lists[] so that
++ * the aging needs not to worry about it. And it's set again when a page
++ * considered active is isolated for non-reclaiming purposes, e.g., migration.
++ * See lru_gen_add_page() and lru_gen_del_page().
++ *
++ * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
++ * number of categories of the active/inactive LRU when keeping track of
++ * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
++ * in page->flags.
++ */
++#define MIN_NR_GENS		2U
++#define MAX_NR_GENS		4U
++
++#ifndef __GENERATING_BOUNDS_H
++
++struct lruvec;
++
++#define LRU_GEN_MASK		((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
++#define LRU_REFS_MASK		((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
++
++#ifdef CONFIG_LRU_GEN
++
++enum {
++	LRU_GEN_ANON,
++	LRU_GEN_FILE,
++};
++
++/*
++ * The youngest generation number is stored in max_seq for both anon and file
++ * types as they are aged on an equal footing. The oldest generation numbers are
++ * stored in min_seq[] separately for anon and file types as clean file pages
++ * can be evicted regardless of swap constraints.
++ *
++ * Normally anon and file min_seq are in sync. But if swapping is constrained,
++ * e.g., out of swap space, file min_seq is allowed to advance and leave anon
++ * min_seq behind.
++ *
++ * The number of pages in each generation is eventually consistent and therefore
++ * can be transiently negative.
++ */
++struct lru_gen_struct {
++	/* the aging increments the youngest generation number */
++	unsigned long max_seq;
++	/* the eviction increments the oldest generation numbers */
++	unsigned long min_seq[ANON_AND_FILE];
++	/* the multi-gen LRU lists, lazily sorted on eviction */
++	struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
++	/* the multi-gen LRU sizes, eventually consistent */
++	long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
++};
++
++void lru_gen_init_lruvec(struct lruvec *lruvec);
++
++#ifdef CONFIG_MEMCG
++void lru_gen_init_memcg(struct mem_cgroup *memcg);
++void lru_gen_exit_memcg(struct mem_cgroup *memcg);
++#endif
++
++#else /* !CONFIG_LRU_GEN */
++
++static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
++{
++}
++
++#ifdef CONFIG_MEMCG
++static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
++{
++}
++
++static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
++{
++}
++#endif
++
++#endif /* CONFIG_LRU_GEN */
++
+ struct lruvec {
+ 	struct list_head		lists[NR_LRU_LISTS];
+ 	/* per lruvec lru_lock for memcg */
+@@ -311,6 +407,10 @@ struct lruvec {
+ 	unsigned long			refaults[ANON_AND_FILE];
+ 	/* Various lruvec state flags (enum lruvec_flags) */
+ 	unsigned long			flags;
++#ifdef CONFIG_LRU_GEN
++	/* evictable pages divided into generations */
++	struct lru_gen_struct		lrugen;
++#endif
+ #ifdef CONFIG_MEMCG
+ 	struct pglist_data *pgdat;
+ #endif
+--- a/include/linux/page-flags-layout.h
++++ b/include/linux/page-flags-layout.h
+@@ -55,7 +55,8 @@
+ #define SECTIONS_WIDTH		0
+ #endif
+ 
+-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
++#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
++	<= BITS_PER_LONG - NR_PAGEFLAGS
+ #define NODES_WIDTH		NODES_SHIFT
+ #elif defined(CONFIG_SPARSEMEM_VMEMMAP)
+ #error "Vmemmap: No space for nodes field in page flags"
+@@ -89,8 +90,8 @@
+ #define LAST_CPUPID_SHIFT 0
+ #endif
+ 
+-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
+-	<= BITS_PER_LONG - NR_PAGEFLAGS
++#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
++	KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
+ #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
+ #else
+ #define LAST_CPUPID_WIDTH 0
+@@ -100,10 +101,12 @@
+ #define LAST_CPUPID_NOT_IN_PAGE_FLAGS
+ #endif
+ 
+-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
+-	> BITS_PER_LONG - NR_PAGEFLAGS
++#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
++	KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
+ #error "Not enough bits in page flags"
+ #endif
+ 
++#define LRU_REFS_WIDTH	0
++
+ #endif
+ #endif /* _LINUX_PAGE_FLAGS_LAYOUT */
+--- a/include/linux/page-flags.h
++++ b/include/linux/page-flags.h
+@@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall
+ 	 1UL << PG_private	| 1UL << PG_private_2	|	\
+ 	 1UL << PG_writeback	| 1UL << PG_reserved	|	\
+ 	 1UL << PG_slab		| 1UL << PG_active 	|	\
+-	 1UL << PG_unevictable	| __PG_MLOCKED)
++	 1UL << PG_unevictable	| __PG_MLOCKED | LRU_GEN_MASK)
+ 
+ /*
+  * Flags checked when a page is prepped for return by the page allocator.
+@@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall
+  * alloc-free cycle to prevent from reusing the page.
+  */
+ #define PAGE_FLAGS_CHECK_AT_PREP	\
+-	(PAGEFLAGS_MASK & ~__PG_HWPOISON)
++	((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
+ 
+ #define PAGE_FLAGS_PRIVATE				\
+ 	(1UL << PG_private | 1UL << PG_private_2)
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -911,6 +911,10 @@ struct task_struct {
+ #ifdef CONFIG_MEMCG
+ 	unsigned			in_user_fault:1;
+ #endif
++#ifdef CONFIG_LRU_GEN
++	/* whether the LRU algorithm may apply to this access */
++	unsigned			in_lru_fault:1;
++#endif
+ #ifdef CONFIG_COMPAT_BRK
+ 	unsigned			brk_randomized:1;
+ #endif
+--- a/kernel/bounds.c
++++ b/kernel/bounds.c
+@@ -22,6 +22,11 @@ int main(void)
+ 	DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
+ #endif
+ 	DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
++#ifdef CONFIG_LRU_GEN
++	DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
++#else
++	DEFINE(LRU_GEN_WIDTH, 0);
++#endif
+ 	/* End of constants */
+ 
+ 	return 0;
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -897,6 +897,14 @@ config IO_MAPPING
+ config SECRETMEM
+ 	def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
+ 
++config LRU_GEN
++	bool "Multi-Gen LRU"
++	depends on MMU
++	# make sure page->flags has enough spare bits
++	depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP
++	help
++	  A high performance LRU implementation to overcommit memory.
++
+ source "mm/damon/Kconfig"
+ 
+ endmenu
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -2366,7 +2366,8 @@ static void __split_huge_page_tail(struc
+ #ifdef CONFIG_64BIT
+ 			 (1L << PG_arch_2) |
+ #endif
+-			 (1L << PG_dirty)));
++			 (1L << PG_dirty) |
++			 LRU_GEN_MASK | LRU_REFS_MASK));
+ 
+ 	/* ->mapping in first tail page is compound_mapcount */
+ 	VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -5178,6 +5178,7 @@ static void __mem_cgroup_free(struct mem
+ 
+ static void mem_cgroup_free(struct mem_cgroup *memcg)
+ {
++	lru_gen_exit_memcg(memcg);
+ 	memcg_wb_domain_exit(memcg);
+ 	__mem_cgroup_free(memcg);
+ }
+@@ -5241,6 +5242,7 @@ static struct mem_cgroup *mem_cgroup_all
+ 	memcg->deferred_split_queue.split_queue_len = 0;
+ #endif
+ 	idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
++	lru_gen_init_memcg(memcg);
+ 	return memcg;
+ fail:
+ 	mem_cgroup_id_remove(memcg);
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -4792,6 +4792,27 @@ static inline void mm_account_fault(stru
+ 		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
+ }
+ 
++#ifdef CONFIG_LRU_GEN
++static void lru_gen_enter_fault(struct vm_area_struct *vma)
++{
++	/* the LRU algorithm doesn't apply to sequential or random reads */
++	current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
++}
++
++static void lru_gen_exit_fault(void)
++{
++	current->in_lru_fault = false;
++}
++#else
++static void lru_gen_enter_fault(struct vm_area_struct *vma)
++{
++}
++
++static void lru_gen_exit_fault(void)
++{
++}
++#endif /* CONFIG_LRU_GEN */
++
+ /*
+  * By the time we get here, we already hold the mm semaphore
+  *
+@@ -4823,11 +4844,15 @@ vm_fault_t handle_mm_fault(struct vm_are
+ 	if (flags & FAULT_FLAG_USER)
+ 		mem_cgroup_enter_user_fault();
+ 
++	lru_gen_enter_fault(vma);
++
+ 	if (unlikely(is_vm_hugetlb_page(vma)))
+ 		ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
+ 	else
+ 		ret = __handle_mm_fault(vma, address, flags);
+ 
++	lru_gen_exit_fault();
++
+ 	if (flags & FAULT_FLAG_USER) {
+ 		mem_cgroup_exit_user_fault();
+ 		/*
+--- a/mm/mm_init.c
++++ b/mm/mm_init.c
+@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo
+ 
+ 	shift = 8 * sizeof(unsigned long);
+ 	width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
+-		- LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
++		- LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
+ 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
+-		"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
++		"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
+ 		SECTIONS_WIDTH,
+ 		NODES_WIDTH,
+ 		ZONES_WIDTH,
+ 		LAST_CPUPID_WIDTH,
+ 		KASAN_TAG_WIDTH,
++		LRU_GEN_WIDTH,
++		LRU_REFS_WIDTH,
+ 		NR_PAGEFLAGS);
+ 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
+ 		"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
+--- a/mm/mmzone.c
++++ b/mm/mmzone.c
+@@ -81,6 +81,8 @@ void lruvec_init(struct lruvec *lruvec)
+ 
+ 	for_each_lru(lru)
+ 		INIT_LIST_HEAD(&lruvec->lists[lru]);
++
++	lru_gen_init_lruvec(lruvec);
+ }
+ 
+ #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
+--- a/mm/swap.c
++++ b/mm/swap.c
+@@ -446,6 +446,11 @@ void lru_cache_add(struct page *page)
+ 	VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
+ 	VM_BUG_ON_PAGE(PageLRU(page), page);
+ 
++	/* see the comment in lru_gen_add_page() */
++	if (lru_gen_enabled() && !PageUnevictable(page) &&
++	    lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
++		SetPageActive(page);
++
+ 	get_page(page);
+ 	local_lock(&lru_pvecs.lock);
+ 	pvec = this_cpu_ptr(&lru_pvecs.lru_add);
+@@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc
+ 
+ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
+ {
+-	if (PageActive(page) && !PageUnevictable(page)) {
++	if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
+ 		int nr_pages = thp_nr_pages(page);
+ 
+ 		del_page_from_lru_list(page, lruvec);
+@@ -661,7 +666,8 @@ void deactivate_file_page(struct page *p
+  */
+ void deactivate_page(struct page *page)
+ {
+-	if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
++	if (PageLRU(page) && !PageUnevictable(page) &&
++	    (PageActive(page) || lru_gen_enabled())) {
+ 		struct pagevec *pvec;
+ 
+ 		local_lock(&lru_pvecs.lock);
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2821,6 +2821,81 @@ static bool can_age_anon_pages(struct pg
+ 	return can_demote(pgdat->node_id, sc);
+ }
+ 
++#ifdef CONFIG_LRU_GEN
++
++/******************************************************************************
++ *                          shorthand helpers
++ ******************************************************************************/
++
++#define for_each_gen_type_zone(gen, type, zone)				\
++	for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++)			\
++		for ((type) = 0; (type) < ANON_AND_FILE; (type)++)	\
++			for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
++
++static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
++{
++	struct pglist_data *pgdat = NODE_DATA(nid);
++
++#ifdef CONFIG_MEMCG
++	if (memcg) {
++		struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
++
++		/* for hotadd_new_pgdat() */
++		if (!lruvec->pgdat)
++			lruvec->pgdat = pgdat;
++
++		return lruvec;
++	}
++#endif
++	VM_WARN_ON_ONCE(!mem_cgroup_disabled());
++
++	return pgdat ? &pgdat->__lruvec : NULL;
++}
++
++/******************************************************************************
++ *                          initialization
++ ******************************************************************************/
++
++void lru_gen_init_lruvec(struct lruvec *lruvec)
++{
++	int gen, type, zone;
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++
++	lrugen->max_seq = MIN_NR_GENS + 1;
++
++	for_each_gen_type_zone(gen, type, zone)
++		INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
++}
++
++#ifdef CONFIG_MEMCG
++void lru_gen_init_memcg(struct mem_cgroup *memcg)
++{
++}
++
++void lru_gen_exit_memcg(struct mem_cgroup *memcg)
++{
++	int nid;
++
++	for_each_node(nid) {
++		struct lruvec *lruvec = get_lruvec(memcg, nid);
++
++		VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
++					   sizeof(lruvec->lrugen.nr_pages)));
++	}
++}
++#endif
++
++static int __init init_lru_gen(void)
++{
++	BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
++	BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
++
++	return 0;
++};
++late_initcall(init_lru_gen);
++
++#endif /* CONFIG_LRU_GEN */
++
+ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ {
+ 	unsigned long nr[NR_LRU_LISTS];

+ 1447 - 0
target/linux/generic/backport-6.1/020-v6.1-06-mm-multi-gen-LRU-minimal-implementation.patch

@@ -0,0 +1,1447 @@
+From b564b9471cd60ef1ee3961a224898ce4a9620d84 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Sun, 18 Sep 2022 02:00:03 -0600
+Subject: [PATCH 06/29] mm: multi-gen LRU: minimal implementation
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+To avoid confusion, the terms "promotion" and "demotion" will be applied
+to the multi-gen LRU, as a new convention; the terms "activation" and
+"deactivation" will be applied to the active/inactive LRU, as usual.
+
+The aging produces young generations.  Given an lruvec, it increments
+max_seq when max_seq-min_seq+1 approaches MIN_NR_GENS.  The aging promotes
+hot pages to the youngest generation when it finds them accessed through
+page tables; the demotion of cold pages happens consequently when it
+increments max_seq.  Promotion in the aging path does not involve any LRU
+list operations, only the updates of the gen counter and
+lrugen->nr_pages[]; demotion, unless as the result of the increment of
+max_seq, requires LRU list operations, e.g., lru_deactivate_fn().  The
+aging has the complexity O(nr_hot_pages), since it is only interested in
+hot pages.
+
+The eviction consumes old generations.  Given an lruvec, it increments
+min_seq when lrugen->lists[] indexed by min_seq%MAX_NR_GENS becomes empty.
+A feedback loop modeled after the PID controller monitors refaults over
+anon and file types and decides which type to evict when both types are
+available from the same generation.
+
+The protection of pages accessed multiple times through file descriptors
+takes place in the eviction path.  Each generation is divided into
+multiple tiers.  A page accessed N times through file descriptors is in
+tier order_base_2(N).  Tiers do not have dedicated lrugen->lists[], only
+bits in page->flags.  The aforementioned feedback loop also monitors
+refaults over all tiers and decides when to protect pages in which tiers
+(N>1), using the first tier (N=0,1) as a baseline.  The first tier
+contains single-use unmapped clean pages, which are most likely the best
+choices.  In contrast to promotion in the aging path, the protection of a
+page in the eviction path is achieved by moving this page to the next
+generation, i.e., min_seq+1, if the feedback loop decides so.  This
+approach has the following advantages:
+
+1. It removes the cost of activation in the buffered access path by
+   inferring whether pages accessed multiple times through file
+   descriptors are statistically hot and thus worth protecting in the
+   eviction path.
+2. It takes pages accessed through page tables into account and avoids
+   overprotecting pages accessed multiple times through file
+   descriptors. (Pages accessed through page tables are in the first
+   tier, since N=0.)
+3. More tiers provide better protection for pages accessed more than
+   twice through file descriptors, when under heavy buffered I/O
+   workloads.
+
+Server benchmark results:
+  Single workload:
+    fio (buffered I/O): +[30, 32]%
+                IOPS         BW
+      5.19-rc1: 2673k        10.2GiB/s
+      patch1-6: 3491k        13.3GiB/s
+
+  Single workload:
+    memcached (anon): -[4, 6]%
+                Ops/sec      KB/sec
+      5.19-rc1: 1161501.04   45177.25
+      patch1-6: 1106168.46   43025.04
+
+  Configurations:
+    CPU: two Xeon 6154
+    Mem: total 256G
+
+    Node 1 was only used as a ram disk to reduce the variance in the
+    results.
+
+    patch drivers/block/brd.c <<EOF
+    99,100c99,100
+    < 	gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM;
+    < 	page = alloc_page(gfp_flags);
+    ---
+    > 	gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM | __GFP_THISNODE;
+    > 	page = alloc_pages_node(1, gfp_flags, 0);
+    EOF
+
+    cat >>/etc/systemd/system.conf <<EOF
+    CPUAffinity=numa
+    NUMAPolicy=bind
+    NUMAMask=0
+    EOF
+
+    cat >>/etc/memcached.conf <<EOF
+    -m 184320
+    -s /var/run/memcached/memcached.sock
+    -a 0766
+    -t 36
+    -B binary
+    EOF
+
+    cat fio.sh
+    modprobe brd rd_nr=1 rd_size=113246208
+    swapoff -a
+    mkfs.ext4 /dev/ram0
+    mount -t ext4 /dev/ram0 /mnt
+
+    mkdir /sys/fs/cgroup/user.slice/test
+    echo 38654705664 >/sys/fs/cgroup/user.slice/test/memory.max
+    echo $$ >/sys/fs/cgroup/user.slice/test/cgroup.procs
+    fio -name=mglru --numjobs=72 --directory=/mnt --size=1408m \
+      --buffered=1 --ioengine=io_uring --iodepth=128 \
+      --iodepth_batch_submit=32 --iodepth_batch_complete=32 \
+      --rw=randread --random_distribution=random --norandommap \
+      --time_based --ramp_time=10m --runtime=5m --group_reporting
+
+    cat memcached.sh
+    modprobe brd rd_nr=1 rd_size=113246208
+    swapoff -a
+    mkswap /dev/ram0
+    swapon /dev/ram0
+
+    memtier_benchmark -S /var/run/memcached/memcached.sock \
+      -P memcache_binary -n allkeys --key-minimum=1 \
+      --key-maximum=65000000 --key-pattern=P:P -c 1 -t 36 \
+      --ratio 1:0 --pipeline 8 -d 2000
+
+    memtier_benchmark -S /var/run/memcached/memcached.sock \
+      -P memcache_binary -n allkeys --key-minimum=1 \
+      --key-maximum=65000000 --key-pattern=R:R -c 1 -t 36 \
+      --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
+
+Client benchmark results:
+  kswapd profiles:
+    5.19-rc1
+      40.33%  page_vma_mapped_walk (overhead)
+      21.80%  lzo1x_1_do_compress (real work)
+       7.53%  do_raw_spin_lock
+       3.95%  _raw_spin_unlock_irq
+       2.52%  vma_interval_tree_iter_next
+       2.37%  page_referenced_one
+       2.28%  vma_interval_tree_subtree_search
+       1.97%  anon_vma_interval_tree_iter_first
+       1.60%  ptep_clear_flush
+       1.06%  __zram_bvec_write
+
+    patch1-6
+      39.03%  lzo1x_1_do_compress (real work)
+      18.47%  page_vma_mapped_walk (overhead)
+       6.74%  _raw_spin_unlock_irq
+       3.97%  do_raw_spin_lock
+       2.49%  ptep_clear_flush
+       2.48%  anon_vma_interval_tree_iter_first
+       1.92%  page_referenced_one
+       1.88%  __zram_bvec_write
+       1.48%  memmove
+       1.31%  vma_interval_tree_iter_next
+
+  Configurations:
+    CPU: single Snapdragon 7c
+    Mem: total 4G
+
+    ChromeOS MemoryPressure [1]
+
+[1] https://chromium.googlesource.com/chromiumos/platform/tast-tests/
+
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Acked-by: Brian Geffon <[email protected]>
+Acked-by: Jan Alexander Steffens (heftig) <[email protected]>
+Acked-by: Oleksandr Natalenko <[email protected]>
+Acked-by: Steven Barrett <[email protected]>
+Acked-by: Suleiman Souhlal <[email protected]>
+Tested-by: Daniel Byrne <[email protected]>
+Tested-by: Donald Carr <[email protected]>
+Tested-by: Holger Hoffstätte <[email protected]>
+Tested-by: Konstantin Kharlamov <[email protected]>
+Tested-by: Shuang Zhai <[email protected]>
+Tested-by: Sofia Trinh <[email protected]>
+Tested-by: Vaibhav Jain <[email protected]>
+Cc: Andi Kleen <[email protected]>
+Cc: Aneesh Kumar K.V <[email protected]>
+Cc: Barry Song <[email protected]>
+Cc: Catalin Marinas <[email protected]>
+Cc: Dave Hansen <[email protected]>
+Cc: Hillf Danton <[email protected]>
+Cc: Jens Axboe <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Jonathan Corbet <[email protected]>
+Cc: Linus Torvalds <[email protected]>
+Cc: Matthew Wilcox <[email protected]>
+Cc: Mel Gorman <[email protected]>
+Cc: Miaohe Lin <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Cc: Michal Hocko <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Peter Zijlstra <[email protected]>
+Cc: Qi Zheng <[email protected]>
+Cc: Tejun Heo <[email protected]>
+Cc: Vlastimil Babka <[email protected]>
+Cc: Will Deacon <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ include/linux/mm_inline.h         |  36 ++
+ include/linux/mmzone.h            |  41 ++
+ include/linux/page-flags-layout.h |   5 +-
+ kernel/bounds.c                   |   2 +
+ mm/Kconfig                        |  11 +
+ mm/swap.c                         |  39 ++
+ mm/vmscan.c                       | 792 +++++++++++++++++++++++++++++-
+ mm/workingset.c                   | 110 ++++-
+ 8 files changed, 1025 insertions(+), 11 deletions(-)
+
+--- a/include/linux/mm_inline.h
++++ b/include/linux/mm_inline.h
+@@ -106,6 +106,33 @@ static inline int lru_gen_from_seq(unsig
+ 	return seq % MAX_NR_GENS;
+ }
+ 
++static inline int lru_hist_from_seq(unsigned long seq)
++{
++	return seq % NR_HIST_GENS;
++}
++
++static inline int lru_tier_from_refs(int refs)
++{
++	VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));
++
++	/* see the comment in page_lru_refs() */
++	return order_base_2(refs + 1);
++}
++
++static inline int page_lru_refs(struct page *page)
++{
++	unsigned long flags = READ_ONCE(page->flags);
++	bool workingset = flags & BIT(PG_workingset);
++
++	/*
++	 * Return the number of accesses beyond PG_referenced, i.e., N-1 if the
++	 * total number of accesses is N>1, since N=0,1 both map to the first
++	 * tier. lru_tier_from_refs() will account for this off-by-one. Also see
++	 * the comment on MAX_NR_TIERS.
++	 */
++	return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
++}
++
+ static inline int page_lru_gen(struct page *page)
+ {
+ 	unsigned long flags = READ_ONCE(page->flags);
+@@ -158,6 +185,15 @@ static inline void lru_gen_update_size(s
+ 		__update_lru_size(lruvec, lru, zone, -delta);
+ 		return;
+ 	}
++
++	/* promotion */
++	if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
++		__update_lru_size(lruvec, lru, zone, -delta);
++		__update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
++	}
++
++	/* demotion requires isolation, e.g., lru_deactivate_fn() */
++	VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
+ }
+ 
+ static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -327,6 +327,28 @@ enum lruvec_flags {
+ #define MIN_NR_GENS		2U
+ #define MAX_NR_GENS		4U
+ 
++/*
++ * Each generation is divided into multiple tiers. A page accessed N times
++ * through file descriptors is in tier order_base_2(N). A page in the first tier
++ * (N=0,1) is marked by PG_referenced unless it was faulted in through page
++ * tables or read ahead. A page in any other tier (N>1) is marked by
++ * PG_referenced and PG_workingset. This implies a minimum of two tiers is
++ * supported without using additional bits in page->flags.
++ *
++ * In contrast to moving across generations which requires the LRU lock, moving
++ * across tiers only involves atomic operations on page->flags and therefore
++ * has a negligible cost in the buffered access path. In the eviction path,
++ * comparisons of refaulted/(evicted+protected) from the first tier and the
++ * rest infer whether pages accessed multiple times through file descriptors
++ * are statistically hot and thus worth protecting.
++ *
++ * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the
++ * number of categories of the active/inactive LRU when keeping track of
++ * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in
++ * page->flags.
++ */
++#define MAX_NR_TIERS		4U
++
+ #ifndef __GENERATING_BOUNDS_H
+ 
+ struct lruvec;
+@@ -341,6 +363,16 @@ enum {
+ 	LRU_GEN_FILE,
+ };
+ 
++#define MIN_LRU_BATCH		BITS_PER_LONG
++#define MAX_LRU_BATCH		(MIN_LRU_BATCH * 64)
++
++/* whether to keep historical stats from evicted generations */
++#ifdef CONFIG_LRU_GEN_STATS
++#define NR_HIST_GENS		MAX_NR_GENS
++#else
++#define NR_HIST_GENS		1U
++#endif
++
+ /*
+  * The youngest generation number is stored in max_seq for both anon and file
+  * types as they are aged on an equal footing. The oldest generation numbers are
+@@ -363,6 +395,15 @@ struct lru_gen_struct {
+ 	struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ 	/* the multi-gen LRU sizes, eventually consistent */
+ 	long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
++	/* the exponential moving average of refaulted */
++	unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
++	/* the exponential moving average of evicted+protected */
++	unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
++	/* the first tier doesn't need protection, hence the minus one */
++	unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
++	/* can be modified without holding the LRU lock */
++	atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
++	atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+ };
+ 
+ void lru_gen_init_lruvec(struct lruvec *lruvec);
+--- a/include/linux/page-flags-layout.h
++++ b/include/linux/page-flags-layout.h
+@@ -106,7 +106,10 @@
+ #error "Not enough bits in page flags"
+ #endif
+ 
+-#define LRU_REFS_WIDTH	0
++/* see the comment on MAX_NR_TIERS */
++#define LRU_REFS_WIDTH	min(__LRU_REFS_WIDTH, BITS_PER_LONG - NR_PAGEFLAGS - \
++			    ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \
++			    NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH)
+ 
+ #endif
+ #endif /* _LINUX_PAGE_FLAGS_LAYOUT */
+--- a/kernel/bounds.c
++++ b/kernel/bounds.c
+@@ -24,8 +24,10 @@ int main(void)
+ 	DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
+ #ifdef CONFIG_LRU_GEN
+ 	DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
++	DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2);
+ #else
+ 	DEFINE(LRU_GEN_WIDTH, 0);
++	DEFINE(__LRU_REFS_WIDTH, 0);
+ #endif
+ 	/* End of constants */
+ 
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -897,6 +897,7 @@ config IO_MAPPING
+ config SECRETMEM
+ 	def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
+ 
++# multi-gen LRU {
+ config LRU_GEN
+ 	bool "Multi-Gen LRU"
+ 	depends on MMU
+@@ -905,6 +906,16 @@ config LRU_GEN
+ 	help
+ 	  A high performance LRU implementation to overcommit memory.
+ 
++config LRU_GEN_STATS
++	bool "Full stats for debugging"
++	depends on LRU_GEN
++	help
++	  Do not enable this option unless you plan to look at historical stats
++	  from evicted generations for debugging purpose.
++
++	  This option has a per-memcg and per-node memory overhead.
++# }
++
+ source "mm/damon/Kconfig"
+ 
+ endmenu
+--- a/mm/swap.c
++++ b/mm/swap.c
+@@ -389,6 +389,40 @@ static void __lru_cache_activate_page(st
+ 	local_unlock(&lru_pvecs.lock);
+ }
+ 
++#ifdef CONFIG_LRU_GEN
++static void page_inc_refs(struct page *page)
++{
++	unsigned long new_flags, old_flags = READ_ONCE(page->flags);
++
++	if (PageUnevictable(page))
++		return;
++
++	if (!PageReferenced(page)) {
++		SetPageReferenced(page);
++		return;
++	}
++
++	if (!PageWorkingset(page)) {
++		SetPageWorkingset(page);
++		return;
++	}
++
++	/* see the comment on MAX_NR_TIERS */
++	do {
++		new_flags = old_flags & LRU_REFS_MASK;
++		if (new_flags == LRU_REFS_MASK)
++			break;
++
++		new_flags += BIT(LRU_REFS_PGOFF);
++		new_flags |= old_flags & ~LRU_REFS_MASK;
++	} while (!try_cmpxchg(&page->flags, &old_flags, new_flags));
++}
++#else
++static void page_inc_refs(struct page *page)
++{
++}
++#endif /* CONFIG_LRU_GEN */
++
+ /*
+  * Mark a page as having seen activity.
+  *
+@@ -403,6 +437,11 @@ void mark_page_accessed(struct page *pag
+ {
+ 	page = compound_head(page);
+ 
++	if (lru_gen_enabled()) {
++		page_inc_refs(page);
++		return;
++	}
++
+ 	if (!PageReferenced(page)) {
+ 		SetPageReferenced(page);
+ 	} else if (PageUnevictable(page)) {
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -1142,9 +1142,11 @@ static int __remove_mapping(struct addre
+ 
+ 	if (PageSwapCache(page)) {
+ 		swp_entry_t swap = { .val = page_private(page) };
+-		mem_cgroup_swapout(page, swap);
++
++		/* get a shadow entry before mem_cgroup_swapout() clears page_memcg() */
+ 		if (reclaimed && !mapping_exiting(mapping))
+ 			shadow = workingset_eviction(page, target_memcg);
++		mem_cgroup_swapout(page, swap);
+ 		__delete_from_swap_cache(page, swap, shadow);
+ 		xa_unlock_irq(&mapping->i_pages);
+ 		put_swap_page(page, swap);
+@@ -2502,6 +2504,9 @@ static void prepare_scan_count(pg_data_t
+ 	unsigned long file;
+ 	struct lruvec *target_lruvec;
+ 
++	if (lru_gen_enabled())
++		return;
++
+ 	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
+ 
+ 	/*
+@@ -2827,6 +2832,17 @@ static bool can_age_anon_pages(struct pg
+  *                          shorthand helpers
+  ******************************************************************************/
+ 
++#define LRU_REFS_FLAGS	(BIT(PG_referenced) | BIT(PG_workingset))
++
++#define DEFINE_MAX_SEQ(lruvec)						\
++	unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
++
++#define DEFINE_MIN_SEQ(lruvec)						\
++	unsigned long min_seq[ANON_AND_FILE] = {			\
++		READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]),	\
++		READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]),	\
++	}
++
+ #define for_each_gen_type_zone(gen, type, zone)				\
+ 	for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++)			\
+ 		for ((type) = 0; (type) < ANON_AND_FILE; (type)++)	\
+@@ -2852,6 +2868,745 @@ static struct lruvec __maybe_unused *get
+ 	return pgdat ? &pgdat->__lruvec : NULL;
+ }
+ 
++static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
++{
++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
++
++	if (!can_demote(pgdat->node_id, sc) &&
++	    mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
++		return 0;
++
++	return mem_cgroup_swappiness(memcg);
++}
++
++static int get_nr_gens(struct lruvec *lruvec, int type)
++{
++	return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1;
++}
++
++static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
++{
++	/* see the comment on lru_gen_struct */
++	return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
++	       get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
++	       get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
++}
++
++/******************************************************************************
++ *                          refault feedback loop
++ ******************************************************************************/
++
++/*
++ * A feedback loop based on Proportional-Integral-Derivative (PID) controller.
++ *
++ * The P term is refaulted/(evicted+protected) from a tier in the generation
++ * currently being evicted; the I term is the exponential moving average of the
++ * P term over the generations previously evicted, using the smoothing factor
++ * 1/2; the D term isn't supported.
++ *
++ * The setpoint (SP) is always the first tier of one type; the process variable
++ * (PV) is either any tier of the other type or any other tier of the same
++ * type.
++ *
++ * The error is the difference between the SP and the PV; the correction is to
++ * turn off protection when SP>PV or turn on protection when SP<PV.
++ *
++ * For future optimizations:
++ * 1. The D term may discount the other two terms over time so that long-lived
++ *    generations can resist stale information.
++ */
++struct ctrl_pos {
++	unsigned long refaulted;
++	unsigned long total;
++	int gain;
++};
++
++static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
++			  struct ctrl_pos *pos)
++{
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	int hist = lru_hist_from_seq(lrugen->min_seq[type]);
++
++	pos->refaulted = lrugen->avg_refaulted[type][tier] +
++			 atomic_long_read(&lrugen->refaulted[hist][type][tier]);
++	pos->total = lrugen->avg_total[type][tier] +
++		     atomic_long_read(&lrugen->evicted[hist][type][tier]);
++	if (tier)
++		pos->total += lrugen->protected[hist][type][tier - 1];
++	pos->gain = gain;
++}
++
++static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
++{
++	int hist, tier;
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
++	unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
++
++	lockdep_assert_held(&lruvec->lru_lock);
++
++	if (!carryover && !clear)
++		return;
++
++	hist = lru_hist_from_seq(seq);
++
++	for (tier = 0; tier < MAX_NR_TIERS; tier++) {
++		if (carryover) {
++			unsigned long sum;
++
++			sum = lrugen->avg_refaulted[type][tier] +
++			      atomic_long_read(&lrugen->refaulted[hist][type][tier]);
++			WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
++
++			sum = lrugen->avg_total[type][tier] +
++			      atomic_long_read(&lrugen->evicted[hist][type][tier]);
++			if (tier)
++				sum += lrugen->protected[hist][type][tier - 1];
++			WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
++		}
++
++		if (clear) {
++			atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
++			atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
++			if (tier)
++				WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
++		}
++	}
++}
++
++static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
++{
++	/*
++	 * Return true if the PV has a limited number of refaults or a lower
++	 * refaulted/total than the SP.
++	 */
++	return pv->refaulted < MIN_LRU_BATCH ||
++	       pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <=
++	       (sp->refaulted + 1) * pv->total * pv->gain;
++}
++
++/******************************************************************************
++ *                          the aging
++ ******************************************************************************/
++
++/* protect pages accessed multiple times through file descriptors */
++static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming)
++{
++	int type = page_is_file_lru(page);
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
++	unsigned long new_flags, old_flags = READ_ONCE(page->flags);
++
++	VM_WARN_ON_ONCE_PAGE(!(old_flags & LRU_GEN_MASK), page);
++
++	do {
++		new_gen = (old_gen + 1) % MAX_NR_GENS;
++
++		new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
++		new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
++		/* for end_page_writeback() */
++		if (reclaiming)
++			new_flags |= BIT(PG_reclaim);
++	} while (!try_cmpxchg(&page->flags, &old_flags, new_flags));
++
++	lru_gen_update_size(lruvec, page, old_gen, new_gen);
++
++	return new_gen;
++}
++
++static void inc_min_seq(struct lruvec *lruvec, int type)
++{
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++
++	reset_ctrl_pos(lruvec, type, true);
++	WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
++}
++
++static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
++{
++	int gen, type, zone;
++	bool success = false;
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	DEFINE_MIN_SEQ(lruvec);
++
++	VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
++
++	/* find the oldest populated generation */
++	for (type = !can_swap; type < ANON_AND_FILE; type++) {
++		while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
++			gen = lru_gen_from_seq(min_seq[type]);
++
++			for (zone = 0; zone < MAX_NR_ZONES; zone++) {
++				if (!list_empty(&lrugen->lists[gen][type][zone]))
++					goto next;
++			}
++
++			min_seq[type]++;
++		}
++next:
++		;
++	}
++
++	/* see the comment on lru_gen_struct */
++	if (can_swap) {
++		min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
++		min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
++	}
++
++	for (type = !can_swap; type < ANON_AND_FILE; type++) {
++		if (min_seq[type] == lrugen->min_seq[type])
++			continue;
++
++		reset_ctrl_pos(lruvec, type, true);
++		WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
++		success = true;
++	}
++
++	return success;
++}
++
++static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap)
++{
++	int prev, next;
++	int type, zone;
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++
++	spin_lock_irq(&lruvec->lru_lock);
++
++	VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
++
++	if (max_seq != lrugen->max_seq)
++		goto unlock;
++
++	for (type = ANON_AND_FILE - 1; type >= 0; type--) {
++		if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
++			continue;
++
++		VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap);
++
++		inc_min_seq(lruvec, type);
++	}
++
++	/*
++	 * Update the active/inactive LRU sizes for compatibility. Both sides of
++	 * the current max_seq need to be covered, since max_seq+1 can overlap
++	 * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do
++	 * overlap, cold/hot inversion happens.
++	 */
++	prev = lru_gen_from_seq(lrugen->max_seq - 1);
++	next = lru_gen_from_seq(lrugen->max_seq + 1);
++
++	for (type = 0; type < ANON_AND_FILE; type++) {
++		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
++			enum lru_list lru = type * LRU_INACTIVE_FILE;
++			long delta = lrugen->nr_pages[prev][type][zone] -
++				     lrugen->nr_pages[next][type][zone];
++
++			if (!delta)
++				continue;
++
++			__update_lru_size(lruvec, lru, zone, delta);
++			__update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
++		}
++	}
++
++	for (type = 0; type < ANON_AND_FILE; type++)
++		reset_ctrl_pos(lruvec, type, false);
++
++	/* make sure preceding modifications appear */
++	smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
++unlock:
++	spin_unlock_irq(&lruvec->lru_lock);
++}
++
++static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
++			     struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
++{
++	int gen, type, zone;
++	unsigned long old = 0;
++	unsigned long young = 0;
++	unsigned long total = 0;
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++
++	for (type = !can_swap; type < ANON_AND_FILE; type++) {
++		unsigned long seq;
++
++		for (seq = min_seq[type]; seq <= max_seq; seq++) {
++			unsigned long size = 0;
++
++			gen = lru_gen_from_seq(seq);
++
++			for (zone = 0; zone < MAX_NR_ZONES; zone++)
++				size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
++
++			total += size;
++			if (seq == max_seq)
++				young += size;
++			else if (seq + MIN_NR_GENS == max_seq)
++				old += size;
++		}
++	}
++
++	/* try to scrape all its memory if this memcg was deleted */
++	*nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
++
++	/*
++	 * The aging tries to be lazy to reduce the overhead, while the eviction
++	 * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
++	 * ideal number of generations is MIN_NR_GENS+1.
++	 */
++	if (min_seq[!can_swap] + MIN_NR_GENS > max_seq)
++		return true;
++	if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
++		return false;
++
++	/*
++	 * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
++	 * of the total number of pages for each generation. A reasonable range
++	 * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
++	 * aging cares about the upper bound of hot pages, while the eviction
++	 * cares about the lower bound of cold pages.
++	 */
++	if (young * MIN_NR_GENS > total)
++		return true;
++	if (old * (MIN_NR_GENS + 2) < total)
++		return true;
++
++	return false;
++}
++
++static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
++{
++	bool need_aging;
++	unsigned long nr_to_scan;
++	int swappiness = get_swappiness(lruvec, sc);
++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++	DEFINE_MAX_SEQ(lruvec);
++	DEFINE_MIN_SEQ(lruvec);
++
++	VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
++
++	mem_cgroup_calculate_protection(NULL, memcg);
++
++	if (mem_cgroup_below_min(memcg))
++		return;
++
++	need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
++	if (need_aging)
++		inc_max_seq(lruvec, max_seq, swappiness);
++}
++
++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
++{
++	struct mem_cgroup *memcg;
++
++	VM_WARN_ON_ONCE(!current_is_kswapd());
++
++	memcg = mem_cgroup_iter(NULL, NULL, NULL);
++	do {
++		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
++
++		age_lruvec(lruvec, sc);
++
++		cond_resched();
++	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
++}
++
++/******************************************************************************
++ *                          the eviction
++ ******************************************************************************/
++
++static bool sort_page(struct lruvec *lruvec, struct page *page, int tier_idx)
++{
++	bool success;
++	int gen = page_lru_gen(page);
++	int type = page_is_file_lru(page);
++	int zone = page_zonenum(page);
++	int delta = thp_nr_pages(page);
++	int refs = page_lru_refs(page);
++	int tier = lru_tier_from_refs(refs);
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++
++	VM_WARN_ON_ONCE_PAGE(gen >= MAX_NR_GENS, page);
++
++	/* unevictable */
++	if (!page_evictable(page)) {
++		success = lru_gen_del_page(lruvec, page, true);
++		VM_WARN_ON_ONCE_PAGE(!success, page);
++		SetPageUnevictable(page);
++		add_page_to_lru_list(page, lruvec);
++		__count_vm_events(UNEVICTABLE_PGCULLED, delta);
++		return true;
++	}
++
++	/* dirty lazyfree */
++	if (type == LRU_GEN_FILE && PageAnon(page) && PageDirty(page)) {
++		success = lru_gen_del_page(lruvec, page, true);
++		VM_WARN_ON_ONCE_PAGE(!success, page);
++		SetPageSwapBacked(page);
++		add_page_to_lru_list_tail(page, lruvec);
++		return true;
++	}
++
++	/* protected */
++	if (tier > tier_idx) {
++		int hist = lru_hist_from_seq(lrugen->min_seq[type]);
++
++		gen = page_inc_gen(lruvec, page, false);
++		list_move_tail(&page->lru, &lrugen->lists[gen][type][zone]);
++
++		WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
++			   lrugen->protected[hist][type][tier - 1] + delta);
++		__mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
++		return true;
++	}
++
++	/* waiting for writeback */
++	if (PageLocked(page) || PageWriteback(page) ||
++	    (type == LRU_GEN_FILE && PageDirty(page))) {
++		gen = page_inc_gen(lruvec, page, true);
++		list_move(&page->lru, &lrugen->lists[gen][type][zone]);
++		return true;
++	}
++
++	return false;
++}
++
++static bool isolate_page(struct lruvec *lruvec, struct page *page, struct scan_control *sc)
++{
++	bool success;
++
++	/* unmapping inhibited */
++	if (!sc->may_unmap && page_mapped(page))
++		return false;
++
++	/* swapping inhibited */
++	if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
++	    (PageDirty(page) ||
++	     (PageAnon(page) && !PageSwapCache(page))))
++		return false;
++
++	/* raced with release_pages() */
++	if (!get_page_unless_zero(page))
++		return false;
++
++	/* raced with another isolation */
++	if (!TestClearPageLRU(page)) {
++		put_page(page);
++		return false;
++	}
++
++	/* see the comment on MAX_NR_TIERS */
++	if (!PageReferenced(page))
++		set_mask_bits(&page->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
++
++	/* for shrink_page_list() */
++	ClearPageReclaim(page);
++	ClearPageReferenced(page);
++
++	success = lru_gen_del_page(lruvec, page, true);
++	VM_WARN_ON_ONCE_PAGE(!success, page);
++
++	return true;
++}
++
++static int scan_pages(struct lruvec *lruvec, struct scan_control *sc,
++		      int type, int tier, struct list_head *list)
++{
++	int gen, zone;
++	enum vm_event_item item;
++	int sorted = 0;
++	int scanned = 0;
++	int isolated = 0;
++	int remaining = MAX_LRU_BATCH;
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++
++	VM_WARN_ON_ONCE(!list_empty(list));
++
++	if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
++		return 0;
++
++	gen = lru_gen_from_seq(lrugen->min_seq[type]);
++
++	for (zone = sc->reclaim_idx; zone >= 0; zone--) {
++		LIST_HEAD(moved);
++		int skipped = 0;
++		struct list_head *head = &lrugen->lists[gen][type][zone];
++
++		while (!list_empty(head)) {
++			struct page *page = lru_to_page(head);
++			int delta = thp_nr_pages(page);
++
++			VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
++			VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
++			VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
++			VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page);
++
++			scanned += delta;
++
++			if (sort_page(lruvec, page, tier))
++				sorted += delta;
++			else if (isolate_page(lruvec, page, sc)) {
++				list_add(&page->lru, list);
++				isolated += delta;
++			} else {
++				list_move(&page->lru, &moved);
++				skipped += delta;
++			}
++
++			if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH)
++				break;
++		}
++
++		if (skipped) {
++			list_splice(&moved, head);
++			__count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
++		}
++
++		if (!remaining || isolated >= MIN_LRU_BATCH)
++			break;
++	}
++
++	item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
++	if (!cgroup_reclaim(sc)) {
++		__count_vm_events(item, isolated);
++		__count_vm_events(PGREFILL, sorted);
++	}
++	__count_memcg_events(memcg, item, isolated);
++	__count_memcg_events(memcg, PGREFILL, sorted);
++	__count_vm_events(PGSCAN_ANON + type, isolated);
++
++	/*
++	 * There might not be eligible pages due to reclaim_idx, may_unmap and
++	 * may_writepage. Check the remaining to prevent livelock if it's not
++	 * making progress.
++	 */
++	return isolated || !remaining ? scanned : 0;
++}
++
++static int get_tier_idx(struct lruvec *lruvec, int type)
++{
++	int tier;
++	struct ctrl_pos sp, pv;
++
++	/*
++	 * To leave a margin for fluctuations, use a larger gain factor (1:2).
++	 * This value is chosen because any other tier would have at least twice
++	 * as many refaults as the first tier.
++	 */
++	read_ctrl_pos(lruvec, type, 0, 1, &sp);
++	for (tier = 1; tier < MAX_NR_TIERS; tier++) {
++		read_ctrl_pos(lruvec, type, tier, 2, &pv);
++		if (!positive_ctrl_err(&sp, &pv))
++			break;
++	}
++
++	return tier - 1;
++}
++
++static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
++{
++	int type, tier;
++	struct ctrl_pos sp, pv;
++	int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
++
++	/*
++	 * Compare the first tier of anon with that of file to determine which
++	 * type to scan. Also need to compare other tiers of the selected type
++	 * with the first tier of the other type to determine the last tier (of
++	 * the selected type) to evict.
++	 */
++	read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp);
++	read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv);
++	type = positive_ctrl_err(&sp, &pv);
++
++	read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
++	for (tier = 1; tier < MAX_NR_TIERS; tier++) {
++		read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
++		if (!positive_ctrl_err(&sp, &pv))
++			break;
++	}
++
++	*tier_idx = tier - 1;
++
++	return type;
++}
++
++static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
++			 int *type_scanned, struct list_head *list)
++{
++	int i;
++	int type;
++	int scanned;
++	int tier = -1;
++	DEFINE_MIN_SEQ(lruvec);
++
++	/*
++	 * Try to make the obvious choice first. When anon and file are both
++	 * available from the same generation, interpret swappiness 1 as file
++	 * first and 200 as anon first.
++	 */
++	if (!swappiness)
++		type = LRU_GEN_FILE;
++	else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
++		type = LRU_GEN_ANON;
++	else if (swappiness == 1)
++		type = LRU_GEN_FILE;
++	else if (swappiness == 200)
++		type = LRU_GEN_ANON;
++	else
++		type = get_type_to_scan(lruvec, swappiness, &tier);
++
++	for (i = !swappiness; i < ANON_AND_FILE; i++) {
++		if (tier < 0)
++			tier = get_tier_idx(lruvec, type);
++
++		scanned = scan_pages(lruvec, sc, type, tier, list);
++		if (scanned)
++			break;
++
++		type = !type;
++		tier = -1;
++	}
++
++	*type_scanned = type;
++
++	return scanned;
++}
++
++static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
++{
++	int type;
++	int scanned;
++	int reclaimed;
++	LIST_HEAD(list);
++	struct page *page;
++	enum vm_event_item item;
++	struct reclaim_stat stat;
++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
++
++	spin_lock_irq(&lruvec->lru_lock);
++
++	scanned = isolate_pages(lruvec, sc, swappiness, &type, &list);
++
++	scanned += try_to_inc_min_seq(lruvec, swappiness);
++
++	if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
++		scanned = 0;
++
++	spin_unlock_irq(&lruvec->lru_lock);
++
++	if (list_empty(&list))
++		return scanned;
++
++	reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false);
++
++	list_for_each_entry(page, &list, lru) {
++		/* restore LRU_REFS_FLAGS cleared by isolate_page() */
++		if (PageWorkingset(page))
++			SetPageReferenced(page);
++
++		/* don't add rejected pages to the oldest generation */
++		if (PageReclaim(page) &&
++		    (PageDirty(page) || PageWriteback(page)))
++			ClearPageActive(page);
++		else
++			SetPageActive(page);
++	}
++
++	spin_lock_irq(&lruvec->lru_lock);
++
++	move_pages_to_lru(lruvec, &list);
++
++	item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
++	if (!cgroup_reclaim(sc))
++		__count_vm_events(item, reclaimed);
++	__count_memcg_events(memcg, item, reclaimed);
++	__count_vm_events(PGSTEAL_ANON + type, reclaimed);
++
++	spin_unlock_irq(&lruvec->lru_lock);
++
++	mem_cgroup_uncharge_list(&list);
++	free_unref_page_list(&list);
++
++	sc->nr_reclaimed += reclaimed;
++
++	return scanned;
++}
++
++static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
++				    bool can_swap)
++{
++	bool need_aging;
++	unsigned long nr_to_scan;
++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++	DEFINE_MAX_SEQ(lruvec);
++	DEFINE_MIN_SEQ(lruvec);
++
++	if (mem_cgroup_below_min(memcg) ||
++	    (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
++		return 0;
++
++	need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
++	if (!need_aging)
++		return nr_to_scan;
++
++	/* skip the aging path at the default priority */
++	if (sc->priority == DEF_PRIORITY)
++		goto done;
++
++	/* leave the work to lru_gen_age_node() */
++	if (current_is_kswapd())
++		return 0;
++
++	inc_max_seq(lruvec, max_seq, can_swap);
++done:
++	return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
++}
++
++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
++{
++	struct blk_plug plug;
++	unsigned long scanned = 0;
++
++	lru_add_drain();
++
++	blk_start_plug(&plug);
++
++	while (true) {
++		int delta;
++		int swappiness;
++		unsigned long nr_to_scan;
++
++		if (sc->may_swap)
++			swappiness = get_swappiness(lruvec, sc);
++		else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))
++			swappiness = 1;
++		else
++			swappiness = 0;
++
++		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
++		if (!nr_to_scan)
++			break;
++
++		delta = evict_pages(lruvec, sc, swappiness);
++		if (!delta)
++			break;
++
++		scanned += delta;
++		if (scanned >= nr_to_scan)
++			break;
++
++		cond_resched();
++	}
++
++	blk_finish_plug(&plug);
++}
++
+ /******************************************************************************
+  *                          initialization
+  ******************************************************************************/
+@@ -2894,6 +3649,16 @@ static int __init init_lru_gen(void)
+ };
+ late_initcall(init_lru_gen);
+ 
++#else /* !CONFIG_LRU_GEN */
++
++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
++{
++}
++
++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
++{
++}
++
+ #endif /* CONFIG_LRU_GEN */
+ 
+ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+@@ -2907,6 +3672,11 @@ static void shrink_lruvec(struct lruvec
+ 	bool proportional_reclaim;
+ 	struct blk_plug plug;
+ 
++	if (lru_gen_enabled()) {
++		lru_gen_shrink_lruvec(lruvec, sc);
++		return;
++	}
++
+ 	get_scan_count(lruvec, sc, nr);
+ 
+ 	/* Record the original scan target for proportional adjustments later */
+@@ -3372,6 +4142,9 @@ static void snapshot_refaults(struct mem
+ 	struct lruvec *target_lruvec;
+ 	unsigned long refaults;
+ 
++	if (lru_gen_enabled())
++		return;
++
+ 	target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
+ 	refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
+ 	target_lruvec->refaults[0] = refaults;
+@@ -3736,12 +4509,16 @@ unsigned long try_to_free_mem_cgroup_pag
+ }
+ #endif
+ 
+-static void age_active_anon(struct pglist_data *pgdat,
+-				struct scan_control *sc)
++static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+ {
+ 	struct mem_cgroup *memcg;
+ 	struct lruvec *lruvec;
+ 
++	if (lru_gen_enabled()) {
++		lru_gen_age_node(pgdat, sc);
++		return;
++	}
++
+ 	if (!can_age_anon_pages(pgdat, sc))
+ 		return;
+ 
+@@ -4058,12 +4835,11 @@ restart:
+ 		sc.may_swap = !nr_boost_reclaim;
+ 
+ 		/*
+-		 * Do some background aging of the anon list, to give
+-		 * pages a chance to be referenced before reclaiming. All
+-		 * pages are rotated regardless of classzone as this is
+-		 * about consistent aging.
++		 * Do some background aging, to give pages a chance to be
++		 * referenced before reclaiming. All pages are rotated
++		 * regardless of classzone as this is about consistent aging.
+ 		 */
+-		age_active_anon(pgdat, &sc);
++		kswapd_age_node(pgdat, &sc);
+ 
+ 		/*
+ 		 * If we're getting trouble reclaiming, start doing writepage
+--- a/mm/workingset.c
++++ b/mm/workingset.c
+@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_
+ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
+ 			 bool workingset)
+ {
+-	eviction >>= bucket_order;
+ 	eviction &= EVICTION_MASK;
+ 	eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
+ 	eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
+@@ -212,10 +211,107 @@ static void unpack_shadow(void *shadow,
+ 
+ 	*memcgidp = memcgid;
+ 	*pgdat = NODE_DATA(nid);
+-	*evictionp = entry << bucket_order;
++	*evictionp = entry;
+ 	*workingsetp = workingset;
+ }
+ 
++#ifdef CONFIG_LRU_GEN
++
++static void *lru_gen_eviction(struct page *page)
++{
++	int hist;
++	unsigned long token;
++	unsigned long min_seq;
++	struct lruvec *lruvec;
++	struct lru_gen_struct *lrugen;
++	int type = page_is_file_lru(page);
++	int delta = thp_nr_pages(page);
++	int refs = page_lru_refs(page);
++	int tier = lru_tier_from_refs(refs);
++	struct mem_cgroup *memcg = page_memcg(page);
++	struct pglist_data *pgdat = page_pgdat(page);
++
++	BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
++
++	lruvec = mem_cgroup_lruvec(memcg, pgdat);
++	lrugen = &lruvec->lrugen;
++	min_seq = READ_ONCE(lrugen->min_seq[type]);
++	token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);
++
++	hist = lru_hist_from_seq(min_seq);
++	atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
++
++	return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
++}
++
++static void lru_gen_refault(struct page *page, void *shadow)
++{
++	int hist, tier, refs;
++	int memcg_id;
++	bool workingset;
++	unsigned long token;
++	unsigned long min_seq;
++	struct lruvec *lruvec;
++	struct lru_gen_struct *lrugen;
++	struct mem_cgroup *memcg;
++	struct pglist_data *pgdat;
++	int type = page_is_file_lru(page);
++	int delta = thp_nr_pages(page);
++
++	unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
++
++	if (pgdat != page_pgdat(page))
++		return;
++
++	rcu_read_lock();
++
++	memcg = page_memcg_rcu(page);
++	if (memcg_id != mem_cgroup_id(memcg))
++		goto unlock;
++
++	lruvec = mem_cgroup_lruvec(memcg, pgdat);
++	lrugen = &lruvec->lrugen;
++
++	min_seq = READ_ONCE(lrugen->min_seq[type]);
++	if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)))
++		goto unlock;
++
++	hist = lru_hist_from_seq(min_seq);
++	/* see the comment in page_lru_refs() */
++	refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
++	tier = lru_tier_from_refs(refs);
++
++	atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
++	mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
++
++	/*
++	 * Count the following two cases as stalls:
++	 * 1. For pages accessed through page tables, hotter pages pushed out
++	 *    hot pages which refaulted immediately.
++	 * 2. For pages accessed multiple times through file descriptors,
++	 *    numbers of accesses might have been out of the range.
++	 */
++	if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) {
++		SetPageWorkingset(page);
++		mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
++	}
++unlock:
++	rcu_read_unlock();
++}
++
++#else /* !CONFIG_LRU_GEN */
++
++static void *lru_gen_eviction(struct page *page)
++{
++	return NULL;
++}
++
++static void lru_gen_refault(struct page *page, void *shadow)
++{
++}
++
++#endif /* CONFIG_LRU_GEN */
++
+ /**
+  * workingset_age_nonresident - age non-resident entries as LRU ages
+  * @lruvec: the lruvec that was aged
+@@ -264,10 +360,14 @@ void *workingset_eviction(struct page *p
+ 	VM_BUG_ON_PAGE(page_count(page), page);
+ 	VM_BUG_ON_PAGE(!PageLocked(page), page);
+ 
++	if (lru_gen_enabled())
++		return lru_gen_eviction(page);
++
+ 	lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
+ 	/* XXX: target_memcg can be NULL, go through lruvec */
+ 	memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
+ 	eviction = atomic_long_read(&lruvec->nonresident_age);
++	eviction >>= bucket_order;
+ 	workingset_age_nonresident(lruvec, thp_nr_pages(page));
+ 	return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
+ }
+@@ -296,7 +396,13 @@ void workingset_refault(struct page *pag
+ 	bool workingset;
+ 	int memcgid;
+ 
++	if (lru_gen_enabled()) {
++		lru_gen_refault(page, shadow);
++		return;
++	}
++
+ 	unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
++	eviction <<= bucket_order;
+ 
+ 	rcu_read_lock();
+ 	/*

+ 491 - 0
target/linux/generic/backport-6.1/020-v6.1-07-mm-multi-gen-LRU-exploit-locality-in-rmap.patch

@@ -0,0 +1,491 @@
+From e4277535f6d6708bb19b88c4bad155832671d69b Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Sun, 18 Sep 2022 02:00:04 -0600
+Subject: [PATCH 07/29] mm: multi-gen LRU: exploit locality in rmap
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Searching the rmap for PTEs mapping each page on an LRU list (to test and
+clear the accessed bit) can be expensive because pages from different VMAs
+(PA space) are not cache friendly to the rmap (VA space).  For workloads
+mostly using mapped pages, searching the rmap can incur the highest CPU
+cost in the reclaim path.
+
+This patch exploits spatial locality to reduce the trips into the rmap.
+When shrink_page_list() walks the rmap and finds a young PTE, a new
+function lru_gen_look_around() scans at most BITS_PER_LONG-1 adjacent
+PTEs.  On finding another young PTE, it clears the accessed bit and
+updates the gen counter of the page mapped by this PTE to
+(max_seq%MAX_NR_GENS)+1.
+
+Server benchmark results:
+  Single workload:
+    fio (buffered I/O): no change
+
+  Single workload:
+    memcached (anon): +[3, 5]%
+                Ops/sec      KB/sec
+      patch1-6: 1106168.46   43025.04
+      patch1-7: 1147696.57   44640.29
+
+  Configurations:
+    no change
+
+Client benchmark results:
+  kswapd profiles:
+    patch1-6
+      39.03%  lzo1x_1_do_compress (real work)
+      18.47%  page_vma_mapped_walk (overhead)
+       6.74%  _raw_spin_unlock_irq
+       3.97%  do_raw_spin_lock
+       2.49%  ptep_clear_flush
+       2.48%  anon_vma_interval_tree_iter_first
+       1.92%  page_referenced_one
+       1.88%  __zram_bvec_write
+       1.48%  memmove
+       1.31%  vma_interval_tree_iter_next
+
+    patch1-7
+      48.16%  lzo1x_1_do_compress (real work)
+       8.20%  page_vma_mapped_walk (overhead)
+       7.06%  _raw_spin_unlock_irq
+       2.92%  ptep_clear_flush
+       2.53%  __zram_bvec_write
+       2.11%  do_raw_spin_lock
+       2.02%  memmove
+       1.93%  lru_gen_look_around
+       1.56%  free_unref_page_list
+       1.40%  memset
+
+  Configurations:
+    no change
+
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Acked-by: Barry Song <[email protected]>
+Acked-by: Brian Geffon <[email protected]>
+Acked-by: Jan Alexander Steffens (heftig) <[email protected]>
+Acked-by: Oleksandr Natalenko <[email protected]>
+Acked-by: Steven Barrett <[email protected]>
+Acked-by: Suleiman Souhlal <[email protected]>
+Tested-by: Daniel Byrne <[email protected]>
+Tested-by: Donald Carr <[email protected]>
+Tested-by: Holger Hoffstätte <[email protected]>
+Tested-by: Konstantin Kharlamov <[email protected]>
+Tested-by: Shuang Zhai <[email protected]>
+Tested-by: Sofia Trinh <[email protected]>
+Tested-by: Vaibhav Jain <[email protected]>
+Cc: Andi Kleen <[email protected]>
+Cc: Aneesh Kumar K.V <[email protected]>
+Cc: Catalin Marinas <[email protected]>
+Cc: Dave Hansen <[email protected]>
+Cc: Hillf Danton <[email protected]>
+Cc: Jens Axboe <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Jonathan Corbet <[email protected]>
+Cc: Linus Torvalds <[email protected]>
+Cc: Matthew Wilcox <[email protected]>
+Cc: Mel Gorman <[email protected]>
+Cc: Miaohe Lin <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Cc: Michal Hocko <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Peter Zijlstra <[email protected]>
+Cc: Qi Zheng <[email protected]>
+Cc: Tejun Heo <[email protected]>
+Cc: Vlastimil Babka <[email protected]>
+Cc: Will Deacon <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ include/linux/memcontrol.h |  31 +++++++
+ include/linux/mmzone.h     |   6 ++
+ mm/internal.h              |   1 +
+ mm/memcontrol.c            |   1 +
+ mm/rmap.c                  |   7 ++
+ mm/swap.c                  |   4 +-
+ mm/vmscan.c                | 184 +++++++++++++++++++++++++++++++++++++
+ 7 files changed, 232 insertions(+), 2 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -442,6 +442,7 @@ static inline struct obj_cgroup *__page_
+  * - LRU isolation
+  * - lock_page_memcg()
+  * - exclusive reference
++ * - mem_cgroup_trylock_pages()
+  *
+  * For a kmem page a caller should hold an rcu read lock to protect memcg
+  * associated with a kmem page from being released.
+@@ -497,6 +498,7 @@ static inline struct mem_cgroup *page_me
+  * - LRU isolation
+  * - lock_page_memcg()
+  * - exclusive reference
++ * - mem_cgroup_trylock_pages()
+  *
+  * For a kmem page a caller should hold an rcu read lock to protect memcg
+  * associated with a kmem page from being released.
+@@ -953,6 +955,23 @@ void unlock_page_memcg(struct page *page
+ 
+ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
+ 
++/* try to stablize page_memcg() for all the pages in a memcg */
++static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
++{
++	rcu_read_lock();
++
++	if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
++		return true;
++
++	rcu_read_unlock();
++	return false;
++}
++
++static inline void mem_cgroup_unlock_pages(void)
++{
++	rcu_read_unlock();
++}
++
+ /* idx can be of type enum memcg_stat_item or node_stat_item */
+ static inline void mod_memcg_state(struct mem_cgroup *memcg,
+ 				   int idx, int val)
+@@ -1369,6 +1388,18 @@ static inline void unlock_page_memcg(str
+ {
+ }
+ 
++static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
++{
++	/* to match page_memcg_rcu() */
++	rcu_read_lock();
++	return true;
++}
++
++static inline void mem_cgroup_unlock_pages(void)
++{
++	rcu_read_unlock();
++}
++
+ static inline void mem_cgroup_handle_over_high(void)
+ {
+ }
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -352,6 +352,7 @@ enum lruvec_flags {
+ #ifndef __GENERATING_BOUNDS_H
+ 
+ struct lruvec;
++struct page_vma_mapped_walk;
+ 
+ #define LRU_GEN_MASK		((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
+ #define LRU_REFS_MASK		((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
+@@ -407,6 +408,7 @@ struct lru_gen_struct {
+ };
+ 
+ void lru_gen_init_lruvec(struct lruvec *lruvec);
++void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
+ 
+ #ifdef CONFIG_MEMCG
+ void lru_gen_init_memcg(struct mem_cgroup *memcg);
+@@ -419,6 +421,10 @@ static inline void lru_gen_init_lruvec(s
+ {
+ }
+ 
++static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
++{
++}
++
+ #ifdef CONFIG_MEMCG
+ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
+ {
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -35,6 +35,7 @@
+ void page_writeback_init(void);
+ 
+ vm_fault_t do_swap_page(struct vm_fault *vmf);
++void activate_page(struct page *page);
+ 
+ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
+ 		unsigned long floor, unsigned long ceiling);
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -2798,6 +2798,7 @@ static void commit_charge(struct page *p
+ 	 * - LRU isolation
+ 	 * - lock_page_memcg()
+ 	 * - exclusive reference
++	 * - mem_cgroup_trylock_pages()
+ 	 */
+ 	page->memcg_data = (unsigned long)memcg;
+ }
+--- a/mm/rmap.c
++++ b/mm/rmap.c
+@@ -73,6 +73,7 @@
+ #include <linux/page_idle.h>
+ #include <linux/memremap.h>
+ #include <linux/userfaultfd_k.h>
++#include <linux/mm_inline.h>
+ 
+ #include <asm/tlbflush.h>
+ 
+@@ -793,6 +794,12 @@ static bool page_referenced_one(struct p
+ 		}
+ 
+ 		if (pvmw.pte) {
++			if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
++			    !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
++				lru_gen_look_around(&pvmw);
++				referenced++;
++			}
++
+ 			if (ptep_clear_flush_young_notify(vma, address,
+ 						pvmw.pte)) {
+ 				/*
+--- a/mm/swap.c
++++ b/mm/swap.c
+@@ -325,7 +325,7 @@ static bool need_activate_page_drain(int
+ 	return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
+ }
+ 
+-static void activate_page(struct page *page)
++void activate_page(struct page *page)
+ {
+ 	page = compound_head(page);
+ 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+@@ -345,7 +345,7 @@ static inline void activate_page_drain(i
+ {
+ }
+ 
+-static void activate_page(struct page *page)
++void activate_page(struct page *page)
+ {
+ 	struct lruvec *lruvec;
+ 
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -1409,6 +1409,11 @@ retry:
+ 		if (!sc->may_unmap && page_mapped(page))
+ 			goto keep_locked;
+ 
++		/* page_update_gen() tried to promote this page? */
++		if (lru_gen_enabled() && !ignore_references &&
++		    page_mapped(page) && PageReferenced(page))
++			goto keep_locked;
++
+ 		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
+ 			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+ 
+@@ -2990,6 +2995,29 @@ static bool positive_ctrl_err(struct ctr
+  *                          the aging
+  ******************************************************************************/
+ 
++/* promote pages accessed through page tables */
++static int page_update_gen(struct page *page, int gen)
++{
++	unsigned long new_flags, old_flags = READ_ONCE(page->flags);
++
++	VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
++	VM_WARN_ON_ONCE(!rcu_read_lock_held());
++
++	do {
++		/* lru_gen_del_page() has isolated this page? */
++		if (!(old_flags & LRU_GEN_MASK)) {
++			/* for shrink_page_list() */
++			new_flags = old_flags | BIT(PG_referenced);
++			continue;
++		}
++
++		new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
++		new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
++	} while (!try_cmpxchg(&page->flags, &old_flags, new_flags));
++
++	return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
++}
++
+ /* protect pages accessed multiple times through file descriptors */
+ static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming)
+ {
+@@ -3001,6 +3029,11 @@ static int page_inc_gen(struct lruvec *l
+ 	VM_WARN_ON_ONCE_PAGE(!(old_flags & LRU_GEN_MASK), page);
+ 
+ 	do {
++		new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
++		/* page_update_gen() has promoted this page? */
++		if (new_gen >= 0 && new_gen != old_gen)
++			return new_gen;
++
+ 		new_gen = (old_gen + 1) % MAX_NR_GENS;
+ 
+ 		new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
+@@ -3015,6 +3048,43 @@ static int page_inc_gen(struct lruvec *l
+ 	return new_gen;
+ }
+ 
++static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
++{
++	unsigned long pfn = pte_pfn(pte);
++
++	VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
++
++	if (!pte_present(pte) || is_zero_pfn(pfn))
++		return -1;
++
++	if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
++		return -1;
++
++	if (WARN_ON_ONCE(!pfn_valid(pfn)))
++		return -1;
++
++	return pfn;
++}
++
++static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg,
++				 struct pglist_data *pgdat)
++{
++	struct page *page;
++
++	/* try to avoid unnecessary memory loads */
++	if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
++		return NULL;
++
++	page = compound_head(pfn_to_page(pfn));
++	if (page_to_nid(page) != pgdat->node_id)
++		return NULL;
++
++	if (page_memcg_rcu(page) != memcg)
++		return NULL;
++
++	return page;
++}
++
+ static void inc_min_seq(struct lruvec *lruvec, int type)
+ {
+ 	struct lru_gen_struct *lrugen = &lruvec->lrugen;
+@@ -3214,6 +3284,114 @@ static void lru_gen_age_node(struct pgli
+ 	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+ }
+ 
++/*
++ * This function exploits spatial locality when shrink_page_list() walks the
++ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
++ */
++void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
++{
++	int i;
++	pte_t *pte;
++	unsigned long start;
++	unsigned long end;
++	unsigned long addr;
++	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
++	struct page *page = pvmw->page;
++	struct mem_cgroup *memcg = page_memcg(page);
++	struct pglist_data *pgdat = page_pgdat(page);
++	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
++	DEFINE_MAX_SEQ(lruvec);
++	int old_gen, new_gen = lru_gen_from_seq(max_seq);
++
++	lockdep_assert_held(pvmw->ptl);
++	VM_WARN_ON_ONCE_PAGE(PageLRU(page), page);
++
++	if (spin_is_contended(pvmw->ptl))
++		return;
++
++	start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
++	end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
++
++	if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
++		if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
++			end = start + MIN_LRU_BATCH * PAGE_SIZE;
++		else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
++			start = end - MIN_LRU_BATCH * PAGE_SIZE;
++		else {
++			start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2;
++			end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2;
++		}
++	}
++
++	pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
++
++	rcu_read_lock();
++	arch_enter_lazy_mmu_mode();
++
++	for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
++		unsigned long pfn;
++
++		pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
++		if (pfn == -1)
++			continue;
++
++		if (!pte_young(pte[i]))
++			continue;
++
++		page = get_pfn_page(pfn, memcg, pgdat);
++		if (!page)
++			continue;
++
++		if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
++			VM_WARN_ON_ONCE(true);
++
++		if (pte_dirty(pte[i]) && !PageDirty(page) &&
++		    !(PageAnon(page) && PageSwapBacked(page) &&
++		      !PageSwapCache(page)))
++			set_page_dirty(page);
++
++		old_gen = page_lru_gen(page);
++		if (old_gen < 0)
++			SetPageReferenced(page);
++		else if (old_gen != new_gen)
++			__set_bit(i, bitmap);
++	}
++
++	arch_leave_lazy_mmu_mode();
++	rcu_read_unlock();
++
++	if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
++		for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
++			page = pte_page(pte[i]);
++			activate_page(page);
++		}
++		return;
++	}
++
++	/* page_update_gen() requires stable page_memcg() */
++	if (!mem_cgroup_trylock_pages(memcg))
++		return;
++
++	spin_lock_irq(&lruvec->lru_lock);
++	new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
++
++	for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
++		page = compound_head(pte_page(pte[i]));
++		if (page_memcg_rcu(page) != memcg)
++			continue;
++
++		old_gen = page_update_gen(page, new_gen);
++		if (old_gen < 0 || old_gen == new_gen)
++			continue;
++
++		lru_gen_update_size(lruvec, page, old_gen, new_gen);
++	}
++
++	spin_unlock_irq(&lruvec->lru_lock);
++
++	mem_cgroup_unlock_pages();
++}
++
+ /******************************************************************************
+  *                          the eviction
+  ******************************************************************************/
+@@ -3250,6 +3428,12 @@ static bool sort_page(struct lruvec *lru
+ 		return true;
+ 	}
+ 
++	/* promoted */
++	if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
++		list_move(&page->lru, &lrugen->lists[gen][type][zone]);
++		return true;
++	}
++
+ 	/* protected */
+ 	if (tier > tier_idx) {
+ 		int hist = lru_hist_from_seq(lrugen->min_seq[type]);

+ 1687 - 0
target/linux/generic/backport-6.1/020-v6.1-08-mm-multi-gen-LRU-support-page-table-walks.patch

@@ -0,0 +1,1687 @@
+From 05223c4e80b34e29f2255c04ffebc2c4475e7593 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Sun, 18 Sep 2022 02:00:05 -0600
+Subject: [PATCH 08/29] mm: multi-gen LRU: support page table walks
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+To further exploit spatial locality, the aging prefers to walk page tables
+to search for young PTEs and promote hot pages.  A kill switch will be
+added in the next patch to disable this behavior.  When disabled, the
+aging relies on the rmap only.
+
+NB: this behavior has nothing similar with the page table scanning in the
+2.4 kernel [1], which searches page tables for old PTEs, adds cold pages
+to swapcache and unmaps them.
+
+To avoid confusion, the term "iteration" specifically means the traversal
+of an entire mm_struct list; the term "walk" will be applied to page
+tables and the rmap, as usual.
+
+An mm_struct list is maintained for each memcg, and an mm_struct follows
+its owner task to the new memcg when this task is migrated.  Given an
+lruvec, the aging iterates lruvec_memcg()->mm_list and calls
+walk_page_range() with each mm_struct on this list to promote hot pages
+before it increments max_seq.
+
+When multiple page table walkers iterate the same list, each of them gets
+a unique mm_struct; therefore they can run concurrently.  Page table
+walkers ignore any misplaced pages, e.g., if an mm_struct was migrated,
+pages it left in the previous memcg will not be promoted when its current
+memcg is under reclaim.  Similarly, page table walkers will not promote
+pages from nodes other than the one under reclaim.
+
+This patch uses the following optimizations when walking page tables:
+1. It tracks the usage of mm_struct's between context switches so that
+   page table walkers can skip processes that have been sleeping since
+   the last iteration.
+2. It uses generational Bloom filters to record populated branches so
+   that page table walkers can reduce their search space based on the
+   query results, e.g., to skip page tables containing mostly holes or
+   misplaced pages.
+3. It takes advantage of the accessed bit in non-leaf PMD entries when
+   CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y.
+4. It does not zigzag between a PGD table and the same PMD table
+   spanning multiple VMAs. IOW, it finishes all the VMAs within the
+   range of the same PMD table before it returns to a PGD table. This
+   improves the cache performance for workloads that have large
+   numbers of tiny VMAs [2], especially when CONFIG_PGTABLE_LEVELS=5.
+
+Server benchmark results:
+  Single workload:
+    fio (buffered I/O): no change
+
+  Single workload:
+    memcached (anon): +[8, 10]%
+                Ops/sec      KB/sec
+      patch1-7: 1147696.57   44640.29
+      patch1-8: 1245274.91   48435.66
+
+  Configurations:
+    no change
+
+Client benchmark results:
+  kswapd profiles:
+    patch1-7
+      48.16%  lzo1x_1_do_compress (real work)
+       8.20%  page_vma_mapped_walk (overhead)
+       7.06%  _raw_spin_unlock_irq
+       2.92%  ptep_clear_flush
+       2.53%  __zram_bvec_write
+       2.11%  do_raw_spin_lock
+       2.02%  memmove
+       1.93%  lru_gen_look_around
+       1.56%  free_unref_page_list
+       1.40%  memset
+
+    patch1-8
+      49.44%  lzo1x_1_do_compress (real work)
+       6.19%  page_vma_mapped_walk (overhead)
+       5.97%  _raw_spin_unlock_irq
+       3.13%  get_pfn_page
+       2.85%  ptep_clear_flush
+       2.42%  __zram_bvec_write
+       2.08%  do_raw_spin_lock
+       1.92%  memmove
+       1.44%  alloc_zspage
+       1.36%  memset
+
+  Configurations:
+    no change
+
+Thanks to the following developers for their efforts [3].
+  kernel test robot <[email protected]>
+
+[1] https://lwn.net/Articles/23732/
+[2] https://llvm.org/docs/ScudoHardenedAllocator.html
+[3] https://lore.kernel.org/r/[email protected]/
+
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Acked-by: Brian Geffon <[email protected]>
+Acked-by: Jan Alexander Steffens (heftig) <[email protected]>
+Acked-by: Oleksandr Natalenko <[email protected]>
+Acked-by: Steven Barrett <[email protected]>
+Acked-by: Suleiman Souhlal <[email protected]>
+Tested-by: Daniel Byrne <[email protected]>
+Tested-by: Donald Carr <[email protected]>
+Tested-by: Holger Hoffstätte <[email protected]>
+Tested-by: Konstantin Kharlamov <[email protected]>
+Tested-by: Shuang Zhai <[email protected]>
+Tested-by: Sofia Trinh <[email protected]>
+Tested-by: Vaibhav Jain <[email protected]>
+Cc: Andi Kleen <[email protected]>
+Cc: Aneesh Kumar K.V <[email protected]>
+Cc: Barry Song <[email protected]>
+Cc: Catalin Marinas <[email protected]>
+Cc: Dave Hansen <[email protected]>
+Cc: Hillf Danton <[email protected]>
+Cc: Jens Axboe <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Jonathan Corbet <[email protected]>
+Cc: Linus Torvalds <[email protected]>
+Cc: Matthew Wilcox <[email protected]>
+Cc: Mel Gorman <[email protected]>
+Cc: Miaohe Lin <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Cc: Michal Hocko <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Peter Zijlstra <[email protected]>
+Cc: Qi Zheng <[email protected]>
+Cc: Tejun Heo <[email protected]>
+Cc: Vlastimil Babka <[email protected]>
+Cc: Will Deacon <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ fs/exec.c                  |    2 +
+ include/linux/memcontrol.h |    5 +
+ include/linux/mm_types.h   |   76 +++
+ include/linux/mmzone.h     |   56 +-
+ include/linux/swap.h       |    4 +
+ kernel/exit.c              |    1 +
+ kernel/fork.c              |    9 +
+ kernel/sched/core.c        |    1 +
+ mm/memcontrol.c            |   25 +
+ mm/vmscan.c                | 1010 +++++++++++++++++++++++++++++++++++-
+ 10 files changed, 1172 insertions(+), 17 deletions(-)
+
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -1013,6 +1013,7 @@ static int exec_mmap(struct mm_struct *m
+ 	active_mm = tsk->active_mm;
+ 	tsk->active_mm = mm;
+ 	tsk->mm = mm;
++	lru_gen_add_mm(mm);
+ 	/*
+ 	 * This prevents preemption while active_mm is being loaded and
+ 	 * it and mm are being updated, which could cause problems for
+@@ -1028,6 +1029,7 @@ static int exec_mmap(struct mm_struct *m
+ 	tsk->mm->vmacache_seqnum = 0;
+ 	vmacache_flush(tsk);
+ 	task_unlock(tsk);
++	lru_gen_use_mm(mm);
+ 	if (old_mm) {
+ 		mmap_read_unlock(old_mm);
+ 		BUG_ON(active_mm != old_mm);
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -348,6 +348,11 @@ struct mem_cgroup {
+ 	struct deferred_split deferred_split_queue;
+ #endif
+ 
++#ifdef CONFIG_LRU_GEN
++	/* per-memcg mm_struct list */
++	struct lru_gen_mm_list mm_list;
++#endif
++
+ 	struct mem_cgroup_per_node *nodeinfo[];
+ };
+ 
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -580,6 +580,22 @@ struct mm_struct {
+ #ifdef CONFIG_IOMMU_SUPPORT
+ 		u32 pasid;
+ #endif
++#ifdef CONFIG_LRU_GEN
++		struct {
++			/* this mm_struct is on lru_gen_mm_list */
++			struct list_head list;
++			/*
++			 * Set when switching to this mm_struct, as a hint of
++			 * whether it has been used since the last time per-node
++			 * page table walkers cleared the corresponding bits.
++			 */
++			unsigned long bitmap;
++#ifdef CONFIG_MEMCG
++			/* points to the memcg of "owner" above */
++			struct mem_cgroup *memcg;
++#endif
++		} lru_gen;
++#endif /* CONFIG_LRU_GEN */
+ 	} __randomize_layout;
+ 
+ 	/*
+@@ -606,6 +622,66 @@ static inline cpumask_t *mm_cpumask(stru
+ 	return (struct cpumask *)&mm->cpu_bitmap;
+ }
+ 
++#ifdef CONFIG_LRU_GEN
++
++struct lru_gen_mm_list {
++	/* mm_struct list for page table walkers */
++	struct list_head fifo;
++	/* protects the list above */
++	spinlock_t lock;
++};
++
++void lru_gen_add_mm(struct mm_struct *mm);
++void lru_gen_del_mm(struct mm_struct *mm);
++#ifdef CONFIG_MEMCG
++void lru_gen_migrate_mm(struct mm_struct *mm);
++#endif
++
++static inline void lru_gen_init_mm(struct mm_struct *mm)
++{
++	INIT_LIST_HEAD(&mm->lru_gen.list);
++	mm->lru_gen.bitmap = 0;
++#ifdef CONFIG_MEMCG
++	mm->lru_gen.memcg = NULL;
++#endif
++}
++
++static inline void lru_gen_use_mm(struct mm_struct *mm)
++{
++	/*
++	 * When the bitmap is set, page reclaim knows this mm_struct has been
++	 * used since the last time it cleared the bitmap. So it might be worth
++	 * walking the page tables of this mm_struct to clear the accessed bit.
++	 */
++	WRITE_ONCE(mm->lru_gen.bitmap, -1);
++}
++
++#else /* !CONFIG_LRU_GEN */
++
++static inline void lru_gen_add_mm(struct mm_struct *mm)
++{
++}
++
++static inline void lru_gen_del_mm(struct mm_struct *mm)
++{
++}
++
++#ifdef CONFIG_MEMCG
++static inline void lru_gen_migrate_mm(struct mm_struct *mm)
++{
++}
++#endif
++
++static inline void lru_gen_init_mm(struct mm_struct *mm)
++{
++}
++
++static inline void lru_gen_use_mm(struct mm_struct *mm)
++{
++}
++
++#endif /* CONFIG_LRU_GEN */
++
+ struct mmu_gather;
+ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
+ extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -385,7 +385,7 @@ enum {
+  * min_seq behind.
+  *
+  * The number of pages in each generation is eventually consistent and therefore
+- * can be transiently negative.
++ * can be transiently negative when reset_batch_size() is pending.
+  */
+ struct lru_gen_struct {
+ 	/* the aging increments the youngest generation number */
+@@ -407,6 +407,53 @@ struct lru_gen_struct {
+ 	atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+ };
+ 
++enum {
++	MM_LEAF_TOTAL,		/* total leaf entries */
++	MM_LEAF_OLD,		/* old leaf entries */
++	MM_LEAF_YOUNG,		/* young leaf entries */
++	MM_NONLEAF_TOTAL,	/* total non-leaf entries */
++	MM_NONLEAF_FOUND,	/* non-leaf entries found in Bloom filters */
++	MM_NONLEAF_ADDED,	/* non-leaf entries added to Bloom filters */
++	NR_MM_STATS
++};
++
++/* double-buffering Bloom filters */
++#define NR_BLOOM_FILTERS	2
++
++struct lru_gen_mm_state {
++	/* set to max_seq after each iteration */
++	unsigned long seq;
++	/* where the current iteration continues (inclusive) */
++	struct list_head *head;
++	/* where the last iteration ended (exclusive) */
++	struct list_head *tail;
++	/* to wait for the last page table walker to finish */
++	struct wait_queue_head wait;
++	/* Bloom filters flip after each iteration */
++	unsigned long *filters[NR_BLOOM_FILTERS];
++	/* the mm stats for debugging */
++	unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
++	/* the number of concurrent page table walkers */
++	int nr_walkers;
++};
++
++struct lru_gen_mm_walk {
++	/* the lruvec under reclaim */
++	struct lruvec *lruvec;
++	/* unstable max_seq from lru_gen_struct */
++	unsigned long max_seq;
++	/* the next address within an mm to scan */
++	unsigned long next_addr;
++	/* to batch promoted pages */
++	int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
++	/* to batch the mm stats */
++	int mm_stats[NR_MM_STATS];
++	/* total batched items */
++	int batched;
++	bool can_swap;
++	bool force_scan;
++};
++
+ void lru_gen_init_lruvec(struct lruvec *lruvec);
+ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
+ 
+@@ -457,6 +504,8 @@ struct lruvec {
+ #ifdef CONFIG_LRU_GEN
+ 	/* evictable pages divided into generations */
+ 	struct lru_gen_struct		lrugen;
++	/* to concurrently iterate lru_gen_mm_list */
++	struct lru_gen_mm_state		mm_state;
+ #endif
+ #ifdef CONFIG_MEMCG
+ 	struct pglist_data *pgdat;
+@@ -1042,6 +1091,11 @@ typedef struct pglist_data {
+ 
+ 	unsigned long		flags;
+ 
++#ifdef CONFIG_LRU_GEN
++	/* kswap mm walk data */
++	struct lru_gen_mm_walk	mm_walk;
++#endif
++
+ 	ZONE_PADDING(_pad2_)
+ 
+ 	/* Per-node vmstats */
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -137,6 +137,10 @@ union swap_header {
+  */
+ struct reclaim_state {
+ 	unsigned long reclaimed_slab;
++#ifdef CONFIG_LRU_GEN
++	/* per-thread mm walk data */
++	struct lru_gen_mm_walk *mm_walk;
++#endif
+ };
+ 
+ #ifdef __KERNEL__
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -469,6 +469,7 @@ assign_new_owner:
+ 		goto retry;
+ 	}
+ 	WRITE_ONCE(mm->owner, c);
++	lru_gen_migrate_mm(mm);
+ 	task_unlock(c);
+ 	put_task_struct(c);
+ }
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -1083,6 +1083,7 @@ static struct mm_struct *mm_init(struct
+ 		goto fail_nocontext;
+ 
+ 	mm->user_ns = get_user_ns(user_ns);
++	lru_gen_init_mm(mm);
+ 	return mm;
+ 
+ fail_nocontext:
+@@ -1125,6 +1126,7 @@ static inline void __mmput(struct mm_str
+ 	}
+ 	if (mm->binfmt)
+ 		module_put(mm->binfmt->module);
++	lru_gen_del_mm(mm);
+ 	mmdrop(mm);
+ }
+ 
+@@ -2622,6 +2624,13 @@ pid_t kernel_clone(struct kernel_clone_a
+ 		get_task_struct(p);
+ 	}
+ 
++	if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
++		/* lock the task to synchronize with memcg migration */
++		task_lock(p);
++		lru_gen_add_mm(p->mm);
++		task_unlock(p);
++	}
++
+ 	wake_up_new_task(p);
+ 
+ 	/* forking complete and child started to run, tell ptracer */
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -5010,6 +5010,7 @@ context_switch(struct rq *rq, struct tas
+ 		 * finish_task_switch()'s mmdrop().
+ 		 */
+ 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
++		lru_gen_use_mm(next->mm);
+ 
+ 		if (!prev->mm) {                        // from kernel
+ 			/* will mmdrop() in finish_task_switch(). */
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -6212,6 +6212,30 @@ static void mem_cgroup_move_task(void)
+ }
+ #endif
+ 
++#ifdef CONFIG_LRU_GEN
++static void mem_cgroup_attach(struct cgroup_taskset *tset)
++{
++	struct task_struct *task;
++	struct cgroup_subsys_state *css;
++
++	/* find the first leader if there is any */
++	cgroup_taskset_for_each_leader(task, css, tset)
++		break;
++
++	if (!task)
++		return;
++
++	task_lock(task);
++	if (task->mm && READ_ONCE(task->mm->owner) == task)
++		lru_gen_migrate_mm(task->mm);
++	task_unlock(task);
++}
++#else
++static void mem_cgroup_attach(struct cgroup_taskset *tset)
++{
++}
++#endif /* CONFIG_LRU_GEN */
++
+ static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
+ {
+ 	if (value == PAGE_COUNTER_MAX)
+@@ -6555,6 +6579,7 @@ struct cgroup_subsys memory_cgrp_subsys
+ 	.css_reset = mem_cgroup_css_reset,
+ 	.css_rstat_flush = mem_cgroup_css_rstat_flush,
+ 	.can_attach = mem_cgroup_can_attach,
++	.attach = mem_cgroup_attach,
+ 	.cancel_attach = mem_cgroup_cancel_attach,
+ 	.post_attach = mem_cgroup_move_task,
+ 	.dfl_cftypes = memory_files,
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -50,6 +50,8 @@
+ #include <linux/printk.h>
+ #include <linux/dax.h>
+ #include <linux/psi.h>
++#include <linux/pagewalk.h>
++#include <linux/shmem_fs.h>
+ 
+ #include <asm/tlbflush.h>
+ #include <asm/div64.h>
+@@ -2853,7 +2855,7 @@ static bool can_age_anon_pages(struct pg
+ 		for ((type) = 0; (type) < ANON_AND_FILE; (type)++)	\
+ 			for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+ 
+-static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
++static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
+ {
+ 	struct pglist_data *pgdat = NODE_DATA(nid);
+ 
+@@ -2899,6 +2901,371 @@ static bool __maybe_unused seq_is_valid(
+ }
+ 
+ /******************************************************************************
++ *                          mm_struct list
++ ******************************************************************************/
++
++static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
++{
++	static struct lru_gen_mm_list mm_list = {
++		.fifo = LIST_HEAD_INIT(mm_list.fifo),
++		.lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
++	};
++
++#ifdef CONFIG_MEMCG
++	if (memcg)
++		return &memcg->mm_list;
++#endif
++	VM_WARN_ON_ONCE(!mem_cgroup_disabled());
++
++	return &mm_list;
++}
++
++void lru_gen_add_mm(struct mm_struct *mm)
++{
++	int nid;
++	struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
++	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
++
++	VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list));
++#ifdef CONFIG_MEMCG
++	VM_WARN_ON_ONCE(mm->lru_gen.memcg);
++	mm->lru_gen.memcg = memcg;
++#endif
++	spin_lock(&mm_list->lock);
++
++	for_each_node_state(nid, N_MEMORY) {
++		struct lruvec *lruvec = get_lruvec(memcg, nid);
++
++		if (!lruvec)
++			continue;
++
++		/* the first addition since the last iteration */
++		if (lruvec->mm_state.tail == &mm_list->fifo)
++			lruvec->mm_state.tail = &mm->lru_gen.list;
++	}
++
++	list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
++
++	spin_unlock(&mm_list->lock);
++}
++
++void lru_gen_del_mm(struct mm_struct *mm)
++{
++	int nid;
++	struct lru_gen_mm_list *mm_list;
++	struct mem_cgroup *memcg = NULL;
++
++	if (list_empty(&mm->lru_gen.list))
++		return;
++
++#ifdef CONFIG_MEMCG
++	memcg = mm->lru_gen.memcg;
++#endif
++	mm_list = get_mm_list(memcg);
++
++	spin_lock(&mm_list->lock);
++
++	for_each_node(nid) {
++		struct lruvec *lruvec = get_lruvec(memcg, nid);
++
++		if (!lruvec)
++			continue;
++
++		/* where the last iteration ended (exclusive) */
++		if (lruvec->mm_state.tail == &mm->lru_gen.list)
++			lruvec->mm_state.tail = lruvec->mm_state.tail->next;
++
++		/* where the current iteration continues (inclusive) */
++		if (lruvec->mm_state.head != &mm->lru_gen.list)
++			continue;
++
++		lruvec->mm_state.head = lruvec->mm_state.head->next;
++		/* the deletion ends the current iteration */
++		if (lruvec->mm_state.head == &mm_list->fifo)
++			WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
++	}
++
++	list_del_init(&mm->lru_gen.list);
++
++	spin_unlock(&mm_list->lock);
++
++#ifdef CONFIG_MEMCG
++	mem_cgroup_put(mm->lru_gen.memcg);
++	mm->lru_gen.memcg = NULL;
++#endif
++}
++
++#ifdef CONFIG_MEMCG
++void lru_gen_migrate_mm(struct mm_struct *mm)
++{
++	struct mem_cgroup *memcg;
++	struct task_struct *task = rcu_dereference_protected(mm->owner, true);
++
++	VM_WARN_ON_ONCE(task->mm != mm);
++	lockdep_assert_held(&task->alloc_lock);
++
++	/* for mm_update_next_owner() */
++	if (mem_cgroup_disabled())
++		return;
++
++	rcu_read_lock();
++	memcg = mem_cgroup_from_task(task);
++	rcu_read_unlock();
++	if (memcg == mm->lru_gen.memcg)
++		return;
++
++	VM_WARN_ON_ONCE(!mm->lru_gen.memcg);
++	VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
++
++	lru_gen_del_mm(mm);
++	lru_gen_add_mm(mm);
++}
++#endif
++
++/*
++ * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
++ * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
++ * bits in a bitmap, k is the number of hash functions and n is the number of
++ * inserted items.
++ *
++ * Page table walkers use one of the two filters to reduce their search space.
++ * To get rid of non-leaf entries that no longer have enough leaf entries, the
++ * aging uses the double-buffering technique to flip to the other filter each
++ * time it produces a new generation. For non-leaf entries that have enough
++ * leaf entries, the aging carries them over to the next generation in
++ * walk_pmd_range(); the eviction also report them when walking the rmap
++ * in lru_gen_look_around().
++ *
++ * For future optimizations:
++ * 1. It's not necessary to keep both filters all the time. The spare one can be
++ *    freed after the RCU grace period and reallocated if needed again.
++ * 2. And when reallocating, it's worth scaling its size according to the number
++ *    of inserted entries in the other filter, to reduce the memory overhead on
++ *    small systems and false positives on large systems.
++ * 3. Jenkins' hash function is an alternative to Knuth's.
++ */
++#define BLOOM_FILTER_SHIFT	15
++
++static inline int filter_gen_from_seq(unsigned long seq)
++{
++	return seq % NR_BLOOM_FILTERS;
++}
++
++static void get_item_key(void *item, int *key)
++{
++	u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
++
++	BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
++
++	key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
++	key[1] = hash >> BLOOM_FILTER_SHIFT;
++}
++
++static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
++{
++	unsigned long *filter;
++	int gen = filter_gen_from_seq(seq);
++
++	filter = lruvec->mm_state.filters[gen];
++	if (filter) {
++		bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
++		return;
++	}
++
++	filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
++			       __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
++	WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
++}
++
++static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
++{
++	int key[2];
++	unsigned long *filter;
++	int gen = filter_gen_from_seq(seq);
++
++	filter = READ_ONCE(lruvec->mm_state.filters[gen]);
++	if (!filter)
++		return;
++
++	get_item_key(item, key);
++
++	if (!test_bit(key[0], filter))
++		set_bit(key[0], filter);
++	if (!test_bit(key[1], filter))
++		set_bit(key[1], filter);
++}
++
++static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
++{
++	int key[2];
++	unsigned long *filter;
++	int gen = filter_gen_from_seq(seq);
++
++	filter = READ_ONCE(lruvec->mm_state.filters[gen]);
++	if (!filter)
++		return true;
++
++	get_item_key(item, key);
++
++	return test_bit(key[0], filter) && test_bit(key[1], filter);
++}
++
++static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
++{
++	int i;
++	int hist;
++
++	lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
++
++	if (walk) {
++		hist = lru_hist_from_seq(walk->max_seq);
++
++		for (i = 0; i < NR_MM_STATS; i++) {
++			WRITE_ONCE(lruvec->mm_state.stats[hist][i],
++				   lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]);
++			walk->mm_stats[i] = 0;
++		}
++	}
++
++	if (NR_HIST_GENS > 1 && last) {
++		hist = lru_hist_from_seq(lruvec->mm_state.seq + 1);
++
++		for (i = 0; i < NR_MM_STATS; i++)
++			WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0);
++	}
++}
++
++static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
++{
++	int type;
++	unsigned long size = 0;
++	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
++	int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
++
++	if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
++		return true;
++
++	clear_bit(key, &mm->lru_gen.bitmap);
++
++	for (type = !walk->can_swap; type < ANON_AND_FILE; type++) {
++		size += type ? get_mm_counter(mm, MM_FILEPAGES) :
++			       get_mm_counter(mm, MM_ANONPAGES) +
++			       get_mm_counter(mm, MM_SHMEMPAGES);
++	}
++
++	if (size < MIN_LRU_BATCH)
++		return true;
++
++	return !mmget_not_zero(mm);
++}
++
++static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
++			    struct mm_struct **iter)
++{
++	bool first = false;
++	bool last = true;
++	struct mm_struct *mm = NULL;
++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
++	struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
++
++	/*
++	 * There are four interesting cases for this page table walker:
++	 * 1. It tries to start a new iteration of mm_list with a stale max_seq;
++	 *    there is nothing left to do.
++	 * 2. It's the first of the current generation, and it needs to reset
++	 *    the Bloom filter for the next generation.
++	 * 3. It reaches the end of mm_list, and it needs to increment
++	 *    mm_state->seq; the iteration is done.
++	 * 4. It's the last of the current generation, and it needs to reset the
++	 *    mm stats counters for the next generation.
++	 */
++	spin_lock(&mm_list->lock);
++
++	VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
++	VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq);
++	VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers);
++
++	if (walk->max_seq <= mm_state->seq) {
++		if (!*iter)
++			last = false;
++		goto done;
++	}
++
++	if (!mm_state->nr_walkers) {
++		VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
++
++		mm_state->head = mm_list->fifo.next;
++		first = true;
++	}
++
++	while (!mm && mm_state->head != &mm_list->fifo) {
++		mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
++
++		mm_state->head = mm_state->head->next;
++
++		/* force scan for those added after the last iteration */
++		if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
++			mm_state->tail = mm_state->head;
++			walk->force_scan = true;
++		}
++
++		if (should_skip_mm(mm, walk))
++			mm = NULL;
++	}
++
++	if (mm_state->head == &mm_list->fifo)
++		WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
++done:
++	if (*iter && !mm)
++		mm_state->nr_walkers--;
++	if (!*iter && mm)
++		mm_state->nr_walkers++;
++
++	if (mm_state->nr_walkers)
++		last = false;
++
++	if (*iter || last)
++		reset_mm_stats(lruvec, walk, last);
++
++	spin_unlock(&mm_list->lock);
++
++	if (mm && first)
++		reset_bloom_filter(lruvec, walk->max_seq + 1);
++
++	if (*iter)
++		mmput_async(*iter);
++
++	*iter = mm;
++
++	return last;
++}
++
++static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
++{
++	bool success = false;
++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
++	struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
++
++	spin_lock(&mm_list->lock);
++
++	VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
++
++	if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
++		VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
++
++		WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
++		reset_mm_stats(lruvec, NULL, true);
++		success = true;
++	}
++
++	spin_unlock(&mm_list->lock);
++
++	return success;
++}
++
++/******************************************************************************
+  *                          refault feedback loop
+  ******************************************************************************/
+ 
+@@ -3048,6 +3415,118 @@ static int page_inc_gen(struct lruvec *l
+ 	return new_gen;
+ }
+ 
++static void update_batch_size(struct lru_gen_mm_walk *walk, struct page *page,
++			      int old_gen, int new_gen)
++{
++	int type = page_is_file_lru(page);
++	int zone = page_zonenum(page);
++	int delta = thp_nr_pages(page);
++
++	VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS);
++	VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS);
++
++	walk->batched++;
++
++	walk->nr_pages[old_gen][type][zone] -= delta;
++	walk->nr_pages[new_gen][type][zone] += delta;
++}
++
++static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
++{
++	int gen, type, zone;
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++
++	walk->batched = 0;
++
++	for_each_gen_type_zone(gen, type, zone) {
++		enum lru_list lru = type * LRU_INACTIVE_FILE;
++		int delta = walk->nr_pages[gen][type][zone];
++
++		if (!delta)
++			continue;
++
++		walk->nr_pages[gen][type][zone] = 0;
++		WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
++			   lrugen->nr_pages[gen][type][zone] + delta);
++
++		if (lru_gen_is_active(lruvec, gen))
++			lru += LRU_ACTIVE;
++		__update_lru_size(lruvec, lru, zone, delta);
++	}
++}
++
++static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args)
++{
++	struct address_space *mapping;
++	struct vm_area_struct *vma = args->vma;
++	struct lru_gen_mm_walk *walk = args->private;
++
++	if (!vma_is_accessible(vma))
++		return true;
++
++	if (is_vm_hugetlb_page(vma))
++		return true;
++
++	if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ))
++		return true;
++
++	if (vma == get_gate_vma(vma->vm_mm))
++		return true;
++
++	if (vma_is_anonymous(vma))
++		return !walk->can_swap;
++
++	if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
++		return true;
++
++	mapping = vma->vm_file->f_mapping;
++	if (mapping_unevictable(mapping))
++		return true;
++
++	if (shmem_mapping(mapping))
++		return !walk->can_swap;
++
++	/* to exclude special mappings like dax, etc. */
++	return !mapping->a_ops->readpage;
++}
++
++/*
++ * Some userspace memory allocators map many single-page VMAs. Instead of
++ * returning back to the PGD table for each of such VMAs, finish an entire PMD
++ * table to reduce zigzags and improve cache performance.
++ */
++static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args,
++			 unsigned long *vm_start, unsigned long *vm_end)
++{
++	unsigned long start = round_up(*vm_end, size);
++	unsigned long end = (start | ~mask) + 1;
++
++	VM_WARN_ON_ONCE(mask & size);
++	VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask));
++
++	while (args->vma) {
++		if (start >= args->vma->vm_end) {
++			args->vma = args->vma->vm_next;
++			continue;
++		}
++
++		if (end && end <= args->vma->vm_start)
++			return false;
++
++		if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) {
++			args->vma = args->vma->vm_next;
++			continue;
++		}
++
++		*vm_start = max(start, args->vma->vm_start);
++		*vm_end = min(end - 1, args->vma->vm_end - 1) + 1;
++
++		return true;
++	}
++
++	return false;
++}
++
+ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
+ {
+ 	unsigned long pfn = pte_pfn(pte);
+@@ -3066,8 +3545,28 @@ static unsigned long get_pte_pfn(pte_t p
+ 	return pfn;
+ }
+ 
++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
++static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
++{
++	unsigned long pfn = pmd_pfn(pmd);
++
++	VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
++
++	if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
++		return -1;
++
++	if (WARN_ON_ONCE(pmd_devmap(pmd)))
++		return -1;
++
++	if (WARN_ON_ONCE(!pfn_valid(pfn)))
++		return -1;
++
++	return pfn;
++}
++#endif
++
+ static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg,
+-				 struct pglist_data *pgdat)
++				 struct pglist_data *pgdat, bool can_swap)
+ {
+ 	struct page *page;
+ 
+@@ -3082,9 +3581,375 @@ static struct page *get_pfn_page(unsigne
+ 	if (page_memcg_rcu(page) != memcg)
+ 		return NULL;
+ 
++	/* file VMAs can contain anon pages from COW */
++	if (!page_is_file_lru(page) && !can_swap)
++		return NULL;
++
+ 	return page;
+ }
+ 
++static bool suitable_to_scan(int total, int young)
++{
++	int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
++
++	/* suitable if the average number of young PTEs per cacheline is >=1 */
++	return young * n >= total;
++}
++
++static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
++			   struct mm_walk *args)
++{
++	int i;
++	pte_t *pte;
++	spinlock_t *ptl;
++	unsigned long addr;
++	int total = 0;
++	int young = 0;
++	struct lru_gen_mm_walk *walk = args->private;
++	struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
++	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
++	int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
++
++	VM_WARN_ON_ONCE(pmd_leaf(*pmd));
++
++	ptl = pte_lockptr(args->mm, pmd);
++	if (!spin_trylock(ptl))
++		return false;
++
++	arch_enter_lazy_mmu_mode();
++
++	pte = pte_offset_map(pmd, start & PMD_MASK);
++restart:
++	for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
++		unsigned long pfn;
++		struct page *page;
++
++		total++;
++		walk->mm_stats[MM_LEAF_TOTAL]++;
++
++		pfn = get_pte_pfn(pte[i], args->vma, addr);
++		if (pfn == -1)
++			continue;
++
++		if (!pte_young(pte[i])) {
++			walk->mm_stats[MM_LEAF_OLD]++;
++			continue;
++		}
++
++		page = get_pfn_page(pfn, memcg, pgdat, walk->can_swap);
++		if (!page)
++			continue;
++
++		if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
++			VM_WARN_ON_ONCE(true);
++
++		young++;
++		walk->mm_stats[MM_LEAF_YOUNG]++;
++
++		if (pte_dirty(pte[i]) && !PageDirty(page) &&
++		    !(PageAnon(page) && PageSwapBacked(page) &&
++		      !PageSwapCache(page)))
++			set_page_dirty(page);
++
++		old_gen = page_update_gen(page, new_gen);
++		if (old_gen >= 0 && old_gen != new_gen)
++			update_batch_size(walk, page, old_gen, new_gen);
++	}
++
++	if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
++		goto restart;
++
++	pte_unmap(pte);
++
++	arch_leave_lazy_mmu_mode();
++	spin_unlock(ptl);
++
++	return suitable_to_scan(total, young);
++}
++
++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
++static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
++				  struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
++{
++	int i;
++	pmd_t *pmd;
++	spinlock_t *ptl;
++	struct lru_gen_mm_walk *walk = args->private;
++	struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
++	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
++	int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
++
++	VM_WARN_ON_ONCE(pud_leaf(*pud));
++
++	/* try to batch at most 1+MIN_LRU_BATCH+1 entries */
++	if (*start == -1) {
++		*start = next;
++		return;
++	}
++
++	i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start);
++	if (i && i <= MIN_LRU_BATCH) {
++		__set_bit(i - 1, bitmap);
++		return;
++	}
++
++	pmd = pmd_offset(pud, *start);
++
++	ptl = pmd_lockptr(args->mm, pmd);
++	if (!spin_trylock(ptl))
++		goto done;
++
++	arch_enter_lazy_mmu_mode();
++
++	do {
++		unsigned long pfn;
++		struct page *page;
++		unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start;
++
++		pfn = get_pmd_pfn(pmd[i], vma, addr);
++		if (pfn == -1)
++			goto next;
++
++		if (!pmd_trans_huge(pmd[i])) {
++			if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
++				pmdp_test_and_clear_young(vma, addr, pmd + i);
++			goto next;
++		}
++
++		page = get_pfn_page(pfn, memcg, pgdat, walk->can_swap);
++		if (!page)
++			goto next;
++
++		if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
++			goto next;
++
++		walk->mm_stats[MM_LEAF_YOUNG]++;
++
++		if (pmd_dirty(pmd[i]) && !PageDirty(page) &&
++		    !(PageAnon(page) && PageSwapBacked(page) &&
++		      !PageSwapCache(page)))
++			set_page_dirty(page);
++
++		old_gen = page_update_gen(page, new_gen);
++		if (old_gen >= 0 && old_gen != new_gen)
++			update_batch_size(walk, page, old_gen, new_gen);
++next:
++		i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
++	} while (i <= MIN_LRU_BATCH);
++
++	arch_leave_lazy_mmu_mode();
++	spin_unlock(ptl);
++done:
++	*start = -1;
++	bitmap_zero(bitmap, MIN_LRU_BATCH);
++}
++#else
++static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
++				  struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
++{
++}
++#endif
++
++static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
++			   struct mm_walk *args)
++{
++	int i;
++	pmd_t *pmd;
++	unsigned long next;
++	unsigned long addr;
++	struct vm_area_struct *vma;
++	unsigned long pos = -1;
++	struct lru_gen_mm_walk *walk = args->private;
++	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
++
++	VM_WARN_ON_ONCE(pud_leaf(*pud));
++
++	/*
++	 * Finish an entire PMD in two passes: the first only reaches to PTE
++	 * tables to avoid taking the PMD lock; the second, if necessary, takes
++	 * the PMD lock to clear the accessed bit in PMD entries.
++	 */
++	pmd = pmd_offset(pud, start & PUD_MASK);
++restart:
++	/* walk_pte_range() may call get_next_vma() */
++	vma = args->vma;
++	for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
++		pmd_t val = pmd_read_atomic(pmd + i);
++
++		/* for pmd_read_atomic() */
++		barrier();
++
++		next = pmd_addr_end(addr, end);
++
++		if (!pmd_present(val) || is_huge_zero_pmd(val)) {
++			walk->mm_stats[MM_LEAF_TOTAL]++;
++			continue;
++		}
++
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++		if (pmd_trans_huge(val)) {
++			unsigned long pfn = pmd_pfn(val);
++			struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
++
++			walk->mm_stats[MM_LEAF_TOTAL]++;
++
++			if (!pmd_young(val)) {
++				walk->mm_stats[MM_LEAF_OLD]++;
++				continue;
++			}
++
++			/* try to avoid unnecessary memory loads */
++			if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
++				continue;
++
++			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
++			continue;
++		}
++#endif
++		walk->mm_stats[MM_NONLEAF_TOTAL]++;
++
++#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
++		if (!pmd_young(val))
++			continue;
++
++		walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
++#endif
++		if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
++			continue;
++
++		walk->mm_stats[MM_NONLEAF_FOUND]++;
++
++		if (!walk_pte_range(&val, addr, next, args))
++			continue;
++
++		walk->mm_stats[MM_NONLEAF_ADDED]++;
++
++		/* carry over to the next generation */
++		update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);
++	}
++
++	walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos);
++
++	if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
++		goto restart;
++}
++
++static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
++			  struct mm_walk *args)
++{
++	int i;
++	pud_t *pud;
++	unsigned long addr;
++	unsigned long next;
++	struct lru_gen_mm_walk *walk = args->private;
++
++	VM_WARN_ON_ONCE(p4d_leaf(*p4d));
++
++	pud = pud_offset(p4d, start & P4D_MASK);
++restart:
++	for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
++		pud_t val = READ_ONCE(pud[i]);
++
++		next = pud_addr_end(addr, end);
++
++		if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
++			continue;
++
++		walk_pmd_range(&val, addr, next, args);
++
++		/* a racy check to curtail the waiting time */
++		if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
++			return 1;
++
++		if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
++			end = (addr | ~PUD_MASK) + 1;
++			goto done;
++		}
++	}
++
++	if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end))
++		goto restart;
++
++	end = round_up(end, P4D_SIZE);
++done:
++	if (!end || !args->vma)
++		return 1;
++
++	walk->next_addr = max(end, args->vma->vm_start);
++
++	return -EAGAIN;
++}
++
++static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk)
++{
++	static const struct mm_walk_ops mm_walk_ops = {
++		.test_walk = should_skip_vma,
++		.p4d_entry = walk_pud_range,
++	};
++
++	int err;
++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++
++	walk->next_addr = FIRST_USER_ADDRESS;
++
++	do {
++		err = -EBUSY;
++
++		/* page_update_gen() requires stable page_memcg() */
++		if (!mem_cgroup_trylock_pages(memcg))
++			break;
++
++		/* the caller might be holding the lock for write */
++		if (mmap_read_trylock(mm)) {
++			err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
++
++			mmap_read_unlock(mm);
++		}
++
++		mem_cgroup_unlock_pages();
++
++		if (walk->batched) {
++			spin_lock_irq(&lruvec->lru_lock);
++			reset_batch_size(lruvec, walk);
++			spin_unlock_irq(&lruvec->lru_lock);
++		}
++
++		cond_resched();
++	} while (err == -EAGAIN);
++}
++
++static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
++{
++	struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
++
++	if (pgdat && current_is_kswapd()) {
++		VM_WARN_ON_ONCE(walk);
++
++		walk = &pgdat->mm_walk;
++	} else if (!pgdat && !walk) {
++		VM_WARN_ON_ONCE(current_is_kswapd());
++
++		walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
++	}
++
++	current->reclaim_state->mm_walk = walk;
++
++	return walk;
++}
++
++static void clear_mm_walk(void)
++{
++	struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
++
++	VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages)));
++	VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats)));
++
++	current->reclaim_state->mm_walk = NULL;
++
++	if (!current_is_kswapd())
++		kfree(walk);
++}
++
+ static void inc_min_seq(struct lruvec *lruvec, int type)
+ {
+ 	struct lru_gen_struct *lrugen = &lruvec->lrugen;
+@@ -3136,7 +4001,7 @@ next:
+ 	return success;
+ }
+ 
+-static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap)
++static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
+ {
+ 	int prev, next;
+ 	int type, zone;
+@@ -3146,9 +4011,6 @@ static void inc_max_seq(struct lruvec *l
+ 
+ 	VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
+ 
+-	if (max_seq != lrugen->max_seq)
+-		goto unlock;
+-
+ 	for (type = ANON_AND_FILE - 1; type >= 0; type--) {
+ 		if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
+ 			continue;
+@@ -3186,10 +4048,76 @@ static void inc_max_seq(struct lruvec *l
+ 
+ 	/* make sure preceding modifications appear */
+ 	smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
+-unlock:
++
+ 	spin_unlock_irq(&lruvec->lru_lock);
+ }
+ 
++static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
++			       struct scan_control *sc, bool can_swap)
++{
++	bool success;
++	struct lru_gen_mm_walk *walk;
++	struct mm_struct *mm = NULL;
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++
++	VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
++
++	/* see the comment in iterate_mm_list() */
++	if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) {
++		success = false;
++		goto done;
++	}
++
++	/*
++	 * If the hardware doesn't automatically set the accessed bit, fallback
++	 * to lru_gen_look_around(), which only clears the accessed bit in a
++	 * handful of PTEs. Spreading the work out over a period of time usually
++	 * is less efficient, but it avoids bursty page faults.
++	 */
++	if (!arch_has_hw_pte_young()) {
++		success = iterate_mm_list_nowalk(lruvec, max_seq);
++		goto done;
++	}
++
++	walk = set_mm_walk(NULL);
++	if (!walk) {
++		success = iterate_mm_list_nowalk(lruvec, max_seq);
++		goto done;
++	}
++
++	walk->lruvec = lruvec;
++	walk->max_seq = max_seq;
++	walk->can_swap = can_swap;
++	walk->force_scan = false;
++
++	do {
++		success = iterate_mm_list(lruvec, walk, &mm);
++		if (mm)
++			walk_mm(lruvec, mm, walk);
++
++		cond_resched();
++	} while (mm);
++done:
++	if (!success) {
++		if (sc->priority <= DEF_PRIORITY - 2)
++			wait_event_killable(lruvec->mm_state.wait,
++					    max_seq < READ_ONCE(lrugen->max_seq));
++
++		return max_seq < READ_ONCE(lrugen->max_seq);
++	}
++
++	VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
++
++	inc_max_seq(lruvec, can_swap);
++	/* either this sees any waiters or they will see updated max_seq */
++	if (wq_has_sleeper(&lruvec->mm_state.wait))
++		wake_up_all(&lruvec->mm_state.wait);
++
++	wakeup_flusher_threads(WB_REASON_VMSCAN);
++
++	return true;
++}
++
+ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
+ 			     struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
+ {
+@@ -3265,7 +4193,7 @@ static void age_lruvec(struct lruvec *lr
+ 
+ 	need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
+ 	if (need_aging)
+-		inc_max_seq(lruvec, max_seq, swappiness);
++		try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
+ }
+ 
+ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+@@ -3274,6 +4202,8 @@ static void lru_gen_age_node(struct pgli
+ 
+ 	VM_WARN_ON_ONCE(!current_is_kswapd());
+ 
++	set_mm_walk(pgdat);
++
+ 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ 	do {
+ 		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+@@ -3282,11 +4212,16 @@ static void lru_gen_age_node(struct pgli
+ 
+ 		cond_resched();
+ 	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
++
++	clear_mm_walk();
+ }
+ 
+ /*
+  * This function exploits spatial locality when shrink_page_list() walks the
+- * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
++ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
++ * the scan was done cacheline efficiently, it adds the PMD entry pointing to
++ * the PTE table to the Bloom filter. This forms a feedback loop between the
++ * eviction and the aging.
+  */
+ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+ {
+@@ -3295,6 +4230,8 @@ void lru_gen_look_around(struct page_vma
+ 	unsigned long start;
+ 	unsigned long end;
+ 	unsigned long addr;
++	struct lru_gen_mm_walk *walk;
++	int young = 0;
+ 	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
+ 	struct page *page = pvmw->page;
+ 	struct mem_cgroup *memcg = page_memcg(page);
+@@ -3309,6 +4246,9 @@ void lru_gen_look_around(struct page_vma
+ 	if (spin_is_contended(pvmw->ptl))
+ 		return;
+ 
++	/* avoid taking the LRU lock under the PTL when possible */
++	walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
++
+ 	start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
+ 	end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
+ 
+@@ -3338,13 +4278,15 @@ void lru_gen_look_around(struct page_vma
+ 		if (!pte_young(pte[i]))
+ 			continue;
+ 
+-		page = get_pfn_page(pfn, memcg, pgdat);
++		page = get_pfn_page(pfn, memcg, pgdat, !walk || walk->can_swap);
+ 		if (!page)
+ 			continue;
+ 
+ 		if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
+ 			VM_WARN_ON_ONCE(true);
+ 
++		young++;
++
+ 		if (pte_dirty(pte[i]) && !PageDirty(page) &&
+ 		    !(PageAnon(page) && PageSwapBacked(page) &&
+ 		      !PageSwapCache(page)))
+@@ -3360,7 +4302,11 @@ void lru_gen_look_around(struct page_vma
+ 	arch_leave_lazy_mmu_mode();
+ 	rcu_read_unlock();
+ 
+-	if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
++	/* feedback from rmap walkers to page table walkers */
++	if (suitable_to_scan(i, young))
++		update_bloom_filter(lruvec, max_seq, pvmw->pmd);
++
++	if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
+ 		for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
+ 			page = pte_page(pte[i]);
+ 			activate_page(page);
+@@ -3372,8 +4318,10 @@ void lru_gen_look_around(struct page_vma
+ 	if (!mem_cgroup_trylock_pages(memcg))
+ 		return;
+ 
+-	spin_lock_irq(&lruvec->lru_lock);
+-	new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
++	if (!walk) {
++		spin_lock_irq(&lruvec->lru_lock);
++		new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
++	}
+ 
+ 	for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
+ 		page = compound_head(pte_page(pte[i]));
+@@ -3384,10 +4332,14 @@ void lru_gen_look_around(struct page_vma
+ 		if (old_gen < 0 || old_gen == new_gen)
+ 			continue;
+ 
+-		lru_gen_update_size(lruvec, page, old_gen, new_gen);
++		if (walk)
++			update_batch_size(walk, page, old_gen, new_gen);
++		else
++			lru_gen_update_size(lruvec, page, old_gen, new_gen);
+ 	}
+ 
+-	spin_unlock_irq(&lruvec->lru_lock);
++	if (!walk)
++		spin_unlock_irq(&lruvec->lru_lock);
+ 
+ 	mem_cgroup_unlock_pages();
+ }
+@@ -3670,6 +4622,7 @@ static int evict_pages(struct lruvec *lr
+ 	struct page *page;
+ 	enum vm_event_item item;
+ 	struct reclaim_stat stat;
++	struct lru_gen_mm_walk *walk;
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ 
+@@ -3706,6 +4659,10 @@ static int evict_pages(struct lruvec *lr
+ 
+ 	move_pages_to_lru(lruvec, &list);
+ 
++	walk = current->reclaim_state->mm_walk;
++	if (walk && walk->batched)
++		reset_batch_size(lruvec, walk);
++
+ 	item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
+ 	if (!cgroup_reclaim(sc))
+ 		__count_vm_events(item, reclaimed);
+@@ -3722,6 +4679,11 @@ static int evict_pages(struct lruvec *lr
+ 	return scanned;
+ }
+ 
++/*
++ * For future optimizations:
++ * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
++ *    reclaim.
++ */
+ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
+ 				    bool can_swap)
+ {
+@@ -3747,7 +4709,8 @@ static unsigned long get_nr_to_scan(stru
+ 	if (current_is_kswapd())
+ 		return 0;
+ 
+-	inc_max_seq(lruvec, max_seq, can_swap);
++	if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap))
++		return nr_to_scan;
+ done:
+ 	return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
+ }
+@@ -3761,6 +4724,8 @@ static void lru_gen_shrink_lruvec(struct
+ 
+ 	blk_start_plug(&plug);
+ 
++	set_mm_walk(lruvec_pgdat(lruvec));
++
+ 	while (true) {
+ 		int delta;
+ 		int swappiness;
+@@ -3788,6 +4753,8 @@ static void lru_gen_shrink_lruvec(struct
+ 		cond_resched();
+ 	}
+ 
++	clear_mm_walk();
++
+ 	blk_finish_plug(&plug);
+ }
+ 
+@@ -3804,15 +4771,21 @@ void lru_gen_init_lruvec(struct lruvec *
+ 
+ 	for_each_gen_type_zone(gen, type, zone)
+ 		INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
++
++	lruvec->mm_state.seq = MIN_NR_GENS;
++	init_waitqueue_head(&lruvec->mm_state.wait);
+ }
+ 
+ #ifdef CONFIG_MEMCG
+ void lru_gen_init_memcg(struct mem_cgroup *memcg)
+ {
++	INIT_LIST_HEAD(&memcg->mm_list.fifo);
++	spin_lock_init(&memcg->mm_list.lock);
+ }
+ 
+ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
+ {
++	int i;
+ 	int nid;
+ 
+ 	for_each_node(nid) {
+@@ -3820,6 +4793,11 @@ void lru_gen_exit_memcg(struct mem_cgrou
+ 
+ 		VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
+ 					   sizeof(lruvec->lrugen.nr_pages)));
++
++		for (i = 0; i < NR_BLOOM_FILTERS; i++) {
++			bitmap_free(lruvec->mm_state.filters[i]);
++			lruvec->mm_state.filters[i] = NULL;
++		}
+ 	}
+ }
+ #endif

+ 315 - 0
target/linux/generic/backport-6.1/020-v6.1-09-mm-multi-gen-LRU-optimize-multiple-memcgs.patch

@@ -0,0 +1,315 @@
+From 36a18a68ea458e8f4db2ca86b00091daf32c6c74 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Sun, 18 Sep 2022 02:00:06 -0600
+Subject: [PATCH 09/29] mm: multi-gen LRU: optimize multiple memcgs
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+When multiple memcgs are available, it is possible to use generations as a
+frame of reference to make better choices and improve overall performance
+under global memory pressure.  This patch adds a basic optimization to
+select memcgs that can drop single-use unmapped clean pages first.  Doing
+so reduces the chance of going into the aging path or swapping, which can
+be costly.
+
+A typical example that benefits from this optimization is a server running
+mixed types of workloads, e.g., heavy anon workload in one memcg and heavy
+buffered I/O workload in the other.
+
+Though this optimization can be applied to both kswapd and direct reclaim,
+it is only added to kswapd to keep the patchset manageable.  Later
+improvements may cover the direct reclaim path.
+
+While ensuring certain fairness to all eligible memcgs, proportional scans
+of individual memcgs also require proper backoff to avoid overshooting
+their aggregate reclaim target by too much.  Otherwise it can cause high
+direct reclaim latency.  The conditions for backoff are:
+
+1. At low priorities, for direct reclaim, if aging fairness or direct
+   reclaim latency is at risk, i.e., aging one memcg multiple times or
+   swapping after the target is met.
+2. At high priorities, for global reclaim, if per-zone free pages are
+   above respective watermarks.
+
+Server benchmark results:
+  Mixed workloads:
+    fio (buffered I/O): +[19, 21]%
+                IOPS         BW
+      patch1-8: 1880k        7343MiB/s
+      patch1-9: 2252k        8796MiB/s
+
+    memcached (anon): +[119, 123]%
+                Ops/sec      KB/sec
+      patch1-8: 862768.65    33514.68
+      patch1-9: 1911022.12   74234.54
+
+  Mixed workloads:
+    fio (buffered I/O): +[75, 77]%
+                IOPS         BW
+      5.19-rc1: 1279k        4996MiB/s
+      patch1-9: 2252k        8796MiB/s
+
+    memcached (anon): +[13, 15]%
+                Ops/sec      KB/sec
+      5.19-rc1: 1673524.04   65008.87
+      patch1-9: 1911022.12   74234.54
+
+  Configurations:
+    (changes since patch 6)
+
+    cat mixed.sh
+    modprobe brd rd_nr=2 rd_size=56623104
+
+    swapoff -a
+    mkswap /dev/ram0
+    swapon /dev/ram0
+
+    mkfs.ext4 /dev/ram1
+    mount -t ext4 /dev/ram1 /mnt
+
+    memtier_benchmark -S /var/run/memcached/memcached.sock \
+      -P memcache_binary -n allkeys --key-minimum=1 \
+      --key-maximum=50000000 --key-pattern=P:P -c 1 -t 36 \
+      --ratio 1:0 --pipeline 8 -d 2000
+
+    fio -name=mglru --numjobs=36 --directory=/mnt --size=1408m \
+      --buffered=1 --ioengine=io_uring --iodepth=128 \
+      --iodepth_batch_submit=32 --iodepth_batch_complete=32 \
+      --rw=randread --random_distribution=random --norandommap \
+      --time_based --ramp_time=10m --runtime=90m --group_reporting &
+    pid=$!
+
+    sleep 200
+
+    memtier_benchmark -S /var/run/memcached/memcached.sock \
+      -P memcache_binary -n allkeys --key-minimum=1 \
+      --key-maximum=50000000 --key-pattern=R:R -c 1 -t 36 \
+      --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
+
+    kill -INT $pid
+    wait
+
+Client benchmark results:
+  no change (CONFIG_MEMCG=n)
+
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Acked-by: Brian Geffon <[email protected]>
+Acked-by: Jan Alexander Steffens (heftig) <[email protected]>
+Acked-by: Oleksandr Natalenko <[email protected]>
+Acked-by: Steven Barrett <[email protected]>
+Acked-by: Suleiman Souhlal <[email protected]>
+Tested-by: Daniel Byrne <[email protected]>
+Tested-by: Donald Carr <[email protected]>
+Tested-by: Holger Hoffstätte <[email protected]>
+Tested-by: Konstantin Kharlamov <[email protected]>
+Tested-by: Shuang Zhai <[email protected]>
+Tested-by: Sofia Trinh <[email protected]>
+Tested-by: Vaibhav Jain <[email protected]>
+Cc: Andi Kleen <[email protected]>
+Cc: Aneesh Kumar K.V <[email protected]>
+Cc: Barry Song <[email protected]>
+Cc: Catalin Marinas <[email protected]>
+Cc: Dave Hansen <[email protected]>
+Cc: Hillf Danton <[email protected]>
+Cc: Jens Axboe <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Jonathan Corbet <[email protected]>
+Cc: Linus Torvalds <[email protected]>
+Cc: Matthew Wilcox <[email protected]>
+Cc: Mel Gorman <[email protected]>
+Cc: Miaohe Lin <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Cc: Michal Hocko <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Peter Zijlstra <[email protected]>
+Cc: Qi Zheng <[email protected]>
+Cc: Tejun Heo <[email protected]>
+Cc: Vlastimil Babka <[email protected]>
+Cc: Will Deacon <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ mm/vmscan.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++-----
+ 1 file changed, 96 insertions(+), 9 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -127,6 +127,12 @@ struct scan_control {
+ 	/* Always discard instead of demoting to lower tier memory */
+ 	unsigned int no_demotion:1;
+ 
++#ifdef CONFIG_LRU_GEN
++	/* help kswapd make better choices among multiple memcgs */
++	unsigned int memcgs_need_aging:1;
++	unsigned long last_reclaimed;
++#endif
++
+ 	/* Allocation order */
+ 	s8 order;
+ 
+@@ -4202,6 +4208,19 @@ static void lru_gen_age_node(struct pgli
+ 
+ 	VM_WARN_ON_ONCE(!current_is_kswapd());
+ 
++	sc->last_reclaimed = sc->nr_reclaimed;
++
++	/*
++	 * To reduce the chance of going into the aging path, which can be
++	 * costly, optimistically skip it if the flag below was cleared in the
++	 * eviction path. This improves the overall performance when multiple
++	 * memcgs are available.
++	 */
++	if (!sc->memcgs_need_aging) {
++		sc->memcgs_need_aging = true;
++		return;
++	}
++
+ 	set_mm_walk(pgdat);
+ 
+ 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
+@@ -4613,7 +4632,8 @@ static int isolate_pages(struct lruvec *
+ 	return scanned;
+ }
+ 
+-static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
++static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
++		       bool *need_swapping)
+ {
+ 	int type;
+ 	int scanned;
+@@ -4676,6 +4696,9 @@ static int evict_pages(struct lruvec *lr
+ 
+ 	sc->nr_reclaimed += reclaimed;
+ 
++	if (need_swapping && type == LRU_GEN_ANON)
++		*need_swapping = true;
++
+ 	return scanned;
+ }
+ 
+@@ -4685,9 +4708,8 @@ static int evict_pages(struct lruvec *lr
+  *    reclaim.
+  */
+ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
+-				    bool can_swap)
++				    bool can_swap, bool *need_aging)
+ {
+-	bool need_aging;
+ 	unsigned long nr_to_scan;
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ 	DEFINE_MAX_SEQ(lruvec);
+@@ -4697,8 +4719,8 @@ static unsigned long get_nr_to_scan(stru
+ 	    (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
+ 		return 0;
+ 
+-	need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
+-	if (!need_aging)
++	*need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
++	if (!*need_aging)
+ 		return nr_to_scan;
+ 
+ 	/* skip the aging path at the default priority */
+@@ -4715,10 +4737,68 @@ done:
+ 	return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
+ }
+ 
++static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
++			      struct scan_control *sc, bool need_swapping)
++{
++	int i;
++	DEFINE_MAX_SEQ(lruvec);
++
++	if (!current_is_kswapd()) {
++		/* age each memcg once to ensure fairness */
++		if (max_seq - seq > 1)
++			return true;
++
++		/* over-swapping can increase allocation latency */
++		if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
++			return true;
++
++		/* give this thread a chance to exit and free its memory */
++		if (fatal_signal_pending(current)) {
++			sc->nr_reclaimed += MIN_LRU_BATCH;
++			return true;
++		}
++
++		if (cgroup_reclaim(sc))
++			return false;
++	} else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
++		return false;
++
++	/* keep scanning at low priorities to ensure fairness */
++	if (sc->priority > DEF_PRIORITY - 2)
++		return false;
++
++	/*
++	 * A minimum amount of work was done under global memory pressure. For
++	 * kswapd, it may be overshooting. For direct reclaim, the target isn't
++	 * met, and yet the allocation may still succeed, since kswapd may have
++	 * caught up. In either case, it's better to stop now, and restart if
++	 * necessary.
++	 */
++	for (i = 0; i <= sc->reclaim_idx; i++) {
++		unsigned long wmark;
++		struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
++
++		if (!managed_zone(zone))
++			continue;
++
++		wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
++		if (wmark > zone_page_state(zone, NR_FREE_PAGES))
++			return false;
++	}
++
++	sc->nr_reclaimed += MIN_LRU_BATCH;
++
++	return true;
++}
++
+ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ {
+ 	struct blk_plug plug;
++	bool need_aging = false;
++	bool need_swapping = false;
+ 	unsigned long scanned = 0;
++	unsigned long reclaimed = sc->nr_reclaimed;
++	DEFINE_MAX_SEQ(lruvec);
+ 
+ 	lru_add_drain();
+ 
+@@ -4738,21 +4818,28 @@ static void lru_gen_shrink_lruvec(struct
+ 		else
+ 			swappiness = 0;
+ 
+-		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
++		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
+ 		if (!nr_to_scan)
+-			break;
++			goto done;
+ 
+-		delta = evict_pages(lruvec, sc, swappiness);
++		delta = evict_pages(lruvec, sc, swappiness, &need_swapping);
+ 		if (!delta)
+-			break;
++			goto done;
+ 
+ 		scanned += delta;
+ 		if (scanned >= nr_to_scan)
+ 			break;
+ 
++		if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
++			break;
++
+ 		cond_resched();
+ 	}
+ 
++	/* see the comment in lru_gen_age_node() */
++	if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
++		sc->memcgs_need_aging = false;
++done:
+ 	clear_mm_walk();
+ 
+ 	blk_finish_plug(&plug);

+ 498 - 0
target/linux/generic/backport-6.1/020-v6.1-10-mm-multi-gen-LRU-kill-switch.patch

@@ -0,0 +1,498 @@
+From 640db3a029dca909af47157ca18f52b29d34a1b9 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Sun, 18 Sep 2022 02:00:07 -0600
+Subject: [PATCH 10/29] mm: multi-gen LRU: kill switch
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that
+can be disabled include:
+  0x0001: the multi-gen LRU core
+  0x0002: walking page table, when arch_has_hw_pte_young() returns
+          true
+  0x0004: clearing the accessed bit in non-leaf PMD entries, when
+          CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y
+  [yYnN]: apply to all the components above
+E.g.,
+  echo y >/sys/kernel/mm/lru_gen/enabled
+  cat /sys/kernel/mm/lru_gen/enabled
+  0x0007
+  echo 5 >/sys/kernel/mm/lru_gen/enabled
+  cat /sys/kernel/mm/lru_gen/enabled
+  0x0005
+
+NB: the page table walks happen on the scale of seconds under heavy memory
+pressure, in which case the mmap_lock contention is a lesser concern,
+compared with the LRU lock contention and the I/O congestion.  So far the
+only well-known case of the mmap_lock contention happens on Android, due
+to Scudo [1] which allocates several thousand VMAs for merely a few
+hundred MBs.  The SPF and the Maple Tree also have provided their own
+assessments [2][3].  However, if walking page tables does worsen the
+mmap_lock contention, the kill switch can be used to disable it.  In this
+case the multi-gen LRU will suffer a minor performance degradation, as
+shown previously.
+
+Clearing the accessed bit in non-leaf PMD entries can also be disabled,
+since this behavior was not tested on x86 varieties other than Intel and
+AMD.
+
+[1] https://source.android.com/devices/tech/debug/scudo
+[2] https://lore.kernel.org/r/[email protected]/
+[3] https://lore.kernel.org/r/[email protected]/
+
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Acked-by: Brian Geffon <[email protected]>
+Acked-by: Jan Alexander Steffens (heftig) <[email protected]>
+Acked-by: Oleksandr Natalenko <[email protected]>
+Acked-by: Steven Barrett <[email protected]>
+Acked-by: Suleiman Souhlal <[email protected]>
+Tested-by: Daniel Byrne <[email protected]>
+Tested-by: Donald Carr <[email protected]>
+Tested-by: Holger Hoffstätte <[email protected]>
+Tested-by: Konstantin Kharlamov <[email protected]>
+Tested-by: Shuang Zhai <[email protected]>
+Tested-by: Sofia Trinh <[email protected]>
+Tested-by: Vaibhav Jain <[email protected]>
+Cc: Andi Kleen <[email protected]>
+Cc: Aneesh Kumar K.V <[email protected]>
+Cc: Barry Song <[email protected]>
+Cc: Catalin Marinas <[email protected]>
+Cc: Dave Hansen <[email protected]>
+Cc: Hillf Danton <[email protected]>
+Cc: Jens Axboe <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Jonathan Corbet <[email protected]>
+Cc: Linus Torvalds <[email protected]>
+Cc: Matthew Wilcox <[email protected]>
+Cc: Mel Gorman <[email protected]>
+Cc: Miaohe Lin <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Cc: Michal Hocko <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Peter Zijlstra <[email protected]>
+Cc: Qi Zheng <[email protected]>
+Cc: Tejun Heo <[email protected]>
+Cc: Vlastimil Babka <[email protected]>
+Cc: Will Deacon <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ include/linux/cgroup.h          |  15 ++-
+ include/linux/mm_inline.h       |  15 ++-
+ include/linux/mmzone.h          |   9 ++
+ kernel/cgroup/cgroup-internal.h |   1 -
+ mm/Kconfig                      |   6 +
+ mm/vmscan.c                     | 228 +++++++++++++++++++++++++++++++-
+ 6 files changed, 265 insertions(+), 9 deletions(-)
+
+--- a/include/linux/cgroup.h
++++ b/include/linux/cgroup.h
+@@ -433,6 +433,18 @@ static inline void cgroup_put(struct cgr
+ 	css_put(&cgrp->self);
+ }
+ 
++extern struct mutex cgroup_mutex;
++
++static inline void cgroup_lock(void)
++{
++	mutex_lock(&cgroup_mutex);
++}
++
++static inline void cgroup_unlock(void)
++{
++	mutex_unlock(&cgroup_mutex);
++}
++
+ /**
+  * task_css_set_check - obtain a task's css_set with extra access conditions
+  * @task: the task to obtain css_set for
+@@ -447,7 +459,6 @@ static inline void cgroup_put(struct cgr
+  * as locks used during the cgroup_subsys::attach() methods.
+  */
+ #ifdef CONFIG_PROVE_RCU
+-extern struct mutex cgroup_mutex;
+ extern spinlock_t css_set_lock;
+ #define task_css_set_check(task, __c)					\
+ 	rcu_dereference_check((task)->cgroups,				\
+@@ -708,6 +719,8 @@ struct cgroup;
+ static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
+ static inline void css_get(struct cgroup_subsys_state *css) {}
+ static inline void css_put(struct cgroup_subsys_state *css) {}
++static inline void cgroup_lock(void) {}
++static inline void cgroup_unlock(void) {}
+ static inline int cgroup_attach_task_all(struct task_struct *from,
+ 					 struct task_struct *t) { return 0; }
+ static inline int cgroupstats_build(struct cgroupstats *stats,
+--- a/include/linux/mm_inline.h
++++ b/include/linux/mm_inline.h
+@@ -91,10 +91,21 @@ static __always_inline enum lru_list pag
+ 
+ #ifdef CONFIG_LRU_GEN
+ 
++#ifdef CONFIG_LRU_GEN_ENABLED
+ static inline bool lru_gen_enabled(void)
+ {
+-	return true;
++	DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
++
++	return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
++}
++#else
++static inline bool lru_gen_enabled(void)
++{
++	DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
++
++	return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
+ }
++#endif
+ 
+ static inline bool lru_gen_in_fault(void)
+ {
+@@ -207,7 +218,7 @@ static inline bool lru_gen_add_page(stru
+ 
+ 	VM_WARN_ON_ONCE_PAGE(gen != -1, page);
+ 
+-	if (PageUnevictable(page))
++	if (PageUnevictable(page) || !lrugen->enabled)
+ 		return false;
+ 	/*
+ 	 * There are three common cases for this page:
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -364,6 +364,13 @@ enum {
+ 	LRU_GEN_FILE,
+ };
+ 
++enum {
++	LRU_GEN_CORE,
++	LRU_GEN_MM_WALK,
++	LRU_GEN_NONLEAF_YOUNG,
++	NR_LRU_GEN_CAPS
++};
++
+ #define MIN_LRU_BATCH		BITS_PER_LONG
+ #define MAX_LRU_BATCH		(MIN_LRU_BATCH * 64)
+ 
+@@ -405,6 +412,8 @@ struct lru_gen_struct {
+ 	/* can be modified without holding the LRU lock */
+ 	atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+ 	atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
++	/* whether the multi-gen LRU is enabled */
++	bool enabled;
+ };
+ 
+ enum {
+--- a/kernel/cgroup/cgroup-internal.h
++++ b/kernel/cgroup/cgroup-internal.h
+@@ -165,7 +165,6 @@ struct cgroup_mgctx {
+ #define DEFINE_CGROUP_MGCTX(name)						\
+ 	struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
+ 
+-extern struct mutex cgroup_mutex;
+ extern spinlock_t css_set_lock;
+ extern struct cgroup_subsys *cgroup_subsys[];
+ extern struct list_head cgroup_roots;
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -906,6 +906,12 @@ config LRU_GEN
+ 	help
+ 	  A high performance LRU implementation to overcommit memory.
+ 
++config LRU_GEN_ENABLED
++	bool "Enable by default"
++	depends on LRU_GEN
++	help
++	  This option enables the multi-gen LRU by default.
++
+ config LRU_GEN_STATS
+ 	bool "Full stats for debugging"
+ 	depends on LRU_GEN
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -52,6 +52,7 @@
+ #include <linux/psi.h>
+ #include <linux/pagewalk.h>
+ #include <linux/shmem_fs.h>
++#include <linux/ctype.h>
+ 
+ #include <asm/tlbflush.h>
+ #include <asm/div64.h>
+@@ -2841,6 +2842,14 @@ static bool can_age_anon_pages(struct pg
+ 
+ #ifdef CONFIG_LRU_GEN
+ 
++#ifdef CONFIG_LRU_GEN_ENABLED
++DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
++#define get_cap(cap)	static_branch_likely(&lru_gen_caps[cap])
++#else
++DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
++#define get_cap(cap)	static_branch_unlikely(&lru_gen_caps[cap])
++#endif
++
+ /******************************************************************************
+  *                          shorthand helpers
+  ******************************************************************************/
+@@ -3717,7 +3726,8 @@ static void walk_pmd_range_locked(pud_t
+ 			goto next;
+ 
+ 		if (!pmd_trans_huge(pmd[i])) {
+-			if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
++			if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
++			    get_cap(LRU_GEN_NONLEAF_YOUNG))
+ 				pmdp_test_and_clear_young(vma, addr, pmd + i);
+ 			goto next;
+ 		}
+@@ -3815,10 +3825,12 @@ restart:
+ 		walk->mm_stats[MM_NONLEAF_TOTAL]++;
+ 
+ #ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
+-		if (!pmd_young(val))
+-			continue;
++		if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
++			if (!pmd_young(val))
++				continue;
+ 
+-		walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
++			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
++		}
+ #endif
+ 		if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
+ 			continue;
+@@ -4080,7 +4092,7 @@ static bool try_to_inc_max_seq(struct lr
+ 	 * handful of PTEs. Spreading the work out over a period of time usually
+ 	 * is less efficient, but it avoids bursty page faults.
+ 	 */
+-	if (!arch_has_hw_pte_young()) {
++	if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
+ 		success = iterate_mm_list_nowalk(lruvec, max_seq);
+ 		goto done;
+ 	}
+@@ -4846,6 +4858,208 @@ done:
+ }
+ 
+ /******************************************************************************
++ *                          state change
++ ******************************************************************************/
++
++static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
++{
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++
++	if (lrugen->enabled) {
++		enum lru_list lru;
++
++		for_each_evictable_lru(lru) {
++			if (!list_empty(&lruvec->lists[lru]))
++				return false;
++		}
++	} else {
++		int gen, type, zone;
++
++		for_each_gen_type_zone(gen, type, zone) {
++			if (!list_empty(&lrugen->lists[gen][type][zone]))
++				return false;
++		}
++	}
++
++	return true;
++}
++
++static bool fill_evictable(struct lruvec *lruvec)
++{
++	enum lru_list lru;
++	int remaining = MAX_LRU_BATCH;
++
++	for_each_evictable_lru(lru) {
++		int type = is_file_lru(lru);
++		bool active = is_active_lru(lru);
++		struct list_head *head = &lruvec->lists[lru];
++
++		while (!list_empty(head)) {
++			bool success;
++			struct page *page = lru_to_page(head);
++
++			VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
++			VM_WARN_ON_ONCE_PAGE(PageActive(page) != active, page);
++			VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
++			VM_WARN_ON_ONCE_PAGE(page_lru_gen(page) != -1, page);
++
++			del_page_from_lru_list(page, lruvec);
++			success = lru_gen_add_page(lruvec, page, false);
++			VM_WARN_ON_ONCE(!success);
++
++			if (!--remaining)
++				return false;
++		}
++	}
++
++	return true;
++}
++
++static bool drain_evictable(struct lruvec *lruvec)
++{
++	int gen, type, zone;
++	int remaining = MAX_LRU_BATCH;
++
++	for_each_gen_type_zone(gen, type, zone) {
++		struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
++
++		while (!list_empty(head)) {
++			bool success;
++			struct page *page = lru_to_page(head);
++
++			VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
++			VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
++			VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
++			VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page);
++
++			success = lru_gen_del_page(lruvec, page, false);
++			VM_WARN_ON_ONCE(!success);
++			add_page_to_lru_list(page, lruvec);
++
++			if (!--remaining)
++				return false;
++		}
++	}
++
++	return true;
++}
++
++static void lru_gen_change_state(bool enabled)
++{
++	static DEFINE_MUTEX(state_mutex);
++
++	struct mem_cgroup *memcg;
++
++	cgroup_lock();
++	cpus_read_lock();
++	get_online_mems();
++	mutex_lock(&state_mutex);
++
++	if (enabled == lru_gen_enabled())
++		goto unlock;
++
++	if (enabled)
++		static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
++	else
++		static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
++
++	memcg = mem_cgroup_iter(NULL, NULL, NULL);
++	do {
++		int nid;
++
++		for_each_node(nid) {
++			struct lruvec *lruvec = get_lruvec(memcg, nid);
++
++			if (!lruvec)
++				continue;
++
++			spin_lock_irq(&lruvec->lru_lock);
++
++			VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
++			VM_WARN_ON_ONCE(!state_is_valid(lruvec));
++
++			lruvec->lrugen.enabled = enabled;
++
++			while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
++				spin_unlock_irq(&lruvec->lru_lock);
++				cond_resched();
++				spin_lock_irq(&lruvec->lru_lock);
++			}
++
++			spin_unlock_irq(&lruvec->lru_lock);
++		}
++
++		cond_resched();
++	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
++unlock:
++	mutex_unlock(&state_mutex);
++	put_online_mems();
++	cpus_read_unlock();
++	cgroup_unlock();
++}
++
++/******************************************************************************
++ *                          sysfs interface
++ ******************************************************************************/
++
++static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
++{
++	unsigned int caps = 0;
++
++	if (get_cap(LRU_GEN_CORE))
++		caps |= BIT(LRU_GEN_CORE);
++
++	if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
++		caps |= BIT(LRU_GEN_MM_WALK);
++
++	if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
++		caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
++
++	return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
++}
++
++static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
++			     const char *buf, size_t len)
++{
++	int i;
++	unsigned int caps;
++
++	if (tolower(*buf) == 'n')
++		caps = 0;
++	else if (tolower(*buf) == 'y')
++		caps = -1;
++	else if (kstrtouint(buf, 0, &caps))
++		return -EINVAL;
++
++	for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
++		bool enabled = caps & BIT(i);
++
++		if (i == LRU_GEN_CORE)
++			lru_gen_change_state(enabled);
++		else if (enabled)
++			static_branch_enable(&lru_gen_caps[i]);
++		else
++			static_branch_disable(&lru_gen_caps[i]);
++	}
++
++	return len;
++}
++
++static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
++	enabled, 0644, show_enabled, store_enabled
++);
++
++static struct attribute *lru_gen_attrs[] = {
++	&lru_gen_enabled_attr.attr,
++	NULL
++};
++
++static struct attribute_group lru_gen_attr_group = {
++	.name = "lru_gen",
++	.attrs = lru_gen_attrs,
++};
++
++/******************************************************************************
+  *                          initialization
+  ******************************************************************************/
+ 
+@@ -4855,6 +5069,7 @@ void lru_gen_init_lruvec(struct lruvec *
+ 	struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ 
+ 	lrugen->max_seq = MIN_NR_GENS + 1;
++	lrugen->enabled = lru_gen_enabled();
+ 
+ 	for_each_gen_type_zone(gen, type, zone)
+ 		INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
+@@ -4894,6 +5109,9 @@ static int __init init_lru_gen(void)
+ 	BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
+ 	BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
+ 
++	if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
++		pr_err("lru_gen: failed to create sysfs group\n");
++
+ 	return 0;
+ };
+ late_initcall(init_lru_gen);

+ 226 - 0
target/linux/generic/backport-6.1/020-v6.1-11-mm-multi-gen-LRU-thrashing-prevention.patch

@@ -0,0 +1,226 @@
+From 73d1ff551760f0c79c47ab70faa4c2ca91413f5c Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Sun, 18 Sep 2022 02:00:08 -0600
+Subject: [PATCH 11/29] mm: multi-gen LRU: thrashing prevention
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Add /sys/kernel/mm/lru_gen/min_ttl_ms for thrashing prevention, as
+requested by many desktop users [1].
+
+When set to value N, it prevents the working set of N milliseconds from
+getting evicted.  The OOM killer is triggered if this working set cannot
+be kept in memory.  Based on the average human detectable lag (~100ms),
+N=1000 usually eliminates intolerable lags due to thrashing.  Larger
+values like N=3000 make lags less noticeable at the risk of premature OOM
+kills.
+
+Compared with the size-based approach [2], this time-based approach
+has the following advantages:
+
+1. It is easier to configure because it is agnostic to applications
+   and memory sizes.
+2. It is more reliable because it is directly wired to the OOM killer.
+
+[1] https://lore.kernel.org/r/Ydza%[email protected]/
+[2] https://lore.kernel.org/r/[email protected]/
+
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Acked-by: Brian Geffon <[email protected]>
+Acked-by: Jan Alexander Steffens (heftig) <[email protected]>
+Acked-by: Oleksandr Natalenko <[email protected]>
+Acked-by: Steven Barrett <[email protected]>
+Acked-by: Suleiman Souhlal <[email protected]>
+Tested-by: Daniel Byrne <[email protected]>
+Tested-by: Donald Carr <[email protected]>
+Tested-by: Holger Hoffstätte <[email protected]>
+Tested-by: Konstantin Kharlamov <[email protected]>
+Tested-by: Shuang Zhai <[email protected]>
+Tested-by: Sofia Trinh <[email protected]>
+Tested-by: Vaibhav Jain <[email protected]>
+Cc: Andi Kleen <[email protected]>
+Cc: Aneesh Kumar K.V <[email protected]>
+Cc: Barry Song <[email protected]>
+Cc: Catalin Marinas <[email protected]>
+Cc: Dave Hansen <[email protected]>
+Cc: Hillf Danton <[email protected]>
+Cc: Jens Axboe <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Jonathan Corbet <[email protected]>
+Cc: Linus Torvalds <[email protected]>
+Cc: Matthew Wilcox <[email protected]>
+Cc: Mel Gorman <[email protected]>
+Cc: Miaohe Lin <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Cc: Michal Hocko <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Peter Zijlstra <[email protected]>
+Cc: Qi Zheng <[email protected]>
+Cc: Tejun Heo <[email protected]>
+Cc: Vlastimil Babka <[email protected]>
+Cc: Will Deacon <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ include/linux/mmzone.h |  2 ++
+ mm/vmscan.c            | 74 ++++++++++++++++++++++++++++++++++++++++--
+ 2 files changed, 73 insertions(+), 3 deletions(-)
+
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -399,6 +399,8 @@ struct lru_gen_struct {
+ 	unsigned long max_seq;
+ 	/* the eviction increments the oldest generation numbers */
+ 	unsigned long min_seq[ANON_AND_FILE];
++	/* the birth time of each generation in jiffies */
++	unsigned long timestamps[MAX_NR_GENS];
+ 	/* the multi-gen LRU lists, lazily sorted on eviction */
+ 	struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ 	/* the multi-gen LRU sizes, eventually consistent */
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -4064,6 +4064,7 @@ static void inc_max_seq(struct lruvec *l
+ 	for (type = 0; type < ANON_AND_FILE; type++)
+ 		reset_ctrl_pos(lruvec, type, false);
+ 
++	WRITE_ONCE(lrugen->timestamps[next], jiffies);
+ 	/* make sure preceding modifications appear */
+ 	smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
+ 
+@@ -4193,7 +4194,7 @@ static bool should_run_aging(struct lruv
+ 	return false;
+ }
+ 
+-static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
++static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
+ {
+ 	bool need_aging;
+ 	unsigned long nr_to_scan;
+@@ -4207,16 +4208,36 @@ static void age_lruvec(struct lruvec *lr
+ 	mem_cgroup_calculate_protection(NULL, memcg);
+ 
+ 	if (mem_cgroup_below_min(memcg))
+-		return;
++		return false;
+ 
+ 	need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
++
++	if (min_ttl) {
++		int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
++		unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
++
++		if (time_is_after_jiffies(birth + min_ttl))
++			return false;
++
++		/* the size is likely too small to be helpful */
++		if (!nr_to_scan && sc->priority != DEF_PRIORITY)
++			return false;
++	}
++
+ 	if (need_aging)
+ 		try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
++
++	return true;
+ }
+ 
++/* to protect the working set of the last N jiffies */
++static unsigned long lru_gen_min_ttl __read_mostly;
++
+ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+ {
+ 	struct mem_cgroup *memcg;
++	bool success = false;
++	unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
+ 
+ 	VM_WARN_ON_ONCE(!current_is_kswapd());
+ 
+@@ -4239,12 +4260,32 @@ static void lru_gen_age_node(struct pgli
+ 	do {
+ 		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ 
+-		age_lruvec(lruvec, sc);
++		if (age_lruvec(lruvec, sc, min_ttl))
++			success = true;
+ 
+ 		cond_resched();
+ 	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+ 
+ 	clear_mm_walk();
++
++	/* check the order to exclude compaction-induced reclaim */
++	if (success || !min_ttl || sc->order)
++		return;
++
++	/*
++	 * The main goal is to OOM kill if every generation from all memcgs is
++	 * younger than min_ttl. However, another possibility is all memcgs are
++	 * either below min or empty.
++	 */
++	if (mutex_trylock(&oom_lock)) {
++		struct oom_control oc = {
++			.gfp_mask = sc->gfp_mask,
++		};
++
++		out_of_memory(&oc);
++
++		mutex_unlock(&oom_lock);
++	}
+ }
+ 
+ /*
+@@ -5002,6 +5043,28 @@ unlock:
+  *                          sysfs interface
+  ******************************************************************************/
+ 
++static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
++}
++
++static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
++			     const char *buf, size_t len)
++{
++	unsigned int msecs;
++
++	if (kstrtouint(buf, 0, &msecs))
++		return -EINVAL;
++
++	WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
++
++	return len;
++}
++
++static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
++	min_ttl_ms, 0644, show_min_ttl, store_min_ttl
++);
++
+ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+ {
+ 	unsigned int caps = 0;
+@@ -5050,6 +5113,7 @@ static struct kobj_attribute lru_gen_ena
+ );
+ 
+ static struct attribute *lru_gen_attrs[] = {
++	&lru_gen_min_ttl_attr.attr,
+ 	&lru_gen_enabled_attr.attr,
+ 	NULL
+ };
+@@ -5065,12 +5129,16 @@ static struct attribute_group lru_gen_at
+ 
+ void lru_gen_init_lruvec(struct lruvec *lruvec)
+ {
++	int i;
+ 	int gen, type, zone;
+ 	struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ 
+ 	lrugen->max_seq = MIN_NR_GENS + 1;
+ 	lrugen->enabled = lru_gen_enabled();
+ 
++	for (i = 0; i <= MIN_NR_GENS + 1; i++)
++		lrugen->timestamps[i] = jiffies;
++
+ 	for_each_gen_type_zone(gen, type, zone)
+ 		INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
+ 

+ 579 - 0
target/linux/generic/backport-6.1/020-v6.1-12-mm-multi-gen-LRU-debugfs-interface.patch

@@ -0,0 +1,579 @@
+From 530716d008ca26315f246cd70dc1cefc636beaa4 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Sun, 18 Sep 2022 02:00:09 -0600
+Subject: [PATCH 12/29] mm: multi-gen LRU: debugfs interface
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Add /sys/kernel/debug/lru_gen for working set estimation and proactive
+reclaim.  These techniques are commonly used to optimize job scheduling
+(bin packing) in data centers [1][2].
+
+Compared with the page table-based approach and the PFN-based
+approach, this lruvec-based approach has the following advantages:
+1. It offers better choices because it is aware of memcgs, NUMA nodes,
+   shared mappings and unmapped page cache.
+2. It is more scalable because it is O(nr_hot_pages), whereas the
+   PFN-based approach is O(nr_total_pages).
+
+Add /sys/kernel/debug/lru_gen_full for debugging.
+
+[1] https://dl.acm.org/doi/10.1145/3297858.3304053
+[2] https://dl.acm.org/doi/10.1145/3503222.3507731
+
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Reviewed-by: Qi Zheng <[email protected]>
+Acked-by: Brian Geffon <[email protected]>
+Acked-by: Jan Alexander Steffens (heftig) <[email protected]>
+Acked-by: Oleksandr Natalenko <[email protected]>
+Acked-by: Steven Barrett <[email protected]>
+Acked-by: Suleiman Souhlal <[email protected]>
+Tested-by: Daniel Byrne <[email protected]>
+Tested-by: Donald Carr <[email protected]>
+Tested-by: Holger Hoffstätte <[email protected]>
+Tested-by: Konstantin Kharlamov <[email protected]>
+Tested-by: Shuang Zhai <[email protected]>
+Tested-by: Sofia Trinh <[email protected]>
+Tested-by: Vaibhav Jain <[email protected]>
+Cc: Andi Kleen <[email protected]>
+Cc: Aneesh Kumar K.V <[email protected]>
+Cc: Barry Song <[email protected]>
+Cc: Catalin Marinas <[email protected]>
+Cc: Dave Hansen <[email protected]>
+Cc: Hillf Danton <[email protected]>
+Cc: Jens Axboe <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Jonathan Corbet <[email protected]>
+Cc: Linus Torvalds <[email protected]>
+Cc: Matthew Wilcox <[email protected]>
+Cc: Mel Gorman <[email protected]>
+Cc: Miaohe Lin <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Cc: Michal Hocko <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Peter Zijlstra <[email protected]>
+Cc: Tejun Heo <[email protected]>
+Cc: Vlastimil Babka <[email protected]>
+Cc: Will Deacon <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ include/linux/nodemask.h |   1 +
+ mm/vmscan.c              | 411 ++++++++++++++++++++++++++++++++++++++-
+ 2 files changed, 402 insertions(+), 10 deletions(-)
+
+--- a/include/linux/nodemask.h
++++ b/include/linux/nodemask.h
+@@ -485,6 +485,7 @@ static inline int num_node_state(enum no
+ #define first_online_node	0
+ #define first_memory_node	0
+ #define next_online_node(nid)	(MAX_NUMNODES)
++#define next_memory_node(nid)	(MAX_NUMNODES)
+ #define nr_node_ids		1U
+ #define nr_online_nodes		1U
+ 
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -53,6 +53,7 @@
+ #include <linux/pagewalk.h>
+ #include <linux/shmem_fs.h>
+ #include <linux/ctype.h>
++#include <linux/debugfs.h>
+ 
+ #include <asm/tlbflush.h>
+ #include <asm/div64.h>
+@@ -3968,12 +3969,40 @@ static void clear_mm_walk(void)
+ 		kfree(walk);
+ }
+ 
+-static void inc_min_seq(struct lruvec *lruvec, int type)
++static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
+ {
++	int zone;
++	int remaining = MAX_LRU_BATCH;
+ 	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
++
++	if (type == LRU_GEN_ANON && !can_swap)
++		goto done;
++
++	/* prevent cold/hot inversion if force_scan is true */
++	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
++		struct list_head *head = &lrugen->lists[old_gen][type][zone];
++
++		while (!list_empty(head)) {
++			struct page *page = lru_to_page(head);
++
++			VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
++			VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
++			VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
++			VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page);
+ 
++			new_gen = page_inc_gen(lruvec, page, false);
++			list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]);
++
++			if (!--remaining)
++				return false;
++		}
++	}
++done:
+ 	reset_ctrl_pos(lruvec, type, true);
+ 	WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
++
++	return true;
+ }
+ 
+ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
+@@ -4019,7 +4048,7 @@ next:
+ 	return success;
+ }
+ 
+-static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
++static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
+ {
+ 	int prev, next;
+ 	int type, zone;
+@@ -4033,9 +4062,13 @@ static void inc_max_seq(struct lruvec *l
+ 		if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
+ 			continue;
+ 
+-		VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap);
++		VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
+ 
+-		inc_min_seq(lruvec, type);
++		while (!inc_min_seq(lruvec, type, can_swap)) {
++			spin_unlock_irq(&lruvec->lru_lock);
++			cond_resched();
++			spin_lock_irq(&lruvec->lru_lock);
++		}
+ 	}
+ 
+ 	/*
+@@ -4072,7 +4105,7 @@ static void inc_max_seq(struct lruvec *l
+ }
+ 
+ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+-			       struct scan_control *sc, bool can_swap)
++			       struct scan_control *sc, bool can_swap, bool force_scan)
+ {
+ 	bool success;
+ 	struct lru_gen_mm_walk *walk;
+@@ -4093,7 +4126,7 @@ static bool try_to_inc_max_seq(struct lr
+ 	 * handful of PTEs. Spreading the work out over a period of time usually
+ 	 * is less efficient, but it avoids bursty page faults.
+ 	 */
+-	if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
++	if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
+ 		success = iterate_mm_list_nowalk(lruvec, max_seq);
+ 		goto done;
+ 	}
+@@ -4107,7 +4140,7 @@ static bool try_to_inc_max_seq(struct lr
+ 	walk->lruvec = lruvec;
+ 	walk->max_seq = max_seq;
+ 	walk->can_swap = can_swap;
+-	walk->force_scan = false;
++	walk->force_scan = force_scan;
+ 
+ 	do {
+ 		success = iterate_mm_list(lruvec, walk, &mm);
+@@ -4127,7 +4160,7 @@ done:
+ 
+ 	VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
+ 
+-	inc_max_seq(lruvec, can_swap);
++	inc_max_seq(lruvec, can_swap, force_scan);
+ 	/* either this sees any waiters or they will see updated max_seq */
+ 	if (wq_has_sleeper(&lruvec->mm_state.wait))
+ 		wake_up_all(&lruvec->mm_state.wait);
+@@ -4225,7 +4258,7 @@ static bool age_lruvec(struct lruvec *lr
+ 	}
+ 
+ 	if (need_aging)
+-		try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
++		try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
+ 
+ 	return true;
+ }
+@@ -4784,7 +4817,7 @@ static unsigned long get_nr_to_scan(stru
+ 	if (current_is_kswapd())
+ 		return 0;
+ 
+-	if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap))
++	if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
+ 		return nr_to_scan;
+ done:
+ 	return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
+@@ -5124,6 +5157,361 @@ static struct attribute_group lru_gen_at
+ };
+ 
+ /******************************************************************************
++ *                          debugfs interface
++ ******************************************************************************/
++
++static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
++{
++	struct mem_cgroup *memcg;
++	loff_t nr_to_skip = *pos;
++
++	m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
++	if (!m->private)
++		return ERR_PTR(-ENOMEM);
++
++	memcg = mem_cgroup_iter(NULL, NULL, NULL);
++	do {
++		int nid;
++
++		for_each_node_state(nid, N_MEMORY) {
++			if (!nr_to_skip--)
++				return get_lruvec(memcg, nid);
++		}
++	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
++
++	return NULL;
++}
++
++static void lru_gen_seq_stop(struct seq_file *m, void *v)
++{
++	if (!IS_ERR_OR_NULL(v))
++		mem_cgroup_iter_break(NULL, lruvec_memcg(v));
++
++	kvfree(m->private);
++	m->private = NULL;
++}
++
++static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
++{
++	int nid = lruvec_pgdat(v)->node_id;
++	struct mem_cgroup *memcg = lruvec_memcg(v);
++
++	++*pos;
++
++	nid = next_memory_node(nid);
++	if (nid == MAX_NUMNODES) {
++		memcg = mem_cgroup_iter(NULL, memcg, NULL);
++		if (!memcg)
++			return NULL;
++
++		nid = first_memory_node;
++	}
++
++	return get_lruvec(memcg, nid);
++}
++
++static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
++				  unsigned long max_seq, unsigned long *min_seq,
++				  unsigned long seq)
++{
++	int i;
++	int type, tier;
++	int hist = lru_hist_from_seq(seq);
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++
++	for (tier = 0; tier < MAX_NR_TIERS; tier++) {
++		seq_printf(m, "            %10d", tier);
++		for (type = 0; type < ANON_AND_FILE; type++) {
++			const char *s = "   ";
++			unsigned long n[3] = {};
++
++			if (seq == max_seq) {
++				s = "RT ";
++				n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
++				n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
++			} else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
++				s = "rep";
++				n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
++				n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
++				if (tier)
++					n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
++			}
++
++			for (i = 0; i < 3; i++)
++				seq_printf(m, " %10lu%c", n[i], s[i]);
++		}
++		seq_putc(m, '\n');
++	}
++
++	seq_puts(m, "                      ");
++	for (i = 0; i < NR_MM_STATS; i++) {
++		const char *s = "      ";
++		unsigned long n = 0;
++
++		if (seq == max_seq && NR_HIST_GENS == 1) {
++			s = "LOYNFA";
++			n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
++		} else if (seq != max_seq && NR_HIST_GENS > 1) {
++			s = "loynfa";
++			n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
++		}
++
++		seq_printf(m, " %10lu%c", n, s[i]);
++	}
++	seq_putc(m, '\n');
++}
++
++static int lru_gen_seq_show(struct seq_file *m, void *v)
++{
++	unsigned long seq;
++	bool full = !debugfs_real_fops(m->file)->write;
++	struct lruvec *lruvec = v;
++	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	int nid = lruvec_pgdat(lruvec)->node_id;
++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++	DEFINE_MAX_SEQ(lruvec);
++	DEFINE_MIN_SEQ(lruvec);
++
++	if (nid == first_memory_node) {
++		const char *path = memcg ? m->private : "";
++
++#ifdef CONFIG_MEMCG
++		if (memcg)
++			cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
++#endif
++		seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
++	}
++
++	seq_printf(m, " node %5d\n", nid);
++
++	if (!full)
++		seq = min_seq[LRU_GEN_ANON];
++	else if (max_seq >= MAX_NR_GENS)
++		seq = max_seq - MAX_NR_GENS + 1;
++	else
++		seq = 0;
++
++	for (; seq <= max_seq; seq++) {
++		int type, zone;
++		int gen = lru_gen_from_seq(seq);
++		unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
++
++		seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth));
++
++		for (type = 0; type < ANON_AND_FILE; type++) {
++			unsigned long size = 0;
++			char mark = full && seq < min_seq[type] ? 'x' : ' ';
++
++			for (zone = 0; zone < MAX_NR_ZONES; zone++)
++				size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
++
++			seq_printf(m, " %10lu%c", size, mark);
++		}
++
++		seq_putc(m, '\n');
++
++		if (full)
++			lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
++	}
++
++	return 0;
++}
++
++static const struct seq_operations lru_gen_seq_ops = {
++	.start = lru_gen_seq_start,
++	.stop = lru_gen_seq_stop,
++	.next = lru_gen_seq_next,
++	.show = lru_gen_seq_show,
++};
++
++static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
++		     bool can_swap, bool force_scan)
++{
++	DEFINE_MAX_SEQ(lruvec);
++	DEFINE_MIN_SEQ(lruvec);
++
++	if (seq < max_seq)
++		return 0;
++
++	if (seq > max_seq)
++		return -EINVAL;
++
++	if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
++		return -ERANGE;
++
++	try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan);
++
++	return 0;
++}
++
++static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
++			int swappiness, unsigned long nr_to_reclaim)
++{
++	DEFINE_MAX_SEQ(lruvec);
++
++	if (seq + MIN_NR_GENS > max_seq)
++		return -EINVAL;
++
++	sc->nr_reclaimed = 0;
++
++	while (!signal_pending(current)) {
++		DEFINE_MIN_SEQ(lruvec);
++
++		if (seq < min_seq[!swappiness])
++			return 0;
++
++		if (sc->nr_reclaimed >= nr_to_reclaim)
++			return 0;
++
++		if (!evict_pages(lruvec, sc, swappiness, NULL))
++			return 0;
++
++		cond_resched();
++	}
++
++	return -EINTR;
++}
++
++static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
++		   struct scan_control *sc, int swappiness, unsigned long opt)
++{
++	struct lruvec *lruvec;
++	int err = -EINVAL;
++	struct mem_cgroup *memcg = NULL;
++
++	if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
++		return -EINVAL;
++
++	if (!mem_cgroup_disabled()) {
++		rcu_read_lock();
++		memcg = mem_cgroup_from_id(memcg_id);
++#ifdef CONFIG_MEMCG
++		if (memcg && !css_tryget(&memcg->css))
++			memcg = NULL;
++#endif
++		rcu_read_unlock();
++
++		if (!memcg)
++			return -EINVAL;
++	}
++
++	if (memcg_id != mem_cgroup_id(memcg))
++		goto done;
++
++	lruvec = get_lruvec(memcg, nid);
++
++	if (swappiness < 0)
++		swappiness = get_swappiness(lruvec, sc);
++	else if (swappiness > 200)
++		goto done;
++
++	switch (cmd) {
++	case '+':
++		err = run_aging(lruvec, seq, sc, swappiness, opt);
++		break;
++	case '-':
++		err = run_eviction(lruvec, seq, sc, swappiness, opt);
++		break;
++	}
++done:
++	mem_cgroup_put(memcg);
++
++	return err;
++}
++
++static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
++				 size_t len, loff_t *pos)
++{
++	void *buf;
++	char *cur, *next;
++	unsigned int flags;
++	struct blk_plug plug;
++	int err = -EINVAL;
++	struct scan_control sc = {
++		.may_writepage = true,
++		.may_unmap = true,
++		.may_swap = true,
++		.reclaim_idx = MAX_NR_ZONES - 1,
++		.gfp_mask = GFP_KERNEL,
++	};
++
++	buf = kvmalloc(len + 1, GFP_KERNEL);
++	if (!buf)
++		return -ENOMEM;
++
++	if (copy_from_user(buf, src, len)) {
++		kvfree(buf);
++		return -EFAULT;
++	}
++
++	set_task_reclaim_state(current, &sc.reclaim_state);
++	flags = memalloc_noreclaim_save();
++	blk_start_plug(&plug);
++	if (!set_mm_walk(NULL)) {
++		err = -ENOMEM;
++		goto done;
++	}
++
++	next = buf;
++	next[len] = '\0';
++
++	while ((cur = strsep(&next, ",;\n"))) {
++		int n;
++		int end;
++		char cmd;
++		unsigned int memcg_id;
++		unsigned int nid;
++		unsigned long seq;
++		unsigned int swappiness = -1;
++		unsigned long opt = -1;
++
++		cur = skip_spaces(cur);
++		if (!*cur)
++			continue;
++
++		n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
++			   &seq, &end, &swappiness, &end, &opt, &end);
++		if (n < 4 || cur[end]) {
++			err = -EINVAL;
++			break;
++		}
++
++		err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
++		if (err)
++			break;
++	}
++done:
++	clear_mm_walk();
++	blk_finish_plug(&plug);
++	memalloc_noreclaim_restore(flags);
++	set_task_reclaim_state(current, NULL);
++
++	kvfree(buf);
++
++	return err ? : len;
++}
++
++static int lru_gen_seq_open(struct inode *inode, struct file *file)
++{
++	return seq_open(file, &lru_gen_seq_ops);
++}
++
++static const struct file_operations lru_gen_rw_fops = {
++	.open = lru_gen_seq_open,
++	.read = seq_read,
++	.write = lru_gen_seq_write,
++	.llseek = seq_lseek,
++	.release = seq_release,
++};
++
++static const struct file_operations lru_gen_ro_fops = {
++	.open = lru_gen_seq_open,
++	.read = seq_read,
++	.llseek = seq_lseek,
++	.release = seq_release,
++};
++
++/******************************************************************************
+  *                          initialization
+  ******************************************************************************/
+ 
+@@ -5180,6 +5568,9 @@ static int __init init_lru_gen(void)
+ 	if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
+ 		pr_err("lru_gen: failed to create sysfs group\n");
+ 
++	debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
++	debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
++
+ 	return 0;
+ };
+ late_initcall(init_lru_gen);

+ 32 - 0
target/linux/generic/backport-6.1/020-v6.1-13-mm-mglru-don-t-sync-disk-for-each-aging-cycle.patch

@@ -0,0 +1,32 @@
+From 92d430e8955c976eacb7cc91d7ff849c0dd009af Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Wed, 28 Sep 2022 13:36:58 -0600
+Subject: [PATCH 13/29] mm/mglru: don't sync disk for each aging cycle
+
+wakeup_flusher_threads() was added under the assumption that if a system
+runs out of clean cold pages, it might want to write back dirty pages more
+aggressively so that they can become clean and be dropped.
+
+However, doing so can breach the rate limit a system wants to impose on
+writeback, resulting in early SSD wearout.
+
+Link: https://lkml.kernel.org/r/[email protected]
+Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
+Signed-off-by: Yu Zhao <[email protected]>
+Reported-by: Axel Rasmussen <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ mm/vmscan.c | 2 --
+ 1 file changed, 2 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -4165,8 +4165,6 @@ done:
+ 	if (wq_has_sleeper(&lruvec->mm_state.wait))
+ 		wake_up_all(&lruvec->mm_state.wait);
+ 
+-	wakeup_flusher_threads(WB_REASON_VMSCAN);
+-
+ 	return true;
+ }
+ 

+ 124 - 0
target/linux/generic/backport-6.1/020-v6.1-14-mm-multi-gen-LRU-retry-pages-written-back-while-isol.patch

@@ -0,0 +1,124 @@
+From 6f315879ad750391a0b1fab8c9170bc054a5f5d7 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Tue, 15 Nov 2022 18:38:07 -0700
+Subject: [PATCH 14/29] mm: multi-gen LRU: retry pages written back while
+ isolated
+
+The page reclaim isolates a batch of pages from the tail of one of the
+LRU lists and works on those pages one by one.  For a suitable
+swap-backed page, if the swap device is async, it queues that page for
+writeback.  After the page reclaim finishes an entire batch, it puts back
+the pages it queued for writeback to the head of the original LRU list.
+
+In the meantime, the page writeback flushes the queued pages also by
+batches.  Its batching logic is independent from that of the page reclaim.
+For each of the pages it writes back, the page writeback calls
+rotate_reclaimable_page() which tries to rotate a page to the tail.
+
+rotate_reclaimable_page() only works for a page after the page reclaim
+has put it back.  If an async swap device is fast enough, the page
+writeback can finish with that page while the page reclaim is still
+working on the rest of the batch containing it.  In this case, that page
+will remain at the head and the page reclaim will not retry it before
+reaching there.
+
+This patch adds a retry to evict_pages().  After evict_pages() has
+finished an entire batch and before it puts back pages it cannot free
+immediately, it retries those that may have missed the rotation.
+
+Before this patch, ~60% of pages swapped to an Intel Optane missed
+rotate_reclaimable_page().  After this patch, ~99% of missed pages were
+reclaimed upon retry.
+
+This problem affects relatively slow async swap devices like Samsung 980
+Pro much less and does not affect sync swap devices like zram or zswap at
+all.
+
+Link: https://lkml.kernel.org/r/[email protected]
+Fixes: ac35a4902374 ("mm: multi-gen LRU: minimal implementation")
+Signed-off-by: Yu Zhao <[email protected]>
+Cc: "Yin, Fengwei" <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ mm/vmscan.c | 48 +++++++++++++++++++++++++++++++++++++-----------
+ 1 file changed, 37 insertions(+), 11 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -4723,10 +4723,13 @@ static int evict_pages(struct lruvec *lr
+ 	int scanned;
+ 	int reclaimed;
+ 	LIST_HEAD(list);
++	LIST_HEAD(clean);
+ 	struct page *page;
++	struct page *next;
+ 	enum vm_event_item item;
+ 	struct reclaim_stat stat;
+ 	struct lru_gen_mm_walk *walk;
++	bool skip_retry = false;
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ 
+@@ -4743,20 +4746,37 @@ static int evict_pages(struct lruvec *lr
+ 
+ 	if (list_empty(&list))
+ 		return scanned;
+-
++retry:
+ 	reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false);
++	sc->nr_reclaimed += reclaimed;
+ 
+-	list_for_each_entry(page, &list, lru) {
+-		/* restore LRU_REFS_FLAGS cleared by isolate_page() */
+-		if (PageWorkingset(page))
+-			SetPageReferenced(page);
++	list_for_each_entry_safe_reverse(page, next, &list, lru) {
++		if (!page_evictable(page)) {
++			list_del(&page->lru);
++			putback_lru_page(page);
++			continue;
++		}
+ 
+-		/* don't add rejected pages to the oldest generation */
+ 		if (PageReclaim(page) &&
+-		    (PageDirty(page) || PageWriteback(page)))
+-			ClearPageActive(page);
+-		else
+-			SetPageActive(page);
++		    (PageDirty(page) || PageWriteback(page))) {
++			/* restore LRU_REFS_FLAGS cleared by isolate_page() */
++			if (PageWorkingset(page))
++				SetPageReferenced(page);
++			continue;
++		}
++
++		if (skip_retry || PageActive(page) || PageReferenced(page) ||
++		    page_mapped(page) || PageLocked(page) ||
++		    PageDirty(page) || PageWriteback(page)) {
++			/* don't add rejected pages to the oldest generation */
++			set_mask_bits(&page->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
++				      BIT(PG_active));
++			continue;
++		}
++
++		/* retry pages that may have missed rotate_reclaimable_page() */
++		list_move(&page->lru, &clean);
++		sc->nr_scanned -= thp_nr_pages(page);
+ 	}
+ 
+ 	spin_lock_irq(&lruvec->lru_lock);
+@@ -4778,7 +4798,13 @@ static int evict_pages(struct lruvec *lr
+ 	mem_cgroup_uncharge_list(&list);
+ 	free_unref_page_list(&list);
+ 
+-	sc->nr_reclaimed += reclaimed;
++	INIT_LIST_HEAD(&list);
++	list_splice_init(&clean, &list);
++
++	if (!list_empty(&list)) {
++		skip_retry = true;
++		goto retry;
++	}
+ 
+ 	if (need_swapping && type == LRU_GEN_ANON)
+ 		*need_swapping = true;

+ 49 - 0
target/linux/generic/backport-6.1/020-v6.1-15-mm-multi-gen-LRU-move-lru_gen_add_mm-out-of-IRQ-off-.patch

@@ -0,0 +1,49 @@
+From 255bb0ac393f1c2818cd75af45a9226300ab3daf Mon Sep 17 00:00:00 2001
+From: Sebastian Andrzej Siewior <[email protected]>
+Date: Wed, 26 Oct 2022 15:48:30 +0200
+Subject: [PATCH 15/29] mm: multi-gen LRU: move lru_gen_add_mm() out of IRQ-off
+ region
+
+lru_gen_add_mm() has been added within an IRQ-off region in the commit
+mentioned below.  The other invocations of lru_gen_add_mm() are not within
+an IRQ-off region.
+
+The invocation within IRQ-off region is problematic on PREEMPT_RT because
+the function is using a spin_lock_t which must not be used within
+IRQ-disabled regions.
+
+The other invocations of lru_gen_add_mm() occur while
+task_struct::alloc_lock is acquired.  Move lru_gen_add_mm() after
+interrupts are enabled and before task_unlock().
+
+Link: https://lkml.kernel.org/r/[email protected]
+Fixes: bd74fdaea1460 ("mm: multi-gen LRU: support page table walks")
+Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
+Acked-by: Yu Zhao <[email protected]>
+Cc: Al Viro <[email protected]>
+Cc: "Eric W . Biederman" <[email protected]>
+Cc: Kees Cook <[email protected]>
+Cc: Thomas Gleixner <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ fs/exec.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -1013,7 +1013,6 @@ static int exec_mmap(struct mm_struct *m
+ 	active_mm = tsk->active_mm;
+ 	tsk->active_mm = mm;
+ 	tsk->mm = mm;
+-	lru_gen_add_mm(mm);
+ 	/*
+ 	 * This prevents preemption while active_mm is being loaded and
+ 	 * it and mm are being updated, which could cause problems for
+@@ -1028,6 +1027,7 @@ static int exec_mmap(struct mm_struct *m
+ 		local_irq_enable();
+ 	tsk->mm->vmacache_seqnum = 0;
+ 	vmacache_flush(tsk);
++	lru_gen_add_mm(mm);
+ 	task_unlock(tsk);
+ 	lru_gen_use_mm(mm);
+ 	if (old_mm) {

+ 96 - 0
target/linux/generic/backport-6.1/020-v6.1-17-mm-add-dummy-pmd_young-for-architectures-not-having-.patch

@@ -0,0 +1,96 @@
+From c5ec455ebd2b488d91de9d8915a0c8036a2a04dd Mon Sep 17 00:00:00 2001
+From: Juergen Gross <[email protected]>
+Date: Wed, 30 Nov 2022 14:49:41 -0800
+Subject: [PATCH 17/29] mm: add dummy pmd_young() for architectures not having
+ it
+
+In order to avoid #ifdeffery add a dummy pmd_young() implementation as a
+fallback.  This is required for the later patch "mm: introduce
+arch_has_hw_nonleaf_pmd_young()".
+
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Juergen Gross <[email protected]>
+Acked-by: Yu Zhao <[email protected]>
+Cc: Borislav Petkov <[email protected]>
+Cc: Dave Hansen <[email protected]>
+Cc: Geert Uytterhoeven <[email protected]>
+Cc: "H. Peter Anvin" <[email protected]>
+Cc: Ingo Molnar <[email protected]>
+Cc: Sander Eikelenboom <[email protected]>
+Cc: Thomas Gleixner <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ arch/mips/include/asm/pgtable.h     | 1 +
+ arch/riscv/include/asm/pgtable.h    | 1 +
+ arch/s390/include/asm/pgtable.h     | 1 +
+ arch/sparc/include/asm/pgtable_64.h | 1 +
+ arch/x86/include/asm/pgtable.h      | 1 +
+ include/linux/pgtable.h             | 7 +++++++
+ 6 files changed, 12 insertions(+)
+
+--- a/arch/mips/include/asm/pgtable.h
++++ b/arch/mips/include/asm/pgtable.h
+@@ -632,6 +632,7 @@ static inline pmd_t pmd_mkdirty(pmd_t pm
+ 	return pmd;
+ }
+ 
++#define pmd_young pmd_young
+ static inline int pmd_young(pmd_t pmd)
+ {
+ 	return !!(pmd_val(pmd) & _PAGE_ACCESSED);
+--- a/arch/riscv/include/asm/pgtable.h
++++ b/arch/riscv/include/asm/pgtable.h
+@@ -535,6 +535,7 @@ static inline int pmd_dirty(pmd_t pmd)
+ 	return pte_dirty(pmd_pte(pmd));
+ }
+ 
++#define pmd_young pmd_young
+ static inline int pmd_young(pmd_t pmd)
+ {
+ 	return pte_young(pmd_pte(pmd));
+--- a/arch/s390/include/asm/pgtable.h
++++ b/arch/s390/include/asm/pgtable.h
+@@ -748,6 +748,7 @@ static inline int pmd_dirty(pmd_t pmd)
+ 	return (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) != 0;
+ }
+ 
++#define pmd_young pmd_young
+ static inline int pmd_young(pmd_t pmd)
+ {
+ 	return (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) != 0;
+--- a/arch/sparc/include/asm/pgtable_64.h
++++ b/arch/sparc/include/asm/pgtable_64.h
+@@ -712,6 +712,7 @@ static inline unsigned long pmd_dirty(pm
+ 	return pte_dirty(pte);
+ }
+ 
++#define pmd_young pmd_young
+ static inline unsigned long pmd_young(pmd_t pmd)
+ {
+ 	pte_t pte = __pte(pmd_val(pmd));
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -136,6 +136,7 @@ static inline int pmd_dirty(pmd_t pmd)
+ 	return pmd_flags(pmd) & _PAGE_DIRTY;
+ }
+ 
++#define pmd_young pmd_young
+ static inline int pmd_young(pmd_t pmd)
+ {
+ 	return pmd_flags(pmd) & _PAGE_ACCESSED;
+--- a/include/linux/pgtable.h
++++ b/include/linux/pgtable.h
+@@ -164,6 +164,13 @@ static inline pte_t *virt_to_kpte(unsign
+ 	return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr);
+ }
+ 
++#ifndef pmd_young
++static inline int pmd_young(pmd_t pmd)
++{
++	return 0;
++}
++#endif
++
+ #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+ extern int ptep_set_access_flags(struct vm_area_struct *vma,
+ 				 unsigned long address, pte_t *ptep,

+ 113 - 0
target/linux/generic/backport-6.1/020-v6.1-18-mm-introduce-arch_has_hw_nonleaf_pmd_young.patch

@@ -0,0 +1,113 @@
+From 46cbda7b65998a5af4493f745d94417af697bd68 Mon Sep 17 00:00:00 2001
+From: Juergen Gross <[email protected]>
+Date: Wed, 23 Nov 2022 07:45:10 +0100
+Subject: [PATCH 18/29] mm: introduce arch_has_hw_nonleaf_pmd_young()
+
+When running as a Xen PV guests commit eed9a328aa1a ("mm: x86: add
+CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG") can cause a protection violation in
+pmdp_test_and_clear_young():
+
+ BUG: unable to handle page fault for address: ffff8880083374d0
+ #PF: supervisor write access in kernel mode
+ #PF: error_code(0x0003) - permissions violation
+ PGD 3026067 P4D 3026067 PUD 3027067 PMD 7fee5067 PTE 8010000008337065
+ Oops: 0003 [#1] PREEMPT SMP NOPTI
+ CPU: 7 PID: 158 Comm: kswapd0 Not tainted 6.1.0-rc5-20221118-doflr+ #1
+ RIP: e030:pmdp_test_and_clear_young+0x25/0x40
+
+This happens because the Xen hypervisor can't emulate direct writes to
+page table entries other than PTEs.
+
+This can easily be fixed by introducing arch_has_hw_nonleaf_pmd_young()
+similar to arch_has_hw_pte_young() and test that instead of
+CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.
+
+Link: https://lkml.kernel.org/r/[email protected]
+Fixes: eed9a328aa1a ("mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG")
+Signed-off-by: Juergen Gross <[email protected]>
+Reported-by: Sander Eikelenboom <[email protected]>
+Acked-by: Yu Zhao <[email protected]>
+Tested-by: Sander Eikelenboom <[email protected]>
+Acked-by: David Hildenbrand <[email protected]>	[core changes]
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ arch/x86/include/asm/pgtable.h |  8 ++++++++
+ include/linux/pgtable.h        | 11 +++++++++++
+ mm/vmscan.c                    | 10 +++++-----
+ 3 files changed, 24 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -1405,6 +1405,14 @@ static inline bool arch_has_hw_pte_young
+ 	return true;
+ }
+ 
++#ifdef CONFIG_XEN_PV
++#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
++static inline bool arch_has_hw_nonleaf_pmd_young(void)
++{
++	return !cpu_feature_enabled(X86_FEATURE_XENPV);
++}
++#endif
++
+ #endif	/* __ASSEMBLY__ */
+ 
+ #endif /* _ASM_X86_PGTABLE_H */
+--- a/include/linux/pgtable.h
++++ b/include/linux/pgtable.h
+@@ -266,6 +266,17 @@ static inline int pmdp_clear_flush_young
+ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ #endif
+ 
++#ifndef arch_has_hw_nonleaf_pmd_young
++/*
++ * Return whether the accessed bit in non-leaf PMD entries is supported on the
++ * local CPU.
++ */
++static inline bool arch_has_hw_nonleaf_pmd_young(void)
++{
++	return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG);
++}
++#endif
++
+ #ifndef arch_has_hw_pte_young
+ /*
+  * Return whether the accessed bit is supported on the local CPU.
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -3727,7 +3727,7 @@ static void walk_pmd_range_locked(pud_t
+ 			goto next;
+ 
+ 		if (!pmd_trans_huge(pmd[i])) {
+-			if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
++			if (arch_has_hw_nonleaf_pmd_young() &&
+ 			    get_cap(LRU_GEN_NONLEAF_YOUNG))
+ 				pmdp_test_and_clear_young(vma, addr, pmd + i);
+ 			goto next;
+@@ -3825,14 +3825,14 @@ restart:
+ #endif
+ 		walk->mm_stats[MM_NONLEAF_TOTAL]++;
+ 
+-#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
+-		if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
++		if (arch_has_hw_nonleaf_pmd_young() &&
++		    get_cap(LRU_GEN_NONLEAF_YOUNG)) {
+ 			if (!pmd_young(val))
+ 				continue;
+ 
+ 			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
+ 		}
+-#endif
++
+ 		if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
+ 			continue;
+ 
+@@ -5132,7 +5132,7 @@ static ssize_t show_enabled(struct kobje
+ 	if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
+ 		caps |= BIT(LRU_GEN_MM_WALK);
+ 
+-	if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
++	if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
+ 		caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
+ 
+ 	return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);

+ 56 - 0
target/linux/generic/backport-6.1/020-v6.2-16-mm-multi-gen-LRU-fix-crash-during-cgroup-migration.patch

@@ -0,0 +1,56 @@
+From c7dfefd4bdfba3d5171038d1cc2d4160288e6ee4 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Sun, 15 Jan 2023 20:44:05 -0700
+Subject: [PATCH 16/29] mm: multi-gen LRU: fix crash during cgroup migration
+
+lru_gen_migrate_mm() assumes lru_gen_add_mm() runs prior to itself.  This
+isn't true for the following scenario:
+
+    CPU 1                         CPU 2
+
+  clone()
+    cgroup_can_fork()
+                                cgroup_procs_write()
+    cgroup_post_fork()
+                                  task_lock()
+                                  lru_gen_migrate_mm()
+                                  task_unlock()
+    task_lock()
+    lru_gen_add_mm()
+    task_unlock()
+
+And when the above happens, kernel crashes because of linked list
+corruption (mm_struct->lru_gen.list).
+
+Link: https://lore.kernel.org/r/[email protected]/
+Link: https://lkml.kernel.org/r/[email protected]
+Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
+Signed-off-by: Yu Zhao <[email protected]>
+Reported-by: msizanoen <[email protected]>
+Tested-by: msizanoen <[email protected]>
+Cc: <[email protected]>	[6.1+]
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ mm/vmscan.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -3024,13 +3024,16 @@ void lru_gen_migrate_mm(struct mm_struct
+ 	if (mem_cgroup_disabled())
+ 		return;
+ 
++	/* migration can happen before addition */
++	if (!mm->lru_gen.memcg)
++		return;
++
+ 	rcu_read_lock();
+ 	memcg = mem_cgroup_from_task(task);
+ 	rcu_read_unlock();
+ 	if (memcg == mm->lru_gen.memcg)
+ 		return;
+ 
+-	VM_WARN_ON_ONCE(!mm->lru_gen.memcg);
+ 	VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
+ 
+ 	lru_gen_del_mm(mm);

+ 196 - 0
target/linux/generic/backport-6.1/020-v6.3-19-mm-add-vma_has_recency.patch

@@ -0,0 +1,196 @@
+From 6c7f552a48b49a8612786a28a2239fbc24fac289 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Fri, 30 Dec 2022 14:52:51 -0700
+Subject: [PATCH 19/29] mm: add vma_has_recency()
+
+Add vma_has_recency() to indicate whether a VMA may exhibit temporal
+locality that the LRU algorithm relies on.
+
+This function returns false for VMAs marked by VM_SEQ_READ or
+VM_RAND_READ.  While the former flag indicates linear access, i.e., a
+special case of spatial locality, both flags indicate a lack of temporal
+locality, i.e., the reuse of an area within a relatively small duration.
+
+"Recency" is chosen over "locality" to avoid confusion between temporal
+and spatial localities.
+
+Before this patch, the active/inactive LRU only ignored the accessed bit
+from VMAs marked by VM_SEQ_READ.  After this patch, the active/inactive
+LRU and MGLRU share the same logic: they both ignore the accessed bit if
+vma_has_recency() returns false.
+
+For the active/inactive LRU, the following fio test showed a [6, 8]%
+increase in IOPS when randomly accessing mapped files under memory
+pressure.
+
+  kb=$(awk '/MemTotal/ { print $2 }' /proc/meminfo)
+  kb=$((kb - 8*1024*1024))
+
+  modprobe brd rd_nr=1 rd_size=$kb
+  dd if=/dev/zero of=/dev/ram0 bs=1M
+
+  mkfs.ext4 /dev/ram0
+  mount /dev/ram0 /mnt/
+  swapoff -a
+
+  fio --name=test --directory=/mnt/ --ioengine=mmap --numjobs=8 \
+      --size=8G --rw=randrw --time_based --runtime=10m \
+      --group_reporting
+
+The discussion that led to this patch is here [1].  Additional test
+results are available in that thread.
+
+[1] https://lore.kernel.org/r/Y31s%[email protected]/
+
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Cc: Alexander Viro <[email protected]>
+Cc: Andrea Righi <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ include/linux/mm_inline.h |  9 +++++++++
+ mm/memory.c               |  8 ++++----
+ mm/rmap.c                 | 42 +++++++++++++++++----------------------
+ mm/vmscan.c               |  5 ++++-
+ 4 files changed, 35 insertions(+), 29 deletions(-)
+
+--- a/include/linux/mm_inline.h
++++ b/include/linux/mm_inline.h
+@@ -333,4 +333,13 @@ static __always_inline void del_page_fro
+ 	update_lru_size(lruvec, page_lru(page), page_zonenum(page),
+ 			-thp_nr_pages(page));
+ }
++
++static inline bool vma_has_recency(struct vm_area_struct *vma)
++{
++	if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))
++		return false;
++
++	return true;
++}
++
+ #endif
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -41,6 +41,7 @@
+ 
+ #include <linux/kernel_stat.h>
+ #include <linux/mm.h>
++#include <linux/mm_inline.h>
+ #include <linux/sched/mm.h>
+ #include <linux/sched/coredump.h>
+ #include <linux/sched/numa_balancing.h>
+@@ -1353,8 +1354,7 @@ again:
+ 					force_flush = 1;
+ 					set_page_dirty(page);
+ 				}
+-				if (pte_young(ptent) &&
+-				    likely(!(vma->vm_flags & VM_SEQ_READ)))
++				if (pte_young(ptent) && likely(vma_has_recency(vma)))
+ 					mark_page_accessed(page);
+ 			}
+ 			rss[mm_counter(page)]--;
+@@ -4795,8 +4795,8 @@ static inline void mm_account_fault(stru
+ #ifdef CONFIG_LRU_GEN
+ static void lru_gen_enter_fault(struct vm_area_struct *vma)
+ {
+-	/* the LRU algorithm doesn't apply to sequential or random reads */
+-	current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
++	/* the LRU algorithm only applies to accesses with recency */
++	current->in_lru_fault = vma_has_recency(vma);
+ }
+ 
+ static void lru_gen_exit_fault(void)
+--- a/mm/rmap.c
++++ b/mm/rmap.c
+@@ -794,25 +794,14 @@ static bool page_referenced_one(struct p
+ 		}
+ 
+ 		if (pvmw.pte) {
+-			if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
+-			    !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
++			if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
+ 				lru_gen_look_around(&pvmw);
+ 				referenced++;
+ 			}
+ 
+ 			if (ptep_clear_flush_young_notify(vma, address,
+-						pvmw.pte)) {
+-				/*
+-				 * Don't treat a reference through
+-				 * a sequentially read mapping as such.
+-				 * If the page has been used in another mapping,
+-				 * we will catch it; if this other mapping is
+-				 * already gone, the unmap path will have set
+-				 * PG_referenced or activated the page.
+-				 */
+-				if (likely(!(vma->vm_flags & VM_SEQ_READ)))
+-					referenced++;
+-			}
++						pvmw.pte))
++				referenced++;
+ 		} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ 			if (pmdp_clear_flush_young_notify(vma, address,
+ 						pvmw.pmd))
+@@ -846,7 +835,20 @@ static bool invalid_page_referenced_vma(
+ 	struct page_referenced_arg *pra = arg;
+ 	struct mem_cgroup *memcg = pra->memcg;
+ 
+-	if (!mm_match_cgroup(vma->vm_mm, memcg))
++	/*
++	 * Ignore references from this mapping if it has no recency. If the
++	 * page has been used in another mapping, we will catch it; if this
++	 * other mapping is already gone, the unmap path will have set the
++	 * referenced flag or activated the page in zap_pte_range().
++	 */
++	if (!vma_has_recency(vma))
++		return true;
++
++	/*
++	 * If we are reclaiming on behalf of a cgroup, skip counting on behalf
++	 * of references from different cgroups.
++	 */
++	if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
+ 		return true;
+ 
+ 	return false;
+@@ -876,6 +878,7 @@ int page_referenced(struct page *page,
+ 		.rmap_one = page_referenced_one,
+ 		.arg = (void *)&pra,
+ 		.anon_lock = page_lock_anon_vma_read,
++		.invalid_vma = invalid_page_referenced_vma,
+ 	};
+ 
+ 	*vm_flags = 0;
+@@ -891,15 +894,6 @@ int page_referenced(struct page *page,
+ 			return 1;
+ 	}
+ 
+-	/*
+-	 * If we are reclaiming on behalf of a cgroup, skip
+-	 * counting on behalf of references from different
+-	 * cgroups
+-	 */
+-	if (memcg) {
+-		rwc.invalid_vma = invalid_page_referenced_vma;
+-	}
+-
+ 	rmap_walk(page, &rwc);
+ 	*vm_flags = pra.vm_flags;
+ 
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -3486,7 +3486,10 @@ static int should_skip_vma(unsigned long
+ 	if (is_vm_hugetlb_page(vma))
+ 		return true;
+ 
+-	if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ))
++	if (!vma_has_recency(vma))
++		return true;
++
++	if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL))
+ 		return true;
+ 
+ 	if (vma == get_gate_vma(vma->vm_mm))

+ 125 - 0
target/linux/generic/backport-6.1/020-v6.3-20-mm-support-POSIX_FADV_NOREUSE.patch

@@ -0,0 +1,125 @@
+From 686c3d4f71de9e0e7a27f03a5617a712385f90cd Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Fri, 30 Dec 2022 14:52:52 -0700
+Subject: [PATCH 20/29] mm: support POSIX_FADV_NOREUSE
+
+This patch adds POSIX_FADV_NOREUSE to vma_has_recency() so that the LRU
+algorithm can ignore access to mapped files marked by this flag.
+
+The advantages of POSIX_FADV_NOREUSE are:
+1. Unlike MADV_SEQUENTIAL and MADV_RANDOM, it does not alter the
+   default readahead behavior.
+2. Unlike MADV_SEQUENTIAL and MADV_RANDOM, it does not split VMAs and
+   therefore does not take mmap_lock.
+3. Unlike MADV_COLD, setting it has a negligible cost, regardless of
+   how many pages it affects.
+
+Its limitations are:
+1. Like POSIX_FADV_RANDOM and POSIX_FADV_SEQUENTIAL, it currently does
+   not support range. IOW, its scope is the entire file.
+2. It currently does not ignore access through file descriptors.
+   Specifically, for the active/inactive LRU, given a file page shared
+   by two users and one of them having set POSIX_FADV_NOREUSE on the
+   file, this page will be activated upon the second user accessing
+   it. This corner case can be covered by checking POSIX_FADV_NOREUSE
+   before calling mark_page_accessed() on the read path. But it is
+   considered not worth the effort.
+
+There have been a few attempts to support POSIX_FADV_NOREUSE, e.g., [1].
+This time the goal is to fill a niche: a few desktop applications, e.g.,
+large file transferring and video encoding/decoding, want fast file
+streaming with mmap() rather than direct IO.  Among those applications, an
+SVT-AV1 regression was reported when running with MGLRU [2].  The
+following test can reproduce that regression.
+
+  kb=$(awk '/MemTotal/ { print $2 }' /proc/meminfo)
+  kb=$((kb - 8*1024*1024))
+
+  modprobe brd rd_nr=1 rd_size=$kb
+  dd if=/dev/zero of=/dev/ram0 bs=1M
+
+  mkfs.ext4 /dev/ram0
+  mount /dev/ram0 /mnt/
+  swapoff -a
+
+  fallocate -l 8G /mnt/swapfile
+  mkswap /mnt/swapfile
+  swapon /mnt/swapfile
+
+  wget http://ultravideo.cs.tut.fi/video/Bosphorus_3840x2160_120fps_420_8bit_YUV_Y4M.7z
+  7z e -o/mnt/ Bosphorus_3840x2160_120fps_420_8bit_YUV_Y4M.7z
+  SvtAv1EncApp --preset 12 -w 3840 -h 2160 \
+               -i /mnt/Bosphorus_3840x2160.y4m
+
+For MGLRU, the following change showed a [9-11]% increase in FPS,
+which makes it on par with the active/inactive LRU.
+
+  patch Source/App/EncApp/EbAppMain.c <<EOF
+  31a32
+  > #include <fcntl.h>
+  35d35
+  < #include <fcntl.h> /* _O_BINARY */
+  117a118
+  >             posix_fadvise(config->mmap.fd, 0, 0, POSIX_FADV_NOREUSE);
+  EOF
+
+[1] https://lore.kernel.org/r/[email protected]/
+[2] https://openbenchmarking.org/result/2209259-PTS-MGLRU8GB57
+
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Cc: Alexander Viro <[email protected]>
+Cc: Andrea Righi <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ include/linux/fs.h        | 2 ++
+ include/linux/mm_inline.h | 3 +++
+ mm/fadvise.c              | 5 ++++-
+ 3 files changed, 9 insertions(+), 1 deletion(-)
+
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -167,6 +167,8 @@ typedef int (dio_iodone_t)(struct kiocb
+ /* File is stream-like */
+ #define FMODE_STREAM		((__force fmode_t)0x200000)
+ 
++#define	FMODE_NOREUSE		((__force fmode_t)0x400000)
++
+ /* File was opened by fanotify and shouldn't generate fanotify events */
+ #define FMODE_NONOTIFY		((__force fmode_t)0x4000000)
+ 
+--- a/include/linux/mm_inline.h
++++ b/include/linux/mm_inline.h
+@@ -339,6 +339,9 @@ static inline bool vma_has_recency(struc
+ 	if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))
+ 		return false;
+ 
++	if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE))
++		return false;
++
+ 	return true;
+ }
+ 
+--- a/mm/fadvise.c
++++ b/mm/fadvise.c
+@@ -80,7 +80,7 @@ int generic_fadvise(struct file *file, l
+ 	case POSIX_FADV_NORMAL:
+ 		file->f_ra.ra_pages = bdi->ra_pages;
+ 		spin_lock(&file->f_lock);
+-		file->f_mode &= ~FMODE_RANDOM;
++		file->f_mode &= ~(FMODE_RANDOM | FMODE_NOREUSE);
+ 		spin_unlock(&file->f_lock);
+ 		break;
+ 	case POSIX_FADV_RANDOM:
+@@ -107,6 +107,9 @@ int generic_fadvise(struct file *file, l
+ 		force_page_cache_readahead(mapping, file, start_index, nrpages);
+ 		break;
+ 	case POSIX_FADV_NOREUSE:
++		spin_lock(&file->f_lock);
++		file->f_mode |= FMODE_NOREUSE;
++		spin_unlock(&file->f_lock);
+ 		break;
+ 	case POSIX_FADV_DONTNEED:
+ 		if (!inode_write_congested(mapping->host))

+ 348 - 0
target/linux/generic/backport-6.1/020-v6.3-21-mm-multi-gen-LRU-rename-lru_gen_struct-to-lru_gen_pa.patch

@@ -0,0 +1,348 @@
+From 348fdbada9fb3f0bf1a53651be46319105af187f Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Wed, 21 Dec 2022 21:18:59 -0700
+Subject: [PATCH 21/29] mm: multi-gen LRU: rename lru_gen_struct to
+ lru_gen_page
+
+Patch series "mm: multi-gen LRU: memcg LRU", v3.
+
+Overview
+========
+
+An memcg LRU is a per-node LRU of memcgs.  It is also an LRU of LRUs,
+since each node and memcg combination has an LRU of pages (see
+mem_cgroup_lruvec()).
+
+Its goal is to improve the scalability of global reclaim, which is
+critical to system-wide memory overcommit in data centers.  Note that
+memcg reclaim is currently out of scope.
+
+Its memory bloat is a pointer to each lruvec and negligible to each
+pglist_data.  In terms of traversing memcgs during global reclaim, it
+improves the best-case complexity from O(n) to O(1) and does not affect
+the worst-case complexity O(n).  Therefore, on average, it has a sublinear
+complexity in contrast to the current linear complexity.
+
+The basic structure of an memcg LRU can be understood by an analogy to
+the active/inactive LRU (of pages):
+1. It has the young and the old (generations), i.e., the counterparts
+   to the active and the inactive;
+2. The increment of max_seq triggers promotion, i.e., the counterpart
+   to activation;
+3. Other events trigger similar operations, e.g., offlining an memcg
+   triggers demotion, i.e., the counterpart to deactivation.
+
+In terms of global reclaim, it has two distinct features:
+1. Sharding, which allows each thread to start at a random memcg (in
+   the old generation) and improves parallelism;
+2. Eventual fairness, which allows direct reclaim to bail out at will
+   and reduces latency without affecting fairness over some time.
+
+The commit message in patch 6 details the workflow:
+https://lore.kernel.org/r/[email protected]/
+
+The following is a simple test to quickly verify its effectiveness.
+
+  Test design:
+  1. Create multiple memcgs.
+  2. Each memcg contains a job (fio).
+  3. All jobs access the same amount of memory randomly.
+  4. The system does not experience global memory pressure.
+  5. Periodically write to the root memory.reclaim.
+
+  Desired outcome:
+  1. All memcgs have similar pgsteal counts, i.e., stddev(pgsteal)
+     over mean(pgsteal) is close to 0%.
+  2. The total pgsteal is close to the total requested through
+     memory.reclaim, i.e., sum(pgsteal) over sum(requested) is close
+     to 100%.
+
+  Actual outcome [1]:
+                                     MGLRU off    MGLRU on
+  stddev(pgsteal) / mean(pgsteal)    75%          20%
+  sum(pgsteal) / sum(requested)      425%         95%
+
+  ####################################################################
+  MEMCGS=128
+
+  for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
+      mkdir /sys/fs/cgroup/memcg$memcg
+  done
+
+  start() {
+      echo $BASHPID > /sys/fs/cgroup/memcg$memcg/cgroup.procs
+
+      fio -name=memcg$memcg --numjobs=1 --ioengine=mmap \
+          --filename=/dev/zero --size=1920M --rw=randrw \
+          --rate=64m,64m --random_distribution=random \
+          --fadvise_hint=0 --time_based --runtime=10h \
+          --group_reporting --minimal
+  }
+
+  for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
+      start &
+  done
+
+  sleep 600
+
+  for ((i = 0; i < 600; i++)); do
+      echo 256m >/sys/fs/cgroup/memory.reclaim
+      sleep 6
+  done
+
+  for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
+      grep "pgsteal " /sys/fs/cgroup/memcg$memcg/memory.stat
+  done
+  ####################################################################
+
+[1]: This was obtained from running the above script (touches less
+     than 256GB memory) on an EPYC 7B13 with 512GB DRAM for over an
+     hour.
+
+This patch (of 8):
+
+The new name lru_gen_page will be more distinct from the coming
+lru_gen_memcg.
+
+Link: https://lkml.kernel.org/r/[email protected]
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Jonathan Corbet <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Cc: Michal Hocko <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Roman Gushchin <[email protected]>
+Cc: Suren Baghdasaryan <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ include/linux/mm_inline.h |  4 ++--
+ include/linux/mmzone.h    |  6 +++---
+ mm/vmscan.c               | 34 +++++++++++++++++-----------------
+ mm/workingset.c           |  4 ++--
+ 4 files changed, 24 insertions(+), 24 deletions(-)
+
+--- a/include/linux/mm_inline.h
++++ b/include/linux/mm_inline.h
+@@ -168,7 +168,7 @@ static inline void lru_gen_update_size(s
+ 	int zone = page_zonenum(page);
+ 	int delta = thp_nr_pages(page);
+ 	enum lru_list lru = type * LRU_INACTIVE_FILE;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_page *lrugen = &lruvec->lrugen;
+ 
+ 	VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
+ 	VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
+@@ -214,7 +214,7 @@ static inline bool lru_gen_add_page(stru
+ 	int gen = page_lru_gen(page);
+ 	int type = page_is_file_lru(page);
+ 	int zone = page_zonenum(page);
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_page *lrugen = &lruvec->lrugen;
+ 
+ 	VM_WARN_ON_ONCE_PAGE(gen != -1, page);
+ 
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -394,7 +394,7 @@ enum {
+  * The number of pages in each generation is eventually consistent and therefore
+  * can be transiently negative when reset_batch_size() is pending.
+  */
+-struct lru_gen_struct {
++struct lru_gen_page {
+ 	/* the aging increments the youngest generation number */
+ 	unsigned long max_seq;
+ 	/* the eviction increments the oldest generation numbers */
+@@ -451,7 +451,7 @@ struct lru_gen_mm_state {
+ struct lru_gen_mm_walk {
+ 	/* the lruvec under reclaim */
+ 	struct lruvec *lruvec;
+-	/* unstable max_seq from lru_gen_struct */
++	/* unstable max_seq from lru_gen_page */
+ 	unsigned long max_seq;
+ 	/* the next address within an mm to scan */
+ 	unsigned long next_addr;
+@@ -514,7 +514,7 @@ struct lruvec {
+ 	unsigned long			flags;
+ #ifdef CONFIG_LRU_GEN
+ 	/* evictable pages divided into generations */
+-	struct lru_gen_struct		lrugen;
++	struct lru_gen_page		lrugen;
+ 	/* to concurrently iterate lru_gen_mm_list */
+ 	struct lru_gen_mm_state		mm_state;
+ #endif
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2910,7 +2910,7 @@ static int get_nr_gens(struct lruvec *lr
+ 
+ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
+ {
+-	/* see the comment on lru_gen_struct */
++	/* see the comment on lru_gen_page */
+ 	return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
+ 	       get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
+ 	       get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
+@@ -3316,7 +3316,7 @@ struct ctrl_pos {
+ static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
+ 			  struct ctrl_pos *pos)
+ {
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_page *lrugen = &lruvec->lrugen;
+ 	int hist = lru_hist_from_seq(lrugen->min_seq[type]);
+ 
+ 	pos->refaulted = lrugen->avg_refaulted[type][tier] +
+@@ -3331,7 +3331,7 @@ static void read_ctrl_pos(struct lruvec
+ static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
+ {
+ 	int hist, tier;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_page *lrugen = &lruvec->lrugen;
+ 	bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
+ 	unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
+ 
+@@ -3408,7 +3408,7 @@ static int page_update_gen(struct page *
+ static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming)
+ {
+ 	int type = page_is_file_lru(page);
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_page *lrugen = &lruvec->lrugen;
+ 	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
+ 	unsigned long new_flags, old_flags = READ_ONCE(page->flags);
+ 
+@@ -3453,7 +3453,7 @@ static void update_batch_size(struct lru
+ static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
+ {
+ 	int gen, type, zone;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_page *lrugen = &lruvec->lrugen;
+ 
+ 	walk->batched = 0;
+ 
+@@ -3979,7 +3979,7 @@ static bool inc_min_seq(struct lruvec *l
+ {
+ 	int zone;
+ 	int remaining = MAX_LRU_BATCH;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_page *lrugen = &lruvec->lrugen;
+ 	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
+ 
+ 	if (type == LRU_GEN_ANON && !can_swap)
+@@ -4015,7 +4015,7 @@ static bool try_to_inc_min_seq(struct lr
+ {
+ 	int gen, type, zone;
+ 	bool success = false;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_page *lrugen = &lruvec->lrugen;
+ 	DEFINE_MIN_SEQ(lruvec);
+ 
+ 	VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
+@@ -4036,7 +4036,7 @@ next:
+ 		;
+ 	}
+ 
+-	/* see the comment on lru_gen_struct */
++	/* see the comment on lru_gen_page */
+ 	if (can_swap) {
+ 		min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
+ 		min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
+@@ -4058,7 +4058,7 @@ static void inc_max_seq(struct lruvec *l
+ {
+ 	int prev, next;
+ 	int type, zone;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_page *lrugen = &lruvec->lrugen;
+ 
+ 	spin_lock_irq(&lruvec->lru_lock);
+ 
+@@ -4116,7 +4116,7 @@ static bool try_to_inc_max_seq(struct lr
+ 	bool success;
+ 	struct lru_gen_mm_walk *walk;
+ 	struct mm_struct *mm = NULL;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_page *lrugen = &lruvec->lrugen;
+ 
+ 	VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
+ 
+@@ -4181,7 +4181,7 @@ static bool should_run_aging(struct lruv
+ 	unsigned long old = 0;
+ 	unsigned long young = 0;
+ 	unsigned long total = 0;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_page *lrugen = &lruvec->lrugen;
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ 
+ 	for (type = !can_swap; type < ANON_AND_FILE; type++) {
+@@ -4466,7 +4466,7 @@ static bool sort_page(struct lruvec *lru
+ 	int delta = thp_nr_pages(page);
+ 	int refs = page_lru_refs(page);
+ 	int tier = lru_tier_from_refs(refs);
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_page *lrugen = &lruvec->lrugen;
+ 
+ 	VM_WARN_ON_ONCE_PAGE(gen >= MAX_NR_GENS, page);
+ 
+@@ -4566,7 +4566,7 @@ static int scan_pages(struct lruvec *lru
+ 	int scanned = 0;
+ 	int isolated = 0;
+ 	int remaining = MAX_LRU_BATCH;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_page *lrugen = &lruvec->lrugen;
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ 
+ 	VM_WARN_ON_ONCE(!list_empty(list));
+@@ -4967,7 +4967,7 @@ done:
+ 
+ static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
+ {
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_page *lrugen = &lruvec->lrugen;
+ 
+ 	if (lrugen->enabled) {
+ 		enum lru_list lru;
+@@ -5247,7 +5247,7 @@ static void lru_gen_seq_show_full(struct
+ 	int i;
+ 	int type, tier;
+ 	int hist = lru_hist_from_seq(seq);
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_page *lrugen = &lruvec->lrugen;
+ 
+ 	for (tier = 0; tier < MAX_NR_TIERS; tier++) {
+ 		seq_printf(m, "            %10d", tier);
+@@ -5296,7 +5296,7 @@ static int lru_gen_seq_show(struct seq_f
+ 	unsigned long seq;
+ 	bool full = !debugfs_real_fops(m->file)->write;
+ 	struct lruvec *lruvec = v;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_page *lrugen = &lruvec->lrugen;
+ 	int nid = lruvec_pgdat(lruvec)->node_id;
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ 	DEFINE_MAX_SEQ(lruvec);
+@@ -5549,7 +5549,7 @@ void lru_gen_init_lruvec(struct lruvec *
+ {
+ 	int i;
+ 	int gen, type, zone;
+-	struct lru_gen_struct *lrugen = &lruvec->lrugen;
++	struct lru_gen_page *lrugen = &lruvec->lrugen;
+ 
+ 	lrugen->max_seq = MIN_NR_GENS + 1;
+ 	lrugen->enabled = lru_gen_enabled();
+--- a/mm/workingset.c
++++ b/mm/workingset.c
+@@ -223,7 +223,7 @@ static void *lru_gen_eviction(struct pag
+ 	unsigned long token;
+ 	unsigned long min_seq;
+ 	struct lruvec *lruvec;
+-	struct lru_gen_struct *lrugen;
++	struct lru_gen_page *lrugen;
+ 	int type = page_is_file_lru(page);
+ 	int delta = thp_nr_pages(page);
+ 	int refs = page_lru_refs(page);
+@@ -252,7 +252,7 @@ static void lru_gen_refault(struct page
+ 	unsigned long token;
+ 	unsigned long min_seq;
+ 	struct lruvec *lruvec;
+-	struct lru_gen_struct *lrugen;
++	struct lru_gen_page *lrugen;
+ 	struct mem_cgroup *memcg;
+ 	struct pglist_data *pgdat;
+ 	int type = page_is_file_lru(page);

+ 162 - 0
target/linux/generic/backport-6.1/020-v6.3-22-mm-multi-gen-LRU-rename-lrugen-lists-to-lrugen-pages.patch

@@ -0,0 +1,162 @@
+From afd37e73db04c7e6b47411120ac5f6a7eca51fec Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Wed, 21 Dec 2022 21:19:00 -0700
+Subject: [PATCH 22/29] mm: multi-gen LRU: rename lrugen->lists[] to
+ lrugen->pages[]
+
+lru_gen_page will be chained into per-node lists by the coming
+lrugen->list.
+
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Jonathan Corbet <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Cc: Michal Hocko <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Roman Gushchin <[email protected]>
+Cc: Suren Baghdasaryan <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ include/linux/mm_inline.h |  4 ++--
+ include/linux/mmzone.h    |  8 ++++----
+ mm/vmscan.c               | 20 ++++++++++----------
+ 3 files changed, 16 insertions(+), 16 deletions(-)
+
+--- a/include/linux/mm_inline.h
++++ b/include/linux/mm_inline.h
+@@ -246,9 +246,9 @@ static inline bool lru_gen_add_page(stru
+ 	lru_gen_update_size(lruvec, page, -1, gen);
+ 	/* for rotate_reclaimable_page() */
+ 	if (reclaiming)
+-		list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
++		list_add_tail(&page->lru, &lrugen->pages[gen][type][zone]);
+ 	else
+-		list_add(&page->lru, &lrugen->lists[gen][type][zone]);
++		list_add(&page->lru, &lrugen->pages[gen][type][zone]);
+ 
+ 	return true;
+ }
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -302,7 +302,7 @@ enum lruvec_flags {
+  * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
+  * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
+  * corresponding generation. The gen counter in page->flags stores gen+1 while
+- * a page is on one of lrugen->lists[]. Otherwise it stores 0.
++ * a page is on one of lrugen->pages[]. Otherwise it stores 0.
+  *
+  * A page is added to the youngest generation on faulting. The aging needs to
+  * check the accessed bit at least twice before handing this page over to the
+@@ -314,8 +314,8 @@ enum lruvec_flags {
+  * rest of generations, if they exist, are considered inactive. See
+  * lru_gen_is_active().
+  *
+- * PG_active is always cleared while a page is on one of lrugen->lists[] so that
+- * the aging needs not to worry about it. And it's set again when a page
++ * PG_active is always cleared while a page is on one of lrugen->pages[] so
++ * that the aging needs not to worry about it. And it's set again when a page
+  * considered active is isolated for non-reclaiming purposes, e.g., migration.
+  * See lru_gen_add_page() and lru_gen_del_page().
+  *
+@@ -402,7 +402,7 @@ struct lru_gen_page {
+ 	/* the birth time of each generation in jiffies */
+ 	unsigned long timestamps[MAX_NR_GENS];
+ 	/* the multi-gen LRU lists, lazily sorted on eviction */
+-	struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
++	struct list_head pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ 	/* the multi-gen LRU sizes, eventually consistent */
+ 	long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ 	/* the exponential moving average of refaulted */
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -3987,7 +3987,7 @@ static bool inc_min_seq(struct lruvec *l
+ 
+ 	/* prevent cold/hot inversion if force_scan is true */
+ 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+-		struct list_head *head = &lrugen->lists[old_gen][type][zone];
++		struct list_head *head = &lrugen->pages[old_gen][type][zone];
+ 
+ 		while (!list_empty(head)) {
+ 			struct page *page = lru_to_page(head);
+@@ -3998,7 +3998,7 @@ static bool inc_min_seq(struct lruvec *l
+ 			VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page);
+ 
+ 			new_gen = page_inc_gen(lruvec, page, false);
+-			list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]);
++			list_move_tail(&page->lru, &lrugen->pages[new_gen][type][zone]);
+ 
+ 			if (!--remaining)
+ 				return false;
+@@ -4026,7 +4026,7 @@ static bool try_to_inc_min_seq(struct lr
+ 			gen = lru_gen_from_seq(min_seq[type]);
+ 
+ 			for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+-				if (!list_empty(&lrugen->lists[gen][type][zone]))
++				if (!list_empty(&lrugen->pages[gen][type][zone]))
+ 					goto next;
+ 			}
+ 
+@@ -4491,7 +4491,7 @@ static bool sort_page(struct lruvec *lru
+ 
+ 	/* promoted */
+ 	if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
+-		list_move(&page->lru, &lrugen->lists[gen][type][zone]);
++		list_move(&page->lru, &lrugen->pages[gen][type][zone]);
+ 		return true;
+ 	}
+ 
+@@ -4500,7 +4500,7 @@ static bool sort_page(struct lruvec *lru
+ 		int hist = lru_hist_from_seq(lrugen->min_seq[type]);
+ 
+ 		gen = page_inc_gen(lruvec, page, false);
+-		list_move_tail(&page->lru, &lrugen->lists[gen][type][zone]);
++		list_move_tail(&page->lru, &lrugen->pages[gen][type][zone]);
+ 
+ 		WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
+ 			   lrugen->protected[hist][type][tier - 1] + delta);
+@@ -4512,7 +4512,7 @@ static bool sort_page(struct lruvec *lru
+ 	if (PageLocked(page) || PageWriteback(page) ||
+ 	    (type == LRU_GEN_FILE && PageDirty(page))) {
+ 		gen = page_inc_gen(lruvec, page, true);
+-		list_move(&page->lru, &lrugen->lists[gen][type][zone]);
++		list_move(&page->lru, &lrugen->pages[gen][type][zone]);
+ 		return true;
+ 	}
+ 
+@@ -4579,7 +4579,7 @@ static int scan_pages(struct lruvec *lru
+ 	for (zone = sc->reclaim_idx; zone >= 0; zone--) {
+ 		LIST_HEAD(moved);
+ 		int skipped = 0;
+-		struct list_head *head = &lrugen->lists[gen][type][zone];
++		struct list_head *head = &lrugen->pages[gen][type][zone];
+ 
+ 		while (!list_empty(head)) {
+ 			struct page *page = lru_to_page(head);
+@@ -4980,7 +4980,7 @@ static bool __maybe_unused state_is_vali
+ 		int gen, type, zone;
+ 
+ 		for_each_gen_type_zone(gen, type, zone) {
+-			if (!list_empty(&lrugen->lists[gen][type][zone]))
++			if (!list_empty(&lrugen->pages[gen][type][zone]))
+ 				return false;
+ 		}
+ 	}
+@@ -5025,7 +5025,7 @@ static bool drain_evictable(struct lruve
+ 	int remaining = MAX_LRU_BATCH;
+ 
+ 	for_each_gen_type_zone(gen, type, zone) {
+-		struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
++		struct list_head *head = &lruvec->lrugen.pages[gen][type][zone];
+ 
+ 		while (!list_empty(head)) {
+ 			bool success;
+@@ -5558,7 +5558,7 @@ void lru_gen_init_lruvec(struct lruvec *
+ 		lrugen->timestamps[i] = jiffies;
+ 
+ 	for_each_gen_type_zone(gen, type, zone)
+-		INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
++		INIT_LIST_HEAD(&lrugen->pages[gen][type][zone]);
+ 
+ 	lruvec->mm_state.seq = MIN_NR_GENS;
+ 	init_waitqueue_head(&lruvec->mm_state.wait);

+ 188 - 0
target/linux/generic/backport-6.1/020-v6.3-23-mm-multi-gen-LRU-remove-eviction-fairness-safeguard.patch

@@ -0,0 +1,188 @@
+From ce45f1c4b32cf69b166f56ef5bc6c761e06ed4e5 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Wed, 21 Dec 2022 21:19:01 -0700
+Subject: [PATCH 23/29] mm: multi-gen LRU: remove eviction fairness safeguard
+
+Recall that the eviction consumes the oldest generation: first it
+bucket-sorts pages whose gen counters were updated by the aging and
+reclaims the rest; then it increments lrugen->min_seq.
+
+The current eviction fairness safeguard for global reclaim has a
+dilemma: when there are multiple eligible memcgs, should it continue
+or stop upon meeting the reclaim goal? If it continues, it overshoots
+and increases direct reclaim latency; if it stops, it loses fairness
+between memcgs it has taken memory away from and those it has yet to.
+
+With memcg LRU, the eviction, while ensuring eventual fairness, will
+stop upon meeting its goal. Therefore the current eviction fairness
+safeguard for global reclaim will not be needed.
+
+Note that memcg LRU only applies to global reclaim. For memcg reclaim,
+the eviction will continue, even if it is overshooting. This becomes
+unconditional due to code simplification.
+
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Jonathan Corbet <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Cc: Michal Hocko <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Roman Gushchin <[email protected]>
+Cc: Suren Baghdasaryan <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ mm/vmscan.c | 82 +++++++++++++++--------------------------------------
+ 1 file changed, 23 insertions(+), 59 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -443,6 +443,11 @@ static bool cgroup_reclaim(struct scan_c
+ 	return sc->target_mem_cgroup;
+ }
+ 
++static bool global_reclaim(struct scan_control *sc)
++{
++	return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup);
++}
++
+ /**
+  * writeback_throttling_sane - is the usual dirty throttling mechanism available?
+  * @sc: scan_control in question
+@@ -493,6 +498,11 @@ static bool cgroup_reclaim(struct scan_c
+ 	return false;
+ }
+ 
++static bool global_reclaim(struct scan_control *sc)
++{
++	return true;
++}
++
+ static bool writeback_throttling_sane(struct scan_control *sc)
+ {
+ 	return true;
+@@ -4722,8 +4732,7 @@ static int isolate_pages(struct lruvec *
+ 	return scanned;
+ }
+ 
+-static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+-		       bool *need_swapping)
++static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
+ {
+ 	int type;
+ 	int scanned;
+@@ -4812,9 +4821,6 @@ retry:
+ 		goto retry;
+ 	}
+ 
+-	if (need_swapping && type == LRU_GEN_ANON)
+-		*need_swapping = true;
+-
+ 	return scanned;
+ }
+ 
+@@ -4853,68 +4859,26 @@ done:
+ 	return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
+ }
+ 
+-static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
+-			      struct scan_control *sc, bool need_swapping)
++static unsigned long get_nr_to_reclaim(struct scan_control *sc)
+ {
+-	int i;
+-	DEFINE_MAX_SEQ(lruvec);
+-
+-	if (!current_is_kswapd()) {
+-		/* age each memcg once to ensure fairness */
+-		if (max_seq - seq > 1)
+-			return true;
+-
+-		/* over-swapping can increase allocation latency */
+-		if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
+-			return true;
+-
+-		/* give this thread a chance to exit and free its memory */
+-		if (fatal_signal_pending(current)) {
+-			sc->nr_reclaimed += MIN_LRU_BATCH;
+-			return true;
+-		}
+-
+-		if (cgroup_reclaim(sc))
+-			return false;
+-	} else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
+-		return false;
+-
+-	/* keep scanning at low priorities to ensure fairness */
+-	if (sc->priority > DEF_PRIORITY - 2)
+-		return false;
+-
+-	/*
+-	 * A minimum amount of work was done under global memory pressure. For
+-	 * kswapd, it may be overshooting. For direct reclaim, the target isn't
+-	 * met, and yet the allocation may still succeed, since kswapd may have
+-	 * caught up. In either case, it's better to stop now, and restart if
+-	 * necessary.
+-	 */
+-	for (i = 0; i <= sc->reclaim_idx; i++) {
+-		unsigned long wmark;
+-		struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
+-
+-		if (!managed_zone(zone))
+-			continue;
+-
+-		wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
+-		if (wmark > zone_page_state(zone, NR_FREE_PAGES))
+-			return false;
+-	}
++	/* don't abort memcg reclaim to ensure fairness */
++	if (!global_reclaim(sc))
++		return -1;
+ 
+-	sc->nr_reclaimed += MIN_LRU_BATCH;
++	/* discount the previous progress for kswapd */
++	if (current_is_kswapd())
++		return sc->nr_to_reclaim + sc->last_reclaimed;
+ 
+-	return true;
++	return max(sc->nr_to_reclaim, compact_gap(sc->order));
+ }
+ 
+ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ {
+ 	struct blk_plug plug;
+ 	bool need_aging = false;
+-	bool need_swapping = false;
+ 	unsigned long scanned = 0;
+ 	unsigned long reclaimed = sc->nr_reclaimed;
+-	DEFINE_MAX_SEQ(lruvec);
++	unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
+ 
+ 	lru_add_drain();
+ 
+@@ -4938,7 +4902,7 @@ static void lru_gen_shrink_lruvec(struct
+ 		if (!nr_to_scan)
+ 			goto done;
+ 
+-		delta = evict_pages(lruvec, sc, swappiness, &need_swapping);
++		delta = evict_pages(lruvec, sc, swappiness);
+ 		if (!delta)
+ 			goto done;
+ 
+@@ -4946,7 +4910,7 @@ static void lru_gen_shrink_lruvec(struct
+ 		if (scanned >= nr_to_scan)
+ 			break;
+ 
+-		if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
++		if (sc->nr_reclaimed >= nr_to_reclaim)
+ 			break;
+ 
+ 		cond_resched();
+@@ -5393,7 +5357,7 @@ static int run_eviction(struct lruvec *l
+ 		if (sc->nr_reclaimed >= nr_to_reclaim)
+ 			return 0;
+ 
+-		if (!evict_pages(lruvec, sc, swappiness, NULL))
++		if (!evict_pages(lruvec, sc, swappiness))
+ 			return 0;
+ 
+ 		cond_resched();

+ 287 - 0
target/linux/generic/backport-6.1/020-v6.3-24-mm-multi-gen-LRU-remove-aging-fairness-safeguard.patch

@@ -0,0 +1,287 @@
+From e20b7386fccc18c791796eb1dc1a91eee3ccf801 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Wed, 21 Dec 2022 21:19:02 -0700
+Subject: [PATCH 24/29] mm: multi-gen LRU: remove aging fairness safeguard
+
+Recall that the aging produces the youngest generation: first it scans
+for accessed pages and updates their gen counters; then it increments
+lrugen->max_seq.
+
+The current aging fairness safeguard for kswapd uses two passes to
+ensure the fairness to multiple eligible memcgs. On the first pass,
+which is shared with the eviction, it checks whether all eligible
+memcgs are low on cold pages. If so, it requires a second pass, on
+which it ages all those memcgs at the same time.
+
+With memcg LRU, the aging, while ensuring eventual fairness, will run
+when necessary. Therefore the current aging fairness safeguard for
+kswapd will not be needed.
+
+Note that memcg LRU only applies to global reclaim. For memcg reclaim,
+the aging can be unfair to different memcgs, i.e., their
+lrugen->max_seq can be incremented at different paces.
+
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Jonathan Corbet <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Cc: Michal Hocko <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Roman Gushchin <[email protected]>
+Cc: Suren Baghdasaryan <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ mm/vmscan.c | 126 ++++++++++++++++++++++++----------------------------
+ 1 file changed, 59 insertions(+), 67 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -131,7 +131,6 @@ struct scan_control {
+ 
+ #ifdef CONFIG_LRU_GEN
+ 	/* help kswapd make better choices among multiple memcgs */
+-	unsigned int memcgs_need_aging:1;
+ 	unsigned long last_reclaimed;
+ #endif
+ 
+@@ -4184,7 +4183,7 @@ done:
+ 	return true;
+ }
+ 
+-static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
++static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
+ 			     struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
+ {
+ 	int gen, type, zone;
+@@ -4193,6 +4192,13 @@ static bool should_run_aging(struct lruv
+ 	unsigned long total = 0;
+ 	struct lru_gen_page *lrugen = &lruvec->lrugen;
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++	DEFINE_MIN_SEQ(lruvec);
++
++	/* whether this lruvec is completely out of cold pages */
++	if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
++		*nr_to_scan = 0;
++		return true;
++	}
+ 
+ 	for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ 		unsigned long seq;
+@@ -4221,8 +4227,6 @@ static bool should_run_aging(struct lruv
+ 	 * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
+ 	 * ideal number of generations is MIN_NR_GENS+1.
+ 	 */
+-	if (min_seq[!can_swap] + MIN_NR_GENS > max_seq)
+-		return true;
+ 	if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
+ 		return false;
+ 
+@@ -4241,40 +4245,54 @@ static bool should_run_aging(struct lruv
+ 	return false;
+ }
+ 
+-static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
++static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
+ {
+-	bool need_aging;
+-	unsigned long nr_to_scan;
+-	int swappiness = get_swappiness(lruvec, sc);
++	int gen, type, zone;
++	unsigned long total = 0;
++	bool can_swap = get_swappiness(lruvec, sc);
++	struct lru_gen_page *lrugen = &lruvec->lrugen;
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ 	DEFINE_MAX_SEQ(lruvec);
+ 	DEFINE_MIN_SEQ(lruvec);
+ 
+-	VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
++	for (type = !can_swap; type < ANON_AND_FILE; type++) {
++		unsigned long seq;
+ 
+-	mem_cgroup_calculate_protection(NULL, memcg);
++		for (seq = min_seq[type]; seq <= max_seq; seq++) {
++			gen = lru_gen_from_seq(seq);
+ 
+-	if (mem_cgroup_below_min(memcg))
+-		return false;
++			for (zone = 0; zone < MAX_NR_ZONES; zone++)
++				total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
++		}
++	}
+ 
+-	need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
++	/* whether the size is big enough to be helpful */
++	return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
++}
+ 
+-	if (min_ttl) {
+-		int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+-		unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
++static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc,
++				  unsigned long min_ttl)
++{
++	int gen;
++	unsigned long birth;
++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++	DEFINE_MIN_SEQ(lruvec);
+ 
+-		if (time_is_after_jiffies(birth + min_ttl))
+-			return false;
++	VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
+ 
+-		/* the size is likely too small to be helpful */
+-		if (!nr_to_scan && sc->priority != DEF_PRIORITY)
+-			return false;
+-	}
++	/* see the comment on lru_gen_page */
++	gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
++	birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
+ 
+-	if (need_aging)
+-		try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
++	if (time_is_after_jiffies(birth + min_ttl))
++		return false;
+ 
+-	return true;
++	if (!lruvec_is_sizable(lruvec, sc))
++		return false;
++
++	mem_cgroup_calculate_protection(NULL, memcg);
++
++	return !mem_cgroup_below_min(memcg);
+ }
+ 
+ /* to protect the working set of the last N jiffies */
+@@ -4283,46 +4301,32 @@ static unsigned long lru_gen_min_ttl __r
+ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+ {
+ 	struct mem_cgroup *memcg;
+-	bool success = false;
+ 	unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
+ 
+ 	VM_WARN_ON_ONCE(!current_is_kswapd());
+ 
+ 	sc->last_reclaimed = sc->nr_reclaimed;
+ 
+-	/*
+-	 * To reduce the chance of going into the aging path, which can be
+-	 * costly, optimistically skip it if the flag below was cleared in the
+-	 * eviction path. This improves the overall performance when multiple
+-	 * memcgs are available.
+-	 */
+-	if (!sc->memcgs_need_aging) {
+-		sc->memcgs_need_aging = true;
++	/* check the order to exclude compaction-induced reclaim */
++	if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
+ 		return;
+-	}
+-
+-	set_mm_walk(pgdat);
+ 
+ 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ 	do {
+ 		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ 
+-		if (age_lruvec(lruvec, sc, min_ttl))
+-			success = true;
++		if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
++			mem_cgroup_iter_break(NULL, memcg);
++			return;
++		}
+ 
+ 		cond_resched();
+ 	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+ 
+-	clear_mm_walk();
+-
+-	/* check the order to exclude compaction-induced reclaim */
+-	if (success || !min_ttl || sc->order)
+-		return;
+-
+ 	/*
+ 	 * The main goal is to OOM kill if every generation from all memcgs is
+ 	 * younger than min_ttl. However, another possibility is all memcgs are
+-	 * either below min or empty.
++	 * either too small or below min.
+ 	 */
+ 	if (mutex_trylock(&oom_lock)) {
+ 		struct oom_control oc = {
+@@ -4830,33 +4834,27 @@ retry:
+  *    reclaim.
+  */
+ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
+-				    bool can_swap, bool *need_aging)
++				    bool can_swap)
+ {
+ 	unsigned long nr_to_scan;
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ 	DEFINE_MAX_SEQ(lruvec);
+-	DEFINE_MIN_SEQ(lruvec);
+ 
+ 	if (mem_cgroup_below_min(memcg) ||
+ 	    (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
+ 		return 0;
+ 
+-	*need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
+-	if (!*need_aging)
++	if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
+ 		return nr_to_scan;
+ 
+ 	/* skip the aging path at the default priority */
+ 	if (sc->priority == DEF_PRIORITY)
+-		goto done;
++		return nr_to_scan;
+ 
+-	/* leave the work to lru_gen_age_node() */
+-	if (current_is_kswapd())
+-		return 0;
++	try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false);
+ 
+-	if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
+-		return nr_to_scan;
+-done:
+-	return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
++	/* skip this lruvec as it's low on cold pages */
++	return 0;
+ }
+ 
+ static unsigned long get_nr_to_reclaim(struct scan_control *sc)
+@@ -4875,9 +4873,7 @@ static unsigned long get_nr_to_reclaim(s
+ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ {
+ 	struct blk_plug plug;
+-	bool need_aging = false;
+ 	unsigned long scanned = 0;
+-	unsigned long reclaimed = sc->nr_reclaimed;
+ 	unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
+ 
+ 	lru_add_drain();
+@@ -4898,13 +4894,13 @@ static void lru_gen_shrink_lruvec(struct
+ 		else
+ 			swappiness = 0;
+ 
+-		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
++		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
+ 		if (!nr_to_scan)
+-			goto done;
++			break;
+ 
+ 		delta = evict_pages(lruvec, sc, swappiness);
+ 		if (!delta)
+-			goto done;
++			break;
+ 
+ 		scanned += delta;
+ 		if (scanned >= nr_to_scan)
+@@ -4916,10 +4912,6 @@ static void lru_gen_shrink_lruvec(struct
+ 		cond_resched();
+ 	}
+ 
+-	/* see the comment in lru_gen_age_node() */
+-	if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
+-		sc->memcgs_need_aging = false;
+-done:
+ 	clear_mm_walk();
+ 
+ 	blk_finish_plug(&plug);

+ 161 - 0
target/linux/generic/backport-6.1/020-v6.3-25-mm-multi-gen-LRU-shuffle-should_run_aging.patch

@@ -0,0 +1,161 @@
+From 107d54931df3c28d81648122e219bf0034ef4e99 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Wed, 21 Dec 2022 21:19:03 -0700
+Subject: [PATCH 25/29] mm: multi-gen LRU: shuffle should_run_aging()
+
+Move should_run_aging() next to its only caller left.
+
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Jonathan Corbet <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Cc: Michal Hocko <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Roman Gushchin <[email protected]>
+Cc: Suren Baghdasaryan <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ mm/vmscan.c | 124 ++++++++++++++++++++++++++--------------------------
+ 1 file changed, 62 insertions(+), 62 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -4183,68 +4183,6 @@ done:
+ 	return true;
+ }
+ 
+-static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
+-			     struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
+-{
+-	int gen, type, zone;
+-	unsigned long old = 0;
+-	unsigned long young = 0;
+-	unsigned long total = 0;
+-	struct lru_gen_page *lrugen = &lruvec->lrugen;
+-	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+-	DEFINE_MIN_SEQ(lruvec);
+-
+-	/* whether this lruvec is completely out of cold pages */
+-	if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
+-		*nr_to_scan = 0;
+-		return true;
+-	}
+-
+-	for (type = !can_swap; type < ANON_AND_FILE; type++) {
+-		unsigned long seq;
+-
+-		for (seq = min_seq[type]; seq <= max_seq; seq++) {
+-			unsigned long size = 0;
+-
+-			gen = lru_gen_from_seq(seq);
+-
+-			for (zone = 0; zone < MAX_NR_ZONES; zone++)
+-				size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
+-
+-			total += size;
+-			if (seq == max_seq)
+-				young += size;
+-			else if (seq + MIN_NR_GENS == max_seq)
+-				old += size;
+-		}
+-	}
+-
+-	/* try to scrape all its memory if this memcg was deleted */
+-	*nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
+-
+-	/*
+-	 * The aging tries to be lazy to reduce the overhead, while the eviction
+-	 * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
+-	 * ideal number of generations is MIN_NR_GENS+1.
+-	 */
+-	if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
+-		return false;
+-
+-	/*
+-	 * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
+-	 * of the total number of pages for each generation. A reasonable range
+-	 * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
+-	 * aging cares about the upper bound of hot pages, while the eviction
+-	 * cares about the lower bound of cold pages.
+-	 */
+-	if (young * MIN_NR_GENS > total)
+-		return true;
+-	if (old * (MIN_NR_GENS + 2) < total)
+-		return true;
+-
+-	return false;
+-}
+-
+ static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
+ {
+ 	int gen, type, zone;
+@@ -4828,6 +4766,68 @@ retry:
+ 	return scanned;
+ }
+ 
++static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
++			     struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
++{
++	int gen, type, zone;
++	unsigned long old = 0;
++	unsigned long young = 0;
++	unsigned long total = 0;
++	struct lru_gen_page *lrugen = &lruvec->lrugen;
++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++	DEFINE_MIN_SEQ(lruvec);
++
++	/* whether this lruvec is completely out of cold pages */
++	if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
++		*nr_to_scan = 0;
++		return true;
++	}
++
++	for (type = !can_swap; type < ANON_AND_FILE; type++) {
++		unsigned long seq;
++
++		for (seq = min_seq[type]; seq <= max_seq; seq++) {
++			unsigned long size = 0;
++
++			gen = lru_gen_from_seq(seq);
++
++			for (zone = 0; zone < MAX_NR_ZONES; zone++)
++				size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
++
++			total += size;
++			if (seq == max_seq)
++				young += size;
++			else if (seq + MIN_NR_GENS == max_seq)
++				old += size;
++		}
++	}
++
++	/* try to scrape all its memory if this memcg was deleted */
++	*nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
++
++	/*
++	 * The aging tries to be lazy to reduce the overhead, while the eviction
++	 * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
++	 * ideal number of generations is MIN_NR_GENS+1.
++	 */
++	if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
++		return false;
++
++	/*
++	 * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
++	 * of the total number of pages for each generation. A reasonable range
++	 * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
++	 * aging cares about the upper bound of hot pages, while the eviction
++	 * cares about the lower bound of cold pages.
++	 */
++	if (young * MIN_NR_GENS > total)
++		return true;
++	if (old * (MIN_NR_GENS + 2) < total)
++		return true;
++
++	return false;
++}
++
+ /*
+  * For future optimizations:
+  * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg

+ 868 - 0
target/linux/generic/backport-6.1/020-v6.3-26-mm-multi-gen-LRU-per-node-lru_gen_page-lists.patch

@@ -0,0 +1,868 @@
+From fa6363828d314e837c5f79e97ea5e8c0d2f7f062 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Wed, 21 Dec 2022 21:19:04 -0700
+Subject: [PATCH 26/29] mm: multi-gen LRU: per-node lru_gen_page lists
+
+For each node, memcgs are divided into two generations: the old and
+the young. For each generation, memcgs are randomly sharded into
+multiple bins to improve scalability. For each bin, an RCU hlist_nulls
+is virtually divided into three segments: the head, the tail and the
+default.
+
+An onlining memcg is added to the tail of a random bin in the old
+generation. The eviction starts at the head of a random bin in the old
+generation. The per-node memcg generation counter, whose reminder (mod
+2) indexes the old generation, is incremented when all its bins become
+empty.
+
+There are four operations:
+1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in
+   its current generation (old or young) and updates its "seg" to
+   "head";
+2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in
+   its current generation (old or young) and updates its "seg" to
+   "tail";
+3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in
+   the old generation, updates its "gen" to "old" and resets its "seg"
+   to "default";
+4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin
+   in the young generation, updates its "gen" to "young" and resets
+   its "seg" to "default".
+
+The events that trigger the above operations are:
+1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
+2. The first attempt to reclaim an memcg below low, which triggers
+   MEMCG_LRU_TAIL;
+3. The first attempt to reclaim an memcg below reclaimable size
+   threshold, which triggers MEMCG_LRU_TAIL;
+4. The second attempt to reclaim an memcg below reclaimable size
+   threshold, which triggers MEMCG_LRU_YOUNG;
+5. Attempting to reclaim an memcg below min, which triggers
+   MEMCG_LRU_YOUNG;
+6. Finishing the aging on the eviction path, which triggers
+   MEMCG_LRU_YOUNG;
+7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
+
+Note that memcg LRU only applies to global reclaim, and the
+round-robin incrementing of their max_seq counters ensures the
+eventual fairness to all eligible memcgs. For memcg reclaim, it still
+relies on mem_cgroup_iter().
+
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Jonathan Corbet <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Cc: Michal Hocko <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Roman Gushchin <[email protected]>
+Cc: Suren Baghdasaryan <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ include/linux/memcontrol.h |  10 +
+ include/linux/mm_inline.h  |  17 ++
+ include/linux/mmzone.h     | 117 +++++++++++-
+ mm/memcontrol.c            |  16 ++
+ mm/page_alloc.c            |   1 +
+ mm/vmscan.c                | 373 +++++++++++++++++++++++++++++++++----
+ 6 files changed, 499 insertions(+), 35 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -818,6 +818,11 @@ static inline void obj_cgroup_put(struct
+ 	percpu_ref_put(&objcg->refcnt);
+ }
+ 
++static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
++{
++	return !memcg || css_tryget(&memcg->css);
++}
++
+ static inline void mem_cgroup_put(struct mem_cgroup *memcg)
+ {
+ 	if (memcg)
+@@ -1283,6 +1288,11 @@ struct mem_cgroup *mem_cgroup_from_css(s
+ 	return NULL;
+ }
+ 
++static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
++{
++	return true;
++}
++
+ static inline void mem_cgroup_put(struct mem_cgroup *memcg)
+ {
+ }
+--- a/include/linux/mm_inline.h
++++ b/include/linux/mm_inline.h
+@@ -112,6 +112,18 @@ static inline bool lru_gen_in_fault(void
+ 	return current->in_lru_fault;
+ }
+ 
++#ifdef CONFIG_MEMCG
++static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
++{
++	return READ_ONCE(lruvec->lrugen.seg);
++}
++#else
++static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
++{
++	return 0;
++}
++#endif
++
+ static inline int lru_gen_from_seq(unsigned long seq)
+ {
+ 	return seq % MAX_NR_GENS;
+@@ -287,6 +299,11 @@ static inline bool lru_gen_in_fault(void
+ 	return false;
+ }
+ 
++static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
++{
++	return 0;
++}
++
+ static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
+ {
+ 	return false;
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -7,6 +7,7 @@
+ 
+ #include <linux/spinlock.h>
+ #include <linux/list.h>
++#include <linux/list_nulls.h>
+ #include <linux/wait.h>
+ #include <linux/bitops.h>
+ #include <linux/cache.h>
+@@ -357,6 +358,15 @@ struct page_vma_mapped_walk;
+ #define LRU_GEN_MASK		((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
+ #define LRU_REFS_MASK		((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
+ 
++/* see the comment on MEMCG_NR_GENS */
++enum {
++	MEMCG_LRU_NOP,
++	MEMCG_LRU_HEAD,
++	MEMCG_LRU_TAIL,
++	MEMCG_LRU_OLD,
++	MEMCG_LRU_YOUNG,
++};
++
+ #ifdef CONFIG_LRU_GEN
+ 
+ enum {
+@@ -416,6 +426,14 @@ struct lru_gen_page {
+ 	atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+ 	/* whether the multi-gen LRU is enabled */
+ 	bool enabled;
++#ifdef CONFIG_MEMCG
++	/* the memcg generation this lru_gen_page belongs to */
++	u8 gen;
++	/* the list segment this lru_gen_page belongs to */
++	u8 seg;
++	/* per-node lru_gen_page list for global reclaim */
++	struct hlist_nulls_node list;
++#endif
+ };
+ 
+ enum {
+@@ -469,12 +487,87 @@ void lru_gen_init_lruvec(struct lruvec *
+ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
+ 
+ #ifdef CONFIG_MEMCG
++
++/*
++ * For each node, memcgs are divided into two generations: the old and the
++ * young. For each generation, memcgs are randomly sharded into multiple bins
++ * to improve scalability. For each bin, the hlist_nulls is virtually divided
++ * into three segments: the head, the tail and the default.
++ *
++ * An onlining memcg is added to the tail of a random bin in the old generation.
++ * The eviction starts at the head of a random bin in the old generation. The
++ * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
++ * the old generation, is incremented when all its bins become empty.
++ *
++ * There are four operations:
++ * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its
++ *    current generation (old or young) and updates its "seg" to "head";
++ * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its
++ *    current generation (old or young) and updates its "seg" to "tail";
++ * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old
++ *    generation, updates its "gen" to "old" and resets its "seg" to "default";
++ * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the
++ *    young generation, updates its "gen" to "young" and resets its "seg" to
++ *    "default".
++ *
++ * The events that trigger the above operations are:
++ * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
++ * 2. The first attempt to reclaim an memcg below low, which triggers
++ *    MEMCG_LRU_TAIL;
++ * 3. The first attempt to reclaim an memcg below reclaimable size threshold,
++ *    which triggers MEMCG_LRU_TAIL;
++ * 4. The second attempt to reclaim an memcg below reclaimable size threshold,
++ *    which triggers MEMCG_LRU_YOUNG;
++ * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG;
++ * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
++ * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
++ *
++ * Note that memcg LRU only applies to global reclaim, and the round-robin
++ * incrementing of their max_seq counters ensures the eventual fairness to all
++ * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
++ */
++#define MEMCG_NR_GENS	2
++#define MEMCG_NR_BINS	8
++
++struct lru_gen_memcg {
++	/* the per-node memcg generation counter */
++	unsigned long seq;
++	/* each memcg has one lru_gen_page per node */
++	unsigned long nr_memcgs[MEMCG_NR_GENS];
++	/* per-node lru_gen_page list for global reclaim */
++	struct hlist_nulls_head	fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
++	/* protects the above */
++	spinlock_t lock;
++};
++
++void lru_gen_init_pgdat(struct pglist_data *pgdat);
++
+ void lru_gen_init_memcg(struct mem_cgroup *memcg);
+ void lru_gen_exit_memcg(struct mem_cgroup *memcg);
+-#endif
++void lru_gen_online_memcg(struct mem_cgroup *memcg);
++void lru_gen_offline_memcg(struct mem_cgroup *memcg);
++void lru_gen_release_memcg(struct mem_cgroup *memcg);
++void lru_gen_rotate_memcg(struct lruvec *lruvec, int op);
++
++#else /* !CONFIG_MEMCG */
++
++#define MEMCG_NR_GENS	1
++
++struct lru_gen_memcg {
++};
++
++static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
++{
++}
++
++#endif /* CONFIG_MEMCG */
+ 
+ #else /* !CONFIG_LRU_GEN */
+ 
++static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
++{
++}
++
+ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
+ {
+ }
+@@ -484,6 +577,7 @@ static inline void lru_gen_look_around(s
+ }
+ 
+ #ifdef CONFIG_MEMCG
++
+ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
+ {
+ }
+@@ -491,7 +585,24 @@ static inline void lru_gen_init_memcg(st
+ static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
+ {
+ }
+-#endif
++
++static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
++{
++}
++
++static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
++{
++}
++
++static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
++{
++}
++
++static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
++{
++}
++
++#endif /* CONFIG_MEMCG */
+ 
+ #endif /* CONFIG_LRU_GEN */
+ 
+@@ -1105,6 +1216,8 @@ typedef struct pglist_data {
+ #ifdef CONFIG_LRU_GEN
+ 	/* kswap mm walk data */
+ 	struct lru_gen_mm_walk	mm_walk;
++	/* lru_gen_page list */
++	struct lru_gen_memcg memcg_lru;
+ #endif
+ 
+ 	ZONE_PADDING(_pad2_)
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -549,6 +549,16 @@ static void mem_cgroup_update_tree(struc
+ 	struct mem_cgroup_per_node *mz;
+ 	struct mem_cgroup_tree_per_node *mctz;
+ 
++	if (lru_gen_enabled()) {
++		struct lruvec *lruvec = &mem_cgroup_page_nodeinfo(memcg, page)->lruvec;
++
++		/* see the comment on MEMCG_NR_GENS */
++		if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
++			lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
++
++		return;
++	}
++
+ 	mctz = soft_limit_tree_from_page(page);
+ 	if (!mctz)
+ 		return;
+@@ -3433,6 +3443,9 @@ unsigned long mem_cgroup_soft_limit_recl
+ 	unsigned long excess;
+ 	unsigned long nr_scanned;
+ 
++	if (lru_gen_enabled())
++		return 0;
++
+ 	if (order > 0)
+ 		return 0;
+ 
+@@ -5321,6 +5334,7 @@ static int mem_cgroup_css_online(struct
+ 	if (unlikely(mem_cgroup_is_root(memcg)))
+ 		queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
+ 				   2UL*HZ);
++	lru_gen_online_memcg(memcg);
+ 	return 0;
+ }
+ 
+@@ -5347,6 +5361,7 @@ static void mem_cgroup_css_offline(struc
+ 	memcg_offline_kmem(memcg);
+ 	reparent_shrinker_deferred(memcg);
+ 	wb_memcg_offline(memcg);
++	lru_gen_offline_memcg(memcg);
+ 
+ 	drain_all_stock(memcg);
+ 
+@@ -5358,6 +5373,7 @@ static void mem_cgroup_css_released(stru
+ 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ 
+ 	invalidate_reclaim_iterators(memcg);
++	lru_gen_release_memcg(memcg);
+ }
+ 
+ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -7661,6 +7661,7 @@ static void __init free_area_init_node(i
+ 	pgdat_set_deferred_range(pgdat);
+ 
+ 	free_area_init_core(pgdat);
++	lru_gen_init_pgdat(pgdat);
+ }
+ 
+ void __init free_area_init_memoryless_node(int nid)
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -54,6 +54,8 @@
+ #include <linux/shmem_fs.h>
+ #include <linux/ctype.h>
+ #include <linux/debugfs.h>
++#include <linux/rculist_nulls.h>
++#include <linux/random.h>
+ 
+ #include <asm/tlbflush.h>
+ #include <asm/div64.h>
+@@ -129,11 +131,6 @@ struct scan_control {
+ 	/* Always discard instead of demoting to lower tier memory */
+ 	unsigned int no_demotion:1;
+ 
+-#ifdef CONFIG_LRU_GEN
+-	/* help kswapd make better choices among multiple memcgs */
+-	unsigned long last_reclaimed;
+-#endif
+-
+ 	/* Allocation order */
+ 	s8 order;
+ 
+@@ -2880,6 +2877,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_ca
+ 		for ((type) = 0; (type) < ANON_AND_FILE; (type)++)	\
+ 			for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+ 
++#define get_memcg_gen(seq)	((seq) % MEMCG_NR_GENS)
++#define get_memcg_bin(bin)	((bin) % MEMCG_NR_BINS)
++
+ static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
+ {
+ 	struct pglist_data *pgdat = NODE_DATA(nid);
+@@ -4169,8 +4169,7 @@ done:
+ 		if (sc->priority <= DEF_PRIORITY - 2)
+ 			wait_event_killable(lruvec->mm_state.wait,
+ 					    max_seq < READ_ONCE(lrugen->max_seq));
+-
+-		return max_seq < READ_ONCE(lrugen->max_seq);
++		return false;
+ 	}
+ 
+ 	VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
+@@ -4243,8 +4242,6 @@ static void lru_gen_age_node(struct pgli
+ 
+ 	VM_WARN_ON_ONCE(!current_is_kswapd());
+ 
+-	sc->last_reclaimed = sc->nr_reclaimed;
+-
+ 	/* check the order to exclude compaction-induced reclaim */
+ 	if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
+ 		return;
+@@ -4833,8 +4830,7 @@ static bool should_run_aging(struct lruv
+  * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
+  *    reclaim.
+  */
+-static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
+-				    bool can_swap)
++static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
+ {
+ 	unsigned long nr_to_scan;
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+@@ -4851,10 +4847,8 @@ static unsigned long get_nr_to_scan(stru
+ 	if (sc->priority == DEF_PRIORITY)
+ 		return nr_to_scan;
+ 
+-	try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false);
+-
+ 	/* skip this lruvec as it's low on cold pages */
+-	return 0;
++	return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
+ }
+ 
+ static unsigned long get_nr_to_reclaim(struct scan_control *sc)
+@@ -4863,29 +4857,18 @@ static unsigned long get_nr_to_reclaim(s
+ 	if (!global_reclaim(sc))
+ 		return -1;
+ 
+-	/* discount the previous progress for kswapd */
+-	if (current_is_kswapd())
+-		return sc->nr_to_reclaim + sc->last_reclaimed;
+-
+ 	return max(sc->nr_to_reclaim, compact_gap(sc->order));
+ }
+ 
+-static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
++static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+ {
+-	struct blk_plug plug;
++	long nr_to_scan;
+ 	unsigned long scanned = 0;
+ 	unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
+ 
+-	lru_add_drain();
+-
+-	blk_start_plug(&plug);
+-
+-	set_mm_walk(lruvec_pgdat(lruvec));
+-
+ 	while (true) {
+ 		int delta;
+ 		int swappiness;
+-		unsigned long nr_to_scan;
+ 
+ 		if (sc->may_swap)
+ 			swappiness = get_swappiness(lruvec, sc);
+@@ -4895,7 +4878,7 @@ static void lru_gen_shrink_lruvec(struct
+ 			swappiness = 0;
+ 
+ 		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
+-		if (!nr_to_scan)
++		if (nr_to_scan <= 0)
+ 			break;
+ 
+ 		delta = evict_pages(lruvec, sc, swappiness);
+@@ -4912,10 +4895,250 @@ static void lru_gen_shrink_lruvec(struct
+ 		cond_resched();
+ 	}
+ 
++	/* whether try_to_inc_max_seq() was successful */
++	return nr_to_scan < 0;
++}
++
++static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
++{
++	bool success;
++	unsigned long scanned = sc->nr_scanned;
++	unsigned long reclaimed = sc->nr_reclaimed;
++	int seg = lru_gen_memcg_seg(lruvec);
++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
++	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
++
++	/* see the comment on MEMCG_NR_GENS */
++	if (!lruvec_is_sizable(lruvec, sc))
++		return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
++
++	mem_cgroup_calculate_protection(NULL, memcg);
++
++	if (mem_cgroup_below_min(memcg))
++		return MEMCG_LRU_YOUNG;
++
++	if (mem_cgroup_below_low(memcg)) {
++		/* see the comment on MEMCG_NR_GENS */
++		if (seg != MEMCG_LRU_TAIL)
++			return MEMCG_LRU_TAIL;
++
++		memcg_memory_event(memcg, MEMCG_LOW);
++	}
++
++	success = try_to_shrink_lruvec(lruvec, sc);
++
++	shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
++
++	vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
++		   sc->nr_reclaimed - reclaimed);
++
++	sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
++	current->reclaim_state->reclaimed_slab = 0;
++
++	return success ? MEMCG_LRU_YOUNG : 0;
++}
++
++#ifdef CONFIG_MEMCG
++
++static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
++{
++	int gen;
++	int bin;
++	int first_bin;
++	struct lruvec *lruvec;
++	struct lru_gen_page *lrugen;
++	const struct hlist_nulls_node *pos;
++	int op = 0;
++	struct mem_cgroup *memcg = NULL;
++	unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
++
++	bin = first_bin = prandom_u32_max(MEMCG_NR_BINS);
++restart:
++	gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
++
++	rcu_read_lock();
++
++	hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
++		if (op)
++			lru_gen_rotate_memcg(lruvec, op);
++
++		mem_cgroup_put(memcg);
++
++		lruvec = container_of(lrugen, struct lruvec, lrugen);
++		memcg = lruvec_memcg(lruvec);
++
++		if (!mem_cgroup_tryget(memcg)) {
++			op = 0;
++			memcg = NULL;
++			continue;
++		}
++
++		rcu_read_unlock();
++
++		op = shrink_one(lruvec, sc);
++
++		if (sc->nr_reclaimed >= nr_to_reclaim)
++			goto success;
++
++		rcu_read_lock();
++	}
++
++	rcu_read_unlock();
++
++	/* restart if raced with lru_gen_rotate_memcg() */
++	if (gen != get_nulls_value(pos))
++		goto restart;
++
++	/* try the rest of the bins of the current generation */
++	bin = get_memcg_bin(bin + 1);
++	if (bin != first_bin)
++		goto restart;
++success:
++	if (op)
++		lru_gen_rotate_memcg(lruvec, op);
++
++	mem_cgroup_put(memcg);
++}
++
++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
++{
++	struct blk_plug plug;
++
++	VM_WARN_ON_ONCE(global_reclaim(sc));
++
++	lru_add_drain();
++
++	blk_start_plug(&plug);
++
++	set_mm_walk(lruvec_pgdat(lruvec));
++
++	if (try_to_shrink_lruvec(lruvec, sc))
++		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
++
++	clear_mm_walk();
++
++	blk_finish_plug(&plug);
++}
++
++#else /* !CONFIG_MEMCG */
++
++static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
++{
++	BUILD_BUG();
++}
++
++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
++{
++	BUILD_BUG();
++}
++
++#endif
++
++static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
++{
++	int priority;
++	unsigned long reclaimable;
++	struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
++
++	if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
++		return;
++	/*
++	 * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
++	 * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
++	 * estimated reclaimed_to_scanned_ratio = inactive / total.
++	 */
++	reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
++	if (get_swappiness(lruvec, sc))
++		reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
++
++	reclaimable /= MEMCG_NR_GENS;
++
++	/* round down reclaimable and round up sc->nr_to_reclaim */
++	priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
++
++	sc->priority = clamp(priority, 0, DEF_PRIORITY);
++}
++
++static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
++{
++	struct blk_plug plug;
++	unsigned long reclaimed = sc->nr_reclaimed;
++
++	VM_WARN_ON_ONCE(!global_reclaim(sc));
++
++	lru_add_drain();
++
++	blk_start_plug(&plug);
++
++	set_mm_walk(pgdat);
++
++	set_initial_priority(pgdat, sc);
++
++	if (current_is_kswapd())
++		sc->nr_reclaimed = 0;
++
++	if (mem_cgroup_disabled())
++		shrink_one(&pgdat->__lruvec, sc);
++	else
++		shrink_many(pgdat, sc);
++
++	if (current_is_kswapd())
++		sc->nr_reclaimed += reclaimed;
++
+ 	clear_mm_walk();
+ 
+ 	blk_finish_plug(&plug);
++
++	/* kswapd should never fail */
++	pgdat->kswapd_failures = 0;
++}
++
++#ifdef CONFIG_MEMCG
++void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
++{
++	int seg;
++	int old, new;
++	int bin = prandom_u32_max(MEMCG_NR_BINS);
++	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
++
++	spin_lock(&pgdat->memcg_lru.lock);
++
++	VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
++
++	seg = 0;
++	new = old = lruvec->lrugen.gen;
++
++	/* see the comment on MEMCG_NR_GENS */
++	if (op == MEMCG_LRU_HEAD)
++		seg = MEMCG_LRU_HEAD;
++	else if (op == MEMCG_LRU_TAIL)
++		seg = MEMCG_LRU_TAIL;
++	else if (op == MEMCG_LRU_OLD)
++		new = get_memcg_gen(pgdat->memcg_lru.seq);
++	else if (op == MEMCG_LRU_YOUNG)
++		new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
++	else
++		VM_WARN_ON_ONCE(true);
++
++	hlist_nulls_del_rcu(&lruvec->lrugen.list);
++
++	if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
++		hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
++	else
++		hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
++
++	pgdat->memcg_lru.nr_memcgs[old]--;
++	pgdat->memcg_lru.nr_memcgs[new]++;
++
++	lruvec->lrugen.gen = new;
++	WRITE_ONCE(lruvec->lrugen.seg, seg);
++
++	if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
++		WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
++
++	spin_unlock(&pgdat->memcg_lru.lock);
+ }
++#endif
+ 
+ /******************************************************************************
+  *                          state change
+@@ -5370,11 +5593,11 @@ static int run_cmd(char cmd, int memcg_i
+ 
+ 	if (!mem_cgroup_disabled()) {
+ 		rcu_read_lock();
++
+ 		memcg = mem_cgroup_from_id(memcg_id);
+-#ifdef CONFIG_MEMCG
+-		if (memcg && !css_tryget(&memcg->css))
++		if (!mem_cgroup_tryget(memcg))
+ 			memcg = NULL;
+-#endif
++
+ 		rcu_read_unlock();
+ 
+ 		if (!memcg)
+@@ -5521,6 +5744,19 @@ void lru_gen_init_lruvec(struct lruvec *
+ }
+ 
+ #ifdef CONFIG_MEMCG
++
++void lru_gen_init_pgdat(struct pglist_data *pgdat)
++{
++	int i, j;
++
++	spin_lock_init(&pgdat->memcg_lru.lock);
++
++	for (i = 0; i < MEMCG_NR_GENS; i++) {
++		for (j = 0; j < MEMCG_NR_BINS; j++)
++			INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
++	}
++}
++
+ void lru_gen_init_memcg(struct mem_cgroup *memcg)
+ {
+ 	INIT_LIST_HEAD(&memcg->mm_list.fifo);
+@@ -5544,7 +5780,69 @@ void lru_gen_exit_memcg(struct mem_cgrou
+ 		}
+ 	}
+ }
+-#endif
++
++void lru_gen_online_memcg(struct mem_cgroup *memcg)
++{
++	int gen;
++	int nid;
++	int bin = prandom_u32_max(MEMCG_NR_BINS);
++
++	for_each_node(nid) {
++		struct pglist_data *pgdat = NODE_DATA(nid);
++		struct lruvec *lruvec = get_lruvec(memcg, nid);
++
++		spin_lock(&pgdat->memcg_lru.lock);
++
++		VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
++
++		gen = get_memcg_gen(pgdat->memcg_lru.seq);
++
++		hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
++		pgdat->memcg_lru.nr_memcgs[gen]++;
++
++		lruvec->lrugen.gen = gen;
++
++		spin_unlock(&pgdat->memcg_lru.lock);
++	}
++}
++
++void lru_gen_offline_memcg(struct mem_cgroup *memcg)
++{
++	int nid;
++
++	for_each_node(nid) {
++		struct lruvec *lruvec = get_lruvec(memcg, nid);
++
++		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
++	}
++}
++
++void lru_gen_release_memcg(struct mem_cgroup *memcg)
++{
++	int gen;
++	int nid;
++
++	for_each_node(nid) {
++		struct pglist_data *pgdat = NODE_DATA(nid);
++		struct lruvec *lruvec = get_lruvec(memcg, nid);
++
++		spin_lock(&pgdat->memcg_lru.lock);
++
++		VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
++
++		gen = lruvec->lrugen.gen;
++
++		hlist_nulls_del_rcu(&lruvec->lrugen.list);
++		pgdat->memcg_lru.nr_memcgs[gen]--;
++
++		if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
++			WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
++
++		spin_unlock(&pgdat->memcg_lru.lock);
++	}
++}
++
++#endif /* CONFIG_MEMCG */
+ 
+ static int __init init_lru_gen(void)
+ {
+@@ -5571,6 +5869,10 @@ static void lru_gen_shrink_lruvec(struct
+ {
+ }
+ 
++static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
++{
++}
++
+ #endif /* CONFIG_LRU_GEN */
+ 
+ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+@@ -5584,7 +5886,7 @@ static void shrink_lruvec(struct lruvec
+ 	bool proportional_reclaim;
+ 	struct blk_plug plug;
+ 
+-	if (lru_gen_enabled()) {
++	if (lru_gen_enabled() && !global_reclaim(sc)) {
+ 		lru_gen_shrink_lruvec(lruvec, sc);
+ 		return;
+ 	}
+@@ -5826,6 +6128,11 @@ static void shrink_node(pg_data_t *pgdat
+ 	struct lruvec *target_lruvec;
+ 	bool reclaimable = false;
+ 
++	if (lru_gen_enabled() && global_reclaim(sc)) {
++		lru_gen_shrink_node(pgdat, sc);
++		return;
++	}
++
+ 	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
+ 
+ again:

+ 196 - 0
target/linux/generic/backport-6.1/020-v6.3-27-mm-multi-gen-LRU-clarify-scan_control-flags.patch

@@ -0,0 +1,196 @@
+From 93147736b5b3a21bea24313bfc7a696829932009 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Wed, 21 Dec 2022 21:19:05 -0700
+Subject: [PATCH 27/29] mm: multi-gen LRU: clarify scan_control flags
+
+Among the flags in scan_control:
+1. sc->may_swap, which indicates swap constraint due to memsw.max, is
+   supported as usual.
+2. sc->proactive, which indicates reclaim by memory.reclaim, may not
+   opportunistically skip the aging path, since it is considered less
+   latency sensitive.
+3. !(sc->gfp_mask & __GFP_IO), which indicates IO constraint, lowers
+   swappiness to prioritize file LRU, since clean file pages are more
+   likely to exist.
+4. sc->may_writepage and sc->may_unmap, which indicates opportunistic
+   reclaim, are rejected, since unmapped clean pages are already
+   prioritized. Scanning for more of them is likely futile and can
+   cause high reclaim latency when there is a large number of memcgs.
+
+The rest are handled by the existing code.
+
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Jonathan Corbet <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Cc: Michal Hocko <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Roman Gushchin <[email protected]>
+Cc: Suren Baghdasaryan <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ mm/vmscan.c | 55 +++++++++++++++++++++++++++--------------------------
+ 1 file changed, 28 insertions(+), 27 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2905,6 +2905,9 @@ static int get_swappiness(struct lruvec
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ 
++	if (!sc->may_swap)
++		return 0;
++
+ 	if (!can_demote(pgdat->node_id, sc) &&
+ 	    mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
+ 		return 0;
+@@ -3952,7 +3955,7 @@ static void walk_mm(struct lruvec *lruve
+ 	} while (err == -EAGAIN);
+ }
+ 
+-static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
++static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)
+ {
+ 	struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
+ 
+@@ -3960,7 +3963,7 @@ static struct lru_gen_mm_walk *set_mm_wa
+ 		VM_WARN_ON_ONCE(walk);
+ 
+ 		walk = &pgdat->mm_walk;
+-	} else if (!pgdat && !walk) {
++	} else if (!walk && force_alloc) {
+ 		VM_WARN_ON_ONCE(current_is_kswapd());
+ 
+ 		walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+@@ -4146,7 +4149,7 @@ static bool try_to_inc_max_seq(struct lr
+ 		goto done;
+ 	}
+ 
+-	walk = set_mm_walk(NULL);
++	walk = set_mm_walk(NULL, true);
+ 	if (!walk) {
+ 		success = iterate_mm_list_nowalk(lruvec, max_seq);
+ 		goto done;
+@@ -4215,8 +4218,6 @@ static bool lruvec_is_reclaimable(struct
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ 	DEFINE_MIN_SEQ(lruvec);
+ 
+-	VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
+-
+ 	/* see the comment on lru_gen_page */
+ 	gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+ 	birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
+@@ -4472,12 +4473,8 @@ static bool isolate_page(struct lruvec *
+ {
+ 	bool success;
+ 
+-	/* unmapping inhibited */
+-	if (!sc->may_unmap && page_mapped(page))
+-		return false;
+-
+ 	/* swapping inhibited */
+-	if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
++	if (!(sc->gfp_mask & __GFP_IO) &&
+ 	    (PageDirty(page) ||
+ 	     (PageAnon(page) && !PageSwapCache(page))))
+ 		return false;
+@@ -4574,9 +4571,8 @@ static int scan_pages(struct lruvec *lru
+ 	__count_vm_events(PGSCAN_ANON + type, isolated);
+ 
+ 	/*
+-	 * There might not be eligible pages due to reclaim_idx, may_unmap and
+-	 * may_writepage. Check the remaining to prevent livelock if it's not
+-	 * making progress.
++	 * There might not be eligible pages due to reclaim_idx. Check the
++	 * remaining to prevent livelock if it's not making progress.
+ 	 */
+ 	return isolated || !remaining ? scanned : 0;
+ }
+@@ -4836,8 +4832,7 @@ static long get_nr_to_scan(struct lruvec
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ 	DEFINE_MAX_SEQ(lruvec);
+ 
+-	if (mem_cgroup_below_min(memcg) ||
+-	    (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
++	if (mem_cgroup_below_min(memcg))
+ 		return 0;
+ 
+ 	if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
+@@ -4865,17 +4860,14 @@ static bool try_to_shrink_lruvec(struct
+ 	long nr_to_scan;
+ 	unsigned long scanned = 0;
+ 	unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
++	int swappiness = get_swappiness(lruvec, sc);
++
++	/* clean file pages are more likely to exist */
++	if (swappiness && !(sc->gfp_mask & __GFP_IO))
++		swappiness = 1;
+ 
+ 	while (true) {
+ 		int delta;
+-		int swappiness;
+-
+-		if (sc->may_swap)
+-			swappiness = get_swappiness(lruvec, sc);
+-		else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))
+-			swappiness = 1;
+-		else
+-			swappiness = 0;
+ 
+ 		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
+ 		if (nr_to_scan <= 0)
+@@ -5005,12 +4997,13 @@ static void lru_gen_shrink_lruvec(struct
+ 	struct blk_plug plug;
+ 
+ 	VM_WARN_ON_ONCE(global_reclaim(sc));
++	VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap);
+ 
+ 	lru_add_drain();
+ 
+ 	blk_start_plug(&plug);
+ 
+-	set_mm_walk(lruvec_pgdat(lruvec));
++	set_mm_walk(NULL, false);
+ 
+ 	if (try_to_shrink_lruvec(lruvec, sc))
+ 		lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
+@@ -5066,11 +5059,19 @@ static void lru_gen_shrink_node(struct p
+ 
+ 	VM_WARN_ON_ONCE(!global_reclaim(sc));
+ 
++	/*
++	 * Unmapped clean pages are already prioritized. Scanning for more of
++	 * them is likely futile and can cause high reclaim latency when there
++	 * is a large number of memcgs.
++	 */
++	if (!sc->may_writepage || !sc->may_unmap)
++		goto done;
++
+ 	lru_add_drain();
+ 
+ 	blk_start_plug(&plug);
+ 
+-	set_mm_walk(pgdat);
++	set_mm_walk(pgdat, false);
+ 
+ 	set_initial_priority(pgdat, sc);
+ 
+@@ -5088,7 +5089,7 @@ static void lru_gen_shrink_node(struct p
+ 	clear_mm_walk();
+ 
+ 	blk_finish_plug(&plug);
+-
++done:
+ 	/* kswapd should never fail */
+ 	pgdat->kswapd_failures = 0;
+ }
+@@ -5656,7 +5657,7 @@ static ssize_t lru_gen_seq_write(struct
+ 	set_task_reclaim_state(current, &sc.reclaim_state);
+ 	flags = memalloc_noreclaim_save();
+ 	blk_start_plug(&plug);
+-	if (!set_mm_walk(NULL)) {
++	if (!set_mm_walk(NULL, true)) {
+ 		err = -ENOMEM;
+ 		goto done;
+ 	}

+ 34 - 0
target/linux/generic/backport-6.1/020-v6.3-28-mm-multi-gen-LRU-simplify-arch_has_hw_pte_young-chec.patch

@@ -0,0 +1,34 @@
+From cf3297e4c7a928da8b2b2f0baff2f9c69ea57952 Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Wed, 21 Dec 2022 21:19:06 -0700
+Subject: [PATCH 28/29] mm: multi-gen LRU: simplify arch_has_hw_pte_young()
+ check
+
+Scanning page tables when hardware does not set the accessed bit has
+no real use cases.
+
+Link: https://lkml.kernel.org/r/[email protected]
+Signed-off-by: Yu Zhao <[email protected]>
+Cc: Johannes Weiner <[email protected]>
+Cc: Jonathan Corbet <[email protected]>
+Cc: Michael Larabel <[email protected]>
+Cc: Michal Hocko <[email protected]>
+Cc: Mike Rapoport <[email protected]>
+Cc: Roman Gushchin <[email protected]>
+Cc: Suren Baghdasaryan <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ mm/vmscan.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -4144,7 +4144,7 @@ static bool try_to_inc_max_seq(struct lr
+ 	 * handful of PTEs. Spreading the work out over a period of time usually
+ 	 * is less efficient, but it avoids bursty page faults.
+ 	 */
+-	if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
++	if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) {
+ 		success = iterate_mm_list_nowalk(lruvec, max_seq);
+ 		goto done;
+ 	}

+ 88 - 0
target/linux/generic/backport-6.1/020-v6.3-29-mm-multi-gen-LRU-avoid-futile-retries.patch

@@ -0,0 +1,88 @@
+From cc67f962cc53f6e1dfa92eb85b7b26fe83a3c66f Mon Sep 17 00:00:00 2001
+From: Yu Zhao <[email protected]>
+Date: Mon, 13 Feb 2023 00:53:22 -0700
+Subject: [PATCH 29/29] mm: multi-gen LRU: avoid futile retries
+
+Recall that the per-node memcg LRU has two generations and they alternate
+when the last memcg (of a given node) is moved from one to the other.
+Each generation is also sharded into multiple bins to improve scalability.
+A reclaimer starts with a random bin (in the old generation) and, if it
+fails, it will retry, i.e., to try the rest of the bins.
+
+If a reclaimer fails with the last memcg, it should move this memcg to the
+young generation first, which causes the generations to alternate, and
+then retry.  Otherwise, the retries will be futile because all other bins
+are empty.
+
+Link: https://lkml.kernel.org/r/[email protected]
+Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
+Signed-off-by: Yu Zhao <[email protected]>
+Reported-by: T.J. Mercier <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+---
+ mm/vmscan.c | 25 +++++++++++++++----------
+ 1 file changed, 15 insertions(+), 10 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -4934,18 +4934,20 @@ static int shrink_one(struct lruvec *lru
+ 
+ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
+ {
++	int op;
+ 	int gen;
+ 	int bin;
+ 	int first_bin;
+ 	struct lruvec *lruvec;
+ 	struct lru_gen_page *lrugen;
++	struct mem_cgroup *memcg;
+ 	const struct hlist_nulls_node *pos;
+-	int op = 0;
+-	struct mem_cgroup *memcg = NULL;
+ 	unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
+ 
+ 	bin = first_bin = prandom_u32_max(MEMCG_NR_BINS);
+ restart:
++	op = 0;
++	memcg = NULL;
+ 	gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
+ 
+ 	rcu_read_lock();
+@@ -4969,14 +4971,22 @@ restart:
+ 
+ 		op = shrink_one(lruvec, sc);
+ 
+-		if (sc->nr_reclaimed >= nr_to_reclaim)
+-			goto success;
+-
+ 		rcu_read_lock();
++
++		if (sc->nr_reclaimed >= nr_to_reclaim)
++			break;
+ 	}
+ 
+ 	rcu_read_unlock();
+ 
++	if (op)
++		lru_gen_rotate_memcg(lruvec, op);
++
++	mem_cgroup_put(memcg);
++
++	if (sc->nr_reclaimed >= nr_to_reclaim)
++		return;
++
+ 	/* restart if raced with lru_gen_rotate_memcg() */
+ 	if (gen != get_nulls_value(pos))
+ 		goto restart;
+@@ -4985,11 +4995,6 @@ restart:
+ 	bin = get_memcg_bin(bin + 1);
+ 	if (bin != first_bin)
+ 		goto restart;
+-success:
+-	if (op)
+-		lru_gen_rotate_memcg(lruvec, op);
+-
+-	mem_cgroup_put(memcg);
+ }
+ 
+ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)

+ 65 - 0
target/linux/generic/backport-6.1/050-v5.16-00-MIPS-uasm-Enable-muhu-opcode-for-MIPS-R6.patch

@@ -0,0 +1,65 @@
+From: Johan Almbladh <[email protected]>
+Date: Tue, 5 Oct 2021 18:54:02 +0200
+Subject: [PATCH] MIPS: uasm: Enable muhu opcode for MIPS R6
+
+Enable the 'muhu' instruction, complementing the existing 'mulu', needed
+to implement a MIPS32 BPF JIT.
+
+Also fix a typo in the existing definition of 'dmulu'.
+
+Signed-off-by: Tony Ambardar <[email protected]>
+
+This patch is a dependency for my 32-bit MIPS eBPF JIT.
+
+Signed-off-by: Johan Almbladh <[email protected]>
+---
+
+--- a/arch/mips/include/asm/uasm.h
++++ b/arch/mips/include/asm/uasm.h
+@@ -145,6 +145,7 @@ Ip_u1(_mtlo);
+ Ip_u3u1u2(_mul);
+ Ip_u1u2(_multu);
+ Ip_u3u1u2(_mulu);
++Ip_u3u1u2(_muhu);
+ Ip_u3u1u2(_nor);
+ Ip_u3u1u2(_or);
+ Ip_u2u1u3(_ori);
+--- a/arch/mips/mm/uasm-mips.c
++++ b/arch/mips/mm/uasm-mips.c
+@@ -90,7 +90,7 @@ static const struct insn insn_table[insn
+ 				RS | RT | RD},
+ 	[insn_dmtc0]	= {M(cop0_op, dmtc_op, 0, 0, 0, 0), RT | RD | SET},
+ 	[insn_dmultu]	= {M(spec_op, 0, 0, 0, 0, dmultu_op), RS | RT},
+-	[insn_dmulu]	= {M(spec_op, 0, 0, 0, dmult_dmul_op, dmultu_op),
++	[insn_dmulu]	= {M(spec_op, 0, 0, 0, dmultu_dmulu_op, dmultu_op),
+ 				RS | RT | RD},
+ 	[insn_drotr]	= {M(spec_op, 1, 0, 0, 0, dsrl_op), RT | RD | RE},
+ 	[insn_drotr32]	= {M(spec_op, 1, 0, 0, 0, dsrl32_op), RT | RD | RE},
+@@ -150,6 +150,8 @@ static const struct insn insn_table[insn
+ 	[insn_mtlo]	= {M(spec_op, 0, 0, 0, 0, mtlo_op), RS},
+ 	[insn_mulu]	= {M(spec_op, 0, 0, 0, multu_mulu_op, multu_op),
+ 				RS | RT | RD},
++	[insn_muhu]	= {M(spec_op, 0, 0, 0, multu_muhu_op, multu_op),
++				RS | RT | RD},
+ #ifndef CONFIG_CPU_MIPSR6
+ 	[insn_mul]	= {M(spec2_op, 0, 0, 0, 0, mul_op), RS | RT | RD},
+ #else
+--- a/arch/mips/mm/uasm.c
++++ b/arch/mips/mm/uasm.c
+@@ -59,7 +59,7 @@ enum opcode {
+ 	insn_lddir, insn_ldpte, insn_ldx, insn_lh, insn_lhu, insn_ll, insn_lld,
+ 	insn_lui, insn_lw, insn_lwu, insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi,
+ 	insn_mflo, insn_modu, insn_movn, insn_movz, insn_mtc0, insn_mthc0,
+-	insn_mthi, insn_mtlo, insn_mul, insn_multu, insn_mulu, insn_nor,
++	insn_mthi, insn_mtlo, insn_mul, insn_multu, insn_mulu, insn_muhu, insn_nor,
+ 	insn_or, insn_ori, insn_pref, insn_rfe, insn_rotr, insn_sb, insn_sc,
+ 	insn_scd, insn_seleqz, insn_selnez, insn_sd, insn_sh, insn_sll,
+ 	insn_sllv, insn_slt, insn_slti, insn_sltiu, insn_sltu, insn_sra,
+@@ -344,6 +344,7 @@ I_u1(_mtlo)
+ I_u3u1u2(_mul)
+ I_u1u2(_multu)
+ I_u3u1u2(_mulu)
++I_u3u1u2(_muhu)
+ I_u3u1u2(_nor)
+ I_u3u1u2(_or)
+ I_u2u1u3(_ori)

+ 31 - 0
target/linux/generic/backport-6.1/050-v5.16-01-mips-uasm-Add-workaround-for-Loongson-2F-nop-CPU-err.patch

@@ -0,0 +1,31 @@
+From: Johan Almbladh <[email protected]>
+Date: Tue, 5 Oct 2021 18:54:03 +0200
+Subject: [PATCH] mips: uasm: Add workaround for Loongson-2F nop CPU errata
+
+This patch implements a workaround for the Loongson-2F nop in generated,
+code, if the existing option CONFIG_CPU_NOP_WORKAROUND is set. Before,
+the binutils option -mfix-loongson2f-nop was enabled, but no workaround
+was done when emitting MIPS code. Now, the nop pseudo instruction is
+emitted as "or ax,ax,zero" instead of the default "sll zero,zero,0". This
+is consistent with the workaround implemented by binutils.
+
+Link: https://sourceware.org/legacy-ml/binutils/2009-11/msg00387.html
+
+Signed-off-by: Johan Almbladh <[email protected]>
+Reviewed-by: Jiaxun Yang <[email protected]>
+---
+
+--- a/arch/mips/include/asm/uasm.h
++++ b/arch/mips/include/asm/uasm.h
+@@ -249,7 +249,11 @@ static inline void uasm_l##lb(struct uas
+ #define uasm_i_bnezl(buf, rs, off) uasm_i_bnel(buf, rs, 0, off)
+ #define uasm_i_ehb(buf) uasm_i_sll(buf, 0, 0, 3)
+ #define uasm_i_move(buf, a, b) UASM_i_ADDU(buf, a, 0, b)
++#ifdef CONFIG_CPU_NOP_WORKAROUNDS
++#define uasm_i_nop(buf) uasm_i_or(buf, 1, 1, 0)
++#else
+ #define uasm_i_nop(buf) uasm_i_sll(buf, 0, 0, 0)
++#endif
+ #define uasm_i_ssnop(buf) uasm_i_sll(buf, 0, 0, 1)
+ 
+ static inline void uasm_i_drotr_safe(u32 **p, unsigned int a1,

+ 3078 - 0
target/linux/generic/backport-6.1/050-v5.16-02-mips-bpf-Add-eBPF-JIT-for-32-bit-MIPS.patch

@@ -0,0 +1,3078 @@
+From: Johan Almbladh <[email protected]>
+Date: Tue, 5 Oct 2021 18:54:04 +0200
+Subject: [PATCH] mips: bpf: Add eBPF JIT for 32-bit MIPS
+
+This is an implementation of an eBPF JIT for 32-bit MIPS I-V and MIPS32.
+The implementation supports all 32-bit and 64-bit ALU and JMP operations,
+including the recently-added atomics. 64-bit div/mod and 64-bit atomics
+are implemented using function calls to math64 and atomic64 functions,
+respectively. All 32-bit operations are implemented natively by the JIT,
+except if the CPU lacks ll/sc instructions.
+
+Register mapping
+================
+All 64-bit eBPF registers are mapped to native 32-bit MIPS register pairs,
+and does not use any stack scratch space for register swapping. This means
+that all eBPF register data is kept in CPU registers all the time, and
+this simplifies the register management a lot. It also reduces the JIT's
+pressure on temporary registers since we do not have to move data around.
+
+Native register pairs are ordered according to CPU endiannes, following
+the O32 calling convention for passing 64-bit arguments and return values.
+The eBPF return value, arguments and callee-saved registers are mapped to
+their native MIPS equivalents.
+
+Since the 32 highest bits in the eBPF FP (frame pointer) register are
+always zero, only one general-purpose register is actually needed for the
+mapping. The MIPS fp register is used for this purpose. The high bits are
+mapped to MIPS register r0. This saves us one CPU register, which is much
+needed for temporaries, while still allowing us to treat the R10 (FP)
+register just like any other eBPF register in the JIT.
+
+The MIPS gp (global pointer) and at (assembler temporary) registers are
+used as internal temporary registers for constant blinding. CPU registers
+t6-t9 are used internally by the JIT when constructing more complex 64-bit
+operations. This is precisely what is needed - two registers to store an
+operand value, and two more as scratch registers when performing the
+operation.
+
+The register mapping is shown below.
+
+    R0 - $v1, $v0   return value
+    R1 - $a1, $a0   argument 1, passed in registers
+    R2 - $a3, $a2   argument 2, passed in registers
+    R3 - $t1, $t0   argument 3, passed on stack
+    R4 - $t3, $t2   argument 4, passed on stack
+    R5 - $t4, $t3   argument 5, passed on stack
+    R6 - $s1, $s0   callee-saved
+    R7 - $s3, $s2   callee-saved
+    R8 - $s5, $s4   callee-saved
+    R9 - $s7, $s6   callee-saved
+    FP - $r0, $fp   32-bit frame pointer
+    AX - $gp, $at   constant-blinding
+         $t6 - $t9  unallocated, JIT temporaries
+
+Jump offsets
+============
+The JIT tries to map all conditional JMP operations to MIPS conditional
+PC-relative branches. The MIPS branch offset field is 18 bits, in bytes,
+which is equivalent to the eBPF 16-bit instruction offset. However, since
+the JIT may emit more than one CPU instruction per eBPF instruction, the
+field width may overflow. If that happens, the JIT converts the long
+conditional jump to a short PC-relative branch with the condition
+inverted, jumping over a long unconditional absolute jmp (j).
+
+This conversion will change the instruction offset mapping used for jumps,
+and may in turn result in more branch offset overflows. The JIT therefore
+dry-runs the translation until no more branches are converted and the
+offsets do not change anymore. There is an upper bound on this of course,
+and if the JIT hits that limit, the last two iterations are run with all
+branches being converted.
+
+Tail call count
+===============
+The current tail call count is stored in the 16-byte area of the caller's
+stack frame that is reserved for the callee in the o32 ABI. The value is
+initialized in the prologue, and propagated to the tail-callee by skipping
+the initialization instructions when emitting the tail call.
+
+Signed-off-by: Johan Almbladh <[email protected]>
+---
+ create mode 100644 arch/mips/net/bpf_jit_comp.c
+ create mode 100644 arch/mips/net/bpf_jit_comp.h
+ create mode 100644 arch/mips/net/bpf_jit_comp32.c
+
+--- a/arch/mips/net/Makefile
++++ b/arch/mips/net/Makefile
+@@ -2,4 +2,9 @@
+ # MIPS networking code
+ 
+ obj-$(CONFIG_MIPS_CBPF_JIT) += bpf_jit.o bpf_jit_asm.o
+-obj-$(CONFIG_MIPS_EBPF_JIT) += ebpf_jit.o
++
++ifeq ($(CONFIG_32BIT),y)
++        obj-$(CONFIG_MIPS_EBPF_JIT) += bpf_jit_comp.o bpf_jit_comp32.o
++else
++        obj-$(CONFIG_MIPS_EBPF_JIT) += ebpf_jit.o
++endif
+--- /dev/null
++++ b/arch/mips/net/bpf_jit_comp.c
+@@ -0,0 +1,1020 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/*
++ * Just-In-Time compiler for eBPF bytecode on MIPS.
++ * Implementation of JIT functions common to 32-bit and 64-bit CPUs.
++ *
++ * Copyright (c) 2021 Anyfi Networks AB.
++ * Author: Johan Almbladh <[email protected]>
++ *
++ * Based on code and ideas from
++ * Copyright (c) 2017 Cavium, Inc.
++ * Copyright (c) 2017 Shubham Bansal <[email protected]>
++ * Copyright (c) 2011 Mircea Gherzan <[email protected]>
++ */
++
++/*
++ * Code overview
++ * =============
++ *
++ * - bpf_jit_comp.h
++ *   Common definitions and utilities.
++ *
++ * - bpf_jit_comp.c
++ *   Implementation of JIT top-level logic and exported JIT API functions.
++ *   Implementation of internal operations shared by 32-bit and 64-bit code.
++ *   JMP and ALU JIT control code, register control code, shared ALU and
++ *   JMP/JMP32 JIT operations.
++ *
++ * - bpf_jit_comp32.c
++ *   Implementation of functions to JIT prologue, epilogue and a single eBPF
++ *   instruction for 32-bit MIPS CPUs. The functions use shared operations
++ *   where possible, and implement the rest for 32-bit MIPS such as ALU64
++ *   operations.
++ *
++ * - bpf_jit_comp64.c
++ *   Ditto, for 64-bit MIPS CPUs.
++ *
++ * Zero and sign extension
++ * ========================
++ * 32-bit MIPS instructions on 64-bit MIPS registers use sign extension,
++ * but the eBPF instruction set mandates zero extension. We let the verifier
++ * insert explicit zero-extensions after 32-bit ALU operations, both for
++ * 32-bit and 64-bit MIPS JITs. Conditional JMP32 operations on 64-bit MIPs
++ * are JITed with sign extensions inserted when so expected.
++ *
++ * ALU operations
++ * ==============
++ * ALU operations on 32/64-bit MIPS and ALU64 operations on 64-bit MIPS are
++ * JITed in the following steps. ALU64 operations on 32-bit MIPS are more
++ * complicated and therefore only processed by special implementations in
++ * step (3).
++ *
++ * 1) valid_alu_i:
++ *    Determine if an immediate operation can be emitted as such, or if
++ *    we must fall back to the register version.
++ *
++ * 2) rewrite_alu_i:
++ *    Convert BPF operation and immediate value to a canonical form for
++ *    JITing. In some degenerate cases this form may be a no-op.
++ *
++ * 3) emit_alu_{i,i64,r,64}:
++ *    Emit instructions for an ALU or ALU64 immediate or register operation.
++ *
++ * JMP operations
++ * ==============
++ * JMP and JMP32 operations require an JIT instruction offset table for
++ * translating the jump offset. This table is computed by dry-running the
++ * JIT without actually emitting anything. However, the computed PC-relative
++ * offset may overflow the 18-bit offset field width of the native MIPS
++ * branch instruction. In such cases, the long jump is converted into the
++ * following sequence.
++ *
++ *    <branch> !<cond> +2    Inverted PC-relative branch
++ *    nop                    Delay slot
++ *    j <offset>             Unconditional absolute long jump
++ *    nop                    Delay slot
++ *
++ * Since this converted sequence alters the offset table, all offsets must
++ * be re-calculated. This may in turn trigger new branch conversions, so
++ * the process is repeated until no further changes are made. Normally it
++ * completes in 1-2 iterations. If JIT_MAX_ITERATIONS should reached, we
++ * fall back to converting every remaining jump operation. The branch
++ * conversion is independent of how the JMP or JMP32 condition is JITed.
++ *
++ * JMP32 and JMP operations are JITed as follows.
++ *
++ * 1) setup_jmp_{i,r}:
++ *    Convert jump conditional and offset into a form that can be JITed.
++ *    This form may be a no-op, a canonical form, or an inverted PC-relative
++ *    jump if branch conversion is necessary.
++ *
++ * 2) valid_jmp_i:
++ *    Determine if an immediate operations can be emitted as such, or if
++ *    we must fall back to the register version. Applies to JMP32 for 32-bit
++ *    MIPS, and both JMP and JMP32 for 64-bit MIPS.
++ *
++ * 3) emit_jmp_{i,i64,r,r64}:
++ *    Emit instructions for an JMP or JMP32 immediate or register operation.
++ *
++ * 4) finish_jmp_{i,r}:
++ *    Emit any instructions needed to finish the jump. This includes a nop
++ *    for the delay slot if a branch was emitted, and a long absolute jump
++ *    if the branch was converted.
++ */
++
++#include <linux/limits.h>
++#include <linux/bitops.h>
++#include <linux/errno.h>
++#include <linux/filter.h>
++#include <linux/bpf.h>
++#include <linux/slab.h>
++#include <asm/bitops.h>
++#include <asm/cacheflush.h>
++#include <asm/cpu-features.h>
++#include <asm/isa-rev.h>
++#include <asm/uasm.h>
++
++#include "bpf_jit_comp.h"
++
++/* Convenience macros for descriptor access */
++#define CONVERTED(desc)	((desc) & JIT_DESC_CONVERT)
++#define INDEX(desc)	((desc) & ~JIT_DESC_CONVERT)
++
++/*
++ * Push registers on the stack, starting at a given depth from the stack
++ * pointer and increasing. The next depth to be written is returned.
++ */
++int push_regs(struct jit_context *ctx, u32 mask, u32 excl, int depth)
++{
++	int reg;
++
++	for (reg = 0; reg < BITS_PER_BYTE * sizeof(mask); reg++)
++		if (mask & BIT(reg)) {
++			if ((excl & BIT(reg)) == 0) {
++				if (sizeof(long) == 4)
++					emit(ctx, sw, reg, depth, MIPS_R_SP);
++				else /* sizeof(long) == 8 */
++					emit(ctx, sd, reg, depth, MIPS_R_SP);
++			}
++			depth += sizeof(long);
++		}
++
++	ctx->stack_used = max((int)ctx->stack_used, depth);
++	return depth;
++}
++
++/*
++ * Pop registers from the stack, starting at a given depth from the stack
++ * pointer and increasing. The next depth to be read is returned.
++ */
++int pop_regs(struct jit_context *ctx, u32 mask, u32 excl, int depth)
++{
++	int reg;
++
++	for (reg = 0; reg < BITS_PER_BYTE * sizeof(mask); reg++)
++		if (mask & BIT(reg)) {
++			if ((excl & BIT(reg)) == 0) {
++				if (sizeof(long) == 4)
++					emit(ctx, lw, reg, depth, MIPS_R_SP);
++				else /* sizeof(long) == 8 */
++					emit(ctx, ld, reg, depth, MIPS_R_SP);
++			}
++			depth += sizeof(long);
++		}
++
++	return depth;
++}
++
++/* Compute the 28-bit jump target address from a BPF program location */
++int get_target(struct jit_context *ctx, u32 loc)
++{
++	u32 index = INDEX(ctx->descriptors[loc]);
++	unsigned long pc = (unsigned long)&ctx->target[ctx->jit_index];
++	unsigned long addr = (unsigned long)&ctx->target[index];
++
++	if (!ctx->target)
++		return 0;
++
++	if ((addr ^ pc) & ~MIPS_JMP_MASK)
++		return -1;
++
++	return addr & MIPS_JMP_MASK;
++}
++
++/* Compute the PC-relative offset to relative BPF program offset */
++int get_offset(const struct jit_context *ctx, int off)
++{
++	return (INDEX(ctx->descriptors[ctx->bpf_index + off]) -
++		ctx->jit_index - 1) * sizeof(u32);
++}
++
++/* dst = imm (register width) */
++void emit_mov_i(struct jit_context *ctx, u8 dst, s32 imm)
++{
++	if (imm >= -0x8000 && imm <= 0x7fff) {
++		emit(ctx, addiu, dst, MIPS_R_ZERO, imm);
++	} else {
++		emit(ctx, lui, dst, (s16)((u32)imm >> 16));
++		emit(ctx, ori, dst, dst, (u16)(imm & 0xffff));
++	}
++	clobber_reg(ctx, dst);
++}
++
++/* dst = src (register width) */
++void emit_mov_r(struct jit_context *ctx, u8 dst, u8 src)
++{
++	emit(ctx, ori, dst, src, 0);
++	clobber_reg(ctx, dst);
++}
++
++/* Validate ALU immediate range */
++bool valid_alu_i(u8 op, s32 imm)
++{
++	switch (BPF_OP(op)) {
++	case BPF_NEG:
++	case BPF_LSH:
++	case BPF_RSH:
++	case BPF_ARSH:
++		/* All legal eBPF values are valid */
++		return true;
++	case BPF_ADD:
++		/* imm must be 16 bits */
++		return imm >= -0x8000 && imm <= 0x7fff;
++	case BPF_SUB:
++		/* -imm must be 16 bits */
++		return imm >= -0x7fff && imm <= 0x8000;
++	case BPF_AND:
++	case BPF_OR:
++	case BPF_XOR:
++		/* imm must be 16 bits unsigned */
++		return imm >= 0 && imm <= 0xffff;
++	case BPF_MUL:
++		/* imm must be zero or a positive power of two */
++		return imm == 0 || (imm > 0 && is_power_of_2(imm));
++	case BPF_DIV:
++	case BPF_MOD:
++		/* imm must be an 17-bit power of two */
++		return (u32)imm <= 0x10000 && is_power_of_2((u32)imm);
++	}
++	return false;
++}
++
++/* Rewrite ALU immediate operation */
++bool rewrite_alu_i(u8 op, s32 imm, u8 *alu, s32 *val)
++{
++	bool act = true;
++
++	switch (BPF_OP(op)) {
++	case BPF_LSH:
++	case BPF_RSH:
++	case BPF_ARSH:
++	case BPF_ADD:
++	case BPF_SUB:
++	case BPF_OR:
++	case BPF_XOR:
++		/* imm == 0 is a no-op */
++		act = imm != 0;
++		break;
++	case BPF_MUL:
++		if (imm == 1) {
++			/* dst * 1 is a no-op */
++			act = false;
++		} else if (imm == 0) {
++			/* dst * 0 is dst & 0 */
++			op = BPF_AND;
++		} else {
++			/* dst * (1 << n) is dst << n */
++			op = BPF_LSH;
++			imm = ilog2(abs(imm));
++		}
++		break;
++	case BPF_DIV:
++		if (imm == 1) {
++			/* dst / 1 is a no-op */
++			act = false;
++		} else {
++			/* dst / (1 << n) is dst >> n */
++			op = BPF_RSH;
++			imm = ilog2(imm);
++		}
++		break;
++	case BPF_MOD:
++		/* dst % (1 << n) is dst & ((1 << n) - 1) */
++		op = BPF_AND;
++		imm--;
++		break;
++	}
++
++	*alu = op;
++	*val = imm;
++	return act;
++}
++
++/* ALU immediate operation (32-bit) */
++void emit_alu_i(struct jit_context *ctx, u8 dst, s32 imm, u8 op)
++{
++	switch (BPF_OP(op)) {
++	/* dst = -dst */
++	case BPF_NEG:
++		emit(ctx, subu, dst, MIPS_R_ZERO, dst);
++		break;
++	/* dst = dst & imm */
++	case BPF_AND:
++		emit(ctx, andi, dst, dst, (u16)imm);
++		break;
++	/* dst = dst | imm */
++	case BPF_OR:
++		emit(ctx, ori, dst, dst, (u16)imm);
++		break;
++	/* dst = dst ^ imm */
++	case BPF_XOR:
++		emit(ctx, xori, dst, dst, (u16)imm);
++		break;
++	/* dst = dst << imm */
++	case BPF_LSH:
++		emit(ctx, sll, dst, dst, imm);
++		break;
++	/* dst = dst >> imm */
++	case BPF_RSH:
++		emit(ctx, srl, dst, dst, imm);
++		break;
++	/* dst = dst >> imm (arithmetic) */
++	case BPF_ARSH:
++		emit(ctx, sra, dst, dst, imm);
++		break;
++	/* dst = dst + imm */
++	case BPF_ADD:
++		emit(ctx, addiu, dst, dst, imm);
++		break;
++	/* dst = dst - imm */
++	case BPF_SUB:
++		emit(ctx, addiu, dst, dst, -imm);
++		break;
++	}
++	clobber_reg(ctx, dst);
++}
++
++/* ALU register operation (32-bit) */
++void emit_alu_r(struct jit_context *ctx, u8 dst, u8 src, u8 op)
++{
++	switch (BPF_OP(op)) {
++	/* dst = dst & src */
++	case BPF_AND:
++		emit(ctx, and, dst, dst, src);
++		break;
++	/* dst = dst | src */
++	case BPF_OR:
++		emit(ctx, or, dst, dst, src);
++		break;
++	/* dst = dst ^ src */
++	case BPF_XOR:
++		emit(ctx, xor, dst, dst, src);
++		break;
++	/* dst = dst << src */
++	case BPF_LSH:
++		emit(ctx, sllv, dst, dst, src);
++		break;
++	/* dst = dst >> src */
++	case BPF_RSH:
++		emit(ctx, srlv, dst, dst, src);
++		break;
++	/* dst = dst >> src (arithmetic) */
++	case BPF_ARSH:
++		emit(ctx, srav, dst, dst, src);
++		break;
++	/* dst = dst + src */
++	case BPF_ADD:
++		emit(ctx, addu, dst, dst, src);
++		break;
++	/* dst = dst - src */
++	case BPF_SUB:
++		emit(ctx, subu, dst, dst, src);
++		break;
++	/* dst = dst * src */
++	case BPF_MUL:
++		if (cpu_has_mips32r1 || cpu_has_mips32r6) {
++			emit(ctx, mul, dst, dst, src);
++		} else {
++			emit(ctx, multu, dst, src);
++			emit(ctx, mflo, dst);
++		}
++		break;
++	/* dst = dst / src */
++	case BPF_DIV:
++		if (cpu_has_mips32r6) {
++			emit(ctx, divu_r6, dst, dst, src);
++		} else {
++			emit(ctx, divu, dst, src);
++			emit(ctx, mflo, dst);
++		}
++		break;
++	/* dst = dst % src */
++	case BPF_MOD:
++		if (cpu_has_mips32r6) {
++			emit(ctx, modu, dst, dst, src);
++		} else {
++			emit(ctx, divu, dst, src);
++			emit(ctx, mfhi, dst);
++		}
++		break;
++	}
++	clobber_reg(ctx, dst);
++}
++
++/* Atomic read-modify-write (32-bit) */
++void emit_atomic_r(struct jit_context *ctx, u8 dst, u8 src, s16 off, u8 code)
++{
++	emit(ctx, ll, MIPS_R_T9, off, dst);
++	switch (code) {
++	case BPF_ADD:
++		emit(ctx, addu, MIPS_R_T8, MIPS_R_T9, src);
++		break;
++	case BPF_AND:
++		emit(ctx, and, MIPS_R_T8, MIPS_R_T9, src);
++		break;
++	case BPF_OR:
++		emit(ctx, or, MIPS_R_T8, MIPS_R_T9, src);
++		break;
++	case BPF_XOR:
++		emit(ctx, xor, MIPS_R_T8, MIPS_R_T9, src);
++		break;
++	}
++	emit(ctx, sc, MIPS_R_T8, off, dst);
++	emit(ctx, beqz, MIPS_R_T8, -16);
++	emit(ctx, nop); /* Delay slot */
++}
++
++/* Atomic compare-and-exchange (32-bit) */
++void emit_cmpxchg_r(struct jit_context *ctx, u8 dst, u8 src, u8 res, s16 off)
++{
++	emit(ctx, ll, MIPS_R_T9, off, dst);
++	emit(ctx, bne, MIPS_R_T9, res, 12);
++	emit(ctx, move, MIPS_R_T8, src);     /* Delay slot */
++	emit(ctx, sc, MIPS_R_T8, off, dst);
++	emit(ctx, beqz, MIPS_R_T8, -20);
++	emit(ctx, move, res, MIPS_R_T9);     /* Delay slot */
++	clobber_reg(ctx, res);
++}
++
++/* Swap bytes and truncate a register word or half word */
++void emit_bswap_r(struct jit_context *ctx, u8 dst, u32 width)
++{
++	u8 tmp = MIPS_R_T8;
++	u8 msk = MIPS_R_T9;
++
++	switch (width) {
++	/* Swap bytes in a word */
++	case 32:
++		if (cpu_has_mips32r2 || cpu_has_mips32r6) {
++			emit(ctx, wsbh, dst, dst);
++			emit(ctx, rotr, dst, dst, 16);
++		} else {
++			emit(ctx, sll, tmp, dst, 16);    /* tmp  = dst << 16 */
++			emit(ctx, srl, dst, dst, 16);    /* dst = dst >> 16  */
++			emit(ctx, or, dst, dst, tmp);    /* dst = dst | tmp  */
++
++			emit(ctx, lui, msk, 0xff);       /* msk = 0x00ff0000 */
++			emit(ctx, ori, msk, msk, 0xff);  /* msk = msk | 0xff */
++
++			emit(ctx, and, tmp, dst, msk);   /* tmp = dst & msk  */
++			emit(ctx, sll, tmp, tmp, 8);     /* tmp = tmp << 8   */
++			emit(ctx, srl, dst, dst, 8);     /* dst = dst >> 8   */
++			emit(ctx, and, dst, dst, msk);   /* dst = dst & msk  */
++			emit(ctx, or, dst, dst, tmp);    /* reg = dst | tmp  */
++		}
++		break;
++	/* Swap bytes in a half word */
++	case 16:
++		if (cpu_has_mips32r2 || cpu_has_mips32r6) {
++			emit(ctx, wsbh, dst, dst);
++			emit(ctx, andi, dst, dst, 0xffff);
++		} else {
++			emit(ctx, andi, tmp, dst, 0xff00); /* t = d & 0xff00 */
++			emit(ctx, srl, tmp, tmp, 8);       /* t = t >> 8     */
++			emit(ctx, andi, dst, dst, 0x00ff); /* d = d & 0x00ff */
++			emit(ctx, sll, dst, dst, 8);       /* d = d << 8     */
++			emit(ctx, or,  dst, dst, tmp);     /* d = d | t      */
++		}
++		break;
++	}
++	clobber_reg(ctx, dst);
++}
++
++/* Validate jump immediate range */
++bool valid_jmp_i(u8 op, s32 imm)
++{
++	switch (op) {
++	case JIT_JNOP:
++		/* Immediate value not used */
++		return true;
++	case BPF_JEQ:
++	case BPF_JNE:
++		/* No immediate operation */
++		return false;
++	case BPF_JSET:
++	case JIT_JNSET:
++		/* imm must be 16 bits unsigned */
++		return imm >= 0 && imm <= 0xffff;
++	case BPF_JGE:
++	case BPF_JLT:
++	case BPF_JSGE:
++	case BPF_JSLT:
++		/* imm must be 16 bits */
++		return imm >= -0x8000 && imm <= 0x7fff;
++	case BPF_JGT:
++	case BPF_JLE:
++	case BPF_JSGT:
++	case BPF_JSLE:
++		/* imm + 1 must be 16 bits */
++		return imm >= -0x8001 && imm <= 0x7ffe;
++	}
++	return false;
++}
++
++/* Invert a conditional jump operation */
++static u8 invert_jmp(u8 op)
++{
++	switch (op) {
++	case BPF_JA: return JIT_JNOP;
++	case BPF_JEQ: return BPF_JNE;
++	case BPF_JNE: return BPF_JEQ;
++	case BPF_JSET: return JIT_JNSET;
++	case BPF_JGT: return BPF_JLE;
++	case BPF_JGE: return BPF_JLT;
++	case BPF_JLT: return BPF_JGE;
++	case BPF_JLE: return BPF_JGT;
++	case BPF_JSGT: return BPF_JSLE;
++	case BPF_JSGE: return BPF_JSLT;
++	case BPF_JSLT: return BPF_JSGE;
++	case BPF_JSLE: return BPF_JSGT;
++	}
++	return 0;
++}
++
++/* Prepare a PC-relative jump operation */
++static void setup_jmp(struct jit_context *ctx, u8 bpf_op,
++		      s16 bpf_off, u8 *jit_op, s32 *jit_off)
++{
++	u32 *descp = &ctx->descriptors[ctx->bpf_index];
++	int op = bpf_op;
++	int offset = 0;
++
++	/* Do not compute offsets on the first pass */
++	if (INDEX(*descp) == 0)
++		goto done;
++
++	/* Skip jumps never taken */
++	if (bpf_op == JIT_JNOP)
++		goto done;
++
++	/* Convert jumps always taken */
++	if (bpf_op == BPF_JA)
++		*descp |= JIT_DESC_CONVERT;
++
++	/*
++	 * Current ctx->jit_index points to the start of the branch preamble.
++	 * Since the preamble differs among different branch conditionals,
++	 * the current index cannot be used to compute the branch offset.
++	 * Instead, we use the offset table value for the next instruction,
++	 * which gives the index immediately after the branch delay slot.
++	 */
++	if (!CONVERTED(*descp)) {
++		int target = ctx->bpf_index + bpf_off + 1;
++		int origin = ctx->bpf_index + 1;
++
++		offset = (INDEX(ctx->descriptors[target]) -
++			  INDEX(ctx->descriptors[origin]) + 1) * sizeof(u32);
++	}
++
++	/*
++	 * The PC-relative branch offset field on MIPS is 18 bits signed,
++	 * so if the computed offset is larger than this we generate a an
++	 * absolute jump that we skip with an inverted conditional branch.
++	 */
++	if (CONVERTED(*descp) || offset < -0x20000 || offset > 0x1ffff) {
++		offset = 3 * sizeof(u32);
++		op = invert_jmp(bpf_op);
++		ctx->changes += !CONVERTED(*descp);
++		*descp |= JIT_DESC_CONVERT;
++	}
++
++done:
++	*jit_off = offset;
++	*jit_op = op;
++}
++
++/* Prepare a PC-relative jump operation with immediate conditional */
++void setup_jmp_i(struct jit_context *ctx, s32 imm, u8 width,
++		 u8 bpf_op, s16 bpf_off, u8 *jit_op, s32 *jit_off)
++{
++	bool always = false;
++	bool never = false;
++
++	switch (bpf_op) {
++	case BPF_JEQ:
++	case BPF_JNE:
++		break;
++	case BPF_JSET:
++	case BPF_JLT:
++		never = imm == 0;
++		break;
++	case BPF_JGE:
++		always = imm == 0;
++		break;
++	case BPF_JGT:
++		never = (u32)imm == U32_MAX;
++		break;
++	case BPF_JLE:
++		always = (u32)imm == U32_MAX;
++		break;
++	case BPF_JSGT:
++		never = imm == S32_MAX && width == 32;
++		break;
++	case BPF_JSGE:
++		always = imm == S32_MIN && width == 32;
++		break;
++	case BPF_JSLT:
++		never = imm == S32_MIN && width == 32;
++		break;
++	case BPF_JSLE:
++		always = imm == S32_MAX && width == 32;
++		break;
++	}
++
++	if (never)
++		bpf_op = JIT_JNOP;
++	if (always)
++		bpf_op = BPF_JA;
++	setup_jmp(ctx, bpf_op, bpf_off, jit_op, jit_off);
++}
++
++/* Prepare a PC-relative jump operation with register conditional */
++void setup_jmp_r(struct jit_context *ctx, bool same_reg,
++		 u8 bpf_op, s16 bpf_off, u8 *jit_op, s32 *jit_off)
++{
++	switch (bpf_op) {
++	case BPF_JSET:
++		break;
++	case BPF_JEQ:
++	case BPF_JGE:
++	case BPF_JLE:
++	case BPF_JSGE:
++	case BPF_JSLE:
++		if (same_reg)
++			bpf_op = BPF_JA;
++		break;
++	case BPF_JNE:
++	case BPF_JLT:
++	case BPF_JGT:
++	case BPF_JSGT:
++	case BPF_JSLT:
++		if (same_reg)
++			bpf_op = JIT_JNOP;
++		break;
++	}
++	setup_jmp(ctx, bpf_op, bpf_off, jit_op, jit_off);
++}
++
++/* Finish a PC-relative jump operation */
++int finish_jmp(struct jit_context *ctx, u8 jit_op, s16 bpf_off)
++{
++	/* Emit conditional branch delay slot */
++	if (jit_op != JIT_JNOP)
++		emit(ctx, nop);
++	/*
++	 * Emit an absolute long jump with delay slot,
++	 * if the PC-relative branch was converted.
++	 */
++	if (CONVERTED(ctx->descriptors[ctx->bpf_index])) {
++		int target = get_target(ctx, ctx->bpf_index + bpf_off + 1);
++
++		if (target < 0)
++			return -1;
++		emit(ctx, j, target);
++		emit(ctx, nop);
++	}
++	return 0;
++}
++
++/* Jump immediate (32-bit) */
++void emit_jmp_i(struct jit_context *ctx, u8 dst, s32 imm, s32 off, u8 op)
++{
++	switch (op) {
++	/* No-op, used internally for branch optimization */
++	case JIT_JNOP:
++		break;
++	/* PC += off if dst & imm */
++	case BPF_JSET:
++		emit(ctx, andi, MIPS_R_T9, dst, (u16)imm);
++		emit(ctx, bnez, MIPS_R_T9, off);
++		break;
++	/* PC += off if (dst & imm) == 0 (not in BPF, used for long jumps) */
++	case JIT_JNSET:
++		emit(ctx, andi, MIPS_R_T9, dst, (u16)imm);
++		emit(ctx, beqz, MIPS_R_T9, off);
++		break;
++	/* PC += off if dst > imm */
++	case BPF_JGT:
++		emit(ctx, sltiu, MIPS_R_T9, dst, imm + 1);
++		emit(ctx, beqz, MIPS_R_T9, off);
++		break;
++	/* PC += off if dst >= imm */
++	case BPF_JGE:
++		emit(ctx, sltiu, MIPS_R_T9, dst, imm);
++		emit(ctx, beqz, MIPS_R_T9, off);
++		break;
++	/* PC += off if dst < imm */
++	case BPF_JLT:
++		emit(ctx, sltiu, MIPS_R_T9, dst, imm);
++		emit(ctx, bnez, MIPS_R_T9, off);
++		break;
++	/* PC += off if dst <= imm */
++	case BPF_JLE:
++		emit(ctx, sltiu, MIPS_R_T9, dst, imm + 1);
++		emit(ctx, bnez, MIPS_R_T9, off);
++		break;
++	/* PC += off if dst > imm (signed) */
++	case BPF_JSGT:
++		emit(ctx, slti, MIPS_R_T9, dst, imm + 1);
++		emit(ctx, beqz, MIPS_R_T9, off);
++		break;
++	/* PC += off if dst >= imm (signed) */
++	case BPF_JSGE:
++		emit(ctx, slti, MIPS_R_T9, dst, imm);
++		emit(ctx, beqz, MIPS_R_T9, off);
++		break;
++	/* PC += off if dst < imm (signed) */
++	case BPF_JSLT:
++		emit(ctx, slti, MIPS_R_T9, dst, imm);
++		emit(ctx, bnez, MIPS_R_T9, off);
++		break;
++	/* PC += off if dst <= imm (signed) */
++	case BPF_JSLE:
++		emit(ctx, slti, MIPS_R_T9, dst, imm + 1);
++		emit(ctx, bnez, MIPS_R_T9, off);
++		break;
++	}
++}
++
++/* Jump register (32-bit) */
++void emit_jmp_r(struct jit_context *ctx, u8 dst, u8 src, s32 off, u8 op)
++{
++	switch (op) {
++	/* No-op, used internally for branch optimization */
++	case JIT_JNOP:
++		break;
++	/* PC += off if dst == src */
++	case BPF_JEQ:
++		emit(ctx, beq, dst, src, off);
++		break;
++	/* PC += off if dst != src */
++	case BPF_JNE:
++		emit(ctx, bne, dst, src, off);
++		break;
++	/* PC += off if dst & src */
++	case BPF_JSET:
++		emit(ctx, and, MIPS_R_T9, dst, src);
++		emit(ctx, bnez, MIPS_R_T9, off);
++		break;
++	/* PC += off if (dst & imm) == 0 (not in BPF, used for long jumps) */
++	case JIT_JNSET:
++		emit(ctx, and, MIPS_R_T9, dst, src);
++		emit(ctx, beqz, MIPS_R_T9, off);
++		break;
++	/* PC += off if dst > src */
++	case BPF_JGT:
++		emit(ctx, sltu, MIPS_R_T9, src, dst);
++		emit(ctx, bnez, MIPS_R_T9, off);
++		break;
++	/* PC += off if dst >= src */
++	case BPF_JGE:
++		emit(ctx, sltu, MIPS_R_T9, dst, src);
++		emit(ctx, beqz, MIPS_R_T9, off);
++		break;
++	/* PC += off if dst < src */
++	case BPF_JLT:
++		emit(ctx, sltu, MIPS_R_T9, dst, src);
++		emit(ctx, bnez, MIPS_R_T9, off);
++		break;
++	/* PC += off if dst <= src */
++	case BPF_JLE:
++		emit(ctx, sltu, MIPS_R_T9, src, dst);
++		emit(ctx, beqz, MIPS_R_T9, off);
++		break;
++	/* PC += off if dst > src (signed) */
++	case BPF_JSGT:
++		emit(ctx, slt, MIPS_R_T9, src, dst);
++		emit(ctx, bnez, MIPS_R_T9, off);
++		break;
++	/* PC += off if dst >= src (signed) */
++	case BPF_JSGE:
++		emit(ctx, slt, MIPS_R_T9, dst, src);
++		emit(ctx, beqz, MIPS_R_T9, off);
++		break;
++	/* PC += off if dst < src (signed) */
++	case BPF_JSLT:
++		emit(ctx, slt, MIPS_R_T9, dst, src);
++		emit(ctx, bnez, MIPS_R_T9, off);
++		break;
++	/* PC += off if dst <= src (signed) */
++	case BPF_JSLE:
++		emit(ctx, slt, MIPS_R_T9, src, dst);
++		emit(ctx, beqz, MIPS_R_T9, off);
++		break;
++	}
++}
++
++/* Jump always */
++int emit_ja(struct jit_context *ctx, s16 off)
++{
++	int target = get_target(ctx, ctx->bpf_index + off + 1);
++
++	if (target < 0)
++		return -1;
++	emit(ctx, j, target);
++	emit(ctx, nop);
++	return 0;
++}
++
++/* Jump to epilogue */
++int emit_exit(struct jit_context *ctx)
++{
++	int target = get_target(ctx, ctx->program->len);
++
++	if (target < 0)
++		return -1;
++	emit(ctx, j, target);
++	emit(ctx, nop);
++	return 0;
++}
++
++/* Build the program body from eBPF bytecode */
++static int build_body(struct jit_context *ctx)
++{
++	const struct bpf_prog *prog = ctx->program;
++	unsigned int i;
++
++	ctx->stack_used = 0;
++	for (i = 0; i < prog->len; i++) {
++		const struct bpf_insn *insn = &prog->insnsi[i];
++		u32 *descp = &ctx->descriptors[i];
++		int ret;
++
++		access_reg(ctx, insn->src_reg);
++		access_reg(ctx, insn->dst_reg);
++
++		ctx->bpf_index = i;
++		if (ctx->target == NULL) {
++			ctx->changes += INDEX(*descp) != ctx->jit_index;
++			*descp &= JIT_DESC_CONVERT;
++			*descp |= ctx->jit_index;
++		}
++
++		ret = build_insn(insn, ctx);
++		if (ret < 0)
++			return ret;
++
++		if (ret > 0) {
++			i++;
++			if (ctx->target == NULL)
++				descp[1] = ctx->jit_index;
++		}
++	}
++
++	/* Store the end offset, where the epilogue begins */
++	ctx->descriptors[prog->len] = ctx->jit_index;
++	return 0;
++}
++
++/* Set the branch conversion flag on all instructions */
++static void set_convert_flag(struct jit_context *ctx, bool enable)
++{
++	const struct bpf_prog *prog = ctx->program;
++	u32 flag = enable ? JIT_DESC_CONVERT : 0;
++	unsigned int i;
++
++	for (i = 0; i <= prog->len; i++)
++		ctx->descriptors[i] = INDEX(ctx->descriptors[i]) | flag;
++}
++
++static void jit_fill_hole(void *area, unsigned int size)
++{
++	u32 *p;
++
++	/* We are guaranteed to have aligned memory. */
++	for (p = area; size >= sizeof(u32); size -= sizeof(u32))
++		uasm_i_break(&p, BRK_BUG); /* Increments p */
++}
++
++bool bpf_jit_needs_zext(void)
++{
++	return true;
++}
++
++struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
++{
++	struct bpf_prog *tmp, *orig_prog = prog;
++	struct bpf_binary_header *header = NULL;
++	struct jit_context ctx;
++	bool tmp_blinded = false;
++	unsigned int tmp_idx;
++	unsigned int image_size;
++	u8 *image_ptr;
++	int tries;
++
++	/*
++	 * If BPF JIT was not enabled then we must fall back to
++	 * the interpreter.
++	 */
++	if (!prog->jit_requested)
++		return orig_prog;
++	/*
++	 * If constant blinding was enabled and we failed during blinding
++	 * then we must fall back to the interpreter. Otherwise, we save
++	 * the new JITed code.
++	 */
++	tmp = bpf_jit_blind_constants(prog);
++	if (IS_ERR(tmp))
++		return orig_prog;
++	if (tmp != prog) {
++		tmp_blinded = true;
++		prog = tmp;
++	}
++
++	memset(&ctx, 0, sizeof(ctx));
++	ctx.program = prog;
++
++	/*
++	 * Not able to allocate memory for descriptors[], then
++	 * we must fall back to the interpreter
++	 */
++	ctx.descriptors = kcalloc(prog->len + 1, sizeof(*ctx.descriptors),
++				  GFP_KERNEL);
++	if (ctx.descriptors == NULL)
++		goto out_err;
++
++	/* First pass discovers used resources */
++	if (build_body(&ctx) < 0)
++		goto out_err;
++	/*
++	 * Second pass computes instruction offsets.
++	 * If any PC-relative branches are out of range, a sequence of
++	 * a PC-relative branch + a jump is generated, and we have to
++	 * try again from the beginning to generate the new offsets.
++	 * This is done until no additional conversions are necessary.
++	 * The last two iterations are done with all branches being
++	 * converted, to guarantee offset table convergence within a
++	 * fixed number of iterations.
++	 */
++	ctx.jit_index = 0;
++	build_prologue(&ctx);
++	tmp_idx = ctx.jit_index;
++
++	tries = JIT_MAX_ITERATIONS;
++	do {
++		ctx.jit_index = tmp_idx;
++		ctx.changes = 0;
++		if (tries == 2)
++			set_convert_flag(&ctx, true);
++		if (build_body(&ctx) < 0)
++			goto out_err;
++	} while (ctx.changes > 0 && --tries > 0);
++
++	if (WARN_ONCE(ctx.changes > 0, "JIT offsets failed to converge"))
++		goto out_err;
++
++	build_epilogue(&ctx, MIPS_R_RA);
++
++	/* Now we know the size of the structure to make */
++	image_size = sizeof(u32) * ctx.jit_index;
++	header = bpf_jit_binary_alloc(image_size, &image_ptr,
++				      sizeof(u32), jit_fill_hole);
++	/*
++	 * Not able to allocate memory for the structure then
++	 * we must fall back to the interpretation
++	 */
++	if (header == NULL)
++		goto out_err;
++
++	/* Actual pass to generate final JIT code */
++	ctx.target = (u32 *)image_ptr;
++	ctx.jit_index = 0;
++
++	/*
++	 * If building the JITed code fails somehow,
++	 * we fall back to the interpretation.
++	 */
++	build_prologue(&ctx);
++	if (build_body(&ctx) < 0)
++		goto out_err;
++	build_epilogue(&ctx, MIPS_R_RA);
++
++	/* Populate line info meta data */
++	set_convert_flag(&ctx, false);
++	bpf_prog_fill_jited_linfo(prog, &ctx.descriptors[1]);
++
++	/* Set as read-only exec and flush instruction cache */
++	bpf_jit_binary_lock_ro(header);
++	flush_icache_range((unsigned long)header,
++			   (unsigned long)&ctx.target[ctx.jit_index]);
++
++	if (bpf_jit_enable > 1)
++		bpf_jit_dump(prog->len, image_size, 2, ctx.target);
++
++	prog->bpf_func = (void *)ctx.target;
++	prog->jited = 1;
++	prog->jited_len = image_size;
++
++out:
++	if (tmp_blinded)
++		bpf_jit_prog_release_other(prog, prog == orig_prog ?
++					   tmp : orig_prog);
++	kfree(ctx.descriptors);
++	return prog;
++
++out_err:
++	prog = orig_prog;
++	if (header)
++		bpf_jit_binary_free(header);
++	goto out;
++}
+--- /dev/null
++++ b/arch/mips/net/bpf_jit_comp.h
+@@ -0,0 +1,211 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++/*
++ * Just-In-Time compiler for eBPF bytecode on 32-bit and 64-bit MIPS.
++ *
++ * Copyright (c) 2021 Anyfi Networks AB.
++ * Author: Johan Almbladh <[email protected]>
++ *
++ * Based on code and ideas from
++ * Copyright (c) 2017 Cavium, Inc.
++ * Copyright (c) 2017 Shubham Bansal <[email protected]>
++ * Copyright (c) 2011 Mircea Gherzan <[email protected]>
++ */
++
++#ifndef _BPF_JIT_COMP_H
++#define _BPF_JIT_COMP_H
++
++/* MIPS registers */
++#define MIPS_R_ZERO	0   /* Const zero */
++#define MIPS_R_AT	1   /* Asm temp   */
++#define MIPS_R_V0	2   /* Result     */
++#define MIPS_R_V1	3   /* Result     */
++#define MIPS_R_A0	4   /* Argument   */
++#define MIPS_R_A1	5   /* Argument   */
++#define MIPS_R_A2	6   /* Argument   */
++#define MIPS_R_A3	7   /* Argument   */
++#define MIPS_R_A4	8   /* Arg (n64)  */
++#define MIPS_R_A5	9   /* Arg (n64)  */
++#define MIPS_R_A6	10  /* Arg (n64)  */
++#define MIPS_R_A7	11  /* Arg (n64)  */
++#define MIPS_R_T0	8   /* Temp (o32) */
++#define MIPS_R_T1	9   /* Temp (o32) */
++#define MIPS_R_T2	10  /* Temp (o32) */
++#define MIPS_R_T3	11  /* Temp (o32) */
++#define MIPS_R_T4	12  /* Temporary  */
++#define MIPS_R_T5	13  /* Temporary  */
++#define MIPS_R_T6	14  /* Temporary  */
++#define MIPS_R_T7	15  /* Temporary  */
++#define MIPS_R_S0	16  /* Saved      */
++#define MIPS_R_S1	17  /* Saved      */
++#define MIPS_R_S2	18  /* Saved      */
++#define MIPS_R_S3	19  /* Saved      */
++#define MIPS_R_S4	20  /* Saved      */
++#define MIPS_R_S5	21  /* Saved      */
++#define MIPS_R_S6	22  /* Saved      */
++#define MIPS_R_S7	23  /* Saved      */
++#define MIPS_R_T8	24  /* Temporary  */
++#define MIPS_R_T9	25  /* Temporary  */
++/*      MIPS_R_K0	26     Reserved   */
++/*      MIPS_R_K1	27     Reserved   */
++#define MIPS_R_GP	28  /* Global ptr */
++#define MIPS_R_SP	29  /* Stack ptr  */
++#define MIPS_R_FP	30  /* Frame ptr  */
++#define MIPS_R_RA	31  /* Return     */
++
++/*
++ * Jump address mask for immediate jumps. The four most significant bits
++ * must be equal to PC.
++ */
++#define MIPS_JMP_MASK	0x0fffffffUL
++
++/* Maximum number of iterations in offset table computation */
++#define JIT_MAX_ITERATIONS	8
++
++/*
++ * Jump pseudo-instructions used internally
++ * for branch conversion and branch optimization.
++ */
++#define JIT_JNSET	0xe0
++#define JIT_JNOP	0xf0
++
++/* Descriptor flag for PC-relative branch conversion */
++#define JIT_DESC_CONVERT	BIT(31)
++
++/* JIT context for an eBPF program */
++struct jit_context {
++	struct bpf_prog *program;     /* The eBPF program being JITed        */
++	u32 *descriptors;             /* eBPF to JITed CPU insn descriptors  */
++	u32 *target;                  /* JITed code buffer                   */
++	u32 bpf_index;                /* Index of current BPF program insn   */
++	u32 jit_index;                /* Index of current JIT target insn    */
++	u32 changes;                  /* Number of PC-relative branch conv   */
++	u32 accessed;                 /* Bit mask of read eBPF registers     */
++	u32 clobbered;                /* Bit mask of modified CPU registers  */
++	u32 stack_size;               /* Total allocated stack size in bytes */
++	u32 saved_size;               /* Size of callee-saved registers      */
++	u32 stack_used;               /* Stack size used for function calls  */
++};
++
++/* Emit the instruction if the JIT memory space has been allocated */
++#define emit(ctx, func, ...)					\
++do {								\
++	if ((ctx)->target != NULL) {				\
++		u32 *p = &(ctx)->target[ctx->jit_index];	\
++		uasm_i_##func(&p, ##__VA_ARGS__);		\
++	}							\
++	(ctx)->jit_index++;					\
++} while (0)
++
++/*
++ * Mark a BPF register as accessed, it needs to be
++ * initialized by the program if expected, e.g. FP.
++ */
++static inline void access_reg(struct jit_context *ctx, u8 reg)
++{
++	ctx->accessed |= BIT(reg);
++}
++
++/*
++ * Mark a CPU register as clobbered, it needs to be
++ * saved/restored by the program if callee-saved.
++ */
++static inline void clobber_reg(struct jit_context *ctx, u8 reg)
++{
++	ctx->clobbered |= BIT(reg);
++}
++
++/*
++ * Push registers on the stack, starting at a given depth from the stack
++ * pointer and increasing. The next depth to be written is returned.
++ */
++int push_regs(struct jit_context *ctx, u32 mask, u32 excl, int depth);
++
++/*
++ * Pop registers from the stack, starting at a given depth from the stack
++ * pointer and increasing. The next depth to be read is returned.
++ */
++int pop_regs(struct jit_context *ctx, u32 mask, u32 excl, int depth);
++
++/* Compute the 28-bit jump target address from a BPF program location */
++int get_target(struct jit_context *ctx, u32 loc);
++
++/* Compute the PC-relative offset to relative BPF program offset */
++int get_offset(const struct jit_context *ctx, int off);
++
++/* dst = imm (32-bit) */
++void emit_mov_i(struct jit_context *ctx, u8 dst, s32 imm);
++
++/* dst = src (32-bit) */
++void emit_mov_r(struct jit_context *ctx, u8 dst, u8 src);
++
++/* Validate ALU/ALU64 immediate range */
++bool valid_alu_i(u8 op, s32 imm);
++
++/* Rewrite ALU/ALU64 immediate operation */
++bool rewrite_alu_i(u8 op, s32 imm, u8 *alu, s32 *val);
++
++/* ALU immediate operation (32-bit) */
++void emit_alu_i(struct jit_context *ctx, u8 dst, s32 imm, u8 op);
++
++/* ALU register operation (32-bit) */
++void emit_alu_r(struct jit_context *ctx, u8 dst, u8 src, u8 op);
++
++/* Atomic read-modify-write (32-bit) */
++void emit_atomic_r(struct jit_context *ctx, u8 dst, u8 src, s16 off, u8 code);
++
++/* Atomic compare-and-exchange (32-bit) */
++void emit_cmpxchg_r(struct jit_context *ctx, u8 dst, u8 src, u8 res, s16 off);
++
++/* Swap bytes and truncate a register word or half word */
++void emit_bswap_r(struct jit_context *ctx, u8 dst, u32 width);
++
++/* Validate JMP/JMP32 immediate range */
++bool valid_jmp_i(u8 op, s32 imm);
++
++/* Prepare a PC-relative jump operation with immediate conditional */
++void setup_jmp_i(struct jit_context *ctx, s32 imm, u8 width,
++		 u8 bpf_op, s16 bpf_off, u8 *jit_op, s32 *jit_off);
++
++/* Prepare a PC-relative jump operation with register conditional */
++void setup_jmp_r(struct jit_context *ctx, bool same_reg,
++		 u8 bpf_op, s16 bpf_off, u8 *jit_op, s32 *jit_off);
++
++/* Finish a PC-relative jump operation */
++int finish_jmp(struct jit_context *ctx, u8 jit_op, s16 bpf_off);
++
++/* Conditional JMP/JMP32 immediate */
++void emit_jmp_i(struct jit_context *ctx, u8 dst, s32 imm, s32 off, u8 op);
++
++/* Conditional JMP/JMP32 register */
++void emit_jmp_r(struct jit_context *ctx, u8 dst, u8 src, s32 off, u8 op);
++
++/* Jump always */
++int emit_ja(struct jit_context *ctx, s16 off);
++
++/* Jump to epilogue */
++int emit_exit(struct jit_context *ctx);
++
++/*
++ * Build program prologue to set up the stack and registers.
++ * This function is implemented separately for 32-bit and 64-bit JITs.
++ */
++void build_prologue(struct jit_context *ctx);
++
++/*
++ * Build the program epilogue to restore the stack and registers.
++ * This function is implemented separately for 32-bit and 64-bit JITs.
++ */
++void build_epilogue(struct jit_context *ctx, int dest_reg);
++
++/*
++ * Convert an eBPF instruction to native instruction, i.e
++ * JITs an eBPF instruction.
++ * Returns :
++ *	0  - Successfully JITed an 8-byte eBPF instruction
++ *	>0 - Successfully JITed a 16-byte eBPF instruction
++ *	<0 - Failed to JIT.
++ * This function is implemented separately for 32-bit and 64-bit JITs.
++ */
++int build_insn(const struct bpf_insn *insn, struct jit_context *ctx);
++
++#endif /* _BPF_JIT_COMP_H */
+--- /dev/null
++++ b/arch/mips/net/bpf_jit_comp32.c
+@@ -0,0 +1,1741 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/*
++ * Just-In-Time compiler for eBPF bytecode on MIPS.
++ * Implementation of JIT functions for 32-bit CPUs.
++ *
++ * Copyright (c) 2021 Anyfi Networks AB.
++ * Author: Johan Almbladh <[email protected]>
++ *
++ * Based on code and ideas from
++ * Copyright (c) 2017 Cavium, Inc.
++ * Copyright (c) 2017 Shubham Bansal <[email protected]>
++ * Copyright (c) 2011 Mircea Gherzan <[email protected]>
++ */
++
++#include <linux/math64.h>
++#include <linux/errno.h>
++#include <linux/filter.h>
++#include <linux/bpf.h>
++#include <asm/cpu-features.h>
++#include <asm/isa-rev.h>
++#include <asm/uasm.h>
++
++#include "bpf_jit_comp.h"
++
++/* MIPS a4-a7 are not available in the o32 ABI */
++#undef MIPS_R_A4
++#undef MIPS_R_A5
++#undef MIPS_R_A6
++#undef MIPS_R_A7
++
++/* Stack is 8-byte aligned in o32 ABI */
++#define MIPS_STACK_ALIGNMENT 8
++
++/*
++ * The top 16 bytes of a stack frame is reserved for the callee in O32 ABI.
++ * This corresponds to stack space for register arguments a0-a3.
++ */
++#define JIT_RESERVED_STACK 16
++
++/* Temporary 64-bit register used by JIT */
++#define JIT_REG_TMP MAX_BPF_JIT_REG
++
++/*
++ * Number of prologue bytes to skip when doing a tail call.
++ * Tail call count (TCC) initialization (8 bytes) always, plus
++ * R0-to-v0 assignment (4 bytes) if big endian.
++ */
++#ifdef __BIG_ENDIAN
++#define JIT_TCALL_SKIP 12
++#else
++#define JIT_TCALL_SKIP 8
++#endif
++
++/* CPU registers holding the callee return value */
++#define JIT_RETURN_REGS	  \
++	(BIT(MIPS_R_V0) | \
++	 BIT(MIPS_R_V1))
++
++/* CPU registers arguments passed to callee directly */
++#define JIT_ARG_REGS      \
++	(BIT(MIPS_R_A0) | \
++	 BIT(MIPS_R_A1) | \
++	 BIT(MIPS_R_A2) | \
++	 BIT(MIPS_R_A3))
++
++/* CPU register arguments passed to callee on stack */
++#define JIT_STACK_REGS    \
++	(BIT(MIPS_R_T0) | \
++	 BIT(MIPS_R_T1) | \
++	 BIT(MIPS_R_T2) | \
++	 BIT(MIPS_R_T3) | \
++	 BIT(MIPS_R_T4) | \
++	 BIT(MIPS_R_T5))
++
++/* Caller-saved CPU registers */
++#define JIT_CALLER_REGS    \
++	(JIT_RETURN_REGS | \
++	 JIT_ARG_REGS    | \
++	 JIT_STACK_REGS)
++
++/* Callee-saved CPU registers */
++#define JIT_CALLEE_REGS   \
++	(BIT(MIPS_R_S0) | \
++	 BIT(MIPS_R_S1) | \
++	 BIT(MIPS_R_S2) | \
++	 BIT(MIPS_R_S3) | \
++	 BIT(MIPS_R_S4) | \
++	 BIT(MIPS_R_S5) | \
++	 BIT(MIPS_R_S6) | \
++	 BIT(MIPS_R_S7) | \
++	 BIT(MIPS_R_GP) | \
++	 BIT(MIPS_R_FP) | \
++	 BIT(MIPS_R_RA))
++
++/*
++ * Mapping of 64-bit eBPF registers to 32-bit native MIPS registers.
++ *
++ * 1) Native register pairs are ordered according to CPU endiannes, following
++ *    the MIPS convention for passing 64-bit arguments and return values.
++ * 2) The eBPF return value, arguments and callee-saved registers are mapped
++ *    to their native MIPS equivalents.
++ * 3) Since the 32 highest bits in the eBPF FP register are always zero,
++ *    only one general-purpose register is actually needed for the mapping.
++ *    We use the fp register for this purpose, and map the highest bits to
++ *    the MIPS register r0 (zero).
++ * 4) We use the MIPS gp and at registers as internal temporary registers
++ *    for constant blinding. The gp register is callee-saved.
++ * 5) One 64-bit temporary register is mapped for use when sign-extending
++ *    immediate operands. MIPS registers t6-t9 are available to the JIT
++ *    for as temporaries when implementing complex 64-bit operations.
++ *
++ * With this scheme all eBPF registers are being mapped to native MIPS
++ * registers without having to use any stack scratch space. The direct
++ * register mapping (2) simplifies the handling of function calls.
++ */
++static const u8 bpf2mips32[][2] = {
++	/* Return value from in-kernel function, and exit value from eBPF */
++	[BPF_REG_0] = {MIPS_R_V1, MIPS_R_V0},
++	/* Arguments from eBPF program to in-kernel function */
++	[BPF_REG_1] = {MIPS_R_A1, MIPS_R_A0},
++	[BPF_REG_2] = {MIPS_R_A3, MIPS_R_A2},
++	/* Remaining arguments, to be passed on the stack per O32 ABI */
++	[BPF_REG_3] = {MIPS_R_T1, MIPS_R_T0},
++	[BPF_REG_4] = {MIPS_R_T3, MIPS_R_T2},
++	[BPF_REG_5] = {MIPS_R_T5, MIPS_R_T4},
++	/* Callee-saved registers that in-kernel function will preserve */
++	[BPF_REG_6] = {MIPS_R_S1, MIPS_R_S0},
++	[BPF_REG_7] = {MIPS_R_S3, MIPS_R_S2},
++	[BPF_REG_8] = {MIPS_R_S5, MIPS_R_S4},
++	[BPF_REG_9] = {MIPS_R_S7, MIPS_R_S6},
++	/* Read-only frame pointer to access the eBPF stack */
++#ifdef __BIG_ENDIAN
++	[BPF_REG_FP] = {MIPS_R_FP, MIPS_R_ZERO},
++#else
++	[BPF_REG_FP] = {MIPS_R_ZERO, MIPS_R_FP},
++#endif
++	/* Temporary register for blinding constants */
++	[BPF_REG_AX] = {MIPS_R_GP, MIPS_R_AT},
++	/* Temporary register for internal JIT use */
++	[JIT_REG_TMP] = {MIPS_R_T7, MIPS_R_T6},
++};
++
++/* Get low CPU register for a 64-bit eBPF register mapping */
++static inline u8 lo(const u8 reg[])
++{
++#ifdef __BIG_ENDIAN
++	return reg[0];
++#else
++	return reg[1];
++#endif
++}
++
++/* Get high CPU register for a 64-bit eBPF register mapping */
++static inline u8 hi(const u8 reg[])
++{
++#ifdef __BIG_ENDIAN
++	return reg[1];
++#else
++	return reg[0];
++#endif
++}
++
++/*
++ * Mark a 64-bit CPU register pair as clobbered, it needs to be
++ * saved/restored by the program if callee-saved.
++ */
++static void clobber_reg64(struct jit_context *ctx, const u8 reg[])
++{
++	clobber_reg(ctx, reg[0]);
++	clobber_reg(ctx, reg[1]);
++}
++
++/* dst = imm (sign-extended) */
++static void emit_mov_se_i64(struct jit_context *ctx, const u8 dst[], s32 imm)
++{
++	emit_mov_i(ctx, lo(dst), imm);
++	if (imm < 0)
++		emit(ctx, addiu, hi(dst), MIPS_R_ZERO, -1);
++	else
++		emit(ctx, move, hi(dst), MIPS_R_ZERO);
++	clobber_reg64(ctx, dst);
++}
++
++/* Zero extension, if verifier does not do it for us  */
++static void emit_zext_ver(struct jit_context *ctx, const u8 dst[])
++{
++	if (!ctx->program->aux->verifier_zext) {
++		emit(ctx, move, hi(dst), MIPS_R_ZERO);
++		clobber_reg(ctx, hi(dst));
++	}
++}
++
++/* Load delay slot, if ISA mandates it */
++static void emit_load_delay(struct jit_context *ctx)
++{
++	if (!cpu_has_mips_2_3_4_5_r)
++		emit(ctx, nop);
++}
++
++/* ALU immediate operation (64-bit) */
++static void emit_alu_i64(struct jit_context *ctx,
++			 const u8 dst[], s32 imm, u8 op)
++{
++	u8 src = MIPS_R_T6;
++
++	/*
++	 * ADD/SUB with all but the max negative imm can be handled by
++	 * inverting the operation and the imm value, saving one insn.
++	 */
++	if (imm > S32_MIN && imm < 0)
++		switch (op) {
++		case BPF_ADD:
++			op = BPF_SUB;
++			imm = -imm;
++			break;
++		case BPF_SUB:
++			op = BPF_ADD;
++			imm = -imm;
++			break;
++		}
++
++	/* Move immediate to temporary register */
++	emit_mov_i(ctx, src, imm);
++
++	switch (op) {
++	/* dst = dst + imm */
++	case BPF_ADD:
++		emit(ctx, addu, lo(dst), lo(dst), src);
++		emit(ctx, sltu, MIPS_R_T9, lo(dst), src);
++		emit(ctx, addu, hi(dst), hi(dst), MIPS_R_T9);
++		if (imm < 0)
++			emit(ctx, addiu, hi(dst), hi(dst), -1);
++		break;
++	/* dst = dst - imm */
++	case BPF_SUB:
++		emit(ctx, sltu, MIPS_R_T9, lo(dst), src);
++		emit(ctx, subu, lo(dst), lo(dst), src);
++		emit(ctx, subu, hi(dst), hi(dst), MIPS_R_T9);
++		if (imm < 0)
++			emit(ctx, addiu, hi(dst), hi(dst), 1);
++		break;
++	/* dst = dst | imm */
++	case BPF_OR:
++		emit(ctx, or, lo(dst), lo(dst), src);
++		if (imm < 0)
++			emit(ctx, addiu, hi(dst), MIPS_R_ZERO, -1);
++		break;
++	/* dst = dst & imm */
++	case BPF_AND:
++		emit(ctx, and, lo(dst), lo(dst), src);
++		if (imm >= 0)
++			emit(ctx, move, hi(dst), MIPS_R_ZERO);
++		break;
++	/* dst = dst ^ imm */
++	case BPF_XOR:
++		emit(ctx, xor, lo(dst), lo(dst), src);
++		if (imm < 0) {
++			emit(ctx, subu, hi(dst), MIPS_R_ZERO, hi(dst));
++			emit(ctx, addiu, hi(dst), hi(dst), -1);
++		}
++		break;
++	}
++	clobber_reg64(ctx, dst);
++}
++
++/* ALU register operation (64-bit) */
++static void emit_alu_r64(struct jit_context *ctx,
++			 const u8 dst[], const u8 src[], u8 op)
++{
++	switch (BPF_OP(op)) {
++	/* dst = dst + src */
++	case BPF_ADD:
++		if (src == dst) {
++			emit(ctx, srl, MIPS_R_T9, lo(dst), 31);
++			emit(ctx, addu, lo(dst), lo(dst), lo(dst));
++		} else {
++			emit(ctx, addu, lo(dst), lo(dst), lo(src));
++			emit(ctx, sltu, MIPS_R_T9, lo(dst), lo(src));
++		}
++		emit(ctx, addu, hi(dst), hi(dst), hi(src));
++		emit(ctx, addu, hi(dst), hi(dst), MIPS_R_T9);
++		break;
++	/* dst = dst - src */
++	case BPF_SUB:
++		emit(ctx, sltu, MIPS_R_T9, lo(dst), lo(src));
++		emit(ctx, subu, lo(dst), lo(dst), lo(src));
++		emit(ctx, subu, hi(dst), hi(dst), hi(src));
++		emit(ctx, subu, hi(dst), hi(dst), MIPS_R_T9);
++		break;
++	/* dst = dst | src */
++	case BPF_OR:
++		emit(ctx, or, lo(dst), lo(dst), lo(src));
++		emit(ctx, or, hi(dst), hi(dst), hi(src));
++		break;
++	/* dst = dst & src */
++	case BPF_AND:
++		emit(ctx, and, lo(dst), lo(dst), lo(src));
++		emit(ctx, and, hi(dst), hi(dst), hi(src));
++		break;
++	/* dst = dst ^ src */
++	case BPF_XOR:
++		emit(ctx, xor, lo(dst), lo(dst), lo(src));
++		emit(ctx, xor, hi(dst), hi(dst), hi(src));
++		break;
++	}
++	clobber_reg64(ctx, dst);
++}
++
++/* ALU invert (64-bit) */
++static void emit_neg_i64(struct jit_context *ctx, const u8 dst[])
++{
++	emit(ctx, sltu, MIPS_R_T9, MIPS_R_ZERO, lo(dst));
++	emit(ctx, subu, lo(dst), MIPS_R_ZERO, lo(dst));
++	emit(ctx, subu, hi(dst), MIPS_R_ZERO, hi(dst));
++	emit(ctx, subu, hi(dst), hi(dst), MIPS_R_T9);
++
++	clobber_reg64(ctx, dst);
++}
++
++/* ALU shift immediate (64-bit) */
++static void emit_shift_i64(struct jit_context *ctx,
++			   const u8 dst[], u32 imm, u8 op)
++{
++	switch (BPF_OP(op)) {
++	/* dst = dst << imm */
++	case BPF_LSH:
++		if (imm < 32) {
++			emit(ctx, srl, MIPS_R_T9, lo(dst), 32 - imm);
++			emit(ctx, sll, lo(dst), lo(dst), imm);
++			emit(ctx, sll, hi(dst), hi(dst), imm);
++			emit(ctx, or, hi(dst), hi(dst), MIPS_R_T9);
++		} else {
++			emit(ctx, sll, hi(dst), lo(dst), imm - 32);
++			emit(ctx, move, lo(dst), MIPS_R_ZERO);
++		}
++		break;
++	/* dst = dst >> imm */
++	case BPF_RSH:
++		if (imm < 32) {
++			emit(ctx, sll, MIPS_R_T9, hi(dst), 32 - imm);
++			emit(ctx, srl, lo(dst), lo(dst), imm);
++			emit(ctx, srl, hi(dst), hi(dst), imm);
++			emit(ctx, or, lo(dst), lo(dst), MIPS_R_T9);
++		} else {
++			emit(ctx, srl, lo(dst), hi(dst), imm - 32);
++			emit(ctx, move, hi(dst), MIPS_R_ZERO);
++		}
++		break;
++	/* dst = dst >> imm (arithmetic) */
++	case BPF_ARSH:
++		if (imm < 32) {
++			emit(ctx, sll, MIPS_R_T9, hi(dst), 32 - imm);
++			emit(ctx, srl, lo(dst), lo(dst), imm);
++			emit(ctx, sra, hi(dst), hi(dst), imm);
++			emit(ctx, or, lo(dst), lo(dst), MIPS_R_T9);
++		} else {
++			emit(ctx, sra, lo(dst), hi(dst), imm - 32);
++			emit(ctx, sra, hi(dst), hi(dst), 31);
++		}
++		break;
++	}
++	clobber_reg64(ctx, dst);
++}
++
++/* ALU shift register (64-bit) */
++static void emit_shift_r64(struct jit_context *ctx,
++			   const u8 dst[], u8 src, u8 op)
++{
++	u8 t1 = MIPS_R_T8;
++	u8 t2 = MIPS_R_T9;
++
++	emit(ctx, andi, t1, src, 32);              /* t1 = src & 32          */
++	emit(ctx, beqz, t1, 16);                   /* PC += 16 if t1 == 0    */
++	emit(ctx, nor, t2, src, MIPS_R_ZERO);      /* t2 = ~src (delay slot) */
++
++	switch (BPF_OP(op)) {
++	/* dst = dst << src */
++	case BPF_LSH:
++		/* Next: shift >= 32 */
++		emit(ctx, sllv, hi(dst), lo(dst), src);    /* dh = dl << src */
++		emit(ctx, move, lo(dst), MIPS_R_ZERO);     /* dl = 0         */
++		emit(ctx, b, 20);                          /* PC += 20       */
++		/* +16: shift < 32 */
++		emit(ctx, srl, t1, lo(dst), 1);            /* t1 = dl >> 1   */
++		emit(ctx, srlv, t1, t1, t2);               /* t1 = t1 >> t2  */
++		emit(ctx, sllv, lo(dst), lo(dst), src);    /* dl = dl << src */
++		emit(ctx, sllv, hi(dst), hi(dst), src);    /* dh = dh << src */
++		emit(ctx, or, hi(dst), hi(dst), t1);       /* dh = dh | t1   */
++		break;
++	/* dst = dst >> src */
++	case BPF_RSH:
++		/* Next: shift >= 32 */
++		emit(ctx, srlv, lo(dst), hi(dst), src);    /* dl = dh >> src */
++		emit(ctx, move, hi(dst), MIPS_R_ZERO);     /* dh = 0         */
++		emit(ctx, b, 20);                          /* PC += 20       */
++		/* +16: shift < 32 */
++		emit(ctx, sll, t1, hi(dst), 1);            /* t1 = dl << 1   */
++		emit(ctx, sllv, t1, t1, t2);               /* t1 = t1 << t2  */
++		emit(ctx, srlv, lo(dst), lo(dst), src);    /* dl = dl >> src */
++		emit(ctx, srlv, hi(dst), hi(dst), src);    /* dh = dh >> src */
++		emit(ctx, or, lo(dst), lo(dst), t1);       /* dl = dl | t1   */
++		break;
++	/* dst = dst >> src (arithmetic) */
++	case BPF_ARSH:
++		/* Next: shift >= 32 */
++		emit(ctx, srav, lo(dst), hi(dst), src);   /* dl = dh >>a src */
++		emit(ctx, sra, hi(dst), hi(dst), 31);     /* dh = dh >>a 31  */
++		emit(ctx, b, 20);                         /* PC += 20        */
++		/* +16: shift < 32 */
++		emit(ctx, sll, t1, hi(dst), 1);           /* t1 = dl << 1    */
++		emit(ctx, sllv, t1, t1, t2);              /* t1 = t1 << t2   */
++		emit(ctx, srlv, lo(dst), lo(dst), src);   /* dl = dl >>a src */
++		emit(ctx, srav, hi(dst), hi(dst), src);   /* dh = dh >> src  */
++		emit(ctx, or, lo(dst), lo(dst), t1);      /* dl = dl | t1    */
++		break;
++	}
++
++	/* +20: Done */
++	clobber_reg64(ctx, dst);
++}
++
++/* ALU mul immediate (64x32-bit) */
++static void emit_mul_i64(struct jit_context *ctx, const u8 dst[], s32 imm)
++{
++	u8 src = MIPS_R_T6;
++	u8 tmp = MIPS_R_T9;
++
++	switch (imm) {
++	/* dst = dst * 1 is a no-op */
++	case 1:
++		break;
++	/* dst = dst * -1 */
++	case -1:
++		emit_neg_i64(ctx, dst);
++		break;
++	case 0:
++		emit_mov_r(ctx, lo(dst), MIPS_R_ZERO);
++		emit_mov_r(ctx, hi(dst), MIPS_R_ZERO);
++		break;
++	/* Full 64x32 multiply */
++	default:
++		/* hi(dst) = hi(dst) * src(imm) */
++		emit_mov_i(ctx, src, imm);
++		if (cpu_has_mips32r1 || cpu_has_mips32r6) {
++			emit(ctx, mul, hi(dst), hi(dst), src);
++		} else {
++			emit(ctx, multu, hi(dst), src);
++			emit(ctx, mflo, hi(dst));
++		}
++
++		/* hi(dst) = hi(dst) - lo(dst) */
++		if (imm < 0)
++			emit(ctx, subu, hi(dst), hi(dst), lo(dst));
++
++		/* tmp = lo(dst) * src(imm) >> 32 */
++		/* lo(dst) = lo(dst) * src(imm) */
++		if (cpu_has_mips32r6) {
++			emit(ctx, muhu, tmp, lo(dst), src);
++			emit(ctx, mulu, lo(dst), lo(dst), src);
++		} else {
++			emit(ctx, multu, lo(dst), src);
++			emit(ctx, mflo, lo(dst));
++			emit(ctx, mfhi, tmp);
++		}
++
++		/* hi(dst) += tmp */
++		emit(ctx, addu, hi(dst), hi(dst), tmp);
++		clobber_reg64(ctx, dst);
++		break;
++	}
++}
++
++/* ALU mul register (64x64-bit) */
++static void emit_mul_r64(struct jit_context *ctx,
++			 const u8 dst[], const u8 src[])
++{
++	u8 acc = MIPS_R_T8;
++	u8 tmp = MIPS_R_T9;
++
++	/* acc = hi(dst) * lo(src) */
++	if (cpu_has_mips32r1 || cpu_has_mips32r6) {
++		emit(ctx, mul, acc, hi(dst), lo(src));
++	} else {
++		emit(ctx, multu, hi(dst), lo(src));
++		emit(ctx, mflo, acc);
++	}
++
++	/* tmp = lo(dst) * hi(src) */
++	if (cpu_has_mips32r1 || cpu_has_mips32r6) {
++		emit(ctx, mul, tmp, lo(dst), hi(src));
++	} else {
++		emit(ctx, multu, lo(dst), hi(src));
++		emit(ctx, mflo, tmp);
++	}
++
++	/* acc += tmp */
++	emit(ctx, addu, acc, acc, tmp);
++
++	/* tmp = lo(dst) * lo(src) >> 32 */
++	/* lo(dst) = lo(dst) * lo(src) */
++	if (cpu_has_mips32r6) {
++		emit(ctx, muhu, tmp, lo(dst), lo(src));
++		emit(ctx, mulu, lo(dst), lo(dst), lo(src));
++	} else {
++		emit(ctx, multu, lo(dst), lo(src));
++		emit(ctx, mflo, lo(dst));
++		emit(ctx, mfhi, tmp);
++	}
++
++	/* hi(dst) = acc + tmp */
++	emit(ctx, addu, hi(dst), acc, tmp);
++	clobber_reg64(ctx, dst);
++}
++
++/* Helper function for 64-bit modulo */
++static u64 jit_mod64(u64 a, u64 b)
++{
++	u64 rem;
++
++	div64_u64_rem(a, b, &rem);
++	return rem;
++}
++
++/* ALU div/mod register (64-bit) */
++static void emit_divmod_r64(struct jit_context *ctx,
++			    const u8 dst[], const u8 src[], u8 op)
++{
++	const u8 *r0 = bpf2mips32[BPF_REG_0]; /* Mapped to v0-v1 */
++	const u8 *r1 = bpf2mips32[BPF_REG_1]; /* Mapped to a0-a1 */
++	const u8 *r2 = bpf2mips32[BPF_REG_2]; /* Mapped to a2-a3 */
++	int exclude, k;
++	u32 addr = 0;
++
++	/* Push caller-saved registers on stack */
++	push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
++		  0, JIT_RESERVED_STACK);
++
++	/* Put 64-bit arguments 1 and 2 in registers a0-a3 */
++	for (k = 0; k < 2; k++) {
++		emit(ctx, move, MIPS_R_T9, src[k]);
++		emit(ctx, move, r1[k], dst[k]);
++		emit(ctx, move, r2[k], MIPS_R_T9);
++	}
++
++	/* Emit function call */
++	switch (BPF_OP(op)) {
++	/* dst = dst / src */
++	case BPF_DIV:
++		addr = (u32)&div64_u64;
++		break;
++	/* dst = dst % src */
++	case BPF_MOD:
++		addr = (u32)&jit_mod64;
++		break;
++	}
++	emit_mov_i(ctx, MIPS_R_T9, addr);
++	emit(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
++	emit(ctx, nop); /* Delay slot */
++
++	/* Store the 64-bit result in dst */
++	emit(ctx, move, dst[0], r0[0]);
++	emit(ctx, move, dst[1], r0[1]);
++
++	/* Restore caller-saved registers, excluding the computed result */
++	exclude = BIT(lo(dst)) | BIT(hi(dst));
++	pop_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
++		 exclude, JIT_RESERVED_STACK);
++	emit_load_delay(ctx);
++
++	clobber_reg64(ctx, dst);
++	clobber_reg(ctx, MIPS_R_V0);
++	clobber_reg(ctx, MIPS_R_V1);
++	clobber_reg(ctx, MIPS_R_RA);
++}
++
++/* Swap bytes in a register word */
++static void emit_swap8_r(struct jit_context *ctx, u8 dst, u8 src, u8 mask)
++{
++	u8 tmp = MIPS_R_T9;
++
++	emit(ctx, and, tmp, src, mask); /* tmp = src & 0x00ff00ff */
++	emit(ctx, sll, tmp, tmp, 8);    /* tmp = tmp << 8         */
++	emit(ctx, srl, dst, src, 8);    /* dst = src >> 8         */
++	emit(ctx, and, dst, dst, mask); /* dst = dst & 0x00ff00ff */
++	emit(ctx, or,  dst, dst, tmp);  /* dst = dst | tmp        */
++}
++
++/* Swap half words in a register word */
++static void emit_swap16_r(struct jit_context *ctx, u8 dst, u8 src)
++{
++	u8 tmp = MIPS_R_T9;
++
++	emit(ctx, sll, tmp, src, 16);  /* tmp = src << 16 */
++	emit(ctx, srl, dst, src, 16);  /* dst = src >> 16 */
++	emit(ctx, or,  dst, dst, tmp); /* dst = dst | tmp */
++}
++
++/* Swap bytes and truncate a register double word, word or half word */
++static void emit_bswap_r64(struct jit_context *ctx, const u8 dst[], u32 width)
++{
++	u8 tmp = MIPS_R_T8;
++
++	switch (width) {
++	/* Swap bytes in a double word */
++	case 64:
++		if (cpu_has_mips32r2 || cpu_has_mips32r6) {
++			emit(ctx, rotr, tmp, hi(dst), 16);
++			emit(ctx, rotr, hi(dst), lo(dst), 16);
++			emit(ctx, wsbh, lo(dst), tmp);
++			emit(ctx, wsbh, hi(dst), hi(dst));
++		} else {
++			emit_swap16_r(ctx, tmp, lo(dst));
++			emit_swap16_r(ctx, lo(dst), hi(dst));
++			emit(ctx, move, hi(dst), tmp);
++
++			emit(ctx, lui, tmp, 0xff);      /* tmp = 0x00ff0000 */
++			emit(ctx, ori, tmp, tmp, 0xff); /* tmp = 0x00ff00ff */
++			emit_swap8_r(ctx, lo(dst), lo(dst), tmp);
++			emit_swap8_r(ctx, hi(dst), hi(dst), tmp);
++		}
++		break;
++	/* Swap bytes in a word */
++	/* Swap bytes in a half word */
++	case 32:
++	case 16:
++		emit_bswap_r(ctx, lo(dst), width);
++		emit(ctx, move, hi(dst), MIPS_R_ZERO);
++		break;
++	}
++	clobber_reg64(ctx, dst);
++}
++
++/* Truncate a register double word, word or half word */
++static void emit_trunc_r64(struct jit_context *ctx, const u8 dst[], u32 width)
++{
++	switch (width) {
++	case 64:
++		break;
++	/* Zero-extend a word */
++	case 32:
++		emit(ctx, move, hi(dst), MIPS_R_ZERO);
++		clobber_reg(ctx, hi(dst));
++		break;
++	/* Zero-extend a half word */
++	case 16:
++		emit(ctx, move, hi(dst), MIPS_R_ZERO);
++		emit(ctx, andi, lo(dst), lo(dst), 0xffff);
++		clobber_reg64(ctx, dst);
++		break;
++	}
++}
++
++/* Load operation: dst = *(size*)(src + off) */
++static void emit_ldx(struct jit_context *ctx,
++		     const u8 dst[], u8 src, s16 off, u8 size)
++{
++	switch (size) {
++	/* Load a byte */
++	case BPF_B:
++		emit(ctx, lbu, lo(dst), off, src);
++		emit(ctx, move, hi(dst), MIPS_R_ZERO);
++		break;
++	/* Load a half word */
++	case BPF_H:
++		emit(ctx, lhu, lo(dst), off, src);
++		emit(ctx, move, hi(dst), MIPS_R_ZERO);
++		break;
++	/* Load a word */
++	case BPF_W:
++		emit(ctx, lw, lo(dst), off, src);
++		emit(ctx, move, hi(dst), MIPS_R_ZERO);
++		break;
++	/* Load a double word */
++	case BPF_DW:
++		if (dst[1] == src) {
++			emit(ctx, lw, dst[0], off + 4, src);
++			emit(ctx, lw, dst[1], off, src);
++		} else {
++			emit(ctx, lw, dst[1], off, src);
++			emit(ctx, lw, dst[0], off + 4, src);
++		}
++		emit_load_delay(ctx);
++		break;
++	}
++	clobber_reg64(ctx, dst);
++}
++
++/* Store operation: *(size *)(dst + off) = src */
++static void emit_stx(struct jit_context *ctx,
++		     const u8 dst, const u8 src[], s16 off, u8 size)
++{
++	switch (size) {
++	/* Store a byte */
++	case BPF_B:
++		emit(ctx, sb, lo(src), off, dst);
++		break;
++	/* Store a half word */
++	case BPF_H:
++		emit(ctx, sh, lo(src), off, dst);
++		break;
++	/* Store a word */
++	case BPF_W:
++		emit(ctx, sw, lo(src), off, dst);
++		break;
++	/* Store a double word */
++	case BPF_DW:
++		emit(ctx, sw, src[1], off, dst);
++		emit(ctx, sw, src[0], off + 4, dst);
++		break;
++	}
++}
++
++/* Atomic read-modify-write (32-bit, non-ll/sc fallback) */
++static void emit_atomic_r32(struct jit_context *ctx,
++			    u8 dst, u8 src, s16 off, u8 code)
++{
++	u32 exclude = 0;
++	u32 addr = 0;
++
++	/* Push caller-saved registers on stack */
++	push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
++		  0, JIT_RESERVED_STACK);
++	/*
++	 * Argument 1: dst+off if xchg, otherwise src, passed in register a0
++	 * Argument 2: src if xchg, othersize dst+off, passed in register a1
++	 */
++	emit(ctx, move, MIPS_R_T9, dst);
++	emit(ctx, move, MIPS_R_A0, src);
++	emit(ctx, addiu, MIPS_R_A1, MIPS_R_T9, off);
++
++	/* Emit function call */
++	switch (code) {
++	case BPF_ADD:
++		addr = (u32)&atomic_add;
++		break;
++	case BPF_SUB:
++		addr = (u32)&atomic_sub;
++		break;
++	case BPF_OR:
++		addr = (u32)&atomic_or;
++		break;
++	case BPF_AND:
++		addr = (u32)&atomic_and;
++		break;
++	case BPF_XOR:
++		addr = (u32)&atomic_xor;
++		break;
++	}
++	emit_mov_i(ctx, MIPS_R_T9, addr);
++	emit(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
++	emit(ctx, nop); /* Delay slot */
++
++	/* Restore caller-saved registers, except any fetched value */
++	pop_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
++		 exclude, JIT_RESERVED_STACK);
++	emit_load_delay(ctx);
++	clobber_reg(ctx, MIPS_R_RA);
++}
++
++/* Atomic read-modify-write (64-bit) */
++static void emit_atomic_r64(struct jit_context *ctx,
++			    u8 dst, const u8 src[], s16 off, u8 code)
++{
++	const u8 *r1 = bpf2mips32[BPF_REG_1]; /* Mapped to a0-a1 */
++	u32 exclude = 0;
++	u32 addr = 0;
++
++	/* Push caller-saved registers on stack */
++	push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
++		  0, JIT_RESERVED_STACK);
++	/*
++	 * Argument 1: 64-bit src, passed in registers a0-a1
++	 * Argument 2: 32-bit dst+off, passed in register a2
++	 */
++	emit(ctx, move, MIPS_R_T9, dst);
++	emit(ctx, move, r1[0], src[0]);
++	emit(ctx, move, r1[1], src[1]);
++	emit(ctx, addiu, MIPS_R_A2, MIPS_R_T9, off);
++
++	/* Emit function call */
++	switch (code) {
++	case BPF_ADD:
++		addr = (u32)&atomic64_add;
++		break;
++	case BPF_SUB:
++		addr = (u32)&atomic64_sub;
++		break;
++	case BPF_OR:
++		addr = (u32)&atomic64_or;
++		break;
++	case BPF_AND:
++		addr = (u32)&atomic64_and;
++		break;
++	case BPF_XOR:
++		addr = (u32)&atomic64_xor;
++		break;
++	}
++	emit_mov_i(ctx, MIPS_R_T9, addr);
++	emit(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
++	emit(ctx, nop); /* Delay slot */
++
++	/* Restore caller-saved registers, except any fetched value */
++	pop_regs(ctx, ctx->clobbered & JIT_CALLER_REGS,
++		 exclude, JIT_RESERVED_STACK);
++	emit_load_delay(ctx);
++	clobber_reg(ctx, MIPS_R_RA);
++}
++
++/*
++ * Conditional movz or an emulated equivalent.
++ * Note that the rs register may be modified.
++ */
++static void emit_movz_r(struct jit_context *ctx, u8 rd, u8 rs, u8 rt)
++{
++	if (cpu_has_mips_2) {
++		emit(ctx, movz, rd, rs, rt);           /* rd = rt ? rd : rs  */
++	} else if (cpu_has_mips32r6) {
++		if (rs != MIPS_R_ZERO)
++			emit(ctx, seleqz, rs, rs, rt); /* rs = 0 if rt == 0  */
++		emit(ctx, selnez, rd, rd, rt);         /* rd = 0 if rt != 0  */
++		if (rs != MIPS_R_ZERO)
++			emit(ctx, or, rd, rd, rs);     /* rd = rd | rs       */
++	} else {
++		emit(ctx, bnez, rt, 8);                /* PC += 8 if rd != 0 */
++		emit(ctx, nop);                        /* +0: delay slot     */
++		emit(ctx, or, rd, rs, MIPS_R_ZERO);    /* +4: rd = rs        */
++	}
++	clobber_reg(ctx, rd);
++	clobber_reg(ctx, rs);
++}
++
++/*
++ * Conditional movn or an emulated equivalent.
++ * Note that the rs register may be modified.
++ */
++static void emit_movn_r(struct jit_context *ctx, u8 rd, u8 rs, u8 rt)
++{
++	if (cpu_has_mips_2) {
++		emit(ctx, movn, rd, rs, rt);           /* rd = rt ? rs : rd  */
++	} else if (cpu_has_mips32r6) {
++		if (rs != MIPS_R_ZERO)
++			emit(ctx, selnez, rs, rs, rt); /* rs = 0 if rt == 0  */
++		emit(ctx, seleqz, rd, rd, rt);         /* rd = 0 if rt != 0  */
++		if (rs != MIPS_R_ZERO)
++			emit(ctx, or, rd, rd, rs);     /* rd = rd | rs       */
++	} else {
++		emit(ctx, beqz, rt, 8);                /* PC += 8 if rd == 0 */
++		emit(ctx, nop);                        /* +0: delay slot     */
++		emit(ctx, or, rd, rs, MIPS_R_ZERO);    /* +4: rd = rs        */
++	}
++	clobber_reg(ctx, rd);
++	clobber_reg(ctx, rs);
++}
++
++/* Emulation of 64-bit sltiu rd, rs, imm, where imm may be S32_MAX + 1 */
++static void emit_sltiu_r64(struct jit_context *ctx, u8 rd,
++			   const u8 rs[], s64 imm)
++{
++	u8 tmp = MIPS_R_T9;
++
++	if (imm < 0) {
++		emit_mov_i(ctx, rd, imm);                 /* rd = imm        */
++		emit(ctx, sltu, rd, lo(rs), rd);          /* rd = rsl < rd   */
++		emit(ctx, sltiu, tmp, hi(rs), -1);        /* tmp = rsh < ~0U */
++		emit(ctx, or, rd, rd, tmp);               /* rd = rd | tmp   */
++	} else { /* imm >= 0 */
++		if (imm > 0x7fff) {
++			emit_mov_i(ctx, rd, (s32)imm);     /* rd = imm       */
++			emit(ctx, sltu, rd, lo(rs), rd);   /* rd = rsl < rd  */
++		} else {
++			emit(ctx, sltiu, rd, lo(rs), imm); /* rd = rsl < imm */
++		}
++		emit_movn_r(ctx, rd, MIPS_R_ZERO, hi(rs)); /* rd = 0 if rsh  */
++	}
++}
++
++/* Emulation of 64-bit sltu rd, rs, rt */
++static void emit_sltu_r64(struct jit_context *ctx, u8 rd,
++			  const u8 rs[], const u8 rt[])
++{
++	u8 tmp = MIPS_R_T9;
++
++	emit(ctx, sltu, rd, lo(rs), lo(rt));           /* rd = rsl < rtl     */
++	emit(ctx, subu, tmp, hi(rs), hi(rt));          /* tmp = rsh - rth    */
++	emit_movn_r(ctx, rd, MIPS_R_ZERO, tmp);        /* rd = 0 if tmp != 0 */
++	emit(ctx, sltu, tmp, hi(rs), hi(rt));          /* tmp = rsh < rth    */
++	emit(ctx, or, rd, rd, tmp);                    /* rd = rd | tmp      */
++}
++
++/* Emulation of 64-bit slti rd, rs, imm, where imm may be S32_MAX + 1 */
++static void emit_slti_r64(struct jit_context *ctx, u8 rd,
++			  const u8 rs[], s64 imm)
++{
++	u8 t1 = MIPS_R_T8;
++	u8 t2 = MIPS_R_T9;
++	u8 cmp;
++
++	/*
++	 * if ((rs < 0) ^ (imm < 0)) t1 = imm >u rsl
++	 * else                      t1 = rsl <u imm
++	 */
++	emit_mov_i(ctx, rd, (s32)imm);
++	emit(ctx, sltu, t1, lo(rs), rd);               /* t1 = rsl <u imm   */
++	emit(ctx, sltu, t2, rd, lo(rs));               /* t2 = imm <u rsl   */
++	emit(ctx, srl, rd, hi(rs), 31);                /* rd = rsh >> 31    */
++	if (imm < 0)
++		emit_movz_r(ctx, t1, t2, rd);          /* t1 = rd ? t1 : t2 */
++	else
++		emit_movn_r(ctx, t1, t2, rd);          /* t1 = rd ? t2 : t1 */
++	/*
++	 * if ((imm < 0 && rsh != 0xffffffff) ||
++	 *     (imm >= 0 && rsh != 0))
++	 *      t1 = 0
++	 */
++	if (imm < 0) {
++		emit(ctx, addiu, rd, hi(rs), 1);       /* rd = rsh + 1 */
++		cmp = rd;
++	} else { /* imm >= 0 */
++		cmp = hi(rs);
++	}
++	emit_movn_r(ctx, t1, MIPS_R_ZERO, cmp);        /* t1 = 0 if cmp != 0 */
++
++	/*
++	 * if (imm < 0) rd = rsh < -1
++	 * else         rd = rsh != 0
++	 * rd = rd | t1
++	 */
++	emit(ctx, slti, rd, hi(rs), imm < 0 ? -1 : 0); /* rd = rsh < hi(imm) */
++	emit(ctx, or, rd, rd, t1);                     /* rd = rd | t1       */
++}
++
++/* Emulation of 64-bit(slt rd, rs, rt) */
++static void emit_slt_r64(struct jit_context *ctx, u8 rd,
++			 const u8 rs[], const u8 rt[])
++{
++	u8 t1 = MIPS_R_T7;
++	u8 t2 = MIPS_R_T8;
++	u8 t3 = MIPS_R_T9;
++
++	/*
++	 * if ((rs < 0) ^ (rt < 0)) t1 = rtl <u rsl
++	 * else                     t1 = rsl <u rtl
++	 * if (rsh == rth)          t1 = 0
++	 */
++	emit(ctx, sltu, t1, lo(rs), lo(rt));           /* t1 = rsl <u rtl   */
++	emit(ctx, sltu, t2, lo(rt), lo(rs));           /* t2 = rtl <u rsl   */
++	emit(ctx, xor, t3, hi(rs), hi(rt));            /* t3 = rlh ^ rth    */
++	emit(ctx, srl, rd, t3, 31);                    /* rd = t3 >> 31     */
++	emit_movn_r(ctx, t1, t2, rd);                  /* t1 = rd ? t2 : t1 */
++	emit_movn_r(ctx, t1, MIPS_R_ZERO, t3);         /* t1 = 0 if t3 != 0 */
++
++	/* rd = (rsh < rth) | t1 */
++	emit(ctx, slt, rd, hi(rs), hi(rt));            /* rd = rsh <s rth   */
++	emit(ctx, or, rd, rd, t1);                     /* rd = rd | t1      */
++}
++
++/* Jump immediate (64-bit) */
++static void emit_jmp_i64(struct jit_context *ctx,
++			 const u8 dst[], s32 imm, s32 off, u8 op)
++{
++	u8 tmp = MIPS_R_T6;
++
++	switch (op) {
++	/* No-op, used internally for branch optimization */
++	case JIT_JNOP:
++		break;
++	/* PC += off if dst == imm */
++	/* PC += off if dst != imm */
++	case BPF_JEQ:
++	case BPF_JNE:
++		if (imm >= -0x7fff && imm <= 0x8000) {
++			emit(ctx, addiu, tmp, lo(dst), -imm);
++		} else if ((u32)imm <= 0xffff) {
++			emit(ctx, xori, tmp, lo(dst), imm);
++		} else {       /* Register fallback */
++			emit_mov_i(ctx, tmp, imm);
++			emit(ctx, xor, tmp, lo(dst), tmp);
++		}
++		if (imm < 0) { /* Compare sign extension */
++			emit(ctx, addu, MIPS_R_T9, hi(dst), 1);
++			emit(ctx, or, tmp, tmp, MIPS_R_T9);
++		} else {       /* Compare zero extension */
++			emit(ctx, or, tmp, tmp, hi(dst));
++		}
++		if (op == BPF_JEQ)
++			emit(ctx, beqz, tmp, off);
++		else   /* BPF_JNE */
++			emit(ctx, bnez, tmp, off);
++		break;
++	/* PC += off if dst & imm */
++	/* PC += off if (dst & imm) == 0 (not in BPF, used for long jumps) */
++	case BPF_JSET:
++	case JIT_JNSET:
++		if ((u32)imm <= 0xffff) {
++			emit(ctx, andi, tmp, lo(dst), imm);
++		} else {     /* Register fallback */
++			emit_mov_i(ctx, tmp, imm);
++			emit(ctx, and, tmp, lo(dst), tmp);
++		}
++		if (imm < 0) /* Sign-extension pulls in high word */
++			emit(ctx, or, tmp, tmp, hi(dst));
++		if (op == BPF_JSET)
++			emit(ctx, bnez, tmp, off);
++		else   /* JIT_JNSET */
++			emit(ctx, beqz, tmp, off);
++		break;
++	/* PC += off if dst > imm */
++	case BPF_JGT:
++		emit_sltiu_r64(ctx, tmp, dst, (s64)imm + 1);
++		emit(ctx, beqz, tmp, off);
++		break;
++	/* PC += off if dst >= imm */
++	case BPF_JGE:
++		emit_sltiu_r64(ctx, tmp, dst, imm);
++		emit(ctx, beqz, tmp, off);
++		break;
++	/* PC += off if dst < imm */
++	case BPF_JLT:
++		emit_sltiu_r64(ctx, tmp, dst, imm);
++		emit(ctx, bnez, tmp, off);
++		break;
++	/* PC += off if dst <= imm */
++	case BPF_JLE:
++		emit_sltiu_r64(ctx, tmp, dst, (s64)imm + 1);
++		emit(ctx, bnez, tmp, off);
++		break;
++	/* PC += off if dst > imm (signed) */
++	case BPF_JSGT:
++		emit_slti_r64(ctx, tmp, dst, (s64)imm + 1);
++		emit(ctx, beqz, tmp, off);
++		break;
++	/* PC += off if dst >= imm (signed) */
++	case BPF_JSGE:
++		emit_slti_r64(ctx, tmp, dst, imm);
++		emit(ctx, beqz, tmp, off);
++		break;
++	/* PC += off if dst < imm (signed) */
++	case BPF_JSLT:
++		emit_slti_r64(ctx, tmp, dst, imm);
++		emit(ctx, bnez, tmp, off);
++		break;
++	/* PC += off if dst <= imm (signed) */
++	case BPF_JSLE:
++		emit_slti_r64(ctx, tmp, dst, (s64)imm + 1);
++		emit(ctx, bnez, tmp, off);
++		break;
++	}
++}
++
++/* Jump register (64-bit) */
++static void emit_jmp_r64(struct jit_context *ctx,
++			 const u8 dst[], const u8 src[], s32 off, u8 op)
++{
++	u8 t1 = MIPS_R_T6;
++	u8 t2 = MIPS_R_T7;
++
++	switch (op) {
++	/* No-op, used internally for branch optimization */
++	case JIT_JNOP:
++		break;
++	/* PC += off if dst == src */
++	/* PC += off if dst != src */
++	case BPF_JEQ:
++	case BPF_JNE:
++		emit(ctx, subu, t1, lo(dst), lo(src));
++		emit(ctx, subu, t2, hi(dst), hi(src));
++		emit(ctx, or, t1, t1, t2);
++		if (op == BPF_JEQ)
++			emit(ctx, beqz, t1, off);
++		else   /* BPF_JNE */
++			emit(ctx, bnez, t1, off);
++		break;
++	/* PC += off if dst & src */
++	/* PC += off if (dst & imm) == 0 (not in BPF, used for long jumps) */
++	case BPF_JSET:
++	case JIT_JNSET:
++		emit(ctx, and, t1, lo(dst), lo(src));
++		emit(ctx, and, t2, hi(dst), hi(src));
++		emit(ctx, or, t1, t1, t2);
++		if (op == BPF_JSET)
++			emit(ctx, bnez, t1, off);
++		else   /* JIT_JNSET */
++			emit(ctx, beqz, t1, off);
++		break;
++	/* PC += off if dst > src */
++	case BPF_JGT:
++		emit_sltu_r64(ctx, t1, src, dst);
++		emit(ctx, bnez, t1, off);
++		break;
++	/* PC += off if dst >= src */
++	case BPF_JGE:
++		emit_sltu_r64(ctx, t1, dst, src);
++		emit(ctx, beqz, t1, off);
++		break;
++	/* PC += off if dst < src */
++	case BPF_JLT:
++		emit_sltu_r64(ctx, t1, dst, src);
++		emit(ctx, bnez, t1, off);
++		break;
++	/* PC += off if dst <= src */
++	case BPF_JLE:
++		emit_sltu_r64(ctx, t1, src, dst);
++		emit(ctx, beqz, t1, off);
++		break;
++	/* PC += off if dst > src (signed) */
++	case BPF_JSGT:
++		emit_slt_r64(ctx, t1, src, dst);
++		emit(ctx, bnez, t1, off);
++		break;
++	/* PC += off if dst >= src (signed) */
++	case BPF_JSGE:
++		emit_slt_r64(ctx, t1, dst, src);
++		emit(ctx, beqz, t1, off);
++		break;
++	/* PC += off if dst < src (signed) */
++	case BPF_JSLT:
++		emit_slt_r64(ctx, t1, dst, src);
++		emit(ctx, bnez, t1, off);
++		break;
++	/* PC += off if dst <= src (signed) */
++	case BPF_JSLE:
++		emit_slt_r64(ctx, t1, src, dst);
++		emit(ctx, beqz, t1, off);
++		break;
++	}
++}
++
++/* Function call */
++static int emit_call(struct jit_context *ctx, const struct bpf_insn *insn)
++{
++	bool fixed;
++	u64 addr;
++
++	/* Decode the call address */
++	if (bpf_jit_get_func_addr(ctx->program, insn, false,
++				  &addr, &fixed) < 0)
++		return -1;
++	if (!fixed)
++		return -1;
++
++	/* Push stack arguments */
++	push_regs(ctx, JIT_STACK_REGS, 0, JIT_RESERVED_STACK);
++
++	/* Emit function call */
++	emit_mov_i(ctx, MIPS_R_T9, addr);
++	emit(ctx, jalr, MIPS_R_RA, MIPS_R_T9);
++	emit(ctx, nop); /* Delay slot */
++
++	clobber_reg(ctx, MIPS_R_RA);
++	clobber_reg(ctx, MIPS_R_V0);
++	clobber_reg(ctx, MIPS_R_V1);
++	return 0;
++}
++
++/* Function tail call */
++static int emit_tail_call(struct jit_context *ctx)
++{
++	u8 ary = lo(bpf2mips32[BPF_REG_2]);
++	u8 ind = lo(bpf2mips32[BPF_REG_3]);
++	u8 t1 = MIPS_R_T8;
++	u8 t2 = MIPS_R_T9;
++	int off;
++
++	/*
++	 * Tail call:
++	 * eBPF R1   - function argument (context ptr), passed in a0-a1
++	 * eBPF R2   - ptr to object with array of function entry points
++	 * eBPF R3   - array index of function to be called
++	 * stack[sz] - remaining tail call count, initialized in prologue
++	 */
++
++	/* if (ind >= ary->map.max_entries) goto out */
++	off = offsetof(struct bpf_array, map.max_entries);
++	if (off > 0x7fff)
++		return -1;
++	emit(ctx, lw, t1, off, ary);             /* t1 = ary->map.max_entries*/
++	emit_load_delay(ctx);                    /* Load delay slot          */
++	emit(ctx, sltu, t1, ind, t1);            /* t1 = ind < t1            */
++	emit(ctx, beqz, t1, get_offset(ctx, 1)); /* PC += off(1) if t1 == 0  */
++						 /* (next insn delay slot)   */
++	/* if (TCC-- <= 0) goto out */
++	emit(ctx, lw, t2, ctx->stack_size, MIPS_R_SP);  /* t2 = *(SP + size) */
++	emit_load_delay(ctx);                     /* Load delay slot         */
++	emit(ctx, blez, t2, get_offset(ctx, 1));  /* PC += off(1) if t2 < 0  */
++	emit(ctx, addiu, t2, t2, -1);             /* t2-- (delay slot)       */
++	emit(ctx, sw, t2, ctx->stack_size, MIPS_R_SP);  /* *(SP + size) = t2 */
++
++	/* prog = ary->ptrs[ind] */
++	off = offsetof(struct bpf_array, ptrs);
++	if (off > 0x7fff)
++		return -1;
++	emit(ctx, sll, t1, ind, 2);               /* t1 = ind << 2           */
++	emit(ctx, addu, t1, t1, ary);             /* t1 += ary               */
++	emit(ctx, lw, t2, off, t1);               /* t2 = *(t1 + off)        */
++	emit_load_delay(ctx);                     /* Load delay slot         */
++
++	/* if (prog == 0) goto out */
++	emit(ctx, beqz, t2, get_offset(ctx, 1));  /* PC += off(1) if t2 == 0 */
++	emit(ctx, nop);                           /* Delay slot              */
++
++	/* func = prog->bpf_func + 8 (prologue skip offset) */
++	off = offsetof(struct bpf_prog, bpf_func);
++	if (off > 0x7fff)
++		return -1;
++	emit(ctx, lw, t1, off, t2);                /* t1 = *(t2 + off)       */
++	emit_load_delay(ctx);                      /* Load delay slot        */
++	emit(ctx, addiu, t1, t1, JIT_TCALL_SKIP);  /* t1 += skip (8 or 12)   */
++
++	/* goto func */
++	build_epilogue(ctx, t1);
++	return 0;
++}
++
++/*
++ * Stack frame layout for a JITed program (stack grows down).
++ *
++ * Higher address  : Caller's stack frame       :
++ *                 :----------------------------:
++ *                 : 64-bit eBPF args r3-r5     :
++ *                 :----------------------------:
++ *                 : Reserved / tail call count :
++ *                 +============================+  <--- MIPS sp before call
++ *                 | Callee-saved registers,    |
++ *                 | including RA and FP        |
++ *                 +----------------------------+  <--- eBPF FP (MIPS zero,fp)
++ *                 | Local eBPF variables       |
++ *                 | allocated by program       |
++ *                 +----------------------------+
++ *                 | Reserved for caller-saved  |
++ *                 | registers                  |
++ *                 +----------------------------+
++ *                 | Reserved for 64-bit eBPF   |
++ *                 | args r3-r5 & args passed   |
++ *                 | on stack in kernel calls   |
++ * Lower address   +============================+  <--- MIPS sp
++ */
++
++/* Build program prologue to set up the stack and registers */
++void build_prologue(struct jit_context *ctx)
++{
++	const u8 *r1 = bpf2mips32[BPF_REG_1];
++	const u8 *fp = bpf2mips32[BPF_REG_FP];
++	int stack, saved, locals, reserved;
++
++	/*
++	 * The first two instructions initialize TCC in the reserved (for us)
++	 * 16-byte area in the parent's stack frame. On a tail call, the
++	 * calling function jumps into the prologue after these instructions.
++	 */
++	emit(ctx, ori, MIPS_R_T9, MIPS_R_ZERO,
++	     min(MAX_TAIL_CALL_CNT + 1, 0xffff));
++	emit(ctx, sw, MIPS_R_T9, 0, MIPS_R_SP);
++
++	/*
++	 * Register eBPF R1 contains the 32-bit context pointer argument.
++	 * A 32-bit argument is always passed in MIPS register a0, regardless
++	 * of CPU endianness. Initialize R1 accordingly and zero-extend.
++	 */
++#ifdef __BIG_ENDIAN
++	emit(ctx, move, lo(r1), MIPS_R_A0);
++#endif
++
++	/* === Entry-point for tail calls === */
++
++	/* Zero-extend the 32-bit argument */
++	emit(ctx, move, hi(r1), MIPS_R_ZERO);
++
++	/* If the eBPF frame pointer was accessed it must be saved */
++	if (ctx->accessed & BIT(BPF_REG_FP))
++		clobber_reg64(ctx, fp);
++
++	/* Compute the stack space needed for callee-saved registers */
++	saved = hweight32(ctx->clobbered & JIT_CALLEE_REGS) * sizeof(u32);
++	saved = ALIGN(saved, MIPS_STACK_ALIGNMENT);
++
++	/* Stack space used by eBPF program local data */
++	locals = ALIGN(ctx->program->aux->stack_depth, MIPS_STACK_ALIGNMENT);
++
++	/*
++	 * If we are emitting function calls, reserve extra stack space for
++	 * caller-saved registers and function arguments passed on the stack.
++	 * The required space is computed automatically during resource
++	 * usage discovery (pass 1).
++	 */
++	reserved = ctx->stack_used;
++
++	/* Allocate the stack frame */
++	stack = ALIGN(saved + locals + reserved, MIPS_STACK_ALIGNMENT);
++	emit(ctx, addiu, MIPS_R_SP, MIPS_R_SP, -stack);
++
++	/* Store callee-saved registers on stack */
++	push_regs(ctx, ctx->clobbered & JIT_CALLEE_REGS, 0, stack - saved);
++
++	/* Initialize the eBPF frame pointer if accessed */
++	if (ctx->accessed & BIT(BPF_REG_FP))
++		emit(ctx, addiu, lo(fp), MIPS_R_SP, stack - saved);
++
++	ctx->saved_size = saved;
++	ctx->stack_size = stack;
++}
++
++/* Build the program epilogue to restore the stack and registers */
++void build_epilogue(struct jit_context *ctx, int dest_reg)
++{
++	/* Restore callee-saved registers from stack */
++	pop_regs(ctx, ctx->clobbered & JIT_CALLEE_REGS, 0,
++		 ctx->stack_size - ctx->saved_size);
++	/*
++	 * A 32-bit return value is always passed in MIPS register v0,
++	 * but on big-endian targets the low part of R0 is mapped to v1.
++	 */
++#ifdef __BIG_ENDIAN
++	emit(ctx, move, MIPS_R_V0, MIPS_R_V1);
++#endif
++
++	/* Jump to the return address and adjust the stack pointer */
++	emit(ctx, jr, dest_reg);
++	emit(ctx, addiu, MIPS_R_SP, MIPS_R_SP, ctx->stack_size);
++}
++
++/* Build one eBPF instruction */
++int build_insn(const struct bpf_insn *insn, struct jit_context *ctx)
++{
++	const u8 *dst = bpf2mips32[insn->dst_reg];
++	const u8 *src = bpf2mips32[insn->src_reg];
++	const u8 *tmp = bpf2mips32[JIT_REG_TMP];
++	u8 code = insn->code;
++	s16 off = insn->off;
++	s32 imm = insn->imm;
++	s32 val, rel;
++	u8 alu, jmp;
++
++	switch (code) {
++	/* ALU operations */
++	/* dst = imm */
++	case BPF_ALU | BPF_MOV | BPF_K:
++		emit_mov_i(ctx, lo(dst), imm);
++		emit_zext_ver(ctx, dst);
++		break;
++	/* dst = src */
++	case BPF_ALU | BPF_MOV | BPF_X:
++		if (imm == 1) {
++			/* Special mov32 for zext */
++			emit_mov_i(ctx, hi(dst), 0);
++		} else {
++			emit_mov_r(ctx, lo(dst), lo(src));
++			emit_zext_ver(ctx, dst);
++		}
++		break;
++	/* dst = -dst */
++	case BPF_ALU | BPF_NEG:
++		emit_alu_i(ctx, lo(dst), 0, BPF_NEG);
++		emit_zext_ver(ctx, dst);
++		break;
++	/* dst = dst & imm */
++	/* dst = dst | imm */
++	/* dst = dst ^ imm */
++	/* dst = dst << imm */
++	/* dst = dst >> imm */
++	/* dst = dst >> imm (arithmetic) */
++	/* dst = dst + imm */
++	/* dst = dst - imm */
++	/* dst = dst * imm */
++	/* dst = dst / imm */
++	/* dst = dst % imm */
++	case BPF_ALU | BPF_OR | BPF_K:
++	case BPF_ALU | BPF_AND | BPF_K:
++	case BPF_ALU | BPF_XOR | BPF_K:
++	case BPF_ALU | BPF_LSH | BPF_K:
++	case BPF_ALU | BPF_RSH | BPF_K:
++	case BPF_ALU | BPF_ARSH | BPF_K:
++	case BPF_ALU | BPF_ADD | BPF_K:
++	case BPF_ALU | BPF_SUB | BPF_K:
++	case BPF_ALU | BPF_MUL | BPF_K:
++	case BPF_ALU | BPF_DIV | BPF_K:
++	case BPF_ALU | BPF_MOD | BPF_K:
++		if (!valid_alu_i(BPF_OP(code), imm)) {
++			emit_mov_i(ctx, MIPS_R_T6, imm);
++			emit_alu_r(ctx, lo(dst), MIPS_R_T6, BPF_OP(code));
++		} else if (rewrite_alu_i(BPF_OP(code), imm, &alu, &val)) {
++			emit_alu_i(ctx, lo(dst), val, alu);
++		}
++		emit_zext_ver(ctx, dst);
++		break;
++	/* dst = dst & src */
++	/* dst = dst | src */
++	/* dst = dst ^ src */
++	/* dst = dst << src */
++	/* dst = dst >> src */
++	/* dst = dst >> src (arithmetic) */
++	/* dst = dst + src */
++	/* dst = dst - src */
++	/* dst = dst * src */
++	/* dst = dst / src */
++	/* dst = dst % src */
++	case BPF_ALU | BPF_AND | BPF_X:
++	case BPF_ALU | BPF_OR | BPF_X:
++	case BPF_ALU | BPF_XOR | BPF_X:
++	case BPF_ALU | BPF_LSH | BPF_X:
++	case BPF_ALU | BPF_RSH | BPF_X:
++	case BPF_ALU | BPF_ARSH | BPF_X:
++	case BPF_ALU | BPF_ADD | BPF_X:
++	case BPF_ALU | BPF_SUB | BPF_X:
++	case BPF_ALU | BPF_MUL | BPF_X:
++	case BPF_ALU | BPF_DIV | BPF_X:
++	case BPF_ALU | BPF_MOD | BPF_X:
++		emit_alu_r(ctx, lo(dst), lo(src), BPF_OP(code));
++		emit_zext_ver(ctx, dst);
++		break;
++	/* dst = imm (64-bit) */
++	case BPF_ALU64 | BPF_MOV | BPF_K:
++		emit_mov_se_i64(ctx, dst, imm);
++		break;
++	/* dst = src (64-bit) */
++	case BPF_ALU64 | BPF_MOV | BPF_X:
++		emit_mov_r(ctx, lo(dst), lo(src));
++		emit_mov_r(ctx, hi(dst), hi(src));
++		break;
++	/* dst = -dst (64-bit) */
++	case BPF_ALU64 | BPF_NEG:
++		emit_neg_i64(ctx, dst);
++		break;
++	/* dst = dst & imm (64-bit) */
++	case BPF_ALU64 | BPF_AND | BPF_K:
++		emit_alu_i64(ctx, dst, imm, BPF_OP(code));
++		break;
++	/* dst = dst | imm (64-bit) */
++	/* dst = dst ^ imm (64-bit) */
++	/* dst = dst + imm (64-bit) */
++	/* dst = dst - imm (64-bit) */
++	case BPF_ALU64 | BPF_OR | BPF_K:
++	case BPF_ALU64 | BPF_XOR | BPF_K:
++	case BPF_ALU64 | BPF_ADD | BPF_K:
++	case BPF_ALU64 | BPF_SUB | BPF_K:
++		if (imm)
++			emit_alu_i64(ctx, dst, imm, BPF_OP(code));
++		break;
++	/* dst = dst << imm (64-bit) */
++	/* dst = dst >> imm (64-bit) */
++	/* dst = dst >> imm (64-bit, arithmetic) */
++	case BPF_ALU64 | BPF_LSH | BPF_K:
++	case BPF_ALU64 | BPF_RSH | BPF_K:
++	case BPF_ALU64 | BPF_ARSH | BPF_K:
++		if (imm)
++			emit_shift_i64(ctx, dst, imm, BPF_OP(code));
++		break;
++	/* dst = dst * imm (64-bit) */
++	case BPF_ALU64 | BPF_MUL | BPF_K:
++		emit_mul_i64(ctx, dst, imm);
++		break;
++	/* dst = dst / imm (64-bit) */
++	/* dst = dst % imm (64-bit) */
++	case BPF_ALU64 | BPF_DIV | BPF_K:
++	case BPF_ALU64 | BPF_MOD | BPF_K:
++		/*
++		 * Sign-extend the immediate value into a temporary register,
++		 * and then do the operation on this register.
++		 */
++		emit_mov_se_i64(ctx, tmp, imm);
++		emit_divmod_r64(ctx, dst, tmp, BPF_OP(code));
++		break;
++	/* dst = dst & src (64-bit) */
++	/* dst = dst | src (64-bit) */
++	/* dst = dst ^ src (64-bit) */
++	/* dst = dst + src (64-bit) */
++	/* dst = dst - src (64-bit) */
++	case BPF_ALU64 | BPF_AND | BPF_X:
++	case BPF_ALU64 | BPF_OR | BPF_X:
++	case BPF_ALU64 | BPF_XOR | BPF_X:
++	case BPF_ALU64 | BPF_ADD | BPF_X:
++	case BPF_ALU64 | BPF_SUB | BPF_X:
++		emit_alu_r64(ctx, dst, src, BPF_OP(code));
++		break;
++	/* dst = dst << src (64-bit) */
++	/* dst = dst >> src (64-bit) */
++	/* dst = dst >> src (64-bit, arithmetic) */
++	case BPF_ALU64 | BPF_LSH | BPF_X:
++	case BPF_ALU64 | BPF_RSH | BPF_X:
++	case BPF_ALU64 | BPF_ARSH | BPF_X:
++		emit_shift_r64(ctx, dst, lo(src), BPF_OP(code));
++		break;
++	/* dst = dst * src (64-bit) */
++	case BPF_ALU64 | BPF_MUL | BPF_X:
++		emit_mul_r64(ctx, dst, src);
++		break;
++	/* dst = dst / src (64-bit) */
++	/* dst = dst % src (64-bit) */
++	case BPF_ALU64 | BPF_DIV | BPF_X:
++	case BPF_ALU64 | BPF_MOD | BPF_X:
++		emit_divmod_r64(ctx, dst, src, BPF_OP(code));
++		break;
++	/* dst = htole(dst) */
++	/* dst = htobe(dst) */
++	case BPF_ALU | BPF_END | BPF_FROM_LE:
++	case BPF_ALU | BPF_END | BPF_FROM_BE:
++		if (BPF_SRC(code) ==
++#ifdef __BIG_ENDIAN
++		    BPF_FROM_LE
++#else
++		    BPF_FROM_BE
++#endif
++		    )
++			emit_bswap_r64(ctx, dst, imm);
++		else
++			emit_trunc_r64(ctx, dst, imm);
++		break;
++	/* dst = imm64 */
++	case BPF_LD | BPF_IMM | BPF_DW:
++		emit_mov_i(ctx, lo(dst), imm);
++		emit_mov_i(ctx, hi(dst), insn[1].imm);
++		return 1;
++	/* LDX: dst = *(size *)(src + off) */
++	case BPF_LDX | BPF_MEM | BPF_W:
++	case BPF_LDX | BPF_MEM | BPF_H:
++	case BPF_LDX | BPF_MEM | BPF_B:
++	case BPF_LDX | BPF_MEM | BPF_DW:
++		emit_ldx(ctx, dst, lo(src), off, BPF_SIZE(code));
++		break;
++	/* ST: *(size *)(dst + off) = imm */
++	case BPF_ST | BPF_MEM | BPF_W:
++	case BPF_ST | BPF_MEM | BPF_H:
++	case BPF_ST | BPF_MEM | BPF_B:
++	case BPF_ST | BPF_MEM | BPF_DW:
++		switch (BPF_SIZE(code)) {
++		case BPF_DW:
++			/* Sign-extend immediate value into temporary reg */
++			emit_mov_se_i64(ctx, tmp, imm);
++			break;
++		case BPF_W:
++		case BPF_H:
++		case BPF_B:
++			emit_mov_i(ctx, lo(tmp), imm);
++			break;
++		}
++		emit_stx(ctx, lo(dst), tmp, off, BPF_SIZE(code));
++		break;
++	/* STX: *(size *)(dst + off) = src */
++	case BPF_STX | BPF_MEM | BPF_W:
++	case BPF_STX | BPF_MEM | BPF_H:
++	case BPF_STX | BPF_MEM | BPF_B:
++	case BPF_STX | BPF_MEM | BPF_DW:
++		emit_stx(ctx, lo(dst), src, off, BPF_SIZE(code));
++		break;
++	/* Speculation barrier */
++	case BPF_ST | BPF_NOSPEC:
++		break;
++	/* Atomics */
++	case BPF_STX | BPF_XADD | BPF_W:
++		switch (imm) {
++		case BPF_ADD:
++		case BPF_AND:
++		case BPF_OR:
++		case BPF_XOR:
++			if (cpu_has_llsc)
++				emit_atomic_r(ctx, lo(dst), lo(src), off, imm);
++			else /* Non-ll/sc fallback */
++				emit_atomic_r32(ctx, lo(dst), lo(src),
++						off, imm);
++			break;
++		default:
++			goto notyet;
++		}
++		break;
++	/* Atomics (64-bit) */
++	case BPF_STX | BPF_XADD | BPF_DW:
++		switch (imm) {
++		case BPF_ADD:
++		case BPF_AND:
++		case BPF_OR:
++		case BPF_XOR:
++			emit_atomic_r64(ctx, lo(dst), src, off, imm);
++			break;
++		default:
++			goto notyet;
++		}
++		break;
++	/* PC += off if dst == src */
++	/* PC += off if dst != src */
++	/* PC += off if dst & src */
++	/* PC += off if dst > src */
++	/* PC += off if dst >= src */
++	/* PC += off if dst < src */
++	/* PC += off if dst <= src */
++	/* PC += off if dst > src (signed) */
++	/* PC += off if dst >= src (signed) */
++	/* PC += off if dst < src (signed) */
++	/* PC += off if dst <= src (signed) */
++	case BPF_JMP32 | BPF_JEQ | BPF_X:
++	case BPF_JMP32 | BPF_JNE | BPF_X:
++	case BPF_JMP32 | BPF_JSET | BPF_X:
++	case BPF_JMP32 | BPF_JGT | BPF_X:
++	case BPF_JMP32 | BPF_JGE | BPF_X:
++	case BPF_JMP32 | BPF_JLT | BPF_X:
++	case BPF_JMP32 | BPF_JLE | BPF_X:
++	case BPF_JMP32 | BPF_JSGT | BPF_X:
++	case BPF_JMP32 | BPF_JSGE | BPF_X:
++	case BPF_JMP32 | BPF_JSLT | BPF_X:
++	case BPF_JMP32 | BPF_JSLE | BPF_X:
++		if (off == 0)
++			break;
++		setup_jmp_r(ctx, dst == src, BPF_OP(code), off, &jmp, &rel);
++		emit_jmp_r(ctx, lo(dst), lo(src), rel, jmp);
++		if (finish_jmp(ctx, jmp, off) < 0)
++			goto toofar;
++		break;
++	/* PC += off if dst == imm */
++	/* PC += off if dst != imm */
++	/* PC += off if dst & imm */
++	/* PC += off if dst > imm */
++	/* PC += off if dst >= imm */
++	/* PC += off if dst < imm */
++	/* PC += off if dst <= imm */
++	/* PC += off if dst > imm (signed) */
++	/* PC += off if dst >= imm (signed) */
++	/* PC += off if dst < imm (signed) */
++	/* PC += off if dst <= imm (signed) */
++	case BPF_JMP32 | BPF_JEQ | BPF_K:
++	case BPF_JMP32 | BPF_JNE | BPF_K:
++	case BPF_JMP32 | BPF_JSET | BPF_K:
++	case BPF_JMP32 | BPF_JGT | BPF_K:
++	case BPF_JMP32 | BPF_JGE | BPF_K:
++	case BPF_JMP32 | BPF_JLT | BPF_K:
++	case BPF_JMP32 | BPF_JLE | BPF_K:
++	case BPF_JMP32 | BPF_JSGT | BPF_K:
++	case BPF_JMP32 | BPF_JSGE | BPF_K:
++	case BPF_JMP32 | BPF_JSLT | BPF_K:
++	case BPF_JMP32 | BPF_JSLE | BPF_K:
++		if (off == 0)
++			break;
++		setup_jmp_i(ctx, imm, 32, BPF_OP(code), off, &jmp, &rel);
++		if (valid_jmp_i(jmp, imm)) {
++			emit_jmp_i(ctx, lo(dst), imm, rel, jmp);
++		} else {
++			/* Move large immediate to register */
++			emit_mov_i(ctx, MIPS_R_T6, imm);
++			emit_jmp_r(ctx, lo(dst), MIPS_R_T6, rel, jmp);
++		}
++		if (finish_jmp(ctx, jmp, off) < 0)
++			goto toofar;
++		break;
++	/* PC += off if dst == src */
++	/* PC += off if dst != src */
++	/* PC += off if dst & src */
++	/* PC += off if dst > src */
++	/* PC += off if dst >= src */
++	/* PC += off if dst < src */
++	/* PC += off if dst <= src */
++	/* PC += off if dst > src (signed) */
++	/* PC += off if dst >= src (signed) */
++	/* PC += off if dst < src (signed) */
++	/* PC += off if dst <= src (signed) */
++	case BPF_JMP | BPF_JEQ | BPF_X:
++	case BPF_JMP | BPF_JNE | BPF_X:
++	case BPF_JMP | BPF_JSET | BPF_X:
++	case BPF_JMP | BPF_JGT | BPF_X:
++	case BPF_JMP | BPF_JGE | BPF_X:
++	case BPF_JMP | BPF_JLT | BPF_X:
++	case BPF_JMP | BPF_JLE | BPF_X:
++	case BPF_JMP | BPF_JSGT | BPF_X:
++	case BPF_JMP | BPF_JSGE | BPF_X:
++	case BPF_JMP | BPF_JSLT | BPF_X:
++	case BPF_JMP | BPF_JSLE | BPF_X:
++		if (off == 0)
++			break;
++		setup_jmp_r(ctx, dst == src, BPF_OP(code), off, &jmp, &rel);
++		emit_jmp_r64(ctx, dst, src, rel, jmp);
++		if (finish_jmp(ctx, jmp, off) < 0)
++			goto toofar;
++		break;
++	/* PC += off if dst == imm */
++	/* PC += off if dst != imm */
++	/* PC += off if dst & imm */
++	/* PC += off if dst > imm */
++	/* PC += off if dst >= imm */
++	/* PC += off if dst < imm */
++	/* PC += off if dst <= imm */
++	/* PC += off if dst > imm (signed) */
++	/* PC += off if dst >= imm (signed) */
++	/* PC += off if dst < imm (signed) */
++	/* PC += off if dst <= imm (signed) */
++	case BPF_JMP | BPF_JEQ | BPF_K:
++	case BPF_JMP | BPF_JNE | BPF_K:
++	case BPF_JMP | BPF_JSET | BPF_K:
++	case BPF_JMP | BPF_JGT | BPF_K:
++	case BPF_JMP | BPF_JGE | BPF_K:
++	case BPF_JMP | BPF_JLT | BPF_K:
++	case BPF_JMP | BPF_JLE | BPF_K:
++	case BPF_JMP | BPF_JSGT | BPF_K:
++	case BPF_JMP | BPF_JSGE | BPF_K:
++	case BPF_JMP | BPF_JSLT | BPF_K:
++	case BPF_JMP | BPF_JSLE | BPF_K:
++		if (off == 0)
++			break;
++		setup_jmp_i(ctx, imm, 64, BPF_OP(code), off, &jmp, &rel);
++		emit_jmp_i64(ctx, dst, imm, rel, jmp);
++		if (finish_jmp(ctx, jmp, off) < 0)
++			goto toofar;
++		break;
++	/* PC += off */
++	case BPF_JMP | BPF_JA:
++		if (off == 0)
++			break;
++		if (emit_ja(ctx, off) < 0)
++			goto toofar;
++		break;
++	/* Tail call */
++	case BPF_JMP | BPF_TAIL_CALL:
++		if (emit_tail_call(ctx) < 0)
++			goto invalid;
++		break;
++	/* Function call */
++	case BPF_JMP | BPF_CALL:
++		if (emit_call(ctx, insn) < 0)
++			goto invalid;
++		break;
++	/* Function return */
++	case BPF_JMP | BPF_EXIT:
++		/*
++		 * Optimization: when last instruction is EXIT
++		 * simply continue to epilogue.
++		 */
++		if (ctx->bpf_index == ctx->program->len - 1)
++			break;
++		if (emit_exit(ctx) < 0)
++			goto toofar;
++		break;
++
++	default:
++invalid:
++		pr_err_once("unknown opcode %02x\n", code);
++		return -EINVAL;
++notyet:
++		pr_info_once("*** NOT YET: opcode %02x ***\n", code);
++		return -EFAULT;
++toofar:
++		pr_info_once("*** TOO FAR: jump at %u opcode %02x ***\n",
++			     ctx->bpf_index, code);
++		return -E2BIG;
++	}
++	return 0;
++}

+ 1005 - 0
target/linux/generic/backport-6.1/050-v5.16-03-mips-bpf-Add-new-eBPF-JIT-for-64-bit-MIPS.patch

@@ -0,0 +1,1005 @@
+From: Johan Almbladh <[email protected]>
+Date: Tue, 5 Oct 2021 18:54:05 +0200
+Subject: [PATCH] mips: bpf: Add new eBPF JIT for 64-bit MIPS
+
+This is an implementation on of an eBPF JIT for 64-bit MIPS III-V and
+MIPS64r1-r6. It uses the same framework introduced by the 32-bit JIT.
+
+Signed-off-by: Johan Almbladh <[email protected]>
+---
+ create mode 100644 arch/mips/net/bpf_jit_comp64.c
+
+--- /dev/null
++++ b/arch/mips/net/bpf_jit_comp64.c
+@@ -0,0 +1,991 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/*
++ * Just-In-Time compiler for eBPF bytecode on MIPS.
++ * Implementation of JIT functions for 64-bit CPUs.
++ *
++ * Copyright (c) 2021 Anyfi Networks AB.
++ * Author: Johan Almbladh <[email protected]>
++ *
++ * Based on code and ideas from
++ * Copyright (c) 2017 Cavium, Inc.
++ * Copyright (c) 2017 Shubham Bansal <[email protected]>
++ * Copyright (c) 2011 Mircea Gherzan <[email protected]>
++ */
++
++#include <linux/errno.h>
++#include <linux/filter.h>
++#include <linux/bpf.h>
++#include <asm/cpu-features.h>
++#include <asm/isa-rev.h>
++#include <asm/uasm.h>
++
++#include "bpf_jit_comp.h"
++
++/* MIPS t0-t3 are not available in the n64 ABI */
++#undef MIPS_R_T0
++#undef MIPS_R_T1
++#undef MIPS_R_T2
++#undef MIPS_R_T3
++
++/* Stack is 16-byte aligned in n64 ABI */
++#define MIPS_STACK_ALIGNMENT 16
++
++/* Extra 64-bit eBPF registers used by JIT */
++#define JIT_REG_TC (MAX_BPF_JIT_REG + 0)
++#define JIT_REG_ZX (MAX_BPF_JIT_REG + 1)
++
++/* Number of prologue bytes to skip when doing a tail call */
++#define JIT_TCALL_SKIP 4
++
++/* Callee-saved CPU registers that the JIT must preserve */
++#define JIT_CALLEE_REGS   \
++	(BIT(MIPS_R_S0) | \
++	 BIT(MIPS_R_S1) | \
++	 BIT(MIPS_R_S2) | \
++	 BIT(MIPS_R_S3) | \
++	 BIT(MIPS_R_S4) | \
++	 BIT(MIPS_R_S5) | \
++	 BIT(MIPS_R_S6) | \
++	 BIT(MIPS_R_S7) | \
++	 BIT(MIPS_R_GP) | \
++	 BIT(MIPS_R_FP) | \
++	 BIT(MIPS_R_RA))
++
++/* Caller-saved CPU registers available for JIT use */
++#define JIT_CALLER_REGS	  \
++	(BIT(MIPS_R_A5) | \
++	 BIT(MIPS_R_A6) | \
++	 BIT(MIPS_R_A7))
++/*
++ * Mapping of 64-bit eBPF registers to 64-bit native MIPS registers.
++ * MIPS registers t4 - t7 may be used by the JIT as temporary registers.
++ * MIPS registers t8 - t9 are reserved for single-register common functions.
++ */
++static const u8 bpf2mips64[] = {
++	/* Return value from in-kernel function, and exit value from eBPF */
++	[BPF_REG_0] = MIPS_R_V0,
++	/* Arguments from eBPF program to in-kernel function */
++	[BPF_REG_1] = MIPS_R_A0,
++	[BPF_REG_2] = MIPS_R_A1,
++	[BPF_REG_3] = MIPS_R_A2,
++	[BPF_REG_4] = MIPS_R_A3,
++	[BPF_REG_5] = MIPS_R_A4,
++	/* Callee-saved registers that in-kernel function will preserve */
++	[BPF_REG_6] = MIPS_R_S0,
++	[BPF_REG_7] = MIPS_R_S1,
++	[BPF_REG_8] = MIPS_R_S2,
++	[BPF_REG_9] = MIPS_R_S3,
++	/* Read-only frame pointer to access the eBPF stack */
++	[BPF_REG_FP] = MIPS_R_FP,
++	/* Temporary register for blinding constants */
++	[BPF_REG_AX] = MIPS_R_AT,
++	/* Tail call count register, caller-saved */
++	[JIT_REG_TC] = MIPS_R_A5,
++	/* Constant for register zero-extension */
++	[JIT_REG_ZX] = MIPS_R_V1,
++};
++
++/*
++ * MIPS 32-bit operations on 64-bit registers generate a sign-extended
++ * result. However, the eBPF ISA mandates zero-extension, so we rely on the
++ * verifier to add that for us (emit_zext_ver). In addition, ALU arithmetic
++ * operations, right shift and byte swap require properly sign-extended
++ * operands or the result is unpredictable. We emit explicit sign-extensions
++ * in those cases.
++ */
++
++/* Sign extension */
++static void emit_sext(struct jit_context *ctx, u8 dst, u8 src)
++{
++	emit(ctx, sll, dst, src, 0);
++	clobber_reg(ctx, dst);
++}
++
++/* Zero extension */
++static void emit_zext(struct jit_context *ctx, u8 dst)
++{
++	if (cpu_has_mips64r2 || cpu_has_mips64r6) {
++		emit(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32);
++	} else {
++		emit(ctx, and, dst, dst, bpf2mips64[JIT_REG_ZX]);
++		access_reg(ctx, JIT_REG_ZX); /* We need the ZX register */
++	}
++	clobber_reg(ctx, dst);
++}
++
++/* Zero extension, if verifier does not do it for us  */
++static void emit_zext_ver(struct jit_context *ctx, u8 dst)
++{
++	if (!ctx->program->aux->verifier_zext)
++		emit_zext(ctx, dst);
++}
++
++/* dst = imm (64-bit) */
++static void emit_mov_i64(struct jit_context *ctx, u8 dst, u64 imm64)
++{
++	if (imm64 >= 0xffffffffffff8000ULL || imm64 < 0x8000ULL) {
++		emit(ctx, daddiu, dst, MIPS_R_ZERO, (s16)imm64);
++	} else if (imm64 >= 0xffffffff80000000ULL ||
++		   (imm64 < 0x80000000 && imm64 > 0xffff)) {
++		emit(ctx, lui, dst, (s16)(imm64 >> 16));
++		emit(ctx, ori, dst, dst, (u16)imm64 & 0xffff);
++	} else {
++		u8 acc = MIPS_R_ZERO;
++		int k;
++
++		for (k = 0; k < 4; k++) {
++			u16 half = imm64 >> (48 - 16 * k);
++
++			if (acc == dst)
++				emit(ctx, dsll, dst, dst, 16);
++
++			if (half) {
++				emit(ctx, ori, dst, acc, half);
++				acc = dst;
++			}
++		}
++	}
++	clobber_reg(ctx, dst);
++}
++
++/* ALU immediate operation (64-bit) */
++static void emit_alu_i64(struct jit_context *ctx, u8 dst, s32 imm, u8 op)
++{
++	switch (BPF_OP(op)) {
++	/* dst = dst | imm */
++	case BPF_OR:
++		emit(ctx, ori, dst, dst, (u16)imm);
++		break;
++	/* dst = dst ^ imm */
++	case BPF_XOR:
++		emit(ctx, xori, dst, dst, (u16)imm);
++		break;
++	/* dst = -dst */
++	case BPF_NEG:
++		emit(ctx, dsubu, dst, MIPS_R_ZERO, dst);
++		break;
++	/* dst = dst << imm */
++	case BPF_LSH:
++		emit(ctx, dsll_safe, dst, dst, imm);
++		break;
++	/* dst = dst >> imm */
++	case BPF_RSH:
++		emit(ctx, dsrl_safe, dst, dst, imm);
++		break;
++	/* dst = dst >> imm (arithmetic) */
++	case BPF_ARSH:
++		emit(ctx, dsra_safe, dst, dst, imm);
++		break;
++	/* dst = dst + imm */
++	case BPF_ADD:
++		emit(ctx, daddiu, dst, dst, imm);
++		break;
++	/* dst = dst - imm */
++	case BPF_SUB:
++		emit(ctx, daddiu, dst, dst, -imm);
++		break;
++	default:
++		/* Width-generic operations */
++		emit_alu_i(ctx, dst, imm, op);
++	}
++	clobber_reg(ctx, dst);
++}
++
++/* ALU register operation (64-bit) */
++static void emit_alu_r64(struct jit_context *ctx, u8 dst, u8 src, u8 op)
++{
++	switch (BPF_OP(op)) {
++	/* dst = dst << src */
++	case BPF_LSH:
++		emit(ctx, dsllv, dst, dst, src);
++		break;
++	/* dst = dst >> src */
++	case BPF_RSH:
++		emit(ctx, dsrlv, dst, dst, src);
++		break;
++	/* dst = dst >> src (arithmetic) */
++	case BPF_ARSH:
++		emit(ctx, dsrav, dst, dst, src);
++		break;
++	/* dst = dst + src */
++	case BPF_ADD:
++		emit(ctx, daddu, dst, dst, src);
++		break;
++	/* dst = dst - src */
++	case BPF_SUB:
++		emit(ctx, dsubu, dst, dst, src);
++		break;
++	/* dst = dst * src */
++	case BPF_MUL:
++		if (cpu_has_mips64r6) {
++			emit(ctx, dmulu, dst, dst, src);
++		} else {
++			emit(ctx, dmultu, dst, src);
++			emit(ctx, mflo, dst);
++		}
++		break;
++	/* dst = dst / src */
++	case BPF_DIV:
++		if (cpu_has_mips64r6) {
++			emit(ctx, ddivu_r6, dst, dst, src);
++		} else {
++			emit(ctx, ddivu, dst, src);
++			emit(ctx, mflo, dst);
++		}
++		break;
++	/* dst = dst % src */
++	case BPF_MOD:
++		if (cpu_has_mips64r6) {
++			emit(ctx, dmodu, dst, dst, src);
++		} else {
++			emit(ctx, ddivu, dst, src);
++			emit(ctx, mfhi, dst);
++		}
++		break;
++	default:
++		/* Width-generic operations */
++		emit_alu_r(ctx, dst, src, op);
++	}
++	clobber_reg(ctx, dst);
++}
++
++/* Swap sub words in a register double word */
++static void emit_swap_r64(struct jit_context *ctx, u8 dst, u8 mask, u32 bits)
++{
++	u8 tmp = MIPS_R_T9;
++
++	emit(ctx, and, tmp, dst, mask);  /* tmp = dst & mask  */
++	emit(ctx, dsll, tmp, tmp, bits); /* tmp = tmp << bits */
++	emit(ctx, dsrl, dst, dst, bits); /* dst = dst >> bits */
++	emit(ctx, and, dst, dst, mask);  /* dst = dst & mask  */
++	emit(ctx, or, dst, dst, tmp);    /* dst = dst | tmp   */
++}
++
++/* Swap bytes and truncate a register double word, word or half word */
++static void emit_bswap_r64(struct jit_context *ctx, u8 dst, u32 width)
++{
++	switch (width) {
++	/* Swap bytes in a double word */
++	case 64:
++		if (cpu_has_mips64r2 || cpu_has_mips64r6) {
++			emit(ctx, dsbh, dst, dst);
++			emit(ctx, dshd, dst, dst);
++		} else {
++			u8 t1 = MIPS_R_T6;
++			u8 t2 = MIPS_R_T7;
++
++			emit(ctx, dsll32, t2, dst, 0);  /* t2 = dst << 32    */
++			emit(ctx, dsrl32, dst, dst, 0); /* dst = dst >> 32   */
++			emit(ctx, or, dst, dst, t2);    /* dst = dst | t2    */
++
++			emit(ctx, ori, t2, MIPS_R_ZERO, 0xffff);
++			emit(ctx, dsll32, t1, t2, 0);   /* t1 = t2 << 32     */
++			emit(ctx, or, t1, t1, t2);      /* t1 = t1 | t2      */
++			emit_swap_r64(ctx, dst, t1, 16);/* dst = swap16(dst) */
++
++			emit(ctx, lui, t2, 0xff);       /* t2 = 0x00ff0000   */
++			emit(ctx, ori, t2, t2, 0xff);   /* t2 = t2 | 0x00ff  */
++			emit(ctx, dsll32, t1, t2, 0);   /* t1 = t2 << 32     */
++			emit(ctx, or, t1, t1, t2);      /* t1 = t1 | t2      */
++			emit_swap_r64(ctx, dst, t1, 8); /* dst = swap8(dst)  */
++		}
++		break;
++	/* Swap bytes in a half word */
++	/* Swap bytes in a word */
++	case 32:
++	case 16:
++		emit_sext(ctx, dst, dst);
++		emit_bswap_r(ctx, dst, width);
++		if (cpu_has_mips64r2 || cpu_has_mips64r6)
++			emit_zext(ctx, dst);
++		break;
++	}
++	clobber_reg(ctx, dst);
++}
++
++/* Truncate a register double word, word or half word */
++static void emit_trunc_r64(struct jit_context *ctx, u8 dst, u32 width)
++{
++	switch (width) {
++	case 64:
++		break;
++	/* Zero-extend a word */
++	case 32:
++		emit_zext(ctx, dst);
++		break;
++	/* Zero-extend a half word */
++	case 16:
++		emit(ctx, andi, dst, dst, 0xffff);
++		break;
++	}
++	clobber_reg(ctx, dst);
++}
++
++/* Load operation: dst = *(size*)(src + off) */
++static void emit_ldx(struct jit_context *ctx, u8 dst, u8 src, s16 off, u8 size)
++{
++	switch (size) {
++	/* Load a byte */
++	case BPF_B:
++		emit(ctx, lbu, dst, off, src);
++		break;
++	/* Load a half word */
++	case BPF_H:
++		emit(ctx, lhu, dst, off, src);
++		break;
++	/* Load a word */
++	case BPF_W:
++		emit(ctx, lwu, dst, off, src);
++		break;
++	/* Load a double word */
++	case BPF_DW:
++		emit(ctx, ld, dst, off, src);
++		break;
++	}
++	clobber_reg(ctx, dst);
++}
++
++/* Store operation: *(size *)(dst + off) = src */
++static void emit_stx(struct jit_context *ctx, u8 dst, u8 src, s16 off, u8 size)
++{
++	switch (size) {
++	/* Store a byte */
++	case BPF_B:
++		emit(ctx, sb, src, off, dst);
++		break;
++	/* Store a half word */
++	case BPF_H:
++		emit(ctx, sh, src, off, dst);
++		break;
++	/* Store a word */
++	case BPF_W:
++		emit(ctx, sw, src, off, dst);
++		break;
++	/* Store a double word */
++	case BPF_DW:
++		emit(ctx, sd, src, off, dst);
++		break;
++	}
++}
++
++/* Atomic read-modify-write */
++static void emit_atomic_r64(struct jit_context *ctx,
++			    u8 dst, u8 src, s16 off, u8 code)
++{
++	u8 t1 = MIPS_R_T6;
++	u8 t2 = MIPS_R_T7;
++
++	emit(ctx, lld, t1, off, dst);
++	switch (code) {
++	case BPF_ADD:
++		emit(ctx, daddu, t2, t1, src);
++		break;
++	case BPF_AND:
++		emit(ctx, and, t2, t1, src);
++		break;
++	case BPF_OR:
++		emit(ctx, or, t2, t1, src);
++		break;
++	case BPF_XOR:
++		emit(ctx, xor, t2, t1, src);
++		break;
++	}
++	emit(ctx, scd, t2, off, dst);
++	emit(ctx, beqz, t2, -16);
++	emit(ctx, nop); /* Delay slot */
++}
++
++/* Function call */
++static int emit_call(struct jit_context *ctx, const struct bpf_insn *insn)
++{
++	u8 zx = bpf2mips64[JIT_REG_ZX];
++	u8 tmp = MIPS_R_T6;
++	bool fixed;
++	u64 addr;
++
++	/* Decode the call address */
++	if (bpf_jit_get_func_addr(ctx->program, insn, false,
++				  &addr, &fixed) < 0)
++		return -1;
++	if (!fixed)
++		return -1;
++
++	/* Push caller-saved registers on stack */
++	push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS, 0, 0);
++
++	/* Emit function call */
++	emit_mov_i64(ctx, tmp, addr);
++	emit(ctx, jalr, MIPS_R_RA, tmp);
++	emit(ctx, nop); /* Delay slot */
++
++	/* Restore caller-saved registers */
++	pop_regs(ctx, ctx->clobbered & JIT_CALLER_REGS, 0, 0);
++
++	/* Re-initialize the JIT zero-extension register if accessed */
++	if (ctx->accessed & BIT(JIT_REG_ZX)) {
++		emit(ctx, daddiu, zx, MIPS_R_ZERO, -1);
++		emit(ctx, dsrl32, zx, zx, 0);
++	}
++
++	clobber_reg(ctx, MIPS_R_RA);
++	clobber_reg(ctx, MIPS_R_V0);
++	clobber_reg(ctx, MIPS_R_V1);
++	return 0;
++}
++
++/* Function tail call */
++static int emit_tail_call(struct jit_context *ctx)
++{
++	u8 ary = bpf2mips64[BPF_REG_2];
++	u8 ind = bpf2mips64[BPF_REG_3];
++	u8 tcc = bpf2mips64[JIT_REG_TC];
++	u8 tmp = MIPS_R_T6;
++	int off;
++
++	/*
++	 * Tail call:
++	 * eBPF R1 - function argument (context ptr), passed in a0-a1
++	 * eBPF R2 - ptr to object with array of function entry points
++	 * eBPF R3 - array index of function to be called
++	 */
++
++	/* if (ind >= ary->map.max_entries) goto out */
++	off = offsetof(struct bpf_array, map.max_entries);
++	if (off > 0x7fff)
++		return -1;
++	emit(ctx, lwu, tmp, off, ary);            /* tmp = ary->map.max_entrs*/
++	emit(ctx, sltu, tmp, ind, tmp);           /* tmp = ind < t1          */
++	emit(ctx, beqz, tmp, get_offset(ctx, 1)); /* PC += off(1) if tmp == 0*/
++
++	/* if (--TCC < 0) goto out */
++	emit(ctx, daddiu, tcc, tcc, -1);          /* tcc-- (delay slot)      */
++	emit(ctx, bltz, tcc, get_offset(ctx, 1)); /* PC += off(1) if tcc < 0 */
++						  /* (next insn delay slot)  */
++	/* prog = ary->ptrs[ind] */
++	off = offsetof(struct bpf_array, ptrs);
++	if (off > 0x7fff)
++		return -1;
++	emit(ctx, dsll, tmp, ind, 3);             /* tmp = ind << 3          */
++	emit(ctx, daddu, tmp, tmp, ary);          /* tmp += ary              */
++	emit(ctx, ld, tmp, off, tmp);             /* tmp = *(tmp + off)      */
++
++	/* if (prog == 0) goto out */
++	emit(ctx, beqz, tmp, get_offset(ctx, 1)); /* PC += off(1) if tmp == 0*/
++	emit(ctx, nop);                           /* Delay slot              */
++
++	/* func = prog->bpf_func + 8 (prologue skip offset) */
++	off = offsetof(struct bpf_prog, bpf_func);
++	if (off > 0x7fff)
++		return -1;
++	emit(ctx, ld, tmp, off, tmp);                /* tmp = *(tmp + off)   */
++	emit(ctx, daddiu, tmp, tmp, JIT_TCALL_SKIP); /* tmp += skip (4)      */
++
++	/* goto func */
++	build_epilogue(ctx, tmp);
++	access_reg(ctx, JIT_REG_TC);
++	return 0;
++}
++
++/*
++ * Stack frame layout for a JITed program (stack grows down).
++ *
++ * Higher address  : Previous stack frame      :
++ *                 +===========================+  <--- MIPS sp before call
++ *                 | Callee-saved registers,   |
++ *                 | including RA and FP       |
++ *                 +---------------------------+  <--- eBPF FP (MIPS fp)
++ *                 | Local eBPF variables      |
++ *                 | allocated by program      |
++ *                 +---------------------------+
++ *                 | Reserved for caller-saved |
++ *                 | registers                 |
++ * Lower address   +===========================+  <--- MIPS sp
++ */
++
++/* Build program prologue to set up the stack and registers */
++void build_prologue(struct jit_context *ctx)
++{
++	u8 fp = bpf2mips64[BPF_REG_FP];
++	u8 tc = bpf2mips64[JIT_REG_TC];
++	u8 zx = bpf2mips64[JIT_REG_ZX];
++	int stack, saved, locals, reserved;
++
++	/*
++	 * The first instruction initializes the tail call count register.
++	 * On a tail call, the calling function jumps into the prologue
++	 * after this instruction.
++	 */
++	emit(ctx, addiu, tc, MIPS_R_ZERO, min(MAX_TAIL_CALL_CNT + 1, 0xffff));
++
++	/* === Entry-point for tail calls === */
++
++	/*
++	 * If the eBPF frame pointer and tail call count registers were
++	 * accessed they must be preserved. Mark them as clobbered here
++	 * to save and restore them on the stack as needed.
++	 */
++	if (ctx->accessed & BIT(BPF_REG_FP))
++		clobber_reg(ctx, fp);
++	if (ctx->accessed & BIT(JIT_REG_TC))
++		clobber_reg(ctx, tc);
++	if (ctx->accessed & BIT(JIT_REG_ZX))
++		clobber_reg(ctx, zx);
++
++	/* Compute the stack space needed for callee-saved registers */
++	saved = hweight32(ctx->clobbered & JIT_CALLEE_REGS) * sizeof(u64);
++	saved = ALIGN(saved, MIPS_STACK_ALIGNMENT);
++
++	/* Stack space used by eBPF program local data */
++	locals = ALIGN(ctx->program->aux->stack_depth, MIPS_STACK_ALIGNMENT);
++
++	/*
++	 * If we are emitting function calls, reserve extra stack space for
++	 * caller-saved registers needed by the JIT. The required space is
++	 * computed automatically during resource usage discovery (pass 1).
++	 */
++	reserved = ctx->stack_used;
++
++	/* Allocate the stack frame */
++	stack = ALIGN(saved + locals + reserved, MIPS_STACK_ALIGNMENT);
++	if (stack)
++		emit(ctx, daddiu, MIPS_R_SP, MIPS_R_SP, -stack);
++
++	/* Store callee-saved registers on stack */
++	push_regs(ctx, ctx->clobbered & JIT_CALLEE_REGS, 0, stack - saved);
++
++	/* Initialize the eBPF frame pointer if accessed */
++	if (ctx->accessed & BIT(BPF_REG_FP))
++		emit(ctx, daddiu, fp, MIPS_R_SP, stack - saved);
++
++	/* Initialize the ePF JIT zero-extension register if accessed */
++	if (ctx->accessed & BIT(JIT_REG_ZX)) {
++		emit(ctx, daddiu, zx, MIPS_R_ZERO, -1);
++		emit(ctx, dsrl32, zx, zx, 0);
++	}
++
++	ctx->saved_size = saved;
++	ctx->stack_size = stack;
++}
++
++/* Build the program epilogue to restore the stack and registers */
++void build_epilogue(struct jit_context *ctx, int dest_reg)
++{
++	/* Restore callee-saved registers from stack */
++	pop_regs(ctx, ctx->clobbered & JIT_CALLEE_REGS, 0,
++		 ctx->stack_size - ctx->saved_size);
++
++	/* Release the stack frame */
++	if (ctx->stack_size)
++		emit(ctx, daddiu, MIPS_R_SP, MIPS_R_SP, ctx->stack_size);
++
++	/* Jump to return address and sign-extend the 32-bit return value */
++	emit(ctx, jr, dest_reg);
++	emit(ctx, sll, MIPS_R_V0, MIPS_R_V0, 0); /* Delay slot */
++}
++
++/* Build one eBPF instruction */
++int build_insn(const struct bpf_insn *insn, struct jit_context *ctx)
++{
++	u8 dst = bpf2mips64[insn->dst_reg];
++	u8 src = bpf2mips64[insn->src_reg];
++	u8 code = insn->code;
++	s16 off = insn->off;
++	s32 imm = insn->imm;
++	s32 val, rel;
++	u8 alu, jmp;
++
++	switch (code) {
++	/* ALU operations */
++	/* dst = imm */
++	case BPF_ALU | BPF_MOV | BPF_K:
++		emit_mov_i(ctx, dst, imm);
++		emit_zext_ver(ctx, dst);
++		break;
++	/* dst = src */
++	case BPF_ALU | BPF_MOV | BPF_X:
++		if (imm == 1) {
++			/* Special mov32 for zext */
++			emit_zext(ctx, dst);
++		} else {
++			emit_mov_r(ctx, dst, src);
++			emit_zext_ver(ctx, dst);
++		}
++		break;
++	/* dst = -dst */
++	case BPF_ALU | BPF_NEG:
++		emit_sext(ctx, dst, dst);
++		emit_alu_i(ctx, dst, 0, BPF_NEG);
++		emit_zext_ver(ctx, dst);
++		break;
++	/* dst = dst & imm */
++	/* dst = dst | imm */
++	/* dst = dst ^ imm */
++	/* dst = dst << imm */
++	case BPF_ALU | BPF_OR | BPF_K:
++	case BPF_ALU | BPF_AND | BPF_K:
++	case BPF_ALU | BPF_XOR | BPF_K:
++	case BPF_ALU | BPF_LSH | BPF_K:
++		if (!valid_alu_i(BPF_OP(code), imm)) {
++			emit_mov_i(ctx, MIPS_R_T4, imm);
++			emit_alu_r(ctx, dst, MIPS_R_T4, BPF_OP(code));
++		} else if (rewrite_alu_i(BPF_OP(code), imm, &alu, &val)) {
++			emit_alu_i(ctx, dst, val, alu);
++		}
++		emit_zext_ver(ctx, dst);
++		break;
++	/* dst = dst >> imm */
++	/* dst = dst >> imm (arithmetic) */
++	/* dst = dst + imm */
++	/* dst = dst - imm */
++	/* dst = dst * imm */
++	/* dst = dst / imm */
++	/* dst = dst % imm */
++	case BPF_ALU | BPF_RSH | BPF_K:
++	case BPF_ALU | BPF_ARSH | BPF_K:
++	case BPF_ALU | BPF_ADD | BPF_K:
++	case BPF_ALU | BPF_SUB | BPF_K:
++	case BPF_ALU | BPF_MUL | BPF_K:
++	case BPF_ALU | BPF_DIV | BPF_K:
++	case BPF_ALU | BPF_MOD | BPF_K:
++		if (!valid_alu_i(BPF_OP(code), imm)) {
++			emit_sext(ctx, dst, dst);
++			emit_mov_i(ctx, MIPS_R_T4, imm);
++			emit_alu_r(ctx, dst, MIPS_R_T4, BPF_OP(code));
++		} else if (rewrite_alu_i(BPF_OP(code), imm, &alu, &val)) {
++			emit_sext(ctx, dst, dst);
++			emit_alu_i(ctx, dst, val, alu);
++		}
++		emit_zext_ver(ctx, dst);
++		break;
++	/* dst = dst & src */
++	/* dst = dst | src */
++	/* dst = dst ^ src */
++	/* dst = dst << src */
++	case BPF_ALU | BPF_AND | BPF_X:
++	case BPF_ALU | BPF_OR | BPF_X:
++	case BPF_ALU | BPF_XOR | BPF_X:
++	case BPF_ALU | BPF_LSH | BPF_X:
++		emit_alu_r(ctx, dst, src, BPF_OP(code));
++		emit_zext_ver(ctx, dst);
++		break;
++	/* dst = dst >> src */
++	/* dst = dst >> src (arithmetic) */
++	/* dst = dst + src */
++	/* dst = dst - src */
++	/* dst = dst * src */
++	/* dst = dst / src */
++	/* dst = dst % src */
++	case BPF_ALU | BPF_RSH | BPF_X:
++	case BPF_ALU | BPF_ARSH | BPF_X:
++	case BPF_ALU | BPF_ADD | BPF_X:
++	case BPF_ALU | BPF_SUB | BPF_X:
++	case BPF_ALU | BPF_MUL | BPF_X:
++	case BPF_ALU | BPF_DIV | BPF_X:
++	case BPF_ALU | BPF_MOD | BPF_X:
++		emit_sext(ctx, dst, dst);
++		emit_sext(ctx, MIPS_R_T4, src);
++		emit_alu_r(ctx, dst, MIPS_R_T4, BPF_OP(code));
++		emit_zext_ver(ctx, dst);
++		break;
++	/* dst = imm (64-bit) */
++	case BPF_ALU64 | BPF_MOV | BPF_K:
++		emit_mov_i(ctx, dst, imm);
++		break;
++	/* dst = src (64-bit) */
++	case BPF_ALU64 | BPF_MOV | BPF_X:
++		emit_mov_r(ctx, dst, src);
++		break;
++	/* dst = -dst (64-bit) */
++	case BPF_ALU64 | BPF_NEG:
++		emit_alu_i64(ctx, dst, 0, BPF_NEG);
++		break;
++	/* dst = dst & imm (64-bit) */
++	/* dst = dst | imm (64-bit) */
++	/* dst = dst ^ imm (64-bit) */
++	/* dst = dst << imm (64-bit) */
++	/* dst = dst >> imm (64-bit) */
++	/* dst = dst >> imm ((64-bit, arithmetic) */
++	/* dst = dst + imm (64-bit) */
++	/* dst = dst - imm (64-bit) */
++	/* dst = dst * imm (64-bit) */
++	/* dst = dst / imm (64-bit) */
++	/* dst = dst % imm (64-bit) */
++	case BPF_ALU64 | BPF_AND | BPF_K:
++	case BPF_ALU64 | BPF_OR | BPF_K:
++	case BPF_ALU64 | BPF_XOR | BPF_K:
++	case BPF_ALU64 | BPF_LSH | BPF_K:
++	case BPF_ALU64 | BPF_RSH | BPF_K:
++	case BPF_ALU64 | BPF_ARSH | BPF_K:
++	case BPF_ALU64 | BPF_ADD | BPF_K:
++	case BPF_ALU64 | BPF_SUB | BPF_K:
++	case BPF_ALU64 | BPF_MUL | BPF_K:
++	case BPF_ALU64 | BPF_DIV | BPF_K:
++	case BPF_ALU64 | BPF_MOD | BPF_K:
++		if (!valid_alu_i(BPF_OP(code), imm)) {
++			emit_mov_i(ctx, MIPS_R_T4, imm);
++			emit_alu_r64(ctx, dst, MIPS_R_T4, BPF_OP(code));
++		} else if (rewrite_alu_i(BPF_OP(code), imm, &alu, &val)) {
++			emit_alu_i64(ctx, dst, val, alu);
++		}
++		break;
++	/* dst = dst & src (64-bit) */
++	/* dst = dst | src (64-bit) */
++	/* dst = dst ^ src (64-bit) */
++	/* dst = dst << src (64-bit) */
++	/* dst = dst >> src (64-bit) */
++	/* dst = dst >> src (64-bit, arithmetic) */
++	/* dst = dst + src (64-bit) */
++	/* dst = dst - src (64-bit) */
++	/* dst = dst * src (64-bit) */
++	/* dst = dst / src (64-bit) */
++	/* dst = dst % src (64-bit) */
++	case BPF_ALU64 | BPF_AND | BPF_X:
++	case BPF_ALU64 | BPF_OR | BPF_X:
++	case BPF_ALU64 | BPF_XOR | BPF_X:
++	case BPF_ALU64 | BPF_LSH | BPF_X:
++	case BPF_ALU64 | BPF_RSH | BPF_X:
++	case BPF_ALU64 | BPF_ARSH | BPF_X:
++	case BPF_ALU64 | BPF_ADD | BPF_X:
++	case BPF_ALU64 | BPF_SUB | BPF_X:
++	case BPF_ALU64 | BPF_MUL | BPF_X:
++	case BPF_ALU64 | BPF_DIV | BPF_X:
++	case BPF_ALU64 | BPF_MOD | BPF_X:
++		emit_alu_r64(ctx, dst, src, BPF_OP(code));
++		break;
++	/* dst = htole(dst) */
++	/* dst = htobe(dst) */
++	case BPF_ALU | BPF_END | BPF_FROM_LE:
++	case BPF_ALU | BPF_END | BPF_FROM_BE:
++		if (BPF_SRC(code) ==
++#ifdef __BIG_ENDIAN
++		    BPF_FROM_LE
++#else
++		    BPF_FROM_BE
++#endif
++		    )
++			emit_bswap_r64(ctx, dst, imm);
++		else
++			emit_trunc_r64(ctx, dst, imm);
++		break;
++	/* dst = imm64 */
++	case BPF_LD | BPF_IMM | BPF_DW:
++		emit_mov_i64(ctx, dst, (u32)imm | ((u64)insn[1].imm << 32));
++		return 1;
++	/* LDX: dst = *(size *)(src + off) */
++	case BPF_LDX | BPF_MEM | BPF_W:
++	case BPF_LDX | BPF_MEM | BPF_H:
++	case BPF_LDX | BPF_MEM | BPF_B:
++	case BPF_LDX | BPF_MEM | BPF_DW:
++		emit_ldx(ctx, dst, src, off, BPF_SIZE(code));
++		break;
++	/* ST: *(size *)(dst + off) = imm */
++	case BPF_ST | BPF_MEM | BPF_W:
++	case BPF_ST | BPF_MEM | BPF_H:
++	case BPF_ST | BPF_MEM | BPF_B:
++	case BPF_ST | BPF_MEM | BPF_DW:
++		emit_mov_i(ctx, MIPS_R_T4, imm);
++		emit_stx(ctx, dst, MIPS_R_T4, off, BPF_SIZE(code));
++		break;
++	/* STX: *(size *)(dst + off) = src */
++	case BPF_STX | BPF_MEM | BPF_W:
++	case BPF_STX | BPF_MEM | BPF_H:
++	case BPF_STX | BPF_MEM | BPF_B:
++	case BPF_STX | BPF_MEM | BPF_DW:
++		emit_stx(ctx, dst, src, off, BPF_SIZE(code));
++		break;
++	/* Speculation barrier */
++	case BPF_ST | BPF_NOSPEC:
++		break;
++	/* Atomics */
++	case BPF_STX | BPF_XADD | BPF_W:
++	case BPF_STX | BPF_XADD | BPF_DW:
++		switch (imm) {
++		case BPF_ADD:
++		case BPF_AND:
++		case BPF_OR:
++		case BPF_XOR:
++			if (BPF_SIZE(code) == BPF_DW) {
++				emit_atomic_r64(ctx, dst, src, off, imm);
++			} else { /* 32-bit, no fetch */
++				emit_sext(ctx, MIPS_R_T4, src);
++				emit_atomic_r(ctx, dst, MIPS_R_T4, off, imm);
++			}
++			break;
++		default:
++			goto notyet;
++		}
++		break;
++	/* PC += off if dst == src */
++	/* PC += off if dst != src */
++	/* PC += off if dst & src */
++	/* PC += off if dst > src */
++	/* PC += off if dst >= src */
++	/* PC += off if dst < src */
++	/* PC += off if dst <= src */
++	/* PC += off if dst > src (signed) */
++	/* PC += off if dst >= src (signed) */
++	/* PC += off if dst < src (signed) */
++	/* PC += off if dst <= src (signed) */
++	case BPF_JMP32 | BPF_JEQ | BPF_X:
++	case BPF_JMP32 | BPF_JNE | BPF_X:
++	case BPF_JMP32 | BPF_JSET | BPF_X:
++	case BPF_JMP32 | BPF_JGT | BPF_X:
++	case BPF_JMP32 | BPF_JGE | BPF_X:
++	case BPF_JMP32 | BPF_JLT | BPF_X:
++	case BPF_JMP32 | BPF_JLE | BPF_X:
++	case BPF_JMP32 | BPF_JSGT | BPF_X:
++	case BPF_JMP32 | BPF_JSGE | BPF_X:
++	case BPF_JMP32 | BPF_JSLT | BPF_X:
++	case BPF_JMP32 | BPF_JSLE | BPF_X:
++		if (off == 0)
++			break;
++		setup_jmp_r(ctx, dst == src, BPF_OP(code), off, &jmp, &rel);
++		emit_sext(ctx, MIPS_R_T4, dst); /* Sign-extended dst */
++		emit_sext(ctx, MIPS_R_T5, src); /* Sign-extended src */
++		emit_jmp_r(ctx, MIPS_R_T4, MIPS_R_T5, rel, jmp);
++		if (finish_jmp(ctx, jmp, off) < 0)
++			goto toofar;
++		break;
++	/* PC += off if dst == imm */
++	/* PC += off if dst != imm */
++	/* PC += off if dst & imm */
++	/* PC += off if dst > imm */
++	/* PC += off if dst >= imm */
++	/* PC += off if dst < imm */
++	/* PC += off if dst <= imm */
++	/* PC += off if dst > imm (signed) */
++	/* PC += off if dst >= imm (signed) */
++	/* PC += off if dst < imm (signed) */
++	/* PC += off if dst <= imm (signed) */
++	case BPF_JMP32 | BPF_JEQ | BPF_K:
++	case BPF_JMP32 | BPF_JNE | BPF_K:
++	case BPF_JMP32 | BPF_JSET | BPF_K:
++	case BPF_JMP32 | BPF_JGT | BPF_K:
++	case BPF_JMP32 | BPF_JGE | BPF_K:
++	case BPF_JMP32 | BPF_JLT | BPF_K:
++	case BPF_JMP32 | BPF_JLE | BPF_K:
++	case BPF_JMP32 | BPF_JSGT | BPF_K:
++	case BPF_JMP32 | BPF_JSGE | BPF_K:
++	case BPF_JMP32 | BPF_JSLT | BPF_K:
++	case BPF_JMP32 | BPF_JSLE | BPF_K:
++		if (off == 0)
++			break;
++		setup_jmp_i(ctx, imm, 32, BPF_OP(code), off, &jmp, &rel);
++		emit_sext(ctx, MIPS_R_T4, dst); /* Sign-extended dst */
++		if (valid_jmp_i(jmp, imm)) {
++			emit_jmp_i(ctx, MIPS_R_T4, imm, rel, jmp);
++		} else {
++			/* Move large immediate to register, sign-extended */
++			emit_mov_i(ctx, MIPS_R_T5, imm);
++			emit_jmp_r(ctx, MIPS_R_T4, MIPS_R_T5, rel, jmp);
++		}
++		if (finish_jmp(ctx, jmp, off) < 0)
++			goto toofar;
++		break;
++	/* PC += off if dst == src */
++	/* PC += off if dst != src */
++	/* PC += off if dst & src */
++	/* PC += off if dst > src */
++	/* PC += off if dst >= src */
++	/* PC += off if dst < src */
++	/* PC += off if dst <= src */
++	/* PC += off if dst > src (signed) */
++	/* PC += off if dst >= src (signed) */
++	/* PC += off if dst < src (signed) */
++	/* PC += off if dst <= src (signed) */
++	case BPF_JMP | BPF_JEQ | BPF_X:
++	case BPF_JMP | BPF_JNE | BPF_X:
++	case BPF_JMP | BPF_JSET | BPF_X:
++	case BPF_JMP | BPF_JGT | BPF_X:
++	case BPF_JMP | BPF_JGE | BPF_X:
++	case BPF_JMP | BPF_JLT | BPF_X:
++	case BPF_JMP | BPF_JLE | BPF_X:
++	case BPF_JMP | BPF_JSGT | BPF_X:
++	case BPF_JMP | BPF_JSGE | BPF_X:
++	case BPF_JMP | BPF_JSLT | BPF_X:
++	case BPF_JMP | BPF_JSLE | BPF_X:
++		if (off == 0)
++			break;
++		setup_jmp_r(ctx, dst == src, BPF_OP(code), off, &jmp, &rel);
++		emit_jmp_r(ctx, dst, src, rel, jmp);
++		if (finish_jmp(ctx, jmp, off) < 0)
++			goto toofar;
++		break;
++	/* PC += off if dst == imm */
++	/* PC += off if dst != imm */
++	/* PC += off if dst & imm */
++	/* PC += off if dst > imm */
++	/* PC += off if dst >= imm */
++	/* PC += off if dst < imm */
++	/* PC += off if dst <= imm */
++	/* PC += off if dst > imm (signed) */
++	/* PC += off if dst >= imm (signed) */
++	/* PC += off if dst < imm (signed) */
++	/* PC += off if dst <= imm (signed) */
++	case BPF_JMP | BPF_JEQ | BPF_K:
++	case BPF_JMP | BPF_JNE | BPF_K:
++	case BPF_JMP | BPF_JSET | BPF_K:
++	case BPF_JMP | BPF_JGT | BPF_K:
++	case BPF_JMP | BPF_JGE | BPF_K:
++	case BPF_JMP | BPF_JLT | BPF_K:
++	case BPF_JMP | BPF_JLE | BPF_K:
++	case BPF_JMP | BPF_JSGT | BPF_K:
++	case BPF_JMP | BPF_JSGE | BPF_K:
++	case BPF_JMP | BPF_JSLT | BPF_K:
++	case BPF_JMP | BPF_JSLE | BPF_K:
++		if (off == 0)
++			break;
++		setup_jmp_i(ctx, imm, 64, BPF_OP(code), off, &jmp, &rel);
++		if (valid_jmp_i(jmp, imm)) {
++			emit_jmp_i(ctx, dst, imm, rel, jmp);
++		} else {
++			/* Move large immediate to register */
++			emit_mov_i(ctx, MIPS_R_T4, imm);
++			emit_jmp_r(ctx, dst, MIPS_R_T4, rel, jmp);
++		}
++		if (finish_jmp(ctx, jmp, off) < 0)
++			goto toofar;
++		break;
++	/* PC += off */
++	case BPF_JMP | BPF_JA:
++		if (off == 0)
++			break;
++		if (emit_ja(ctx, off) < 0)
++			goto toofar;
++		break;
++	/* Tail call */
++	case BPF_JMP | BPF_TAIL_CALL:
++		if (emit_tail_call(ctx) < 0)
++			goto invalid;
++		break;
++	/* Function call */
++	case BPF_JMP | BPF_CALL:
++		if (emit_call(ctx, insn) < 0)
++			goto invalid;
++		break;
++	/* Function return */
++	case BPF_JMP | BPF_EXIT:
++		/*
++		 * Optimization: when last instruction is EXIT
++		 * simply continue to epilogue.
++		 */
++		if (ctx->bpf_index == ctx->program->len - 1)
++			break;
++		if (emit_exit(ctx) < 0)
++			goto toofar;
++		break;
++
++	default:
++invalid:
++		pr_err_once("unknown opcode %02x\n", code);
++		return -EINVAL;
++notyet:
++		pr_info_once("*** NOT YET: opcode %02x ***\n", code);
++		return -EFAULT;
++toofar:
++		pr_info_once("*** TOO FAR: jump at %u opcode %02x ***\n",
++			     ctx->bpf_index, code);
++		return -E2BIG;
++	}
++	return 0;
++}

+ 120 - 0
target/linux/generic/backport-6.1/050-v5.16-04-mips-bpf-Add-JIT-workarounds-for-CPU-errata.patch

@@ -0,0 +1,120 @@
+From: Johan Almbladh <[email protected]>
+Date: Tue, 5 Oct 2021 18:54:06 +0200
+Subject: [PATCH] mips: bpf: Add JIT workarounds for CPU errata
+
+This patch adds workarounds for the following CPU errata to the MIPS
+eBPF JIT, if enabled in the kernel configuration.
+
+  - R10000 ll/sc weak ordering
+  - Loongson-3 ll/sc weak ordering
+  - Loongson-2F jump hang
+
+The Loongson-2F nop errata is implemented in uasm, which the JIT uses,
+so no additional mitigations are needed for that.
+
+Signed-off-by: Johan Almbladh <[email protected]>
+Reviewed-by: Jiaxun Yang <[email protected]>
+---
+
+--- a/arch/mips/net/bpf_jit_comp.c
++++ b/arch/mips/net/bpf_jit_comp.c
+@@ -404,6 +404,7 @@ void emit_alu_r(struct jit_context *ctx,
+ /* Atomic read-modify-write (32-bit) */
+ void emit_atomic_r(struct jit_context *ctx, u8 dst, u8 src, s16 off, u8 code)
+ {
++	LLSC_sync(ctx);
+ 	emit(ctx, ll, MIPS_R_T9, off, dst);
+ 	switch (code) {
+ 	case BPF_ADD:
+@@ -420,18 +421,19 @@ void emit_atomic_r(struct jit_context *c
+ 		break;
+ 	}
+ 	emit(ctx, sc, MIPS_R_T8, off, dst);
+-	emit(ctx, beqz, MIPS_R_T8, -16);
++	emit(ctx, LLSC_beqz, MIPS_R_T8, -16 - LLSC_offset);
+ 	emit(ctx, nop); /* Delay slot */
+ }
+ 
+ /* Atomic compare-and-exchange (32-bit) */
+ void emit_cmpxchg_r(struct jit_context *ctx, u8 dst, u8 src, u8 res, s16 off)
+ {
++	LLSC_sync(ctx);
+ 	emit(ctx, ll, MIPS_R_T9, off, dst);
+ 	emit(ctx, bne, MIPS_R_T9, res, 12);
+ 	emit(ctx, move, MIPS_R_T8, src);     /* Delay slot */
+ 	emit(ctx, sc, MIPS_R_T8, off, dst);
+-	emit(ctx, beqz, MIPS_R_T8, -20);
++	emit(ctx, LLSC_beqz, MIPS_R_T8, -20 - LLSC_offset);
+ 	emit(ctx, move, res, MIPS_R_T9);     /* Delay slot */
+ 	clobber_reg(ctx, res);
+ }
+--- a/arch/mips/net/bpf_jit_comp.h
++++ b/arch/mips/net/bpf_jit_comp.h
+@@ -87,7 +87,7 @@ struct jit_context {
+ };
+ 
+ /* Emit the instruction if the JIT memory space has been allocated */
+-#define emit(ctx, func, ...)					\
++#define __emit(ctx, func, ...)					\
+ do {								\
+ 	if ((ctx)->target != NULL) {				\
+ 		u32 *p = &(ctx)->target[ctx->jit_index];	\
+@@ -95,6 +95,30 @@ do {								\
+ 	}							\
+ 	(ctx)->jit_index++;					\
+ } while (0)
++#define emit(...) __emit(__VA_ARGS__)
++
++/* Workaround for R10000 ll/sc errata */
++#ifdef CONFIG_WAR_R10000
++#define LLSC_beqz	beqzl
++#else
++#define LLSC_beqz	beqz
++#endif
++
++/* Workaround for Loongson-3 ll/sc errata */
++#ifdef CONFIG_CPU_LOONGSON3_WORKAROUNDS
++#define LLSC_sync(ctx)	emit(ctx, sync, 0)
++#define LLSC_offset	4
++#else
++#define LLSC_sync(ctx)
++#define LLSC_offset	0
++#endif
++
++/* Workaround for Loongson-2F jump errata */
++#ifdef CONFIG_CPU_JUMP_WORKAROUNDS
++#define JALR_MASK	0xffffffffcfffffffULL
++#else
++#define JALR_MASK	(~0ULL)
++#endif
+ 
+ /*
+  * Mark a BPF register as accessed, it needs to be
+--- a/arch/mips/net/bpf_jit_comp64.c
++++ b/arch/mips/net/bpf_jit_comp64.c
+@@ -375,6 +375,7 @@ static void emit_atomic_r64(struct jit_c
+ 	u8 t1 = MIPS_R_T6;
+ 	u8 t2 = MIPS_R_T7;
+ 
++	LLSC_sync(ctx);
+ 	emit(ctx, lld, t1, off, dst);
+ 	switch (code) {
+ 	case BPF_ADD:
+@@ -391,7 +392,7 @@ static void emit_atomic_r64(struct jit_c
+ 		break;
+ 	}
+ 	emit(ctx, scd, t2, off, dst);
+-	emit(ctx, beqz, t2, -16);
++	emit(ctx, LLSC_beqz, t2, -16 - LLSC_offset);
+ 	emit(ctx, nop); /* Delay slot */
+ }
+ 
+@@ -414,7 +415,7 @@ static int emit_call(struct jit_context
+ 	push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS, 0, 0);
+ 
+ 	/* Emit function call */
+-	emit_mov_i64(ctx, tmp, addr);
++	emit_mov_i64(ctx, tmp, addr & JALR_MASK);
+ 	emit(ctx, jalr, MIPS_R_RA, tmp);
+ 	emit(ctx, nop); /* Delay slot */
+ 

+ 61 - 0
target/linux/generic/backport-6.1/050-v5.16-05-mips-bpf-Enable-eBPF-JITs.patch

@@ -0,0 +1,61 @@
+From: Johan Almbladh <[email protected]>
+Date: Tue, 5 Oct 2021 18:54:07 +0200
+Subject: [PATCH] mips: bpf: Enable eBPF JITs
+
+This patch enables the new eBPF JITs for 32-bit and 64-bit MIPS. It also
+disables the old cBPF JIT to so cBPF programs are converted to use the
+new JIT.
+
+Workarounds for R4000 CPU errata are not implemented by the JIT, so the
+JIT is disabled if any of those workarounds are configured.
+
+Signed-off-by: Johan Almbladh <[email protected]>
+---
+
+--- a/MAINTAINERS
++++ b/MAINTAINERS
+@@ -3431,6 +3431,7 @@ S:	Supported
+ F:	arch/arm64/net/
+ 
+ BPF JIT for MIPS (32-BIT AND 64-BIT)
++M:	Johan Almbladh <[email protected]>
+ M:	Paul Burton <[email protected]>
+ L:	[email protected]
+ L:	[email protected]
+--- a/arch/mips/Kconfig
++++ b/arch/mips/Kconfig
+@@ -57,7 +57,6 @@ config MIPS
+ 	select HAVE_ARCH_TRACEHOOK
+ 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES
+ 	select HAVE_ASM_MODVERSIONS
+-	select HAVE_CBPF_JIT if !64BIT && !CPU_MICROMIPS
+ 	select HAVE_CONTEXT_TRACKING
+ 	select HAVE_TIF_NOHZ
+ 	select HAVE_C_RECORDMCOUNT
+@@ -65,7 +64,10 @@ config MIPS
+ 	select HAVE_DEBUG_STACKOVERFLOW
+ 	select HAVE_DMA_CONTIGUOUS
+ 	select HAVE_DYNAMIC_FTRACE
+-	select HAVE_EBPF_JIT if 64BIT && !CPU_MICROMIPS && TARGET_ISA_REV >= 2
++	select HAVE_EBPF_JIT if !CPU_MICROMIPS && \
++				!CPU_DADDI_WORKAROUNDS && \
++				!CPU_R4000_WORKAROUNDS && \
++				!CPU_R4400_WORKAROUNDS
+ 	select HAVE_EXIT_THREAD
+ 	select HAVE_FAST_GUP
+ 	select HAVE_FTRACE_MCOUNT_RECORD
+--- a/arch/mips/net/Makefile
++++ b/arch/mips/net/Makefile
+@@ -2,9 +2,10 @@
+ # MIPS networking code
+ 
+ obj-$(CONFIG_MIPS_CBPF_JIT) += bpf_jit.o bpf_jit_asm.o
++obj-$(CONFIG_MIPS_EBPF_JIT) += bpf_jit_comp.o
+ 
+ ifeq ($(CONFIG_32BIT),y)
+-        obj-$(CONFIG_MIPS_EBPF_JIT) += bpf_jit_comp.o bpf_jit_comp32.o
++        obj-$(CONFIG_MIPS_EBPF_JIT) += bpf_jit_comp32.o
+ else
+-        obj-$(CONFIG_MIPS_EBPF_JIT) += ebpf_jit.o
++        obj-$(CONFIG_MIPS_EBPF_JIT) += bpf_jit_comp64.o
+ endif

+ 387 - 0
target/linux/generic/backport-6.1/050-v5.16-06-mips-bpf-Remove-old-BPF-JIT-implementations.patch

@@ -0,0 +1,387 @@
+From: Johan Almbladh <[email protected]>
+Date: Tue, 5 Oct 2021 18:54:08 +0200
+Subject: [PATCH] mips: bpf: Remove old BPF JIT implementations
+
+This patch removes the old 32-bit cBPF and 64-bit eBPF JIT implementations.
+They are replaced by a new eBPF implementation that supports both 32-bit
+and 64-bit MIPS CPUs.
+
+Signed-off-by: Johan Almbladh <[email protected]>
+---
+ delete mode 100644 arch/mips/net/bpf_jit.c
+ delete mode 100644 arch/mips/net/bpf_jit.h
+ delete mode 100644 arch/mips/net/bpf_jit_asm.S
+ delete mode 100644 arch/mips/net/ebpf_jit.c
+
+--- a/arch/mips/net/bpf_jit.h
++++ /dev/null
+@@ -1,81 +0,0 @@
+-/* SPDX-License-Identifier: GPL-2.0-only */
+-/*
+- * Just-In-Time compiler for BPF filters on MIPS
+- *
+- * Copyright (c) 2014 Imagination Technologies Ltd.
+- * Author: Markos Chandras <[email protected]>
+- */
+-
+-#ifndef BPF_JIT_MIPS_OP_H
+-#define BPF_JIT_MIPS_OP_H
+-
+-/* Registers used by JIT */
+-#define MIPS_R_ZERO	0
+-#define MIPS_R_V0	2
+-#define MIPS_R_A0	4
+-#define MIPS_R_A1	5
+-#define MIPS_R_T4	12
+-#define MIPS_R_T5	13
+-#define MIPS_R_T6	14
+-#define MIPS_R_T7	15
+-#define MIPS_R_S0	16
+-#define MIPS_R_S1	17
+-#define MIPS_R_S2	18
+-#define MIPS_R_S3	19
+-#define MIPS_R_S4	20
+-#define MIPS_R_S5	21
+-#define MIPS_R_S6	22
+-#define MIPS_R_S7	23
+-#define MIPS_R_SP	29
+-#define MIPS_R_RA	31
+-
+-/* Conditional codes */
+-#define MIPS_COND_EQ	0x1
+-#define MIPS_COND_GE	(0x1 << 1)
+-#define MIPS_COND_GT	(0x1 << 2)
+-#define MIPS_COND_NE	(0x1 << 3)
+-#define MIPS_COND_ALL	(0x1 << 4)
+-/* Conditionals on X register or K immediate */
+-#define MIPS_COND_X	(0x1 << 5)
+-#define MIPS_COND_K	(0x1 << 6)
+-
+-#define r_ret	MIPS_R_V0
+-
+-/*
+- * Use 2 scratch registers to avoid pipeline interlocks.
+- * There is no overhead during epilogue and prologue since
+- * any of the $s0-$s6 registers will only be preserved if
+- * they are going to actually be used.
+- */
+-#define r_skb_hl	MIPS_R_S0 /* skb header length */
+-#define r_skb_data	MIPS_R_S1 /* skb actual data */
+-#define r_off		MIPS_R_S2
+-#define r_A		MIPS_R_S3
+-#define r_X		MIPS_R_S4
+-#define r_skb		MIPS_R_S5
+-#define r_M		MIPS_R_S6
+-#define r_skb_len	MIPS_R_S7
+-#define r_s0		MIPS_R_T4 /* scratch reg 1 */
+-#define r_s1		MIPS_R_T5 /* scratch reg 2 */
+-#define r_tmp_imm	MIPS_R_T6 /* No need to preserve this */
+-#define r_tmp		MIPS_R_T7 /* No need to preserve this */
+-#define r_zero		MIPS_R_ZERO
+-#define r_sp		MIPS_R_SP
+-#define r_ra		MIPS_R_RA
+-
+-#ifndef __ASSEMBLY__
+-
+-/* Declare ASM helpers */
+-
+-#define DECLARE_LOAD_FUNC(func) \
+-	extern u8 func(unsigned long *skb, int offset); \
+-	extern u8 func##_negative(unsigned long *skb, int offset); \
+-	extern u8 func##_positive(unsigned long *skb, int offset)
+-
+-DECLARE_LOAD_FUNC(sk_load_word);
+-DECLARE_LOAD_FUNC(sk_load_half);
+-DECLARE_LOAD_FUNC(sk_load_byte);
+-
+-#endif
+-
+-#endif /* BPF_JIT_MIPS_OP_H */
+--- a/arch/mips/net/bpf_jit_asm.S
++++ /dev/null
+@@ -1,285 +0,0 @@
+-/*
+- * bpf_jib_asm.S: Packet/header access helper functions for MIPS/MIPS64 BPF
+- * compiler.
+- *
+- * Copyright (C) 2015 Imagination Technologies Ltd.
+- * Author: Markos Chandras <[email protected]>
+- *
+- * This program is free software; you can redistribute it and/or modify it
+- * under the terms of the GNU General Public License as published by the
+- * Free Software Foundation; version 2 of the License.
+- */
+-
+-#include <asm/asm.h>
+-#include <asm/isa-rev.h>
+-#include <asm/regdef.h>
+-#include "bpf_jit.h"
+-
+-/* ABI
+- *
+- * r_skb_hl	skb header length
+- * r_skb_data	skb data
+- * r_off(a1)	offset register
+- * r_A		BPF register A
+- * r_X		PF register X
+- * r_skb(a0)	*skb
+- * r_M		*scratch memory
+- * r_skb_le	skb length
+- * r_s0		Scratch register 0
+- * r_s1		Scratch register 1
+- *
+- * On entry:
+- * a0: *skb
+- * a1: offset (imm or imm + X)
+- *
+- * All non-BPF-ABI registers are free for use. On return, we only
+- * care about r_ret. The BPF-ABI registers are assumed to remain
+- * unmodified during the entire filter operation.
+- */
+-
+-#define skb	a0
+-#define offset	a1
+-#define SKF_LL_OFF  (-0x200000) /* Can't include linux/filter.h in assembly */
+-
+-	/* We know better :) so prevent assembler reordering etc */
+-	.set 	noreorder
+-
+-#define is_offset_negative(TYPE)				\
+-	/* If offset is negative we have more work to do */	\
+-	slti	t0, offset, 0;					\
+-	bgtz	t0, bpf_slow_path_##TYPE##_neg;			\
+-	/* Be careful what follows in DS. */
+-
+-#define is_offset_in_header(SIZE, TYPE)				\
+-	/* Reading from header? */				\
+-	addiu	$r_s0, $r_skb_hl, -SIZE;			\
+-	slt	t0, $r_s0, offset;				\
+-	bgtz	t0, bpf_slow_path_##TYPE;			\
+-
+-LEAF(sk_load_word)
+-	is_offset_negative(word)
+-FEXPORT(sk_load_word_positive)
+-	is_offset_in_header(4, word)
+-	/* Offset within header boundaries */
+-	PTR_ADDU t1, $r_skb_data, offset
+-	.set	reorder
+-	lw	$r_A, 0(t1)
+-	.set	noreorder
+-#ifdef CONFIG_CPU_LITTLE_ENDIAN
+-# if MIPS_ISA_REV >= 2
+-	wsbh	t0, $r_A
+-	rotr	$r_A, t0, 16
+-# else
+-	sll	t0, $r_A, 24
+-	srl	t1, $r_A, 24
+-	srl	t2, $r_A, 8
+-	or	t0, t0, t1
+-	andi	t2, t2, 0xff00
+-	andi	t1, $r_A, 0xff00
+-	or	t0, t0, t2
+-	sll	t1, t1, 8
+-	or	$r_A, t0, t1
+-# endif
+-#endif
+-	jr	$r_ra
+-	 move	$r_ret, zero
+-	END(sk_load_word)
+-
+-LEAF(sk_load_half)
+-	is_offset_negative(half)
+-FEXPORT(sk_load_half_positive)
+-	is_offset_in_header(2, half)
+-	/* Offset within header boundaries */
+-	PTR_ADDU t1, $r_skb_data, offset
+-	lhu	$r_A, 0(t1)
+-#ifdef CONFIG_CPU_LITTLE_ENDIAN
+-# if MIPS_ISA_REV >= 2
+-	wsbh	$r_A, $r_A
+-# else
+-	sll	t0, $r_A, 8
+-	srl	t1, $r_A, 8
+-	andi	t0, t0, 0xff00
+-	or	$r_A, t0, t1
+-# endif
+-#endif
+-	jr	$r_ra
+-	 move	$r_ret, zero
+-	END(sk_load_half)
+-
+-LEAF(sk_load_byte)
+-	is_offset_negative(byte)
+-FEXPORT(sk_load_byte_positive)
+-	is_offset_in_header(1, byte)
+-	/* Offset within header boundaries */
+-	PTR_ADDU t1, $r_skb_data, offset
+-	lbu	$r_A, 0(t1)
+-	jr	$r_ra
+-	 move	$r_ret, zero
+-	END(sk_load_byte)
+-
+-/*
+- * call skb_copy_bits:
+- * (prototype in linux/skbuff.h)
+- *
+- * int skb_copy_bits(sk_buff *skb, int offset, void *to, int len)
+- *
+- * o32 mandates we leave 4 spaces for argument registers in case
+- * the callee needs to use them. Even though we don't care about
+- * the argument registers ourselves, we need to allocate that space
+- * to remain ABI compliant since the callee may want to use that space.
+- * We also allocate 2 more spaces for $r_ra and our return register (*to).
+- *
+- * n64 is a bit different. The *caller* will allocate the space to preserve
+- * the arguments. So in 64-bit kernels, we allocate the 4-arg space for no
+- * good reason but it does not matter that much really.
+- *
+- * (void *to) is returned in r_s0
+- *
+- */
+-#ifdef CONFIG_CPU_LITTLE_ENDIAN
+-#define DS_OFFSET(SIZE) (4 * SZREG)
+-#else
+-#define DS_OFFSET(SIZE) ((4 * SZREG) + (4 - SIZE))
+-#endif
+-#define bpf_slow_path_common(SIZE)				\
+-	/* Quick check. Are we within reasonable boundaries? */ \
+-	LONG_ADDIU	$r_s1, $r_skb_len, -SIZE;		\
+-	sltu		$r_s0, offset, $r_s1;			\
+-	beqz		$r_s0, fault;				\
+-	/* Load 4th argument in DS */				\
+-	 LONG_ADDIU	a3, zero, SIZE;				\
+-	PTR_ADDIU	$r_sp, $r_sp, -(6 * SZREG);		\
+-	PTR_LA		t0, skb_copy_bits;			\
+-	PTR_S		$r_ra, (5 * SZREG)($r_sp);		\
+-	/* Assign low slot to a2 */				\
+-	PTR_ADDIU	a2, $r_sp, DS_OFFSET(SIZE);		\
+-	jalr		t0;					\
+-	/* Reset our destination slot (DS but it's ok) */	\
+-	 INT_S		zero, (4 * SZREG)($r_sp);		\
+-	/*							\
+-	 * skb_copy_bits returns 0 on success and -EFAULT	\
+-	 * on error. Our data live in a2. Do not bother with	\
+-	 * our data if an error has been returned.		\
+-	 */							\
+-	/* Restore our frame */					\
+-	PTR_L		$r_ra, (5 * SZREG)($r_sp);		\
+-	INT_L		$r_s0, (4 * SZREG)($r_sp);		\
+-	bltz		v0, fault;				\
+-	 PTR_ADDIU	$r_sp, $r_sp, 6 * SZREG;		\
+-	move		$r_ret, zero;				\
+-
+-NESTED(bpf_slow_path_word, (6 * SZREG), $r_sp)
+-	bpf_slow_path_common(4)
+-#ifdef CONFIG_CPU_LITTLE_ENDIAN
+-# if MIPS_ISA_REV >= 2
+-	wsbh	t0, $r_s0
+-	jr	$r_ra
+-	 rotr	$r_A, t0, 16
+-# else
+-	sll	t0, $r_s0, 24
+-	srl	t1, $r_s0, 24
+-	srl	t2, $r_s0, 8
+-	or	t0, t0, t1
+-	andi	t2, t2, 0xff00
+-	andi	t1, $r_s0, 0xff00
+-	or	t0, t0, t2
+-	sll	t1, t1, 8
+-	jr	$r_ra
+-	 or	$r_A, t0, t1
+-# endif
+-#else
+-	jr	$r_ra
+-	 move	$r_A, $r_s0
+-#endif
+-
+-	END(bpf_slow_path_word)
+-
+-NESTED(bpf_slow_path_half, (6 * SZREG), $r_sp)
+-	bpf_slow_path_common(2)
+-#ifdef CONFIG_CPU_LITTLE_ENDIAN
+-# if MIPS_ISA_REV >= 2
+-	jr	$r_ra
+-	 wsbh	$r_A, $r_s0
+-# else
+-	sll	t0, $r_s0, 8
+-	andi	t1, $r_s0, 0xff00
+-	andi	t0, t0, 0xff00
+-	srl	t1, t1, 8
+-	jr	$r_ra
+-	 or	$r_A, t0, t1
+-# endif
+-#else
+-	jr	$r_ra
+-	 move	$r_A, $r_s0
+-#endif
+-
+-	END(bpf_slow_path_half)
+-
+-NESTED(bpf_slow_path_byte, (6 * SZREG), $r_sp)
+-	bpf_slow_path_common(1)
+-	jr	$r_ra
+-	 move	$r_A, $r_s0
+-
+-	END(bpf_slow_path_byte)
+-
+-/*
+- * Negative entry points
+- */
+-	.macro bpf_is_end_of_data
+-	li	t0, SKF_LL_OFF
+-	/* Reading link layer data? */
+-	slt	t1, offset, t0
+-	bgtz	t1, fault
+-	/* Be careful what follows in DS. */
+-	.endm
+-/*
+- * call skb_copy_bits:
+- * (prototype in linux/filter.h)
+- *
+- * void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb,
+- *                                            int k, unsigned int size)
+- *
+- * see above (bpf_slow_path_common) for ABI restrictions
+- */
+-#define bpf_negative_common(SIZE)					\
+-	PTR_ADDIU	$r_sp, $r_sp, -(6 * SZREG);			\
+-	PTR_LA		t0, bpf_internal_load_pointer_neg_helper;	\
+-	PTR_S		$r_ra, (5 * SZREG)($r_sp);			\
+-	jalr		t0;						\
+-	 li		a2, SIZE;					\
+-	PTR_L		$r_ra, (5 * SZREG)($r_sp);			\
+-	/* Check return pointer */					\
+-	beqz		v0, fault;					\
+-	 PTR_ADDIU	$r_sp, $r_sp, 6 * SZREG;			\
+-	/* Preserve our pointer */					\
+-	move		$r_s0, v0;					\
+-	/* Set return value */						\
+-	move		$r_ret, zero;					\
+-
+-bpf_slow_path_word_neg:
+-	bpf_is_end_of_data
+-NESTED(sk_load_word_negative, (6 * SZREG), $r_sp)
+-	bpf_negative_common(4)
+-	jr	$r_ra
+-	 lw	$r_A, 0($r_s0)
+-	END(sk_load_word_negative)
+-
+-bpf_slow_path_half_neg:
+-	bpf_is_end_of_data
+-NESTED(sk_load_half_negative, (6 * SZREG), $r_sp)
+-	bpf_negative_common(2)
+-	jr	$r_ra
+-	 lhu	$r_A, 0($r_s0)
+-	END(sk_load_half_negative)
+-
+-bpf_slow_path_byte_neg:
+-	bpf_is_end_of_data
+-NESTED(sk_load_byte_negative, (6 * SZREG), $r_sp)
+-	bpf_negative_common(1)
+-	jr	$r_ra
+-	 lbu	$r_A, 0($r_s0)
+-	END(sk_load_byte_negative)
+-
+-fault:
+-	jr	$r_ra
+-	 addiu $r_ret, zero, 1

+ 105 - 0
target/linux/generic/backport-6.1/080-v5.17-clk-gate-Add-devm_clk_hw_register_gate.patch

@@ -0,0 +1,105 @@
+From 815f0e738a8d5663a02350e2580706829144a722 Mon Sep 17 00:00:00 2001
+From: Horatiu Vultur <[email protected]>
+Date: Wed, 3 Nov 2021 09:50:59 +0100
+Subject: [PATCH] clk: gate: Add devm_clk_hw_register_gate()
+
+Add devm_clk_hw_register_gate() - devres-managed version of
+clk_hw_register_gate()
+
+Suggested-by: Stephen Boyd <[email protected]>
+Signed-off-by: Horatiu Vultur <[email protected]>
+Acked-by: Nicolas Ferre <[email protected]>
+Signed-off-by: Nicolas Ferre <[email protected]>
+Link: https://lore.kernel.org/r/[email protected]
+---
+ drivers/clk/clk-gate.c       | 35 +++++++++++++++++++++++++++++++++++
+ include/linux/clk-provider.h | 23 +++++++++++++++++++++++
+ 2 files changed, 58 insertions(+)
+
+--- a/drivers/clk/clk-gate.c
++++ b/drivers/clk/clk-gate.c
+@@ -7,6 +7,7 @@
+  */
+ 
+ #include <linux/clk-provider.h>
++#include <linux/device.h>
+ #include <linux/module.h>
+ #include <linux/slab.h>
+ #include <linux/io.h>
+@@ -222,3 +223,37 @@ void clk_hw_unregister_gate(struct clk_h
+ 	kfree(gate);
+ }
+ EXPORT_SYMBOL_GPL(clk_hw_unregister_gate);
++
++static void devm_clk_hw_release_gate(struct device *dev, void *res)
++{
++	clk_hw_unregister_gate(*(struct clk_hw **)res);
++}
++
++struct clk_hw *__devm_clk_hw_register_gate(struct device *dev,
++		struct device_node *np, const char *name,
++		const char *parent_name, const struct clk_hw *parent_hw,
++		const struct clk_parent_data *parent_data,
++		unsigned long flags,
++		void __iomem *reg, u8 bit_idx,
++		u8 clk_gate_flags, spinlock_t *lock)
++{
++	struct clk_hw **ptr, *hw;
++
++	ptr = devres_alloc(devm_clk_hw_release_gate, sizeof(*ptr), GFP_KERNEL);
++	if (!ptr)
++		return ERR_PTR(-ENOMEM);
++
++	hw = __clk_hw_register_gate(dev, np, name, parent_name, parent_hw,
++				    parent_data, flags, reg, bit_idx,
++				    clk_gate_flags, lock);
++
++	if (!IS_ERR(hw)) {
++		*ptr = hw;
++		devres_add(dev, ptr);
++	} else {
++		devres_free(ptr);
++	}
++
++	return hw;
++}
++EXPORT_SYMBOL_GPL(__devm_clk_hw_register_gate);
+--- a/include/linux/clk-provider.h
++++ b/include/linux/clk-provider.h
+@@ -490,6 +490,13 @@ struct clk_hw *__clk_hw_register_gate(st
+ 		unsigned long flags,
+ 		void __iomem *reg, u8 bit_idx,
+ 		u8 clk_gate_flags, spinlock_t *lock);
++struct clk_hw *__devm_clk_hw_register_gate(struct device *dev,
++		struct device_node *np, const char *name,
++		const char *parent_name, const struct clk_hw *parent_hw,
++		const struct clk_parent_data *parent_data,
++		unsigned long flags,
++		void __iomem *reg, u8 bit_idx,
++		u8 clk_gate_flags, spinlock_t *lock);
+ struct clk *clk_register_gate(struct device *dev, const char *name,
+ 		const char *parent_name, unsigned long flags,
+ 		void __iomem *reg, u8 bit_idx,
+@@ -544,6 +551,22 @@ struct clk *clk_register_gate(struct dev
+ 	__clk_hw_register_gate((dev), NULL, (name), NULL, NULL, (parent_data), \
+ 			       (flags), (reg), (bit_idx),		      \
+ 			       (clk_gate_flags), (lock))
++/**
++ * devm_clk_hw_register_gate - register a gate clock with the clock framework
++ * @dev: device that is registering this clock
++ * @name: name of this clock
++ * @parent_name: name of this clock's parent
++ * @flags: framework-specific flags for this clock
++ * @reg: register address to control gating of this clock
++ * @bit_idx: which bit in the register controls gating of this clock
++ * @clk_gate_flags: gate-specific flags for this clock
++ * @lock: shared register lock for this clock
++ */
++#define devm_clk_hw_register_gate(dev, name, parent_name, flags, reg, bit_idx,\
++				  clk_gate_flags, lock)			      \
++	__devm_clk_hw_register_gate((dev), NULL, (name), (parent_name), NULL, \
++			       NULL, (flags), (reg), (bit_idx),		      \
++			       (clk_gate_flags), (lock))
+ void clk_unregister_gate(struct clk *clk);
+ void clk_hw_unregister_gate(struct clk_hw *hw);
+ int clk_gate_is_enabled(struct clk_hw *hw);

+ 52 - 0
target/linux/generic/backport-6.1/081-v5.17-regmap-allow-to-define-reg_update_bits-for-no-bus.patch

@@ -0,0 +1,52 @@
+From 02d6fdecb9c38de19065f6bed8d5214556fd061d Mon Sep 17 00:00:00 2001
+From: Ansuel Smith <[email protected]>
+Date: Thu, 4 Nov 2021 16:00:40 +0100
+Subject: regmap: allow to define reg_update_bits for no bus configuration
+
+Some device requires a special handling for reg_update_bits and can't use
+the normal regmap read write logic. An example is when locking is
+handled by the device and rmw operations requires to do atomic operations.
+Allow to declare a dedicated function in regmap_config for
+reg_update_bits in no bus configuration.
+
+Signed-off-by: Ansuel Smith <[email protected]>
+Link: https://lore.kernel.org/r/[email protected]
+Signed-off-by: Mark Brown <[email protected]>
+---
+ drivers/base/regmap/regmap.c | 1 +
+ include/linux/regmap.h       | 7 +++++++
+ 2 files changed, 8 insertions(+)
+
+--- a/drivers/base/regmap/regmap.c
++++ b/drivers/base/regmap/regmap.c
+@@ -877,6 +877,7 @@ struct regmap *__regmap_init(struct devi
+ 	if (!bus) {
+ 		map->reg_read  = config->reg_read;
+ 		map->reg_write = config->reg_write;
++		map->reg_update_bits = config->reg_update_bits;
+ 
+ 		map->defer_caching = false;
+ 		goto skip_format_initialization;
+--- a/include/linux/regmap.h
++++ b/include/linux/regmap.h
+@@ -290,6 +290,11 @@ typedef void (*regmap_unlock)(void *);
+  *		  read operation on a bus such as SPI, I2C, etc. Most of the
+  *		  devices do not need this.
+  * @reg_write:	  Same as above for writing.
++ * @reg_update_bits: Optional callback that if filled will be used to perform
++ *		     all the update_bits(rmw) operation. Should only be provided
++ *		     if the function require special handling with lock and reg
++ *		     handling and the operation cannot be represented as a simple
++ *		     update_bits operation on a bus such as SPI, I2C, etc.
+  * @fast_io:	  Register IO is fast. Use a spinlock instead of a mutex
+  *	     	  to perform locking. This field is ignored if custom lock/unlock
+  *	     	  functions are used (see fields lock/unlock of struct regmap_config).
+@@ -372,6 +377,8 @@ struct regmap_config {
+ 
+ 	int (*reg_read)(void *context, unsigned int reg, unsigned int *val);
+ 	int (*reg_write)(void *context, unsigned int reg, unsigned int val);
++	int (*reg_update_bits)(void *context, unsigned int reg,
++			       unsigned int mask, unsigned int val);
+ 
+ 	bool fast_io;
+ 

+ 37 - 0
target/linux/generic/backport-6.1/100-v5.18-tty-serial-bcm63xx-use-more-precise-Kconfig-symbol.patch

@@ -0,0 +1,37 @@
+From 0dc0da881b4574d1e04a079ab2ea75da61f5ad2e Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <[email protected]>
+Date: Fri, 11 Mar 2022 10:32:33 +0100
+Subject: [PATCH] tty: serial: bcm63xx: use more precise Kconfig symbol
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Patches lowering SERIAL_BCM63XX dependencies led to a discussion and
+documentation change regarding "depends" usage. Adjust Kconfig entry to
+match current guidelines. Make this symbol available for relevant
+architectures only.
+
+Cc: Geert Uytterhoeven <[email protected]>
+Reviewed-by: Geert Uytterhoeven <[email protected]>
+Acked-by: Florian Fainelli <[email protected]>
+Signed-off-by: Rafał Miłecki <[email protected]>
+Ref: f35a07f92616 ("tty: serial: bcm63xx: lower driver dependencies")
+Ref: 18084e435ff6 ("Documentation/kbuild: Document platform dependency practises")
+Link: https://lore.kernel.org/r/[email protected]
+Signed-off-by: Greg Kroah-Hartman <[email protected]>
+---
+ drivers/tty/serial/Kconfig | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/tty/serial/Kconfig
++++ b/drivers/tty/serial/Kconfig
+@@ -1098,7 +1098,8 @@ config SERIAL_TIMBERDALE
+ config SERIAL_BCM63XX
+ 	tristate "Broadcom BCM63xx/BCM33xx UART support"
+ 	select SERIAL_CORE
+-	depends on COMMON_CLK
++	depends on ARCH_BCM4908 || ARCH_BCM_63XX || BCM63XX || BMIPS_GENERIC || COMPILE_TEST
++	default ARCH_BCM4908 || ARCH_BCM_63XX || BCM63XX || BMIPS_GENERIC
+ 	help
+ 	  This enables the driver for the onchip UART core found on
+ 	  the following chipsets:

+ 49 - 0
target/linux/generic/backport-6.1/200-v5.18-tools-resolve_btfids-Build-with-host-flags.patch

@@ -0,0 +1,49 @@
+From cdbc4e3399ed8cdcf234a85f7a2482b622379e82 Mon Sep 17 00:00:00 2001
+From: Connor O'Brien <[email protected]>
+Date: Wed, 12 Jan 2022 00:25:03 +0000
+Subject: [PATCH] tools/resolve_btfids: Build with host flags
+
+resolve_btfids is built using $(HOSTCC) and $(HOSTLD) but does not
+pick up the corresponding flags. As a result, host-specific settings
+(such as a sysroot specified via HOSTCFLAGS=--sysroot=..., or a linker
+specified via HOSTLDFLAGS=-fuse-ld=...) will not be respected.
+
+Fix this by setting CFLAGS to KBUILD_HOSTCFLAGS and LDFLAGS to
+KBUILD_HOSTLDFLAGS.
+
+Also pass the cflags through to libbpf via EXTRA_CFLAGS to ensure that
+the host libbpf is built with flags consistent with resolve_btfids.
+
+Signed-off-by: Connor O'Brien <[email protected]>
+Signed-off-by: Andrii Nakryiko <[email protected]>
+Acked-by: Song Liu <[email protected]>
+Link: https://lore.kernel.org/bpf/[email protected]
+(cherry picked from commit 0e3a1c902ffb56e9fe4416f0cd382c97b09ecbf6)
+Signed-off-by: Stijn Tintel <[email protected]>
+---
+ tools/bpf/resolve_btfids/Makefile | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/tools/bpf/resolve_btfids/Makefile
++++ b/tools/bpf/resolve_btfids/Makefile
+@@ -23,6 +23,8 @@ CC       = $(HOSTCC)
+ LD       = $(HOSTLD)
+ ARCH     = $(HOSTARCH)
+ RM      ?= rm
++CFLAGS  := $(KBUILD_HOSTCFLAGS)
++LDFLAGS := $(KBUILD_HOSTLDFLAGS)
+ 
+ OUTPUT ?= $(srctree)/tools/bpf/resolve_btfids/
+ 
+@@ -45,9 +47,9 @@ $(SUBCMDOBJ): fixdep FORCE | $(OUTPUT)/l
+ 	$(Q)$(MAKE) -C $(SUBCMD_SRC) OUTPUT=$(abspath $(dir $@))/ $(abspath $@)
+ 
+ $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf
+-	$(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC)  OUTPUT=$(abspath $(dir $@))/ $(abspath $@)
++	$(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC)  OUTPUT=$(abspath $(dir $@))/ EXTRA_CFLAGS="$(CFLAGS)" $(abspath $@)
+ 
+-CFLAGS := -g \
++CFLAGS += -g \
+           -I$(srctree)/tools/include \
+           -I$(srctree)/tools/include/uapi \
+           -I$(LIBBPF_SRC) \

+ 997 - 0
target/linux/generic/backport-6.1/201-v5.16-scripts-dtc-Update-to-upstream-version-v1.6.1-19-g0a.patch

@@ -0,0 +1,997 @@
+From a77725a9a3c5924e2fd4cd5b3557dd92a8e46f87 Mon Sep 17 00:00:00 2001
+From: Rob Herring <[email protected]>
+Date: Mon, 25 Oct 2021 11:05:45 -0500
+Subject: [PATCH 1/1] scripts/dtc: Update to upstream version
+ v1.6.1-19-g0a3a9d3449c8
+
+This adds the following commits from upstream:
+
+0a3a9d3449c8 checks: Add an interrupt-map check
+8fd24744e361 checks: Ensure '#interrupt-cells' only exists in interrupt providers
+d8d1a9a77863 checks: Drop interrupt provider '#address-cells' check
+52a16fd72824 checks: Make interrupt_provider check dependent on interrupts_extended_is_cell
+37fd700685da treesource: Maintain phandle label/path on output
+e33ce1d6a8c7 flattree: Use '\n', not ';' to separate asm pseudo-ops
+d24cc189dca6 asm: Use assembler macros instead of cpp macros
+ff3a30c115ad asm: Use .asciz and .ascii instead of .string
+5eb5927d81ee fdtdump: fix -Werror=int-to-pointer-cast
+0869f8269161 libfdt: Add ALIGNMENT error string
+69595a167f06 checks: Fix bus-range check
+72d09e2682a4 Makefile: add -Wsign-compare to warning options
+b587787ef388 checks: Fix signedness comparisons warnings
+69bed6c2418f dtc: Wrap phandle validity check
+910221185560 fdtget: Fix signedness comparisons warnings
+d966f08fcd21 tests: Fix signedness comparisons warnings
+ecfb438c07fa dtc: Fix signedness comparisons warnings: pointer diff
+5bec74a6d135 dtc: Fix signedness comparisons warnings: reservednum
+24e7f511fd4a fdtdump: Fix signedness comparisons warnings
+b6910bec1161 Bump version to v1.6.1
+21d61d18f968 Fix CID 1461557
+4c2ef8f4d14c checks: Introduce is_multiple_of()
+e59ca36fb70e Make handling of cpp line information more tolerant
+0c3fd9b6aceb checks: Drop interrupt_cells_is_cell check
+6b3081abc4ac checks: Add check_is_cell() for all phandle+arg properties
+2dffc192a77f yamltree: Remove marker ordering dependency
+61e513439e40 pylibfdt: Rework "avoid unused variable warning" lines
+c8bddd106095 tests: add a positive gpio test case
+ad4abfadb687 checks: replace strstr and strrchr with strends
+09c6a6e88718 dtc.h: add strends for suffix matching
+9bb9b8d0b4a0 checks: tigthen up nr-gpios prop exception
+b07b62ee3342 libfdt: Add FDT alignment check to fdt_check_header()
+a2def5479950 libfdt: Check that the root-node name is empty
+4ca61f84dc21 libfdt: Check that there is only one root node
+34d708249a91 dtc: Remove -O dtbo support
+8e7ff260f755 libfdt: Fix a possible "unchecked return value" warning
+88875268c05c checks: Warn on node-name and property name being the same
+9d2279e7e6ee checks: Change node-name check to match devicetree spec
+f527c867a8c6 util: limit gnu_printf format attribute to gcc >= 4.4.0
+
+Reviewed-by: Frank Rowand <[email protected]>
+Tested-by: Frank Rowand <[email protected]>
+Signed-off-by: Rob Herring <[email protected]>
+---
+ scripts/dtc/checks.c              | 222 ++++++++++++++++++++++--------
+ scripts/dtc/dtc-lexer.l           |   2 +-
+ scripts/dtc/dtc.c                 |   6 +-
+ scripts/dtc/dtc.h                 |  40 +++++-
+ scripts/dtc/flattree.c            |  11 +-
+ scripts/dtc/libfdt/fdt.c          |   4 +
+ scripts/dtc/libfdt/fdt_rw.c       |  18 ++-
+ scripts/dtc/libfdt/fdt_strerror.c |   1 +
+ scripts/dtc/libfdt/libfdt.h       |   7 +
+ scripts/dtc/livetree.c            |   6 +-
+ scripts/dtc/treesource.c          |  48 +++----
+ scripts/dtc/util.h                |   6 +-
+ scripts/dtc/version_gen.h         |   2 +-
+ scripts/dtc/yamltree.c            |  16 ++-
+ 14 files changed, 275 insertions(+), 114 deletions(-)
+
+--- a/scripts/dtc/checks.c
++++ b/scripts/dtc/checks.c
+@@ -143,6 +143,14 @@ static void check_nodes_props(struct che
+ 		check_nodes_props(c, dti, child);
+ }
+ 
++static bool is_multiple_of(int multiple, int divisor)
++{
++	if (divisor == 0)
++		return multiple == 0;
++	else
++		return (multiple % divisor) == 0;
++}
++
+ static bool run_check(struct check *c, struct dt_info *dti)
+ {
+ 	struct node *dt = dti->dt;
+@@ -297,19 +305,20 @@ ERROR(duplicate_property_names, check_du
+ #define LOWERCASE	"abcdefghijklmnopqrstuvwxyz"
+ #define UPPERCASE	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ #define DIGITS		"0123456789"
+-#define PROPNODECHARS	LOWERCASE UPPERCASE DIGITS ",._+*#?-"
++#define NODECHARS	LOWERCASE UPPERCASE DIGITS ",._+-@"
++#define PROPCHARS	LOWERCASE UPPERCASE DIGITS ",._+*#?-"
+ #define PROPNODECHARSSTRICT	LOWERCASE UPPERCASE DIGITS ",-"
+ 
+ static void check_node_name_chars(struct check *c, struct dt_info *dti,
+ 				  struct node *node)
+ {
+-	int n = strspn(node->name, c->data);
++	size_t n = strspn(node->name, c->data);
+ 
+ 	if (n < strlen(node->name))
+ 		FAIL(c, dti, node, "Bad character '%c' in node name",
+ 		     node->name[n]);
+ }
+-ERROR(node_name_chars, check_node_name_chars, PROPNODECHARS "@");
++ERROR(node_name_chars, check_node_name_chars, NODECHARS);
+ 
+ static void check_node_name_chars_strict(struct check *c, struct dt_info *dti,
+ 					 struct node *node)
+@@ -330,6 +339,20 @@ static void check_node_name_format(struc
+ }
+ ERROR(node_name_format, check_node_name_format, NULL, &node_name_chars);
+ 
++static void check_node_name_vs_property_name(struct check *c,
++					     struct dt_info *dti,
++					     struct node *node)
++{
++	if (!node->parent)
++		return;
++
++	if (get_property(node->parent, node->name)) {
++		FAIL(c, dti, node, "node name and property name conflict");
++	}
++}
++WARNING(node_name_vs_property_name, check_node_name_vs_property_name,
++	NULL, &node_name_chars);
++
+ static void check_unit_address_vs_reg(struct check *c, struct dt_info *dti,
+ 				      struct node *node)
+ {
+@@ -363,14 +386,14 @@ static void check_property_name_chars(st
+ 	struct property *prop;
+ 
+ 	for_each_property(node, prop) {
+-		int n = strspn(prop->name, c->data);
++		size_t n = strspn(prop->name, c->data);
+ 
+ 		if (n < strlen(prop->name))
+ 			FAIL_PROP(c, dti, node, prop, "Bad character '%c' in property name",
+ 				  prop->name[n]);
+ 	}
+ }
+-ERROR(property_name_chars, check_property_name_chars, PROPNODECHARS);
++ERROR(property_name_chars, check_property_name_chars, PROPCHARS);
+ 
+ static void check_property_name_chars_strict(struct check *c,
+ 					     struct dt_info *dti,
+@@ -380,7 +403,7 @@ static void check_property_name_chars_st
+ 
+ 	for_each_property(node, prop) {
+ 		const char *name = prop->name;
+-		int n = strspn(name, c->data);
++		size_t n = strspn(name, c->data);
+ 
+ 		if (n == strlen(prop->name))
+ 			continue;
+@@ -497,7 +520,7 @@ static cell_t check_phandle_prop(struct
+ 
+ 	phandle = propval_cell(prop);
+ 
+-	if ((phandle == 0) || (phandle == -1)) {
++	if (!phandle_is_valid(phandle)) {
+ 		FAIL_PROP(c, dti, node, prop, "bad value (0x%x) in %s property",
+ 		     phandle, prop->name);
+ 		return 0;
+@@ -556,7 +579,7 @@ static void check_name_properties(struct
+ 	if (!prop)
+ 		return; /* No name property, that's fine */
+ 
+-	if ((prop->val.len != node->basenamelen+1)
++	if ((prop->val.len != node->basenamelen + 1U)
+ 	    || (memcmp(prop->val.val, node->name, node->basenamelen) != 0)) {
+ 		FAIL(c, dti, node, "\"name\" property is incorrect (\"%s\" instead"
+ 		     " of base node name)", prop->val.val);
+@@ -657,7 +680,6 @@ ERROR(omit_unused_nodes, fixup_omit_unus
+  */
+ WARNING_IF_NOT_CELL(address_cells_is_cell, "#address-cells");
+ WARNING_IF_NOT_CELL(size_cells_is_cell, "#size-cells");
+-WARNING_IF_NOT_CELL(interrupt_cells_is_cell, "#interrupt-cells");
+ 
+ WARNING_IF_NOT_STRING(device_type_is_string, "device_type");
+ WARNING_IF_NOT_STRING(model_is_string, "model");
+@@ -672,8 +694,7 @@ static void check_names_is_string_list(s
+ 	struct property *prop;
+ 
+ 	for_each_property(node, prop) {
+-		const char *s = strrchr(prop->name, '-');
+-		if (!s || !streq(s, "-names"))
++		if (!strends(prop->name, "-names"))
+ 			continue;
+ 
+ 		c->data = prop->name;
+@@ -753,7 +774,7 @@ static void check_reg_format(struct chec
+ 	size_cells = node_size_cells(node->parent);
+ 	entrylen = (addr_cells + size_cells) * sizeof(cell_t);
+ 
+-	if (!entrylen || (prop->val.len % entrylen) != 0)
++	if (!is_multiple_of(prop->val.len, entrylen))
+ 		FAIL_PROP(c, dti, node, prop, "property has invalid length (%d bytes) "
+ 			  "(#address-cells == %d, #size-cells == %d)",
+ 			  prop->val.len, addr_cells, size_cells);
+@@ -794,7 +815,7 @@ static void check_ranges_format(struct c
+ 				  "#size-cells (%d) differs from %s (%d)",
+ 				  ranges, c_size_cells, node->parent->fullpath,
+ 				  p_size_cells);
+-	} else if ((prop->val.len % entrylen) != 0) {
++	} else if (!is_multiple_of(prop->val.len, entrylen)) {
+ 		FAIL_PROP(c, dti, node, prop, "\"%s\" property has invalid length (%d bytes) "
+ 			  "(parent #address-cells == %d, child #address-cells == %d, "
+ 			  "#size-cells == %d)", ranges, prop->val.len,
+@@ -871,7 +892,7 @@ static void check_pci_device_bus_num(str
+ 	} else {
+ 		cells = (cell_t *)prop->val.val;
+ 		min_bus = fdt32_to_cpu(cells[0]);
+-		max_bus = fdt32_to_cpu(cells[0]);
++		max_bus = fdt32_to_cpu(cells[1]);
+ 	}
+ 	if ((bus_num < min_bus) || (bus_num > max_bus))
+ 		FAIL_PROP(c, dti, node, prop, "PCI bus number %d out of range, expected (%d - %d)",
+@@ -1367,9 +1388,9 @@ static void check_property_phandle_args(
+ 				          const struct provider *provider)
+ {
+ 	struct node *root = dti->dt;
+-	int cell, cellsize = 0;
++	unsigned int cell, cellsize = 0;
+ 
+-	if (prop->val.len % sizeof(cell_t)) {
++	if (!is_multiple_of(prop->val.len, sizeof(cell_t))) {
+ 		FAIL_PROP(c, dti, node, prop,
+ 			  "property size (%d) is invalid, expected multiple of %zu",
+ 			  prop->val.len, sizeof(cell_t));
+@@ -1379,14 +1400,14 @@ static void check_property_phandle_args(
+ 	for (cell = 0; cell < prop->val.len / sizeof(cell_t); cell += cellsize + 1) {
+ 		struct node *provider_node;
+ 		struct property *cellprop;
+-		int phandle;
++		cell_t phandle;
+ 
+ 		phandle = propval_cell_n(prop, cell);
+ 		/*
+ 		 * Some bindings use a cell value 0 or -1 to skip over optional
+ 		 * entries when each index position has a specific definition.
+ 		 */
+-		if (phandle == 0 || phandle == -1) {
++		if (!phandle_is_valid(phandle)) {
+ 			/* Give up if this is an overlay with external references */
+ 			if (dti->dtsflags & DTSF_PLUGIN)
+ 				break;
+@@ -1452,7 +1473,8 @@ static void check_provider_cells_propert
+ }
+ #define WARNING_PROPERTY_PHANDLE_CELLS(nm, propname, cells_name, ...) \
+ 	static struct provider nm##_provider = { (propname), (cells_name), __VA_ARGS__ }; \
+-	WARNING(nm##_property, check_provider_cells_property, &nm##_provider, &phandle_references);
++	WARNING_IF_NOT_CELL(nm##_is_cell, cells_name); \
++	WARNING(nm##_property, check_provider_cells_property, &nm##_provider, &nm##_is_cell, &phandle_references);
+ 
+ WARNING_PROPERTY_PHANDLE_CELLS(clocks, "clocks", "#clock-cells");
+ WARNING_PROPERTY_PHANDLE_CELLS(cooling_device, "cooling-device", "#cooling-cells");
+@@ -1473,24 +1495,17 @@ WARNING_PROPERTY_PHANDLE_CELLS(thermal_s
+ 
+ static bool prop_is_gpio(struct property *prop)
+ {
+-	char *str;
+-
+ 	/*
+ 	 * *-gpios and *-gpio can appear in property names,
+ 	 * so skip over any false matches (only one known ATM)
+ 	 */
+-	if (strstr(prop->name, "nr-gpio"))
++	if (strends(prop->name, ",nr-gpios"))
+ 		return false;
+ 
+-	str = strrchr(prop->name, '-');
+-	if (str)
+-		str++;
+-	else
+-		str = prop->name;
+-	if (!(streq(str, "gpios") || streq(str, "gpio")))
+-		return false;
+-
+-	return true;
++	return strends(prop->name, "-gpios") ||
++		streq(prop->name, "gpios") ||
++		strends(prop->name, "-gpio") ||
++		streq(prop->name, "gpio");
+ }
+ 
+ static void check_gpios_property(struct check *c,
+@@ -1525,13 +1540,10 @@ static void check_deprecated_gpio_proper
+ 	struct property *prop;
+ 
+ 	for_each_property(node, prop) {
+-		char *str;
+-
+ 		if (!prop_is_gpio(prop))
+ 			continue;
+ 
+-		str = strstr(prop->name, "gpio");
+-		if (!streq(str, "gpio"))
++		if (!strends(prop->name, "gpio"))
+ 			continue;
+ 
+ 		FAIL_PROP(c, dti, node, prop,
+@@ -1561,21 +1573,106 @@ static void check_interrupt_provider(str
+ 				     struct node *node)
+ {
+ 	struct property *prop;
++	bool irq_provider = node_is_interrupt_provider(node);
+ 
+-	if (!node_is_interrupt_provider(node))
++	prop = get_property(node, "#interrupt-cells");
++	if (irq_provider && !prop) {
++		FAIL(c, dti, node,
++		     "Missing '#interrupt-cells' in interrupt provider");
+ 		return;
++	}
+ 
+-	prop = get_property(node, "#interrupt-cells");
+-	if (!prop)
++	if (!irq_provider && prop) {
+ 		FAIL(c, dti, node,
+-		     "Missing #interrupt-cells in interrupt provider");
++		     "'#interrupt-cells' found, but node is not an interrupt provider");
++		return;
++	}
++}
++WARNING(interrupt_provider, check_interrupt_provider, NULL, &interrupts_extended_is_cell);
+ 
+-	prop = get_property(node, "#address-cells");
+-	if (!prop)
++static void check_interrupt_map(struct check *c,
++				struct dt_info *dti,
++				struct node *node)
++{
++	struct node *root = dti->dt;
++	struct property *prop, *irq_map_prop;
++	size_t cellsize, cell, map_cells;
++
++	irq_map_prop = get_property(node, "interrupt-map");
++	if (!irq_map_prop)
++		return;
++
++	if (node->addr_cells < 0) {
+ 		FAIL(c, dti, node,
+-		     "Missing #address-cells in interrupt provider");
++		     "Missing '#address-cells' in interrupt-map provider");
++		return;
++	}
++	cellsize = node_addr_cells(node);
++	cellsize += propval_cell(get_property(node, "#interrupt-cells"));
++
++	prop = get_property(node, "interrupt-map-mask");
++	if (prop && (prop->val.len != (cellsize * sizeof(cell_t))))
++		FAIL_PROP(c, dti, node, prop,
++			  "property size (%d) is invalid, expected %zu",
++			  prop->val.len, cellsize * sizeof(cell_t));
++
++	if (!is_multiple_of(irq_map_prop->val.len, sizeof(cell_t))) {
++		FAIL_PROP(c, dti, node, irq_map_prop,
++			  "property size (%d) is invalid, expected multiple of %zu",
++			  irq_map_prop->val.len, sizeof(cell_t));
++		return;
++	}
++
++	map_cells = irq_map_prop->val.len / sizeof(cell_t);
++	for (cell = 0; cell < map_cells; ) {
++		struct node *provider_node;
++		struct property *cellprop;
++		int phandle;
++		size_t parent_cellsize;
++
++		if ((cell + cellsize) >= map_cells) {
++			FAIL_PROP(c, dti, node, irq_map_prop,
++				  "property size (%d) too small, expected > %zu",
++				  irq_map_prop->val.len, (cell + cellsize) * sizeof(cell_t));
++			break;
++		}
++		cell += cellsize;
++
++		phandle = propval_cell_n(irq_map_prop, cell);
++		if (!phandle_is_valid(phandle)) {
++			/* Give up if this is an overlay with external references */
++			if (!(dti->dtsflags & DTSF_PLUGIN))
++				FAIL_PROP(c, dti, node, irq_map_prop,
++					  "Cell %zu is not a phandle(%d)",
++					  cell, phandle);
++			break;
++		}
++
++		provider_node = get_node_by_phandle(root, phandle);
++		if (!provider_node) {
++			FAIL_PROP(c, dti, node, irq_map_prop,
++				  "Could not get phandle(%d) node for (cell %zu)",
++				  phandle, cell);
++			break;
++		}
++
++		cellprop = get_property(provider_node, "#interrupt-cells");
++		if (cellprop) {
++			parent_cellsize = propval_cell(cellprop);
++		} else {
++			FAIL(c, dti, node, "Missing property '#interrupt-cells' in node %s or bad phandle (referred from interrupt-map[%zu])",
++			     provider_node->fullpath, cell);
++			break;
++		}
++
++		cellprop = get_property(provider_node, "#address-cells");
++		if (cellprop)
++			parent_cellsize += propval_cell(cellprop);
++
++		cell += 1 + parent_cellsize;
++	}
+ }
+-WARNING(interrupt_provider, check_interrupt_provider, NULL);
++WARNING(interrupt_map, check_interrupt_map, NULL, &phandle_references, &addr_size_cells, &interrupt_provider);
+ 
+ static void check_interrupts_property(struct check *c,
+ 				      struct dt_info *dti,
+@@ -1584,13 +1681,13 @@ static void check_interrupts_property(st
+ 	struct node *root = dti->dt;
+ 	struct node *irq_node = NULL, *parent = node;
+ 	struct property *irq_prop, *prop = NULL;
+-	int irq_cells, phandle;
++	cell_t irq_cells, phandle;
+ 
+ 	irq_prop = get_property(node, "interrupts");
+ 	if (!irq_prop)
+ 		return;
+ 
+-	if (irq_prop->val.len % sizeof(cell_t))
++	if (!is_multiple_of(irq_prop->val.len, sizeof(cell_t)))
+ 		FAIL_PROP(c, dti, node, irq_prop, "size (%d) is invalid, expected multiple of %zu",
+ 		     irq_prop->val.len, sizeof(cell_t));
+ 
+@@ -1603,7 +1700,7 @@ static void check_interrupts_property(st
+ 		prop = get_property(parent, "interrupt-parent");
+ 		if (prop) {
+ 			phandle = propval_cell(prop);
+-			if ((phandle == 0) || (phandle == -1)) {
++			if (!phandle_is_valid(phandle)) {
+ 				/* Give up if this is an overlay with
+ 				 * external references */
+ 				if (dti->dtsflags & DTSF_PLUGIN)
+@@ -1639,7 +1736,7 @@ static void check_interrupts_property(st
+ 	}
+ 
+ 	irq_cells = propval_cell(prop);
+-	if (irq_prop->val.len % (irq_cells * sizeof(cell_t))) {
++	if (!is_multiple_of(irq_prop->val.len, irq_cells * sizeof(cell_t))) {
+ 		FAIL_PROP(c, dti, node, prop,
+ 			  "size is (%d), expected multiple of %d",
+ 			  irq_prop->val.len, (int)(irq_cells * sizeof(cell_t)));
+@@ -1750,7 +1847,7 @@ WARNING(graph_port, check_graph_port, NU
+ static struct node *get_remote_endpoint(struct check *c, struct dt_info *dti,
+ 					struct node *endpoint)
+ {
+-	int phandle;
++	cell_t phandle;
+ 	struct node *node;
+ 	struct property *prop;
+ 
+@@ -1760,7 +1857,7 @@ static struct node *get_remote_endpoint(
+ 
+ 	phandle = propval_cell(prop);
+ 	/* Give up if this is an overlay with external references */
+-	if (phandle == 0 || phandle == -1)
++	if (!phandle_is_valid(phandle))
+ 		return NULL;
+ 
+ 	node = get_node_by_phandle(dti->dt, phandle);
+@@ -1796,7 +1893,7 @@ WARNING(graph_endpoint, check_graph_endp
+ static struct check *check_table[] = {
+ 	&duplicate_node_names, &duplicate_property_names,
+ 	&node_name_chars, &node_name_format, &property_name_chars,
+-	&name_is_string, &name_properties,
++	&name_is_string, &name_properties, &node_name_vs_property_name,
+ 
+ 	&duplicate_label,
+ 
+@@ -1804,7 +1901,7 @@ static struct check *check_table[] = {
+ 	&phandle_references, &path_references,
+ 	&omit_unused_nodes,
+ 
+-	&address_cells_is_cell, &size_cells_is_cell, &interrupt_cells_is_cell,
++	&address_cells_is_cell, &size_cells_is_cell,
+ 	&device_type_is_string, &model_is_string, &status_is_string,
+ 	&label_is_string,
+ 
+@@ -1839,26 +1936,43 @@ static struct check *check_table[] = {
+ 	&chosen_node_is_root, &chosen_node_bootargs, &chosen_node_stdout_path,
+ 
+ 	&clocks_property,
++	&clocks_is_cell,
+ 	&cooling_device_property,
++	&cooling_device_is_cell,
+ 	&dmas_property,
++	&dmas_is_cell,
+ 	&hwlocks_property,
++	&hwlocks_is_cell,
+ 	&interrupts_extended_property,
++	&interrupts_extended_is_cell,
+ 	&io_channels_property,
++	&io_channels_is_cell,
+ 	&iommus_property,
++	&iommus_is_cell,
+ 	&mboxes_property,
++	&mboxes_is_cell,
+ 	&msi_parent_property,
++	&msi_parent_is_cell,
+ 	&mux_controls_property,
++	&mux_controls_is_cell,
+ 	&phys_property,
++	&phys_is_cell,
+ 	&power_domains_property,
++	&power_domains_is_cell,
+ 	&pwms_property,
++	&pwms_is_cell,
+ 	&resets_property,
++	&resets_is_cell,
+ 	&sound_dai_property,
++	&sound_dai_is_cell,
+ 	&thermal_sensors_property,
++	&thermal_sensors_is_cell,
+ 
+ 	&deprecated_gpio_property,
+ 	&gpios_property,
+ 	&interrupts_property,
+ 	&interrupt_provider,
++	&interrupt_map,
+ 
+ 	&alias_paths,
+ 
+@@ -1882,7 +1996,7 @@ static void enable_warning_error(struct
+ 
+ static void disable_warning_error(struct check *c, bool warn, bool error)
+ {
+-	int i;
++	unsigned int i;
+ 
+ 	/* Lowering level, also lower it for things this is the prereq
+ 	 * for */
+@@ -1903,7 +2017,7 @@ static void disable_warning_error(struct
+ 
+ void parse_checks_option(bool warn, bool error, const char *arg)
+ {
+-	int i;
++	unsigned int i;
+ 	const char *name = arg;
+ 	bool enable = true;
+ 
+@@ -1930,7 +2044,7 @@ void parse_checks_option(bool warn, bool
+ 
+ void process_checks(bool force, struct dt_info *dti)
+ {
+-	int i;
++	unsigned int i;
+ 	int error = 0;
+ 
+ 	for (i = 0; i < ARRAY_SIZE(check_table); i++) {
+--- a/scripts/dtc/dtc-lexer.l
++++ b/scripts/dtc/dtc-lexer.l
+@@ -57,7 +57,7 @@ static void PRINTF(1, 2) lexical_error(c
+ 			push_input_file(name);
+ 		}
+ 
+-<*>^"#"(line)?[ \t]+[0-9]+[ \t]+{STRING}([ \t]+[0-9]+)? {
++<*>^"#"(line)?[ \t]+[0-9]+[ \t]+{STRING}([ \t]+[0-9]+)* {
+ 			char *line, *fnstart, *fnend;
+ 			struct data fn;
+ 			/* skip text before line # */
+--- a/scripts/dtc/dtc.c
++++ b/scripts/dtc/dtc.c
+@@ -12,7 +12,7 @@
+  * Command line options
+  */
+ int quiet;		/* Level of quietness */
+-int reservenum;		/* Number of memory reservation slots */
++unsigned int reservenum;/* Number of memory reservation slots */
+ int minsize;		/* Minimum blob size */
+ int padsize;		/* Additional padding to blob */
+ int alignsize;		/* Additional padding to blob accroding to the alignsize */
+@@ -197,7 +197,7 @@ int main(int argc, char *argv[])
+ 			depname = optarg;
+ 			break;
+ 		case 'R':
+-			reservenum = strtol(optarg, NULL, 0);
++			reservenum = strtoul(optarg, NULL, 0);
+ 			break;
+ 		case 'S':
+ 			minsize = strtol(optarg, NULL, 0);
+@@ -359,8 +359,6 @@ int main(int argc, char *argv[])
+ #endif
+ 	} else if (streq(outform, "dtb")) {
+ 		dt_to_blob(outf, dti, outversion);
+-	} else if (streq(outform, "dtbo")) {
+-		dt_to_blob(outf, dti, outversion);
+ 	} else if (streq(outform, "asm")) {
+ 		dt_to_asm(outf, dti, outversion);
+ 	} else if (streq(outform, "null")) {
+--- a/scripts/dtc/dtc.h
++++ b/scripts/dtc/dtc.h
+@@ -35,7 +35,7 @@
+  * Command line options
+  */
+ extern int quiet;		/* Level of quietness */
+-extern int reservenum;		/* Number of memory reservation slots */
++extern unsigned int reservenum;	/* Number of memory reservation slots */
+ extern int minsize;		/* Minimum blob size */
+ extern int padsize;		/* Additional padding to blob */
+ extern int alignsize;		/* Additional padding to blob accroding to the alignsize */
+@@ -51,6 +51,11 @@ extern int annotate;		/* annotate .dts w
+ 
+ typedef uint32_t cell_t;
+ 
++static inline bool phandle_is_valid(cell_t phandle)
++{
++	return phandle != 0 && phandle != ~0U;
++}
++
+ static inline uint16_t dtb_ld16(const void *p)
+ {
+ 	const uint8_t *bp = (const uint8_t *)p;
+@@ -86,6 +91,16 @@ static inline uint64_t dtb_ld64(const vo
+ #define streq(a, b)	(strcmp((a), (b)) == 0)
+ #define strstarts(s, prefix)	(strncmp((s), (prefix), strlen(prefix)) == 0)
+ #define strprefixeq(a, n, b)	(strlen(b) == (n) && (memcmp(a, b, n) == 0))
++static inline bool strends(const char *str, const char *suffix)
++{
++	unsigned int len, suffix_len;
++
++	len = strlen(str);
++	suffix_len = strlen(suffix);
++	if (len < suffix_len)
++		return false;
++	return streq(str + len - suffix_len, suffix);
++}
+ 
+ #define ALIGN(x, a)	(((x) + (a) - 1) & ~((a) - 1))
+ 
+@@ -101,6 +116,12 @@ enum markertype {
+ 	TYPE_UINT64,
+ 	TYPE_STRING,
+ };
++
++static inline bool is_type_marker(enum markertype type)
++{
++	return type >= TYPE_UINT8;
++}
++
+ extern const char *markername(enum markertype markertype);
+ 
+ struct  marker {
+@@ -125,7 +146,22 @@ struct data {
+ 	for_each_marker(m) \
+ 		if ((m)->type == (t))
+ 
+-size_t type_marker_length(struct marker *m);
++static inline struct marker *next_type_marker(struct marker *m)
++{
++	for_each_marker(m)
++		if (is_type_marker(m->type))
++			break;
++	return m;
++}
++
++static inline size_t type_marker_length(struct marker *m)
++{
++	struct marker *next = next_type_marker(m->next);
++
++	if (next)
++		return next->offset - m->offset;
++	return 0;
++}
+ 
+ void data_free(struct data d);
+ 
+--- a/scripts/dtc/flattree.c
++++ b/scripts/dtc/flattree.c
+@@ -124,7 +124,8 @@ static void asm_emit_cell(void *e, cell_
+ {
+ 	FILE *f = e;
+ 
+-	fprintf(f, "\t.byte 0x%02x; .byte 0x%02x; .byte 0x%02x; .byte 0x%02x\n",
++	fprintf(f, "\t.byte\t0x%02x\n" "\t.byte\t0x%02x\n"
++		"\t.byte\t0x%02x\n" "\t.byte\t0x%02x\n",
+ 		(val >> 24) & 0xff, (val >> 16) & 0xff,
+ 		(val >> 8) & 0xff, val & 0xff);
+ }
+@@ -134,9 +135,9 @@ static void asm_emit_string(void *e, con
+ 	FILE *f = e;
+ 
+ 	if (len != 0)
+-		fprintf(f, "\t.string\t\"%.*s\"\n", len, str);
++		fprintf(f, "\t.asciz\t\"%.*s\"\n", len, str);
+ 	else
+-		fprintf(f, "\t.string\t\"%s\"\n", str);
++		fprintf(f, "\t.asciz\t\"%s\"\n", str);
+ }
+ 
+ static void asm_emit_align(void *e, int a)
+@@ -295,7 +296,7 @@ static struct data flatten_reserve_list(
+ {
+ 	struct reserve_info *re;
+ 	struct data d = empty_data;
+-	int    j;
++	unsigned int j;
+ 
+ 	for (re = reservelist; re; re = re->next) {
+ 		d = data_append_re(d, re->address, re->size);
+@@ -438,7 +439,7 @@ static void dump_stringtable_asm(FILE *f
+ 
+ 	while (p < (strbuf.val + strbuf.len)) {
+ 		len = strlen(p);
+-		fprintf(f, "\t.string \"%s\"\n", p);
++		fprintf(f, "\t.asciz \"%s\"\n", p);
+ 		p += len+1;
+ 	}
+ }
+--- a/scripts/dtc/libfdt/fdt.c
++++ b/scripts/dtc/libfdt/fdt.c
+@@ -90,6 +90,10 @@ int fdt_check_header(const void *fdt)
+ {
+ 	size_t hdrsize;
+ 
++	/* The device tree must be at an 8-byte aligned address */
++	if ((uintptr_t)fdt & 7)
++		return -FDT_ERR_ALIGNMENT;
++
+ 	if (fdt_magic(fdt) != FDT_MAGIC)
+ 		return -FDT_ERR_BADMAGIC;
+ 	if (!can_assume(LATEST)) {
+--- a/scripts/dtc/libfdt/fdt_rw.c
++++ b/scripts/dtc/libfdt/fdt_rw.c
+@@ -349,7 +349,10 @@ int fdt_add_subnode_namelen(void *fdt, i
+ 		return offset;
+ 
+ 	/* Try to place the new node after the parent's properties */
+-	fdt_next_tag(fdt, parentoffset, &nextoffset); /* skip the BEGIN_NODE */
++	tag = fdt_next_tag(fdt, parentoffset, &nextoffset);
++	/* the fdt_subnode_offset_namelen() should ensure this never hits */
++	if (!can_assume(LIBFDT_FLAWLESS) && (tag != FDT_BEGIN_NODE))
++		return -FDT_ERR_INTERNAL;
+ 	do {
+ 		offset = nextoffset;
+ 		tag = fdt_next_tag(fdt, offset, &nextoffset);
+@@ -391,7 +394,9 @@ int fdt_del_node(void *fdt, int nodeoffs
+ }
+ 
+ static void fdt_packblocks_(const char *old, char *new,
+-			    int mem_rsv_size, int struct_size)
++			    int mem_rsv_size,
++			    int struct_size,
++			    int strings_size)
+ {
+ 	int mem_rsv_off, struct_off, strings_off;
+ 
+@@ -406,8 +411,7 @@ static void fdt_packblocks_(const char *
+ 	fdt_set_off_dt_struct(new, struct_off);
+ 	fdt_set_size_dt_struct(new, struct_size);
+ 
+-	memmove(new + strings_off, old + fdt_off_dt_strings(old),
+-		fdt_size_dt_strings(old));
++	memmove(new + strings_off, old + fdt_off_dt_strings(old), strings_size);
+ 	fdt_set_off_dt_strings(new, strings_off);
+ 	fdt_set_size_dt_strings(new, fdt_size_dt_strings(old));
+ }
+@@ -467,7 +471,8 @@ int fdt_open_into(const void *fdt, void
+ 			return -FDT_ERR_NOSPACE;
+ 	}
+ 
+-	fdt_packblocks_(fdt, tmp, mem_rsv_size, struct_size);
++	fdt_packblocks_(fdt, tmp, mem_rsv_size, struct_size,
++			fdt_size_dt_strings(fdt));
+ 	memmove(buf, tmp, newsize);
+ 
+ 	fdt_set_magic(buf, FDT_MAGIC);
+@@ -487,7 +492,8 @@ int fdt_pack(void *fdt)
+ 
+ 	mem_rsv_size = (fdt_num_mem_rsv(fdt)+1)
+ 		* sizeof(struct fdt_reserve_entry);
+-	fdt_packblocks_(fdt, fdt, mem_rsv_size, fdt_size_dt_struct(fdt));
++	fdt_packblocks_(fdt, fdt, mem_rsv_size, fdt_size_dt_struct(fdt),
++			fdt_size_dt_strings(fdt));
+ 	fdt_set_totalsize(fdt, fdt_data_size_(fdt));
+ 
+ 	return 0;
+--- a/scripts/dtc/libfdt/fdt_strerror.c
++++ b/scripts/dtc/libfdt/fdt_strerror.c
+@@ -39,6 +39,7 @@ static struct fdt_errtabent fdt_errtable
+ 	FDT_ERRTABENT(FDT_ERR_BADOVERLAY),
+ 	FDT_ERRTABENT(FDT_ERR_NOPHANDLES),
+ 	FDT_ERRTABENT(FDT_ERR_BADFLAGS),
++	FDT_ERRTABENT(FDT_ERR_ALIGNMENT),
+ };
+ #define FDT_ERRTABSIZE	((int)(sizeof(fdt_errtable) / sizeof(fdt_errtable[0])))
+ 
+--- a/scripts/dtc/libfdt/libfdt.h
++++ b/scripts/dtc/libfdt/libfdt.h
+@@ -131,6 +131,13 @@ uint32_t fdt_next_tag(const void *fdt, i
+  * to work even with unaligned pointers on platforms (such as ARMv5) that don't
+  * like unaligned loads and stores.
+  */
++static inline uint16_t fdt16_ld(const fdt16_t *p)
++{
++	const uint8_t *bp = (const uint8_t *)p;
++
++	return ((uint16_t)bp[0] << 8) | bp[1];
++}
++
+ static inline uint32_t fdt32_ld(const fdt32_t *p)
+ {
+ 	const uint8_t *bp = (const uint8_t *)p;
+--- a/scripts/dtc/livetree.c
++++ b/scripts/dtc/livetree.c
+@@ -526,7 +526,7 @@ struct node *get_node_by_path(struct nod
+ 	p = strchr(path, '/');
+ 
+ 	for_each_child(tree, child) {
+-		if (p && strprefixeq(path, p - path, child->name))
++		if (p && strprefixeq(path, (size_t)(p - path), child->name))
+ 			return get_node_by_path(child, p+1);
+ 		else if (!p && streq(path, child->name))
+ 			return child;
+@@ -559,7 +559,7 @@ struct node *get_node_by_phandle(struct
+ {
+ 	struct node *child, *node;
+ 
+-	if ((phandle == 0) || (phandle == -1)) {
++	if (!phandle_is_valid(phandle)) {
+ 		assert(generate_fixups);
+ 		return NULL;
+ 	}
+@@ -594,7 +594,7 @@ cell_t get_node_phandle(struct node *roo
+ 	static cell_t phandle = 1; /* FIXME: ick, static local */
+ 	struct data d = empty_data;
+ 
+-	if ((node->phandle != 0) && (node->phandle != -1))
++	if (phandle_is_valid(node->phandle))
+ 		return node->phandle;
+ 
+ 	while (get_node_by_phandle(root, phandle))
+--- a/scripts/dtc/treesource.c
++++ b/scripts/dtc/treesource.c
+@@ -124,27 +124,6 @@ static void write_propval_int(FILE *f, c
+ 	}
+ }
+ 
+-static bool has_data_type_information(struct marker *m)
+-{
+-	return m->type >= TYPE_UINT8;
+-}
+-
+-static struct marker *next_type_marker(struct marker *m)
+-{
+-	while (m && !has_data_type_information(m))
+-		m = m->next;
+-	return m;
+-}
+-
+-size_t type_marker_length(struct marker *m)
+-{
+-	struct marker *next = next_type_marker(m->next);
+-
+-	if (next)
+-		return next->offset - m->offset;
+-	return 0;
+-}
+-
+ static const char *delim_start[] = {
+ 	[TYPE_UINT8] = "[",
+ 	[TYPE_UINT16] = "/bits/ 16 <",
+@@ -229,26 +208,39 @@ static void write_propval(FILE *f, struc
+ 		size_t chunk_len = (m->next ? m->next->offset : len) - m->offset;
+ 		size_t data_len = type_marker_length(m) ? : len - m->offset;
+ 		const char *p = &prop->val.val[m->offset];
++		struct marker *m_phandle;
+ 
+-		if (has_data_type_information(m)) {
++		if (is_type_marker(m->type)) {
+ 			emit_type = m->type;
+ 			fprintf(f, " %s", delim_start[emit_type]);
+ 		} else if (m->type == LABEL)
+ 			fprintf(f, " %s:", m->ref);
+-		else if (m->offset)
+-			fputc(' ', f);
+ 
+-		if (emit_type == TYPE_NONE) {
+-			assert(chunk_len == 0);
++		if (emit_type == TYPE_NONE || chunk_len == 0)
+ 			continue;
+-		}
+ 
+ 		switch(emit_type) {
+ 		case TYPE_UINT16:
+ 			write_propval_int(f, p, chunk_len, 2);
+ 			break;
+ 		case TYPE_UINT32:
+-			write_propval_int(f, p, chunk_len, 4);
++			m_phandle = prop->val.markers;
++			for_each_marker_of_type(m_phandle, REF_PHANDLE)
++				if (m->offset == m_phandle->offset)
++					break;
++
++			if (m_phandle) {
++				if (m_phandle->ref[0] == '/')
++					fprintf(f, "&{%s}", m_phandle->ref);
++				else
++					fprintf(f, "&%s", m_phandle->ref);
++				if (chunk_len > 4) {
++					fputc(' ', f);
++					write_propval_int(f, p + 4, chunk_len - 4, 4);
++				}
++			} else {
++				write_propval_int(f, p, chunk_len, 4);
++			}
+ 			break;
+ 		case TYPE_UINT64:
+ 			write_propval_int(f, p, chunk_len, 8);
+--- a/scripts/dtc/util.h
++++ b/scripts/dtc/util.h
+@@ -13,10 +13,10 @@
+  */
+ 
+ #ifdef __GNUC__
+-#ifdef __clang__
+-#define PRINTF(i, j)	__attribute__((format (printf, i, j)))
+-#else
++#if __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)
+ #define PRINTF(i, j)	__attribute__((format (gnu_printf, i, j)))
++#else
++#define PRINTF(i, j)	__attribute__((format (printf, i, j)))
+ #endif
+ #define NORETURN	__attribute__((noreturn))
+ #else
+--- a/scripts/dtc/version_gen.h
++++ b/scripts/dtc/version_gen.h
+@@ -1 +1 @@
+-#define DTC_VERSION "DTC 1.6.0-g183df9e9"
++#define DTC_VERSION "DTC 1.6.1-g0a3a9d34"
+--- a/scripts/dtc/yamltree.c
++++ b/scripts/dtc/yamltree.c
+@@ -29,11 +29,12 @@ char *yaml_error_name[] = {
+ 		    (emitter)->problem, __func__, __LINE__);		\
+ })
+ 
+-static void yaml_propval_int(yaml_emitter_t *emitter, struct marker *markers, char *data, unsigned int len, int width)
++static void yaml_propval_int(yaml_emitter_t *emitter, struct marker *markers,
++	char *data, unsigned int seq_offset, unsigned int len, int width)
+ {
+ 	yaml_event_t event;
+ 	void *tag;
+-	unsigned int off, start_offset = markers->offset;
++	unsigned int off;
+ 
+ 	switch(width) {
+ 		case 1: tag = "!u8"; break;
+@@ -66,7 +67,7 @@ static void yaml_propval_int(yaml_emitte
+ 			m = markers;
+ 			is_phandle = false;
+ 			for_each_marker_of_type(m, REF_PHANDLE) {
+-				if (m->offset == (start_offset + off)) {
++				if (m->offset == (seq_offset + off)) {
+ 					is_phandle = true;
+ 					break;
+ 				}
+@@ -114,6 +115,7 @@ static void yaml_propval(yaml_emitter_t
+ 	yaml_event_t event;
+ 	unsigned int len = prop->val.len;
+ 	struct marker *m = prop->val.markers;
++	struct marker *markers = prop->val.markers;
+ 
+ 	/* Emit the property name */
+ 	yaml_scalar_event_initialize(&event, NULL,
+@@ -151,19 +153,19 @@ static void yaml_propval(yaml_emitter_t
+ 
+ 		switch(m->type) {
+ 		case TYPE_UINT16:
+-			yaml_propval_int(emitter, m, data, chunk_len, 2);
++			yaml_propval_int(emitter, markers, data, m->offset, chunk_len, 2);
+ 			break;
+ 		case TYPE_UINT32:
+-			yaml_propval_int(emitter, m, data, chunk_len, 4);
++			yaml_propval_int(emitter, markers, data, m->offset, chunk_len, 4);
+ 			break;
+ 		case TYPE_UINT64:
+-			yaml_propval_int(emitter, m, data, chunk_len, 8);
++			yaml_propval_int(emitter, markers, data, m->offset, chunk_len, 8);
+ 			break;
+ 		case TYPE_STRING:
+ 			yaml_propval_string(emitter, data, chunk_len);
+ 			break;
+ 		default:
+-			yaml_propval_int(emitter, m, data, chunk_len, 1);
++			yaml_propval_int(emitter, markers, data, m->offset, chunk_len, 1);
+ 			break;
+ 		}
+ 	}

+ 48 - 0
target/linux/generic/backport-6.1/300-v5.18-pinctrl-qcom-Return--EINVAL-for-setting-affinity-if-no-IRQ-parent.patch

@@ -0,0 +1,48 @@
+From: Manivannan Sadhasivam <[email protected]>
+To: [email protected]
+Cc: [email protected], [email protected],
+        [email protected], [email protected],
+        [email protected],
+        Manivannan Sadhasivam <[email protected]>
+Subject: [PATCH] pinctrl: qcom: Return -EINVAL for setting affinity if no IRQ
+ parent
+Date: Thu, 13 Jan 2022 21:56:17 +0530
+Message-Id: <[email protected]>
+
+The MSM GPIO IRQ controller relies on the parent IRQ controller to set the
+CPU affinity for the IRQ. And this is only valid if there is any wakeup
+parent available and defined in DT.
+
+For the case of no parent IRQ controller defined in DT,
+msm_gpio_irq_set_affinity() and msm_gpio_irq_set_vcpu_affinity() should
+return -EINVAL instead of 0 as the affinity can't be set.
+
+Otherwise, below warning will be printed by genirq:
+
+genirq: irq_chip msmgpio did not update eff. affinity mask of irq 70
+
+Signed-off-by: Manivannan Sadhasivam <[email protected]>
+---
+ drivers/pinctrl/qcom/pinctrl-msm.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/pinctrl/qcom/pinctrl-msm.c
++++ b/drivers/pinctrl/qcom/pinctrl-msm.c
+@@ -1157,7 +1157,7 @@ static int msm_gpio_irq_set_affinity(str
+ 	if (d->parent_data && test_bit(d->hwirq, pctrl->skip_wake_irqs))
+ 		return irq_chip_set_affinity_parent(d, dest, force);
+ 
+-	return 0;
++	return -EINVAL;
+ }
+ 
+ static int msm_gpio_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu_info)
+@@ -1168,7 +1168,7 @@ static int msm_gpio_irq_set_vcpu_affinit
+ 	if (d->parent_data && test_bit(d->hwirq, pctrl->skip_wake_irqs))
+ 		return irq_chip_set_vcpu_affinity_parent(d, vcpu_info);
+ 
+-	return 0;
++	return -EINVAL;
+ }
+ 
+ static void msm_gpio_irq_handler(struct irq_desc *desc)

+ 166 - 0
target/linux/generic/backport-6.1/301-v5.16-soc-qcom-smem-Support-reserved-memory-description.patch

@@ -0,0 +1,166 @@
+From b5af64fceb04dc298c5e69c517b4d83893ff060b Mon Sep 17 00:00:00 2001
+From: Bjorn Andersson <[email protected]>
+Date: Thu, 30 Sep 2021 11:21:10 -0700
+Subject: [PATCH 1/1] soc: qcom: smem: Support reserved-memory description
+
+Practically all modern Qualcomm platforms has a single reserved-memory
+region for SMEM. So rather than having to describe SMEM in the form of a
+node with a reference to a reserved-memory node, allow the SMEM device
+to be instantiated directly from the reserved-memory node.
+
+The current means of falling back to dereferencing the "memory-region"
+is kept as a fallback, if it's determined that the SMEM node is a
+reserved-memory node.
+
+The "qcom,smem" compatible is added to the reserved_mem_matches list, to
+allow the reserved-memory device to be probed.
+
+In order to retain the readability of the code, the resolution of
+resources is split from the actual ioremapping.
+
+Signed-off-by: Bjorn Andersson <[email protected]>
+Acked-by: Rob Herring <[email protected]>
+Reviewed-by: Vladimir Zapolskiy <[email protected]>
+Link: https://lore.kernel.org/r/[email protected]
+---
+ drivers/of/platform.c   |  1 +
+ drivers/soc/qcom/smem.c | 57 ++++++++++++++++++++++++++++-------------
+ 2 files changed, 40 insertions(+), 18 deletions(-)
+
+--- a/drivers/of/platform.c
++++ b/drivers/of/platform.c
+@@ -509,6 +509,7 @@ EXPORT_SYMBOL_GPL(of_platform_default_po
+ static const struct of_device_id reserved_mem_matches[] = {
+ 	{ .compatible = "qcom,rmtfs-mem" },
+ 	{ .compatible = "qcom,cmd-db" },
++	{ .compatible = "qcom,smem" },
+ 	{ .compatible = "ramoops" },
+ 	{ .compatible = "nvmem-rmem" },
+ 	{}
+--- a/drivers/soc/qcom/smem.c
++++ b/drivers/soc/qcom/smem.c
+@@ -9,6 +9,7 @@
+ #include <linux/module.h>
+ #include <linux/of.h>
+ #include <linux/of_address.h>
++#include <linux/of_reserved_mem.h>
+ #include <linux/platform_device.h>
+ #include <linux/sizes.h>
+ #include <linux/slab.h>
+@@ -240,7 +241,7 @@ static const u8 SMEM_INFO_MAGIC[] = { 0x
+  * @size:	size of the memory region
+  */
+ struct smem_region {
+-	u32 aux_base;
++	phys_addr_t aux_base;
+ 	void __iomem *virt_base;
+ 	size_t size;
+ };
+@@ -499,7 +500,7 @@ static void *qcom_smem_get_global(struct
+ 	for (i = 0; i < smem->num_regions; i++) {
+ 		region = &smem->regions[i];
+ 
+-		if (region->aux_base == aux_base || !aux_base) {
++		if ((u32)region->aux_base == aux_base || !aux_base) {
+ 			if (size != NULL)
+ 				*size = le32_to_cpu(entry->size);
+ 			return region->virt_base + le32_to_cpu(entry->offset);
+@@ -664,7 +665,7 @@ phys_addr_t qcom_smem_virt_to_phys(void
+ 		if (p < region->virt_base + region->size) {
+ 			u64 offset = p - region->virt_base;
+ 
+-			return (phys_addr_t)region->aux_base + offset;
++			return region->aux_base + offset;
+ 		}
+ 	}
+ 
+@@ -863,12 +864,12 @@ qcom_smem_enumerate_partitions(struct qc
+ 	return 0;
+ }
+ 
+-static int qcom_smem_map_memory(struct qcom_smem *smem, struct device *dev,
+-				const char *name, int i)
++static int qcom_smem_resolve_mem(struct qcom_smem *smem, const char *name,
++				 struct smem_region *region)
+ {
++	struct device *dev = smem->dev;
+ 	struct device_node *np;
+ 	struct resource r;
+-	resource_size_t size;
+ 	int ret;
+ 
+ 	np = of_parse_phandle(dev->of_node, name, 0);
+@@ -881,13 +882,9 @@ static int qcom_smem_map_memory(struct q
+ 	of_node_put(np);
+ 	if (ret)
+ 		return ret;
+-	size = resource_size(&r);
+ 
+-	smem->regions[i].virt_base = devm_ioremap_wc(dev, r.start, size);
+-	if (!smem->regions[i].virt_base)
+-		return -ENOMEM;
+-	smem->regions[i].aux_base = (u32)r.start;
+-	smem->regions[i].size = size;
++	region->aux_base = r.start;
++	region->size = resource_size(&r);
+ 
+ 	return 0;
+ }
+@@ -895,12 +892,14 @@ static int qcom_smem_map_memory(struct q
+ static int qcom_smem_probe(struct platform_device *pdev)
+ {
+ 	struct smem_header *header;
++	struct reserved_mem *rmem;
+ 	struct qcom_smem *smem;
+ 	size_t array_size;
+ 	int num_regions;
+ 	int hwlock_id;
+ 	u32 version;
+ 	int ret;
++	int i;
+ 
+ 	num_regions = 1;
+ 	if (of_find_property(pdev->dev.of_node, "qcom,rpm-msg-ram", NULL))
+@@ -914,13 +913,35 @@ static int qcom_smem_probe(struct platfo
+ 	smem->dev = &pdev->dev;
+ 	smem->num_regions = num_regions;
+ 
+-	ret = qcom_smem_map_memory(smem, &pdev->dev, "memory-region", 0);
+-	if (ret)
+-		return ret;
+-
+-	if (num_regions > 1 && (ret = qcom_smem_map_memory(smem, &pdev->dev,
+-					"qcom,rpm-msg-ram", 1)))
+-		return ret;
++	rmem = of_reserved_mem_lookup(pdev->dev.of_node);
++	if (rmem) {
++		smem->regions[0].aux_base = rmem->base;
++		smem->regions[0].size = rmem->size;
++	} else {
++		/*
++		 * Fall back to the memory-region reference, if we're not a
++		 * reserved-memory node.
++		 */
++		ret = qcom_smem_resolve_mem(smem, "memory-region", &smem->regions[0]);
++		if (ret)
++			return ret;
++	}
++
++	if (num_regions > 1) {
++		ret = qcom_smem_resolve_mem(smem, "qcom,rpm-msg-ram", &smem->regions[1]);
++		if (ret)
++			return ret;
++	}
++
++	for (i = 0; i < num_regions; i++) {
++		smem->regions[i].virt_base = devm_ioremap_wc(&pdev->dev,
++							     smem->regions[i].aux_base,
++							     smem->regions[i].size);
++		if (!smem->regions[i].virt_base) {
++			dev_err(&pdev->dev, "failed to remap %pa\n", &smem->regions[i].aux_base);
++			return -ENOMEM;
++		}
++	}
+ 
+ 	header = smem->regions[0].virt_base;
+ 	if (le32_to_cpu(header->initialized) != 1 ||

+ 33 - 0
target/linux/generic/backport-6.1/302-v5.16-watchdog-bcm63xx_wdt-fix-fallthrough-warning.patch

@@ -0,0 +1,33 @@
+From ee1a0696934a8b77a6a2098f92832c46d34ec5da Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <[email protected]>
+Date: Wed, 27 Oct 2021 14:31:35 +0200
+Subject: [PATCH] watchdog: bcm63xx_wdt: fix fallthrough warning
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This fixes:
+drivers/watchdog/bcm63xx_wdt.c: In function 'bcm63xx_wdt_ioctl':
+drivers/watchdog/bcm63xx_wdt.c:208:17: warning: this statement may fall through [-Wimplicit-fallthrough=]
+
+Signed-off-by: Rafał Miłecki <[email protected]>
+Reviewed-by: Florian Fainelli <[email protected]>
+Reviewed-by: Guenter Roeck <[email protected]>
+Link: https://lore.kernel.org/r/[email protected]
+Signed-off-by: Guenter Roeck <[email protected]>
+Signed-off-by: Wim Van Sebroeck <[email protected]>
+---
+ drivers/watchdog/bcm63xx_wdt.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/drivers/watchdog/bcm63xx_wdt.c
++++ b/drivers/watchdog/bcm63xx_wdt.c
+@@ -207,6 +207,8 @@ static long bcm63xx_wdt_ioctl(struct fil
+ 
+ 		bcm63xx_wdt_pet();
+ 
++		fallthrough;
++
+ 	case WDIOC_GETTIMEOUT:
+ 		return put_user(wdt_time, p);
+ 

+ 162 - 0
target/linux/generic/backport-6.1/330-v5.16-01-MIPS-kernel-proc-add-CPU-option-reporting.patch

@@ -0,0 +1,162 @@
+From 626bfa03729959ea9917181fb3d8ffaa1594d02a Mon Sep 17 00:00:00 2001
+From: Hauke Mehrtens <[email protected]>
+Date: Wed, 13 Oct 2021 22:40:18 -0700
+Subject: [PATCH 1/1] MIPS: kernel: proc: add CPU option reporting
+
+Many MIPS CPUs have optional CPU features which are not activated for
+all CPU cores. Print the CPU options, which are implemented in the core,
+in /proc/cpuinfo. This makes it possible to see which features are
+supported and which are not supported. This should cover all standard
+MIPS extensions. Before, it only printed information about the main MIPS
+ASEs.
+
+Signed-off-by: Hauke Mehrtens <[email protected]>
+
+Changes from original patch[0]:
+- Remove cpu_has_6k_cache and cpu_has_8k_cache due to commit 6ce91ba8589a
+  ("MIPS: Remove cpu_has_6k_cache and cpu_has_8k_cache in cpu_cache_init()")
+- Add new options: mac2008_only, ftlbparex, gsexcex, mmid, mm_sysad,
+  mm_full
+- Use seq_puts instead of seq_printf as suggested by checkpatch
+- Minor commit message reword
+
+[0]: https://lore.kernel.org/linux-mips/[email protected]/
+
+Signed-off-by: Ilya Lipnitskiy <[email protected]>
+Acked-by: Hauke Mehrtens <[email protected]>
+Signed-off-by: Thomas Bogendoerfer <[email protected]>
+---
+ arch/mips/kernel/proc.c | 122 ++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 122 insertions(+)
+
+--- a/arch/mips/kernel/proc.c
++++ b/arch/mips/kernel/proc.c
+@@ -138,6 +138,128 @@ static int show_cpuinfo(struct seq_file
+ 		seq_printf(m, "micromips kernel\t: %s\n",
+ 		      (read_c0_config3() & MIPS_CONF3_ISA_OE) ?  "yes" : "no");
+ 	}
++
++	seq_puts(m, "Options implemented\t:");
++	if (cpu_has_tlb)
++		seq_puts(m, " tlb");
++	if (cpu_has_ftlb)
++		seq_puts(m, " ftlb");
++	if (cpu_has_tlbinv)
++		seq_puts(m, " tlbinv");
++	if (cpu_has_segments)
++		seq_puts(m, " segments");
++	if (cpu_has_rixiex)
++		seq_puts(m, " rixiex");
++	if (cpu_has_ldpte)
++		seq_puts(m, " ldpte");
++	if (cpu_has_maar)
++		seq_puts(m, " maar");
++	if (cpu_has_rw_llb)
++		seq_puts(m, " rw_llb");
++	if (cpu_has_4kex)
++		seq_puts(m, " 4kex");
++	if (cpu_has_3k_cache)
++		seq_puts(m, " 3k_cache");
++	if (cpu_has_4k_cache)
++		seq_puts(m, " 4k_cache");
++	if (cpu_has_tx39_cache)
++		seq_puts(m, " tx39_cache");
++	if (cpu_has_octeon_cache)
++		seq_puts(m, " octeon_cache");
++	if (cpu_has_fpu)
++		seq_puts(m, " fpu");
++	if (cpu_has_32fpr)
++		seq_puts(m, " 32fpr");
++	if (cpu_has_cache_cdex_p)
++		seq_puts(m, " cache_cdex_p");
++	if (cpu_has_cache_cdex_s)
++		seq_puts(m, " cache_cdex_s");
++	if (cpu_has_prefetch)
++		seq_puts(m, " prefetch");
++	if (cpu_has_mcheck)
++		seq_puts(m, " mcheck");
++	if (cpu_has_ejtag)
++		seq_puts(m, " ejtag");
++	if (cpu_has_llsc)
++		seq_puts(m, " llsc");
++	if (cpu_has_guestctl0ext)
++		seq_puts(m, " guestctl0ext");
++	if (cpu_has_guestctl1)
++		seq_puts(m, " guestctl1");
++	if (cpu_has_guestctl2)
++		seq_puts(m, " guestctl2");
++	if (cpu_has_guestid)
++		seq_puts(m, " guestid");
++	if (cpu_has_drg)
++		seq_puts(m, " drg");
++	if (cpu_has_rixi)
++		seq_puts(m, " rixi");
++	if (cpu_has_lpa)
++		seq_puts(m, " lpa");
++	if (cpu_has_mvh)
++		seq_puts(m, " mvh");
++	if (cpu_has_vtag_icache)
++		seq_puts(m, " vtag_icache");
++	if (cpu_has_dc_aliases)
++		seq_puts(m, " dc_aliases");
++	if (cpu_has_ic_fills_f_dc)
++		seq_puts(m, " ic_fills_f_dc");
++	if (cpu_has_pindexed_dcache)
++		seq_puts(m, " pindexed_dcache");
++	if (cpu_has_userlocal)
++		seq_puts(m, " userlocal");
++	if (cpu_has_nofpuex)
++		seq_puts(m, " nofpuex");
++	if (cpu_has_vint)
++		seq_puts(m, " vint");
++	if (cpu_has_veic)
++		seq_puts(m, " veic");
++	if (cpu_has_inclusive_pcaches)
++		seq_puts(m, " inclusive_pcaches");
++	if (cpu_has_perf_cntr_intr_bit)
++		seq_puts(m, " perf_cntr_intr_bit");
++	if (cpu_has_ufr)
++		seq_puts(m, " ufr");
++	if (cpu_has_fre)
++		seq_puts(m, " fre");
++	if (cpu_has_cdmm)
++		seq_puts(m, " cdmm");
++	if (cpu_has_small_pages)
++		seq_puts(m, " small_pages");
++	if (cpu_has_nan_legacy)
++		seq_puts(m, " nan_legacy");
++	if (cpu_has_nan_2008)
++		seq_puts(m, " nan_2008");
++	if (cpu_has_ebase_wg)
++		seq_puts(m, " ebase_wg");
++	if (cpu_has_badinstr)
++		seq_puts(m, " badinstr");
++	if (cpu_has_badinstrp)
++		seq_puts(m, " badinstrp");
++	if (cpu_has_contextconfig)
++		seq_puts(m, " contextconfig");
++	if (cpu_has_perf)
++		seq_puts(m, " perf");
++	if (cpu_has_mac2008_only)
++		seq_puts(m, " mac2008_only");
++	if (cpu_has_ftlbparex)
++		seq_puts(m, " ftlbparex");
++	if (cpu_has_gsexcex)
++		seq_puts(m, " gsexcex");
++	if (cpu_has_shared_ftlb_ram)
++		seq_puts(m, " shared_ftlb_ram");
++	if (cpu_has_shared_ftlb_entries)
++		seq_puts(m, " shared_ftlb_entries");
++	if (cpu_has_mipsmt_pertccounters)
++		seq_puts(m, " mipsmt_pertccounters");
++	if (cpu_has_mmid)
++		seq_puts(m, " mmid");
++	if (cpu_has_mm_sysad)
++		seq_puts(m, " mm_sysad");
++	if (cpu_has_mm_full)
++		seq_puts(m, " mm_full");
++	seq_puts(m, "\n");
++
+ 	seq_printf(m, "shadow register sets\t: %d\n",
+ 		      cpu_data[n].srsets);
+ 	seq_printf(m, "kscratch registers\t: %d\n",

+ 62 - 0
target/linux/generic/backport-6.1/330-v5.16-02-MIPS-Fix-using-smp_processor_id-in-preemptible-in-sh.patch

@@ -0,0 +1,62 @@
+From 1cab5bd69eb1f995ced2d7576cb15f8a8941fd85 Mon Sep 17 00:00:00 2001
+From: Tiezhu Yang <[email protected]>
+Date: Thu, 25 Nov 2021 19:39:32 +0800
+Subject: [PATCH 1/1] MIPS: Fix using smp_processor_id() in preemptible in
+ show_cpuinfo()
+
+There exists the following issue under DEBUG_PREEMPT:
+
+ BUG: using smp_processor_id() in preemptible [00000000] code: systemd/1
+ caller is show_cpuinfo+0x460/0xea0
+ ...
+ Call Trace:
+ [<ffffffff8020f0dc>] show_stack+0x94/0x128
+ [<ffffffff80e6cab4>] dump_stack_lvl+0x94/0xd8
+ [<ffffffff80e74c5c>] check_preemption_disabled+0x104/0x110
+ [<ffffffff802209c8>] show_cpuinfo+0x460/0xea0
+ [<ffffffff80539d54>] seq_read_iter+0xfc/0x4f8
+ [<ffffffff804fcc10>] new_sync_read+0x110/0x1b8
+ [<ffffffff804ff57c>] vfs_read+0x1b4/0x1d0
+ [<ffffffff804ffb18>] ksys_read+0xd0/0x110
+ [<ffffffff8021c090>] syscall_common+0x34/0x58
+
+We can see the following call trace:
+ show_cpuinfo()
+   cpu_has_fpu
+     current_cpu_data
+       smp_processor_id()
+
+ $ addr2line -f -e vmlinux 0xffffffff802209c8
+ show_cpuinfo
+ arch/mips/kernel/proc.c:188
+
+ $ head -188 arch/mips/kernel/proc.c | tail -1
+	 if (cpu_has_fpu)
+
+ arch/mips/include/asm/cpu-features.h
+ #  define cpu_has_fpu		(current_cpu_data.options & MIPS_CPU_FPU)
+
+ arch/mips/include/asm/cpu-info.h
+ #define current_cpu_data cpu_data[smp_processor_id()]
+
+Based on the above analysis, fix the issue by using raw_cpu_has_fpu
+which calls raw_smp_processor_id() in show_cpuinfo().
+
+Fixes: 626bfa037299 ("MIPS: kernel: proc: add CPU option reporting")
+Signed-off-by: Tiezhu Yang <[email protected]>
+Signed-off-by: Thomas Bogendoerfer <[email protected]>
+---
+ arch/mips/kernel/proc.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/mips/kernel/proc.c
++++ b/arch/mips/kernel/proc.c
+@@ -166,7 +166,7 @@ static int show_cpuinfo(struct seq_file
+ 		seq_puts(m, " tx39_cache");
+ 	if (cpu_has_octeon_cache)
+ 		seq_puts(m, " octeon_cache");
+-	if (cpu_has_fpu)
++	if (raw_cpu_has_fpu)
+ 		seq_puts(m, " fpu");
+ 	if (cpu_has_32fpr)
+ 		seq_puts(m, " 32fpr");

+ 186 - 0
target/linux/generic/backport-6.1/331-v5.19-mtd-spinand-Add-support-for-XTX-XT26G0xA.patch

@@ -0,0 +1,186 @@
+From f4c5c7f9d2e5ab005d57826b740b694b042a737c Mon Sep 17 00:00:00 2001
+From: Felix Matouschek <[email protected]>
+Date: Mon, 18 Apr 2022 15:28:03 +0200
+Subject: [PATCH 1/1] mtd: spinand: Add support for XTX XT26G0xA
+
+Add support for XTX Technology XT26G01AXXXXX, XTX26G02AXXXXX and
+XTX26G04AXXXXX SPI NAND.
+
+These are 3V, 1G/2G/4Gbit serial SLC NAND flash devices with on-die ECC
+(8bit strength per 512bytes).
+
+Tested on Teltonika RUTX10 flashed with OpenWrt.
+
+Links:
+  - http://www.xtxtech.com/download/?AId=225
+  - https://datasheet.lcsc.com/szlcsc/2005251034_XTX-XT26G01AWSEGA_C558841.pdf
+Signed-off-by: Felix Matouschek <[email protected]>
+Signed-off-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/linux-mtd/[email protected]
+---
+ drivers/mtd/nand/spi/Makefile |   2 +-
+ drivers/mtd/nand/spi/core.c   |   1 +
+ drivers/mtd/nand/spi/xtx.c    | 129 ++++++++++++++++++++++++++++++++++
+ include/linux/mtd/spinand.h   |   1 +
+ 4 files changed, 132 insertions(+), 1 deletion(-)
+ create mode 100644 drivers/mtd/nand/spi/xtx.c
+
+--- a/drivers/mtd/nand/spi/Makefile
++++ b/drivers/mtd/nand/spi/Makefile
+@@ -1,3 +1,3 @@
+ # SPDX-License-Identifier: GPL-2.0
+-spinand-objs := core.o gigadevice.o macronix.o micron.o paragon.o toshiba.o winbond.o
++spinand-objs := core.o gigadevice.o macronix.o micron.o paragon.o toshiba.o winbond.o xtx.o
+ obj-$(CONFIG_MTD_SPI_NAND) += spinand.o
+--- a/drivers/mtd/nand/spi/core.c
++++ b/drivers/mtd/nand/spi/core.c
+@@ -902,6 +902,7 @@ static const struct spinand_manufacturer
+ 	&paragon_spinand_manufacturer,
+ 	&toshiba_spinand_manufacturer,
+ 	&winbond_spinand_manufacturer,
++	&xtx_spinand_manufacturer,
+ };
+ 
+ static int spinand_manufacturer_match(struct spinand_device *spinand,
+--- /dev/null
++++ b/drivers/mtd/nand/spi/xtx.c
+@@ -0,0 +1,129 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Author:
++ * Felix Matouschek <[email protected]>
++ */
++
++#include <linux/device.h>
++#include <linux/kernel.h>
++#include <linux/mtd/spinand.h>
++
++#define SPINAND_MFR_XTX	0x0B
++
++#define XT26G0XA_STATUS_ECC_MASK	GENMASK(5, 2)
++#define XT26G0XA_STATUS_ECC_NO_DETECTED	(0 << 2)
++#define XT26G0XA_STATUS_ECC_8_CORRECTED	(3 << 4)
++#define XT26G0XA_STATUS_ECC_UNCOR_ERROR	(2 << 4)
++
++static SPINAND_OP_VARIANTS(read_cache_variants,
++		SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP(0, 1, NULL, 0),
++		SPINAND_PAGE_READ_FROM_CACHE_X4_OP(0, 1, NULL, 0),
++		SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP(0, 1, NULL, 0),
++		SPINAND_PAGE_READ_FROM_CACHE_X2_OP(0, 1, NULL, 0),
++		SPINAND_PAGE_READ_FROM_CACHE_OP(true, 0, 1, NULL, 0),
++		SPINAND_PAGE_READ_FROM_CACHE_OP(false, 0, 1, NULL, 0));
++
++static SPINAND_OP_VARIANTS(write_cache_variants,
++		SPINAND_PROG_LOAD_X4(true, 0, NULL, 0),
++		SPINAND_PROG_LOAD(true, 0, NULL, 0));
++
++static SPINAND_OP_VARIANTS(update_cache_variants,
++		SPINAND_PROG_LOAD_X4(false, 0, NULL, 0),
++		SPINAND_PROG_LOAD(false, 0, NULL, 0));
++
++static int xt26g0xa_ooblayout_ecc(struct mtd_info *mtd, int section,
++				   struct mtd_oob_region *region)
++{
++	if (section)
++		return -ERANGE;
++
++	region->offset = 48;
++	region->length = 16;
++
++	return 0;
++}
++
++static int xt26g0xa_ooblayout_free(struct mtd_info *mtd, int section,
++				   struct mtd_oob_region *region)
++{
++	if (section)
++		return -ERANGE;
++
++	region->offset = 1;
++	region->length = 47;
++
++	return 0;
++}
++
++static const struct mtd_ooblayout_ops xt26g0xa_ooblayout = {
++	.ecc = xt26g0xa_ooblayout_ecc,
++	.free = xt26g0xa_ooblayout_free,
++};
++
++static int xt26g0xa_ecc_get_status(struct spinand_device *spinand,
++					 u8 status)
++{
++	status = status & XT26G0XA_STATUS_ECC_MASK;
++
++	switch (status) {
++	case XT26G0XA_STATUS_ECC_NO_DETECTED:
++		return 0;
++	case XT26G0XA_STATUS_ECC_8_CORRECTED:
++		return 8;
++	case XT26G0XA_STATUS_ECC_UNCOR_ERROR:
++		return -EBADMSG;
++	default:
++		break;
++	}
++
++	/* At this point values greater than (2 << 4) are invalid  */
++	if (status > XT26G0XA_STATUS_ECC_UNCOR_ERROR)
++		return -EINVAL;
++
++	/* (1 << 2) through (7 << 2) are 1-7 corrected errors */
++	return status >> 2;
++}
++
++static const struct spinand_info xtx_spinand_table[] = {
++	SPINAND_INFO("XT26G01A",
++		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0xE1),
++		     NAND_MEMORG(1, 2048, 64, 64, 1024, 20, 1, 1, 1),
++		     NAND_ECCREQ(8, 512),
++		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
++					      &write_cache_variants,
++					      &update_cache_variants),
++		     SPINAND_HAS_QE_BIT,
++		     SPINAND_ECCINFO(&xt26g0xa_ooblayout,
++				     xt26g0xa_ecc_get_status)),
++	SPINAND_INFO("XT26G02A",
++		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0xE2),
++		     NAND_MEMORG(1, 2048, 64, 64, 2048, 40, 1, 1, 1),
++		     NAND_ECCREQ(8, 512),
++		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
++					      &write_cache_variants,
++					      &update_cache_variants),
++		     SPINAND_HAS_QE_BIT,
++		     SPINAND_ECCINFO(&xt26g0xa_ooblayout,
++				     xt26g0xa_ecc_get_status)),
++	SPINAND_INFO("XT26G04A",
++		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0xE3),
++		     NAND_MEMORG(1, 2048, 64, 128, 2048, 40, 1, 1, 1),
++		     NAND_ECCREQ(8, 512),
++		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
++					      &write_cache_variants,
++					      &update_cache_variants),
++		     SPINAND_HAS_QE_BIT,
++		     SPINAND_ECCINFO(&xt26g0xa_ooblayout,
++				     xt26g0xa_ecc_get_status)),
++};
++
++static const struct spinand_manufacturer_ops xtx_spinand_manuf_ops = {
++};
++
++const struct spinand_manufacturer xtx_spinand_manufacturer = {
++	.id = SPINAND_MFR_XTX,
++	.name = "XTX",
++	.chips = xtx_spinand_table,
++	.nchips = ARRAY_SIZE(xtx_spinand_table),
++	.ops = &xtx_spinand_manuf_ops,
++};
+--- a/include/linux/mtd/spinand.h
++++ b/include/linux/mtd/spinand.h
+@@ -266,6 +266,7 @@ extern const struct spinand_manufacturer
+ extern const struct spinand_manufacturer paragon_spinand_manufacturer;
+ extern const struct spinand_manufacturer toshiba_spinand_manufacturer;
+ extern const struct spinand_manufacturer winbond_spinand_manufacturer;
++extern const struct spinand_manufacturer xtx_spinand_manufacturer;
+ 
+ /**
+  * struct spinand_op_variants - SPI NAND operation variants

+ 219 - 0
target/linux/generic/backport-6.1/344-v5.18-01-phy-marvell-phy-mvebu-a3700-comphy-Remove-port-from-.patch

@@ -0,0 +1,219 @@
+From 4bf18d5a2dd02db8c5b16a2cfae513510506df5b Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <[email protected]>
+Date: Thu, 3 Feb 2022 22:44:40 +0100
+Subject: [PATCH 1/2] phy: marvell: phy-mvebu-a3700-comphy: Remove port from
+ driver configuration
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Port number is encoded into argument for SMC call. It is zero for SATA,
+PCIe and also both USB 3.0 PHYs. It is non-zero only for Ethernet PHY
+(incorrectly called SGMII) on lane 0. Ethernet PHY on lane 1 also uses zero
+port number.
+
+So construct "port" bits for SMC call argument can be constructed directly
+from PHY type and lane number.
+
+Change driver code to always pass zero port number for non-ethernet PHYs
+and for ethernet PHYs determinate port number from lane number. This
+simplifies the driver.
+
+As port number from DT PHY configuration is not used anymore, remove whole
+driver code which parses it. This also simplifies the driver.
+
+Signed-off-by: Pali Rohár <[email protected]>
+Signed-off-by: Marek Behún <[email protected]>
+Reviewed-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/r/[email protected]
+Signed-off-by: Vinod Koul <[email protected]>
+---
+ drivers/phy/marvell/phy-mvebu-a3700-comphy.c | 62 +++++++++-----------
+ 1 file changed, 29 insertions(+), 33 deletions(-)
+
+--- a/drivers/phy/marvell/phy-mvebu-a3700-comphy.c
++++ b/drivers/phy/marvell/phy-mvebu-a3700-comphy.c
+@@ -20,7 +20,6 @@
+ #include <linux/platform_device.h>
+ 
+ #define MVEBU_A3700_COMPHY_LANES		3
+-#define MVEBU_A3700_COMPHY_PORTS		2
+ 
+ /* COMPHY Fast SMC function identifiers */
+ #define COMPHY_SIP_POWER_ON			0x82000001
+@@ -45,51 +44,47 @@
+ #define COMPHY_FW_NET(mode, idx, speed)		(COMPHY_FW_MODE(mode) | \
+ 						 ((idx) << 8) |	\
+ 						 ((speed) << 2))
+-#define COMPHY_FW_PCIE(mode, idx, speed, width)	(COMPHY_FW_NET(mode, idx, speed) | \
++#define COMPHY_FW_PCIE(mode, speed, width)	(COMPHY_FW_NET(mode, 0, speed) | \
+ 						 ((width) << 18))
+ 
+ struct mvebu_a3700_comphy_conf {
+ 	unsigned int lane;
+ 	enum phy_mode mode;
+ 	int submode;
+-	unsigned int port;
+ 	u32 fw_mode;
+ };
+ 
+-#define MVEBU_A3700_COMPHY_CONF(_lane, _mode, _smode, _port, _fw)	\
++#define MVEBU_A3700_COMPHY_CONF(_lane, _mode, _smode, _fw)		\
+ 	{								\
+ 		.lane = _lane,						\
+ 		.mode = _mode,						\
+ 		.submode = _smode,					\
+-		.port = _port,						\
+ 		.fw_mode = _fw,						\
+ 	}
+ 
+-#define MVEBU_A3700_COMPHY_CONF_GEN(_lane, _mode, _port, _fw) \
+-	MVEBU_A3700_COMPHY_CONF(_lane, _mode, PHY_INTERFACE_MODE_NA, _port, _fw)
++#define MVEBU_A3700_COMPHY_CONF_GEN(_lane, _mode, _fw) \
++	MVEBU_A3700_COMPHY_CONF(_lane, _mode, PHY_INTERFACE_MODE_NA, _fw)
+ 
+-#define MVEBU_A3700_COMPHY_CONF_ETH(_lane, _smode, _port, _fw) \
+-	MVEBU_A3700_COMPHY_CONF(_lane, PHY_MODE_ETHERNET, _smode, _port, _fw)
++#define MVEBU_A3700_COMPHY_CONF_ETH(_lane, _smode, _fw) \
++	MVEBU_A3700_COMPHY_CONF(_lane, PHY_MODE_ETHERNET, _smode, _fw)
+ 
+ static const struct mvebu_a3700_comphy_conf mvebu_a3700_comphy_modes[] = {
+ 	/* lane 0 */
+-	MVEBU_A3700_COMPHY_CONF_GEN(0, PHY_MODE_USB_HOST_SS, 0,
++	MVEBU_A3700_COMPHY_CONF_GEN(0, PHY_MODE_USB_HOST_SS,
+ 				    COMPHY_FW_MODE_USB3H),
+-	MVEBU_A3700_COMPHY_CONF_ETH(0, PHY_INTERFACE_MODE_SGMII, 1,
++	MVEBU_A3700_COMPHY_CONF_ETH(0, PHY_INTERFACE_MODE_SGMII,
+ 				    COMPHY_FW_MODE_SGMII),
+-	MVEBU_A3700_COMPHY_CONF_ETH(0, PHY_INTERFACE_MODE_2500BASEX, 1,
++	MVEBU_A3700_COMPHY_CONF_ETH(0, PHY_INTERFACE_MODE_2500BASEX,
+ 				    COMPHY_FW_MODE_2500BASEX),
+ 	/* lane 1 */
+-	MVEBU_A3700_COMPHY_CONF_GEN(1, PHY_MODE_PCIE, 0,
+-				    COMPHY_FW_MODE_PCIE),
+-	MVEBU_A3700_COMPHY_CONF_ETH(1, PHY_INTERFACE_MODE_SGMII, 0,
++	MVEBU_A3700_COMPHY_CONF_GEN(1, PHY_MODE_PCIE, COMPHY_FW_MODE_PCIE),
++	MVEBU_A3700_COMPHY_CONF_ETH(1, PHY_INTERFACE_MODE_SGMII,
+ 				    COMPHY_FW_MODE_SGMII),
+-	MVEBU_A3700_COMPHY_CONF_ETH(1, PHY_INTERFACE_MODE_2500BASEX, 0,
++	MVEBU_A3700_COMPHY_CONF_ETH(1, PHY_INTERFACE_MODE_2500BASEX,
+ 				    COMPHY_FW_MODE_2500BASEX),
+ 	/* lane 2 */
+-	MVEBU_A3700_COMPHY_CONF_GEN(2, PHY_MODE_SATA, 0,
+-				    COMPHY_FW_MODE_SATA),
+-	MVEBU_A3700_COMPHY_CONF_GEN(2, PHY_MODE_USB_HOST_SS, 0,
++	MVEBU_A3700_COMPHY_CONF_GEN(2, PHY_MODE_SATA, COMPHY_FW_MODE_SATA),
++	MVEBU_A3700_COMPHY_CONF_GEN(2, PHY_MODE_USB_HOST_SS,
+ 				    COMPHY_FW_MODE_USB3H),
+ };
+ 
+@@ -98,7 +93,6 @@ struct mvebu_a3700_comphy_lane {
+ 	unsigned int id;
+ 	enum phy_mode mode;
+ 	int submode;
+-	int port;
+ };
+ 
+ static int mvebu_a3700_comphy_smc(unsigned long function, unsigned long lane,
+@@ -120,7 +114,7 @@ static int mvebu_a3700_comphy_smc(unsign
+ 	}
+ }
+ 
+-static int mvebu_a3700_comphy_get_fw_mode(int lane, int port,
++static int mvebu_a3700_comphy_get_fw_mode(int lane,
+ 					  enum phy_mode mode,
+ 					  int submode)
+ {
+@@ -132,7 +126,6 @@ static int mvebu_a3700_comphy_get_fw_mod
+ 
+ 	for (i = 0; i < n; i++) {
+ 		if (mvebu_a3700_comphy_modes[i].lane == lane &&
+-		    mvebu_a3700_comphy_modes[i].port == port &&
+ 		    mvebu_a3700_comphy_modes[i].mode == mode &&
+ 		    mvebu_a3700_comphy_modes[i].submode == submode)
+ 			break;
+@@ -153,7 +146,7 @@ static int mvebu_a3700_comphy_set_mode(s
+ 	if (submode == PHY_INTERFACE_MODE_1000BASEX)
+ 		submode = PHY_INTERFACE_MODE_SGMII;
+ 
+-	fw_mode = mvebu_a3700_comphy_get_fw_mode(lane->id, lane->port, mode,
++	fw_mode = mvebu_a3700_comphy_get_fw_mode(lane->id, mode,
+ 						 submode);
+ 	if (fw_mode < 0) {
+ 		dev_err(lane->dev, "invalid COMPHY mode\n");
+@@ -172,9 +165,10 @@ static int mvebu_a3700_comphy_power_on(s
+ 	struct mvebu_a3700_comphy_lane *lane = phy_get_drvdata(phy);
+ 	u32 fw_param;
+ 	int fw_mode;
++	int fw_port;
+ 	int ret;
+ 
+-	fw_mode = mvebu_a3700_comphy_get_fw_mode(lane->id, lane->port,
++	fw_mode = mvebu_a3700_comphy_get_fw_mode(lane->id,
+ 						 lane->mode, lane->submode);
+ 	if (fw_mode < 0) {
+ 		dev_err(lane->dev, "invalid COMPHY mode\n");
+@@ -191,17 +185,18 @@ static int mvebu_a3700_comphy_power_on(s
+ 		fw_param = COMPHY_FW_MODE(fw_mode);
+ 		break;
+ 	case PHY_MODE_ETHERNET:
++		fw_port = (lane->id == 0) ? 1 : 0;
+ 		switch (lane->submode) {
+ 		case PHY_INTERFACE_MODE_SGMII:
+ 			dev_dbg(lane->dev, "set lane %d to SGMII mode\n",
+ 				lane->id);
+-			fw_param = COMPHY_FW_NET(fw_mode, lane->port,
++			fw_param = COMPHY_FW_NET(fw_mode, fw_port,
+ 						 COMPHY_FW_SPEED_1_25G);
+ 			break;
+ 		case PHY_INTERFACE_MODE_2500BASEX:
+ 			dev_dbg(lane->dev, "set lane %d to 2500BASEX mode\n",
+ 				lane->id);
+-			fw_param = COMPHY_FW_NET(fw_mode, lane->port,
++			fw_param = COMPHY_FW_NET(fw_mode, fw_port,
+ 						 COMPHY_FW_SPEED_3_125G);
+ 			break;
+ 		default:
+@@ -212,8 +207,7 @@ static int mvebu_a3700_comphy_power_on(s
+ 		break;
+ 	case PHY_MODE_PCIE:
+ 		dev_dbg(lane->dev, "set lane %d to PCIe mode\n", lane->id);
+-		fw_param = COMPHY_FW_PCIE(fw_mode, lane->port,
+-					  COMPHY_FW_SPEED_5G,
++		fw_param = COMPHY_FW_PCIE(fw_mode, COMPHY_FW_SPEED_5G,
+ 					  phy->attrs.bus_width);
+ 		break;
+ 	default:
+@@ -247,17 +241,20 @@ static struct phy *mvebu_a3700_comphy_xl
+ 					    struct of_phandle_args *args)
+ {
+ 	struct mvebu_a3700_comphy_lane *lane;
++	unsigned int port;
+ 	struct phy *phy;
+ 
+-	if (WARN_ON(args->args[0] >= MVEBU_A3700_COMPHY_PORTS))
+-		return ERR_PTR(-EINVAL);
+-
+ 	phy = of_phy_simple_xlate(dev, args);
+ 	if (IS_ERR(phy))
+ 		return phy;
+ 
+ 	lane = phy_get_drvdata(phy);
+-	lane->port = args->args[0];
++
++	port = args->args[0];
++	if (port != 0 && (port != 1 || lane->id != 0)) {
++		dev_err(lane->dev, "invalid port number %u\n", port);
++		return ERR_PTR(-EINVAL);
++	}
+ 
+ 	return phy;
+ }
+@@ -302,7 +299,6 @@ static int mvebu_a3700_comphy_probe(stru
+ 		lane->mode = PHY_MODE_INVALID;
+ 		lane->submode = PHY_INTERFACE_MODE_NA;
+ 		lane->id = lane_id;
+-		lane->port = -1;
+ 		phy_set_drvdata(phy, lane);
+ 	}
+ 

+ 1552 - 0
target/linux/generic/backport-6.1/344-v5.18-02-phy-marvell-phy-mvebu-a3700-comphy-Add-native-kernel.patch

@@ -0,0 +1,1552 @@
+From 934337080c6c59b75db76b180b509f218640ad48 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <[email protected]>
+Date: Thu, 3 Feb 2022 22:44:41 +0100
+Subject: [PATCH 2/2] phy: marvell: phy-mvebu-a3700-comphy: Add native kernel
+ implementation
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Remove old RPC implementation and add a new native kernel implementation.
+
+The old implementation uses ARM SMC API to issue RPC calls to ARM Trusted
+Firmware which provides real implementation of PHY configuration.
+
+But older versions of ARM Trusted Firmware do not provide this PHY
+configuration functionality, simply returning: operation not supported; or
+worse, some versions provide the configuration functionality incorrectly.
+
+For example the firmware shipped in ESPRESSObin board has this older
+version of ARM Trusted Firmware and therefore SATA, USB 3.0 and PCIe
+functionality do not work with newer versions of Linux kernel.
+
+Due to the above reasons, the following commits were introduced into Linux,
+to workaround these issues by ignoring -EOPNOTSUPP error code from
+phy-mvebu-a3700-comphy driver function phy_power_on():
+
+commit 45aefe3d2251 ("ata: ahci: mvebu: Make SATA PHY optional for Armada
+3720")
+commit 3241929b67d2 ("usb: host: xhci: mvebu: make USB 3.0 PHY optional for
+Armada 3720")
+commit b0c6ae0f8948 ("PCI: aardvark: Fix initialization with old Marvell's
+Arm Trusted Firmware")
+
+Replace this RPC implementation with proper native kernel implementation,
+which is independent on the firmware. Never return -EOPNOTSUPP for proper
+arguments.
+
+This should solve multiple issues with real-world boards, where it is not
+possible or really inconvenient to change the firmware. Let's eliminate
+these issues.
+
+This implementation is ported directly from Armada 3720 comphy driver found
+in newest version of ARM Trusted Firmware source code, but with various
+fixes of register names, some added comments, some refactoring due to the
+original code not conforming to kernel standards. Also PCIe mode poweroff
+support was added here, and PHY reset support. These changes are also going
+to be sent to ARM Trusted Firmware.
+
+[ Pali did the porting from ATF.
+  I (Marek) then fixed some register names, some various other things,
+  added some comments and refactored the code to kernel standards. Also
+  fixed PHY poweroff and added PHY reset. ]
+
+Signed-off-by: Pali Rohár <[email protected]>
+Acked-by: Miquel Raynal <[email protected]>
+Signed-off-by: Marek Behún <[email protected]>
+Link: https://lore.kernel.org/r/[email protected]
+Signed-off-by: Vinod Koul <[email protected]>
+---
+ drivers/phy/marvell/phy-mvebu-a3700-comphy.c | 1332 ++++++++++++++++--
+ 1 file changed, 1215 insertions(+), 117 deletions(-)
+
+--- a/drivers/phy/marvell/phy-mvebu-a3700-comphy.c
++++ b/drivers/phy/marvell/phy-mvebu-a3700-comphy.c
+@@ -5,12 +5,16 @@
+  * Authors:
+  *   Evan Wang <[email protected]>
+  *   Miquèl Raynal <[email protected]>
++ *   Pali Rohár <[email protected]>
++ *   Marek Behún <[email protected]>
+  *
+  * Structure inspired from phy-mvebu-cp110-comphy.c written by Antoine Tenart.
+- * SMC call initial support done by Grzegorz Jaszczyk.
++ * Comphy code from ARM Trusted Firmware ported by Pali Rohár <[email protected]>
++ * and Marek Behún <[email protected]>.
+  */
+ 
+-#include <linux/arm-smccc.h>
++#include <linux/bitfield.h>
++#include <linux/clk.h>
+ #include <linux/io.h>
+ #include <linux/iopoll.h>
+ #include <linux/mfd/syscon.h>
+@@ -18,103 +22,1118 @@
+ #include <linux/phy.h>
+ #include <linux/phy/phy.h>
+ #include <linux/platform_device.h>
++#include <linux/spinlock.h>
+ 
+-#define MVEBU_A3700_COMPHY_LANES		3
++#define PLL_SET_DELAY_US		600
++#define COMPHY_PLL_SLEEP		1000
++#define COMPHY_PLL_TIMEOUT		150000
++
++/* Comphy lane2 indirect access register offset */
++#define COMPHY_LANE2_INDIR_ADDR		0x0
++#define COMPHY_LANE2_INDIR_DATA		0x4
++
++/* SATA and USB3 PHY offset compared to SATA PHY */
++#define COMPHY_LANE2_REGS_BASE		0x200
++
++/*
++ * When accessing common PHY lane registers directly, we need to shift by 1,
++ * since the registers are 16-bit.
++ */
++#define COMPHY_LANE_REG_DIRECT(reg)	(((reg) & 0x7FF) << 1)
++
++/* COMPHY registers */
++#define COMPHY_POWER_PLL_CTRL		0x01
++#define PU_IVREF_BIT			BIT(15)
++#define PU_PLL_BIT			BIT(14)
++#define PU_RX_BIT			BIT(13)
++#define PU_TX_BIT			BIT(12)
++#define PU_TX_INTP_BIT			BIT(11)
++#define PU_DFE_BIT			BIT(10)
++#define RESET_DTL_RX_BIT		BIT(9)
++#define PLL_LOCK_BIT			BIT(8)
++#define REF_FREF_SEL_MASK		GENMASK(4, 0)
++#define REF_FREF_SEL_SERDES_25MHZ	FIELD_PREP(REF_FREF_SEL_MASK, 0x1)
++#define REF_FREF_SEL_SERDES_40MHZ	FIELD_PREP(REF_FREF_SEL_MASK, 0x3)
++#define REF_FREF_SEL_SERDES_50MHZ	FIELD_PREP(REF_FREF_SEL_MASK, 0x4)
++#define REF_FREF_SEL_PCIE_USB3_25MHZ	FIELD_PREP(REF_FREF_SEL_MASK, 0x2)
++#define REF_FREF_SEL_PCIE_USB3_40MHZ	FIELD_PREP(REF_FREF_SEL_MASK, 0x3)
++#define COMPHY_MODE_MASK		GENMASK(7, 5)
++#define COMPHY_MODE_SATA		FIELD_PREP(COMPHY_MODE_MASK, 0x0)
++#define COMPHY_MODE_PCIE		FIELD_PREP(COMPHY_MODE_MASK, 0x3)
++#define COMPHY_MODE_SERDES		FIELD_PREP(COMPHY_MODE_MASK, 0x4)
++#define COMPHY_MODE_USB3		FIELD_PREP(COMPHY_MODE_MASK, 0x5)
++
++#define COMPHY_KVCO_CAL_CTRL		0x02
++#define USE_MAX_PLL_RATE_BIT		BIT(12)
++#define SPEED_PLL_MASK			GENMASK(7, 2)
++#define SPEED_PLL_VALUE_16		FIELD_PREP(SPEED_PLL_MASK, 0x10)
++
++#define COMPHY_DIG_LOOPBACK_EN		0x23
++#define SEL_DATA_WIDTH_MASK		GENMASK(11, 10)
++#define DATA_WIDTH_10BIT		FIELD_PREP(SEL_DATA_WIDTH_MASK, 0x0)
++#define DATA_WIDTH_20BIT		FIELD_PREP(SEL_DATA_WIDTH_MASK, 0x1)
++#define DATA_WIDTH_40BIT		FIELD_PREP(SEL_DATA_WIDTH_MASK, 0x2)
++#define PLL_READY_TX_BIT		BIT(4)
++
++#define COMPHY_SYNC_PATTERN		0x24
++#define TXD_INVERT_BIT			BIT(10)
++#define RXD_INVERT_BIT			BIT(11)
++
++#define COMPHY_SYNC_MASK_GEN		0x25
++#define PHY_GEN_MAX_MASK		GENMASK(11, 10)
++#define PHY_GEN_MAX_USB3_5G		FIELD_PREP(PHY_GEN_MAX_MASK, 0x1)
++
++#define COMPHY_ISOLATION_CTRL		0x26
++#define PHY_ISOLATE_MODE		BIT(15)
++
++#define COMPHY_GEN2_SET2		0x3e
++#define GS2_TX_SSC_AMP_MASK		GENMASK(15, 9)
++#define GS2_TX_SSC_AMP_4128		FIELD_PREP(GS2_TX_SSC_AMP_MASK, 0x20)
++#define GS2_VREG_RXTX_MAS_ISET_MASK	GENMASK(8, 7)
++#define GS2_VREG_RXTX_MAS_ISET_60U	FIELD_PREP(GS2_VREG_RXTX_MAS_ISET_MASK,\
++						   0x0)
++#define GS2_VREG_RXTX_MAS_ISET_80U	FIELD_PREP(GS2_VREG_RXTX_MAS_ISET_MASK,\
++						   0x1)
++#define GS2_VREG_RXTX_MAS_ISET_100U	FIELD_PREP(GS2_VREG_RXTX_MAS_ISET_MASK,\
++						   0x2)
++#define GS2_VREG_RXTX_MAS_ISET_120U	FIELD_PREP(GS2_VREG_RXTX_MAS_ISET_MASK,\
++						   0x3)
++#define GS2_RSVD_6_0_MASK		GENMASK(6, 0)
++
++#define COMPHY_GEN3_SET2		0x3f
++
++#define COMPHY_IDLE_SYNC_EN		0x48
++#define IDLE_SYNC_EN			BIT(12)
++
++#define COMPHY_MISC_CTRL0		0x4F
++#define CLK100M_125M_EN			BIT(4)
++#define TXDCLK_2X_SEL			BIT(6)
++#define CLK500M_EN			BIT(7)
++#define PHY_REF_CLK_SEL			BIT(10)
++
++#define COMPHY_SFT_RESET		0x52
++#define SFT_RST				BIT(9)
++#define SFT_RST_NO_REG			BIT(10)
++
++#define COMPHY_MISC_CTRL1		0x73
++#define SEL_BITS_PCIE_FORCE		BIT(15)
++
++#define COMPHY_GEN2_SET3		0x112
++#define GS3_FFE_CAP_SEL_MASK		GENMASK(3, 0)
++#define GS3_FFE_CAP_SEL_VALUE		FIELD_PREP(GS3_FFE_CAP_SEL_MASK, 0xF)
++
++/* PIPE registers */
++#define COMPHY_PIPE_LANE_CFG0		0x180
++#define PRD_TXDEEMPH0_MASK		BIT(0)
++#define PRD_TXMARGIN_MASK		GENMASK(3, 1)
++#define PRD_TXSWING_MASK		BIT(4)
++#define CFG_TX_ALIGN_POS_MASK		GENMASK(8, 5)
++
++#define COMPHY_PIPE_LANE_CFG1		0x181
++#define PRD_TXDEEMPH1_MASK		BIT(15)
++#define USE_MAX_PLL_RATE_EN		BIT(9)
++#define TX_DET_RX_MODE			BIT(6)
++#define GEN2_TX_DATA_DLY_MASK		GENMASK(4, 3)
++#define GEN2_TX_DATA_DLY_DEFT		FIELD_PREP(GEN2_TX_DATA_DLY_MASK, 2)
++#define TX_ELEC_IDLE_MODE_EN		BIT(0)
++
++#define COMPHY_PIPE_LANE_STAT1		0x183
++#define TXDCLK_PCLK_EN			BIT(0)
++
++#define COMPHY_PIPE_LANE_CFG4		0x188
++#define SPREAD_SPECTRUM_CLK_EN		BIT(7)
++
++#define COMPHY_PIPE_RST_CLK_CTRL	0x1C1
++#define PIPE_SOFT_RESET			BIT(0)
++#define PIPE_REG_RESET			BIT(1)
++#define MODE_CORE_CLK_FREQ_SEL		BIT(9)
++#define MODE_PIPE_WIDTH_32		BIT(3)
++#define MODE_REFDIV_MASK		GENMASK(5, 4)
++#define MODE_REFDIV_BY_4		FIELD_PREP(MODE_REFDIV_MASK, 0x2)
++
++#define COMPHY_PIPE_TEST_MODE_CTRL	0x1C2
++#define MODE_MARGIN_OVERRIDE		BIT(2)
++
++#define COMPHY_PIPE_CLK_SRC_LO		0x1C3
++#define MODE_CLK_SRC			BIT(0)
++#define BUNDLE_PERIOD_SEL		BIT(1)
++#define BUNDLE_PERIOD_SCALE_MASK	GENMASK(3, 2)
++#define BUNDLE_SAMPLE_CTRL		BIT(4)
++#define PLL_READY_DLY_MASK		GENMASK(7, 5)
++#define CFG_SEL_20B			BIT(15)
++
++#define COMPHY_PIPE_PWR_MGM_TIM1	0x1D0
++#define CFG_PM_OSCCLK_WAIT_MASK		GENMASK(15, 12)
++#define CFG_PM_RXDEN_WAIT_MASK		GENMASK(11, 8)
++#define CFG_PM_RXDEN_WAIT_1_UNIT	FIELD_PREP(CFG_PM_RXDEN_WAIT_MASK, 0x1)
++#define CFG_PM_RXDLOZ_WAIT_MASK		GENMASK(7, 0)
++#define CFG_PM_RXDLOZ_WAIT_7_UNIT	FIELD_PREP(CFG_PM_RXDLOZ_WAIT_MASK, 0x7)
++#define CFG_PM_RXDLOZ_WAIT_12_UNIT	FIELD_PREP(CFG_PM_RXDLOZ_WAIT_MASK, 0xC)
++
++/*
++ * This register is not from PHY lane register space. It only exists in the
++ * indirect register space, before the actual PHY lane 2 registers. So the
++ * offset is absolute, not relative to COMPHY_LANE2_REGS_BASE.
++ * It is used only for SATA PHY initialization.
++ */
++#define COMPHY_RESERVED_REG		0x0E
++#define PHYCTRL_FRM_PIN_BIT		BIT(13)
+ 
+-/* COMPHY Fast SMC function identifiers */
+-#define COMPHY_SIP_POWER_ON			0x82000001
+-#define COMPHY_SIP_POWER_OFF			0x82000002
+-#define COMPHY_SIP_PLL_LOCK			0x82000003
+-
+-#define COMPHY_FW_MODE_SATA			0x1
+-#define COMPHY_FW_MODE_SGMII			0x2
+-#define COMPHY_FW_MODE_2500BASEX		0x3
+-#define COMPHY_FW_MODE_USB3H			0x4
+-#define COMPHY_FW_MODE_USB3D			0x5
+-#define COMPHY_FW_MODE_PCIE			0x6
+-#define COMPHY_FW_MODE_USB3			0xa
+-
+-#define COMPHY_FW_SPEED_1_25G			0 /* SGMII 1G */
+-#define COMPHY_FW_SPEED_2_5G			1
+-#define COMPHY_FW_SPEED_3_125G			2 /* 2500BASE-X */
+-#define COMPHY_FW_SPEED_5G			3
+-#define COMPHY_FW_SPEED_MAX			0x3F
+-
+-#define COMPHY_FW_MODE(mode)			((mode) << 12)
+-#define COMPHY_FW_NET(mode, idx, speed)		(COMPHY_FW_MODE(mode) | \
+-						 ((idx) << 8) |	\
+-						 ((speed) << 2))
+-#define COMPHY_FW_PCIE(mode, speed, width)	(COMPHY_FW_NET(mode, 0, speed) | \
+-						 ((width) << 18))
++/* South Bridge PHY Configuration Registers */
++#define COMPHY_PHY_REG(lane, reg)	(((1 - (lane)) * 0x28) + ((reg) & 0x3f))
++
++/*
++ * lane0: USB3/GbE1 PHY Configuration 1
++ * lane1: PCIe/GbE0 PHY Configuration 1
++ * (used only by SGMII code)
++ */
++#define COMPHY_PHY_CFG1			0x0
++#define PIN_PU_IVREF_BIT		BIT(1)
++#define PIN_RESET_CORE_BIT		BIT(11)
++#define PIN_RESET_COMPHY_BIT		BIT(12)
++#define PIN_PU_PLL_BIT			BIT(16)
++#define PIN_PU_RX_BIT			BIT(17)
++#define PIN_PU_TX_BIT			BIT(18)
++#define PIN_TX_IDLE_BIT			BIT(19)
++#define GEN_RX_SEL_MASK			GENMASK(25, 22)
++#define GEN_RX_SEL_VALUE(val)		FIELD_PREP(GEN_RX_SEL_MASK, (val))
++#define GEN_TX_SEL_MASK			GENMASK(29, 26)
++#define GEN_TX_SEL_VALUE(val)		FIELD_PREP(GEN_TX_SEL_MASK, (val))
++#define SERDES_SPEED_1_25_G		0x6
++#define SERDES_SPEED_3_125_G		0x8
++#define PHY_RX_INIT_BIT			BIT(30)
++
++/*
++ * lane0: USB3/GbE1 PHY Status 1
++ * lane1: PCIe/GbE0 PHY Status 1
++ * (used only by SGMII code)
++ */
++#define COMPHY_PHY_STAT1		0x18
++#define PHY_RX_INIT_DONE_BIT		BIT(0)
++#define PHY_PLL_READY_RX_BIT		BIT(2)
++#define PHY_PLL_READY_TX_BIT		BIT(3)
++
++/* PHY Selector */
++#define COMPHY_SELECTOR_PHY_REG			0xFC
++/* bit0: 0: Lane1 is GbE0; 1: Lane1 is PCIe */
++#define COMPHY_SELECTOR_PCIE_GBE0_SEL_BIT	BIT(0)
++/* bit4: 0: Lane0 is GbE1; 1: Lane0 is USB3 */
++#define COMPHY_SELECTOR_USB3_GBE1_SEL_BIT	BIT(4)
++/* bit8: 0: Lane0 is USB3 instead of GbE1, Lane2 is SATA; 1: Lane2 is USB3 */
++#define COMPHY_SELECTOR_USB3_PHY_SEL_BIT	BIT(8)
+ 
+ struct mvebu_a3700_comphy_conf {
+ 	unsigned int lane;
+ 	enum phy_mode mode;
+ 	int submode;
+-	u32 fw_mode;
+ };
+ 
+-#define MVEBU_A3700_COMPHY_CONF(_lane, _mode, _smode, _fw)		\
++#define MVEBU_A3700_COMPHY_CONF(_lane, _mode, _smode)			\
+ 	{								\
+ 		.lane = _lane,						\
+ 		.mode = _mode,						\
+ 		.submode = _smode,					\
+-		.fw_mode = _fw,						\
+ 	}
+ 
+-#define MVEBU_A3700_COMPHY_CONF_GEN(_lane, _mode, _fw) \
+-	MVEBU_A3700_COMPHY_CONF(_lane, _mode, PHY_INTERFACE_MODE_NA, _fw)
++#define MVEBU_A3700_COMPHY_CONF_GEN(_lane, _mode) \
++	MVEBU_A3700_COMPHY_CONF(_lane, _mode, PHY_INTERFACE_MODE_NA)
+ 
+-#define MVEBU_A3700_COMPHY_CONF_ETH(_lane, _smode, _fw) \
+-	MVEBU_A3700_COMPHY_CONF(_lane, PHY_MODE_ETHERNET, _smode, _fw)
++#define MVEBU_A3700_COMPHY_CONF_ETH(_lane, _smode) \
++	MVEBU_A3700_COMPHY_CONF(_lane, PHY_MODE_ETHERNET, _smode)
+ 
+ static const struct mvebu_a3700_comphy_conf mvebu_a3700_comphy_modes[] = {
+ 	/* lane 0 */
+-	MVEBU_A3700_COMPHY_CONF_GEN(0, PHY_MODE_USB_HOST_SS,
+-				    COMPHY_FW_MODE_USB3H),
+-	MVEBU_A3700_COMPHY_CONF_ETH(0, PHY_INTERFACE_MODE_SGMII,
+-				    COMPHY_FW_MODE_SGMII),
+-	MVEBU_A3700_COMPHY_CONF_ETH(0, PHY_INTERFACE_MODE_2500BASEX,
+-				    COMPHY_FW_MODE_2500BASEX),
++	MVEBU_A3700_COMPHY_CONF_GEN(0, PHY_MODE_USB_HOST_SS),
++	MVEBU_A3700_COMPHY_CONF_ETH(0, PHY_INTERFACE_MODE_SGMII),
++	MVEBU_A3700_COMPHY_CONF_ETH(0, PHY_INTERFACE_MODE_1000BASEX),
++	MVEBU_A3700_COMPHY_CONF_ETH(0, PHY_INTERFACE_MODE_2500BASEX),
+ 	/* lane 1 */
+-	MVEBU_A3700_COMPHY_CONF_GEN(1, PHY_MODE_PCIE, COMPHY_FW_MODE_PCIE),
+-	MVEBU_A3700_COMPHY_CONF_ETH(1, PHY_INTERFACE_MODE_SGMII,
+-				    COMPHY_FW_MODE_SGMII),
+-	MVEBU_A3700_COMPHY_CONF_ETH(1, PHY_INTERFACE_MODE_2500BASEX,
+-				    COMPHY_FW_MODE_2500BASEX),
++	MVEBU_A3700_COMPHY_CONF_GEN(1, PHY_MODE_PCIE),
++	MVEBU_A3700_COMPHY_CONF_ETH(1, PHY_INTERFACE_MODE_SGMII),
++	MVEBU_A3700_COMPHY_CONF_ETH(1, PHY_INTERFACE_MODE_1000BASEX),
++	MVEBU_A3700_COMPHY_CONF_ETH(1, PHY_INTERFACE_MODE_2500BASEX),
+ 	/* lane 2 */
+-	MVEBU_A3700_COMPHY_CONF_GEN(2, PHY_MODE_SATA, COMPHY_FW_MODE_SATA),
+-	MVEBU_A3700_COMPHY_CONF_GEN(2, PHY_MODE_USB_HOST_SS,
+-				    COMPHY_FW_MODE_USB3H),
++	MVEBU_A3700_COMPHY_CONF_GEN(2, PHY_MODE_SATA),
++	MVEBU_A3700_COMPHY_CONF_GEN(2, PHY_MODE_USB_HOST_SS),
++};
++
++struct mvebu_a3700_comphy_priv {
++	void __iomem *comphy_regs;
++	void __iomem *lane0_phy_regs; /* USB3 and GbE1 */
++	void __iomem *lane1_phy_regs; /* PCIe and GbE0 */
++	void __iomem *lane2_phy_indirect; /* SATA and USB3 */
++	spinlock_t lock; /* for PHY selector access */
++	bool xtal_is_40m;
+ };
+ 
+ struct mvebu_a3700_comphy_lane {
++	struct mvebu_a3700_comphy_priv *priv;
+ 	struct device *dev;
+ 	unsigned int id;
+ 	enum phy_mode mode;
+ 	int submode;
++	bool invert_tx;
++	bool invert_rx;
++	bool needs_reset;
++};
++
++struct gbe_phy_init_data_fix {
++	u16 addr;
++	u16 value;
++};
++
++/* Changes to 40M1G25 mode data required for running 40M3G125 init mode */
++static struct gbe_phy_init_data_fix gbe_phy_init_fix[] = {
++	{ 0x005, 0x07CC }, { 0x015, 0x0000 }, { 0x01B, 0x0000 },
++	{ 0x01D, 0x0000 }, { 0x01E, 0x0000 }, { 0x01F, 0x0000 },
++	{ 0x020, 0x0000 }, { 0x021, 0x0030 }, { 0x026, 0x0888 },
++	{ 0x04D, 0x0152 }, { 0x04F, 0xA020 }, { 0x050, 0x07CC },
++	{ 0x053, 0xE9CA }, { 0x055, 0xBD97 }, { 0x071, 0x3015 },
++	{ 0x076, 0x03AA }, { 0x07C, 0x0FDF }, { 0x0C2, 0x3030 },
++	{ 0x0C3, 0x8000 }, { 0x0E2, 0x5550 }, { 0x0E3, 0x12A4 },
++	{ 0x0E4, 0x7D00 }, { 0x0E6, 0x0C83 }, { 0x101, 0xFCC0 },
++	{ 0x104, 0x0C10 }
+ };
+ 
+-static int mvebu_a3700_comphy_smc(unsigned long function, unsigned long lane,
+-				  unsigned long mode)
++/* 40M1G25 mode init data */
++static u16 gbe_phy_init[512] = {
++	/* 0       1       2       3       4       5       6       7 */
++	/*-----------------------------------------------------------*/
++	/* 8       9       A       B       C       D       E       F */
++	0x3110, 0xFD83, 0x6430, 0x412F, 0x82C0, 0x06FA, 0x4500, 0x6D26,	/* 00 */
++	0xAFC0, 0x8000, 0xC000, 0x0000, 0x2000, 0x49CC, 0x0BC9, 0x2A52,	/* 08 */
++	0x0BD2, 0x0CDE, 0x13D2, 0x0CE8, 0x1149, 0x10E0, 0x0000, 0x0000,	/* 10 */
++	0x0000, 0x0000, 0x0000, 0x0001, 0x0000, 0x4134, 0x0D2D, 0xFFFF,	/* 18 */
++	0xFFE0, 0x4030, 0x1016, 0x0030, 0x0000, 0x0800, 0x0866, 0x0000,	/* 20 */
++	0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,	/* 28 */
++	0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/* 30 */
++	0x0000, 0x0000, 0x000F, 0x6A62, 0x1988, 0x3100, 0x3100, 0x3100,	/* 38 */
++	0x3100, 0xA708, 0x2430, 0x0830, 0x1030, 0x4610, 0xFF00, 0xFF00,	/* 40 */
++	0x0060, 0x1000, 0x0400, 0x0040, 0x00F0, 0x0155, 0x1100, 0xA02A,	/* 48 */
++	0x06FA, 0x0080, 0xB008, 0xE3ED, 0x5002, 0xB592, 0x7A80, 0x0001,	/* 50 */
++	0x020A, 0x8820, 0x6014, 0x8054, 0xACAA, 0xFC88, 0x2A02, 0x45CF,	/* 58 */
++	0x000F, 0x1817, 0x2860, 0x064F, 0x0000, 0x0204, 0x1800, 0x6000,	/* 60 */
++	0x810F, 0x4F23, 0x4000, 0x4498, 0x0850, 0x0000, 0x000E, 0x1002,	/* 68 */
++	0x9D3A, 0x3009, 0xD066, 0x0491, 0x0001, 0x6AB0, 0x0399, 0x3780,	/* 70 */
++	0x0040, 0x5AC0, 0x4A80, 0x0000, 0x01DF, 0x0000, 0x0007, 0x0000,	/* 78 */
++	0x2D54, 0x00A1, 0x4000, 0x0100, 0xA20A, 0x0000, 0x0000, 0x0000,	/* 80 */
++	0x0000, 0x0000, 0x0000, 0x7400, 0x0E81, 0x1000, 0x1242, 0x0210,	/* 88 */
++	0x80DF, 0x0F1F, 0x2F3F, 0x4F5F, 0x6F7F, 0x0F1F, 0x2F3F, 0x4F5F,	/* 90 */
++	0x6F7F, 0x4BAD, 0x0000, 0x0000, 0x0800, 0x0000, 0x2400, 0xB651,	/* 98 */
++	0xC9E0, 0x4247, 0x0A24, 0x0000, 0xAF19, 0x1004, 0x0000, 0x0000,	/* A0 */
++	0x0000, 0x0013, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/* A8 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/* B0 */
++	0x0000, 0x0000, 0x0000, 0x0060, 0x0000, 0x0000, 0x0000, 0x0000,	/* B8 */
++	0x0000, 0x0000, 0x3010, 0xFA00, 0x0000, 0x0000, 0x0000, 0x0003,	/* C0 */
++	0x1618, 0x8200, 0x8000, 0x0400, 0x050F, 0x0000, 0x0000, 0x0000,	/* C8 */
++	0x4C93, 0x0000, 0x1000, 0x1120, 0x0010, 0x1242, 0x1242, 0x1E00,	/* D0 */
++	0x0000, 0x0000, 0x0000, 0x00F8, 0x0000, 0x0041, 0x0800, 0x0000,	/* D8 */
++	0x82A0, 0x572E, 0x2490, 0x14A9, 0x4E00, 0x0000, 0x0803, 0x0541,	/* E0 */
++	0x0C15, 0x0000, 0x0000, 0x0400, 0x2626, 0x0000, 0x0000, 0x4200,	/* E8 */
++	0x0000, 0xAA55, 0x1020, 0x0000, 0x0000, 0x5010, 0x0000, 0x0000,	/* F0 */
++	0x0000, 0x0000, 0x5000, 0x0000, 0x0000, 0x0000, 0x02F2, 0x0000,	/* F8 */
++	0x101F, 0xFDC0, 0x4000, 0x8010, 0x0110, 0x0006, 0x0000, 0x0000,	/*100 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*108 */
++	0x04CF, 0x0000, 0x04CF, 0x0000, 0x04CF, 0x0000, 0x04C6, 0x0000,	/*110 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*118 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*120 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*128 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*130 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*138 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*140 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*148 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*150 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*158 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*160 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*168 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*170 */
++	0x0000, 0x0000, 0x0000, 0x00F0, 0x08A2, 0x3112, 0x0A14, 0x0000,	/*178 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*180 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*188 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*190 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*198 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*1A0 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*1A8 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*1B0 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*1B8 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*1C0 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*1C8 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*1D0 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*1D8 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*1E0 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*1E8 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,	/*1F0 */
++	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000	/*1F8 */
++};
++
++static inline void comphy_reg_set(void __iomem *addr, u32 data, u32 mask)
+ {
+-	struct arm_smccc_res res;
+-	s32 ret;
++	u32 val;
++
++	val = readl(addr);
++	val = (val & ~mask) | (data & mask);
++	writel(val, addr);
++}
+ 
+-	arm_smccc_smc(function, lane, mode, 0, 0, 0, 0, 0, &res);
+-	ret = res.a0;
++static inline void comphy_reg_set16(void __iomem *addr, u16 data, u16 mask)
++{
++	u16 val;
+ 
+-	switch (ret) {
+-	case SMCCC_RET_SUCCESS:
+-		return 0;
+-	case SMCCC_RET_NOT_SUPPORTED:
+-		return -EOPNOTSUPP;
++	val = readw(addr);
++	val = (val & ~mask) | (data & mask);
++	writew(val, addr);
++}
++
++/* Used for accessing lane 2 registers (SATA/USB3 PHY) */
++static void comphy_set_indirect(struct mvebu_a3700_comphy_priv *priv,
++				u32 offset, u16 data, u16 mask)
++{
++	writel(offset,
++	       priv->lane2_phy_indirect + COMPHY_LANE2_INDIR_ADDR);
++	comphy_reg_set(priv->lane2_phy_indirect + COMPHY_LANE2_INDIR_DATA,
++		       data, mask);
++}
++
++static void comphy_lane_reg_set(struct mvebu_a3700_comphy_lane *lane,
++				u16 reg, u16 data, u16 mask)
++{
++	if (lane->id == 2) {
++		/* lane 2 PHY registers are accessed indirectly */
++		comphy_set_indirect(lane->priv,
++				    reg + COMPHY_LANE2_REGS_BASE,
++				    data, mask);
++	} else {
++		void __iomem *base = lane->id == 1 ?
++				     lane->priv->lane1_phy_regs :
++				     lane->priv->lane0_phy_regs;
++
++		comphy_reg_set16(base + COMPHY_LANE_REG_DIRECT(reg),
++				 data, mask);
++	}
++}
++
++static int comphy_lane_reg_poll(struct mvebu_a3700_comphy_lane *lane,
++				u16 reg, u16 bits,
++				ulong sleep_us, ulong timeout_us)
++{
++	int ret;
++
++	if (lane->id == 2) {
++		u32 data;
++
++		/* lane 2 PHY registers are accessed indirectly */
++		writel(reg + COMPHY_LANE2_REGS_BASE,
++		       lane->priv->lane2_phy_indirect +
++		       COMPHY_LANE2_INDIR_ADDR);
++
++		ret = readl_poll_timeout(lane->priv->lane2_phy_indirect +
++					 COMPHY_LANE2_INDIR_DATA,
++					 data, (data & bits) == bits,
++					 sleep_us, timeout_us);
++	} else {
++		void __iomem *base = lane->id == 1 ?
++				     lane->priv->lane1_phy_regs :
++				     lane->priv->lane0_phy_regs;
++		u16 data;
++
++		ret = readw_poll_timeout(base + COMPHY_LANE_REG_DIRECT(reg),
++					 data, (data & bits) == bits,
++					 sleep_us, timeout_us);
++	}
++
++	return ret;
++}
++
++static void comphy_periph_reg_set(struct mvebu_a3700_comphy_lane *lane,
++				  u8 reg, u32 data, u32 mask)
++{
++	comphy_reg_set(lane->priv->comphy_regs + COMPHY_PHY_REG(lane->id, reg),
++		       data, mask);
++}
++
++static int comphy_periph_reg_poll(struct mvebu_a3700_comphy_lane *lane,
++				  u8 reg, u32 bits,
++				  ulong sleep_us, ulong timeout_us)
++{
++	u32 data;
++
++	return readl_poll_timeout(lane->priv->comphy_regs +
++				  COMPHY_PHY_REG(lane->id, reg),
++				  data, (data & bits) == bits,
++				  sleep_us, timeout_us);
++}
++
++/* PHY selector configures with corresponding modes */
++static int
++mvebu_a3700_comphy_set_phy_selector(struct mvebu_a3700_comphy_lane *lane)
++{
++	u32 old, new, clr = 0, set = 0;
++	unsigned long flags;
++
++	switch (lane->mode) {
++	case PHY_MODE_SATA:
++		/* SATA must be in Lane2 */
++		if (lane->id == 2)
++			clr = COMPHY_SELECTOR_USB3_PHY_SEL_BIT;
++		else
++			goto error;
++		break;
++
++	case PHY_MODE_ETHERNET:
++		if (lane->id == 0)
++			clr = COMPHY_SELECTOR_USB3_GBE1_SEL_BIT;
++		else if (lane->id == 1)
++			clr = COMPHY_SELECTOR_PCIE_GBE0_SEL_BIT;
++		else
++			goto error;
++		break;
++
++	case PHY_MODE_USB_HOST_SS:
++		if (lane->id == 2)
++			set = COMPHY_SELECTOR_USB3_PHY_SEL_BIT;
++		else if (lane->id == 0)
++			set = COMPHY_SELECTOR_USB3_GBE1_SEL_BIT;
++		else
++			goto error;
++		break;
++
++	case PHY_MODE_PCIE:
++		/* PCIE must be in Lane1 */
++		if (lane->id == 1)
++			set = COMPHY_SELECTOR_PCIE_GBE0_SEL_BIT;
++		else
++			goto error;
++		break;
++
++	default:
++		goto error;
++	}
++
++	spin_lock_irqsave(&lane->priv->lock, flags);
++
++	old = readl(lane->priv->comphy_regs + COMPHY_SELECTOR_PHY_REG);
++	new = (old & ~clr) | set;
++	writel(new, lane->priv->comphy_regs + COMPHY_SELECTOR_PHY_REG);
++
++	spin_unlock_irqrestore(&lane->priv->lock, flags);
++
++	dev_dbg(lane->dev,
++		"COMPHY[%d] mode[%d] changed PHY selector 0x%08x -> 0x%08x\n",
++		lane->id, lane->mode, old, new);
++
++	return 0;
++error:
++	dev_err(lane->dev, "COMPHY[%d] mode[%d] is invalid\n", lane->id,
++		lane->mode);
++	return -EINVAL;
++}
++
++static int
++mvebu_a3700_comphy_sata_power_on(struct mvebu_a3700_comphy_lane *lane)
++{
++	u32 mask, data, ref_clk;
++	int ret;
++
++	/* Configure phy selector for SATA */
++	ret = mvebu_a3700_comphy_set_phy_selector(lane);
++	if (ret)
++		return ret;
++
++	/* Clear phy isolation mode to make it work in normal mode */
++	comphy_lane_reg_set(lane, COMPHY_ISOLATION_CTRL,
++			    0x0, PHY_ISOLATE_MODE);
++
++	/* 0. Check the Polarity invert bits */
++	data = 0x0;
++	if (lane->invert_tx)
++		data |= TXD_INVERT_BIT;
++	if (lane->invert_rx)
++		data |= RXD_INVERT_BIT;
++	mask = TXD_INVERT_BIT | RXD_INVERT_BIT;
++	comphy_lane_reg_set(lane, COMPHY_SYNC_PATTERN, data, mask);
++
++	/* 1. Select 40-bit data width */
++	comphy_lane_reg_set(lane, COMPHY_DIG_LOOPBACK_EN,
++			    DATA_WIDTH_40BIT, SEL_DATA_WIDTH_MASK);
++
++	/* 2. Select reference clock(25M) and PHY mode (SATA) */
++	if (lane->priv->xtal_is_40m)
++		ref_clk = REF_FREF_SEL_SERDES_40MHZ;
++	else
++		ref_clk = REF_FREF_SEL_SERDES_25MHZ;
++
++	data = ref_clk | COMPHY_MODE_SATA;
++	mask = REF_FREF_SEL_MASK | COMPHY_MODE_MASK;
++	comphy_lane_reg_set(lane, COMPHY_POWER_PLL_CTRL, data, mask);
++
++	/* 3. Use maximum PLL rate (no power save) */
++	comphy_lane_reg_set(lane, COMPHY_KVCO_CAL_CTRL,
++			    USE_MAX_PLL_RATE_BIT, USE_MAX_PLL_RATE_BIT);
++
++	/* 4. Reset reserved bit */
++	comphy_set_indirect(lane->priv, COMPHY_RESERVED_REG,
++			    0x0, PHYCTRL_FRM_PIN_BIT);
++
++	/* 5. Set vendor-specific configuration (It is done in sata driver) */
++	/* XXX: in U-Boot below sequence was executed in this place, in Linux
++	 * not.  Now it is done only in U-Boot before this comphy
++	 * initialization - tests shows that it works ok, but in case of any
++	 * future problem it is left for reference.
++	 *   reg_set(MVEBU_REGS_BASE + 0xe00a0, 0, 0xffffffff);
++	 *   reg_set(MVEBU_REGS_BASE + 0xe00a4, BIT(6), BIT(6));
++	 */
++
++	/* Wait for > 55 us to allow PLL be enabled */
++	udelay(PLL_SET_DELAY_US);
++
++	/* Polling status */
++	ret = comphy_lane_reg_poll(lane, COMPHY_DIG_LOOPBACK_EN,
++				   PLL_READY_TX_BIT, COMPHY_PLL_SLEEP,
++				   COMPHY_PLL_TIMEOUT);
++	if (ret)
++		dev_err(lane->dev, "Failed to lock SATA PLL\n");
++
++	return ret;
++}
++
++static void comphy_gbe_phy_init(struct mvebu_a3700_comphy_lane *lane,
++				bool is_1gbps)
++{
++	int addr, fix_idx;
++	u16 val;
++
++	fix_idx = 0;
++	for (addr = 0; addr < 512; addr++) {
++		/*
++		 * All PHY register values are defined in full for 3.125Gbps
++		 * SERDES speed. The values required for 1.25 Gbps are almost
++		 * the same and only few registers should be "fixed" in
++		 * comparison to 3.125 Gbps values. These register values are
++		 * stored in "gbe_phy_init_fix" array.
++		 */
++		if (!is_1gbps && gbe_phy_init_fix[fix_idx].addr == addr) {
++			/* Use new value */
++			val = gbe_phy_init_fix[fix_idx].value;
++			if (fix_idx < ARRAY_SIZE(gbe_phy_init_fix))
++				fix_idx++;
++		} else {
++			val = gbe_phy_init[addr];
++		}
++
++		comphy_lane_reg_set(lane, addr, val, 0xFFFF);
++	}
++}
++
++static int
++mvebu_a3700_comphy_ethernet_power_on(struct mvebu_a3700_comphy_lane *lane)
++{
++	u32 mask, data, speed_sel;
++	int ret;
++
++	/* Set selector */
++	ret = mvebu_a3700_comphy_set_phy_selector(lane);
++	if (ret)
++		return ret;
++
++	/*
++	 * 1. Reset PHY by setting PHY input port PIN_RESET=1.
++	 * 2. Set PHY input port PIN_TX_IDLE=1, PIN_PU_IVREF=1 to keep
++	 *    PHY TXP/TXN output to idle state during PHY initialization
++	 * 3. Set PHY input port PIN_PU_PLL=0, PIN_PU_RX=0, PIN_PU_TX=0.
++	 */
++	data = PIN_PU_IVREF_BIT | PIN_TX_IDLE_BIT | PIN_RESET_COMPHY_BIT;
++	mask = data | PIN_RESET_CORE_BIT | PIN_PU_PLL_BIT | PIN_PU_RX_BIT |
++	       PIN_PU_TX_BIT | PHY_RX_INIT_BIT;
++	comphy_periph_reg_set(lane, COMPHY_PHY_CFG1, data, mask);
++
++	/* 4. Release reset to the PHY by setting PIN_RESET=0. */
++	data = 0x0;
++	mask = PIN_RESET_COMPHY_BIT;
++	comphy_periph_reg_set(lane, COMPHY_PHY_CFG1, data, mask);
++
++	/*
++	 * 5. Set PIN_PHY_GEN_TX[3:0] and PIN_PHY_GEN_RX[3:0] to decide COMPHY
++	 * bit rate
++	 */
++	switch (lane->submode) {
++	case PHY_INTERFACE_MODE_SGMII:
++	case PHY_INTERFACE_MODE_1000BASEX:
++		/* SGMII 1G, SerDes speed 1.25G */
++		speed_sel = SERDES_SPEED_1_25_G;
++		break;
++	case PHY_INTERFACE_MODE_2500BASEX:
++		/* 2500Base-X, SerDes speed 3.125G */
++		speed_sel = SERDES_SPEED_3_125_G;
++		break;
+ 	default:
++		/* Other rates are not supported */
++		dev_err(lane->dev,
++			"unsupported phy speed %d on comphy lane%d\n",
++			lane->submode, lane->id);
+ 		return -EINVAL;
+ 	}
++	data = GEN_RX_SEL_VALUE(speed_sel) | GEN_TX_SEL_VALUE(speed_sel);
++	mask = GEN_RX_SEL_MASK | GEN_TX_SEL_MASK;
++	comphy_periph_reg_set(lane, COMPHY_PHY_CFG1, data, mask);
++
++	/*
++	 * 6. Wait 10mS for bandgap and reference clocks to stabilize; then
++	 * start SW programming.
++	 */
++	mdelay(10);
++
++	/* 7. Program COMPHY register PHY_MODE */
++	data = COMPHY_MODE_SERDES;
++	mask = COMPHY_MODE_MASK;
++	comphy_lane_reg_set(lane, COMPHY_POWER_PLL_CTRL, data, mask);
++
++	/*
++	 * 8. Set COMPHY register REFCLK_SEL to select the correct REFCLK
++	 * source
++	 */
++	data = 0x0;
++	mask = PHY_REF_CLK_SEL;
++	comphy_lane_reg_set(lane, COMPHY_MISC_CTRL0, data, mask);
++
++	/*
++	 * 9. Set correct reference clock frequency in COMPHY register
++	 * REF_FREF_SEL.
++	 */
++	if (lane->priv->xtal_is_40m)
++		data = REF_FREF_SEL_SERDES_50MHZ;
++	else
++		data = REF_FREF_SEL_SERDES_25MHZ;
++
++	mask = REF_FREF_SEL_MASK;
++	comphy_lane_reg_set(lane, COMPHY_POWER_PLL_CTRL, data, mask);
++
++	/*
++	 * 10. Program COMPHY register PHY_GEN_MAX[1:0]
++	 * This step is mentioned in the flow received from verification team.
++	 * However the PHY_GEN_MAX value is only meaningful for other interfaces
++	 * (not SERDES). For instance, it selects SATA speed 1.5/3/6 Gbps or
++	 * PCIe speed 2.5/5 Gbps
++	 */
++
++	/*
++	 * 11. Program COMPHY register SEL_BITS to set correct parallel data
++	 * bus width
++	 */
++	data = DATA_WIDTH_10BIT;
++	mask = SEL_DATA_WIDTH_MASK;
++	comphy_lane_reg_set(lane, COMPHY_DIG_LOOPBACK_EN, data, mask);
++
++	/*
++	 * 12. As long as DFE function needs to be enabled in any mode,
++	 * COMPHY register DFE_UPDATE_EN[5:0] shall be programmed to 0x3F
++	 * for real chip during COMPHY power on.
++	 * The value of the DFE_UPDATE_EN already is 0x3F, because it is the
++	 * default value after reset of the PHY.
++	 */
++
++	/*
++	 * 13. Program COMPHY GEN registers.
++	 * These registers should be programmed based on the lab testing result
++	 * to achieve optimal performance. Please contact the CEA group to get
++	 * the related GEN table during real chip bring-up. We only required to
++	 * run though the entire registers programming flow defined by
++	 * "comphy_gbe_phy_init" when the REF clock is 40 MHz. For REF clock
++	 * 25 MHz the default values stored in PHY registers are OK.
++	 */
++	dev_dbg(lane->dev, "Running C-DPI phy init %s mode\n",
++		lane->submode == PHY_INTERFACE_MODE_2500BASEX ? "2G5" : "1G");
++	if (lane->priv->xtal_is_40m)
++		comphy_gbe_phy_init(lane,
++				    lane->submode != PHY_INTERFACE_MODE_2500BASEX);
++
++	/*
++	 * 14. Check the PHY Polarity invert bit
++	 */
++	data = 0x0;
++	if (lane->invert_tx)
++		data |= TXD_INVERT_BIT;
++	if (lane->invert_rx)
++		data |= RXD_INVERT_BIT;
++	mask = TXD_INVERT_BIT | RXD_INVERT_BIT;
++	comphy_lane_reg_set(lane, COMPHY_SYNC_PATTERN, data, mask);
++
++	/*
++	 * 15. Set PHY input ports PIN_PU_PLL, PIN_PU_TX and PIN_PU_RX to 1 to
++	 * start PHY power up sequence. All the PHY register programming should
++	 * be done before PIN_PU_PLL=1. There should be no register programming
++	 * for normal PHY operation from this point.
++	 */
++	data = PIN_PU_PLL_BIT | PIN_PU_RX_BIT | PIN_PU_TX_BIT;
++	mask = data;
++	comphy_periph_reg_set(lane, COMPHY_PHY_CFG1, data, mask);
++
++	/*
++	 * 16. Wait for PHY power up sequence to finish by checking output ports
++	 * PIN_PLL_READY_TX=1 and PIN_PLL_READY_RX=1.
++	 */
++	ret = comphy_periph_reg_poll(lane, COMPHY_PHY_STAT1,
++				     PHY_PLL_READY_TX_BIT |
++				     PHY_PLL_READY_RX_BIT,
++				     COMPHY_PLL_SLEEP, COMPHY_PLL_TIMEOUT);
++	if (ret) {
++		dev_err(lane->dev, "Failed to lock PLL for SERDES PHY %d\n",
++			lane->id);
++		return ret;
++	}
++
++	/*
++	 * 17. Set COMPHY input port PIN_TX_IDLE=0
++	 */
++	comphy_periph_reg_set(lane, COMPHY_PHY_CFG1, 0x0, PIN_TX_IDLE_BIT);
++
++	/*
++	 * 18. After valid data appear on PIN_RXDATA bus, set PIN_RX_INIT=1. To
++	 * start RX initialization. PIN_RX_INIT_DONE will be cleared to 0 by the
++	 * PHY After RX initialization is done, PIN_RX_INIT_DONE will be set to
++	 * 1 by COMPHY Set PIN_RX_INIT=0 after PIN_RX_INIT_DONE= 1. Please
++	 * refer to RX initialization part for details.
++	 */
++	comphy_periph_reg_set(lane, COMPHY_PHY_CFG1,
++			      PHY_RX_INIT_BIT, PHY_RX_INIT_BIT);
++
++	ret = comphy_periph_reg_poll(lane, COMPHY_PHY_STAT1,
++				     PHY_PLL_READY_TX_BIT |
++				     PHY_PLL_READY_RX_BIT,
++				     COMPHY_PLL_SLEEP, COMPHY_PLL_TIMEOUT);
++	if (ret) {
++		dev_err(lane->dev, "Failed to lock PLL for SERDES PHY %d\n",
++			lane->id);
++		return ret;
++	}
++
++	ret = comphy_periph_reg_poll(lane, COMPHY_PHY_STAT1,
++				     PHY_RX_INIT_DONE_BIT,
++				     COMPHY_PLL_SLEEP, COMPHY_PLL_TIMEOUT);
++	if (ret)
++		dev_err(lane->dev, "Failed to init RX of SERDES PHY %d\n",
++			lane->id);
++
++	return ret;
+ }
+ 
+-static int mvebu_a3700_comphy_get_fw_mode(int lane,
++static int
++mvebu_a3700_comphy_usb3_power_on(struct mvebu_a3700_comphy_lane *lane)
++{
++	u32 mask, data, cfg, ref_clk;
++	int ret;
++
++	/* Set phy seclector */
++	ret = mvebu_a3700_comphy_set_phy_selector(lane);
++	if (ret)
++		return ret;
++
++	/*
++	 * 0. Set PHY OTG Control(0x5d034), bit 4, Power up OTG module The
++	 * register belong to UTMI module, so it is set in UTMI phy driver.
++	 */
++
++	/*
++	 * 1. Set PRD_TXDEEMPH (3.5db de-emph)
++	 */
++	data = PRD_TXDEEMPH0_MASK;
++	mask = PRD_TXDEEMPH0_MASK | PRD_TXMARGIN_MASK | PRD_TXSWING_MASK |
++	       CFG_TX_ALIGN_POS_MASK;
++	comphy_lane_reg_set(lane, COMPHY_PIPE_LANE_CFG0, data, mask);
++
++	/*
++	 * 2. Set BIT0: enable transmitter in high impedance mode
++	 *    Set BIT[3:4]: delay 2 clock cycles for HiZ off latency
++	 *    Set BIT6: Tx detect Rx at HiZ mode
++	 *    Unset BIT15: set to 0 to set USB3 De-emphasize level to -3.5db
++	 *            together with bit 0 of COMPHY_PIPE_LANE_CFG0 register
++	 */
++	data = TX_DET_RX_MODE | GEN2_TX_DATA_DLY_DEFT | TX_ELEC_IDLE_MODE_EN;
++	mask = PRD_TXDEEMPH1_MASK | TX_DET_RX_MODE | GEN2_TX_DATA_DLY_MASK |
++	       TX_ELEC_IDLE_MODE_EN;
++	comphy_lane_reg_set(lane, COMPHY_PIPE_LANE_CFG1, data, mask);
++
++	/*
++	 * 3. Set Spread Spectrum Clock Enabled
++	 */
++	comphy_lane_reg_set(lane, COMPHY_PIPE_LANE_CFG4,
++			    SPREAD_SPECTRUM_CLK_EN, SPREAD_SPECTRUM_CLK_EN);
++
++	/*
++	 * 4. Set Override Margining Controls From the MAC:
++	 *    Use margining signals from lane configuration
++	 */
++	comphy_lane_reg_set(lane, COMPHY_PIPE_TEST_MODE_CTRL,
++			    MODE_MARGIN_OVERRIDE, 0xFFFF);
++
++	/*
++	 * 5. Set Lane-to-Lane Bundle Clock Sampling Period = per PCLK cycles
++	 *    set Mode Clock Source = PCLK is generated from REFCLK
++	 */
++	data = 0x0;
++	mask = MODE_CLK_SRC | BUNDLE_PERIOD_SEL | BUNDLE_PERIOD_SCALE_MASK |
++	       BUNDLE_SAMPLE_CTRL | PLL_READY_DLY_MASK;
++	comphy_lane_reg_set(lane, COMPHY_PIPE_CLK_SRC_LO, data, mask);
++
++	/*
++	 * 6. Set G2 Spread Spectrum Clock Amplitude at 4K
++	 */
++	comphy_lane_reg_set(lane, COMPHY_GEN2_SET2,
++			    GS2_TX_SSC_AMP_4128, GS2_TX_SSC_AMP_MASK);
++
++	/*
++	 * 7. Unset G3 Spread Spectrum Clock Amplitude
++	 *    set G3 TX and RX Register Master Current Select
++	 */
++	data = GS2_VREG_RXTX_MAS_ISET_60U;
++	mask = GS2_TX_SSC_AMP_MASK | GS2_VREG_RXTX_MAS_ISET_MASK |
++	       GS2_RSVD_6_0_MASK;
++	comphy_lane_reg_set(lane, COMPHY_GEN3_SET2, data, mask);
++
++	/*
++	 * 8. Check crystal jumper setting and program the Power and PLL Control
++	 * accordingly Change RX wait
++	 */
++	if (lane->priv->xtal_is_40m) {
++		ref_clk = REF_FREF_SEL_PCIE_USB3_40MHZ;
++		cfg = CFG_PM_RXDLOZ_WAIT_12_UNIT;
++	} else {
++		ref_clk = REF_FREF_SEL_PCIE_USB3_25MHZ;
++		cfg = CFG_PM_RXDLOZ_WAIT_7_UNIT;
++	}
++
++	data = PU_IVREF_BIT | PU_PLL_BIT | PU_RX_BIT | PU_TX_BIT |
++	       PU_TX_INTP_BIT | PU_DFE_BIT | COMPHY_MODE_USB3 | ref_clk;
++	mask = PU_IVREF_BIT | PU_PLL_BIT | PU_RX_BIT | PU_TX_BIT |
++	       PU_TX_INTP_BIT | PU_DFE_BIT | PLL_LOCK_BIT | COMPHY_MODE_MASK |
++	       REF_FREF_SEL_MASK;
++	comphy_lane_reg_set(lane, COMPHY_POWER_PLL_CTRL, data, mask);
++
++	data = CFG_PM_RXDEN_WAIT_1_UNIT | cfg;
++	mask = CFG_PM_OSCCLK_WAIT_MASK | CFG_PM_RXDEN_WAIT_MASK |
++	       CFG_PM_RXDLOZ_WAIT_MASK;
++	comphy_lane_reg_set(lane, COMPHY_PIPE_PWR_MGM_TIM1, data, mask);
++
++	/*
++	 * 9. Enable idle sync
++	 */
++	comphy_lane_reg_set(lane, COMPHY_IDLE_SYNC_EN,
++			    IDLE_SYNC_EN, IDLE_SYNC_EN);
++
++	/*
++	 * 10. Enable the output of 500M clock
++	 */
++	comphy_lane_reg_set(lane, COMPHY_MISC_CTRL0, CLK500M_EN, CLK500M_EN);
++
++	/*
++	 * 11. Set 20-bit data width
++	 */
++	comphy_lane_reg_set(lane, COMPHY_DIG_LOOPBACK_EN,
++			    DATA_WIDTH_20BIT, 0xFFFF);
++
++	/*
++	 * 12. Override Speed_PLL value and use MAC PLL
++	 */
++	data = SPEED_PLL_VALUE_16 | USE_MAX_PLL_RATE_BIT;
++	mask = 0xFFFF;
++	comphy_lane_reg_set(lane, COMPHY_KVCO_CAL_CTRL, data, mask);
++
++	/*
++	 * 13. Check the Polarity invert bit
++	 */
++	data = 0x0;
++	if (lane->invert_tx)
++		data |= TXD_INVERT_BIT;
++	if (lane->invert_rx)
++		data |= RXD_INVERT_BIT;
++	mask = TXD_INVERT_BIT | RXD_INVERT_BIT;
++	comphy_lane_reg_set(lane, COMPHY_SYNC_PATTERN, data, mask);
++
++	/*
++	 * 14. Set max speed generation to USB3.0 5Gbps
++	 */
++	comphy_lane_reg_set(lane, COMPHY_SYNC_MASK_GEN,
++			    PHY_GEN_MAX_USB3_5G, PHY_GEN_MAX_MASK);
++
++	/*
++	 * 15. Set capacitor value for FFE gain peaking to 0xF
++	 */
++	comphy_lane_reg_set(lane, COMPHY_GEN2_SET3,
++			    GS3_FFE_CAP_SEL_VALUE, GS3_FFE_CAP_SEL_MASK);
++
++	/*
++	 * 16. Release SW reset
++	 */
++	data = MODE_CORE_CLK_FREQ_SEL | MODE_PIPE_WIDTH_32 | MODE_REFDIV_BY_4;
++	mask = 0xFFFF;
++	comphy_lane_reg_set(lane, COMPHY_PIPE_RST_CLK_CTRL, data, mask);
++
++	/* Wait for > 55 us to allow PCLK be enabled */
++	udelay(PLL_SET_DELAY_US);
++
++	ret = comphy_lane_reg_poll(lane, COMPHY_PIPE_LANE_STAT1, TXDCLK_PCLK_EN,
++				   COMPHY_PLL_SLEEP, COMPHY_PLL_TIMEOUT);
++	if (ret)
++		dev_err(lane->dev, "Failed to lock USB3 PLL\n");
++
++	return ret;
++}
++
++static int
++mvebu_a3700_comphy_pcie_power_on(struct mvebu_a3700_comphy_lane *lane)
++{
++	u32 mask, data, ref_clk;
++	int ret;
++
++	/* Configure phy selector for PCIe */
++	ret = mvebu_a3700_comphy_set_phy_selector(lane);
++	if (ret)
++		return ret;
++
++	/* 1. Enable max PLL. */
++	comphy_lane_reg_set(lane, COMPHY_PIPE_LANE_CFG1,
++			    USE_MAX_PLL_RATE_EN, USE_MAX_PLL_RATE_EN);
++
++	/* 2. Select 20 bit SERDES interface. */
++	comphy_lane_reg_set(lane, COMPHY_PIPE_CLK_SRC_LO,
++			    CFG_SEL_20B, CFG_SEL_20B);
++
++	/* 3. Force to use reg setting for PCIe mode */
++	comphy_lane_reg_set(lane, COMPHY_MISC_CTRL1,
++			    SEL_BITS_PCIE_FORCE, SEL_BITS_PCIE_FORCE);
++
++	/* 4. Change RX wait */
++	data = CFG_PM_RXDEN_WAIT_1_UNIT | CFG_PM_RXDLOZ_WAIT_12_UNIT;
++	mask = CFG_PM_OSCCLK_WAIT_MASK | CFG_PM_RXDEN_WAIT_MASK |
++	       CFG_PM_RXDLOZ_WAIT_MASK;
++	comphy_lane_reg_set(lane, COMPHY_PIPE_PWR_MGM_TIM1, data, mask);
++
++	/* 5. Enable idle sync */
++	comphy_lane_reg_set(lane, COMPHY_IDLE_SYNC_EN,
++			    IDLE_SYNC_EN, IDLE_SYNC_EN);
++
++	/* 6. Enable the output of 100M/125M/500M clock */
++	data = CLK500M_EN | TXDCLK_2X_SEL | CLK100M_125M_EN;
++	mask = data;
++	comphy_lane_reg_set(lane, COMPHY_MISC_CTRL0, data, mask);
++
++	/*
++	 * 7. Enable TX, PCIE global register, 0xd0074814, it is done in
++	 * PCI-E driver
++	 */
++
++	/*
++	 * 8. Check crystal jumper setting and program the Power and PLL
++	 * Control accordingly
++	 */
++
++	if (lane->priv->xtal_is_40m)
++		ref_clk = REF_FREF_SEL_PCIE_USB3_40MHZ;
++	else
++		ref_clk = REF_FREF_SEL_PCIE_USB3_25MHZ;
++
++	data = PU_IVREF_BIT | PU_PLL_BIT | PU_RX_BIT | PU_TX_BIT |
++	       PU_TX_INTP_BIT | PU_DFE_BIT | COMPHY_MODE_PCIE | ref_clk;
++	mask = 0xFFFF;
++	comphy_lane_reg_set(lane, COMPHY_POWER_PLL_CTRL, data, mask);
++
++	/* 9. Override Speed_PLL value and use MAC PLL */
++	comphy_lane_reg_set(lane, COMPHY_KVCO_CAL_CTRL,
++			    SPEED_PLL_VALUE_16 | USE_MAX_PLL_RATE_BIT,
++			    0xFFFF);
++
++	/* 10. Check the Polarity invert bit */
++	data = 0x0;
++	if (lane->invert_tx)
++		data |= TXD_INVERT_BIT;
++	if (lane->invert_rx)
++		data |= RXD_INVERT_BIT;
++	mask = TXD_INVERT_BIT | RXD_INVERT_BIT;
++	comphy_lane_reg_set(lane, COMPHY_SYNC_PATTERN, data, mask);
++
++	/* 11. Release SW reset */
++	data = MODE_CORE_CLK_FREQ_SEL | MODE_PIPE_WIDTH_32;
++	mask = data | PIPE_SOFT_RESET | MODE_REFDIV_MASK;
++	comphy_lane_reg_set(lane, COMPHY_PIPE_RST_CLK_CTRL, data, mask);
++
++	/* Wait for > 55 us to allow PCLK be enabled */
++	udelay(PLL_SET_DELAY_US);
++
++	ret = comphy_lane_reg_poll(lane, COMPHY_PIPE_LANE_STAT1, TXDCLK_PCLK_EN,
++				   COMPHY_PLL_SLEEP, COMPHY_PLL_TIMEOUT);
++	if (ret)
++		dev_err(lane->dev, "Failed to lock PCIE PLL\n");
++
++	return ret;
++}
++
++static void
++mvebu_a3700_comphy_sata_power_off(struct mvebu_a3700_comphy_lane *lane)
++{
++	/* Set phy isolation mode */
++	comphy_lane_reg_set(lane, COMPHY_ISOLATION_CTRL,
++			    PHY_ISOLATE_MODE, PHY_ISOLATE_MODE);
++
++	/* Power off PLL, Tx, Rx */
++	comphy_lane_reg_set(lane, COMPHY_POWER_PLL_CTRL,
++			    0x0, PU_PLL_BIT | PU_RX_BIT | PU_TX_BIT);
++}
++
++static void
++mvebu_a3700_comphy_ethernet_power_off(struct mvebu_a3700_comphy_lane *lane)
++{
++	u32 mask, data;
++
++	data = PIN_RESET_CORE_BIT | PIN_RESET_COMPHY_BIT | PIN_PU_IVREF_BIT |
++	       PHY_RX_INIT_BIT;
++	mask = data;
++	comphy_periph_reg_set(lane, COMPHY_PHY_CFG1, data, mask);
++}
++
++static void
++mvebu_a3700_comphy_pcie_power_off(struct mvebu_a3700_comphy_lane *lane)
++{
++	/* Power off PLL, Tx, Rx */
++	comphy_lane_reg_set(lane, COMPHY_POWER_PLL_CTRL,
++			    0x0, PU_PLL_BIT | PU_RX_BIT | PU_TX_BIT);
++}
++
++static int mvebu_a3700_comphy_reset(struct phy *phy)
++{
++	struct mvebu_a3700_comphy_lane *lane = phy_get_drvdata(phy);
++	u16 mask, data;
++
++	dev_dbg(lane->dev, "resetting lane %d\n", lane->id);
++
++	/* COMPHY reset for internal logic */
++	comphy_lane_reg_set(lane, COMPHY_SFT_RESET,
++			    SFT_RST_NO_REG, SFT_RST_NO_REG);
++
++	/* COMPHY register reset (cleared automatically) */
++	comphy_lane_reg_set(lane, COMPHY_SFT_RESET, SFT_RST, SFT_RST);
++
++	/* PIPE soft and register reset */
++	data = PIPE_SOFT_RESET | PIPE_REG_RESET;
++	mask = data;
++	comphy_lane_reg_set(lane, COMPHY_PIPE_RST_CLK_CTRL, data, mask);
++
++	/* Release PIPE register reset */
++	comphy_lane_reg_set(lane, COMPHY_PIPE_RST_CLK_CTRL,
++			    0x0, PIPE_REG_RESET);
++
++	/* Reset SB configuration register (only for lanes 0 and 1) */
++	if (lane->id == 0 || lane->id == 1) {
++		u32 mask, data;
++
++		data = PIN_RESET_CORE_BIT | PIN_RESET_COMPHY_BIT |
++		       PIN_PU_PLL_BIT | PIN_PU_RX_BIT | PIN_PU_TX_BIT;
++		mask = data | PIN_PU_IVREF_BIT | PIN_TX_IDLE_BIT;
++		comphy_periph_reg_set(lane, COMPHY_PHY_CFG1, data, mask);
++	}
++
++	return 0;
++}
++
++static bool mvebu_a3700_comphy_check_mode(int lane,
+ 					  enum phy_mode mode,
+ 					  int submode)
+ {
+@@ -122,7 +1141,7 @@ static int mvebu_a3700_comphy_get_fw_mod
+ 
+ 	/* Unused PHY mux value is 0x0 */
+ 	if (mode == PHY_MODE_INVALID)
+-		return -EINVAL;
++		return false;
+ 
+ 	for (i = 0; i < n; i++) {
+ 		if (mvebu_a3700_comphy_modes[i].lane == lane &&
+@@ -132,27 +1151,30 @@ static int mvebu_a3700_comphy_get_fw_mod
+ 	}
+ 
+ 	if (i == n)
+-		return -EINVAL;
++		return false;
+ 
+-	return mvebu_a3700_comphy_modes[i].fw_mode;
++	return true;
+ }
+ 
+ static int mvebu_a3700_comphy_set_mode(struct phy *phy, enum phy_mode mode,
+ 				       int submode)
+ {
+ 	struct mvebu_a3700_comphy_lane *lane = phy_get_drvdata(phy);
+-	int fw_mode;
+ 
+-	if (submode == PHY_INTERFACE_MODE_1000BASEX)
+-		submode = PHY_INTERFACE_MODE_SGMII;
+-
+-	fw_mode = mvebu_a3700_comphy_get_fw_mode(lane->id, mode,
+-						 submode);
+-	if (fw_mode < 0) {
++	if (!mvebu_a3700_comphy_check_mode(lane->id, mode, submode)) {
+ 		dev_err(lane->dev, "invalid COMPHY mode\n");
+-		return fw_mode;
++		return -EINVAL;
+ 	}
+ 
++	/* Mode cannot be changed while the PHY is powered on */
++	if (phy->power_count &&
++	    (lane->mode != mode || lane->submode != submode))
++		return -EBUSY;
++
++	/* If changing mode, ensure reset is called */
++	if (lane->mode != PHY_MODE_INVALID && lane->mode != mode)
++		lane->needs_reset = true;
++
+ 	/* Just remember the mode, ->power_on() will do the real setup */
+ 	lane->mode = mode;
+ 	lane->submode = submode;
+@@ -163,76 +1185,77 @@ static int mvebu_a3700_comphy_set_mode(s
+ static int mvebu_a3700_comphy_power_on(struct phy *phy)
+ {
+ 	struct mvebu_a3700_comphy_lane *lane = phy_get_drvdata(phy);
+-	u32 fw_param;
+-	int fw_mode;
+-	int fw_port;
+ 	int ret;
+ 
+-	fw_mode = mvebu_a3700_comphy_get_fw_mode(lane->id,
+-						 lane->mode, lane->submode);
+-	if (fw_mode < 0) {
++	if (!mvebu_a3700_comphy_check_mode(lane->id, lane->mode,
++					   lane->submode)) {
+ 		dev_err(lane->dev, "invalid COMPHY mode\n");
+-		return fw_mode;
++		return -EINVAL;
++	}
++
++	if (lane->needs_reset) {
++		ret = mvebu_a3700_comphy_reset(phy);
++		if (ret)
++			return ret;
++
++		lane->needs_reset = false;
+ 	}
+ 
+ 	switch (lane->mode) {
+ 	case PHY_MODE_USB_HOST_SS:
+ 		dev_dbg(lane->dev, "set lane %d to USB3 host mode\n", lane->id);
+-		fw_param = COMPHY_FW_MODE(fw_mode);
+-		break;
++		return mvebu_a3700_comphy_usb3_power_on(lane);
+ 	case PHY_MODE_SATA:
+ 		dev_dbg(lane->dev, "set lane %d to SATA mode\n", lane->id);
+-		fw_param = COMPHY_FW_MODE(fw_mode);
+-		break;
++		return mvebu_a3700_comphy_sata_power_on(lane);
+ 	case PHY_MODE_ETHERNET:
+-		fw_port = (lane->id == 0) ? 1 : 0;
+-		switch (lane->submode) {
+-		case PHY_INTERFACE_MODE_SGMII:
+-			dev_dbg(lane->dev, "set lane %d to SGMII mode\n",
+-				lane->id);
+-			fw_param = COMPHY_FW_NET(fw_mode, fw_port,
+-						 COMPHY_FW_SPEED_1_25G);
+-			break;
+-		case PHY_INTERFACE_MODE_2500BASEX:
+-			dev_dbg(lane->dev, "set lane %d to 2500BASEX mode\n",
+-				lane->id);
+-			fw_param = COMPHY_FW_NET(fw_mode, fw_port,
+-						 COMPHY_FW_SPEED_3_125G);
+-			break;
+-		default:
+-			dev_err(lane->dev, "unsupported PHY submode (%d)\n",
+-				lane->submode);
+-			return -ENOTSUPP;
+-		}
+-		break;
++		dev_dbg(lane->dev, "set lane %d to Ethernet mode\n", lane->id);
++		return mvebu_a3700_comphy_ethernet_power_on(lane);
+ 	case PHY_MODE_PCIE:
+ 		dev_dbg(lane->dev, "set lane %d to PCIe mode\n", lane->id);
+-		fw_param = COMPHY_FW_PCIE(fw_mode, COMPHY_FW_SPEED_5G,
+-					  phy->attrs.bus_width);
+-		break;
++		return mvebu_a3700_comphy_pcie_power_on(lane);
+ 	default:
+ 		dev_err(lane->dev, "unsupported PHY mode (%d)\n", lane->mode);
+-		return -ENOTSUPP;
++		return -EOPNOTSUPP;
+ 	}
+-
+-	ret = mvebu_a3700_comphy_smc(COMPHY_SIP_POWER_ON, lane->id, fw_param);
+-	if (ret == -EOPNOTSUPP)
+-		dev_err(lane->dev,
+-			"unsupported SMC call, try updating your firmware\n");
+-
+-	return ret;
+ }
+ 
+ static int mvebu_a3700_comphy_power_off(struct phy *phy)
+ {
+ 	struct mvebu_a3700_comphy_lane *lane = phy_get_drvdata(phy);
+ 
+-	return mvebu_a3700_comphy_smc(COMPHY_SIP_POWER_OFF, lane->id, 0);
++	switch (lane->mode) {
++	case PHY_MODE_USB_HOST_SS:
++		/*
++		 * The USB3 MAC sets the USB3 PHY to low state, so we do not
++		 * need to power off USB3 PHY again.
++		 */
++		break;
++
++	case PHY_MODE_SATA:
++		mvebu_a3700_comphy_sata_power_off(lane);
++		break;
++
++	case PHY_MODE_ETHERNET:
++		mvebu_a3700_comphy_ethernet_power_off(lane);
++		break;
++
++	case PHY_MODE_PCIE:
++		mvebu_a3700_comphy_pcie_power_off(lane);
++		break;
++
++	default:
++		dev_err(lane->dev, "invalid COMPHY mode\n");
++		return -EINVAL;
++	}
++
++	return 0;
+ }
+ 
+ static const struct phy_ops mvebu_a3700_comphy_ops = {
+ 	.power_on	= mvebu_a3700_comphy_power_on,
+ 	.power_off	= mvebu_a3700_comphy_power_off,
++	.reset		= mvebu_a3700_comphy_reset,
+ 	.set_mode	= mvebu_a3700_comphy_set_mode,
+ 	.owner		= THIS_MODULE,
+ };
+@@ -256,13 +1279,75 @@ static struct phy *mvebu_a3700_comphy_xl
+ 		return ERR_PTR(-EINVAL);
+ 	}
+ 
++	lane->invert_tx = args->args[1] & BIT(0);
++	lane->invert_rx = args->args[1] & BIT(1);
++
+ 	return phy;
+ }
+ 
+ static int mvebu_a3700_comphy_probe(struct platform_device *pdev)
+ {
++	struct mvebu_a3700_comphy_priv *priv;
+ 	struct phy_provider *provider;
+ 	struct device_node *child;
++	struct resource *res;
++	struct clk *clk;
++	int ret;
++
++	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
++	if (!priv)
++		return -ENOMEM;
++
++	spin_lock_init(&priv->lock);
++
++	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "comphy");
++	priv->comphy_regs = devm_ioremap_resource(&pdev->dev, res);
++	if (IS_ERR(priv->comphy_regs))
++		return PTR_ERR(priv->comphy_regs);
++
++	res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
++					   "lane1_pcie_gbe");
++	priv->lane1_phy_regs = devm_ioremap_resource(&pdev->dev, res);
++	if (IS_ERR(priv->lane1_phy_regs))
++		return PTR_ERR(priv->lane1_phy_regs);
++
++	res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
++					   "lane0_usb3_gbe");
++	priv->lane0_phy_regs = devm_ioremap_resource(&pdev->dev, res);
++	if (IS_ERR(priv->lane0_phy_regs))
++		return PTR_ERR(priv->lane0_phy_regs);
++
++	res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
++					   "lane2_sata_usb3");
++	priv->lane2_phy_indirect = devm_ioremap_resource(&pdev->dev, res);
++	if (IS_ERR(priv->lane2_phy_indirect))
++		return PTR_ERR(priv->lane2_phy_indirect);
++
++	/*
++	 * Driver needs to know if reference xtal clock is 40MHz or 25MHz.
++	 * Old DT bindings do not have xtal clk present. So do not fail here
++	 * and expects that default 25MHz reference clock is used.
++	 */
++	clk = clk_get(&pdev->dev, "xtal");
++	if (IS_ERR(clk)) {
++		if (PTR_ERR(clk) == -EPROBE_DEFER)
++			return -EPROBE_DEFER;
++		dev_warn(&pdev->dev, "missing 'xtal' clk (%ld)\n",
++			 PTR_ERR(clk));
++	} else {
++		ret = clk_prepare_enable(clk);
++		if (ret) {
++			dev_warn(&pdev->dev, "enabling xtal clk failed (%d)\n",
++				 ret);
++		} else {
++			if (clk_get_rate(clk) == 40000000)
++				priv->xtal_is_40m = true;
++			clk_disable_unprepare(clk);
++		}
++		clk_put(clk);
++	}
++
++	dev_set_drvdata(&pdev->dev, priv);
+ 
+ 	for_each_available_child_of_node(pdev->dev.of_node, child) {
+ 		struct mvebu_a3700_comphy_lane *lane;
+@@ -277,7 +1362,7 @@ static int mvebu_a3700_comphy_probe(stru
+ 			continue;
+ 		}
+ 
+-		if (lane_id >= MVEBU_A3700_COMPHY_LANES) {
++		if (lane_id >= 3) {
+ 			dev_err(&pdev->dev, "invalid 'reg' property\n");
+ 			continue;
+ 		}
+@@ -295,15 +1380,26 @@ static int mvebu_a3700_comphy_probe(stru
+ 			return PTR_ERR(phy);
+ 		}
+ 
++		lane->priv = priv;
+ 		lane->dev = &pdev->dev;
+ 		lane->mode = PHY_MODE_INVALID;
+ 		lane->submode = PHY_INTERFACE_MODE_NA;
+ 		lane->id = lane_id;
++		lane->invert_tx = false;
++		lane->invert_rx = false;
+ 		phy_set_drvdata(phy, lane);
++
++		/*
++		 * To avoid relying on the bootloader/firmware configuration,
++		 * power off all comphys.
++		 */
++		mvebu_a3700_comphy_reset(phy);
++		lane->needs_reset = false;
+ 	}
+ 
+ 	provider = devm_of_phy_provider_register(&pdev->dev,
+ 						 mvebu_a3700_comphy_xlate);
++
+ 	return PTR_ERR_OR_ZERO(provider);
+ }
+ 
+@@ -323,5 +1419,7 @@ static struct platform_driver mvebu_a370
+ module_platform_driver(mvebu_a3700_comphy_driver);
+ 
+ MODULE_AUTHOR("Miquèl Raynal <[email protected]>");
++MODULE_AUTHOR("Pali Rohár <[email protected]>");
++MODULE_AUTHOR("Marek Behún <[email protected]>");
+ MODULE_DESCRIPTION("Common PHY driver for A3700");
+ MODULE_LICENSE("GPL v2");

+ 32 - 0
target/linux/generic/backport-6.1/345-v5.17-arm64-dts-marvell-armada-37xx-Add-xtal-clock-to-comp.patch

@@ -0,0 +1,32 @@
+From 73a78b6130d9e13daca22b86ad52f063b9403e03 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <[email protected]>
+Date: Wed, 8 Dec 2021 03:40:35 +0100
+Subject: [PATCH 1/1] arm64: dts: marvell: armada-37xx: Add xtal clock to
+ comphy node
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Kernel driver phy-mvebu-a3700-comphy.c needs to know the rate of the
+reference xtal clock. So add missing xtal clock source into comphy device
+tree node. If the property is not present, the driver defaults to 25 MHz
+xtal rate (which, as far as we know, is used by all the existing boards).
+
+Signed-off-by: Pali Rohár <[email protected]>
+Signed-off-by: Marek Behún <[email protected]>
+Signed-off-by: Gregory CLEMENT <[email protected]>
+---
+ arch/arm64/boot/dts/marvell/armada-37xx.dtsi | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
++++ b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
+@@ -265,6 +265,8 @@
+ 					    "lane2_sata_usb3";
+ 				#address-cells = <1>;
+ 				#size-cells = <0>;
++				clocks = <&xtalclk>;
++				clock-names = "xtal";
+ 
+ 				comphy0: phy@0 {
+ 					reg = <0>;

+ 64 - 0
target/linux/generic/backport-6.1/346-v5.18-01-Revert-ata-ahci-mvebu-Make-SATA-PHY-optional-for-Arm.patch

@@ -0,0 +1,64 @@
+From ee995101fde67f85a3cd4c74f4f92fc4592e726b Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <[email protected]>
+Date: Thu, 3 Feb 2022 22:44:42 +0100
+Subject: [PATCH 1/3] Revert "ata: ahci: mvebu: Make SATA PHY optional for
+ Armada 3720"
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This reverts commit 45aefe3d2251e4e229d7662052739f96ad1d08d9.
+
+Armada 3720 PHY driver (phy-mvebu-a3700-comphy.c) does not return
+-EOPNOTSUPP from phy_power_on() callback anymore.
+
+So remove AHCI_HFLAG_IGN_NOTSUPP_POWER_ON flag from Armada 3720 plat data.
+
+AHCI_HFLAG_IGN_NOTSUPP_POWER_ON is not used by any other ahci driver, so
+remove this flag completely.
+
+Signed-off-by: Pali Rohár <[email protected]>
+Signed-off-by: Marek Behún <[email protected]>
+Acked-by: Miquel Raynal <[email protected]>
+Acked-by: Damien Le Moal <[email protected]>
+Link: https://lore.kernel.org/r/[email protected]
+Signed-off-by: Vinod Koul <[email protected]>
+---
+ drivers/ata/ahci.h             | 2 --
+ drivers/ata/ahci_mvebu.c       | 2 +-
+ drivers/ata/libahci_platform.c | 2 +-
+ 3 files changed, 2 insertions(+), 4 deletions(-)
+
+--- a/drivers/ata/ahci.h
++++ b/drivers/ata/ahci.h
+@@ -240,8 +240,6 @@ enum {
+ 							as default lpm_policy */
+ 	AHCI_HFLAG_SUSPEND_PHYS		= (1 << 26), /* handle PHYs during
+ 							suspend/resume */
+-	AHCI_HFLAG_IGN_NOTSUPP_POWER_ON	= (1 << 27), /* ignore -EOPNOTSUPP
+-							from phy_power_on() */
+ 	AHCI_HFLAG_NO_SXS		= (1 << 28), /* SXS not supported */
+ 
+ 	/* ap->flags bits */
+--- a/drivers/ata/ahci_mvebu.c
++++ b/drivers/ata/ahci_mvebu.c
+@@ -227,7 +227,7 @@ static const struct ahci_mvebu_plat_data
+ 
+ static const struct ahci_mvebu_plat_data ahci_mvebu_armada_3700_plat_data = {
+ 	.plat_config = ahci_mvebu_armada_3700_config,
+-	.flags = AHCI_HFLAG_SUSPEND_PHYS | AHCI_HFLAG_IGN_NOTSUPP_POWER_ON,
++	.flags = AHCI_HFLAG_SUSPEND_PHYS,
+ };
+ 
+ static const struct of_device_id ahci_mvebu_of_match[] = {
+--- a/drivers/ata/libahci_platform.c
++++ b/drivers/ata/libahci_platform.c
+@@ -59,7 +59,7 @@ int ahci_platform_enable_phys(struct ahc
+ 		}
+ 
+ 		rc = phy_power_on(hpriv->phys[i]);
+-		if (rc && !(rc == -EOPNOTSUPP && (hpriv->flags & AHCI_HFLAG_IGN_NOTSUPP_POWER_ON))) {
++		if (rc) {
+ 			phy_exit(hpriv->phys[i]);
+ 			goto disable_phys;
+ 		}

+ 166 - 0
target/linux/generic/backport-6.1/346-v5.18-02-Revert-usb-host-xhci-mvebu-make-USB-3.0-PHY-optional.patch

@@ -0,0 +1,166 @@
+From 8e10548f7f4814e530857d2049d6af6bc78add53 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <[email protected]>
+Date: Thu, 3 Feb 2022 22:44:43 +0100
+Subject: [PATCH 2/3] Revert "usb: host: xhci: mvebu: make USB 3.0 PHY optional
+ for Armada 3720"
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This reverts commit 3241929b67d28c83945d3191c6816a3271fd6b85.
+
+Armada 3720 phy driver (phy-mvebu-a3700-comphy.c) does not return
+-EOPNOTSUPP from phy_power_on() callback anymore.
+
+So remove XHCI_SKIP_PHY_INIT flag from xhci_mvebu_a3700_plat_setup() and
+then also whole xhci_mvebu_a3700_plat_setup() function which is there just
+to handle -EOPNOTSUPP for XHCI_SKIP_PHY_INIT.
+
+xhci plat_setup callback is not used by any other xhci plat driver, so
+remove this callback completely.
+
+Signed-off-by: Pali Rohár <[email protected]>
+Signed-off-by: Marek Behún <[email protected]>
+Acked-by: Miquel Raynal <[email protected]>
+Acked-by: Greg Kroah-Hartman <[email protected]>
+Link: https://lore.kernel.org/r/[email protected]
+Signed-off-by: Vinod Koul <[email protected]>
+---
+ drivers/usb/host/xhci-mvebu.c | 42 -----------------------------------
+ drivers/usb/host/xhci-mvebu.h |  6 -----
+ drivers/usb/host/xhci-plat.c  | 20 +----------------
+ drivers/usb/host/xhci-plat.h  |  1 -
+ 4 files changed, 1 insertion(+), 68 deletions(-)
+
+--- a/drivers/usb/host/xhci-mvebu.c
++++ b/drivers/usb/host/xhci-mvebu.c
+@@ -8,7 +8,6 @@
+ #include <linux/mbus.h>
+ #include <linux/of.h>
+ #include <linux/platform_device.h>
+-#include <linux/phy/phy.h>
+ 
+ #include <linux/usb.h>
+ #include <linux/usb/hcd.h>
+@@ -74,47 +73,6 @@ int xhci_mvebu_mbus_init_quirk(struct us
+ 
+ 	return 0;
+ }
+-
+-int xhci_mvebu_a3700_plat_setup(struct usb_hcd *hcd)
+-{
+-	struct xhci_hcd *xhci = hcd_to_xhci(hcd);
+-	struct device *dev = hcd->self.controller;
+-	struct phy *phy;
+-	int ret;
+-
+-	/* Old bindings miss the PHY handle */
+-	phy = of_phy_get(dev->of_node, "usb3-phy");
+-	if (IS_ERR(phy) && PTR_ERR(phy) == -EPROBE_DEFER)
+-		return -EPROBE_DEFER;
+-	else if (IS_ERR(phy))
+-		goto phy_out;
+-
+-	ret = phy_init(phy);
+-	if (ret)
+-		goto phy_put;
+-
+-	ret = phy_set_mode(phy, PHY_MODE_USB_HOST_SS);
+-	if (ret)
+-		goto phy_exit;
+-
+-	ret = phy_power_on(phy);
+-	if (ret == -EOPNOTSUPP) {
+-		/* Skip initializatin of XHCI PHY when it is unsupported by firmware */
+-		dev_warn(dev, "PHY unsupported by firmware\n");
+-		xhci->quirks |= XHCI_SKIP_PHY_INIT;
+-	}
+-	if (ret)
+-		goto phy_exit;
+-
+-	phy_power_off(phy);
+-phy_exit:
+-	phy_exit(phy);
+-phy_put:
+-	of_phy_put(phy);
+-phy_out:
+-
+-	return 0;
+-}
+ 
+ int xhci_mvebu_a3700_init_quirk(struct usb_hcd *hcd)
+ {
+--- a/drivers/usb/host/xhci-mvebu.h
++++ b/drivers/usb/host/xhci-mvebu.h
+@@ -12,18 +12,12 @@ struct usb_hcd;
+ 
+ #if IS_ENABLED(CONFIG_USB_XHCI_MVEBU)
+ int xhci_mvebu_mbus_init_quirk(struct usb_hcd *hcd);
+-int xhci_mvebu_a3700_plat_setup(struct usb_hcd *hcd);
+ int xhci_mvebu_a3700_init_quirk(struct usb_hcd *hcd);
+ #else
+ static inline int xhci_mvebu_mbus_init_quirk(struct usb_hcd *hcd)
+ {
+ 	return 0;
+ }
+-
+-static inline int xhci_mvebu_a3700_plat_setup(struct usb_hcd *hcd)
+-{
+-	return 0;
+-}
+ 
+ static inline int xhci_mvebu_a3700_init_quirk(struct usb_hcd *hcd)
+ {
+--- a/drivers/usb/host/xhci-plat.c
++++ b/drivers/usb/host/xhci-plat.c
+@@ -44,16 +44,6 @@ static void xhci_priv_plat_start(struct
+ 		priv->plat_start(hcd);
+ }
+ 
+-static int xhci_priv_plat_setup(struct usb_hcd *hcd)
+-{
+-	struct xhci_plat_priv *priv = hcd_to_xhci_priv(hcd);
+-
+-	if (!priv->plat_setup)
+-		return 0;
+-
+-	return priv->plat_setup(hcd);
+-}
+-
+ static int xhci_priv_init_quirk(struct usb_hcd *hcd)
+ {
+ 	struct xhci_plat_priv *priv = hcd_to_xhci_priv(hcd);
+@@ -121,7 +111,6 @@ static const struct xhci_plat_priv xhci_
+ };
+ 
+ static const struct xhci_plat_priv xhci_plat_marvell_armada3700 = {
+-	.plat_setup = xhci_mvebu_a3700_plat_setup,
+ 	.init_quirk = xhci_mvebu_a3700_init_quirk,
+ };
+ 
+@@ -341,14 +330,7 @@ static int xhci_plat_probe(struct platfo
+ 
+ 	hcd->tpl_support = of_usb_host_tpl_support(sysdev->of_node);
+ 	xhci->shared_hcd->tpl_support = hcd->tpl_support;
+-
+-	if (priv) {
+-		ret = xhci_priv_plat_setup(hcd);
+-		if (ret)
+-			goto disable_usb_phy;
+-	}
+-
+-	if ((xhci->quirks & XHCI_SKIP_PHY_INIT) || (priv && (priv->quirks & XHCI_SKIP_PHY_INIT)))
++	if (priv && (priv->quirks & XHCI_SKIP_PHY_INIT))
+ 		hcd->skip_phy_initialization = 1;
+ 
+ 	if (priv && (priv->quirks & XHCI_SG_TRB_CACHE_SIZE_QUIRK))
+--- a/drivers/usb/host/xhci-plat.h
++++ b/drivers/usb/host/xhci-plat.h
+@@ -13,7 +13,6 @@
+ struct xhci_plat_priv {
+ 	const char *firmware_name;
+ 	unsigned long long quirks;
+-	int (*plat_setup)(struct usb_hcd *);
+ 	void (*plat_start)(struct usb_hcd *);
+ 	int (*init_quirk)(struct usb_hcd *);
+ 	int (*suspend_quirk)(struct usb_hcd *);

+ 39 - 0
target/linux/generic/backport-6.1/346-v5.18-03-Revert-PCI-aardvark-Fix-initialization-with-old-Marv.patch

@@ -0,0 +1,39 @@
+From 9a4556dad7bd0a6b8339cb72e169f5c76f2af6f1 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <[email protected]>
+Date: Thu, 3 Feb 2022 22:44:44 +0100
+Subject: [PATCH 3/3] Revert "PCI: aardvark: Fix initialization with old
+ Marvell's Arm Trusted Firmware"
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This reverts commit b0c6ae0f8948a2be6bf4e8b4bbab9ca1343289b6.
+
+Armada 3720 phy driver (phy-mvebu-a3700-comphy.c) does not return
+-EOPNOTSUPP from phy_power_on() callback anymore.
+
+So remove dead code which handles -EOPNOTSUPP return value.
+
+Signed-off-by: Pali Rohár <[email protected]>
+Signed-off-by: Marek Behún <[email protected]>
+Acked-by: Miquel Raynal <[email protected]>
+Acked-by: Lorenzo Pieralisi <[email protected]>
+Link: https://lore.kernel.org/r/[email protected]
+Signed-off-by: Vinod Koul <[email protected]>
+---
+ drivers/pci/controller/pci-aardvark.c | 4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+--- a/drivers/pci/controller/pci-aardvark.c
++++ b/drivers/pci/controller/pci-aardvark.c
+@@ -1642,9 +1642,7 @@ static int advk_pcie_enable_phy(struct a
+ 	}
+ 
+ 	ret = phy_power_on(pcie->phy);
+-	if (ret == -EOPNOTSUPP) {
+-		dev_warn(&pcie->pdev->dev, "PHY unsupported by firmware\n");
+-	} else if (ret) {
++	if (ret) {
+ 		phy_exit(pcie->phy);
+ 		return ret;
+ 	}

+ 194 - 0
target/linux/generic/backport-6.1/347-v6.0-phy-marvell-phy-mvebu-a3700-comphy-Remove-broken-res.patch

@@ -0,0 +1,194 @@
+From 0a6fc70d76bddf98278af2ac000379c82aec8f11 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <[email protected]>
+Date: Mon, 29 Aug 2022 10:30:46 +0200
+Subject: [PATCH] phy: marvell: phy-mvebu-a3700-comphy: Remove broken reset
+ support
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Reset support for SATA PHY is somehow broken and after calling it, kernel
+is not able to detect and initialize SATA disk Samsung SSD 850 EMT0 [1].
+
+Reset support was introduced in commit 934337080c6c ("phy: marvell:
+phy-mvebu-a3700-comphy: Add native kernel implementation") as part of
+complete rewrite of this driver. v1 patch series of that commit [2] did
+not contain reset support and was tested that is working fine with
+Ethernet, SATA and USB PHYs without issues too.
+
+So for now remove broken reset support and change implementation of
+power_off callback to power off all functions on specified lane (and not
+only selected function) because during startup kernel does not know which
+function was selected and configured by bootloader. Same logic was used
+also in v1 patch series of that commit.
+
+This change fixes issues with initialization of SATA disk Samsung SSD 850
+and disk is working again, like before mentioned commit.
+
+Once problem with PHY reset callback is solved its functionality could be
+re-introduced. But for now it is unknown why it does not work.
+
+[1] - https://lore.kernel.org/r/20220531124159.3e4lgn2v462irbtz@shindev/
+[2] - https://lore.kernel.org/r/[email protected]/
+
+Reported-by: Shinichiro Kawasaki <[email protected]>
+Fixes: 934337080c6c ("phy: marvell: phy-mvebu-a3700-comphy: Add native kernel implementation")
+Cc: [email protected] # v5.18+
+Signed-off-by: Pali Rohár <[email protected]>
+Tested-by: Shinichiro Kawasaki <[email protected]>
+Link: https://lore.kernel.org/r/[email protected]
+Signed-off-by: Vinod Koul <[email protected]>
+---
+ drivers/phy/marvell/phy-mvebu-a3700-comphy.c | 87 ++++----------------
+ 1 file changed, 17 insertions(+), 70 deletions(-)
+
+--- a/drivers/phy/marvell/phy-mvebu-a3700-comphy.c
++++ b/drivers/phy/marvell/phy-mvebu-a3700-comphy.c
+@@ -274,7 +274,6 @@ struct mvebu_a3700_comphy_lane {
+ 	int submode;
+ 	bool invert_tx;
+ 	bool invert_rx;
+-	bool needs_reset;
+ };
+ 
+ struct gbe_phy_init_data_fix {
+@@ -1097,40 +1096,12 @@ mvebu_a3700_comphy_pcie_power_off(struct
+ 			    0x0, PU_PLL_BIT | PU_RX_BIT | PU_TX_BIT);
+ }
+ 
+-static int mvebu_a3700_comphy_reset(struct phy *phy)
++static void mvebu_a3700_comphy_usb3_power_off(struct mvebu_a3700_comphy_lane *lane)
+ {
+-	struct mvebu_a3700_comphy_lane *lane = phy_get_drvdata(phy);
+-	u16 mask, data;
+-
+-	dev_dbg(lane->dev, "resetting lane %d\n", lane->id);
+-
+-	/* COMPHY reset for internal logic */
+-	comphy_lane_reg_set(lane, COMPHY_SFT_RESET,
+-			    SFT_RST_NO_REG, SFT_RST_NO_REG);
+-
+-	/* COMPHY register reset (cleared automatically) */
+-	comphy_lane_reg_set(lane, COMPHY_SFT_RESET, SFT_RST, SFT_RST);
+-
+-	/* PIPE soft and register reset */
+-	data = PIPE_SOFT_RESET | PIPE_REG_RESET;
+-	mask = data;
+-	comphy_lane_reg_set(lane, COMPHY_PIPE_RST_CLK_CTRL, data, mask);
+-
+-	/* Release PIPE register reset */
+-	comphy_lane_reg_set(lane, COMPHY_PIPE_RST_CLK_CTRL,
+-			    0x0, PIPE_REG_RESET);
+-
+-	/* Reset SB configuration register (only for lanes 0 and 1) */
+-	if (lane->id == 0 || lane->id == 1) {
+-		u32 mask, data;
+-
+-		data = PIN_RESET_CORE_BIT | PIN_RESET_COMPHY_BIT |
+-		       PIN_PU_PLL_BIT | PIN_PU_RX_BIT | PIN_PU_TX_BIT;
+-		mask = data | PIN_PU_IVREF_BIT | PIN_TX_IDLE_BIT;
+-		comphy_periph_reg_set(lane, COMPHY_PHY_CFG1, data, mask);
+-	}
+-
+-	return 0;
++	/*
++	 * The USB3 MAC sets the USB3 PHY to low state, so we do not
++	 * need to power off USB3 PHY again.
++	 */
+ }
+ 
+ static bool mvebu_a3700_comphy_check_mode(int lane,
+@@ -1171,10 +1142,6 @@ static int mvebu_a3700_comphy_set_mode(s
+ 	    (lane->mode != mode || lane->submode != submode))
+ 		return -EBUSY;
+ 
+-	/* If changing mode, ensure reset is called */
+-	if (lane->mode != PHY_MODE_INVALID && lane->mode != mode)
+-		lane->needs_reset = true;
+-
+ 	/* Just remember the mode, ->power_on() will do the real setup */
+ 	lane->mode = mode;
+ 	lane->submode = submode;
+@@ -1185,7 +1152,6 @@ static int mvebu_a3700_comphy_set_mode(s
+ static int mvebu_a3700_comphy_power_on(struct phy *phy)
+ {
+ 	struct mvebu_a3700_comphy_lane *lane = phy_get_drvdata(phy);
+-	int ret;
+ 
+ 	if (!mvebu_a3700_comphy_check_mode(lane->id, lane->mode,
+ 					   lane->submode)) {
+@@ -1193,14 +1159,6 @@ static int mvebu_a3700_comphy_power_on(s
+ 		return -EINVAL;
+ 	}
+ 
+-	if (lane->needs_reset) {
+-		ret = mvebu_a3700_comphy_reset(phy);
+-		if (ret)
+-			return ret;
+-
+-		lane->needs_reset = false;
+-	}
+-
+ 	switch (lane->mode) {
+ 	case PHY_MODE_USB_HOST_SS:
+ 		dev_dbg(lane->dev, "set lane %d to USB3 host mode\n", lane->id);
+@@ -1224,38 +1182,28 @@ static int mvebu_a3700_comphy_power_off(
+ {
+ 	struct mvebu_a3700_comphy_lane *lane = phy_get_drvdata(phy);
+ 
+-	switch (lane->mode) {
+-	case PHY_MODE_USB_HOST_SS:
+-		/*
+-		 * The USB3 MAC sets the USB3 PHY to low state, so we do not
+-		 * need to power off USB3 PHY again.
+-		 */
+-		break;
+-
+-	case PHY_MODE_SATA:
+-		mvebu_a3700_comphy_sata_power_off(lane);
+-		break;
+-
+-	case PHY_MODE_ETHERNET:
++	switch (lane->id) {
++	case 0:
++		mvebu_a3700_comphy_usb3_power_off(lane);
+ 		mvebu_a3700_comphy_ethernet_power_off(lane);
+-		break;
+-
+-	case PHY_MODE_PCIE:
++		return 0;
++	case 1:
+ 		mvebu_a3700_comphy_pcie_power_off(lane);
+-		break;
+-
++		mvebu_a3700_comphy_ethernet_power_off(lane);
++		return 0;
++	case 2:
++		mvebu_a3700_comphy_usb3_power_off(lane);
++		mvebu_a3700_comphy_sata_power_off(lane);
++		return 0;
+ 	default:
+ 		dev_err(lane->dev, "invalid COMPHY mode\n");
+ 		return -EINVAL;
+ 	}
+-
+-	return 0;
+ }
+ 
+ static const struct phy_ops mvebu_a3700_comphy_ops = {
+ 	.power_on	= mvebu_a3700_comphy_power_on,
+ 	.power_off	= mvebu_a3700_comphy_power_off,
+-	.reset		= mvebu_a3700_comphy_reset,
+ 	.set_mode	= mvebu_a3700_comphy_set_mode,
+ 	.owner		= THIS_MODULE,
+ };
+@@ -1393,8 +1341,7 @@ static int mvebu_a3700_comphy_probe(stru
+ 		 * To avoid relying on the bootloader/firmware configuration,
+ 		 * power off all comphys.
+ 		 */
+-		mvebu_a3700_comphy_reset(phy);
+-		lane->needs_reset = false;
++		mvebu_a3700_comphy_power_off(phy);
+ 	}
+ 
+ 	provider = devm_of_phy_provider_register(&pdev->dev,

+ 90 - 0
target/linux/generic/backport-6.1/350-v5.18-regmap-add-configurable-downshift-for-addresses.patch

@@ -0,0 +1,90 @@
+From 86fc59ef818beb0e1945d17f8e734898baba7e4e Mon Sep 17 00:00:00 2001
+From: Colin Foster <[email protected]>
+Date: Sun, 13 Mar 2022 15:45:23 -0700
+Subject: [PATCH 1/2] regmap: add configurable downshift for addresses
+
+Add an additional reg_downshift to be applied to register addresses before
+any register accesses. An example of a device that uses this is a VSC7514
+chip, which require each register address to be downshifted by two if the
+access is performed over a SPI bus.
+
+Signed-off-by: Colin Foster <[email protected]>
+Link: https://lore.kernel.org/r/[email protected]
+Signed-off-by: Mark Brown <[email protected]>
+---
+ drivers/base/regmap/internal.h | 1 +
+ drivers/base/regmap/regmap.c   | 5 +++++
+ include/linux/regmap.h         | 3 +++
+ 3 files changed, 9 insertions(+)
+
+--- a/drivers/base/regmap/internal.h
++++ b/drivers/base/regmap/internal.h
+@@ -31,6 +31,7 @@ struct regmap_format {
+ 	size_t buf_size;
+ 	size_t reg_bytes;
+ 	size_t pad_bytes;
++	size_t reg_downshift;
+ 	size_t val_bytes;
+ 	void (*format_write)(struct regmap *map,
+ 			     unsigned int reg, unsigned int val);
+--- a/drivers/base/regmap/regmap.c
++++ b/drivers/base/regmap/regmap.c
+@@ -823,6 +823,7 @@ struct regmap *__regmap_init(struct devi
+ 
+ 	map->format.reg_bytes = DIV_ROUND_UP(config->reg_bits, 8);
+ 	map->format.pad_bytes = config->pad_bits / 8;
++	map->format.reg_downshift = config->reg_downshift;
+ 	map->format.val_bytes = DIV_ROUND_UP(config->val_bits, 8);
+ 	map->format.buf_size = DIV_ROUND_UP(config->reg_bits +
+ 			config->val_bits + config->pad_bits, 8);
+@@ -1735,6 +1736,7 @@ static int _regmap_raw_write_impl(struct
+ 			return ret;
+ 	}
+ 
++	reg >>= map->format.reg_downshift;
+ 	map->format.format_reg(map->work_buf, reg, map->reg_shift);
+ 	regmap_set_work_buf_flag_mask(map, map->format.reg_bytes,
+ 				      map->write_flag_mask);
+@@ -1905,6 +1907,7 @@ static int _regmap_bus_formatted_write(v
+ 			return ret;
+ 	}
+ 
++	reg >>= map->format.reg_downshift;
+ 	map->format.format_write(map, reg, val);
+ 
+ 	trace_regmap_hw_write_start(map, reg, 1);
+@@ -2346,6 +2349,7 @@ static int _regmap_raw_multi_reg_write(s
+ 		unsigned int reg = regs[i].reg;
+ 		unsigned int val = regs[i].def;
+ 		trace_regmap_hw_write_start(map, reg, 1);
++		reg >>= map->format.reg_downshift;
+ 		map->format.format_reg(u8, reg, map->reg_shift);
+ 		u8 += reg_bytes + pad_bytes;
+ 		map->format.format_val(u8, val, 0);
+@@ -2673,6 +2677,7 @@ static int _regmap_raw_read(struct regma
+ 			return ret;
+ 	}
+ 
++	reg >>= map->format.reg_downshift;
+ 	map->format.format_reg(map->work_buf, reg, map->reg_shift);
+ 	regmap_set_work_buf_flag_mask(map, map->format.reg_bytes,
+ 				      map->read_flag_mask);
+--- a/include/linux/regmap.h
++++ b/include/linux/regmap.h
+@@ -237,6 +237,8 @@ typedef void (*regmap_unlock)(void *);
+  * @reg_stride: The register address stride. Valid register addresses are a
+  *              multiple of this value. If set to 0, a value of 1 will be
+  *              used.
++ * @reg_downshift: The number of bits to downshift the register before
++ *		   performing any operations.
+  * @pad_bits: Number of bits of padding between register and value.
+  * @val_bits: Number of bits in a register value, mandatory.
+  *
+@@ -360,6 +362,7 @@ struct regmap_config {
+ 
+ 	int reg_bits;
+ 	int reg_stride;
++	int reg_downshift;
+ 	int pad_bits;
+ 	int val_bits;
+ 

+ 95 - 0
target/linux/generic/backport-6.1/351-v5.18-regmap-allow-a-defined-reg_base-to-be-added-to-every.patch

@@ -0,0 +1,95 @@
+From 0074f3f2b1e43d3cedd97e47fb6980db6d2ba79e Mon Sep 17 00:00:00 2001
+From: Colin Foster <[email protected]>
+Date: Sun, 13 Mar 2022 15:45:24 -0700
+Subject: [PATCH 2/2] regmap: allow a defined reg_base to be added to every
+ address
+
+There's an inconsistency that arises when a register set can be accessed
+internally via MMIO, or externally via SPI. The VSC7514 chip allows both
+modes of operation. When internally accessed, the system utilizes __iomem,
+devm_ioremap_resource, and devm_regmap_init_mmio.
+
+For SPI it isn't possible to utilize memory-mapped IO. To properly operate,
+the resource base must be added to the register before every operation.
+
+Signed-off-by: Colin Foster <[email protected]>
+Link: https://lore.kernel.org/r/[email protected]
+Signed-off-by: Mark Brown <[email protected]>
+---
+ drivers/base/regmap/internal.h | 1 +
+ drivers/base/regmap/regmap.c   | 6 ++++++
+ include/linux/regmap.h         | 3 +++
+ 3 files changed, 10 insertions(+)
+
+--- a/drivers/base/regmap/internal.h
++++ b/drivers/base/regmap/internal.h
+@@ -63,6 +63,7 @@ struct regmap {
+ 	regmap_unlock unlock;
+ 	void *lock_arg; /* This is passed to lock/unlock functions */
+ 	gfp_t alloc_flags;
++	unsigned int reg_base;
+ 
+ 	struct device *dev; /* Device we do I/O on */
+ 	void *work_buf;     /* Scratch buffer used to format I/O */
+--- a/drivers/base/regmap/regmap.c
++++ b/drivers/base/regmap/regmap.c
+@@ -821,6 +821,8 @@ struct regmap *__regmap_init(struct devi
+ 	else
+ 		map->alloc_flags = GFP_KERNEL;
+ 
++	map->reg_base = config->reg_base;
++
+ 	map->format.reg_bytes = DIV_ROUND_UP(config->reg_bits, 8);
+ 	map->format.pad_bytes = config->pad_bits / 8;
+ 	map->format.reg_downshift = config->reg_downshift;
+@@ -1736,6 +1738,7 @@ static int _regmap_raw_write_impl(struct
+ 			return ret;
+ 	}
+ 
++	reg += map->reg_base;
+ 	reg >>= map->format.reg_downshift;
+ 	map->format.format_reg(map->work_buf, reg, map->reg_shift);
+ 	regmap_set_work_buf_flag_mask(map, map->format.reg_bytes,
+@@ -1907,6 +1910,7 @@ static int _regmap_bus_formatted_write(v
+ 			return ret;
+ 	}
+ 
++	reg += map->reg_base;
+ 	reg >>= map->format.reg_downshift;
+ 	map->format.format_write(map, reg, val);
+ 
+@@ -2349,6 +2353,7 @@ static int _regmap_raw_multi_reg_write(s
+ 		unsigned int reg = regs[i].reg;
+ 		unsigned int val = regs[i].def;
+ 		trace_regmap_hw_write_start(map, reg, 1);
++		reg += map->reg_base;
+ 		reg >>= map->format.reg_downshift;
+ 		map->format.format_reg(u8, reg, map->reg_shift);
+ 		u8 += reg_bytes + pad_bytes;
+@@ -2677,6 +2682,7 @@ static int _regmap_raw_read(struct regma
+ 			return ret;
+ 	}
+ 
++	reg += map->reg_base;
+ 	reg >>= map->format.reg_downshift;
+ 	map->format.format_reg(map->work_buf, reg, map->reg_shift);
+ 	regmap_set_work_buf_flag_mask(map, map->format.reg_bytes,
+--- a/include/linux/regmap.h
++++ b/include/linux/regmap.h
+@@ -239,6 +239,8 @@ typedef void (*regmap_unlock)(void *);
+  *              used.
+  * @reg_downshift: The number of bits to downshift the register before
+  *		   performing any operations.
++ * @reg_base: Value to be added to every register address before performing any
++ *	      operation.
+  * @pad_bits: Number of bits of padding between register and value.
+  * @val_bits: Number of bits in a register value, mandatory.
+  *
+@@ -363,6 +365,7 @@ struct regmap_config {
+ 	int reg_bits;
+ 	int reg_stride;
+ 	int reg_downshift;
++	unsigned int reg_base;
+ 	int pad_bits;
+ 	int val_bits;
+ 

+ 57 - 0
target/linux/generic/backport-6.1/352-v6.3-regmap-apply-reg_base-and-reg_downshift-for-single-r.patch

@@ -0,0 +1,57 @@
+From 697c3892d825fb78f42ec8e53bed065dd728db3e Mon Sep 17 00:00:00 2001
+From: Daniel Golle <[email protected]>
+Date: Mon, 30 Jan 2023 02:04:57 +0000
+Subject: [PATCH] regmap: apply reg_base and reg_downshift for single register
+ ops
+
+reg_base and reg_downshift currently don't have any effect if used with
+a regmap_bus or regmap_config which only offers single register
+operations (ie. reg_read, reg_write and optionally reg_update_bits).
+
+Fix that and take them into account also for regmap_bus with only
+reg_read and read_write operations by applying reg_base and
+reg_downshift in _regmap_bus_reg_write, _regmap_bus_reg_read.
+
+Also apply reg_base and reg_downshift in _regmap_update_bits, but only
+in case the operation is carried out with a reg_update_bits call
+defined in either regmap_bus or regmap_config.
+
+Fixes: 0074f3f2b1e43d ("regmap: allow a defined reg_base to be added to every address")
+Fixes: 86fc59ef818beb ("regmap: add configurable downshift for addresses")
+Signed-off-by: Daniel Golle <[email protected]>
+Tested-by: Colin Foster <[email protected]>
+Link: https://lore.kernel.org/r/[email protected]
+Signed-off-by: Mark Brown <[email protected]>
+---
+ drivers/base/regmap/regmap.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/drivers/base/regmap/regmap.c
++++ b/drivers/base/regmap/regmap.c
+@@ -1929,6 +1929,8 @@ static int _regmap_bus_reg_write(void *c
+ {
+ 	struct regmap *map = context;
+ 
++	reg += map->reg_base;
++	reg >>= map->format.reg_downshift;
+ 	return map->bus->reg_write(map->bus_context, reg, val);
+ }
+ 
+@@ -2703,6 +2705,8 @@ static int _regmap_bus_reg_read(void *co
+ {
+ 	struct regmap *map = context;
+ 
++	reg += map->reg_base;
++	reg >>= map->format.reg_downshift;
+ 	return map->bus->reg_read(map->bus_context, reg, val);
+ }
+ 
+@@ -3078,6 +3082,8 @@ static int _regmap_update_bits(struct re
+ 		*change = false;
+ 
+ 	if (regmap_volatile(map, reg) && map->reg_update_bits) {
++		reg += map->reg_base;
++		reg >>= map->format.reg_downshift;
+ 		ret = map->reg_update_bits(map->bus_context, reg, mask, val);
+ 		if (ret == 0 && change)
+ 			*change = true;

+ 72 - 0
target/linux/generic/backport-6.1/400-v5.19-mtd-call-of_platform_populate-for-MTD-partitions.patch

@@ -0,0 +1,72 @@
+From bcdf0315a61a29eb753a607d3a85a4032de72d94 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <[email protected]>
+Date: Tue, 10 May 2022 15:12:59 +0200
+Subject: [PATCH] mtd: call of_platform_populate() for MTD partitions
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Until this change MTD subsystem supported handling partitions only with
+MTD partitions parsers. That's a specific / limited API designed around
+partitions.
+
+Some MTD partitions may however require different handling. They may
+contain specific data that needs to be parsed and somehow extracted. For
+that purpose MTD subsystem should allow binding of standard platform
+drivers.
+
+An example can be U-Boot (sub)partition with environment variables.
+There exist a "u-boot,env" DT binding for MTD (sub)partition that
+requires an NVMEM driver.
+
+Ref: 5db1c2dbc04c ("dt-bindings: nvmem: add U-Boot environment variables binding")
+Signed-off-by: Rafał Miłecki <[email protected]>
+Signed-off-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/linux-mtd/[email protected]
+---
+ drivers/mtd/mtdpart.c | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/drivers/mtd/mtdpart.c
++++ b/drivers/mtd/mtdpart.c
+@@ -17,6 +17,7 @@
+ #include <linux/mtd/partitions.h>
+ #include <linux/err.h>
+ #include <linux/of.h>
++#include <linux/of_platform.h>
+ 
+ #include "mtdcore.h"
+ 
+@@ -577,10 +578,16 @@ static int mtd_part_of_parse(struct mtd_
+ 	struct mtd_part_parser *parser;
+ 	struct device_node *np;
+ 	struct property *prop;
++	struct device *dev;
+ 	const char *compat;
+ 	const char *fixed = "fixed-partitions";
+ 	int ret, err = 0;
+ 
++	dev = &master->dev;
++	/* Use parent device (controller) if the top level MTD is not registered */
++	if (!IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER) && !mtd_is_partition(master))
++		dev = master->dev.parent;
++
+ 	np = mtd_get_of_node(master);
+ 	if (mtd_is_partition(master))
+ 		of_node_get(np);
+@@ -593,6 +600,7 @@ static int mtd_part_of_parse(struct mtd_
+ 			continue;
+ 		ret = mtd_part_do_parse(parser, master, pparts, NULL);
+ 		if (ret > 0) {
++			of_platform_populate(np, NULL, NULL, dev);
+ 			of_node_put(np);
+ 			return ret;
+ 		}
+@@ -600,6 +608,7 @@ static int mtd_part_of_parse(struct mtd_
+ 		if (ret < 0 && !err)
+ 			err = ret;
+ 	}
++	of_platform_populate(np, NULL, NULL, dev);
+ 	of_node_put(np);
+ 
+ 	/*

+ 302 - 0
target/linux/generic/backport-6.1/401-v6.0-mtd-parsers-add-support-for-Sercomm-partitions.patch

@@ -0,0 +1,302 @@
+From 9b78ef0c7997052e9eaa0f7a4513d546fa17358c Mon Sep 17 00:00:00 2001
+From: Mikhail Zhilkin <[email protected]>
+Date: Sun, 29 May 2022 11:07:14 +0000
+Subject: [PATCH] mtd: parsers: add support for Sercomm partitions
+
+This adds an MTD partition parser for the Sercomm partition table that
+is used in some Beeline, Netgear and Sercomm routers.
+
+The Sercomm partition map table contains real partition offsets, which
+may differ from device to device depending on the number and location of
+bad blocks on NAND.
+
+Original patch (proposed by NOGUCHI Hiroshi):
+Link: https://github.com/openwrt/openwrt/pull/1318#issuecomment-420607394
+
+Signed-off-by: NOGUCHI Hiroshi <[email protected]>
+Signed-off-by: Mikhail Zhilkin <[email protected]>
+Signed-off-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/linux-mtd/[email protected]
+---
+ drivers/mtd/parsers/Kconfig  |   9 ++
+ drivers/mtd/parsers/Makefile |   1 +
+ drivers/mtd/parsers/scpart.c | 248 +++++++++++++++++++++++++++++++++++
+ 3 files changed, 258 insertions(+)
+ create mode 100644 drivers/mtd/parsers/scpart.c
+
+--- a/drivers/mtd/parsers/Kconfig
++++ b/drivers/mtd/parsers/Kconfig
+@@ -186,3 +186,12 @@ config MTD_QCOMSMEM_PARTS
+ 	help
+ 	  This provides support for parsing partitions from Shared Memory (SMEM)
+ 	  for NAND and SPI flash on Qualcomm platforms.
++
++config MTD_SERCOMM_PARTS
++	tristate "Sercomm partition table parser"
++	depends on MTD && RALINK
++	help
++	  This provides partitions table parser for devices with Sercomm
++	  partition map. This partition table contains real partition
++	  offsets, which may differ from device to device depending on the
++	  number and location of bad blocks on NAND.
+--- a/drivers/mtd/parsers/Makefile
++++ b/drivers/mtd/parsers/Makefile
+@@ -10,6 +10,7 @@ ofpart-$(CONFIG_MTD_OF_PARTS_LINKSYS_NS)
+ obj-$(CONFIG_MTD_PARSER_IMAGETAG)	+= parser_imagetag.o
+ obj-$(CONFIG_MTD_AFS_PARTS)		+= afs.o
+ obj-$(CONFIG_MTD_PARSER_TRX)		+= parser_trx.o
++obj-$(CONFIG_MTD_SERCOMM_PARTS)		+= scpart.o
+ obj-$(CONFIG_MTD_SHARPSL_PARTS)		+= sharpslpart.o
+ obj-$(CONFIG_MTD_REDBOOT_PARTS)		+= redboot.o
+ obj-$(CONFIG_MTD_QCOMSMEM_PARTS)	+= qcomsmempart.o
+--- /dev/null
++++ b/drivers/mtd/parsers/scpart.c
+@@ -0,0 +1,248 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ *    drivers/mtd/scpart.c: Sercomm Partition Parser
++ *
++ *    Copyright (C) 2018 NOGUCHI Hiroshi
++ *    Copyright (C) 2022 Mikhail Zhilkin
++ */
++
++#include <linux/kernel.h>
++#include <linux/slab.h>
++#include <linux/mtd/mtd.h>
++#include <linux/mtd/partitions.h>
++#include <linux/module.h>
++
++#define	MOD_NAME	"scpart"
++
++#ifdef pr_fmt
++#undef pr_fmt
++#endif
++
++#define pr_fmt(fmt) MOD_NAME ": " fmt
++
++#define	ID_ALREADY_FOUND	0xffffffffUL
++
++#define	MAP_OFFS_IN_BLK		0x800
++#define	MAP_MIRROR_NUM		2
++
++static const char sc_part_magic[] = {
++	'S', 'C', 'F', 'L', 'M', 'A', 'P', 'O', 'K', '\0',
++};
++#define	PART_MAGIC_LEN		sizeof(sc_part_magic)
++
++/* assumes that all fields are set by CPU native endian */
++struct sc_part_desc {
++	uint32_t	part_id;
++	uint32_t	part_offs;
++	uint32_t	part_bytes;
++};
++
++static uint32_t scpart_desc_is_valid(struct sc_part_desc *pdesc)
++{
++	return ((pdesc->part_id != 0xffffffffUL) &&
++		(pdesc->part_offs != 0xffffffffUL) &&
++		(pdesc->part_bytes != 0xffffffffUL));
++}
++
++static int scpart_scan_partmap(struct mtd_info *master, loff_t partmap_offs,
++			       struct sc_part_desc **ppdesc)
++{
++	int cnt = 0;
++	int res = 0;
++	int res2;
++	loff_t offs;
++	size_t retlen;
++	struct sc_part_desc *pdesc = NULL;
++	struct sc_part_desc *tmpdesc;
++	uint8_t *buf;
++
++	buf = kzalloc(master->erasesize, GFP_KERNEL);
++	if (!buf) {
++		res = -ENOMEM;
++		goto out;
++	}
++
++	res2 = mtd_read(master, partmap_offs, master->erasesize, &retlen, buf);
++	if (res2 || retlen != master->erasesize) {
++		res = -EIO;
++		goto free;
++	}
++
++	for (offs = MAP_OFFS_IN_BLK;
++	     offs < master->erasesize - sizeof(*tmpdesc);
++	     offs += sizeof(*tmpdesc)) {
++		tmpdesc = (struct sc_part_desc *)&buf[offs];
++		if (!scpart_desc_is_valid(tmpdesc))
++			break;
++		cnt++;
++	}
++
++	if (cnt > 0) {
++		int bytes = cnt * sizeof(*pdesc);
++
++		pdesc = kcalloc(cnt, sizeof(*pdesc), GFP_KERNEL);
++		if (!pdesc) {
++			res = -ENOMEM;
++			goto free;
++		}
++		memcpy(pdesc, &(buf[MAP_OFFS_IN_BLK]), bytes);
++
++		*ppdesc = pdesc;
++		res = cnt;
++	}
++
++free:
++	kfree(buf);
++
++out:
++	return res;
++}
++
++static int scpart_find_partmap(struct mtd_info *master,
++			       struct sc_part_desc **ppdesc)
++{
++	int magic_found = 0;
++	int res = 0;
++	int res2;
++	loff_t offs = 0;
++	size_t retlen;
++	uint8_t rdbuf[PART_MAGIC_LEN];
++
++	while ((magic_found < MAP_MIRROR_NUM) &&
++			(offs < master->size) &&
++			 !mtd_block_isbad(master, offs)) {
++		res2 = mtd_read(master, offs, PART_MAGIC_LEN, &retlen, rdbuf);
++		if (res2 || retlen != PART_MAGIC_LEN) {
++			res = -EIO;
++			goto out;
++		}
++		if (!memcmp(rdbuf, sc_part_magic, PART_MAGIC_LEN)) {
++			pr_debug("Signature found at 0x%llx\n", offs);
++			magic_found++;
++			res = scpart_scan_partmap(master, offs, ppdesc);
++			if (res > 0)
++				goto out;
++		}
++		offs += master->erasesize;
++	}
++
++out:
++	if (res > 0)
++		pr_info("Valid 'SC PART MAP' (%d partitions) found at 0x%llx\n", res, offs);
++	else
++		pr_info("No valid 'SC PART MAP' was found\n");
++
++	return res;
++}
++
++static int scpart_parse(struct mtd_info *master,
++			const struct mtd_partition **pparts,
++			struct mtd_part_parser_data *data)
++{
++	const char *partname;
++	int n;
++	int nr_scparts;
++	int nr_parts = 0;
++	int res = 0;
++	struct sc_part_desc *scpart_map = NULL;
++	struct mtd_partition *parts = NULL;
++	struct device_node *mtd_node;
++	struct device_node *ofpart_node;
++	struct device_node *pp;
++
++	mtd_node = mtd_get_of_node(master);
++	if (!mtd_node) {
++		res = -ENOENT;
++		goto out;
++	}
++
++	ofpart_node = of_get_child_by_name(mtd_node, "partitions");
++	if (!ofpart_node) {
++		pr_info("%s: 'partitions' subnode not found on %pOF.\n",
++				master->name, mtd_node);
++		res = -ENOENT;
++		goto out;
++	}
++
++	nr_scparts = scpart_find_partmap(master, &scpart_map);
++	if (nr_scparts <= 0) {
++		pr_info("No any partitions was found in 'SC PART MAP'.\n");
++		res = -ENOENT;
++		goto free;
++	}
++
++	parts = kcalloc(of_get_child_count(ofpart_node), sizeof(*parts),
++		GFP_KERNEL);
++	if (!parts) {
++		res = -ENOMEM;
++		goto free;
++	}
++
++	for_each_child_of_node(ofpart_node, pp) {
++		u32 scpart_id;
++
++		if (of_property_read_u32(pp, "sercomm,scpart-id", &scpart_id))
++			continue;
++
++		for (n = 0 ; n < nr_scparts ; n++)
++			if ((scpart_map[n].part_id != ID_ALREADY_FOUND) &&
++					(scpart_id == scpart_map[n].part_id))
++				break;
++		if (n >= nr_scparts)
++			/* not match */
++			continue;
++
++		/* add the partition found in OF into MTD partition array */
++		parts[nr_parts].offset = scpart_map[n].part_offs;
++		parts[nr_parts].size = scpart_map[n].part_bytes;
++		parts[nr_parts].of_node = pp;
++
++		if (!of_property_read_string(pp, "label", &partname))
++			parts[nr_parts].name = partname;
++		if (of_property_read_bool(pp, "read-only"))
++			parts[nr_parts].mask_flags |= MTD_WRITEABLE;
++		if (of_property_read_bool(pp, "lock"))
++			parts[nr_parts].mask_flags |= MTD_POWERUP_LOCK;
++
++		/* mark as 'done' */
++		scpart_map[n].part_id = ID_ALREADY_FOUND;
++
++		nr_parts++;
++	}
++
++	if (nr_parts > 0) {
++		*pparts = parts;
++		res = nr_parts;
++	} else
++		pr_info("No partition in OF matches partition ID with 'SC PART MAP'.\n");
++
++	of_node_put(pp);
++
++free:
++	kfree(scpart_map);
++	if (res <= 0)
++		kfree(parts);
++
++out:
++	return res;
++}
++
++static const struct of_device_id scpart_parser_of_match_table[] = {
++	{ .compatible = "sercomm,sc-partitions" },
++	{},
++};
++MODULE_DEVICE_TABLE(of, scpart_parser_of_match_table);
++
++static struct mtd_part_parser scpart_parser = {
++	.parse_fn = scpart_parse,
++	.name = "scpart",
++	.of_match_table = scpart_parser_of_match_table,
++};
++module_mtd_part_parser(scpart_parser);
++
++/* mtd parsers will request the module by parser name */
++MODULE_ALIAS("scpart");
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("NOGUCHI Hiroshi <[email protected]>");
++MODULE_AUTHOR("Mikhail Zhilkin <[email protected]>");
++MODULE_DESCRIPTION("Sercomm partition parser");

+ 106 - 0
target/linux/generic/backport-6.1/402-v6.0-mtd-next-mtd-core-introduce-of-support-for-dynamic-partitions.patch

@@ -0,0 +1,106 @@
+From ad9b10d1eaada169bd764abcab58f08538877e26 Mon Sep 17 00:00:00 2001
+From: Christian Marangi <[email protected]>
+Date: Wed, 22 Jun 2022 03:06:28 +0200
+Subject: mtd: core: introduce of support for dynamic partitions
+
+We have many parser that register mtd partitions at runtime. One example
+is the cmdlinepart or the smem-part parser where the compatible is defined
+in the dts and the partitions gets detected and registered by the
+parser. This is problematic for the NVMEM subsystem that requires an OF
+node to detect NVMEM cells.
+
+To fix this problem, introduce an additional logic that will try to
+assign an OF node to the MTD if declared.
+
+On MTD addition, it will be checked if the MTD has an OF node and if
+not declared will check if a partition with the same label / node name is
+declared in DTS. If an exact match is found, the partition dynamically
+allocated by the parser will have a connected OF node.
+
+The NVMEM subsystem will detect the OF node and register any NVMEM cells
+declared statically in the DTS.
+
+Signed-off-by: Christian Marangi <[email protected]>
+Signed-off-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/linux-mtd/[email protected]
+---
+ drivers/mtd/mtdcore.c | 61 +++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 61 insertions(+)
+
+--- a/drivers/mtd/mtdcore.c
++++ b/drivers/mtd/mtdcore.c
+@@ -564,6 +564,66 @@ static int mtd_nvmem_add(struct mtd_info
+ 	return 0;
+ }
+ 
++static void mtd_check_of_node(struct mtd_info *mtd)
++{
++	struct device_node *partitions, *parent_dn, *mtd_dn = NULL;
++	const char *pname, *prefix = "partition-";
++	int plen, mtd_name_len, offset, prefix_len;
++	struct mtd_info *parent;
++	bool found = false;
++
++	/* Check if MTD already has a device node */
++	if (dev_of_node(&mtd->dev))
++		return;
++
++	/* Check if a partitions node exist */
++	parent = mtd->parent;
++	parent_dn = dev_of_node(&parent->dev);
++	if (!parent_dn)
++		return;
++
++	partitions = of_get_child_by_name(parent_dn, "partitions");
++	if (!partitions)
++		goto exit_parent;
++
++	prefix_len = strlen(prefix);
++	mtd_name_len = strlen(mtd->name);
++
++	/* Search if a partition is defined with the same name */
++	for_each_child_of_node(partitions, mtd_dn) {
++		offset = 0;
++
++		/* Skip partition with no/wrong prefix */
++		if (!of_node_name_prefix(mtd_dn, "partition-"))
++			continue;
++
++		/* Label have priority. Check that first */
++		if (of_property_read_string(mtd_dn, "label", &pname)) {
++			of_property_read_string(mtd_dn, "name", &pname);
++			offset = prefix_len;
++		}
++
++		plen = strlen(pname) - offset;
++		if (plen == mtd_name_len &&
++		    !strncmp(mtd->name, pname + offset, plen)) {
++			found = true;
++			break;
++		}
++	}
++
++	if (!found)
++		goto exit_partitions;
++
++	/* Set of_node only for nvmem */
++	if (of_device_is_compatible(mtd_dn, "nvmem-cells"))
++		mtd_set_of_node(mtd, mtd_dn);
++
++exit_partitions:
++	of_node_put(partitions);
++exit_parent:
++	of_node_put(parent_dn);
++}
++
+ /**
+  *	add_mtd_device - register an MTD device
+  *	@mtd: pointer to new MTD device info structure
+@@ -669,6 +729,7 @@ int add_mtd_device(struct mtd_info *mtd)
+ 	mtd->dev.devt = MTD_DEVT(i);
+ 	dev_set_name(&mtd->dev, "mtd%d", i);
+ 	dev_set_drvdata(&mtd->dev, mtd);
++	mtd_check_of_node(mtd);
+ 	of_node_get(mtd_get_of_node(mtd));
+ 	error = device_register(&mtd->dev);
+ 	if (error) {

+ 72 - 0
target/linux/generic/backport-6.1/403-v6.1-mtd-allow-getting-MTD-device-associated-with-a-speci.patch

@@ -0,0 +1,72 @@
+From b0321721be50b80c03a51866a94fde4f94690e18 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <[email protected]>
+Date: Wed, 15 Jun 2022 21:42:59 +0200
+Subject: [PATCH] mtd: allow getting MTD device associated with a specific DT
+ node
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+MTD subsystem API allows interacting with MTD devices (e.g. reading,
+writing, handling bad blocks). So far a random driver could get MTD
+device only by its name (get_mtd_device_nm()). This change allows
+getting them also by a DT node.
+
+This API is required for drivers handling DT defined MTD partitions in a
+specific way (e.g. U-Boot (sub)partition with environment variables).
+
+Signed-off-by: Rafał Miłecki <[email protected]>
+Acked-by: Miquel Raynal <[email protected]>
+Signed-off-by: Srinivas Kandagatla <[email protected]>
+---
+ drivers/mtd/mtdcore.c   | 28 ++++++++++++++++++++++++++++
+ include/linux/mtd/mtd.h |  1 +
+ 2 files changed, 29 insertions(+)
+
+--- a/drivers/mtd/mtdcore.c
++++ b/drivers/mtd/mtdcore.c
+@@ -1236,6 +1236,34 @@ int __get_mtd_device(struct mtd_info *mt
+ EXPORT_SYMBOL_GPL(__get_mtd_device);
+ 
+ /**
++ * of_get_mtd_device_by_node - obtain an MTD device associated with a given node
++ *
++ * @np: device tree node
++ */
++struct mtd_info *of_get_mtd_device_by_node(struct device_node *np)
++{
++	struct mtd_info *mtd = NULL;
++	struct mtd_info *tmp;
++	int err;
++
++	mutex_lock(&mtd_table_mutex);
++
++	err = -EPROBE_DEFER;
++	mtd_for_each_device(tmp) {
++		if (mtd_get_of_node(tmp) == np) {
++			mtd = tmp;
++			err = __get_mtd_device(mtd);
++			break;
++		}
++	}
++
++	mutex_unlock(&mtd_table_mutex);
++
++	return err ? ERR_PTR(err) : mtd;
++}
++EXPORT_SYMBOL_GPL(of_get_mtd_device_by_node);
++
++/**
+  *	get_mtd_device_nm - obtain a validated handle for an MTD device by
+  *	device name
+  *	@name: MTD device name to open
+--- a/include/linux/mtd/mtd.h
++++ b/include/linux/mtd/mtd.h
+@@ -682,6 +682,7 @@ extern int mtd_device_unregister(struct
+ extern struct mtd_info *get_mtd_device(struct mtd_info *mtd, int num);
+ extern int __get_mtd_device(struct mtd_info *mtd);
+ extern void __put_mtd_device(struct mtd_info *mtd);
++extern struct mtd_info *of_get_mtd_device_by_node(struct device_node *np);
+ extern struct mtd_info *get_mtd_device_nm(const char *name);
+ extern void put_mtd_device(struct mtd_info *mtd);
+ 

+ 30 - 0
target/linux/generic/backport-6.1/404-v6.0-mtd-core-check-partition-before-dereference.patch

@@ -0,0 +1,30 @@
+From 7ec4cdb321738d44ae5d405e7b6ac73dfbf99caa Mon Sep 17 00:00:00 2001
+From: Tetsuo Handa <[email protected]>
+Date: Mon, 25 Jul 2022 22:49:25 +0900
+Subject: [PATCH] mtd: core: check partition before dereference
+
+syzbot is reporting NULL pointer dereference at mtd_check_of_node() [1],
+for mtdram test device (CONFIG_MTD_MTDRAM) is not partition.
+
+Link: https://syzkaller.appspot.com/bug?extid=fe013f55a2814a9e8cfd [1]
+Reported-by: syzbot <[email protected]>
+Reported-by: kernel test robot <[email protected]>
+Fixes: ad9b10d1eaada169 ("mtd: core: introduce of support for dynamic partitions")
+Signed-off-by: Tetsuo Handa <[email protected]>
+CC: [email protected]
+Signed-off-by: Richard Weinberger <[email protected]>
+---
+ drivers/mtd/mtdcore.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/drivers/mtd/mtdcore.c
++++ b/drivers/mtd/mtdcore.c
+@@ -577,6 +577,8 @@ static void mtd_check_of_node(struct mtd
+ 		return;
+ 
+ 	/* Check if a partitions node exist */
++	if (!mtd_is_partition(mtd))
++		return;
+ 	parent = mtd->parent;
+ 	parent_dn = dev_of_node(&parent->dev);
+ 	if (!parent_dn)

+ 101 - 0
target/linux/generic/backport-6.1/405-v6.1-mtd-core-add-missing-of_node_get-in-dynamic-partitio.patch

@@ -0,0 +1,101 @@
+From 12b58961de0bd88b3c7dfa5d21f6d67f4678b780 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <[email protected]>
+Date: Tue, 18 Oct 2022 07:18:22 +0200
+Subject: [PATCH] mtd: core: add missing of_node_get() in dynamic partitions
+ code
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This fixes unbalanced of_node_put():
+[    1.078910] 6 cmdlinepart partitions found on MTD device gpmi-nand
+[    1.085116] Creating 6 MTD partitions on "gpmi-nand":
+[    1.090181] 0x000000000000-0x000008000000 : "nandboot"
+[    1.096952] 0x000008000000-0x000009000000 : "nandfit"
+[    1.103547] 0x000009000000-0x00000b000000 : "nandkernel"
+[    1.110317] 0x00000b000000-0x00000c000000 : "nanddtb"
+[    1.115525] ------------[ cut here ]------------
+[    1.120141] refcount_t: addition on 0; use-after-free.
+[    1.125328] WARNING: CPU: 0 PID: 1 at lib/refcount.c:25 refcount_warn_saturate+0xdc/0x148
+[    1.133528] Modules linked in:
+[    1.136589] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.0.0-rc7-next-20220930-04543-g8cf3f7
+[    1.146342] Hardware name: Freescale i.MX8DXL DDR3L EVK (DT)
+[    1.151999] pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
+[    1.158965] pc : refcount_warn_saturate+0xdc/0x148
+[    1.163760] lr : refcount_warn_saturate+0xdc/0x148
+[    1.168556] sp : ffff800009ddb080
+[    1.171866] x29: ffff800009ddb080 x28: ffff800009ddb35a x27: 0000000000000002
+[    1.179015] x26: ffff8000098b06ad x25: ffffffffffffffff x24: ffff0a00ffffff05
+[    1.186165] x23: ffff00001fdf6470 x22: ffff800009ddb367 x21: 0000000000000000
+[    1.193314] x20: ffff00001fdfebe8 x19: ffff00001fdfec50 x18: ffffffffffffffff
+[    1.200464] x17: 0000000000000000 x16: 0000000000000118 x15: 0000000000000004
+[    1.207614] x14: 0000000000000fff x13: ffff800009bca248 x12: 0000000000000003
+[    1.214764] x11: 00000000ffffefff x10: c0000000ffffefff x9 : 4762cb2ccb52de00
+[    1.221914] x8 : 4762cb2ccb52de00 x7 : 205d313431303231 x6 : 312e31202020205b
+[    1.229063] x5 : ffff800009d55c1f x4 : 0000000000000001 x3 : 0000000000000000
+[    1.236213] x2 : 0000000000000000 x1 : ffff800009954be6 x0 : 000000000000002a
+[    1.243365] Call trace:
+[    1.245806]  refcount_warn_saturate+0xdc/0x148
+[    1.250253]  kobject_get+0x98/0x9c
+[    1.253658]  of_node_get+0x20/0x34
+[    1.257072]  of_fwnode_get+0x3c/0x54
+[    1.260652]  fwnode_get_nth_parent+0xd8/0xf4
+[    1.264926]  fwnode_full_name_string+0x3c/0xb4
+[    1.269373]  device_node_string+0x498/0x5b4
+[    1.273561]  pointer+0x41c/0x5d0
+[    1.276793]  vsnprintf+0x4d8/0x694
+[    1.280198]  vprintk_store+0x164/0x528
+[    1.283951]  vprintk_emit+0x98/0x164
+[    1.287530]  vprintk_default+0x44/0x6c
+[    1.291284]  vprintk+0xf0/0x134
+[    1.294428]  _printk+0x54/0x7c
+[    1.297486]  of_node_release+0xe8/0x128
+[    1.301326]  kobject_put+0x98/0xfc
+[    1.304732]  of_node_put+0x1c/0x28
+[    1.308137]  add_mtd_device+0x484/0x6d4
+[    1.311977]  add_mtd_partitions+0xf0/0x1d0
+[    1.316078]  parse_mtd_partitions+0x45c/0x518
+[    1.320439]  mtd_device_parse_register+0xb0/0x274
+[    1.325147]  gpmi_nand_probe+0x51c/0x650
+[    1.329074]  platform_probe+0xa8/0xd0
+[    1.332740]  really_probe+0x130/0x334
+[    1.336406]  __driver_probe_device+0xb4/0xe0
+[    1.340681]  driver_probe_device+0x3c/0x1f8
+[    1.344869]  __driver_attach+0xdc/0x1a4
+[    1.348708]  bus_for_each_dev+0x80/0xcc
+[    1.352548]  driver_attach+0x24/0x30
+[    1.356127]  bus_add_driver+0x108/0x1f4
+[    1.359967]  driver_register+0x78/0x114
+[    1.363807]  __platform_driver_register+0x24/0x30
+[    1.368515]  gpmi_nand_driver_init+0x1c/0x28
+[    1.372798]  do_one_initcall+0xbc/0x238
+[    1.376638]  do_initcall_level+0x94/0xb4
+[    1.380565]  do_initcalls+0x54/0x94
+[    1.384058]  do_basic_setup+0x1c/0x28
+[    1.387724]  kernel_init_freeable+0x110/0x188
+[    1.392084]  kernel_init+0x20/0x1a0
+[    1.395578]  ret_from_fork+0x10/0x20
+[    1.399157] ---[ end trace 0000000000000000 ]---
+[    1.403782] ------------[ cut here ]------------
+
+Reported-by: Han Xu <[email protected]>
+Fixes: ad9b10d1eaada169 ("mtd: core: introduce of support for dynamic partitions")
+Signed-off-by: Rafał Miłecki <[email protected]>
+Tested-by: Han Xu <[email protected]>
+Signed-off-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/linux-mtd/[email protected]
+---
+ drivers/mtd/mtdcore.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/mtd/mtdcore.c
++++ b/drivers/mtd/mtdcore.c
+@@ -580,7 +580,7 @@ static void mtd_check_of_node(struct mtd
+ 	if (!mtd_is_partition(mtd))
+ 		return;
+ 	parent = mtd->parent;
+-	parent_dn = dev_of_node(&parent->dev);
++	parent_dn = of_node_get(dev_of_node(&parent->dev));
+ 	if (!parent_dn)
+ 		return;
+ 

+ 65 - 0
target/linux/generic/backport-6.1/406-v6.2-0001-mtd-core-simplify-a-bit-code-find-partition-matching.patch

@@ -0,0 +1,65 @@
+From 63db0cb35e1cb3b3c134906d1062f65513fdda2d Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <[email protected]>
+Date: Tue, 4 Oct 2022 10:37:09 +0200
+Subject: [PATCH] mtd: core: simplify (a bit) code find partition-matching
+ dynamic OF node
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+1. Don't hardcode "partition-" string twice
+2. Use simpler logic & use ->name to avoid of_property_read_string()
+3. Use mtd_get_of_node() helper
+
+Cc: Christian Marangi <[email protected]>
+Signed-off-by: Rafał Miłecki <[email protected]>
+Signed-off-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/linux-mtd/[email protected]
+---
+ drivers/mtd/mtdcore.c | 16 +++++++---------
+ 1 file changed, 7 insertions(+), 9 deletions(-)
+
+--- a/drivers/mtd/mtdcore.c
++++ b/drivers/mtd/mtdcore.c
+@@ -569,18 +569,16 @@ static void mtd_check_of_node(struct mtd
+ 	struct device_node *partitions, *parent_dn, *mtd_dn = NULL;
+ 	const char *pname, *prefix = "partition-";
+ 	int plen, mtd_name_len, offset, prefix_len;
+-	struct mtd_info *parent;
+ 	bool found = false;
+ 
+ 	/* Check if MTD already has a device node */
+-	if (dev_of_node(&mtd->dev))
++	if (mtd_get_of_node(mtd))
+ 		return;
+ 
+ 	/* Check if a partitions node exist */
+ 	if (!mtd_is_partition(mtd))
+ 		return;
+-	parent = mtd->parent;
+-	parent_dn = of_node_get(dev_of_node(&parent->dev));
++	parent_dn = of_node_get(mtd_get_of_node(mtd->parent));
+ 	if (!parent_dn)
+ 		return;
+ 
+@@ -593,15 +591,15 @@ static void mtd_check_of_node(struct mtd
+ 
+ 	/* Search if a partition is defined with the same name */
+ 	for_each_child_of_node(partitions, mtd_dn) {
+-		offset = 0;
+-
+ 		/* Skip partition with no/wrong prefix */
+-		if (!of_node_name_prefix(mtd_dn, "partition-"))
++		if (!of_node_name_prefix(mtd_dn, prefix))
+ 			continue;
+ 
+ 		/* Label have priority. Check that first */
+-		if (of_property_read_string(mtd_dn, "label", &pname)) {
+-			of_property_read_string(mtd_dn, "name", &pname);
++		if (!of_property_read_string(mtd_dn, "label", &pname)) {
++			offset = 0;
++		} else {
++			pname = mtd_dn->name;
+ 			offset = prefix_len;
+ 		}
+ 

+ 84 - 0
target/linux/generic/backport-6.1/406-v6.2-0002-mtd-core-try-to-find-OF-node-for-every-MTD-partition.patch

@@ -0,0 +1,84 @@
+From ddb8cefb7af288950447ca6eeeafb09977dab56f Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <[email protected]>
+Date: Tue, 4 Oct 2022 10:37:10 +0200
+Subject: [PATCH] mtd: core: try to find OF node for every MTD partition
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+So far this feature was limited to the top-level "nvmem-cells" node.
+There are multiple parsers creating partitions and subpartitions
+dynamically. Extend that code to handle them too.
+
+This allows finding partition-* node for every MTD (sub)partition.
+
+Random example:
+
+partitions {
+	compatible = "brcm,bcm947xx-cfe-partitions";
+
+	partition-firmware {
+		compatible = "brcm,trx";
+
+		partition-loader {
+		};
+	};
+};
+
+Cc: Christian Marangi <[email protected]>
+Signed-off-by: Rafał Miłecki <[email protected]>
+Signed-off-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/linux-mtd/[email protected]
+---
+ drivers/mtd/mtdcore.c | 18 ++++++------------
+ 1 file changed, 6 insertions(+), 12 deletions(-)
+
+--- a/drivers/mtd/mtdcore.c
++++ b/drivers/mtd/mtdcore.c
+@@ -569,20 +569,22 @@ static void mtd_check_of_node(struct mtd
+ 	struct device_node *partitions, *parent_dn, *mtd_dn = NULL;
+ 	const char *pname, *prefix = "partition-";
+ 	int plen, mtd_name_len, offset, prefix_len;
+-	bool found = false;
+ 
+ 	/* Check if MTD already has a device node */
+ 	if (mtd_get_of_node(mtd))
+ 		return;
+ 
+-	/* Check if a partitions node exist */
+ 	if (!mtd_is_partition(mtd))
+ 		return;
++
+ 	parent_dn = of_node_get(mtd_get_of_node(mtd->parent));
+ 	if (!parent_dn)
+ 		return;
+ 
+-	partitions = of_get_child_by_name(parent_dn, "partitions");
++	if (mtd_is_partition(mtd->parent))
++		partitions = of_node_get(parent_dn);
++	else
++		partitions = of_get_child_by_name(parent_dn, "partitions");
+ 	if (!partitions)
+ 		goto exit_parent;
+ 
+@@ -606,19 +608,11 @@ static void mtd_check_of_node(struct mtd
+ 		plen = strlen(pname) - offset;
+ 		if (plen == mtd_name_len &&
+ 		    !strncmp(mtd->name, pname + offset, plen)) {
+-			found = true;
++			mtd_set_of_node(mtd, mtd_dn);
+ 			break;
+ 		}
+ 	}
+ 
+-	if (!found)
+-		goto exit_partitions;
+-
+-	/* Set of_node only for nvmem */
+-	if (of_device_is_compatible(mtd_dn, "nvmem-cells"))
+-		mtd_set_of_node(mtd, mtd_dn);
+-
+-exit_partitions:
+ 	of_node_put(partitions);
+ exit_parent:
+ 	of_node_put(parent_dn);

+ 32 - 0
target/linux/generic/backport-6.1/407-v5.17-mtd-parsers-qcom-Don-t-print-error-message-on-EPROBE.patch

@@ -0,0 +1,32 @@
+From 26bccc9671ba5e01f7153addbe94e7dc3f677375 Mon Sep 17 00:00:00 2001
+From: Bryan O'Donoghue <[email protected]>
+Date: Mon, 3 Jan 2022 03:03:16 +0000
+Subject: [PATCH 13/14] mtd: parsers: qcom: Don't print error message on
+ -EPROBE_DEFER
+
+Its possible for the main smem driver to not be loaded by the time we come
+along to parse the smem partition description but, this is a perfectly
+normal thing.
+
+No need to print out an error message in this case.
+
+Signed-off-by: Bryan O'Donoghue <[email protected]>
+Reviewed-by: Manivannan Sadhasivam <[email protected]>
+Signed-off-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/linux-mtd/[email protected]
+---
+ drivers/mtd/parsers/qcomsmempart.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/mtd/parsers/qcomsmempart.c
++++ b/drivers/mtd/parsers/qcomsmempart.c
+@@ -75,7 +75,8 @@ static int parse_qcomsmem_part(struct mt
+ 	pr_debug("Parsing partition table info from SMEM\n");
+ 	ptable = qcom_smem_get(SMEM_APPS, SMEM_AARM_PARTITION_TABLE, &len);
+ 	if (IS_ERR(ptable)) {
+-		pr_err("Error reading partition table header\n");
++		if (PTR_ERR(ptable) != -EPROBE_DEFER)
++			pr_err("Error reading partition table header\n");
+ 		return PTR_ERR(ptable);
+ 	}
+ 

+ 47 - 0
target/linux/generic/backport-6.1/408-v6.2-mtd-core-set-ROOT_DEV-for-partitions-marked-as-rootf.patch

@@ -0,0 +1,47 @@
+From 26422ac78e9d8767bd4aabfbae616b15edbf6a1b Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <[email protected]>
+Date: Sat, 22 Oct 2022 23:13:18 +0200
+Subject: [PATCH] mtd: core: set ROOT_DEV for partitions marked as rootfs in DT
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This adds support for "linux,rootfs" binding that is used to mark flash
+partition containing rootfs. It's useful for devices using device tree
+that don't have bootloader passing root info in cmdline.
+
+Signed-off-by: Rafał Miłecki <[email protected]>
+Signed-off-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/linux-mtd/[email protected]
+---
+ drivers/mtd/mtdcore.c | 12 ++++++++++++
+ 1 file changed, 12 insertions(+)
+
+--- a/drivers/mtd/mtdcore.c
++++ b/drivers/mtd/mtdcore.c
+@@ -28,6 +28,7 @@
+ #include <linux/leds.h>
+ #include <linux/debugfs.h>
+ #include <linux/nvmem-provider.h>
++#include <linux/root_dev.h>
+ 
+ #include <linux/mtd/mtd.h>
+ #include <linux/mtd/partitions.h>
+@@ -748,6 +749,17 @@ int add_mtd_device(struct mtd_info *mtd)
+ 		not->add(mtd);
+ 
+ 	mutex_unlock(&mtd_table_mutex);
++
++	if (of_find_property(mtd_get_of_node(mtd), "linux,rootfs", NULL)) {
++		if (IS_BUILTIN(CONFIG_MTD)) {
++			pr_info("mtd: setting mtd%d (%s) as root device\n", mtd->index, mtd->name);
++			ROOT_DEV = MKDEV(MTD_BLOCK_MAJOR, mtd->index);
++		} else {
++			pr_warn("mtd: can't set mtd%d (%s) as root device - mtd must be builtin\n",
++				mtd->index, mtd->name);
++		}
++	}
++
+ 	/* We _know_ we aren't being removed, because
+ 	   our caller is still holding us here. So none
+ 	   of this try_ nonsense, and no bitching about it

+ 33 - 0
target/linux/generic/backport-6.1/410-v5.18-mtd-parsers-trx-allow-to-use-on-MediaTek-MIPS-SoCs.patch

@@ -0,0 +1,33 @@
+From 2365f91c861cbfeef7141c69842848c7b2d3c2db Mon Sep 17 00:00:00 2001
+From: INAGAKI Hiroshi <[email protected]>
+Date: Sun, 13 Feb 2022 15:40:44 +0900
+Subject: [PATCH] mtd: parsers: trx: allow to use on MediaTek MIPS SoCs
+
+Buffalo sells some router devices which have trx-formatted firmware,
+based on MediaTek MIPS SoCs. To use parser_trx on those devices, add
+"RALINK" to dependency and allow to compile for MediaTek MIPS SoCs.
+
+examples:
+
+- WCR-1166DS  (MT7628)
+- WSR-1166DHP (MT7621)
+- WSR-2533DHP (MT7621)
+
+Signed-off-by: INAGAKI Hiroshi <[email protected]>
+Signed-off-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/linux-mtd/[email protected]
+---
+ drivers/mtd/parsers/Kconfig | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/mtd/parsers/Kconfig
++++ b/drivers/mtd/parsers/Kconfig
+@@ -115,7 +115,7 @@ config MTD_AFS_PARTS
+ 
+ config MTD_PARSER_TRX
+ 	tristate "Parser for TRX format partitions"
+-	depends on MTD && (BCM47XX || ARCH_BCM_5301X || ARCH_MEDIATEK || COMPILE_TEST)
++	depends on MTD && (BCM47XX || ARCH_BCM_5301X || ARCH_MEDIATEK || RALINK || COMPILE_TEST)
+ 	help
+ 	  TRX is a firmware format used by Broadcom on their devices. It
+ 	  may contain up to 3/4 partitions (depending on the version).

+ 58 - 0
target/linux/generic/backport-6.1/420-v5.19-02-mtd-spinand-gigadevice-add-support-for-GD5FxGQ4xExxG.patch

@@ -0,0 +1,58 @@
+From 573eec222bc82fb5e724586267fbbb1aed9ffd03 Mon Sep 17 00:00:00 2001
+From: Chuanhong Guo <[email protected]>
+Date: Sun, 20 Mar 2022 17:59:58 +0800
+Subject: [PATCH 2/5] mtd: spinand: gigadevice: add support for GD5FxGQ4xExxG
+
+Add support for:
+ GD5F1GQ4RExxG
+ GD5F2GQ4{U,R}ExxG
+
+These chips differ from GD5F1GQ4UExxG only in chip ID, voltage
+and capacity.
+
+Signed-off-by: Chuanhong Guo <[email protected]>
+Signed-off-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/linux-mtd/[email protected]
+---
+ drivers/mtd/nand/spi/gigadevice.c | 30 ++++++++++++++++++++++++++++++
+ 1 file changed, 30 insertions(+)
+
+--- a/drivers/mtd/nand/spi/gigadevice.c
++++ b/drivers/mtd/nand/spi/gigadevice.c
+@@ -333,6 +333,36 @@ static const struct spinand_info gigadev
+ 		     SPINAND_HAS_QE_BIT,
+ 		     SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
+ 				     gd5fxgq4uexxg_ecc_get_status)),
++	SPINAND_INFO("GD5F1GQ4RExxG",
++		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0xc1),
++		     NAND_MEMORG(1, 2048, 128, 64, 1024, 20, 1, 1, 1),
++		     NAND_ECCREQ(8, 512),
++		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
++					      &write_cache_variants,
++					      &update_cache_variants),
++		     SPINAND_HAS_QE_BIT,
++		     SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
++				     gd5fxgq4uexxg_ecc_get_status)),
++	SPINAND_INFO("GD5F2GQ4UExxG",
++		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0xd2),
++		     NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 1, 1),
++		     NAND_ECCREQ(8, 512),
++		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
++					      &write_cache_variants,
++					      &update_cache_variants),
++		     SPINAND_HAS_QE_BIT,
++		     SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
++				     gd5fxgq4uexxg_ecc_get_status)),
++	SPINAND_INFO("GD5F2GQ4RExxG",
++		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0xc2),
++		     NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 1, 1),
++		     NAND_ECCREQ(8, 512),
++		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
++					      &write_cache_variants,
++					      &update_cache_variants),
++		     SPINAND_HAS_QE_BIT,
++		     SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
++				     gd5fxgq4uexxg_ecc_get_status)),
+ 	SPINAND_INFO("GD5F1GQ4UFxxG",
+ 		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE, 0xb1, 0x48),
+ 		     NAND_MEMORG(1, 2048, 128, 64, 1024, 20, 1, 1, 1),

+ 33 - 0
target/linux/generic/backport-6.1/420-v5.19-03-mtd-spinand-gigadevice-add-support-for-GD5F1GQ5RExxG.patch

@@ -0,0 +1,33 @@
+From 620a988813403318023296b61228ee8f3fcdb8e0 Mon Sep 17 00:00:00 2001
+From: Chuanhong Guo <[email protected]>
+Date: Sun, 20 Mar 2022 17:59:59 +0800
+Subject: [PATCH 3/5] mtd: spinand: gigadevice: add support for GD5F1GQ5RExxG
+
+This chip is the 1.8v version of GD5F1GQ5UExxG.
+
+Signed-off-by: Chuanhong Guo <[email protected]>
+Signed-off-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/linux-mtd/[email protected]
+---
+ drivers/mtd/nand/spi/gigadevice.c | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+--- a/drivers/mtd/nand/spi/gigadevice.c
++++ b/drivers/mtd/nand/spi/gigadevice.c
+@@ -383,6 +383,16 @@ static const struct spinand_info gigadev
+ 		     SPINAND_HAS_QE_BIT,
+ 		     SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
+ 				     gd5fxgq5xexxg_ecc_get_status)),
++	SPINAND_INFO("GD5F1GQ5RExxG",
++		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x41),
++		     NAND_MEMORG(1, 2048, 128, 64, 1024, 20, 1, 1, 1),
++		     NAND_ECCREQ(4, 512),
++		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants_1gq5,
++					      &write_cache_variants,
++					      &update_cache_variants),
++		     SPINAND_HAS_QE_BIT,
++		     SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
++				     gd5fxgq5xexxg_ecc_get_status)),
+ };
+ 
+ static const struct spinand_manufacturer_ops gigadevice_spinand_manuf_ops = {

+ 84 - 0
target/linux/generic/backport-6.1/420-v5.19-04-mtd-spinand-gigadevice-add-support-for-GD5F-2-4-GQ5x.patch

@@ -0,0 +1,84 @@
+From 194ec04b3a9e7fa97d1fbef296410631bc3cf1c8 Mon Sep 17 00:00:00 2001
+From: Chuanhong Guo <[email protected]>
+Date: Sun, 20 Mar 2022 18:00:00 +0800
+Subject: [PATCH 4/5] mtd: spinand: gigadevice: add support for GD5F{2,
+ 4}GQ5xExxG
+
+Add support for:
+ GD5F2GQ5{U,R}ExxG
+ GD5F4GQ6{U,R}ExxG
+
+These chips uses 4 dummy bytes for quad io and 2 dummy bytes for dual io.
+Besides that and memory layout, they are identical to their 1G variant.
+
+Signed-off-by: Chuanhong Guo <[email protected]>
+Signed-off-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/linux-mtd/[email protected]
+---
+ drivers/mtd/nand/spi/gigadevice.c | 48 +++++++++++++++++++++++++++++++
+ 1 file changed, 48 insertions(+)
+
+--- a/drivers/mtd/nand/spi/gigadevice.c
++++ b/drivers/mtd/nand/spi/gigadevice.c
+@@ -47,6 +47,14 @@ static SPINAND_OP_VARIANTS(read_cache_va
+ 		SPINAND_PAGE_READ_FROM_CACHE_OP(true, 0, 1, NULL, 0),
+ 		SPINAND_PAGE_READ_FROM_CACHE_OP(false, 0, 1, NULL, 0));
+ 
++static SPINAND_OP_VARIANTS(read_cache_variants_2gq5,
++		SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP(0, 4, NULL, 0),
++		SPINAND_PAGE_READ_FROM_CACHE_X4_OP(0, 1, NULL, 0),
++		SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP(0, 2, NULL, 0),
++		SPINAND_PAGE_READ_FROM_CACHE_X2_OP(0, 1, NULL, 0),
++		SPINAND_PAGE_READ_FROM_CACHE_OP(true, 0, 1, NULL, 0),
++		SPINAND_PAGE_READ_FROM_CACHE_OP(false, 0, 1, NULL, 0));
++
+ static SPINAND_OP_VARIANTS(write_cache_variants,
+ 		SPINAND_PROG_LOAD_X4(true, 0, NULL, 0),
+ 		SPINAND_PROG_LOAD(true, 0, NULL, 0));
+@@ -391,6 +399,46 @@ static const struct spinand_info gigadev
+ 					      &write_cache_variants,
+ 					      &update_cache_variants),
+ 		     SPINAND_HAS_QE_BIT,
++		     SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
++				     gd5fxgq5xexxg_ecc_get_status)),
++	SPINAND_INFO("GD5F2GQ5UExxG",
++		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x52),
++		     NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 1, 1),
++		     NAND_ECCREQ(4, 512),
++		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants_2gq5,
++					      &write_cache_variants,
++					      &update_cache_variants),
++		     SPINAND_HAS_QE_BIT,
++		     SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
++				     gd5fxgq5xexxg_ecc_get_status)),
++	SPINAND_INFO("GD5F2GQ5RExxG",
++		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x42),
++		     NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 1, 1),
++		     NAND_ECCREQ(4, 512),
++		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants_2gq5,
++					      &write_cache_variants,
++					      &update_cache_variants),
++		     SPINAND_HAS_QE_BIT,
++		     SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
++				     gd5fxgq5xexxg_ecc_get_status)),
++	SPINAND_INFO("GD5F4GQ6UExxG",
++		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x55),
++		     NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 2, 1),
++		     NAND_ECCREQ(4, 512),
++		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants_2gq5,
++					      &write_cache_variants,
++					      &update_cache_variants),
++		     SPINAND_HAS_QE_BIT,
++		     SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
++				     gd5fxgq5xexxg_ecc_get_status)),
++	SPINAND_INFO("GD5F4GQ6RExxG",
++		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x45),
++		     NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 2, 1),
++		     NAND_ECCREQ(4, 512),
++		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants_2gq5,
++					      &write_cache_variants,
++					      &update_cache_variants),
++		     SPINAND_HAS_QE_BIT,
+ 		     SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
+ 				     gd5fxgq5xexxg_ecc_get_status)),
+ };

+ 91 - 0
target/linux/generic/backport-6.1/420-v5.19-05-mtd-spinand-gigadevice-add-support-for-GD5FxGM7xExxG.patch

@@ -0,0 +1,91 @@
+From 54647cd003c08b714474a5b599a147ec6a160486 Mon Sep 17 00:00:00 2001
+From: Chuanhong Guo <[email protected]>
+Date: Sun, 20 Mar 2022 18:00:01 +0800
+Subject: [PATCH 5/5] mtd: spinand: gigadevice: add support for GD5FxGM7xExxG
+
+Add support for:
+ GD5F{1,2}GM7{U,R}ExxG
+ GD5F4GM8{U,R}ExxG
+
+These are new 27nm counterparts for the GD5FxGQ4 chips from GigaDevice
+with 8b/512b on-die ECC capability.
+These chips (and currently supported GD5FxGQ5 chips) have QIO DTR
+instruction for reading page cache. It isn't added in this patch because
+I don't have a DTR spi controller for testing.
+
+Signed-off-by: Chuanhong Guo <[email protected]>
+Signed-off-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/linux-mtd/[email protected]
+---
+ drivers/mtd/nand/spi/gigadevice.c | 60 +++++++++++++++++++++++++++++++
+ 1 file changed, 60 insertions(+)
+
+--- a/drivers/mtd/nand/spi/gigadevice.c
++++ b/drivers/mtd/nand/spi/gigadevice.c
+@@ -441,6 +441,66 @@ static const struct spinand_info gigadev
+ 		     SPINAND_HAS_QE_BIT,
+ 		     SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
+ 				     gd5fxgq5xexxg_ecc_get_status)),
++	SPINAND_INFO("GD5F1GM7UExxG",
++		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x91),
++		     NAND_MEMORG(1, 2048, 128, 64, 1024, 20, 1, 1, 1),
++		     NAND_ECCREQ(8, 512),
++		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants_1gq5,
++					      &write_cache_variants,
++					      &update_cache_variants),
++		     SPINAND_HAS_QE_BIT,
++		     SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
++				     gd5fxgq4uexxg_ecc_get_status)),
++	SPINAND_INFO("GD5F1GM7RExxG",
++		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x81),
++		     NAND_MEMORG(1, 2048, 128, 64, 1024, 20, 1, 1, 1),
++		     NAND_ECCREQ(8, 512),
++		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants_1gq5,
++					      &write_cache_variants,
++					      &update_cache_variants),
++		     SPINAND_HAS_QE_BIT,
++		     SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
++				     gd5fxgq4uexxg_ecc_get_status)),
++	SPINAND_INFO("GD5F2GM7UExxG",
++		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x92),
++		     NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 1, 1),
++		     NAND_ECCREQ(8, 512),
++		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants_1gq5,
++					      &write_cache_variants,
++					      &update_cache_variants),
++		     SPINAND_HAS_QE_BIT,
++		     SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
++				     gd5fxgq4uexxg_ecc_get_status)),
++	SPINAND_INFO("GD5F2GM7RExxG",
++		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x82),
++		     NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 1, 1),
++		     NAND_ECCREQ(8, 512),
++		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants_1gq5,
++					      &write_cache_variants,
++					      &update_cache_variants),
++		     SPINAND_HAS_QE_BIT,
++		     SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
++				     gd5fxgq4uexxg_ecc_get_status)),
++	SPINAND_INFO("GD5F4GM8UExxG",
++		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x95),
++		     NAND_MEMORG(1, 2048, 128, 64, 4096, 80, 1, 1, 1),
++		     NAND_ECCREQ(8, 512),
++		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants_1gq5,
++					      &write_cache_variants,
++					      &update_cache_variants),
++		     SPINAND_HAS_QE_BIT,
++		     SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
++				     gd5fxgq4uexxg_ecc_get_status)),
++	SPINAND_INFO("GD5F4GM8RExxG",
++		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x85),
++		     NAND_MEMORG(1, 2048, 128, 64, 4096, 80, 1, 1, 1),
++		     NAND_ECCREQ(8, 512),
++		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants_1gq5,
++					      &write_cache_variants,
++					      &update_cache_variants),
++		     SPINAND_HAS_QE_BIT,
++		     SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
++				     gd5fxgq4uexxg_ecc_get_status)),
+ };
+ 
+ static const struct spinand_manufacturer_ops gigadevice_spinand_manuf_ops = {

+ 229 - 0
target/linux/generic/backport-6.1/421-v6.2-mtd-parsers-add-TP-Link-SafeLoader-partitions-table-.patch

@@ -0,0 +1,229 @@
+From aec4d5f5ffd0f0092bd9dc21ea90e0bc237d4b74 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <[email protected]>
+Date: Sat, 15 Oct 2022 11:29:50 +0200
+Subject: [PATCH] mtd: parsers: add TP-Link SafeLoader partitions table parser
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This parser deals with most TP-Link home routers. It reads info about
+partitions and registers them in the MTD subsystem.
+
+Example from TP-Link Archer C5 V2:
+
+spi-nor spi0.0: s25fl128s1 (16384 Kbytes)
+15 tplink-safeloader partitions found on MTD device spi0.0
+Creating 15 MTD partitions on "spi0.0":
+0x000000000000-0x000000040000 : "fs-uboot"
+0x000000040000-0x000000440000 : "os-image"
+0x000000440000-0x000000e40000 : "rootfs"
+0x000000e40000-0x000000e40200 : "default-mac"
+0x000000e40200-0x000000e40400 : "pin"
+0x000000e40400-0x000000e40600 : "product-info"
+0x000000e50000-0x000000e60000 : "partition-table"
+0x000000e60000-0x000000e60200 : "soft-version"
+0x000000e61000-0x000000e70000 : "support-list"
+0x000000e70000-0x000000e80000 : "profile"
+0x000000e80000-0x000000e90000 : "default-config"
+0x000000e90000-0x000000ee0000 : "user-config"
+0x000000ee0000-0x000000fe0000 : "log"
+0x000000fe0000-0x000000ff0000 : "radio_bk"
+0x000000ff0000-0x000001000000 : "radio"
+
+Signed-off-by: Rafał Miłecki <[email protected]>
+Signed-off-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/linux-mtd/[email protected]
+---
+ drivers/mtd/parsers/Kconfig             |  15 +++
+ drivers/mtd/parsers/Makefile            |   1 +
+ drivers/mtd/parsers/tplink_safeloader.c | 150 ++++++++++++++++++++++++
+ 3 files changed, 166 insertions(+)
+ create mode 100644 drivers/mtd/parsers/tplink_safeloader.c
+
+--- a/drivers/mtd/parsers/Kconfig
++++ b/drivers/mtd/parsers/Kconfig
+@@ -113,6 +113,21 @@ config MTD_AFS_PARTS
+ 	  for your particular device. It won't happen automatically. The
+ 	  'physmap' map driver (CONFIG_MTD_PHYSMAP) does this, for example.
+ 
++config MTD_PARSER_TPLINK_SAFELOADER
++	tristate "TP-Link Safeloader partitions parser"
++	depends on MTD && (ARCH_BCM_5301X || ATH79 || SOC_MT7620 || SOC_MT7621 || COMPILE_TEST)
++	help
++	  TP-Link home routers use flash partitions to store various data. Info
++	  about flash space layout is stored in a partitions table using a
++	  custom ASCII-based format.
++
++	  That format was first found in devices with SafeLoader bootloader and
++	  was named after it. Later it was adapted to CFE and U-Boot
++	  bootloaders.
++
++	  This driver reads partitions table, parses it and creates MTD
++	  partitions.
++
+ config MTD_PARSER_TRX
+ 	tristate "Parser for TRX format partitions"
+ 	depends on MTD && (BCM47XX || ARCH_BCM_5301X || ARCH_MEDIATEK || RALINK || COMPILE_TEST)
+--- a/drivers/mtd/parsers/Makefile
++++ b/drivers/mtd/parsers/Makefile
+@@ -9,6 +9,7 @@ ofpart-$(CONFIG_MTD_OF_PARTS_BCM4908)	+=
+ ofpart-$(CONFIG_MTD_OF_PARTS_LINKSYS_NS)+= ofpart_linksys_ns.o
+ obj-$(CONFIG_MTD_PARSER_IMAGETAG)	+= parser_imagetag.o
+ obj-$(CONFIG_MTD_AFS_PARTS)		+= afs.o
++obj-$(CONFIG_MTD_PARSER_TPLINK_SAFELOADER)	+= tplink_safeloader.o
+ obj-$(CONFIG_MTD_PARSER_TRX)		+= parser_trx.o
+ obj-$(CONFIG_MTD_SERCOMM_PARTS)		+= scpart.o
+ obj-$(CONFIG_MTD_SHARPSL_PARTS)		+= sharpslpart.o
+--- /dev/null
++++ b/drivers/mtd/parsers/tplink_safeloader.c
+@@ -0,0 +1,150 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/*
++ * Copyright © 2022 Rafał Miłecki <[email protected]>
++ */
++
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/mtd/mtd.h>
++#include <linux/mtd/partitions.h>
++#include <linux/of.h>
++#include <linux/slab.h>
++
++#define TPLINK_SAFELOADER_DATA_OFFSET		4
++#define TPLINK_SAFELOADER_MAX_PARTS		32
++
++struct safeloader_cmn_header {
++	__be32 size;
++	uint32_t unused;
++} __packed;
++
++static void *mtd_parser_tplink_safeloader_read_table(struct mtd_info *mtd)
++{
++	struct safeloader_cmn_header hdr;
++	struct device_node *np;
++	size_t bytes_read;
++	size_t offset;
++	size_t size;
++	char *buf;
++	int err;
++
++	np = mtd_get_of_node(mtd);
++	if (mtd_is_partition(mtd))
++		of_node_get(np);
++	else
++		np = of_get_child_by_name(np, "partitions");
++
++	if (of_property_read_u32(np, "partitions-table-offset", (u32 *)&offset)) {
++		pr_err("Failed to get partitions table offset\n");
++		goto err_put;
++	}
++
++	err = mtd_read(mtd, offset, sizeof(hdr), &bytes_read, (uint8_t *)&hdr);
++	if (err && !mtd_is_bitflip(err)) {
++		pr_err("Failed to read from %s at 0x%zx\n", mtd->name, offset);
++		goto err_put;
++	}
++
++	size = be32_to_cpu(hdr.size);
++
++	buf = kmalloc(size + 1, GFP_KERNEL);
++	if (!buf)
++		goto err_put;
++
++	err = mtd_read(mtd, offset + sizeof(hdr), size, &bytes_read, buf);
++	if (err && !mtd_is_bitflip(err)) {
++		pr_err("Failed to read from %s at 0x%zx\n", mtd->name, offset + sizeof(hdr));
++		goto err_kfree;
++	}
++
++	buf[size] = '\0';
++
++	of_node_put(np);
++
++	return buf;
++
++err_kfree:
++	kfree(buf);
++err_put:
++	of_node_put(np);
++	return NULL;
++}
++
++static int mtd_parser_tplink_safeloader_parse(struct mtd_info *mtd,
++					      const struct mtd_partition **pparts,
++					      struct mtd_part_parser_data *data)
++{
++	struct mtd_partition *parts;
++	char name[65];
++	size_t offset;
++	size_t bytes;
++	char *buf;
++	int idx;
++	int err;
++
++	parts = kcalloc(TPLINK_SAFELOADER_MAX_PARTS, sizeof(*parts), GFP_KERNEL);
++	if (!parts) {
++		err = -ENOMEM;
++		goto err_out;
++	}
++
++	buf = mtd_parser_tplink_safeloader_read_table(mtd);
++	if (!buf) {
++		err = -ENOENT;
++		goto err_out;
++	}
++
++	for (idx = 0, offset = TPLINK_SAFELOADER_DATA_OFFSET;
++	     idx < TPLINK_SAFELOADER_MAX_PARTS &&
++	     sscanf(buf + offset, "partition %64s base 0x%llx size 0x%llx%zn\n",
++		    name, &parts[idx].offset, &parts[idx].size, &bytes) == 3;
++	     idx++, offset += bytes + 1) {
++		parts[idx].name = kstrdup(name, GFP_KERNEL);
++		if (!parts[idx].name) {
++			err = -ENOMEM;
++			goto err_free;
++		}
++	}
++
++	if (idx == TPLINK_SAFELOADER_MAX_PARTS)
++		pr_warn("Reached maximum number of partitions!\n");
++
++	kfree(buf);
++
++	*pparts = parts;
++
++	return idx;
++
++err_free:
++	for (idx -= 1; idx >= 0; idx--)
++		kfree(parts[idx].name);
++err_out:
++	return err;
++};
++
++static void mtd_parser_tplink_safeloader_cleanup(const struct mtd_partition *pparts,
++						 int nr_parts)
++{
++	int i;
++
++	for (i = 0; i < nr_parts; i++)
++		kfree(pparts[i].name);
++
++	kfree(pparts);
++}
++
++static const struct of_device_id mtd_parser_tplink_safeloader_of_match_table[] = {
++	{ .compatible = "tplink,safeloader-partitions" },
++	{},
++};
++MODULE_DEVICE_TABLE(of, mtd_parser_tplink_safeloader_of_match_table);
++
++static struct mtd_part_parser mtd_parser_tplink_safeloader = {
++	.parse_fn = mtd_parser_tplink_safeloader_parse,
++	.cleanup = mtd_parser_tplink_safeloader_cleanup,
++	.name = "tplink-safeloader",
++	.of_match_table = mtd_parser_tplink_safeloader_of_match_table,
++};
++module_mtd_part_parser(mtd_parser_tplink_safeloader);
++
++MODULE_LICENSE("GPL");

+ 49 - 0
target/linux/generic/backport-6.1/422-v5.19-mtd-spi-nor-support-eon-en25qh256a.patch

@@ -0,0 +1,49 @@
+From 6abef37d16d0c570ef5a149e63762fba2a30804b Mon Sep 17 00:00:00 2001
+From: "Leon M. George" <[email protected]>
+Date: Wed, 30 Mar 2022 16:16:56 +0200
+Subject: [PATCH] mtd: spi-nor: support eon en25qh256a variant
+
+The EN25QH256A variant of the EN25QH256 doesn't initialize correctly from SFDP
+alone and only accesses memory below 8m (addr_width is 4 but read_opcode takes
+only 3 bytes).
+
+Set SNOR_F_4B_OPCODES if the flash chip variant was detected using hwcaps.
+
+The fix submitted upstream uses the PARSE_SFDP initializer that is not
+available in the kernel used with Openwrt.
+
+Signed-off-by: Leon M. George <[email protected]>
+---
+ drivers/mtd/spi-nor/eon.c | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+--- a/drivers/mtd/spi-nor/eon.c
++++ b/drivers/mtd/spi-nor/eon.c
+@@ -8,6 +8,16 @@
+ 
+ #include "core.h"
+ 
++static void en25qh256_post_sfdp_fixups(struct spi_nor *nor)
++{
++	if (nor->params->hwcaps.mask & SNOR_HWCAPS_READ_1_1_4)
++		nor->flags |= SNOR_F_4B_OPCODES;
++}
++
++static const struct spi_nor_fixups en25qh256_fixups = {
++	.post_sfdp = en25qh256_post_sfdp_fixups,
++};
++
+ static const struct flash_info eon_parts[] = {
+ 	/* EON -- en25xxx */
+ 	{ "en25f32",    INFO(0x1c3116, 0, 64 * 1024,   64, SECT_4K) },
+@@ -23,7 +33,9 @@ static const struct flash_info eon_parts
+ 	{ "en25qh64",   INFO(0x1c7017, 0, 64 * 1024,  128,
+ 			     SECT_4K | SPI_NOR_DUAL_READ) },
+ 	{ "en25qh128",  INFO(0x1c7018, 0, 64 * 1024,  256, 0) },
+-	{ "en25qh256",  INFO(0x1c7019, 0, 64 * 1024,  512, 0) },
++	{ "en25qh256",  INFO(0x1c7019, 0, 64 * 1024,  512,
++		SPI_NOR_DUAL_READ)
++		.fixups = &en25qh256_fixups },
+ 	{ "en25s64",	INFO(0x1c3817, 0, 64 * 1024,  128, SECT_4K) },
+ };
+ 

+ 73 - 0
target/linux/generic/backport-6.1/423-v6.1-0001-mtd-track-maximum-number-of-bitflips-for-each-read-r.patch

@@ -0,0 +1,73 @@
+From e237285113963bd1dd2e925770aa8b3aa8a1894c Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Micha=C5=82=20K=C4=99pie=C5=84?= <[email protected]>
+Date: Wed, 29 Jun 2022 14:57:34 +0200
+Subject: [PATCH 1/4] mtd: track maximum number of bitflips for each read
+ request
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+mtd_read_oob() callers are currently oblivious to the details of ECC
+errors detected during the read operation - they only learn (through the
+return value) whether any corrected bitflips or uncorrectable errors
+occurred.  More detailed ECC information can be useful to user-space
+applications for making better-informed choices about moving data
+around.
+
+Extend struct mtd_oob_ops with a pointer to a newly-introduced struct
+mtd_req_stats and set its 'max_bitflips' field to the maximum number of
+bitflips found in a single ECC step during the read operation performed
+by mtd_read_oob().  This is a prerequisite for ultimately passing that
+value back to user space.
+
+Suggested-by: Boris Brezillon <[email protected]>
+Signed-off-by: Michał Kępień <[email protected]>
+Signed-off-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/linux-mtd/[email protected]
+---
+ drivers/mtd/mtdcore.c   | 5 +++++
+ include/linux/mtd/mtd.h | 5 +++++
+ 2 files changed, 10 insertions(+)
+
+--- a/drivers/mtd/mtdcore.c
++++ b/drivers/mtd/mtdcore.c
+@@ -1676,6 +1676,9 @@ int mtd_read_oob(struct mtd_info *mtd, l
+ 	if (!master->_read_oob && (!master->_read || ops->oobbuf))
+ 		return -EOPNOTSUPP;
+ 
++	if (ops->stats)
++		memset(ops->stats, 0, sizeof(*ops->stats));
++
+ 	if (mtd->flags & MTD_SLC_ON_MLC_EMULATION)
+ 		ret_code = mtd_io_emulated_slc(mtd, from, true, ops);
+ 	else
+@@ -1693,6 +1696,8 @@ int mtd_read_oob(struct mtd_info *mtd, l
+ 		return ret_code;
+ 	if (mtd->ecc_strength == 0)
+ 		return 0;	/* device lacks ecc */
++	if (ops->stats)
++		ops->stats->max_bitflips = ret_code;
+ 	return ret_code >= mtd->bitflip_threshold ? -EUCLEAN : 0;
+ }
+ EXPORT_SYMBOL_GPL(mtd_read_oob);
+--- a/include/linux/mtd/mtd.h
++++ b/include/linux/mtd/mtd.h
+@@ -40,6 +40,10 @@ struct mtd_erase_region_info {
+ 	unsigned long *lockmap;		/* If keeping bitmap of locks */
+ };
+ 
++struct mtd_req_stats {
++	unsigned int max_bitflips;
++};
++
+ /**
+  * struct mtd_oob_ops - oob operation operands
+  * @mode:	operation mode
+@@ -70,6 +74,7 @@ struct mtd_oob_ops {
+ 	uint32_t	ooboffs;
+ 	uint8_t		*datbuf;
+ 	uint8_t		*oobbuf;
++	struct mtd_req_stats *stats;
+ };
+ 
+ #define MTD_MAX_OOBFREE_ENTRIES_LARGE	32

+ 325 - 0
target/linux/generic/backport-6.1/423-v6.1-0002-mtd-always-initialize-stats-in-struct-mtd_oob_ops.patch

@@ -0,0 +1,325 @@
+From e97709c9d18903f5acd5fbe2985dd054da0432b1 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Micha=C5=82=20K=C4=99pie=C5=84?= <[email protected]>
+Date: Wed, 29 Jun 2022 14:57:35 +0200
+Subject: [PATCH 2/4] mtd: always initialize 'stats' in struct mtd_oob_ops
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+As the 'stats' field in struct mtd_oob_ops is used in conditional
+expressions, ensure it is always zero-initialized in all such structures
+to prevent random stack garbage from being interpreted as a pointer.
+
+Strictly speaking, this problem currently only needs to be fixed for
+struct mtd_oob_ops structures subsequently passed to mtd_read_oob().
+However, this commit goes a step further and makes all instances of
+struct mtd_oob_ops in the tree zero-initialized, in hope of preventing
+future problems, e.g. if struct mtd_req_stats gets extended with write
+statistics at some point.
+
+Signed-off-by: Michał Kępień <[email protected]>
+Signed-off-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/linux-mtd/[email protected]
+---
+ drivers/mtd/inftlcore.c                 | 6 +++---
+ drivers/mtd/mtdswap.c                   | 6 +++---
+ drivers/mtd/nand/onenand/onenand_base.c | 4 ++--
+ drivers/mtd/nand/onenand/onenand_bbt.c  | 2 +-
+ drivers/mtd/nand/raw/nand_bbt.c         | 8 ++++----
+ drivers/mtd/nand/raw/sm_common.c        | 2 +-
+ drivers/mtd/nftlcore.c                  | 6 +++---
+ drivers/mtd/sm_ftl.c                    | 4 ++--
+ drivers/mtd/ssfdc.c                     | 2 +-
+ drivers/mtd/tests/nandbiterrs.c         | 2 +-
+ drivers/mtd/tests/oobtest.c             | 8 ++++----
+ drivers/mtd/tests/readtest.c            | 2 +-
+ fs/jffs2/wbuf.c                         | 6 +++---
+ 13 files changed, 29 insertions(+), 29 deletions(-)
+
+--- a/drivers/mtd/inftlcore.c
++++ b/drivers/mtd/inftlcore.c
+@@ -136,7 +136,7 @@ static void inftl_remove_dev(struct mtd_
+ int inftl_read_oob(struct mtd_info *mtd, loff_t offs, size_t len,
+ 		   size_t *retlen, uint8_t *buf)
+ {
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 	int res;
+ 
+ 	ops.mode = MTD_OPS_PLACE_OOB;
+@@ -156,7 +156,7 @@ int inftl_read_oob(struct mtd_info *mtd,
+ int inftl_write_oob(struct mtd_info *mtd, loff_t offs, size_t len,
+ 		    size_t *retlen, uint8_t *buf)
+ {
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 	int res;
+ 
+ 	ops.mode = MTD_OPS_PLACE_OOB;
+@@ -176,7 +176,7 @@ int inftl_write_oob(struct mtd_info *mtd
+ static int inftl_write(struct mtd_info *mtd, loff_t offs, size_t len,
+ 		       size_t *retlen, uint8_t *buf, uint8_t *oob)
+ {
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 	int res;
+ 
+ 	ops.mode = MTD_OPS_PLACE_OOB;
+--- a/drivers/mtd/mtdswap.c
++++ b/drivers/mtd/mtdswap.c
+@@ -323,7 +323,7 @@ static int mtdswap_read_markers(struct m
+ 	struct mtdswap_oobdata *data, *data2;
+ 	int ret;
+ 	loff_t offset;
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 
+ 	offset = mtdswap_eb_offset(d, eb);
+ 
+@@ -370,7 +370,7 @@ static int mtdswap_write_marker(struct m
+ 	struct mtdswap_oobdata n;
+ 	int ret;
+ 	loff_t offset;
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 
+ 	ops.ooboffs = 0;
+ 	ops.oobbuf = (uint8_t *)&n;
+@@ -879,7 +879,7 @@ static unsigned int mtdswap_eblk_passes(
+ 	loff_t base, pos;
+ 	unsigned int *p1 = (unsigned int *)d->page_buf;
+ 	unsigned char *p2 = (unsigned char *)d->oob_buf;
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 	int ret;
+ 
+ 	ops.mode = MTD_OPS_AUTO_OOB;
+--- a/drivers/mtd/nand/onenand/onenand_base.c
++++ b/drivers/mtd/nand/onenand/onenand_base.c
+@@ -2935,7 +2935,7 @@ static int do_otp_write(struct mtd_info
+ 	struct onenand_chip *this = mtd->priv;
+ 	unsigned char *pbuf = buf;
+ 	int ret;
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 
+ 	/* Force buffer page aligned */
+ 	if (len < mtd->writesize) {
+@@ -2977,7 +2977,7 @@ static int do_otp_lock(struct mtd_info *
+ 		size_t *retlen, u_char *buf)
+ {
+ 	struct onenand_chip *this = mtd->priv;
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 	int ret;
+ 
+ 	if (FLEXONENAND(this)) {
+--- a/drivers/mtd/nand/onenand/onenand_bbt.c
++++ b/drivers/mtd/nand/onenand/onenand_bbt.c
+@@ -61,7 +61,7 @@ static int create_bbt(struct mtd_info *m
+ 	int startblock;
+ 	loff_t from;
+ 	size_t readlen, ooblen;
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 	int rgn;
+ 
+ 	printk(KERN_INFO "Scanning device for bad blocks\n");
+--- a/drivers/mtd/nand/raw/nand_bbt.c
++++ b/drivers/mtd/nand/raw/nand_bbt.c
+@@ -313,7 +313,7 @@ static int scan_read_oob(struct nand_chi
+ 			 size_t len)
+ {
+ 	struct mtd_info *mtd = nand_to_mtd(this);
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 	int res, ret = 0;
+ 
+ 	ops.mode = MTD_OPS_PLACE_OOB;
+@@ -354,7 +354,7 @@ static int scan_write_bbt(struct nand_ch
+ 			  uint8_t *buf, uint8_t *oob)
+ {
+ 	struct mtd_info *mtd = nand_to_mtd(this);
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 
+ 	ops.mode = MTD_OPS_PLACE_OOB;
+ 	ops.ooboffs = 0;
+@@ -416,7 +416,7 @@ static int scan_block_fast(struct nand_c
+ {
+ 	struct mtd_info *mtd = nand_to_mtd(this);
+ 
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 	int ret, page_offset;
+ 
+ 	ops.ooblen = mtd->oobsize;
+@@ -756,7 +756,7 @@ static int write_bbt(struct nand_chip *t
+ 	uint8_t rcode = td->reserved_block_code;
+ 	size_t retlen, len = 0;
+ 	loff_t to;
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 
+ 	ops.ooblen = mtd->oobsize;
+ 	ops.ooboffs = 0;
+--- a/drivers/mtd/nand/raw/sm_common.c
++++ b/drivers/mtd/nand/raw/sm_common.c
+@@ -99,7 +99,7 @@ static const struct mtd_ooblayout_ops oo
+ static int sm_block_markbad(struct nand_chip *chip, loff_t ofs)
+ {
+ 	struct mtd_info *mtd = nand_to_mtd(chip);
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 	struct sm_oob oob;
+ 	int ret;
+ 
+--- a/drivers/mtd/nftlcore.c
++++ b/drivers/mtd/nftlcore.c
+@@ -124,7 +124,7 @@ int nftl_read_oob(struct mtd_info *mtd,
+ 		  size_t *retlen, uint8_t *buf)
+ {
+ 	loff_t mask = mtd->writesize - 1;
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 	int res;
+ 
+ 	ops.mode = MTD_OPS_PLACE_OOB;
+@@ -145,7 +145,7 @@ int nftl_write_oob(struct mtd_info *mtd,
+ 		   size_t *retlen, uint8_t *buf)
+ {
+ 	loff_t mask = mtd->writesize - 1;
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 	int res;
+ 
+ 	ops.mode = MTD_OPS_PLACE_OOB;
+@@ -168,7 +168,7 @@ static int nftl_write(struct mtd_info *m
+ 		      size_t *retlen, uint8_t *buf, uint8_t *oob)
+ {
+ 	loff_t mask = mtd->writesize - 1;
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 	int res;
+ 
+ 	ops.mode = MTD_OPS_PLACE_OOB;
+--- a/drivers/mtd/sm_ftl.c
++++ b/drivers/mtd/sm_ftl.c
+@@ -239,7 +239,7 @@ static int sm_read_sector(struct sm_ftl
+ 			  uint8_t *buffer, struct sm_oob *oob)
+ {
+ 	struct mtd_info *mtd = ftl->trans->mtd;
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 	struct sm_oob tmp_oob;
+ 	int ret = -EIO;
+ 	int try = 0;
+@@ -323,7 +323,7 @@ static int sm_write_sector(struct sm_ftl
+ 			   int zone, int block, int boffset,
+ 			   uint8_t *buffer, struct sm_oob *oob)
+ {
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 	struct mtd_info *mtd = ftl->trans->mtd;
+ 	int ret;
+ 
+--- a/drivers/mtd/ssfdc.c
++++ b/drivers/mtd/ssfdc.c
+@@ -163,7 +163,7 @@ static int read_physical_sector(struct m
+ /* Read redundancy area (wrapper to MTD_READ_OOB */
+ static int read_raw_oob(struct mtd_info *mtd, loff_t offs, uint8_t *buf)
+ {
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 	int ret;
+ 
+ 	ops.mode = MTD_OPS_RAW;
+--- a/drivers/mtd/tests/nandbiterrs.c
++++ b/drivers/mtd/tests/nandbiterrs.c
+@@ -99,7 +99,7 @@ static int write_page(int log)
+ static int rewrite_page(int log)
+ {
+ 	int err = 0;
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 
+ 	if (log)
+ 		pr_info("rewrite page\n");
+--- a/drivers/mtd/tests/oobtest.c
++++ b/drivers/mtd/tests/oobtest.c
+@@ -56,7 +56,7 @@ static void do_vary_offset(void)
+ static int write_eraseblock(int ebnum)
+ {
+ 	int i;
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 	int err = 0;
+ 	loff_t addr = (loff_t)ebnum * mtd->erasesize;
+ 
+@@ -165,7 +165,7 @@ static size_t memffshow(loff_t addr, lof
+ static int verify_eraseblock(int ebnum)
+ {
+ 	int i;
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 	int err = 0;
+ 	loff_t addr = (loff_t)ebnum * mtd->erasesize;
+ 	size_t bitflips;
+@@ -260,7 +260,7 @@ static int verify_eraseblock(int ebnum)
+ 
+ static int verify_eraseblock_in_one_go(int ebnum)
+ {
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 	int err = 0;
+ 	loff_t addr = (loff_t)ebnum * mtd->erasesize;
+ 	size_t len = mtd->oobavail * pgcnt;
+@@ -338,7 +338,7 @@ static int __init mtd_oobtest_init(void)
+ 	int err = 0;
+ 	unsigned int i;
+ 	uint64_t tmp;
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 	loff_t addr = 0, addr0;
+ 
+ 	printk(KERN_INFO "\n");
+--- a/drivers/mtd/tests/readtest.c
++++ b/drivers/mtd/tests/readtest.c
+@@ -47,7 +47,7 @@ static int read_eraseblock_by_page(int e
+ 				err = ret;
+ 		}
+ 		if (mtd->oobsize) {
+-			struct mtd_oob_ops ops;
++			struct mtd_oob_ops ops = { };
+ 
+ 			ops.mode      = MTD_OPS_PLACE_OOB;
+ 			ops.len       = 0;
+--- a/fs/jffs2/wbuf.c
++++ b/fs/jffs2/wbuf.c
+@@ -1035,7 +1035,7 @@ int jffs2_check_oob_empty(struct jffs2_s
+ {
+ 	int i, ret;
+ 	int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 
+ 	ops.mode = MTD_OPS_AUTO_OOB;
+ 	ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail;
+@@ -1076,7 +1076,7 @@ int jffs2_check_oob_empty(struct jffs2_s
+ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
+ 				 struct jffs2_eraseblock *jeb)
+ {
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 	int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
+ 
+ 	ops.mode = MTD_OPS_AUTO_OOB;
+@@ -1101,7 +1101,7 @@ int jffs2_write_nand_cleanmarker(struct
+ 				 struct jffs2_eraseblock *jeb)
+ {
+ 	int ret;
+-	struct mtd_oob_ops ops;
++	struct mtd_oob_ops ops = { };
+ 	int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
+ 
+ 	ops.mode = MTD_OPS_AUTO_OOB;

+ 172 - 0
target/linux/generic/backport-6.1/423-v6.1-0003-mtd-add-ECC-error-accounting-for-each-read-request.patch

@@ -0,0 +1,172 @@
+From 2ed18d818d1f7492172f8dd5904344c7d367e8ed Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Micha=C5=82=20K=C4=99pie=C5=84?= <[email protected]>
+Date: Wed, 29 Jun 2022 14:57:36 +0200
+Subject: [PATCH 3/4] mtd: add ECC error accounting for each read request
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Extend struct mtd_req_stats with two new fields holding the number of
+corrected bitflips and uncorrectable errors detected during a read
+operation.  This is a prerequisite for ultimately passing those counters
+to user space, where they can be useful to applications for making
+better-informed choices about moving data around.
+
+Unlike 'max_bitflips' (which is set - in a common code path - to the
+return value of a function called while the MTD device's mutex is held),
+these counters have to be maintained in each MTD driver which defines
+the '_read_oob' callback because the statistics need to be calculated
+while the MTD device's mutex is held.
+
+Suggested-by: Boris Brezillon <[email protected]>
+Signed-off-by: Michał Kępień <[email protected]>
+Signed-off-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/linux-mtd/[email protected]
+---
+ drivers/mtd/devices/docg3.c             |  8 ++++++++
+ drivers/mtd/nand/onenand/onenand_base.c | 12 ++++++++++++
+ drivers/mtd/nand/raw/nand_base.c        | 10 ++++++++++
+ drivers/mtd/nand/spi/core.c             | 10 ++++++++++
+ include/linux/mtd/mtd.h                 |  2 ++
+ 5 files changed, 42 insertions(+)
+
+--- a/drivers/mtd/devices/docg3.c
++++ b/drivers/mtd/devices/docg3.c
+@@ -871,6 +871,7 @@ static int doc_read_oob(struct mtd_info
+ 	u8 *buf = ops->datbuf;
+ 	size_t len, ooblen, nbdata, nboob;
+ 	u8 hwecc[DOC_ECC_BCH_SIZE], eccconf1;
++	struct mtd_ecc_stats old_stats;
+ 	int max_bitflips = 0;
+ 
+ 	if (buf)
+@@ -895,6 +896,7 @@ static int doc_read_oob(struct mtd_info
+ 	ret = 0;
+ 	skip = from % DOC_LAYOUT_PAGE_SIZE;
+ 	mutex_lock(&docg3->cascade->lock);
++	old_stats = mtd->ecc_stats;
+ 	while (ret >= 0 && (len > 0 || ooblen > 0)) {
+ 		calc_block_sector(from - skip, &block0, &block1, &page, &ofs,
+ 			docg3->reliable);
+@@ -966,6 +968,12 @@ static int doc_read_oob(struct mtd_info
+ 	}
+ 
+ out:
++	if (ops->stats) {
++		ops->stats->uncorrectable_errors +=
++			mtd->ecc_stats.failed - old_stats.failed;
++		ops->stats->corrected_bitflips +=
++			mtd->ecc_stats.corrected - old_stats.corrected;
++	}
+ 	mutex_unlock(&docg3->cascade->lock);
+ 	return ret;
+ err_in_read:
+--- a/drivers/mtd/nand/onenand/onenand_base.c
++++ b/drivers/mtd/nand/onenand/onenand_base.c
+@@ -1440,6 +1440,7 @@ static int onenand_read_oob(struct mtd_i
+ 			    struct mtd_oob_ops *ops)
+ {
+ 	struct onenand_chip *this = mtd->priv;
++	struct mtd_ecc_stats old_stats;
+ 	int ret;
+ 
+ 	switch (ops->mode) {
+@@ -1453,12 +1454,23 @@ static int onenand_read_oob(struct mtd_i
+ 	}
+ 
+ 	onenand_get_device(mtd, FL_READING);
++
++	old_stats = mtd->ecc_stats;
++
+ 	if (ops->datbuf)
+ 		ret = ONENAND_IS_4KB_PAGE(this) ?
+ 			onenand_mlc_read_ops_nolock(mtd, from, ops) :
+ 			onenand_read_ops_nolock(mtd, from, ops);
+ 	else
+ 		ret = onenand_read_oob_nolock(mtd, from, ops);
++
++	if (ops->stats) {
++		ops->stats->uncorrectable_errors +=
++			mtd->ecc_stats.failed - old_stats.failed;
++		ops->stats->corrected_bitflips +=
++			mtd->ecc_stats.corrected - old_stats.corrected;
++	}
++
+ 	onenand_release_device(mtd);
+ 
+ 	return ret;
+--- a/drivers/mtd/nand/raw/nand_base.c
++++ b/drivers/mtd/nand/raw/nand_base.c
+@@ -3815,6 +3815,7 @@ static int nand_read_oob(struct mtd_info
+ 			 struct mtd_oob_ops *ops)
+ {
+ 	struct nand_chip *chip = mtd_to_nand(mtd);
++	struct mtd_ecc_stats old_stats;
+ 	int ret;
+ 
+ 	ops->retlen = 0;
+@@ -3826,11 +3827,20 @@ static int nand_read_oob(struct mtd_info
+ 
+ 	nand_get_device(chip);
+ 
++	old_stats = mtd->ecc_stats;
++
+ 	if (!ops->datbuf)
+ 		ret = nand_do_read_oob(chip, from, ops);
+ 	else
+ 		ret = nand_do_read_ops(chip, from, ops);
+ 
++	if (ops->stats) {
++		ops->stats->uncorrectable_errors +=
++			mtd->ecc_stats.failed - old_stats.failed;
++		ops->stats->corrected_bitflips +=
++			mtd->ecc_stats.corrected - old_stats.corrected;
++	}
++
+ 	nand_release_device(chip);
+ 	return ret;
+ }
+--- a/drivers/mtd/nand/spi/core.c
++++ b/drivers/mtd/nand/spi/core.c
+@@ -629,6 +629,7 @@ static int spinand_mtd_read(struct mtd_i
+ {
+ 	struct spinand_device *spinand = mtd_to_spinand(mtd);
+ 	struct nand_device *nand = mtd_to_nanddev(mtd);
++	struct mtd_ecc_stats old_stats;
+ 	unsigned int max_bitflips = 0;
+ 	struct nand_io_iter iter;
+ 	bool disable_ecc = false;
+@@ -640,6 +641,8 @@ static int spinand_mtd_read(struct mtd_i
+ 
+ 	mutex_lock(&spinand->lock);
+ 
++	old_stats = mtd->ecc_stats;
++
+ 	nanddev_io_for_each_page(nand, NAND_PAGE_READ, from, ops, &iter) {
+ 		if (disable_ecc)
+ 			iter.req.mode = MTD_OPS_RAW;
+@@ -662,6 +665,13 @@ static int spinand_mtd_read(struct mtd_i
+ 		ops->oobretlen += iter.req.ooblen;
+ 	}
+ 
++	if (ops->stats) {
++		ops->stats->uncorrectable_errors +=
++			mtd->ecc_stats.failed - old_stats.failed;
++		ops->stats->corrected_bitflips +=
++			mtd->ecc_stats.corrected - old_stats.corrected;
++	}
++
+ 	mutex_unlock(&spinand->lock);
+ 
+ 	if (ecc_failed && !ret)
+--- a/include/linux/mtd/mtd.h
++++ b/include/linux/mtd/mtd.h
+@@ -41,6 +41,8 @@ struct mtd_erase_region_info {
+ };
+ 
+ struct mtd_req_stats {
++	unsigned int uncorrectable_errors;
++	unsigned int corrected_bitflips;
+ 	unsigned int max_bitflips;
+ };
+ 

+ 321 - 0
target/linux/generic/backport-6.1/423-v6.1-0004-mtdchar-add-MEMREAD-ioctl.patch

@@ -0,0 +1,321 @@
+From 2c9745d36e04ac27161acd78514f647b9b587ad4 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Micha=C5=82=20K=C4=99pie=C5=84?= <[email protected]>
+Date: Wed, 29 Jun 2022 14:57:37 +0200
+Subject: [PATCH 4/4] mtdchar: add MEMREAD ioctl
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+User-space applications making use of MTD devices via /dev/mtd*
+character devices currently have limited capabilities for reading data:
+
+  - only deprecated methods of accessing OOB layout information exist,
+
+  - there is no way to explicitly specify MTD operation mode to use; it
+    is auto-selected based on the MTD file mode (MTD_FILE_MODE_*) set
+    for the character device; in particular, this prevents using
+    MTD_OPS_AUTO_OOB for reads,
+
+  - all existing user-space interfaces which cause mtd_read() or
+    mtd_read_oob() to be called (via mtdchar_read() and
+    mtdchar_read_oob(), respectively) return success even when those
+    functions return -EUCLEAN or -EBADMSG; this renders user-space
+    applications using these interfaces unaware of any corrected
+    bitflips or uncorrectable ECC errors detected during reads.
+
+Note that the existing MEMWRITE ioctl allows the MTD operation mode to
+be explicitly set, allowing user-space applications to write page data
+and OOB data without requiring them to know anything about the OOB
+layout of the MTD device they are writing to (MTD_OPS_AUTO_OOB).  Also,
+the MEMWRITE ioctl does not mangle the return value of mtd_write_oob().
+
+Add a new ioctl, MEMREAD, which addresses the above issues.  It is
+intended to be a read-side counterpart of the existing MEMWRITE ioctl.
+Similarly to the latter, the read operation is performed in a loop which
+processes at most mtd->erasesize bytes in each iteration.  This is done
+to prevent unbounded memory allocations caused by calling kmalloc() with
+the 'size' argument taken directly from the struct mtd_read_req provided
+by user space.  However, the new ioctl is implemented so that the values
+it returns match those that would have been returned if just a single
+mtd_read_oob() call was issued to handle the entire read operation in
+one go.
+
+Note that while just returning -EUCLEAN or -EBADMSG to user space would
+already be a valid and useful indication of the ECC algorithm detecting
+errors during a read operation, that signal would not be granular enough
+to cover all use cases.  For example, knowing the maximum number of
+bitflips detected in a single ECC step during a read operation performed
+on a given page may be useful when dealing with an MTD partition whose
+ECC layout varies across pages (e.g. a partition consisting of a
+bootloader area using a "custom" ECC layout followed by data pages using
+a "standard" ECC layout).  To address that, include ECC statistics in
+the structure returned to user space by the new MEMREAD ioctl.
+
+Link: https://www.infradead.org/pipermail/linux-mtd/2016-April/067085.html
+
+Suggested-by: Boris Brezillon <[email protected]>
+Signed-off-by: Michał Kępień <[email protected]>
+Acked-by: Richard Weinberger <[email protected]>
+Signed-off-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/linux-mtd/[email protected]
+---
+ drivers/mtd/mtdchar.c      | 139 +++++++++++++++++++++++++++++++++++++
+ include/uapi/mtd/mtd-abi.h |  64 +++++++++++++++--
+ 2 files changed, 198 insertions(+), 5 deletions(-)
+
+--- a/drivers/mtd/mtdchar.c
++++ b/drivers/mtd/mtdchar.c
+@@ -621,6 +621,137 @@ static int mtdchar_write_ioctl(struct mt
+ 	return ret;
+ }
+ 
++static int mtdchar_read_ioctl(struct mtd_info *mtd,
++		struct mtd_read_req __user *argp)
++{
++	struct mtd_info *master = mtd_get_master(mtd);
++	struct mtd_read_req req;
++	void __user *usr_data, *usr_oob;
++	uint8_t *datbuf = NULL, *oobbuf = NULL;
++	size_t datbuf_len, oobbuf_len;
++	size_t orig_len, orig_ooblen;
++	int ret = 0;
++
++	if (copy_from_user(&req, argp, sizeof(req)))
++		return -EFAULT;
++
++	orig_len = req.len;
++	orig_ooblen = req.ooblen;
++
++	usr_data = (void __user *)(uintptr_t)req.usr_data;
++	usr_oob = (void __user *)(uintptr_t)req.usr_oob;
++
++	if (!master->_read_oob)
++		return -EOPNOTSUPP;
++
++	if (!usr_data)
++		req.len = 0;
++
++	if (!usr_oob)
++		req.ooblen = 0;
++
++	req.ecc_stats.uncorrectable_errors = 0;
++	req.ecc_stats.corrected_bitflips = 0;
++	req.ecc_stats.max_bitflips = 0;
++
++	req.len &= 0xffffffff;
++	req.ooblen &= 0xffffffff;
++
++	if (req.start + req.len > mtd->size) {
++		ret = -EINVAL;
++		goto out;
++	}
++
++	datbuf_len = min_t(size_t, req.len, mtd->erasesize);
++	if (datbuf_len > 0) {
++		datbuf = kvmalloc(datbuf_len, GFP_KERNEL);
++		if (!datbuf) {
++			ret = -ENOMEM;
++			goto out;
++		}
++	}
++
++	oobbuf_len = min_t(size_t, req.ooblen, mtd->erasesize);
++	if (oobbuf_len > 0) {
++		oobbuf = kvmalloc(oobbuf_len, GFP_KERNEL);
++		if (!oobbuf) {
++			ret = -ENOMEM;
++			goto out;
++		}
++	}
++
++	while (req.len > 0 || (!usr_data && req.ooblen > 0)) {
++		struct mtd_req_stats stats;
++		struct mtd_oob_ops ops = {
++			.mode = req.mode,
++			.len = min_t(size_t, req.len, datbuf_len),
++			.ooblen = min_t(size_t, req.ooblen, oobbuf_len),
++			.datbuf = datbuf,
++			.oobbuf = oobbuf,
++			.stats = &stats,
++		};
++
++		/*
++		 * Shorten non-page-aligned, eraseblock-sized reads so that the
++		 * read ends on an eraseblock boundary.  This is necessary in
++		 * order to prevent OOB data for some pages from being
++		 * duplicated in the output of non-page-aligned reads requiring
++		 * multiple mtd_read_oob() calls to be completed.
++		 */
++		if (ops.len == mtd->erasesize)
++			ops.len -= mtd_mod_by_ws(req.start + ops.len, mtd);
++
++		ret = mtd_read_oob(mtd, (loff_t)req.start, &ops);
++
++		req.ecc_stats.uncorrectable_errors +=
++			stats.uncorrectable_errors;
++		req.ecc_stats.corrected_bitflips += stats.corrected_bitflips;
++		req.ecc_stats.max_bitflips =
++			max(req.ecc_stats.max_bitflips, stats.max_bitflips);
++
++		if (ret && !mtd_is_bitflip_or_eccerr(ret))
++			break;
++
++		if (copy_to_user(usr_data, ops.datbuf, ops.retlen) ||
++		    copy_to_user(usr_oob, ops.oobbuf, ops.oobretlen)) {
++			ret = -EFAULT;
++			break;
++		}
++
++		req.start += ops.retlen;
++		req.len -= ops.retlen;
++		usr_data += ops.retlen;
++
++		req.ooblen -= ops.oobretlen;
++		usr_oob += ops.oobretlen;
++	}
++
++	/*
++	 * As multiple iterations of the above loop (and therefore multiple
++	 * mtd_read_oob() calls) may be necessary to complete the read request,
++	 * adjust the final return code to ensure it accounts for all detected
++	 * ECC errors.
++	 */
++	if (!ret || mtd_is_bitflip(ret)) {
++		if (req.ecc_stats.uncorrectable_errors > 0)
++			ret = -EBADMSG;
++		else if (req.ecc_stats.corrected_bitflips > 0)
++			ret = -EUCLEAN;
++	}
++
++out:
++	req.len = orig_len - req.len;
++	req.ooblen = orig_ooblen - req.ooblen;
++
++	if (copy_to_user(argp, &req, sizeof(req)))
++		ret = -EFAULT;
++
++	kvfree(datbuf);
++	kvfree(oobbuf);
++
++	return ret;
++}
++
+ static int mtdchar_ioctl(struct file *file, u_int cmd, u_long arg)
+ {
+ 	struct mtd_file_info *mfi = file->private_data;
+@@ -643,6 +774,7 @@ static int mtdchar_ioctl(struct file *fi
+ 	case MEMGETINFO:
+ 	case MEMREADOOB:
+ 	case MEMREADOOB64:
++	case MEMREAD:
+ 	case MEMISLOCKED:
+ 	case MEMGETOOBSEL:
+ 	case MEMGETBADBLOCK:
+@@ -817,6 +949,13 @@ static int mtdchar_ioctl(struct file *fi
+ 		break;
+ 	}
+ 
++	case MEMREAD:
++	{
++		ret = mtdchar_read_ioctl(mtd,
++		      (struct mtd_read_req __user *)arg);
++		break;
++	}
++
+ 	case MEMLOCK:
+ 	{
+ 		struct erase_info_user einfo;
+--- a/include/uapi/mtd/mtd-abi.h
++++ b/include/uapi/mtd/mtd-abi.h
+@@ -55,9 +55,9 @@ struct mtd_oob_buf64 {
+  * @MTD_OPS_RAW:	data are transferred as-is, with no error correction;
+  *			this mode implies %MTD_OPS_PLACE_OOB
+  *
+- * These modes can be passed to ioctl(MEMWRITE) and are also used internally.
+- * See notes on "MTD file modes" for discussion on %MTD_OPS_RAW vs.
+- * %MTD_FILE_MODE_RAW.
++ * These modes can be passed to ioctl(MEMWRITE) and ioctl(MEMREAD); they are
++ * also used internally. See notes on "MTD file modes" for discussion on
++ * %MTD_OPS_RAW vs. %MTD_FILE_MODE_RAW.
+  */
+ enum {
+ 	MTD_OPS_PLACE_OOB = 0,
+@@ -91,6 +91,53 @@ struct mtd_write_req {
+ 	__u8 padding[7];
+ };
+ 
++/**
++ * struct mtd_read_req_ecc_stats - ECC statistics for a read operation
++ *
++ * @uncorrectable_errors: the number of uncorrectable errors that happened
++ *			  during the read operation
++ * @corrected_bitflips: the number of bitflips corrected during the read
++ *			operation
++ * @max_bitflips: the maximum number of bitflips detected in any single ECC
++ *		  step for the data read during the operation; this information
++ *		  can be used to decide whether the data stored in a specific
++ *		  region of the MTD device should be moved somewhere else to
++ *		  avoid data loss.
++ */
++struct mtd_read_req_ecc_stats {
++	__u32 uncorrectable_errors;
++	__u32 corrected_bitflips;
++	__u32 max_bitflips;
++};
++
++/**
++ * struct mtd_read_req - data structure for requesting a read operation
++ *
++ * @start:	start address
++ * @len:	length of data buffer (only lower 32 bits are used)
++ * @ooblen:	length of OOB buffer (only lower 32 bits are used)
++ * @usr_data:	user-provided data buffer
++ * @usr_oob:	user-provided OOB buffer
++ * @mode:	MTD mode (see "MTD operation modes")
++ * @padding:	reserved, must be set to 0
++ * @ecc_stats:	ECC statistics for the read operation
++ *
++ * This structure supports ioctl(MEMREAD) operations, allowing data and/or OOB
++ * reads in various modes. To read from OOB-only, set @usr_data == NULL, and to
++ * read data-only, set @usr_oob == NULL. However, setting both @usr_data and
++ * @usr_oob to NULL is not allowed.
++ */
++struct mtd_read_req {
++	__u64 start;
++	__u64 len;
++	__u64 ooblen;
++	__u64 usr_data;
++	__u64 usr_oob;
++	__u8 mode;
++	__u8 padding[7];
++	struct mtd_read_req_ecc_stats ecc_stats;
++};
++
+ #define MTD_ABSENT		0
+ #define MTD_RAM			1
+ #define MTD_ROM			2
+@@ -207,6 +254,12 @@ struct otp_info {
+ #define MEMWRITE		_IOWR('M', 24, struct mtd_write_req)
+ /* Erase a given range of user data (must be in mode %MTD_FILE_MODE_OTP_USER) */
+ #define OTPERASE		_IOW('M', 25, struct otp_info)
++/*
++ * Most generic read interface; can read in-band and/or out-of-band in various
++ * modes (see "struct mtd_read_req"). This ioctl is not supported for flashes
++ * without OOB, e.g., NOR flash.
++ */
++#define MEMREAD			_IOWR('M', 26, struct mtd_read_req)
+ 
+ /*
+  * Obsolete legacy interface. Keep it in order not to break userspace
+@@ -270,8 +323,9 @@ struct mtd_ecc_stats {
+  * Note: %MTD_FILE_MODE_RAW provides the same functionality as %MTD_OPS_RAW -
+  * raw access to the flash, without error correction or autoplacement schemes.
+  * Wherever possible, the MTD_OPS_* mode will override the MTD_FILE_MODE_* mode
+- * (e.g., when using ioctl(MEMWRITE)), but in some cases, the MTD_FILE_MODE is
+- * used out of necessity (e.g., `write()', ioctl(MEMWRITEOOB64)).
++ * (e.g., when using ioctl(MEMWRITE) or ioctl(MEMREAD)), but in some cases, the
++ * MTD_FILE_MODE is used out of necessity (e.g., `write()',
++ * ioctl(MEMWRITEOOB64)).
+  */
+ enum mtd_file_modes {
+ 	MTD_FILE_MODE_NORMAL = MTD_OTP_OFF,

+ 35 - 0
target/linux/generic/backport-6.1/423-v6.3-mtd-spinand-macronix-use-scratch-buffer-for-DMA-oper.patch

@@ -0,0 +1,35 @@
+From ebed787a0becb9354f0a23620a5130cccd6c730c Mon Sep 17 00:00:00 2001
+From: Daniel Golle <[email protected]>
+Date: Thu, 19 Jan 2023 03:45:43 +0000
+Subject: [PATCH] mtd: spinand: macronix: use scratch buffer for DMA operation
+
+The mx35lf1ge4ab_get_eccsr() function uses an SPI DMA operation to
+read the eccsr, hence the buffer should not be on stack. Since commit
+380583227c0c7f ("spi: spi-mem: Add extra sanity checks on the op param")
+the kernel emmits a warning and blocks such operations.
+
+Use the scratch buffer to get eccsr instead of trying to directly read
+into a stack-allocated variable.
+
+Signed-off-by: Daniel Golle <[email protected]>
+Reviewed-by: Dhruva Gole <[email protected]>
+Signed-off-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/linux-mtd/[email protected]
+---
+ drivers/mtd/nand/spi/macronix.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/mtd/nand/spi/macronix.c
++++ b/drivers/mtd/nand/spi/macronix.c
+@@ -83,9 +83,10 @@ static int mx35lf1ge4ab_ecc_get_status(s
+ 		 * in order to avoid forcing the wear-leveling layer to move
+ 		 * data around if it's not necessary.
+ 		 */
+-		if (mx35lf1ge4ab_get_eccsr(spinand, &eccsr))
++		if (mx35lf1ge4ab_get_eccsr(spinand, spinand->scratchbuf))
+ 			return nanddev_get_ecc_conf(nand)->strength;
+ 
++		eccsr = *spinand->scratchbuf;
+ 		if (WARN_ON(eccsr > nanddev_get_ecc_conf(nand)->strength ||
+ 			    !eccsr))
+ 			return nanddev_get_ecc_conf(nand)->strength;

+ 47 - 0
target/linux/generic/backport-6.1/424-v6.4-0004-mtd-core-prepare-mtd_otp_nvmem_add-to-handle-EPROBE_.patch

@@ -0,0 +1,47 @@
+From 281f7a6c1a33fffcde32001bacbb4f672140fbf9 Mon Sep 17 00:00:00 2001
+From: Michael Walle <[email protected]>
+Date: Wed, 8 Mar 2023 09:20:21 +0100
+Subject: [PATCH] mtd: core: prepare mtd_otp_nvmem_add() to handle
+ -EPROBE_DEFER
+
+NVMEM soon will get the ability for nvmem layouts and these might
+not be ready when nvmem_register() is called and thus it might
+return -EPROBE_DEFER. Don't print the error message in this case.
+
+Signed-off-by: Michael Walle <[email protected]>
+Signed-off-by: Miquel Raynal <[email protected]>
+Link: https://lore.kernel.org/linux-mtd/[email protected]
+---
+ drivers/mtd/mtdcore.c | 7 +++----
+ 1 file changed, 3 insertions(+), 4 deletions(-)
+
+--- a/drivers/mtd/mtdcore.c
++++ b/drivers/mtd/mtdcore.c
+@@ -960,8 +960,8 @@ static int mtd_otp_nvmem_add(struct mtd_
+ 			nvmem = mtd_otp_nvmem_register(mtd, "user-otp", size,
+ 						       mtd_nvmem_user_otp_reg_read);
+ 			if (IS_ERR(nvmem)) {
+-				dev_err(dev, "Failed to register OTP NVMEM device\n");
+-				return PTR_ERR(nvmem);
++				err = PTR_ERR(nvmem);
++				goto err;
+ 			}
+ 			mtd->otp_user_nvmem = nvmem;
+ 		}
+@@ -978,7 +978,6 @@ static int mtd_otp_nvmem_add(struct mtd_
+ 			nvmem = mtd_otp_nvmem_register(mtd, "factory-otp", size,
+ 						       mtd_nvmem_fact_otp_reg_read);
+ 			if (IS_ERR(nvmem)) {
+-				dev_err(dev, "Failed to register OTP NVMEM device\n");
+ 				err = PTR_ERR(nvmem);
+ 				goto err;
+ 			}
+@@ -991,7 +990,7 @@ static int mtd_otp_nvmem_add(struct mtd_
+ err:
+ 	if (mtd->otp_user_nvmem)
+ 		nvmem_unregister(mtd->otp_user_nvmem);
+-	return err;
++	return dev_err_probe(dev, err, "Failed to register OTP NVMEM device\n");
+ }
+ 
+ /**

+ 165 - 0
target/linux/generic/backport-6.1/600-v5.18-page_pool-Add-allocation-stats.patch

@@ -0,0 +1,165 @@
+From 8610037e8106b48c79cfe0afb92b2b2466e51c3d Mon Sep 17 00:00:00 2001
+From: Joe Damato <[email protected]>
+Date: Tue, 1 Mar 2022 23:55:47 -0800
+Subject: [PATCH] page_pool: Add allocation stats
+
+Add per-pool statistics counters for the allocation path of a page pool.
+These stats are incremented in softirq context, so no locking or per-cpu
+variables are needed.
+
+This code is disabled by default and a kernel config option is provided for
+users who wish to enable them.
+
+The statistics added are:
+	- fast: successful fast path allocations
+	- slow: slow path order-0 allocations
+	- slow_high_order: slow path high order allocations
+	- empty: ptr ring is empty, so a slow path allocation was forced.
+	- refill: an allocation which triggered a refill of the cache
+	- waive: pages obtained from the ptr ring that cannot be added to
+	  the cache due to a NUMA mismatch.
+
+Signed-off-by: Joe Damato <[email protected]>
+Acked-by: Jesper Dangaard Brouer <[email protected]>
+Reviewed-by: Ilias Apalodimas <[email protected]>
+Signed-off-by: David S. Miller <[email protected]>
+---
+ include/net/page_pool.h | 18 ++++++++++++++++++
+ net/Kconfig             | 13 +++++++++++++
+ net/core/page_pool.c    | 24 ++++++++++++++++++++----
+ 3 files changed, 51 insertions(+), 4 deletions(-)
+
+--- a/include/net/page_pool.h
++++ b/include/net/page_pool.h
+@@ -82,6 +82,19 @@ struct page_pool_params {
+ 	unsigned int	offset;  /* DMA addr offset */
+ };
+ 
++#ifdef CONFIG_PAGE_POOL_STATS
++struct page_pool_alloc_stats {
++	u64 fast; /* fast path allocations */
++	u64 slow; /* slow-path order 0 allocations */
++	u64 slow_high_order; /* slow-path high order allocations */
++	u64 empty; /* failed refills due to empty ptr ring, forcing
++		    * slow path allocation
++		    */
++	u64 refill; /* allocations via successful refill */
++	u64 waive;  /* failed refills due to numa zone mismatch */
++};
++#endif
++
+ struct page_pool {
+ 	struct page_pool_params p;
+ 
+@@ -132,6 +145,11 @@ struct page_pool {
+ 	refcount_t user_cnt;
+ 
+ 	u64 destroy_cnt;
++
++#ifdef CONFIG_PAGE_POOL_STATS
++	/* these stats are incremented while in softirq context */
++	struct page_pool_alloc_stats alloc_stats;
++#endif
+ };
+ 
+ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
+--- a/net/Kconfig
++++ b/net/Kconfig
+@@ -434,6 +434,19 @@ config NET_DEVLINK
+ config PAGE_POOL
+ 	bool
+ 
++config PAGE_POOL_STATS
++	default n
++	bool "Page pool stats"
++	depends on PAGE_POOL
++	help
++	  Enable page pool statistics to track page allocation and recycling
++	  in page pools. This option incurs additional CPU cost in allocation
++	  and recycle paths and additional memory cost to store the statistics.
++	  These statistics are only available if this option is enabled and if
++	  the driver using the page pool supports exporting this data.
++
++	  If unsure, say N.
++
+ config FAILOVER
+ 	tristate "Generic failover module"
+ 	help
+--- a/net/core/page_pool.c
++++ b/net/core/page_pool.c
+@@ -26,6 +26,13 @@
+ 
+ #define BIAS_MAX	LONG_MAX
+ 
++#ifdef CONFIG_PAGE_POOL_STATS
++/* alloc_stat_inc is intended to be used in softirq context */
++#define alloc_stat_inc(pool, __stat)	(pool->alloc_stats.__stat++)
++#else
++#define alloc_stat_inc(pool, __stat)
++#endif
++
+ static int page_pool_init(struct page_pool *pool,
+ 			  const struct page_pool_params *params)
+ {
+@@ -117,8 +124,10 @@ static struct page *page_pool_refill_all
+ 	int pref_nid; /* preferred NUMA node */
+ 
+ 	/* Quicker fallback, avoid locks when ring is empty */
+-	if (__ptr_ring_empty(r))
++	if (__ptr_ring_empty(r)) {
++		alloc_stat_inc(pool, empty);
+ 		return NULL;
++	}
+ 
+ 	/* Softirq guarantee CPU and thus NUMA node is stable. This,
+ 	 * assumes CPU refilling driver RX-ring will also run RX-NAPI.
+@@ -148,14 +157,17 @@ static struct page *page_pool_refill_all
+ 			 * This limit stress on page buddy alloactor.
+ 			 */
+ 			page_pool_return_page(pool, page);
++			alloc_stat_inc(pool, waive);
+ 			page = NULL;
+ 			break;
+ 		}
+ 	} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
+ 
+ 	/* Return last page */
+-	if (likely(pool->alloc.count > 0))
++	if (likely(pool->alloc.count > 0)) {
+ 		page = pool->alloc.cache[--pool->alloc.count];
++		alloc_stat_inc(pool, refill);
++	}
+ 
+ 	spin_unlock(&r->consumer_lock);
+ 	return page;
+@@ -170,6 +182,7 @@ static struct page *__page_pool_get_cach
+ 	if (likely(pool->alloc.count)) {
+ 		/* Fast-path */
+ 		page = pool->alloc.cache[--pool->alloc.count];
++		alloc_stat_inc(pool, fast);
+ 	} else {
+ 		page = page_pool_refill_alloc_cache(pool);
+ 	}
+@@ -241,6 +254,7 @@ static struct page *__page_pool_alloc_pa
+ 		return NULL;
+ 	}
+ 
++	alloc_stat_inc(pool, slow_high_order);
+ 	page_pool_set_pp_info(pool, page);
+ 
+ 	/* Track how many pages are held 'in-flight' */
+@@ -295,10 +309,12 @@ static struct page *__page_pool_alloc_pa
+ 	}
+ 
+ 	/* Return last page */
+-	if (likely(pool->alloc.count > 0))
++	if (likely(pool->alloc.count > 0)) {
+ 		page = pool->alloc.cache[--pool->alloc.count];
+-	else
++		alloc_stat_inc(pool, slow);
++	} else {
+ 		page = NULL;
++	}
+ 
+ 	/* When page just alloc'ed is should/must have refcnt 1. */
+ 	return page;

+ 140 - 0
target/linux/generic/backport-6.1/601-v5.18-page_pool-Add-recycle-stats.patch

@@ -0,0 +1,140 @@
+From ad6fa1e1ab1b8164f1ba296b1b4dc556a483bcad Mon Sep 17 00:00:00 2001
+From: Joe Damato <[email protected]>
+Date: Tue, 1 Mar 2022 23:55:48 -0800
+Subject: [PATCH 2/3] page_pool: Add recycle stats
+
+Add per-cpu stats tracking page pool recycling events:
+	- cached: recycling placed page in the page pool cache
+	- cache_full: page pool cache was full
+	- ring: page placed into the ptr ring
+	- ring_full: page released from page pool because the ptr ring was full
+	- released_refcnt: page released (and not recycled) because refcnt > 1
+
+Signed-off-by: Joe Damato <[email protected]>
+Acked-by: Jesper Dangaard Brouer <[email protected]>
+Reviewed-by: Ilias Apalodimas <[email protected]>
+Signed-off-by: David S. Miller <[email protected]>
+---
+ include/net/page_pool.h | 16 ++++++++++++++++
+ net/core/page_pool.c    | 30 ++++++++++++++++++++++++++++--
+ 2 files changed, 44 insertions(+), 2 deletions(-)
+
+--- a/include/net/page_pool.h
++++ b/include/net/page_pool.h
+@@ -93,6 +93,18 @@ struct page_pool_alloc_stats {
+ 	u64 refill; /* allocations via successful refill */
+ 	u64 waive;  /* failed refills due to numa zone mismatch */
+ };
++
++struct page_pool_recycle_stats {
++	u64 cached;	/* recycling placed page in the cache. */
++	u64 cache_full; /* cache was full */
++	u64 ring;	/* recycling placed page back into ptr ring */
++	u64 ring_full;	/* page was released from page-pool because
++			 * PTR ring was full.
++			 */
++	u64 released_refcnt; /* page released because of elevated
++			      * refcnt
++			      */
++};
+ #endif
+ 
+ struct page_pool {
+@@ -136,6 +148,10 @@ struct page_pool {
+ 	 */
+ 	struct ptr_ring ring;
+ 
++#ifdef CONFIG_PAGE_POOL_STATS
++	/* recycle stats are per-cpu to avoid locking */
++	struct page_pool_recycle_stats __percpu *recycle_stats;
++#endif
+ 	atomic_t pages_state_release_cnt;
+ 
+ 	/* A page_pool is strictly tied to a single RX-queue being
+--- a/net/core/page_pool.c
++++ b/net/core/page_pool.c
+@@ -29,8 +29,15 @@
+ #ifdef CONFIG_PAGE_POOL_STATS
+ /* alloc_stat_inc is intended to be used in softirq context */
+ #define alloc_stat_inc(pool, __stat)	(pool->alloc_stats.__stat++)
++/* recycle_stat_inc is safe to use when preemption is possible. */
++#define recycle_stat_inc(pool, __stat)							\
++	do {										\
++		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
++		this_cpu_inc(s->__stat);						\
++	} while (0)
+ #else
+ #define alloc_stat_inc(pool, __stat)
++#define recycle_stat_inc(pool, __stat)
+ #endif
+ 
+ static int page_pool_init(struct page_pool *pool,
+@@ -80,6 +87,12 @@ static int page_pool_init(struct page_po
+ 	    pool->p.flags & PP_FLAG_PAGE_FRAG)
+ 		return -EINVAL;
+ 
++#ifdef CONFIG_PAGE_POOL_STATS
++	pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
++	if (!pool->recycle_stats)
++		return -ENOMEM;
++#endif
++
+ 	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
+ 		return -ENOMEM;
+ 
+@@ -412,7 +425,12 @@ static bool page_pool_recycle_in_ring(st
+ 	else
+ 		ret = ptr_ring_produce_bh(&pool->ring, page);
+ 
+-	return (ret == 0) ? true : false;
++	if (!ret) {
++		recycle_stat_inc(pool, ring);
++		return true;
++	}
++
++	return false;
+ }
+ 
+ /* Only allow direct recycling in special circumstances, into the
+@@ -423,11 +441,14 @@ static bool page_pool_recycle_in_ring(st
+ static bool page_pool_recycle_in_cache(struct page *page,
+ 				       struct page_pool *pool)
+ {
+-	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
++	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
++		recycle_stat_inc(pool, cache_full);
+ 		return false;
++	}
+ 
+ 	/* Caller MUST have verified/know (page_ref_count(page) == 1) */
+ 	pool->alloc.cache[pool->alloc.count++] = page;
++	recycle_stat_inc(pool, cached);
+ 	return true;
+ }
+ 
+@@ -482,6 +503,7 @@ __page_pool_put_page(struct page_pool *p
+ 	 * doing refcnt based recycle tricks, meaning another process
+ 	 * will be invoking put_page.
+ 	 */
++	recycle_stat_inc(pool, released_refcnt);
+ 	/* Do not replace this with page_pool_return_page() */
+ 	page_pool_release_page(pool, page);
+ 	put_page(page);
+@@ -495,6 +517,7 @@ void page_pool_put_page(struct page_pool
+ 	page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
+ 	if (page && !page_pool_recycle_in_ring(pool, page)) {
+ 		/* Cache full, fallback to free pages */
++		recycle_stat_inc(pool, ring_full);
+ 		page_pool_return_page(pool, page);
+ 	}
+ }
+@@ -641,6 +664,9 @@ static void page_pool_free(struct page_p
+ 	if (pool->p.flags & PP_FLAG_DMA_MAP)
+ 		put_device(pool->p.dev);
+ 
++#ifdef CONFIG_PAGE_POOL_STATS
++	free_percpu(pool->recycle_stats);
++#endif
+ 	kfree(pool);
+ }
+ 

+ 77 - 0
target/linux/generic/backport-6.1/602-v5.18-page_pool-Add-function-to-batch-and-return-stats.patch

@@ -0,0 +1,77 @@
+From 6b95e3388b1ea0ca63500c5a6e39162dbf828433 Mon Sep 17 00:00:00 2001
+From: Joe Damato <[email protected]>
+Date: Tue, 1 Mar 2022 23:55:49 -0800
+Subject: [PATCH 3/3] page_pool: Add function to batch and return stats
+
+Adds a function page_pool_get_stats which can be used by drivers to obtain
+stats for a specified page_pool.
+
+Signed-off-by: Joe Damato <[email protected]>
+Acked-by: Jesper Dangaard Brouer <[email protected]>
+Reviewed-by: Ilias Apalodimas <[email protected]>
+Signed-off-by: David S. Miller <[email protected]>
+---
+ include/net/page_pool.h | 17 +++++++++++++++++
+ net/core/page_pool.c    | 25 +++++++++++++++++++++++++
+ 2 files changed, 42 insertions(+)
+
+--- a/include/net/page_pool.h
++++ b/include/net/page_pool.h
+@@ -105,6 +105,23 @@ struct page_pool_recycle_stats {
+ 			      * refcnt
+ 			      */
+ };
++
++/* This struct wraps the above stats structs so users of the
++ * page_pool_get_stats API can pass a single argument when requesting the
++ * stats for the page pool.
++ */
++struct page_pool_stats {
++	struct page_pool_alloc_stats alloc_stats;
++	struct page_pool_recycle_stats recycle_stats;
++};
++
++/*
++ * Drivers that wish to harvest page pool stats and report them to users
++ * (perhaps via ethtool, debugfs, or another mechanism) can allocate a
++ * struct page_pool_stats call page_pool_get_stats to get stats for the specified pool.
++ */
++bool page_pool_get_stats(struct page_pool *pool,
++			 struct page_pool_stats *stats);
+ #endif
+ 
+ struct page_pool {
+--- a/net/core/page_pool.c
++++ b/net/core/page_pool.c
+@@ -35,6 +35,31 @@
+ 		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
+ 		this_cpu_inc(s->__stat);						\
+ 	} while (0)
++
++bool page_pool_get_stats(struct page_pool *pool,
++			 struct page_pool_stats *stats)
++{
++	int cpu = 0;
++
++	if (!stats)
++		return false;
++
++	memcpy(&stats->alloc_stats, &pool->alloc_stats, sizeof(pool->alloc_stats));
++
++	for_each_possible_cpu(cpu) {
++		const struct page_pool_recycle_stats *pcpu =
++			per_cpu_ptr(pool->recycle_stats, cpu);
++
++		stats->recycle_stats.cached += pcpu->cached;
++		stats->recycle_stats.cache_full += pcpu->cache_full;
++		stats->recycle_stats.ring += pcpu->ring;
++		stats->recycle_stats.ring_full += pcpu->ring_full;
++		stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
++	}
++
++	return true;
++}
++EXPORT_SYMBOL(page_pool_get_stats);
+ #else
+ #define alloc_stat_inc(pool, __stat)
+ #define recycle_stat_inc(pool, __stat)

+ 55 - 0
target/linux/generic/backport-6.1/603-v5.19-page_pool-Add-recycle-stats-to-page_pool_put_page_bu.patch

@@ -0,0 +1,55 @@
+From 590032a4d2133ecc10d3078a8db1d85a4842f12c Mon Sep 17 00:00:00 2001
+From: Lorenzo Bianconi <[email protected]>
+Date: Mon, 11 Apr 2022 16:05:26 +0200
+Subject: [PATCH] page_pool: Add recycle stats to page_pool_put_page_bulk
+
+Add missing recycle stats to page_pool_put_page_bulk routine.
+
+Reviewed-by: Joe Damato <[email protected]>
+Signed-off-by: Lorenzo Bianconi <[email protected]>
+Reviewed-by: Ilias Apalodimas <[email protected]>
+Link: https://lore.kernel.org/r/3712178b51c007cfaed910ea80e68f00c916b1fa.1649685634.git.lorenzo@kernel.org
+Signed-off-by: Paolo Abeni <[email protected]>
+---
+ net/core/page_pool.c | 15 +++++++++++++--
+ 1 file changed, 13 insertions(+), 2 deletions(-)
+
+--- a/net/core/page_pool.c
++++ b/net/core/page_pool.c
+@@ -36,6 +36,12 @@
+ 		this_cpu_inc(s->__stat);						\
+ 	} while (0)
+ 
++#define recycle_stat_add(pool, __stat, val)						\
++	do {										\
++		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
++		this_cpu_add(s->__stat, val);						\
++	} while (0)
++
+ bool page_pool_get_stats(struct page_pool *pool,
+ 			 struct page_pool_stats *stats)
+ {
+@@ -63,6 +69,7 @@ EXPORT_SYMBOL(page_pool_get_stats);
+ #else
+ #define alloc_stat_inc(pool, __stat)
+ #define recycle_stat_inc(pool, __stat)
++#define recycle_stat_add(pool, __stat, val)
+ #endif
+ 
+ static int page_pool_init(struct page_pool *pool,
+@@ -569,9 +576,13 @@ void page_pool_put_page_bulk(struct page
+ 	/* Bulk producer into ptr_ring page_pool cache */
+ 	page_pool_ring_lock(pool);
+ 	for (i = 0; i < bulk_len; i++) {
+-		if (__ptr_ring_produce(&pool->ring, data[i]))
+-			break; /* ring full */
++		if (__ptr_ring_produce(&pool->ring, data[i])) {
++			/* ring full */
++			recycle_stat_inc(pool, ring_full);
++			break;
++		}
+ 	}
++	recycle_stat_add(pool, ring, i);
+ 	page_pool_ring_unlock(pool);
+ 
+ 	/* Hopefully all pages was return into ptr_ring */

+ 147 - 0
target/linux/generic/backport-6.1/604-v5.19-net-page_pool-introduce-ethtool-stats.patch

@@ -0,0 +1,147 @@
+From f3c5264f452a5b0ac1de1f2f657efbabdea3c76a Mon Sep 17 00:00:00 2001
+From: Lorenzo Bianconi <[email protected]>
+Date: Tue, 12 Apr 2022 18:31:58 +0200
+Subject: [PATCH] net: page_pool: introduce ethtool stats
+
+Introduce page_pool APIs to report stats through ethtool and reduce
+duplicated code in each driver.
+
+Signed-off-by: Lorenzo Bianconi <[email protected]>
+Reviewed-by: Jakub Kicinski <[email protected]>
+Reviewed-by: Ilias Apalodimas <[email protected]>
+Signed-off-by: David S. Miller <[email protected]>
+---
+ include/net/page_pool.h | 21 ++++++++++++++
+ net/core/page_pool.c    | 63 ++++++++++++++++++++++++++++++++++++++++-
+ 2 files changed, 83 insertions(+), 1 deletion(-)
+
+--- a/include/net/page_pool.h
++++ b/include/net/page_pool.h
+@@ -115,6 +115,10 @@ struct page_pool_stats {
+ 	struct page_pool_recycle_stats recycle_stats;
+ };
+ 
++int page_pool_ethtool_stats_get_count(void);
++u8 *page_pool_ethtool_stats_get_strings(u8 *data);
++u64 *page_pool_ethtool_stats_get(u64 *data, void *stats);
++
+ /*
+  * Drivers that wish to harvest page pool stats and report them to users
+  * (perhaps via ethtool, debugfs, or another mechanism) can allocate a
+@@ -122,6 +126,23 @@ struct page_pool_stats {
+  */
+ bool page_pool_get_stats(struct page_pool *pool,
+ 			 struct page_pool_stats *stats);
++#else
++
++static inline int page_pool_ethtool_stats_get_count(void)
++{
++	return 0;
++}
++
++static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data)
++{
++	return data;
++}
++
++static inline u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
++{
++	return data;
++}
++
+ #endif
+ 
+ struct page_pool {
+--- a/net/core/page_pool.c
++++ b/net/core/page_pool.c
+@@ -18,6 +18,7 @@
+ #include <linux/page-flags.h>
+ #include <linux/mm.h> /* for __put_page() */
+ #include <linux/poison.h>
++#include <linux/ethtool.h>
+ 
+ #include <trace/events/page_pool.h>
+ 
+@@ -42,6 +43,20 @@
+ 		this_cpu_add(s->__stat, val);						\
+ 	} while (0)
+ 
++static const char pp_stats[][ETH_GSTRING_LEN] = {
++	"rx_pp_alloc_fast",
++	"rx_pp_alloc_slow",
++	"rx_pp_alloc_slow_ho",
++	"rx_pp_alloc_empty",
++	"rx_pp_alloc_refill",
++	"rx_pp_alloc_waive",
++	"rx_pp_recycle_cached",
++	"rx_pp_recycle_cache_full",
++	"rx_pp_recycle_ring",
++	"rx_pp_recycle_ring_full",
++	"rx_pp_recycle_released_ref",
++};
++
+ bool page_pool_get_stats(struct page_pool *pool,
+ 			 struct page_pool_stats *stats)
+ {
+@@ -50,7 +65,13 @@ bool page_pool_get_stats(struct page_poo
+ 	if (!stats)
+ 		return false;
+ 
+-	memcpy(&stats->alloc_stats, &pool->alloc_stats, sizeof(pool->alloc_stats));
++	/* The caller is responsible to initialize stats. */
++	stats->alloc_stats.fast += pool->alloc_stats.fast;
++	stats->alloc_stats.slow += pool->alloc_stats.slow;
++	stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
++	stats->alloc_stats.empty += pool->alloc_stats.empty;
++	stats->alloc_stats.refill += pool->alloc_stats.refill;
++	stats->alloc_stats.waive += pool->alloc_stats.waive;
+ 
+ 	for_each_possible_cpu(cpu) {
+ 		const struct page_pool_recycle_stats *pcpu =
+@@ -66,6 +87,46 @@ bool page_pool_get_stats(struct page_poo
+ 	return true;
+ }
+ EXPORT_SYMBOL(page_pool_get_stats);
++
++u8 *page_pool_ethtool_stats_get_strings(u8 *data)
++{
++	int i;
++
++	for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
++		memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
++		data += ETH_GSTRING_LEN;
++	}
++
++	return data;
++}
++EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
++
++int page_pool_ethtool_stats_get_count(void)
++{
++	return ARRAY_SIZE(pp_stats);
++}
++EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
++
++u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
++{
++	struct page_pool_stats *pool_stats = stats;
++
++	*data++ = pool_stats->alloc_stats.fast;
++	*data++ = pool_stats->alloc_stats.slow;
++	*data++ = pool_stats->alloc_stats.slow_high_order;
++	*data++ = pool_stats->alloc_stats.empty;
++	*data++ = pool_stats->alloc_stats.refill;
++	*data++ = pool_stats->alloc_stats.waive;
++	*data++ = pool_stats->recycle_stats.cached;
++	*data++ = pool_stats->recycle_stats.cache_full;
++	*data++ = pool_stats->recycle_stats.ring;
++	*data++ = pool_stats->recycle_stats.ring_full;
++	*data++ = pool_stats->recycle_stats.released_refcnt;
++
++	return data;
++}
++EXPORT_SYMBOL(page_pool_ethtool_stats_get);
++
+ #else
+ #define alloc_stat_inc(pool, __stat)
+ #define recycle_stat_inc(pool, __stat)

+ 99 - 0
target/linux/generic/backport-6.1/605-v5.18-xdp-introduce-flags-field-in-xdp_buff-xdp_frame.patch

@@ -0,0 +1,99 @@
+From 2e88d4ff03013937028f5397268b21e10cf68713 Mon Sep 17 00:00:00 2001
+From: Lorenzo Bianconi <[email protected]>
+Date: Fri, 21 Jan 2022 11:09:45 +0100
+Subject: [PATCH] xdp: introduce flags field in xdp_buff/xdp_frame
+
+Introduce flags field in xdp_frame and xdp_buffer data structures
+to define additional buffer features. At the moment the only
+supported buffer feature is frags bit (XDP_FLAGS_HAS_FRAGS).
+frags bit is used to specify if this is a linear buffer
+(XDP_FLAGS_HAS_FRAGS not set) or a frags frame (XDP_FLAGS_HAS_FRAGS
+set). In the latter case the driver is expected to initialize the
+skb_shared_info structure at the end of the first buffer to link together
+subsequent buffers belonging to the same frame.
+
+Acked-by: Toke Hoiland-Jorgensen <[email protected]>
+Acked-by: John Fastabend <[email protected]>
+Acked-by: Jesper Dangaard Brouer <[email protected]>
+Signed-off-by: Lorenzo Bianconi <[email protected]>
+Link: https://lore.kernel.org/r/e389f14f3a162c0a5bc6a2e1aa8dd01a90be117d.1642758637.git.lorenzo@kernel.org
+Signed-off-by: Alexei Starovoitov <[email protected]>
+---
+ include/net/xdp.h | 29 +++++++++++++++++++++++++++++
+ 1 file changed, 29 insertions(+)
+
+--- a/include/net/xdp.h
++++ b/include/net/xdp.h
+@@ -66,6 +66,10 @@ struct xdp_txq_info {
+ 	struct net_device *dev;
+ };
+ 
++enum xdp_buff_flags {
++	XDP_FLAGS_HAS_FRAGS	= BIT(0), /* non-linear xdp buff */
++};
++
+ struct xdp_buff {
+ 	void *data;
+ 	void *data_end;
+@@ -74,13 +78,30 @@ struct xdp_buff {
+ 	struct xdp_rxq_info *rxq;
+ 	struct xdp_txq_info *txq;
+ 	u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
++	u32 flags; /* supported values defined in xdp_buff_flags */
+ };
+ 
++static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp)
++{
++	return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS);
++}
++
++static __always_inline void xdp_buff_set_frags_flag(struct xdp_buff *xdp)
++{
++	xdp->flags |= XDP_FLAGS_HAS_FRAGS;
++}
++
++static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp)
++{
++	xdp->flags &= ~XDP_FLAGS_HAS_FRAGS;
++}
++
+ static __always_inline void
+ xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq)
+ {
+ 	xdp->frame_sz = frame_sz;
+ 	xdp->rxq = rxq;
++	xdp->flags = 0;
+ }
+ 
+ static __always_inline void
+@@ -122,8 +143,14 @@ struct xdp_frame {
+ 	 */
+ 	struct xdp_mem_info mem;
+ 	struct net_device *dev_rx; /* used by cpumap */
++	u32 flags; /* supported values defined in xdp_buff_flags */
+ };
+ 
++static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame)
++{
++	return !!(frame->flags & XDP_FLAGS_HAS_FRAGS);
++}
++
+ #define XDP_BULK_QUEUE_SIZE	16
+ struct xdp_frame_bulk {
+ 	int count;
+@@ -180,6 +207,7 @@ void xdp_convert_frame_to_buff(struct xd
+ 	xdp->data_end = frame->data + frame->len;
+ 	xdp->data_meta = frame->data - frame->metasize;
+ 	xdp->frame_sz = frame->frame_sz;
++	xdp->flags = frame->flags;
+ }
+ 
+ static inline
+@@ -206,6 +234,7 @@ int xdp_update_frame_from_buff(struct xd
+ 	xdp_frame->headroom = headroom - sizeof(*xdp_frame);
+ 	xdp_frame->metasize = metasize;
+ 	xdp_frame->frame_sz = xdp->frame_sz;
++	xdp_frame->flags = xdp->flags;
+ 
+ 	return 0;
+ }

+ 137 - 0
target/linux/generic/backport-6.1/606-v5.18-xdp-add-frags-support-to-xdp_return_-buff-frame.patch

@@ -0,0 +1,137 @@
+From 7c48cb0176c6d6d3b55029f7ff4ffa05faee6446 Mon Sep 17 00:00:00 2001
+From: Lorenzo Bianconi <[email protected]>
+Date: Fri, 21 Jan 2022 11:09:50 +0100
+Subject: [PATCH] xdp: add frags support to xdp_return_{buff/frame}
+
+Take into account if the received xdp_buff/xdp_frame is non-linear
+recycling/returning the frame memory to the allocator or into
+xdp_frame_bulk.
+
+Acked-by: Toke Hoiland-Jorgensen <[email protected]>
+Acked-by: John Fastabend <[email protected]>
+Signed-off-by: Lorenzo Bianconi <[email protected]>
+Link: https://lore.kernel.org/r/a961069febc868508ce1bdf5e53a343eb4e57cb2.1642758637.git.lorenzo@kernel.org
+Signed-off-by: Alexei Starovoitov <[email protected]>
+---
+ include/net/xdp.h | 18 ++++++++++++++--
+ net/core/xdp.c    | 54 ++++++++++++++++++++++++++++++++++++++++++++++-
+ 2 files changed, 69 insertions(+), 3 deletions(-)
+
+--- a/include/net/xdp.h
++++ b/include/net/xdp.h
+@@ -275,10 +275,24 @@ void __xdp_release_frame(void *data, str
+ static inline void xdp_release_frame(struct xdp_frame *xdpf)
+ {
+ 	struct xdp_mem_info *mem = &xdpf->mem;
++	struct skb_shared_info *sinfo;
++	int i;
+ 
+ 	/* Curr only page_pool needs this */
+-	if (mem->type == MEM_TYPE_PAGE_POOL)
+-		__xdp_release_frame(xdpf->data, mem);
++	if (mem->type != MEM_TYPE_PAGE_POOL)
++		return;
++
++	if (likely(!xdp_frame_has_frags(xdpf)))
++		goto out;
++
++	sinfo = xdp_get_shared_info_from_frame(xdpf);
++	for (i = 0; i < sinfo->nr_frags; i++) {
++		struct page *page = skb_frag_page(&sinfo->frags[i]);
++
++		__xdp_release_frame(page_address(page), mem);
++	}
++out:
++	__xdp_release_frame(xdpf->data, mem);
+ }
+ 
+ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
+--- a/net/core/xdp.c
++++ b/net/core/xdp.c
+@@ -376,12 +376,38 @@ static void __xdp_return(void *data, str
+ 
+ void xdp_return_frame(struct xdp_frame *xdpf)
+ {
++	struct skb_shared_info *sinfo;
++	int i;
++
++	if (likely(!xdp_frame_has_frags(xdpf)))
++		goto out;
++
++	sinfo = xdp_get_shared_info_from_frame(xdpf);
++	for (i = 0; i < sinfo->nr_frags; i++) {
++		struct page *page = skb_frag_page(&sinfo->frags[i]);
++
++		__xdp_return(page_address(page), &xdpf->mem, false, NULL);
++	}
++out:
+ 	__xdp_return(xdpf->data, &xdpf->mem, false, NULL);
+ }
+ EXPORT_SYMBOL_GPL(xdp_return_frame);
+ 
+ void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
+ {
++	struct skb_shared_info *sinfo;
++	int i;
++
++	if (likely(!xdp_frame_has_frags(xdpf)))
++		goto out;
++
++	sinfo = xdp_get_shared_info_from_frame(xdpf);
++	for (i = 0; i < sinfo->nr_frags; i++) {
++		struct page *page = skb_frag_page(&sinfo->frags[i]);
++
++		__xdp_return(page_address(page), &xdpf->mem, true, NULL);
++	}
++out:
+ 	__xdp_return(xdpf->data, &xdpf->mem, true, NULL);
+ }
+ EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
+@@ -417,7 +443,7 @@ void xdp_return_frame_bulk(struct xdp_fr
+ 	struct xdp_mem_allocator *xa;
+ 
+ 	if (mem->type != MEM_TYPE_PAGE_POOL) {
+-		__xdp_return(xdpf->data, &xdpf->mem, false, NULL);
++		xdp_return_frame(xdpf);
+ 		return;
+ 	}
+ 
+@@ -436,12 +462,38 @@ void xdp_return_frame_bulk(struct xdp_fr
+ 		bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+ 	}
+ 
++	if (unlikely(xdp_frame_has_frags(xdpf))) {
++		struct skb_shared_info *sinfo;
++		int i;
++
++		sinfo = xdp_get_shared_info_from_frame(xdpf);
++		for (i = 0; i < sinfo->nr_frags; i++) {
++			skb_frag_t *frag = &sinfo->frags[i];
++
++			bq->q[bq->count++] = skb_frag_address(frag);
++			if (bq->count == XDP_BULK_QUEUE_SIZE)
++				xdp_flush_frame_bulk(bq);
++		}
++	}
+ 	bq->q[bq->count++] = xdpf->data;
+ }
+ EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);
+ 
+ void xdp_return_buff(struct xdp_buff *xdp)
+ {
++	struct skb_shared_info *sinfo;
++	int i;
++
++	if (likely(!xdp_buff_has_frags(xdp)))
++		goto out;
++
++	sinfo = xdp_get_shared_info_from_buff(xdp);
++	for (i = 0; i < sinfo->nr_frags; i++) {
++		struct page *page = skb_frag_page(&sinfo->frags[i]);
++
++		__xdp_return(page_address(page), &xdp->rxq->mem, true, xdp);
++	}
++out:
+ 	__xdp_return(xdp->data, &xdp->rxq->mem, true, xdp);
+ }
+ 

+ 31 - 0
target/linux/generic/backport-6.1/607-v5.18-net-skbuff-add-size-metadata-to-skb_shared_info-for-.patch

@@ -0,0 +1,31 @@
+From d16697cb6261d4cc23422e6b1cb2759df8aa76d0 Mon Sep 17 00:00:00 2001
+From: Lorenzo Bianconi <[email protected]>
+Date: Fri, 21 Jan 2022 11:09:44 +0100
+Subject: [PATCH] net: skbuff: add size metadata to skb_shared_info for xdp
+
+Introduce xdp_frags_size field in skb_shared_info data structure
+to store xdp_buff/xdp_frame frame paged size (xdp_frags_size will
+be used in xdp frags support). In order to not increase
+skb_shared_info size we will use a hole due to skb_shared_info
+alignment.
+
+Acked-by: Toke Hoiland-Jorgensen <[email protected]>
+Acked-by: John Fastabend <[email protected]>
+Acked-by: Jesper Dangaard Brouer <[email protected]>
+Signed-off-by: Lorenzo Bianconi <[email protected]>
+Link: https://lore.kernel.org/r/8a849819a3e0a143d540f78a3a5add76e17e980d.1642758637.git.lorenzo@kernel.org
+Signed-off-by: Alexei Starovoitov <[email protected]>
+---
+ include/linux/skbuff.h | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -568,6 +568,7 @@ struct skb_shared_info {
+ 	 * Warning : all fields before dataref are cleared in __alloc_skb()
+ 	 */
+ 	atomic_t	dataref;
++	unsigned int	xdp_frags_size;
+ 
+ 	/* Intermediate layers must ensure that destructor_arg
+ 	 * remains valid until skb destructor */

+ 65 - 0
target/linux/generic/backport-6.1/608-v5.18-net-veth-Account-total-xdp_frame-len-running-ndo_xdp.patch

@@ -0,0 +1,65 @@
+From 5142239a22219921a7863cf00c9ab853c00689d8 Mon Sep 17 00:00:00 2001
+From: Lorenzo Bianconi <[email protected]>
+Date: Fri, 11 Mar 2022 10:14:18 +0100
+Subject: [PATCH] net: veth: Account total xdp_frame len running ndo_xdp_xmit
+
+Even if this is a theoretical issue since it is not possible to perform
+XDP_REDIRECT on a non-linear xdp_frame, veth driver does not account
+paged area in ndo_xdp_xmit function pointer.
+Introduce xdp_get_frame_len utility routine to get the xdp_frame full
+length and account total frame size running XDP_REDIRECT of a
+non-linear xdp frame into a veth device.
+
+Signed-off-by: Lorenzo Bianconi <[email protected]>
+Signed-off-by: Daniel Borkmann <[email protected]>
+Acked-by: Toke Hoiland-Jorgensen <[email protected]>
+Acked-by: John Fastabend <[email protected]>
+Link: https://lore.kernel.org/bpf/54f9fd3bb65d190daf2c0bbae2f852ff16cfbaa0.1646989407.git.lorenzo@kernel.org
+---
+ drivers/net/veth.c |  4 ++--
+ include/net/xdp.h  | 14 ++++++++++++++
+ 2 files changed, 16 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/veth.c
++++ b/drivers/net/veth.c
+@@ -501,7 +501,7 @@ static int veth_xdp_xmit(struct net_devi
+ 		struct xdp_frame *frame = frames[i];
+ 		void *ptr = veth_xdp_to_ptr(frame);
+ 
+-		if (unlikely(frame->len > max_len ||
++		if (unlikely(xdp_get_frame_len(frame) > max_len ||
+ 			     __ptr_ring_produce(&rq->xdp_ring, ptr)))
+ 			break;
+ 		nxmit++;
+@@ -862,7 +862,7 @@ static int veth_xdp_rcv(struct veth_rq *
+ 			/* ndo_xdp_xmit */
+ 			struct xdp_frame *frame = veth_ptr_to_xdp(ptr);
+ 
+-			stats->xdp_bytes += frame->len;
++			stats->xdp_bytes += xdp_get_frame_len(frame);
+ 			frame = veth_xdp_rcv_one(rq, frame, bq, stats);
+ 			if (frame) {
+ 				/* XDP_PASS */
+--- a/include/net/xdp.h
++++ b/include/net/xdp.h
+@@ -295,6 +295,20 @@ out:
+ 	__xdp_release_frame(xdpf->data, mem);
+ }
+ 
++static __always_inline unsigned int xdp_get_frame_len(struct xdp_frame *xdpf)
++{
++	struct skb_shared_info *sinfo;
++	unsigned int len = xdpf->len;
++
++	if (likely(!xdp_frame_has_frags(xdpf)))
++		goto out;
++
++	sinfo = xdp_get_shared_info_from_frame(xdpf);
++	len += sinfo->xdp_frags_size;
++out:
++	return len;
++}
++
+ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
+ 		     struct net_device *dev, u32 queue_index, unsigned int napi_id);
+ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);

+ 40 - 0
target/linux/generic/backport-6.1/609-v5.18-veth-Allow-jumbo-frames-in-xdp-mode.patch

@@ -0,0 +1,40 @@
+From 7cda76d858a4e71ac4a04066c093679a12e1312c Mon Sep 17 00:00:00 2001
+From: Lorenzo Bianconi <[email protected]>
+Date: Fri, 11 Mar 2022 10:14:20 +0100
+Subject: [PATCH] veth: Allow jumbo frames in xdp mode
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Allow increasing the MTU over page boundaries on veth devices
+if the attached xdp program declares to support xdp fragments.
+
+Signed-off-by: Lorenzo Bianconi <[email protected]>
+Signed-off-by: Daniel Borkmann <[email protected]>
+Acked-by: Toke Høiland-Jørgensen <[email protected]>
+Acked-by: John Fastabend <[email protected]>
+Link: https://lore.kernel.org/bpf/d5dc039c3d4123426e7023a488c449181a7bc57f.1646989407.git.lorenzo@kernel.org
+---
+ drivers/net/veth.c | 11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+--- a/drivers/net/veth.c
++++ b/drivers/net/veth.c
+@@ -1471,9 +1471,14 @@ static int veth_xdp_set(struct net_devic
+ 			goto err;
+ 		}
+ 
+-		max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM -
+-			  peer->hard_header_len -
+-			  SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
++		max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) -
++			  peer->hard_header_len;
++		/* Allow increasing the max_mtu if the program supports
++		 * XDP fragments.
++		 */
++		//if (prog->aux->xdp_has_frags)
++		max_mtu += PAGE_SIZE * MAX_SKB_FRAGS;
++
+ 		if (peer->mtu > max_mtu) {
+ 			NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP");
+ 			err = -ERANGE;

+ 56 - 0
target/linux/generic/backport-6.1/610-v6.3-net-page_pool-use-in_softirq-instead.patch

@@ -0,0 +1,56 @@
+From: Qingfang DENG <[email protected]>
+Date: Fri, 3 Feb 2023 09:16:11 +0800
+Subject: [PATCH] net: page_pool: use in_softirq() instead
+
+We use BH context only for synchronization, so we don't care if it's
+actually serving softirq or not.
+
+As a side node, in case of threaded NAPI, in_serving_softirq() will
+return false because it's in process context with BH off, making
+page_pool_recycle_in_cache() unreachable.
+
+Signed-off-by: Qingfang DENG <[email protected]>
+---
+
+--- a/include/net/page_pool.h
++++ b/include/net/page_pool.h
+@@ -357,7 +357,7 @@ static inline void page_pool_nid_changed
+ static inline void page_pool_ring_lock(struct page_pool *pool)
+ 	__acquires(&pool->ring.producer_lock)
+ {
+-	if (in_serving_softirq())
++	if (in_softirq())
+ 		spin_lock(&pool->ring.producer_lock);
+ 	else
+ 		spin_lock_bh(&pool->ring.producer_lock);
+@@ -366,7 +366,7 @@ static inline void page_pool_ring_lock(s
+ static inline void page_pool_ring_unlock(struct page_pool *pool)
+ 	__releases(&pool->ring.producer_lock)
+ {
+-	if (in_serving_softirq())
++	if (in_softirq())
+ 		spin_unlock(&pool->ring.producer_lock);
+ 	else
+ 		spin_unlock_bh(&pool->ring.producer_lock);
+--- a/net/core/page_pool.c
++++ b/net/core/page_pool.c
+@@ -512,8 +512,8 @@ static void page_pool_return_page(struct
+ static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
+ {
+ 	int ret;
+-	/* BH protection not needed if current is serving softirq */
+-	if (in_serving_softirq())
++	/* BH protection not needed if current is softirq */
++	if (in_softirq())
+ 		ret = ptr_ring_produce(&pool->ring, page);
+ 	else
+ 		ret = ptr_ring_produce_bh(&pool->ring, page);
+@@ -576,7 +576,7 @@ __page_pool_put_page(struct page_pool *p
+ 			page_pool_dma_sync_for_device(pool, page,
+ 						      dma_sync_size);
+ 
+-		if (allow_direct && in_serving_softirq() &&
++		if (allow_direct && in_softirq() &&
+ 		    page_pool_recycle_in_cache(page, pool))
+ 			return NULL;
+ 

+ 41 - 0
target/linux/generic/backport-6.1/611-v6.3-net-add-helper-eth_addr_add.patch

@@ -0,0 +1,41 @@
+From 7390609b0121a1b982c5ecdfcd72dc328e5784ee Mon Sep 17 00:00:00 2001
+From: Michael Walle <[email protected]>
+Date: Mon, 6 Feb 2023 13:43:42 +0000
+Subject: [PATCH] net: add helper eth_addr_add()
+
+Add a helper to add an offset to a ethernet address. This comes in handy
+if you have a base ethernet address for multiple interfaces.
+
+Signed-off-by: Michael Walle <[email protected]>
+Reviewed-by: Andrew Lunn <[email protected]>
+Acked-by: Jakub Kicinski <[email protected]>
+Signed-off-by: Srinivas Kandagatla <[email protected]>
+Link: https://lore.kernel.org/r/[email protected]
+Signed-off-by: Greg Kroah-Hartman <[email protected]>
+---
+ include/linux/etherdevice.h | 14 ++++++++++++++
+ 1 file changed, 14 insertions(+)
+
+--- a/include/linux/etherdevice.h
++++ b/include/linux/etherdevice.h
+@@ -478,6 +478,20 @@ static inline void eth_addr_inc(u8 *addr
+ }
+ 
+ /**
++ * eth_addr_add() - Add (or subtract) an offset to/from the given MAC address.
++ *
++ * @offset: Offset to add.
++ * @addr: Pointer to a six-byte array containing Ethernet address to increment.
++ */
++static inline void eth_addr_add(u8 *addr, long offset)
++{
++	u64 u = ether_addr_to_u64(addr);
++
++	u += offset;
++	u64_to_ether_addr(u, addr);
++}
++
++/**
+  * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
+  * @dev: Pointer to a device structure
+  * @addr: Pointer to a six-byte array containing the Ethernet address

+ 279 - 0
target/linux/generic/backport-6.1/700-v5.17-net-dsa-introduce-tagger-owned-storage-for-private.patch

@@ -0,0 +1,279 @@
+From dc452a471dbae8aca8257c565174212620880093 Mon Sep 17 00:00:00 2001
+From: Vladimir Oltean <[email protected]>
+Date: Fri, 10 Dec 2021 01:34:37 +0200
+Subject: net: dsa: introduce tagger-owned storage for private and shared data
+
+Ansuel is working on register access over Ethernet for the qca8k switch
+family. This requires the qca8k tagging protocol driver to receive
+frames which aren't intended for the network stack, but instead for the
+qca8k switch driver itself.
+
+The dp->priv is currently the prevailing method for passing data back
+and forth between the tagging protocol driver and the switch driver.
+However, this method is riddled with caveats.
+
+The DSA design allows in principle for any switch driver to return any
+protocol it desires in ->get_tag_protocol(). The dsa_loop driver can be
+modified to do just that. But in the current design, the memory behind
+dp->priv has to be allocated by the switch driver, so if the tagging
+protocol is paired to an unexpected switch driver, we may end up in NULL
+pointer dereferences inside the kernel, or worse (a switch driver may
+allocate dp->priv according to the expectations of a different tagger).
+
+The latter possibility is even more plausible considering that DSA
+switches can dynamically change tagging protocols in certain cases
+(dsa <-> edsa, ocelot <-> ocelot-8021q), and the current design lends
+itself to mistakes that are all too easy to make.
+
+This patch proposes that the tagging protocol driver should manage its
+own memory, instead of relying on the switch driver to do so.
+After analyzing the different in-tree needs, it can be observed that the
+required tagger storage is per switch, therefore a ds->tagger_data
+pointer is introduced. In principle, per-port storage could also be
+introduced, although there is no need for it at the moment. Future
+changes will replace the current usage of dp->priv with ds->tagger_data.
+
+We define a "binding" event between the DSA switch tree and the tagging
+protocol. During this binding event, the tagging protocol's ->connect()
+method is called first, and this may allocate some memory for each
+switch of the tree. Then a cross-chip notifier is emitted for the
+switches within that tree, and they are given the opportunity to fix up
+the tagger's memory (for example, they might set up some function
+pointers that represent virtual methods for consuming packets).
+Because the memory is owned by the tagger, there exists a ->disconnect()
+method for the tagger (which is the place to free the resources), but
+there doesn't exist a ->disconnect() method for the switch driver.
+This is part of the design. The switch driver should make minimal use of
+the public part of the tagger data, and only after type-checking it
+using the supplied "proto" argument.
+
+In the code there are in fact two binding events, one is the initial
+event in dsa_switch_setup_tag_protocol(). At this stage, the cross chip
+notifier chains aren't initialized, so we call each switch's connect()
+method by hand. Then there is dsa_tree_bind_tag_proto() during
+dsa_tree_change_tag_proto(), and here we have an old protocol and a new
+one. We first connect to the new one before disconnecting from the old
+one, to simplify error handling a bit and to ensure we remain in a valid
+state at all times.
+
+Co-developed-by: Ansuel Smith <[email protected]>
+Signed-off-by: Ansuel Smith <[email protected]>
+Signed-off-by: Vladimir Oltean <[email protected]>
+Signed-off-by: David S. Miller <[email protected]>
+---
+ include/net/dsa.h  | 12 +++++++++
+ net/dsa/dsa2.c     | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++---
+ net/dsa/dsa_priv.h |  1 +
+ net/dsa/switch.c   | 14 +++++++++++
+ 4 files changed, 96 insertions(+), 4 deletions(-)
+
+--- a/include/net/dsa.h
++++ b/include/net/dsa.h
+@@ -80,12 +80,15 @@ enum dsa_tag_protocol {
+ };
+ 
+ struct dsa_switch;
++struct dsa_switch_tree;
+ 
+ struct dsa_device_ops {
+ 	struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev);
+ 	struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev);
+ 	void (*flow_dissect)(const struct sk_buff *skb, __be16 *proto,
+ 			     int *offset);
++	int (*connect)(struct dsa_switch_tree *dst);
++	void (*disconnect)(struct dsa_switch_tree *dst);
+ 	unsigned int needed_headroom;
+ 	unsigned int needed_tailroom;
+ 	const char *name;
+@@ -329,6 +332,8 @@ struct dsa_switch {
+ 	 */
+ 	void *priv;
+ 
++	void *tagger_data;
++
+ 	/*
+ 	 * Configuration data for this switch.
+ 	 */
+@@ -584,6 +589,13 @@ struct dsa_switch_ops {
+ 						  enum dsa_tag_protocol mprot);
+ 	int	(*change_tag_protocol)(struct dsa_switch *ds, int port,
+ 				       enum dsa_tag_protocol proto);
++	/*
++	 * Method for switch drivers to connect to the tagging protocol driver
++	 * in current use. The switch driver can provide handlers for certain
++	 * types of packets for switch management.
++	 */
++	int	(*connect_tag_protocol)(struct dsa_switch *ds,
++					enum dsa_tag_protocol proto);
+ 
+ 	/* Optional switch-wide initialization and destruction methods */
+ 	int	(*setup)(struct dsa_switch *ds);
+--- a/net/dsa/dsa2.c
++++ b/net/dsa/dsa2.c
+@@ -230,8 +230,12 @@ static struct dsa_switch_tree *dsa_tree_
+ 
+ static void dsa_tree_free(struct dsa_switch_tree *dst)
+ {
+-	if (dst->tag_ops)
++	if (dst->tag_ops) {
++		if (dst->tag_ops->disconnect)
++			dst->tag_ops->disconnect(dst);
++
+ 		dsa_tag_driver_put(dst->tag_ops);
++	}
+ 	list_del(&dst->list);
+ 	kfree(dst);
+ }
+@@ -805,7 +809,7 @@ static int dsa_switch_setup_tag_protocol
+ 	int port, err;
+ 
+ 	if (tag_ops->proto == dst->default_proto)
+-		return 0;
++		goto connect;
+ 
+ 	for (port = 0; port < ds->num_ports; port++) {
+ 		if (!dsa_is_cpu_port(ds, port))
+@@ -821,6 +825,17 @@ static int dsa_switch_setup_tag_protocol
+ 		}
+ 	}
+ 
++connect:
++	if (ds->ops->connect_tag_protocol) {
++		err = ds->ops->connect_tag_protocol(ds, tag_ops->proto);
++		if (err) {
++			dev_err(ds->dev,
++				"Unable to connect to tag protocol \"%s\": %pe\n",
++				tag_ops->name, ERR_PTR(err));
++			return err;
++		}
++	}
++
+ 	return 0;
+ }
+ 
+@@ -1132,6 +1147,46 @@ static void dsa_tree_teardown(struct dsa
+ 	dst->setup = false;
+ }
+ 
++static int dsa_tree_bind_tag_proto(struct dsa_switch_tree *dst,
++				   const struct dsa_device_ops *tag_ops)
++{
++	const struct dsa_device_ops *old_tag_ops = dst->tag_ops;
++	struct dsa_notifier_tag_proto_info info;
++	int err;
++
++	dst->tag_ops = tag_ops;
++
++	/* Notify the new tagger about the connection to this tree */
++	if (tag_ops->connect) {
++		err = tag_ops->connect(dst);
++		if (err)
++			goto out_revert;
++	}
++
++	/* Notify the switches from this tree about the connection
++	 * to the new tagger
++	 */
++	info.tag_ops = tag_ops;
++	err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_CONNECT, &info);
++	if (err && err != -EOPNOTSUPP)
++		goto out_disconnect;
++
++	/* Notify the old tagger about the disconnection from this tree */
++	if (old_tag_ops->disconnect)
++		old_tag_ops->disconnect(dst);
++
++	return 0;
++
++out_disconnect:
++	/* Revert the new tagger's connection to this tree */
++	if (tag_ops->disconnect)
++		tag_ops->disconnect(dst);
++out_revert:
++	dst->tag_ops = old_tag_ops;
++
++	return err;
++}
++
+ /* Since the dsa/tagging sysfs device attribute is per master, the assumption
+  * is that all DSA switches within a tree share the same tagger, otherwise
+  * they would have formed disjoint trees (different "dsa,member" values).
+@@ -1164,12 +1219,15 @@ int dsa_tree_change_tag_proto(struct dsa
+ 			goto out_unlock;
+ 	}
+ 
++	/* Notify the tag protocol change */
+ 	info.tag_ops = tag_ops;
+ 	err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info);
+ 	if (err)
+-		goto out_unwind_tagger;
++		return err;
+ 
+-	dst->tag_ops = tag_ops;
++	err = dsa_tree_bind_tag_proto(dst, tag_ops);
++	if (err)
++		goto out_unwind_tagger;
+ 
+ 	rtnl_unlock();
+ 
+@@ -1257,6 +1315,7 @@ static int dsa_port_parse_cpu(struct dsa
+ 	struct dsa_switch *ds = dp->ds;
+ 	struct dsa_switch_tree *dst = ds->dst;
+ 	enum dsa_tag_protocol default_proto;
++	int err;
+ 
+ 	/* Find out which protocol the switch would prefer. */
+ 	default_proto = dsa_get_tag_protocol(dp, master);
+@@ -1311,6 +1370,12 @@ static int dsa_port_parse_cpu(struct dsa
+ 		 */
+ 		dsa_tag_driver_put(tag_ops);
+ 	} else {
++		if (tag_ops->connect) {
++			err = tag_ops->connect(dst);
++			if (err)
++				return err;
++		}
++
+ 		dst->tag_ops = tag_ops;
+ 	}
+ 
+--- a/net/dsa/dsa_priv.h
++++ b/net/dsa/dsa_priv.h
+@@ -37,6 +37,7 @@ enum {
+ 	DSA_NOTIFIER_VLAN_DEL,
+ 	DSA_NOTIFIER_MTU,
+ 	DSA_NOTIFIER_TAG_PROTO,
++	DSA_NOTIFIER_TAG_PROTO_CONNECT,
+ 	DSA_NOTIFIER_MRP_ADD,
+ 	DSA_NOTIFIER_MRP_DEL,
+ 	DSA_NOTIFIER_MRP_ADD_RING_ROLE,
+--- a/net/dsa/switch.c
++++ b/net/dsa/switch.c
+@@ -616,6 +616,17 @@ static int dsa_switch_change_tag_proto(s
+ 	return 0;
+ }
+ 
++static int dsa_switch_connect_tag_proto(struct dsa_switch *ds,
++					struct dsa_notifier_tag_proto_info *info)
++{
++	const struct dsa_device_ops *tag_ops = info->tag_ops;
++
++	if (!ds->ops->connect_tag_protocol)
++		return -EOPNOTSUPP;
++
++	return ds->ops->connect_tag_protocol(ds, tag_ops->proto);
++}
++
+ static int dsa_switch_mrp_add(struct dsa_switch *ds,
+ 			      struct dsa_notifier_mrp_info *info)
+ {
+@@ -735,6 +746,9 @@ static int dsa_switch_event(struct notif
+ 	case DSA_NOTIFIER_TAG_PROTO:
+ 		err = dsa_switch_change_tag_proto(ds, info);
+ 		break;
++	case DSA_NOTIFIER_TAG_PROTO_CONNECT:
++		err = dsa_switch_connect_tag_proto(ds, info);
++		break;
+ 	case DSA_NOTIFIER_MRP_ADD:
+ 		err = dsa_switch_mrp_add(ds, info);
+ 		break;

+ 274 - 0
target/linux/generic/backport-6.1/701-v5.17-dsa-make-tagging-protocols-connect-to-individual-switches.patch

@@ -0,0 +1,274 @@
+From 7f2973149c22e7a6fee4c0c9fa6b8e4108e9c208 Mon Sep 17 00:00:00 2001
+From: Vladimir Oltean <[email protected]>
+Date: Tue, 14 Dec 2021 03:45:36 +0200
+Subject: net: dsa: make tagging protocols connect to individual switches from
+ a tree
+
+On the NXP Bluebox 3 board which uses a multi-switch setup with sja1105,
+the mechanism through which the tagger connects to the switch tree is
+broken, due to improper DSA code design. At the time when tag_ops->connect()
+is called in dsa_port_parse_cpu(), DSA hasn't finished "touching" all
+the ports, so it doesn't know how large the tree is and how many ports
+it has. It has just seen the first CPU port by this time. As a result,
+this function will call the tagger's ->connect method too early, and the
+tagger will connect only to the first switch from the tree.
+
+This could be perhaps addressed a bit more simply by just moving the
+tag_ops->connect(dst) call a bit later (for example in dsa_tree_setup),
+but there is already a design inconsistency at present: on the switch
+side, the notification is on a per-switch basis, but on the tagger side,
+it is on a per-tree basis. Furthermore, the persistent storage itself is
+per switch (ds->tagger_data). And the tagger connect and disconnect
+procedures (at least the ones that exist currently) could see a fair bit
+of simplification if they didn't have to iterate through the switches of
+a tree.
+
+To fix the issue, this change transforms tag_ops->connect(dst) into
+tag_ops->connect(ds) and moves it somewhere where we already iterate
+over all switches of a tree. That is in dsa_switch_setup_tag_protocol(),
+which is a good placement because we already have there the connection
+call to the switch side of things.
+
+As for the dsa_tree_bind_tag_proto() method (called from the code path
+that changes the tag protocol), things are a bit more complicated
+because we receive the tree as argument, yet when we unwind on errors,
+it would be nice to not call tag_ops->disconnect(ds) where we didn't
+previously call tag_ops->connect(ds). We didn't have this problem before
+because the tag_ops connection operations passed the entire dst before,
+and this is more fine grained now. To solve the error rewind case using
+the new API, we have to create yet one more cross-chip notifier for
+disconnection, and stay connected with the old tag protocol to all the
+switches in the tree until we've succeeded to connect with the new one
+as well. So if something fails half way, the whole tree is still
+connected to the old tagger. But there may still be leaks if the tagger
+fails to connect to the 2nd out of 3 switches in a tree: somebody needs
+to tell the tagger to disconnect from the first switch. Nothing comes
+for free, and this was previously handled privately by the tagging
+protocol driver before, but now we need to emit a disconnect cross-chip
+notifier for that, because DSA has to take care of the unwind path. We
+assume that the tagging protocol has connected to a switch if it has set
+ds->tagger_data to something, otherwise we avoid calling its
+disconnection method in the error rewind path.
+
+The rest of the changes are in the tagging protocol drivers, and have to
+do with the replacement of dst with ds. The iteration is removed and the
+error unwind path is simplified, as mentioned above.
+
+Signed-off-by: Vladimir Oltean <[email protected]>
+Signed-off-by: David S. Miller <[email protected]>
+---
+ include/net/dsa.h          |  5 ++--
+ net/dsa/dsa2.c             | 44 +++++++++++++-----------------
+ net/dsa/dsa_priv.h         |  1 +
+ net/dsa/switch.c           | 52 ++++++++++++++++++++++++++++++++---
+ net/dsa/tag_ocelot_8021q.c | 53 +++++++++++-------------------------
+ net/dsa/tag_sja1105.c      | 67 ++++++++++++++++------------------------------
+ 6 files changed, 109 insertions(+), 113 deletions(-)
+
+--- a/include/net/dsa.h
++++ b/include/net/dsa.h
+@@ -80,15 +80,14 @@ enum dsa_tag_protocol {
+ };
+ 
+ struct dsa_switch;
+-struct dsa_switch_tree;
+ 
+ struct dsa_device_ops {
+ 	struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev);
+ 	struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev);
+ 	void (*flow_dissect)(const struct sk_buff *skb, __be16 *proto,
+ 			     int *offset);
+-	int (*connect)(struct dsa_switch_tree *dst);
+-	void (*disconnect)(struct dsa_switch_tree *dst);
++	int (*connect)(struct dsa_switch *ds);
++	void (*disconnect)(struct dsa_switch *ds);
+ 	unsigned int needed_headroom;
+ 	unsigned int needed_tailroom;
+ 	const char *name;
+--- a/net/dsa/dsa2.c
++++ b/net/dsa/dsa2.c
+@@ -230,12 +230,8 @@ static struct dsa_switch_tree *dsa_tree_
+ 
+ static void dsa_tree_free(struct dsa_switch_tree *dst)
+ {
+-	if (dst->tag_ops) {
+-		if (dst->tag_ops->disconnect)
+-			dst->tag_ops->disconnect(dst);
+-
++	if (dst->tag_ops)
+ 		dsa_tag_driver_put(dst->tag_ops);
+-	}
+ 	list_del(&dst->list);
+ 	kfree(dst);
+ }
+@@ -826,17 +822,29 @@ static int dsa_switch_setup_tag_protocol
+ 	}
+ 
+ connect:
++	if (tag_ops->connect) {
++		err = tag_ops->connect(ds);
++		if (err)
++			return err;
++	}
++
+ 	if (ds->ops->connect_tag_protocol) {
+ 		err = ds->ops->connect_tag_protocol(ds, tag_ops->proto);
+ 		if (err) {
+ 			dev_err(ds->dev,
+ 				"Unable to connect to tag protocol \"%s\": %pe\n",
+ 				tag_ops->name, ERR_PTR(err));
+-			return err;
++			goto disconnect;
+ 		}
+ 	}
+ 
+ 	return 0;
++
++disconnect:
++	if (tag_ops->disconnect)
++		tag_ops->disconnect(ds);
++
++	return err;
+ }
+ 
+ static int dsa_switch_setup(struct dsa_switch *ds)
+@@ -1156,13 +1164,6 @@ static int dsa_tree_bind_tag_proto(struc
+ 
+ 	dst->tag_ops = tag_ops;
+ 
+-	/* Notify the new tagger about the connection to this tree */
+-	if (tag_ops->connect) {
+-		err = tag_ops->connect(dst);
+-		if (err)
+-			goto out_revert;
+-	}
+-
+ 	/* Notify the switches from this tree about the connection
+ 	 * to the new tagger
+ 	 */
+@@ -1172,16 +1173,14 @@ static int dsa_tree_bind_tag_proto(struc
+ 		goto out_disconnect;
+ 
+ 	/* Notify the old tagger about the disconnection from this tree */
+-	if (old_tag_ops->disconnect)
+-		old_tag_ops->disconnect(dst);
++	info.tag_ops = old_tag_ops;
++	dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_DISCONNECT, &info);
+ 
+ 	return 0;
+ 
+ out_disconnect:
+-	/* Revert the new tagger's connection to this tree */
+-	if (tag_ops->disconnect)
+-		tag_ops->disconnect(dst);
+-out_revert:
++	info.tag_ops = tag_ops;
++	dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_DISCONNECT, &info);
+ 	dst->tag_ops = old_tag_ops;
+ 
+ 	return err;
+@@ -1315,7 +1314,6 @@ static int dsa_port_parse_cpu(struct dsa
+ 	struct dsa_switch *ds = dp->ds;
+ 	struct dsa_switch_tree *dst = ds->dst;
+ 	enum dsa_tag_protocol default_proto;
+-	int err;
+ 
+ 	/* Find out which protocol the switch would prefer. */
+ 	default_proto = dsa_get_tag_protocol(dp, master);
+@@ -1370,12 +1368,6 @@ static int dsa_port_parse_cpu(struct dsa
+ 		 */
+ 		dsa_tag_driver_put(tag_ops);
+ 	} else {
+-		if (tag_ops->connect) {
+-			err = tag_ops->connect(dst);
+-			if (err)
+-				return err;
+-		}
+-
+ 		dst->tag_ops = tag_ops;
+ 	}
+ 
+--- a/net/dsa/dsa_priv.h
++++ b/net/dsa/dsa_priv.h
+@@ -38,6 +38,7 @@ enum {
+ 	DSA_NOTIFIER_MTU,
+ 	DSA_NOTIFIER_TAG_PROTO,
+ 	DSA_NOTIFIER_TAG_PROTO_CONNECT,
++	DSA_NOTIFIER_TAG_PROTO_DISCONNECT,
+ 	DSA_NOTIFIER_MRP_ADD,
+ 	DSA_NOTIFIER_MRP_DEL,
+ 	DSA_NOTIFIER_MRP_ADD_RING_ROLE,
+--- a/net/dsa/switch.c
++++ b/net/dsa/switch.c
+@@ -616,15 +616,58 @@ static int dsa_switch_change_tag_proto(s
+ 	return 0;
+ }
+ 
+-static int dsa_switch_connect_tag_proto(struct dsa_switch *ds,
+-					struct dsa_notifier_tag_proto_info *info)
++/* We use the same cross-chip notifiers to inform both the tagger side, as well
++ * as the switch side, of connection and disconnection events.
++ * Since ds->tagger_data is owned by the tagger, it isn't a hard error if the
++ * switch side doesn't support connecting to this tagger, and therefore, the
++ * fact that we don't disconnect the tagger side doesn't constitute a memory
++ * leak: the tagger will still operate with persistent per-switch memory, just
++ * with the switch side unconnected to it. What does constitute a hard error is
++ * when the switch side supports connecting but fails.
++ */
++static int
++dsa_switch_connect_tag_proto(struct dsa_switch *ds,
++			     struct dsa_notifier_tag_proto_info *info)
+ {
+ 	const struct dsa_device_ops *tag_ops = info->tag_ops;
++	int err;
++
++	/* Notify the new tagger about the connection to this switch */
++	if (tag_ops->connect) {
++		err = tag_ops->connect(ds);
++		if (err)
++			return err;
++	}
+ 
+ 	if (!ds->ops->connect_tag_protocol)
+ 		return -EOPNOTSUPP;
+ 
+-	return ds->ops->connect_tag_protocol(ds, tag_ops->proto);
++	/* Notify the switch about the connection to the new tagger */
++	err = ds->ops->connect_tag_protocol(ds, tag_ops->proto);
++	if (err) {
++		/* Revert the new tagger's connection to this tree */
++		if (tag_ops->disconnect)
++			tag_ops->disconnect(ds);
++		return err;
++	}
++
++	return 0;
++}
++
++static int
++dsa_switch_disconnect_tag_proto(struct dsa_switch *ds,
++				struct dsa_notifier_tag_proto_info *info)
++{
++	const struct dsa_device_ops *tag_ops = info->tag_ops;
++
++	/* Notify the tagger about the disconnection from this switch */
++	if (tag_ops->disconnect && ds->tagger_data)
++		tag_ops->disconnect(ds);
++
++	/* No need to notify the switch, since it shouldn't have any
++	 * resources to tear down
++	 */
++	return 0;
+ }
+ 
+ static int dsa_switch_mrp_add(struct dsa_switch *ds,
+@@ -749,6 +792,9 @@ static int dsa_switch_event(struct notif
+ 	case DSA_NOTIFIER_TAG_PROTO_CONNECT:
+ 		err = dsa_switch_connect_tag_proto(ds, info);
+ 		break;
++	case DSA_NOTIFIER_TAG_PROTO_DISCONNECT:
++		err = dsa_switch_disconnect_tag_proto(ds, info);
++		break;
+ 	case DSA_NOTIFIER_MRP_ADD:
+ 		err = dsa_switch_mrp_add(ds, info);
+ 		break;

+ 327 - 0
target/linux/generic/backport-6.1/702-v5.19-00-net-ethernet-mtk_eth_soc-add-support-for-coherent-DM.patch

@@ -0,0 +1,327 @@
+From: Felix Fietkau <[email protected]>
+Date: Sat, 5 Feb 2022 17:59:07 +0100
+Subject: [PATCH] net: ethernet: mtk_eth_soc: add support for coherent
+ DMA
+
+It improves performance by eliminating the need for a cache flush on rx and tx
+In preparation for supporting WED (Wireless Ethernet Dispatch), also add a
+function for disabling coherent DMA at runtime.
+
+Signed-off-by: Felix Fietkau <[email protected]>
+---
+
+--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
++++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+@@ -9,6 +9,7 @@
+ #include <linux/of_device.h>
+ #include <linux/of_mdio.h>
+ #include <linux/of_net.h>
++#include <linux/of_address.h>
+ #include <linux/mfd/syscon.h>
+ #include <linux/regmap.h>
+ #include <linux/clk.h>
+@@ -840,7 +841,7 @@ static int mtk_init_fq_dma(struct mtk_et
+ 	dma_addr_t dma_addr;
+ 	int i;
+ 
+-	eth->scratch_ring = dma_alloc_coherent(eth->dev,
++	eth->scratch_ring = dma_alloc_coherent(eth->dma_dev,
+ 					       cnt * sizeof(struct mtk_tx_dma),
+ 					       &eth->phy_scratch_ring,
+ 					       GFP_ATOMIC);
+@@ -852,10 +853,10 @@ static int mtk_init_fq_dma(struct mtk_et
+ 	if (unlikely(!eth->scratch_head))
+ 		return -ENOMEM;
+ 
+-	dma_addr = dma_map_single(eth->dev,
++	dma_addr = dma_map_single(eth->dma_dev,
+ 				  eth->scratch_head, cnt * MTK_QDMA_PAGE_SIZE,
+ 				  DMA_FROM_DEVICE);
+-	if (unlikely(dma_mapping_error(eth->dev, dma_addr)))
++	if (unlikely(dma_mapping_error(eth->dma_dev, dma_addr)))
+ 		return -ENOMEM;
+ 
+ 	phy_ring_tail = eth->phy_scratch_ring +
+@@ -909,26 +910,26 @@ static void mtk_tx_unmap(struct mtk_eth
+ {
+ 	if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
+ 		if (tx_buf->flags & MTK_TX_FLAGS_SINGLE0) {
+-			dma_unmap_single(eth->dev,
++			dma_unmap_single(eth->dma_dev,
+ 					 dma_unmap_addr(tx_buf, dma_addr0),
+ 					 dma_unmap_len(tx_buf, dma_len0),
+ 					 DMA_TO_DEVICE);
+ 		} else if (tx_buf->flags & MTK_TX_FLAGS_PAGE0) {
+-			dma_unmap_page(eth->dev,
++			dma_unmap_page(eth->dma_dev,
+ 				       dma_unmap_addr(tx_buf, dma_addr0),
+ 				       dma_unmap_len(tx_buf, dma_len0),
+ 				       DMA_TO_DEVICE);
+ 		}
+ 	} else {
+ 		if (dma_unmap_len(tx_buf, dma_len0)) {
+-			dma_unmap_page(eth->dev,
++			dma_unmap_page(eth->dma_dev,
+ 				       dma_unmap_addr(tx_buf, dma_addr0),
+ 				       dma_unmap_len(tx_buf, dma_len0),
+ 				       DMA_TO_DEVICE);
+ 		}
+ 
+ 		if (dma_unmap_len(tx_buf, dma_len1)) {
+-			dma_unmap_page(eth->dev,
++			dma_unmap_page(eth->dma_dev,
+ 				       dma_unmap_addr(tx_buf, dma_addr1),
+ 				       dma_unmap_len(tx_buf, dma_len1),
+ 				       DMA_TO_DEVICE);
+@@ -1006,9 +1007,9 @@ static int mtk_tx_map(struct sk_buff *sk
+ 	if (skb_vlan_tag_present(skb))
+ 		txd4 |= TX_DMA_INS_VLAN | skb_vlan_tag_get(skb);
+ 
+-	mapped_addr = dma_map_single(eth->dev, skb->data,
++	mapped_addr = dma_map_single(eth->dma_dev, skb->data,
+ 				     skb_headlen(skb), DMA_TO_DEVICE);
+-	if (unlikely(dma_mapping_error(eth->dev, mapped_addr)))
++	if (unlikely(dma_mapping_error(eth->dma_dev, mapped_addr)))
+ 		return -ENOMEM;
+ 
+ 	WRITE_ONCE(itxd->txd1, mapped_addr);
+@@ -1047,10 +1048,10 @@ static int mtk_tx_map(struct sk_buff *sk
+ 
+ 
+ 			frag_map_size = min(frag_size, MTK_TX_DMA_BUF_LEN);
+-			mapped_addr = skb_frag_dma_map(eth->dev, frag, offset,
++			mapped_addr = skb_frag_dma_map(eth->dma_dev, frag, offset,
+ 						       frag_map_size,
+ 						       DMA_TO_DEVICE);
+-			if (unlikely(dma_mapping_error(eth->dev, mapped_addr)))
++			if (unlikely(dma_mapping_error(eth->dma_dev, mapped_addr)))
+ 				goto err_dma;
+ 
+ 			if (i == nr_frags - 1 &&
+@@ -1331,18 +1332,18 @@ static int mtk_poll_rx(struct napi_struc
+ 			netdev->stats.rx_dropped++;
+ 			goto release_desc;
+ 		}
+-		dma_addr = dma_map_single(eth->dev,
++		dma_addr = dma_map_single(eth->dma_dev,
+ 					  new_data + NET_SKB_PAD +
+ 					  eth->ip_align,
+ 					  ring->buf_size,
+ 					  DMA_FROM_DEVICE);
+-		if (unlikely(dma_mapping_error(eth->dev, dma_addr))) {
++		if (unlikely(dma_mapping_error(eth->dma_dev, dma_addr))) {
+ 			skb_free_frag(new_data);
+ 			netdev->stats.rx_dropped++;
+ 			goto release_desc;
+ 		}
+ 
+-		dma_unmap_single(eth->dev, trxd.rxd1,
++		dma_unmap_single(eth->dma_dev, trxd.rxd1,
+ 				 ring->buf_size, DMA_FROM_DEVICE);
+ 
+ 		/* receive data */
+@@ -1615,7 +1616,7 @@ static int mtk_tx_alloc(struct mtk_eth *
+ 	if (!ring->buf)
+ 		goto no_tx_mem;
+ 
+-	ring->dma = dma_alloc_coherent(eth->dev, MTK_DMA_SIZE * sz,
++	ring->dma = dma_alloc_coherent(eth->dma_dev, MTK_DMA_SIZE * sz,
+ 				       &ring->phys, GFP_ATOMIC);
+ 	if (!ring->dma)
+ 		goto no_tx_mem;
+@@ -1633,7 +1634,7 @@ static int mtk_tx_alloc(struct mtk_eth *
+ 	 * descriptors in ring->dma_pdma.
+ 	 */
+ 	if (!MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
+-		ring->dma_pdma = dma_alloc_coherent(eth->dev, MTK_DMA_SIZE * sz,
++		ring->dma_pdma = dma_alloc_coherent(eth->dma_dev, MTK_DMA_SIZE * sz,
+ 						    &ring->phys_pdma,
+ 						    GFP_ATOMIC);
+ 		if (!ring->dma_pdma)
+@@ -1692,7 +1693,7 @@ static void mtk_tx_clean(struct mtk_eth
+ 	}
+ 
+ 	if (ring->dma) {
+-		dma_free_coherent(eth->dev,
++		dma_free_coherent(eth->dma_dev,
+ 				  MTK_DMA_SIZE * sizeof(*ring->dma),
+ 				  ring->dma,
+ 				  ring->phys);
+@@ -1700,7 +1701,7 @@ static void mtk_tx_clean(struct mtk_eth
+ 	}
+ 
+ 	if (ring->dma_pdma) {
+-		dma_free_coherent(eth->dev,
++		dma_free_coherent(eth->dma_dev,
+ 				  MTK_DMA_SIZE * sizeof(*ring->dma_pdma),
+ 				  ring->dma_pdma,
+ 				  ring->phys_pdma);
+@@ -1748,18 +1749,18 @@ static int mtk_rx_alloc(struct mtk_eth *
+ 			return -ENOMEM;
+ 	}
+ 
+-	ring->dma = dma_alloc_coherent(eth->dev,
++	ring->dma = dma_alloc_coherent(eth->dma_dev,
+ 				       rx_dma_size * sizeof(*ring->dma),
+ 				       &ring->phys, GFP_ATOMIC);
+ 	if (!ring->dma)
+ 		return -ENOMEM;
+ 
+ 	for (i = 0; i < rx_dma_size; i++) {
+-		dma_addr_t dma_addr = dma_map_single(eth->dev,
++		dma_addr_t dma_addr = dma_map_single(eth->dma_dev,
+ 				ring->data[i] + NET_SKB_PAD + eth->ip_align,
+ 				ring->buf_size,
+ 				DMA_FROM_DEVICE);
+-		if (unlikely(dma_mapping_error(eth->dev, dma_addr)))
++		if (unlikely(dma_mapping_error(eth->dma_dev, dma_addr)))
+ 			return -ENOMEM;
+ 		ring->dma[i].rxd1 = (unsigned int)dma_addr;
+ 
+@@ -1795,7 +1796,7 @@ static void mtk_rx_clean(struct mtk_eth
+ 				continue;
+ 			if (!ring->dma[i].rxd1)
+ 				continue;
+-			dma_unmap_single(eth->dev,
++			dma_unmap_single(eth->dma_dev,
+ 					 ring->dma[i].rxd1,
+ 					 ring->buf_size,
+ 					 DMA_FROM_DEVICE);
+@@ -1806,7 +1807,7 @@ static void mtk_rx_clean(struct mtk_eth
+ 	}
+ 
+ 	if (ring->dma) {
+-		dma_free_coherent(eth->dev,
++		dma_free_coherent(eth->dma_dev,
+ 				  ring->dma_size * sizeof(*ring->dma),
+ 				  ring->dma,
+ 				  ring->phys);
+@@ -2162,7 +2163,7 @@ static void mtk_dma_free(struct mtk_eth
+ 		if (eth->netdev[i])
+ 			netdev_reset_queue(eth->netdev[i]);
+ 	if (eth->scratch_ring) {
+-		dma_free_coherent(eth->dev,
++		dma_free_coherent(eth->dma_dev,
+ 				  MTK_DMA_SIZE * sizeof(struct mtk_tx_dma),
+ 				  eth->scratch_ring,
+ 				  eth->phy_scratch_ring);
+@@ -2514,6 +2515,8 @@ static void mtk_dim_tx(struct work_struc
+ 
+ static int mtk_hw_init(struct mtk_eth *eth)
+ {
++	u32 dma_mask = ETHSYS_DMA_AG_MAP_PDMA | ETHSYS_DMA_AG_MAP_QDMA |
++		       ETHSYS_DMA_AG_MAP_PPE;
+ 	int i, val, ret;
+ 
+ 	if (test_and_set_bit(MTK_HW_INIT, &eth->state))
+@@ -2526,6 +2529,10 @@ static int mtk_hw_init(struct mtk_eth *e
+ 	if (ret)
+ 		goto err_disable_pm;
+ 
++	if (eth->ethsys)
++		regmap_update_bits(eth->ethsys, ETHSYS_DMA_AG_MAP, dma_mask,
++				   of_dma_is_coherent(eth->dma_dev->of_node) * dma_mask);
++
+ 	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
+ 		ret = device_reset(eth->dev);
+ 		if (ret) {
+@@ -3079,6 +3086,35 @@ free_netdev:
+ 	return err;
+ }
+ 
++void mtk_eth_set_dma_device(struct mtk_eth *eth, struct device *dma_dev)
++{
++	struct net_device *dev, *tmp;
++	LIST_HEAD(dev_list);
++	int i;
++
++	rtnl_lock();
++
++	for (i = 0; i < MTK_MAC_COUNT; i++) {
++		dev = eth->netdev[i];
++
++		if (!dev || !(dev->flags & IFF_UP))
++			continue;
++
++		list_add_tail(&dev->close_list, &dev_list);
++	}
++
++	dev_close_many(&dev_list, false);
++
++	eth->dma_dev = dma_dev;
++
++	list_for_each_entry_safe(dev, tmp, &dev_list, close_list) {
++		list_del_init(&dev->close_list);
++		dev_open(dev, NULL);
++	}
++
++	rtnl_unlock();
++}
++
+ static int mtk_probe(struct platform_device *pdev)
+ {
+ 	struct device_node *mac_np;
+@@ -3092,6 +3128,7 @@ static int mtk_probe(struct platform_dev
+ 	eth->soc = of_device_get_match_data(&pdev->dev);
+ 
+ 	eth->dev = &pdev->dev;
++	eth->dma_dev = &pdev->dev;
+ 	eth->base = devm_platform_ioremap_resource(pdev, 0);
+ 	if (IS_ERR(eth->base))
+ 		return PTR_ERR(eth->base);
+@@ -3140,6 +3177,16 @@ static int mtk_probe(struct platform_dev
+ 		}
+ 	}
+ 
++	if (of_dma_is_coherent(pdev->dev.of_node)) {
++		struct regmap *cci;
++
++		cci = syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
++						      "mediatek,cci-control");
++		/* enable CPU/bus coherency */
++		if (!IS_ERR(cci))
++			regmap_write(cci, 0, 3);
++	}
++
+ 	if (MTK_HAS_CAPS(eth->soc->caps, MTK_SGMII)) {
+ 		eth->sgmii = devm_kzalloc(eth->dev, sizeof(*eth->sgmii),
+ 					  GFP_KERNEL);
+--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
++++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
+@@ -463,6 +463,12 @@
+ #define RSTCTRL_FE		BIT(6)
+ #define RSTCTRL_PPE		BIT(31)
+ 
++/* ethernet dma channel agent map */
++#define ETHSYS_DMA_AG_MAP	0x408
++#define ETHSYS_DMA_AG_MAP_PDMA	BIT(0)
++#define ETHSYS_DMA_AG_MAP_QDMA	BIT(1)
++#define ETHSYS_DMA_AG_MAP_PPE	BIT(2)
++
+ /* SGMII subsystem config registers */
+ /* Register to auto-negotiation restart */
+ #define SGMSYS_PCS_CONTROL_1	0x0
+@@ -880,6 +886,7 @@ struct mtk_sgmii {
+ /* struct mtk_eth -	This is the main datasructure for holding the state
+  *			of the driver
+  * @dev:		The device pointer
++ * @dev:		The device pointer used for dma mapping/alloc
+  * @base:		The mapped register i/o base
+  * @page_lock:		Make sure that register operations are atomic
+  * @tx_irq__lock:	Make sure that IRQ register operations are atomic
+@@ -923,6 +930,7 @@ struct mtk_sgmii {
+ 
+ struct mtk_eth {
+ 	struct device			*dev;
++	struct device			*dma_dev;
+ 	void __iomem			*base;
+ 	spinlock_t			page_lock;
+ 	spinlock_t			tx_irq_lock;
+@@ -1021,6 +1029,7 @@ int mtk_gmac_rgmii_path_setup(struct mtk
+ int mtk_eth_offload_init(struct mtk_eth *eth);
+ int mtk_eth_setup_tc(struct net_device *dev, enum tc_setup_type type,
+ 		     void *type_data);
++void mtk_eth_set_dma_device(struct mtk_eth *eth, struct device *dma_dev);
+ 
+ 
+ #endif /* MTK_ETH_H */

+ 30 - 0
target/linux/generic/backport-6.1/702-v5.19-01-arm64-dts-mediatek-mt7622-add-support-for-coherent-D.patch

@@ -0,0 +1,30 @@
+From: Felix Fietkau <[email protected]>
+Date: Mon, 7 Feb 2022 10:27:22 +0100
+Subject: [PATCH] arm64: dts: mediatek: mt7622: add support for coherent
+ DMA
+
+It improves performance by eliminating the need for a cache flush on rx and tx
+
+Signed-off-by: Felix Fietkau <[email protected]>
+---
+
+--- a/arch/arm64/boot/dts/mediatek/mt7622.dtsi
++++ b/arch/arm64/boot/dts/mediatek/mt7622.dtsi
+@@ -357,7 +357,7 @@
+ 		};
+ 
+ 		cci_control2: slave-if@5000 {
+-			compatible = "arm,cci-400-ctrl-if";
++			compatible = "arm,cci-400-ctrl-if", "syscon";
+ 			interface-type = "ace";
+ 			reg = <0x5000 0x1000>;
+ 		};
+@@ -938,6 +938,8 @@
+ 		power-domains = <&scpsys MT7622_POWER_DOMAIN_ETHSYS>;
+ 		mediatek,ethsys = <&ethsys>;
+ 		mediatek,sgmiisys = <&sgmiisys>;
++		mediatek,cci-control = <&cci_control2>;
++		dma-coherent;
+ 		#address-cells = <1>;
+ 		#size-cells = <0>;
+ 		status = "disabled";

+ 1679 - 0
target/linux/generic/backport-6.1/702-v5.19-02-net-ethernet-mtk_eth_soc-add-support-for-Wireless-Et.patch

@@ -0,0 +1,1679 @@
+From: Felix Fietkau <[email protected]>
+Date: Sat, 5 Feb 2022 17:56:08 +0100
+Subject: [PATCH] net: ethernet: mtk_eth_soc: add support for Wireless
+ Ethernet Dispatch (WED)
+
+The Wireless Ethernet Dispatch subsystem on the MT7622 SoC can be
+configured to intercept and handle access to the DMA queues and
+PCIe interrupts for a MT7615/MT7915 wireless card.
+It can manage the internal WDMA (Wireless DMA) controller, which allows
+ethernet packets to be passed from the packet switch engine (PSE) to the
+wireless card, bypassing the CPU entirely.
+This can be used to implement hardware flow offloading from ethernet to
+WLAN.
+
+Signed-off-by: Felix Fietkau <[email protected]>
+---
+ create mode 100644 drivers/net/ethernet/mediatek/mtk_wed.c
+ create mode 100644 drivers/net/ethernet/mediatek/mtk_wed.h
+ create mode 100644 drivers/net/ethernet/mediatek/mtk_wed_debugfs.c
+ create mode 100644 drivers/net/ethernet/mediatek/mtk_wed_ops.c
+ create mode 100644 drivers/net/ethernet/mediatek/mtk_wed_regs.h
+ create mode 100644 include/linux/soc/mediatek/mtk_wed.h
+
+--- a/drivers/net/ethernet/mediatek/Kconfig
++++ b/drivers/net/ethernet/mediatek/Kconfig
+@@ -7,6 +7,10 @@ config NET_VENDOR_MEDIATEK
+ 
+ if NET_VENDOR_MEDIATEK
+ 
++config NET_MEDIATEK_SOC_WED
++	depends on ARCH_MEDIATEK || COMPILE_TEST
++	def_bool NET_MEDIATEK_SOC != n
++
+ config NET_MEDIATEK_SOC
+ 	tristate "MediaTek SoC Gigabit Ethernet support"
+ 	depends on NET_DSA || !NET_DSA
+--- a/drivers/net/ethernet/mediatek/Makefile
++++ b/drivers/net/ethernet/mediatek/Makefile
+@@ -5,4 +5,9 @@
+ 
+ obj-$(CONFIG_NET_MEDIATEK_SOC) += mtk_eth.o
+ mtk_eth-y := mtk_eth_soc.o mtk_sgmii.o mtk_eth_path.o mtk_ppe.o mtk_ppe_debugfs.o mtk_ppe_offload.o
++mtk_eth-$(CONFIG_NET_MEDIATEK_SOC_WED) += mtk_wed.o
++ifdef CONFIG_DEBUG_FS
++mtk_eth-$(CONFIG_NET_MEDIATEK_SOC_WED) += mtk_wed_debugfs.o
++endif
++obj-$(CONFIG_NET_MEDIATEK_SOC_WED) += mtk_wed_ops.o
+ obj-$(CONFIG_NET_MEDIATEK_STAR_EMAC) += mtk_star_emac.o
+--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
++++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+@@ -24,6 +24,7 @@
+ #include <net/dsa.h>
+ 
+ #include "mtk_eth_soc.h"
++#include "mtk_wed.h"
+ 
+ static int mtk_msg_level = -1;
+ module_param_named(msg_level, mtk_msg_level, int, 0);
+@@ -3209,6 +3210,22 @@ static int mtk_probe(struct platform_dev
+ 		}
+ 	}
+ 
++	for (i = 0;; i++) {
++		struct device_node *np = of_parse_phandle(pdev->dev.of_node,
++							  "mediatek,wed", i);
++		static const u32 wdma_regs[] = {
++			MTK_WDMA0_BASE,
++			MTK_WDMA1_BASE
++		};
++		void __iomem *wdma;
++
++		if (!np || i >= ARRAY_SIZE(wdma_regs))
++			break;
++
++		wdma = eth->base + wdma_regs[i];
++		mtk_wed_add_hw(np, eth, wdma, i);
++	}
++
+ 	for (i = 0; i < 3; i++) {
+ 		if (MTK_HAS_CAPS(eth->soc->caps, MTK_SHARED_INT) && i > 0)
+ 			eth->irq[i] = eth->irq[0];
+--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
++++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
+@@ -295,6 +295,9 @@
+ #define MTK_GDM1_TX_GPCNT	0x2438
+ #define MTK_STAT_OFFSET		0x40
+ 
++#define MTK_WDMA0_BASE		0x2800
++#define MTK_WDMA1_BASE		0x2c00
++
+ /* QDMA descriptor txd4 */
+ #define TX_DMA_CHKSUM		(0x7 << 29)
+ #define TX_DMA_TSO		BIT(28)
+--- /dev/null
++++ b/drivers/net/ethernet/mediatek/mtk_wed.c
+@@ -0,0 +1,875 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/* Copyright (C) 2021 Felix Fietkau <[email protected]> */
++
++#include <linux/kernel.h>
++#include <linux/slab.h>
++#include <linux/module.h>
++#include <linux/bitfield.h>
++#include <linux/dma-mapping.h>
++#include <linux/skbuff.h>
++#include <linux/of_platform.h>
++#include <linux/of_address.h>
++#include <linux/mfd/syscon.h>
++#include <linux/debugfs.h>
++#include <linux/soc/mediatek/mtk_wed.h>
++#include "mtk_eth_soc.h"
++#include "mtk_wed_regs.h"
++#include "mtk_wed.h"
++#include "mtk_ppe.h"
++
++#define MTK_PCIE_BASE(n)		(0x1a143000 + (n) * 0x2000)
++
++#define MTK_WED_PKT_SIZE		1900
++#define MTK_WED_BUF_SIZE		2048
++#define MTK_WED_BUF_PER_PAGE		(PAGE_SIZE / 2048)
++
++#define MTK_WED_TX_RING_SIZE		2048
++#define MTK_WED_WDMA_RING_SIZE		1024
++
++static struct mtk_wed_hw *hw_list[2];
++static DEFINE_MUTEX(hw_lock);
++
++static void
++wed_m32(struct mtk_wed_device *dev, u32 reg, u32 mask, u32 val)
++{
++	regmap_update_bits(dev->hw->regs, reg, mask | val, val);
++}
++
++static void
++wed_set(struct mtk_wed_device *dev, u32 reg, u32 mask)
++{
++	return wed_m32(dev, reg, 0, mask);
++}
++
++static void
++wed_clr(struct mtk_wed_device *dev, u32 reg, u32 mask)
++{
++	return wed_m32(dev, reg, mask, 0);
++}
++
++static void
++wdma_m32(struct mtk_wed_device *dev, u32 reg, u32 mask, u32 val)
++{
++	wdma_w32(dev, reg, (wdma_r32(dev, reg) & ~mask) | val);
++}
++
++static void
++wdma_set(struct mtk_wed_device *dev, u32 reg, u32 mask)
++{
++	wdma_m32(dev, reg, 0, mask);
++}
++
++static u32
++mtk_wed_read_reset(struct mtk_wed_device *dev)
++{
++	return wed_r32(dev, MTK_WED_RESET);
++}
++
++static void
++mtk_wed_reset(struct mtk_wed_device *dev, u32 mask)
++{
++	u32 status;
++
++	wed_w32(dev, MTK_WED_RESET, mask);
++	if (readx_poll_timeout(mtk_wed_read_reset, dev, status,
++			       !(status & mask), 0, 1000))
++		WARN_ON_ONCE(1);
++}
++
++static struct mtk_wed_hw *
++mtk_wed_assign(struct mtk_wed_device *dev)
++{
++	struct mtk_wed_hw *hw;
++
++	hw = hw_list[pci_domain_nr(dev->wlan.pci_dev->bus)];
++	if (!hw || hw->wed_dev)
++		return NULL;
++
++	hw->wed_dev = dev;
++	return hw;
++}
++
++static int
++mtk_wed_buffer_alloc(struct mtk_wed_device *dev)
++{
++	struct mtk_wdma_desc *desc;
++	dma_addr_t desc_phys;
++	void **page_list;
++	int token = dev->wlan.token_start;
++	int ring_size;
++	int n_pages;
++	int i, page_idx;
++
++	ring_size = dev->wlan.nbuf & ~(MTK_WED_BUF_PER_PAGE - 1);
++	n_pages = ring_size / MTK_WED_BUF_PER_PAGE;
++
++	page_list = kcalloc(n_pages, sizeof(*page_list), GFP_KERNEL);
++	if (!page_list)
++		return -ENOMEM;
++
++	dev->buf_ring.size = ring_size;
++	dev->buf_ring.pages = page_list;
++
++	desc = dma_alloc_coherent(dev->hw->dev, ring_size * sizeof(*desc),
++				  &desc_phys, GFP_KERNEL);
++	if (!desc)
++		return -ENOMEM;
++
++	dev->buf_ring.desc = desc;
++	dev->buf_ring.desc_phys = desc_phys;
++
++	for (i = 0, page_idx = 0; i < ring_size; i += MTK_WED_BUF_PER_PAGE) {
++		dma_addr_t page_phys, buf_phys;
++		struct page *page;
++		void *buf;
++		int s;
++
++		page = __dev_alloc_pages(GFP_KERNEL, 0);
++		if (!page)
++			return -ENOMEM;
++
++		page_phys = dma_map_page(dev->hw->dev, page, 0, PAGE_SIZE,
++					 DMA_BIDIRECTIONAL);
++		if (dma_mapping_error(dev->hw->dev, page_phys)) {
++			__free_page(page);
++			return -ENOMEM;
++		}
++
++		page_list[page_idx++] = page;
++		dma_sync_single_for_cpu(dev->hw->dev, page_phys, PAGE_SIZE,
++					DMA_BIDIRECTIONAL);
++
++		buf = page_to_virt(page);
++		buf_phys = page_phys;
++
++		for (s = 0; s < MTK_WED_BUF_PER_PAGE; s++) {
++			u32 txd_size;
++
++			txd_size = dev->wlan.init_buf(buf, buf_phys, token++);
++
++			desc->buf0 = buf_phys;
++			desc->buf1 = buf_phys + txd_size;
++			desc->ctrl = FIELD_PREP(MTK_WDMA_DESC_CTRL_LEN0,
++						txd_size) |
++				     FIELD_PREP(MTK_WDMA_DESC_CTRL_LEN1,
++						MTK_WED_BUF_SIZE - txd_size) |
++				     MTK_WDMA_DESC_CTRL_LAST_SEG1;
++			desc->info = 0;
++			desc++;
++
++			buf += MTK_WED_BUF_SIZE;
++			buf_phys += MTK_WED_BUF_SIZE;
++		}
++
++		dma_sync_single_for_device(dev->hw->dev, page_phys, PAGE_SIZE,
++					   DMA_BIDIRECTIONAL);
++	}
++
++	return 0;
++}
++
++static void
++mtk_wed_free_buffer(struct mtk_wed_device *dev)
++{
++	struct mtk_wdma_desc *desc = dev->buf_ring.desc;
++	void **page_list = dev->buf_ring.pages;
++	int page_idx;
++	int i;
++
++	if (!page_list)
++		return;
++
++	if (!desc)
++		goto free_pagelist;
++
++	for (i = 0, page_idx = 0; i < dev->buf_ring.size; i += MTK_WED_BUF_PER_PAGE) {
++		void *page = page_list[page_idx++];
++
++		if (!page)
++			break;
++
++		dma_unmap_page(dev->hw->dev, desc[i].buf0,
++			       PAGE_SIZE, DMA_BIDIRECTIONAL);
++		__free_page(page);
++	}
++
++	dma_free_coherent(dev->hw->dev, dev->buf_ring.size * sizeof(*desc),
++			  desc, dev->buf_ring.desc_phys);
++
++free_pagelist:
++	kfree(page_list);
++}
++
++static void
++mtk_wed_free_ring(struct mtk_wed_device *dev, struct mtk_wed_ring *ring)
++{
++	if (!ring->desc)
++		return;
++
++	dma_free_coherent(dev->hw->dev, ring->size * sizeof(*ring->desc),
++			  ring->desc, ring->desc_phys);
++}
++
++static void
++mtk_wed_free_tx_rings(struct mtk_wed_device *dev)
++{
++	int i;
++
++	for (i = 0; i < ARRAY_SIZE(dev->tx_ring); i++)
++		mtk_wed_free_ring(dev, &dev->tx_ring[i]);
++	for (i = 0; i < ARRAY_SIZE(dev->tx_wdma); i++)
++		mtk_wed_free_ring(dev, &dev->tx_wdma[i]);
++}
++
++static void
++mtk_wed_set_ext_int(struct mtk_wed_device *dev, bool en)
++{
++	u32 mask = MTK_WED_EXT_INT_STATUS_ERROR_MASK;
++
++	if (!dev->hw->num_flows)
++		mask &= ~MTK_WED_EXT_INT_STATUS_TKID_WO_PYLD;
++
++	wed_w32(dev, MTK_WED_EXT_INT_MASK, en ? mask : 0);
++	wed_r32(dev, MTK_WED_EXT_INT_MASK);
++}
++
++static void
++mtk_wed_stop(struct mtk_wed_device *dev)
++{
++	regmap_write(dev->hw->mirror, dev->hw->index * 4, 0);
++	mtk_wed_set_ext_int(dev, false);
++
++	wed_clr(dev, MTK_WED_CTRL,
++		MTK_WED_CTRL_WDMA_INT_AGENT_EN |
++		MTK_WED_CTRL_WPDMA_INT_AGENT_EN |
++		MTK_WED_CTRL_WED_TX_BM_EN |
++		MTK_WED_CTRL_WED_TX_FREE_AGENT_EN);
++	wed_w32(dev, MTK_WED_WPDMA_INT_TRIGGER, 0);
++	wed_w32(dev, MTK_WED_WDMA_INT_TRIGGER, 0);
++	wdma_w32(dev, MTK_WDMA_INT_MASK, 0);
++	wdma_w32(dev, MTK_WDMA_INT_GRP2, 0);
++	wed_w32(dev, MTK_WED_WPDMA_INT_MASK, 0);
++
++	wed_clr(dev, MTK_WED_GLO_CFG,
++		MTK_WED_GLO_CFG_TX_DMA_EN |
++		MTK_WED_GLO_CFG_RX_DMA_EN);
++	wed_clr(dev, MTK_WED_WPDMA_GLO_CFG,
++		MTK_WED_WPDMA_GLO_CFG_TX_DRV_EN |
++		MTK_WED_WPDMA_GLO_CFG_RX_DRV_EN);
++	wed_clr(dev, MTK_WED_WDMA_GLO_CFG,
++		MTK_WED_WDMA_GLO_CFG_RX_DRV_EN);
++}
++
++static void
++mtk_wed_detach(struct mtk_wed_device *dev)
++{
++	struct device_node *wlan_node = dev->wlan.pci_dev->dev.of_node;
++	struct mtk_wed_hw *hw = dev->hw;
++
++	mutex_lock(&hw_lock);
++
++	mtk_wed_stop(dev);
++
++	wdma_w32(dev, MTK_WDMA_RESET_IDX, MTK_WDMA_RESET_IDX_RX);
++	wdma_w32(dev, MTK_WDMA_RESET_IDX, 0);
++
++	mtk_wed_reset(dev, MTK_WED_RESET_WED);
++
++	mtk_wed_free_buffer(dev);
++	mtk_wed_free_tx_rings(dev);
++
++	if (of_dma_is_coherent(wlan_node))
++		regmap_update_bits(hw->hifsys, HIFSYS_DMA_AG_MAP,
++				   BIT(hw->index), BIT(hw->index));
++
++	if (!hw_list[!hw->index]->wed_dev &&
++	    hw->eth->dma_dev != hw->eth->dev)
++		mtk_eth_set_dma_device(hw->eth, hw->eth->dev);
++
++	memset(dev, 0, sizeof(*dev));
++	module_put(THIS_MODULE);
++
++	hw->wed_dev = NULL;
++	mutex_unlock(&hw_lock);
++}
++
++static void
++mtk_wed_hw_init_early(struct mtk_wed_device *dev)
++{
++	u32 mask, set;
++	u32 offset;
++
++	mtk_wed_stop(dev);
++	mtk_wed_reset(dev, MTK_WED_RESET_WED);
++
++	mask = MTK_WED_WDMA_GLO_CFG_BT_SIZE |
++	       MTK_WED_WDMA_GLO_CFG_DYNAMIC_DMAD_RECYCLE |
++	       MTK_WED_WDMA_GLO_CFG_RX_DIS_FSM_AUTO_IDLE;
++	set = FIELD_PREP(MTK_WED_WDMA_GLO_CFG_BT_SIZE, 2) |
++	      MTK_WED_WDMA_GLO_CFG_DYNAMIC_SKIP_DMAD_PREP |
++	      MTK_WED_WDMA_GLO_CFG_IDLE_DMAD_SUPPLY;
++	wed_m32(dev, MTK_WED_WDMA_GLO_CFG, mask, set);
++
++	wdma_set(dev, MTK_WDMA_GLO_CFG, MTK_WDMA_GLO_CFG_RX_INFO_PRERES);
++
++	offset = dev->hw->index ? 0x04000400 : 0;
++	wed_w32(dev, MTK_WED_WDMA_OFFSET0, 0x2a042a20 + offset);
++	wed_w32(dev, MTK_WED_WDMA_OFFSET1, 0x29002800 + offset);
++
++	wed_w32(dev, MTK_WED_PCIE_CFG_BASE, MTK_PCIE_BASE(dev->hw->index));
++	wed_w32(dev, MTK_WED_WPDMA_CFG_BASE, dev->wlan.wpdma_phys);
++}
++
++static void
++mtk_wed_hw_init(struct mtk_wed_device *dev)
++{
++	if (dev->init_done)
++		return;
++
++	dev->init_done = true;
++	mtk_wed_set_ext_int(dev, false);
++	wed_w32(dev, MTK_WED_TX_BM_CTRL,
++		MTK_WED_TX_BM_CTRL_PAUSE |
++		FIELD_PREP(MTK_WED_TX_BM_CTRL_VLD_GRP_NUM,
++			   dev->buf_ring.size / 128) |
++		FIELD_PREP(MTK_WED_TX_BM_CTRL_RSV_GRP_NUM,
++			   MTK_WED_TX_RING_SIZE / 256));
++
++	wed_w32(dev, MTK_WED_TX_BM_BASE, dev->buf_ring.desc_phys);
++
++	wed_w32(dev, MTK_WED_TX_BM_TKID,
++		FIELD_PREP(MTK_WED_TX_BM_TKID_START,
++			   dev->wlan.token_start) |
++		FIELD_PREP(MTK_WED_TX_BM_TKID_END,
++			   dev->wlan.token_start + dev->wlan.nbuf - 1));
++
++	wed_w32(dev, MTK_WED_TX_BM_BUF_LEN, MTK_WED_PKT_SIZE);
++
++	wed_w32(dev, MTK_WED_TX_BM_DYN_THR,
++		FIELD_PREP(MTK_WED_TX_BM_DYN_THR_LO, 1) |
++		MTK_WED_TX_BM_DYN_THR_HI);
++
++	mtk_wed_reset(dev, MTK_WED_RESET_TX_BM);
++
++	wed_set(dev, MTK_WED_CTRL,
++		MTK_WED_CTRL_WED_TX_BM_EN |
++		MTK_WED_CTRL_WED_TX_FREE_AGENT_EN);
++
++	wed_clr(dev, MTK_WED_TX_BM_CTRL, MTK_WED_TX_BM_CTRL_PAUSE);
++}
++
++static void
++mtk_wed_ring_reset(struct mtk_wdma_desc *desc, int size)
++{
++	int i;
++
++	for (i = 0; i < size; i++) {
++		desc[i].buf0 = 0;
++		desc[i].ctrl = cpu_to_le32(MTK_WDMA_DESC_CTRL_DMA_DONE);
++		desc[i].buf1 = 0;
++		desc[i].info = 0;
++	}
++}
++
++static u32
++mtk_wed_check_busy(struct mtk_wed_device *dev)
++{
++	if (wed_r32(dev, MTK_WED_GLO_CFG) & MTK_WED_GLO_CFG_TX_DMA_BUSY)
++		return true;
++
++	if (wed_r32(dev, MTK_WED_WPDMA_GLO_CFG) &
++	    MTK_WED_WPDMA_GLO_CFG_TX_DRV_BUSY)
++		return true;
++
++	if (wed_r32(dev, MTK_WED_CTRL) & MTK_WED_CTRL_WDMA_INT_AGENT_BUSY)
++		return true;
++
++	if (wed_r32(dev, MTK_WED_WDMA_GLO_CFG) &
++	    MTK_WED_WDMA_GLO_CFG_RX_DRV_BUSY)
++		return true;
++
++	if (wdma_r32(dev, MTK_WDMA_GLO_CFG) &
++	    MTK_WED_WDMA_GLO_CFG_RX_DRV_BUSY)
++		return true;
++
++	if (wed_r32(dev, MTK_WED_CTRL) &
++	    (MTK_WED_CTRL_WED_TX_BM_BUSY | MTK_WED_CTRL_WED_TX_FREE_AGENT_BUSY))
++		return true;
++
++	return false;
++}
++
++static int
++mtk_wed_poll_busy(struct mtk_wed_device *dev)
++{
++	int sleep = 15000;
++	int timeout = 100 * sleep;
++	u32 val;
++
++	return read_poll_timeout(mtk_wed_check_busy, val, !val, sleep,
++				 timeout, false, dev);
++}
++
++static void
++mtk_wed_reset_dma(struct mtk_wed_device *dev)
++{
++	bool busy = false;
++	u32 val;
++	int i;
++
++	for (i = 0; i < ARRAY_SIZE(dev->tx_ring); i++) {
++		struct mtk_wdma_desc *desc = dev->tx_ring[i].desc;
++
++		if (!desc)
++			continue;
++
++		mtk_wed_ring_reset(desc, MTK_WED_TX_RING_SIZE);
++	}
++
++	if (mtk_wed_poll_busy(dev))
++		busy = mtk_wed_check_busy(dev);
++
++	if (busy) {
++		mtk_wed_reset(dev, MTK_WED_RESET_WED_TX_DMA);
++	} else {
++		wed_w32(dev, MTK_WED_RESET_IDX,
++			MTK_WED_RESET_IDX_TX |
++			MTK_WED_RESET_IDX_RX);
++		wed_w32(dev, MTK_WED_RESET_IDX, 0);
++	}
++
++	wdma_w32(dev, MTK_WDMA_RESET_IDX, MTK_WDMA_RESET_IDX_RX);
++	wdma_w32(dev, MTK_WDMA_RESET_IDX, 0);
++
++	if (busy) {
++		mtk_wed_reset(dev, MTK_WED_RESET_WDMA_INT_AGENT);
++		mtk_wed_reset(dev, MTK_WED_RESET_WDMA_RX_DRV);
++	} else {
++		wed_w32(dev, MTK_WED_WDMA_RESET_IDX,
++			MTK_WED_WDMA_RESET_IDX_RX | MTK_WED_WDMA_RESET_IDX_DRV);
++		wed_w32(dev, MTK_WED_WDMA_RESET_IDX, 0);
++
++		wed_set(dev, MTK_WED_WDMA_GLO_CFG,
++			MTK_WED_WDMA_GLO_CFG_RST_INIT_COMPLETE);
++
++		wed_clr(dev, MTK_WED_WDMA_GLO_CFG,
++			MTK_WED_WDMA_GLO_CFG_RST_INIT_COMPLETE);
++	}
++
++	for (i = 0; i < 100; i++) {
++		val = wed_r32(dev, MTK_WED_TX_BM_INTF);
++		if (FIELD_GET(MTK_WED_TX_BM_INTF_TKFIFO_FDEP, val) == 0x40)
++			break;
++	}
++
++	mtk_wed_reset(dev, MTK_WED_RESET_TX_FREE_AGENT);
++	mtk_wed_reset(dev, MTK_WED_RESET_TX_BM);
++
++	if (busy) {
++		mtk_wed_reset(dev, MTK_WED_RESET_WPDMA_INT_AGENT);
++		mtk_wed_reset(dev, MTK_WED_RESET_WPDMA_TX_DRV);
++		mtk_wed_reset(dev, MTK_WED_RESET_WPDMA_RX_DRV);
++	} else {
++		wed_w32(dev, MTK_WED_WPDMA_RESET_IDX,
++			MTK_WED_WPDMA_RESET_IDX_TX |
++			MTK_WED_WPDMA_RESET_IDX_RX);
++		wed_w32(dev, MTK_WED_WPDMA_RESET_IDX, 0);
++	}
++
++}
++
++static int
++mtk_wed_ring_alloc(struct mtk_wed_device *dev, struct mtk_wed_ring *ring,
++		   int size)
++{
++	ring->desc = dma_alloc_coherent(dev->hw->dev,
++					size * sizeof(*ring->desc),
++					&ring->desc_phys, GFP_KERNEL);
++	if (!ring->desc)
++		return -ENOMEM;
++
++	ring->size = size;
++	mtk_wed_ring_reset(ring->desc, size);
++
++	return 0;
++}
++
++static int
++mtk_wed_wdma_ring_setup(struct mtk_wed_device *dev, int idx, int size)
++{
++	struct mtk_wed_ring *wdma = &dev->tx_wdma[idx];
++
++	if (mtk_wed_ring_alloc(dev, wdma, MTK_WED_WDMA_RING_SIZE))
++		return -ENOMEM;
++
++	wdma_w32(dev, MTK_WDMA_RING_RX(idx) + MTK_WED_RING_OFS_BASE,
++		 wdma->desc_phys);
++	wdma_w32(dev, MTK_WDMA_RING_RX(idx) + MTK_WED_RING_OFS_COUNT,
++		 size);
++	wdma_w32(dev, MTK_WDMA_RING_RX(idx) + MTK_WED_RING_OFS_CPU_IDX, 0);
++
++	wed_w32(dev, MTK_WED_WDMA_RING_RX(idx) + MTK_WED_RING_OFS_BASE,
++		wdma->desc_phys);
++	wed_w32(dev, MTK_WED_WDMA_RING_RX(idx) + MTK_WED_RING_OFS_COUNT,
++		size);
++
++	return 0;
++}
++
++static void
++mtk_wed_start(struct mtk_wed_device *dev, u32 irq_mask)
++{
++	u32 wdma_mask;
++	u32 val;
++	int i;
++
++	for (i = 0; i < ARRAY_SIZE(dev->tx_wdma); i++)
++		if (!dev->tx_wdma[i].desc)
++			mtk_wed_wdma_ring_setup(dev, i, 16);
++
++	wdma_mask = FIELD_PREP(MTK_WDMA_INT_MASK_RX_DONE, GENMASK(1, 0));
++
++	mtk_wed_hw_init(dev);
++
++	wed_set(dev, MTK_WED_CTRL,
++		MTK_WED_CTRL_WDMA_INT_AGENT_EN |
++		MTK_WED_CTRL_WPDMA_INT_AGENT_EN |
++		MTK_WED_CTRL_WED_TX_BM_EN |
++		MTK_WED_CTRL_WED_TX_FREE_AGENT_EN);
++
++	wed_w32(dev, MTK_WED_PCIE_INT_TRIGGER, MTK_WED_PCIE_INT_TRIGGER_STATUS);
++
++	wed_w32(dev, MTK_WED_WPDMA_INT_TRIGGER,
++		MTK_WED_WPDMA_INT_TRIGGER_RX_DONE |
++		MTK_WED_WPDMA_INT_TRIGGER_TX_DONE);
++
++	wed_set(dev, MTK_WED_WPDMA_INT_CTRL,
++		MTK_WED_WPDMA_INT_CTRL_SUBRT_ADV);
++
++	wed_w32(dev, MTK_WED_WDMA_INT_TRIGGER, wdma_mask);
++	wed_clr(dev, MTK_WED_WDMA_INT_CTRL, wdma_mask);
++
++	wdma_w32(dev, MTK_WDMA_INT_MASK, wdma_mask);
++	wdma_w32(dev, MTK_WDMA_INT_GRP2, wdma_mask);
++
++	wed_w32(dev, MTK_WED_WPDMA_INT_MASK, irq_mask);
++	wed_w32(dev, MTK_WED_INT_MASK, irq_mask);
++
++	wed_set(dev, MTK_WED_GLO_CFG,
++		MTK_WED_GLO_CFG_TX_DMA_EN |
++		MTK_WED_GLO_CFG_RX_DMA_EN);
++	wed_set(dev, MTK_WED_WPDMA_GLO_CFG,
++		MTK_WED_WPDMA_GLO_CFG_TX_DRV_EN |
++		MTK_WED_WPDMA_GLO_CFG_RX_DRV_EN);
++	wed_set(dev, MTK_WED_WDMA_GLO_CFG,
++		MTK_WED_WDMA_GLO_CFG_RX_DRV_EN);
++
++	mtk_wed_set_ext_int(dev, true);
++	val = dev->wlan.wpdma_phys |
++	      MTK_PCIE_MIRROR_MAP_EN |
++	      FIELD_PREP(MTK_PCIE_MIRROR_MAP_WED_ID, dev->hw->index);
++
++	if (dev->hw->index)
++		val |= BIT(1);
++	val |= BIT(0);
++	regmap_write(dev->hw->mirror, dev->hw->index * 4, val);
++
++	dev->running = true;
++}
++
++static int
++mtk_wed_attach(struct mtk_wed_device *dev)
++	__releases(RCU)
++{
++	struct mtk_wed_hw *hw;
++	int ret = 0;
++
++	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
++			 "mtk_wed_attach without holding the RCU read lock");
++
++	if (pci_domain_nr(dev->wlan.pci_dev->bus) > 1 ||
++	    !try_module_get(THIS_MODULE))
++		ret = -ENODEV;
++
++	rcu_read_unlock();
++
++	if (ret)
++		return ret;
++
++	mutex_lock(&hw_lock);
++
++	hw = mtk_wed_assign(dev);
++	if (!hw) {
++		module_put(THIS_MODULE);
++		ret = -ENODEV;
++		goto out;
++	}
++
++	dev_info(&dev->wlan.pci_dev->dev, "attaching wed device %d\n", hw->index);
++
++	dev->hw = hw;
++	dev->dev = hw->dev;
++	dev->irq = hw->irq;
++	dev->wdma_idx = hw->index;
++
++	if (hw->eth->dma_dev == hw->eth->dev &&
++	    of_dma_is_coherent(hw->eth->dev->of_node))
++		mtk_eth_set_dma_device(hw->eth, hw->dev);
++
++	ret = mtk_wed_buffer_alloc(dev);
++	if (ret) {
++		mtk_wed_detach(dev);
++		goto out;
++	}
++
++	mtk_wed_hw_init_early(dev);
++	regmap_update_bits(hw->hifsys, HIFSYS_DMA_AG_MAP, BIT(hw->index), 0);
++
++out:
++	mutex_unlock(&hw_lock);
++
++	return ret;
++}
++
++static int
++mtk_wed_tx_ring_setup(struct mtk_wed_device *dev, int idx, void __iomem *regs)
++{
++	struct mtk_wed_ring *ring = &dev->tx_ring[idx];
++
++	/*
++	 * Tx ring redirection:
++	 * Instead of configuring the WLAN PDMA TX ring directly, the WLAN
++	 * driver allocated DMA ring gets configured into WED MTK_WED_RING_TX(n)
++	 * registers.
++	 *
++	 * WED driver posts its own DMA ring as WLAN PDMA TX and configures it
++	 * into MTK_WED_WPDMA_RING_TX(n) registers.
++	 * It gets filled with packets picked up from WED TX ring and from
++	 * WDMA RX.
++	 */
++
++	BUG_ON(idx > ARRAY_SIZE(dev->tx_ring));
++
++	if (mtk_wed_ring_alloc(dev, ring, MTK_WED_TX_RING_SIZE))
++		return -ENOMEM;
++
++	if (mtk_wed_wdma_ring_setup(dev, idx, MTK_WED_WDMA_RING_SIZE))
++		return -ENOMEM;
++
++	ring->reg_base = MTK_WED_RING_TX(idx);
++	ring->wpdma = regs;
++
++	/* WED -> WPDMA */
++	wpdma_tx_w32(dev, idx, MTK_WED_RING_OFS_BASE, ring->desc_phys);
++	wpdma_tx_w32(dev, idx, MTK_WED_RING_OFS_COUNT, MTK_WED_TX_RING_SIZE);
++	wpdma_tx_w32(dev, idx, MTK_WED_RING_OFS_CPU_IDX, 0);
++
++	wed_w32(dev, MTK_WED_WPDMA_RING_TX(idx) + MTK_WED_RING_OFS_BASE,
++		ring->desc_phys);
++	wed_w32(dev, MTK_WED_WPDMA_RING_TX(idx) + MTK_WED_RING_OFS_COUNT,
++		MTK_WED_TX_RING_SIZE);
++	wed_w32(dev, MTK_WED_WPDMA_RING_TX(idx) + MTK_WED_RING_OFS_CPU_IDX, 0);
++
++	return 0;
++}
++
++static int
++mtk_wed_txfree_ring_setup(struct mtk_wed_device *dev, void __iomem *regs)
++{
++	struct mtk_wed_ring *ring = &dev->txfree_ring;
++	int i;
++
++	/*
++	 * For txfree event handling, the same DMA ring is shared between WED
++	 * and WLAN. The WLAN driver accesses the ring index registers through
++	 * WED
++	 */
++	ring->reg_base = MTK_WED_RING_RX(1);
++	ring->wpdma = regs;
++
++	for (i = 0; i < 12; i += 4) {
++		u32 val = readl(regs + i);
++
++		wed_w32(dev, MTK_WED_RING_RX(1) + i, val);
++		wed_w32(dev, MTK_WED_WPDMA_RING_RX(1) + i, val);
++	}
++
++	return 0;
++}
++
++static u32
++mtk_wed_irq_get(struct mtk_wed_device *dev, u32 mask)
++{
++	u32 val;
++
++	val = wed_r32(dev, MTK_WED_EXT_INT_STATUS);
++	wed_w32(dev, MTK_WED_EXT_INT_STATUS, val);
++	val &= MTK_WED_EXT_INT_STATUS_ERROR_MASK;
++	if (!dev->hw->num_flows)
++		val &= ~MTK_WED_EXT_INT_STATUS_TKID_WO_PYLD;
++	if (val && net_ratelimit())
++		pr_err("mtk_wed%d: error status=%08x\n", dev->hw->index, val);
++
++	val = wed_r32(dev, MTK_WED_INT_STATUS);
++	val &= mask;
++	wed_w32(dev, MTK_WED_INT_STATUS, val); /* ACK */
++
++	return val;
++}
++
++static void
++mtk_wed_irq_set_mask(struct mtk_wed_device *dev, u32 mask)
++{
++	if (!dev->running)
++		return;
++
++	mtk_wed_set_ext_int(dev, !!mask);
++	wed_w32(dev, MTK_WED_INT_MASK, mask);
++}
++
++int mtk_wed_flow_add(int index)
++{
++	struct mtk_wed_hw *hw = hw_list[index];
++	int ret;
++
++	if (!hw || !hw->wed_dev)
++		return -ENODEV;
++
++	if (hw->num_flows) {
++		hw->num_flows++;
++		return 0;
++	}
++
++	mutex_lock(&hw_lock);
++	if (!hw->wed_dev) {
++		ret = -ENODEV;
++		goto out;
++	}
++
++	ret = hw->wed_dev->wlan.offload_enable(hw->wed_dev);
++	if (!ret)
++		hw->num_flows++;
++	mtk_wed_set_ext_int(hw->wed_dev, true);
++
++out:
++	mutex_unlock(&hw_lock);
++
++	return ret;
++}
++
++void mtk_wed_flow_remove(int index)
++{
++	struct mtk_wed_hw *hw = hw_list[index];
++
++	if (!hw)
++		return;
++
++	if (--hw->num_flows)
++		return;
++
++	mutex_lock(&hw_lock);
++	if (!hw->wed_dev)
++		goto out;
++
++	hw->wed_dev->wlan.offload_disable(hw->wed_dev);
++	mtk_wed_set_ext_int(hw->wed_dev, true);
++
++out:
++	mutex_unlock(&hw_lock);
++}
++
++void mtk_wed_add_hw(struct device_node *np, struct mtk_eth *eth,
++		    void __iomem *wdma, int index)
++{
++	static const struct mtk_wed_ops wed_ops = {
++		.attach = mtk_wed_attach,
++		.tx_ring_setup = mtk_wed_tx_ring_setup,
++		.txfree_ring_setup = mtk_wed_txfree_ring_setup,
++		.start = mtk_wed_start,
++		.stop = mtk_wed_stop,
++		.reset_dma = mtk_wed_reset_dma,
++		.reg_read = wed_r32,
++		.reg_write = wed_w32,
++		.irq_get = mtk_wed_irq_get,
++		.irq_set_mask = mtk_wed_irq_set_mask,
++		.detach = mtk_wed_detach,
++	};
++	struct device_node *eth_np = eth->dev->of_node;
++	struct platform_device *pdev;
++	struct mtk_wed_hw *hw;
++	struct regmap *regs;
++	int irq;
++
++	if (!np)
++		return;
++
++	pdev = of_find_device_by_node(np);
++	if (!pdev)
++		return;
++
++	get_device(&pdev->dev);
++	irq = platform_get_irq(pdev, 0);
++	if (irq < 0)
++		return;
++
++	regs = syscon_regmap_lookup_by_phandle(np, NULL);
++	if (!regs)
++		return;
++
++	rcu_assign_pointer(mtk_soc_wed_ops, &wed_ops);
++
++	mutex_lock(&hw_lock);
++
++	if (WARN_ON(hw_list[index]))
++		goto unlock;
++
++	hw = kzalloc(sizeof(*hw), GFP_KERNEL);
++	hw->node = np;
++	hw->regs = regs;
++	hw->eth = eth;
++	hw->dev = &pdev->dev;
++	hw->wdma = wdma;
++	hw->index = index;
++	hw->irq = irq;
++	hw->mirror = syscon_regmap_lookup_by_phandle(eth_np,
++						     "mediatek,pcie-mirror");
++	hw->hifsys = syscon_regmap_lookup_by_phandle(eth_np,
++						     "mediatek,hifsys");
++	if (IS_ERR(hw->mirror) || IS_ERR(hw->hifsys)) {
++		kfree(hw);
++		goto unlock;
++	}
++
++	if (!index) {
++		regmap_write(hw->mirror, 0, 0);
++		regmap_write(hw->mirror, 4, 0);
++	}
++	mtk_wed_hw_add_debugfs(hw);
++
++	hw_list[index] = hw;
++
++unlock:
++	mutex_unlock(&hw_lock);
++}
++
++void mtk_wed_exit(void)
++{
++	int i;
++
++	rcu_assign_pointer(mtk_soc_wed_ops, NULL);
++
++	synchronize_rcu();
++
++	for (i = 0; i < ARRAY_SIZE(hw_list); i++) {
++		struct mtk_wed_hw *hw;
++
++		hw = hw_list[i];
++		if (!hw)
++			continue;
++
++		hw_list[i] = NULL;
++		debugfs_remove(hw->debugfs_dir);
++		put_device(hw->dev);
++		kfree(hw);
++	}
++}
+--- /dev/null
++++ b/drivers/net/ethernet/mediatek/mtk_wed.h
+@@ -0,0 +1,128 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/* Copyright (C) 2021 Felix Fietkau <[email protected]> */
++
++#ifndef __MTK_WED_PRIV_H
++#define __MTK_WED_PRIV_H
++
++#include <linux/soc/mediatek/mtk_wed.h>
++#include <linux/debugfs.h>
++#include <linux/regmap.h>
++
++struct mtk_eth;
++
++struct mtk_wed_hw {
++	struct device_node *node;
++	struct mtk_eth *eth;
++	struct regmap *regs;
++	struct regmap *hifsys;
++	struct device *dev;
++	void __iomem *wdma;
++	struct regmap *mirror;
++	struct dentry *debugfs_dir;
++	struct mtk_wed_device *wed_dev;
++	u32 debugfs_reg;
++	u32 num_flows;
++	char dirname[5];
++	int irq;
++	int index;
++};
++
++
++#ifdef CONFIG_NET_MEDIATEK_SOC_WED
++static inline void
++wed_w32(struct mtk_wed_device *dev, u32 reg, u32 val)
++{
++	regmap_write(dev->hw->regs, reg, val);
++}
++
++static inline u32
++wed_r32(struct mtk_wed_device *dev, u32 reg)
++{
++	unsigned int val;
++
++	regmap_read(dev->hw->regs, reg, &val);
++
++	return val;
++}
++
++static inline void
++wdma_w32(struct mtk_wed_device *dev, u32 reg, u32 val)
++{
++	writel(val, dev->hw->wdma + reg);
++}
++
++static inline u32
++wdma_r32(struct mtk_wed_device *dev, u32 reg)
++{
++	return readl(dev->hw->wdma + reg);
++}
++
++static inline u32
++wpdma_tx_r32(struct mtk_wed_device *dev, int ring, u32 reg)
++{
++	if (!dev->tx_ring[ring].wpdma)
++		return 0;
++
++	return readl(dev->tx_ring[ring].wpdma + reg);
++}
++
++static inline void
++wpdma_tx_w32(struct mtk_wed_device *dev, int ring, u32 reg, u32 val)
++{
++	if (!dev->tx_ring[ring].wpdma)
++		return;
++
++	writel(val, dev->tx_ring[ring].wpdma + reg);
++}
++
++static inline u32
++wpdma_txfree_r32(struct mtk_wed_device *dev, u32 reg)
++{
++	if (!dev->txfree_ring.wpdma)
++		return 0;
++
++	return readl(dev->txfree_ring.wpdma + reg);
++}
++
++static inline void
++wpdma_txfree_w32(struct mtk_wed_device *dev, u32 reg, u32 val)
++{
++	if (!dev->txfree_ring.wpdma)
++		return;
++
++	writel(val, dev->txfree_ring.wpdma + reg);
++}
++
++void mtk_wed_add_hw(struct device_node *np, struct mtk_eth *eth,
++		    void __iomem *wdma, int index);
++void mtk_wed_exit(void);
++int mtk_wed_flow_add(int index);
++void mtk_wed_flow_remove(int index);
++#else
++static inline void
++mtk_wed_add_hw(struct device_node *np, struct mtk_eth *eth,
++	       void __iomem *wdma, int index)
++{
++}
++static inline void
++mtk_wed_exit(void)
++{
++}
++static inline int mtk_wed_flow_add(int index)
++{
++	return -EINVAL;
++}
++static inline void mtk_wed_flow_remove(int index)
++{
++}
++#endif
++
++#ifdef CONFIG_DEBUG_FS
++void mtk_wed_hw_add_debugfs(struct mtk_wed_hw *hw);
++#else
++static inline void mtk_wed_hw_add_debugfs(struct mtk_wed_hw *hw)
++{
++}
++#endif
++
++#endif
+--- /dev/null
++++ b/drivers/net/ethernet/mediatek/mtk_wed_debugfs.c
+@@ -0,0 +1,175 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/* Copyright (C) 2021 Felix Fietkau <[email protected]> */
++
++#include <linux/seq_file.h>
++#include "mtk_wed.h"
++#include "mtk_wed_regs.h"
++
++struct reg_dump {
++	const char *name;
++	u16 offset;
++	u8 type;
++	u8 base;
++};
++
++enum {
++	DUMP_TYPE_STRING,
++	DUMP_TYPE_WED,
++	DUMP_TYPE_WDMA,
++	DUMP_TYPE_WPDMA_TX,
++	DUMP_TYPE_WPDMA_TXFREE,
++};
++
++#define DUMP_STR(_str) { _str, 0, DUMP_TYPE_STRING }
++#define DUMP_REG(_reg, ...) { #_reg, MTK_##_reg, __VA_ARGS__ }
++#define DUMP_RING(_prefix, _base, ...)				\
++	{ _prefix " BASE", _base, __VA_ARGS__ },		\
++	{ _prefix " CNT",  _base + 0x4, __VA_ARGS__ },	\
++	{ _prefix " CIDX", _base + 0x8, __VA_ARGS__ },	\
++	{ _prefix " DIDX", _base + 0xc, __VA_ARGS__ }
++
++#define DUMP_WED(_reg) DUMP_REG(_reg, DUMP_TYPE_WED)
++#define DUMP_WED_RING(_base) DUMP_RING(#_base, MTK_##_base, DUMP_TYPE_WED)
++
++#define DUMP_WDMA(_reg) DUMP_REG(_reg, DUMP_TYPE_WDMA)
++#define DUMP_WDMA_RING(_base) DUMP_RING(#_base, MTK_##_base, DUMP_TYPE_WDMA)
++
++#define DUMP_WPDMA_TX_RING(_n) DUMP_RING("WPDMA_TX" #_n, 0, DUMP_TYPE_WPDMA_TX, _n)
++#define DUMP_WPDMA_TXFREE_RING DUMP_RING("WPDMA_RX1", 0, DUMP_TYPE_WPDMA_TXFREE)
++
++static void
++print_reg_val(struct seq_file *s, const char *name, u32 val)
++{
++	seq_printf(s, "%-32s %08x\n", name, val);
++}
++
++static void
++dump_wed_regs(struct seq_file *s, struct mtk_wed_device *dev,
++	      const struct reg_dump *regs, int n_regs)
++{
++	const struct reg_dump *cur;
++	u32 val;
++
++	for (cur = regs; cur < &regs[n_regs]; cur++) {
++		switch (cur->type) {
++		case DUMP_TYPE_STRING:
++			seq_printf(s, "%s======== %s:\n",
++				   cur > regs ? "\n" : "",
++				   cur->name);
++			continue;
++		case DUMP_TYPE_WED:
++			val = wed_r32(dev, cur->offset);
++			break;
++		case DUMP_TYPE_WDMA:
++			val = wdma_r32(dev, cur->offset);
++			break;
++		case DUMP_TYPE_WPDMA_TX:
++			val = wpdma_tx_r32(dev, cur->base, cur->offset);
++			break;
++		case DUMP_TYPE_WPDMA_TXFREE:
++			val = wpdma_txfree_r32(dev, cur->offset);
++			break;
++		}
++		print_reg_val(s, cur->name, val);
++	}
++}
++
++
++static int
++wed_txinfo_show(struct seq_file *s, void *data)
++{
++	static const struct reg_dump regs[] = {
++		DUMP_STR("WED TX"),
++		DUMP_WED(WED_TX_MIB(0)),
++		DUMP_WED_RING(WED_RING_TX(0)),
++
++		DUMP_WED(WED_TX_MIB(1)),
++		DUMP_WED_RING(WED_RING_TX(1)),
++
++		DUMP_STR("WPDMA TX"),
++		DUMP_WED(WED_WPDMA_TX_MIB(0)),
++		DUMP_WED_RING(WED_WPDMA_RING_TX(0)),
++		DUMP_WED(WED_WPDMA_TX_COHERENT_MIB(0)),
++
++		DUMP_WED(WED_WPDMA_TX_MIB(1)),
++		DUMP_WED_RING(WED_WPDMA_RING_TX(1)),
++		DUMP_WED(WED_WPDMA_TX_COHERENT_MIB(1)),
++
++		DUMP_STR("WPDMA TX"),
++		DUMP_WPDMA_TX_RING(0),
++		DUMP_WPDMA_TX_RING(1),
++
++		DUMP_STR("WED WDMA RX"),
++		DUMP_WED(WED_WDMA_RX_MIB(0)),
++		DUMP_WED_RING(WED_WDMA_RING_RX(0)),
++		DUMP_WED(WED_WDMA_RX_THRES(0)),
++		DUMP_WED(WED_WDMA_RX_RECYCLE_MIB(0)),
++		DUMP_WED(WED_WDMA_RX_PROCESSED_MIB(0)),
++
++		DUMP_WED(WED_WDMA_RX_MIB(1)),
++		DUMP_WED_RING(WED_WDMA_RING_RX(1)),
++		DUMP_WED(WED_WDMA_RX_THRES(1)),
++		DUMP_WED(WED_WDMA_RX_RECYCLE_MIB(1)),
++		DUMP_WED(WED_WDMA_RX_PROCESSED_MIB(1)),
++
++		DUMP_STR("WDMA RX"),
++		DUMP_WDMA(WDMA_GLO_CFG),
++		DUMP_WDMA_RING(WDMA_RING_RX(0)),
++		DUMP_WDMA_RING(WDMA_RING_RX(1)),
++	};
++	struct mtk_wed_hw *hw = s->private;
++	struct mtk_wed_device *dev = hw->wed_dev;
++
++	if (!dev)
++		return 0;
++
++	dump_wed_regs(s, dev, regs, ARRAY_SIZE(regs));
++
++	return 0;
++}
++DEFINE_SHOW_ATTRIBUTE(wed_txinfo);
++
++
++static int
++mtk_wed_reg_set(void *data, u64 val)
++{
++	struct mtk_wed_hw *hw = data;
++
++	regmap_write(hw->regs, hw->debugfs_reg, val);
++
++	return 0;
++}
++
++static int
++mtk_wed_reg_get(void *data, u64 *val)
++{
++	struct mtk_wed_hw *hw = data;
++	unsigned int regval;
++	int ret;
++
++	ret = regmap_read(hw->regs, hw->debugfs_reg, &regval);
++	if (ret)
++		return ret;
++
++	*val = regval;
++
++	return 0;
++}
++
++DEFINE_DEBUGFS_ATTRIBUTE(fops_regval, mtk_wed_reg_get, mtk_wed_reg_set,
++             "0x%08llx\n");
++
++void mtk_wed_hw_add_debugfs(struct mtk_wed_hw *hw)
++{
++	struct dentry *dir;
++
++	snprintf(hw->dirname, sizeof(hw->dirname), "wed%d", hw->index);
++	dir = debugfs_create_dir(hw->dirname, NULL);
++	if (!dir)
++		return;
++
++	hw->debugfs_dir = dir;
++	debugfs_create_u32("regidx", 0600, dir, &hw->debugfs_reg);
++	debugfs_create_file_unsafe("regval", 0600, dir, hw, &fops_regval);
++	debugfs_create_file_unsafe("txinfo", 0400, dir, hw, &wed_txinfo_fops);
++}
+--- /dev/null
++++ b/drivers/net/ethernet/mediatek/mtk_wed_ops.c
+@@ -0,0 +1,8 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/* Copyright (C) 2020 Felix Fietkau <[email protected]> */
++
++#include <linux/kernel.h>
++#include <linux/soc/mediatek/mtk_wed.h>
++
++const struct mtk_wed_ops __rcu *mtk_soc_wed_ops;
++EXPORT_SYMBOL_GPL(mtk_soc_wed_ops);
+--- /dev/null
++++ b/drivers/net/ethernet/mediatek/mtk_wed_regs.h
+@@ -0,0 +1,251 @@
++// SPDX-License-Identifier: GPL-2.0-only
++/* Copyright (C) 2020 Felix Fietkau <[email protected]> */
++
++#ifndef __MTK_WED_REGS_H
++#define __MTK_WED_REGS_H
++
++#define MTK_WDMA_DESC_CTRL_LEN1			GENMASK(14, 0)
++#define MTK_WDMA_DESC_CTRL_LAST_SEG1		BIT(15)
++#define MTK_WDMA_DESC_CTRL_BURST		BIT(16)
++#define MTK_WDMA_DESC_CTRL_LEN0			GENMASK(29, 16)
++#define MTK_WDMA_DESC_CTRL_LAST_SEG0		BIT(30)
++#define MTK_WDMA_DESC_CTRL_DMA_DONE		BIT(31)
++
++struct mtk_wdma_desc {
++	__le32 buf0;
++	__le32 ctrl;
++	__le32 buf1;
++	__le32 info;
++} __packed __aligned(4);
++
++#define MTK_WED_RESET					0x008
++#define MTK_WED_RESET_TX_BM				BIT(0)
++#define MTK_WED_RESET_TX_FREE_AGENT			BIT(4)
++#define MTK_WED_RESET_WPDMA_TX_DRV			BIT(8)
++#define MTK_WED_RESET_WPDMA_RX_DRV			BIT(9)
++#define MTK_WED_RESET_WPDMA_INT_AGENT			BIT(11)
++#define MTK_WED_RESET_WED_TX_DMA			BIT(12)
++#define MTK_WED_RESET_WDMA_RX_DRV			BIT(17)
++#define MTK_WED_RESET_WDMA_INT_AGENT			BIT(19)
++#define MTK_WED_RESET_WED				BIT(31)
++
++#define MTK_WED_CTRL					0x00c
++#define MTK_WED_CTRL_WPDMA_INT_AGENT_EN			BIT(0)
++#define MTK_WED_CTRL_WPDMA_INT_AGENT_BUSY		BIT(1)
++#define MTK_WED_CTRL_WDMA_INT_AGENT_EN			BIT(2)
++#define MTK_WED_CTRL_WDMA_INT_AGENT_BUSY		BIT(3)
++#define MTK_WED_CTRL_WED_TX_BM_EN			BIT(8)
++#define MTK_WED_CTRL_WED_TX_BM_BUSY			BIT(9)
++#define MTK_WED_CTRL_WED_TX_FREE_AGENT_EN		BIT(10)
++#define MTK_WED_CTRL_WED_TX_FREE_AGENT_BUSY		BIT(11)
++#define MTK_WED_CTRL_RESERVE_EN				BIT(12)
++#define MTK_WED_CTRL_RESERVE_BUSY			BIT(13)
++#define MTK_WED_CTRL_FINAL_DIDX_READ			BIT(24)
++#define MTK_WED_CTRL_MIB_READ_CLEAR			BIT(28)
++
++#define MTK_WED_EXT_INT_STATUS				0x020
++#define MTK_WED_EXT_INT_STATUS_TF_LEN_ERR		BIT(0)
++#define MTK_WED_EXT_INT_STATUS_TKID_WO_PYLD		BIT(1)
++#define MTK_WED_EXT_INT_STATUS_TKID_TITO_INVALID	BIT(4)
++#define MTK_WED_EXT_INT_STATUS_TX_FBUF_LO_TH		BIT(8)
++#define MTK_WED_EXT_INT_STATUS_TX_FBUF_HI_TH		BIT(9)
++#define MTK_WED_EXT_INT_STATUS_RX_FBUF_LO_TH		BIT(12)
++#define MTK_WED_EXT_INT_STATUS_RX_FBUF_HI_TH		BIT(13)
++#define MTK_WED_EXT_INT_STATUS_RX_DRV_R_RESP_ERR	BIT(16)
++#define MTK_WED_EXT_INT_STATUS_RX_DRV_W_RESP_ERR	BIT(17)
++#define MTK_WED_EXT_INT_STATUS_RX_DRV_COHERENT		BIT(18)
++#define MTK_WED_EXT_INT_STATUS_RX_DRV_INIT_WDMA_EN	BIT(19)
++#define MTK_WED_EXT_INT_STATUS_RX_DRV_BM_DMAD_COHERENT	BIT(20)
++#define MTK_WED_EXT_INT_STATUS_TX_DRV_R_RESP_ERR	BIT(21)
++#define MTK_WED_EXT_INT_STATUS_TX_DRV_W_RESP_ERR	BIT(22)
++#define MTK_WED_EXT_INT_STATUS_RX_DRV_DMA_RECYCLE	BIT(24)
++#define MTK_WED_EXT_INT_STATUS_ERROR_MASK		(MTK_WED_EXT_INT_STATUS_TF_LEN_ERR | \
++							 MTK_WED_EXT_INT_STATUS_TKID_WO_PYLD | \
++							 MTK_WED_EXT_INT_STATUS_TKID_TITO_INVALID | \
++							 MTK_WED_EXT_INT_STATUS_RX_DRV_R_RESP_ERR | \
++							 MTK_WED_EXT_INT_STATUS_RX_DRV_W_RESP_ERR | \
++							 MTK_WED_EXT_INT_STATUS_RX_DRV_INIT_WDMA_EN | \
++							 MTK_WED_EXT_INT_STATUS_TX_DRV_R_RESP_ERR | \
++							 MTK_WED_EXT_INT_STATUS_TX_DRV_W_RESP_ERR)
++
++#define MTK_WED_EXT_INT_MASK				0x028
++
++#define MTK_WED_STATUS					0x060
++#define MTK_WED_STATUS_TX				GENMASK(15, 8)
++
++#define MTK_WED_TX_BM_CTRL				0x080
++#define MTK_WED_TX_BM_CTRL_VLD_GRP_NUM			GENMASK(6, 0)
++#define MTK_WED_TX_BM_CTRL_RSV_GRP_NUM			GENMASK(22, 16)
++#define MTK_WED_TX_BM_CTRL_PAUSE			BIT(28)
++
++#define MTK_WED_TX_BM_BASE				0x084
++
++#define MTK_WED_TX_BM_TKID				0x088
++#define MTK_WED_TX_BM_TKID_START			GENMASK(15, 0)
++#define MTK_WED_TX_BM_TKID_END				GENMASK(31, 16)
++
++#define MTK_WED_TX_BM_BUF_LEN				0x08c
++
++#define MTK_WED_TX_BM_INTF				0x09c
++#define MTK_WED_TX_BM_INTF_TKID				GENMASK(15, 0)
++#define MTK_WED_TX_BM_INTF_TKFIFO_FDEP			GENMASK(23, 16)
++#define MTK_WED_TX_BM_INTF_TKID_VALID			BIT(28)
++#define MTK_WED_TX_BM_INTF_TKID_READ			BIT(29)
++
++#define MTK_WED_TX_BM_DYN_THR				0x0a0
++#define MTK_WED_TX_BM_DYN_THR_LO			GENMASK(6, 0)
++#define MTK_WED_TX_BM_DYN_THR_HI			GENMASK(22, 16)
++
++#define MTK_WED_INT_STATUS				0x200
++#define MTK_WED_INT_MASK				0x204
++
++#define MTK_WED_GLO_CFG					0x208
++#define MTK_WED_GLO_CFG_TX_DMA_EN			BIT(0)
++#define MTK_WED_GLO_CFG_TX_DMA_BUSY			BIT(1)
++#define MTK_WED_GLO_CFG_RX_DMA_EN			BIT(2)
++#define MTK_WED_GLO_CFG_RX_DMA_BUSY			BIT(3)
++#define MTK_WED_GLO_CFG_RX_BT_SIZE			GENMASK(5, 4)
++#define MTK_WED_GLO_CFG_TX_WB_DDONE			BIT(6)
++#define MTK_WED_GLO_CFG_BIG_ENDIAN			BIT(7)
++#define MTK_WED_GLO_CFG_DIS_BT_SIZE_ALIGN		BIT(8)
++#define MTK_WED_GLO_CFG_TX_BT_SIZE_LO			BIT(9)
++#define MTK_WED_GLO_CFG_MULTI_DMA_EN			GENMASK(11, 10)
++#define MTK_WED_GLO_CFG_FIFO_LITTLE_ENDIAN		BIT(12)
++#define MTK_WED_GLO_CFG_MI_DEPTH_RD			GENMASK(21, 13)
++#define MTK_WED_GLO_CFG_TX_BT_SIZE_HI			GENMASK(23, 22)
++#define MTK_WED_GLO_CFG_SW_RESET			BIT(24)
++#define MTK_WED_GLO_CFG_FIRST_TOKEN_ONLY		BIT(26)
++#define MTK_WED_GLO_CFG_OMIT_RX_INFO			BIT(27)
++#define MTK_WED_GLO_CFG_OMIT_TX_INFO			BIT(28)
++#define MTK_WED_GLO_CFG_BYTE_SWAP			BIT(29)
++#define MTK_WED_GLO_CFG_RX_2B_OFFSET			BIT(31)
++
++#define MTK_WED_RESET_IDX				0x20c
++#define MTK_WED_RESET_IDX_TX				GENMASK(3, 0)
++#define MTK_WED_RESET_IDX_RX				GENMASK(17, 16)
++
++#define MTK_WED_TX_MIB(_n)				(0x2a0 + (_n) * 4)
++
++#define MTK_WED_RING_TX(_n)				(0x300 + (_n) * 0x10)
++
++#define MTK_WED_RING_RX(_n)				(0x400 + (_n) * 0x10)
++
++#define MTK_WED_WPDMA_INT_TRIGGER			0x504
++#define MTK_WED_WPDMA_INT_TRIGGER_RX_DONE		BIT(1)
++#define MTK_WED_WPDMA_INT_TRIGGER_TX_DONE		GENMASK(5, 4)
++
++#define MTK_WED_WPDMA_GLO_CFG				0x508
++#define MTK_WED_WPDMA_GLO_CFG_TX_DRV_EN			BIT(0)
++#define MTK_WED_WPDMA_GLO_CFG_TX_DRV_BUSY		BIT(1)
++#define MTK_WED_WPDMA_GLO_CFG_RX_DRV_EN			BIT(2)
++#define MTK_WED_WPDMA_GLO_CFG_RX_DRV_BUSY		BIT(3)
++#define MTK_WED_WPDMA_GLO_CFG_RX_BT_SIZE		GENMASK(5, 4)
++#define MTK_WED_WPDMA_GLO_CFG_TX_WB_DDONE		BIT(6)
++#define MTK_WED_WPDMA_GLO_CFG_BIG_ENDIAN		BIT(7)
++#define MTK_WED_WPDMA_GLO_CFG_DIS_BT_SIZE_ALIGN		BIT(8)
++#define MTK_WED_WPDMA_GLO_CFG_TX_BT_SIZE_LO		BIT(9)
++#define MTK_WED_WPDMA_GLO_CFG_MULTI_DMA_EN		GENMASK(11, 10)
++#define MTK_WED_WPDMA_GLO_CFG_FIFO_LITTLE_ENDIAN	BIT(12)
++#define MTK_WED_WPDMA_GLO_CFG_MI_DEPTH_RD		GENMASK(21, 13)
++#define MTK_WED_WPDMA_GLO_CFG_TX_BT_SIZE_HI		GENMASK(23, 22)
++#define MTK_WED_WPDMA_GLO_CFG_SW_RESET			BIT(24)
++#define MTK_WED_WPDMA_GLO_CFG_FIRST_TOKEN_ONLY		BIT(26)
++#define MTK_WED_WPDMA_GLO_CFG_OMIT_RX_INFO		BIT(27)
++#define MTK_WED_WPDMA_GLO_CFG_OMIT_TX_INFO		BIT(28)
++#define MTK_WED_WPDMA_GLO_CFG_BYTE_SWAP			BIT(29)
++#define MTK_WED_WPDMA_GLO_CFG_RX_2B_OFFSET		BIT(31)
++
++#define MTK_WED_WPDMA_RESET_IDX				0x50c
++#define MTK_WED_WPDMA_RESET_IDX_TX			GENMASK(3, 0)
++#define MTK_WED_WPDMA_RESET_IDX_RX			GENMASK(17, 16)
++
++#define MTK_WED_WPDMA_INT_CTRL				0x520
++#define MTK_WED_WPDMA_INT_CTRL_SUBRT_ADV		BIT(21)
++
++#define MTK_WED_WPDMA_INT_MASK				0x524
++
++#define MTK_WED_PCIE_CFG_BASE				0x560
++
++#define MTK_WED_PCIE_INT_TRIGGER			0x570
++#define MTK_WED_PCIE_INT_TRIGGER_STATUS			BIT(16)
++
++#define MTK_WED_WPDMA_CFG_BASE				0x580
++
++#define MTK_WED_WPDMA_TX_MIB(_n)			(0x5a0 + (_n) * 4)
++#define MTK_WED_WPDMA_TX_COHERENT_MIB(_n)		(0x5d0 + (_n) * 4)
++
++#define MTK_WED_WPDMA_RING_TX(_n)			(0x600 + (_n) * 0x10)
++#define MTK_WED_WPDMA_RING_RX(_n)			(0x700 + (_n) * 0x10)
++#define MTK_WED_WDMA_RING_RX(_n)			(0x900 + (_n) * 0x10)
++#define MTK_WED_WDMA_RX_THRES(_n)			(0x940 + (_n) * 0x4)
++
++#define MTK_WED_WDMA_GLO_CFG				0xa04
++#define MTK_WED_WDMA_GLO_CFG_TX_DRV_EN			BIT(0)
++#define MTK_WED_WDMA_GLO_CFG_RX_DRV_EN			BIT(2)
++#define MTK_WED_WDMA_GLO_CFG_RX_DRV_BUSY		BIT(3)
++#define MTK_WED_WDMA_GLO_CFG_BT_SIZE			GENMASK(5, 4)
++#define MTK_WED_WDMA_GLO_CFG_TX_WB_DDONE		BIT(6)
++#define MTK_WED_WDMA_GLO_CFG_RX_DIS_FSM_AUTO_IDLE	BIT(13)
++#define MTK_WED_WDMA_GLO_CFG_WCOMPLETE_SEL		BIT(16)
++#define MTK_WED_WDMA_GLO_CFG_INIT_PHASE_RXDMA_BYPASS	BIT(17)
++#define MTK_WED_WDMA_GLO_CFG_INIT_PHASE_BYPASS		BIT(18)
++#define MTK_WED_WDMA_GLO_CFG_FSM_RETURN_IDLE		BIT(19)
++#define MTK_WED_WDMA_GLO_CFG_WAIT_COHERENT		BIT(20)
++#define MTK_WED_WDMA_GLO_CFG_AXI_W_AFTER_AW		BIT(21)
++#define MTK_WED_WDMA_GLO_CFG_IDLE_DMAD_SUPPLY_SINGLE_W	BIT(22)
++#define MTK_WED_WDMA_GLO_CFG_IDLE_DMAD_SUPPLY		BIT(23)
++#define MTK_WED_WDMA_GLO_CFG_DYNAMIC_SKIP_DMAD_PREP	BIT(24)
++#define MTK_WED_WDMA_GLO_CFG_DYNAMIC_DMAD_RECYCLE	BIT(25)
++#define MTK_WED_WDMA_GLO_CFG_RST_INIT_COMPLETE		BIT(26)
++#define MTK_WED_WDMA_GLO_CFG_RXDRV_CLKGATE_BYPASS	BIT(30)
++
++#define MTK_WED_WDMA_RESET_IDX				0xa08
++#define MTK_WED_WDMA_RESET_IDX_RX			GENMASK(17, 16)
++#define MTK_WED_WDMA_RESET_IDX_DRV			GENMASK(25, 24)
++
++#define MTK_WED_WDMA_INT_TRIGGER			0xa28
++#define MTK_WED_WDMA_INT_TRIGGER_RX_DONE		GENMASK(17, 16)
++
++#define MTK_WED_WDMA_INT_CTRL				0xa2c
++#define MTK_WED_WDMA_INT_CTRL_POLL_SRC_SEL		GENMASK(17, 16)
++
++#define MTK_WED_WDMA_OFFSET0				0xaa4
++#define MTK_WED_WDMA_OFFSET1				0xaa8
++
++#define MTK_WED_WDMA_RX_MIB(_n)				(0xae0 + (_n) * 4)
++#define MTK_WED_WDMA_RX_RECYCLE_MIB(_n)			(0xae8 + (_n) * 4)
++#define MTK_WED_WDMA_RX_PROCESSED_MIB(_n)		(0xaf0 + (_n) * 4)
++
++#define MTK_WED_RING_OFS_BASE				0x00
++#define MTK_WED_RING_OFS_COUNT				0x04
++#define MTK_WED_RING_OFS_CPU_IDX			0x08
++#define MTK_WED_RING_OFS_DMA_IDX			0x0c
++
++#define MTK_WDMA_RING_RX(_n)				(0x100 + (_n) * 0x10)
++
++#define MTK_WDMA_GLO_CFG				0x204
++#define MTK_WDMA_GLO_CFG_RX_INFO_PRERES			GENMASK(28, 26)
++
++#define MTK_WDMA_RESET_IDX				0x208
++#define MTK_WDMA_RESET_IDX_TX				GENMASK(3, 0)
++#define MTK_WDMA_RESET_IDX_RX				GENMASK(17, 16)
++
++#define MTK_WDMA_INT_MASK				0x228
++#define MTK_WDMA_INT_MASK_TX_DONE			GENMASK(3, 0)
++#define MTK_WDMA_INT_MASK_RX_DONE			GENMASK(17, 16)
++#define MTK_WDMA_INT_MASK_TX_DELAY			BIT(28)
++#define MTK_WDMA_INT_MASK_TX_COHERENT			BIT(29)
++#define MTK_WDMA_INT_MASK_RX_DELAY			BIT(30)
++#define MTK_WDMA_INT_MASK_RX_COHERENT			BIT(31)
++
++#define MTK_WDMA_INT_GRP1				0x250
++#define MTK_WDMA_INT_GRP2				0x254
++
++#define MTK_PCIE_MIRROR_MAP(n)				((n) ? 0x4 : 0x0)
++#define MTK_PCIE_MIRROR_MAP_EN				BIT(0)
++#define MTK_PCIE_MIRROR_MAP_WED_ID			BIT(1)
++
++/* DMA channel mapping */
++#define HIFSYS_DMA_AG_MAP				0x008
++
++#endif
+--- /dev/null
++++ b/include/linux/soc/mediatek/mtk_wed.h
+@@ -0,0 +1,131 @@
++#ifndef __MTK_WED_H
++#define __MTK_WED_H
++
++#include <linux/kernel.h>
++#include <linux/rcupdate.h>
++#include <linux/regmap.h>
++#include <linux/pci.h>
++
++#define MTK_WED_TX_QUEUES		2
++
++struct mtk_wed_hw;
++struct mtk_wdma_desc;
++
++struct mtk_wed_ring {
++	struct mtk_wdma_desc *desc;
++	dma_addr_t desc_phys;
++	int size;
++
++	u32 reg_base;
++	void __iomem *wpdma;
++};
++
++struct mtk_wed_device {
++#ifdef CONFIG_NET_MEDIATEK_SOC_WED
++	const struct mtk_wed_ops *ops;
++	struct device *dev;
++	struct mtk_wed_hw *hw;
++	bool init_done, running;
++	int wdma_idx;
++	int irq;
++
++	struct mtk_wed_ring tx_ring[MTK_WED_TX_QUEUES];
++	struct mtk_wed_ring txfree_ring;
++	struct mtk_wed_ring tx_wdma[MTK_WED_TX_QUEUES];
++
++	struct {
++		int size;
++		void **pages;
++		struct mtk_wdma_desc *desc;
++		dma_addr_t desc_phys;
++	} buf_ring;
++
++	/* filled by driver: */
++	struct {
++		struct pci_dev *pci_dev;
++
++		u32 wpdma_phys;
++
++		u16 token_start;
++		unsigned int nbuf;
++
++		u32 (*init_buf)(void *ptr, dma_addr_t phys, int token_id);
++		int (*offload_enable)(struct mtk_wed_device *wed);
++		void (*offload_disable)(struct mtk_wed_device *wed);
++	} wlan;
++#endif
++};
++
++struct mtk_wed_ops {
++	int (*attach)(struct mtk_wed_device *dev);
++	int (*tx_ring_setup)(struct mtk_wed_device *dev, int ring,
++			     void __iomem *regs);
++	int (*txfree_ring_setup)(struct mtk_wed_device *dev,
++				 void __iomem *regs);
++	void (*detach)(struct mtk_wed_device *dev);
++
++	void (*stop)(struct mtk_wed_device *dev);
++	void (*start)(struct mtk_wed_device *dev, u32 irq_mask);
++	void (*reset_dma)(struct mtk_wed_device *dev);
++
++	u32 (*reg_read)(struct mtk_wed_device *dev, u32 reg);
++	void (*reg_write)(struct mtk_wed_device *dev, u32 reg, u32 val);
++
++	u32 (*irq_get)(struct mtk_wed_device *dev, u32 mask);
++	void (*irq_set_mask)(struct mtk_wed_device *dev, u32 mask);
++};
++
++extern const struct mtk_wed_ops __rcu *mtk_soc_wed_ops;
++
++static inline int
++mtk_wed_device_attach(struct mtk_wed_device *dev)
++{
++	int ret = -ENODEV;
++
++#ifdef CONFIG_NET_MEDIATEK_SOC_WED
++	rcu_read_lock();
++	dev->ops = rcu_dereference(mtk_soc_wed_ops);
++	if (dev->ops)
++		ret = dev->ops->attach(dev);
++	else
++		rcu_read_unlock();
++
++	if (ret)
++		dev->ops = NULL;
++#endif
++
++	return ret;
++}
++
++#ifdef CONFIG_NET_MEDIATEK_SOC_WED
++#define mtk_wed_device_active(_dev) !!(_dev)->ops
++#define mtk_wed_device_detach(_dev) (_dev)->ops->detach(_dev)
++#define mtk_wed_device_start(_dev, _mask) (_dev)->ops->start(_dev, _mask)
++#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs) \
++	(_dev)->ops->tx_ring_setup(_dev, _ring, _regs)
++#define mtk_wed_device_txfree_ring_setup(_dev, _regs) \
++	(_dev)->ops->txfree_ring_setup(_dev, _regs)
++#define mtk_wed_device_reg_read(_dev, _reg) \
++	(_dev)->ops->reg_read(_dev, _reg)
++#define mtk_wed_device_reg_write(_dev, _reg, _val) \
++	(_dev)->ops->reg_write(_dev, _reg, _val)
++#define mtk_wed_device_irq_get(_dev, _mask) \
++	(_dev)->ops->irq_get(_dev, _mask)
++#define mtk_wed_device_irq_set_mask(_dev, _mask) \
++	(_dev)->ops->irq_set_mask(_dev, _mask)
++#else
++static inline bool mtk_wed_device_active(struct mtk_wed_device *dev)
++{
++	return false;
++}
++#define mtk_wed_device_detach(_dev) do {} while (0)
++#define mtk_wed_device_start(_dev, _mask) do {} while (0)
++#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs) -ENODEV
++#define mtk_wed_device_txfree_ring_setup(_dev, _ring, _regs) -ENODEV
++#define mtk_wed_device_reg_read(_dev, _reg) 0
++#define mtk_wed_device_reg_write(_dev, _reg, _val) do {} while (0)
++#define mtk_wed_device_irq_get(_dev, _mask) 0
++#define mtk_wed_device_irq_set_mask(_dev, _mask) do {} while (0)
++#endif
++
++#endif

Algúns arquivos non se mostraron porque demasiados arquivos cambiaron neste cambio