vor 4 Jahren · 05158082f6
--- a/target/linux/generic/config-5.15
+++ b/target/linux/generic/config-5.15
@@ -3195,6 +3195,9 @@ CONFIG_LOG_CPU_MAX_BUF_SHIFT=12
 
				 # CONFIG_LPC_ICH is not set
			
 
				 # CONFIG_LPC_SCH is not set
			
 
				 # CONFIG_LP_CONSOLE is not set
			
 
				+CONFIG_LRU_GEN=y
			
 
				+CONFIG_LRU_GEN_ENABLED=y
			
 
				+# CONFIG_LRU_GEN_STATS is not set
			
 
				 # CONFIG_LSI_ET1011C_PHY is not set
			
 
				 CONFIG_LSM="lockdown,yama,loadpin,safesetid,integrity"
			
 
				 CONFIG_LSM_MMAP_MIN_ADDR=65536
			
@@ -4388,6 +4391,7 @@ CONFIG_NMI_LOG_BUF_SHIFT=13
 
				 # CONFIG_NO_HZ is not set
			
 
				 # CONFIG_NO_HZ_FULL is not set
			
 
				 # CONFIG_NO_HZ_IDLE is not set
			
 
				+CONFIG_NR_LRU_GENS=7
			
 
				 # CONFIG_NS83820 is not set
			
 
				 # CONFIG_NTB is not set
			
 
				 # CONFIG_NTFS3_64BIT_CLUSTER is not set
			
@@ -6480,6 +6484,7 @@ CONFIG_THIN_ARCHIVES=y
 
				 # CONFIG_THUNDER_NIC_VF is not set
			
 
				 # CONFIG_TICK_CPU_ACCOUNTING is not set
			
 
				 CONFIG_TICK_ONESHOT=y
			
 
				+CONFIG_TIERS_PER_GEN=4
			
 
				 # CONFIG_TIFM_CORE is not set
			
 
				 # CONFIG_TIGON3 is not set
			
 
				 # CONFIG_TIMB_DMA is not set
			
--- a/target/linux/generic/pending-5.15/020-00-mm-x86-arm64-add-arch_has_hw_pte_young.patch
+++ b/target/linux/generic/pending-5.15/020-00-mm-x86-arm64-add-arch_has_hw_pte_young.patch
@@ -0,0 +1,169 @@
 
				+From a8e6015d9534f39abc08e6804566af059e498a60 Mon Sep 17 00:00:00 2001
			
 
				+From: Yu Zhao <[email protected]>
			
 
				+Date: Wed, 4 Aug 2021 01:31:34 -0600
			
 
				+Subject: [PATCH 01/10] mm: x86, arm64: add arch_has_hw_pte_young()
			
 
				+
			
 
				+Some architectures automatically set the accessed bit in PTEs, e.g.,
			
 
				+x86 and arm64 v8.2. On architectures that do not have this capability,
			
 
				+clearing the accessed bit in a PTE triggers a page fault following the
			
 
				+TLB miss of this PTE.
			
 
				+
			
 
				+Being aware of this capability can help make better decisions, i.e.,
			
 
				+whether to limit the size of each batch of PTEs and the burst of
			
 
				+batches when clearing the accessed bit.
			
 
				+
			
 
				+Signed-off-by: Yu Zhao <[email protected]>
			
 
				+Change-Id: Ib49b44fb56df3333a2ff1fcc496fb1980b976e7a
			
 
				+---
			
 
				+ arch/arm64/include/asm/cpufeature.h |  5 +++++
			
 
				+ arch/arm64/include/asm/pgtable.h    | 13 ++++++++-----
			
 
				+ arch/arm64/kernel/cpufeature.c      | 10 ++++++++++
			
 
				+ arch/arm64/tools/cpucaps            |  1 +
			
 
				+ arch/x86/include/asm/pgtable.h      |  6 +++---
			
 
				+ include/linux/pgtable.h             | 13 +++++++++++++
			
 
				+ mm/memory.c                         | 14 +-------------
			
 
				+ 7 files changed, 41 insertions(+), 21 deletions(-)
			
 
				+
			
 
				+--- a/arch/arm64/include/asm/cpufeature.h
			
 
				++++ b/arch/arm64/include/asm/cpufeature.h
			
 
				+@@ -808,6 +808,11 @@ static inline bool system_supports_tlb_r
			
 
				+ 		cpus_have_const_cap(ARM64_HAS_TLB_RANGE);
			
 
				+ }
			
 
				+ 
			
 
				++static inline bool system_has_hw_af(void)
			
 
				++{
			
 
				++	return IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && cpus_have_const_cap(ARM64_HW_AF);
			
 
				++}
			
 
				++
			
 
				+ extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
			
 
				+ 
			
 
				+ static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
			
 
				+--- a/arch/arm64/include/asm/pgtable.h
			
 
				++++ b/arch/arm64/include/asm/pgtable.h
			
 
				+@@ -999,13 +999,16 @@ static inline void update_mmu_cache(stru
			
 
				+  * page after fork() + CoW for pfn mappings. We don't always have a
			
 
				+  * hardware-managed access flag on arm64.
			
 
				+  */
			
 
				+-static inline bool arch_faults_on_old_pte(void)
			
 
				++static inline bool arch_has_hw_pte_young(bool local)
			
 
				+ {
			
 
				+-	WARN_ON(preemptible());
			
 
				++	if (local) {
			
 
				++		WARN_ON(preemptible());
			
 
				++		return cpu_has_hw_af();
			
 
				++	}
			
 
				+ 
			
 
				+-	return !cpu_has_hw_af();
			
 
				++	return system_has_hw_af();
			
 
				+ }
			
 
				+-#define arch_faults_on_old_pte		arch_faults_on_old_pte
			
 
				++#define arch_has_hw_pte_young		arch_has_hw_pte_young
			
 
				+ 
			
 
				+ /*
			
 
				+  * Experimentally, it's cheap to set the access flag in hardware and we
			
 
				+@@ -1013,7 +1016,7 @@ static inline bool arch_faults_on_old_pt
			
 
				+  */
			
 
				+ static inline bool arch_wants_old_prefaulted_pte(void)
			
 
				+ {
			
 
				+-	return !arch_faults_on_old_pte();
			
 
				++	return arch_has_hw_pte_young(true);
			
 
				+ }
			
 
				+ #define arch_wants_old_prefaulted_pte	arch_wants_old_prefaulted_pte
			
 
				+ 
			
 
				+--- a/arch/arm64/kernel/cpufeature.c
			
 
				++++ b/arch/arm64/kernel/cpufeature.c
			
 
				+@@ -2184,6 +2184,16 @@ static const struct arm64_cpu_capabiliti
			
 
				+ 		.matches = has_hw_dbm,
			
 
				+ 		.cpu_enable = cpu_enable_hw_dbm,
			
 
				+ 	},
			
 
				++	{
			
 
				++		.desc = "Hardware update of the Access flag",
			
 
				++		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
			
 
				++		.capability = ARM64_HW_AF,
			
 
				++		.sys_reg = SYS_ID_AA64MMFR1_EL1,
			
 
				++		.sign = FTR_UNSIGNED,
			
 
				++		.field_pos = ID_AA64MMFR1_HADBS_SHIFT,
			
 
				++		.min_field_value = 1,
			
 
				++		.matches = has_cpuid_feature,
			
 
				++	},
			
 
				+ #endif
			
 
				+ 	{
			
 
				+ 		.desc = "CRC32 instructions",
			
 
				+--- a/arch/arm64/tools/cpucaps
			
 
				++++ b/arch/arm64/tools/cpucaps
			
 
				+@@ -35,6 +35,7 @@ HAS_STAGE2_FWB
			
 
				+ HAS_SYSREG_GIC_CPUIF
			
 
				+ HAS_TLB_RANGE
			
 
				+ HAS_VIRT_HOST_EXTN
			
 
				++HW_AF
			
 
				+ HW_DBM
			
 
				+ KVM_PROTECTED_MODE
			
 
				+ MISMATCHED_CACHE_TYPE
			
 
				+--- a/arch/x86/include/asm/pgtable.h
			
 
				++++ b/arch/x86/include/asm/pgtable.h
			
 
				+@@ -1397,10 +1397,10 @@ static inline bool arch_has_pfn_modify_c
			
 
				+ 	return boot_cpu_has_bug(X86_BUG_L1TF);
			
 
				+ }
			
 
				+ 
			
 
				+-#define arch_faults_on_old_pte arch_faults_on_old_pte
			
 
				+-static inline bool arch_faults_on_old_pte(void)
			
 
				++#define arch_has_hw_pte_young arch_has_hw_pte_young
			
 
				++static inline bool arch_has_hw_pte_young(bool local)
			
 
				+ {
			
 
				+-	return false;
			
 
				++	return true;
			
 
				+ }
			
 
				+ 
			
 
				+ #endif	/* __ASSEMBLY__ */
			
 
				+--- a/include/linux/pgtable.h
			
 
				++++ b/include/linux/pgtable.h
			
 
				+@@ -259,6 +259,19 @@ static inline int pmdp_clear_flush_young
			
 
				+ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
			
 
				+ #endif
			
 
				+ 
			
 
				++#ifndef arch_has_hw_pte_young
			
 
				++/*
			
 
				++ * Return whether the accessed bit is supported by the local CPU or all CPUs.
			
 
				++ *
			
 
				++ * Those arches which have hw access flag feature need to implement their own
			
 
				++ * helper. By default, "false" means pagefault will be hit on old pte.
			
 
				++ */
			
 
				++static inline bool arch_has_hw_pte_young(bool local)
			
 
				++{
			
 
				++	return false;
			
 
				++}
			
 
				++#endif
			
 
				++
			
 
				+ #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
			
 
				+ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
			
 
				+ 				       unsigned long address,
			
 
				+--- a/mm/memory.c
			
 
				++++ b/mm/memory.c
			
 
				+@@ -121,18 +121,6 @@ int randomize_va_space __read_mostly =
			
 
				+ 					2;
			
 
				+ #endif
			
 
				+ 
			
 
				+-#ifndef arch_faults_on_old_pte
			
 
				+-static inline bool arch_faults_on_old_pte(void)
			
 
				+-{
			
 
				+-	/*
			
 
				+-	 * Those arches which don't have hw access flag feature need to
			
 
				+-	 * implement their own helper. By default, "true" means pagefault
			
 
				+-	 * will be hit on old pte.
			
 
				+-	 */
			
 
				+-	return true;
			
 
				+-}
			
 
				+-#endif
			
 
				+-
			
 
				+ #ifndef arch_wants_old_prefaulted_pte
			
 
				+ static inline bool arch_wants_old_prefaulted_pte(void)
			
 
				+ {
			
 
				+@@ -2782,7 +2770,7 @@ static inline bool cow_user_page(struct
			
 
				+ 	 * On architectures with software "accessed" bits, we would
			
 
				+ 	 * take a double page fault, so mark it accessed here.
			
 
				+ 	 */
			
 
				+-	if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
			
 
				++	if (!arch_has_hw_pte_young(true) && !pte_young(vmf->orig_pte)) {
			
 
				+ 		pte_t entry;
			
 
				+ 
			
 
				+ 		vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
			
--- a/target/linux/generic/pending-5.15/020-01-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch
+++ b/target/linux/generic/pending-5.15/020-01-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch
@@ -0,0 +1,111 @@
 
				+From f8b663bbfa30af5515e222fd74df20ea4e8393a2 Mon Sep 17 00:00:00 2001
			
 
				+From: Yu Zhao <[email protected]>
			
 
				+Date: Sat, 26 Sep 2020 21:17:18 -0600
			
 
				+Subject: [PATCH 02/10] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
			
 
				+
			
 
				+Some architectures support the accessed bit on non-leaf PMD entries,
			
 
				+e.g., x86_64 sets the accessed bit on a non-leaf PMD entry when using
			
 
				+it as part of linear address translation [1]. As an optimization, page
			
 
				+table walkers who are interested in the accessed bit can skip the PTEs
			
 
				+under a non-leaf PMD entry if the accessed bit is cleared on this PMD
			
 
				+entry.
			
 
				+
			
 
				+Although an inline function may be preferable, this capability is
			
 
				+added as a configuration option to look consistent when used with the
			
 
				+existing macros.
			
 
				+
			
 
				+[1]: Intel 64 and IA-32 Architectures Software Developer's Manual
			
 
				+     Volume 3 (June 2021), section 4.8
			
 
				+
			
 
				+Signed-off-by: Yu Zhao <[email protected]>
			
 
				+Tested-by: Konstantin Kharlamov <[email protected]>
			
 
				+Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8
			
 
				+---
			
 
				+ arch/Kconfig                   | 9 +++++++++
			
 
				+ arch/x86/Kconfig               | 1 +
			
 
				+ arch/x86/include/asm/pgtable.h | 3 ++-
			
 
				+ arch/x86/mm/pgtable.c          | 5 ++++-
			
 
				+ include/linux/pgtable.h        | 4 ++--
			
 
				+ 5 files changed, 18 insertions(+), 4 deletions(-)
			
 
				+
			
 
				+--- a/arch/Kconfig
			
 
				++++ b/arch/Kconfig
			
 
				+@@ -1295,6 +1295,15 @@ config ARCH_HAS_ELFCORE_COMPAT
			
 
				+ config ARCH_HAS_PARANOID_L1D_FLUSH
			
 
				+ 	bool
			
 
				+ 
			
 
				++config ARCH_HAS_NONLEAF_PMD_YOUNG
			
 
				++	bool
			
 
				++	depends on PGTABLE_LEVELS > 2
			
 
				++	help
			
 
				++	  Architectures that select this are able to set the accessed bit on
			
 
				++	  non-leaf PMD entries in addition to leaf PTE entries where pages are
			
 
				++	  mapped. For them, page table walkers that clear the accessed bit may
			
 
				++	  stop at non-leaf PMD entries if they do not see the accessed bit.
			
 
				++
			
 
				+ source "kernel/gcov/Kconfig"
			
 
				+ 
			
 
				+ source "scripts/gcc-plugins/Kconfig"
			
 
				+--- a/arch/x86/Kconfig
			
 
				++++ b/arch/x86/Kconfig
			
 
				+@@ -84,6 +84,7 @@ config X86
			
 
				+ 	select ARCH_HAS_PMEM_API		if X86_64
			
 
				+ 	select ARCH_HAS_PTE_DEVMAP		if X86_64
			
 
				+ 	select ARCH_HAS_PTE_SPECIAL
			
 
				++	select ARCH_HAS_NONLEAF_PMD_YOUNG	if X86_64
			
 
				+ 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
			
 
				+ 	select ARCH_HAS_COPY_MC			if X86_64
			
 
				+ 	select ARCH_HAS_SET_MEMORY
			
 
				+--- a/arch/x86/include/asm/pgtable.h
			
 
				++++ b/arch/x86/include/asm/pgtable.h
			
 
				+@@ -817,7 +817,8 @@ static inline unsigned long pmd_page_vad
			
 
				+ 
			
 
				+ static inline int pmd_bad(pmd_t pmd)
			
 
				+ {
			
 
				+-	return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
			
 
				++	return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
			
 
				++	       (_KERNPG_TABLE & ~_PAGE_ACCESSED);
			
 
				+ }
			
 
				+ 
			
 
				+ static inline unsigned long pages_to_mb(unsigned long npg)
			
 
				+--- a/arch/x86/mm/pgtable.c
			
 
				++++ b/arch/x86/mm/pgtable.c
			
 
				+@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_
			
 
				+ 	return ret;
			
 
				+ }
			
 
				+ 
			
 
				+-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
			
 
				++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
			
 
				+ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
			
 
				+ 			      unsigned long addr, pmd_t *pmdp)
			
 
				+ {
			
 
				+@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_
			
 
				+ 
			
 
				+ 	return ret;
			
 
				+ }
			
 
				++#endif
			
 
				++
			
 
				++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
			
 
				+ int pudp_test_and_clear_young(struct vm_area_struct *vma,
			
 
				+ 			      unsigned long addr, pud_t *pudp)
			
 
				+ {
			
 
				+--- a/include/linux/pgtable.h
			
 
				++++ b/include/linux/pgtable.h
			
 
				+@@ -212,7 +212,7 @@ static inline int ptep_test_and_clear_yo
			
 
				+ #endif
			
 
				+ 
			
 
				+ #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
			
 
				+-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
			
 
				++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
			
 
				+ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
			
 
				+ 					    unsigned long address,
			
 
				+ 					    pmd_t *pmdp)
			
 
				+@@ -233,7 +233,7 @@ static inline int pmdp_test_and_clear_yo
			
 
				+ 	BUILD_BUG();
			
 
				+ 	return 0;
			
 
				+ }
			
 
				+-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
			
 
				++#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
			
 
				+ #endif
			
 
				+ 
			
 
				+ #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
			
--- a/target/linux/generic/pending-5.15/020-02-mm-vmscan.c-refactor-shrink_node.patch
+++ b/target/linux/generic/pending-5.15/020-02-mm-vmscan.c-refactor-shrink_node.patch
@@ -0,0 +1,224 @@
 
				+From a810f8e2f1bdd0707eaf05c8b4ba84a3ff2801bd Mon Sep 17 00:00:00 2001
			
 
				+From: Yu Zhao <[email protected]>
			
 
				+Date: Sun, 27 Sep 2020 20:49:08 -0600
			
 
				+Subject: [PATCH 03/10] mm/vmscan.c: refactor shrink_node()
			
 
				+
			
 
				+This patch refactors shrink_node(). This will make the upcoming
			
 
				+changes to mm/vmscan.c more readable.
			
 
				+
			
 
				+Signed-off-by: Yu Zhao <[email protected]>
			
 
				+Tested-by: Konstantin Kharlamov <[email protected]>
			
 
				+Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04
			
 
				+---
			
 
				+ mm/vmscan.c | 186 +++++++++++++++++++++++++++-------------------------
			
 
				+ 1 file changed, 98 insertions(+), 88 deletions(-)
			
 
				+
			
 
				+--- a/mm/vmscan.c
			
 
				++++ b/mm/vmscan.c
			
 
				+@@ -2562,6 +2562,103 @@ enum scan_balance {
			
 
				+ 	SCAN_FILE,
			
 
				+ };
			
 
				+ 
			
 
				++static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
			
 
				++{
			
 
				++	unsigned long file;
			
 
				++	struct lruvec *target_lruvec;
			
 
				++
			
 
				++	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
			
 
				++
			
 
				++	/*
			
 
				++	 * Determine the scan balance between anon and file LRUs.
			
 
				++	 */
			
 
				++	spin_lock_irq(&target_lruvec->lru_lock);
			
 
				++	sc->anon_cost = target_lruvec->anon_cost;
			
 
				++	sc->file_cost = target_lruvec->file_cost;
			
 
				++	spin_unlock_irq(&target_lruvec->lru_lock);
			
 
				++
			
 
				++	/*
			
 
				++	 * Target desirable inactive:active list ratios for the anon
			
 
				++	 * and file LRU lists.
			
 
				++	 */
			
 
				++	if (!sc->force_deactivate) {
			
 
				++		unsigned long refaults;
			
 
				++
			
 
				++		refaults = lruvec_page_state(target_lruvec,
			
 
				++				WORKINGSET_ACTIVATE_ANON);
			
 
				++		if (refaults != target_lruvec->refaults[0] ||
			
 
				++			inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
			
 
				++			sc->may_deactivate |= DEACTIVATE_ANON;
			
 
				++		else
			
 
				++			sc->may_deactivate &= ~DEACTIVATE_ANON;
			
 
				++
			
 
				++		/*
			
 
				++		 * When refaults are being observed, it means a new
			
 
				++		 * workingset is being established. Deactivate to get
			
 
				++		 * rid of any stale active pages quickly.
			
 
				++		 */
			
 
				++		refaults = lruvec_page_state(target_lruvec,
			
 
				++				WORKINGSET_ACTIVATE_FILE);
			
 
				++		if (refaults != target_lruvec->refaults[1] ||
			
 
				++		    inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
			
 
				++			sc->may_deactivate |= DEACTIVATE_FILE;
			
 
				++		else
			
 
				++			sc->may_deactivate &= ~DEACTIVATE_FILE;
			
 
				++	} else
			
 
				++		sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
			
 
				++
			
 
				++	/*
			
 
				++	 * If we have plenty of inactive file pages that aren't
			
 
				++	 * thrashing, try to reclaim those first before touching
			
 
				++	 * anonymous pages.
			
 
				++	 */
			
 
				++	file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
			
 
				++	if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
			
 
				++		sc->cache_trim_mode = 1;
			
 
				++	else
			
 
				++		sc->cache_trim_mode = 0;
			
 
				++
			
 
				++	/*
			
 
				++	 * Prevent the reclaimer from falling into the cache trap: as
			
 
				++	 * cache pages start out inactive, every cache fault will tip
			
 
				++	 * the scan balance towards the file LRU.  And as the file LRU
			
 
				++	 * shrinks, so does the window for rotation from references.
			
 
				++	 * This means we have a runaway feedback loop where a tiny
			
 
				++	 * thrashing file LRU becomes infinitely more attractive than
			
 
				++	 * anon pages.  Try to detect this based on file LRU size.
			
 
				++	 */
			
 
				++	if (!cgroup_reclaim(sc)) {
			
 
				++		unsigned long total_high_wmark = 0;
			
 
				++		unsigned long free, anon;
			
 
				++		int z;
			
 
				++
			
 
				++		free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
			
 
				++		file = node_page_state(pgdat, NR_ACTIVE_FILE) +
			
 
				++			   node_page_state(pgdat, NR_INACTIVE_FILE);
			
 
				++
			
 
				++		for (z = 0; z < MAX_NR_ZONES; z++) {
			
 
				++			struct zone *zone = &pgdat->node_zones[z];
			
 
				++
			
 
				++			if (!managed_zone(zone))
			
 
				++				continue;
			
 
				++
			
 
				++			total_high_wmark += high_wmark_pages(zone);
			
 
				++		}
			
 
				++
			
 
				++		/*
			
 
				++		 * Consider anon: if that's low too, this isn't a
			
 
				++		 * runaway file reclaim problem, but rather just
			
 
				++		 * extreme pressure. Reclaim as per usual then.
			
 
				++		 */
			
 
				++		anon = node_page_state(pgdat, NR_INACTIVE_ANON);
			
 
				++
			
 
				++		sc->file_is_tiny =
			
 
				++			file + free <= total_high_wmark &&
			
 
				++			!(sc->may_deactivate & DEACTIVATE_ANON) &&
			
 
				++			anon >> sc->priority;
			
 
				++	}
			
 
				++}
			
 
				++
			
 
				+ /*
			
 
				+  * Determine how aggressively the anon and file LRU lists should be
			
 
				+  * scanned.  The relative value of each set of LRU lists is determined
			
 
				+@@ -3032,7 +3129,6 @@ static void shrink_node(pg_data_t *pgdat
			
 
				+ 	unsigned long nr_reclaimed, nr_scanned;
			
 
				+ 	struct lruvec *target_lruvec;
			
 
				+ 	bool reclaimable = false;
			
 
				+-	unsigned long file;
			
 
				+ 
			
 
				+ 	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
			
 
				+ 
			
 
				+@@ -3048,93 +3144,7 @@ again:
			
 
				+ 	nr_reclaimed = sc->nr_reclaimed;
			
 
				+ 	nr_scanned = sc->nr_scanned;
			
 
				+ 
			
 
				+-	/*
			
 
				+-	 * Determine the scan balance between anon and file LRUs.
			
 
				+-	 */
			
 
				+-	spin_lock_irq(&target_lruvec->lru_lock);
			
 
				+-	sc->anon_cost = target_lruvec->anon_cost;
			
 
				+-	sc->file_cost = target_lruvec->file_cost;
			
 
				+-	spin_unlock_irq(&target_lruvec->lru_lock);
			
 
				+-
			
 
				+-	/*
			
 
				+-	 * Target desirable inactive:active list ratios for the anon
			
 
				+-	 * and file LRU lists.
			
 
				+-	 */
			
 
				+-	if (!sc->force_deactivate) {
			
 
				+-		unsigned long refaults;
			
 
				+-
			
 
				+-		refaults = lruvec_page_state(target_lruvec,
			
 
				+-				WORKINGSET_ACTIVATE_ANON);
			
 
				+-		if (refaults != target_lruvec->refaults[0] ||
			
 
				+-			inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
			
 
				+-			sc->may_deactivate |= DEACTIVATE_ANON;
			
 
				+-		else
			
 
				+-			sc->may_deactivate &= ~DEACTIVATE_ANON;
			
 
				+-
			
 
				+-		/*
			
 
				+-		 * When refaults are being observed, it means a new
			
 
				+-		 * workingset is being established. Deactivate to get
			
 
				+-		 * rid of any stale active pages quickly.
			
 
				+-		 */
			
 
				+-		refaults = lruvec_page_state(target_lruvec,
			
 
				+-				WORKINGSET_ACTIVATE_FILE);
			
 
				+-		if (refaults != target_lruvec->refaults[1] ||
			
 
				+-		    inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
			
 
				+-			sc->may_deactivate |= DEACTIVATE_FILE;
			
 
				+-		else
			
 
				+-			sc->may_deactivate &= ~DEACTIVATE_FILE;
			
 
				+-	} else
			
 
				+-		sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
			
 
				+-
			
 
				+-	/*
			
 
				+-	 * If we have plenty of inactive file pages that aren't
			
 
				+-	 * thrashing, try to reclaim those first before touching
			
 
				+-	 * anonymous pages.
			
 
				+-	 */
			
 
				+-	file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
			
 
				+-	if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
			
 
				+-		sc->cache_trim_mode = 1;
			
 
				+-	else
			
 
				+-		sc->cache_trim_mode = 0;
			
 
				+-
			
 
				+-	/*
			
 
				+-	 * Prevent the reclaimer from falling into the cache trap: as
			
 
				+-	 * cache pages start out inactive, every cache fault will tip
			
 
				+-	 * the scan balance towards the file LRU.  And as the file LRU
			
 
				+-	 * shrinks, so does the window for rotation from references.
			
 
				+-	 * This means we have a runaway feedback loop where a tiny
			
 
				+-	 * thrashing file LRU becomes infinitely more attractive than
			
 
				+-	 * anon pages.  Try to detect this based on file LRU size.
			
 
				+-	 */
			
 
				+-	if (!cgroup_reclaim(sc)) {
			
 
				+-		unsigned long total_high_wmark = 0;
			
 
				+-		unsigned long free, anon;
			
 
				+-		int z;
			
 
				+-
			
 
				+-		free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
			
 
				+-		file = node_page_state(pgdat, NR_ACTIVE_FILE) +
			
 
				+-			   node_page_state(pgdat, NR_INACTIVE_FILE);
			
 
				+-
			
 
				+-		for (z = 0; z < MAX_NR_ZONES; z++) {
			
 
				+-			struct zone *zone = &pgdat->node_zones[z];
			
 
				+-			if (!managed_zone(zone))
			
 
				+-				continue;
			
 
				+-
			
 
				+-			total_high_wmark += high_wmark_pages(zone);
			
 
				+-		}
			
 
				+-
			
 
				+-		/*
			
 
				+-		 * Consider anon: if that's low too, this isn't a
			
 
				+-		 * runaway file reclaim problem, but rather just
			
 
				+-		 * extreme pressure. Reclaim as per usual then.
			
 
				+-		 */
			
 
				+-		anon = node_page_state(pgdat, NR_INACTIVE_ANON);
			
 
				+-
			
 
				+-		sc->file_is_tiny =
			
 
				+-			file + free <= total_high_wmark &&
			
 
				+-			!(sc->may_deactivate & DEACTIVATE_ANON) &&
			
 
				+-			anon >> sc->priority;
			
 
				+-	}
			
 
				++	prepare_scan_count(pgdat, sc);
			
 
				+ 
			
 
				+ 	shrink_node_memcgs(pgdat, sc);
			
 
				+ 
			
--- a/target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch
+++ b/target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch
@@ -0,0 +1,996 @@
 
				+From 05f366c941ae2bb8ba21c79fafcb747a5a6b967b Mon Sep 17 00:00:00 2001
			
 
				+From: Yu Zhao <[email protected]>
			
 
				+Date: Mon, 25 Jan 2021 21:12:33 -0700
			
 
				+Subject: [PATCH 04/10] mm: multigenerational lru: groundwork
			
 
				+
			
 
				+For each lruvec, evictable pages are divided into multiple
			
 
				+generations. The youngest generation number is stored in
			
 
				+lrugen->max_seq for both anon and file types as they are aged on an
			
 
				+equal footing. The oldest generation numbers are stored in
			
 
				+lrugen->min_seq[] separately for anon and file types as clean file
			
 
				+pages can be evicted regardless of swap constraints. These three
			
 
				+variables are monotonically increasing. Generation numbers are
			
 
				+truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into
			
 
				+page->flags. The sliding window technique is used to prevent truncated
			
 
				+generation numbers from overlapping. Each truncated generation number
			
 
				+is an index to
			
 
				+lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES].
			
 
				+
			
 
				+The framework comprises two conceptually independent components: the
			
 
				+aging, which produces young generations, and the eviction, which
			
 
				+consumes old generations. Both can be invoked independently from user
			
 
				+space for the purpose of working set estimation and proactive reclaim.
			
 
				+
			
 
				+The protection of hot pages and the selection of cold pages are based
			
 
				+on page access types and patterns. There are two access types: one via
			
 
				+page tables and the other via file descriptors. The protection of the
			
 
				+former type is by design stronger because:
			
 
				+  1) The uncertainty in determining the access patterns of the former
			
 
				+  type is higher due to the coalesced nature of the accessed bit.
			
 
				+  2) The cost of evicting the former type is higher due to the TLB
			
 
				+  flushes required and the likelihood of involving I/O.
			
 
				+  3) The penalty of under-protecting the former type is higher because
			
 
				+  applications usually do not prepare themselves for major faults like
			
 
				+  they do for blocked I/O. For example, client applications commonly
			
 
				+  dedicate blocked I/O to separate threads to avoid UI janks that
			
 
				+  negatively affect user experience.
			
 
				+
			
 
				+There are also two access patterns: one with temporal locality and the
			
 
				+other without. The latter pattern, e.g., random and sequential, needs
			
 
				+to be explicitly excluded to avoid weakening the protection of the
			
 
				+former pattern. Generally the former type follows the former pattern
			
 
				+unless MADV_SEQUENTIAL is specified and the latter type follows the
			
 
				+latter pattern unless outlying refaults have been observed.
			
 
				+
			
 
				+Upon faulting, a page is added to the youngest generation, which
			
 
				+provides the strongest protection as the eviction will not consider
			
 
				+this page before the aging has scanned it at least twice. The first
			
 
				+scan clears the accessed bit set during the initial fault. And the
			
 
				+second scan makes sure this page has not been used since the first
			
 
				+scan. A page from any other generations is brought back to the
			
 
				+youngest generation whenever the aging finds the accessed bit set on
			
 
				+any of the PTEs mapping this page.
			
 
				+
			
 
				+Unmapped pages are initially added to the oldest generation and then
			
 
				+conditionally protected by tiers. This is done later [PATCH 07/10].
			
 
				+
			
 
				+Signed-off-by: Yu Zhao <[email protected]>
			
 
				+Tested-by: Konstantin Kharlamov <[email protected]>
			
 
				+Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7
			
 
				+---
			
 
				+ fs/fuse/dev.c                     |   3 +-
			
 
				+ include/linux/cgroup.h            |  15 +-
			
 
				+ include/linux/mm.h                |  36 ++++
			
 
				+ include/linux/mm_inline.h         | 182 ++++++++++++++++++++
			
 
				+ include/linux/mmzone.h            |  70 ++++++++
			
 
				+ include/linux/page-flags-layout.h |  19 ++-
			
 
				+ include/linux/page-flags.h        |   4 +-
			
 
				+ include/linux/sched.h             |   3 +
			
 
				+ kernel/bounds.c                   |   3 +
			
 
				+ kernel/cgroup/cgroup-internal.h   |   1 -
			
 
				+ mm/huge_memory.c                  |   3 +-
			
 
				+ mm/memcontrol.c                   |   1 +
			
 
				+ mm/memory.c                       |   7 +
			
 
				+ mm/mm_init.c                      |   6 +-
			
 
				+ mm/page_alloc.c                   |   1 +
			
 
				+ mm/swap.c                         |   9 +-
			
 
				+ mm/swapfile.c                     |   2 +
			
 
				+ mm/vmscan.c                       | 268 ++++++++++++++++++++++++++++++
			
 
				+ 18 files changed, 618 insertions(+), 15 deletions(-)
			
 
				+
			
 
				+--- a/fs/fuse/dev.c
			
 
				++++ b/fs/fuse/dev.c
			
 
				+@@ -785,7 +785,8 @@ static int fuse_check_page(struct page *
			
 
				+ 	       1 << PG_active |
			
 
				+ 	       1 << PG_workingset |
			
 
				+ 	       1 << PG_reclaim |
			
 
				+-	       1 << PG_waiters))) {
			
 
				++	       1 << PG_waiters |
			
 
				++	       LRU_GEN_MASK | LRU_REFS_MASK))) {
			
 
				+ 		dump_page(page, "fuse: trying to steal weird page");
			
 
				+ 		return 1;
			
 
				+ 	}
			
 
				+--- a/include/linux/cgroup.h
			
 
				++++ b/include/linux/cgroup.h
			
 
				+@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgr
			
 
				+ 	css_put(&cgrp->self);
			
 
				+ }
			
 
				+ 
			
 
				++extern struct mutex cgroup_mutex;
			
 
				++
			
 
				++static inline void cgroup_lock(void)
			
 
				++{
			
 
				++	mutex_lock(&cgroup_mutex);
			
 
				++}
			
 
				++
			
 
				++static inline void cgroup_unlock(void)
			
 
				++{
			
 
				++	mutex_unlock(&cgroup_mutex);
			
 
				++}
			
 
				++
			
 
				+ /**
			
 
				+  * task_css_set_check - obtain a task's css_set with extra access conditions
			
 
				+  * @task: the task to obtain css_set for
			
 
				+@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgr
			
 
				+  * as locks used during the cgroup_subsys::attach() methods.
			
 
				+  */
			
 
				+ #ifdef CONFIG_PROVE_RCU
			
 
				+-extern struct mutex cgroup_mutex;
			
 
				+ extern spinlock_t css_set_lock;
			
 
				+ #define task_css_set_check(task, __c)					\
			
 
				+ 	rcu_dereference_check((task)->cgroups,				\
			
 
				+@@ -707,6 +718,8 @@ struct cgroup;
			
 
				+ static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
			
 
				+ static inline void css_get(struct cgroup_subsys_state *css) {}
			
 
				+ static inline void css_put(struct cgroup_subsys_state *css) {}
			
 
				++static inline void cgroup_lock(void) {}
			
 
				++static inline void cgroup_unlock(void) {}
			
 
				+ static inline int cgroup_attach_task_all(struct task_struct *from,
			
 
				+ 					 struct task_struct *t) { return 0; }
			
 
				+ static inline int cgroupstats_build(struct cgroupstats *stats,
			
 
				+--- a/include/linux/mm.h
			
 
				++++ b/include/linux/mm.h
			
 
				+@@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v
			
 
				+ #define ZONES_PGOFF		(NODES_PGOFF - ZONES_WIDTH)
			
 
				+ #define LAST_CPUPID_PGOFF	(ZONES_PGOFF - LAST_CPUPID_WIDTH)
			
 
				+ #define KASAN_TAG_PGOFF		(LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
			
 
				++#define LRU_GEN_PGOFF		(KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
			
 
				++#define LRU_REFS_PGOFF		(LRU_GEN_PGOFF - LRU_REFS_WIDTH)
			
 
				+ 
			
 
				+ /*
			
 
				+  * Define the bit shifts to access each section.  For non-existent
			
 
				+@@ -1807,6 +1809,40 @@ static inline void unmap_mapping_range(s
			
 
				+ 		loff_t const holebegin, loff_t const holelen, int even_cows) { }
			
 
				+ #endif
			
 
				+ 
			
 
				++#ifdef CONFIG_LRU_GEN
			
 
				++static inline void task_enter_nonseq_fault(void)
			
 
				++{
			
 
				++	WARN_ON(current->in_nonseq_fault);
			
 
				++
			
 
				++	current->in_nonseq_fault = 1;
			
 
				++}
			
 
				++
			
 
				++static inline void task_exit_nonseq_fault(void)
			
 
				++{
			
 
				++	WARN_ON(!current->in_nonseq_fault);
			
 
				++
			
 
				++	current->in_nonseq_fault = 0;
			
 
				++}
			
 
				++
			
 
				++static inline bool task_in_nonseq_fault(void)
			
 
				++{
			
 
				++	return current->in_nonseq_fault;
			
 
				++}
			
 
				++#else
			
 
				++static inline void task_enter_nonseq_fault(void)
			
 
				++{
			
 
				++}
			
 
				++
			
 
				++static inline void task_exit_nonseq_fault(void)
			
 
				++{
			
 
				++}
			
 
				++
			
 
				++static inline bool task_in_nonseq_fault(void)
			
 
				++{
			
 
				++	return false;
			
 
				++}
			
 
				++#endif /* CONFIG_LRU_GEN */
			
 
				++
			
 
				+ static inline void unmap_shared_mapping_range(struct address_space *mapping,
			
 
				+ 		loff_t const holebegin, loff_t const holelen)
			
 
				+ {
			
 
				+--- a/include/linux/mm_inline.h
			
 
				++++ b/include/linux/mm_inline.h
			
 
				+@@ -79,11 +79,187 @@ static __always_inline enum lru_list pag
			
 
				+ 	return lru;
			
 
				+ }
			
 
				+ 
			
 
				++#ifdef CONFIG_LRU_GEN
			
 
				++
			
 
				++static inline bool lru_gen_enabled(void)
			
 
				++{
			
 
				++#ifdef CONFIG_LRU_GEN_ENABLED
			
 
				++	DECLARE_STATIC_KEY_TRUE(lru_gen_static_key);
			
 
				++
			
 
				++	return static_branch_likely(&lru_gen_static_key);
			
 
				++#else
			
 
				++	DECLARE_STATIC_KEY_FALSE(lru_gen_static_key);
			
 
				++
			
 
				++	return static_branch_unlikely(&lru_gen_static_key);
			
 
				++#endif
			
 
				++}
			
 
				++
			
 
				++/* Return an index within the sliding window that tracks MAX_NR_GENS generations. */
			
 
				++static inline int lru_gen_from_seq(unsigned long seq)
			
 
				++{
			
 
				++	return seq % MAX_NR_GENS;
			
 
				++}
			
 
				++
			
 
				++/* The youngest and the second youngest generations are counted as active. */
			
 
				++static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
			
 
				++{
			
 
				++	unsigned long max_seq = lruvec->evictable.max_seq;
			
 
				++
			
 
				++	VM_BUG_ON(gen >= MAX_NR_GENS);
			
 
				++
			
 
				++	return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
			
 
				++}
			
 
				++
			
 
				++/* Update the sizes of the multigenerational lru lists. */
			
 
				++static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec,
			
 
				++				       int old_gen, int new_gen)
			
 
				++{
			
 
				++	int type = page_is_file_lru(page);
			
 
				++	int zone = page_zonenum(page);
			
 
				++	int delta = thp_nr_pages(page);
			
 
				++	enum lru_list lru = type * LRU_FILE;
			
 
				++	struct lrugen *lrugen = &lruvec->evictable;
			
 
				++
			
 
				++	lockdep_assert_held(&lruvec->lru_lock);
			
 
				++	VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS);
			
 
				++	VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS);
			
 
				++	VM_BUG_ON(old_gen == -1 && new_gen == -1);
			
 
				++
			
 
				++	if (old_gen >= 0)
			
 
				++		WRITE_ONCE(lrugen->sizes[old_gen][type][zone],
			
 
				++			   lrugen->sizes[old_gen][type][zone] - delta);
			
 
				++	if (new_gen >= 0)
			
 
				++		WRITE_ONCE(lrugen->sizes[new_gen][type][zone],
			
 
				++			   lrugen->sizes[new_gen][type][zone] + delta);
			
 
				++
			
 
				++	if (old_gen < 0) {
			
 
				++		if (lru_gen_is_active(lruvec, new_gen))
			
 
				++			lru += LRU_ACTIVE;
			
 
				++		update_lru_size(lruvec, lru, zone, delta);
			
 
				++		return;
			
 
				++	}
			
 
				++
			
 
				++	if (new_gen < 0) {
			
 
				++		if (lru_gen_is_active(lruvec, old_gen))
			
 
				++			lru += LRU_ACTIVE;
			
 
				++		update_lru_size(lruvec, lru, zone, -delta);
			
 
				++		return;
			
 
				++	}
			
 
				++
			
 
				++	if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
			
 
				++		update_lru_size(lruvec, lru, zone, -delta);
			
 
				++		update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
			
 
				++	}
			
 
				++
			
 
				++	VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
			
 
				++}
			
 
				++
			
 
				++/* Add a page to one of the multigenerational lru lists. Return true on success. */
			
 
				++static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
			
 
				++{
			
 
				++	int gen;
			
 
				++	unsigned long old_flags, new_flags;
			
 
				++	int type = page_is_file_lru(page);
			
 
				++	int zone = page_zonenum(page);
			
 
				++	struct lrugen *lrugen = &lruvec->evictable;
			
 
				++
			
 
				++	if (PageUnevictable(page) || !lrugen->enabled[type])
			
 
				++		return false;
			
 
				++	/*
			
 
				++	 * If a page shouldn't be considered for eviction, i.e., a page mapped
			
 
				++	 * upon fault during which the accessed bit is set, add it to the
			
 
				++	 * youngest generation.
			
 
				++	 *
			
 
				++	 * If a page can't be evicted immediately, i.e., an anon page not in
			
 
				++	 * swap cache or a dirty page pending writeback, add it to the second
			
 
				++	 * oldest generation.
			
 
				++	 *
			
 
				++	 * If a page could be evicted immediately, e.g., a clean page, add it to
			
 
				++	 * the oldest generation.
			
 
				++	 */
			
 
				++	if (PageActive(page))
			
 
				++		gen = lru_gen_from_seq(lrugen->max_seq);
			
 
				++	else if ((!type && !PageSwapCache(page)) ||
			
 
				++		 (PageReclaim(page) && (PageDirty(page) || PageWriteback(page))))
			
 
				++		gen = lru_gen_from_seq(lrugen->min_seq[type] + 1);
			
 
				++	else
			
 
				++		gen = lru_gen_from_seq(lrugen->min_seq[type]);
			
 
				++
			
 
				++	do {
			
 
				++		new_flags = old_flags = READ_ONCE(page->flags);
			
 
				++		VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page);
			
 
				++
			
 
				++		new_flags &= ~(LRU_GEN_MASK | BIT(PG_active));
			
 
				++		new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
			
 
				++	} while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
			
 
				++
			
 
				++	lru_gen_update_size(page, lruvec, -1, gen);
			
 
				++	/* for rotate_reclaimable_page() */
			
 
				++	if (reclaiming)
			
 
				++		list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
			
 
				++	else
			
 
				++		list_add(&page->lru, &lrugen->lists[gen][type][zone]);
			
 
				++
			
 
				++	return true;
			
 
				++}
			
 
				++
			
 
				++/* Delete a page from one of the multigenerational lru lists. Return true on success. */
			
 
				++static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
			
 
				++{
			
 
				++	int gen;
			
 
				++	unsigned long old_flags, new_flags;
			
 
				++
			
 
				++	do {
			
 
				++		new_flags = old_flags = READ_ONCE(page->flags);
			
 
				++		if (!(new_flags & LRU_GEN_MASK))
			
 
				++			return false;
			
 
				++
			
 
				++		VM_BUG_ON_PAGE(PageActive(page), page);
			
 
				++		VM_BUG_ON_PAGE(PageUnevictable(page), page);
			
 
				++
			
 
				++		gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
			
 
				++
			
 
				++		new_flags &= ~LRU_GEN_MASK;
			
 
				++		/* for shrink_page_list() */
			
 
				++		if (reclaiming)
			
 
				++			new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim));
			
 
				++		else if (lru_gen_is_active(lruvec, gen))
			
 
				++			new_flags |= BIT(PG_active);
			
 
				++	} while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
			
 
				++
			
 
				++	lru_gen_update_size(page, lruvec, gen, -1);
			
 
				++	list_del(&page->lru);
			
 
				++
			
 
				++	return true;
			
 
				++}
			
 
				++
			
 
				++#else
			
 
				++
			
 
				++static inline bool lru_gen_enabled(void)
			
 
				++{
			
 
				++	return false;
			
 
				++}
			
 
				++
			
 
				++static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
			
 
				++{
			
 
				++	return false;
			
 
				++}
			
 
				++
			
 
				++static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
			
 
				++{
			
 
				++	return false;
			
 
				++}
			
 
				++
			
 
				++#endif /* CONFIG_LRU_GEN */
			
 
				++
			
 
				+ static __always_inline void add_page_to_lru_list(struct page *page,
			
 
				+ 				struct lruvec *lruvec)
			
 
				+ {
			
 
				+ 	enum lru_list lru = page_lru(page);
			
 
				+ 
			
 
				++	if (lru_gen_add_page(page, lruvec, false))
			
 
				++		return;
			
 
				++
			
 
				+ 	update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
			
 
				+ 	list_add(&page->lru, &lruvec->lists[lru]);
			
 
				+ }
			
 
				+@@ -93,6 +269,9 @@ static __always_inline void add_page_to_
			
 
				+ {
			
 
				+ 	enum lru_list lru = page_lru(page);
			
 
				+ 
			
 
				++	if (lru_gen_add_page(page, lruvec, true))
			
 
				++		return;
			
 
				++
			
 
				+ 	update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
			
 
				+ 	list_add_tail(&page->lru, &lruvec->lists[lru]);
			
 
				+ }
			
 
				+@@ -100,6 +279,9 @@ static __always_inline void add_page_to_
			
 
				+ static __always_inline void del_page_from_lru_list(struct page *page,
			
 
				+ 				struct lruvec *lruvec)
			
 
				+ {
			
 
				++	if (lru_gen_del_page(page, lruvec, false))
			
 
				++		return;
			
 
				++
			
 
				+ 	list_del(&page->lru);
			
 
				+ 	update_lru_size(lruvec, page_lru(page), page_zonenum(page),
			
 
				+ 			-thp_nr_pages(page));
			
 
				+--- a/include/linux/mmzone.h
			
 
				++++ b/include/linux/mmzone.h
			
 
				+@@ -294,6 +294,72 @@ enum lruvec_flags {
			
 
				+ 					 */
			
 
				+ };
			
 
				+ 
			
 
				++struct lruvec;
			
 
				++
			
 
				++#define LRU_GEN_MASK		((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
			
 
				++#define LRU_REFS_MASK		((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
			
 
				++
			
 
				++#ifdef CONFIG_LRU_GEN
			
 
				++
			
 
				++/*
			
 
				++ * For each lruvec, evictable pages are divided into multiple generations. The
			
 
				++ * youngest and the oldest generation numbers, AKA max_seq and min_seq, are
			
 
				++ * monotonically increasing. The sliding window technique is used to track at
			
 
				++ * least MIN_NR_GENS and at most MAX_NR_GENS generations. An offset within the
			
 
				++ * window, AKA gen, indexes an array of per-type and per-zone lists for the
			
 
				++ * corresponding generation. The counter in page->flags stores gen+1 while a
			
 
				++ * page is on one of the multigenerational lru lists. Otherwise, it stores 0.
			
 
				++ *
			
 
				++ * After a page is faulted in, the aging must check the accessed bit at least
			
 
				++ * twice before the eviction would consider it. The first check clears the
			
 
				++ * accessed bit set during the initial fault. The second check makes sure this
			
 
				++ * page hasn't been used since then.
			
 
				++ */
			
 
				++#define MIN_NR_GENS		2
			
 
				++#define MAX_NR_GENS		((unsigned int)CONFIG_NR_LRU_GENS)
			
 
				++
			
 
				++struct lrugen {
			
 
				++	/* the aging increments the max generation number */
			
 
				++	unsigned long max_seq;
			
 
				++	/* the eviction increments the min generation numbers */
			
 
				++	unsigned long min_seq[ANON_AND_FILE];
			
 
				++	/* the birth time of each generation in jiffies */
			
 
				++	unsigned long timestamps[MAX_NR_GENS];
			
 
				++	/* the multigenerational lru lists */
			
 
				++	struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
			
 
				++	/* the sizes of the multigenerational lru lists in pages */
			
 
				++	unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
			
 
				++	/* whether the multigenerational lru is enabled */
			
 
				++	bool enabled[ANON_AND_FILE];
			
 
				++};
			
 
				++
			
 
				++#define MAX_BATCH_SIZE		8192
			
 
				++
			
 
				++void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
			
 
				++void lru_gen_change_state(bool enable, bool main, bool swap);
			
 
				++
			
 
				++#ifdef CONFIG_MEMCG
			
 
				++void lru_gen_init_memcg(struct mem_cgroup *memcg);
			
 
				++#endif
			
 
				++
			
 
				++#else /* !CONFIG_LRU_GEN */
			
 
				++
			
 
				++static inline void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec)
			
 
				++{
			
 
				++}
			
 
				++
			
 
				++static inline void lru_gen_change_state(bool enable, bool main, bool swap)
			
 
				++{
			
 
				++}
			
 
				++
			
 
				++#ifdef CONFIG_MEMCG
			
 
				++static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
			
 
				++{
			
 
				++}
			
 
				++#endif
			
 
				++
			
 
				++#endif /* CONFIG_LRU_GEN */
			
 
				++
			
 
				+ struct lruvec {
			
 
				+ 	struct list_head		lists[NR_LRU_LISTS];
			
 
				+ 	/* per lruvec lru_lock for memcg */
			
 
				+@@ -311,6 +377,10 @@ struct lruvec {
			
 
				+ 	unsigned long			refaults[ANON_AND_FILE];
			
 
				+ 	/* Various lruvec state flags (enum lruvec_flags) */
			
 
				+ 	unsigned long			flags;
			
 
				++#ifdef CONFIG_LRU_GEN
			
 
				++	/* unevictable pages are on LRU_UNEVICTABLE */
			
 
				++	struct lrugen			evictable;
			
 
				++#endif
			
 
				+ #ifdef CONFIG_MEMCG
			
 
				+ 	struct pglist_data *pgdat;
			
 
				+ #endif
			
 
				+--- a/include/linux/page-flags-layout.h
			
 
				++++ b/include/linux/page-flags-layout.h
			
 
				+@@ -26,6 +26,14 @@
			
 
				+ 
			
 
				+ #define ZONES_WIDTH		ZONES_SHIFT
			
 
				+ 
			
 
				++#ifdef CONFIG_LRU_GEN
			
 
				++/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */
			
 
				++#define LRU_REFS_WIDTH		(CONFIG_TIERS_PER_GEN - 2)
			
 
				++#else
			
 
				++#define LRU_GEN_WIDTH		0
			
 
				++#define LRU_REFS_WIDTH		0
			
 
				++#endif /* CONFIG_LRU_GEN */
			
 
				++
			
 
				+ #ifdef CONFIG_SPARSEMEM
			
 
				+ #include <asm/sparsemem.h>
			
 
				+ #define SECTIONS_SHIFT	(MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
			
 
				+@@ -55,7 +63,8 @@
			
 
				+ #define SECTIONS_WIDTH		0
			
 
				+ #endif
			
 
				+ 
			
 
				+-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
			
 
				++#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
			
 
				++	<= BITS_PER_LONG - NR_PAGEFLAGS
			
 
				+ #define NODES_WIDTH		NODES_SHIFT
			
 
				+ #elif defined(CONFIG_SPARSEMEM_VMEMMAP)
			
 
				+ #error "Vmemmap: No space for nodes field in page flags"
			
 
				+@@ -89,8 +98,8 @@
			
 
				+ #define LAST_CPUPID_SHIFT 0
			
 
				+ #endif
			
 
				+ 
			
 
				+-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
			
 
				+-	<= BITS_PER_LONG - NR_PAGEFLAGS
			
 
				++#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
			
 
				++	KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
			
 
				+ #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
			
 
				+ #else
			
 
				+ #define LAST_CPUPID_WIDTH 0
			
 
				+@@ -100,8 +109,8 @@
			
 
				+ #define LAST_CPUPID_NOT_IN_PAGE_FLAGS
			
 
				+ #endif
			
 
				+ 
			
 
				+-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
			
 
				+-	> BITS_PER_LONG - NR_PAGEFLAGS
			
 
				++#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
			
 
				++	KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
			
 
				+ #error "Not enough bits in page flags"
			
 
				+ #endif
			
 
				+ 
			
 
				+--- a/include/linux/page-flags.h
			
 
				++++ b/include/linux/page-flags.h
			
 
				+@@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall
			
 
				+ 	 1UL << PG_private	| 1UL << PG_private_2	|	\
			
 
				+ 	 1UL << PG_writeback	| 1UL << PG_reserved	|	\
			
 
				+ 	 1UL << PG_slab		| 1UL << PG_active 	|	\
			
 
				+-	 1UL << PG_unevictable	| __PG_MLOCKED)
			
 
				++	 1UL << PG_unevictable	| __PG_MLOCKED | LRU_GEN_MASK)
			
 
				+ 
			
 
				+ /*
			
 
				+  * Flags checked when a page is prepped for return by the page allocator.
			
 
				+@@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall
			
 
				+  * alloc-free cycle to prevent from reusing the page.
			
 
				+  */
			
 
				+ #define PAGE_FLAGS_CHECK_AT_PREP	\
			
 
				+-	(PAGEFLAGS_MASK & ~__PG_HWPOISON)
			
 
				++	((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
			
 
				+ 
			
 
				+ #define PAGE_FLAGS_PRIVATE				\
			
 
				+ 	(1UL << PG_private | 1UL << PG_private_2)
			
 
				+--- a/include/linux/sched.h
			
 
				++++ b/include/linux/sched.h
			
 
				+@@ -911,6 +911,9 @@ struct task_struct {
			
 
				+ #ifdef CONFIG_MEMCG
			
 
				+ 	unsigned			in_user_fault:1;
			
 
				+ #endif
			
 
				++#ifdef CONFIG_LRU_GEN
			
 
				++	unsigned			in_nonseq_fault:1;
			
 
				++#endif
			
 
				+ #ifdef CONFIG_COMPAT_BRK
			
 
				+ 	unsigned			brk_randomized:1;
			
 
				+ #endif
			
 
				+--- a/kernel/bounds.c
			
 
				++++ b/kernel/bounds.c
			
 
				+@@ -22,6 +22,9 @@ int main(void)
			
 
				+ 	DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
			
 
				+ #endif
			
 
				+ 	DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
			
 
				++#ifdef CONFIG_LRU_GEN
			
 
				++	DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1));
			
 
				++#endif
			
 
				+ 	/* End of constants */
			
 
				+ 
			
 
				+ 	return 0;
			
 
				+--- a/kernel/cgroup/cgroup-internal.h
			
 
				++++ b/kernel/cgroup/cgroup-internal.h
			
 
				+@@ -165,7 +165,6 @@ struct cgroup_mgctx {
			
 
				+ #define DEFINE_CGROUP_MGCTX(name)						\
			
 
				+ 	struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
			
 
				+ 
			
 
				+-extern struct mutex cgroup_mutex;
			
 
				+ extern spinlock_t css_set_lock;
			
 
				+ extern struct cgroup_subsys *cgroup_subsys[];
			
 
				+ extern struct list_head cgroup_roots;
			
 
				+--- a/mm/huge_memory.c
			
 
				++++ b/mm/huge_memory.c
			
 
				+@@ -2364,7 +2364,8 @@ static void __split_huge_page_tail(struc
			
 
				+ #ifdef CONFIG_64BIT
			
 
				+ 			 (1L << PG_arch_2) |
			
 
				+ #endif
			
 
				+-			 (1L << PG_dirty)));
			
 
				++			 (1L << PG_dirty) |
			
 
				++			 LRU_GEN_MASK | LRU_REFS_MASK));
			
 
				+ 
			
 
				+ 	/* ->mapping in first tail page is compound_mapcount */
			
 
				+ 	VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
			
 
				+--- a/mm/memcontrol.c
			
 
				++++ b/mm/memcontrol.c
			
 
				+@@ -5226,6 +5226,7 @@ static struct mem_cgroup *mem_cgroup_all
			
 
				+ 	memcg->deferred_split_queue.split_queue_len = 0;
			
 
				+ #endif
			
 
				+ 	idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
			
 
				++	lru_gen_init_memcg(memcg);
			
 
				+ 	return memcg;
			
 
				+ fail:
			
 
				+ 	mem_cgroup_id_remove(memcg);
			
 
				+--- a/mm/memory.c
			
 
				++++ b/mm/memory.c
			
 
				+@@ -4788,6 +4788,7 @@ vm_fault_t handle_mm_fault(struct vm_are
			
 
				+ 			   unsigned int flags, struct pt_regs *regs)
			
 
				+ {
			
 
				+ 	vm_fault_t ret;
			
 
				++	bool nonseq_fault = !(vma->vm_flags & VM_SEQ_READ);
			
 
				+ 
			
 
				+ 	__set_current_state(TASK_RUNNING);
			
 
				+ 
			
 
				+@@ -4809,11 +4810,17 @@ vm_fault_t handle_mm_fault(struct vm_are
			
 
				+ 	if (flags & FAULT_FLAG_USER)
			
 
				+ 		mem_cgroup_enter_user_fault();
			
 
				+ 
			
 
				++	if (nonseq_fault)
			
 
				++		task_enter_nonseq_fault();
			
 
				++
			
 
				+ 	if (unlikely(is_vm_hugetlb_page(vma)))
			
 
				+ 		ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
			
 
				+ 	else
			
 
				+ 		ret = __handle_mm_fault(vma, address, flags);
			
 
				+ 
			
 
				++	if (nonseq_fault)
			
 
				++		task_exit_nonseq_fault();
			
 
				++
			
 
				+ 	if (flags & FAULT_FLAG_USER) {
			
 
				+ 		mem_cgroup_exit_user_fault();
			
 
				+ 		/*
			
 
				+--- a/mm/mm_init.c
			
 
				++++ b/mm/mm_init.c
			
 
				+@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo
			
 
				+ 
			
 
				+ 	shift = 8 * sizeof(unsigned long);
			
 
				+ 	width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
			
 
				+-		- LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
			
 
				++		- LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
			
 
				+ 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
			
 
				+-		"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
			
 
				++		"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
			
 
				+ 		SECTIONS_WIDTH,
			
 
				+ 		NODES_WIDTH,
			
 
				+ 		ZONES_WIDTH,
			
 
				+ 		LAST_CPUPID_WIDTH,
			
 
				+ 		KASAN_TAG_WIDTH,
			
 
				++		LRU_GEN_WIDTH,
			
 
				++		LRU_REFS_WIDTH,
			
 
				+ 		NR_PAGEFLAGS);
			
 
				+ 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
			
 
				+ 		"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
			
 
				+--- a/mm/page_alloc.c
			
 
				++++ b/mm/page_alloc.c
			
 
				+@@ -7411,6 +7411,7 @@ static void __meminit pgdat_init_interna
			
 
				+ 
			
 
				+ 	pgdat_page_ext_init(pgdat);
			
 
				+ 	lruvec_init(&pgdat->__lruvec);
			
 
				++	lru_gen_init_state(NULL, &pgdat->__lruvec);
			
 
				+ }
			
 
				+ 
			
 
				+ static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
			
 
				+--- a/mm/swap.c
			
 
				++++ b/mm/swap.c
			
 
				+@@ -446,6 +446,11 @@ void lru_cache_add(struct page *page)
			
 
				+ 	VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
			
 
				+ 	VM_BUG_ON_PAGE(PageLRU(page), page);
			
 
				+ 
			
 
				++	/* see the comment in lru_gen_add_page() */
			
 
				++	if (lru_gen_enabled() && !PageUnevictable(page) &&
			
 
				++	    task_in_nonseq_fault() && !(current->flags & PF_MEMALLOC))
			
 
				++		SetPageActive(page);
			
 
				++
			
 
				+ 	get_page(page);
			
 
				+ 	local_lock(&lru_pvecs.lock);
			
 
				+ 	pvec = this_cpu_ptr(&lru_pvecs.lru_add);
			
 
				+@@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc
			
 
				+ 
			
 
				+ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
			
 
				+ {
			
 
				+-	if (PageActive(page) && !PageUnevictable(page)) {
			
 
				++	if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
			
 
				+ 		int nr_pages = thp_nr_pages(page);
			
 
				+ 
			
 
				+ 		del_page_from_lru_list(page, lruvec);
			
 
				+@@ -661,7 +666,7 @@ void deactivate_file_page(struct page *p
			
 
				+  */
			
 
				+ void deactivate_page(struct page *page)
			
 
				+ {
			
 
				+-	if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
			
 
				++	if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
			
 
				+ 		struct pagevec *pvec;
			
 
				+ 
			
 
				+ 		local_lock(&lru_pvecs.lock);
			
 
				+--- a/mm/swapfile.c
			
 
				++++ b/mm/swapfile.c
			
 
				+@@ -2688,6 +2688,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
			
 
				+ 	err = 0;
			
 
				+ 	atomic_inc(&proc_poll_event);
			
 
				+ 	wake_up_interruptible(&proc_poll_wait);
			
 
				++	lru_gen_change_state(false, false, true);
			
 
				+ 
			
 
				+ out_dput:
			
 
				+ 	filp_close(victim, NULL);
			
 
				+@@ -3349,6 +3350,7 @@ SYSCALL_DEFINE2(swapon, const char __use
			
 
				+ 	mutex_unlock(&swapon_mutex);
			
 
				+ 	atomic_inc(&proc_poll_event);
			
 
				+ 	wake_up_interruptible(&proc_poll_wait);
			
 
				++	lru_gen_change_state(true, false, true);
			
 
				+ 
			
 
				+ 	error = 0;
			
 
				+ 	goto out;
			
 
				+--- a/mm/vmscan.c
			
 
				++++ b/mm/vmscan.c
			
 
				+@@ -50,6 +50,7 @@
			
 
				+ #include <linux/printk.h>
			
 
				+ #include <linux/dax.h>
			
 
				+ #include <linux/psi.h>
			
 
				++#include <linux/memory.h>
			
 
				+ 
			
 
				+ #include <asm/tlbflush.h>
			
 
				+ #include <asm/div64.h>
			
 
				+@@ -2880,6 +2881,273 @@ static bool can_age_anon_pages(struct pg
			
 
				+ 	return can_demote(pgdat->node_id, sc);
			
 
				+ }
			
 
				+ 
			
 
				++#ifdef CONFIG_LRU_GEN
			
 
				++
			
 
				++/******************************************************************************
			
 
				++ *                          shorthand helpers
			
 
				++ ******************************************************************************/
			
 
				++
			
 
				++#define for_each_gen_type_zone(gen, type, zone)				\
			
 
				++	for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++)			\
			
 
				++		for ((type) = 0; (type) < ANON_AND_FILE; (type)++)	\
			
 
				++			for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
			
 
				++
			
 
				++static int page_lru_gen(struct page *page)
			
 
				++{
			
 
				++	unsigned long flags = READ_ONCE(page->flags);
			
 
				++
			
 
				++	return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
			
 
				++}
			
 
				++
			
 
				++static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg)
			
 
				++{
			
 
				++	struct pglist_data *pgdat = NODE_DATA(nid);
			
 
				++
			
 
				++#ifdef CONFIG_MEMCG
			
 
				++	if (memcg) {
			
 
				++		struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
			
 
				++
			
 
				++		if (lruvec->pgdat != pgdat)
			
 
				++			lruvec->pgdat = pgdat;
			
 
				++
			
 
				++		return lruvec;
			
 
				++	}
			
 
				++#endif
			
 
				++	return pgdat ? &pgdat->__lruvec : NULL;
			
 
				++}
			
 
				++
			
 
				++static int get_nr_gens(struct lruvec *lruvec, int type)
			
 
				++{
			
 
				++	return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1;
			
 
				++}
			
 
				++
			
 
				++static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
			
 
				++{
			
 
				++	return get_nr_gens(lruvec, 1) >= MIN_NR_GENS &&
			
 
				++	       get_nr_gens(lruvec, 1) <= get_nr_gens(lruvec, 0) &&
			
 
				++	       get_nr_gens(lruvec, 0) <= MAX_NR_GENS;
			
 
				++}
			
 
				++
			
 
				++/******************************************************************************
			
 
				++ *                          state change
			
 
				++ ******************************************************************************/
			
 
				++
			
 
				++#ifdef CONFIG_LRU_GEN_ENABLED
			
 
				++DEFINE_STATIC_KEY_TRUE(lru_gen_static_key);
			
 
				++#else
			
 
				++DEFINE_STATIC_KEY_FALSE(lru_gen_static_key);
			
 
				++#endif
			
 
				++
			
 
				++static int lru_gen_nr_swapfiles;
			
 
				++
			
 
				++static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
			
 
				++{
			
 
				++	int gen, type, zone;
			
 
				++	enum lru_list lru;
			
 
				++	struct lrugen *lrugen = &lruvec->evictable;
			
 
				++
			
 
				++	for_each_evictable_lru(lru) {
			
 
				++		type = is_file_lru(lru);
			
 
				++
			
 
				++		if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru]))
			
 
				++			return false;
			
 
				++	}
			
 
				++
			
 
				++	for_each_gen_type_zone(gen, type, zone) {
			
 
				++		if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone]))
			
 
				++			return false;
			
 
				++
			
 
				++		/* unlikely but not a bug when reset_batch_size() is pending */
			
 
				++		VM_WARN_ON(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]);
			
 
				++	}
			
 
				++
			
 
				++	return true;
			
 
				++}
			
 
				++
			
 
				++static bool fill_lists(struct lruvec *lruvec)
			
 
				++{
			
 
				++	enum lru_list lru;
			
 
				++	int remaining = MAX_BATCH_SIZE;
			
 
				++
			
 
				++	for_each_evictable_lru(lru) {
			
 
				++		int type = is_file_lru(lru);
			
 
				++		bool active = is_active_lru(lru);
			
 
				++		struct list_head *head = &lruvec->lists[lru];
			
 
				++
			
 
				++		if (!lruvec->evictable.enabled[type])
			
 
				++			continue;
			
 
				++
			
 
				++		while (!list_empty(head)) {
			
 
				++			bool success;
			
 
				++			struct page *page = lru_to_page(head);
			
 
				++
			
 
				++			VM_BUG_ON_PAGE(PageTail(page), page);
			
 
				++			VM_BUG_ON_PAGE(PageUnevictable(page), page);
			
 
				++			VM_BUG_ON_PAGE(PageActive(page) != active, page);
			
 
				++			VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
			
 
				++			VM_BUG_ON_PAGE(page_lru_gen(page) < MAX_NR_GENS, page);
			
 
				++
			
 
				++			prefetchw_prev_lru_page(page, head, flags);
			
 
				++
			
 
				++			del_page_from_lru_list(page, lruvec);
			
 
				++			success = lru_gen_add_page(page, lruvec, false);
			
 
				++			VM_BUG_ON(!success);
			
 
				++
			
 
				++			if (!--remaining)
			
 
				++				return false;
			
 
				++		}
			
 
				++	}
			
 
				++
			
 
				++	return true;
			
 
				++}
			
 
				++
			
 
				++static bool drain_lists(struct lruvec *lruvec)
			
 
				++{
			
 
				++	int gen, type, zone;
			
 
				++	int remaining = MAX_BATCH_SIZE;
			
 
				++
			
 
				++	for_each_gen_type_zone(gen, type, zone) {
			
 
				++		struct list_head *head = &lruvec->evictable.lists[gen][type][zone];
			
 
				++
			
 
				++		if (lruvec->evictable.enabled[type])
			
 
				++			continue;
			
 
				++
			
 
				++		while (!list_empty(head)) {
			
 
				++			bool success;
			
 
				++			struct page *page = lru_to_page(head);
			
 
				++
			
 
				++			VM_BUG_ON_PAGE(PageTail(page), page);
			
 
				++			VM_BUG_ON_PAGE(PageUnevictable(page), page);
			
 
				++			VM_BUG_ON_PAGE(PageActive(page), page);
			
 
				++			VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
			
 
				++			VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
			
 
				++
			
 
				++			prefetchw_prev_lru_page(page, head, flags);
			
 
				++
			
 
				++			success = lru_gen_del_page(page, lruvec, false);
			
 
				++			VM_BUG_ON(!success);
			
 
				++			add_page_to_lru_list(page, lruvec);
			
 
				++
			
 
				++			if (!--remaining)
			
 
				++				return false;
			
 
				++		}
			
 
				++	}
			
 
				++
			
 
				++	return true;
			
 
				++}
			
 
				++
			
 
				++/*
			
 
				++ * For file page tracking, we enable/disable it according to the main switch.
			
 
				++ * For anon page tracking, we only enabled it when the main switch is on and
			
 
				++ * there is at least one swapfile; we disable it when there are no swapfiles
			
 
				++ * regardless of the value of the main switch. Otherwise, we will eventually
			
 
				++ * reach the max size of the sliding window and have to call inc_min_seq().
			
 
				++ */
			
 
				++void lru_gen_change_state(bool enable, bool main, bool swap)
			
 
				++{
			
 
				++	static DEFINE_MUTEX(state_mutex);
			
 
				++
			
 
				++	struct mem_cgroup *memcg;
			
 
				++
			
 
				++	mem_hotplug_begin();
			
 
				++	cgroup_lock();
			
 
				++	mutex_lock(&state_mutex);
			
 
				++
			
 
				++	if (swap) {
			
 
				++		if (enable)
			
 
				++			swap = !lru_gen_nr_swapfiles++;
			
 
				++		else
			
 
				++			swap = !--lru_gen_nr_swapfiles;
			
 
				++	}
			
 
				++
			
 
				++	if (main && enable != lru_gen_enabled()) {
			
 
				++		if (enable)
			
 
				++			static_branch_enable(&lru_gen_static_key);
			
 
				++		else
			
 
				++			static_branch_disable(&lru_gen_static_key);
			
 
				++	} else if (!swap || !lru_gen_enabled())
			
 
				++		goto unlock;
			
 
				++
			
 
				++	memcg = mem_cgroup_iter(NULL, NULL, NULL);
			
 
				++	do {
			
 
				++		int nid;
			
 
				++
			
 
				++		for_each_node(nid) {
			
 
				++			struct lruvec *lruvec = get_lruvec(nid, memcg);
			
 
				++
			
 
				++			if (!lruvec)
			
 
				++				continue;
			
 
				++
			
 
				++			spin_lock_irq(&lruvec->lru_lock);
			
 
				++
			
 
				++			VM_BUG_ON(!seq_is_valid(lruvec));
			
 
				++			VM_BUG_ON(!state_is_valid(lruvec));
			
 
				++
			
 
				++			lruvec->evictable.enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
			
 
				++			lruvec->evictable.enabled[1] = lru_gen_enabled();
			
 
				++
			
 
				++			while (!(enable ? fill_lists(lruvec) : drain_lists(lruvec))) {
			
 
				++				spin_unlock_irq(&lruvec->lru_lock);
			
 
				++				cond_resched();
			
 
				++				spin_lock_irq(&lruvec->lru_lock);
			
 
				++			}
			
 
				++
			
 
				++			spin_unlock_irq(&lruvec->lru_lock);
			
 
				++		}
			
 
				++
			
 
				++		cond_resched();
			
 
				++	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
			
 
				++unlock:
			
 
				++	mutex_unlock(&state_mutex);
			
 
				++	cgroup_unlock();
			
 
				++	mem_hotplug_done();
			
 
				++}
			
 
				++
			
 
				++/******************************************************************************
			
 
				++ *                          initialization
			
 
				++ ******************************************************************************/
			
 
				++
			
 
				++void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec)
			
 
				++{
			
 
				++	int i;
			
 
				++	int gen, type, zone;
			
 
				++	struct lrugen *lrugen = &lruvec->evictable;
			
 
				++
			
 
				++	lrugen->max_seq = MIN_NR_GENS + 1;
			
 
				++	lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
			
 
				++	lrugen->enabled[1] = lru_gen_enabled();
			
 
				++
			
 
				++	for (i = 0; i <= MIN_NR_GENS + 1; i++)
			
 
				++		lrugen->timestamps[i] = jiffies;
			
 
				++
			
 
				++	for_each_gen_type_zone(gen, type, zone)
			
 
				++		INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
			
 
				++}
			
 
				++
			
 
				++#ifdef CONFIG_MEMCG
			
 
				++void lru_gen_init_memcg(struct mem_cgroup *memcg)
			
 
				++{
			
 
				++	int nid;
			
 
				++
			
 
				++	for_each_node(nid) {
			
 
				++		struct lruvec *lruvec = get_lruvec(nid, memcg);
			
 
				++
			
 
				++		lru_gen_init_state(memcg, lruvec);
			
 
				++	}
			
 
				++}
			
 
				++#endif
			
 
				++
			
 
				++static int __init init_lru_gen(void)
			
 
				++{
			
 
				++	BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
			
 
				++	BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
			
 
				++
			
 
				++	return 0;
			
 
				++};
			
 
				++late_initcall(init_lru_gen);
			
 
				++
			
 
				++#endif /* CONFIG_LRU_GEN */
			
 
				++
			
 
				+ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
			
 
				+ {
			
 
				+ 	unsigned long nr[NR_LRU_LISTS];
			
--- a/target/linux/generic/pending-5.15/020-04-mm-multigenerational-lru-mm_struct-list.patch
+++ b/target/linux/generic/pending-5.15/020-04-mm-multigenerational-lru-mm_struct-list.patch
@@ -0,0 +1,760 @@
 
				+From 534bcc4a0bb5b24600891ce793f0295a142e9dae Mon Sep 17 00:00:00 2001
			
 
				+From: Yu Zhao <[email protected]>
			
 
				+Date: Mon, 5 Apr 2021 04:17:41 -0600
			
 
				+Subject: [PATCH 05/10] mm: multigenerational lru: mm_struct list
			
 
				+
			
 
				+To scan PTEs for accessed pages, a mm_struct list is maintained for
			
 
				+each memcg. When multiple threads traverse the same memcg->mm_list,
			
 
				+each of them gets a unique mm_struct and therefore they can run
			
 
				+walk_page_range() concurrently to reach page tables of all processes
			
 
				+of this memcg.
			
 
				+
			
 
				+This infrastructure also provides the following optimizations:
			
 
				+  1) it allows walkers to skip processes that have been sleeping since
			
 
				+  the last walk by tracking the usage of mm_struct between context
			
 
				+  switches.
			
 
				+  2) it allows walkers to add interesting items they find during a
			
 
				+  walk to a Bloom filter so that they can skip uninteresting items
			
 
				+  during the next walk by testing whether an item is in this Bloom
			
 
				+  filter.
			
 
				+
			
 
				+Signed-off-by: Yu Zhao <[email protected]>
			
 
				+Tested-by: Konstantin Kharlamov <[email protected]>
			
 
				+Change-Id: I25d9eda8c6bdc7c3653b9f210a159d6c247c81e8
			
 
				+---
			
 
				+ fs/exec.c                  |   2 +
			
 
				+ include/linux/memcontrol.h |   4 +
			
 
				+ include/linux/mm_inline.h  |   6 +
			
 
				+ include/linux/mm_types.h   |  75 +++++++++
			
 
				+ include/linux/mmzone.h     |  63 +++++++
			
 
				+ kernel/exit.c              |   1 +
			
 
				+ kernel/fork.c              |   9 +
			
 
				+ kernel/sched/core.c        |   1 +
			
 
				+ mm/memcontrol.c            |  25 +++
			
 
				+ mm/vmscan.c                | 331 +++++++++++++++++++++++++++++++++++++
			
 
				+ 10 files changed, 517 insertions(+)
			
 
				+
			
 
				+--- a/fs/exec.c
			
 
				++++ b/fs/exec.c
			
 
				+@@ -1013,6 +1013,7 @@ static int exec_mmap(struct mm_struct *m
			
 
				+ 	active_mm = tsk->active_mm;
			
 
				+ 	tsk->active_mm = mm;
			
 
				+ 	tsk->mm = mm;
			
 
				++	lru_gen_add_mm(mm);
			
 
				+ 	/*
			
 
				+ 	 * This prevents preemption while active_mm is being loaded and
			
 
				+ 	 * it and mm are being updated, which could cause problems for
			
 
				+@@ -1023,6 +1024,7 @@ static int exec_mmap(struct mm_struct *m
			
 
				+ 	if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
			
 
				+ 		local_irq_enable();
			
 
				+ 	activate_mm(active_mm, mm);
			
 
				++	lru_gen_activate_mm(mm);
			
 
				+ 	if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
			
 
				+ 		local_irq_enable();
			
 
				+ 	tsk->mm->vmacache_seqnum = 0;
			
 
				+--- a/include/linux/memcontrol.h
			
 
				++++ b/include/linux/memcontrol.h
			
 
				+@@ -348,6 +348,10 @@ struct mem_cgroup {
			
 
				+ 	struct deferred_split deferred_split_queue;
			
 
				+ #endif
			
 
				+ 
			
 
				++#ifdef CONFIG_LRU_GEN
			
 
				++	struct lru_gen_mm_list mm_list;
			
 
				++#endif
			
 
				++
			
 
				+ 	struct mem_cgroup_per_node *nodeinfo[];
			
 
				+ };
			
 
				+ 
			
 
				+--- a/include/linux/mm_inline.h
			
 
				++++ b/include/linux/mm_inline.h
			
 
				+@@ -100,6 +100,12 @@ static inline int lru_gen_from_seq(unsig
			
 
				+ 	return seq % MAX_NR_GENS;
			
 
				+ }
			
 
				+ 
			
 
				++/* Return a proper index regardless whether we keep stats for historical generations. */
			
 
				++static inline int lru_hist_from_seq(unsigned long seq)
			
 
				++{
			
 
				++	return seq % NR_HIST_GENS;
			
 
				++}
			
 
				++
			
 
				+ /* The youngest and the second youngest generations are counted as active. */
			
 
				+ static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
			
 
				+ {
			
 
				+--- a/include/linux/mm_types.h
			
 
				++++ b/include/linux/mm_types.h
			
 
				+@@ -3,6 +3,7 @@
			
 
				+ #define _LINUX_MM_TYPES_H
			
 
				+ 
			
 
				+ #include <linux/mm_types_task.h>
			
 
				++#include <linux/sched.h>
			
 
				+ 
			
 
				+ #include <linux/auxvec.h>
			
 
				+ #include <linux/list.h>
			
 
				+@@ -15,6 +16,8 @@
			
 
				+ #include <linux/page-flags-layout.h>
			
 
				+ #include <linux/workqueue.h>
			
 
				+ #include <linux/seqlock.h>
			
 
				++#include <linux/nodemask.h>
			
 
				++#include <linux/mmdebug.h>
			
 
				+ 
			
 
				+ #include <asm/mmu.h>
			
 
				+ 
			
 
				+@@ -580,6 +583,18 @@ struct mm_struct {
			
 
				+ #ifdef CONFIG_IOMMU_SUPPORT
			
 
				+ 		u32 pasid;
			
 
				+ #endif
			
 
				++#ifdef CONFIG_LRU_GEN
			
 
				++		struct {
			
 
				++			/* the node of a global or per-memcg mm_struct list */
			
 
				++			struct list_head list;
			
 
				++#ifdef CONFIG_MEMCG
			
 
				++			/* points to the memcg of the owner task above */
			
 
				++			struct mem_cgroup *memcg;
			
 
				++#endif
			
 
				++			/* whether this mm_struct has been used since the last walk */
			
 
				++			nodemask_t nodes;
			
 
				++		} lrugen;
			
 
				++#endif /* CONFIG_LRU_GEN */
			
 
				+ 	} __randomize_layout;
			
 
				+ 
			
 
				+ 	/*
			
 
				+@@ -606,6 +621,66 @@ static inline cpumask_t *mm_cpumask(stru
			
 
				+ 	return (struct cpumask *)&mm->cpu_bitmap;
			
 
				+ }
			
 
				+ 
			
 
				++#ifdef CONFIG_LRU_GEN
			
 
				++
			
 
				++struct lru_gen_mm_list {
			
 
				++	/* a global or per-memcg mm_struct list */
			
 
				++	struct list_head fifo;
			
 
				++	/* protects the list above */
			
 
				++	spinlock_t lock;
			
 
				++};
			
 
				++
			
 
				++void lru_gen_add_mm(struct mm_struct *mm);
			
 
				++void lru_gen_del_mm(struct mm_struct *mm);
			
 
				++#ifdef CONFIG_MEMCG
			
 
				++void lru_gen_migrate_mm(struct mm_struct *mm);
			
 
				++#endif
			
 
				++
			
 
				++static inline void lru_gen_init_mm(struct mm_struct *mm)
			
 
				++{
			
 
				++	INIT_LIST_HEAD(&mm->lrugen.list);
			
 
				++#ifdef CONFIG_MEMCG
			
 
				++	mm->lrugen.memcg = NULL;
			
 
				++#endif
			
 
				++	nodes_clear(mm->lrugen.nodes);
			
 
				++}
			
 
				++
			
 
				++/* Track the usage of each mm_struct so that we can skip inactive ones. */
			
 
				++static inline void lru_gen_activate_mm(struct mm_struct *mm)
			
 
				++{
			
 
				++	/* unlikely but not a bug when racing with lru_gen_migrate_mm() */
			
 
				++	VM_WARN_ON(list_empty(&mm->lrugen.list));
			
 
				++
			
 
				++	if (!(current->flags & PF_KTHREAD) && !nodes_full(mm->lrugen.nodes))
			
 
				++		nodes_setall(mm->lrugen.nodes);
			
 
				++}
			
 
				++
			
 
				++#else /* !CONFIG_LRU_GEN */
			
 
				++
			
 
				++static inline void lru_gen_add_mm(struct mm_struct *mm)
			
 
				++{
			
 
				++}
			
 
				++
			
 
				++static inline void lru_gen_del_mm(struct mm_struct *mm)
			
 
				++{
			
 
				++}
			
 
				++
			
 
				++#ifdef CONFIG_MEMCG
			
 
				++static inline void lru_gen_migrate_mm(struct mm_struct *mm)
			
 
				++{
			
 
				++}
			
 
				++#endif
			
 
				++
			
 
				++static inline void lru_gen_init_mm(struct mm_struct *mm)
			
 
				++{
			
 
				++}
			
 
				++
			
 
				++static inline void lru_gen_activate_mm(struct mm_struct *mm)
			
 
				++{
			
 
				++}
			
 
				++
			
 
				++#endif /* CONFIG_LRU_GEN */
			
 
				++
			
 
				+ struct mmu_gather;
			
 
				+ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
			
 
				+ extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
			
 
				+--- a/include/linux/mmzone.h
			
 
				++++ b/include/linux/mmzone.h
			
 
				+@@ -318,6 +318,13 @@ struct lruvec;
			
 
				+ #define MIN_NR_GENS		2
			
 
				+ #define MAX_NR_GENS		((unsigned int)CONFIG_NR_LRU_GENS)
			
 
				+ 
			
 
				++/* Whether to keep stats for historical generations. */
			
 
				++#ifdef CONFIG_LRU_GEN_STATS
			
 
				++#define NR_HIST_GENS		((unsigned int)CONFIG_NR_LRU_GENS)
			
 
				++#else
			
 
				++#define NR_HIST_GENS		1U
			
 
				++#endif
			
 
				++
			
 
				+ struct lrugen {
			
 
				+ 	/* the aging increments the max generation number */
			
 
				+ 	unsigned long max_seq;
			
 
				+@@ -333,13 +340,63 @@ struct lrugen {
			
 
				+ 	bool enabled[ANON_AND_FILE];
			
 
				+ };
			
 
				+ 
			
 
				++enum {
			
 
				++	MM_LEAF_TOTAL,		/* total leaf entries */
			
 
				++	MM_LEAF_OLD,		/* old leaf entries */
			
 
				++	MM_LEAF_YOUNG,		/* young leaf entries */
			
 
				++	MM_NONLEAF_TOTAL,	/* total non-leaf entries */
			
 
				++	MM_NONLEAF_PREV,	/* previously worthy non-leaf entries */
			
 
				++	MM_NONLEAF_CUR,		/* currently worthy non-leaf entries */
			
 
				++	NR_MM_STATS
			
 
				++};
			
 
				++
			
 
				++/* mnemonic codes for the stats above */
			
 
				++#define MM_STAT_CODES		"toydpc"
			
 
				++
			
 
				++/* double buffering bloom filters */
			
 
				++#define NR_BLOOM_FILTERS	2
			
 
				++
			
 
				++struct lru_gen_mm_walk {
			
 
				++	/* set to max_seq after each round of walk */
			
 
				++	unsigned long seq;
			
 
				++	/* the next mm_struct on the list to walk */
			
 
				++	struct list_head *head;
			
 
				++	/* the first mm_struct never walked before */
			
 
				++	struct list_head *tail;
			
 
				++	/* to wait for the last walker to finish */
			
 
				++	struct wait_queue_head wait;
			
 
				++	/* bloom filters flip after each round of walk */
			
 
				++	unsigned long *filters[NR_BLOOM_FILTERS];
			
 
				++	/* page table stats for debugging */
			
 
				++	unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
			
 
				++	/* the number of concurrent walkers */
			
 
				++	int nr_walkers;
			
 
				++};
			
 
				++
			
 
				++#define MIN_BATCH_SIZE		64
			
 
				+ #define MAX_BATCH_SIZE		8192
			
 
				+ 
			
 
				++struct mm_walk_args {
			
 
				++	struct mem_cgroup *memcg;
			
 
				++	unsigned long max_seq;
			
 
				++	unsigned long start_pfn;
			
 
				++	unsigned long end_pfn;
			
 
				++	unsigned long next_addr;
			
 
				++	unsigned long bitmap[BITS_TO_LONGS(MIN_BATCH_SIZE)];
			
 
				++	int node_id;
			
 
				++	int swappiness;
			
 
				++	int batch_size;
			
 
				++	int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
			
 
				++	int mm_stats[NR_MM_STATS];
			
 
				++	bool use_filter;
			
 
				++};
			
 
				++
			
 
				+ void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
			
 
				+ void lru_gen_change_state(bool enable, bool main, bool swap);
			
 
				+ 
			
 
				+ #ifdef CONFIG_MEMCG
			
 
				+ void lru_gen_init_memcg(struct mem_cgroup *memcg);
			
 
				++void lru_gen_free_memcg(struct mem_cgroup *memcg);
			
 
				+ #endif
			
 
				+ 
			
 
				+ #else /* !CONFIG_LRU_GEN */
			
 
				+@@ -356,6 +413,10 @@ static inline void lru_gen_change_state(
			
 
				+ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
			
 
				+ {
			
 
				+ }
			
 
				++
			
 
				++static inline void lru_gen_free_memcg(struct mem_cgroup *memcg)
			
 
				++{
			
 
				++}
			
 
				+ #endif
			
 
				+ 
			
 
				+ #endif /* CONFIG_LRU_GEN */
			
 
				+@@ -380,6 +441,8 @@ struct lruvec {
			
 
				+ #ifdef CONFIG_LRU_GEN
			
 
				+ 	/* unevictable pages are on LRU_UNEVICTABLE */
			
 
				+ 	struct lrugen			evictable;
			
 
				++	/* state for mm list and page table walks */
			
 
				++	struct lru_gen_mm_walk		mm_walk;
			
 
				+ #endif
			
 
				+ #ifdef CONFIG_MEMCG
			
 
				+ 	struct pglist_data *pgdat;
			
 
				+--- a/kernel/exit.c
			
 
				++++ b/kernel/exit.c
			
 
				+@@ -422,6 +422,7 @@ assign_new_owner:
			
 
				+ 		goto retry;
			
 
				+ 	}
			
 
				+ 	WRITE_ONCE(mm->owner, c);
			
 
				++	lru_gen_migrate_mm(mm);
			
 
				+ 	task_unlock(c);
			
 
				+ 	put_task_struct(c);
			
 
				+ }
			
 
				+--- a/kernel/fork.c
			
 
				++++ b/kernel/fork.c
			
 
				+@@ -1080,6 +1080,7 @@ static struct mm_struct *mm_init(struct
			
 
				+ 		goto fail_nocontext;
			
 
				+ 
			
 
				+ 	mm->user_ns = get_user_ns(user_ns);
			
 
				++	lru_gen_init_mm(mm);
			
 
				+ 	return mm;
			
 
				+ 
			
 
				+ fail_nocontext:
			
 
				+@@ -1122,6 +1123,7 @@ static inline void __mmput(struct mm_str
			
 
				+ 	}
			
 
				+ 	if (mm->binfmt)
			
 
				+ 		module_put(mm->binfmt->module);
			
 
				++	lru_gen_del_mm(mm);
			
 
				+ 	mmdrop(mm);
			
 
				+ }
			
 
				+ 
			
 
				+@@ -2616,6 +2618,13 @@ pid_t kernel_clone(struct kernel_clone_a
			
 
				+ 		get_task_struct(p);
			
 
				+ 	}
			
 
				+ 
			
 
				++	if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
			
 
				++		/* lock the task to synchronize with memcg migration */
			
 
				++		task_lock(p);
			
 
				++		lru_gen_add_mm(p->mm);
			
 
				++		task_unlock(p);
			
 
				++	}
			
 
				++
			
 
				+ 	wake_up_new_task(p);
			
 
				+ 
			
 
				+ 	/* forking complete and child started to run, tell ptracer */
			
 
				+--- a/kernel/sched/core.c
			
 
				++++ b/kernel/sched/core.c
			
 
				+@@ -4978,6 +4978,7 @@ context_switch(struct rq *rq, struct tas
			
 
				+ 		 * finish_task_switch()'s mmdrop().
			
 
				+ 		 */
			
 
				+ 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
			
 
				++		lru_gen_activate_mm(next->mm);
			
 
				+ 
			
 
				+ 		if (!prev->mm) {                        // from kernel
			
 
				+ 			/* will mmdrop() in finish_task_switch(). */
			
 
				+--- a/mm/memcontrol.c
			
 
				++++ b/mm/memcontrol.c
			
 
				+@@ -5163,6 +5163,7 @@ static void __mem_cgroup_free(struct mem
			
 
				+ 
			
 
				+ static void mem_cgroup_free(struct mem_cgroup *memcg)
			
 
				+ {
			
 
				++	lru_gen_free_memcg(memcg);
			
 
				+ 	memcg_wb_domain_exit(memcg);
			
 
				+ 	__mem_cgroup_free(memcg);
			
 
				+ }
			
 
				+@@ -6195,6 +6196,29 @@ static void mem_cgroup_move_task(void)
			
 
				+ }
			
 
				+ #endif
			
 
				+ 
			
 
				++#ifdef CONFIG_LRU_GEN
			
 
				++static void mem_cgroup_attach(struct cgroup_taskset *tset)
			
 
				++{
			
 
				++	struct cgroup_subsys_state *css;
			
 
				++	struct task_struct *task = NULL;
			
 
				++
			
 
				++	cgroup_taskset_for_each_leader(task, css, tset)
			
 
				++		break;
			
 
				++
			
 
				++	if (!task)
			
 
				++		return;
			
 
				++
			
 
				++	task_lock(task);
			
 
				++	if (task->mm && task->mm->owner == task)
			
 
				++		lru_gen_migrate_mm(task->mm);
			
 
				++	task_unlock(task);
			
 
				++}
			
 
				++#else
			
 
				++static void mem_cgroup_attach(struct cgroup_taskset *tset)
			
 
				++{
			
 
				++}
			
 
				++#endif /* CONFIG_LRU_GEN */
			
 
				++
			
 
				+ static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
			
 
				+ {
			
 
				+ 	if (value == PAGE_COUNTER_MAX)
			
 
				+@@ -6538,6 +6562,7 @@ struct cgroup_subsys memory_cgrp_subsys
			
 
				+ 	.css_reset = mem_cgroup_css_reset,
			
 
				+ 	.css_rstat_flush = mem_cgroup_css_rstat_flush,
			
 
				+ 	.can_attach = mem_cgroup_can_attach,
			
 
				++	.attach = mem_cgroup_attach,
			
 
				+ 	.cancel_attach = mem_cgroup_cancel_attach,
			
 
				+ 	.post_attach = mem_cgroup_move_task,
			
 
				+ 	.dfl_cftypes = memory_files,
			
 
				+--- a/mm/vmscan.c
			
 
				++++ b/mm/vmscan.c
			
 
				+@@ -2929,6 +2929,306 @@ static bool __maybe_unused seq_is_valid(
			
 
				+ }
			
 
				+ 
			
 
				+ /******************************************************************************
			
 
				++ *                          mm_struct list
			
 
				++ ******************************************************************************/
			
 
				++
			
 
				++static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
			
 
				++{
			
 
				++	static struct lru_gen_mm_list mm_list = {
			
 
				++		.fifo = LIST_HEAD_INIT(mm_list.fifo),
			
 
				++		.lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
			
 
				++	};
			
 
				++
			
 
				++#ifdef CONFIG_MEMCG
			
 
				++	if (memcg)
			
 
				++		return &memcg->mm_list;
			
 
				++#endif
			
 
				++	return &mm_list;
			
 
				++}
			
 
				++
			
 
				++void lru_gen_add_mm(struct mm_struct *mm)
			
 
				++{
			
 
				++	int nid;
			
 
				++	struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
			
 
				++	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
			
 
				++
			
 
				++	VM_BUG_ON_MM(!list_empty(&mm->lrugen.list), mm);
			
 
				++#ifdef CONFIG_MEMCG
			
 
				++	VM_BUG_ON_MM(mm->lrugen.memcg, mm);
			
 
				++	mm->lrugen.memcg = memcg;
			
 
				++#endif
			
 
				++	spin_lock(&mm_list->lock);
			
 
				++
			
 
				++	list_add_tail(&mm->lrugen.list, &mm_list->fifo);
			
 
				++
			
 
				++	for_each_node(nid) {
			
 
				++		struct lruvec *lruvec = get_lruvec(nid, memcg);
			
 
				++
			
 
				++		if (!lruvec)
			
 
				++			continue;
			
 
				++
			
 
				++		if (lruvec->mm_walk.tail == &mm_list->fifo)
			
 
				++			lruvec->mm_walk.tail = lruvec->mm_walk.tail->prev;
			
 
				++	}
			
 
				++
			
 
				++	spin_unlock(&mm_list->lock);
			
 
				++}
			
 
				++
			
 
				++void lru_gen_del_mm(struct mm_struct *mm)
			
 
				++{
			
 
				++	int nid;
			
 
				++	struct lru_gen_mm_list *mm_list;
			
 
				++	struct mem_cgroup *memcg = NULL;
			
 
				++
			
 
				++	if (list_empty(&mm->lrugen.list))
			
 
				++		return;
			
 
				++
			
 
				++#ifdef CONFIG_MEMCG
			
 
				++	memcg = mm->lrugen.memcg;
			
 
				++#endif
			
 
				++	mm_list = get_mm_list(memcg);
			
 
				++
			
 
				++	spin_lock(&mm_list->lock);
			
 
				++
			
 
				++	for_each_node(nid) {
			
 
				++		struct lruvec *lruvec = get_lruvec(nid, memcg);
			
 
				++
			
 
				++		if (!lruvec)
			
 
				++			continue;
			
 
				++
			
 
				++		if (lruvec->mm_walk.tail == &mm->lrugen.list)
			
 
				++			lruvec->mm_walk.tail = lruvec->mm_walk.tail->next;
			
 
				++
			
 
				++		if (lruvec->mm_walk.head != &mm->lrugen.list)
			
 
				++			continue;
			
 
				++
			
 
				++		lruvec->mm_walk.head = lruvec->mm_walk.head->next;
			
 
				++		if (lruvec->mm_walk.head == &mm_list->fifo)
			
 
				++			WRITE_ONCE(lruvec->mm_walk.seq, lruvec->mm_walk.seq + 1);
			
 
				++	}
			
 
				++
			
 
				++	list_del_init(&mm->lrugen.list);
			
 
				++
			
 
				++	spin_unlock(&mm_list->lock);
			
 
				++
			
 
				++#ifdef CONFIG_MEMCG
			
 
				++	mem_cgroup_put(mm->lrugen.memcg);
			
 
				++	mm->lrugen.memcg = NULL;
			
 
				++#endif
			
 
				++}
			
 
				++
			
 
				++#ifdef CONFIG_MEMCG
			
 
				++void lru_gen_migrate_mm(struct mm_struct *mm)
			
 
				++{
			
 
				++	struct mem_cgroup *memcg;
			
 
				++
			
 
				++	lockdep_assert_held(&mm->owner->alloc_lock);
			
 
				++
			
 
				++	if (mem_cgroup_disabled())
			
 
				++		return;
			
 
				++
			
 
				++	rcu_read_lock();
			
 
				++	memcg = mem_cgroup_from_task(mm->owner);
			
 
				++	rcu_read_unlock();
			
 
				++	if (memcg == mm->lrugen.memcg)
			
 
				++		return;
			
 
				++
			
 
				++	VM_BUG_ON_MM(!mm->lrugen.memcg, mm);
			
 
				++	VM_BUG_ON_MM(list_empty(&mm->lrugen.list), mm);
			
 
				++
			
 
				++	lru_gen_del_mm(mm);
			
 
				++	lru_gen_add_mm(mm);
			
 
				++}
			
 
				++#endif
			
 
				++
			
 
				++#define BLOOM_FILTER_SHIFT	15
			
 
				++
			
 
				++static inline int filter_gen_from_seq(unsigned long seq)
			
 
				++{
			
 
				++	return seq % NR_BLOOM_FILTERS;
			
 
				++}
			
 
				++
			
 
				++static void get_item_key(void *item, int *key)
			
 
				++{
			
 
				++	u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
			
 
				++
			
 
				++	BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
			
 
				++
			
 
				++	key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
			
 
				++	key[1] = hash >> BLOOM_FILTER_SHIFT;
			
 
				++}
			
 
				++
			
 
				++static void clear_bloom_filter(struct lruvec *lruvec, unsigned long seq)
			
 
				++{
			
 
				++	unsigned long *filter;
			
 
				++	int gen = filter_gen_from_seq(seq);
			
 
				++
			
 
				++	lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
			
 
				++
			
 
				++	filter = lruvec->mm_walk.filters[gen];
			
 
				++	if (filter) {
			
 
				++		bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
			
 
				++		return;
			
 
				++	}
			
 
				++
			
 
				++	filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), GFP_ATOMIC);
			
 
				++	WRITE_ONCE(lruvec->mm_walk.filters[gen], filter);
			
 
				++}
			
 
				++
			
 
				++static void set_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
			
 
				++{
			
 
				++	int key[2];
			
 
				++	unsigned long *filter;
			
 
				++	int gen = filter_gen_from_seq(seq);
			
 
				++
			
 
				++	filter = READ_ONCE(lruvec->mm_walk.filters[gen]);
			
 
				++	if (!filter)
			
 
				++		return;
			
 
				++
			
 
				++	get_item_key(item, key);
			
 
				++
			
 
				++	if (!test_bit(key[0], filter))
			
 
				++		set_bit(key[0], filter);
			
 
				++	if (!test_bit(key[1], filter))
			
 
				++		set_bit(key[1], filter);
			
 
				++}
			
 
				++
			
 
				++static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
			
 
				++{
			
 
				++	int key[2];
			
 
				++	unsigned long *filter;
			
 
				++	int gen = filter_gen_from_seq(seq);
			
 
				++
			
 
				++	filter = READ_ONCE(lruvec->mm_walk.filters[gen]);
			
 
				++	if (!filter)
			
 
				++		return false;
			
 
				++
			
 
				++	get_item_key(item, key);
			
 
				++
			
 
				++	return test_bit(key[0], filter) && test_bit(key[1], filter);
			
 
				++}
			
 
				++
			
 
				++static void reset_mm_stats(struct lruvec *lruvec, bool last, struct mm_walk_args *args)
			
 
				++{
			
 
				++	int i;
			
 
				++	int hist = lru_hist_from_seq(args->max_seq);
			
 
				++
			
 
				++	lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
			
 
				++
			
 
				++	for (i = 0; i < NR_MM_STATS; i++) {
			
 
				++		WRITE_ONCE(lruvec->mm_walk.stats[hist][i],
			
 
				++			   lruvec->mm_walk.stats[hist][i] + args->mm_stats[i]);
			
 
				++		args->mm_stats[i] = 0;
			
 
				++	}
			
 
				++
			
 
				++	if (!last || NR_HIST_GENS == 1)
			
 
				++		return;
			
 
				++
			
 
				++	hist = lru_hist_from_seq(args->max_seq + 1);
			
 
				++	for (i = 0; i < NR_MM_STATS; i++)
			
 
				++		WRITE_ONCE(lruvec->mm_walk.stats[hist][i], 0);
			
 
				++}
			
 
				++
			
 
				++static bool should_skip_mm(struct mm_struct *mm, struct mm_walk_args *args)
			
 
				++{
			
 
				++	int type;
			
 
				++	unsigned long size = 0;
			
 
				++
			
 
				++	if (cpumask_empty(mm_cpumask(mm)) && !node_isset(args->node_id, mm->lrugen.nodes))
			
 
				++		return true;
			
 
				++
			
 
				++	if (mm_is_oom_victim(mm))
			
 
				++		return true;
			
 
				++
			
 
				++	for (type = !args->swappiness; type < ANON_AND_FILE; type++) {
			
 
				++		size += type ? get_mm_counter(mm, MM_FILEPAGES) :
			
 
				++			       get_mm_counter(mm, MM_ANONPAGES) +
			
 
				++			       get_mm_counter(mm, MM_SHMEMPAGES);
			
 
				++	}
			
 
				++
			
 
				++	if (size < MIN_BATCH_SIZE)
			
 
				++		return true;
			
 
				++
			
 
				++	if (!mmget_not_zero(mm))
			
 
				++		return true;
			
 
				++
			
 
				++	node_clear(args->node_id, mm->lrugen.nodes);
			
 
				++
			
 
				++	return false;
			
 
				++}
			
 
				++
			
 
				++/* To support multiple walkers that concurrently walk an mm_struct list. */
			
 
				++static bool get_next_mm(struct lruvec *lruvec, struct mm_walk_args *args,
			
 
				++			struct mm_struct **iter)
			
 
				++{
			
 
				++	bool first = false;
			
 
				++	bool last = true;
			
 
				++	struct mm_struct *mm = NULL;
			
 
				++	struct lru_gen_mm_walk *mm_walk = &lruvec->mm_walk;
			
 
				++	struct lru_gen_mm_list *mm_list = get_mm_list(args->memcg);
			
 
				++
			
 
				++	if (*iter)
			
 
				++		mmput_async(*iter);
			
 
				++	else if (args->max_seq <= READ_ONCE(mm_walk->seq))
			
 
				++		return false;
			
 
				++
			
 
				++	spin_lock(&mm_list->lock);
			
 
				++
			
 
				++	VM_BUG_ON(args->max_seq > mm_walk->seq + 1);
			
 
				++	VM_BUG_ON(*iter && args->max_seq < mm_walk->seq);
			
 
				++	VM_BUG_ON(*iter && !mm_walk->nr_walkers);
			
 
				++
			
 
				++	if (args->max_seq <= mm_walk->seq) {
			
 
				++		if (!*iter)
			
 
				++			last = false;
			
 
				++		goto done;
			
 
				++	}
			
 
				++
			
 
				++	if (mm_walk->head == &mm_list->fifo) {
			
 
				++		VM_BUG_ON(mm_walk->nr_walkers);
			
 
				++		mm_walk->head = mm_walk->head->next;
			
 
				++		first = true;
			
 
				++	}
			
 
				++
			
 
				++	while (!mm && mm_walk->head != &mm_list->fifo) {
			
 
				++		mm = list_entry(mm_walk->head, struct mm_struct, lrugen.list);
			
 
				++
			
 
				++		mm_walk->head = mm_walk->head->next;
			
 
				++
			
 
				++		if (mm_walk->tail == &mm->lrugen.list) {
			
 
				++			mm_walk->tail = mm_walk->tail->next;
			
 
				++			args->use_filter = false;
			
 
				++		}
			
 
				++
			
 
				++		if (should_skip_mm(mm, args))
			
 
				++			mm = NULL;
			
 
				++	}
			
 
				++
			
 
				++	if (mm_walk->head == &mm_list->fifo)
			
 
				++		WRITE_ONCE(mm_walk->seq, mm_walk->seq + 1);
			
 
				++done:
			
 
				++	if (*iter && !mm)
			
 
				++		mm_walk->nr_walkers--;
			
 
				++	if (!*iter && mm)
			
 
				++		mm_walk->nr_walkers++;
			
 
				++
			
 
				++	if (mm_walk->nr_walkers)
			
 
				++		last = false;
			
 
				++
			
 
				++	if (mm && first)
			
 
				++		clear_bloom_filter(lruvec, args->max_seq + 1);
			
 
				++
			
 
				++	if (*iter || last)
			
 
				++		reset_mm_stats(lruvec, last, args);
			
 
				++
			
 
				++	spin_unlock(&mm_list->lock);
			
 
				++
			
 
				++	*iter = mm;
			
 
				++
			
 
				++	return last;
			
 
				++}
			
 
				++
			
 
				++/******************************************************************************
			
 
				+  *                          state change
			
 
				+  ******************************************************************************/
			
 
				+ 
			
 
				+@@ -3112,6 +3412,7 @@ void lru_gen_init_state(struct mem_cgrou
			
 
				+ 	int i;
			
 
				+ 	int gen, type, zone;
			
 
				+ 	struct lrugen *lrugen = &lruvec->evictable;
			
 
				++	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
			
 
				+ 
			
 
				+ 	lrugen->max_seq = MIN_NR_GENS + 1;
			
 
				+ 	lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
			
 
				+@@ -3122,6 +3423,17 @@ void lru_gen_init_state(struct mem_cgrou
			
 
				+ 
			
 
				+ 	for_each_gen_type_zone(gen, type, zone)
			
 
				+ 		INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
			
 
				++
			
 
				++	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg)
			
 
				++		spin_lock(&mm_list->lock);
			
 
				++
			
 
				++	lruvec->mm_walk.seq = MIN_NR_GENS;
			
 
				++	lruvec->mm_walk.head = &mm_list->fifo;
			
 
				++	lruvec->mm_walk.tail = &mm_list->fifo;
			
 
				++	init_waitqueue_head(&lruvec->mm_walk.wait);
			
 
				++
			
 
				++	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg)
			
 
				++		spin_unlock(&mm_list->lock);
			
 
				+ }
			
 
				+ 
			
 
				+ #ifdef CONFIG_MEMCG
			
 
				+@@ -3129,18 +3441,37 @@ void lru_gen_init_memcg(struct mem_cgrou
			
 
				+ {
			
 
				+ 	int nid;
			
 
				+ 
			
 
				++	INIT_LIST_HEAD(&memcg->mm_list.fifo);
			
 
				++	spin_lock_init(&memcg->mm_list.lock);
			
 
				++
			
 
				+ 	for_each_node(nid) {
			
 
				+ 		struct lruvec *lruvec = get_lruvec(nid, memcg);
			
 
				+ 
			
 
				+ 		lru_gen_init_state(memcg, lruvec);
			
 
				+ 	}
			
 
				+ }
			
 
				++
			
 
				++void lru_gen_free_memcg(struct mem_cgroup *memcg)
			
 
				++{
			
 
				++	int nid;
			
 
				++
			
 
				++	for_each_node(nid) {
			
 
				++		int i;
			
 
				++		struct lruvec *lruvec = get_lruvec(nid, memcg);
			
 
				++
			
 
				++		for (i = 0; i < NR_BLOOM_FILTERS; i++) {
			
 
				++			bitmap_free(lruvec->mm_walk.filters[i]);
			
 
				++			lruvec->mm_walk.filters[i] = NULL;
			
 
				++		}
			
 
				++	}
			
 
				++}
			
 
				+ #endif
			
 
				+ 
			
 
				+ static int __init init_lru_gen(void)
			
 
				+ {
			
 
				+ 	BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
			
 
				+ 	BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
			
 
				++	BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
			
 
				+ 
			
 
				+ 	return 0;
			
 
				+ };
			
--- a/target/linux/generic/pending-5.15/020-05-mm-multigenerational-lru-aging.patch
+++ b/target/linux/generic/pending-5.15/020-05-mm-multigenerational-lru-aging.patch
@@ -0,0 +1,1176 @@
 
				+From 8217cd2238c40cf77208aa27a7cc09879e685890 Mon Sep 17 00:00:00 2001
			
 
				+From: Yu Zhao <[email protected]>
			
 
				+Date: Mon, 5 Apr 2021 04:35:07 -0600
			
 
				+Subject: [PATCH 06/10] mm: multigenerational lru: aging
			
 
				+
			
 
				+The aging produces young generations. Given an lruvec, the aging
			
 
				+traverses lruvec_memcg()->mm_list and calls walk_page_range() to scan
			
 
				+PTEs for accessed pages. Upon finding one, the aging updates its
			
 
				+generation number to max_seq (modulo MAX_NR_GENS). After each round of
			
 
				+traversal, the aging increments max_seq. The aging is due when
			
 
				+min_seq[] reaches max_seq-1.
			
 
				+
			
 
				+The aging uses the following optimizations when walking page tables:
			
 
				+  1) It skips non-leaf PMD entries that have the accessed bit cleared
			
 
				+  when CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y.
			
 
				+  2) It does not zigzag between a PGD table and the same PMD or PTE
			
 
				+  table spanning multiple VMAs. In other words, it finishes all the
			
 
				+  VMAs within the range of the same PMD or PTE table before it returns
			
 
				+  to this PGD table. This optimizes workloads that have large numbers
			
 
				+  of tiny VMAs, especially when CONFIG_PGTABLE_LEVELS=5.
			
 
				+
			
 
				+Signed-off-by: Yu Zhao <[email protected]>
			
 
				+Tested-by: Konstantin Kharlamov <[email protected]>
			
 
				+Change-Id: I3ae8abc3100d023cecb3a699d86020ae6fc10a45
			
 
				+---
			
 
				+ include/linux/memcontrol.h |   3 +
			
 
				+ include/linux/mmzone.h     |   9 +
			
 
				+ include/linux/oom.h        |  16 +
			
 
				+ include/linux/swap.h       |   3 +
			
 
				+ mm/memcontrol.c            |   5 +
			
 
				+ mm/oom_kill.c              |   4 +-
			
 
				+ mm/rmap.c                  |   8 +
			
 
				+ mm/vmscan.c                | 948 +++++++++++++++++++++++++++++++++++++
			
 
				+ 8 files changed, 994 insertions(+), 2 deletions(-)
			
 
				+
			
 
				+--- a/include/linux/memcontrol.h
			
 
				++++ b/include/linux/memcontrol.h
			
 
				+@@ -1367,10 +1367,13 @@ mem_cgroup_print_oom_meminfo(struct mem_
			
 
				+ 
			
 
				+ static inline void lock_page_memcg(struct page *page)
			
 
				+ {
			
 
				++	/* to match page_memcg_rcu() */
			
 
				++	rcu_read_lock();
			
 
				+ }
			
 
				+ 
			
 
				+ static inline void unlock_page_memcg(struct page *page)
			
 
				+ {
			
 
				++	rcu_read_unlock();
			
 
				+ }
			
 
				+ 
			
 
				+ static inline void mem_cgroup_handle_over_high(void)
			
 
				+--- a/include/linux/mmzone.h
			
 
				++++ b/include/linux/mmzone.h
			
 
				+@@ -295,6 +295,7 @@ enum lruvec_flags {
			
 
				+ };
			
 
				+ 
			
 
				+ struct lruvec;
			
 
				++struct page_vma_mapped_walk;
			
 
				+ 
			
 
				+ #define LRU_GEN_MASK		((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
			
 
				+ #define LRU_REFS_MASK		((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
			
 
				+@@ -393,6 +394,7 @@ struct mm_walk_args {
			
 
				+ 
			
 
				+ void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
			
 
				+ void lru_gen_change_state(bool enable, bool main, bool swap);
			
 
				++void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
			
 
				+ 
			
 
				+ #ifdef CONFIG_MEMCG
			
 
				+ void lru_gen_init_memcg(struct mem_cgroup *memcg);
			
 
				+@@ -409,6 +411,10 @@ static inline void lru_gen_change_state(
			
 
				+ {
			
 
				+ }
			
 
				+ 
			
 
				++static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
			
 
				++{
			
 
				++}
			
 
				++
			
 
				+ #ifdef CONFIG_MEMCG
			
 
				+ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
			
 
				+ {
			
 
				+@@ -1028,6 +1034,9 @@ typedef struct pglist_data {
			
 
				+ 
			
 
				+ 	unsigned long		flags;
			
 
				+ 
			
 
				++#ifdef CONFIG_LRU_GEN
			
 
				++	struct mm_walk_args	mm_walk_args;
			
 
				++#endif
			
 
				+ 	ZONE_PADDING(_pad2_)
			
 
				+ 
			
 
				+ 	/* Per-node vmstats */
			
 
				+--- a/include/linux/oom.h
			
 
				++++ b/include/linux/oom.h
			
 
				+@@ -57,6 +57,22 @@ struct oom_control {
			
 
				+ extern struct mutex oom_lock;
			
 
				+ extern struct mutex oom_adj_mutex;
			
 
				+ 
			
 
				++#ifdef CONFIG_MMU
			
 
				++extern struct task_struct *oom_reaper_list;
			
 
				++extern struct wait_queue_head oom_reaper_wait;
			
 
				++
			
 
				++static inline bool oom_reaping_in_progress(void)
			
 
				++{
			
 
				++	/* racy check to see if oom reaping could be in progress */
			
 
				++	return READ_ONCE(oom_reaper_list) || !waitqueue_active(&oom_reaper_wait);
			
 
				++}
			
 
				++#else
			
 
				++static inline bool oom_reaping_in_progress(void)
			
 
				++{
			
 
				++	return false;
			
 
				++}
			
 
				++#endif
			
 
				++
			
 
				+ static inline void set_current_oom_origin(void)
			
 
				+ {
			
 
				+ 	current->signal->oom_flag_origin = true;
			
 
				+--- a/include/linux/swap.h
			
 
				++++ b/include/linux/swap.h
			
 
				+@@ -137,6 +137,9 @@ union swap_header {
			
 
				+  */
			
 
				+ struct reclaim_state {
			
 
				+ 	unsigned long reclaimed_slab;
			
 
				++#ifdef CONFIG_LRU_GEN
			
 
				++	struct mm_walk_args *mm_walk_args;
			
 
				++#endif
			
 
				+ };
			
 
				+ 
			
 
				+ #ifdef __KERNEL__
			
 
				+--- a/mm/memcontrol.c
			
 
				++++ b/mm/memcontrol.c
			
 
				+@@ -1304,12 +1304,17 @@ void mem_cgroup_update_lru_size(struct l
			
 
				+ 		*lru_size += nr_pages;
			
 
				+ 
			
 
				+ 	size = *lru_size;
			
 
				++#ifdef CONFIG_LRU_GEN
			
 
				++	/* unlikely but not a bug when reset_batch_size() is pending */
			
 
				++	VM_WARN_ON(size + MAX_BATCH_SIZE < 0);
			
 
				++#else
			
 
				+ 	if (WARN_ONCE(size < 0,
			
 
				+ 		"%s(%p, %d, %d): lru_size %ld\n",
			
 
				+ 		__func__, lruvec, lru, nr_pages, size)) {
			
 
				+ 		VM_BUG_ON(1);
			
 
				+ 		*lru_size = 0;
			
 
				+ 	}
			
 
				++#endif
			
 
				+ 
			
 
				+ 	if (nr_pages > 0)
			
 
				+ 		*lru_size += nr_pages;
			
 
				+--- a/mm/oom_kill.c
			
 
				++++ b/mm/oom_kill.c
			
 
				+@@ -508,8 +508,8 @@ bool process_shares_mm(struct task_struc
			
 
				+  * victim (if that is possible) to help the OOM killer to move on.
			
 
				+  */
			
 
				+ static struct task_struct *oom_reaper_th;
			
 
				+-static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
			
 
				+-static struct task_struct *oom_reaper_list;
			
 
				++DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
			
 
				++struct task_struct *oom_reaper_list;
			
 
				+ static DEFINE_SPINLOCK(oom_reaper_lock);
			
 
				+ 
			
 
				+ bool __oom_reap_task_mm(struct mm_struct *mm)
			
 
				+--- a/mm/rmap.c
			
 
				++++ b/mm/rmap.c
			
 
				+@@ -73,6 +73,7 @@
			
 
				+ #include <linux/page_idle.h>
			
 
				+ #include <linux/memremap.h>
			
 
				+ #include <linux/userfaultfd_k.h>
			
 
				++#include <linux/mm_inline.h>
			
 
				+ 
			
 
				+ #include <asm/tlbflush.h>
			
 
				+ 
			
 
				+@@ -790,6 +791,13 @@ static bool page_referenced_one(struct p
			
 
				+ 		}
			
 
				+ 
			
 
				+ 		if (pvmw.pte) {
			
 
				++			/* the multigenerational lru exploits the spatial locality */
			
 
				++			if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
			
 
				++			    !(vma->vm_flags & VM_SEQ_READ)) {
			
 
				++				lru_gen_look_around(&pvmw);
			
 
				++				referenced++;
			
 
				++			}
			
 
				++
			
 
				+ 			if (ptep_clear_flush_young_notify(vma, address,
			
 
				+ 						pvmw.pte)) {
			
 
				+ 				/*
			
 
				+--- a/mm/vmscan.c
			
 
				++++ b/mm/vmscan.c
			
 
				+@@ -51,6 +51,8 @@
			
 
				+ #include <linux/dax.h>
			
 
				+ #include <linux/psi.h>
			
 
				+ #include <linux/memory.h>
			
 
				++#include <linux/pagewalk.h>
			
 
				++#include <linux/shmem_fs.h>
			
 
				+ 
			
 
				+ #include <asm/tlbflush.h>
			
 
				+ #include <asm/div64.h>
			
 
				+@@ -2887,6 +2889,15 @@ static bool can_age_anon_pages(struct pg
			
 
				+  *                          shorthand helpers
			
 
				+  ******************************************************************************/
			
 
				+ 
			
 
				++#define DEFINE_MAX_SEQ(lruvec)						\
			
 
				++	unsigned long max_seq = READ_ONCE((lruvec)->evictable.max_seq)
			
 
				++
			
 
				++#define DEFINE_MIN_SEQ(lruvec)						\
			
 
				++	unsigned long min_seq[ANON_AND_FILE] = {			\
			
 
				++		READ_ONCE((lruvec)->evictable.min_seq[0]),		\
			
 
				++		READ_ONCE((lruvec)->evictable.min_seq[1]),		\
			
 
				++	}
			
 
				++
			
 
				+ #define for_each_gen_type_zone(gen, type, zone)				\
			
 
				+ 	for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++)			\
			
 
				+ 		for ((type) = 0; (type) < ANON_AND_FILE; (type)++)	\
			
 
				+@@ -2899,6 +2910,12 @@ static int page_lru_gen(struct page *pag
			
 
				+ 	return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
			
 
				+ }
			
 
				+ 
			
 
				++static int get_swappiness(struct mem_cgroup *memcg)
			
 
				++{
			
 
				++	return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ?
			
 
				++	       mem_cgroup_swappiness(memcg) : 0;
			
 
				++}
			
 
				++
			
 
				+ static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg)
			
 
				+ {
			
 
				+ 	struct pglist_data *pgdat = NODE_DATA(nid);
			
 
				+@@ -3229,6 +3246,926 @@ done:
			
 
				+ }
			
 
				+ 
			
 
				+ /******************************************************************************
			
 
				++ *                          the aging
			
 
				++ ******************************************************************************/
			
 
				++
			
 
				++static int page_update_gen(struct page *page, int gen)
			
 
				++{
			
 
				++	unsigned long old_flags, new_flags;
			
 
				++
			
 
				++	VM_BUG_ON(gen >= MAX_NR_GENS);
			
 
				++
			
 
				++	do {
			
 
				++		new_flags = old_flags = READ_ONCE(page->flags);
			
 
				++
			
 
				++		if (!(new_flags & LRU_GEN_MASK)) {
			
 
				++			new_flags |= BIT(PG_referenced);
			
 
				++			continue;
			
 
				++		}
			
 
				++
			
 
				++		new_flags &= ~LRU_GEN_MASK;
			
 
				++		new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
			
 
				++	} while (new_flags != old_flags &&
			
 
				++		 cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
			
 
				++
			
 
				++	return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
			
 
				++}
			
 
				++
			
 
				++static void page_inc_gen(struct page *page, struct lruvec *lruvec, bool reclaiming)
			
 
				++{
			
 
				++	int old_gen, new_gen;
			
 
				++	unsigned long old_flags, new_flags;
			
 
				++	int type = page_is_file_lru(page);
			
 
				++	int zone = page_zonenum(page);
			
 
				++	struct lrugen *lrugen = &lruvec->evictable;
			
 
				++
			
 
				++	old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
			
 
				++
			
 
				++	do {
			
 
				++		new_flags = old_flags = READ_ONCE(page->flags);
			
 
				++		VM_BUG_ON_PAGE(!(new_flags & LRU_GEN_MASK), page);
			
 
				++
			
 
				++		new_gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
			
 
				++		/* page_update_gen() has updated this page? */
			
 
				++		if (new_gen >= 0 && new_gen != old_gen) {
			
 
				++			list_move(&page->lru, &lrugen->lists[new_gen][type][zone]);
			
 
				++			return;
			
 
				++		}
			
 
				++
			
 
				++		new_gen = (old_gen + 1) % MAX_NR_GENS;
			
 
				++
			
 
				++		new_flags &= ~LRU_GEN_MASK;
			
 
				++		new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
			
 
				++		/* for end_page_writeback() */
			
 
				++		if (reclaiming)
			
 
				++			new_flags |= BIT(PG_reclaim);
			
 
				++	} while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
			
 
				++
			
 
				++	lru_gen_update_size(page, lruvec, old_gen, new_gen);
			
 
				++	if (reclaiming)
			
 
				++		list_move(&page->lru, &lrugen->lists[new_gen][type][zone]);
			
 
				++	else
			
 
				++		list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]);
			
 
				++}
			
 
				++
			
 
				++static void update_batch_size(struct page *page, int old_gen, int new_gen,
			
 
				++			      struct mm_walk_args *args)
			
 
				++{
			
 
				++	int type = page_is_file_lru(page);
			
 
				++	int zone = page_zonenum(page);
			
 
				++	int delta = thp_nr_pages(page);
			
 
				++
			
 
				++	VM_BUG_ON(old_gen >= MAX_NR_GENS);
			
 
				++	VM_BUG_ON(new_gen >= MAX_NR_GENS);
			
 
				++
			
 
				++	args->batch_size++;
			
 
				++
			
 
				++	args->nr_pages[old_gen][type][zone] -= delta;
			
 
				++	args->nr_pages[new_gen][type][zone] += delta;
			
 
				++}
			
 
				++
			
 
				++static void reset_batch_size(struct lruvec *lruvec, struct mm_walk_args *args)
			
 
				++{
			
 
				++	int gen, type, zone;
			
 
				++	struct lrugen *lrugen = &lruvec->evictable;
			
 
				++
			
 
				++	args->batch_size = 0;
			
 
				++
			
 
				++	for_each_gen_type_zone(gen, type, zone) {
			
 
				++		enum lru_list lru = type * LRU_FILE;
			
 
				++		int delta = args->nr_pages[gen][type][zone];
			
 
				++
			
 
				++		if (!delta)
			
 
				++			continue;
			
 
				++
			
 
				++		args->nr_pages[gen][type][zone] = 0;
			
 
				++		WRITE_ONCE(lrugen->sizes[gen][type][zone],
			
 
				++			   lrugen->sizes[gen][type][zone] + delta);
			
 
				++
			
 
				++		if (lru_gen_is_active(lruvec, gen))
			
 
				++			lru += LRU_ACTIVE;
			
 
				++		update_lru_size(lruvec, lru, zone, delta);
			
 
				++	}
			
 
				++}
			
 
				++
			
 
				++static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *walk)
			
 
				++{
			
 
				++	struct address_space *mapping;
			
 
				++	struct vm_area_struct *vma = walk->vma;
			
 
				++	struct mm_walk_args *args = walk->private;
			
 
				++
			
 
				++	if (!vma_is_accessible(vma) || is_vm_hugetlb_page(vma) ||
			
 
				++	    (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ)))
			
 
				++		return true;
			
 
				++
			
 
				++	if (vma_is_anonymous(vma))
			
 
				++		return !args->swappiness;
			
 
				++
			
 
				++	if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
			
 
				++		return true;
			
 
				++
			
 
				++	mapping = vma->vm_file->f_mapping;
			
 
				++	if (!mapping->a_ops->writepage)
			
 
				++		return true;
			
 
				++
			
 
				++	return (shmem_mapping(mapping) && !args->swappiness) || mapping_unevictable(mapping);
			
 
				++}
			
 
				++
			
 
				++/*
			
 
				++ * Some userspace memory allocators create many single-page VMAs. So instead of
			
 
				++ * returning back to the PGD table for each of such VMAs, we finish at least an
			
 
				++ * entire PMD table and therefore avoid many zigzags.
			
 
				++ */
			
 
				++static bool get_next_vma(struct mm_walk *walk, unsigned long mask, unsigned long size,
			
 
				++			 unsigned long *start, unsigned long *end)
			
 
				++{
			
 
				++	unsigned long next = round_up(*end, size);
			
 
				++
			
 
				++	VM_BUG_ON(mask & size);
			
 
				++	VM_BUG_ON(*start >= *end);
			
 
				++	VM_BUG_ON((next & mask) != (*start & mask));
			
 
				++
			
 
				++	while (walk->vma) {
			
 
				++		if (next >= walk->vma->vm_end) {
			
 
				++			walk->vma = walk->vma->vm_next;
			
 
				++			continue;
			
 
				++		}
			
 
				++
			
 
				++		if ((next & mask) != (walk->vma->vm_start & mask))
			
 
				++			return false;
			
 
				++
			
 
				++		if (should_skip_vma(walk->vma->vm_start, walk->vma->vm_end, walk)) {
			
 
				++			walk->vma = walk->vma->vm_next;
			
 
				++			continue;
			
 
				++		}
			
 
				++
			
 
				++		*start = max(next, walk->vma->vm_start);
			
 
				++		next = (next | ~mask) + 1;
			
 
				++		/* rounded-up boundaries can wrap to 0 */
			
 
				++		*end = next && next < walk->vma->vm_end ? next : walk->vma->vm_end;
			
 
				++
			
 
				++		return true;
			
 
				++	}
			
 
				++
			
 
				++	return false;
			
 
				++}
			
 
				++
			
 
				++static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
			
 
				++			   struct mm_walk *walk)
			
 
				++{
			
 
				++	int i;
			
 
				++	pte_t *pte;
			
 
				++	spinlock_t *ptl;
			
 
				++	unsigned long addr;
			
 
				++	int worth = 0;
			
 
				++	struct mm_walk_args *args = walk->private;
			
 
				++	int old_gen, new_gen = lru_gen_from_seq(args->max_seq);
			
 
				++
			
 
				++	VM_BUG_ON(pmd_leaf(*pmd));
			
 
				++
			
 
				++	pte = pte_offset_map_lock(walk->mm, pmd, start & PMD_MASK, &ptl);
			
 
				++	arch_enter_lazy_mmu_mode();
			
 
				++restart:
			
 
				++	for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
			
 
				++		struct page *page;
			
 
				++		unsigned long pfn = pte_pfn(pte[i]);
			
 
				++
			
 
				++		args->mm_stats[MM_LEAF_TOTAL]++;
			
 
				++
			
 
				++		if (!pte_present(pte[i]) || is_zero_pfn(pfn))
			
 
				++			continue;
			
 
				++
			
 
				++		if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i])))
			
 
				++			continue;
			
 
				++
			
 
				++		if (!pte_young(pte[i])) {
			
 
				++			args->mm_stats[MM_LEAF_OLD]++;
			
 
				++			continue;
			
 
				++		}
			
 
				++
			
 
				++		VM_BUG_ON(!pfn_valid(pfn));
			
 
				++		if (pfn < args->start_pfn || pfn >= args->end_pfn)
			
 
				++			continue;
			
 
				++
			
 
				++		page = compound_head(pfn_to_page(pfn));
			
 
				++		if (page_to_nid(page) != args->node_id)
			
 
				++			continue;
			
 
				++
			
 
				++		if (page_memcg_rcu(page) != args->memcg)
			
 
				++			continue;
			
 
				++
			
 
				++		VM_BUG_ON(addr < walk->vma->vm_start || addr >= walk->vma->vm_end);
			
 
				++		if (!ptep_test_and_clear_young(walk->vma, addr, pte + i))
			
 
				++			continue;
			
 
				++
			
 
				++		args->mm_stats[MM_LEAF_YOUNG]++;
			
 
				++
			
 
				++		if (pte_dirty(pte[i]) && !PageDirty(page) &&
			
 
				++		    !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page)))
			
 
				++			set_page_dirty(page);
			
 
				++
			
 
				++		old_gen = page_update_gen(page, new_gen);
			
 
				++		if (old_gen >= 0 && old_gen != new_gen)
			
 
				++			update_batch_size(page, old_gen, new_gen, args);
			
 
				++
			
 
				++		worth++;
			
 
				++	}
			
 
				++
			
 
				++	if (i < PTRS_PER_PTE && get_next_vma(walk, PMD_MASK, PAGE_SIZE, &start, &end))
			
 
				++		goto restart;
			
 
				++
			
 
				++	arch_leave_lazy_mmu_mode();
			
 
				++	pte_unmap_unlock(pte, ptl);
			
 
				++
			
 
				++	return worth >= MIN_BATCH_SIZE / 2;
			
 
				++}
			
 
				++
			
 
				++/*
			
 
				++ * We scan PMD entries in two passes. The first pass reaches to PTE tables and
			
 
				++ * doesn't take the PMD lock. The second pass clears the accessed bit on PMD
			
 
				++ * entries and needs to take the PMD lock.
			
 
				++ */
			
 
				++#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
			
 
				++static void walk_pmd_range_locked(pud_t *pud, unsigned long start, int offset,
			
 
				++				  struct vm_area_struct *vma, struct mm_walk *walk)
			
 
				++{
			
 
				++	int i;
			
 
				++	pmd_t *pmd;
			
 
				++	spinlock_t *ptl;
			
 
				++	struct mm_walk_args *args = walk->private;
			
 
				++	int old_gen, new_gen = lru_gen_from_seq(args->max_seq);
			
 
				++
			
 
				++	VM_BUG_ON(pud_leaf(*pud));
			
 
				++
			
 
				++	start = (start & PUD_MASK) + offset * PMD_SIZE;
			
 
				++	pmd = pmd_offset(pud, start);
			
 
				++	ptl = pmd_lock(walk->mm, pmd);
			
 
				++	arch_enter_lazy_mmu_mode();
			
 
				++
			
 
				++	for_each_set_bit(i, args->bitmap, MIN_BATCH_SIZE) {
			
 
				++		struct page *page;
			
 
				++		unsigned long pfn = pmd_pfn(pmd[i]);
			
 
				++		unsigned long addr = start + i * PMD_SIZE;
			
 
				++
			
 
				++		if (!pmd_present(pmd[i]) || is_huge_zero_pmd(pmd[i]))
			
 
				++			continue;
			
 
				++
			
 
				++		if (WARN_ON_ONCE(pmd_devmap(pmd[i])))
			
 
				++			continue;
			
 
				++
			
 
				++		if (!pmd_trans_huge(pmd[i])) {
			
 
				++			if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
			
 
				++				pmdp_test_and_clear_young(vma, addr, pmd + i);
			
 
				++			continue;
			
 
				++		}
			
 
				++
			
 
				++		VM_BUG_ON(!pfn_valid(pfn));
			
 
				++		if (pfn < args->start_pfn || pfn >= args->end_pfn)
			
 
				++			continue;
			
 
				++
			
 
				++		page = pfn_to_page(pfn);
			
 
				++		VM_BUG_ON_PAGE(PageTail(page), page);
			
 
				++		if (page_to_nid(page) != args->node_id)
			
 
				++			continue;
			
 
				++
			
 
				++		if (page_memcg_rcu(page) != args->memcg)
			
 
				++			continue;
			
 
				++
			
 
				++		VM_BUG_ON(addr < vma->vm_start || addr >= vma->vm_end);
			
 
				++		if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
			
 
				++			continue;
			
 
				++
			
 
				++		args->mm_stats[MM_LEAF_YOUNG]++;
			
 
				++
			
 
				++		if (pmd_dirty(pmd[i]) && !PageDirty(page) &&
			
 
				++		    !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page)))
			
 
				++			set_page_dirty(page);
			
 
				++
			
 
				++		old_gen = page_update_gen(page, new_gen);
			
 
				++		if (old_gen >= 0 && old_gen != new_gen)
			
 
				++			update_batch_size(page, old_gen, new_gen, args);
			
 
				++	}
			
 
				++
			
 
				++	arch_leave_lazy_mmu_mode();
			
 
				++	spin_unlock(ptl);
			
 
				++
			
 
				++	bitmap_zero(args->bitmap, MIN_BATCH_SIZE);
			
 
				++}
			
 
				++#else
			
 
				++static void walk_pmd_range_locked(pud_t *pud, unsigned long start, int offset,
			
 
				++				  struct vm_area_struct *vma, struct mm_walk *walk)
			
 
				++{
			
 
				++}
			
 
				++#endif
			
 
				++
			
 
				++static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
			
 
				++			   struct mm_walk *walk)
			
 
				++{
			
 
				++	int i;
			
 
				++	pmd_t *pmd;
			
 
				++	unsigned long next;
			
 
				++	unsigned long addr;
			
 
				++	struct vm_area_struct *vma;
			
 
				++	int offset = -1;
			
 
				++	bool reset = false;
			
 
				++	struct mm_walk_args *args = walk->private;
			
 
				++	struct lruvec *lruvec = get_lruvec(args->node_id, args->memcg);
			
 
				++
			
 
				++	VM_BUG_ON(pud_leaf(*pud));
			
 
				++
			
 
				++	pmd = pmd_offset(pud, start & PUD_MASK);
			
 
				++restart:
			
 
				++	vma = walk->vma;
			
 
				++	for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
			
 
				++		pmd_t val = pmd_read_atomic(pmd + i);
			
 
				++
			
 
				++		/* for pmd_read_atomic() */
			
 
				++		barrier();
			
 
				++
			
 
				++		next = pmd_addr_end(addr, end);
			
 
				++
			
 
				++		if (!pmd_present(val)) {
			
 
				++			args->mm_stats[MM_LEAF_TOTAL]++;
			
 
				++			continue;
			
 
				++		}
			
 
				++
			
 
				++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
			
 
				++		if (pmd_trans_huge(val)) {
			
 
				++			unsigned long pfn = pmd_pfn(val);
			
 
				++
			
 
				++			args->mm_stats[MM_LEAF_TOTAL]++;
			
 
				++
			
 
				++			if (is_huge_zero_pmd(val))
			
 
				++				continue;
			
 
				++
			
 
				++			if (!pmd_young(val)) {
			
 
				++				args->mm_stats[MM_LEAF_OLD]++;
			
 
				++				continue;
			
 
				++			}
			
 
				++
			
 
				++			if (pfn < args->start_pfn || pfn >= args->end_pfn)
			
 
				++				continue;
			
 
				++
			
 
				++			if (offset < 0)
			
 
				++				offset = i;
			
 
				++			else if (i - offset >= MIN_BATCH_SIZE) {
			
 
				++				walk_pmd_range_locked(pud, start, offset, vma, walk);
			
 
				++				offset = i;
			
 
				++			}
			
 
				++			__set_bit(i - offset, args->bitmap);
			
 
				++			reset = true;
			
 
				++			continue;
			
 
				++		}
			
 
				++#endif
			
 
				++		args->mm_stats[MM_NONLEAF_TOTAL]++;
			
 
				++
			
 
				++#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
			
 
				++		if (!pmd_young(val))
			
 
				++			continue;
			
 
				++
			
 
				++		if (offset < 0)
			
 
				++			offset = i;
			
 
				++		else if (i - offset >= MIN_BATCH_SIZE) {
			
 
				++			walk_pmd_range_locked(pud, start, offset, vma, walk);
			
 
				++			offset = i;
			
 
				++			reset = false;
			
 
				++		}
			
 
				++		__set_bit(i - offset, args->bitmap);
			
 
				++#endif
			
 
				++		if (args->use_filter && !test_bloom_filter(lruvec, args->max_seq, pmd + i))
			
 
				++			continue;
			
 
				++
			
 
				++		args->mm_stats[MM_NONLEAF_PREV]++;
			
 
				++
			
 
				++		if (!walk_pte_range(&val, addr, next, walk))
			
 
				++			continue;
			
 
				++
			
 
				++		args->mm_stats[MM_NONLEAF_CUR]++;
			
 
				++
			
 
				++		set_bloom_filter(lruvec, args->max_seq + 1, pmd + i);
			
 
				++	}
			
 
				++
			
 
				++	if (reset) {
			
 
				++		walk_pmd_range_locked(pud, start, offset, vma, walk);
			
 
				++		offset = -1;
			
 
				++		reset = false;
			
 
				++	}
			
 
				++
			
 
				++	if (i < PTRS_PER_PMD && get_next_vma(walk, PUD_MASK, PMD_SIZE, &start, &end))
			
 
				++		goto restart;
			
 
				++
			
 
				++	if (offset >= 0)
			
 
				++		walk_pmd_range_locked(pud, start, offset, vma, walk);
			
 
				++}
			
 
				++
			
 
				++static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
			
 
				++			  struct mm_walk *walk)
			
 
				++{
			
 
				++	int i;
			
 
				++	pud_t *pud;
			
 
				++	unsigned long addr;
			
 
				++	unsigned long next;
			
 
				++	struct mm_walk_args *args = walk->private;
			
 
				++
			
 
				++	VM_BUG_ON(p4d_leaf(*p4d));
			
 
				++
			
 
				++	pud = pud_offset(p4d, start & P4D_MASK);
			
 
				++restart:
			
 
				++	for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
			
 
				++		pud_t val = READ_ONCE(pud[i]);
			
 
				++
			
 
				++		next = pud_addr_end(addr, end);
			
 
				++
			
 
				++		if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
			
 
				++			continue;
			
 
				++
			
 
				++		walk_pmd_range(&val, addr, next, walk);
			
 
				++
			
 
				++		if (args->batch_size >= MAX_BATCH_SIZE) {
			
 
				++			end = (addr | ~PUD_MASK) + 1;
			
 
				++			goto done;
			
 
				++		}
			
 
				++	}
			
 
				++
			
 
				++	if (i < PTRS_PER_PUD && get_next_vma(walk, P4D_MASK, PUD_SIZE, &start, &end))
			
 
				++		goto restart;
			
 
				++
			
 
				++	end = round_up(end, P4D_SIZE);
			
 
				++done:
			
 
				++	/* rounded-up boundaries can wrap to 0 */
			
 
				++	args->next_addr = end && walk->vma ? max(end, walk->vma->vm_start) : 0;
			
 
				++
			
 
				++	return -EAGAIN;
			
 
				++}
			
 
				++
			
 
				++static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct mm_walk_args *args)
			
 
				++{
			
 
				++	static const struct mm_walk_ops mm_walk_ops = {
			
 
				++		.test_walk = should_skip_vma,
			
 
				++		.p4d_entry = walk_pud_range,
			
 
				++	};
			
 
				++
			
 
				++	int err;
			
 
				++
			
 
				++	args->next_addr = FIRST_USER_ADDRESS;
			
 
				++
			
 
				++	do {
			
 
				++		unsigned long start = args->next_addr;
			
 
				++		unsigned long end = mm->highest_vm_end;
			
 
				++
			
 
				++		err = -EBUSY;
			
 
				++
			
 
				++		rcu_read_lock();
			
 
				++#ifdef CONFIG_MEMCG
			
 
				++		if (args->memcg && atomic_read(&args->memcg->moving_account))
			
 
				++			goto contended;
			
 
				++#endif
			
 
				++		if (!mmap_read_trylock(mm))
			
 
				++			goto contended;
			
 
				++
			
 
				++		err = walk_page_range(mm, start, end, &mm_walk_ops, args);
			
 
				++
			
 
				++		mmap_read_unlock(mm);
			
 
				++
			
 
				++		if (args->batch_size) {
			
 
				++			spin_lock_irq(&lruvec->lru_lock);
			
 
				++			reset_batch_size(lruvec, args);
			
 
				++			spin_unlock_irq(&lruvec->lru_lock);
			
 
				++		}
			
 
				++contended:
			
 
				++		rcu_read_unlock();
			
 
				++
			
 
				++		cond_resched();
			
 
				++	} while (err == -EAGAIN && args->next_addr && !mm_is_oom_victim(mm));
			
 
				++}
			
 
				++
			
 
				++static struct mm_walk_args *alloc_mm_walk_args(void)
			
 
				++{
			
 
				++	if (!current->reclaim_state || !current->reclaim_state->mm_walk_args)
			
 
				++		return kvzalloc(sizeof(struct mm_walk_args), GFP_KERNEL);
			
 
				++
			
 
				++	return current->reclaim_state->mm_walk_args;
			
 
				++}
			
 
				++
			
 
				++static void free_mm_walk_args(struct mm_walk_args *args)
			
 
				++{
			
 
				++	if (!current->reclaim_state || !current->reclaim_state->mm_walk_args)
			
 
				++		kvfree(args);
			
 
				++}
			
 
				++
			
 
				++static bool inc_min_seq(struct lruvec *lruvec, int type)
			
 
				++{
			
 
				++	int gen, zone;
			
 
				++	int remaining = MAX_BATCH_SIZE;
			
 
				++	struct lrugen *lrugen = &lruvec->evictable;
			
 
				++
			
 
				++	VM_BUG_ON(!seq_is_valid(lruvec));
			
 
				++
			
 
				++	if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
			
 
				++		return true;
			
 
				++
			
 
				++	gen = lru_gen_from_seq(lrugen->min_seq[type]);
			
 
				++
			
 
				++	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
			
 
				++		struct list_head *head = &lrugen->lists[gen][type][zone];
			
 
				++
			
 
				++		while (!list_empty(head)) {
			
 
				++			struct page *page = lru_to_page(head);
			
 
				++
			
 
				++			VM_BUG_ON_PAGE(PageTail(page), page);
			
 
				++			VM_BUG_ON_PAGE(PageUnevictable(page), page);
			
 
				++			VM_BUG_ON_PAGE(PageActive(page), page);
			
 
				++			VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
			
 
				++			VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
			
 
				++
			
 
				++			prefetchw_prev_lru_page(page, head, flags);
			
 
				++
			
 
				++			page_inc_gen(page, lruvec, false);
			
 
				++
			
 
				++			if (!--remaining)
			
 
				++				return false;
			
 
				++		}
			
 
				++	}
			
 
				++
			
 
				++	WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
			
 
				++
			
 
				++	return true;
			
 
				++}
			
 
				++
			
 
				++static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness)
			
 
				++{
			
 
				++	int gen, type, zone;
			
 
				++	bool success = false;
			
 
				++	struct lrugen *lrugen = &lruvec->evictable;
			
 
				++	DEFINE_MIN_SEQ(lruvec);
			
 
				++
			
 
				++	VM_BUG_ON(!seq_is_valid(lruvec));
			
 
				++
			
 
				++	for (type = 0; type < ANON_AND_FILE; type++) {
			
 
				++		while (lrugen->max_seq - min_seq[type] >= MIN_NR_GENS) {
			
 
				++			gen = lru_gen_from_seq(min_seq[type]);
			
 
				++
			
 
				++			for (zone = 0; zone < MAX_NR_ZONES; zone++) {
			
 
				++				if (!list_empty(&lrugen->lists[gen][type][zone]))
			
 
				++					goto next;
			
 
				++			}
			
 
				++
			
 
				++			min_seq[type]++;
			
 
				++		}
			
 
				++next:
			
 
				++		;
			
 
				++	}
			
 
				++
			
 
				++	min_seq[0] = min(min_seq[0], min_seq[1]);
			
 
				++	if (swappiness)
			
 
				++		min_seq[1] = max(min_seq[0], lrugen->min_seq[1]);
			
 
				++
			
 
				++	for (type = 0; type < ANON_AND_FILE; type++) {
			
 
				++		if (min_seq[type] == lrugen->min_seq[type])
			
 
				++			continue;
			
 
				++
			
 
				++		WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
			
 
				++		success = true;
			
 
				++	}
			
 
				++
			
 
				++	return success;
			
 
				++}
			
 
				++
			
 
				++static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq)
			
 
				++{
			
 
				++	int gen, type, zone;
			
 
				++	struct lrugen *lrugen = &lruvec->evictable;
			
 
				++
			
 
				++	spin_lock_irq(&lruvec->lru_lock);
			
 
				++
			
 
				++	VM_BUG_ON(!seq_is_valid(lruvec));
			
 
				++
			
 
				++	if (max_seq != lrugen->max_seq)
			
 
				++		goto unlock;
			
 
				++
			
 
				++	if (!try_to_inc_min_seq(lruvec, true)) {
			
 
				++		for (type = ANON_AND_FILE - 1; type >= 0; type--) {
			
 
				++			while (!inc_min_seq(lruvec, type)) {
			
 
				++				spin_unlock_irq(&lruvec->lru_lock);
			
 
				++				cond_resched();
			
 
				++				spin_lock_irq(&lruvec->lru_lock);
			
 
				++			}
			
 
				++		}
			
 
				++	}
			
 
				++
			
 
				++	gen = lru_gen_from_seq(lrugen->max_seq - 1);
			
 
				++	for (type = 0; type < ANON_AND_FILE; type++) {
			
 
				++		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
			
 
				++			enum lru_list lru = type * LRU_FILE;
			
 
				++			long delta = lrugen->sizes[gen][type][zone];
			
 
				++
			
 
				++			if (!delta)
			
 
				++				continue;
			
 
				++
			
 
				++			WARN_ON_ONCE(delta != (int)delta);
			
 
				++
			
 
				++			update_lru_size(lruvec, lru, zone, delta);
			
 
				++			update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
			
 
				++		}
			
 
				++	}
			
 
				++
			
 
				++	gen = lru_gen_from_seq(lrugen->max_seq + 1);
			
 
				++	for (type = 0; type < ANON_AND_FILE; type++) {
			
 
				++		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
			
 
				++			enum lru_list lru = type * LRU_FILE;
			
 
				++			long delta = lrugen->sizes[gen][type][zone];
			
 
				++
			
 
				++			if (!delta)
			
 
				++				continue;
			
 
				++
			
 
				++			WARN_ON_ONCE(delta != (int)delta);
			
 
				++
			
 
				++			update_lru_size(lruvec, lru, zone, -delta);
			
 
				++			update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
			
 
				++		}
			
 
				++	}
			
 
				++
			
 
				++	WRITE_ONCE(lrugen->timestamps[gen], jiffies);
			
 
				++	/* make sure all preceding modifications appear first */
			
 
				++	smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
			
 
				++unlock:
			
 
				++	spin_unlock_irq(&lruvec->lru_lock);
			
 
				++}
			
 
				++
			
 
				++/* Main function used by the foreground, the background and the user-triggered aging. */
			
 
				++static bool try_to_inc_max_seq(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
			
 
				++			       unsigned long max_seq, bool use_filter)
			
 
				++{
			
 
				++	bool last;
			
 
				++	struct mm_walk_args *args;
			
 
				++	struct mm_struct *mm = NULL;
			
 
				++	struct lrugen *lrugen = &lruvec->evictable;
			
 
				++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
			
 
				++	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
			
 
				++	int nid = pgdat->node_id;
			
 
				++
			
 
				++	VM_BUG_ON(max_seq > READ_ONCE(lrugen->max_seq));
			
 
				++
			
 
				++	/*
			
 
				++	 * If we are not from run_aging() and clearing the accessed bit may
			
 
				++	 * trigger page faults, then don't proceed to clearing all accessed
			
 
				++	 * PTEs. Instead, fallback to lru_gen_look_around(), which only clears a
			
 
				++	 * handful of accessed PTEs. This is less efficient but causes fewer
			
 
				++	 * page faults on CPUs that don't have the capability.
			
 
				++	 */
			
 
				++	if ((current->flags & PF_MEMALLOC) && !arch_has_hw_pte_young(false)) {
			
 
				++		inc_max_seq(lruvec, max_seq);
			
 
				++		return true;
			
 
				++	}
			
 
				++
			
 
				++	args = alloc_mm_walk_args();
			
 
				++	if (!args)
			
 
				++		return false;
			
 
				++
			
 
				++	args->memcg = memcg;
			
 
				++	args->max_seq = max_seq;
			
 
				++	args->start_pfn = pgdat->node_start_pfn;
			
 
				++	args->end_pfn = pgdat_end_pfn(pgdat);
			
 
				++	args->node_id = nid;
			
 
				++	args->swappiness = swappiness;
			
 
				++	args->use_filter = use_filter;
			
 
				++
			
 
				++	do {
			
 
				++		last = get_next_mm(lruvec, args, &mm);
			
 
				++		if (mm)
			
 
				++			walk_mm(lruvec, mm, args);
			
 
				++
			
 
				++		cond_resched();
			
 
				++	} while (mm);
			
 
				++
			
 
				++	free_mm_walk_args(args);
			
 
				++
			
 
				++	if (!last) {
			
 
				++		/* don't wait unless we may have trouble reclaiming */
			
 
				++		if (!current_is_kswapd() && sc->priority < DEF_PRIORITY - 2)
			
 
				++			wait_event_killable(lruvec->mm_walk.wait,
			
 
				++					    max_seq < READ_ONCE(lrugen->max_seq));
			
 
				++
			
 
				++		return max_seq < READ_ONCE(lrugen->max_seq);
			
 
				++	}
			
 
				++
			
 
				++	VM_BUG_ON(max_seq != READ_ONCE(lrugen->max_seq));
			
 
				++
			
 
				++	inc_max_seq(lruvec, max_seq);
			
 
				++	/* either we see any waiters or they will see updated max_seq */
			
 
				++	if (wq_has_sleeper(&lruvec->mm_walk.wait))
			
 
				++		wake_up_all(&lruvec->mm_walk.wait);
			
 
				++
			
 
				++	wakeup_flusher_threads(WB_REASON_VMSCAN);
			
 
				++
			
 
				++	return true;
			
 
				++}
			
 
				++
			
 
				++static long get_nr_evictable(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
			
 
				++			     unsigned long max_seq, unsigned long *min_seq, bool *low)
			
 
				++{
			
 
				++	int gen, type, zone;
			
 
				++	long max = 0;
			
 
				++	long min = 0;
			
 
				++	struct lrugen *lrugen = &lruvec->evictable;
			
 
				++
			
 
				++	for (type = !swappiness; type < ANON_AND_FILE; type++) {
			
 
				++		unsigned long seq;
			
 
				++
			
 
				++		for (seq = min_seq[type]; seq <= max_seq; seq++) {
			
 
				++			long size = 0;
			
 
				++
			
 
				++			gen = lru_gen_from_seq(seq);
			
 
				++
			
 
				++			for (zone = 0; zone <= sc->reclaim_idx; zone++)
			
 
				++				size += READ_ONCE(lrugen->sizes[gen][type][zone]);
			
 
				++
			
 
				++			max += size;
			
 
				++			if (type && max_seq - seq >= MIN_NR_GENS)
			
 
				++				min += size;
			
 
				++		}
			
 
				++	}
			
 
				++
			
 
				++	*low = max_seq - min_seq[1] <= MIN_NR_GENS && min < MIN_BATCH_SIZE;
			
 
				++
			
 
				++	return max > 0 ? max : 0;
			
 
				++}
			
 
				++
			
 
				++static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc,
			
 
				++		       unsigned long min_ttl)
			
 
				++{
			
 
				++	bool low;
			
 
				++	long nr_to_scan;
			
 
				++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
			
 
				++	int swappiness = get_swappiness(memcg);
			
 
				++	DEFINE_MAX_SEQ(lruvec);
			
 
				++	DEFINE_MIN_SEQ(lruvec);
			
 
				++
			
 
				++	if (mem_cgroup_below_min(memcg))
			
 
				++		return false;
			
 
				++
			
 
				++	if (min_ttl) {
			
 
				++		int gen = lru_gen_from_seq(min_seq[1]);
			
 
				++		unsigned long birth = READ_ONCE(lruvec->evictable.timestamps[gen]);
			
 
				++
			
 
				++		if (time_is_after_jiffies(birth + min_ttl))
			
 
				++			return false;
			
 
				++	}
			
 
				++
			
 
				++	nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low);
			
 
				++	if (!nr_to_scan)
			
 
				++		return false;
			
 
				++
			
 
				++	nr_to_scan >>= sc->priority;
			
 
				++
			
 
				++	if (!mem_cgroup_online(memcg))
			
 
				++		nr_to_scan++;
			
 
				++
			
 
				++	if (nr_to_scan && low && (!mem_cgroup_below_low(memcg) || sc->memcg_low_reclaim))
			
 
				++		try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true);
			
 
				++
			
 
				++	return true;
			
 
				++}
			
 
				++
			
 
				++/* Protect the working set accessed within the last N milliseconds. */
			
 
				++static unsigned long lru_gen_min_ttl __read_mostly;
			
 
				++
			
 
				++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
			
 
				++{
			
 
				++	struct mem_cgroup *memcg;
			
 
				++	bool success = false;
			
 
				++	unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
			
 
				++
			
 
				++	VM_BUG_ON(!current_is_kswapd());
			
 
				++
			
 
				++	if (!sc->force_deactivate) {
			
 
				++		sc->force_deactivate = 1;
			
 
				++		return;
			
 
				++	}
			
 
				++
			
 
				++	current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args;
			
 
				++
			
 
				++	memcg = mem_cgroup_iter(NULL, NULL, NULL);
			
 
				++	do {
			
 
				++		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
			
 
				++
			
 
				++		if (age_lruvec(lruvec, sc, min_ttl))
			
 
				++			success = true;
			
 
				++
			
 
				++		cond_resched();
			
 
				++	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
			
 
				++
			
 
				++	if (!success && mutex_trylock(&oom_lock)) {
			
 
				++		struct oom_control oc = {
			
 
				++			.gfp_mask = sc->gfp_mask,
			
 
				++			.order = sc->order,
			
 
				++		};
			
 
				++
			
 
				++		/* to avoid overkilling */
			
 
				++		if (!oom_reaping_in_progress())
			
 
				++			out_of_memory(&oc);
			
 
				++
			
 
				++		mutex_unlock(&oom_lock);
			
 
				++	}
			
 
				++
			
 
				++	current->reclaim_state->mm_walk_args = NULL;
			
 
				++}
			
 
				++
			
 
				++/* Scan the vicinity of an accessed PTE when shrink_page_list() uses the rmap. */
			
 
				++void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
			
 
				++{
			
 
				++	int i;
			
 
				++	pte_t *pte;
			
 
				++	struct page *page;
			
 
				++	int old_gen, new_gen;
			
 
				++	unsigned long start;
			
 
				++	unsigned long end;
			
 
				++	unsigned long addr;
			
 
				++	struct mm_walk_args *args;
			
 
				++	int worth = 0;
			
 
				++	struct mem_cgroup *memcg = page_memcg(pvmw->page);
			
 
				++	struct pglist_data *pgdat = page_pgdat(pvmw->page);
			
 
				++	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
			
 
				++	DEFINE_MAX_SEQ(lruvec);
			
 
				++
			
 
				++	lockdep_assert_held(pvmw->ptl);
			
 
				++	VM_BUG_ON_PAGE(PageLRU(pvmw->page), pvmw->page);
			
 
				++
			
 
				++	args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL;
			
 
				++	if (!args)
			
 
				++		return;
			
 
				++
			
 
				++	start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
			
 
				++	end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
			
 
				++
			
 
				++	if (end - start > MIN_BATCH_SIZE * PAGE_SIZE) {
			
 
				++		if (pvmw->address - start < MIN_BATCH_SIZE * PAGE_SIZE / 2)
			
 
				++			end = start + MIN_BATCH_SIZE * PAGE_SIZE;
			
 
				++		else if (end - pvmw->address < MIN_BATCH_SIZE * PAGE_SIZE / 2)
			
 
				++			start = end - MIN_BATCH_SIZE * PAGE_SIZE;
			
 
				++		else {
			
 
				++			start = pvmw->address - MIN_BATCH_SIZE * PAGE_SIZE / 2;
			
 
				++			end = pvmw->address + MIN_BATCH_SIZE * PAGE_SIZE / 2;
			
 
				++		}
			
 
				++	}
			
 
				++
			
 
				++	pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
			
 
				++	new_gen = lru_gen_from_seq(max_seq);
			
 
				++
			
 
				++	lock_page_memcg(pvmw->page);
			
 
				++	arch_enter_lazy_mmu_mode();
			
 
				++
			
 
				++	for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
			
 
				++		unsigned long pfn = pte_pfn(pte[i]);
			
 
				++
			
 
				++		if (!pte_present(pte[i]) || is_zero_pfn(pfn))
			
 
				++			continue;
			
 
				++
			
 
				++		if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i])))
			
 
				++			continue;
			
 
				++
			
 
				++		VM_BUG_ON(!pfn_valid(pfn));
			
 
				++		if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
			
 
				++			continue;
			
 
				++
			
 
				++		worth++;
			
 
				++
			
 
				++		if (!pte_young(pte[i]))
			
 
				++			continue;
			
 
				++
			
 
				++		page = compound_head(pfn_to_page(pfn));
			
 
				++		if (page_to_nid(page) != pgdat->node_id)
			
 
				++			continue;
			
 
				++
			
 
				++		if (page_memcg_rcu(page) != memcg)
			
 
				++			continue;
			
 
				++
			
 
				++		VM_BUG_ON(addr < pvmw->vma->vm_start || addr >= pvmw->vma->vm_end);
			
 
				++		if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
			
 
				++			continue;
			
 
				++
			
 
				++		if (pte_dirty(pte[i]) && !PageDirty(page) &&
			
 
				++		    !(PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page)))
			
 
				++			__set_bit(i, args->bitmap);
			
 
				++
			
 
				++		old_gen = page_update_gen(page, new_gen);
			
 
				++		if (old_gen >= 0 && old_gen != new_gen)
			
 
				++			update_batch_size(page, old_gen, new_gen, args);
			
 
				++	}
			
 
				++
			
 
				++	arch_leave_lazy_mmu_mode();
			
 
				++	unlock_page_memcg(pvmw->page);
			
 
				++
			
 
				++	if (worth >= MIN_BATCH_SIZE / 2)
			
 
				++		set_bloom_filter(lruvec, max_seq, pvmw->pmd);
			
 
				++
			
 
				++	for_each_set_bit(i, args->bitmap, MIN_BATCH_SIZE)
			
 
				++		set_page_dirty(pte_page(pte[i]));
			
 
				++
			
 
				++	bitmap_zero(args->bitmap, MIN_BATCH_SIZE);
			
 
				++}
			
 
				++
			
 
				++/******************************************************************************
			
 
				+  *                          state change
			
 
				+  ******************************************************************************/
			
 
				+ 
			
 
				+@@ -3477,6 +4414,12 @@ static int __init init_lru_gen(void)
			
 
				+ };
			
 
				+ late_initcall(init_lru_gen);
			
 
				+ 
			
 
				++#else
			
 
				++
			
 
				++static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
			
 
				++{
			
 
				++}
			
 
				++
			
 
				+ #endif /* CONFIG_LRU_GEN */
			
 
				+ 
			
 
				+ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
			
 
				+@@ -4333,6 +5276,11 @@ static void age_active_anon(struct pglis
			
 
				+ 	struct mem_cgroup *memcg;
			
 
				+ 	struct lruvec *lruvec;
			
 
				+ 
			
 
				++	if (lru_gen_enabled()) {
			
 
				++		lru_gen_age_node(pgdat, sc);
			
 
				++		return;
			
 
				++	}
			
 
				++
			
 
				+ 	if (!can_age_anon_pages(pgdat, sc))
			
 
				+ 		return;
			
 
				+ 
			
--- a/target/linux/generic/pending-5.15/020-06-mm-multigenerational-lru-eviction.patch
+++ b/target/linux/generic/pending-5.15/020-06-mm-multigenerational-lru-eviction.patch
@@ -0,0 +1,1002 @@
 
				+From f4b881ce07ccb2a519f664afaa2a68225b612ca3 Mon Sep 17 00:00:00 2001
			
 
				+From: Yu Zhao <[email protected]>
			
 
				+Date: Tue, 29 Jun 2021 20:46:47 -0600
			
 
				+Subject: [PATCH 07/10] mm: multigenerational lru: eviction
			
 
				+
			
 
				+The eviction consumes old generations. Given an lruvec, the eviction
			
 
				+scans pages on lrugen->lists indexed by anon and file min_seq[]
			
 
				+(modulo MAX_NR_GENS). It first tries to select a type based on the
			
 
				+values of min_seq[]. If they are equal, it selects the type that has
			
 
				+a lower refaulted %. The eviction sorts a page according to its
			
 
				+updated generation number if the aging has found this page accessed.
			
 
				+It also moves a page to the next generation if this page is from an
			
 
				+upper tier that has a higher refaulted % than the base tier. The
			
 
				+eviction increments min_seq[] of a selected type when it finds
			
 
				+lrugen->lists indexed by min_seq[] of this selected type are empty.
			
 
				+
			
 
				+Each generation is divided into multiple tiers. Tiers represent
			
 
				+different ranges of numbers of accesses from file descriptors only.
			
 
				+Pages accessed N times via file descriptors belong to tier
			
 
				+order_base_2(N). Each generation contains at most MAX_NR_TIERS tiers,
			
 
				+and they require additional MAX_NR_TIERS-2 bits in page->flags. In
			
 
				+contrast to moving between generations which requires list operations,
			
 
				+moving between tiers only involves operations on page->flags and
			
 
				+therefore has a negligible cost. A feedback loop modeled after the PID
			
 
				+controller monitors refaulted % across all tiers and decides when to
			
 
				+protect pages from which tiers.
			
 
				+
			
 
				+Unmapped pages are initially added to the oldest generation and then
			
 
				+conditionally protected by tiers. Each tier keeps track of how many
			
 
				+pages from it have refaulted. Tier 0 is the base tier and pages from
			
 
				+it are evicted unconditionally because there are no better candidates.
			
 
				+Pages from an upper tier are either evicted or moved to the next
			
 
				+generation, depending on whether this upper tier has a higher
			
 
				+refaulted % than the base tier. This model has the following
			
 
				+advantages:
			
 
				+  1) It removes the cost in the buffered access path and reduces the
			
 
				+  overall cost of protection because pages are conditionally protected
			
 
				+  in the reclaim path.
			
 
				+  2) It takes mapped pages into account and avoids overprotecting
			
 
				+  pages accessed multiple times via file descriptors.
			
 
				+  3 Additional tiers improve the protection of pages accessed more
			
 
				+  than twice.
			
 
				+
			
 
				+Signed-off-by: Yu Zhao <[email protected]>
			
 
				+Tested-by: Konstantin Kharlamov <[email protected]>
			
 
				+Change-Id: I64c06d8f2cdb83ac7d56c7e1d07f043483956cac
			
 
				+---
			
 
				+ include/linux/mm_inline.h |  10 +
			
 
				+ include/linux/mmzone.h    |  33 +++
			
 
				+ mm/swap.c                 |  42 +++
			
 
				+ mm/vmscan.c               | 555 +++++++++++++++++++++++++++++++++++++-
			
 
				+ mm/workingset.c           | 120 ++++++++-
			
 
				+ 5 files changed, 757 insertions(+), 3 deletions(-)
			
 
				+
			
 
				+--- a/include/linux/mm_inline.h
			
 
				++++ b/include/linux/mm_inline.h
			
 
				+@@ -106,6 +106,14 @@ static inline int lru_hist_from_seq(unsi
			
 
				+ 	return seq % NR_HIST_GENS;
			
 
				+ }
			
 
				+ 
			
 
				++/* Convert the number of accesses to a tier. See the comment on MAX_NR_TIERS. */
			
 
				++static inline int lru_tier_from_refs(int refs)
			
 
				++{
			
 
				++	VM_BUG_ON(refs > BIT(LRU_REFS_WIDTH));
			
 
				++
			
 
				++	return order_base_2(refs + 1);
			
 
				++}
			
 
				++
			
 
				+ /* The youngest and the second youngest generations are counted as active. */
			
 
				+ static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
			
 
				+ {
			
 
				+@@ -226,6 +234,8 @@ static inline bool lru_gen_del_page(stru
			
 
				+ 		gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
			
 
				+ 
			
 
				+ 		new_flags &= ~LRU_GEN_MASK;
			
 
				++		if ((new_flags & LRU_REFS_FLAGS) != LRU_REFS_FLAGS)
			
 
				++			new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
			
 
				+ 		/* for shrink_page_list() */
			
 
				+ 		if (reclaiming)
			
 
				+ 			new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim));
			
 
				+--- a/include/linux/mmzone.h
			
 
				++++ b/include/linux/mmzone.h
			
 
				+@@ -319,6 +319,30 @@ struct page_vma_mapped_walk;
			
 
				+ #define MIN_NR_GENS		2
			
 
				+ #define MAX_NR_GENS		((unsigned int)CONFIG_NR_LRU_GENS)
			
 
				+ 
			
 
				++/*
			
 
				++ * Each generation is divided into multiple tiers. Tiers represent different
			
 
				++ * ranges of numbers of accesses from file descriptors, i.e.,
			
 
				++ * mark_page_accessed(). In contrast to moving between generations which
			
 
				++ * requires the lru lock, moving between tiers only involves an atomic
			
 
				++ * operation on page->flags and therefore has a negligible cost.
			
 
				++ *
			
 
				++ * The purposes of tiers are to:
			
 
				++ *   1) estimate whether pages accessed multiple times via file descriptors are
			
 
				++ *   more active than pages accessed only via page tables by separating the two
			
 
				++ *   access types into upper tiers and the base tier, and comparing refaulted %
			
 
				++ *   across all tiers.
			
 
				++ *   2) improve buffered io performance by deferring the protection of pages
			
 
				++ *   accessed multiple times until the eviction. That is the protection happens
			
 
				++ *   in the reclaim path, not the access path.
			
 
				++ *
			
 
				++ * Pages accessed N times via file descriptors belong to tier order_base_2(N).
			
 
				++ * The base tier may be marked by PageReferenced(). All upper tiers are marked
			
 
				++ * by PageReferenced() && PageWorkingset(). Additional bits from page->flags are
			
 
				++ * used to support more than one upper tier.
			
 
				++ */
			
 
				++#define MAX_NR_TIERS		((unsigned int)CONFIG_TIERS_PER_GEN)
			
 
				++#define LRU_REFS_FLAGS		(BIT(PG_referenced) | BIT(PG_workingset))
			
 
				++
			
 
				+ /* Whether to keep stats for historical generations. */
			
 
				+ #ifdef CONFIG_LRU_GEN_STATS
			
 
				+ #define NR_HIST_GENS		((unsigned int)CONFIG_NR_LRU_GENS)
			
 
				+@@ -337,6 +361,15 @@ struct lrugen {
			
 
				+ 	struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
			
 
				+ 	/* the sizes of the multigenerational lru lists in pages */
			
 
				+ 	unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
			
 
				++	/* the exponential moving average of refaulted */
			
 
				++	unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
			
 
				++	/* the exponential moving average of protected+evicted */
			
 
				++	unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
			
 
				++	/* the base tier isn't protected, hence the minus one */
			
 
				++	unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
			
 
				++	/* incremented without holding the lru lock */
			
 
				++	atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
			
 
				++	atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
			
 
				+ 	/* whether the multigenerational lru is enabled */
			
 
				+ 	bool enabled[ANON_AND_FILE];
			
 
				+ };
			
 
				+--- a/mm/swap.c
			
 
				++++ b/mm/swap.c
			
 
				+@@ -389,6 +389,43 @@ static void __lru_cache_activate_page(st
			
 
				+ 	local_unlock(&lru_pvecs.lock);
			
 
				+ }
			
 
				+ 
			
 
				++#ifdef CONFIG_LRU_GEN
			
 
				++static void page_inc_refs(struct page *page)
			
 
				++{
			
 
				++	unsigned long refs;
			
 
				++	unsigned long old_flags, new_flags;
			
 
				++
			
 
				++	if (PageUnevictable(page))
			
 
				++		return;
			
 
				++
			
 
				++	/* see the comment on MAX_NR_TIERS */
			
 
				++	do {
			
 
				++		new_flags = old_flags = READ_ONCE(page->flags);
			
 
				++
			
 
				++		if (!(new_flags & BIT(PG_referenced))) {
			
 
				++			new_flags |= BIT(PG_referenced);
			
 
				++			continue;
			
 
				++		}
			
 
				++
			
 
				++		if (!(new_flags & BIT(PG_workingset))) {
			
 
				++			new_flags |= BIT(PG_workingset);
			
 
				++			continue;
			
 
				++		}
			
 
				++
			
 
				++		refs = new_flags & LRU_REFS_MASK;
			
 
				++		refs = min(refs + BIT(LRU_REFS_PGOFF), LRU_REFS_MASK);
			
 
				++
			
 
				++		new_flags &= ~LRU_REFS_MASK;
			
 
				++		new_flags |= refs;
			
 
				++	} while (new_flags != old_flags &&
			
 
				++		 cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
			
 
				++}
			
 
				++#else
			
 
				++static void page_inc_refs(struct page *page)
			
 
				++{
			
 
				++}
			
 
				++#endif /* CONFIG_LRU_GEN */
			
 
				++
			
 
				+ /*
			
 
				+  * Mark a page as having seen activity.
			
 
				+  *
			
 
				+@@ -403,6 +440,11 @@ void mark_page_accessed(struct page *pag
			
 
				+ {
			
 
				+ 	page = compound_head(page);
			
 
				+ 
			
 
				++	if (lru_gen_enabled()) {
			
 
				++		page_inc_refs(page);
			
 
				++		return;
			
 
				++	}
			
 
				++
			
 
				+ 	if (!PageReferenced(page)) {
			
 
				+ 		SetPageReferenced(page);
			
 
				+ 	} else if (PageUnevictable(page)) {
			
 
				+--- a/mm/vmscan.c
			
 
				++++ b/mm/vmscan.c
			
 
				+@@ -1145,9 +1145,11 @@ static int __remove_mapping(struct addre
			
 
				+ 
			
 
				+ 	if (PageSwapCache(page)) {
			
 
				+ 		swp_entry_t swap = { .val = page_private(page) };
			
 
				+-		mem_cgroup_swapout(page, swap);
			
 
				++
			
 
				++		/* get a shadow entry before page_memcg() is cleared */
			
 
				+ 		if (reclaimed && !mapping_exiting(mapping))
			
 
				+ 			shadow = workingset_eviction(page, target_memcg);
			
 
				++		mem_cgroup_swapout(page, swap);
			
 
				+ 		__delete_from_swap_cache(page, swap, shadow);
			
 
				+ 		xa_unlock_irq(&mapping->i_pages);
			
 
				+ 		put_swap_page(page, swap);
			
 
				+@@ -1410,6 +1412,11 @@ retry:
			
 
				+ 		if (!sc->may_unmap && page_mapped(page))
			
 
				+ 			goto keep_locked;
			
 
				+ 
			
 
				++		/* lru_gen_look_around() has updated this page? */
			
 
				++		if (lru_gen_enabled() && !ignore_references &&
			
 
				++		    page_mapped(page) && PageReferenced(page))
			
 
				++			goto keep_locked;
			
 
				++
			
 
				+ 		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
			
 
				+ 			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
			
 
				+ 
			
 
				+@@ -2570,6 +2577,9 @@ static void prepare_scan_count(pg_data_t
			
 
				+ 	unsigned long file;
			
 
				+ 	struct lruvec *target_lruvec;
			
 
				+ 
			
 
				++	if (lru_gen_enabled())
			
 
				++		return;
			
 
				++
			
 
				+ 	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
			
 
				+ 
			
 
				+ 	/*
			
 
				+@@ -2910,6 +2920,17 @@ static int page_lru_gen(struct page *pag
			
 
				+ 	return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
			
 
				+ }
			
 
				+ 
			
 
				++static int page_lru_tier(struct page *page)
			
 
				++{
			
 
				++	int refs;
			
 
				++	unsigned long flags = READ_ONCE(page->flags);
			
 
				++
			
 
				++	refs = (flags & LRU_REFS_FLAGS) == LRU_REFS_FLAGS ?
			
 
				++	       ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1 : 0;
			
 
				++
			
 
				++	return lru_tier_from_refs(refs);
			
 
				++}
			
 
				++
			
 
				+ static int get_swappiness(struct mem_cgroup *memcg)
			
 
				+ {
			
 
				+ 	return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ?
			
 
				+@@ -3246,6 +3267,91 @@ done:
			
 
				+ }
			
 
				+ 
			
 
				+ /******************************************************************************
			
 
				++ *                          refault feedback loop
			
 
				++ ******************************************************************************/
			
 
				++
			
 
				++/*
			
 
				++ * A feedback loop modeled after the PID controller. Currently supports the
			
 
				++ * proportional (P) and the integral (I) terms; the derivative (D) term can be
			
 
				++ * added if necessary. The setpoint (SP) is the desired position; the process
			
 
				++ * variable (PV) is the measured position. The error is the difference between
			
 
				++ * the SP and the PV. A positive error results in a positive control output
			
 
				++ * correction, which, in our case, is to allow eviction.
			
 
				++ *
			
 
				++ * The P term is refaulted % of the current generation being evicted. The I
			
 
				++ * term is the exponential moving average of refaulted % of previously evicted
			
 
				++ * generations, using the smoothing factor 1/2.
			
 
				++ *
			
 
				++ * Our goal is to maintain proportional refaulted % across all tiers.
			
 
				++ */
			
 
				++struct ctrl_pos {
			
 
				++	unsigned long refaulted;
			
 
				++	unsigned long total;
			
 
				++	int gain;
			
 
				++};
			
 
				++
			
 
				++static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
			
 
				++			  struct ctrl_pos *pos)
			
 
				++{
			
 
				++	struct lrugen *lrugen = &lruvec->evictable;
			
 
				++	int hist = lru_hist_from_seq(lrugen->min_seq[type]);
			
 
				++
			
 
				++	pos->refaulted = lrugen->avg_refaulted[type][tier] +
			
 
				++			 atomic_long_read(&lrugen->refaulted[hist][type][tier]);
			
 
				++	pos->total = lrugen->avg_total[type][tier] +
			
 
				++		     atomic_long_read(&lrugen->evicted[hist][type][tier]);
			
 
				++	if (tier)
			
 
				++		pos->total += lrugen->protected[hist][type][tier - 1];
			
 
				++	pos->gain = gain;
			
 
				++}
			
 
				++
			
 
				++static void reset_ctrl_pos(struct lruvec *lruvec, int gen, int type)
			
 
				++{
			
 
				++	int tier;
			
 
				++	int hist = lru_hist_from_seq(gen);
			
 
				++	struct lrugen *lrugen = &lruvec->evictable;
			
 
				++	bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]);
			
 
				++	bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
			
 
				++
			
 
				++	if (!carryover && !clear)
			
 
				++		return;
			
 
				++
			
 
				++	for (tier = 0; tier < MAX_NR_TIERS; tier++) {
			
 
				++		if (carryover) {
			
 
				++			unsigned long sum;
			
 
				++
			
 
				++			sum = lrugen->avg_refaulted[type][tier] +
			
 
				++			      atomic_long_read(&lrugen->refaulted[hist][type][tier]);
			
 
				++			WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
			
 
				++
			
 
				++			sum = lrugen->avg_total[type][tier] +
			
 
				++			      atomic_long_read(&lrugen->evicted[hist][type][tier]);
			
 
				++			if (tier)
			
 
				++				sum += lrugen->protected[hist][type][tier - 1];
			
 
				++			WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
			
 
				++		}
			
 
				++
			
 
				++		if (clear) {
			
 
				++			atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
			
 
				++			atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
			
 
				++			if (tier)
			
 
				++				WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
			
 
				++		}
			
 
				++	}
			
 
				++}
			
 
				++
			
 
				++static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
			
 
				++{
			
 
				++	/*
			
 
				++	 * Allow eviction if the PV has a limited number of refaulted pages or a
			
 
				++	 * lower refaulted % than the SP.
			
 
				++	 */
			
 
				++	return pv->refaulted < MIN_BATCH_SIZE ||
			
 
				++	       pv->refaulted * max(sp->total, 1UL) * sp->gain <=
			
 
				++	       sp->refaulted * max(pv->total, 1UL) * pv->gain;
			
 
				++}
			
 
				++
			
 
				++/******************************************************************************
			
 
				+  *                          the aging
			
 
				+  ******************************************************************************/
			
 
				+ 
			
 
				+@@ -3265,6 +3371,7 @@ static int page_update_gen(struct page *
			
 
				+ 
			
 
				+ 		new_flags &= ~LRU_GEN_MASK;
			
 
				+ 		new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
			
 
				++		new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
			
 
				+ 	} while (new_flags != old_flags &&
			
 
				+ 		 cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
			
 
				+ 
			
 
				+@@ -3296,6 +3403,7 @@ static void page_inc_gen(struct page *pa
			
 
				+ 
			
 
				+ 		new_flags &= ~LRU_GEN_MASK;
			
 
				+ 		new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
			
 
				++		new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
			
 
				+ 		/* for end_page_writeback() */
			
 
				+ 		if (reclaiming)
			
 
				+ 			new_flags |= BIT(PG_reclaim);
			
 
				+@@ -3787,6 +3895,7 @@ static bool inc_min_seq(struct lruvec *l
			
 
				+ 		}
			
 
				+ 	}
			
 
				+ 
			
 
				++	reset_ctrl_pos(lruvec, gen, type);
			
 
				+ 	WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
			
 
				+ 
			
 
				+ 	return true;
			
 
				+@@ -3824,6 +3933,8 @@ next:
			
 
				+ 		if (min_seq[type] == lrugen->min_seq[type])
			
 
				+ 			continue;
			
 
				+ 
			
 
				++		gen = lru_gen_from_seq(lrugen->min_seq[type]);
			
 
				++		reset_ctrl_pos(lruvec, gen, type);
			
 
				+ 		WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
			
 
				+ 		success = true;
			
 
				+ 	}
			
 
				+@@ -3885,6 +3996,9 @@ static void inc_max_seq(struct lruvec *l
			
 
				+ 		}
			
 
				+ 	}
			
 
				+ 
			
 
				++	for (type = 0; type < ANON_AND_FILE; type++)
			
 
				++		reset_ctrl_pos(lruvec, gen, type);
			
 
				++
			
 
				+ 	WRITE_ONCE(lrugen->timestamps[gen], jiffies);
			
 
				+ 	/* make sure all preceding modifications appear first */
			
 
				+ 	smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
			
 
				+@@ -4166,6 +4280,433 @@ void lru_gen_look_around(struct page_vma
			
 
				+ }
			
 
				+ 
			
 
				+ /******************************************************************************
			
 
				++ *                          the eviction
			
 
				++ ******************************************************************************/
			
 
				++
			
 
				++static bool sort_page(struct page *page, struct lruvec *lruvec, int tier_idx)
			
 
				++{
			
 
				++	bool success;
			
 
				++	int gen = page_lru_gen(page);
			
 
				++	int type = page_is_file_lru(page);
			
 
				++	int zone = page_zonenum(page);
			
 
				++	int tier = page_lru_tier(page);
			
 
				++	int delta = thp_nr_pages(page);
			
 
				++	struct lrugen *lrugen = &lruvec->evictable;
			
 
				++
			
 
				++	VM_BUG_ON_PAGE(gen >= MAX_NR_GENS, page);
			
 
				++
			
 
				++	/* an mlocked page? */
			
 
				++	if (!page_evictable(page)) {
			
 
				++		success = lru_gen_del_page(page, lruvec, true);
			
 
				++		VM_BUG_ON_PAGE(!success, page);
			
 
				++		SetPageUnevictable(page);
			
 
				++		add_page_to_lru_list(page, lruvec);
			
 
				++		__count_vm_events(UNEVICTABLE_PGCULLED, delta);
			
 
				++		return true;
			
 
				++	}
			
 
				++
			
 
				++	/* a lazy-free page that has been written into? */
			
 
				++	if (type && PageDirty(page) && PageAnon(page)) {
			
 
				++		success = lru_gen_del_page(page, lruvec, true);
			
 
				++		VM_BUG_ON_PAGE(!success, page);
			
 
				++		SetPageSwapBacked(page);
			
 
				++		add_page_to_lru_list_tail(page, lruvec);
			
 
				++		return true;
			
 
				++	}
			
 
				++
			
 
				++	/* page_update_gen() has updated this page? */
			
 
				++	if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
			
 
				++		list_move(&page->lru, &lrugen->lists[gen][type][zone]);
			
 
				++		return true;
			
 
				++	}
			
 
				++
			
 
				++	/* protect this page if its tier has a higher refaulted % */
			
 
				++	if (tier > tier_idx) {
			
 
				++		int hist = lru_hist_from_seq(gen);
			
 
				++
			
 
				++		page_inc_gen(page, lruvec, false);
			
 
				++		WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
			
 
				++			   lrugen->protected[hist][type][tier - 1] + delta);
			
 
				++		__mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
			
 
				++		return true;
			
 
				++	}
			
 
				++
			
 
				++	/* mark this page for reclaim if it's pending writeback */
			
 
				++	if (PageWriteback(page) || (type && PageDirty(page))) {
			
 
				++		page_inc_gen(page, lruvec, true);
			
 
				++		return true;
			
 
				++	}
			
 
				++
			
 
				++	return false;
			
 
				++}
			
 
				++
			
 
				++static bool isolate_page(struct page *page, struct lruvec *lruvec, struct scan_control *sc)
			
 
				++{
			
 
				++	bool success;
			
 
				++
			
 
				++	if (!sc->may_unmap && page_mapped(page))
			
 
				++		return false;
			
 
				++
			
 
				++	if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
			
 
				++	    (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page))))
			
 
				++		return false;
			
 
				++
			
 
				++	if (!get_page_unless_zero(page))
			
 
				++		return false;
			
 
				++
			
 
				++	if (!TestClearPageLRU(page)) {
			
 
				++		put_page(page);
			
 
				++		return false;
			
 
				++	}
			
 
				++
			
 
				++	success = lru_gen_del_page(page, lruvec, true);
			
 
				++	VM_BUG_ON_PAGE(!success, page);
			
 
				++
			
 
				++	return true;
			
 
				++}
			
 
				++
			
 
				++static int scan_pages(struct lruvec *lruvec, struct scan_control *sc,
			
 
				++		      int type, int tier, struct list_head *list)
			
 
				++{
			
 
				++	int gen, zone;
			
 
				++	enum vm_event_item item;
			
 
				++	int sorted = 0;
			
 
				++	int scanned = 0;
			
 
				++	int isolated = 0;
			
 
				++	int remaining = MAX_BATCH_SIZE;
			
 
				++	struct lrugen *lrugen = &lruvec->evictable;
			
 
				++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
			
 
				++
			
 
				++	VM_BUG_ON(!list_empty(list));
			
 
				++
			
 
				++	if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
			
 
				++		return 0;
			
 
				++
			
 
				++	gen = lru_gen_from_seq(lrugen->min_seq[type]);
			
 
				++
			
 
				++	for (zone = sc->reclaim_idx; zone >= 0; zone--) {
			
 
				++		LIST_HEAD(moved);
			
 
				++		int skipped = 0;
			
 
				++		struct list_head *head = &lrugen->lists[gen][type][zone];
			
 
				++
			
 
				++		while (!list_empty(head)) {
			
 
				++			struct page *page = lru_to_page(head);
			
 
				++			int delta = thp_nr_pages(page);
			
 
				++
			
 
				++			VM_BUG_ON_PAGE(PageTail(page), page);
			
 
				++			VM_BUG_ON_PAGE(PageUnevictable(page), page);
			
 
				++			VM_BUG_ON_PAGE(PageActive(page), page);
			
 
				++			VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
			
 
				++			VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
			
 
				++
			
 
				++			prefetchw_prev_lru_page(page, head, flags);
			
 
				++
			
 
				++			scanned += delta;
			
 
				++
			
 
				++			if (sort_page(page, lruvec, tier))
			
 
				++				sorted += delta;
			
 
				++			else if (isolate_page(page, lruvec, sc)) {
			
 
				++				list_add(&page->lru, list);
			
 
				++				isolated += delta;
			
 
				++			} else {
			
 
				++				list_move(&page->lru, &moved);
			
 
				++				skipped += delta;
			
 
				++			}
			
 
				++
			
 
				++			if (!--remaining || max(isolated, skipped) >= MIN_BATCH_SIZE)
			
 
				++				break;
			
 
				++		}
			
 
				++
			
 
				++		if (skipped) {
			
 
				++			list_splice(&moved, head);
			
 
				++			__count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
			
 
				++		}
			
 
				++
			
 
				++		if (!remaining || isolated >= MIN_BATCH_SIZE)
			
 
				++			break;
			
 
				++	}
			
 
				++
			
 
				++	item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
			
 
				++	if (!cgroup_reclaim(sc)) {
			
 
				++		__count_vm_events(item, isolated);
			
 
				++		__count_vm_events(PGREFILL, sorted);
			
 
				++	}
			
 
				++	__count_memcg_events(memcg, item, isolated);
			
 
				++	__count_memcg_events(memcg, PGREFILL, sorted);
			
 
				++	__count_vm_events(PGSCAN_ANON + type, isolated);
			
 
				++
			
 
				++	/*
			
 
				++	 * We may have trouble finding eligible pages due to reclaim_idx,
			
 
				++	 * may_unmap and may_writepage. Check `remaining` to make sure we won't
			
 
				++	 * be stuck if we aren't making enough progress.
			
 
				++	 */
			
 
				++	return isolated || !remaining ? scanned : 0;
			
 
				++}
			
 
				++
			
 
				++static int get_tier_idx(struct lruvec *lruvec, int type)
			
 
				++{
			
 
				++	int tier;
			
 
				++	struct ctrl_pos sp, pv;
			
 
				++
			
 
				++	/*
			
 
				++	 * Ideally we don't want to evict upper tiers that have higher refaulted
			
 
				++	 * %. However, we need to leave a margin for the fluctuation in
			
 
				++	 * refaulted %. So we use a larger gain factor to make sure upper tiers
			
 
				++	 * are indeed more active. We choose 2 because the lowest upper tier
			
 
				++	 * would have twice of refaulted % of the base tier, according to their
			
 
				++	 * numbers of accesses.
			
 
				++	 */
			
 
				++	read_ctrl_pos(lruvec, type, 0, 1, &sp);
			
 
				++	for (tier = 1; tier < MAX_NR_TIERS; tier++) {
			
 
				++		read_ctrl_pos(lruvec, type, tier, 2, &pv);
			
 
				++		if (!positive_ctrl_err(&sp, &pv))
			
 
				++			break;
			
 
				++	}
			
 
				++
			
 
				++	return tier - 1;
			
 
				++}
			
 
				++
			
 
				++static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
			
 
				++{
			
 
				++	int type, tier;
			
 
				++	struct ctrl_pos sp, pv;
			
 
				++	int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
			
 
				++
			
 
				++	/*
			
 
				++	 * Compare refaulted % between the base tiers of anon and file to
			
 
				++	 * determine which type to evict. Also need to compare refaulted % of
			
 
				++	 * the upper tiers of the selected type with that of the base tier of
			
 
				++	 * the other type to determine which tier of the selected type to evict.
			
 
				++	 */
			
 
				++	read_ctrl_pos(lruvec, 0, 0, gain[0], &sp);
			
 
				++	read_ctrl_pos(lruvec, 1, 0, gain[1], &pv);
			
 
				++	type = positive_ctrl_err(&sp, &pv);
			
 
				++
			
 
				++	read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
			
 
				++	for (tier = 1; tier < MAX_NR_TIERS; tier++) {
			
 
				++		read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
			
 
				++		if (!positive_ctrl_err(&sp, &pv))
			
 
				++			break;
			
 
				++	}
			
 
				++
			
 
				++	*tier_idx = tier - 1;
			
 
				++
			
 
				++	return type;
			
 
				++}
			
 
				++
			
 
				++static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
			
 
				++			 int *type_scanned, struct list_head *list)
			
 
				++{
			
 
				++	int i;
			
 
				++	int type;
			
 
				++	int scanned;
			
 
				++	int tier = -1;
			
 
				++	DEFINE_MIN_SEQ(lruvec);
			
 
				++
			
 
				++	VM_BUG_ON(!seq_is_valid(lruvec));
			
 
				++
			
 
				++	/*
			
 
				++	 * Try to select a type based on generations and swappiness, and if that
			
 
				++	 * fails, fall back to get_type_to_scan(). When anon and file are both
			
 
				++	 * available from the same generation, swappiness 200 is interpreted as
			
 
				++	 * anon first and swappiness 1 is interpreted as file first.
			
 
				++	 */
			
 
				++	if (!swappiness)
			
 
				++		type = 1;
			
 
				++	else if (min_seq[0] < min_seq[1])
			
 
				++		type = 0;
			
 
				++	else if (swappiness == 1)
			
 
				++		type = 1;
			
 
				++	else if (swappiness == 200)
			
 
				++		type = 0;
			
 
				++	else
			
 
				++		type = get_type_to_scan(lruvec, swappiness, &tier);
			
 
				++
			
 
				++	for (i = !swappiness; i < ANON_AND_FILE; i++) {
			
 
				++		if (tier < 0)
			
 
				++			tier = get_tier_idx(lruvec, type);
			
 
				++
			
 
				++		scanned = scan_pages(lruvec, sc, type, tier, list);
			
 
				++		if (scanned)
			
 
				++			break;
			
 
				++
			
 
				++		type = !type;
			
 
				++		tier = -1;
			
 
				++	}
			
 
				++
			
 
				++	*type_scanned = type;
			
 
				++
			
 
				++	return scanned;
			
 
				++}
			
 
				++
			
 
				++/* Main function used by the foreground, the background and the user-triggered eviction. */
			
 
				++static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
			
 
				++{
			
 
				++	int type;
			
 
				++	int scanned;
			
 
				++	int reclaimed;
			
 
				++	LIST_HEAD(list);
			
 
				++	struct page *page;
			
 
				++	enum vm_event_item item;
			
 
				++	struct reclaim_stat stat;
			
 
				++	struct mm_walk_args *args;
			
 
				++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
			
 
				++	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
			
 
				++
			
 
				++	spin_lock_irq(&lruvec->lru_lock);
			
 
				++
			
 
				++	scanned = isolate_pages(lruvec, sc, swappiness, &type, &list);
			
 
				++
			
 
				++	if (try_to_inc_min_seq(lruvec, swappiness))
			
 
				++		scanned++;
			
 
				++
			
 
				++	if (get_nr_gens(lruvec, 1) == MIN_NR_GENS)
			
 
				++		scanned = 0;
			
 
				++
			
 
				++	spin_unlock_irq(&lruvec->lru_lock);
			
 
				++
			
 
				++	if (list_empty(&list))
			
 
				++		return scanned;
			
 
				++
			
 
				++	reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false);
			
 
				++	/*
			
 
				++	 * We need to prevent rejected pages from being added back to the same
			
 
				++	 * lists they were isolated from. Otherwise we may risk looping on them
			
 
				++	 * forever.
			
 
				++	 */
			
 
				++	list_for_each_entry(page, &list, lru) {
			
 
				++		if (!PageReclaim(page) || !(PageDirty(page) || PageWriteback(page)))
			
 
				++			SetPageActive(page);
			
 
				++
			
 
				++		ClearPageReferenced(page);
			
 
				++		ClearPageWorkingset(page);
			
 
				++	}
			
 
				++
			
 
				++	spin_lock_irq(&lruvec->lru_lock);
			
 
				++
			
 
				++	move_pages_to_lru(lruvec, &list);
			
 
				++
			
 
				++	args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL;
			
 
				++	if (args && args->batch_size)
			
 
				++		reset_batch_size(lruvec, args);
			
 
				++
			
 
				++	item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
			
 
				++	if (!cgroup_reclaim(sc))
			
 
				++		__count_vm_events(item, reclaimed);
			
 
				++	__count_memcg_events(memcg, item, reclaimed);
			
 
				++	__count_vm_events(PGSTEAL_ANON + type, reclaimed);
			
 
				++
			
 
				++	spin_unlock_irq(&lruvec->lru_lock);
			
 
				++
			
 
				++	mem_cgroup_uncharge_list(&list);
			
 
				++	free_unref_page_list(&list);
			
 
				++
			
 
				++	sc->nr_reclaimed += reclaimed;
			
 
				++
			
 
				++	return scanned;
			
 
				++}
			
 
				++
			
 
				++static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
			
 
				++{
			
 
				++	bool low;
			
 
				++	long nr_to_scan;
			
 
				++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
			
 
				++	int priority = sc->priority;
			
 
				++	DEFINE_MAX_SEQ(lruvec);
			
 
				++	DEFINE_MIN_SEQ(lruvec);
			
 
				++
			
 
				++	if (mem_cgroup_below_min(memcg) ||
			
 
				++	    (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
			
 
				++		return 0;
			
 
				++
			
 
				++	if (sc->nr_reclaimed >= sc->nr_to_reclaim) {
			
 
				++		priority = DEF_PRIORITY;
			
 
				++		sc->force_deactivate = 0;
			
 
				++	}
			
 
				++
			
 
				++	nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low);
			
 
				++	if (!nr_to_scan)
			
 
				++		return 0;
			
 
				++
			
 
				++	nr_to_scan >>= priority;
			
 
				++
			
 
				++	if (!mem_cgroup_online(memcg))
			
 
				++		nr_to_scan++;
			
 
				++
			
 
				++	if (!nr_to_scan)
			
 
				++		return 0;
			
 
				++
			
 
				++	if (current_is_kswapd()) {
			
 
				++		/* leave the work to lru_gen_age_node() */
			
 
				++		if (max_seq - min_seq[1] < MIN_NR_GENS)
			
 
				++			return 0;
			
 
				++
			
 
				++		if (!low)
			
 
				++			sc->force_deactivate = 0;
			
 
				++
			
 
				++		return nr_to_scan;
			
 
				++	}
			
 
				++
			
 
				++	if (max_seq - min_seq[1] >= MIN_NR_GENS)
			
 
				++		return nr_to_scan;
			
 
				++
			
 
				++	/* move onto slab and other memcgs if we haven't tried them all */
			
 
				++	if (!sc->force_deactivate) {
			
 
				++		sc->skipped_deactivate = 1;
			
 
				++		return 0;
			
 
				++	}
			
 
				++
			
 
				++	return try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true) ? nr_to_scan : 0;
			
 
				++}
			
 
				++
			
 
				++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
			
 
				++{
			
 
				++	struct blk_plug plug;
			
 
				++	long scanned = 0;
			
 
				++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
			
 
				++	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
			
 
				++
			
 
				++	lru_add_drain();
			
 
				++
			
 
				++	if (current_is_kswapd())
			
 
				++		current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args;
			
 
				++
			
 
				++	blk_start_plug(&plug);
			
 
				++
			
 
				++	while (true) {
			
 
				++		int delta;
			
 
				++		int swappiness;
			
 
				++		long nr_to_scan;
			
 
				++
			
 
				++		if (sc->may_swap)
			
 
				++			swappiness = get_swappiness(memcg);
			
 
				++		else if (!cgroup_reclaim(sc) && get_swappiness(memcg))
			
 
				++			swappiness = 1;
			
 
				++		else
			
 
				++			swappiness = 0;
			
 
				++
			
 
				++		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
			
 
				++		if (!nr_to_scan)
			
 
				++			break;
			
 
				++
			
 
				++		delta = evict_pages(lruvec, sc, swappiness);
			
 
				++		if (!delta)
			
 
				++			break;
			
 
				++
			
 
				++		scanned += delta;
			
 
				++		if (scanned >= nr_to_scan)
			
 
				++			break;
			
 
				++
			
 
				++		cond_resched();
			
 
				++	}
			
 
				++
			
 
				++	blk_finish_plug(&plug);
			
 
				++
			
 
				++	if (current_is_kswapd())
			
 
				++		current->reclaim_state->mm_walk_args = NULL;
			
 
				++}
			
 
				++
			
 
				++/******************************************************************************
			
 
				+  *                          state change
			
 
				+  ******************************************************************************/
			
 
				+ 
			
 
				+@@ -4420,6 +4961,10 @@ static void lru_gen_age_node(struct pgli
			
 
				+ {
			
 
				+ }
			
 
				+ 
			
 
				++static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
			
 
				++{
			
 
				++}
			
 
				++
			
 
				+ #endif /* CONFIG_LRU_GEN */
			
 
				+ 
			
 
				+ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
			
 
				+@@ -4433,6 +4978,11 @@ static void shrink_lruvec(struct lruvec
			
 
				+ 	struct blk_plug plug;
			
 
				+ 	bool scan_adjusted;
			
 
				+ 
			
 
				++	if (lru_gen_enabled()) {
			
 
				++		lru_gen_shrink_lruvec(lruvec, sc);
			
 
				++		return;
			
 
				++	}
			
 
				++
			
 
				+ 	get_scan_count(lruvec, sc, nr);
			
 
				+ 
			
 
				+ 	/* Record the original scan target for proportional adjustments later */
			
 
				+@@ -4906,6 +5456,9 @@ static void snapshot_refaults(struct mem
			
 
				+ 	struct lruvec *target_lruvec;
			
 
				+ 	unsigned long refaults;
			
 
				+ 
			
 
				++	if (lru_gen_enabled())
			
 
				++		return;
			
 
				++
			
 
				+ 	target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
			
 
				+ 	refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
			
 
				+ 	target_lruvec->refaults[0] = refaults;
			
 
				+--- a/mm/workingset.c
			
 
				++++ b/mm/workingset.c
			
 
				+@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_
			
 
				+ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
			
 
				+ 			 bool workingset)
			
 
				+ {
			
 
				+-	eviction >>= bucket_order;
			
 
				+ 	eviction &= EVICTION_MASK;
			
 
				+ 	eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
			
 
				+ 	eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
			
 
				+@@ -212,10 +211,117 @@ static void unpack_shadow(void *shadow,
			
 
				+ 
			
 
				+ 	*memcgidp = memcgid;
			
 
				+ 	*pgdat = NODE_DATA(nid);
			
 
				+-	*evictionp = entry << bucket_order;
			
 
				++	*evictionp = entry;
			
 
				+ 	*workingsetp = workingset;
			
 
				+ }
			
 
				+ 
			
 
				++#ifdef CONFIG_LRU_GEN
			
 
				++
			
 
				++static int page_lru_refs(struct page *page)
			
 
				++{
			
 
				++	unsigned long flags = READ_ONCE(page->flags);
			
 
				++
			
 
				++	BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
			
 
				++
			
 
				++	/* see the comment on MAX_NR_TIERS */
			
 
				++	return flags & BIT(PG_workingset) ? (flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF : 0;
			
 
				++}
			
 
				++
			
 
				++/* Return a token to be stored in the shadow entry of a page being evicted. */
			
 
				++static void *lru_gen_eviction(struct page *page)
			
 
				++{
			
 
				++	int hist, tier;
			
 
				++	unsigned long token;
			
 
				++	unsigned long min_seq;
			
 
				++	struct lruvec *lruvec;
			
 
				++	struct lrugen *lrugen;
			
 
				++	int type = page_is_file_lru(page);
			
 
				++	int refs = page_lru_refs(page);
			
 
				++	int delta = thp_nr_pages(page);
			
 
				++	bool workingset = PageWorkingset(page);
			
 
				++	struct mem_cgroup *memcg = page_memcg(page);
			
 
				++	struct pglist_data *pgdat = page_pgdat(page);
			
 
				++
			
 
				++	lruvec = mem_cgroup_lruvec(memcg, pgdat);
			
 
				++	lrugen = &lruvec->evictable;
			
 
				++	min_seq = READ_ONCE(lrugen->min_seq[type]);
			
 
				++	token = (min_seq << LRU_REFS_WIDTH) | refs;
			
 
				++
			
 
				++	hist = lru_hist_from_seq(min_seq);
			
 
				++	tier = lru_tier_from_refs(refs + workingset);
			
 
				++	atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
			
 
				++
			
 
				++	return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset);
			
 
				++}
			
 
				++
			
 
				++/* Count a refaulted page based on the token stored in its shadow entry. */
			
 
				++static void lru_gen_refault(struct page *page, void *shadow)
			
 
				++{
			
 
				++	int hist, tier, refs;
			
 
				++	int memcg_id;
			
 
				++	bool workingset;
			
 
				++	unsigned long token;
			
 
				++	unsigned long min_seq;
			
 
				++	struct lruvec *lruvec;
			
 
				++	struct lrugen *lrugen;
			
 
				++	struct mem_cgroup *memcg;
			
 
				++	struct pglist_data *pgdat;
			
 
				++	int type = page_is_file_lru(page);
			
 
				++	int delta = thp_nr_pages(page);
			
 
				++
			
 
				++	unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
			
 
				++	if (page_pgdat(page) != pgdat)
			
 
				++		return;
			
 
				++
			
 
				++	rcu_read_lock();
			
 
				++	memcg = page_memcg_rcu(page);
			
 
				++	if (mem_cgroup_id(memcg) != memcg_id)
			
 
				++		goto unlock;
			
 
				++
			
 
				++	refs = token & (BIT(LRU_REFS_WIDTH) - 1);
			
 
				++	if (refs && !workingset)
			
 
				++		goto unlock;
			
 
				++
			
 
				++	token >>= LRU_REFS_WIDTH;
			
 
				++	lruvec = mem_cgroup_lruvec(memcg, pgdat);
			
 
				++	lrugen = &lruvec->evictable;
			
 
				++	min_seq = READ_ONCE(lrugen->min_seq[type]);
			
 
				++	if (token != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)))
			
 
				++		goto unlock;
			
 
				++
			
 
				++	hist = lru_hist_from_seq(min_seq);
			
 
				++	tier = lru_tier_from_refs(refs + workingset);
			
 
				++	atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
			
 
				++	mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
			
 
				++
			
 
				++	/*
			
 
				++	 * Tiers don't offer any protection to pages accessed via page tables.
			
 
				++	 * That's what generations do. Tiers can't fully protect pages after
			
 
				++	 * their numbers of accesses has exceeded the max value. Conservatively
			
 
				++	 * count these two conditions as stalls even though they might not
			
 
				++	 * indicate any real memory pressure.
			
 
				++	 */
			
 
				++	if (task_in_nonseq_fault() || refs + workingset == BIT(LRU_REFS_WIDTH)) {
			
 
				++		SetPageWorkingset(page);
			
 
				++		mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
			
 
				++	}
			
 
				++unlock:
			
 
				++	rcu_read_unlock();
			
 
				++}
			
 
				++
			
 
				++#else
			
 
				++
			
 
				++static void *lru_gen_eviction(struct page *page)
			
 
				++{
			
 
				++	return NULL;
			
 
				++}
			
 
				++
			
 
				++static void lru_gen_refault(struct page *page, void *shadow)
			
 
				++{
			
 
				++}
			
 
				++
			
 
				++#endif /* CONFIG_LRU_GEN */
			
 
				++
			
 
				+ /**
			
 
				+  * workingset_age_nonresident - age non-resident entries as LRU ages
			
 
				+  * @lruvec: the lruvec that was aged
			
 
				+@@ -264,10 +370,14 @@ void *workingset_eviction(struct page *p
			
 
				+ 	VM_BUG_ON_PAGE(page_count(page), page);
			
 
				+ 	VM_BUG_ON_PAGE(!PageLocked(page), page);
			
 
				+ 
			
 
				++	if (lru_gen_enabled())
			
 
				++		return lru_gen_eviction(page);
			
 
				++
			
 
				+ 	lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
			
 
				+ 	/* XXX: target_memcg can be NULL, go through lruvec */
			
 
				+ 	memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
			
 
				+ 	eviction = atomic_long_read(&lruvec->nonresident_age);
			
 
				++	eviction >>= bucket_order;
			
 
				+ 	workingset_age_nonresident(lruvec, thp_nr_pages(page));
			
 
				+ 	return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
			
 
				+ }
			
 
				+@@ -296,7 +406,13 @@ void workingset_refault(struct page *pag
			
 
				+ 	bool workingset;
			
 
				+ 	int memcgid;
			
 
				+ 
			
 
				++	if (lru_gen_enabled()) {
			
 
				++		lru_gen_refault(page, shadow);
			
 
				++		return;
			
 
				++	}
			
 
				++
			
 
				+ 	unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
			
 
				++	eviction <<= bucket_order;
			
 
				+ 
			
 
				+ 	rcu_read_lock();
			
 
				+ 	/*
			
--- a/target/linux/generic/pending-5.15/020-07-mm-multigenerational-lru-user-interface.patch
+++ b/target/linux/generic/pending-5.15/020-07-mm-multigenerational-lru-user-interface.patch
@@ -0,0 +1,496 @@
 
				+From 5cc7fdec54e87e32b4fb0f07d84b21769d5f8d92 Mon Sep 17 00:00:00 2001
			
 
				+From: Yu Zhao <[email protected]>
			
 
				+Date: Mon, 25 Jan 2021 21:38:02 -0700
			
 
				+Subject: [PATCH 08/10] mm: multigenerational lru: user interface
			
 
				+
			
 
				+Add /sys/kernel/mm/lru_gen/enabled to enable and disable the
			
 
				+multigenerational lru at runtime.
			
 
				+
			
 
				+Add /sys/kernel/mm/lru_gen/min_ttl_ms to protect the working set of a
			
 
				+given number of milliseconds. The OOM killer is invoked if this
			
 
				+working set cannot be kept in memory.
			
 
				+
			
 
				+Add /sys/kernel/debug/lru_gen to monitor the multigenerational lru and
			
 
				+invoke the aging and the eviction. This file has the following output:
			
 
				+  memcg  memcg_id  memcg_path
			
 
				+    node  node_id
			
 
				+      min_gen  birth_time  anon_size  file_size
			
 
				+      ...
			
 
				+      max_gen  birth_time  anon_size  file_size
			
 
				+
			
 
				+min_gen is the oldest generation number and max_gen is the youngest
			
 
				+generation number. birth_time is in milliseconds. anon_size and
			
 
				+file_size are in pages.
			
 
				+
			
 
				+This file takes the following input:
			
 
				+  + memcg_id node_id max_gen [swappiness] [use_bloom_filter]
			
 
				+  - memcg_id node_id min_gen [swappiness] [nr_to_reclaim]
			
 
				+
			
 
				+The first command line invokes the aging, which scans PTEs for
			
 
				+accessed pages and then creates the next generation max_gen+1. A swap
			
 
				+file and a non-zero swappiness, which overrides vm.swappiness, are
			
 
				+required to scan PTEs mapping anon pages. The second command line
			
 
				+invokes the eviction, which evicts generations less than or equal to
			
 
				+min_gen. min_gen should be less than max_gen-1 as max_gen and
			
 
				+max_gen-1 are not fully aged and therefore cannot be evicted.
			
 
				+Setting nr_to_reclaim to N limits the number of pages to evict.
			
 
				+Setting use_bloom_filter to 0 overrides the default behavior which
			
 
				+only scans PTE tables found populated. Multiple command lines are
			
 
				+supported, as is concatenation with delimiters "," and ";".
			
 
				+
			
 
				+Signed-off-by: Yu Zhao <[email protected]>
			
 
				+Tested-by: Konstantin Kharlamov <[email protected]>
			
 
				+Change-Id: I4448e60029badbe347aa3b624f429b280cc3a3d3
			
 
				+---
			
 
				+ include/linux/nodemask.h |   1 +
			
 
				+ mm/vmscan.c              | 415 +++++++++++++++++++++++++++++++++++++++
			
 
				+ 2 files changed, 416 insertions(+)
			
 
				+
			
 
				+--- a/include/linux/nodemask.h
			
 
				++++ b/include/linux/nodemask.h
			
 
				+@@ -485,6 +485,7 @@ static inline int num_node_state(enum no
			
 
				+ #define first_online_node	0
			
 
				+ #define first_memory_node	0
			
 
				+ #define next_online_node(nid)	(MAX_NUMNODES)
			
 
				++#define next_memory_node(nid)	(MAX_NUMNODES)
			
 
				+ #define nr_node_ids		1U
			
 
				+ #define nr_online_nodes		1U
			
 
				+ 
			
 
				+--- a/mm/vmscan.c
			
 
				++++ b/mm/vmscan.c
			
 
				+@@ -53,6 +53,8 @@
			
 
				+ #include <linux/memory.h>
			
 
				+ #include <linux/pagewalk.h>
			
 
				+ #include <linux/shmem_fs.h>
			
 
				++#include <linux/ctype.h>
			
 
				++#include <linux/debugfs.h>
			
 
				+ 
			
 
				+ #include <asm/tlbflush.h>
			
 
				+ #include <asm/div64.h>
			
 
				+@@ -4882,6 +4884,413 @@ unlock:
			
 
				+ }
			
 
				+ 
			
 
				+ /******************************************************************************
			
 
				++ *                          sysfs interface
			
 
				++ ******************************************************************************/
			
 
				++
			
 
				++static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
			
 
				++{
			
 
				++	return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
			
 
				++}
			
 
				++
			
 
				++static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
			
 
				++			     const char *buf, size_t len)
			
 
				++{
			
 
				++	unsigned int msecs;
			
 
				++
			
 
				++	if (kstrtouint(buf, 10, &msecs))
			
 
				++		return -EINVAL;
			
 
				++
			
 
				++	WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
			
 
				++
			
 
				++	return len;
			
 
				++}
			
 
				++
			
 
				++static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
			
 
				++	min_ttl_ms, 0644, show_min_ttl, store_min_ttl
			
 
				++);
			
 
				++
			
 
				++static ssize_t show_enable(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
			
 
				++{
			
 
				++	return snprintf(buf, PAGE_SIZE, "%d\n", lru_gen_enabled());
			
 
				++}
			
 
				++
			
 
				++static ssize_t store_enable(struct kobject *kobj, struct kobj_attribute *attr,
			
 
				++			    const char *buf, size_t len)
			
 
				++{
			
 
				++	bool enable;
			
 
				++
			
 
				++	if (kstrtobool(buf, &enable))
			
 
				++		return -EINVAL;
			
 
				++
			
 
				++	lru_gen_change_state(enable, true, false);
			
 
				++
			
 
				++	return len;
			
 
				++}
			
 
				++
			
 
				++static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
			
 
				++	enabled, 0644, show_enable, store_enable
			
 
				++);
			
 
				++
			
 
				++static struct attribute *lru_gen_attrs[] = {
			
 
				++	&lru_gen_min_ttl_attr.attr,
			
 
				++	&lru_gen_enabled_attr.attr,
			
 
				++	NULL
			
 
				++};
			
 
				++
			
 
				++static struct attribute_group lru_gen_attr_group = {
			
 
				++	.name = "lru_gen",
			
 
				++	.attrs = lru_gen_attrs,
			
 
				++};
			
 
				++
			
 
				++/******************************************************************************
			
 
				++ *                          debugfs interface
			
 
				++ ******************************************************************************/
			
 
				++
			
 
				++static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
			
 
				++{
			
 
				++	struct mem_cgroup *memcg;
			
 
				++	loff_t nr_to_skip = *pos;
			
 
				++
			
 
				++	m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
			
 
				++	if (!m->private)
			
 
				++		return ERR_PTR(-ENOMEM);
			
 
				++
			
 
				++	memcg = mem_cgroup_iter(NULL, NULL, NULL);
			
 
				++	do {
			
 
				++		int nid;
			
 
				++
			
 
				++		for_each_node_state(nid, N_MEMORY) {
			
 
				++			if (!nr_to_skip--)
			
 
				++				return get_lruvec(nid, memcg);
			
 
				++		}
			
 
				++	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
			
 
				++
			
 
				++	return NULL;
			
 
				++}
			
 
				++
			
 
				++static void lru_gen_seq_stop(struct seq_file *m, void *v)
			
 
				++{
			
 
				++	if (!IS_ERR_OR_NULL(v))
			
 
				++		mem_cgroup_iter_break(NULL, lruvec_memcg(v));
			
 
				++
			
 
				++	kvfree(m->private);
			
 
				++	m->private = NULL;
			
 
				++}
			
 
				++
			
 
				++static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
			
 
				++{
			
 
				++	int nid = lruvec_pgdat(v)->node_id;
			
 
				++	struct mem_cgroup *memcg = lruvec_memcg(v);
			
 
				++
			
 
				++	++*pos;
			
 
				++
			
 
				++	nid = next_memory_node(nid);
			
 
				++	if (nid == MAX_NUMNODES) {
			
 
				++		memcg = mem_cgroup_iter(NULL, memcg, NULL);
			
 
				++		if (!memcg)
			
 
				++			return NULL;
			
 
				++
			
 
				++		nid = first_memory_node;
			
 
				++	}
			
 
				++
			
 
				++	return get_lruvec(nid, memcg);
			
 
				++}
			
 
				++
			
 
				++static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
			
 
				++				  unsigned long max_seq, unsigned long *min_seq,
			
 
				++				  unsigned long seq)
			
 
				++{
			
 
				++	int i;
			
 
				++	int type, tier;
			
 
				++	int hist = lru_hist_from_seq(seq);
			
 
				++	struct lrugen *lrugen = &lruvec->evictable;
			
 
				++
			
 
				++	for (tier = 0; tier < MAX_NR_TIERS; tier++) {
			
 
				++		seq_printf(m, "            %10d", tier);
			
 
				++		for (type = 0; type < ANON_AND_FILE; type++) {
			
 
				++			unsigned long n[3] = {};
			
 
				++
			
 
				++			if (seq == max_seq) {
			
 
				++				n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
			
 
				++				n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
			
 
				++
			
 
				++				seq_printf(m, " %10luR %10luT %10lu ", n[0], n[1], n[2]);
			
 
				++			} else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
			
 
				++				n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
			
 
				++				n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
			
 
				++				if (tier)
			
 
				++					n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
			
 
				++
			
 
				++				seq_printf(m, " %10lur %10lue %10lup", n[0], n[1], n[2]);
			
 
				++			} else
			
 
				++				seq_puts(m, "          0           0           0 ");
			
 
				++		}
			
 
				++		seq_putc(m, '\n');
			
 
				++	}
			
 
				++
			
 
				++	seq_puts(m, "                      ");
			
 
				++	for (i = 0; i < NR_MM_STATS; i++) {
			
 
				++		if (seq == max_seq && NR_HIST_GENS == 1)
			
 
				++			seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]),
			
 
				++				   toupper(MM_STAT_CODES[i]));
			
 
				++		else if (seq != max_seq && NR_HIST_GENS > 1)
			
 
				++			seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]),
			
 
				++				   MM_STAT_CODES[i]);
			
 
				++		else
			
 
				++			seq_puts(m, "          0 ");
			
 
				++	}
			
 
				++	seq_putc(m, '\n');
			
 
				++}
			
 
				++
			
 
				++static int lru_gen_seq_show(struct seq_file *m, void *v)
			
 
				++{
			
 
				++	unsigned long seq;
			
 
				++	bool full = !debugfs_real_fops(m->file)->write;
			
 
				++	struct lruvec *lruvec = v;
			
 
				++	struct lrugen *lrugen = &lruvec->evictable;
			
 
				++	int nid = lruvec_pgdat(lruvec)->node_id;
			
 
				++	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
			
 
				++	DEFINE_MAX_SEQ(lruvec);
			
 
				++	DEFINE_MIN_SEQ(lruvec);
			
 
				++
			
 
				++	if (nid == first_memory_node) {
			
 
				++		const char *path = memcg ? m->private : "";
			
 
				++
			
 
				++#ifdef CONFIG_MEMCG
			
 
				++		if (memcg)
			
 
				++			cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
			
 
				++#endif
			
 
				++		seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
			
 
				++	}
			
 
				++
			
 
				++	seq_printf(m, " node %5d\n", nid);
			
 
				++
			
 
				++	if (!full)
			
 
				++		seq = min_seq[0];
			
 
				++	else if (max_seq >= MAX_NR_GENS)
			
 
				++		seq = max_seq - MAX_NR_GENS + 1;
			
 
				++	else
			
 
				++		seq = 0;
			
 
				++
			
 
				++	for (; seq <= max_seq; seq++) {
			
 
				++		int gen, type, zone;
			
 
				++		unsigned int msecs;
			
 
				++
			
 
				++		gen = lru_gen_from_seq(seq);
			
 
				++		msecs = jiffies_to_msecs(jiffies - READ_ONCE(lrugen->timestamps[gen]));
			
 
				++
			
 
				++		seq_printf(m, " %10lu %10u", seq, msecs);
			
 
				++
			
 
				++		for (type = 0; type < ANON_AND_FILE; type++) {
			
 
				++			long size = 0;
			
 
				++
			
 
				++			if (seq < min_seq[type]) {
			
 
				++				seq_puts(m, "         -0 ");
			
 
				++				continue;
			
 
				++			}
			
 
				++
			
 
				++			for (zone = 0; zone < MAX_NR_ZONES; zone++)
			
 
				++				size += READ_ONCE(lrugen->sizes[gen][type][zone]);
			
 
				++
			
 
				++			seq_printf(m, " %10lu ", max(size, 0L));
			
 
				++		}
			
 
				++
			
 
				++		seq_putc(m, '\n');
			
 
				++
			
 
				++		if (full)
			
 
				++			lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
			
 
				++	}
			
 
				++
			
 
				++	return 0;
			
 
				++}
			
 
				++
			
 
				++static const struct seq_operations lru_gen_seq_ops = {
			
 
				++	.start = lru_gen_seq_start,
			
 
				++	.stop = lru_gen_seq_stop,
			
 
				++	.next = lru_gen_seq_next,
			
 
				++	.show = lru_gen_seq_show,
			
 
				++};
			
 
				++
			
 
				++static int run_aging(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
			
 
				++		     unsigned long seq, bool use_filter)
			
 
				++{
			
 
				++	DEFINE_MAX_SEQ(lruvec);
			
 
				++
			
 
				++	if (seq == max_seq)
			
 
				++		try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, use_filter);
			
 
				++
			
 
				++	return seq > max_seq ? -EINVAL : 0;
			
 
				++}
			
 
				++
			
 
				++static int run_eviction(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
			
 
				++			unsigned long seq, unsigned long nr_to_reclaim)
			
 
				++{
			
 
				++	struct blk_plug plug;
			
 
				++	int err = -EINTR;
			
 
				++	DEFINE_MAX_SEQ(lruvec);
			
 
				++
			
 
				++	if (seq >= max_seq - 1)
			
 
				++		return -EINVAL;
			
 
				++
			
 
				++	sc->nr_reclaimed = 0;
			
 
				++
			
 
				++	blk_start_plug(&plug);
			
 
				++
			
 
				++	while (!signal_pending(current)) {
			
 
				++		DEFINE_MIN_SEQ(lruvec);
			
 
				++
			
 
				++		if (seq < min_seq[!swappiness] || sc->nr_reclaimed >= nr_to_reclaim ||
			
 
				++		    !evict_pages(lruvec, sc, swappiness)) {
			
 
				++			err = 0;
			
 
				++			break;
			
 
				++		}
			
 
				++
			
 
				++		cond_resched();
			
 
				++	}
			
 
				++
			
 
				++	blk_finish_plug(&plug);
			
 
				++
			
 
				++	return err;
			
 
				++}
			
 
				++
			
 
				++static int run_cmd(char cmd, int memcg_id, int nid, struct scan_control *sc,
			
 
				++		   int swappiness, unsigned long seq, unsigned long opt)
			
 
				++{
			
 
				++	struct lruvec *lruvec;
			
 
				++	int err = -EINVAL;
			
 
				++	struct mem_cgroup *memcg = NULL;
			
 
				++
			
 
				++	if (!mem_cgroup_disabled()) {
			
 
				++		rcu_read_lock();
			
 
				++		memcg = mem_cgroup_from_id(memcg_id);
			
 
				++#ifdef CONFIG_MEMCG
			
 
				++		if (memcg && !css_tryget(&memcg->css))
			
 
				++			memcg = NULL;
			
 
				++#endif
			
 
				++		rcu_read_unlock();
			
 
				++
			
 
				++		if (!memcg)
			
 
				++			goto done;
			
 
				++	}
			
 
				++	if (memcg_id != mem_cgroup_id(memcg))
			
 
				++		goto done;
			
 
				++
			
 
				++	if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
			
 
				++		goto done;
			
 
				++
			
 
				++	lruvec = get_lruvec(nid, memcg);
			
 
				++
			
 
				++	if (swappiness < 0)
			
 
				++		swappiness = get_swappiness(memcg);
			
 
				++	else if (swappiness > 200)
			
 
				++		goto done;
			
 
				++
			
 
				++	switch (cmd) {
			
 
				++	case '+':
			
 
				++		err = run_aging(lruvec, sc, swappiness, seq, opt);
			
 
				++		break;
			
 
				++	case '-':
			
 
				++		err = run_eviction(lruvec, sc, swappiness, seq, opt);
			
 
				++		break;
			
 
				++	}
			
 
				++done:
			
 
				++	mem_cgroup_put(memcg);
			
 
				++
			
 
				++	return err;
			
 
				++}
			
 
				++
			
 
				++static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
			
 
				++				 size_t len, loff_t *pos)
			
 
				++{
			
 
				++	void *buf;
			
 
				++	char *cur, *next;
			
 
				++	unsigned int flags;
			
 
				++	int err = 0;
			
 
				++	struct scan_control sc = {
			
 
				++		.may_writepage = 1,
			
 
				++		.may_unmap = 1,
			
 
				++		.may_swap = 1,
			
 
				++		.reclaim_idx = MAX_NR_ZONES - 1,
			
 
				++		.gfp_mask = GFP_KERNEL,
			
 
				++	};
			
 
				++
			
 
				++	buf = kvmalloc(len + 1, GFP_KERNEL);
			
 
				++	if (!buf)
			
 
				++		return -ENOMEM;
			
 
				++
			
 
				++	if (copy_from_user(buf, src, len)) {
			
 
				++		kvfree(buf);
			
 
				++		return -EFAULT;
			
 
				++	}
			
 
				++
			
 
				++	next = buf;
			
 
				++	next[len] = '\0';
			
 
				++
			
 
				++	sc.reclaim_state.mm_walk_args = alloc_mm_walk_args();
			
 
				++	if (!sc.reclaim_state.mm_walk_args) {
			
 
				++		kvfree(buf);
			
 
				++		return -ENOMEM;
			
 
				++	}
			
 
				++
			
 
				++	flags = memalloc_noreclaim_save();
			
 
				++	set_task_reclaim_state(current, &sc.reclaim_state);
			
 
				++
			
 
				++	while ((cur = strsep(&next, ",;\n"))) {
			
 
				++		int n;
			
 
				++		int end;
			
 
				++		char cmd;
			
 
				++		unsigned int memcg_id;
			
 
				++		unsigned int nid;
			
 
				++		unsigned long seq;
			
 
				++		unsigned int swappiness = -1;
			
 
				++		unsigned long opt = -1;
			
 
				++
			
 
				++		cur = skip_spaces(cur);
			
 
				++		if (!*cur)
			
 
				++			continue;
			
 
				++
			
 
				++		n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
			
 
				++			   &seq, &end, &swappiness, &end, &opt, &end);
			
 
				++		if (n < 4 || cur[end]) {
			
 
				++			err = -EINVAL;
			
 
				++			break;
			
 
				++		}
			
 
				++
			
 
				++		err = run_cmd(cmd, memcg_id, nid, &sc, swappiness, seq, opt);
			
 
				++		if (err)
			
 
				++			break;
			
 
				++	}
			
 
				++
			
 
				++	set_task_reclaim_state(current, NULL);
			
 
				++	memalloc_noreclaim_restore(flags);
			
 
				++
			
 
				++	free_mm_walk_args(sc.reclaim_state.mm_walk_args);
			
 
				++	kvfree(buf);
			
 
				++
			
 
				++	return err ? : len;
			
 
				++}
			
 
				++
			
 
				++static int lru_gen_seq_open(struct inode *inode, struct file *file)
			
 
				++{
			
 
				++	return seq_open(file, &lru_gen_seq_ops);
			
 
				++}
			
 
				++
			
 
				++static const struct file_operations lru_gen_rw_fops = {
			
 
				++	.open = lru_gen_seq_open,
			
 
				++	.read = seq_read,
			
 
				++	.write = lru_gen_seq_write,
			
 
				++	.llseek = seq_lseek,
			
 
				++	.release = seq_release,
			
 
				++};
			
 
				++
			
 
				++static const struct file_operations lru_gen_ro_fops = {
			
 
				++	.open = lru_gen_seq_open,
			
 
				++	.read = seq_read,
			
 
				++	.llseek = seq_lseek,
			
 
				++	.release = seq_release,
			
 
				++};
			
 
				++
			
 
				++/******************************************************************************
			
 
				+  *                          initialization
			
 
				+  ******************************************************************************/
			
 
				+ 
			
 
				+@@ -4951,6 +5360,12 @@ static int __init init_lru_gen(void)
			
 
				+ 	BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
			
 
				+ 	BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
			
 
				+ 
			
 
				++	if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
			
 
				++		pr_err("lru_gen: failed to create sysfs group\n");
			
 
				++
			
 
				++	debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
			
 
				++	debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
			
 
				++
			
 
				+ 	return 0;
			
 
				+ };
			
 
				+ late_initcall(init_lru_gen);
			
--- a/target/linux/generic/pending-5.15/020-08-mm-multigenerational-lru-Kconfig.patch
+++ b/target/linux/generic/pending-5.15/020-08-mm-multigenerational-lru-Kconfig.patch
@@ -0,0 +1,80 @@
 
				+From 3008095eb835d207dd7e5b60899aad17f32aa9f7 Mon Sep 17 00:00:00 2001
			
 
				+From: Yu Zhao <[email protected]>
			
 
				+Date: Mon, 25 Jan 2021 21:47:24 -0700
			
 
				+Subject: [PATCH 09/10] mm: multigenerational lru: Kconfig
			
 
				+
			
 
				+Add configuration options for the multigenerational lru.
			
 
				+
			
 
				+Signed-off-by: Yu Zhao <[email protected]>
			
 
				+Tested-by: Konstantin Kharlamov <[email protected]>
			
 
				+Change-Id: Ic74ea07f8fb5f56e6904a1b80c3c286bc2911635
			
 
				+---
			
 
				+ mm/Kconfig | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
			
 
				+ 1 file changed, 59 insertions(+)
			
 
				+
			
 
				+--- a/mm/Kconfig
			
 
				++++ b/mm/Kconfig
			
 
				+@@ -899,4 +899,63 @@ config SECRETMEM
			
 
				+ 
			
 
				+ source "mm/damon/Kconfig"
			
 
				+ 
			
 
				++# the multigenerational lru {
			
 
				++config LRU_GEN
			
 
				++	bool "Multigenerational LRU"
			
 
				++	depends on MMU
			
 
				++	# the following options may leave not enough spare bits in page->flags
			
 
				++	depends on !MAXSMP && (64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP)
			
 
				++	help
			
 
				++	  A high performance LRU implementation to heavily overcommit workloads
			
 
				++	  that are not IO bound. See Documentation/vm/multigen_lru.rst for
			
 
				++	  details.
			
 
				++
			
 
				++	  Warning: do not enable this option unless you plan to use it because
			
 
				++	  it introduces a small per-process and per-memcg and per-node memory
			
 
				++	  overhead.
			
 
				++
			
 
				++config LRU_GEN_ENABLED
			
 
				++	bool "Turn on by default"
			
 
				++	depends on LRU_GEN
			
 
				++	help
			
 
				++	  The default value of /sys/kernel/mm/lru_gen/enabled is 0. This option
			
 
				++	  changes it to 1.
			
 
				++
			
 
				++	  Warning: the default value is the fast path. See
			
 
				++	  Documentation/static-keys.txt for details.
			
 
				++
			
 
				++config LRU_GEN_STATS
			
 
				++	bool "Full stats for debugging"
			
 
				++	depends on LRU_GEN
			
 
				++	help
			
 
				++	  This option keeps full stats for each generation, which can be read
			
 
				++	  from /sys/kernel/debug/lru_gen_full.
			
 
				++
			
 
				++	  Warning: do not enable this option unless you plan to use it because
			
 
				++	  it introduces an additional small per-process and per-memcg and
			
 
				++	  per-node memory overhead.
			
 
				++
			
 
				++config NR_LRU_GENS
			
 
				++	int "Max number of generations"
			
 
				++	depends on LRU_GEN
			
 
				++	range 4 31
			
 
				++	default 7
			
 
				++	help
			
 
				++	  This will use order_base_2(N+1) spare bits from page flags.
			
 
				++
			
 
				++	  Warning: do not use numbers larger than necessary because each
			
 
				++	  generation introduces a small per-node and per-memcg memory overhead.
			
 
				++
			
 
				++config TIERS_PER_GEN
			
 
				++	int "Number of tiers per generation"
			
 
				++	depends on LRU_GEN
			
 
				++	range 2 5
			
 
				++	default 4
			
 
				++	help
			
 
				++	  This will use N-2 spare bits from page flags.
			
 
				++
			
 
				++	  Larger values generally offer better protection to active pages under
			
 
				++	  heavy buffered I/O workloads.
			
 
				++# }
			
 
				++
			
 
				+ endmenu
			
--- a/target/linux/generic/pending-5.15/020-09-mm-multigenerational-lru-documentation.patch
+++ b/target/linux/generic/pending-5.15/020-09-mm-multigenerational-lru-documentation.patch
@@ -0,0 +1,161 @@
 
				+From f59c618ed70a1e48accc4cad91a200966f2569c9 Mon Sep 17 00:00:00 2001
			
 
				+From: Yu Zhao <[email protected]>
			
 
				+Date: Tue, 2 Feb 2021 01:27:45 -0700
			
 
				+Subject: [PATCH 10/10] mm: multigenerational lru: documentation
			
 
				+
			
 
				+Add Documentation/vm/multigen_lru.rst.
			
 
				+
			
 
				+Signed-off-by: Yu Zhao <[email protected]>
			
 
				+Tested-by: Konstantin Kharlamov <[email protected]>
			
 
				+Change-Id: I1902178bcbb5adfa0a748c4d284a6456059bdd7e
			
 
				+---
			
 
				+ Documentation/vm/index.rst        |   1 +
			
 
				+ Documentation/vm/multigen_lru.rst | 132 ++++++++++++++++++++++++++++++
			
 
				+ 2 files changed, 133 insertions(+)
			
 
				+ create mode 100644 Documentation/vm/multigen_lru.rst
			
 
				+
			
 
				+--- a/Documentation/vm/index.rst
			
 
				++++ b/Documentation/vm/index.rst
			
 
				+@@ -17,6 +17,7 @@ various features of the Linux memory man
			
 
				+ 
			
 
				+    swap_numa
			
 
				+    zswap
			
 
				++   multigen_lru
			
 
				+ 
			
 
				+ Kernel developers MM documentation
			
 
				+ ==================================
			
 
				+--- /dev/null
			
 
				++++ b/Documentation/vm/multigen_lru.rst
			
 
				+@@ -0,0 +1,132 @@
			
 
				++.. SPDX-License-Identifier: GPL-2.0
			
 
				++
			
 
				++=====================
			
 
				++Multigenerational LRU
			
 
				++=====================
			
 
				++
			
 
				++Quick Start
			
 
				++===========
			
 
				++Build Configurations
			
 
				++--------------------
			
 
				++:Required: Set ``CONFIG_LRU_GEN=y``.
			
 
				++
			
 
				++:Optional: Set ``CONFIG_LRU_GEN_ENABLED=y`` to turn the feature on by
			
 
				++ default.
			
 
				++
			
 
				++Runtime Configurations
			
 
				++----------------------
			
 
				++:Required: Write ``1`` to ``/sys/kernel/mm/lru_gen/enable`` if the
			
 
				++ feature was not turned on by default.
			
 
				++
			
 
				++:Optional: Write ``N`` to ``/sys/kernel/mm/lru_gen/min_ttl_ms`` to
			
 
				++ protect the working set of ``N`` milliseconds. The OOM killer is
			
 
				++ invoked if this working set cannot be kept in memory.
			
 
				++
			
 
				++:Optional: Read ``/sys/kernel/debug/lru_gen`` to confirm the feature
			
 
				++ is turned on. This file has the following output:
			
 
				++
			
 
				++::
			
 
				++
			
 
				++  memcg  memcg_id  memcg_path
			
 
				++    node  node_id
			
 
				++      min_gen  birth_time  anon_size  file_size
			
 
				++      ...
			
 
				++      max_gen  birth_time  anon_size  file_size
			
 
				++
			
 
				++``min_gen`` is the oldest generation number and ``max_gen`` is the
			
 
				++youngest generation number. ``birth_time`` is in milliseconds.
			
 
				++``anon_size`` and ``file_size`` are in pages.
			
 
				++
			
 
				++Phones/Laptops/Workstations
			
 
				++---------------------------
			
 
				++No additional configurations required.
			
 
				++
			
 
				++Servers/Data Centers
			
 
				++--------------------
			
 
				++:To support more generations: Change ``CONFIG_NR_LRU_GENS`` to a
			
 
				++ larger number.
			
 
				++
			
 
				++:To support more tiers: Change ``CONFIG_TIERS_PER_GEN`` to a larger
			
 
				++ number.
			
 
				++
			
 
				++:To support full stats: Set ``CONFIG_LRU_GEN_STATS=y``.
			
 
				++
			
 
				++:Working set estimation: Write ``+ memcg_id node_id max_gen
			
 
				++ [swappiness] [use_bloom_filter]`` to ``/sys/kernel/debug/lru_gen`` to
			
 
				++ invoke the aging, which scans PTEs for accessed pages and then
			
 
				++ creates the next generation ``max_gen+1``. A swap file and a non-zero
			
 
				++ ``swappiness``, which overrides ``vm.swappiness``, are required to
			
 
				++ scan PTEs mapping anon pages. Set ``use_bloom_filter`` to 0 to
			
 
				++ override the default behavior which only scans PTE tables found
			
 
				++ populated.
			
 
				++
			
 
				++:Proactive reclaim: Write ``- memcg_id node_id min_gen [swappiness]
			
 
				++ [nr_to_reclaim]`` to ``/sys/kernel/debug/lru_gen`` to invoke the
			
 
				++ eviction, which evicts generations less than or equal to ``min_gen``.
			
 
				++ ``min_gen`` should be less than ``max_gen-1`` as ``max_gen`` and
			
 
				++ ``max_gen-1`` are not fully aged and therefore cannot be evicted.
			
 
				++ Use ``nr_to_reclaim`` to limit the number of pages to evict. Multiple
			
 
				++ command lines are supported, so does concatenation with delimiters
			
 
				++ ``,`` and ``;``.
			
 
				++
			
 
				++Framework
			
 
				++=========
			
 
				++For each ``lruvec``, evictable pages are divided into multiple
			
 
				++generations. The youngest generation number is stored in
			
 
				++``lrugen->max_seq`` for both anon and file types as they are aged on
			
 
				++an equal footing. The oldest generation numbers are stored in
			
 
				++``lrugen->min_seq[]`` separately for anon and file types as clean
			
 
				++file pages can be evicted regardless of swap and writeback
			
 
				++constraints. These three variables are monotonically increasing.
			
 
				++Generation numbers are truncated into
			
 
				++``order_base_2(CONFIG_NR_LRU_GENS+1)`` bits in order to fit into
			
 
				++``page->flags``. The sliding window technique is used to prevent
			
 
				++truncated generation numbers from overlapping. Each truncated
			
 
				++generation number is an index to an array of per-type and per-zone
			
 
				++lists ``lrugen->lists``.
			
 
				++
			
 
				++Each generation is divided into multiple tiers. Tiers represent
			
 
				++different ranges of numbers of accesses from file descriptors only.
			
 
				++Pages accessed ``N`` times via file descriptors belong to tier
			
 
				++``order_base_2(N)``. Each generation contains at most
			
 
				++``CONFIG_TIERS_PER_GEN`` tiers, and they require additional
			
 
				++``CONFIG_TIERS_PER_GEN-2`` bits in ``page->flags``. In contrast to
			
 
				++moving between generations which requires list operations, moving
			
 
				++between tiers only involves operations on ``page->flags`` and
			
 
				++therefore has a negligible cost. A feedback loop modeled after the PID
			
 
				++controller monitors refaulted % across all tiers and decides when to
			
 
				++protect pages from which tiers.
			
 
				++
			
 
				++The framework comprises two conceptually independent components: the
			
 
				++aging and the eviction, which can be invoked separately from user
			
 
				++space for the purpose of working set estimation and proactive reclaim.
			
 
				++
			
 
				++Aging
			
 
				++-----
			
 
				++The aging produces young generations. Given an ``lruvec``, the aging
			
 
				++traverses ``lruvec_memcg()->mm_list`` and calls ``walk_page_range()``
			
 
				++to scan PTEs for accessed pages (a ``mm_struct`` list is maintained
			
 
				++for each ``memcg``). Upon finding one, the aging updates its
			
 
				++generation number to ``max_seq`` (modulo ``CONFIG_NR_LRU_GENS``).
			
 
				++After each round of traversal, the aging increments ``max_seq``. The
			
 
				++aging is due when ``min_seq[]`` reaches ``max_seq-1``.
			
 
				++
			
 
				++Eviction
			
 
				++--------
			
 
				++The eviction consumes old generations. Given an ``lruvec``, the
			
 
				++eviction scans pages on the per-zone lists indexed by anon and file
			
 
				++``min_seq[]`` (modulo ``CONFIG_NR_LRU_GENS``). It first tries to
			
 
				++select a type based on the values of ``min_seq[]``. If they are
			
 
				++equal, it selects the type that has a lower refaulted %. The eviction
			
 
				++sorts a page according to its updated generation number if the aging
			
 
				++has found this page accessed. It also moves a page to the next
			
 
				++generation if this page is from an upper tier that has a higher
			
 
				++refaulted % than the base tier. The eviction increments ``min_seq[]``
			
 
				++of a selected type when it finds all the per-zone lists indexed by
			
 
				++``min_seq[]`` of this selected type are empty.
			
 
				++
			
 
				++To-do List
			
 
				++==========
			
 
				++KVM Optimization
			
 
				++----------------
			
 
				++Support shadow page table walk.