1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687 |
- From 05223c4e80b34e29f2255c04ffebc2c4475e7593 Mon Sep 17 00:00:00 2001
- From: Yu Zhao <[email protected]>
- Date: Sun, 18 Sep 2022 02:00:05 -0600
- Subject: [PATCH 08/29] mm: multi-gen LRU: support page table walks
- MIME-Version: 1.0
- Content-Type: text/plain; charset=UTF-8
- Content-Transfer-Encoding: 8bit
- To further exploit spatial locality, the aging prefers to walk page tables
- to search for young PTEs and promote hot pages. A kill switch will be
- added in the next patch to disable this behavior. When disabled, the
- aging relies on the rmap only.
- NB: this behavior has nothing similar with the page table scanning in the
- 2.4 kernel [1], which searches page tables for old PTEs, adds cold pages
- to swapcache and unmaps them.
- To avoid confusion, the term "iteration" specifically means the traversal
- of an entire mm_struct list; the term "walk" will be applied to page
- tables and the rmap, as usual.
- An mm_struct list is maintained for each memcg, and an mm_struct follows
- its owner task to the new memcg when this task is migrated. Given an
- lruvec, the aging iterates lruvec_memcg()->mm_list and calls
- walk_page_range() with each mm_struct on this list to promote hot pages
- before it increments max_seq.
- When multiple page table walkers iterate the same list, each of them gets
- a unique mm_struct; therefore they can run concurrently. Page table
- walkers ignore any misplaced pages, e.g., if an mm_struct was migrated,
- pages it left in the previous memcg will not be promoted when its current
- memcg is under reclaim. Similarly, page table walkers will not promote
- pages from nodes other than the one under reclaim.
- This patch uses the following optimizations when walking page tables:
- 1. It tracks the usage of mm_struct's between context switches so that
- page table walkers can skip processes that have been sleeping since
- the last iteration.
- 2. It uses generational Bloom filters to record populated branches so
- that page table walkers can reduce their search space based on the
- query results, e.g., to skip page tables containing mostly holes or
- misplaced pages.
- 3. It takes advantage of the accessed bit in non-leaf PMD entries when
- CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y.
- 4. It does not zigzag between a PGD table and the same PMD table
- spanning multiple VMAs. IOW, it finishes all the VMAs within the
- range of the same PMD table before it returns to a PGD table. This
- improves the cache performance for workloads that have large
- numbers of tiny VMAs [2], especially when CONFIG_PGTABLE_LEVELS=5.
- Server benchmark results:
- Single workload:
- fio (buffered I/O): no change
- Single workload:
- memcached (anon): +[8, 10]%
- Ops/sec KB/sec
- patch1-7: 1147696.57 44640.29
- patch1-8: 1245274.91 48435.66
- Configurations:
- no change
- Client benchmark results:
- kswapd profiles:
- patch1-7
- 48.16% lzo1x_1_do_compress (real work)
- 8.20% page_vma_mapped_walk (overhead)
- 7.06% _raw_spin_unlock_irq
- 2.92% ptep_clear_flush
- 2.53% __zram_bvec_write
- 2.11% do_raw_spin_lock
- 2.02% memmove
- 1.93% lru_gen_look_around
- 1.56% free_unref_page_list
- 1.40% memset
- patch1-8
- 49.44% lzo1x_1_do_compress (real work)
- 6.19% page_vma_mapped_walk (overhead)
- 5.97% _raw_spin_unlock_irq
- 3.13% get_pfn_page
- 2.85% ptep_clear_flush
- 2.42% __zram_bvec_write
- 2.08% do_raw_spin_lock
- 1.92% memmove
- 1.44% alloc_zspage
- 1.36% memset
- Configurations:
- no change
- Thanks to the following developers for their efforts [3].
- kernel test robot <[email protected]>
- [1] https://lwn.net/Articles/23732/
- [2] https://llvm.org/docs/ScudoHardenedAllocator.html
- [3] https://lore.kernel.org/r/[email protected]/
- Link: https://lkml.kernel.org/r/[email protected]
- Signed-off-by: Yu Zhao <[email protected]>
- Acked-by: Brian Geffon <[email protected]>
- Acked-by: Jan Alexander Steffens (heftig) <[email protected]>
- Acked-by: Oleksandr Natalenko <[email protected]>
- Acked-by: Steven Barrett <[email protected]>
- Acked-by: Suleiman Souhlal <[email protected]>
- Tested-by: Daniel Byrne <[email protected]>
- Tested-by: Donald Carr <[email protected]>
- Tested-by: Holger Hoffstätte <[email protected]>
- Tested-by: Konstantin Kharlamov <[email protected]>
- Tested-by: Shuang Zhai <[email protected]>
- Tested-by: Sofia Trinh <[email protected]>
- Tested-by: Vaibhav Jain <[email protected]>
- Cc: Andi Kleen <[email protected]>
- Cc: Aneesh Kumar K.V <[email protected]>
- Cc: Barry Song <[email protected]>
- Cc: Catalin Marinas <[email protected]>
- Cc: Dave Hansen <[email protected]>
- Cc: Hillf Danton <[email protected]>
- Cc: Jens Axboe <[email protected]>
- Cc: Johannes Weiner <[email protected]>
- Cc: Jonathan Corbet <[email protected]>
- Cc: Linus Torvalds <[email protected]>
- Cc: Matthew Wilcox <[email protected]>
- Cc: Mel Gorman <[email protected]>
- Cc: Miaohe Lin <[email protected]>
- Cc: Michael Larabel <[email protected]>
- Cc: Michal Hocko <[email protected]>
- Cc: Mike Rapoport <[email protected]>
- Cc: Mike Rapoport <[email protected]>
- Cc: Peter Zijlstra <[email protected]>
- Cc: Qi Zheng <[email protected]>
- Cc: Tejun Heo <[email protected]>
- Cc: Vlastimil Babka <[email protected]>
- Cc: Will Deacon <[email protected]>
- Signed-off-by: Andrew Morton <[email protected]>
- ---
- fs/exec.c | 2 +
- include/linux/memcontrol.h | 5 +
- include/linux/mm_types.h | 76 +++
- include/linux/mmzone.h | 56 +-
- include/linux/swap.h | 4 +
- kernel/exit.c | 1 +
- kernel/fork.c | 9 +
- kernel/sched/core.c | 1 +
- mm/memcontrol.c | 25 +
- mm/vmscan.c | 1010 +++++++++++++++++++++++++++++++++++-
- 10 files changed, 1172 insertions(+), 17 deletions(-)
- --- a/fs/exec.c
- +++ b/fs/exec.c
- @@ -1013,6 +1013,7 @@ static int exec_mmap(struct mm_struct *m
- active_mm = tsk->active_mm;
- tsk->active_mm = mm;
- tsk->mm = mm;
- + lru_gen_add_mm(mm);
- /*
- * This prevents preemption while active_mm is being loaded and
- * it and mm are being updated, which could cause problems for
- @@ -1028,6 +1029,7 @@ static int exec_mmap(struct mm_struct *m
- tsk->mm->vmacache_seqnum = 0;
- vmacache_flush(tsk);
- task_unlock(tsk);
- + lru_gen_use_mm(mm);
- if (old_mm) {
- mmap_read_unlock(old_mm);
- BUG_ON(active_mm != old_mm);
- --- a/include/linux/memcontrol.h
- +++ b/include/linux/memcontrol.h
- @@ -348,6 +348,11 @@ struct mem_cgroup {
- struct deferred_split deferred_split_queue;
- #endif
-
- +#ifdef CONFIG_LRU_GEN
- + /* per-memcg mm_struct list */
- + struct lru_gen_mm_list mm_list;
- +#endif
- +
- struct mem_cgroup_per_node *nodeinfo[];
- };
-
- --- a/include/linux/mm_types.h
- +++ b/include/linux/mm_types.h
- @@ -580,6 +580,22 @@ struct mm_struct {
- #ifdef CONFIG_IOMMU_SUPPORT
- u32 pasid;
- #endif
- +#ifdef CONFIG_LRU_GEN
- + struct {
- + /* this mm_struct is on lru_gen_mm_list */
- + struct list_head list;
- + /*
- + * Set when switching to this mm_struct, as a hint of
- + * whether it has been used since the last time per-node
- + * page table walkers cleared the corresponding bits.
- + */
- + unsigned long bitmap;
- +#ifdef CONFIG_MEMCG
- + /* points to the memcg of "owner" above */
- + struct mem_cgroup *memcg;
- +#endif
- + } lru_gen;
- +#endif /* CONFIG_LRU_GEN */
- } __randomize_layout;
-
- /*
- @@ -606,6 +622,66 @@ static inline cpumask_t *mm_cpumask(stru
- return (struct cpumask *)&mm->cpu_bitmap;
- }
-
- +#ifdef CONFIG_LRU_GEN
- +
- +struct lru_gen_mm_list {
- + /* mm_struct list for page table walkers */
- + struct list_head fifo;
- + /* protects the list above */
- + spinlock_t lock;
- +};
- +
- +void lru_gen_add_mm(struct mm_struct *mm);
- +void lru_gen_del_mm(struct mm_struct *mm);
- +#ifdef CONFIG_MEMCG
- +void lru_gen_migrate_mm(struct mm_struct *mm);
- +#endif
- +
- +static inline void lru_gen_init_mm(struct mm_struct *mm)
- +{
- + INIT_LIST_HEAD(&mm->lru_gen.list);
- + mm->lru_gen.bitmap = 0;
- +#ifdef CONFIG_MEMCG
- + mm->lru_gen.memcg = NULL;
- +#endif
- +}
- +
- +static inline void lru_gen_use_mm(struct mm_struct *mm)
- +{
- + /*
- + * When the bitmap is set, page reclaim knows this mm_struct has been
- + * used since the last time it cleared the bitmap. So it might be worth
- + * walking the page tables of this mm_struct to clear the accessed bit.
- + */
- + WRITE_ONCE(mm->lru_gen.bitmap, -1);
- +}
- +
- +#else /* !CONFIG_LRU_GEN */
- +
- +static inline void lru_gen_add_mm(struct mm_struct *mm)
- +{
- +}
- +
- +static inline void lru_gen_del_mm(struct mm_struct *mm)
- +{
- +}
- +
- +#ifdef CONFIG_MEMCG
- +static inline void lru_gen_migrate_mm(struct mm_struct *mm)
- +{
- +}
- +#endif
- +
- +static inline void lru_gen_init_mm(struct mm_struct *mm)
- +{
- +}
- +
- +static inline void lru_gen_use_mm(struct mm_struct *mm)
- +{
- +}
- +
- +#endif /* CONFIG_LRU_GEN */
- +
- struct mmu_gather;
- extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
- extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
- --- a/include/linux/mmzone.h
- +++ b/include/linux/mmzone.h
- @@ -385,7 +385,7 @@ enum {
- * min_seq behind.
- *
- * The number of pages in each generation is eventually consistent and therefore
- - * can be transiently negative.
- + * can be transiently negative when reset_batch_size() is pending.
- */
- struct lru_gen_struct {
- /* the aging increments the youngest generation number */
- @@ -407,6 +407,53 @@ struct lru_gen_struct {
- atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
- };
-
- +enum {
- + MM_LEAF_TOTAL, /* total leaf entries */
- + MM_LEAF_OLD, /* old leaf entries */
- + MM_LEAF_YOUNG, /* young leaf entries */
- + MM_NONLEAF_TOTAL, /* total non-leaf entries */
- + MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */
- + MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */
- + NR_MM_STATS
- +};
- +
- +/* double-buffering Bloom filters */
- +#define NR_BLOOM_FILTERS 2
- +
- +struct lru_gen_mm_state {
- + /* set to max_seq after each iteration */
- + unsigned long seq;
- + /* where the current iteration continues (inclusive) */
- + struct list_head *head;
- + /* where the last iteration ended (exclusive) */
- + struct list_head *tail;
- + /* to wait for the last page table walker to finish */
- + struct wait_queue_head wait;
- + /* Bloom filters flip after each iteration */
- + unsigned long *filters[NR_BLOOM_FILTERS];
- + /* the mm stats for debugging */
- + unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
- + /* the number of concurrent page table walkers */
- + int nr_walkers;
- +};
- +
- +struct lru_gen_mm_walk {
- + /* the lruvec under reclaim */
- + struct lruvec *lruvec;
- + /* unstable max_seq from lru_gen_struct */
- + unsigned long max_seq;
- + /* the next address within an mm to scan */
- + unsigned long next_addr;
- + /* to batch promoted pages */
- + int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
- + /* to batch the mm stats */
- + int mm_stats[NR_MM_STATS];
- + /* total batched items */
- + int batched;
- + bool can_swap;
- + bool force_scan;
- +};
- +
- void lru_gen_init_lruvec(struct lruvec *lruvec);
- void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
-
- @@ -457,6 +504,8 @@ struct lruvec {
- #ifdef CONFIG_LRU_GEN
- /* evictable pages divided into generations */
- struct lru_gen_struct lrugen;
- + /* to concurrently iterate lru_gen_mm_list */
- + struct lru_gen_mm_state mm_state;
- #endif
- #ifdef CONFIG_MEMCG
- struct pglist_data *pgdat;
- @@ -1042,6 +1091,11 @@ typedef struct pglist_data {
-
- unsigned long flags;
-
- +#ifdef CONFIG_LRU_GEN
- + /* kswap mm walk data */
- + struct lru_gen_mm_walk mm_walk;
- +#endif
- +
- ZONE_PADDING(_pad2_)
-
- /* Per-node vmstats */
- --- a/include/linux/swap.h
- +++ b/include/linux/swap.h
- @@ -137,6 +137,10 @@ union swap_header {
- */
- struct reclaim_state {
- unsigned long reclaimed_slab;
- +#ifdef CONFIG_LRU_GEN
- + /* per-thread mm walk data */
- + struct lru_gen_mm_walk *mm_walk;
- +#endif
- };
-
- #ifdef __KERNEL__
- --- a/kernel/exit.c
- +++ b/kernel/exit.c
- @@ -469,6 +469,7 @@ assign_new_owner:
- goto retry;
- }
- WRITE_ONCE(mm->owner, c);
- + lru_gen_migrate_mm(mm);
- task_unlock(c);
- put_task_struct(c);
- }
- --- a/kernel/fork.c
- +++ b/kernel/fork.c
- @@ -1083,6 +1083,7 @@ static struct mm_struct *mm_init(struct
- goto fail_nocontext;
-
- mm->user_ns = get_user_ns(user_ns);
- + lru_gen_init_mm(mm);
- return mm;
-
- fail_nocontext:
- @@ -1125,6 +1126,7 @@ static inline void __mmput(struct mm_str
- }
- if (mm->binfmt)
- module_put(mm->binfmt->module);
- + lru_gen_del_mm(mm);
- mmdrop(mm);
- }
-
- @@ -2622,6 +2624,13 @@ pid_t kernel_clone(struct kernel_clone_a
- get_task_struct(p);
- }
-
- + if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
- + /* lock the task to synchronize with memcg migration */
- + task_lock(p);
- + lru_gen_add_mm(p->mm);
- + task_unlock(p);
- + }
- +
- wake_up_new_task(p);
-
- /* forking complete and child started to run, tell ptracer */
- --- a/kernel/sched/core.c
- +++ b/kernel/sched/core.c
- @@ -5010,6 +5010,7 @@ context_switch(struct rq *rq, struct tas
- * finish_task_switch()'s mmdrop().
- */
- switch_mm_irqs_off(prev->active_mm, next->mm, next);
- + lru_gen_use_mm(next->mm);
-
- if (!prev->mm) { // from kernel
- /* will mmdrop() in finish_task_switch(). */
- --- a/mm/memcontrol.c
- +++ b/mm/memcontrol.c
- @@ -6212,6 +6212,30 @@ static void mem_cgroup_move_task(void)
- }
- #endif
-
- +#ifdef CONFIG_LRU_GEN
- +static void mem_cgroup_attach(struct cgroup_taskset *tset)
- +{
- + struct task_struct *task;
- + struct cgroup_subsys_state *css;
- +
- + /* find the first leader if there is any */
- + cgroup_taskset_for_each_leader(task, css, tset)
- + break;
- +
- + if (!task)
- + return;
- +
- + task_lock(task);
- + if (task->mm && READ_ONCE(task->mm->owner) == task)
- + lru_gen_migrate_mm(task->mm);
- + task_unlock(task);
- +}
- +#else
- +static void mem_cgroup_attach(struct cgroup_taskset *tset)
- +{
- +}
- +#endif /* CONFIG_LRU_GEN */
- +
- static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
- {
- if (value == PAGE_COUNTER_MAX)
- @@ -6555,6 +6579,7 @@ struct cgroup_subsys memory_cgrp_subsys
- .css_reset = mem_cgroup_css_reset,
- .css_rstat_flush = mem_cgroup_css_rstat_flush,
- .can_attach = mem_cgroup_can_attach,
- + .attach = mem_cgroup_attach,
- .cancel_attach = mem_cgroup_cancel_attach,
- .post_attach = mem_cgroup_move_task,
- .dfl_cftypes = memory_files,
- --- a/mm/vmscan.c
- +++ b/mm/vmscan.c
- @@ -50,6 +50,8 @@
- #include <linux/printk.h>
- #include <linux/dax.h>
- #include <linux/psi.h>
- +#include <linux/pagewalk.h>
- +#include <linux/shmem_fs.h>
-
- #include <asm/tlbflush.h>
- #include <asm/div64.h>
- @@ -2853,7 +2855,7 @@ static bool can_age_anon_pages(struct pg
- for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
- for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
-
- -static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
- +static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
- {
- struct pglist_data *pgdat = NODE_DATA(nid);
-
- @@ -2899,6 +2901,371 @@ static bool __maybe_unused seq_is_valid(
- }
-
- /******************************************************************************
- + * mm_struct list
- + ******************************************************************************/
- +
- +static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
- +{
- + static struct lru_gen_mm_list mm_list = {
- + .fifo = LIST_HEAD_INIT(mm_list.fifo),
- + .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
- + };
- +
- +#ifdef CONFIG_MEMCG
- + if (memcg)
- + return &memcg->mm_list;
- +#endif
- + VM_WARN_ON_ONCE(!mem_cgroup_disabled());
- +
- + return &mm_list;
- +}
- +
- +void lru_gen_add_mm(struct mm_struct *mm)
- +{
- + int nid;
- + struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
- + struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
- +
- + VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list));
- +#ifdef CONFIG_MEMCG
- + VM_WARN_ON_ONCE(mm->lru_gen.memcg);
- + mm->lru_gen.memcg = memcg;
- +#endif
- + spin_lock(&mm_list->lock);
- +
- + for_each_node_state(nid, N_MEMORY) {
- + struct lruvec *lruvec = get_lruvec(memcg, nid);
- +
- + if (!lruvec)
- + continue;
- +
- + /* the first addition since the last iteration */
- + if (lruvec->mm_state.tail == &mm_list->fifo)
- + lruvec->mm_state.tail = &mm->lru_gen.list;
- + }
- +
- + list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
- +
- + spin_unlock(&mm_list->lock);
- +}
- +
- +void lru_gen_del_mm(struct mm_struct *mm)
- +{
- + int nid;
- + struct lru_gen_mm_list *mm_list;
- + struct mem_cgroup *memcg = NULL;
- +
- + if (list_empty(&mm->lru_gen.list))
- + return;
- +
- +#ifdef CONFIG_MEMCG
- + memcg = mm->lru_gen.memcg;
- +#endif
- + mm_list = get_mm_list(memcg);
- +
- + spin_lock(&mm_list->lock);
- +
- + for_each_node(nid) {
- + struct lruvec *lruvec = get_lruvec(memcg, nid);
- +
- + if (!lruvec)
- + continue;
- +
- + /* where the last iteration ended (exclusive) */
- + if (lruvec->mm_state.tail == &mm->lru_gen.list)
- + lruvec->mm_state.tail = lruvec->mm_state.tail->next;
- +
- + /* where the current iteration continues (inclusive) */
- + if (lruvec->mm_state.head != &mm->lru_gen.list)
- + continue;
- +
- + lruvec->mm_state.head = lruvec->mm_state.head->next;
- + /* the deletion ends the current iteration */
- + if (lruvec->mm_state.head == &mm_list->fifo)
- + WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
- + }
- +
- + list_del_init(&mm->lru_gen.list);
- +
- + spin_unlock(&mm_list->lock);
- +
- +#ifdef CONFIG_MEMCG
- + mem_cgroup_put(mm->lru_gen.memcg);
- + mm->lru_gen.memcg = NULL;
- +#endif
- +}
- +
- +#ifdef CONFIG_MEMCG
- +void lru_gen_migrate_mm(struct mm_struct *mm)
- +{
- + struct mem_cgroup *memcg;
- + struct task_struct *task = rcu_dereference_protected(mm->owner, true);
- +
- + VM_WARN_ON_ONCE(task->mm != mm);
- + lockdep_assert_held(&task->alloc_lock);
- +
- + /* for mm_update_next_owner() */
- + if (mem_cgroup_disabled())
- + return;
- +
- + rcu_read_lock();
- + memcg = mem_cgroup_from_task(task);
- + rcu_read_unlock();
- + if (memcg == mm->lru_gen.memcg)
- + return;
- +
- + VM_WARN_ON_ONCE(!mm->lru_gen.memcg);
- + VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
- +
- + lru_gen_del_mm(mm);
- + lru_gen_add_mm(mm);
- +}
- +#endif
- +
- +/*
- + * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
- + * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
- + * bits in a bitmap, k is the number of hash functions and n is the number of
- + * inserted items.
- + *
- + * Page table walkers use one of the two filters to reduce their search space.
- + * To get rid of non-leaf entries that no longer have enough leaf entries, the
- + * aging uses the double-buffering technique to flip to the other filter each
- + * time it produces a new generation. For non-leaf entries that have enough
- + * leaf entries, the aging carries them over to the next generation in
- + * walk_pmd_range(); the eviction also report them when walking the rmap
- + * in lru_gen_look_around().
- + *
- + * For future optimizations:
- + * 1. It's not necessary to keep both filters all the time. The spare one can be
- + * freed after the RCU grace period and reallocated if needed again.
- + * 2. And when reallocating, it's worth scaling its size according to the number
- + * of inserted entries in the other filter, to reduce the memory overhead on
- + * small systems and false positives on large systems.
- + * 3. Jenkins' hash function is an alternative to Knuth's.
- + */
- +#define BLOOM_FILTER_SHIFT 15
- +
- +static inline int filter_gen_from_seq(unsigned long seq)
- +{
- + return seq % NR_BLOOM_FILTERS;
- +}
- +
- +static void get_item_key(void *item, int *key)
- +{
- + u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
- +
- + BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
- +
- + key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
- + key[1] = hash >> BLOOM_FILTER_SHIFT;
- +}
- +
- +static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
- +{
- + unsigned long *filter;
- + int gen = filter_gen_from_seq(seq);
- +
- + filter = lruvec->mm_state.filters[gen];
- + if (filter) {
- + bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
- + return;
- + }
- +
- + filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
- + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
- + WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
- +}
- +
- +static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
- +{
- + int key[2];
- + unsigned long *filter;
- + int gen = filter_gen_from_seq(seq);
- +
- + filter = READ_ONCE(lruvec->mm_state.filters[gen]);
- + if (!filter)
- + return;
- +
- + get_item_key(item, key);
- +
- + if (!test_bit(key[0], filter))
- + set_bit(key[0], filter);
- + if (!test_bit(key[1], filter))
- + set_bit(key[1], filter);
- +}
- +
- +static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
- +{
- + int key[2];
- + unsigned long *filter;
- + int gen = filter_gen_from_seq(seq);
- +
- + filter = READ_ONCE(lruvec->mm_state.filters[gen]);
- + if (!filter)
- + return true;
- +
- + get_item_key(item, key);
- +
- + return test_bit(key[0], filter) && test_bit(key[1], filter);
- +}
- +
- +static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
- +{
- + int i;
- + int hist;
- +
- + lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
- +
- + if (walk) {
- + hist = lru_hist_from_seq(walk->max_seq);
- +
- + for (i = 0; i < NR_MM_STATS; i++) {
- + WRITE_ONCE(lruvec->mm_state.stats[hist][i],
- + lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]);
- + walk->mm_stats[i] = 0;
- + }
- + }
- +
- + if (NR_HIST_GENS > 1 && last) {
- + hist = lru_hist_from_seq(lruvec->mm_state.seq + 1);
- +
- + for (i = 0; i < NR_MM_STATS; i++)
- + WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0);
- + }
- +}
- +
- +static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
- +{
- + int type;
- + unsigned long size = 0;
- + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
- + int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
- +
- + if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
- + return true;
- +
- + clear_bit(key, &mm->lru_gen.bitmap);
- +
- + for (type = !walk->can_swap; type < ANON_AND_FILE; type++) {
- + size += type ? get_mm_counter(mm, MM_FILEPAGES) :
- + get_mm_counter(mm, MM_ANONPAGES) +
- + get_mm_counter(mm, MM_SHMEMPAGES);
- + }
- +
- + if (size < MIN_LRU_BATCH)
- + return true;
- +
- + return !mmget_not_zero(mm);
- +}
- +
- +static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
- + struct mm_struct **iter)
- +{
- + bool first = false;
- + bool last = true;
- + struct mm_struct *mm = NULL;
- + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- + struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
- + struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
- +
- + /*
- + * There are four interesting cases for this page table walker:
- + * 1. It tries to start a new iteration of mm_list with a stale max_seq;
- + * there is nothing left to do.
- + * 2. It's the first of the current generation, and it needs to reset
- + * the Bloom filter for the next generation.
- + * 3. It reaches the end of mm_list, and it needs to increment
- + * mm_state->seq; the iteration is done.
- + * 4. It's the last of the current generation, and it needs to reset the
- + * mm stats counters for the next generation.
- + */
- + spin_lock(&mm_list->lock);
- +
- + VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
- + VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq);
- + VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers);
- +
- + if (walk->max_seq <= mm_state->seq) {
- + if (!*iter)
- + last = false;
- + goto done;
- + }
- +
- + if (!mm_state->nr_walkers) {
- + VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
- +
- + mm_state->head = mm_list->fifo.next;
- + first = true;
- + }
- +
- + while (!mm && mm_state->head != &mm_list->fifo) {
- + mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
- +
- + mm_state->head = mm_state->head->next;
- +
- + /* force scan for those added after the last iteration */
- + if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
- + mm_state->tail = mm_state->head;
- + walk->force_scan = true;
- + }
- +
- + if (should_skip_mm(mm, walk))
- + mm = NULL;
- + }
- +
- + if (mm_state->head == &mm_list->fifo)
- + WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
- +done:
- + if (*iter && !mm)
- + mm_state->nr_walkers--;
- + if (!*iter && mm)
- + mm_state->nr_walkers++;
- +
- + if (mm_state->nr_walkers)
- + last = false;
- +
- + if (*iter || last)
- + reset_mm_stats(lruvec, walk, last);
- +
- + spin_unlock(&mm_list->lock);
- +
- + if (mm && first)
- + reset_bloom_filter(lruvec, walk->max_seq + 1);
- +
- + if (*iter)
- + mmput_async(*iter);
- +
- + *iter = mm;
- +
- + return last;
- +}
- +
- +static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
- +{
- + bool success = false;
- + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- + struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
- + struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
- +
- + spin_lock(&mm_list->lock);
- +
- + VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
- +
- + if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
- + VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
- +
- + WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
- + reset_mm_stats(lruvec, NULL, true);
- + success = true;
- + }
- +
- + spin_unlock(&mm_list->lock);
- +
- + return success;
- +}
- +
- +/******************************************************************************
- * refault feedback loop
- ******************************************************************************/
-
- @@ -3048,6 +3415,118 @@ static int page_inc_gen(struct lruvec *l
- return new_gen;
- }
-
- +static void update_batch_size(struct lru_gen_mm_walk *walk, struct page *page,
- + int old_gen, int new_gen)
- +{
- + int type = page_is_file_lru(page);
- + int zone = page_zonenum(page);
- + int delta = thp_nr_pages(page);
- +
- + VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS);
- + VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS);
- +
- + walk->batched++;
- +
- + walk->nr_pages[old_gen][type][zone] -= delta;
- + walk->nr_pages[new_gen][type][zone] += delta;
- +}
- +
- +static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
- +{
- + int gen, type, zone;
- + struct lru_gen_struct *lrugen = &lruvec->lrugen;
- +
- + walk->batched = 0;
- +
- + for_each_gen_type_zone(gen, type, zone) {
- + enum lru_list lru = type * LRU_INACTIVE_FILE;
- + int delta = walk->nr_pages[gen][type][zone];
- +
- + if (!delta)
- + continue;
- +
- + walk->nr_pages[gen][type][zone] = 0;
- + WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
- + lrugen->nr_pages[gen][type][zone] + delta);
- +
- + if (lru_gen_is_active(lruvec, gen))
- + lru += LRU_ACTIVE;
- + __update_lru_size(lruvec, lru, zone, delta);
- + }
- +}
- +
- +static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args)
- +{
- + struct address_space *mapping;
- + struct vm_area_struct *vma = args->vma;
- + struct lru_gen_mm_walk *walk = args->private;
- +
- + if (!vma_is_accessible(vma))
- + return true;
- +
- + if (is_vm_hugetlb_page(vma))
- + return true;
- +
- + if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ))
- + return true;
- +
- + if (vma == get_gate_vma(vma->vm_mm))
- + return true;
- +
- + if (vma_is_anonymous(vma))
- + return !walk->can_swap;
- +
- + if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
- + return true;
- +
- + mapping = vma->vm_file->f_mapping;
- + if (mapping_unevictable(mapping))
- + return true;
- +
- + if (shmem_mapping(mapping))
- + return !walk->can_swap;
- +
- + /* to exclude special mappings like dax, etc. */
- + return !mapping->a_ops->readpage;
- +}
- +
- +/*
- + * Some userspace memory allocators map many single-page VMAs. Instead of
- + * returning back to the PGD table for each of such VMAs, finish an entire PMD
- + * table to reduce zigzags and improve cache performance.
- + */
- +static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args,
- + unsigned long *vm_start, unsigned long *vm_end)
- +{
- + unsigned long start = round_up(*vm_end, size);
- + unsigned long end = (start | ~mask) + 1;
- +
- + VM_WARN_ON_ONCE(mask & size);
- + VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask));
- +
- + while (args->vma) {
- + if (start >= args->vma->vm_end) {
- + args->vma = args->vma->vm_next;
- + continue;
- + }
- +
- + if (end && end <= args->vma->vm_start)
- + return false;
- +
- + if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) {
- + args->vma = args->vma->vm_next;
- + continue;
- + }
- +
- + *vm_start = max(start, args->vma->vm_start);
- + *vm_end = min(end - 1, args->vma->vm_end - 1) + 1;
- +
- + return true;
- + }
- +
- + return false;
- +}
- +
- static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
- {
- unsigned long pfn = pte_pfn(pte);
- @@ -3066,8 +3545,28 @@ static unsigned long get_pte_pfn(pte_t p
- return pfn;
- }
-
- +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
- +static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
- +{
- + unsigned long pfn = pmd_pfn(pmd);
- +
- + VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
- +
- + if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
- + return -1;
- +
- + if (WARN_ON_ONCE(pmd_devmap(pmd)))
- + return -1;
- +
- + if (WARN_ON_ONCE(!pfn_valid(pfn)))
- + return -1;
- +
- + return pfn;
- +}
- +#endif
- +
- static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg,
- - struct pglist_data *pgdat)
- + struct pglist_data *pgdat, bool can_swap)
- {
- struct page *page;
-
- @@ -3082,9 +3581,375 @@ static struct page *get_pfn_page(unsigne
- if (page_memcg_rcu(page) != memcg)
- return NULL;
-
- + /* file VMAs can contain anon pages from COW */
- + if (!page_is_file_lru(page) && !can_swap)
- + return NULL;
- +
- return page;
- }
-
- +static bool suitable_to_scan(int total, int young)
- +{
- + int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
- +
- + /* suitable if the average number of young PTEs per cacheline is >=1 */
- + return young * n >= total;
- +}
- +
- +static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
- + struct mm_walk *args)
- +{
- + int i;
- + pte_t *pte;
- + spinlock_t *ptl;
- + unsigned long addr;
- + int total = 0;
- + int young = 0;
- + struct lru_gen_mm_walk *walk = args->private;
- + struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
- + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
- + int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
- +
- + VM_WARN_ON_ONCE(pmd_leaf(*pmd));
- +
- + ptl = pte_lockptr(args->mm, pmd);
- + if (!spin_trylock(ptl))
- + return false;
- +
- + arch_enter_lazy_mmu_mode();
- +
- + pte = pte_offset_map(pmd, start & PMD_MASK);
- +restart:
- + for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
- + unsigned long pfn;
- + struct page *page;
- +
- + total++;
- + walk->mm_stats[MM_LEAF_TOTAL]++;
- +
- + pfn = get_pte_pfn(pte[i], args->vma, addr);
- + if (pfn == -1)
- + continue;
- +
- + if (!pte_young(pte[i])) {
- + walk->mm_stats[MM_LEAF_OLD]++;
- + continue;
- + }
- +
- + page = get_pfn_page(pfn, memcg, pgdat, walk->can_swap);
- + if (!page)
- + continue;
- +
- + if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
- + VM_WARN_ON_ONCE(true);
- +
- + young++;
- + walk->mm_stats[MM_LEAF_YOUNG]++;
- +
- + if (pte_dirty(pte[i]) && !PageDirty(page) &&
- + !(PageAnon(page) && PageSwapBacked(page) &&
- + !PageSwapCache(page)))
- + set_page_dirty(page);
- +
- + old_gen = page_update_gen(page, new_gen);
- + if (old_gen >= 0 && old_gen != new_gen)
- + update_batch_size(walk, page, old_gen, new_gen);
- + }
- +
- + if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
- + goto restart;
- +
- + pte_unmap(pte);
- +
- + arch_leave_lazy_mmu_mode();
- + spin_unlock(ptl);
- +
- + return suitable_to_scan(total, young);
- +}
- +
- +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
- +static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
- + struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
- +{
- + int i;
- + pmd_t *pmd;
- + spinlock_t *ptl;
- + struct lru_gen_mm_walk *walk = args->private;
- + struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
- + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
- + int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
- +
- + VM_WARN_ON_ONCE(pud_leaf(*pud));
- +
- + /* try to batch at most 1+MIN_LRU_BATCH+1 entries */
- + if (*start == -1) {
- + *start = next;
- + return;
- + }
- +
- + i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start);
- + if (i && i <= MIN_LRU_BATCH) {
- + __set_bit(i - 1, bitmap);
- + return;
- + }
- +
- + pmd = pmd_offset(pud, *start);
- +
- + ptl = pmd_lockptr(args->mm, pmd);
- + if (!spin_trylock(ptl))
- + goto done;
- +
- + arch_enter_lazy_mmu_mode();
- +
- + do {
- + unsigned long pfn;
- + struct page *page;
- + unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start;
- +
- + pfn = get_pmd_pfn(pmd[i], vma, addr);
- + if (pfn == -1)
- + goto next;
- +
- + if (!pmd_trans_huge(pmd[i])) {
- + if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
- + pmdp_test_and_clear_young(vma, addr, pmd + i);
- + goto next;
- + }
- +
- + page = get_pfn_page(pfn, memcg, pgdat, walk->can_swap);
- + if (!page)
- + goto next;
- +
- + if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
- + goto next;
- +
- + walk->mm_stats[MM_LEAF_YOUNG]++;
- +
- + if (pmd_dirty(pmd[i]) && !PageDirty(page) &&
- + !(PageAnon(page) && PageSwapBacked(page) &&
- + !PageSwapCache(page)))
- + set_page_dirty(page);
- +
- + old_gen = page_update_gen(page, new_gen);
- + if (old_gen >= 0 && old_gen != new_gen)
- + update_batch_size(walk, page, old_gen, new_gen);
- +next:
- + i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
- + } while (i <= MIN_LRU_BATCH);
- +
- + arch_leave_lazy_mmu_mode();
- + spin_unlock(ptl);
- +done:
- + *start = -1;
- + bitmap_zero(bitmap, MIN_LRU_BATCH);
- +}
- +#else
- +static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
- + struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
- +{
- +}
- +#endif
- +
- +static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
- + struct mm_walk *args)
- +{
- + int i;
- + pmd_t *pmd;
- + unsigned long next;
- + unsigned long addr;
- + struct vm_area_struct *vma;
- + unsigned long pos = -1;
- + struct lru_gen_mm_walk *walk = args->private;
- + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
- +
- + VM_WARN_ON_ONCE(pud_leaf(*pud));
- +
- + /*
- + * Finish an entire PMD in two passes: the first only reaches to PTE
- + * tables to avoid taking the PMD lock; the second, if necessary, takes
- + * the PMD lock to clear the accessed bit in PMD entries.
- + */
- + pmd = pmd_offset(pud, start & PUD_MASK);
- +restart:
- + /* walk_pte_range() may call get_next_vma() */
- + vma = args->vma;
- + for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
- + pmd_t val = pmd_read_atomic(pmd + i);
- +
- + /* for pmd_read_atomic() */
- + barrier();
- +
- + next = pmd_addr_end(addr, end);
- +
- + if (!pmd_present(val) || is_huge_zero_pmd(val)) {
- + walk->mm_stats[MM_LEAF_TOTAL]++;
- + continue;
- + }
- +
- +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- + if (pmd_trans_huge(val)) {
- + unsigned long pfn = pmd_pfn(val);
- + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
- +
- + walk->mm_stats[MM_LEAF_TOTAL]++;
- +
- + if (!pmd_young(val)) {
- + walk->mm_stats[MM_LEAF_OLD]++;
- + continue;
- + }
- +
- + /* try to avoid unnecessary memory loads */
- + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
- + continue;
- +
- + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
- + continue;
- + }
- +#endif
- + walk->mm_stats[MM_NONLEAF_TOTAL]++;
- +
- +#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
- + if (!pmd_young(val))
- + continue;
- +
- + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
- +#endif
- + if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
- + continue;
- +
- + walk->mm_stats[MM_NONLEAF_FOUND]++;
- +
- + if (!walk_pte_range(&val, addr, next, args))
- + continue;
- +
- + walk->mm_stats[MM_NONLEAF_ADDED]++;
- +
- + /* carry over to the next generation */
- + update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);
- + }
- +
- + walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos);
- +
- + if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
- + goto restart;
- +}
- +
- +static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
- + struct mm_walk *args)
- +{
- + int i;
- + pud_t *pud;
- + unsigned long addr;
- + unsigned long next;
- + struct lru_gen_mm_walk *walk = args->private;
- +
- + VM_WARN_ON_ONCE(p4d_leaf(*p4d));
- +
- + pud = pud_offset(p4d, start & P4D_MASK);
- +restart:
- + for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
- + pud_t val = READ_ONCE(pud[i]);
- +
- + next = pud_addr_end(addr, end);
- +
- + if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
- + continue;
- +
- + walk_pmd_range(&val, addr, next, args);
- +
- + /* a racy check to curtail the waiting time */
- + if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
- + return 1;
- +
- + if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
- + end = (addr | ~PUD_MASK) + 1;
- + goto done;
- + }
- + }
- +
- + if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end))
- + goto restart;
- +
- + end = round_up(end, P4D_SIZE);
- +done:
- + if (!end || !args->vma)
- + return 1;
- +
- + walk->next_addr = max(end, args->vma->vm_start);
- +
- + return -EAGAIN;
- +}
- +
- +static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk)
- +{
- + static const struct mm_walk_ops mm_walk_ops = {
- + .test_walk = should_skip_vma,
- + .p4d_entry = walk_pud_range,
- + };
- +
- + int err;
- + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- +
- + walk->next_addr = FIRST_USER_ADDRESS;
- +
- + do {
- + err = -EBUSY;
- +
- + /* page_update_gen() requires stable page_memcg() */
- + if (!mem_cgroup_trylock_pages(memcg))
- + break;
- +
- + /* the caller might be holding the lock for write */
- + if (mmap_read_trylock(mm)) {
- + err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
- +
- + mmap_read_unlock(mm);
- + }
- +
- + mem_cgroup_unlock_pages();
- +
- + if (walk->batched) {
- + spin_lock_irq(&lruvec->lru_lock);
- + reset_batch_size(lruvec, walk);
- + spin_unlock_irq(&lruvec->lru_lock);
- + }
- +
- + cond_resched();
- + } while (err == -EAGAIN);
- +}
- +
- +static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
- +{
- + struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
- +
- + if (pgdat && current_is_kswapd()) {
- + VM_WARN_ON_ONCE(walk);
- +
- + walk = &pgdat->mm_walk;
- + } else if (!pgdat && !walk) {
- + VM_WARN_ON_ONCE(current_is_kswapd());
- +
- + walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
- + }
- +
- + current->reclaim_state->mm_walk = walk;
- +
- + return walk;
- +}
- +
- +static void clear_mm_walk(void)
- +{
- + struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
- +
- + VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages)));
- + VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats)));
- +
- + current->reclaim_state->mm_walk = NULL;
- +
- + if (!current_is_kswapd())
- + kfree(walk);
- +}
- +
- static void inc_min_seq(struct lruvec *lruvec, int type)
- {
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
- @@ -3136,7 +4001,7 @@ next:
- return success;
- }
-
- -static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap)
- +static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
- {
- int prev, next;
- int type, zone;
- @@ -3146,9 +4011,6 @@ static void inc_max_seq(struct lruvec *l
-
- VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
-
- - if (max_seq != lrugen->max_seq)
- - goto unlock;
- -
- for (type = ANON_AND_FILE - 1; type >= 0; type--) {
- if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
- continue;
- @@ -3186,10 +4048,76 @@ static void inc_max_seq(struct lruvec *l
-
- /* make sure preceding modifications appear */
- smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
- -unlock:
- +
- spin_unlock_irq(&lruvec->lru_lock);
- }
-
- +static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
- + struct scan_control *sc, bool can_swap)
- +{
- + bool success;
- + struct lru_gen_mm_walk *walk;
- + struct mm_struct *mm = NULL;
- + struct lru_gen_struct *lrugen = &lruvec->lrugen;
- +
- + VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
- +
- + /* see the comment in iterate_mm_list() */
- + if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) {
- + success = false;
- + goto done;
- + }
- +
- + /*
- + * If the hardware doesn't automatically set the accessed bit, fallback
- + * to lru_gen_look_around(), which only clears the accessed bit in a
- + * handful of PTEs. Spreading the work out over a period of time usually
- + * is less efficient, but it avoids bursty page faults.
- + */
- + if (!arch_has_hw_pte_young()) {
- + success = iterate_mm_list_nowalk(lruvec, max_seq);
- + goto done;
- + }
- +
- + walk = set_mm_walk(NULL);
- + if (!walk) {
- + success = iterate_mm_list_nowalk(lruvec, max_seq);
- + goto done;
- + }
- +
- + walk->lruvec = lruvec;
- + walk->max_seq = max_seq;
- + walk->can_swap = can_swap;
- + walk->force_scan = false;
- +
- + do {
- + success = iterate_mm_list(lruvec, walk, &mm);
- + if (mm)
- + walk_mm(lruvec, mm, walk);
- +
- + cond_resched();
- + } while (mm);
- +done:
- + if (!success) {
- + if (sc->priority <= DEF_PRIORITY - 2)
- + wait_event_killable(lruvec->mm_state.wait,
- + max_seq < READ_ONCE(lrugen->max_seq));
- +
- + return max_seq < READ_ONCE(lrugen->max_seq);
- + }
- +
- + VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
- +
- + inc_max_seq(lruvec, can_swap);
- + /* either this sees any waiters or they will see updated max_seq */
- + if (wq_has_sleeper(&lruvec->mm_state.wait))
- + wake_up_all(&lruvec->mm_state.wait);
- +
- + wakeup_flusher_threads(WB_REASON_VMSCAN);
- +
- + return true;
- +}
- +
- static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
- struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
- {
- @@ -3265,7 +4193,7 @@ static void age_lruvec(struct lruvec *lr
-
- need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
- if (need_aging)
- - inc_max_seq(lruvec, max_seq, swappiness);
- + try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
- }
-
- static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
- @@ -3274,6 +4202,8 @@ static void lru_gen_age_node(struct pgli
-
- VM_WARN_ON_ONCE(!current_is_kswapd());
-
- + set_mm_walk(pgdat);
- +
- memcg = mem_cgroup_iter(NULL, NULL, NULL);
- do {
- struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
- @@ -3282,11 +4212,16 @@ static void lru_gen_age_node(struct pgli
-
- cond_resched();
- } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
- +
- + clear_mm_walk();
- }
-
- /*
- * This function exploits spatial locality when shrink_page_list() walks the
- - * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
- + * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
- + * the scan was done cacheline efficiently, it adds the PMD entry pointing to
- + * the PTE table to the Bloom filter. This forms a feedback loop between the
- + * eviction and the aging.
- */
- void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
- {
- @@ -3295,6 +4230,8 @@ void lru_gen_look_around(struct page_vma
- unsigned long start;
- unsigned long end;
- unsigned long addr;
- + struct lru_gen_mm_walk *walk;
- + int young = 0;
- unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
- struct page *page = pvmw->page;
- struct mem_cgroup *memcg = page_memcg(page);
- @@ -3309,6 +4246,9 @@ void lru_gen_look_around(struct page_vma
- if (spin_is_contended(pvmw->ptl))
- return;
-
- + /* avoid taking the LRU lock under the PTL when possible */
- + walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
- +
- start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
- end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
-
- @@ -3338,13 +4278,15 @@ void lru_gen_look_around(struct page_vma
- if (!pte_young(pte[i]))
- continue;
-
- - page = get_pfn_page(pfn, memcg, pgdat);
- + page = get_pfn_page(pfn, memcg, pgdat, !walk || walk->can_swap);
- if (!page)
- continue;
-
- if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
- VM_WARN_ON_ONCE(true);
-
- + young++;
- +
- if (pte_dirty(pte[i]) && !PageDirty(page) &&
- !(PageAnon(page) && PageSwapBacked(page) &&
- !PageSwapCache(page)))
- @@ -3360,7 +4302,11 @@ void lru_gen_look_around(struct page_vma
- arch_leave_lazy_mmu_mode();
- rcu_read_unlock();
-
- - if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
- + /* feedback from rmap walkers to page table walkers */
- + if (suitable_to_scan(i, young))
- + update_bloom_filter(lruvec, max_seq, pvmw->pmd);
- +
- + if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
- for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
- page = pte_page(pte[i]);
- activate_page(page);
- @@ -3372,8 +4318,10 @@ void lru_gen_look_around(struct page_vma
- if (!mem_cgroup_trylock_pages(memcg))
- return;
-
- - spin_lock_irq(&lruvec->lru_lock);
- - new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
- + if (!walk) {
- + spin_lock_irq(&lruvec->lru_lock);
- + new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
- + }
-
- for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
- page = compound_head(pte_page(pte[i]));
- @@ -3384,10 +4332,14 @@ void lru_gen_look_around(struct page_vma
- if (old_gen < 0 || old_gen == new_gen)
- continue;
-
- - lru_gen_update_size(lruvec, page, old_gen, new_gen);
- + if (walk)
- + update_batch_size(walk, page, old_gen, new_gen);
- + else
- + lru_gen_update_size(lruvec, page, old_gen, new_gen);
- }
-
- - spin_unlock_irq(&lruvec->lru_lock);
- + if (!walk)
- + spin_unlock_irq(&lruvec->lru_lock);
-
- mem_cgroup_unlock_pages();
- }
- @@ -3670,6 +4622,7 @@ static int evict_pages(struct lruvec *lr
- struct page *page;
- enum vm_event_item item;
- struct reclaim_stat stat;
- + struct lru_gen_mm_walk *walk;
- struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-
- @@ -3706,6 +4659,10 @@ static int evict_pages(struct lruvec *lr
-
- move_pages_to_lru(lruvec, &list);
-
- + walk = current->reclaim_state->mm_walk;
- + if (walk && walk->batched)
- + reset_batch_size(lruvec, walk);
- +
- item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
- if (!cgroup_reclaim(sc))
- __count_vm_events(item, reclaimed);
- @@ -3722,6 +4679,11 @@ static int evict_pages(struct lruvec *lr
- return scanned;
- }
-
- +/*
- + * For future optimizations:
- + * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
- + * reclaim.
- + */
- static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
- bool can_swap)
- {
- @@ -3747,7 +4709,8 @@ static unsigned long get_nr_to_scan(stru
- if (current_is_kswapd())
- return 0;
-
- - inc_max_seq(lruvec, max_seq, can_swap);
- + if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap))
- + return nr_to_scan;
- done:
- return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
- }
- @@ -3761,6 +4724,8 @@ static void lru_gen_shrink_lruvec(struct
-
- blk_start_plug(&plug);
-
- + set_mm_walk(lruvec_pgdat(lruvec));
- +
- while (true) {
- int delta;
- int swappiness;
- @@ -3788,6 +4753,8 @@ static void lru_gen_shrink_lruvec(struct
- cond_resched();
- }
-
- + clear_mm_walk();
- +
- blk_finish_plug(&plug);
- }
-
- @@ -3804,15 +4771,21 @@ void lru_gen_init_lruvec(struct lruvec *
-
- for_each_gen_type_zone(gen, type, zone)
- INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
- +
- + lruvec->mm_state.seq = MIN_NR_GENS;
- + init_waitqueue_head(&lruvec->mm_state.wait);
- }
-
- #ifdef CONFIG_MEMCG
- void lru_gen_init_memcg(struct mem_cgroup *memcg)
- {
- + INIT_LIST_HEAD(&memcg->mm_list.fifo);
- + spin_lock_init(&memcg->mm_list.lock);
- }
-
- void lru_gen_exit_memcg(struct mem_cgroup *memcg)
- {
- + int i;
- int nid;
-
- for_each_node(nid) {
- @@ -3820,6 +4793,11 @@ void lru_gen_exit_memcg(struct mem_cgrou
-
- VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
- sizeof(lruvec->lrugen.nr_pages)));
- +
- + for (i = 0; i < NR_BLOOM_FILTERS; i++) {
- + bitmap_free(lruvec->mm_state.filters[i]);
- + lruvec->mm_state.filters[i] = NULL;
- + }
- }
- }
- #endif
|