123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807 |
- From a9b328add8422921a0dbbef162730800e16e8cfd Mon Sep 17 00:00:00 2001
- From: Yu Zhao <[email protected]>
- Date: Sun, 18 Sep 2022 02:00:02 -0600
- Subject: [PATCH 05/29] mm: multi-gen LRU: groundwork
- MIME-Version: 1.0
- Content-Type: text/plain; charset=UTF-8
- Content-Transfer-Encoding: 8bit
- Evictable pages are divided into multiple generations for each lruvec.
- The youngest generation number is stored in lrugen->max_seq for both
- anon and file types as they are aged on an equal footing. The oldest
- generation numbers are stored in lrugen->min_seq[] separately for anon
- and file types as clean file pages can be evicted regardless of swap
- constraints. These three variables are monotonically increasing.
- Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits
- in order to fit into the gen counter in page->flags. Each truncated
- generation number is an index to lrugen->lists[]. The sliding window
- technique is used to track at least MIN_NR_GENS and at most
- MAX_NR_GENS generations. The gen counter stores a value within [1,
- MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it
- stores 0.
- There are two conceptually independent procedures: "the aging", which
- produces young generations, and "the eviction", which consumes old
- generations. They form a closed-loop system, i.e., "the page reclaim".
- Both procedures can be invoked from userspace for the purposes of working
- set estimation and proactive reclaim. These techniques are commonly used
- to optimize job scheduling (bin packing) in data centers [1][2].
- To avoid confusion, the terms "hot" and "cold" will be applied to the
- multi-gen LRU, as a new convention; the terms "active" and "inactive" will
- be applied to the active/inactive LRU, as usual.
- The protection of hot pages and the selection of cold pages are based
- on page access channels and patterns. There are two access channels:
- one through page tables and the other through file descriptors. The
- protection of the former channel is by design stronger because:
- 1. The uncertainty in determining the access patterns of the former
- channel is higher due to the approximation of the accessed bit.
- 2. The cost of evicting the former channel is higher due to the TLB
- flushes required and the likelihood of encountering the dirty bit.
- 3. The penalty of underprotecting the former channel is higher because
- applications usually do not prepare themselves for major page
- faults like they do for blocked I/O. E.g., GUI applications
- commonly use dedicated I/O threads to avoid blocking rendering
- threads.
- There are also two access patterns: one with temporal locality and the
- other without. For the reasons listed above, the former channel is
- assumed to follow the former pattern unless VM_SEQ_READ or VM_RAND_READ is
- present; the latter channel is assumed to follow the latter pattern unless
- outlying refaults have been observed [3][4].
- The next patch will address the "outlying refaults". Three macros, i.e.,
- LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are added in
- this patch to make the entire patchset less diffy.
- A page is added to the youngest generation on faulting. The aging needs
- to check the accessed bit at least twice before handing this page over to
- the eviction. The first check takes care of the accessed bit set on the
- initial fault; the second check makes sure this page has not been used
- since then. This protocol, AKA second chance, requires a minimum of two
- generations, hence MIN_NR_GENS.
- [1] https://dl.acm.org/doi/10.1145/3297858.3304053
- [2] https://dl.acm.org/doi/10.1145/3503222.3507731
- [3] https://lwn.net/Articles/495543/
- [4] https://lwn.net/Articles/815342/
- Link: https://lkml.kernel.org/r/[email protected]
- Signed-off-by: Yu Zhao <[email protected]>
- Acked-by: Brian Geffon <[email protected]>
- Acked-by: Jan Alexander Steffens (heftig) <[email protected]>
- Acked-by: Oleksandr Natalenko <[email protected]>
- Acked-by: Steven Barrett <[email protected]>
- Acked-by: Suleiman Souhlal <[email protected]>
- Tested-by: Daniel Byrne <[email protected]>
- Tested-by: Donald Carr <[email protected]>
- Tested-by: Holger Hoffstätte <[email protected]>
- Tested-by: Konstantin Kharlamov <[email protected]>
- Tested-by: Shuang Zhai <[email protected]>
- Tested-by: Sofia Trinh <[email protected]>
- Tested-by: Vaibhav Jain <[email protected]>
- Cc: Andi Kleen <[email protected]>
- Cc: Aneesh Kumar K.V <[email protected]>
- Cc: Barry Song <[email protected]>
- Cc: Catalin Marinas <[email protected]>
- Cc: Dave Hansen <[email protected]>
- Cc: Hillf Danton <[email protected]>
- Cc: Jens Axboe <[email protected]>
- Cc: Johannes Weiner <[email protected]>
- Cc: Jonathan Corbet <[email protected]>
- Cc: Linus Torvalds <[email protected]>
- Cc: Matthew Wilcox <[email protected]>
- Cc: Mel Gorman <[email protected]>
- Cc: Miaohe Lin <[email protected]>
- Cc: Michael Larabel <[email protected]>
- Cc: Michal Hocko <[email protected]>
- Cc: Mike Rapoport <[email protected]>
- Cc: Mike Rapoport <[email protected]>
- Cc: Peter Zijlstra <[email protected]>
- Cc: Qi Zheng <[email protected]>
- Cc: Tejun Heo <[email protected]>
- Cc: Vlastimil Babka <[email protected]>
- Cc: Will Deacon <[email protected]>
- Signed-off-by: Andrew Morton <[email protected]>
- ---
- fs/fuse/dev.c | 3 +-
- include/linux/mm.h | 2 +
- include/linux/mm_inline.h | 177 +++++++++++++++++++++++++++++-
- include/linux/mmzone.h | 100 +++++++++++++++++
- include/linux/page-flags-layout.h | 13 ++-
- include/linux/page-flags.h | 4 +-
- include/linux/sched.h | 4 +
- kernel/bounds.c | 5 +
- mm/Kconfig | 8 ++
- mm/huge_memory.c | 3 +-
- mm/memcontrol.c | 2 +
- mm/memory.c | 25 +++++
- mm/mm_init.c | 6 +-
- mm/mmzone.c | 2 +
- mm/swap.c | 10 +-
- mm/vmscan.c | 75 +++++++++++++
- 16 files changed, 425 insertions(+), 14 deletions(-)
- --- a/fs/fuse/dev.c
- +++ b/fs/fuse/dev.c
- @@ -785,7 +785,8 @@ static int fuse_check_page(struct page *
- 1 << PG_active |
- 1 << PG_workingset |
- 1 << PG_reclaim |
- - 1 << PG_waiters))) {
- + 1 << PG_waiters |
- + LRU_GEN_MASK | LRU_REFS_MASK))) {
- dump_page(page, "fuse: trying to steal weird page");
- return 1;
- }
- --- a/include/linux/mm.h
- +++ b/include/linux/mm.h
- @@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v
- #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
- #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
- #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
- +#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
- +#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)
-
- /*
- * Define the bit shifts to access each section. For non-existent
- --- a/include/linux/mm_inline.h
- +++ b/include/linux/mm_inline.h
- @@ -26,10 +26,13 @@ static inline int page_is_file_lru(struc
-
- static __always_inline void __update_lru_size(struct lruvec *lruvec,
- enum lru_list lru, enum zone_type zid,
- - int nr_pages)
- + long nr_pages)
- {
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-
- + lockdep_assert_held(&lruvec->lru_lock);
- + WARN_ON_ONCE(nr_pages != (int)nr_pages);
- +
- __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
- __mod_zone_page_state(&pgdat->node_zones[zid],
- NR_ZONE_LRU_BASE + lru, nr_pages);
- @@ -86,11 +89,177 @@ static __always_inline enum lru_list pag
- return lru;
- }
-
- +#ifdef CONFIG_LRU_GEN
- +
- +static inline bool lru_gen_enabled(void)
- +{
- + return true;
- +}
- +
- +static inline bool lru_gen_in_fault(void)
- +{
- + return current->in_lru_fault;
- +}
- +
- +static inline int lru_gen_from_seq(unsigned long seq)
- +{
- + return seq % MAX_NR_GENS;
- +}
- +
- +static inline int page_lru_gen(struct page *page)
- +{
- + unsigned long flags = READ_ONCE(page->flags);
- +
- + return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
- +}
- +
- +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
- +{
- + unsigned long max_seq = lruvec->lrugen.max_seq;
- +
- + VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
- +
- + /* see the comment on MIN_NR_GENS */
- + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
- +}
- +
- +static inline void lru_gen_update_size(struct lruvec *lruvec, struct page *page,
- + int old_gen, int new_gen)
- +{
- + int type = page_is_file_lru(page);
- + int zone = page_zonenum(page);
- + int delta = thp_nr_pages(page);
- + enum lru_list lru = type * LRU_INACTIVE_FILE;
- + struct lru_gen_struct *lrugen = &lruvec->lrugen;
- +
- + VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
- + VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
- + VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);
- +
- + if (old_gen >= 0)
- + WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
- + lrugen->nr_pages[old_gen][type][zone] - delta);
- + if (new_gen >= 0)
- + WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
- + lrugen->nr_pages[new_gen][type][zone] + delta);
- +
- + /* addition */
- + if (old_gen < 0) {
- + if (lru_gen_is_active(lruvec, new_gen))
- + lru += LRU_ACTIVE;
- + __update_lru_size(lruvec, lru, zone, delta);
- + return;
- + }
- +
- + /* deletion */
- + if (new_gen < 0) {
- + if (lru_gen_is_active(lruvec, old_gen))
- + lru += LRU_ACTIVE;
- + __update_lru_size(lruvec, lru, zone, -delta);
- + return;
- + }
- +}
- +
- +static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
- +{
- + unsigned long seq;
- + unsigned long flags;
- + int gen = page_lru_gen(page);
- + int type = page_is_file_lru(page);
- + int zone = page_zonenum(page);
- + struct lru_gen_struct *lrugen = &lruvec->lrugen;
- +
- + VM_WARN_ON_ONCE_PAGE(gen != -1, page);
- +
- + if (PageUnevictable(page))
- + return false;
- + /*
- + * There are three common cases for this page:
- + * 1. If it's hot, e.g., freshly faulted in or previously hot and
- + * migrated, add it to the youngest generation.
- + * 2. If it's cold but can't be evicted immediately, i.e., an anon page
- + * not in swapcache or a dirty page pending writeback, add it to the
- + * second oldest generation.
- + * 3. Everything else (clean, cold) is added to the oldest generation.
- + */
- + if (PageActive(page))
- + seq = lrugen->max_seq;
- + else if ((type == LRU_GEN_ANON && !PageSwapCache(page)) ||
- + (PageReclaim(page) &&
- + (PageDirty(page) || PageWriteback(page))))
- + seq = lrugen->min_seq[type] + 1;
- + else
- + seq = lrugen->min_seq[type];
- +
- + gen = lru_gen_from_seq(seq);
- + flags = (gen + 1UL) << LRU_GEN_PGOFF;
- + /* see the comment on MIN_NR_GENS about PG_active */
- + set_mask_bits(&page->flags, LRU_GEN_MASK | BIT(PG_active), flags);
- +
- + lru_gen_update_size(lruvec, page, -1, gen);
- + /* for rotate_reclaimable_page() */
- + if (reclaiming)
- + list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
- + else
- + list_add(&page->lru, &lrugen->lists[gen][type][zone]);
- +
- + return true;
- +}
- +
- +static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
- +{
- + unsigned long flags;
- + int gen = page_lru_gen(page);
- +
- + if (gen < 0)
- + return false;
- +
- + VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
- + VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
- +
- + /* for migrate_page_states() */
- + flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
- + flags = set_mask_bits(&page->flags, LRU_GEN_MASK, flags);
- + gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
- +
- + lru_gen_update_size(lruvec, page, gen, -1);
- + list_del(&page->lru);
- +
- + return true;
- +}
- +
- +#else /* !CONFIG_LRU_GEN */
- +
- +static inline bool lru_gen_enabled(void)
- +{
- + return false;
- +}
- +
- +static inline bool lru_gen_in_fault(void)
- +{
- + return false;
- +}
- +
- +static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
- +{
- + return false;
- +}
- +
- +static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
- +{
- + return false;
- +}
- +
- +#endif /* CONFIG_LRU_GEN */
- +
- static __always_inline void add_page_to_lru_list(struct page *page,
- struct lruvec *lruvec)
- {
- enum lru_list lru = page_lru(page);
-
- + if (lru_gen_add_page(lruvec, page, false))
- + return;
- +
- update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
- list_add(&page->lru, &lruvec->lists[lru]);
- }
- @@ -100,6 +269,9 @@ static __always_inline void add_page_to_
- {
- enum lru_list lru = page_lru(page);
-
- + if (lru_gen_add_page(lruvec, page, true))
- + return;
- +
- update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
- list_add_tail(&page->lru, &lruvec->lists[lru]);
- }
- @@ -107,6 +279,9 @@ static __always_inline void add_page_to_
- static __always_inline void del_page_from_lru_list(struct page *page,
- struct lruvec *lruvec)
- {
- + if (lru_gen_del_page(lruvec, page, false))
- + return;
- +
- list_del(&page->lru);
- update_lru_size(lruvec, page_lru(page), page_zonenum(page),
- -thp_nr_pages(page));
- --- a/include/linux/mmzone.h
- +++ b/include/linux/mmzone.h
- @@ -294,6 +294,102 @@ enum lruvec_flags {
- */
- };
-
- +#endif /* !__GENERATING_BOUNDS_H */
- +
- +/*
- + * Evictable pages are divided into multiple generations. The youngest and the
- + * oldest generation numbers, max_seq and min_seq, are monotonically increasing.
- + * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
- + * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
- + * corresponding generation. The gen counter in page->flags stores gen+1 while
- + * a page is on one of lrugen->lists[]. Otherwise it stores 0.
- + *
- + * A page is added to the youngest generation on faulting. The aging needs to
- + * check the accessed bit at least twice before handing this page over to the
- + * eviction. The first check takes care of the accessed bit set on the initial
- + * fault; the second check makes sure this page hasn't been used since then.
- + * This process, AKA second chance, requires a minimum of two generations,
- + * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
- + * LRU, e.g., /proc/vmstat, these two generations are considered active; the
- + * rest of generations, if they exist, are considered inactive. See
- + * lru_gen_is_active().
- + *
- + * PG_active is always cleared while a page is on one of lrugen->lists[] so that
- + * the aging needs not to worry about it. And it's set again when a page
- + * considered active is isolated for non-reclaiming purposes, e.g., migration.
- + * See lru_gen_add_page() and lru_gen_del_page().
- + *
- + * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
- + * number of categories of the active/inactive LRU when keeping track of
- + * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
- + * in page->flags.
- + */
- +#define MIN_NR_GENS 2U
- +#define MAX_NR_GENS 4U
- +
- +#ifndef __GENERATING_BOUNDS_H
- +
- +struct lruvec;
- +
- +#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
- +#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
- +
- +#ifdef CONFIG_LRU_GEN
- +
- +enum {
- + LRU_GEN_ANON,
- + LRU_GEN_FILE,
- +};
- +
- +/*
- + * The youngest generation number is stored in max_seq for both anon and file
- + * types as they are aged on an equal footing. The oldest generation numbers are
- + * stored in min_seq[] separately for anon and file types as clean file pages
- + * can be evicted regardless of swap constraints.
- + *
- + * Normally anon and file min_seq are in sync. But if swapping is constrained,
- + * e.g., out of swap space, file min_seq is allowed to advance and leave anon
- + * min_seq behind.
- + *
- + * The number of pages in each generation is eventually consistent and therefore
- + * can be transiently negative.
- + */
- +struct lru_gen_struct {
- + /* the aging increments the youngest generation number */
- + unsigned long max_seq;
- + /* the eviction increments the oldest generation numbers */
- + unsigned long min_seq[ANON_AND_FILE];
- + /* the multi-gen LRU lists, lazily sorted on eviction */
- + struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
- + /* the multi-gen LRU sizes, eventually consistent */
- + long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
- +};
- +
- +void lru_gen_init_lruvec(struct lruvec *lruvec);
- +
- +#ifdef CONFIG_MEMCG
- +void lru_gen_init_memcg(struct mem_cgroup *memcg);
- +void lru_gen_exit_memcg(struct mem_cgroup *memcg);
- +#endif
- +
- +#else /* !CONFIG_LRU_GEN */
- +
- +static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
- +{
- +}
- +
- +#ifdef CONFIG_MEMCG
- +static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
- +{
- +}
- +
- +static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
- +{
- +}
- +#endif
- +
- +#endif /* CONFIG_LRU_GEN */
- +
- struct lruvec {
- struct list_head lists[NR_LRU_LISTS];
- /* per lruvec lru_lock for memcg */
- @@ -311,6 +407,10 @@ struct lruvec {
- unsigned long refaults[ANON_AND_FILE];
- /* Various lruvec state flags (enum lruvec_flags) */
- unsigned long flags;
- +#ifdef CONFIG_LRU_GEN
- + /* evictable pages divided into generations */
- + struct lru_gen_struct lrugen;
- +#endif
- #ifdef CONFIG_MEMCG
- struct pglist_data *pgdat;
- #endif
- --- a/include/linux/page-flags-layout.h
- +++ b/include/linux/page-flags-layout.h
- @@ -55,7 +55,8 @@
- #define SECTIONS_WIDTH 0
- #endif
-
- -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
- +#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
- + <= BITS_PER_LONG - NR_PAGEFLAGS
- #define NODES_WIDTH NODES_SHIFT
- #elif defined(CONFIG_SPARSEMEM_VMEMMAP)
- #error "Vmemmap: No space for nodes field in page flags"
- @@ -89,8 +90,8 @@
- #define LAST_CPUPID_SHIFT 0
- #endif
-
- -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
- - <= BITS_PER_LONG - NR_PAGEFLAGS
- +#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
- + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
- #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
- #else
- #define LAST_CPUPID_WIDTH 0
- @@ -100,10 +101,12 @@
- #define LAST_CPUPID_NOT_IN_PAGE_FLAGS
- #endif
-
- -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
- - > BITS_PER_LONG - NR_PAGEFLAGS
- +#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
- + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
- #error "Not enough bits in page flags"
- #endif
-
- +#define LRU_REFS_WIDTH 0
- +
- #endif
- #endif /* _LINUX_PAGE_FLAGS_LAYOUT */
- --- a/include/linux/page-flags.h
- +++ b/include/linux/page-flags.h
- @@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall
- 1UL << PG_private | 1UL << PG_private_2 | \
- 1UL << PG_writeback | 1UL << PG_reserved | \
- 1UL << PG_slab | 1UL << PG_active | \
- - 1UL << PG_unevictable | __PG_MLOCKED)
- + 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
-
- /*
- * Flags checked when a page is prepped for return by the page allocator.
- @@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall
- * alloc-free cycle to prevent from reusing the page.
- */
- #define PAGE_FLAGS_CHECK_AT_PREP \
- - (PAGEFLAGS_MASK & ~__PG_HWPOISON)
- + ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
-
- #define PAGE_FLAGS_PRIVATE \
- (1UL << PG_private | 1UL << PG_private_2)
- --- a/include/linux/sched.h
- +++ b/include/linux/sched.h
- @@ -911,6 +911,10 @@ struct task_struct {
- #ifdef CONFIG_MEMCG
- unsigned in_user_fault:1;
- #endif
- +#ifdef CONFIG_LRU_GEN
- + /* whether the LRU algorithm may apply to this access */
- + unsigned in_lru_fault:1;
- +#endif
- #ifdef CONFIG_COMPAT_BRK
- unsigned brk_randomized:1;
- #endif
- --- a/kernel/bounds.c
- +++ b/kernel/bounds.c
- @@ -22,6 +22,11 @@ int main(void)
- DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
- #endif
- DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
- +#ifdef CONFIG_LRU_GEN
- + DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
- +#else
- + DEFINE(LRU_GEN_WIDTH, 0);
- +#endif
- /* End of constants */
-
- return 0;
- --- a/mm/Kconfig
- +++ b/mm/Kconfig
- @@ -897,6 +897,14 @@ config IO_MAPPING
- config SECRETMEM
- def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
-
- +config LRU_GEN
- + bool "Multi-Gen LRU"
- + depends on MMU
- + # make sure page->flags has enough spare bits
- + depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP
- + help
- + A high performance LRU implementation to overcommit memory.
- +
- source "mm/damon/Kconfig"
-
- endmenu
- --- a/mm/huge_memory.c
- +++ b/mm/huge_memory.c
- @@ -2366,7 +2366,8 @@ static void __split_huge_page_tail(struc
- #ifdef CONFIG_64BIT
- (1L << PG_arch_2) |
- #endif
- - (1L << PG_dirty)));
- + (1L << PG_dirty) |
- + LRU_GEN_MASK | LRU_REFS_MASK));
-
- /* ->mapping in first tail page is compound_mapcount */
- VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
- --- a/mm/memcontrol.c
- +++ b/mm/memcontrol.c
- @@ -5178,6 +5178,7 @@ static void __mem_cgroup_free(struct mem
-
- static void mem_cgroup_free(struct mem_cgroup *memcg)
- {
- + lru_gen_exit_memcg(memcg);
- memcg_wb_domain_exit(memcg);
- __mem_cgroup_free(memcg);
- }
- @@ -5241,6 +5242,7 @@ static struct mem_cgroup *mem_cgroup_all
- memcg->deferred_split_queue.split_queue_len = 0;
- #endif
- idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
- + lru_gen_init_memcg(memcg);
- return memcg;
- fail:
- mem_cgroup_id_remove(memcg);
- --- a/mm/memory.c
- +++ b/mm/memory.c
- @@ -4792,6 +4792,27 @@ static inline void mm_account_fault(stru
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
- }
-
- +#ifdef CONFIG_LRU_GEN
- +static void lru_gen_enter_fault(struct vm_area_struct *vma)
- +{
- + /* the LRU algorithm doesn't apply to sequential or random reads */
- + current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
- +}
- +
- +static void lru_gen_exit_fault(void)
- +{
- + current->in_lru_fault = false;
- +}
- +#else
- +static void lru_gen_enter_fault(struct vm_area_struct *vma)
- +{
- +}
- +
- +static void lru_gen_exit_fault(void)
- +{
- +}
- +#endif /* CONFIG_LRU_GEN */
- +
- /*
- * By the time we get here, we already hold the mm semaphore
- *
- @@ -4823,11 +4844,15 @@ vm_fault_t handle_mm_fault(struct vm_are
- if (flags & FAULT_FLAG_USER)
- mem_cgroup_enter_user_fault();
-
- + lru_gen_enter_fault(vma);
- +
- if (unlikely(is_vm_hugetlb_page(vma)))
- ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
- else
- ret = __handle_mm_fault(vma, address, flags);
-
- + lru_gen_exit_fault();
- +
- if (flags & FAULT_FLAG_USER) {
- mem_cgroup_exit_user_fault();
- /*
- --- a/mm/mm_init.c
- +++ b/mm/mm_init.c
- @@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo
-
- shift = 8 * sizeof(unsigned long);
- width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
- - - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
- + - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
- mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
- - "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
- + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
- SECTIONS_WIDTH,
- NODES_WIDTH,
- ZONES_WIDTH,
- LAST_CPUPID_WIDTH,
- KASAN_TAG_WIDTH,
- + LRU_GEN_WIDTH,
- + LRU_REFS_WIDTH,
- NR_PAGEFLAGS);
- mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
- --- a/mm/mmzone.c
- +++ b/mm/mmzone.c
- @@ -81,6 +81,8 @@ void lruvec_init(struct lruvec *lruvec)
-
- for_each_lru(lru)
- INIT_LIST_HEAD(&lruvec->lists[lru]);
- +
- + lru_gen_init_lruvec(lruvec);
- }
-
- #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
- --- a/mm/swap.c
- +++ b/mm/swap.c
- @@ -446,6 +446,11 @@ void lru_cache_add(struct page *page)
- VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
- VM_BUG_ON_PAGE(PageLRU(page), page);
-
- + /* see the comment in lru_gen_add_page() */
- + if (lru_gen_enabled() && !PageUnevictable(page) &&
- + lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
- + SetPageActive(page);
- +
- get_page(page);
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.lru_add);
- @@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc
-
- static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
- {
- - if (PageActive(page) && !PageUnevictable(page)) {
- + if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
- int nr_pages = thp_nr_pages(page);
-
- del_page_from_lru_list(page, lruvec);
- @@ -661,7 +666,8 @@ void deactivate_file_page(struct page *p
- */
- void deactivate_page(struct page *page)
- {
- - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
- + if (PageLRU(page) && !PageUnevictable(page) &&
- + (PageActive(page) || lru_gen_enabled())) {
- struct pagevec *pvec;
-
- local_lock(&lru_pvecs.lock);
- --- a/mm/vmscan.c
- +++ b/mm/vmscan.c
- @@ -2821,6 +2821,81 @@ static bool can_age_anon_pages(struct pg
- return can_demote(pgdat->node_id, sc);
- }
-
- +#ifdef CONFIG_LRU_GEN
- +
- +/******************************************************************************
- + * shorthand helpers
- + ******************************************************************************/
- +
- +#define for_each_gen_type_zone(gen, type, zone) \
- + for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
- + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
- + for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
- +
- +static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
- +{
- + struct pglist_data *pgdat = NODE_DATA(nid);
- +
- +#ifdef CONFIG_MEMCG
- + if (memcg) {
- + struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
- +
- + /* for hotadd_new_pgdat() */
- + if (!lruvec->pgdat)
- + lruvec->pgdat = pgdat;
- +
- + return lruvec;
- + }
- +#endif
- + VM_WARN_ON_ONCE(!mem_cgroup_disabled());
- +
- + return pgdat ? &pgdat->__lruvec : NULL;
- +}
- +
- +/******************************************************************************
- + * initialization
- + ******************************************************************************/
- +
- +void lru_gen_init_lruvec(struct lruvec *lruvec)
- +{
- + int gen, type, zone;
- + struct lru_gen_struct *lrugen = &lruvec->lrugen;
- +
- + lrugen->max_seq = MIN_NR_GENS + 1;
- +
- + for_each_gen_type_zone(gen, type, zone)
- + INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
- +}
- +
- +#ifdef CONFIG_MEMCG
- +void lru_gen_init_memcg(struct mem_cgroup *memcg)
- +{
- +}
- +
- +void lru_gen_exit_memcg(struct mem_cgroup *memcg)
- +{
- + int nid;
- +
- + for_each_node(nid) {
- + struct lruvec *lruvec = get_lruvec(memcg, nid);
- +
- + VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
- + sizeof(lruvec->lrugen.nr_pages)));
- + }
- +}
- +#endif
- +
- +static int __init init_lru_gen(void)
- +{
- + BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
- + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
- +
- + return 0;
- +};
- +late_initcall(init_lru_gen);
- +
- +#endif /* CONFIG_LRU_GEN */
- +
- static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
- {
- unsigned long nr[NR_LRU_LISTS];
|