123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996 |
- From 05f366c941ae2bb8ba21c79fafcb747a5a6b967b Mon Sep 17 00:00:00 2001
- From: Yu Zhao <[email protected]>
- Date: Mon, 25 Jan 2021 21:12:33 -0700
- Subject: [PATCH 04/10] mm: multigenerational lru: groundwork
- For each lruvec, evictable pages are divided into multiple
- generations. The youngest generation number is stored in
- lrugen->max_seq for both anon and file types as they are aged on an
- equal footing. The oldest generation numbers are stored in
- lrugen->min_seq[] separately for anon and file types as clean file
- pages can be evicted regardless of swap constraints. These three
- variables are monotonically increasing. Generation numbers are
- truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into
- page->flags. The sliding window technique is used to prevent truncated
- generation numbers from overlapping. Each truncated generation number
- is an index to
- lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES].
- The framework comprises two conceptually independent components: the
- aging, which produces young generations, and the eviction, which
- consumes old generations. Both can be invoked independently from user
- space for the purpose of working set estimation and proactive reclaim.
- The protection of hot pages and the selection of cold pages are based
- on page access types and patterns. There are two access types: one via
- page tables and the other via file descriptors. The protection of the
- former type is by design stronger because:
- 1) The uncertainty in determining the access patterns of the former
- type is higher due to the coalesced nature of the accessed bit.
- 2) The cost of evicting the former type is higher due to the TLB
- flushes required and the likelihood of involving I/O.
- 3) The penalty of under-protecting the former type is higher because
- applications usually do not prepare themselves for major faults like
- they do for blocked I/O. For example, client applications commonly
- dedicate blocked I/O to separate threads to avoid UI janks that
- negatively affect user experience.
- There are also two access patterns: one with temporal locality and the
- other without. The latter pattern, e.g., random and sequential, needs
- to be explicitly excluded to avoid weakening the protection of the
- former pattern. Generally the former type follows the former pattern
- unless MADV_SEQUENTIAL is specified and the latter type follows the
- latter pattern unless outlying refaults have been observed.
- Upon faulting, a page is added to the youngest generation, which
- provides the strongest protection as the eviction will not consider
- this page before the aging has scanned it at least twice. The first
- scan clears the accessed bit set during the initial fault. And the
- second scan makes sure this page has not been used since the first
- scan. A page from any other generations is brought back to the
- youngest generation whenever the aging finds the accessed bit set on
- any of the PTEs mapping this page.
- Unmapped pages are initially added to the oldest generation and then
- conditionally protected by tiers. This is done later [PATCH 07/10].
- Signed-off-by: Yu Zhao <[email protected]>
- Tested-by: Konstantin Kharlamov <[email protected]>
- Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7
- ---
- fs/fuse/dev.c | 3 +-
- include/linux/cgroup.h | 15 +-
- include/linux/mm.h | 36 ++++
- include/linux/mm_inline.h | 182 ++++++++++++++++++++
- include/linux/mmzone.h | 70 ++++++++
- include/linux/page-flags-layout.h | 19 ++-
- include/linux/page-flags.h | 4 +-
- include/linux/sched.h | 3 +
- kernel/bounds.c | 3 +
- kernel/cgroup/cgroup-internal.h | 1 -
- mm/huge_memory.c | 3 +-
- mm/memcontrol.c | 1 +
- mm/memory.c | 7 +
- mm/mm_init.c | 6 +-
- mm/page_alloc.c | 1 +
- mm/swap.c | 9 +-
- mm/swapfile.c | 2 +
- mm/vmscan.c | 268 ++++++++++++++++++++++++++++++
- 18 files changed, 618 insertions(+), 15 deletions(-)
- --- a/fs/fuse/dev.c
- +++ b/fs/fuse/dev.c
- @@ -785,7 +785,8 @@ static int fuse_check_page(struct page *
- 1 << PG_active |
- 1 << PG_workingset |
- 1 << PG_reclaim |
- - 1 << PG_waiters))) {
- + 1 << PG_waiters |
- + LRU_GEN_MASK | LRU_REFS_MASK))) {
- dump_page(page, "fuse: trying to steal weird page");
- return 1;
- }
- --- a/include/linux/cgroup.h
- +++ b/include/linux/cgroup.h
- @@ -433,6 +433,18 @@ static inline void cgroup_put(struct cgr
- css_put(&cgrp->self);
- }
-
- +extern struct mutex cgroup_mutex;
- +
- +static inline void cgroup_lock(void)
- +{
- + mutex_lock(&cgroup_mutex);
- +}
- +
- +static inline void cgroup_unlock(void)
- +{
- + mutex_unlock(&cgroup_mutex);
- +}
- +
- /**
- * task_css_set_check - obtain a task's css_set with extra access conditions
- * @task: the task to obtain css_set for
- @@ -447,7 +459,6 @@ static inline void cgroup_put(struct cgr
- * as locks used during the cgroup_subsys::attach() methods.
- */
- #ifdef CONFIG_PROVE_RCU
- -extern struct mutex cgroup_mutex;
- extern spinlock_t css_set_lock;
- #define task_css_set_check(task, __c) \
- rcu_dereference_check((task)->cgroups, \
- @@ -708,6 +719,8 @@ struct cgroup;
- static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
- static inline void css_get(struct cgroup_subsys_state *css) {}
- static inline void css_put(struct cgroup_subsys_state *css) {}
- +static inline void cgroup_lock(void) {}
- +static inline void cgroup_unlock(void) {}
- static inline int cgroup_attach_task_all(struct task_struct *from,
- struct task_struct *t) { return 0; }
- static inline int cgroupstats_build(struct cgroupstats *stats,
- --- a/include/linux/mm.h
- +++ b/include/linux/mm.h
- @@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v
- #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
- #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
- #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
- +#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
- +#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)
-
- /*
- * Define the bit shifts to access each section. For non-existent
- @@ -1807,6 +1809,40 @@ static inline void unmap_mapping_range(s
- loff_t const holebegin, loff_t const holelen, int even_cows) { }
- #endif
-
- +#ifdef CONFIG_LRU_GEN
- +static inline void task_enter_nonseq_fault(void)
- +{
- + WARN_ON(current->in_nonseq_fault);
- +
- + current->in_nonseq_fault = 1;
- +}
- +
- +static inline void task_exit_nonseq_fault(void)
- +{
- + WARN_ON(!current->in_nonseq_fault);
- +
- + current->in_nonseq_fault = 0;
- +}
- +
- +static inline bool task_in_nonseq_fault(void)
- +{
- + return current->in_nonseq_fault;
- +}
- +#else
- +static inline void task_enter_nonseq_fault(void)
- +{
- +}
- +
- +static inline void task_exit_nonseq_fault(void)
- +{
- +}
- +
- +static inline bool task_in_nonseq_fault(void)
- +{
- + return false;
- +}
- +#endif /* CONFIG_LRU_GEN */
- +
- static inline void unmap_shared_mapping_range(struct address_space *mapping,
- loff_t const holebegin, loff_t const holelen)
- {
- --- a/include/linux/mm_inline.h
- +++ b/include/linux/mm_inline.h
- @@ -79,11 +79,187 @@ static __always_inline enum lru_list pag
- return lru;
- }
-
- +#ifdef CONFIG_LRU_GEN
- +
- +static inline bool lru_gen_enabled(void)
- +{
- +#ifdef CONFIG_LRU_GEN_ENABLED
- + DECLARE_STATIC_KEY_TRUE(lru_gen_static_key);
- +
- + return static_branch_likely(&lru_gen_static_key);
- +#else
- + DECLARE_STATIC_KEY_FALSE(lru_gen_static_key);
- +
- + return static_branch_unlikely(&lru_gen_static_key);
- +#endif
- +}
- +
- +/* Return an index within the sliding window that tracks MAX_NR_GENS generations. */
- +static inline int lru_gen_from_seq(unsigned long seq)
- +{
- + return seq % MAX_NR_GENS;
- +}
- +
- +/* The youngest and the second youngest generations are counted as active. */
- +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
- +{
- + unsigned long max_seq = lruvec->evictable.max_seq;
- +
- + VM_BUG_ON(gen >= MAX_NR_GENS);
- +
- + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
- +}
- +
- +/* Update the sizes of the multigenerational lru lists. */
- +static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec,
- + int old_gen, int new_gen)
- +{
- + int type = page_is_file_lru(page);
- + int zone = page_zonenum(page);
- + int delta = thp_nr_pages(page);
- + enum lru_list lru = type * LRU_FILE;
- + struct lrugen *lrugen = &lruvec->evictable;
- +
- + lockdep_assert_held(&lruvec->lru_lock);
- + VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS);
- + VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS);
- + VM_BUG_ON(old_gen == -1 && new_gen == -1);
- +
- + if (old_gen >= 0)
- + WRITE_ONCE(lrugen->sizes[old_gen][type][zone],
- + lrugen->sizes[old_gen][type][zone] - delta);
- + if (new_gen >= 0)
- + WRITE_ONCE(lrugen->sizes[new_gen][type][zone],
- + lrugen->sizes[new_gen][type][zone] + delta);
- +
- + if (old_gen < 0) {
- + if (lru_gen_is_active(lruvec, new_gen))
- + lru += LRU_ACTIVE;
- + update_lru_size(lruvec, lru, zone, delta);
- + return;
- + }
- +
- + if (new_gen < 0) {
- + if (lru_gen_is_active(lruvec, old_gen))
- + lru += LRU_ACTIVE;
- + update_lru_size(lruvec, lru, zone, -delta);
- + return;
- + }
- +
- + if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
- + update_lru_size(lruvec, lru, zone, -delta);
- + update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
- + }
- +
- + VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
- +}
- +
- +/* Add a page to one of the multigenerational lru lists. Return true on success. */
- +static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
- +{
- + int gen;
- + unsigned long old_flags, new_flags;
- + int type = page_is_file_lru(page);
- + int zone = page_zonenum(page);
- + struct lrugen *lrugen = &lruvec->evictable;
- +
- + if (PageUnevictable(page) || !lrugen->enabled[type])
- + return false;
- + /*
- + * If a page shouldn't be considered for eviction, i.e., a page mapped
- + * upon fault during which the accessed bit is set, add it to the
- + * youngest generation.
- + *
- + * If a page can't be evicted immediately, i.e., an anon page not in
- + * swap cache or a dirty page pending writeback, add it to the second
- + * oldest generation.
- + *
- + * If a page could be evicted immediately, e.g., a clean page, add it to
- + * the oldest generation.
- + */
- + if (PageActive(page))
- + gen = lru_gen_from_seq(lrugen->max_seq);
- + else if ((!type && !PageSwapCache(page)) ||
- + (PageReclaim(page) && (PageDirty(page) || PageWriteback(page))))
- + gen = lru_gen_from_seq(lrugen->min_seq[type] + 1);
- + else
- + gen = lru_gen_from_seq(lrugen->min_seq[type]);
- +
- + do {
- + new_flags = old_flags = READ_ONCE(page->flags);
- + VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page);
- +
- + new_flags &= ~(LRU_GEN_MASK | BIT(PG_active));
- + new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
- + } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
- +
- + lru_gen_update_size(page, lruvec, -1, gen);
- + /* for rotate_reclaimable_page() */
- + if (reclaiming)
- + list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
- + else
- + list_add(&page->lru, &lrugen->lists[gen][type][zone]);
- +
- + return true;
- +}
- +
- +/* Delete a page from one of the multigenerational lru lists. Return true on success. */
- +static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
- +{
- + int gen;
- + unsigned long old_flags, new_flags;
- +
- + do {
- + new_flags = old_flags = READ_ONCE(page->flags);
- + if (!(new_flags & LRU_GEN_MASK))
- + return false;
- +
- + VM_BUG_ON_PAGE(PageActive(page), page);
- + VM_BUG_ON_PAGE(PageUnevictable(page), page);
- +
- + gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
- +
- + new_flags &= ~LRU_GEN_MASK;
- + /* for shrink_page_list() */
- + if (reclaiming)
- + new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim));
- + else if (lru_gen_is_active(lruvec, gen))
- + new_flags |= BIT(PG_active);
- + } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
- +
- + lru_gen_update_size(page, lruvec, gen, -1);
- + list_del(&page->lru);
- +
- + return true;
- +}
- +
- +#else
- +
- +static inline bool lru_gen_enabled(void)
- +{
- + return false;
- +}
- +
- +static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
- +{
- + return false;
- +}
- +
- +static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
- +{
- + return false;
- +}
- +
- +#endif /* CONFIG_LRU_GEN */
- +
- static __always_inline void add_page_to_lru_list(struct page *page,
- struct lruvec *lruvec)
- {
- enum lru_list lru = page_lru(page);
-
- + if (lru_gen_add_page(page, lruvec, false))
- + return;
- +
- update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
- list_add(&page->lru, &lruvec->lists[lru]);
- }
- @@ -93,6 +269,9 @@ static __always_inline void add_page_to_
- {
- enum lru_list lru = page_lru(page);
-
- + if (lru_gen_add_page(page, lruvec, true))
- + return;
- +
- update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
- list_add_tail(&page->lru, &lruvec->lists[lru]);
- }
- @@ -100,6 +279,9 @@ static __always_inline void add_page_to_
- static __always_inline void del_page_from_lru_list(struct page *page,
- struct lruvec *lruvec)
- {
- + if (lru_gen_del_page(page, lruvec, false))
- + return;
- +
- list_del(&page->lru);
- update_lru_size(lruvec, page_lru(page), page_zonenum(page),
- -thp_nr_pages(page));
- --- a/include/linux/mmzone.h
- +++ b/include/linux/mmzone.h
- @@ -294,6 +294,72 @@ enum lruvec_flags {
- */
- };
-
- +struct lruvec;
- +
- +#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
- +#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
- +
- +#ifdef CONFIG_LRU_GEN
- +
- +/*
- + * For each lruvec, evictable pages are divided into multiple generations. The
- + * youngest and the oldest generation numbers, AKA max_seq and min_seq, are
- + * monotonically increasing. The sliding window technique is used to track at
- + * least MIN_NR_GENS and at most MAX_NR_GENS generations. An offset within the
- + * window, AKA gen, indexes an array of per-type and per-zone lists for the
- + * corresponding generation. The counter in page->flags stores gen+1 while a
- + * page is on one of the multigenerational lru lists. Otherwise, it stores 0.
- + *
- + * After a page is faulted in, the aging must check the accessed bit at least
- + * twice before the eviction would consider it. The first check clears the
- + * accessed bit set during the initial fault. The second check makes sure this
- + * page hasn't been used since then.
- + */
- +#define MIN_NR_GENS 2
- +#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
- +
- +struct lrugen {
- + /* the aging increments the max generation number */
- + unsigned long max_seq;
- + /* the eviction increments the min generation numbers */
- + unsigned long min_seq[ANON_AND_FILE];
- + /* the birth time of each generation in jiffies */
- + unsigned long timestamps[MAX_NR_GENS];
- + /* the multigenerational lru lists */
- + struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
- + /* the sizes of the multigenerational lru lists in pages */
- + unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
- + /* whether the multigenerational lru is enabled */
- + bool enabled[ANON_AND_FILE];
- +};
- +
- +#define MAX_BATCH_SIZE 8192
- +
- +void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
- +void lru_gen_change_state(bool enable, bool main, bool swap);
- +
- +#ifdef CONFIG_MEMCG
- +void lru_gen_init_memcg(struct mem_cgroup *memcg);
- +#endif
- +
- +#else /* !CONFIG_LRU_GEN */
- +
- +static inline void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec)
- +{
- +}
- +
- +static inline void lru_gen_change_state(bool enable, bool main, bool swap)
- +{
- +}
- +
- +#ifdef CONFIG_MEMCG
- +static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
- +{
- +}
- +#endif
- +
- +#endif /* CONFIG_LRU_GEN */
- +
- struct lruvec {
- struct list_head lists[NR_LRU_LISTS];
- /* per lruvec lru_lock for memcg */
- @@ -311,6 +377,10 @@ struct lruvec {
- unsigned long refaults[ANON_AND_FILE];
- /* Various lruvec state flags (enum lruvec_flags) */
- unsigned long flags;
- +#ifdef CONFIG_LRU_GEN
- + /* unevictable pages are on LRU_UNEVICTABLE */
- + struct lrugen evictable;
- +#endif
- #ifdef CONFIG_MEMCG
- struct pglist_data *pgdat;
- #endif
- --- a/include/linux/page-flags-layout.h
- +++ b/include/linux/page-flags-layout.h
- @@ -26,6 +26,14 @@
-
- #define ZONES_WIDTH ZONES_SHIFT
-
- +#ifdef CONFIG_LRU_GEN
- +/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */
- +#define LRU_REFS_WIDTH (CONFIG_TIERS_PER_GEN - 2)
- +#else
- +#define LRU_GEN_WIDTH 0
- +#define LRU_REFS_WIDTH 0
- +#endif /* CONFIG_LRU_GEN */
- +
- #ifdef CONFIG_SPARSEMEM
- #include <asm/sparsemem.h>
- #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
- @@ -55,7 +63,8 @@
- #define SECTIONS_WIDTH 0
- #endif
-
- -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
- +#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
- + <= BITS_PER_LONG - NR_PAGEFLAGS
- #define NODES_WIDTH NODES_SHIFT
- #elif defined(CONFIG_SPARSEMEM_VMEMMAP)
- #error "Vmemmap: No space for nodes field in page flags"
- @@ -89,8 +98,8 @@
- #define LAST_CPUPID_SHIFT 0
- #endif
-
- -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
- - <= BITS_PER_LONG - NR_PAGEFLAGS
- +#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
- + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
- #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
- #else
- #define LAST_CPUPID_WIDTH 0
- @@ -100,8 +109,8 @@
- #define LAST_CPUPID_NOT_IN_PAGE_FLAGS
- #endif
-
- -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
- - > BITS_PER_LONG - NR_PAGEFLAGS
- +#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
- + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
- #error "Not enough bits in page flags"
- #endif
-
- --- a/include/linux/page-flags.h
- +++ b/include/linux/page-flags.h
- @@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall
- 1UL << PG_private | 1UL << PG_private_2 | \
- 1UL << PG_writeback | 1UL << PG_reserved | \
- 1UL << PG_slab | 1UL << PG_active | \
- - 1UL << PG_unevictable | __PG_MLOCKED)
- + 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
-
- /*
- * Flags checked when a page is prepped for return by the page allocator.
- @@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall
- * alloc-free cycle to prevent from reusing the page.
- */
- #define PAGE_FLAGS_CHECK_AT_PREP \
- - (PAGEFLAGS_MASK & ~__PG_HWPOISON)
- + ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
-
- #define PAGE_FLAGS_PRIVATE \
- (1UL << PG_private | 1UL << PG_private_2)
- --- a/include/linux/sched.h
- +++ b/include/linux/sched.h
- @@ -911,6 +911,9 @@ struct task_struct {
- #ifdef CONFIG_MEMCG
- unsigned in_user_fault:1;
- #endif
- +#ifdef CONFIG_LRU_GEN
- + unsigned in_nonseq_fault:1;
- +#endif
- #ifdef CONFIG_COMPAT_BRK
- unsigned brk_randomized:1;
- #endif
- --- a/kernel/bounds.c
- +++ b/kernel/bounds.c
- @@ -22,6 +22,9 @@ int main(void)
- DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
- #endif
- DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
- +#ifdef CONFIG_LRU_GEN
- + DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1));
- +#endif
- /* End of constants */
-
- return 0;
- --- a/kernel/cgroup/cgroup-internal.h
- +++ b/kernel/cgroup/cgroup-internal.h
- @@ -165,7 +165,6 @@ struct cgroup_mgctx {
- #define DEFINE_CGROUP_MGCTX(name) \
- struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-
- -extern struct mutex cgroup_mutex;
- extern spinlock_t css_set_lock;
- extern struct cgroup_subsys *cgroup_subsys[];
- extern struct list_head cgroup_roots;
- --- a/mm/huge_memory.c
- +++ b/mm/huge_memory.c
- @@ -2364,7 +2364,8 @@ static void __split_huge_page_tail(struc
- #ifdef CONFIG_64BIT
- (1L << PG_arch_2) |
- #endif
- - (1L << PG_dirty)));
- + (1L << PG_dirty) |
- + LRU_GEN_MASK | LRU_REFS_MASK));
-
- /* ->mapping in first tail page is compound_mapcount */
- VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
- --- a/mm/memcontrol.c
- +++ b/mm/memcontrol.c
- @@ -5241,6 +5241,7 @@ static struct mem_cgroup *mem_cgroup_all
- memcg->deferred_split_queue.split_queue_len = 0;
- #endif
- idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
- + lru_gen_init_memcg(memcg);
- return memcg;
- fail:
- mem_cgroup_id_remove(memcg);
- --- a/mm/memory.c
- +++ b/mm/memory.c
- @@ -4788,6 +4788,7 @@ vm_fault_t handle_mm_fault(struct vm_are
- unsigned int flags, struct pt_regs *regs)
- {
- vm_fault_t ret;
- + bool nonseq_fault = !(vma->vm_flags & VM_SEQ_READ);
-
- __set_current_state(TASK_RUNNING);
-
- @@ -4809,11 +4810,17 @@ vm_fault_t handle_mm_fault(struct vm_are
- if (flags & FAULT_FLAG_USER)
- mem_cgroup_enter_user_fault();
-
- + if (nonseq_fault)
- + task_enter_nonseq_fault();
- +
- if (unlikely(is_vm_hugetlb_page(vma)))
- ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
- else
- ret = __handle_mm_fault(vma, address, flags);
-
- + if (nonseq_fault)
- + task_exit_nonseq_fault();
- +
- if (flags & FAULT_FLAG_USER) {
- mem_cgroup_exit_user_fault();
- /*
- --- a/mm/mm_init.c
- +++ b/mm/mm_init.c
- @@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo
-
- shift = 8 * sizeof(unsigned long);
- width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
- - - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
- + - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
- mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
- - "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
- + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
- SECTIONS_WIDTH,
- NODES_WIDTH,
- ZONES_WIDTH,
- LAST_CPUPID_WIDTH,
- KASAN_TAG_WIDTH,
- + LRU_GEN_WIDTH,
- + LRU_REFS_WIDTH,
- NR_PAGEFLAGS);
- mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
- --- a/mm/page_alloc.c
- +++ b/mm/page_alloc.c
- @@ -7459,6 +7459,7 @@ static void __meminit pgdat_init_interna
-
- pgdat_page_ext_init(pgdat);
- lruvec_init(&pgdat->__lruvec);
- + lru_gen_init_state(NULL, &pgdat->__lruvec);
- }
-
- static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
- --- a/mm/swap.c
- +++ b/mm/swap.c
- @@ -446,6 +446,11 @@ void lru_cache_add(struct page *page)
- VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
- VM_BUG_ON_PAGE(PageLRU(page), page);
-
- + /* see the comment in lru_gen_add_page() */
- + if (lru_gen_enabled() && !PageUnevictable(page) &&
- + task_in_nonseq_fault() && !(current->flags & PF_MEMALLOC))
- + SetPageActive(page);
- +
- get_page(page);
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.lru_add);
- @@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc
-
- static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
- {
- - if (PageActive(page) && !PageUnevictable(page)) {
- + if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
- int nr_pages = thp_nr_pages(page);
-
- del_page_from_lru_list(page, lruvec);
- @@ -661,7 +666,7 @@ void deactivate_file_page(struct page *p
- */
- void deactivate_page(struct page *page)
- {
- - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
- + if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
- struct pagevec *pvec;
-
- local_lock(&lru_pvecs.lock);
- --- a/mm/swapfile.c
- +++ b/mm/swapfile.c
- @@ -2689,6 +2689,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
- err = 0;
- atomic_inc(&proc_poll_event);
- wake_up_interruptible(&proc_poll_wait);
- + lru_gen_change_state(false, false, true);
-
- out_dput:
- filp_close(victim, NULL);
- @@ -3350,6 +3351,7 @@ SYSCALL_DEFINE2(swapon, const char __use
- mutex_unlock(&swapon_mutex);
- atomic_inc(&proc_poll_event);
- wake_up_interruptible(&proc_poll_wait);
- + lru_gen_change_state(true, false, true);
-
- error = 0;
- goto out;
- --- a/mm/vmscan.c
- +++ b/mm/vmscan.c
- @@ -50,6 +50,7 @@
- #include <linux/printk.h>
- #include <linux/dax.h>
- #include <linux/psi.h>
- +#include <linux/memory.h>
-
- #include <asm/tlbflush.h>
- #include <asm/div64.h>
- @@ -2815,6 +2816,273 @@ static bool can_age_anon_pages(struct pg
- return can_demote(pgdat->node_id, sc);
- }
-
- +#ifdef CONFIG_LRU_GEN
- +
- +/******************************************************************************
- + * shorthand helpers
- + ******************************************************************************/
- +
- +#define for_each_gen_type_zone(gen, type, zone) \
- + for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
- + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
- + for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
- +
- +static int page_lru_gen(struct page *page)
- +{
- + unsigned long flags = READ_ONCE(page->flags);
- +
- + return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
- +}
- +
- +static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg)
- +{
- + struct pglist_data *pgdat = NODE_DATA(nid);
- +
- +#ifdef CONFIG_MEMCG
- + if (memcg) {
- + struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
- +
- + if (lruvec->pgdat != pgdat)
- + lruvec->pgdat = pgdat;
- +
- + return lruvec;
- + }
- +#endif
- + return pgdat ? &pgdat->__lruvec : NULL;
- +}
- +
- +static int get_nr_gens(struct lruvec *lruvec, int type)
- +{
- + return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1;
- +}
- +
- +static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
- +{
- + return get_nr_gens(lruvec, 1) >= MIN_NR_GENS &&
- + get_nr_gens(lruvec, 1) <= get_nr_gens(lruvec, 0) &&
- + get_nr_gens(lruvec, 0) <= MAX_NR_GENS;
- +}
- +
- +/******************************************************************************
- + * state change
- + ******************************************************************************/
- +
- +#ifdef CONFIG_LRU_GEN_ENABLED
- +DEFINE_STATIC_KEY_TRUE(lru_gen_static_key);
- +#else
- +DEFINE_STATIC_KEY_FALSE(lru_gen_static_key);
- +#endif
- +
- +static int lru_gen_nr_swapfiles;
- +
- +static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
- +{
- + int gen, type, zone;
- + enum lru_list lru;
- + struct lrugen *lrugen = &lruvec->evictable;
- +
- + for_each_evictable_lru(lru) {
- + type = is_file_lru(lru);
- +
- + if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru]))
- + return false;
- + }
- +
- + for_each_gen_type_zone(gen, type, zone) {
- + if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone]))
- + return false;
- +
- + /* unlikely but not a bug when reset_batch_size() is pending */
- + VM_WARN_ON(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]);
- + }
- +
- + return true;
- +}
- +
- +static bool fill_lists(struct lruvec *lruvec)
- +{
- + enum lru_list lru;
- + int remaining = MAX_BATCH_SIZE;
- +
- + for_each_evictable_lru(lru) {
- + int type = is_file_lru(lru);
- + bool active = is_active_lru(lru);
- + struct list_head *head = &lruvec->lists[lru];
- +
- + if (!lruvec->evictable.enabled[type])
- + continue;
- +
- + while (!list_empty(head)) {
- + bool success;
- + struct page *page = lru_to_page(head);
- +
- + VM_BUG_ON_PAGE(PageTail(page), page);
- + VM_BUG_ON_PAGE(PageUnevictable(page), page);
- + VM_BUG_ON_PAGE(PageActive(page) != active, page);
- + VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
- + VM_BUG_ON_PAGE(page_lru_gen(page) < MAX_NR_GENS, page);
- +
- + prefetchw_prev_lru_page(page, head, flags);
- +
- + del_page_from_lru_list(page, lruvec);
- + success = lru_gen_add_page(page, lruvec, false);
- + VM_BUG_ON(!success);
- +
- + if (!--remaining)
- + return false;
- + }
- + }
- +
- + return true;
- +}
- +
- +static bool drain_lists(struct lruvec *lruvec)
- +{
- + int gen, type, zone;
- + int remaining = MAX_BATCH_SIZE;
- +
- + for_each_gen_type_zone(gen, type, zone) {
- + struct list_head *head = &lruvec->evictable.lists[gen][type][zone];
- +
- + if (lruvec->evictable.enabled[type])
- + continue;
- +
- + while (!list_empty(head)) {
- + bool success;
- + struct page *page = lru_to_page(head);
- +
- + VM_BUG_ON_PAGE(PageTail(page), page);
- + VM_BUG_ON_PAGE(PageUnevictable(page), page);
- + VM_BUG_ON_PAGE(PageActive(page), page);
- + VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
- + VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
- +
- + prefetchw_prev_lru_page(page, head, flags);
- +
- + success = lru_gen_del_page(page, lruvec, false);
- + VM_BUG_ON(!success);
- + add_page_to_lru_list(page, lruvec);
- +
- + if (!--remaining)
- + return false;
- + }
- + }
- +
- + return true;
- +}
- +
- +/*
- + * For file page tracking, we enable/disable it according to the main switch.
- + * For anon page tracking, we only enabled it when the main switch is on and
- + * there is at least one swapfile; we disable it when there are no swapfiles
- + * regardless of the value of the main switch. Otherwise, we will eventually
- + * reach the max size of the sliding window and have to call inc_min_seq().
- + */
- +void lru_gen_change_state(bool enable, bool main, bool swap)
- +{
- + static DEFINE_MUTEX(state_mutex);
- +
- + struct mem_cgroup *memcg;
- +
- + mem_hotplug_begin();
- + cgroup_lock();
- + mutex_lock(&state_mutex);
- +
- + if (swap) {
- + if (enable)
- + swap = !lru_gen_nr_swapfiles++;
- + else
- + swap = !--lru_gen_nr_swapfiles;
- + }
- +
- + if (main && enable != lru_gen_enabled()) {
- + if (enable)
- + static_branch_enable(&lru_gen_static_key);
- + else
- + static_branch_disable(&lru_gen_static_key);
- + } else if (!swap || !lru_gen_enabled())
- + goto unlock;
- +
- + memcg = mem_cgroup_iter(NULL, NULL, NULL);
- + do {
- + int nid;
- +
- + for_each_node(nid) {
- + struct lruvec *lruvec = get_lruvec(nid, memcg);
- +
- + if (!lruvec)
- + continue;
- +
- + spin_lock_irq(&lruvec->lru_lock);
- +
- + VM_BUG_ON(!seq_is_valid(lruvec));
- + VM_BUG_ON(!state_is_valid(lruvec));
- +
- + lruvec->evictable.enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
- + lruvec->evictable.enabled[1] = lru_gen_enabled();
- +
- + while (!(enable ? fill_lists(lruvec) : drain_lists(lruvec))) {
- + spin_unlock_irq(&lruvec->lru_lock);
- + cond_resched();
- + spin_lock_irq(&lruvec->lru_lock);
- + }
- +
- + spin_unlock_irq(&lruvec->lru_lock);
- + }
- +
- + cond_resched();
- + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
- +unlock:
- + mutex_unlock(&state_mutex);
- + cgroup_unlock();
- + mem_hotplug_done();
- +}
- +
- +/******************************************************************************
- + * initialization
- + ******************************************************************************/
- +
- +void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec)
- +{
- + int i;
- + int gen, type, zone;
- + struct lrugen *lrugen = &lruvec->evictable;
- +
- + lrugen->max_seq = MIN_NR_GENS + 1;
- + lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
- + lrugen->enabled[1] = lru_gen_enabled();
- +
- + for (i = 0; i <= MIN_NR_GENS + 1; i++)
- + lrugen->timestamps[i] = jiffies;
- +
- + for_each_gen_type_zone(gen, type, zone)
- + INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
- +}
- +
- +#ifdef CONFIG_MEMCG
- +void lru_gen_init_memcg(struct mem_cgroup *memcg)
- +{
- + int nid;
- +
- + for_each_node(nid) {
- + struct lruvec *lruvec = get_lruvec(nid, memcg);
- +
- + lru_gen_init_state(memcg, lruvec);
- + }
- +}
- +#endif
- +
- +static int __init init_lru_gen(void)
- +{
- + BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
- + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
- +
- + return 0;
- +};
- +late_initcall(init_lru_gen);
- +
- +#endif /* CONFIG_LRU_GEN */
- +
- static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
- {
- unsigned long nr[NR_LRU_LISTS];
|