123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491 |
- From e4277535f6d6708bb19b88c4bad155832671d69b Mon Sep 17 00:00:00 2001
- From: Yu Zhao <[email protected]>
- Date: Sun, 18 Sep 2022 02:00:04 -0600
- Subject: [PATCH 07/29] mm: multi-gen LRU: exploit locality in rmap
- MIME-Version: 1.0
- Content-Type: text/plain; charset=UTF-8
- Content-Transfer-Encoding: 8bit
- Searching the rmap for PTEs mapping each page on an LRU list (to test and
- clear the accessed bit) can be expensive because pages from different VMAs
- (PA space) are not cache friendly to the rmap (VA space). For workloads
- mostly using mapped pages, searching the rmap can incur the highest CPU
- cost in the reclaim path.
- This patch exploits spatial locality to reduce the trips into the rmap.
- When shrink_page_list() walks the rmap and finds a young PTE, a new
- function lru_gen_look_around() scans at most BITS_PER_LONG-1 adjacent
- PTEs. On finding another young PTE, it clears the accessed bit and
- updates the gen counter of the page mapped by this PTE to
- (max_seq%MAX_NR_GENS)+1.
- Server benchmark results:
- Single workload:
- fio (buffered I/O): no change
- Single workload:
- memcached (anon): +[3, 5]%
- Ops/sec KB/sec
- patch1-6: 1106168.46 43025.04
- patch1-7: 1147696.57 44640.29
- Configurations:
- no change
- Client benchmark results:
- kswapd profiles:
- patch1-6
- 39.03% lzo1x_1_do_compress (real work)
- 18.47% page_vma_mapped_walk (overhead)
- 6.74% _raw_spin_unlock_irq
- 3.97% do_raw_spin_lock
- 2.49% ptep_clear_flush
- 2.48% anon_vma_interval_tree_iter_first
- 1.92% page_referenced_one
- 1.88% __zram_bvec_write
- 1.48% memmove
- 1.31% vma_interval_tree_iter_next
- patch1-7
- 48.16% lzo1x_1_do_compress (real work)
- 8.20% page_vma_mapped_walk (overhead)
- 7.06% _raw_spin_unlock_irq
- 2.92% ptep_clear_flush
- 2.53% __zram_bvec_write
- 2.11% do_raw_spin_lock
- 2.02% memmove
- 1.93% lru_gen_look_around
- 1.56% free_unref_page_list
- 1.40% memset
- Configurations:
- no change
- Link: https://lkml.kernel.org/r/[email protected]
- Signed-off-by: Yu Zhao <[email protected]>
- Acked-by: Barry Song <[email protected]>
- Acked-by: Brian Geffon <[email protected]>
- Acked-by: Jan Alexander Steffens (heftig) <[email protected]>
- Acked-by: Oleksandr Natalenko <[email protected]>
- Acked-by: Steven Barrett <[email protected]>
- Acked-by: Suleiman Souhlal <[email protected]>
- Tested-by: Daniel Byrne <[email protected]>
- Tested-by: Donald Carr <[email protected]>
- Tested-by: Holger Hoffstätte <[email protected]>
- Tested-by: Konstantin Kharlamov <[email protected]>
- Tested-by: Shuang Zhai <[email protected]>
- Tested-by: Sofia Trinh <[email protected]>
- Tested-by: Vaibhav Jain <[email protected]>
- Cc: Andi Kleen <[email protected]>
- Cc: Aneesh Kumar K.V <[email protected]>
- Cc: Catalin Marinas <[email protected]>
- Cc: Dave Hansen <[email protected]>
- Cc: Hillf Danton <[email protected]>
- Cc: Jens Axboe <[email protected]>
- Cc: Johannes Weiner <[email protected]>
- Cc: Jonathan Corbet <[email protected]>
- Cc: Linus Torvalds <[email protected]>
- Cc: Matthew Wilcox <[email protected]>
- Cc: Mel Gorman <[email protected]>
- Cc: Miaohe Lin <[email protected]>
- Cc: Michael Larabel <[email protected]>
- Cc: Michal Hocko <[email protected]>
- Cc: Mike Rapoport <[email protected]>
- Cc: Mike Rapoport <[email protected]>
- Cc: Peter Zijlstra <[email protected]>
- Cc: Qi Zheng <[email protected]>
- Cc: Tejun Heo <[email protected]>
- Cc: Vlastimil Babka <[email protected]>
- Cc: Will Deacon <[email protected]>
- Signed-off-by: Andrew Morton <[email protected]>
- ---
- include/linux/memcontrol.h | 31 +++++++
- include/linux/mmzone.h | 6 ++
- mm/internal.h | 1 +
- mm/memcontrol.c | 1 +
- mm/rmap.c | 7 ++
- mm/swap.c | 4 +-
- mm/vmscan.c | 184 +++++++++++++++++++++++++++++++++++++
- 7 files changed, 232 insertions(+), 2 deletions(-)
- --- a/include/linux/memcontrol.h
- +++ b/include/linux/memcontrol.h
- @@ -447,6 +447,7 @@ static inline struct obj_cgroup *__page_
- * - LRU isolation
- * - lock_page_memcg()
- * - exclusive reference
- + * - mem_cgroup_trylock_pages()
- *
- * For a kmem page a caller should hold an rcu read lock to protect memcg
- * associated with a kmem page from being released.
- @@ -502,6 +503,7 @@ static inline struct mem_cgroup *page_me
- * - LRU isolation
- * - lock_page_memcg()
- * - exclusive reference
- + * - mem_cgroup_trylock_pages()
- *
- * For a kmem page a caller should hold an rcu read lock to protect memcg
- * associated with a kmem page from being released.
- @@ -958,6 +960,23 @@ void unlock_page_memcg(struct page *page
-
- void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
-
- +/* try to stablize page_memcg() for all the pages in a memcg */
- +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
- +{
- + rcu_read_lock();
- +
- + if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
- + return true;
- +
- + rcu_read_unlock();
- + return false;
- +}
- +
- +static inline void mem_cgroup_unlock_pages(void)
- +{
- + rcu_read_unlock();
- +}
- +
- /* idx can be of type enum memcg_stat_item or node_stat_item */
- static inline void mod_memcg_state(struct mem_cgroup *memcg,
- int idx, int val)
- @@ -1374,6 +1393,18 @@ static inline void unlock_page_memcg(str
- {
- }
-
- +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
- +{
- + /* to match page_memcg_rcu() */
- + rcu_read_lock();
- + return true;
- +}
- +
- +static inline void mem_cgroup_unlock_pages(void)
- +{
- + rcu_read_unlock();
- +}
- +
- static inline void mem_cgroup_handle_over_high(void)
- {
- }
- --- a/include/linux/mmzone.h
- +++ b/include/linux/mmzone.h
- @@ -352,6 +352,7 @@ enum lruvec_flags {
- #ifndef __GENERATING_BOUNDS_H
-
- struct lruvec;
- +struct page_vma_mapped_walk;
-
- #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
- #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
- @@ -407,6 +408,7 @@ struct lru_gen_struct {
- };
-
- void lru_gen_init_lruvec(struct lruvec *lruvec);
- +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
-
- #ifdef CONFIG_MEMCG
- void lru_gen_init_memcg(struct mem_cgroup *memcg);
- @@ -419,6 +421,10 @@ static inline void lru_gen_init_lruvec(s
- {
- }
-
- +static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
- +{
- +}
- +
- #ifdef CONFIG_MEMCG
- static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
- {
- --- a/mm/internal.h
- +++ b/mm/internal.h
- @@ -35,6 +35,7 @@
- void page_writeback_init(void);
-
- vm_fault_t do_swap_page(struct vm_fault *vmf);
- +void activate_page(struct page *page);
-
- void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
- unsigned long floor, unsigned long ceiling);
- --- a/mm/memcontrol.c
- +++ b/mm/memcontrol.c
- @@ -2798,6 +2798,7 @@ static void commit_charge(struct page *p
- * - LRU isolation
- * - lock_page_memcg()
- * - exclusive reference
- + * - mem_cgroup_trylock_pages()
- */
- page->memcg_data = (unsigned long)memcg;
- }
- --- a/mm/rmap.c
- +++ b/mm/rmap.c
- @@ -73,6 +73,7 @@
- #include <linux/page_idle.h>
- #include <linux/memremap.h>
- #include <linux/userfaultfd_k.h>
- +#include <linux/mm_inline.h>
-
- #include <asm/tlbflush.h>
-
- @@ -793,6 +794,12 @@ static bool page_referenced_one(struct p
- }
-
- if (pvmw.pte) {
- + if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
- + !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
- + lru_gen_look_around(&pvmw);
- + referenced++;
- + }
- +
- if (ptep_clear_flush_young_notify(vma, address,
- pvmw.pte)) {
- /*
- --- a/mm/swap.c
- +++ b/mm/swap.c
- @@ -325,7 +325,7 @@ static bool need_activate_page_drain(int
- return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
- }
-
- -static void activate_page(struct page *page)
- +void activate_page(struct page *page)
- {
- page = compound_head(page);
- if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
- @@ -345,7 +345,7 @@ static inline void activate_page_drain(i
- {
- }
-
- -static void activate_page(struct page *page)
- +void activate_page(struct page *page)
- {
- struct lruvec *lruvec;
-
- --- a/mm/vmscan.c
- +++ b/mm/vmscan.c
- @@ -1409,6 +1409,11 @@ retry:
- if (!sc->may_unmap && page_mapped(page))
- goto keep_locked;
-
- + /* page_update_gen() tried to promote this page? */
- + if (lru_gen_enabled() && !ignore_references &&
- + page_mapped(page) && PageReferenced(page))
- + goto keep_locked;
- +
- may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
- (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
-
- @@ -2990,6 +2995,29 @@ static bool positive_ctrl_err(struct ctr
- * the aging
- ******************************************************************************/
-
- +/* promote pages accessed through page tables */
- +static int page_update_gen(struct page *page, int gen)
- +{
- + unsigned long new_flags, old_flags = READ_ONCE(page->flags);
- +
- + VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
- + VM_WARN_ON_ONCE(!rcu_read_lock_held());
- +
- + do {
- + /* lru_gen_del_page() has isolated this page? */
- + if (!(old_flags & LRU_GEN_MASK)) {
- + /* for shrink_page_list() */
- + new_flags = old_flags | BIT(PG_referenced);
- + continue;
- + }
- +
- + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
- + new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
- + } while (!try_cmpxchg(&page->flags, &old_flags, new_flags));
- +
- + return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
- +}
- +
- /* protect pages accessed multiple times through file descriptors */
- static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming)
- {
- @@ -3001,6 +3029,11 @@ static int page_inc_gen(struct lruvec *l
- VM_WARN_ON_ONCE_PAGE(!(old_flags & LRU_GEN_MASK), page);
-
- do {
- + new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
- + /* page_update_gen() has promoted this page? */
- + if (new_gen >= 0 && new_gen != old_gen)
- + return new_gen;
- +
- new_gen = (old_gen + 1) % MAX_NR_GENS;
-
- new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
- @@ -3015,6 +3048,43 @@ static int page_inc_gen(struct lruvec *l
- return new_gen;
- }
-
- +static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
- +{
- + unsigned long pfn = pte_pfn(pte);
- +
- + VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
- +
- + if (!pte_present(pte) || is_zero_pfn(pfn))
- + return -1;
- +
- + if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
- + return -1;
- +
- + if (WARN_ON_ONCE(!pfn_valid(pfn)))
- + return -1;
- +
- + return pfn;
- +}
- +
- +static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg,
- + struct pglist_data *pgdat)
- +{
- + struct page *page;
- +
- + /* try to avoid unnecessary memory loads */
- + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
- + return NULL;
- +
- + page = compound_head(pfn_to_page(pfn));
- + if (page_to_nid(page) != pgdat->node_id)
- + return NULL;
- +
- + if (page_memcg_rcu(page) != memcg)
- + return NULL;
- +
- + return page;
- +}
- +
- static void inc_min_seq(struct lruvec *lruvec, int type)
- {
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
- @@ -3214,6 +3284,114 @@ static void lru_gen_age_node(struct pgli
- } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
- }
-
- +/*
- + * This function exploits spatial locality when shrink_page_list() walks the
- + * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
- + */
- +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
- +{
- + int i;
- + pte_t *pte;
- + unsigned long start;
- + unsigned long end;
- + unsigned long addr;
- + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
- + struct page *page = pvmw->page;
- + struct mem_cgroup *memcg = page_memcg(page);
- + struct pglist_data *pgdat = page_pgdat(page);
- + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
- + DEFINE_MAX_SEQ(lruvec);
- + int old_gen, new_gen = lru_gen_from_seq(max_seq);
- +
- + lockdep_assert_held(pvmw->ptl);
- + VM_WARN_ON_ONCE_PAGE(PageLRU(page), page);
- +
- + if (spin_is_contended(pvmw->ptl))
- + return;
- +
- + start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
- + end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
- +
- + if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
- + if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
- + end = start + MIN_LRU_BATCH * PAGE_SIZE;
- + else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
- + start = end - MIN_LRU_BATCH * PAGE_SIZE;
- + else {
- + start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2;
- + end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2;
- + }
- + }
- +
- + pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
- +
- + rcu_read_lock();
- + arch_enter_lazy_mmu_mode();
- +
- + for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
- + unsigned long pfn;
- +
- + pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
- + if (pfn == -1)
- + continue;
- +
- + if (!pte_young(pte[i]))
- + continue;
- +
- + page = get_pfn_page(pfn, memcg, pgdat);
- + if (!page)
- + continue;
- +
- + if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
- + VM_WARN_ON_ONCE(true);
- +
- + if (pte_dirty(pte[i]) && !PageDirty(page) &&
- + !(PageAnon(page) && PageSwapBacked(page) &&
- + !PageSwapCache(page)))
- + set_page_dirty(page);
- +
- + old_gen = page_lru_gen(page);
- + if (old_gen < 0)
- + SetPageReferenced(page);
- + else if (old_gen != new_gen)
- + __set_bit(i, bitmap);
- + }
- +
- + arch_leave_lazy_mmu_mode();
- + rcu_read_unlock();
- +
- + if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
- + for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
- + page = pte_page(pte[i]);
- + activate_page(page);
- + }
- + return;
- + }
- +
- + /* page_update_gen() requires stable page_memcg() */
- + if (!mem_cgroup_trylock_pages(memcg))
- + return;
- +
- + spin_lock_irq(&lruvec->lru_lock);
- + new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
- +
- + for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
- + page = compound_head(pte_page(pte[i]));
- + if (page_memcg_rcu(page) != memcg)
- + continue;
- +
- + old_gen = page_update_gen(page, new_gen);
- + if (old_gen < 0 || old_gen == new_gen)
- + continue;
- +
- + lru_gen_update_size(lruvec, page, old_gen, new_gen);
- + }
- +
- + spin_unlock_irq(&lruvec->lru_lock);
- +
- + mem_cgroup_unlock_pages();
- +}
- +
- /******************************************************************************
- * the eviction
- ******************************************************************************/
- @@ -3250,6 +3428,12 @@ static bool sort_page(struct lruvec *lru
- return true;
- }
-
- + /* promoted */
- + if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
- + list_move(&page->lru, &lrugen->lists[gen][type][zone]);
- + return true;
- + }
- +
- /* protected */
- if (tier > tier_idx) {
- int hist = lru_hist_from_seq(lrugen->min_seq[type]);
|