| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427 |
- From 48c916b812652f9453be5bd45a703728926d41ca Mon Sep 17 00:00:00 2001
- From: "T.J. Alumbaugh" <[email protected]>
- Date: Wed, 18 Jan 2023 00:18:24 +0000
- Subject: [PATCH 15/19] UPSTREAM: mm: multi-gen LRU: section for memcg LRU
- Move memcg LRU code into a dedicated section. Improve the design doc to
- outline its architecture.
- Link: https://lkml.kernel.org/r/[email protected]
- Change-Id: Id252e420cff7a858acb098cf2b3642da5c40f602
- Signed-off-by: T.J. Alumbaugh <[email protected]>
- Cc: Yu Zhao <[email protected]>
- Signed-off-by: Andrew Morton <[email protected]>
- (cherry picked from commit 36c7b4db7c942ae9e1b111f0c6b468c8b2e33842)
- Bug: 274865848
- Signed-off-by: T.J. Mercier <[email protected]>
- ---
- Documentation/mm/multigen_lru.rst | 33 +++-
- include/linux/mm_inline.h | 17 --
- include/linux/mmzone.h | 13 +-
- mm/memcontrol.c | 8 +-
- mm/vmscan.c | 250 +++++++++++++++++-------------
- 5 files changed, 178 insertions(+), 143 deletions(-)
- --- a/Documentation/mm/multigen_lru.rst
- +++ b/Documentation/mm/multigen_lru.rst
- @@ -186,9 +186,40 @@ is false positive, the cost is an additi
- which may yield hot pages anyway. Parameters of the filter itself can
- control the false positive rate in the limit.
-
- +Memcg LRU
- +---------
- +An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs,
- +since each node and memcg combination has an LRU of folios (see
- +``mem_cgroup_lruvec()``). Its goal is to improve the scalability of
- +global reclaim, which is critical to system-wide memory overcommit in
- +data centers. Note that memcg LRU only applies to global reclaim.
- +
- +The basic structure of an memcg LRU can be understood by an analogy to
- +the active/inactive LRU (of folios):
- +
- +1. It has the young and the old (generations), i.e., the counterparts
- + to the active and the inactive;
- +2. The increment of ``max_seq`` triggers promotion, i.e., the
- + counterpart to activation;
- +3. Other events trigger similar operations, e.g., offlining an memcg
- + triggers demotion, i.e., the counterpart to deactivation.
- +
- +In terms of global reclaim, it has two distinct features:
- +
- +1. Sharding, which allows each thread to start at a random memcg (in
- + the old generation) and improves parallelism;
- +2. Eventual fairness, which allows direct reclaim to bail out at will
- + and reduces latency without affecting fairness over some time.
- +
- +In terms of traversing memcgs during global reclaim, it improves the
- +best-case complexity from O(n) to O(1) and does not affect the
- +worst-case complexity O(n). Therefore, on average, it has a sublinear
- +complexity.
- +
- Summary
- -------
- -The multi-gen LRU can be disassembled into the following parts:
- +The multi-gen LRU (of folios) can be disassembled into the following
- +parts:
-
- * Generations
- * Rmap walks
- --- a/include/linux/mm_inline.h
- +++ b/include/linux/mm_inline.h
- @@ -122,18 +122,6 @@ static inline bool lru_gen_in_fault(void
- return current->in_lru_fault;
- }
-
- -#ifdef CONFIG_MEMCG
- -static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
- -{
- - return READ_ONCE(lruvec->lrugen.seg);
- -}
- -#else
- -static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
- -{
- - return 0;
- -}
- -#endif
- -
- static inline int lru_gen_from_seq(unsigned long seq)
- {
- return seq % MAX_NR_GENS;
- @@ -314,11 +302,6 @@ static inline bool lru_gen_in_fault(void
- return false;
- }
-
- -static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
- -{
- - return 0;
- -}
- -
- static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
- {
- return false;
- --- a/include/linux/mmzone.h
- +++ b/include/linux/mmzone.h
- @@ -368,15 +368,6 @@ struct page_vma_mapped_walk;
- #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
- #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
-
- -/* see the comment on MEMCG_NR_GENS */
- -enum {
- - MEMCG_LRU_NOP,
- - MEMCG_LRU_HEAD,
- - MEMCG_LRU_TAIL,
- - MEMCG_LRU_OLD,
- - MEMCG_LRU_YOUNG,
- -};
- -
- #ifdef CONFIG_LRU_GEN
-
- enum {
- @@ -557,7 +548,7 @@ void lru_gen_exit_memcg(struct mem_cgrou
- void lru_gen_online_memcg(struct mem_cgroup *memcg);
- void lru_gen_offline_memcg(struct mem_cgroup *memcg);
- void lru_gen_release_memcg(struct mem_cgroup *memcg);
- -void lru_gen_rotate_memcg(struct lruvec *lruvec, int op);
- +void lru_gen_soft_reclaim(struct lruvec *lruvec);
-
- #else /* !CONFIG_MEMCG */
-
- @@ -608,7 +599,7 @@ static inline void lru_gen_release_memcg
- {
- }
-
- -static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
- +static inline void lru_gen_soft_reclaim(struct lruvec *lruvec)
- {
- }
-
- --- a/mm/memcontrol.c
- +++ b/mm/memcontrol.c
- @@ -478,12 +478,8 @@ static void mem_cgroup_update_tree(struc
- struct mem_cgroup_tree_per_node *mctz;
-
- if (lru_gen_enabled()) {
- - struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
- -
- - /* see the comment on MEMCG_NR_GENS */
- - if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
- - lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
- -
- + if (soft_limit_excess(memcg))
- + lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec);
- return;
- }
-
- --- a/mm/vmscan.c
- +++ b/mm/vmscan.c
- @@ -4692,6 +4692,148 @@ void lru_gen_look_around(struct page_vma
- }
-
- /******************************************************************************
- + * memcg LRU
- + ******************************************************************************/
- +
- +/* see the comment on MEMCG_NR_GENS */
- +enum {
- + MEMCG_LRU_NOP,
- + MEMCG_LRU_HEAD,
- + MEMCG_LRU_TAIL,
- + MEMCG_LRU_OLD,
- + MEMCG_LRU_YOUNG,
- +};
- +
- +#ifdef CONFIG_MEMCG
- +
- +static int lru_gen_memcg_seg(struct lruvec *lruvec)
- +{
- + return READ_ONCE(lruvec->lrugen.seg);
- +}
- +
- +static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
- +{
- + int seg;
- + int old, new;
- + int bin = get_random_u32_below(MEMCG_NR_BINS);
- + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- +
- + spin_lock(&pgdat->memcg_lru.lock);
- +
- + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
- +
- + seg = 0;
- + new = old = lruvec->lrugen.gen;
- +
- + /* see the comment on MEMCG_NR_GENS */
- + if (op == MEMCG_LRU_HEAD)
- + seg = MEMCG_LRU_HEAD;
- + else if (op == MEMCG_LRU_TAIL)
- + seg = MEMCG_LRU_TAIL;
- + else if (op == MEMCG_LRU_OLD)
- + new = get_memcg_gen(pgdat->memcg_lru.seq);
- + else if (op == MEMCG_LRU_YOUNG)
- + new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
- + else
- + VM_WARN_ON_ONCE(true);
- +
- + hlist_nulls_del_rcu(&lruvec->lrugen.list);
- +
- + if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
- + hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
- + else
- + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
- +
- + pgdat->memcg_lru.nr_memcgs[old]--;
- + pgdat->memcg_lru.nr_memcgs[new]++;
- +
- + lruvec->lrugen.gen = new;
- + WRITE_ONCE(lruvec->lrugen.seg, seg);
- +
- + if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
- + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
- +
- + spin_unlock(&pgdat->memcg_lru.lock);
- +}
- +
- +void lru_gen_online_memcg(struct mem_cgroup *memcg)
- +{
- + int gen;
- + int nid;
- + int bin = get_random_u32_below(MEMCG_NR_BINS);
- +
- + for_each_node(nid) {
- + struct pglist_data *pgdat = NODE_DATA(nid);
- + struct lruvec *lruvec = get_lruvec(memcg, nid);
- +
- + spin_lock(&pgdat->memcg_lru.lock);
- +
- + VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
- +
- + gen = get_memcg_gen(pgdat->memcg_lru.seq);
- +
- + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
- + pgdat->memcg_lru.nr_memcgs[gen]++;
- +
- + lruvec->lrugen.gen = gen;
- +
- + spin_unlock(&pgdat->memcg_lru.lock);
- + }
- +}
- +
- +void lru_gen_offline_memcg(struct mem_cgroup *memcg)
- +{
- + int nid;
- +
- + for_each_node(nid) {
- + struct lruvec *lruvec = get_lruvec(memcg, nid);
- +
- + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
- + }
- +}
- +
- +void lru_gen_release_memcg(struct mem_cgroup *memcg)
- +{
- + int gen;
- + int nid;
- +
- + for_each_node(nid) {
- + struct pglist_data *pgdat = NODE_DATA(nid);
- + struct lruvec *lruvec = get_lruvec(memcg, nid);
- +
- + spin_lock(&pgdat->memcg_lru.lock);
- +
- + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
- +
- + gen = lruvec->lrugen.gen;
- +
- + hlist_nulls_del_rcu(&lruvec->lrugen.list);
- + pgdat->memcg_lru.nr_memcgs[gen]--;
- +
- + if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
- + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
- +
- + spin_unlock(&pgdat->memcg_lru.lock);
- + }
- +}
- +
- +void lru_gen_soft_reclaim(struct lruvec *lruvec)
- +{
- + /* see the comment on MEMCG_NR_GENS */
- + if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
- + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
- +}
- +
- +#else /* !CONFIG_MEMCG */
- +
- +static int lru_gen_memcg_seg(struct lruvec *lruvec)
- +{
- + return 0;
- +}
- +
- +#endif
- +
- +/******************************************************************************
- * the eviction
- ******************************************************************************/
-
- @@ -5398,53 +5540,6 @@ done:
- pgdat->kswapd_failures = 0;
- }
-
- -#ifdef CONFIG_MEMCG
- -void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
- -{
- - int seg;
- - int old, new;
- - int bin = get_random_u32_below(MEMCG_NR_BINS);
- - struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- -
- - spin_lock(&pgdat->memcg_lru.lock);
- -
- - VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
- -
- - seg = 0;
- - new = old = lruvec->lrugen.gen;
- -
- - /* see the comment on MEMCG_NR_GENS */
- - if (op == MEMCG_LRU_HEAD)
- - seg = MEMCG_LRU_HEAD;
- - else if (op == MEMCG_LRU_TAIL)
- - seg = MEMCG_LRU_TAIL;
- - else if (op == MEMCG_LRU_OLD)
- - new = get_memcg_gen(pgdat->memcg_lru.seq);
- - else if (op == MEMCG_LRU_YOUNG)
- - new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
- - else
- - VM_WARN_ON_ONCE(true);
- -
- - hlist_nulls_del_rcu(&lruvec->lrugen.list);
- -
- - if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
- - hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
- - else
- - hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
- -
- - pgdat->memcg_lru.nr_memcgs[old]--;
- - pgdat->memcg_lru.nr_memcgs[new]++;
- -
- - lruvec->lrugen.gen = new;
- - WRITE_ONCE(lruvec->lrugen.seg, seg);
- -
- - if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
- - WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
- -
- - spin_unlock(&pgdat->memcg_lru.lock);
- -}
- -#endif
- -
- /******************************************************************************
- * state change
- ******************************************************************************/
- @@ -6090,67 +6185,6 @@ void lru_gen_exit_memcg(struct mem_cgrou
- }
- }
-
- -void lru_gen_online_memcg(struct mem_cgroup *memcg)
- -{
- - int gen;
- - int nid;
- - int bin = get_random_u32_below(MEMCG_NR_BINS);
- -
- - for_each_node(nid) {
- - struct pglist_data *pgdat = NODE_DATA(nid);
- - struct lruvec *lruvec = get_lruvec(memcg, nid);
- -
- - spin_lock(&pgdat->memcg_lru.lock);
- -
- - VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
- -
- - gen = get_memcg_gen(pgdat->memcg_lru.seq);
- -
- - hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
- - pgdat->memcg_lru.nr_memcgs[gen]++;
- -
- - lruvec->lrugen.gen = gen;
- -
- - spin_unlock(&pgdat->memcg_lru.lock);
- - }
- -}
- -
- -void lru_gen_offline_memcg(struct mem_cgroup *memcg)
- -{
- - int nid;
- -
- - for_each_node(nid) {
- - struct lruvec *lruvec = get_lruvec(memcg, nid);
- -
- - lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
- - }
- -}
- -
- -void lru_gen_release_memcg(struct mem_cgroup *memcg)
- -{
- - int gen;
- - int nid;
- -
- - for_each_node(nid) {
- - struct pglist_data *pgdat = NODE_DATA(nid);
- - struct lruvec *lruvec = get_lruvec(memcg, nid);
- -
- - spin_lock(&pgdat->memcg_lru.lock);
- -
- - VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
- -
- - gen = lruvec->lrugen.gen;
- -
- - hlist_nulls_del_rcu(&lruvec->lrugen.list);
- - pgdat->memcg_lru.nr_memcgs[gen]--;
- -
- - if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
- - WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
- -
- - spin_unlock(&pgdat->memcg_lru.lock);
- - }
- -}
- -
- #endif /* CONFIG_MEMCG */
-
- static int __init init_lru_gen(void)
|