| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447 |
- From b564b9471cd60ef1ee3961a224898ce4a9620d84 Mon Sep 17 00:00:00 2001
- From: Yu Zhao <[email protected]>
- Date: Sun, 18 Sep 2022 02:00:03 -0600
- Subject: [PATCH 06/29] mm: multi-gen LRU: minimal implementation
- MIME-Version: 1.0
- Content-Type: text/plain; charset=UTF-8
- Content-Transfer-Encoding: 8bit
- To avoid confusion, the terms "promotion" and "demotion" will be applied
- to the multi-gen LRU, as a new convention; the terms "activation" and
- "deactivation" will be applied to the active/inactive LRU, as usual.
- The aging produces young generations. Given an lruvec, it increments
- max_seq when max_seq-min_seq+1 approaches MIN_NR_GENS. The aging promotes
- hot pages to the youngest generation when it finds them accessed through
- page tables; the demotion of cold pages happens consequently when it
- increments max_seq. Promotion in the aging path does not involve any LRU
- list operations, only the updates of the gen counter and
- lrugen->nr_pages[]; demotion, unless as the result of the increment of
- max_seq, requires LRU list operations, e.g., lru_deactivate_fn(). The
- aging has the complexity O(nr_hot_pages), since it is only interested in
- hot pages.
- The eviction consumes old generations. Given an lruvec, it increments
- min_seq when lrugen->lists[] indexed by min_seq%MAX_NR_GENS becomes empty.
- A feedback loop modeled after the PID controller monitors refaults over
- anon and file types and decides which type to evict when both types are
- available from the same generation.
- The protection of pages accessed multiple times through file descriptors
- takes place in the eviction path. Each generation is divided into
- multiple tiers. A page accessed N times through file descriptors is in
- tier order_base_2(N). Tiers do not have dedicated lrugen->lists[], only
- bits in page->flags. The aforementioned feedback loop also monitors
- refaults over all tiers and decides when to protect pages in which tiers
- (N>1), using the first tier (N=0,1) as a baseline. The first tier
- contains single-use unmapped clean pages, which are most likely the best
- choices. In contrast to promotion in the aging path, the protection of a
- page in the eviction path is achieved by moving this page to the next
- generation, i.e., min_seq+1, if the feedback loop decides so. This
- approach has the following advantages:
- 1. It removes the cost of activation in the buffered access path by
- inferring whether pages accessed multiple times through file
- descriptors are statistically hot and thus worth protecting in the
- eviction path.
- 2. It takes pages accessed through page tables into account and avoids
- overprotecting pages accessed multiple times through file
- descriptors. (Pages accessed through page tables are in the first
- tier, since N=0.)
- 3. More tiers provide better protection for pages accessed more than
- twice through file descriptors, when under heavy buffered I/O
- workloads.
- Server benchmark results:
- Single workload:
- fio (buffered I/O): +[30, 32]%
- IOPS BW
- 5.19-rc1: 2673k 10.2GiB/s
- patch1-6: 3491k 13.3GiB/s
- Single workload:
- memcached (anon): -[4, 6]%
- Ops/sec KB/sec
- 5.19-rc1: 1161501.04 45177.25
- patch1-6: 1106168.46 43025.04
- Configurations:
- CPU: two Xeon 6154
- Mem: total 256G
- Node 1 was only used as a ram disk to reduce the variance in the
- results.
- patch drivers/block/brd.c <<EOF
- 99,100c99,100
- < gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM;
- < page = alloc_page(gfp_flags);
- ---
- > gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM | __GFP_THISNODE;
- > page = alloc_pages_node(1, gfp_flags, 0);
- EOF
- cat >>/etc/systemd/system.conf <<EOF
- CPUAffinity=numa
- NUMAPolicy=bind
- NUMAMask=0
- EOF
- cat >>/etc/memcached.conf <<EOF
- -m 184320
- -s /var/run/memcached/memcached.sock
- -a 0766
- -t 36
- -B binary
- EOF
- cat fio.sh
- modprobe brd rd_nr=1 rd_size=113246208
- swapoff -a
- mkfs.ext4 /dev/ram0
- mount -t ext4 /dev/ram0 /mnt
- mkdir /sys/fs/cgroup/user.slice/test
- echo 38654705664 >/sys/fs/cgroup/user.slice/test/memory.max
- echo $$ >/sys/fs/cgroup/user.slice/test/cgroup.procs
- fio -name=mglru --numjobs=72 --directory=/mnt --size=1408m \
- --buffered=1 --ioengine=io_uring --iodepth=128 \
- --iodepth_batch_submit=32 --iodepth_batch_complete=32 \
- --rw=randread --random_distribution=random --norandommap \
- --time_based --ramp_time=10m --runtime=5m --group_reporting
- cat memcached.sh
- modprobe brd rd_nr=1 rd_size=113246208
- swapoff -a
- mkswap /dev/ram0
- swapon /dev/ram0
- memtier_benchmark -S /var/run/memcached/memcached.sock \
- -P memcache_binary -n allkeys --key-minimum=1 \
- --key-maximum=65000000 --key-pattern=P:P -c 1 -t 36 \
- --ratio 1:0 --pipeline 8 -d 2000
- memtier_benchmark -S /var/run/memcached/memcached.sock \
- -P memcache_binary -n allkeys --key-minimum=1 \
- --key-maximum=65000000 --key-pattern=R:R -c 1 -t 36 \
- --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
- Client benchmark results:
- kswapd profiles:
- 5.19-rc1
- 40.33% page_vma_mapped_walk (overhead)
- 21.80% lzo1x_1_do_compress (real work)
- 7.53% do_raw_spin_lock
- 3.95% _raw_spin_unlock_irq
- 2.52% vma_interval_tree_iter_next
- 2.37% page_referenced_one
- 2.28% vma_interval_tree_subtree_search
- 1.97% anon_vma_interval_tree_iter_first
- 1.60% ptep_clear_flush
- 1.06% __zram_bvec_write
- patch1-6
- 39.03% lzo1x_1_do_compress (real work)
- 18.47% page_vma_mapped_walk (overhead)
- 6.74% _raw_spin_unlock_irq
- 3.97% do_raw_spin_lock
- 2.49% ptep_clear_flush
- 2.48% anon_vma_interval_tree_iter_first
- 1.92% page_referenced_one
- 1.88% __zram_bvec_write
- 1.48% memmove
- 1.31% vma_interval_tree_iter_next
- Configurations:
- CPU: single Snapdragon 7c
- Mem: total 4G
- ChromeOS MemoryPressure [1]
- [1] https://chromium.googlesource.com/chromiumos/platform/tast-tests/
- Link: https://lkml.kernel.org/r/[email protected]
- Signed-off-by: Yu Zhao <[email protected]>
- Acked-by: Brian Geffon <[email protected]>
- Acked-by: Jan Alexander Steffens (heftig) <[email protected]>
- Acked-by: Oleksandr Natalenko <[email protected]>
- Acked-by: Steven Barrett <[email protected]>
- Acked-by: Suleiman Souhlal <[email protected]>
- Tested-by: Daniel Byrne <[email protected]>
- Tested-by: Donald Carr <[email protected]>
- Tested-by: Holger Hoffstätte <[email protected]>
- Tested-by: Konstantin Kharlamov <[email protected]>
- Tested-by: Shuang Zhai <[email protected]>
- Tested-by: Sofia Trinh <[email protected]>
- Tested-by: Vaibhav Jain <[email protected]>
- Cc: Andi Kleen <[email protected]>
- Cc: Aneesh Kumar K.V <[email protected]>
- Cc: Barry Song <[email protected]>
- Cc: Catalin Marinas <[email protected]>
- Cc: Dave Hansen <[email protected]>
- Cc: Hillf Danton <[email protected]>
- Cc: Jens Axboe <[email protected]>
- Cc: Johannes Weiner <[email protected]>
- Cc: Jonathan Corbet <[email protected]>
- Cc: Linus Torvalds <[email protected]>
- Cc: Matthew Wilcox <[email protected]>
- Cc: Mel Gorman <[email protected]>
- Cc: Miaohe Lin <[email protected]>
- Cc: Michael Larabel <[email protected]>
- Cc: Michal Hocko <[email protected]>
- Cc: Mike Rapoport <[email protected]>
- Cc: Mike Rapoport <[email protected]>
- Cc: Peter Zijlstra <[email protected]>
- Cc: Qi Zheng <[email protected]>
- Cc: Tejun Heo <[email protected]>
- Cc: Vlastimil Babka <[email protected]>
- Cc: Will Deacon <[email protected]>
- Signed-off-by: Andrew Morton <[email protected]>
- ---
- include/linux/mm_inline.h | 36 ++
- include/linux/mmzone.h | 41 ++
- include/linux/page-flags-layout.h | 5 +-
- kernel/bounds.c | 2 +
- mm/Kconfig | 11 +
- mm/swap.c | 39 ++
- mm/vmscan.c | 792 +++++++++++++++++++++++++++++-
- mm/workingset.c | 110 ++++-
- 8 files changed, 1025 insertions(+), 11 deletions(-)
- --- a/include/linux/mm_inline.h
- +++ b/include/linux/mm_inline.h
- @@ -106,6 +106,33 @@ static inline int lru_gen_from_seq(unsig
- return seq % MAX_NR_GENS;
- }
-
- +static inline int lru_hist_from_seq(unsigned long seq)
- +{
- + return seq % NR_HIST_GENS;
- +}
- +
- +static inline int lru_tier_from_refs(int refs)
- +{
- + VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));
- +
- + /* see the comment in page_lru_refs() */
- + return order_base_2(refs + 1);
- +}
- +
- +static inline int page_lru_refs(struct page *page)
- +{
- + unsigned long flags = READ_ONCE(page->flags);
- + bool workingset = flags & BIT(PG_workingset);
- +
- + /*
- + * Return the number of accesses beyond PG_referenced, i.e., N-1 if the
- + * total number of accesses is N>1, since N=0,1 both map to the first
- + * tier. lru_tier_from_refs() will account for this off-by-one. Also see
- + * the comment on MAX_NR_TIERS.
- + */
- + return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
- +}
- +
- static inline int page_lru_gen(struct page *page)
- {
- unsigned long flags = READ_ONCE(page->flags);
- @@ -158,6 +185,15 @@ static inline void lru_gen_update_size(s
- __update_lru_size(lruvec, lru, zone, -delta);
- return;
- }
- +
- + /* promotion */
- + if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
- + __update_lru_size(lruvec, lru, zone, -delta);
- + __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
- + }
- +
- + /* demotion requires isolation, e.g., lru_deactivate_fn() */
- + VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
- }
-
- static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
- --- a/include/linux/mmzone.h
- +++ b/include/linux/mmzone.h
- @@ -327,6 +327,28 @@ enum lruvec_flags {
- #define MIN_NR_GENS 2U
- #define MAX_NR_GENS 4U
-
- +/*
- + * Each generation is divided into multiple tiers. A page accessed N times
- + * through file descriptors is in tier order_base_2(N). A page in the first tier
- + * (N=0,1) is marked by PG_referenced unless it was faulted in through page
- + * tables or read ahead. A page in any other tier (N>1) is marked by
- + * PG_referenced and PG_workingset. This implies a minimum of two tiers is
- + * supported without using additional bits in page->flags.
- + *
- + * In contrast to moving across generations which requires the LRU lock, moving
- + * across tiers only involves atomic operations on page->flags and therefore
- + * has a negligible cost in the buffered access path. In the eviction path,
- + * comparisons of refaulted/(evicted+protected) from the first tier and the
- + * rest infer whether pages accessed multiple times through file descriptors
- + * are statistically hot and thus worth protecting.
- + *
- + * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the
- + * number of categories of the active/inactive LRU when keeping track of
- + * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in
- + * page->flags.
- + */
- +#define MAX_NR_TIERS 4U
- +
- #ifndef __GENERATING_BOUNDS_H
-
- struct lruvec;
- @@ -341,6 +363,16 @@ enum {
- LRU_GEN_FILE,
- };
-
- +#define MIN_LRU_BATCH BITS_PER_LONG
- +#define MAX_LRU_BATCH (MIN_LRU_BATCH * 64)
- +
- +/* whether to keep historical stats from evicted generations */
- +#ifdef CONFIG_LRU_GEN_STATS
- +#define NR_HIST_GENS MAX_NR_GENS
- +#else
- +#define NR_HIST_GENS 1U
- +#endif
- +
- /*
- * The youngest generation number is stored in max_seq for both anon and file
- * types as they are aged on an equal footing. The oldest generation numbers are
- @@ -363,6 +395,15 @@ struct lru_gen_struct {
- struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
- /* the multi-gen LRU sizes, eventually consistent */
- long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
- + /* the exponential moving average of refaulted */
- + unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
- + /* the exponential moving average of evicted+protected */
- + unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
- + /* the first tier doesn't need protection, hence the minus one */
- + unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
- + /* can be modified without holding the LRU lock */
- + atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
- + atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
- };
-
- void lru_gen_init_lruvec(struct lruvec *lruvec);
- --- a/include/linux/page-flags-layout.h
- +++ b/include/linux/page-flags-layout.h
- @@ -106,7 +106,10 @@
- #error "Not enough bits in page flags"
- #endif
-
- -#define LRU_REFS_WIDTH 0
- +/* see the comment on MAX_NR_TIERS */
- +#define LRU_REFS_WIDTH min(__LRU_REFS_WIDTH, BITS_PER_LONG - NR_PAGEFLAGS - \
- + ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \
- + NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH)
-
- #endif
- #endif /* _LINUX_PAGE_FLAGS_LAYOUT */
- --- a/kernel/bounds.c
- +++ b/kernel/bounds.c
- @@ -24,8 +24,10 @@ int main(void)
- DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
- #ifdef CONFIG_LRU_GEN
- DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
- + DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2);
- #else
- DEFINE(LRU_GEN_WIDTH, 0);
- + DEFINE(__LRU_REFS_WIDTH, 0);
- #endif
- /* End of constants */
-
- --- a/mm/Kconfig
- +++ b/mm/Kconfig
- @@ -897,6 +897,7 @@ config IO_MAPPING
- config SECRETMEM
- def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
-
- +# multi-gen LRU {
- config LRU_GEN
- bool "Multi-Gen LRU"
- depends on MMU
- @@ -905,6 +906,16 @@ config LRU_GEN
- help
- A high performance LRU implementation to overcommit memory.
-
- +config LRU_GEN_STATS
- + bool "Full stats for debugging"
- + depends on LRU_GEN
- + help
- + Do not enable this option unless you plan to look at historical stats
- + from evicted generations for debugging purpose.
- +
- + This option has a per-memcg and per-node memory overhead.
- +# }
- +
- source "mm/damon/Kconfig"
-
- endmenu
- --- a/mm/swap.c
- +++ b/mm/swap.c
- @@ -389,6 +389,40 @@ static void __lru_cache_activate_page(st
- local_unlock(&lru_pvecs.lock);
- }
-
- +#ifdef CONFIG_LRU_GEN
- +static void page_inc_refs(struct page *page)
- +{
- + unsigned long new_flags, old_flags = READ_ONCE(page->flags);
- +
- + if (PageUnevictable(page))
- + return;
- +
- + if (!PageReferenced(page)) {
- + SetPageReferenced(page);
- + return;
- + }
- +
- + if (!PageWorkingset(page)) {
- + SetPageWorkingset(page);
- + return;
- + }
- +
- + /* see the comment on MAX_NR_TIERS */
- + do {
- + new_flags = old_flags & LRU_REFS_MASK;
- + if (new_flags == LRU_REFS_MASK)
- + break;
- +
- + new_flags += BIT(LRU_REFS_PGOFF);
- + new_flags |= old_flags & ~LRU_REFS_MASK;
- + } while (!try_cmpxchg(&page->flags, &old_flags, new_flags));
- +}
- +#else
- +static void page_inc_refs(struct page *page)
- +{
- +}
- +#endif /* CONFIG_LRU_GEN */
- +
- /*
- * Mark a page as having seen activity.
- *
- @@ -403,6 +437,11 @@ void mark_page_accessed(struct page *pag
- {
- page = compound_head(page);
-
- + if (lru_gen_enabled()) {
- + page_inc_refs(page);
- + return;
- + }
- +
- if (!PageReferenced(page)) {
- SetPageReferenced(page);
- } else if (PageUnevictable(page)) {
- --- a/mm/vmscan.c
- +++ b/mm/vmscan.c
- @@ -1142,9 +1142,11 @@ static int __remove_mapping(struct addre
-
- if (PageSwapCache(page)) {
- swp_entry_t swap = { .val = page_private(page) };
- - mem_cgroup_swapout(page, swap);
- +
- + /* get a shadow entry before mem_cgroup_swapout() clears page_memcg() */
- if (reclaimed && !mapping_exiting(mapping))
- shadow = workingset_eviction(page, target_memcg);
- + mem_cgroup_swapout(page, swap);
- __delete_from_swap_cache(page, swap, shadow);
- xa_unlock_irq(&mapping->i_pages);
- put_swap_page(page, swap);
- @@ -2502,6 +2504,9 @@ static void prepare_scan_count(pg_data_t
- unsigned long file;
- struct lruvec *target_lruvec;
-
- + if (lru_gen_enabled())
- + return;
- +
- target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
-
- /*
- @@ -2827,6 +2832,17 @@ static bool can_age_anon_pages(struct pg
- * shorthand helpers
- ******************************************************************************/
-
- +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
- +
- +#define DEFINE_MAX_SEQ(lruvec) \
- + unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
- +
- +#define DEFINE_MIN_SEQ(lruvec) \
- + unsigned long min_seq[ANON_AND_FILE] = { \
- + READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \
- + READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
- + }
- +
- #define for_each_gen_type_zone(gen, type, zone) \
- for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
- for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
- @@ -2852,6 +2868,745 @@ static struct lruvec __maybe_unused *get
- return pgdat ? &pgdat->__lruvec : NULL;
- }
-
- +static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
- +{
- + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- +
- + if (!can_demote(pgdat->node_id, sc) &&
- + mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
- + return 0;
- +
- + return mem_cgroup_swappiness(memcg);
- +}
- +
- +static int get_nr_gens(struct lruvec *lruvec, int type)
- +{
- + return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1;
- +}
- +
- +static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
- +{
- + /* see the comment on lru_gen_struct */
- + return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
- + get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
- + get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
- +}
- +
- +/******************************************************************************
- + * refault feedback loop
- + ******************************************************************************/
- +
- +/*
- + * A feedback loop based on Proportional-Integral-Derivative (PID) controller.
- + *
- + * The P term is refaulted/(evicted+protected) from a tier in the generation
- + * currently being evicted; the I term is the exponential moving average of the
- + * P term over the generations previously evicted, using the smoothing factor
- + * 1/2; the D term isn't supported.
- + *
- + * The setpoint (SP) is always the first tier of one type; the process variable
- + * (PV) is either any tier of the other type or any other tier of the same
- + * type.
- + *
- + * The error is the difference between the SP and the PV; the correction is to
- + * turn off protection when SP>PV or turn on protection when SP<PV.
- + *
- + * For future optimizations:
- + * 1. The D term may discount the other two terms over time so that long-lived
- + * generations can resist stale information.
- + */
- +struct ctrl_pos {
- + unsigned long refaulted;
- + unsigned long total;
- + int gain;
- +};
- +
- +static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
- + struct ctrl_pos *pos)
- +{
- + struct lru_gen_struct *lrugen = &lruvec->lrugen;
- + int hist = lru_hist_from_seq(lrugen->min_seq[type]);
- +
- + pos->refaulted = lrugen->avg_refaulted[type][tier] +
- + atomic_long_read(&lrugen->refaulted[hist][type][tier]);
- + pos->total = lrugen->avg_total[type][tier] +
- + atomic_long_read(&lrugen->evicted[hist][type][tier]);
- + if (tier)
- + pos->total += lrugen->protected[hist][type][tier - 1];
- + pos->gain = gain;
- +}
- +
- +static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
- +{
- + int hist, tier;
- + struct lru_gen_struct *lrugen = &lruvec->lrugen;
- + bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
- + unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
- +
- + lockdep_assert_held(&lruvec->lru_lock);
- +
- + if (!carryover && !clear)
- + return;
- +
- + hist = lru_hist_from_seq(seq);
- +
- + for (tier = 0; tier < MAX_NR_TIERS; tier++) {
- + if (carryover) {
- + unsigned long sum;
- +
- + sum = lrugen->avg_refaulted[type][tier] +
- + atomic_long_read(&lrugen->refaulted[hist][type][tier]);
- + WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
- +
- + sum = lrugen->avg_total[type][tier] +
- + atomic_long_read(&lrugen->evicted[hist][type][tier]);
- + if (tier)
- + sum += lrugen->protected[hist][type][tier - 1];
- + WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
- + }
- +
- + if (clear) {
- + atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
- + atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
- + if (tier)
- + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
- + }
- + }
- +}
- +
- +static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
- +{
- + /*
- + * Return true if the PV has a limited number of refaults or a lower
- + * refaulted/total than the SP.
- + */
- + return pv->refaulted < MIN_LRU_BATCH ||
- + pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <=
- + (sp->refaulted + 1) * pv->total * pv->gain;
- +}
- +
- +/******************************************************************************
- + * the aging
- + ******************************************************************************/
- +
- +/* protect pages accessed multiple times through file descriptors */
- +static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming)
- +{
- + int type = page_is_file_lru(page);
- + struct lru_gen_struct *lrugen = &lruvec->lrugen;
- + int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
- + unsigned long new_flags, old_flags = READ_ONCE(page->flags);
- +
- + VM_WARN_ON_ONCE_PAGE(!(old_flags & LRU_GEN_MASK), page);
- +
- + do {
- + new_gen = (old_gen + 1) % MAX_NR_GENS;
- +
- + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
- + new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
- + /* for end_page_writeback() */
- + if (reclaiming)
- + new_flags |= BIT(PG_reclaim);
- + } while (!try_cmpxchg(&page->flags, &old_flags, new_flags));
- +
- + lru_gen_update_size(lruvec, page, old_gen, new_gen);
- +
- + return new_gen;
- +}
- +
- +static void inc_min_seq(struct lruvec *lruvec, int type)
- +{
- + struct lru_gen_struct *lrugen = &lruvec->lrugen;
- +
- + reset_ctrl_pos(lruvec, type, true);
- + WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
- +}
- +
- +static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
- +{
- + int gen, type, zone;
- + bool success = false;
- + struct lru_gen_struct *lrugen = &lruvec->lrugen;
- + DEFINE_MIN_SEQ(lruvec);
- +
- + VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
- +
- + /* find the oldest populated generation */
- + for (type = !can_swap; type < ANON_AND_FILE; type++) {
- + while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
- + gen = lru_gen_from_seq(min_seq[type]);
- +
- + for (zone = 0; zone < MAX_NR_ZONES; zone++) {
- + if (!list_empty(&lrugen->lists[gen][type][zone]))
- + goto next;
- + }
- +
- + min_seq[type]++;
- + }
- +next:
- + ;
- + }
- +
- + /* see the comment on lru_gen_struct */
- + if (can_swap) {
- + min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
- + min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
- + }
- +
- + for (type = !can_swap; type < ANON_AND_FILE; type++) {
- + if (min_seq[type] == lrugen->min_seq[type])
- + continue;
- +
- + reset_ctrl_pos(lruvec, type, true);
- + WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
- + success = true;
- + }
- +
- + return success;
- +}
- +
- +static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap)
- +{
- + int prev, next;
- + int type, zone;
- + struct lru_gen_struct *lrugen = &lruvec->lrugen;
- +
- + spin_lock_irq(&lruvec->lru_lock);
- +
- + VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
- +
- + if (max_seq != lrugen->max_seq)
- + goto unlock;
- +
- + for (type = ANON_AND_FILE - 1; type >= 0; type--) {
- + if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
- + continue;
- +
- + VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap);
- +
- + inc_min_seq(lruvec, type);
- + }
- +
- + /*
- + * Update the active/inactive LRU sizes for compatibility. Both sides of
- + * the current max_seq need to be covered, since max_seq+1 can overlap
- + * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do
- + * overlap, cold/hot inversion happens.
- + */
- + prev = lru_gen_from_seq(lrugen->max_seq - 1);
- + next = lru_gen_from_seq(lrugen->max_seq + 1);
- +
- + for (type = 0; type < ANON_AND_FILE; type++) {
- + for (zone = 0; zone < MAX_NR_ZONES; zone++) {
- + enum lru_list lru = type * LRU_INACTIVE_FILE;
- + long delta = lrugen->nr_pages[prev][type][zone] -
- + lrugen->nr_pages[next][type][zone];
- +
- + if (!delta)
- + continue;
- +
- + __update_lru_size(lruvec, lru, zone, delta);
- + __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
- + }
- + }
- +
- + for (type = 0; type < ANON_AND_FILE; type++)
- + reset_ctrl_pos(lruvec, type, false);
- +
- + /* make sure preceding modifications appear */
- + smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
- +unlock:
- + spin_unlock_irq(&lruvec->lru_lock);
- +}
- +
- +static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
- + struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
- +{
- + int gen, type, zone;
- + unsigned long old = 0;
- + unsigned long young = 0;
- + unsigned long total = 0;
- + struct lru_gen_struct *lrugen = &lruvec->lrugen;
- + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- +
- + for (type = !can_swap; type < ANON_AND_FILE; type++) {
- + unsigned long seq;
- +
- + for (seq = min_seq[type]; seq <= max_seq; seq++) {
- + unsigned long size = 0;
- +
- + gen = lru_gen_from_seq(seq);
- +
- + for (zone = 0; zone < MAX_NR_ZONES; zone++)
- + size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
- +
- + total += size;
- + if (seq == max_seq)
- + young += size;
- + else if (seq + MIN_NR_GENS == max_seq)
- + old += size;
- + }
- + }
- +
- + /* try to scrape all its memory if this memcg was deleted */
- + *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
- +
- + /*
- + * The aging tries to be lazy to reduce the overhead, while the eviction
- + * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
- + * ideal number of generations is MIN_NR_GENS+1.
- + */
- + if (min_seq[!can_swap] + MIN_NR_GENS > max_seq)
- + return true;
- + if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
- + return false;
- +
- + /*
- + * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
- + * of the total number of pages for each generation. A reasonable range
- + * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
- + * aging cares about the upper bound of hot pages, while the eviction
- + * cares about the lower bound of cold pages.
- + */
- + if (young * MIN_NR_GENS > total)
- + return true;
- + if (old * (MIN_NR_GENS + 2) < total)
- + return true;
- +
- + return false;
- +}
- +
- +static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
- +{
- + bool need_aging;
- + unsigned long nr_to_scan;
- + int swappiness = get_swappiness(lruvec, sc);
- + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- + DEFINE_MAX_SEQ(lruvec);
- + DEFINE_MIN_SEQ(lruvec);
- +
- + VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
- +
- + mem_cgroup_calculate_protection(NULL, memcg);
- +
- + if (mem_cgroup_below_min(memcg))
- + return;
- +
- + need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
- + if (need_aging)
- + inc_max_seq(lruvec, max_seq, swappiness);
- +}
- +
- +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
- +{
- + struct mem_cgroup *memcg;
- +
- + VM_WARN_ON_ONCE(!current_is_kswapd());
- +
- + memcg = mem_cgroup_iter(NULL, NULL, NULL);
- + do {
- + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
- +
- + age_lruvec(lruvec, sc);
- +
- + cond_resched();
- + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
- +}
- +
- +/******************************************************************************
- + * the eviction
- + ******************************************************************************/
- +
- +static bool sort_page(struct lruvec *lruvec, struct page *page, int tier_idx)
- +{
- + bool success;
- + int gen = page_lru_gen(page);
- + int type = page_is_file_lru(page);
- + int zone = page_zonenum(page);
- + int delta = thp_nr_pages(page);
- + int refs = page_lru_refs(page);
- + int tier = lru_tier_from_refs(refs);
- + struct lru_gen_struct *lrugen = &lruvec->lrugen;
- +
- + VM_WARN_ON_ONCE_PAGE(gen >= MAX_NR_GENS, page);
- +
- + /* unevictable */
- + if (!page_evictable(page)) {
- + success = lru_gen_del_page(lruvec, page, true);
- + VM_WARN_ON_ONCE_PAGE(!success, page);
- + SetPageUnevictable(page);
- + add_page_to_lru_list(page, lruvec);
- + __count_vm_events(UNEVICTABLE_PGCULLED, delta);
- + return true;
- + }
- +
- + /* dirty lazyfree */
- + if (type == LRU_GEN_FILE && PageAnon(page) && PageDirty(page)) {
- + success = lru_gen_del_page(lruvec, page, true);
- + VM_WARN_ON_ONCE_PAGE(!success, page);
- + SetPageSwapBacked(page);
- + add_page_to_lru_list_tail(page, lruvec);
- + return true;
- + }
- +
- + /* protected */
- + if (tier > tier_idx) {
- + int hist = lru_hist_from_seq(lrugen->min_seq[type]);
- +
- + gen = page_inc_gen(lruvec, page, false);
- + list_move_tail(&page->lru, &lrugen->lists[gen][type][zone]);
- +
- + WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
- + lrugen->protected[hist][type][tier - 1] + delta);
- + __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
- + return true;
- + }
- +
- + /* waiting for writeback */
- + if (PageLocked(page) || PageWriteback(page) ||
- + (type == LRU_GEN_FILE && PageDirty(page))) {
- + gen = page_inc_gen(lruvec, page, true);
- + list_move(&page->lru, &lrugen->lists[gen][type][zone]);
- + return true;
- + }
- +
- + return false;
- +}
- +
- +static bool isolate_page(struct lruvec *lruvec, struct page *page, struct scan_control *sc)
- +{
- + bool success;
- +
- + /* unmapping inhibited */
- + if (!sc->may_unmap && page_mapped(page))
- + return false;
- +
- + /* swapping inhibited */
- + if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
- + (PageDirty(page) ||
- + (PageAnon(page) && !PageSwapCache(page))))
- + return false;
- +
- + /* raced with release_pages() */
- + if (!get_page_unless_zero(page))
- + return false;
- +
- + /* raced with another isolation */
- + if (!TestClearPageLRU(page)) {
- + put_page(page);
- + return false;
- + }
- +
- + /* see the comment on MAX_NR_TIERS */
- + if (!PageReferenced(page))
- + set_mask_bits(&page->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
- +
- + /* for shrink_page_list() */
- + ClearPageReclaim(page);
- + ClearPageReferenced(page);
- +
- + success = lru_gen_del_page(lruvec, page, true);
- + VM_WARN_ON_ONCE_PAGE(!success, page);
- +
- + return true;
- +}
- +
- +static int scan_pages(struct lruvec *lruvec, struct scan_control *sc,
- + int type, int tier, struct list_head *list)
- +{
- + int gen, zone;
- + enum vm_event_item item;
- + int sorted = 0;
- + int scanned = 0;
- + int isolated = 0;
- + int remaining = MAX_LRU_BATCH;
- + struct lru_gen_struct *lrugen = &lruvec->lrugen;
- + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- +
- + VM_WARN_ON_ONCE(!list_empty(list));
- +
- + if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
- + return 0;
- +
- + gen = lru_gen_from_seq(lrugen->min_seq[type]);
- +
- + for (zone = sc->reclaim_idx; zone >= 0; zone--) {
- + LIST_HEAD(moved);
- + int skipped = 0;
- + struct list_head *head = &lrugen->lists[gen][type][zone];
- +
- + while (!list_empty(head)) {
- + struct page *page = lru_to_page(head);
- + int delta = thp_nr_pages(page);
- +
- + VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
- + VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
- + VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
- + VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page);
- +
- + scanned += delta;
- +
- + if (sort_page(lruvec, page, tier))
- + sorted += delta;
- + else if (isolate_page(lruvec, page, sc)) {
- + list_add(&page->lru, list);
- + isolated += delta;
- + } else {
- + list_move(&page->lru, &moved);
- + skipped += delta;
- + }
- +
- + if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH)
- + break;
- + }
- +
- + if (skipped) {
- + list_splice(&moved, head);
- + __count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
- + }
- +
- + if (!remaining || isolated >= MIN_LRU_BATCH)
- + break;
- + }
- +
- + item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
- + if (!cgroup_reclaim(sc)) {
- + __count_vm_events(item, isolated);
- + __count_vm_events(PGREFILL, sorted);
- + }
- + __count_memcg_events(memcg, item, isolated);
- + __count_memcg_events(memcg, PGREFILL, sorted);
- + __count_vm_events(PGSCAN_ANON + type, isolated);
- +
- + /*
- + * There might not be eligible pages due to reclaim_idx, may_unmap and
- + * may_writepage. Check the remaining to prevent livelock if it's not
- + * making progress.
- + */
- + return isolated || !remaining ? scanned : 0;
- +}
- +
- +static int get_tier_idx(struct lruvec *lruvec, int type)
- +{
- + int tier;
- + struct ctrl_pos sp, pv;
- +
- + /*
- + * To leave a margin for fluctuations, use a larger gain factor (1:2).
- + * This value is chosen because any other tier would have at least twice
- + * as many refaults as the first tier.
- + */
- + read_ctrl_pos(lruvec, type, 0, 1, &sp);
- + for (tier = 1; tier < MAX_NR_TIERS; tier++) {
- + read_ctrl_pos(lruvec, type, tier, 2, &pv);
- + if (!positive_ctrl_err(&sp, &pv))
- + break;
- + }
- +
- + return tier - 1;
- +}
- +
- +static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
- +{
- + int type, tier;
- + struct ctrl_pos sp, pv;
- + int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
- +
- + /*
- + * Compare the first tier of anon with that of file to determine which
- + * type to scan. Also need to compare other tiers of the selected type
- + * with the first tier of the other type to determine the last tier (of
- + * the selected type) to evict.
- + */
- + read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp);
- + read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv);
- + type = positive_ctrl_err(&sp, &pv);
- +
- + read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
- + for (tier = 1; tier < MAX_NR_TIERS; tier++) {
- + read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
- + if (!positive_ctrl_err(&sp, &pv))
- + break;
- + }
- +
- + *tier_idx = tier - 1;
- +
- + return type;
- +}
- +
- +static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
- + int *type_scanned, struct list_head *list)
- +{
- + int i;
- + int type;
- + int scanned;
- + int tier = -1;
- + DEFINE_MIN_SEQ(lruvec);
- +
- + /*
- + * Try to make the obvious choice first. When anon and file are both
- + * available from the same generation, interpret swappiness 1 as file
- + * first and 200 as anon first.
- + */
- + if (!swappiness)
- + type = LRU_GEN_FILE;
- + else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
- + type = LRU_GEN_ANON;
- + else if (swappiness == 1)
- + type = LRU_GEN_FILE;
- + else if (swappiness == 200)
- + type = LRU_GEN_ANON;
- + else
- + type = get_type_to_scan(lruvec, swappiness, &tier);
- +
- + for (i = !swappiness; i < ANON_AND_FILE; i++) {
- + if (tier < 0)
- + tier = get_tier_idx(lruvec, type);
- +
- + scanned = scan_pages(lruvec, sc, type, tier, list);
- + if (scanned)
- + break;
- +
- + type = !type;
- + tier = -1;
- + }
- +
- + *type_scanned = type;
- +
- + return scanned;
- +}
- +
- +static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
- +{
- + int type;
- + int scanned;
- + int reclaimed;
- + LIST_HEAD(list);
- + struct page *page;
- + enum vm_event_item item;
- + struct reclaim_stat stat;
- + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- +
- + spin_lock_irq(&lruvec->lru_lock);
- +
- + scanned = isolate_pages(lruvec, sc, swappiness, &type, &list);
- +
- + scanned += try_to_inc_min_seq(lruvec, swappiness);
- +
- + if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
- + scanned = 0;
- +
- + spin_unlock_irq(&lruvec->lru_lock);
- +
- + if (list_empty(&list))
- + return scanned;
- +
- + reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false);
- +
- + list_for_each_entry(page, &list, lru) {
- + /* restore LRU_REFS_FLAGS cleared by isolate_page() */
- + if (PageWorkingset(page))
- + SetPageReferenced(page);
- +
- + /* don't add rejected pages to the oldest generation */
- + if (PageReclaim(page) &&
- + (PageDirty(page) || PageWriteback(page)))
- + ClearPageActive(page);
- + else
- + SetPageActive(page);
- + }
- +
- + spin_lock_irq(&lruvec->lru_lock);
- +
- + move_pages_to_lru(lruvec, &list);
- +
- + item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
- + if (!cgroup_reclaim(sc))
- + __count_vm_events(item, reclaimed);
- + __count_memcg_events(memcg, item, reclaimed);
- + __count_vm_events(PGSTEAL_ANON + type, reclaimed);
- +
- + spin_unlock_irq(&lruvec->lru_lock);
- +
- + mem_cgroup_uncharge_list(&list);
- + free_unref_page_list(&list);
- +
- + sc->nr_reclaimed += reclaimed;
- +
- + return scanned;
- +}
- +
- +static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
- + bool can_swap)
- +{
- + bool need_aging;
- + unsigned long nr_to_scan;
- + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- + DEFINE_MAX_SEQ(lruvec);
- + DEFINE_MIN_SEQ(lruvec);
- +
- + if (mem_cgroup_below_min(memcg) ||
- + (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
- + return 0;
- +
- + need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
- + if (!need_aging)
- + return nr_to_scan;
- +
- + /* skip the aging path at the default priority */
- + if (sc->priority == DEF_PRIORITY)
- + goto done;
- +
- + /* leave the work to lru_gen_age_node() */
- + if (current_is_kswapd())
- + return 0;
- +
- + inc_max_seq(lruvec, max_seq, can_swap);
- +done:
- + return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
- +}
- +
- +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
- +{
- + struct blk_plug plug;
- + unsigned long scanned = 0;
- +
- + lru_add_drain();
- +
- + blk_start_plug(&plug);
- +
- + while (true) {
- + int delta;
- + int swappiness;
- + unsigned long nr_to_scan;
- +
- + if (sc->may_swap)
- + swappiness = get_swappiness(lruvec, sc);
- + else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))
- + swappiness = 1;
- + else
- + swappiness = 0;
- +
- + nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
- + if (!nr_to_scan)
- + break;
- +
- + delta = evict_pages(lruvec, sc, swappiness);
- + if (!delta)
- + break;
- +
- + scanned += delta;
- + if (scanned >= nr_to_scan)
- + break;
- +
- + cond_resched();
- + }
- +
- + blk_finish_plug(&plug);
- +}
- +
- /******************************************************************************
- * initialization
- ******************************************************************************/
- @@ -2894,6 +3649,16 @@ static int __init init_lru_gen(void)
- };
- late_initcall(init_lru_gen);
-
- +#else /* !CONFIG_LRU_GEN */
- +
- +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
- +{
- +}
- +
- +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
- +{
- +}
- +
- #endif /* CONFIG_LRU_GEN */
-
- static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
- @@ -2907,6 +3672,11 @@ static void shrink_lruvec(struct lruvec
- bool proportional_reclaim;
- struct blk_plug plug;
-
- + if (lru_gen_enabled()) {
- + lru_gen_shrink_lruvec(lruvec, sc);
- + return;
- + }
- +
- get_scan_count(lruvec, sc, nr);
-
- /* Record the original scan target for proportional adjustments later */
- @@ -3375,6 +4145,9 @@ static void snapshot_refaults(struct mem
- struct lruvec *target_lruvec;
- unsigned long refaults;
-
- + if (lru_gen_enabled())
- + return;
- +
- target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
- refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
- target_lruvec->refaults[0] = refaults;
- @@ -3739,12 +4512,16 @@ unsigned long try_to_free_mem_cgroup_pag
- }
- #endif
-
- -static void age_active_anon(struct pglist_data *pgdat,
- - struct scan_control *sc)
- +static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
- {
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
-
- + if (lru_gen_enabled()) {
- + lru_gen_age_node(pgdat, sc);
- + return;
- + }
- +
- if (!can_age_anon_pages(pgdat, sc))
- return;
-
- @@ -4061,12 +4838,11 @@ restart:
- sc.may_swap = !nr_boost_reclaim;
-
- /*
- - * Do some background aging of the anon list, to give
- - * pages a chance to be referenced before reclaiming. All
- - * pages are rotated regardless of classzone as this is
- - * about consistent aging.
- + * Do some background aging, to give pages a chance to be
- + * referenced before reclaiming. All pages are rotated
- + * regardless of classzone as this is about consistent aging.
- */
- - age_active_anon(pgdat, &sc);
- + kswapd_age_node(pgdat, &sc);
-
- /*
- * If we're getting trouble reclaiming, start doing writepage
- --- a/mm/workingset.c
- +++ b/mm/workingset.c
- @@ -187,7 +187,6 @@ static unsigned int bucket_order __read_
- static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
- bool workingset)
- {
- - eviction >>= bucket_order;
- eviction &= EVICTION_MASK;
- eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
- eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
- @@ -212,10 +211,107 @@ static void unpack_shadow(void *shadow,
-
- *memcgidp = memcgid;
- *pgdat = NODE_DATA(nid);
- - *evictionp = entry << bucket_order;
- + *evictionp = entry;
- *workingsetp = workingset;
- }
-
- +#ifdef CONFIG_LRU_GEN
- +
- +static void *lru_gen_eviction(struct page *page)
- +{
- + int hist;
- + unsigned long token;
- + unsigned long min_seq;
- + struct lruvec *lruvec;
- + struct lru_gen_struct *lrugen;
- + int type = page_is_file_lru(page);
- + int delta = thp_nr_pages(page);
- + int refs = page_lru_refs(page);
- + int tier = lru_tier_from_refs(refs);
- + struct mem_cgroup *memcg = page_memcg(page);
- + struct pglist_data *pgdat = page_pgdat(page);
- +
- + BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
- +
- + lruvec = mem_cgroup_lruvec(memcg, pgdat);
- + lrugen = &lruvec->lrugen;
- + min_seq = READ_ONCE(lrugen->min_seq[type]);
- + token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);
- +
- + hist = lru_hist_from_seq(min_seq);
- + atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
- +
- + return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
- +}
- +
- +static void lru_gen_refault(struct page *page, void *shadow)
- +{
- + int hist, tier, refs;
- + int memcg_id;
- + bool workingset;
- + unsigned long token;
- + unsigned long min_seq;
- + struct lruvec *lruvec;
- + struct lru_gen_struct *lrugen;
- + struct mem_cgroup *memcg;
- + struct pglist_data *pgdat;
- + int type = page_is_file_lru(page);
- + int delta = thp_nr_pages(page);
- +
- + unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
- +
- + if (pgdat != page_pgdat(page))
- + return;
- +
- + rcu_read_lock();
- +
- + memcg = page_memcg_rcu(page);
- + if (memcg_id != mem_cgroup_id(memcg))
- + goto unlock;
- +
- + lruvec = mem_cgroup_lruvec(memcg, pgdat);
- + lrugen = &lruvec->lrugen;
- +
- + min_seq = READ_ONCE(lrugen->min_seq[type]);
- + if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)))
- + goto unlock;
- +
- + hist = lru_hist_from_seq(min_seq);
- + /* see the comment in page_lru_refs() */
- + refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
- + tier = lru_tier_from_refs(refs);
- +
- + atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
- + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
- +
- + /*
- + * Count the following two cases as stalls:
- + * 1. For pages accessed through page tables, hotter pages pushed out
- + * hot pages which refaulted immediately.
- + * 2. For pages accessed multiple times through file descriptors,
- + * numbers of accesses might have been out of the range.
- + */
- + if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) {
- + SetPageWorkingset(page);
- + mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
- + }
- +unlock:
- + rcu_read_unlock();
- +}
- +
- +#else /* !CONFIG_LRU_GEN */
- +
- +static void *lru_gen_eviction(struct page *page)
- +{
- + return NULL;
- +}
- +
- +static void lru_gen_refault(struct page *page, void *shadow)
- +{
- +}
- +
- +#endif /* CONFIG_LRU_GEN */
- +
- /**
- * workingset_age_nonresident - age non-resident entries as LRU ages
- * @lruvec: the lruvec that was aged
- @@ -264,10 +360,14 @@ void *workingset_eviction(struct page *p
- VM_BUG_ON_PAGE(page_count(page), page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
-
- + if (lru_gen_enabled())
- + return lru_gen_eviction(page);
- +
- lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
- /* XXX: target_memcg can be NULL, go through lruvec */
- memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
- eviction = atomic_long_read(&lruvec->nonresident_age);
- + eviction >>= bucket_order;
- workingset_age_nonresident(lruvec, thp_nr_pages(page));
- return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
- }
- @@ -296,7 +396,13 @@ void workingset_refault(struct page *pag
- bool workingset;
- int memcgid;
-
- + if (lru_gen_enabled()) {
- + lru_gen_refault(page, shadow);
- + return;
- + }
- +
- unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
- + eviction <<= bucket_order;
-
- rcu_read_lock();
- /*
|