020-v6.1-08-mm-multi-gen-LRU-support-page-table-walks.patch 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687
  1. From 05223c4e80b34e29f2255c04ffebc2c4475e7593 Mon Sep 17 00:00:00 2001
  2. From: Yu Zhao <[email protected]>
  3. Date: Sun, 18 Sep 2022 02:00:05 -0600
  4. Subject: [PATCH 08/29] mm: multi-gen LRU: support page table walks
  5. MIME-Version: 1.0
  6. Content-Type: text/plain; charset=UTF-8
  7. Content-Transfer-Encoding: 8bit
  8. To further exploit spatial locality, the aging prefers to walk page tables
  9. to search for young PTEs and promote hot pages. A kill switch will be
  10. added in the next patch to disable this behavior. When disabled, the
  11. aging relies on the rmap only.
  12. NB: this behavior has nothing similar with the page table scanning in the
  13. 2.4 kernel [1], which searches page tables for old PTEs, adds cold pages
  14. to swapcache and unmaps them.
  15. To avoid confusion, the term "iteration" specifically means the traversal
  16. of an entire mm_struct list; the term "walk" will be applied to page
  17. tables and the rmap, as usual.
  18. An mm_struct list is maintained for each memcg, and an mm_struct follows
  19. its owner task to the new memcg when this task is migrated. Given an
  20. lruvec, the aging iterates lruvec_memcg()->mm_list and calls
  21. walk_page_range() with each mm_struct on this list to promote hot pages
  22. before it increments max_seq.
  23. When multiple page table walkers iterate the same list, each of them gets
  24. a unique mm_struct; therefore they can run concurrently. Page table
  25. walkers ignore any misplaced pages, e.g., if an mm_struct was migrated,
  26. pages it left in the previous memcg will not be promoted when its current
  27. memcg is under reclaim. Similarly, page table walkers will not promote
  28. pages from nodes other than the one under reclaim.
  29. This patch uses the following optimizations when walking page tables:
  30. 1. It tracks the usage of mm_struct's between context switches so that
  31. page table walkers can skip processes that have been sleeping since
  32. the last iteration.
  33. 2. It uses generational Bloom filters to record populated branches so
  34. that page table walkers can reduce their search space based on the
  35. query results, e.g., to skip page tables containing mostly holes or
  36. misplaced pages.
  37. 3. It takes advantage of the accessed bit in non-leaf PMD entries when
  38. CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y.
  39. 4. It does not zigzag between a PGD table and the same PMD table
  40. spanning multiple VMAs. IOW, it finishes all the VMAs within the
  41. range of the same PMD table before it returns to a PGD table. This
  42. improves the cache performance for workloads that have large
  43. numbers of tiny VMAs [2], especially when CONFIG_PGTABLE_LEVELS=5.
  44. Server benchmark results:
  45. Single workload:
  46. fio (buffered I/O): no change
  47. Single workload:
  48. memcached (anon): +[8, 10]%
  49. Ops/sec KB/sec
  50. patch1-7: 1147696.57 44640.29
  51. patch1-8: 1245274.91 48435.66
  52. Configurations:
  53. no change
  54. Client benchmark results:
  55. kswapd profiles:
  56. patch1-7
  57. 48.16% lzo1x_1_do_compress (real work)
  58. 8.20% page_vma_mapped_walk (overhead)
  59. 7.06% _raw_spin_unlock_irq
  60. 2.92% ptep_clear_flush
  61. 2.53% __zram_bvec_write
  62. 2.11% do_raw_spin_lock
  63. 2.02% memmove
  64. 1.93% lru_gen_look_around
  65. 1.56% free_unref_page_list
  66. 1.40% memset
  67. patch1-8
  68. 49.44% lzo1x_1_do_compress (real work)
  69. 6.19% page_vma_mapped_walk (overhead)
  70. 5.97% _raw_spin_unlock_irq
  71. 3.13% get_pfn_page
  72. 2.85% ptep_clear_flush
  73. 2.42% __zram_bvec_write
  74. 2.08% do_raw_spin_lock
  75. 1.92% memmove
  76. 1.44% alloc_zspage
  77. 1.36% memset
  78. Configurations:
  79. no change
  80. Thanks to the following developers for their efforts [3].
  81. kernel test robot <[email protected]>
  82. [1] https://lwn.net/Articles/23732/
  83. [2] https://llvm.org/docs/ScudoHardenedAllocator.html
  84. [3] https://lore.kernel.org/r/[email protected]/
  85. Link: https://lkml.kernel.org/r/[email protected]
  86. Signed-off-by: Yu Zhao <[email protected]>
  87. Acked-by: Brian Geffon <[email protected]>
  88. Acked-by: Jan Alexander Steffens (heftig) <[email protected]>
  89. Acked-by: Oleksandr Natalenko <[email protected]>
  90. Acked-by: Steven Barrett <[email protected]>
  91. Acked-by: Suleiman Souhlal <[email protected]>
  92. Tested-by: Daniel Byrne <[email protected]>
  93. Tested-by: Donald Carr <[email protected]>
  94. Tested-by: Holger Hoffstätte <[email protected]>
  95. Tested-by: Konstantin Kharlamov <[email protected]>
  96. Tested-by: Shuang Zhai <[email protected]>
  97. Tested-by: Sofia Trinh <[email protected]>
  98. Tested-by: Vaibhav Jain <[email protected]>
  99. Cc: Andi Kleen <[email protected]>
  100. Cc: Aneesh Kumar K.V <[email protected]>
  101. Cc: Barry Song <[email protected]>
  102. Cc: Catalin Marinas <[email protected]>
  103. Cc: Dave Hansen <[email protected]>
  104. Cc: Hillf Danton <[email protected]>
  105. Cc: Jens Axboe <[email protected]>
  106. Cc: Johannes Weiner <[email protected]>
  107. Cc: Jonathan Corbet <[email protected]>
  108. Cc: Linus Torvalds <[email protected]>
  109. Cc: Matthew Wilcox <[email protected]>
  110. Cc: Mel Gorman <[email protected]>
  111. Cc: Miaohe Lin <[email protected]>
  112. Cc: Michael Larabel <[email protected]>
  113. Cc: Michal Hocko <[email protected]>
  114. Cc: Mike Rapoport <[email protected]>
  115. Cc: Mike Rapoport <[email protected]>
  116. Cc: Peter Zijlstra <[email protected]>
  117. Cc: Qi Zheng <[email protected]>
  118. Cc: Tejun Heo <[email protected]>
  119. Cc: Vlastimil Babka <[email protected]>
  120. Cc: Will Deacon <[email protected]>
  121. Signed-off-by: Andrew Morton <[email protected]>
  122. ---
  123. fs/exec.c | 2 +
  124. include/linux/memcontrol.h | 5 +
  125. include/linux/mm_types.h | 76 +++
  126. include/linux/mmzone.h | 56 +-
  127. include/linux/swap.h | 4 +
  128. kernel/exit.c | 1 +
  129. kernel/fork.c | 9 +
  130. kernel/sched/core.c | 1 +
  131. mm/memcontrol.c | 25 +
  132. mm/vmscan.c | 1010 +++++++++++++++++++++++++++++++++++-
  133. 10 files changed, 1172 insertions(+), 17 deletions(-)
  134. --- a/fs/exec.c
  135. +++ b/fs/exec.c
  136. @@ -1013,6 +1013,7 @@ static int exec_mmap(struct mm_struct *m
  137. active_mm = tsk->active_mm;
  138. tsk->active_mm = mm;
  139. tsk->mm = mm;
  140. + lru_gen_add_mm(mm);
  141. /*
  142. * This prevents preemption while active_mm is being loaded and
  143. * it and mm are being updated, which could cause problems for
  144. @@ -1028,6 +1029,7 @@ static int exec_mmap(struct mm_struct *m
  145. tsk->mm->vmacache_seqnum = 0;
  146. vmacache_flush(tsk);
  147. task_unlock(tsk);
  148. + lru_gen_use_mm(mm);
  149. if (old_mm) {
  150. mmap_read_unlock(old_mm);
  151. BUG_ON(active_mm != old_mm);
  152. --- a/include/linux/memcontrol.h
  153. +++ b/include/linux/memcontrol.h
  154. @@ -348,6 +348,11 @@ struct mem_cgroup {
  155. struct deferred_split deferred_split_queue;
  156. #endif
  157. +#ifdef CONFIG_LRU_GEN
  158. + /* per-memcg mm_struct list */
  159. + struct lru_gen_mm_list mm_list;
  160. +#endif
  161. +
  162. struct mem_cgroup_per_node *nodeinfo[];
  163. };
  164. --- a/include/linux/mm_types.h
  165. +++ b/include/linux/mm_types.h
  166. @@ -580,6 +580,22 @@ struct mm_struct {
  167. #ifdef CONFIG_IOMMU_SUPPORT
  168. u32 pasid;
  169. #endif
  170. +#ifdef CONFIG_LRU_GEN
  171. + struct {
  172. + /* this mm_struct is on lru_gen_mm_list */
  173. + struct list_head list;
  174. + /*
  175. + * Set when switching to this mm_struct, as a hint of
  176. + * whether it has been used since the last time per-node
  177. + * page table walkers cleared the corresponding bits.
  178. + */
  179. + unsigned long bitmap;
  180. +#ifdef CONFIG_MEMCG
  181. + /* points to the memcg of "owner" above */
  182. + struct mem_cgroup *memcg;
  183. +#endif
  184. + } lru_gen;
  185. +#endif /* CONFIG_LRU_GEN */
  186. } __randomize_layout;
  187. /*
  188. @@ -606,6 +622,66 @@ static inline cpumask_t *mm_cpumask(stru
  189. return (struct cpumask *)&mm->cpu_bitmap;
  190. }
  191. +#ifdef CONFIG_LRU_GEN
  192. +
  193. +struct lru_gen_mm_list {
  194. + /* mm_struct list for page table walkers */
  195. + struct list_head fifo;
  196. + /* protects the list above */
  197. + spinlock_t lock;
  198. +};
  199. +
  200. +void lru_gen_add_mm(struct mm_struct *mm);
  201. +void lru_gen_del_mm(struct mm_struct *mm);
  202. +#ifdef CONFIG_MEMCG
  203. +void lru_gen_migrate_mm(struct mm_struct *mm);
  204. +#endif
  205. +
  206. +static inline void lru_gen_init_mm(struct mm_struct *mm)
  207. +{
  208. + INIT_LIST_HEAD(&mm->lru_gen.list);
  209. + mm->lru_gen.bitmap = 0;
  210. +#ifdef CONFIG_MEMCG
  211. + mm->lru_gen.memcg = NULL;
  212. +#endif
  213. +}
  214. +
  215. +static inline void lru_gen_use_mm(struct mm_struct *mm)
  216. +{
  217. + /*
  218. + * When the bitmap is set, page reclaim knows this mm_struct has been
  219. + * used since the last time it cleared the bitmap. So it might be worth
  220. + * walking the page tables of this mm_struct to clear the accessed bit.
  221. + */
  222. + WRITE_ONCE(mm->lru_gen.bitmap, -1);
  223. +}
  224. +
  225. +#else /* !CONFIG_LRU_GEN */
  226. +
  227. +static inline void lru_gen_add_mm(struct mm_struct *mm)
  228. +{
  229. +}
  230. +
  231. +static inline void lru_gen_del_mm(struct mm_struct *mm)
  232. +{
  233. +}
  234. +
  235. +#ifdef CONFIG_MEMCG
  236. +static inline void lru_gen_migrate_mm(struct mm_struct *mm)
  237. +{
  238. +}
  239. +#endif
  240. +
  241. +static inline void lru_gen_init_mm(struct mm_struct *mm)
  242. +{
  243. +}
  244. +
  245. +static inline void lru_gen_use_mm(struct mm_struct *mm)
  246. +{
  247. +}
  248. +
  249. +#endif /* CONFIG_LRU_GEN */
  250. +
  251. struct mmu_gather;
  252. extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
  253. extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
  254. --- a/include/linux/mmzone.h
  255. +++ b/include/linux/mmzone.h
  256. @@ -385,7 +385,7 @@ enum {
  257. * min_seq behind.
  258. *
  259. * The number of pages in each generation is eventually consistent and therefore
  260. - * can be transiently negative.
  261. + * can be transiently negative when reset_batch_size() is pending.
  262. */
  263. struct lru_gen_struct {
  264. /* the aging increments the youngest generation number */
  265. @@ -407,6 +407,53 @@ struct lru_gen_struct {
  266. atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
  267. };
  268. +enum {
  269. + MM_LEAF_TOTAL, /* total leaf entries */
  270. + MM_LEAF_OLD, /* old leaf entries */
  271. + MM_LEAF_YOUNG, /* young leaf entries */
  272. + MM_NONLEAF_TOTAL, /* total non-leaf entries */
  273. + MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */
  274. + MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */
  275. + NR_MM_STATS
  276. +};
  277. +
  278. +/* double-buffering Bloom filters */
  279. +#define NR_BLOOM_FILTERS 2
  280. +
  281. +struct lru_gen_mm_state {
  282. + /* set to max_seq after each iteration */
  283. + unsigned long seq;
  284. + /* where the current iteration continues (inclusive) */
  285. + struct list_head *head;
  286. + /* where the last iteration ended (exclusive) */
  287. + struct list_head *tail;
  288. + /* to wait for the last page table walker to finish */
  289. + struct wait_queue_head wait;
  290. + /* Bloom filters flip after each iteration */
  291. + unsigned long *filters[NR_BLOOM_FILTERS];
  292. + /* the mm stats for debugging */
  293. + unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
  294. + /* the number of concurrent page table walkers */
  295. + int nr_walkers;
  296. +};
  297. +
  298. +struct lru_gen_mm_walk {
  299. + /* the lruvec under reclaim */
  300. + struct lruvec *lruvec;
  301. + /* unstable max_seq from lru_gen_struct */
  302. + unsigned long max_seq;
  303. + /* the next address within an mm to scan */
  304. + unsigned long next_addr;
  305. + /* to batch promoted pages */
  306. + int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
  307. + /* to batch the mm stats */
  308. + int mm_stats[NR_MM_STATS];
  309. + /* total batched items */
  310. + int batched;
  311. + bool can_swap;
  312. + bool force_scan;
  313. +};
  314. +
  315. void lru_gen_init_lruvec(struct lruvec *lruvec);
  316. void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
  317. @@ -457,6 +504,8 @@ struct lruvec {
  318. #ifdef CONFIG_LRU_GEN
  319. /* evictable pages divided into generations */
  320. struct lru_gen_struct lrugen;
  321. + /* to concurrently iterate lru_gen_mm_list */
  322. + struct lru_gen_mm_state mm_state;
  323. #endif
  324. #ifdef CONFIG_MEMCG
  325. struct pglist_data *pgdat;
  326. @@ -1042,6 +1091,11 @@ typedef struct pglist_data {
  327. unsigned long flags;
  328. +#ifdef CONFIG_LRU_GEN
  329. + /* kswap mm walk data */
  330. + struct lru_gen_mm_walk mm_walk;
  331. +#endif
  332. +
  333. ZONE_PADDING(_pad2_)
  334. /* Per-node vmstats */
  335. --- a/include/linux/swap.h
  336. +++ b/include/linux/swap.h
  337. @@ -137,6 +137,10 @@ union swap_header {
  338. */
  339. struct reclaim_state {
  340. unsigned long reclaimed_slab;
  341. +#ifdef CONFIG_LRU_GEN
  342. + /* per-thread mm walk data */
  343. + struct lru_gen_mm_walk *mm_walk;
  344. +#endif
  345. };
  346. #ifdef __KERNEL__
  347. --- a/kernel/exit.c
  348. +++ b/kernel/exit.c
  349. @@ -469,6 +469,7 @@ assign_new_owner:
  350. goto retry;
  351. }
  352. WRITE_ONCE(mm->owner, c);
  353. + lru_gen_migrate_mm(mm);
  354. task_unlock(c);
  355. put_task_struct(c);
  356. }
  357. --- a/kernel/fork.c
  358. +++ b/kernel/fork.c
  359. @@ -1083,6 +1083,7 @@ static struct mm_struct *mm_init(struct
  360. goto fail_nocontext;
  361. mm->user_ns = get_user_ns(user_ns);
  362. + lru_gen_init_mm(mm);
  363. return mm;
  364. fail_nocontext:
  365. @@ -1125,6 +1126,7 @@ static inline void __mmput(struct mm_str
  366. }
  367. if (mm->binfmt)
  368. module_put(mm->binfmt->module);
  369. + lru_gen_del_mm(mm);
  370. mmdrop(mm);
  371. }
  372. @@ -2622,6 +2624,13 @@ pid_t kernel_clone(struct kernel_clone_a
  373. get_task_struct(p);
  374. }
  375. + if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
  376. + /* lock the task to synchronize with memcg migration */
  377. + task_lock(p);
  378. + lru_gen_add_mm(p->mm);
  379. + task_unlock(p);
  380. + }
  381. +
  382. wake_up_new_task(p);
  383. /* forking complete and child started to run, tell ptracer */
  384. --- a/kernel/sched/core.c
  385. +++ b/kernel/sched/core.c
  386. @@ -5010,6 +5010,7 @@ context_switch(struct rq *rq, struct tas
  387. * finish_task_switch()'s mmdrop().
  388. */
  389. switch_mm_irqs_off(prev->active_mm, next->mm, next);
  390. + lru_gen_use_mm(next->mm);
  391. if (!prev->mm) { // from kernel
  392. /* will mmdrop() in finish_task_switch(). */
  393. --- a/mm/memcontrol.c
  394. +++ b/mm/memcontrol.c
  395. @@ -6212,6 +6212,30 @@ static void mem_cgroup_move_task(void)
  396. }
  397. #endif
  398. +#ifdef CONFIG_LRU_GEN
  399. +static void mem_cgroup_attach(struct cgroup_taskset *tset)
  400. +{
  401. + struct task_struct *task;
  402. + struct cgroup_subsys_state *css;
  403. +
  404. + /* find the first leader if there is any */
  405. + cgroup_taskset_for_each_leader(task, css, tset)
  406. + break;
  407. +
  408. + if (!task)
  409. + return;
  410. +
  411. + task_lock(task);
  412. + if (task->mm && READ_ONCE(task->mm->owner) == task)
  413. + lru_gen_migrate_mm(task->mm);
  414. + task_unlock(task);
  415. +}
  416. +#else
  417. +static void mem_cgroup_attach(struct cgroup_taskset *tset)
  418. +{
  419. +}
  420. +#endif /* CONFIG_LRU_GEN */
  421. +
  422. static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
  423. {
  424. if (value == PAGE_COUNTER_MAX)
  425. @@ -6555,6 +6579,7 @@ struct cgroup_subsys memory_cgrp_subsys
  426. .css_reset = mem_cgroup_css_reset,
  427. .css_rstat_flush = mem_cgroup_css_rstat_flush,
  428. .can_attach = mem_cgroup_can_attach,
  429. + .attach = mem_cgroup_attach,
  430. .cancel_attach = mem_cgroup_cancel_attach,
  431. .post_attach = mem_cgroup_move_task,
  432. .dfl_cftypes = memory_files,
  433. --- a/mm/vmscan.c
  434. +++ b/mm/vmscan.c
  435. @@ -50,6 +50,8 @@
  436. #include <linux/printk.h>
  437. #include <linux/dax.h>
  438. #include <linux/psi.h>
  439. +#include <linux/pagewalk.h>
  440. +#include <linux/shmem_fs.h>
  441. #include <asm/tlbflush.h>
  442. #include <asm/div64.h>
  443. @@ -2853,7 +2855,7 @@ static bool can_age_anon_pages(struct pg
  444. for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
  445. for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
  446. -static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
  447. +static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
  448. {
  449. struct pglist_data *pgdat = NODE_DATA(nid);
  450. @@ -2899,6 +2901,371 @@ static bool __maybe_unused seq_is_valid(
  451. }
  452. /******************************************************************************
  453. + * mm_struct list
  454. + ******************************************************************************/
  455. +
  456. +static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
  457. +{
  458. + static struct lru_gen_mm_list mm_list = {
  459. + .fifo = LIST_HEAD_INIT(mm_list.fifo),
  460. + .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
  461. + };
  462. +
  463. +#ifdef CONFIG_MEMCG
  464. + if (memcg)
  465. + return &memcg->mm_list;
  466. +#endif
  467. + VM_WARN_ON_ONCE(!mem_cgroup_disabled());
  468. +
  469. + return &mm_list;
  470. +}
  471. +
  472. +void lru_gen_add_mm(struct mm_struct *mm)
  473. +{
  474. + int nid;
  475. + struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
  476. + struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
  477. +
  478. + VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list));
  479. +#ifdef CONFIG_MEMCG
  480. + VM_WARN_ON_ONCE(mm->lru_gen.memcg);
  481. + mm->lru_gen.memcg = memcg;
  482. +#endif
  483. + spin_lock(&mm_list->lock);
  484. +
  485. + for_each_node_state(nid, N_MEMORY) {
  486. + struct lruvec *lruvec = get_lruvec(memcg, nid);
  487. +
  488. + if (!lruvec)
  489. + continue;
  490. +
  491. + /* the first addition since the last iteration */
  492. + if (lruvec->mm_state.tail == &mm_list->fifo)
  493. + lruvec->mm_state.tail = &mm->lru_gen.list;
  494. + }
  495. +
  496. + list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
  497. +
  498. + spin_unlock(&mm_list->lock);
  499. +}
  500. +
  501. +void lru_gen_del_mm(struct mm_struct *mm)
  502. +{
  503. + int nid;
  504. + struct lru_gen_mm_list *mm_list;
  505. + struct mem_cgroup *memcg = NULL;
  506. +
  507. + if (list_empty(&mm->lru_gen.list))
  508. + return;
  509. +
  510. +#ifdef CONFIG_MEMCG
  511. + memcg = mm->lru_gen.memcg;
  512. +#endif
  513. + mm_list = get_mm_list(memcg);
  514. +
  515. + spin_lock(&mm_list->lock);
  516. +
  517. + for_each_node(nid) {
  518. + struct lruvec *lruvec = get_lruvec(memcg, nid);
  519. +
  520. + if (!lruvec)
  521. + continue;
  522. +
  523. + /* where the last iteration ended (exclusive) */
  524. + if (lruvec->mm_state.tail == &mm->lru_gen.list)
  525. + lruvec->mm_state.tail = lruvec->mm_state.tail->next;
  526. +
  527. + /* where the current iteration continues (inclusive) */
  528. + if (lruvec->mm_state.head != &mm->lru_gen.list)
  529. + continue;
  530. +
  531. + lruvec->mm_state.head = lruvec->mm_state.head->next;
  532. + /* the deletion ends the current iteration */
  533. + if (lruvec->mm_state.head == &mm_list->fifo)
  534. + WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
  535. + }
  536. +
  537. + list_del_init(&mm->lru_gen.list);
  538. +
  539. + spin_unlock(&mm_list->lock);
  540. +
  541. +#ifdef CONFIG_MEMCG
  542. + mem_cgroup_put(mm->lru_gen.memcg);
  543. + mm->lru_gen.memcg = NULL;
  544. +#endif
  545. +}
  546. +
  547. +#ifdef CONFIG_MEMCG
  548. +void lru_gen_migrate_mm(struct mm_struct *mm)
  549. +{
  550. + struct mem_cgroup *memcg;
  551. + struct task_struct *task = rcu_dereference_protected(mm->owner, true);
  552. +
  553. + VM_WARN_ON_ONCE(task->mm != mm);
  554. + lockdep_assert_held(&task->alloc_lock);
  555. +
  556. + /* for mm_update_next_owner() */
  557. + if (mem_cgroup_disabled())
  558. + return;
  559. +
  560. + rcu_read_lock();
  561. + memcg = mem_cgroup_from_task(task);
  562. + rcu_read_unlock();
  563. + if (memcg == mm->lru_gen.memcg)
  564. + return;
  565. +
  566. + VM_WARN_ON_ONCE(!mm->lru_gen.memcg);
  567. + VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
  568. +
  569. + lru_gen_del_mm(mm);
  570. + lru_gen_add_mm(mm);
  571. +}
  572. +#endif
  573. +
  574. +/*
  575. + * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
  576. + * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
  577. + * bits in a bitmap, k is the number of hash functions and n is the number of
  578. + * inserted items.
  579. + *
  580. + * Page table walkers use one of the two filters to reduce their search space.
  581. + * To get rid of non-leaf entries that no longer have enough leaf entries, the
  582. + * aging uses the double-buffering technique to flip to the other filter each
  583. + * time it produces a new generation. For non-leaf entries that have enough
  584. + * leaf entries, the aging carries them over to the next generation in
  585. + * walk_pmd_range(); the eviction also report them when walking the rmap
  586. + * in lru_gen_look_around().
  587. + *
  588. + * For future optimizations:
  589. + * 1. It's not necessary to keep both filters all the time. The spare one can be
  590. + * freed after the RCU grace period and reallocated if needed again.
  591. + * 2. And when reallocating, it's worth scaling its size according to the number
  592. + * of inserted entries in the other filter, to reduce the memory overhead on
  593. + * small systems and false positives on large systems.
  594. + * 3. Jenkins' hash function is an alternative to Knuth's.
  595. + */
  596. +#define BLOOM_FILTER_SHIFT 15
  597. +
  598. +static inline int filter_gen_from_seq(unsigned long seq)
  599. +{
  600. + return seq % NR_BLOOM_FILTERS;
  601. +}
  602. +
  603. +static void get_item_key(void *item, int *key)
  604. +{
  605. + u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
  606. +
  607. + BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
  608. +
  609. + key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
  610. + key[1] = hash >> BLOOM_FILTER_SHIFT;
  611. +}
  612. +
  613. +static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
  614. +{
  615. + unsigned long *filter;
  616. + int gen = filter_gen_from_seq(seq);
  617. +
  618. + filter = lruvec->mm_state.filters[gen];
  619. + if (filter) {
  620. + bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
  621. + return;
  622. + }
  623. +
  624. + filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
  625. + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
  626. + WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
  627. +}
  628. +
  629. +static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
  630. +{
  631. + int key[2];
  632. + unsigned long *filter;
  633. + int gen = filter_gen_from_seq(seq);
  634. +
  635. + filter = READ_ONCE(lruvec->mm_state.filters[gen]);
  636. + if (!filter)
  637. + return;
  638. +
  639. + get_item_key(item, key);
  640. +
  641. + if (!test_bit(key[0], filter))
  642. + set_bit(key[0], filter);
  643. + if (!test_bit(key[1], filter))
  644. + set_bit(key[1], filter);
  645. +}
  646. +
  647. +static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
  648. +{
  649. + int key[2];
  650. + unsigned long *filter;
  651. + int gen = filter_gen_from_seq(seq);
  652. +
  653. + filter = READ_ONCE(lruvec->mm_state.filters[gen]);
  654. + if (!filter)
  655. + return true;
  656. +
  657. + get_item_key(item, key);
  658. +
  659. + return test_bit(key[0], filter) && test_bit(key[1], filter);
  660. +}
  661. +
  662. +static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
  663. +{
  664. + int i;
  665. + int hist;
  666. +
  667. + lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
  668. +
  669. + if (walk) {
  670. + hist = lru_hist_from_seq(walk->max_seq);
  671. +
  672. + for (i = 0; i < NR_MM_STATS; i++) {
  673. + WRITE_ONCE(lruvec->mm_state.stats[hist][i],
  674. + lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]);
  675. + walk->mm_stats[i] = 0;
  676. + }
  677. + }
  678. +
  679. + if (NR_HIST_GENS > 1 && last) {
  680. + hist = lru_hist_from_seq(lruvec->mm_state.seq + 1);
  681. +
  682. + for (i = 0; i < NR_MM_STATS; i++)
  683. + WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0);
  684. + }
  685. +}
  686. +
  687. +static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
  688. +{
  689. + int type;
  690. + unsigned long size = 0;
  691. + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
  692. + int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
  693. +
  694. + if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
  695. + return true;
  696. +
  697. + clear_bit(key, &mm->lru_gen.bitmap);
  698. +
  699. + for (type = !walk->can_swap; type < ANON_AND_FILE; type++) {
  700. + size += type ? get_mm_counter(mm, MM_FILEPAGES) :
  701. + get_mm_counter(mm, MM_ANONPAGES) +
  702. + get_mm_counter(mm, MM_SHMEMPAGES);
  703. + }
  704. +
  705. + if (size < MIN_LRU_BATCH)
  706. + return true;
  707. +
  708. + return !mmget_not_zero(mm);
  709. +}
  710. +
  711. +static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
  712. + struct mm_struct **iter)
  713. +{
  714. + bool first = false;
  715. + bool last = true;
  716. + struct mm_struct *mm = NULL;
  717. + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  718. + struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
  719. + struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
  720. +
  721. + /*
  722. + * There are four interesting cases for this page table walker:
  723. + * 1. It tries to start a new iteration of mm_list with a stale max_seq;
  724. + * there is nothing left to do.
  725. + * 2. It's the first of the current generation, and it needs to reset
  726. + * the Bloom filter for the next generation.
  727. + * 3. It reaches the end of mm_list, and it needs to increment
  728. + * mm_state->seq; the iteration is done.
  729. + * 4. It's the last of the current generation, and it needs to reset the
  730. + * mm stats counters for the next generation.
  731. + */
  732. + spin_lock(&mm_list->lock);
  733. +
  734. + VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
  735. + VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq);
  736. + VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers);
  737. +
  738. + if (walk->max_seq <= mm_state->seq) {
  739. + if (!*iter)
  740. + last = false;
  741. + goto done;
  742. + }
  743. +
  744. + if (!mm_state->nr_walkers) {
  745. + VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
  746. +
  747. + mm_state->head = mm_list->fifo.next;
  748. + first = true;
  749. + }
  750. +
  751. + while (!mm && mm_state->head != &mm_list->fifo) {
  752. + mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
  753. +
  754. + mm_state->head = mm_state->head->next;
  755. +
  756. + /* force scan for those added after the last iteration */
  757. + if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
  758. + mm_state->tail = mm_state->head;
  759. + walk->force_scan = true;
  760. + }
  761. +
  762. + if (should_skip_mm(mm, walk))
  763. + mm = NULL;
  764. + }
  765. +
  766. + if (mm_state->head == &mm_list->fifo)
  767. + WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
  768. +done:
  769. + if (*iter && !mm)
  770. + mm_state->nr_walkers--;
  771. + if (!*iter && mm)
  772. + mm_state->nr_walkers++;
  773. +
  774. + if (mm_state->nr_walkers)
  775. + last = false;
  776. +
  777. + if (*iter || last)
  778. + reset_mm_stats(lruvec, walk, last);
  779. +
  780. + spin_unlock(&mm_list->lock);
  781. +
  782. + if (mm && first)
  783. + reset_bloom_filter(lruvec, walk->max_seq + 1);
  784. +
  785. + if (*iter)
  786. + mmput_async(*iter);
  787. +
  788. + *iter = mm;
  789. +
  790. + return last;
  791. +}
  792. +
  793. +static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
  794. +{
  795. + bool success = false;
  796. + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  797. + struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
  798. + struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
  799. +
  800. + spin_lock(&mm_list->lock);
  801. +
  802. + VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
  803. +
  804. + if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
  805. + VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
  806. +
  807. + WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
  808. + reset_mm_stats(lruvec, NULL, true);
  809. + success = true;
  810. + }
  811. +
  812. + spin_unlock(&mm_list->lock);
  813. +
  814. + return success;
  815. +}
  816. +
  817. +/******************************************************************************
  818. * refault feedback loop
  819. ******************************************************************************/
  820. @@ -3048,6 +3415,118 @@ static int page_inc_gen(struct lruvec *l
  821. return new_gen;
  822. }
  823. +static void update_batch_size(struct lru_gen_mm_walk *walk, struct page *page,
  824. + int old_gen, int new_gen)
  825. +{
  826. + int type = page_is_file_lru(page);
  827. + int zone = page_zonenum(page);
  828. + int delta = thp_nr_pages(page);
  829. +
  830. + VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS);
  831. + VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS);
  832. +
  833. + walk->batched++;
  834. +
  835. + walk->nr_pages[old_gen][type][zone] -= delta;
  836. + walk->nr_pages[new_gen][type][zone] += delta;
  837. +}
  838. +
  839. +static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
  840. +{
  841. + int gen, type, zone;
  842. + struct lru_gen_struct *lrugen = &lruvec->lrugen;
  843. +
  844. + walk->batched = 0;
  845. +
  846. + for_each_gen_type_zone(gen, type, zone) {
  847. + enum lru_list lru = type * LRU_INACTIVE_FILE;
  848. + int delta = walk->nr_pages[gen][type][zone];
  849. +
  850. + if (!delta)
  851. + continue;
  852. +
  853. + walk->nr_pages[gen][type][zone] = 0;
  854. + WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
  855. + lrugen->nr_pages[gen][type][zone] + delta);
  856. +
  857. + if (lru_gen_is_active(lruvec, gen))
  858. + lru += LRU_ACTIVE;
  859. + __update_lru_size(lruvec, lru, zone, delta);
  860. + }
  861. +}
  862. +
  863. +static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args)
  864. +{
  865. + struct address_space *mapping;
  866. + struct vm_area_struct *vma = args->vma;
  867. + struct lru_gen_mm_walk *walk = args->private;
  868. +
  869. + if (!vma_is_accessible(vma))
  870. + return true;
  871. +
  872. + if (is_vm_hugetlb_page(vma))
  873. + return true;
  874. +
  875. + if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ))
  876. + return true;
  877. +
  878. + if (vma == get_gate_vma(vma->vm_mm))
  879. + return true;
  880. +
  881. + if (vma_is_anonymous(vma))
  882. + return !walk->can_swap;
  883. +
  884. + if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
  885. + return true;
  886. +
  887. + mapping = vma->vm_file->f_mapping;
  888. + if (mapping_unevictable(mapping))
  889. + return true;
  890. +
  891. + if (shmem_mapping(mapping))
  892. + return !walk->can_swap;
  893. +
  894. + /* to exclude special mappings like dax, etc. */
  895. + return !mapping->a_ops->readpage;
  896. +}
  897. +
  898. +/*
  899. + * Some userspace memory allocators map many single-page VMAs. Instead of
  900. + * returning back to the PGD table for each of such VMAs, finish an entire PMD
  901. + * table to reduce zigzags and improve cache performance.
  902. + */
  903. +static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args,
  904. + unsigned long *vm_start, unsigned long *vm_end)
  905. +{
  906. + unsigned long start = round_up(*vm_end, size);
  907. + unsigned long end = (start | ~mask) + 1;
  908. +
  909. + VM_WARN_ON_ONCE(mask & size);
  910. + VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask));
  911. +
  912. + while (args->vma) {
  913. + if (start >= args->vma->vm_end) {
  914. + args->vma = args->vma->vm_next;
  915. + continue;
  916. + }
  917. +
  918. + if (end && end <= args->vma->vm_start)
  919. + return false;
  920. +
  921. + if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) {
  922. + args->vma = args->vma->vm_next;
  923. + continue;
  924. + }
  925. +
  926. + *vm_start = max(start, args->vma->vm_start);
  927. + *vm_end = min(end - 1, args->vma->vm_end - 1) + 1;
  928. +
  929. + return true;
  930. + }
  931. +
  932. + return false;
  933. +}
  934. +
  935. static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
  936. {
  937. unsigned long pfn = pte_pfn(pte);
  938. @@ -3066,8 +3545,28 @@ static unsigned long get_pte_pfn(pte_t p
  939. return pfn;
  940. }
  941. +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
  942. +static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
  943. +{
  944. + unsigned long pfn = pmd_pfn(pmd);
  945. +
  946. + VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
  947. +
  948. + if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
  949. + return -1;
  950. +
  951. + if (WARN_ON_ONCE(pmd_devmap(pmd)))
  952. + return -1;
  953. +
  954. + if (WARN_ON_ONCE(!pfn_valid(pfn)))
  955. + return -1;
  956. +
  957. + return pfn;
  958. +}
  959. +#endif
  960. +
  961. static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg,
  962. - struct pglist_data *pgdat)
  963. + struct pglist_data *pgdat, bool can_swap)
  964. {
  965. struct page *page;
  966. @@ -3082,9 +3581,375 @@ static struct page *get_pfn_page(unsigne
  967. if (page_memcg_rcu(page) != memcg)
  968. return NULL;
  969. + /* file VMAs can contain anon pages from COW */
  970. + if (!page_is_file_lru(page) && !can_swap)
  971. + return NULL;
  972. +
  973. return page;
  974. }
  975. +static bool suitable_to_scan(int total, int young)
  976. +{
  977. + int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
  978. +
  979. + /* suitable if the average number of young PTEs per cacheline is >=1 */
  980. + return young * n >= total;
  981. +}
  982. +
  983. +static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
  984. + struct mm_walk *args)
  985. +{
  986. + int i;
  987. + pte_t *pte;
  988. + spinlock_t *ptl;
  989. + unsigned long addr;
  990. + int total = 0;
  991. + int young = 0;
  992. + struct lru_gen_mm_walk *walk = args->private;
  993. + struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
  994. + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
  995. + int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
  996. +
  997. + VM_WARN_ON_ONCE(pmd_leaf(*pmd));
  998. +
  999. + ptl = pte_lockptr(args->mm, pmd);
  1000. + if (!spin_trylock(ptl))
  1001. + return false;
  1002. +
  1003. + arch_enter_lazy_mmu_mode();
  1004. +
  1005. + pte = pte_offset_map(pmd, start & PMD_MASK);
  1006. +restart:
  1007. + for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
  1008. + unsigned long pfn;
  1009. + struct page *page;
  1010. +
  1011. + total++;
  1012. + walk->mm_stats[MM_LEAF_TOTAL]++;
  1013. +
  1014. + pfn = get_pte_pfn(pte[i], args->vma, addr);
  1015. + if (pfn == -1)
  1016. + continue;
  1017. +
  1018. + if (!pte_young(pte[i])) {
  1019. + walk->mm_stats[MM_LEAF_OLD]++;
  1020. + continue;
  1021. + }
  1022. +
  1023. + page = get_pfn_page(pfn, memcg, pgdat, walk->can_swap);
  1024. + if (!page)
  1025. + continue;
  1026. +
  1027. + if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
  1028. + VM_WARN_ON_ONCE(true);
  1029. +
  1030. + young++;
  1031. + walk->mm_stats[MM_LEAF_YOUNG]++;
  1032. +
  1033. + if (pte_dirty(pte[i]) && !PageDirty(page) &&
  1034. + !(PageAnon(page) && PageSwapBacked(page) &&
  1035. + !PageSwapCache(page)))
  1036. + set_page_dirty(page);
  1037. +
  1038. + old_gen = page_update_gen(page, new_gen);
  1039. + if (old_gen >= 0 && old_gen != new_gen)
  1040. + update_batch_size(walk, page, old_gen, new_gen);
  1041. + }
  1042. +
  1043. + if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
  1044. + goto restart;
  1045. +
  1046. + pte_unmap(pte);
  1047. +
  1048. + arch_leave_lazy_mmu_mode();
  1049. + spin_unlock(ptl);
  1050. +
  1051. + return suitable_to_scan(total, young);
  1052. +}
  1053. +
  1054. +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
  1055. +static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
  1056. + struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
  1057. +{
  1058. + int i;
  1059. + pmd_t *pmd;
  1060. + spinlock_t *ptl;
  1061. + struct lru_gen_mm_walk *walk = args->private;
  1062. + struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
  1063. + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
  1064. + int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
  1065. +
  1066. + VM_WARN_ON_ONCE(pud_leaf(*pud));
  1067. +
  1068. + /* try to batch at most 1+MIN_LRU_BATCH+1 entries */
  1069. + if (*start == -1) {
  1070. + *start = next;
  1071. + return;
  1072. + }
  1073. +
  1074. + i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start);
  1075. + if (i && i <= MIN_LRU_BATCH) {
  1076. + __set_bit(i - 1, bitmap);
  1077. + return;
  1078. + }
  1079. +
  1080. + pmd = pmd_offset(pud, *start);
  1081. +
  1082. + ptl = pmd_lockptr(args->mm, pmd);
  1083. + if (!spin_trylock(ptl))
  1084. + goto done;
  1085. +
  1086. + arch_enter_lazy_mmu_mode();
  1087. +
  1088. + do {
  1089. + unsigned long pfn;
  1090. + struct page *page;
  1091. + unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start;
  1092. +
  1093. + pfn = get_pmd_pfn(pmd[i], vma, addr);
  1094. + if (pfn == -1)
  1095. + goto next;
  1096. +
  1097. + if (!pmd_trans_huge(pmd[i])) {
  1098. + if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
  1099. + pmdp_test_and_clear_young(vma, addr, pmd + i);
  1100. + goto next;
  1101. + }
  1102. +
  1103. + page = get_pfn_page(pfn, memcg, pgdat, walk->can_swap);
  1104. + if (!page)
  1105. + goto next;
  1106. +
  1107. + if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
  1108. + goto next;
  1109. +
  1110. + walk->mm_stats[MM_LEAF_YOUNG]++;
  1111. +
  1112. + if (pmd_dirty(pmd[i]) && !PageDirty(page) &&
  1113. + !(PageAnon(page) && PageSwapBacked(page) &&
  1114. + !PageSwapCache(page)))
  1115. + set_page_dirty(page);
  1116. +
  1117. + old_gen = page_update_gen(page, new_gen);
  1118. + if (old_gen >= 0 && old_gen != new_gen)
  1119. + update_batch_size(walk, page, old_gen, new_gen);
  1120. +next:
  1121. + i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
  1122. + } while (i <= MIN_LRU_BATCH);
  1123. +
  1124. + arch_leave_lazy_mmu_mode();
  1125. + spin_unlock(ptl);
  1126. +done:
  1127. + *start = -1;
  1128. + bitmap_zero(bitmap, MIN_LRU_BATCH);
  1129. +}
  1130. +#else
  1131. +static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
  1132. + struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
  1133. +{
  1134. +}
  1135. +#endif
  1136. +
  1137. +static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
  1138. + struct mm_walk *args)
  1139. +{
  1140. + int i;
  1141. + pmd_t *pmd;
  1142. + unsigned long next;
  1143. + unsigned long addr;
  1144. + struct vm_area_struct *vma;
  1145. + unsigned long pos = -1;
  1146. + struct lru_gen_mm_walk *walk = args->private;
  1147. + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
  1148. +
  1149. + VM_WARN_ON_ONCE(pud_leaf(*pud));
  1150. +
  1151. + /*
  1152. + * Finish an entire PMD in two passes: the first only reaches to PTE
  1153. + * tables to avoid taking the PMD lock; the second, if necessary, takes
  1154. + * the PMD lock to clear the accessed bit in PMD entries.
  1155. + */
  1156. + pmd = pmd_offset(pud, start & PUD_MASK);
  1157. +restart:
  1158. + /* walk_pte_range() may call get_next_vma() */
  1159. + vma = args->vma;
  1160. + for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
  1161. + pmd_t val = pmd_read_atomic(pmd + i);
  1162. +
  1163. + /* for pmd_read_atomic() */
  1164. + barrier();
  1165. +
  1166. + next = pmd_addr_end(addr, end);
  1167. +
  1168. + if (!pmd_present(val) || is_huge_zero_pmd(val)) {
  1169. + walk->mm_stats[MM_LEAF_TOTAL]++;
  1170. + continue;
  1171. + }
  1172. +
  1173. +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
  1174. + if (pmd_trans_huge(val)) {
  1175. + unsigned long pfn = pmd_pfn(val);
  1176. + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
  1177. +
  1178. + walk->mm_stats[MM_LEAF_TOTAL]++;
  1179. +
  1180. + if (!pmd_young(val)) {
  1181. + walk->mm_stats[MM_LEAF_OLD]++;
  1182. + continue;
  1183. + }
  1184. +
  1185. + /* try to avoid unnecessary memory loads */
  1186. + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
  1187. + continue;
  1188. +
  1189. + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
  1190. + continue;
  1191. + }
  1192. +#endif
  1193. + walk->mm_stats[MM_NONLEAF_TOTAL]++;
  1194. +
  1195. +#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
  1196. + if (!pmd_young(val))
  1197. + continue;
  1198. +
  1199. + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
  1200. +#endif
  1201. + if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
  1202. + continue;
  1203. +
  1204. + walk->mm_stats[MM_NONLEAF_FOUND]++;
  1205. +
  1206. + if (!walk_pte_range(&val, addr, next, args))
  1207. + continue;
  1208. +
  1209. + walk->mm_stats[MM_NONLEAF_ADDED]++;
  1210. +
  1211. + /* carry over to the next generation */
  1212. + update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);
  1213. + }
  1214. +
  1215. + walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos);
  1216. +
  1217. + if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
  1218. + goto restart;
  1219. +}
  1220. +
  1221. +static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
  1222. + struct mm_walk *args)
  1223. +{
  1224. + int i;
  1225. + pud_t *pud;
  1226. + unsigned long addr;
  1227. + unsigned long next;
  1228. + struct lru_gen_mm_walk *walk = args->private;
  1229. +
  1230. + VM_WARN_ON_ONCE(p4d_leaf(*p4d));
  1231. +
  1232. + pud = pud_offset(p4d, start & P4D_MASK);
  1233. +restart:
  1234. + for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
  1235. + pud_t val = READ_ONCE(pud[i]);
  1236. +
  1237. + next = pud_addr_end(addr, end);
  1238. +
  1239. + if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
  1240. + continue;
  1241. +
  1242. + walk_pmd_range(&val, addr, next, args);
  1243. +
  1244. + /* a racy check to curtail the waiting time */
  1245. + if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
  1246. + return 1;
  1247. +
  1248. + if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
  1249. + end = (addr | ~PUD_MASK) + 1;
  1250. + goto done;
  1251. + }
  1252. + }
  1253. +
  1254. + if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end))
  1255. + goto restart;
  1256. +
  1257. + end = round_up(end, P4D_SIZE);
  1258. +done:
  1259. + if (!end || !args->vma)
  1260. + return 1;
  1261. +
  1262. + walk->next_addr = max(end, args->vma->vm_start);
  1263. +
  1264. + return -EAGAIN;
  1265. +}
  1266. +
  1267. +static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk)
  1268. +{
  1269. + static const struct mm_walk_ops mm_walk_ops = {
  1270. + .test_walk = should_skip_vma,
  1271. + .p4d_entry = walk_pud_range,
  1272. + };
  1273. +
  1274. + int err;
  1275. + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  1276. +
  1277. + walk->next_addr = FIRST_USER_ADDRESS;
  1278. +
  1279. + do {
  1280. + err = -EBUSY;
  1281. +
  1282. + /* page_update_gen() requires stable page_memcg() */
  1283. + if (!mem_cgroup_trylock_pages(memcg))
  1284. + break;
  1285. +
  1286. + /* the caller might be holding the lock for write */
  1287. + if (mmap_read_trylock(mm)) {
  1288. + err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
  1289. +
  1290. + mmap_read_unlock(mm);
  1291. + }
  1292. +
  1293. + mem_cgroup_unlock_pages();
  1294. +
  1295. + if (walk->batched) {
  1296. + spin_lock_irq(&lruvec->lru_lock);
  1297. + reset_batch_size(lruvec, walk);
  1298. + spin_unlock_irq(&lruvec->lru_lock);
  1299. + }
  1300. +
  1301. + cond_resched();
  1302. + } while (err == -EAGAIN);
  1303. +}
  1304. +
  1305. +static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
  1306. +{
  1307. + struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
  1308. +
  1309. + if (pgdat && current_is_kswapd()) {
  1310. + VM_WARN_ON_ONCE(walk);
  1311. +
  1312. + walk = &pgdat->mm_walk;
  1313. + } else if (!pgdat && !walk) {
  1314. + VM_WARN_ON_ONCE(current_is_kswapd());
  1315. +
  1316. + walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
  1317. + }
  1318. +
  1319. + current->reclaim_state->mm_walk = walk;
  1320. +
  1321. + return walk;
  1322. +}
  1323. +
  1324. +static void clear_mm_walk(void)
  1325. +{
  1326. + struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
  1327. +
  1328. + VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages)));
  1329. + VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats)));
  1330. +
  1331. + current->reclaim_state->mm_walk = NULL;
  1332. +
  1333. + if (!current_is_kswapd())
  1334. + kfree(walk);
  1335. +}
  1336. +
  1337. static void inc_min_seq(struct lruvec *lruvec, int type)
  1338. {
  1339. struct lru_gen_struct *lrugen = &lruvec->lrugen;
  1340. @@ -3136,7 +4001,7 @@ next:
  1341. return success;
  1342. }
  1343. -static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap)
  1344. +static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
  1345. {
  1346. int prev, next;
  1347. int type, zone;
  1348. @@ -3146,9 +4011,6 @@ static void inc_max_seq(struct lruvec *l
  1349. VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
  1350. - if (max_seq != lrugen->max_seq)
  1351. - goto unlock;
  1352. -
  1353. for (type = ANON_AND_FILE - 1; type >= 0; type--) {
  1354. if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
  1355. continue;
  1356. @@ -3186,10 +4048,76 @@ static void inc_max_seq(struct lruvec *l
  1357. /* make sure preceding modifications appear */
  1358. smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
  1359. -unlock:
  1360. +
  1361. spin_unlock_irq(&lruvec->lru_lock);
  1362. }
  1363. +static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
  1364. + struct scan_control *sc, bool can_swap)
  1365. +{
  1366. + bool success;
  1367. + struct lru_gen_mm_walk *walk;
  1368. + struct mm_struct *mm = NULL;
  1369. + struct lru_gen_struct *lrugen = &lruvec->lrugen;
  1370. +
  1371. + VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
  1372. +
  1373. + /* see the comment in iterate_mm_list() */
  1374. + if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) {
  1375. + success = false;
  1376. + goto done;
  1377. + }
  1378. +
  1379. + /*
  1380. + * If the hardware doesn't automatically set the accessed bit, fallback
  1381. + * to lru_gen_look_around(), which only clears the accessed bit in a
  1382. + * handful of PTEs. Spreading the work out over a period of time usually
  1383. + * is less efficient, but it avoids bursty page faults.
  1384. + */
  1385. + if (!arch_has_hw_pte_young()) {
  1386. + success = iterate_mm_list_nowalk(lruvec, max_seq);
  1387. + goto done;
  1388. + }
  1389. +
  1390. + walk = set_mm_walk(NULL);
  1391. + if (!walk) {
  1392. + success = iterate_mm_list_nowalk(lruvec, max_seq);
  1393. + goto done;
  1394. + }
  1395. +
  1396. + walk->lruvec = lruvec;
  1397. + walk->max_seq = max_seq;
  1398. + walk->can_swap = can_swap;
  1399. + walk->force_scan = false;
  1400. +
  1401. + do {
  1402. + success = iterate_mm_list(lruvec, walk, &mm);
  1403. + if (mm)
  1404. + walk_mm(lruvec, mm, walk);
  1405. +
  1406. + cond_resched();
  1407. + } while (mm);
  1408. +done:
  1409. + if (!success) {
  1410. + if (sc->priority <= DEF_PRIORITY - 2)
  1411. + wait_event_killable(lruvec->mm_state.wait,
  1412. + max_seq < READ_ONCE(lrugen->max_seq));
  1413. +
  1414. + return max_seq < READ_ONCE(lrugen->max_seq);
  1415. + }
  1416. +
  1417. + VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
  1418. +
  1419. + inc_max_seq(lruvec, can_swap);
  1420. + /* either this sees any waiters or they will see updated max_seq */
  1421. + if (wq_has_sleeper(&lruvec->mm_state.wait))
  1422. + wake_up_all(&lruvec->mm_state.wait);
  1423. +
  1424. + wakeup_flusher_threads(WB_REASON_VMSCAN);
  1425. +
  1426. + return true;
  1427. +}
  1428. +
  1429. static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
  1430. struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
  1431. {
  1432. @@ -3265,7 +4193,7 @@ static void age_lruvec(struct lruvec *lr
  1433. need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
  1434. if (need_aging)
  1435. - inc_max_seq(lruvec, max_seq, swappiness);
  1436. + try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
  1437. }
  1438. static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
  1439. @@ -3274,6 +4202,8 @@ static void lru_gen_age_node(struct pgli
  1440. VM_WARN_ON_ONCE(!current_is_kswapd());
  1441. + set_mm_walk(pgdat);
  1442. +
  1443. memcg = mem_cgroup_iter(NULL, NULL, NULL);
  1444. do {
  1445. struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
  1446. @@ -3282,11 +4212,16 @@ static void lru_gen_age_node(struct pgli
  1447. cond_resched();
  1448. } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
  1449. +
  1450. + clear_mm_walk();
  1451. }
  1452. /*
  1453. * This function exploits spatial locality when shrink_page_list() walks the
  1454. - * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
  1455. + * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
  1456. + * the scan was done cacheline efficiently, it adds the PMD entry pointing to
  1457. + * the PTE table to the Bloom filter. This forms a feedback loop between the
  1458. + * eviction and the aging.
  1459. */
  1460. void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
  1461. {
  1462. @@ -3295,6 +4230,8 @@ void lru_gen_look_around(struct page_vma
  1463. unsigned long start;
  1464. unsigned long end;
  1465. unsigned long addr;
  1466. + struct lru_gen_mm_walk *walk;
  1467. + int young = 0;
  1468. unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
  1469. struct page *page = pvmw->page;
  1470. struct mem_cgroup *memcg = page_memcg(page);
  1471. @@ -3309,6 +4246,9 @@ void lru_gen_look_around(struct page_vma
  1472. if (spin_is_contended(pvmw->ptl))
  1473. return;
  1474. + /* avoid taking the LRU lock under the PTL when possible */
  1475. + walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
  1476. +
  1477. start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
  1478. end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
  1479. @@ -3338,13 +4278,15 @@ void lru_gen_look_around(struct page_vma
  1480. if (!pte_young(pte[i]))
  1481. continue;
  1482. - page = get_pfn_page(pfn, memcg, pgdat);
  1483. + page = get_pfn_page(pfn, memcg, pgdat, !walk || walk->can_swap);
  1484. if (!page)
  1485. continue;
  1486. if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
  1487. VM_WARN_ON_ONCE(true);
  1488. + young++;
  1489. +
  1490. if (pte_dirty(pte[i]) && !PageDirty(page) &&
  1491. !(PageAnon(page) && PageSwapBacked(page) &&
  1492. !PageSwapCache(page)))
  1493. @@ -3360,7 +4302,11 @@ void lru_gen_look_around(struct page_vma
  1494. arch_leave_lazy_mmu_mode();
  1495. rcu_read_unlock();
  1496. - if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
  1497. + /* feedback from rmap walkers to page table walkers */
  1498. + if (suitable_to_scan(i, young))
  1499. + update_bloom_filter(lruvec, max_seq, pvmw->pmd);
  1500. +
  1501. + if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
  1502. for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
  1503. page = pte_page(pte[i]);
  1504. activate_page(page);
  1505. @@ -3372,8 +4318,10 @@ void lru_gen_look_around(struct page_vma
  1506. if (!mem_cgroup_trylock_pages(memcg))
  1507. return;
  1508. - spin_lock_irq(&lruvec->lru_lock);
  1509. - new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
  1510. + if (!walk) {
  1511. + spin_lock_irq(&lruvec->lru_lock);
  1512. + new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
  1513. + }
  1514. for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
  1515. page = compound_head(pte_page(pte[i]));
  1516. @@ -3384,10 +4332,14 @@ void lru_gen_look_around(struct page_vma
  1517. if (old_gen < 0 || old_gen == new_gen)
  1518. continue;
  1519. - lru_gen_update_size(lruvec, page, old_gen, new_gen);
  1520. + if (walk)
  1521. + update_batch_size(walk, page, old_gen, new_gen);
  1522. + else
  1523. + lru_gen_update_size(lruvec, page, old_gen, new_gen);
  1524. }
  1525. - spin_unlock_irq(&lruvec->lru_lock);
  1526. + if (!walk)
  1527. + spin_unlock_irq(&lruvec->lru_lock);
  1528. mem_cgroup_unlock_pages();
  1529. }
  1530. @@ -3670,6 +4622,7 @@ static int evict_pages(struct lruvec *lr
  1531. struct page *page;
  1532. enum vm_event_item item;
  1533. struct reclaim_stat stat;
  1534. + struct lru_gen_mm_walk *walk;
  1535. struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  1536. struct pglist_data *pgdat = lruvec_pgdat(lruvec);
  1537. @@ -3706,6 +4659,10 @@ static int evict_pages(struct lruvec *lr
  1538. move_pages_to_lru(lruvec, &list);
  1539. + walk = current->reclaim_state->mm_walk;
  1540. + if (walk && walk->batched)
  1541. + reset_batch_size(lruvec, walk);
  1542. +
  1543. item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
  1544. if (!cgroup_reclaim(sc))
  1545. __count_vm_events(item, reclaimed);
  1546. @@ -3722,6 +4679,11 @@ static int evict_pages(struct lruvec *lr
  1547. return scanned;
  1548. }
  1549. +/*
  1550. + * For future optimizations:
  1551. + * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
  1552. + * reclaim.
  1553. + */
  1554. static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
  1555. bool can_swap)
  1556. {
  1557. @@ -3747,7 +4709,8 @@ static unsigned long get_nr_to_scan(stru
  1558. if (current_is_kswapd())
  1559. return 0;
  1560. - inc_max_seq(lruvec, max_seq, can_swap);
  1561. + if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap))
  1562. + return nr_to_scan;
  1563. done:
  1564. return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
  1565. }
  1566. @@ -3761,6 +4724,8 @@ static void lru_gen_shrink_lruvec(struct
  1567. blk_start_plug(&plug);
  1568. + set_mm_walk(lruvec_pgdat(lruvec));
  1569. +
  1570. while (true) {
  1571. int delta;
  1572. int swappiness;
  1573. @@ -3788,6 +4753,8 @@ static void lru_gen_shrink_lruvec(struct
  1574. cond_resched();
  1575. }
  1576. + clear_mm_walk();
  1577. +
  1578. blk_finish_plug(&plug);
  1579. }
  1580. @@ -3804,15 +4771,21 @@ void lru_gen_init_lruvec(struct lruvec *
  1581. for_each_gen_type_zone(gen, type, zone)
  1582. INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
  1583. +
  1584. + lruvec->mm_state.seq = MIN_NR_GENS;
  1585. + init_waitqueue_head(&lruvec->mm_state.wait);
  1586. }
  1587. #ifdef CONFIG_MEMCG
  1588. void lru_gen_init_memcg(struct mem_cgroup *memcg)
  1589. {
  1590. + INIT_LIST_HEAD(&memcg->mm_list.fifo);
  1591. + spin_lock_init(&memcg->mm_list.lock);
  1592. }
  1593. void lru_gen_exit_memcg(struct mem_cgroup *memcg)
  1594. {
  1595. + int i;
  1596. int nid;
  1597. for_each_node(nid) {
  1598. @@ -3820,6 +4793,11 @@ void lru_gen_exit_memcg(struct mem_cgrou
  1599. VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
  1600. sizeof(lruvec->lrugen.nr_pages)));
  1601. +
  1602. + for (i = 0; i < NR_BLOOM_FILTERS; i++) {
  1603. + bitmap_free(lruvec->mm_state.filters[i]);
  1604. + lruvec->mm_state.filters[i] = NULL;
  1605. + }
  1606. }
  1607. }
  1608. #endif