2
0

020-v6.1-05-mm-multi-gen-LRU-groundwork.patch 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807
  1. From a9b328add8422921a0dbbef162730800e16e8cfd Mon Sep 17 00:00:00 2001
  2. From: Yu Zhao <[email protected]>
  3. Date: Sun, 18 Sep 2022 02:00:02 -0600
  4. Subject: [PATCH 05/29] mm: multi-gen LRU: groundwork
  5. MIME-Version: 1.0
  6. Content-Type: text/plain; charset=UTF-8
  7. Content-Transfer-Encoding: 8bit
  8. Evictable pages are divided into multiple generations for each lruvec.
  9. The youngest generation number is stored in lrugen->max_seq for both
  10. anon and file types as they are aged on an equal footing. The oldest
  11. generation numbers are stored in lrugen->min_seq[] separately for anon
  12. and file types as clean file pages can be evicted regardless of swap
  13. constraints. These three variables are monotonically increasing.
  14. Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits
  15. in order to fit into the gen counter in page->flags. Each truncated
  16. generation number is an index to lrugen->lists[]. The sliding window
  17. technique is used to track at least MIN_NR_GENS and at most
  18. MAX_NR_GENS generations. The gen counter stores a value within [1,
  19. MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it
  20. stores 0.
  21. There are two conceptually independent procedures: "the aging", which
  22. produces young generations, and "the eviction", which consumes old
  23. generations. They form a closed-loop system, i.e., "the page reclaim".
  24. Both procedures can be invoked from userspace for the purposes of working
  25. set estimation and proactive reclaim. These techniques are commonly used
  26. to optimize job scheduling (bin packing) in data centers [1][2].
  27. To avoid confusion, the terms "hot" and "cold" will be applied to the
  28. multi-gen LRU, as a new convention; the terms "active" and "inactive" will
  29. be applied to the active/inactive LRU, as usual.
  30. The protection of hot pages and the selection of cold pages are based
  31. on page access channels and patterns. There are two access channels:
  32. one through page tables and the other through file descriptors. The
  33. protection of the former channel is by design stronger because:
  34. 1. The uncertainty in determining the access patterns of the former
  35. channel is higher due to the approximation of the accessed bit.
  36. 2. The cost of evicting the former channel is higher due to the TLB
  37. flushes required and the likelihood of encountering the dirty bit.
  38. 3. The penalty of underprotecting the former channel is higher because
  39. applications usually do not prepare themselves for major page
  40. faults like they do for blocked I/O. E.g., GUI applications
  41. commonly use dedicated I/O threads to avoid blocking rendering
  42. threads.
  43. There are also two access patterns: one with temporal locality and the
  44. other without. For the reasons listed above, the former channel is
  45. assumed to follow the former pattern unless VM_SEQ_READ or VM_RAND_READ is
  46. present; the latter channel is assumed to follow the latter pattern unless
  47. outlying refaults have been observed [3][4].
  48. The next patch will address the "outlying refaults". Three macros, i.e.,
  49. LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are added in
  50. this patch to make the entire patchset less diffy.
  51. A page is added to the youngest generation on faulting. The aging needs
  52. to check the accessed bit at least twice before handing this page over to
  53. the eviction. The first check takes care of the accessed bit set on the
  54. initial fault; the second check makes sure this page has not been used
  55. since then. This protocol, AKA second chance, requires a minimum of two
  56. generations, hence MIN_NR_GENS.
  57. [1] https://dl.acm.org/doi/10.1145/3297858.3304053
  58. [2] https://dl.acm.org/doi/10.1145/3503222.3507731
  59. [3] https://lwn.net/Articles/495543/
  60. [4] https://lwn.net/Articles/815342/
  61. Link: https://lkml.kernel.org/r/[email protected]
  62. Signed-off-by: Yu Zhao <[email protected]>
  63. Acked-by: Brian Geffon <[email protected]>
  64. Acked-by: Jan Alexander Steffens (heftig) <[email protected]>
  65. Acked-by: Oleksandr Natalenko <[email protected]>
  66. Acked-by: Steven Barrett <[email protected]>
  67. Acked-by: Suleiman Souhlal <[email protected]>
  68. Tested-by: Daniel Byrne <[email protected]>
  69. Tested-by: Donald Carr <[email protected]>
  70. Tested-by: Holger Hoffstätte <[email protected]>
  71. Tested-by: Konstantin Kharlamov <[email protected]>
  72. Tested-by: Shuang Zhai <[email protected]>
  73. Tested-by: Sofia Trinh <[email protected]>
  74. Tested-by: Vaibhav Jain <[email protected]>
  75. Cc: Andi Kleen <[email protected]>
  76. Cc: Aneesh Kumar K.V <[email protected]>
  77. Cc: Barry Song <[email protected]>
  78. Cc: Catalin Marinas <[email protected]>
  79. Cc: Dave Hansen <[email protected]>
  80. Cc: Hillf Danton <[email protected]>
  81. Cc: Jens Axboe <[email protected]>
  82. Cc: Johannes Weiner <[email protected]>
  83. Cc: Jonathan Corbet <[email protected]>
  84. Cc: Linus Torvalds <[email protected]>
  85. Cc: Matthew Wilcox <[email protected]>
  86. Cc: Mel Gorman <[email protected]>
  87. Cc: Miaohe Lin <[email protected]>
  88. Cc: Michael Larabel <[email protected]>
  89. Cc: Michal Hocko <[email protected]>
  90. Cc: Mike Rapoport <[email protected]>
  91. Cc: Mike Rapoport <[email protected]>
  92. Cc: Peter Zijlstra <[email protected]>
  93. Cc: Qi Zheng <[email protected]>
  94. Cc: Tejun Heo <[email protected]>
  95. Cc: Vlastimil Babka <[email protected]>
  96. Cc: Will Deacon <[email protected]>
  97. Signed-off-by: Andrew Morton <[email protected]>
  98. ---
  99. fs/fuse/dev.c | 3 +-
  100. include/linux/mm.h | 2 +
  101. include/linux/mm_inline.h | 177 +++++++++++++++++++++++++++++-
  102. include/linux/mmzone.h | 100 +++++++++++++++++
  103. include/linux/page-flags-layout.h | 13 ++-
  104. include/linux/page-flags.h | 4 +-
  105. include/linux/sched.h | 4 +
  106. kernel/bounds.c | 5 +
  107. mm/Kconfig | 8 ++
  108. mm/huge_memory.c | 3 +-
  109. mm/memcontrol.c | 2 +
  110. mm/memory.c | 25 +++++
  111. mm/mm_init.c | 6 +-
  112. mm/mmzone.c | 2 +
  113. mm/swap.c | 10 +-
  114. mm/vmscan.c | 75 +++++++++++++
  115. 16 files changed, 425 insertions(+), 14 deletions(-)
  116. --- a/fs/fuse/dev.c
  117. +++ b/fs/fuse/dev.c
  118. @@ -785,7 +785,8 @@ static int fuse_check_page(struct page *
  119. 1 << PG_active |
  120. 1 << PG_workingset |
  121. 1 << PG_reclaim |
  122. - 1 << PG_waiters))) {
  123. + 1 << PG_waiters |
  124. + LRU_GEN_MASK | LRU_REFS_MASK))) {
  125. dump_page(page, "fuse: trying to steal weird page");
  126. return 1;
  127. }
  128. --- a/include/linux/mm.h
  129. +++ b/include/linux/mm.h
  130. @@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v
  131. #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
  132. #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
  133. #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
  134. +#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
  135. +#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)
  136. /*
  137. * Define the bit shifts to access each section. For non-existent
  138. --- a/include/linux/mm_inline.h
  139. +++ b/include/linux/mm_inline.h
  140. @@ -26,10 +26,13 @@ static inline int page_is_file_lru(struc
  141. static __always_inline void __update_lru_size(struct lruvec *lruvec,
  142. enum lru_list lru, enum zone_type zid,
  143. - int nr_pages)
  144. + long nr_pages)
  145. {
  146. struct pglist_data *pgdat = lruvec_pgdat(lruvec);
  147. + lockdep_assert_held(&lruvec->lru_lock);
  148. + WARN_ON_ONCE(nr_pages != (int)nr_pages);
  149. +
  150. __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
  151. __mod_zone_page_state(&pgdat->node_zones[zid],
  152. NR_ZONE_LRU_BASE + lru, nr_pages);
  153. @@ -86,11 +89,177 @@ static __always_inline enum lru_list pag
  154. return lru;
  155. }
  156. +#ifdef CONFIG_LRU_GEN
  157. +
  158. +static inline bool lru_gen_enabled(void)
  159. +{
  160. + return true;
  161. +}
  162. +
  163. +static inline bool lru_gen_in_fault(void)
  164. +{
  165. + return current->in_lru_fault;
  166. +}
  167. +
  168. +static inline int lru_gen_from_seq(unsigned long seq)
  169. +{
  170. + return seq % MAX_NR_GENS;
  171. +}
  172. +
  173. +static inline int page_lru_gen(struct page *page)
  174. +{
  175. + unsigned long flags = READ_ONCE(page->flags);
  176. +
  177. + return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
  178. +}
  179. +
  180. +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
  181. +{
  182. + unsigned long max_seq = lruvec->lrugen.max_seq;
  183. +
  184. + VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
  185. +
  186. + /* see the comment on MIN_NR_GENS */
  187. + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
  188. +}
  189. +
  190. +static inline void lru_gen_update_size(struct lruvec *lruvec, struct page *page,
  191. + int old_gen, int new_gen)
  192. +{
  193. + int type = page_is_file_lru(page);
  194. + int zone = page_zonenum(page);
  195. + int delta = thp_nr_pages(page);
  196. + enum lru_list lru = type * LRU_INACTIVE_FILE;
  197. + struct lru_gen_struct *lrugen = &lruvec->lrugen;
  198. +
  199. + VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
  200. + VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
  201. + VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);
  202. +
  203. + if (old_gen >= 0)
  204. + WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
  205. + lrugen->nr_pages[old_gen][type][zone] - delta);
  206. + if (new_gen >= 0)
  207. + WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
  208. + lrugen->nr_pages[new_gen][type][zone] + delta);
  209. +
  210. + /* addition */
  211. + if (old_gen < 0) {
  212. + if (lru_gen_is_active(lruvec, new_gen))
  213. + lru += LRU_ACTIVE;
  214. + __update_lru_size(lruvec, lru, zone, delta);
  215. + return;
  216. + }
  217. +
  218. + /* deletion */
  219. + if (new_gen < 0) {
  220. + if (lru_gen_is_active(lruvec, old_gen))
  221. + lru += LRU_ACTIVE;
  222. + __update_lru_size(lruvec, lru, zone, -delta);
  223. + return;
  224. + }
  225. +}
  226. +
  227. +static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
  228. +{
  229. + unsigned long seq;
  230. + unsigned long flags;
  231. + int gen = page_lru_gen(page);
  232. + int type = page_is_file_lru(page);
  233. + int zone = page_zonenum(page);
  234. + struct lru_gen_struct *lrugen = &lruvec->lrugen;
  235. +
  236. + VM_WARN_ON_ONCE_PAGE(gen != -1, page);
  237. +
  238. + if (PageUnevictable(page))
  239. + return false;
  240. + /*
  241. + * There are three common cases for this page:
  242. + * 1. If it's hot, e.g., freshly faulted in or previously hot and
  243. + * migrated, add it to the youngest generation.
  244. + * 2. If it's cold but can't be evicted immediately, i.e., an anon page
  245. + * not in swapcache or a dirty page pending writeback, add it to the
  246. + * second oldest generation.
  247. + * 3. Everything else (clean, cold) is added to the oldest generation.
  248. + */
  249. + if (PageActive(page))
  250. + seq = lrugen->max_seq;
  251. + else if ((type == LRU_GEN_ANON && !PageSwapCache(page)) ||
  252. + (PageReclaim(page) &&
  253. + (PageDirty(page) || PageWriteback(page))))
  254. + seq = lrugen->min_seq[type] + 1;
  255. + else
  256. + seq = lrugen->min_seq[type];
  257. +
  258. + gen = lru_gen_from_seq(seq);
  259. + flags = (gen + 1UL) << LRU_GEN_PGOFF;
  260. + /* see the comment on MIN_NR_GENS about PG_active */
  261. + set_mask_bits(&page->flags, LRU_GEN_MASK | BIT(PG_active), flags);
  262. +
  263. + lru_gen_update_size(lruvec, page, -1, gen);
  264. + /* for rotate_reclaimable_page() */
  265. + if (reclaiming)
  266. + list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
  267. + else
  268. + list_add(&page->lru, &lrugen->lists[gen][type][zone]);
  269. +
  270. + return true;
  271. +}
  272. +
  273. +static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
  274. +{
  275. + unsigned long flags;
  276. + int gen = page_lru_gen(page);
  277. +
  278. + if (gen < 0)
  279. + return false;
  280. +
  281. + VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
  282. + VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
  283. +
  284. + /* for migrate_page_states() */
  285. + flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
  286. + flags = set_mask_bits(&page->flags, LRU_GEN_MASK, flags);
  287. + gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
  288. +
  289. + lru_gen_update_size(lruvec, page, gen, -1);
  290. + list_del(&page->lru);
  291. +
  292. + return true;
  293. +}
  294. +
  295. +#else /* !CONFIG_LRU_GEN */
  296. +
  297. +static inline bool lru_gen_enabled(void)
  298. +{
  299. + return false;
  300. +}
  301. +
  302. +static inline bool lru_gen_in_fault(void)
  303. +{
  304. + return false;
  305. +}
  306. +
  307. +static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
  308. +{
  309. + return false;
  310. +}
  311. +
  312. +static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
  313. +{
  314. + return false;
  315. +}
  316. +
  317. +#endif /* CONFIG_LRU_GEN */
  318. +
  319. static __always_inline void add_page_to_lru_list(struct page *page,
  320. struct lruvec *lruvec)
  321. {
  322. enum lru_list lru = page_lru(page);
  323. + if (lru_gen_add_page(lruvec, page, false))
  324. + return;
  325. +
  326. update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
  327. list_add(&page->lru, &lruvec->lists[lru]);
  328. }
  329. @@ -100,6 +269,9 @@ static __always_inline void add_page_to_
  330. {
  331. enum lru_list lru = page_lru(page);
  332. + if (lru_gen_add_page(lruvec, page, true))
  333. + return;
  334. +
  335. update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
  336. list_add_tail(&page->lru, &lruvec->lists[lru]);
  337. }
  338. @@ -107,6 +279,9 @@ static __always_inline void add_page_to_
  339. static __always_inline void del_page_from_lru_list(struct page *page,
  340. struct lruvec *lruvec)
  341. {
  342. + if (lru_gen_del_page(lruvec, page, false))
  343. + return;
  344. +
  345. list_del(&page->lru);
  346. update_lru_size(lruvec, page_lru(page), page_zonenum(page),
  347. -thp_nr_pages(page));
  348. --- a/include/linux/mmzone.h
  349. +++ b/include/linux/mmzone.h
  350. @@ -294,6 +294,102 @@ enum lruvec_flags {
  351. */
  352. };
  353. +#endif /* !__GENERATING_BOUNDS_H */
  354. +
  355. +/*
  356. + * Evictable pages are divided into multiple generations. The youngest and the
  357. + * oldest generation numbers, max_seq and min_seq, are monotonically increasing.
  358. + * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
  359. + * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
  360. + * corresponding generation. The gen counter in page->flags stores gen+1 while
  361. + * a page is on one of lrugen->lists[]. Otherwise it stores 0.
  362. + *
  363. + * A page is added to the youngest generation on faulting. The aging needs to
  364. + * check the accessed bit at least twice before handing this page over to the
  365. + * eviction. The first check takes care of the accessed bit set on the initial
  366. + * fault; the second check makes sure this page hasn't been used since then.
  367. + * This process, AKA second chance, requires a minimum of two generations,
  368. + * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
  369. + * LRU, e.g., /proc/vmstat, these two generations are considered active; the
  370. + * rest of generations, if they exist, are considered inactive. See
  371. + * lru_gen_is_active().
  372. + *
  373. + * PG_active is always cleared while a page is on one of lrugen->lists[] so that
  374. + * the aging needs not to worry about it. And it's set again when a page
  375. + * considered active is isolated for non-reclaiming purposes, e.g., migration.
  376. + * See lru_gen_add_page() and lru_gen_del_page().
  377. + *
  378. + * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
  379. + * number of categories of the active/inactive LRU when keeping track of
  380. + * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
  381. + * in page->flags.
  382. + */
  383. +#define MIN_NR_GENS 2U
  384. +#define MAX_NR_GENS 4U
  385. +
  386. +#ifndef __GENERATING_BOUNDS_H
  387. +
  388. +struct lruvec;
  389. +
  390. +#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
  391. +#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
  392. +
  393. +#ifdef CONFIG_LRU_GEN
  394. +
  395. +enum {
  396. + LRU_GEN_ANON,
  397. + LRU_GEN_FILE,
  398. +};
  399. +
  400. +/*
  401. + * The youngest generation number is stored in max_seq for both anon and file
  402. + * types as they are aged on an equal footing. The oldest generation numbers are
  403. + * stored in min_seq[] separately for anon and file types as clean file pages
  404. + * can be evicted regardless of swap constraints.
  405. + *
  406. + * Normally anon and file min_seq are in sync. But if swapping is constrained,
  407. + * e.g., out of swap space, file min_seq is allowed to advance and leave anon
  408. + * min_seq behind.
  409. + *
  410. + * The number of pages in each generation is eventually consistent and therefore
  411. + * can be transiently negative.
  412. + */
  413. +struct lru_gen_struct {
  414. + /* the aging increments the youngest generation number */
  415. + unsigned long max_seq;
  416. + /* the eviction increments the oldest generation numbers */
  417. + unsigned long min_seq[ANON_AND_FILE];
  418. + /* the multi-gen LRU lists, lazily sorted on eviction */
  419. + struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
  420. + /* the multi-gen LRU sizes, eventually consistent */
  421. + long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
  422. +};
  423. +
  424. +void lru_gen_init_lruvec(struct lruvec *lruvec);
  425. +
  426. +#ifdef CONFIG_MEMCG
  427. +void lru_gen_init_memcg(struct mem_cgroup *memcg);
  428. +void lru_gen_exit_memcg(struct mem_cgroup *memcg);
  429. +#endif
  430. +
  431. +#else /* !CONFIG_LRU_GEN */
  432. +
  433. +static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
  434. +{
  435. +}
  436. +
  437. +#ifdef CONFIG_MEMCG
  438. +static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
  439. +{
  440. +}
  441. +
  442. +static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
  443. +{
  444. +}
  445. +#endif
  446. +
  447. +#endif /* CONFIG_LRU_GEN */
  448. +
  449. struct lruvec {
  450. struct list_head lists[NR_LRU_LISTS];
  451. /* per lruvec lru_lock for memcg */
  452. @@ -311,6 +407,10 @@ struct lruvec {
  453. unsigned long refaults[ANON_AND_FILE];
  454. /* Various lruvec state flags (enum lruvec_flags) */
  455. unsigned long flags;
  456. +#ifdef CONFIG_LRU_GEN
  457. + /* evictable pages divided into generations */
  458. + struct lru_gen_struct lrugen;
  459. +#endif
  460. #ifdef CONFIG_MEMCG
  461. struct pglist_data *pgdat;
  462. #endif
  463. --- a/include/linux/page-flags-layout.h
  464. +++ b/include/linux/page-flags-layout.h
  465. @@ -55,7 +55,8 @@
  466. #define SECTIONS_WIDTH 0
  467. #endif
  468. -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
  469. +#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
  470. + <= BITS_PER_LONG - NR_PAGEFLAGS
  471. #define NODES_WIDTH NODES_SHIFT
  472. #elif defined(CONFIG_SPARSEMEM_VMEMMAP)
  473. #error "Vmemmap: No space for nodes field in page flags"
  474. @@ -89,8 +90,8 @@
  475. #define LAST_CPUPID_SHIFT 0
  476. #endif
  477. -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
  478. - <= BITS_PER_LONG - NR_PAGEFLAGS
  479. +#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
  480. + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
  481. #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
  482. #else
  483. #define LAST_CPUPID_WIDTH 0
  484. @@ -100,10 +101,12 @@
  485. #define LAST_CPUPID_NOT_IN_PAGE_FLAGS
  486. #endif
  487. -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
  488. - > BITS_PER_LONG - NR_PAGEFLAGS
  489. +#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
  490. + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
  491. #error "Not enough bits in page flags"
  492. #endif
  493. +#define LRU_REFS_WIDTH 0
  494. +
  495. #endif
  496. #endif /* _LINUX_PAGE_FLAGS_LAYOUT */
  497. --- a/include/linux/page-flags.h
  498. +++ b/include/linux/page-flags.h
  499. @@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall
  500. 1UL << PG_private | 1UL << PG_private_2 | \
  501. 1UL << PG_writeback | 1UL << PG_reserved | \
  502. 1UL << PG_slab | 1UL << PG_active | \
  503. - 1UL << PG_unevictable | __PG_MLOCKED)
  504. + 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
  505. /*
  506. * Flags checked when a page is prepped for return by the page allocator.
  507. @@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall
  508. * alloc-free cycle to prevent from reusing the page.
  509. */
  510. #define PAGE_FLAGS_CHECK_AT_PREP \
  511. - (PAGEFLAGS_MASK & ~__PG_HWPOISON)
  512. + ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
  513. #define PAGE_FLAGS_PRIVATE \
  514. (1UL << PG_private | 1UL << PG_private_2)
  515. --- a/include/linux/sched.h
  516. +++ b/include/linux/sched.h
  517. @@ -911,6 +911,10 @@ struct task_struct {
  518. #ifdef CONFIG_MEMCG
  519. unsigned in_user_fault:1;
  520. #endif
  521. +#ifdef CONFIG_LRU_GEN
  522. + /* whether the LRU algorithm may apply to this access */
  523. + unsigned in_lru_fault:1;
  524. +#endif
  525. #ifdef CONFIG_COMPAT_BRK
  526. unsigned brk_randomized:1;
  527. #endif
  528. --- a/kernel/bounds.c
  529. +++ b/kernel/bounds.c
  530. @@ -22,6 +22,11 @@ int main(void)
  531. DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
  532. #endif
  533. DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
  534. +#ifdef CONFIG_LRU_GEN
  535. + DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
  536. +#else
  537. + DEFINE(LRU_GEN_WIDTH, 0);
  538. +#endif
  539. /* End of constants */
  540. return 0;
  541. --- a/mm/Kconfig
  542. +++ b/mm/Kconfig
  543. @@ -897,6 +897,14 @@ config IO_MAPPING
  544. config SECRETMEM
  545. def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
  546. +config LRU_GEN
  547. + bool "Multi-Gen LRU"
  548. + depends on MMU
  549. + # make sure page->flags has enough spare bits
  550. + depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP
  551. + help
  552. + A high performance LRU implementation to overcommit memory.
  553. +
  554. source "mm/damon/Kconfig"
  555. endmenu
  556. --- a/mm/huge_memory.c
  557. +++ b/mm/huge_memory.c
  558. @@ -2366,7 +2366,8 @@ static void __split_huge_page_tail(struc
  559. #ifdef CONFIG_64BIT
  560. (1L << PG_arch_2) |
  561. #endif
  562. - (1L << PG_dirty)));
  563. + (1L << PG_dirty) |
  564. + LRU_GEN_MASK | LRU_REFS_MASK));
  565. /* ->mapping in first tail page is compound_mapcount */
  566. VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
  567. --- a/mm/memcontrol.c
  568. +++ b/mm/memcontrol.c
  569. @@ -5178,6 +5178,7 @@ static void __mem_cgroup_free(struct mem
  570. static void mem_cgroup_free(struct mem_cgroup *memcg)
  571. {
  572. + lru_gen_exit_memcg(memcg);
  573. memcg_wb_domain_exit(memcg);
  574. __mem_cgroup_free(memcg);
  575. }
  576. @@ -5241,6 +5242,7 @@ static struct mem_cgroup *mem_cgroup_all
  577. memcg->deferred_split_queue.split_queue_len = 0;
  578. #endif
  579. idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
  580. + lru_gen_init_memcg(memcg);
  581. return memcg;
  582. fail:
  583. mem_cgroup_id_remove(memcg);
  584. --- a/mm/memory.c
  585. +++ b/mm/memory.c
  586. @@ -4792,6 +4792,27 @@ static inline void mm_account_fault(stru
  587. perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
  588. }
  589. +#ifdef CONFIG_LRU_GEN
  590. +static void lru_gen_enter_fault(struct vm_area_struct *vma)
  591. +{
  592. + /* the LRU algorithm doesn't apply to sequential or random reads */
  593. + current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
  594. +}
  595. +
  596. +static void lru_gen_exit_fault(void)
  597. +{
  598. + current->in_lru_fault = false;
  599. +}
  600. +#else
  601. +static void lru_gen_enter_fault(struct vm_area_struct *vma)
  602. +{
  603. +}
  604. +
  605. +static void lru_gen_exit_fault(void)
  606. +{
  607. +}
  608. +#endif /* CONFIG_LRU_GEN */
  609. +
  610. /*
  611. * By the time we get here, we already hold the mm semaphore
  612. *
  613. @@ -4823,11 +4844,15 @@ vm_fault_t handle_mm_fault(struct vm_are
  614. if (flags & FAULT_FLAG_USER)
  615. mem_cgroup_enter_user_fault();
  616. + lru_gen_enter_fault(vma);
  617. +
  618. if (unlikely(is_vm_hugetlb_page(vma)))
  619. ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
  620. else
  621. ret = __handle_mm_fault(vma, address, flags);
  622. + lru_gen_exit_fault();
  623. +
  624. if (flags & FAULT_FLAG_USER) {
  625. mem_cgroup_exit_user_fault();
  626. /*
  627. --- a/mm/mm_init.c
  628. +++ b/mm/mm_init.c
  629. @@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo
  630. shift = 8 * sizeof(unsigned long);
  631. width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
  632. - - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
  633. + - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
  634. mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
  635. - "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
  636. + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
  637. SECTIONS_WIDTH,
  638. NODES_WIDTH,
  639. ZONES_WIDTH,
  640. LAST_CPUPID_WIDTH,
  641. KASAN_TAG_WIDTH,
  642. + LRU_GEN_WIDTH,
  643. + LRU_REFS_WIDTH,
  644. NR_PAGEFLAGS);
  645. mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
  646. "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
  647. --- a/mm/mmzone.c
  648. +++ b/mm/mmzone.c
  649. @@ -81,6 +81,8 @@ void lruvec_init(struct lruvec *lruvec)
  650. for_each_lru(lru)
  651. INIT_LIST_HEAD(&lruvec->lists[lru]);
  652. +
  653. + lru_gen_init_lruvec(lruvec);
  654. }
  655. #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
  656. --- a/mm/swap.c
  657. +++ b/mm/swap.c
  658. @@ -446,6 +446,11 @@ void lru_cache_add(struct page *page)
  659. VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
  660. VM_BUG_ON_PAGE(PageLRU(page), page);
  661. + /* see the comment in lru_gen_add_page() */
  662. + if (lru_gen_enabled() && !PageUnevictable(page) &&
  663. + lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
  664. + SetPageActive(page);
  665. +
  666. get_page(page);
  667. local_lock(&lru_pvecs.lock);
  668. pvec = this_cpu_ptr(&lru_pvecs.lru_add);
  669. @@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc
  670. static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
  671. {
  672. - if (PageActive(page) && !PageUnevictable(page)) {
  673. + if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
  674. int nr_pages = thp_nr_pages(page);
  675. del_page_from_lru_list(page, lruvec);
  676. @@ -661,7 +666,8 @@ void deactivate_file_page(struct page *p
  677. */
  678. void deactivate_page(struct page *page)
  679. {
  680. - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
  681. + if (PageLRU(page) && !PageUnevictable(page) &&
  682. + (PageActive(page) || lru_gen_enabled())) {
  683. struct pagevec *pvec;
  684. local_lock(&lru_pvecs.lock);
  685. --- a/mm/vmscan.c
  686. +++ b/mm/vmscan.c
  687. @@ -2821,6 +2821,81 @@ static bool can_age_anon_pages(struct pg
  688. return can_demote(pgdat->node_id, sc);
  689. }
  690. +#ifdef CONFIG_LRU_GEN
  691. +
  692. +/******************************************************************************
  693. + * shorthand helpers
  694. + ******************************************************************************/
  695. +
  696. +#define for_each_gen_type_zone(gen, type, zone) \
  697. + for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
  698. + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
  699. + for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
  700. +
  701. +static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
  702. +{
  703. + struct pglist_data *pgdat = NODE_DATA(nid);
  704. +
  705. +#ifdef CONFIG_MEMCG
  706. + if (memcg) {
  707. + struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
  708. +
  709. + /* for hotadd_new_pgdat() */
  710. + if (!lruvec->pgdat)
  711. + lruvec->pgdat = pgdat;
  712. +
  713. + return lruvec;
  714. + }
  715. +#endif
  716. + VM_WARN_ON_ONCE(!mem_cgroup_disabled());
  717. +
  718. + return pgdat ? &pgdat->__lruvec : NULL;
  719. +}
  720. +
  721. +/******************************************************************************
  722. + * initialization
  723. + ******************************************************************************/
  724. +
  725. +void lru_gen_init_lruvec(struct lruvec *lruvec)
  726. +{
  727. + int gen, type, zone;
  728. + struct lru_gen_struct *lrugen = &lruvec->lrugen;
  729. +
  730. + lrugen->max_seq = MIN_NR_GENS + 1;
  731. +
  732. + for_each_gen_type_zone(gen, type, zone)
  733. + INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
  734. +}
  735. +
  736. +#ifdef CONFIG_MEMCG
  737. +void lru_gen_init_memcg(struct mem_cgroup *memcg)
  738. +{
  739. +}
  740. +
  741. +void lru_gen_exit_memcg(struct mem_cgroup *memcg)
  742. +{
  743. + int nid;
  744. +
  745. + for_each_node(nid) {
  746. + struct lruvec *lruvec = get_lruvec(memcg, nid);
  747. +
  748. + VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
  749. + sizeof(lruvec->lrugen.nr_pages)));
  750. + }
  751. +}
  752. +#endif
  753. +
  754. +static int __init init_lru_gen(void)
  755. +{
  756. + BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
  757. + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
  758. +
  759. + return 0;
  760. +};
  761. +late_initcall(init_lru_gen);
  762. +
  763. +#endif /* CONFIG_LRU_GEN */
  764. +
  765. static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
  766. {
  767. unsigned long nr[NR_LRU_LISTS];