2
0

020-v6.1-04-mm-multigenerational-lru-groundwork.patch 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996
  1. From 05f366c941ae2bb8ba21c79fafcb747a5a6b967b Mon Sep 17 00:00:00 2001
  2. From: Yu Zhao <[email protected]>
  3. Date: Mon, 25 Jan 2021 21:12:33 -0700
  4. Subject: [PATCH 04/10] mm: multigenerational lru: groundwork
  5. For each lruvec, evictable pages are divided into multiple
  6. generations. The youngest generation number is stored in
  7. lrugen->max_seq for both anon and file types as they are aged on an
  8. equal footing. The oldest generation numbers are stored in
  9. lrugen->min_seq[] separately for anon and file types as clean file
  10. pages can be evicted regardless of swap constraints. These three
  11. variables are monotonically increasing. Generation numbers are
  12. truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into
  13. page->flags. The sliding window technique is used to prevent truncated
  14. generation numbers from overlapping. Each truncated generation number
  15. is an index to
  16. lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES].
  17. The framework comprises two conceptually independent components: the
  18. aging, which produces young generations, and the eviction, which
  19. consumes old generations. Both can be invoked independently from user
  20. space for the purpose of working set estimation and proactive reclaim.
  21. The protection of hot pages and the selection of cold pages are based
  22. on page access types and patterns. There are two access types: one via
  23. page tables and the other via file descriptors. The protection of the
  24. former type is by design stronger because:
  25. 1) The uncertainty in determining the access patterns of the former
  26. type is higher due to the coalesced nature of the accessed bit.
  27. 2) The cost of evicting the former type is higher due to the TLB
  28. flushes required and the likelihood of involving I/O.
  29. 3) The penalty of under-protecting the former type is higher because
  30. applications usually do not prepare themselves for major faults like
  31. they do for blocked I/O. For example, client applications commonly
  32. dedicate blocked I/O to separate threads to avoid UI janks that
  33. negatively affect user experience.
  34. There are also two access patterns: one with temporal locality and the
  35. other without. The latter pattern, e.g., random and sequential, needs
  36. to be explicitly excluded to avoid weakening the protection of the
  37. former pattern. Generally the former type follows the former pattern
  38. unless MADV_SEQUENTIAL is specified and the latter type follows the
  39. latter pattern unless outlying refaults have been observed.
  40. Upon faulting, a page is added to the youngest generation, which
  41. provides the strongest protection as the eviction will not consider
  42. this page before the aging has scanned it at least twice. The first
  43. scan clears the accessed bit set during the initial fault. And the
  44. second scan makes sure this page has not been used since the first
  45. scan. A page from any other generations is brought back to the
  46. youngest generation whenever the aging finds the accessed bit set on
  47. any of the PTEs mapping this page.
  48. Unmapped pages are initially added to the oldest generation and then
  49. conditionally protected by tiers. This is done later [PATCH 07/10].
  50. Signed-off-by: Yu Zhao <[email protected]>
  51. Tested-by: Konstantin Kharlamov <[email protected]>
  52. Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7
  53. ---
  54. fs/fuse/dev.c | 3 +-
  55. include/linux/cgroup.h | 15 +-
  56. include/linux/mm.h | 36 ++++
  57. include/linux/mm_inline.h | 182 ++++++++++++++++++++
  58. include/linux/mmzone.h | 70 ++++++++
  59. include/linux/page-flags-layout.h | 19 ++-
  60. include/linux/page-flags.h | 4 +-
  61. include/linux/sched.h | 3 +
  62. kernel/bounds.c | 3 +
  63. kernel/cgroup/cgroup-internal.h | 1 -
  64. mm/huge_memory.c | 3 +-
  65. mm/memcontrol.c | 1 +
  66. mm/memory.c | 7 +
  67. mm/mm_init.c | 6 +-
  68. mm/page_alloc.c | 1 +
  69. mm/swap.c | 9 +-
  70. mm/swapfile.c | 2 +
  71. mm/vmscan.c | 268 ++++++++++++++++++++++++++++++
  72. 18 files changed, 618 insertions(+), 15 deletions(-)
  73. --- a/fs/fuse/dev.c
  74. +++ b/fs/fuse/dev.c
  75. @@ -785,7 +785,8 @@ static int fuse_check_page(struct page *
  76. 1 << PG_active |
  77. 1 << PG_workingset |
  78. 1 << PG_reclaim |
  79. - 1 << PG_waiters))) {
  80. + 1 << PG_waiters |
  81. + LRU_GEN_MASK | LRU_REFS_MASK))) {
  82. dump_page(page, "fuse: trying to steal weird page");
  83. return 1;
  84. }
  85. --- a/include/linux/cgroup.h
  86. +++ b/include/linux/cgroup.h
  87. @@ -433,6 +433,18 @@ static inline void cgroup_put(struct cgr
  88. css_put(&cgrp->self);
  89. }
  90. +extern struct mutex cgroup_mutex;
  91. +
  92. +static inline void cgroup_lock(void)
  93. +{
  94. + mutex_lock(&cgroup_mutex);
  95. +}
  96. +
  97. +static inline void cgroup_unlock(void)
  98. +{
  99. + mutex_unlock(&cgroup_mutex);
  100. +}
  101. +
  102. /**
  103. * task_css_set_check - obtain a task's css_set with extra access conditions
  104. * @task: the task to obtain css_set for
  105. @@ -447,7 +459,6 @@ static inline void cgroup_put(struct cgr
  106. * as locks used during the cgroup_subsys::attach() methods.
  107. */
  108. #ifdef CONFIG_PROVE_RCU
  109. -extern struct mutex cgroup_mutex;
  110. extern spinlock_t css_set_lock;
  111. #define task_css_set_check(task, __c) \
  112. rcu_dereference_check((task)->cgroups, \
  113. @@ -708,6 +719,8 @@ struct cgroup;
  114. static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
  115. static inline void css_get(struct cgroup_subsys_state *css) {}
  116. static inline void css_put(struct cgroup_subsys_state *css) {}
  117. +static inline void cgroup_lock(void) {}
  118. +static inline void cgroup_unlock(void) {}
  119. static inline int cgroup_attach_task_all(struct task_struct *from,
  120. struct task_struct *t) { return 0; }
  121. static inline int cgroupstats_build(struct cgroupstats *stats,
  122. --- a/include/linux/mm.h
  123. +++ b/include/linux/mm.h
  124. @@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v
  125. #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
  126. #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
  127. #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
  128. +#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
  129. +#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)
  130. /*
  131. * Define the bit shifts to access each section. For non-existent
  132. @@ -1807,6 +1809,40 @@ static inline void unmap_mapping_range(s
  133. loff_t const holebegin, loff_t const holelen, int even_cows) { }
  134. #endif
  135. +#ifdef CONFIG_LRU_GEN
  136. +static inline void task_enter_nonseq_fault(void)
  137. +{
  138. + WARN_ON(current->in_nonseq_fault);
  139. +
  140. + current->in_nonseq_fault = 1;
  141. +}
  142. +
  143. +static inline void task_exit_nonseq_fault(void)
  144. +{
  145. + WARN_ON(!current->in_nonseq_fault);
  146. +
  147. + current->in_nonseq_fault = 0;
  148. +}
  149. +
  150. +static inline bool task_in_nonseq_fault(void)
  151. +{
  152. + return current->in_nonseq_fault;
  153. +}
  154. +#else
  155. +static inline void task_enter_nonseq_fault(void)
  156. +{
  157. +}
  158. +
  159. +static inline void task_exit_nonseq_fault(void)
  160. +{
  161. +}
  162. +
  163. +static inline bool task_in_nonseq_fault(void)
  164. +{
  165. + return false;
  166. +}
  167. +#endif /* CONFIG_LRU_GEN */
  168. +
  169. static inline void unmap_shared_mapping_range(struct address_space *mapping,
  170. loff_t const holebegin, loff_t const holelen)
  171. {
  172. --- a/include/linux/mm_inline.h
  173. +++ b/include/linux/mm_inline.h
  174. @@ -79,11 +79,187 @@ static __always_inline enum lru_list pag
  175. return lru;
  176. }
  177. +#ifdef CONFIG_LRU_GEN
  178. +
  179. +static inline bool lru_gen_enabled(void)
  180. +{
  181. +#ifdef CONFIG_LRU_GEN_ENABLED
  182. + DECLARE_STATIC_KEY_TRUE(lru_gen_static_key);
  183. +
  184. + return static_branch_likely(&lru_gen_static_key);
  185. +#else
  186. + DECLARE_STATIC_KEY_FALSE(lru_gen_static_key);
  187. +
  188. + return static_branch_unlikely(&lru_gen_static_key);
  189. +#endif
  190. +}
  191. +
  192. +/* Return an index within the sliding window that tracks MAX_NR_GENS generations. */
  193. +static inline int lru_gen_from_seq(unsigned long seq)
  194. +{
  195. + return seq % MAX_NR_GENS;
  196. +}
  197. +
  198. +/* The youngest and the second youngest generations are counted as active. */
  199. +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
  200. +{
  201. + unsigned long max_seq = lruvec->evictable.max_seq;
  202. +
  203. + VM_BUG_ON(gen >= MAX_NR_GENS);
  204. +
  205. + return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
  206. +}
  207. +
  208. +/* Update the sizes of the multigenerational lru lists. */
  209. +static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec,
  210. + int old_gen, int new_gen)
  211. +{
  212. + int type = page_is_file_lru(page);
  213. + int zone = page_zonenum(page);
  214. + int delta = thp_nr_pages(page);
  215. + enum lru_list lru = type * LRU_FILE;
  216. + struct lrugen *lrugen = &lruvec->evictable;
  217. +
  218. + lockdep_assert_held(&lruvec->lru_lock);
  219. + VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS);
  220. + VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS);
  221. + VM_BUG_ON(old_gen == -1 && new_gen == -1);
  222. +
  223. + if (old_gen >= 0)
  224. + WRITE_ONCE(lrugen->sizes[old_gen][type][zone],
  225. + lrugen->sizes[old_gen][type][zone] - delta);
  226. + if (new_gen >= 0)
  227. + WRITE_ONCE(lrugen->sizes[new_gen][type][zone],
  228. + lrugen->sizes[new_gen][type][zone] + delta);
  229. +
  230. + if (old_gen < 0) {
  231. + if (lru_gen_is_active(lruvec, new_gen))
  232. + lru += LRU_ACTIVE;
  233. + update_lru_size(lruvec, lru, zone, delta);
  234. + return;
  235. + }
  236. +
  237. + if (new_gen < 0) {
  238. + if (lru_gen_is_active(lruvec, old_gen))
  239. + lru += LRU_ACTIVE;
  240. + update_lru_size(lruvec, lru, zone, -delta);
  241. + return;
  242. + }
  243. +
  244. + if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
  245. + update_lru_size(lruvec, lru, zone, -delta);
  246. + update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
  247. + }
  248. +
  249. + VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
  250. +}
  251. +
  252. +/* Add a page to one of the multigenerational lru lists. Return true on success. */
  253. +static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
  254. +{
  255. + int gen;
  256. + unsigned long old_flags, new_flags;
  257. + int type = page_is_file_lru(page);
  258. + int zone = page_zonenum(page);
  259. + struct lrugen *lrugen = &lruvec->evictable;
  260. +
  261. + if (PageUnevictable(page) || !lrugen->enabled[type])
  262. + return false;
  263. + /*
  264. + * If a page shouldn't be considered for eviction, i.e., a page mapped
  265. + * upon fault during which the accessed bit is set, add it to the
  266. + * youngest generation.
  267. + *
  268. + * If a page can't be evicted immediately, i.e., an anon page not in
  269. + * swap cache or a dirty page pending writeback, add it to the second
  270. + * oldest generation.
  271. + *
  272. + * If a page could be evicted immediately, e.g., a clean page, add it to
  273. + * the oldest generation.
  274. + */
  275. + if (PageActive(page))
  276. + gen = lru_gen_from_seq(lrugen->max_seq);
  277. + else if ((!type && !PageSwapCache(page)) ||
  278. + (PageReclaim(page) && (PageDirty(page) || PageWriteback(page))))
  279. + gen = lru_gen_from_seq(lrugen->min_seq[type] + 1);
  280. + else
  281. + gen = lru_gen_from_seq(lrugen->min_seq[type]);
  282. +
  283. + do {
  284. + new_flags = old_flags = READ_ONCE(page->flags);
  285. + VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page);
  286. +
  287. + new_flags &= ~(LRU_GEN_MASK | BIT(PG_active));
  288. + new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
  289. + } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
  290. +
  291. + lru_gen_update_size(page, lruvec, -1, gen);
  292. + /* for rotate_reclaimable_page() */
  293. + if (reclaiming)
  294. + list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
  295. + else
  296. + list_add(&page->lru, &lrugen->lists[gen][type][zone]);
  297. +
  298. + return true;
  299. +}
  300. +
  301. +/* Delete a page from one of the multigenerational lru lists. Return true on success. */
  302. +static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
  303. +{
  304. + int gen;
  305. + unsigned long old_flags, new_flags;
  306. +
  307. + do {
  308. + new_flags = old_flags = READ_ONCE(page->flags);
  309. + if (!(new_flags & LRU_GEN_MASK))
  310. + return false;
  311. +
  312. + VM_BUG_ON_PAGE(PageActive(page), page);
  313. + VM_BUG_ON_PAGE(PageUnevictable(page), page);
  314. +
  315. + gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
  316. +
  317. + new_flags &= ~LRU_GEN_MASK;
  318. + /* for shrink_page_list() */
  319. + if (reclaiming)
  320. + new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim));
  321. + else if (lru_gen_is_active(lruvec, gen))
  322. + new_flags |= BIT(PG_active);
  323. + } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
  324. +
  325. + lru_gen_update_size(page, lruvec, gen, -1);
  326. + list_del(&page->lru);
  327. +
  328. + return true;
  329. +}
  330. +
  331. +#else
  332. +
  333. +static inline bool lru_gen_enabled(void)
  334. +{
  335. + return false;
  336. +}
  337. +
  338. +static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
  339. +{
  340. + return false;
  341. +}
  342. +
  343. +static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
  344. +{
  345. + return false;
  346. +}
  347. +
  348. +#endif /* CONFIG_LRU_GEN */
  349. +
  350. static __always_inline void add_page_to_lru_list(struct page *page,
  351. struct lruvec *lruvec)
  352. {
  353. enum lru_list lru = page_lru(page);
  354. + if (lru_gen_add_page(page, lruvec, false))
  355. + return;
  356. +
  357. update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
  358. list_add(&page->lru, &lruvec->lists[lru]);
  359. }
  360. @@ -93,6 +269,9 @@ static __always_inline void add_page_to_
  361. {
  362. enum lru_list lru = page_lru(page);
  363. + if (lru_gen_add_page(page, lruvec, true))
  364. + return;
  365. +
  366. update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
  367. list_add_tail(&page->lru, &lruvec->lists[lru]);
  368. }
  369. @@ -100,6 +279,9 @@ static __always_inline void add_page_to_
  370. static __always_inline void del_page_from_lru_list(struct page *page,
  371. struct lruvec *lruvec)
  372. {
  373. + if (lru_gen_del_page(page, lruvec, false))
  374. + return;
  375. +
  376. list_del(&page->lru);
  377. update_lru_size(lruvec, page_lru(page), page_zonenum(page),
  378. -thp_nr_pages(page));
  379. --- a/include/linux/mmzone.h
  380. +++ b/include/linux/mmzone.h
  381. @@ -294,6 +294,72 @@ enum lruvec_flags {
  382. */
  383. };
  384. +struct lruvec;
  385. +
  386. +#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
  387. +#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
  388. +
  389. +#ifdef CONFIG_LRU_GEN
  390. +
  391. +/*
  392. + * For each lruvec, evictable pages are divided into multiple generations. The
  393. + * youngest and the oldest generation numbers, AKA max_seq and min_seq, are
  394. + * monotonically increasing. The sliding window technique is used to track at
  395. + * least MIN_NR_GENS and at most MAX_NR_GENS generations. An offset within the
  396. + * window, AKA gen, indexes an array of per-type and per-zone lists for the
  397. + * corresponding generation. The counter in page->flags stores gen+1 while a
  398. + * page is on one of the multigenerational lru lists. Otherwise, it stores 0.
  399. + *
  400. + * After a page is faulted in, the aging must check the accessed bit at least
  401. + * twice before the eviction would consider it. The first check clears the
  402. + * accessed bit set during the initial fault. The second check makes sure this
  403. + * page hasn't been used since then.
  404. + */
  405. +#define MIN_NR_GENS 2
  406. +#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
  407. +
  408. +struct lrugen {
  409. + /* the aging increments the max generation number */
  410. + unsigned long max_seq;
  411. + /* the eviction increments the min generation numbers */
  412. + unsigned long min_seq[ANON_AND_FILE];
  413. + /* the birth time of each generation in jiffies */
  414. + unsigned long timestamps[MAX_NR_GENS];
  415. + /* the multigenerational lru lists */
  416. + struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
  417. + /* the sizes of the multigenerational lru lists in pages */
  418. + unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
  419. + /* whether the multigenerational lru is enabled */
  420. + bool enabled[ANON_AND_FILE];
  421. +};
  422. +
  423. +#define MAX_BATCH_SIZE 8192
  424. +
  425. +void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
  426. +void lru_gen_change_state(bool enable, bool main, bool swap);
  427. +
  428. +#ifdef CONFIG_MEMCG
  429. +void lru_gen_init_memcg(struct mem_cgroup *memcg);
  430. +#endif
  431. +
  432. +#else /* !CONFIG_LRU_GEN */
  433. +
  434. +static inline void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec)
  435. +{
  436. +}
  437. +
  438. +static inline void lru_gen_change_state(bool enable, bool main, bool swap)
  439. +{
  440. +}
  441. +
  442. +#ifdef CONFIG_MEMCG
  443. +static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
  444. +{
  445. +}
  446. +#endif
  447. +
  448. +#endif /* CONFIG_LRU_GEN */
  449. +
  450. struct lruvec {
  451. struct list_head lists[NR_LRU_LISTS];
  452. /* per lruvec lru_lock for memcg */
  453. @@ -311,6 +377,10 @@ struct lruvec {
  454. unsigned long refaults[ANON_AND_FILE];
  455. /* Various lruvec state flags (enum lruvec_flags) */
  456. unsigned long flags;
  457. +#ifdef CONFIG_LRU_GEN
  458. + /* unevictable pages are on LRU_UNEVICTABLE */
  459. + struct lrugen evictable;
  460. +#endif
  461. #ifdef CONFIG_MEMCG
  462. struct pglist_data *pgdat;
  463. #endif
  464. --- a/include/linux/page-flags-layout.h
  465. +++ b/include/linux/page-flags-layout.h
  466. @@ -26,6 +26,14 @@
  467. #define ZONES_WIDTH ZONES_SHIFT
  468. +#ifdef CONFIG_LRU_GEN
  469. +/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */
  470. +#define LRU_REFS_WIDTH (CONFIG_TIERS_PER_GEN - 2)
  471. +#else
  472. +#define LRU_GEN_WIDTH 0
  473. +#define LRU_REFS_WIDTH 0
  474. +#endif /* CONFIG_LRU_GEN */
  475. +
  476. #ifdef CONFIG_SPARSEMEM
  477. #include <asm/sparsemem.h>
  478. #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
  479. @@ -55,7 +63,8 @@
  480. #define SECTIONS_WIDTH 0
  481. #endif
  482. -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
  483. +#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
  484. + <= BITS_PER_LONG - NR_PAGEFLAGS
  485. #define NODES_WIDTH NODES_SHIFT
  486. #elif defined(CONFIG_SPARSEMEM_VMEMMAP)
  487. #error "Vmemmap: No space for nodes field in page flags"
  488. @@ -89,8 +98,8 @@
  489. #define LAST_CPUPID_SHIFT 0
  490. #endif
  491. -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
  492. - <= BITS_PER_LONG - NR_PAGEFLAGS
  493. +#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
  494. + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
  495. #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
  496. #else
  497. #define LAST_CPUPID_WIDTH 0
  498. @@ -100,8 +109,8 @@
  499. #define LAST_CPUPID_NOT_IN_PAGE_FLAGS
  500. #endif
  501. -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
  502. - > BITS_PER_LONG - NR_PAGEFLAGS
  503. +#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
  504. + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
  505. #error "Not enough bits in page flags"
  506. #endif
  507. --- a/include/linux/page-flags.h
  508. +++ b/include/linux/page-flags.h
  509. @@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall
  510. 1UL << PG_private | 1UL << PG_private_2 | \
  511. 1UL << PG_writeback | 1UL << PG_reserved | \
  512. 1UL << PG_slab | 1UL << PG_active | \
  513. - 1UL << PG_unevictable | __PG_MLOCKED)
  514. + 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
  515. /*
  516. * Flags checked when a page is prepped for return by the page allocator.
  517. @@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall
  518. * alloc-free cycle to prevent from reusing the page.
  519. */
  520. #define PAGE_FLAGS_CHECK_AT_PREP \
  521. - (PAGEFLAGS_MASK & ~__PG_HWPOISON)
  522. + ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
  523. #define PAGE_FLAGS_PRIVATE \
  524. (1UL << PG_private | 1UL << PG_private_2)
  525. --- a/include/linux/sched.h
  526. +++ b/include/linux/sched.h
  527. @@ -911,6 +911,9 @@ struct task_struct {
  528. #ifdef CONFIG_MEMCG
  529. unsigned in_user_fault:1;
  530. #endif
  531. +#ifdef CONFIG_LRU_GEN
  532. + unsigned in_nonseq_fault:1;
  533. +#endif
  534. #ifdef CONFIG_COMPAT_BRK
  535. unsigned brk_randomized:1;
  536. #endif
  537. --- a/kernel/bounds.c
  538. +++ b/kernel/bounds.c
  539. @@ -22,6 +22,9 @@ int main(void)
  540. DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
  541. #endif
  542. DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
  543. +#ifdef CONFIG_LRU_GEN
  544. + DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1));
  545. +#endif
  546. /* End of constants */
  547. return 0;
  548. --- a/kernel/cgroup/cgroup-internal.h
  549. +++ b/kernel/cgroup/cgroup-internal.h
  550. @@ -165,7 +165,6 @@ struct cgroup_mgctx {
  551. #define DEFINE_CGROUP_MGCTX(name) \
  552. struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
  553. -extern struct mutex cgroup_mutex;
  554. extern spinlock_t css_set_lock;
  555. extern struct cgroup_subsys *cgroup_subsys[];
  556. extern struct list_head cgroup_roots;
  557. --- a/mm/huge_memory.c
  558. +++ b/mm/huge_memory.c
  559. @@ -2364,7 +2364,8 @@ static void __split_huge_page_tail(struc
  560. #ifdef CONFIG_64BIT
  561. (1L << PG_arch_2) |
  562. #endif
  563. - (1L << PG_dirty)));
  564. + (1L << PG_dirty) |
  565. + LRU_GEN_MASK | LRU_REFS_MASK));
  566. /* ->mapping in first tail page is compound_mapcount */
  567. VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
  568. --- a/mm/memcontrol.c
  569. +++ b/mm/memcontrol.c
  570. @@ -5241,6 +5241,7 @@ static struct mem_cgroup *mem_cgroup_all
  571. memcg->deferred_split_queue.split_queue_len = 0;
  572. #endif
  573. idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
  574. + lru_gen_init_memcg(memcg);
  575. return memcg;
  576. fail:
  577. mem_cgroup_id_remove(memcg);
  578. --- a/mm/memory.c
  579. +++ b/mm/memory.c
  580. @@ -4788,6 +4788,7 @@ vm_fault_t handle_mm_fault(struct vm_are
  581. unsigned int flags, struct pt_regs *regs)
  582. {
  583. vm_fault_t ret;
  584. + bool nonseq_fault = !(vma->vm_flags & VM_SEQ_READ);
  585. __set_current_state(TASK_RUNNING);
  586. @@ -4809,11 +4810,17 @@ vm_fault_t handle_mm_fault(struct vm_are
  587. if (flags & FAULT_FLAG_USER)
  588. mem_cgroup_enter_user_fault();
  589. + if (nonseq_fault)
  590. + task_enter_nonseq_fault();
  591. +
  592. if (unlikely(is_vm_hugetlb_page(vma)))
  593. ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
  594. else
  595. ret = __handle_mm_fault(vma, address, flags);
  596. + if (nonseq_fault)
  597. + task_exit_nonseq_fault();
  598. +
  599. if (flags & FAULT_FLAG_USER) {
  600. mem_cgroup_exit_user_fault();
  601. /*
  602. --- a/mm/mm_init.c
  603. +++ b/mm/mm_init.c
  604. @@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo
  605. shift = 8 * sizeof(unsigned long);
  606. width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
  607. - - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
  608. + - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
  609. mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
  610. - "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
  611. + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
  612. SECTIONS_WIDTH,
  613. NODES_WIDTH,
  614. ZONES_WIDTH,
  615. LAST_CPUPID_WIDTH,
  616. KASAN_TAG_WIDTH,
  617. + LRU_GEN_WIDTH,
  618. + LRU_REFS_WIDTH,
  619. NR_PAGEFLAGS);
  620. mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
  621. "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
  622. --- a/mm/page_alloc.c
  623. +++ b/mm/page_alloc.c
  624. @@ -7459,6 +7459,7 @@ static void __meminit pgdat_init_interna
  625. pgdat_page_ext_init(pgdat);
  626. lruvec_init(&pgdat->__lruvec);
  627. + lru_gen_init_state(NULL, &pgdat->__lruvec);
  628. }
  629. static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
  630. --- a/mm/swap.c
  631. +++ b/mm/swap.c
  632. @@ -446,6 +446,11 @@ void lru_cache_add(struct page *page)
  633. VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
  634. VM_BUG_ON_PAGE(PageLRU(page), page);
  635. + /* see the comment in lru_gen_add_page() */
  636. + if (lru_gen_enabled() && !PageUnevictable(page) &&
  637. + task_in_nonseq_fault() && !(current->flags & PF_MEMALLOC))
  638. + SetPageActive(page);
  639. +
  640. get_page(page);
  641. local_lock(&lru_pvecs.lock);
  642. pvec = this_cpu_ptr(&lru_pvecs.lru_add);
  643. @@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc
  644. static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
  645. {
  646. - if (PageActive(page) && !PageUnevictable(page)) {
  647. + if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
  648. int nr_pages = thp_nr_pages(page);
  649. del_page_from_lru_list(page, lruvec);
  650. @@ -661,7 +666,7 @@ void deactivate_file_page(struct page *p
  651. */
  652. void deactivate_page(struct page *page)
  653. {
  654. - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
  655. + if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
  656. struct pagevec *pvec;
  657. local_lock(&lru_pvecs.lock);
  658. --- a/mm/swapfile.c
  659. +++ b/mm/swapfile.c
  660. @@ -2689,6 +2689,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
  661. err = 0;
  662. atomic_inc(&proc_poll_event);
  663. wake_up_interruptible(&proc_poll_wait);
  664. + lru_gen_change_state(false, false, true);
  665. out_dput:
  666. filp_close(victim, NULL);
  667. @@ -3350,6 +3351,7 @@ SYSCALL_DEFINE2(swapon, const char __use
  668. mutex_unlock(&swapon_mutex);
  669. atomic_inc(&proc_poll_event);
  670. wake_up_interruptible(&proc_poll_wait);
  671. + lru_gen_change_state(true, false, true);
  672. error = 0;
  673. goto out;
  674. --- a/mm/vmscan.c
  675. +++ b/mm/vmscan.c
  676. @@ -50,6 +50,7 @@
  677. #include <linux/printk.h>
  678. #include <linux/dax.h>
  679. #include <linux/psi.h>
  680. +#include <linux/memory.h>
  681. #include <asm/tlbflush.h>
  682. #include <asm/div64.h>
  683. @@ -2815,6 +2816,273 @@ static bool can_age_anon_pages(struct pg
  684. return can_demote(pgdat->node_id, sc);
  685. }
  686. +#ifdef CONFIG_LRU_GEN
  687. +
  688. +/******************************************************************************
  689. + * shorthand helpers
  690. + ******************************************************************************/
  691. +
  692. +#define for_each_gen_type_zone(gen, type, zone) \
  693. + for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
  694. + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
  695. + for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
  696. +
  697. +static int page_lru_gen(struct page *page)
  698. +{
  699. + unsigned long flags = READ_ONCE(page->flags);
  700. +
  701. + return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
  702. +}
  703. +
  704. +static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg)
  705. +{
  706. + struct pglist_data *pgdat = NODE_DATA(nid);
  707. +
  708. +#ifdef CONFIG_MEMCG
  709. + if (memcg) {
  710. + struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
  711. +
  712. + if (lruvec->pgdat != pgdat)
  713. + lruvec->pgdat = pgdat;
  714. +
  715. + return lruvec;
  716. + }
  717. +#endif
  718. + return pgdat ? &pgdat->__lruvec : NULL;
  719. +}
  720. +
  721. +static int get_nr_gens(struct lruvec *lruvec, int type)
  722. +{
  723. + return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1;
  724. +}
  725. +
  726. +static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
  727. +{
  728. + return get_nr_gens(lruvec, 1) >= MIN_NR_GENS &&
  729. + get_nr_gens(lruvec, 1) <= get_nr_gens(lruvec, 0) &&
  730. + get_nr_gens(lruvec, 0) <= MAX_NR_GENS;
  731. +}
  732. +
  733. +/******************************************************************************
  734. + * state change
  735. + ******************************************************************************/
  736. +
  737. +#ifdef CONFIG_LRU_GEN_ENABLED
  738. +DEFINE_STATIC_KEY_TRUE(lru_gen_static_key);
  739. +#else
  740. +DEFINE_STATIC_KEY_FALSE(lru_gen_static_key);
  741. +#endif
  742. +
  743. +static int lru_gen_nr_swapfiles;
  744. +
  745. +static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
  746. +{
  747. + int gen, type, zone;
  748. + enum lru_list lru;
  749. + struct lrugen *lrugen = &lruvec->evictable;
  750. +
  751. + for_each_evictable_lru(lru) {
  752. + type = is_file_lru(lru);
  753. +
  754. + if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru]))
  755. + return false;
  756. + }
  757. +
  758. + for_each_gen_type_zone(gen, type, zone) {
  759. + if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone]))
  760. + return false;
  761. +
  762. + /* unlikely but not a bug when reset_batch_size() is pending */
  763. + VM_WARN_ON(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]);
  764. + }
  765. +
  766. + return true;
  767. +}
  768. +
  769. +static bool fill_lists(struct lruvec *lruvec)
  770. +{
  771. + enum lru_list lru;
  772. + int remaining = MAX_BATCH_SIZE;
  773. +
  774. + for_each_evictable_lru(lru) {
  775. + int type = is_file_lru(lru);
  776. + bool active = is_active_lru(lru);
  777. + struct list_head *head = &lruvec->lists[lru];
  778. +
  779. + if (!lruvec->evictable.enabled[type])
  780. + continue;
  781. +
  782. + while (!list_empty(head)) {
  783. + bool success;
  784. + struct page *page = lru_to_page(head);
  785. +
  786. + VM_BUG_ON_PAGE(PageTail(page), page);
  787. + VM_BUG_ON_PAGE(PageUnevictable(page), page);
  788. + VM_BUG_ON_PAGE(PageActive(page) != active, page);
  789. + VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
  790. + VM_BUG_ON_PAGE(page_lru_gen(page) < MAX_NR_GENS, page);
  791. +
  792. + prefetchw_prev_lru_page(page, head, flags);
  793. +
  794. + del_page_from_lru_list(page, lruvec);
  795. + success = lru_gen_add_page(page, lruvec, false);
  796. + VM_BUG_ON(!success);
  797. +
  798. + if (!--remaining)
  799. + return false;
  800. + }
  801. + }
  802. +
  803. + return true;
  804. +}
  805. +
  806. +static bool drain_lists(struct lruvec *lruvec)
  807. +{
  808. + int gen, type, zone;
  809. + int remaining = MAX_BATCH_SIZE;
  810. +
  811. + for_each_gen_type_zone(gen, type, zone) {
  812. + struct list_head *head = &lruvec->evictable.lists[gen][type][zone];
  813. +
  814. + if (lruvec->evictable.enabled[type])
  815. + continue;
  816. +
  817. + while (!list_empty(head)) {
  818. + bool success;
  819. + struct page *page = lru_to_page(head);
  820. +
  821. + VM_BUG_ON_PAGE(PageTail(page), page);
  822. + VM_BUG_ON_PAGE(PageUnevictable(page), page);
  823. + VM_BUG_ON_PAGE(PageActive(page), page);
  824. + VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
  825. + VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
  826. +
  827. + prefetchw_prev_lru_page(page, head, flags);
  828. +
  829. + success = lru_gen_del_page(page, lruvec, false);
  830. + VM_BUG_ON(!success);
  831. + add_page_to_lru_list(page, lruvec);
  832. +
  833. + if (!--remaining)
  834. + return false;
  835. + }
  836. + }
  837. +
  838. + return true;
  839. +}
  840. +
  841. +/*
  842. + * For file page tracking, we enable/disable it according to the main switch.
  843. + * For anon page tracking, we only enabled it when the main switch is on and
  844. + * there is at least one swapfile; we disable it when there are no swapfiles
  845. + * regardless of the value of the main switch. Otherwise, we will eventually
  846. + * reach the max size of the sliding window and have to call inc_min_seq().
  847. + */
  848. +void lru_gen_change_state(bool enable, bool main, bool swap)
  849. +{
  850. + static DEFINE_MUTEX(state_mutex);
  851. +
  852. + struct mem_cgroup *memcg;
  853. +
  854. + mem_hotplug_begin();
  855. + cgroup_lock();
  856. + mutex_lock(&state_mutex);
  857. +
  858. + if (swap) {
  859. + if (enable)
  860. + swap = !lru_gen_nr_swapfiles++;
  861. + else
  862. + swap = !--lru_gen_nr_swapfiles;
  863. + }
  864. +
  865. + if (main && enable != lru_gen_enabled()) {
  866. + if (enable)
  867. + static_branch_enable(&lru_gen_static_key);
  868. + else
  869. + static_branch_disable(&lru_gen_static_key);
  870. + } else if (!swap || !lru_gen_enabled())
  871. + goto unlock;
  872. +
  873. + memcg = mem_cgroup_iter(NULL, NULL, NULL);
  874. + do {
  875. + int nid;
  876. +
  877. + for_each_node(nid) {
  878. + struct lruvec *lruvec = get_lruvec(nid, memcg);
  879. +
  880. + if (!lruvec)
  881. + continue;
  882. +
  883. + spin_lock_irq(&lruvec->lru_lock);
  884. +
  885. + VM_BUG_ON(!seq_is_valid(lruvec));
  886. + VM_BUG_ON(!state_is_valid(lruvec));
  887. +
  888. + lruvec->evictable.enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
  889. + lruvec->evictable.enabled[1] = lru_gen_enabled();
  890. +
  891. + while (!(enable ? fill_lists(lruvec) : drain_lists(lruvec))) {
  892. + spin_unlock_irq(&lruvec->lru_lock);
  893. + cond_resched();
  894. + spin_lock_irq(&lruvec->lru_lock);
  895. + }
  896. +
  897. + spin_unlock_irq(&lruvec->lru_lock);
  898. + }
  899. +
  900. + cond_resched();
  901. + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
  902. +unlock:
  903. + mutex_unlock(&state_mutex);
  904. + cgroup_unlock();
  905. + mem_hotplug_done();
  906. +}
  907. +
  908. +/******************************************************************************
  909. + * initialization
  910. + ******************************************************************************/
  911. +
  912. +void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec)
  913. +{
  914. + int i;
  915. + int gen, type, zone;
  916. + struct lrugen *lrugen = &lruvec->evictable;
  917. +
  918. + lrugen->max_seq = MIN_NR_GENS + 1;
  919. + lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
  920. + lrugen->enabled[1] = lru_gen_enabled();
  921. +
  922. + for (i = 0; i <= MIN_NR_GENS + 1; i++)
  923. + lrugen->timestamps[i] = jiffies;
  924. +
  925. + for_each_gen_type_zone(gen, type, zone)
  926. + INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
  927. +}
  928. +
  929. +#ifdef CONFIG_MEMCG
  930. +void lru_gen_init_memcg(struct mem_cgroup *memcg)
  931. +{
  932. + int nid;
  933. +
  934. + for_each_node(nid) {
  935. + struct lruvec *lruvec = get_lruvec(nid, memcg);
  936. +
  937. + lru_gen_init_state(memcg, lruvec);
  938. + }
  939. +}
  940. +#endif
  941. +
  942. +static int __init init_lru_gen(void)
  943. +{
  944. + BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
  945. + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
  946. +
  947. + return 0;
  948. +};
  949. +late_initcall(init_lru_gen);
  950. +
  951. +#endif /* CONFIG_LRU_GEN */
  952. +
  953. static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
  954. {
  955. unsigned long nr[NR_LRU_LISTS];