020-v6.3-26-mm-multi-gen-LRU-per-node-lru_gen_page-lists.patch 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868
  1. From fa6363828d314e837c5f79e97ea5e8c0d2f7f062 Mon Sep 17 00:00:00 2001
  2. From: Yu Zhao <[email protected]>
  3. Date: Wed, 21 Dec 2022 21:19:04 -0700
  4. Subject: [PATCH 26/29] mm: multi-gen LRU: per-node lru_gen_page lists
  5. For each node, memcgs are divided into two generations: the old and
  6. the young. For each generation, memcgs are randomly sharded into
  7. multiple bins to improve scalability. For each bin, an RCU hlist_nulls
  8. is virtually divided into three segments: the head, the tail and the
  9. default.
  10. An onlining memcg is added to the tail of a random bin in the old
  11. generation. The eviction starts at the head of a random bin in the old
  12. generation. The per-node memcg generation counter, whose reminder (mod
  13. 2) indexes the old generation, is incremented when all its bins become
  14. empty.
  15. There are four operations:
  16. 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in
  17. its current generation (old or young) and updates its "seg" to
  18. "head";
  19. 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in
  20. its current generation (old or young) and updates its "seg" to
  21. "tail";
  22. 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in
  23. the old generation, updates its "gen" to "old" and resets its "seg"
  24. to "default";
  25. 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin
  26. in the young generation, updates its "gen" to "young" and resets
  27. its "seg" to "default".
  28. The events that trigger the above operations are:
  29. 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
  30. 2. The first attempt to reclaim an memcg below low, which triggers
  31. MEMCG_LRU_TAIL;
  32. 3. The first attempt to reclaim an memcg below reclaimable size
  33. threshold, which triggers MEMCG_LRU_TAIL;
  34. 4. The second attempt to reclaim an memcg below reclaimable size
  35. threshold, which triggers MEMCG_LRU_YOUNG;
  36. 5. Attempting to reclaim an memcg below min, which triggers
  37. MEMCG_LRU_YOUNG;
  38. 6. Finishing the aging on the eviction path, which triggers
  39. MEMCG_LRU_YOUNG;
  40. 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
  41. Note that memcg LRU only applies to global reclaim, and the
  42. round-robin incrementing of their max_seq counters ensures the
  43. eventual fairness to all eligible memcgs. For memcg reclaim, it still
  44. relies on mem_cgroup_iter().
  45. Link: https://lkml.kernel.org/r/[email protected]
  46. Signed-off-by: Yu Zhao <[email protected]>
  47. Cc: Johannes Weiner <[email protected]>
  48. Cc: Jonathan Corbet <[email protected]>
  49. Cc: Michael Larabel <[email protected]>
  50. Cc: Michal Hocko <[email protected]>
  51. Cc: Mike Rapoport <[email protected]>
  52. Cc: Roman Gushchin <[email protected]>
  53. Cc: Suren Baghdasaryan <[email protected]>
  54. Signed-off-by: Andrew Morton <[email protected]>
  55. ---
  56. include/linux/memcontrol.h | 10 +
  57. include/linux/mm_inline.h | 17 ++
  58. include/linux/mmzone.h | 117 +++++++++++-
  59. mm/memcontrol.c | 16 ++
  60. mm/page_alloc.c | 1 +
  61. mm/vmscan.c | 373 +++++++++++++++++++++++++++++++++----
  62. 6 files changed, 499 insertions(+), 35 deletions(-)
  63. --- a/include/linux/memcontrol.h
  64. +++ b/include/linux/memcontrol.h
  65. @@ -818,6 +818,11 @@ static inline void obj_cgroup_put(struct
  66. percpu_ref_put(&objcg->refcnt);
  67. }
  68. +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
  69. +{
  70. + return !memcg || css_tryget(&memcg->css);
  71. +}
  72. +
  73. static inline void mem_cgroup_put(struct mem_cgroup *memcg)
  74. {
  75. if (memcg)
  76. @@ -1283,6 +1288,11 @@ struct mem_cgroup *mem_cgroup_from_css(s
  77. return NULL;
  78. }
  79. +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
  80. +{
  81. + return true;
  82. +}
  83. +
  84. static inline void mem_cgroup_put(struct mem_cgroup *memcg)
  85. {
  86. }
  87. --- a/include/linux/mm_inline.h
  88. +++ b/include/linux/mm_inline.h
  89. @@ -112,6 +112,18 @@ static inline bool lru_gen_in_fault(void
  90. return current->in_lru_fault;
  91. }
  92. +#ifdef CONFIG_MEMCG
  93. +static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
  94. +{
  95. + return READ_ONCE(lruvec->lrugen.seg);
  96. +}
  97. +#else
  98. +static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
  99. +{
  100. + return 0;
  101. +}
  102. +#endif
  103. +
  104. static inline int lru_gen_from_seq(unsigned long seq)
  105. {
  106. return seq % MAX_NR_GENS;
  107. @@ -287,6 +299,11 @@ static inline bool lru_gen_in_fault(void
  108. return false;
  109. }
  110. +static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
  111. +{
  112. + return 0;
  113. +}
  114. +
  115. static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
  116. {
  117. return false;
  118. --- a/include/linux/mmzone.h
  119. +++ b/include/linux/mmzone.h
  120. @@ -7,6 +7,7 @@
  121. #include <linux/spinlock.h>
  122. #include <linux/list.h>
  123. +#include <linux/list_nulls.h>
  124. #include <linux/wait.h>
  125. #include <linux/bitops.h>
  126. #include <linux/cache.h>
  127. @@ -357,6 +358,15 @@ struct page_vma_mapped_walk;
  128. #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
  129. #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
  130. +/* see the comment on MEMCG_NR_GENS */
  131. +enum {
  132. + MEMCG_LRU_NOP,
  133. + MEMCG_LRU_HEAD,
  134. + MEMCG_LRU_TAIL,
  135. + MEMCG_LRU_OLD,
  136. + MEMCG_LRU_YOUNG,
  137. +};
  138. +
  139. #ifdef CONFIG_LRU_GEN
  140. enum {
  141. @@ -416,6 +426,14 @@ struct lru_gen_page {
  142. atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
  143. /* whether the multi-gen LRU is enabled */
  144. bool enabled;
  145. +#ifdef CONFIG_MEMCG
  146. + /* the memcg generation this lru_gen_page belongs to */
  147. + u8 gen;
  148. + /* the list segment this lru_gen_page belongs to */
  149. + u8 seg;
  150. + /* per-node lru_gen_page list for global reclaim */
  151. + struct hlist_nulls_node list;
  152. +#endif
  153. };
  154. enum {
  155. @@ -469,12 +487,87 @@ void lru_gen_init_lruvec(struct lruvec *
  156. void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
  157. #ifdef CONFIG_MEMCG
  158. +
  159. +/*
  160. + * For each node, memcgs are divided into two generations: the old and the
  161. + * young. For each generation, memcgs are randomly sharded into multiple bins
  162. + * to improve scalability. For each bin, the hlist_nulls is virtually divided
  163. + * into three segments: the head, the tail and the default.
  164. + *
  165. + * An onlining memcg is added to the tail of a random bin in the old generation.
  166. + * The eviction starts at the head of a random bin in the old generation. The
  167. + * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
  168. + * the old generation, is incremented when all its bins become empty.
  169. + *
  170. + * There are four operations:
  171. + * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its
  172. + * current generation (old or young) and updates its "seg" to "head";
  173. + * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its
  174. + * current generation (old or young) and updates its "seg" to "tail";
  175. + * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old
  176. + * generation, updates its "gen" to "old" and resets its "seg" to "default";
  177. + * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the
  178. + * young generation, updates its "gen" to "young" and resets its "seg" to
  179. + * "default".
  180. + *
  181. + * The events that trigger the above operations are:
  182. + * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
  183. + * 2. The first attempt to reclaim an memcg below low, which triggers
  184. + * MEMCG_LRU_TAIL;
  185. + * 3. The first attempt to reclaim an memcg below reclaimable size threshold,
  186. + * which triggers MEMCG_LRU_TAIL;
  187. + * 4. The second attempt to reclaim an memcg below reclaimable size threshold,
  188. + * which triggers MEMCG_LRU_YOUNG;
  189. + * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG;
  190. + * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
  191. + * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
  192. + *
  193. + * Note that memcg LRU only applies to global reclaim, and the round-robin
  194. + * incrementing of their max_seq counters ensures the eventual fairness to all
  195. + * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
  196. + */
  197. +#define MEMCG_NR_GENS 2
  198. +#define MEMCG_NR_BINS 8
  199. +
  200. +struct lru_gen_memcg {
  201. + /* the per-node memcg generation counter */
  202. + unsigned long seq;
  203. + /* each memcg has one lru_gen_page per node */
  204. + unsigned long nr_memcgs[MEMCG_NR_GENS];
  205. + /* per-node lru_gen_page list for global reclaim */
  206. + struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
  207. + /* protects the above */
  208. + spinlock_t lock;
  209. +};
  210. +
  211. +void lru_gen_init_pgdat(struct pglist_data *pgdat);
  212. +
  213. void lru_gen_init_memcg(struct mem_cgroup *memcg);
  214. void lru_gen_exit_memcg(struct mem_cgroup *memcg);
  215. -#endif
  216. +void lru_gen_online_memcg(struct mem_cgroup *memcg);
  217. +void lru_gen_offline_memcg(struct mem_cgroup *memcg);
  218. +void lru_gen_release_memcg(struct mem_cgroup *memcg);
  219. +void lru_gen_rotate_memcg(struct lruvec *lruvec, int op);
  220. +
  221. +#else /* !CONFIG_MEMCG */
  222. +
  223. +#define MEMCG_NR_GENS 1
  224. +
  225. +struct lru_gen_memcg {
  226. +};
  227. +
  228. +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
  229. +{
  230. +}
  231. +
  232. +#endif /* CONFIG_MEMCG */
  233. #else /* !CONFIG_LRU_GEN */
  234. +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
  235. +{
  236. +}
  237. +
  238. static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
  239. {
  240. }
  241. @@ -484,6 +577,7 @@ static inline void lru_gen_look_around(s
  242. }
  243. #ifdef CONFIG_MEMCG
  244. +
  245. static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
  246. {
  247. }
  248. @@ -491,7 +585,24 @@ static inline void lru_gen_init_memcg(st
  249. static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
  250. {
  251. }
  252. -#endif
  253. +
  254. +static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
  255. +{
  256. +}
  257. +
  258. +static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
  259. +{
  260. +}
  261. +
  262. +static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
  263. +{
  264. +}
  265. +
  266. +static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
  267. +{
  268. +}
  269. +
  270. +#endif /* CONFIG_MEMCG */
  271. #endif /* CONFIG_LRU_GEN */
  272. @@ -1105,6 +1216,8 @@ typedef struct pglist_data {
  273. #ifdef CONFIG_LRU_GEN
  274. /* kswap mm walk data */
  275. struct lru_gen_mm_walk mm_walk;
  276. + /* lru_gen_page list */
  277. + struct lru_gen_memcg memcg_lru;
  278. #endif
  279. ZONE_PADDING(_pad2_)
  280. --- a/mm/memcontrol.c
  281. +++ b/mm/memcontrol.c
  282. @@ -549,6 +549,16 @@ static void mem_cgroup_update_tree(struc
  283. struct mem_cgroup_per_node *mz;
  284. struct mem_cgroup_tree_per_node *mctz;
  285. + if (lru_gen_enabled()) {
  286. + struct lruvec *lruvec = &mem_cgroup_page_nodeinfo(memcg, page)->lruvec;
  287. +
  288. + /* see the comment on MEMCG_NR_GENS */
  289. + if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
  290. + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
  291. +
  292. + return;
  293. + }
  294. +
  295. mctz = soft_limit_tree_from_page(page);
  296. if (!mctz)
  297. return;
  298. @@ -3433,6 +3443,9 @@ unsigned long mem_cgroup_soft_limit_recl
  299. unsigned long excess;
  300. unsigned long nr_scanned;
  301. + if (lru_gen_enabled())
  302. + return 0;
  303. +
  304. if (order > 0)
  305. return 0;
  306. @@ -5321,6 +5334,7 @@ static int mem_cgroup_css_online(struct
  307. if (unlikely(mem_cgroup_is_root(memcg)))
  308. queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
  309. 2UL*HZ);
  310. + lru_gen_online_memcg(memcg);
  311. return 0;
  312. }
  313. @@ -5347,6 +5361,7 @@ static void mem_cgroup_css_offline(struc
  314. memcg_offline_kmem(memcg);
  315. reparent_shrinker_deferred(memcg);
  316. wb_memcg_offline(memcg);
  317. + lru_gen_offline_memcg(memcg);
  318. drain_all_stock(memcg);
  319. @@ -5358,6 +5373,7 @@ static void mem_cgroup_css_released(stru
  320. struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  321. invalidate_reclaim_iterators(memcg);
  322. + lru_gen_release_memcg(memcg);
  323. }
  324. static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
  325. --- a/mm/page_alloc.c
  326. +++ b/mm/page_alloc.c
  327. @@ -7661,6 +7661,7 @@ static void __init free_area_init_node(i
  328. pgdat_set_deferred_range(pgdat);
  329. free_area_init_core(pgdat);
  330. + lru_gen_init_pgdat(pgdat);
  331. }
  332. void __init free_area_init_memoryless_node(int nid)
  333. --- a/mm/vmscan.c
  334. +++ b/mm/vmscan.c
  335. @@ -54,6 +54,8 @@
  336. #include <linux/shmem_fs.h>
  337. #include <linux/ctype.h>
  338. #include <linux/debugfs.h>
  339. +#include <linux/rculist_nulls.h>
  340. +#include <linux/random.h>
  341. #include <asm/tlbflush.h>
  342. #include <asm/div64.h>
  343. @@ -129,11 +131,6 @@ struct scan_control {
  344. /* Always discard instead of demoting to lower tier memory */
  345. unsigned int no_demotion:1;
  346. -#ifdef CONFIG_LRU_GEN
  347. - /* help kswapd make better choices among multiple memcgs */
  348. - unsigned long last_reclaimed;
  349. -#endif
  350. -
  351. /* Allocation order */
  352. s8 order;
  353. @@ -2880,6 +2877,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_ca
  354. for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
  355. for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
  356. +#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
  357. +#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
  358. +
  359. static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
  360. {
  361. struct pglist_data *pgdat = NODE_DATA(nid);
  362. @@ -4169,8 +4169,7 @@ done:
  363. if (sc->priority <= DEF_PRIORITY - 2)
  364. wait_event_killable(lruvec->mm_state.wait,
  365. max_seq < READ_ONCE(lrugen->max_seq));
  366. -
  367. - return max_seq < READ_ONCE(lrugen->max_seq);
  368. + return false;
  369. }
  370. VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
  371. @@ -4243,8 +4242,6 @@ static void lru_gen_age_node(struct pgli
  372. VM_WARN_ON_ONCE(!current_is_kswapd());
  373. - sc->last_reclaimed = sc->nr_reclaimed;
  374. -
  375. /* check the order to exclude compaction-induced reclaim */
  376. if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
  377. return;
  378. @@ -4833,8 +4830,7 @@ static bool should_run_aging(struct lruv
  379. * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
  380. * reclaim.
  381. */
  382. -static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
  383. - bool can_swap)
  384. +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
  385. {
  386. unsigned long nr_to_scan;
  387. struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  388. @@ -4851,10 +4847,8 @@ static unsigned long get_nr_to_scan(stru
  389. if (sc->priority == DEF_PRIORITY)
  390. return nr_to_scan;
  391. - try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false);
  392. -
  393. /* skip this lruvec as it's low on cold pages */
  394. - return 0;
  395. + return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
  396. }
  397. static unsigned long get_nr_to_reclaim(struct scan_control *sc)
  398. @@ -4863,29 +4857,18 @@ static unsigned long get_nr_to_reclaim(s
  399. if (!global_reclaim(sc))
  400. return -1;
  401. - /* discount the previous progress for kswapd */
  402. - if (current_is_kswapd())
  403. - return sc->nr_to_reclaim + sc->last_reclaimed;
  404. -
  405. return max(sc->nr_to_reclaim, compact_gap(sc->order));
  406. }
  407. -static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
  408. +static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
  409. {
  410. - struct blk_plug plug;
  411. + long nr_to_scan;
  412. unsigned long scanned = 0;
  413. unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
  414. - lru_add_drain();
  415. -
  416. - blk_start_plug(&plug);
  417. -
  418. - set_mm_walk(lruvec_pgdat(lruvec));
  419. -
  420. while (true) {
  421. int delta;
  422. int swappiness;
  423. - unsigned long nr_to_scan;
  424. if (sc->may_swap)
  425. swappiness = get_swappiness(lruvec, sc);
  426. @@ -4895,7 +4878,7 @@ static void lru_gen_shrink_lruvec(struct
  427. swappiness = 0;
  428. nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
  429. - if (!nr_to_scan)
  430. + if (nr_to_scan <= 0)
  431. break;
  432. delta = evict_pages(lruvec, sc, swappiness);
  433. @@ -4912,10 +4895,250 @@ static void lru_gen_shrink_lruvec(struct
  434. cond_resched();
  435. }
  436. + /* whether try_to_inc_max_seq() was successful */
  437. + return nr_to_scan < 0;
  438. +}
  439. +
  440. +static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
  441. +{
  442. + bool success;
  443. + unsigned long scanned = sc->nr_scanned;
  444. + unsigned long reclaimed = sc->nr_reclaimed;
  445. + int seg = lru_gen_memcg_seg(lruvec);
  446. + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  447. + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
  448. +
  449. + /* see the comment on MEMCG_NR_GENS */
  450. + if (!lruvec_is_sizable(lruvec, sc))
  451. + return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
  452. +
  453. + mem_cgroup_calculate_protection(NULL, memcg);
  454. +
  455. + if (mem_cgroup_below_min(memcg))
  456. + return MEMCG_LRU_YOUNG;
  457. +
  458. + if (mem_cgroup_below_low(memcg)) {
  459. + /* see the comment on MEMCG_NR_GENS */
  460. + if (seg != MEMCG_LRU_TAIL)
  461. + return MEMCG_LRU_TAIL;
  462. +
  463. + memcg_memory_event(memcg, MEMCG_LOW);
  464. + }
  465. +
  466. + success = try_to_shrink_lruvec(lruvec, sc);
  467. +
  468. + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
  469. +
  470. + vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
  471. + sc->nr_reclaimed - reclaimed);
  472. +
  473. + sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
  474. + current->reclaim_state->reclaimed_slab = 0;
  475. +
  476. + return success ? MEMCG_LRU_YOUNG : 0;
  477. +}
  478. +
  479. +#ifdef CONFIG_MEMCG
  480. +
  481. +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
  482. +{
  483. + int gen;
  484. + int bin;
  485. + int first_bin;
  486. + struct lruvec *lruvec;
  487. + struct lru_gen_page *lrugen;
  488. + const struct hlist_nulls_node *pos;
  489. + int op = 0;
  490. + struct mem_cgroup *memcg = NULL;
  491. + unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
  492. +
  493. + bin = first_bin = prandom_u32_max(MEMCG_NR_BINS);
  494. +restart:
  495. + gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
  496. +
  497. + rcu_read_lock();
  498. +
  499. + hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
  500. + if (op)
  501. + lru_gen_rotate_memcg(lruvec, op);
  502. +
  503. + mem_cgroup_put(memcg);
  504. +
  505. + lruvec = container_of(lrugen, struct lruvec, lrugen);
  506. + memcg = lruvec_memcg(lruvec);
  507. +
  508. + if (!mem_cgroup_tryget(memcg)) {
  509. + op = 0;
  510. + memcg = NULL;
  511. + continue;
  512. + }
  513. +
  514. + rcu_read_unlock();
  515. +
  516. + op = shrink_one(lruvec, sc);
  517. +
  518. + if (sc->nr_reclaimed >= nr_to_reclaim)
  519. + goto success;
  520. +
  521. + rcu_read_lock();
  522. + }
  523. +
  524. + rcu_read_unlock();
  525. +
  526. + /* restart if raced with lru_gen_rotate_memcg() */
  527. + if (gen != get_nulls_value(pos))
  528. + goto restart;
  529. +
  530. + /* try the rest of the bins of the current generation */
  531. + bin = get_memcg_bin(bin + 1);
  532. + if (bin != first_bin)
  533. + goto restart;
  534. +success:
  535. + if (op)
  536. + lru_gen_rotate_memcg(lruvec, op);
  537. +
  538. + mem_cgroup_put(memcg);
  539. +}
  540. +
  541. +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
  542. +{
  543. + struct blk_plug plug;
  544. +
  545. + VM_WARN_ON_ONCE(global_reclaim(sc));
  546. +
  547. + lru_add_drain();
  548. +
  549. + blk_start_plug(&plug);
  550. +
  551. + set_mm_walk(lruvec_pgdat(lruvec));
  552. +
  553. + if (try_to_shrink_lruvec(lruvec, sc))
  554. + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
  555. +
  556. + clear_mm_walk();
  557. +
  558. + blk_finish_plug(&plug);
  559. +}
  560. +
  561. +#else /* !CONFIG_MEMCG */
  562. +
  563. +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
  564. +{
  565. + BUILD_BUG();
  566. +}
  567. +
  568. +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
  569. +{
  570. + BUILD_BUG();
  571. +}
  572. +
  573. +#endif
  574. +
  575. +static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
  576. +{
  577. + int priority;
  578. + unsigned long reclaimable;
  579. + struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
  580. +
  581. + if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
  582. + return;
  583. + /*
  584. + * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
  585. + * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
  586. + * estimated reclaimed_to_scanned_ratio = inactive / total.
  587. + */
  588. + reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
  589. + if (get_swappiness(lruvec, sc))
  590. + reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
  591. +
  592. + reclaimable /= MEMCG_NR_GENS;
  593. +
  594. + /* round down reclaimable and round up sc->nr_to_reclaim */
  595. + priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
  596. +
  597. + sc->priority = clamp(priority, 0, DEF_PRIORITY);
  598. +}
  599. +
  600. +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
  601. +{
  602. + struct blk_plug plug;
  603. + unsigned long reclaimed = sc->nr_reclaimed;
  604. +
  605. + VM_WARN_ON_ONCE(!global_reclaim(sc));
  606. +
  607. + lru_add_drain();
  608. +
  609. + blk_start_plug(&plug);
  610. +
  611. + set_mm_walk(pgdat);
  612. +
  613. + set_initial_priority(pgdat, sc);
  614. +
  615. + if (current_is_kswapd())
  616. + sc->nr_reclaimed = 0;
  617. +
  618. + if (mem_cgroup_disabled())
  619. + shrink_one(&pgdat->__lruvec, sc);
  620. + else
  621. + shrink_many(pgdat, sc);
  622. +
  623. + if (current_is_kswapd())
  624. + sc->nr_reclaimed += reclaimed;
  625. +
  626. clear_mm_walk();
  627. blk_finish_plug(&plug);
  628. +
  629. + /* kswapd should never fail */
  630. + pgdat->kswapd_failures = 0;
  631. +}
  632. +
  633. +#ifdef CONFIG_MEMCG
  634. +void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
  635. +{
  636. + int seg;
  637. + int old, new;
  638. + int bin = prandom_u32_max(MEMCG_NR_BINS);
  639. + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
  640. +
  641. + spin_lock(&pgdat->memcg_lru.lock);
  642. +
  643. + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
  644. +
  645. + seg = 0;
  646. + new = old = lruvec->lrugen.gen;
  647. +
  648. + /* see the comment on MEMCG_NR_GENS */
  649. + if (op == MEMCG_LRU_HEAD)
  650. + seg = MEMCG_LRU_HEAD;
  651. + else if (op == MEMCG_LRU_TAIL)
  652. + seg = MEMCG_LRU_TAIL;
  653. + else if (op == MEMCG_LRU_OLD)
  654. + new = get_memcg_gen(pgdat->memcg_lru.seq);
  655. + else if (op == MEMCG_LRU_YOUNG)
  656. + new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
  657. + else
  658. + VM_WARN_ON_ONCE(true);
  659. +
  660. + hlist_nulls_del_rcu(&lruvec->lrugen.list);
  661. +
  662. + if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
  663. + hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
  664. + else
  665. + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
  666. +
  667. + pgdat->memcg_lru.nr_memcgs[old]--;
  668. + pgdat->memcg_lru.nr_memcgs[new]++;
  669. +
  670. + lruvec->lrugen.gen = new;
  671. + WRITE_ONCE(lruvec->lrugen.seg, seg);
  672. +
  673. + if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
  674. + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
  675. +
  676. + spin_unlock(&pgdat->memcg_lru.lock);
  677. }
  678. +#endif
  679. /******************************************************************************
  680. * state change
  681. @@ -5370,11 +5593,11 @@ static int run_cmd(char cmd, int memcg_i
  682. if (!mem_cgroup_disabled()) {
  683. rcu_read_lock();
  684. +
  685. memcg = mem_cgroup_from_id(memcg_id);
  686. -#ifdef CONFIG_MEMCG
  687. - if (memcg && !css_tryget(&memcg->css))
  688. + if (!mem_cgroup_tryget(memcg))
  689. memcg = NULL;
  690. -#endif
  691. +
  692. rcu_read_unlock();
  693. if (!memcg)
  694. @@ -5521,6 +5744,19 @@ void lru_gen_init_lruvec(struct lruvec *
  695. }
  696. #ifdef CONFIG_MEMCG
  697. +
  698. +void lru_gen_init_pgdat(struct pglist_data *pgdat)
  699. +{
  700. + int i, j;
  701. +
  702. + spin_lock_init(&pgdat->memcg_lru.lock);
  703. +
  704. + for (i = 0; i < MEMCG_NR_GENS; i++) {
  705. + for (j = 0; j < MEMCG_NR_BINS; j++)
  706. + INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
  707. + }
  708. +}
  709. +
  710. void lru_gen_init_memcg(struct mem_cgroup *memcg)
  711. {
  712. INIT_LIST_HEAD(&memcg->mm_list.fifo);
  713. @@ -5544,7 +5780,69 @@ void lru_gen_exit_memcg(struct mem_cgrou
  714. }
  715. }
  716. }
  717. -#endif
  718. +
  719. +void lru_gen_online_memcg(struct mem_cgroup *memcg)
  720. +{
  721. + int gen;
  722. + int nid;
  723. + int bin = prandom_u32_max(MEMCG_NR_BINS);
  724. +
  725. + for_each_node(nid) {
  726. + struct pglist_data *pgdat = NODE_DATA(nid);
  727. + struct lruvec *lruvec = get_lruvec(memcg, nid);
  728. +
  729. + spin_lock(&pgdat->memcg_lru.lock);
  730. +
  731. + VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
  732. +
  733. + gen = get_memcg_gen(pgdat->memcg_lru.seq);
  734. +
  735. + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
  736. + pgdat->memcg_lru.nr_memcgs[gen]++;
  737. +
  738. + lruvec->lrugen.gen = gen;
  739. +
  740. + spin_unlock(&pgdat->memcg_lru.lock);
  741. + }
  742. +}
  743. +
  744. +void lru_gen_offline_memcg(struct mem_cgroup *memcg)
  745. +{
  746. + int nid;
  747. +
  748. + for_each_node(nid) {
  749. + struct lruvec *lruvec = get_lruvec(memcg, nid);
  750. +
  751. + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
  752. + }
  753. +}
  754. +
  755. +void lru_gen_release_memcg(struct mem_cgroup *memcg)
  756. +{
  757. + int gen;
  758. + int nid;
  759. +
  760. + for_each_node(nid) {
  761. + struct pglist_data *pgdat = NODE_DATA(nid);
  762. + struct lruvec *lruvec = get_lruvec(memcg, nid);
  763. +
  764. + spin_lock(&pgdat->memcg_lru.lock);
  765. +
  766. + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
  767. +
  768. + gen = lruvec->lrugen.gen;
  769. +
  770. + hlist_nulls_del_rcu(&lruvec->lrugen.list);
  771. + pgdat->memcg_lru.nr_memcgs[gen]--;
  772. +
  773. + if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
  774. + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
  775. +
  776. + spin_unlock(&pgdat->memcg_lru.lock);
  777. + }
  778. +}
  779. +
  780. +#endif /* CONFIG_MEMCG */
  781. static int __init init_lru_gen(void)
  782. {
  783. @@ -5571,6 +5869,10 @@ static void lru_gen_shrink_lruvec(struct
  784. {
  785. }
  786. +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
  787. +{
  788. +}
  789. +
  790. #endif /* CONFIG_LRU_GEN */
  791. static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
  792. @@ -5584,7 +5886,7 @@ static void shrink_lruvec(struct lruvec
  793. bool proportional_reclaim;
  794. struct blk_plug plug;
  795. - if (lru_gen_enabled()) {
  796. + if (lru_gen_enabled() && !global_reclaim(sc)) {
  797. lru_gen_shrink_lruvec(lruvec, sc);
  798. return;
  799. }
  800. @@ -5826,6 +6128,11 @@ static void shrink_node(pg_data_t *pgdat
  801. struct lruvec *target_lruvec;
  802. bool reclaimable = false;
  803. + if (lru_gen_enabled() && global_reclaim(sc)) {
  804. + lru_gen_shrink_node(pgdat, sc);
  805. + return;
  806. + }
  807. +
  808. target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
  809. again: