020-v6.3-06-BACKPORT-mm-multi-gen-LRU-per-node-lru_gen_folio-lis.patch 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876
  1. From 8ee8571e47aa75221e5fbd4c9c7802fc4244c346 Mon Sep 17 00:00:00 2001
  2. From: Yu Zhao <[email protected]>
  3. Date: Wed, 21 Dec 2022 21:19:04 -0700
  4. Subject: [PATCH 06/19] BACKPORT: mm: multi-gen LRU: per-node lru_gen_folio
  5. lists
  6. For each node, memcgs are divided into two generations: the old and
  7. the young. For each generation, memcgs are randomly sharded into
  8. multiple bins to improve scalability. For each bin, an RCU hlist_nulls
  9. is virtually divided into three segments: the head, the tail and the
  10. default.
  11. An onlining memcg is added to the tail of a random bin in the old
  12. generation. The eviction starts at the head of a random bin in the old
  13. generation. The per-node memcg generation counter, whose reminder (mod
  14. 2) indexes the old generation, is incremented when all its bins become
  15. empty.
  16. There are four operations:
  17. 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in
  18. its current generation (old or young) and updates its "seg" to
  19. "head";
  20. 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in
  21. its current generation (old or young) and updates its "seg" to
  22. "tail";
  23. 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in
  24. the old generation, updates its "gen" to "old" and resets its "seg"
  25. to "default";
  26. 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin
  27. in the young generation, updates its "gen" to "young" and resets
  28. its "seg" to "default".
  29. The events that trigger the above operations are:
  30. 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
  31. 2. The first attempt to reclaim an memcg below low, which triggers
  32. MEMCG_LRU_TAIL;
  33. 3. The first attempt to reclaim an memcg below reclaimable size
  34. threshold, which triggers MEMCG_LRU_TAIL;
  35. 4. The second attempt to reclaim an memcg below reclaimable size
  36. threshold, which triggers MEMCG_LRU_YOUNG;
  37. 5. Attempting to reclaim an memcg below min, which triggers
  38. MEMCG_LRU_YOUNG;
  39. 6. Finishing the aging on the eviction path, which triggers
  40. MEMCG_LRU_YOUNG;
  41. 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
  42. Note that memcg LRU only applies to global reclaim, and the
  43. round-robin incrementing of their max_seq counters ensures the
  44. eventual fairness to all eligible memcgs. For memcg reclaim, it still
  45. relies on mem_cgroup_iter().
  46. Link: https://lkml.kernel.org/r/[email protected]
  47. Signed-off-by: Yu Zhao <[email protected]>
  48. Cc: Johannes Weiner <[email protected]>
  49. Cc: Jonathan Corbet <[email protected]>
  50. Cc: Michael Larabel <[email protected]>
  51. Cc: Michal Hocko <[email protected]>
  52. Cc: Mike Rapoport <[email protected]>
  53. Cc: Roman Gushchin <[email protected]>
  54. Cc: Suren Baghdasaryan <[email protected]>
  55. Signed-off-by: Andrew Morton <[email protected]>
  56. Bug: 274865848
  57. (cherry picked from commit e4dde56cd208674ce899b47589f263499e5b8cdc)
  58. [TJ: Resolved conflicts with older function signatures for
  59. min_cgroup_below_min / min_cgroup_below_low and includes]
  60. Change-Id: Idc8a0f635e035d72dd911f807d1224cb47cbd655
  61. Signed-off-by: T.J. Mercier <[email protected]>
  62. ---
  63. include/linux/memcontrol.h | 10 +
  64. include/linux/mm_inline.h | 17 ++
  65. include/linux/mmzone.h | 117 +++++++++++-
  66. mm/memcontrol.c | 16 ++
  67. mm/page_alloc.c | 1 +
  68. mm/vmscan.c | 374 +++++++++++++++++++++++++++++++++----
  69. 6 files changed, 500 insertions(+), 35 deletions(-)
  70. --- a/include/linux/memcontrol.h
  71. +++ b/include/linux/memcontrol.h
  72. @@ -795,6 +795,11 @@ static inline void obj_cgroup_put(struct
  73. percpu_ref_put(&objcg->refcnt);
  74. }
  75. +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
  76. +{
  77. + return !memcg || css_tryget(&memcg->css);
  78. +}
  79. +
  80. static inline void mem_cgroup_put(struct mem_cgroup *memcg)
  81. {
  82. if (memcg)
  83. @@ -1295,6 +1300,11 @@ static inline void obj_cgroup_put(struct
  84. {
  85. }
  86. +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
  87. +{
  88. + return true;
  89. +}
  90. +
  91. static inline void mem_cgroup_put(struct mem_cgroup *memcg)
  92. {
  93. }
  94. --- a/include/linux/mm_inline.h
  95. +++ b/include/linux/mm_inline.h
  96. @@ -122,6 +122,18 @@ static inline bool lru_gen_in_fault(void
  97. return current->in_lru_fault;
  98. }
  99. +#ifdef CONFIG_MEMCG
  100. +static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
  101. +{
  102. + return READ_ONCE(lruvec->lrugen.seg);
  103. +}
  104. +#else
  105. +static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
  106. +{
  107. + return 0;
  108. +}
  109. +#endif
  110. +
  111. static inline int lru_gen_from_seq(unsigned long seq)
  112. {
  113. return seq % MAX_NR_GENS;
  114. @@ -302,6 +314,11 @@ static inline bool lru_gen_in_fault(void
  115. return false;
  116. }
  117. +static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
  118. +{
  119. + return 0;
  120. +}
  121. +
  122. static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
  123. {
  124. return false;
  125. --- a/include/linux/mmzone.h
  126. +++ b/include/linux/mmzone.h
  127. @@ -7,6 +7,7 @@
  128. #include <linux/spinlock.h>
  129. #include <linux/list.h>
  130. +#include <linux/list_nulls.h>
  131. #include <linux/wait.h>
  132. #include <linux/bitops.h>
  133. #include <linux/cache.h>
  134. @@ -367,6 +368,15 @@ struct page_vma_mapped_walk;
  135. #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
  136. #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
  137. +/* see the comment on MEMCG_NR_GENS */
  138. +enum {
  139. + MEMCG_LRU_NOP,
  140. + MEMCG_LRU_HEAD,
  141. + MEMCG_LRU_TAIL,
  142. + MEMCG_LRU_OLD,
  143. + MEMCG_LRU_YOUNG,
  144. +};
  145. +
  146. #ifdef CONFIG_LRU_GEN
  147. enum {
  148. @@ -426,6 +436,14 @@ struct lru_gen_folio {
  149. atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
  150. /* whether the multi-gen LRU is enabled */
  151. bool enabled;
  152. +#ifdef CONFIG_MEMCG
  153. + /* the memcg generation this lru_gen_folio belongs to */
  154. + u8 gen;
  155. + /* the list segment this lru_gen_folio belongs to */
  156. + u8 seg;
  157. + /* per-node lru_gen_folio list for global reclaim */
  158. + struct hlist_nulls_node list;
  159. +#endif
  160. };
  161. enum {
  162. @@ -479,12 +497,87 @@ void lru_gen_init_lruvec(struct lruvec *
  163. void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
  164. #ifdef CONFIG_MEMCG
  165. +
  166. +/*
  167. + * For each node, memcgs are divided into two generations: the old and the
  168. + * young. For each generation, memcgs are randomly sharded into multiple bins
  169. + * to improve scalability. For each bin, the hlist_nulls is virtually divided
  170. + * into three segments: the head, the tail and the default.
  171. + *
  172. + * An onlining memcg is added to the tail of a random bin in the old generation.
  173. + * The eviction starts at the head of a random bin in the old generation. The
  174. + * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
  175. + * the old generation, is incremented when all its bins become empty.
  176. + *
  177. + * There are four operations:
  178. + * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its
  179. + * current generation (old or young) and updates its "seg" to "head";
  180. + * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its
  181. + * current generation (old or young) and updates its "seg" to "tail";
  182. + * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old
  183. + * generation, updates its "gen" to "old" and resets its "seg" to "default";
  184. + * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the
  185. + * young generation, updates its "gen" to "young" and resets its "seg" to
  186. + * "default".
  187. + *
  188. + * The events that trigger the above operations are:
  189. + * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
  190. + * 2. The first attempt to reclaim an memcg below low, which triggers
  191. + * MEMCG_LRU_TAIL;
  192. + * 3. The first attempt to reclaim an memcg below reclaimable size threshold,
  193. + * which triggers MEMCG_LRU_TAIL;
  194. + * 4. The second attempt to reclaim an memcg below reclaimable size threshold,
  195. + * which triggers MEMCG_LRU_YOUNG;
  196. + * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG;
  197. + * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
  198. + * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
  199. + *
  200. + * Note that memcg LRU only applies to global reclaim, and the round-robin
  201. + * incrementing of their max_seq counters ensures the eventual fairness to all
  202. + * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
  203. + */
  204. +#define MEMCG_NR_GENS 2
  205. +#define MEMCG_NR_BINS 8
  206. +
  207. +struct lru_gen_memcg {
  208. + /* the per-node memcg generation counter */
  209. + unsigned long seq;
  210. + /* each memcg has one lru_gen_folio per node */
  211. + unsigned long nr_memcgs[MEMCG_NR_GENS];
  212. + /* per-node lru_gen_folio list for global reclaim */
  213. + struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
  214. + /* protects the above */
  215. + spinlock_t lock;
  216. +};
  217. +
  218. +void lru_gen_init_pgdat(struct pglist_data *pgdat);
  219. +
  220. void lru_gen_init_memcg(struct mem_cgroup *memcg);
  221. void lru_gen_exit_memcg(struct mem_cgroup *memcg);
  222. -#endif
  223. +void lru_gen_online_memcg(struct mem_cgroup *memcg);
  224. +void lru_gen_offline_memcg(struct mem_cgroup *memcg);
  225. +void lru_gen_release_memcg(struct mem_cgroup *memcg);
  226. +void lru_gen_rotate_memcg(struct lruvec *lruvec, int op);
  227. +
  228. +#else /* !CONFIG_MEMCG */
  229. +
  230. +#define MEMCG_NR_GENS 1
  231. +
  232. +struct lru_gen_memcg {
  233. +};
  234. +
  235. +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
  236. +{
  237. +}
  238. +
  239. +#endif /* CONFIG_MEMCG */
  240. #else /* !CONFIG_LRU_GEN */
  241. +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
  242. +{
  243. +}
  244. +
  245. static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
  246. {
  247. }
  248. @@ -494,6 +587,7 @@ static inline void lru_gen_look_around(s
  249. }
  250. #ifdef CONFIG_MEMCG
  251. +
  252. static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
  253. {
  254. }
  255. @@ -501,7 +595,24 @@ static inline void lru_gen_init_memcg(st
  256. static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
  257. {
  258. }
  259. -#endif
  260. +
  261. +static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
  262. +{
  263. +}
  264. +
  265. +static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
  266. +{
  267. +}
  268. +
  269. +static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
  270. +{
  271. +}
  272. +
  273. +static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
  274. +{
  275. +}
  276. +
  277. +#endif /* CONFIG_MEMCG */
  278. #endif /* CONFIG_LRU_GEN */
  279. @@ -1219,6 +1330,8 @@ typedef struct pglist_data {
  280. #ifdef CONFIG_LRU_GEN
  281. /* kswap mm walk data */
  282. struct lru_gen_mm_walk mm_walk;
  283. + /* lru_gen_folio list */
  284. + struct lru_gen_memcg memcg_lru;
  285. #endif
  286. CACHELINE_PADDING(_pad2_);
  287. --- a/mm/memcontrol.c
  288. +++ b/mm/memcontrol.c
  289. @@ -477,6 +477,16 @@ static void mem_cgroup_update_tree(struc
  290. struct mem_cgroup_per_node *mz;
  291. struct mem_cgroup_tree_per_node *mctz;
  292. + if (lru_gen_enabled()) {
  293. + struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
  294. +
  295. + /* see the comment on MEMCG_NR_GENS */
  296. + if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
  297. + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
  298. +
  299. + return;
  300. + }
  301. +
  302. mctz = soft_limit_tree.rb_tree_per_node[nid];
  303. if (!mctz)
  304. return;
  305. @@ -3524,6 +3534,9 @@ unsigned long mem_cgroup_soft_limit_recl
  306. struct mem_cgroup_tree_per_node *mctz;
  307. unsigned long excess;
  308. + if (lru_gen_enabled())
  309. + return 0;
  310. +
  311. if (order > 0)
  312. return 0;
  313. @@ -5387,6 +5400,7 @@ static int mem_cgroup_css_online(struct
  314. if (unlikely(mem_cgroup_is_root(memcg)))
  315. queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
  316. 2UL*HZ);
  317. + lru_gen_online_memcg(memcg);
  318. return 0;
  319. offline_kmem:
  320. memcg_offline_kmem(memcg);
  321. @@ -5418,6 +5432,7 @@ static void mem_cgroup_css_offline(struc
  322. memcg_offline_kmem(memcg);
  323. reparent_shrinker_deferred(memcg);
  324. wb_memcg_offline(memcg);
  325. + lru_gen_offline_memcg(memcg);
  326. drain_all_stock(memcg);
  327. @@ -5429,6 +5444,7 @@ static void mem_cgroup_css_released(stru
  328. struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  329. invalidate_reclaim_iterators(memcg);
  330. + lru_gen_release_memcg(memcg);
  331. }
  332. static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
  333. --- a/mm/page_alloc.c
  334. +++ b/mm/page_alloc.c
  335. @@ -7943,6 +7943,7 @@ static void __init free_area_init_node(i
  336. pgdat_set_deferred_range(pgdat);
  337. free_area_init_core(pgdat);
  338. + lru_gen_init_pgdat(pgdat);
  339. }
  340. static void __init free_area_init_memoryless_node(int nid)
  341. --- a/mm/vmscan.c
  342. +++ b/mm/vmscan.c
  343. @@ -54,6 +54,8 @@
  344. #include <linux/shmem_fs.h>
  345. #include <linux/ctype.h>
  346. #include <linux/debugfs.h>
  347. +#include <linux/rculist_nulls.h>
  348. +#include <linux/random.h>
  349. #include <asm/tlbflush.h>
  350. #include <asm/div64.h>
  351. @@ -134,11 +136,6 @@ struct scan_control {
  352. /* Always discard instead of demoting to lower tier memory */
  353. unsigned int no_demotion:1;
  354. -#ifdef CONFIG_LRU_GEN
  355. - /* help kswapd make better choices among multiple memcgs */
  356. - unsigned long last_reclaimed;
  357. -#endif
  358. -
  359. /* Allocation order */
  360. s8 order;
  361. @@ -3160,6 +3157,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_ca
  362. for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
  363. for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
  364. +#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
  365. +#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
  366. +
  367. static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
  368. {
  369. struct pglist_data *pgdat = NODE_DATA(nid);
  370. @@ -4442,8 +4442,7 @@ done:
  371. if (sc->priority <= DEF_PRIORITY - 2)
  372. wait_event_killable(lruvec->mm_state.wait,
  373. max_seq < READ_ONCE(lrugen->max_seq));
  374. -
  375. - return max_seq < READ_ONCE(lrugen->max_seq);
  376. + return false;
  377. }
  378. VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
  379. @@ -4516,8 +4515,6 @@ static void lru_gen_age_node(struct pgli
  380. VM_WARN_ON_ONCE(!current_is_kswapd());
  381. - sc->last_reclaimed = sc->nr_reclaimed;
  382. -
  383. /* check the order to exclude compaction-induced reclaim */
  384. if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
  385. return;
  386. @@ -5116,8 +5113,7 @@ static bool should_run_aging(struct lruv
  387. * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
  388. * reclaim.
  389. */
  390. -static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
  391. - bool can_swap)
  392. +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
  393. {
  394. unsigned long nr_to_scan;
  395. struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  396. @@ -5134,10 +5130,8 @@ static unsigned long get_nr_to_scan(stru
  397. if (sc->priority == DEF_PRIORITY)
  398. return nr_to_scan;
  399. - try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false);
  400. -
  401. /* skip this lruvec as it's low on cold folios */
  402. - return 0;
  403. + return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
  404. }
  405. static unsigned long get_nr_to_reclaim(struct scan_control *sc)
  406. @@ -5146,29 +5140,18 @@ static unsigned long get_nr_to_reclaim(s
  407. if (!global_reclaim(sc))
  408. return -1;
  409. - /* discount the previous progress for kswapd */
  410. - if (current_is_kswapd())
  411. - return sc->nr_to_reclaim + sc->last_reclaimed;
  412. -
  413. return max(sc->nr_to_reclaim, compact_gap(sc->order));
  414. }
  415. -static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
  416. +static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
  417. {
  418. - struct blk_plug plug;
  419. + long nr_to_scan;
  420. unsigned long scanned = 0;
  421. unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
  422. - lru_add_drain();
  423. -
  424. - blk_start_plug(&plug);
  425. -
  426. - set_mm_walk(lruvec_pgdat(lruvec));
  427. -
  428. while (true) {
  429. int delta;
  430. int swappiness;
  431. - unsigned long nr_to_scan;
  432. if (sc->may_swap)
  433. swappiness = get_swappiness(lruvec, sc);
  434. @@ -5178,7 +5161,7 @@ static void lru_gen_shrink_lruvec(struct
  435. swappiness = 0;
  436. nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
  437. - if (!nr_to_scan)
  438. + if (nr_to_scan <= 0)
  439. break;
  440. delta = evict_folios(lruvec, sc, swappiness);
  441. @@ -5195,10 +5178,251 @@ static void lru_gen_shrink_lruvec(struct
  442. cond_resched();
  443. }
  444. + /* whether try_to_inc_max_seq() was successful */
  445. + return nr_to_scan < 0;
  446. +}
  447. +
  448. +static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
  449. +{
  450. + bool success;
  451. + unsigned long scanned = sc->nr_scanned;
  452. + unsigned long reclaimed = sc->nr_reclaimed;
  453. + int seg = lru_gen_memcg_seg(lruvec);
  454. + struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  455. + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
  456. +
  457. + /* see the comment on MEMCG_NR_GENS */
  458. + if (!lruvec_is_sizable(lruvec, sc))
  459. + return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
  460. +
  461. + mem_cgroup_calculate_protection(NULL, memcg);
  462. +
  463. + if (mem_cgroup_below_min(memcg))
  464. + return MEMCG_LRU_YOUNG;
  465. +
  466. + if (mem_cgroup_below_low(memcg)) {
  467. + /* see the comment on MEMCG_NR_GENS */
  468. + if (seg != MEMCG_LRU_TAIL)
  469. + return MEMCG_LRU_TAIL;
  470. +
  471. + memcg_memory_event(memcg, MEMCG_LOW);
  472. + }
  473. +
  474. + success = try_to_shrink_lruvec(lruvec, sc);
  475. +
  476. + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
  477. +
  478. + if (!sc->proactive)
  479. + vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
  480. + sc->nr_reclaimed - reclaimed);
  481. +
  482. + sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
  483. + current->reclaim_state->reclaimed_slab = 0;
  484. +
  485. + return success ? MEMCG_LRU_YOUNG : 0;
  486. +}
  487. +
  488. +#ifdef CONFIG_MEMCG
  489. +
  490. +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
  491. +{
  492. + int gen;
  493. + int bin;
  494. + int first_bin;
  495. + struct lruvec *lruvec;
  496. + struct lru_gen_folio *lrugen;
  497. + const struct hlist_nulls_node *pos;
  498. + int op = 0;
  499. + struct mem_cgroup *memcg = NULL;
  500. + unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
  501. +
  502. + bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
  503. +restart:
  504. + gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
  505. +
  506. + rcu_read_lock();
  507. +
  508. + hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
  509. + if (op)
  510. + lru_gen_rotate_memcg(lruvec, op);
  511. +
  512. + mem_cgroup_put(memcg);
  513. +
  514. + lruvec = container_of(lrugen, struct lruvec, lrugen);
  515. + memcg = lruvec_memcg(lruvec);
  516. +
  517. + if (!mem_cgroup_tryget(memcg)) {
  518. + op = 0;
  519. + memcg = NULL;
  520. + continue;
  521. + }
  522. +
  523. + rcu_read_unlock();
  524. +
  525. + op = shrink_one(lruvec, sc);
  526. +
  527. + if (sc->nr_reclaimed >= nr_to_reclaim)
  528. + goto success;
  529. +
  530. + rcu_read_lock();
  531. + }
  532. +
  533. + rcu_read_unlock();
  534. +
  535. + /* restart if raced with lru_gen_rotate_memcg() */
  536. + if (gen != get_nulls_value(pos))
  537. + goto restart;
  538. +
  539. + /* try the rest of the bins of the current generation */
  540. + bin = get_memcg_bin(bin + 1);
  541. + if (bin != first_bin)
  542. + goto restart;
  543. +success:
  544. + if (op)
  545. + lru_gen_rotate_memcg(lruvec, op);
  546. +
  547. + mem_cgroup_put(memcg);
  548. +}
  549. +
  550. +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
  551. +{
  552. + struct blk_plug plug;
  553. +
  554. + VM_WARN_ON_ONCE(global_reclaim(sc));
  555. +
  556. + lru_add_drain();
  557. +
  558. + blk_start_plug(&plug);
  559. +
  560. + set_mm_walk(lruvec_pgdat(lruvec));
  561. +
  562. + if (try_to_shrink_lruvec(lruvec, sc))
  563. + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
  564. +
  565. + clear_mm_walk();
  566. +
  567. + blk_finish_plug(&plug);
  568. +}
  569. +
  570. +#else /* !CONFIG_MEMCG */
  571. +
  572. +static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
  573. +{
  574. + BUILD_BUG();
  575. +}
  576. +
  577. +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
  578. +{
  579. + BUILD_BUG();
  580. +}
  581. +
  582. +#endif
  583. +
  584. +static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
  585. +{
  586. + int priority;
  587. + unsigned long reclaimable;
  588. + struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
  589. +
  590. + if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
  591. + return;
  592. + /*
  593. + * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
  594. + * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
  595. + * estimated reclaimed_to_scanned_ratio = inactive / total.
  596. + */
  597. + reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
  598. + if (get_swappiness(lruvec, sc))
  599. + reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
  600. +
  601. + reclaimable /= MEMCG_NR_GENS;
  602. +
  603. + /* round down reclaimable and round up sc->nr_to_reclaim */
  604. + priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
  605. +
  606. + sc->priority = clamp(priority, 0, DEF_PRIORITY);
  607. +}
  608. +
  609. +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
  610. +{
  611. + struct blk_plug plug;
  612. + unsigned long reclaimed = sc->nr_reclaimed;
  613. +
  614. + VM_WARN_ON_ONCE(!global_reclaim(sc));
  615. +
  616. + lru_add_drain();
  617. +
  618. + blk_start_plug(&plug);
  619. +
  620. + set_mm_walk(pgdat);
  621. +
  622. + set_initial_priority(pgdat, sc);
  623. +
  624. + if (current_is_kswapd())
  625. + sc->nr_reclaimed = 0;
  626. +
  627. + if (mem_cgroup_disabled())
  628. + shrink_one(&pgdat->__lruvec, sc);
  629. + else
  630. + shrink_many(pgdat, sc);
  631. +
  632. + if (current_is_kswapd())
  633. + sc->nr_reclaimed += reclaimed;
  634. +
  635. clear_mm_walk();
  636. blk_finish_plug(&plug);
  637. +
  638. + /* kswapd should never fail */
  639. + pgdat->kswapd_failures = 0;
  640. +}
  641. +
  642. +#ifdef CONFIG_MEMCG
  643. +void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
  644. +{
  645. + int seg;
  646. + int old, new;
  647. + int bin = get_random_u32_below(MEMCG_NR_BINS);
  648. + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
  649. +
  650. + spin_lock(&pgdat->memcg_lru.lock);
  651. +
  652. + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
  653. +
  654. + seg = 0;
  655. + new = old = lruvec->lrugen.gen;
  656. +
  657. + /* see the comment on MEMCG_NR_GENS */
  658. + if (op == MEMCG_LRU_HEAD)
  659. + seg = MEMCG_LRU_HEAD;
  660. + else if (op == MEMCG_LRU_TAIL)
  661. + seg = MEMCG_LRU_TAIL;
  662. + else if (op == MEMCG_LRU_OLD)
  663. + new = get_memcg_gen(pgdat->memcg_lru.seq);
  664. + else if (op == MEMCG_LRU_YOUNG)
  665. + new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
  666. + else
  667. + VM_WARN_ON_ONCE(true);
  668. +
  669. + hlist_nulls_del_rcu(&lruvec->lrugen.list);
  670. +
  671. + if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
  672. + hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
  673. + else
  674. + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
  675. +
  676. + pgdat->memcg_lru.nr_memcgs[old]--;
  677. + pgdat->memcg_lru.nr_memcgs[new]++;
  678. +
  679. + lruvec->lrugen.gen = new;
  680. + WRITE_ONCE(lruvec->lrugen.seg, seg);
  681. +
  682. + if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
  683. + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
  684. +
  685. + spin_unlock(&pgdat->memcg_lru.lock);
  686. }
  687. +#endif
  688. /******************************************************************************
  689. * state change
  690. @@ -5656,11 +5880,11 @@ static int run_cmd(char cmd, int memcg_i
  691. if (!mem_cgroup_disabled()) {
  692. rcu_read_lock();
  693. +
  694. memcg = mem_cgroup_from_id(memcg_id);
  695. -#ifdef CONFIG_MEMCG
  696. - if (memcg && !css_tryget(&memcg->css))
  697. + if (!mem_cgroup_tryget(memcg))
  698. memcg = NULL;
  699. -#endif
  700. +
  701. rcu_read_unlock();
  702. if (!memcg)
  703. @@ -5808,6 +6032,19 @@ void lru_gen_init_lruvec(struct lruvec *
  704. }
  705. #ifdef CONFIG_MEMCG
  706. +
  707. +void lru_gen_init_pgdat(struct pglist_data *pgdat)
  708. +{
  709. + int i, j;
  710. +
  711. + spin_lock_init(&pgdat->memcg_lru.lock);
  712. +
  713. + for (i = 0; i < MEMCG_NR_GENS; i++) {
  714. + for (j = 0; j < MEMCG_NR_BINS; j++)
  715. + INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
  716. + }
  717. +}
  718. +
  719. void lru_gen_init_memcg(struct mem_cgroup *memcg)
  720. {
  721. INIT_LIST_HEAD(&memcg->mm_list.fifo);
  722. @@ -5831,7 +6068,69 @@ void lru_gen_exit_memcg(struct mem_cgrou
  723. }
  724. }
  725. }
  726. -#endif
  727. +
  728. +void lru_gen_online_memcg(struct mem_cgroup *memcg)
  729. +{
  730. + int gen;
  731. + int nid;
  732. + int bin = get_random_u32_below(MEMCG_NR_BINS);
  733. +
  734. + for_each_node(nid) {
  735. + struct pglist_data *pgdat = NODE_DATA(nid);
  736. + struct lruvec *lruvec = get_lruvec(memcg, nid);
  737. +
  738. + spin_lock(&pgdat->memcg_lru.lock);
  739. +
  740. + VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
  741. +
  742. + gen = get_memcg_gen(pgdat->memcg_lru.seq);
  743. +
  744. + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
  745. + pgdat->memcg_lru.nr_memcgs[gen]++;
  746. +
  747. + lruvec->lrugen.gen = gen;
  748. +
  749. + spin_unlock(&pgdat->memcg_lru.lock);
  750. + }
  751. +}
  752. +
  753. +void lru_gen_offline_memcg(struct mem_cgroup *memcg)
  754. +{
  755. + int nid;
  756. +
  757. + for_each_node(nid) {
  758. + struct lruvec *lruvec = get_lruvec(memcg, nid);
  759. +
  760. + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
  761. + }
  762. +}
  763. +
  764. +void lru_gen_release_memcg(struct mem_cgroup *memcg)
  765. +{
  766. + int gen;
  767. + int nid;
  768. +
  769. + for_each_node(nid) {
  770. + struct pglist_data *pgdat = NODE_DATA(nid);
  771. + struct lruvec *lruvec = get_lruvec(memcg, nid);
  772. +
  773. + spin_lock(&pgdat->memcg_lru.lock);
  774. +
  775. + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
  776. +
  777. + gen = lruvec->lrugen.gen;
  778. +
  779. + hlist_nulls_del_rcu(&lruvec->lrugen.list);
  780. + pgdat->memcg_lru.nr_memcgs[gen]--;
  781. +
  782. + if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
  783. + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
  784. +
  785. + spin_unlock(&pgdat->memcg_lru.lock);
  786. + }
  787. +}
  788. +
  789. +#endif /* CONFIG_MEMCG */
  790. static int __init init_lru_gen(void)
  791. {
  792. @@ -5858,6 +6157,10 @@ static void lru_gen_shrink_lruvec(struct
  793. {
  794. }
  795. +static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
  796. +{
  797. +}
  798. +
  799. #endif /* CONFIG_LRU_GEN */
  800. static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
  801. @@ -5871,7 +6174,7 @@ static void shrink_lruvec(struct lruvec
  802. bool proportional_reclaim;
  803. struct blk_plug plug;
  804. - if (lru_gen_enabled()) {
  805. + if (lru_gen_enabled() && !global_reclaim(sc)) {
  806. lru_gen_shrink_lruvec(lruvec, sc);
  807. return;
  808. }
  809. @@ -6114,6 +6417,11 @@ static void shrink_node(pg_data_t *pgdat
  810. struct lruvec *target_lruvec;
  811. bool reclaimable = false;
  812. + if (lru_gen_enabled() && global_reclaim(sc)) {
  813. + lru_gen_shrink_node(pgdat, sc);
  814. + return;
  815. + }
  816. +
  817. target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
  818. again: