020-v6.3-15-UPSTREAM-mm-multi-gen-LRU-section-for-memcg-LRU.patch 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427
  1. From 48c916b812652f9453be5bd45a703728926d41ca Mon Sep 17 00:00:00 2001
  2. From: "T.J. Alumbaugh" <[email protected]>
  3. Date: Wed, 18 Jan 2023 00:18:24 +0000
  4. Subject: [PATCH 15/19] UPSTREAM: mm: multi-gen LRU: section for memcg LRU
  5. Move memcg LRU code into a dedicated section. Improve the design doc to
  6. outline its architecture.
  7. Link: https://lkml.kernel.org/r/[email protected]
  8. Change-Id: Id252e420cff7a858acb098cf2b3642da5c40f602
  9. Signed-off-by: T.J. Alumbaugh <[email protected]>
  10. Cc: Yu Zhao <[email protected]>
  11. Signed-off-by: Andrew Morton <[email protected]>
  12. (cherry picked from commit 36c7b4db7c942ae9e1b111f0c6b468c8b2e33842)
  13. Bug: 274865848
  14. Signed-off-by: T.J. Mercier <[email protected]>
  15. ---
  16. Documentation/mm/multigen_lru.rst | 33 +++-
  17. include/linux/mm_inline.h | 17 --
  18. include/linux/mmzone.h | 13 +-
  19. mm/memcontrol.c | 8 +-
  20. mm/vmscan.c | 250 +++++++++++++++++-------------
  21. 5 files changed, 178 insertions(+), 143 deletions(-)
  22. --- a/Documentation/mm/multigen_lru.rst
  23. +++ b/Documentation/mm/multigen_lru.rst
  24. @@ -186,9 +186,40 @@ is false positive, the cost is an additi
  25. which may yield hot pages anyway. Parameters of the filter itself can
  26. control the false positive rate in the limit.
  27. +Memcg LRU
  28. +---------
  29. +An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs,
  30. +since each node and memcg combination has an LRU of folios (see
  31. +``mem_cgroup_lruvec()``). Its goal is to improve the scalability of
  32. +global reclaim, which is critical to system-wide memory overcommit in
  33. +data centers. Note that memcg LRU only applies to global reclaim.
  34. +
  35. +The basic structure of an memcg LRU can be understood by an analogy to
  36. +the active/inactive LRU (of folios):
  37. +
  38. +1. It has the young and the old (generations), i.e., the counterparts
  39. + to the active and the inactive;
  40. +2. The increment of ``max_seq`` triggers promotion, i.e., the
  41. + counterpart to activation;
  42. +3. Other events trigger similar operations, e.g., offlining an memcg
  43. + triggers demotion, i.e., the counterpart to deactivation.
  44. +
  45. +In terms of global reclaim, it has two distinct features:
  46. +
  47. +1. Sharding, which allows each thread to start at a random memcg (in
  48. + the old generation) and improves parallelism;
  49. +2. Eventual fairness, which allows direct reclaim to bail out at will
  50. + and reduces latency without affecting fairness over some time.
  51. +
  52. +In terms of traversing memcgs during global reclaim, it improves the
  53. +best-case complexity from O(n) to O(1) and does not affect the
  54. +worst-case complexity O(n). Therefore, on average, it has a sublinear
  55. +complexity.
  56. +
  57. Summary
  58. -------
  59. -The multi-gen LRU can be disassembled into the following parts:
  60. +The multi-gen LRU (of folios) can be disassembled into the following
  61. +parts:
  62. * Generations
  63. * Rmap walks
  64. --- a/include/linux/mm_inline.h
  65. +++ b/include/linux/mm_inline.h
  66. @@ -122,18 +122,6 @@ static inline bool lru_gen_in_fault(void
  67. return current->in_lru_fault;
  68. }
  69. -#ifdef CONFIG_MEMCG
  70. -static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
  71. -{
  72. - return READ_ONCE(lruvec->lrugen.seg);
  73. -}
  74. -#else
  75. -static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
  76. -{
  77. - return 0;
  78. -}
  79. -#endif
  80. -
  81. static inline int lru_gen_from_seq(unsigned long seq)
  82. {
  83. return seq % MAX_NR_GENS;
  84. @@ -314,11 +302,6 @@ static inline bool lru_gen_in_fault(void
  85. return false;
  86. }
  87. -static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
  88. -{
  89. - return 0;
  90. -}
  91. -
  92. static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
  93. {
  94. return false;
  95. --- a/include/linux/mmzone.h
  96. +++ b/include/linux/mmzone.h
  97. @@ -368,15 +368,6 @@ struct page_vma_mapped_walk;
  98. #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
  99. #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
  100. -/* see the comment on MEMCG_NR_GENS */
  101. -enum {
  102. - MEMCG_LRU_NOP,
  103. - MEMCG_LRU_HEAD,
  104. - MEMCG_LRU_TAIL,
  105. - MEMCG_LRU_OLD,
  106. - MEMCG_LRU_YOUNG,
  107. -};
  108. -
  109. #ifdef CONFIG_LRU_GEN
  110. enum {
  111. @@ -557,7 +548,7 @@ void lru_gen_exit_memcg(struct mem_cgrou
  112. void lru_gen_online_memcg(struct mem_cgroup *memcg);
  113. void lru_gen_offline_memcg(struct mem_cgroup *memcg);
  114. void lru_gen_release_memcg(struct mem_cgroup *memcg);
  115. -void lru_gen_rotate_memcg(struct lruvec *lruvec, int op);
  116. +void lru_gen_soft_reclaim(struct lruvec *lruvec);
  117. #else /* !CONFIG_MEMCG */
  118. @@ -608,7 +599,7 @@ static inline void lru_gen_release_memcg
  119. {
  120. }
  121. -static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
  122. +static inline void lru_gen_soft_reclaim(struct lruvec *lruvec)
  123. {
  124. }
  125. --- a/mm/memcontrol.c
  126. +++ b/mm/memcontrol.c
  127. @@ -478,12 +478,8 @@ static void mem_cgroup_update_tree(struc
  128. struct mem_cgroup_tree_per_node *mctz;
  129. if (lru_gen_enabled()) {
  130. - struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
  131. -
  132. - /* see the comment on MEMCG_NR_GENS */
  133. - if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
  134. - lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
  135. -
  136. + if (soft_limit_excess(memcg))
  137. + lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec);
  138. return;
  139. }
  140. --- a/mm/vmscan.c
  141. +++ b/mm/vmscan.c
  142. @@ -4692,6 +4692,148 @@ void lru_gen_look_around(struct page_vma
  143. }
  144. /******************************************************************************
  145. + * memcg LRU
  146. + ******************************************************************************/
  147. +
  148. +/* see the comment on MEMCG_NR_GENS */
  149. +enum {
  150. + MEMCG_LRU_NOP,
  151. + MEMCG_LRU_HEAD,
  152. + MEMCG_LRU_TAIL,
  153. + MEMCG_LRU_OLD,
  154. + MEMCG_LRU_YOUNG,
  155. +};
  156. +
  157. +#ifdef CONFIG_MEMCG
  158. +
  159. +static int lru_gen_memcg_seg(struct lruvec *lruvec)
  160. +{
  161. + return READ_ONCE(lruvec->lrugen.seg);
  162. +}
  163. +
  164. +static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
  165. +{
  166. + int seg;
  167. + int old, new;
  168. + int bin = get_random_u32_below(MEMCG_NR_BINS);
  169. + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
  170. +
  171. + spin_lock(&pgdat->memcg_lru.lock);
  172. +
  173. + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
  174. +
  175. + seg = 0;
  176. + new = old = lruvec->lrugen.gen;
  177. +
  178. + /* see the comment on MEMCG_NR_GENS */
  179. + if (op == MEMCG_LRU_HEAD)
  180. + seg = MEMCG_LRU_HEAD;
  181. + else if (op == MEMCG_LRU_TAIL)
  182. + seg = MEMCG_LRU_TAIL;
  183. + else if (op == MEMCG_LRU_OLD)
  184. + new = get_memcg_gen(pgdat->memcg_lru.seq);
  185. + else if (op == MEMCG_LRU_YOUNG)
  186. + new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
  187. + else
  188. + VM_WARN_ON_ONCE(true);
  189. +
  190. + hlist_nulls_del_rcu(&lruvec->lrugen.list);
  191. +
  192. + if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
  193. + hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
  194. + else
  195. + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
  196. +
  197. + pgdat->memcg_lru.nr_memcgs[old]--;
  198. + pgdat->memcg_lru.nr_memcgs[new]++;
  199. +
  200. + lruvec->lrugen.gen = new;
  201. + WRITE_ONCE(lruvec->lrugen.seg, seg);
  202. +
  203. + if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
  204. + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
  205. +
  206. + spin_unlock(&pgdat->memcg_lru.lock);
  207. +}
  208. +
  209. +void lru_gen_online_memcg(struct mem_cgroup *memcg)
  210. +{
  211. + int gen;
  212. + int nid;
  213. + int bin = get_random_u32_below(MEMCG_NR_BINS);
  214. +
  215. + for_each_node(nid) {
  216. + struct pglist_data *pgdat = NODE_DATA(nid);
  217. + struct lruvec *lruvec = get_lruvec(memcg, nid);
  218. +
  219. + spin_lock(&pgdat->memcg_lru.lock);
  220. +
  221. + VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
  222. +
  223. + gen = get_memcg_gen(pgdat->memcg_lru.seq);
  224. +
  225. + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
  226. + pgdat->memcg_lru.nr_memcgs[gen]++;
  227. +
  228. + lruvec->lrugen.gen = gen;
  229. +
  230. + spin_unlock(&pgdat->memcg_lru.lock);
  231. + }
  232. +}
  233. +
  234. +void lru_gen_offline_memcg(struct mem_cgroup *memcg)
  235. +{
  236. + int nid;
  237. +
  238. + for_each_node(nid) {
  239. + struct lruvec *lruvec = get_lruvec(memcg, nid);
  240. +
  241. + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
  242. + }
  243. +}
  244. +
  245. +void lru_gen_release_memcg(struct mem_cgroup *memcg)
  246. +{
  247. + int gen;
  248. + int nid;
  249. +
  250. + for_each_node(nid) {
  251. + struct pglist_data *pgdat = NODE_DATA(nid);
  252. + struct lruvec *lruvec = get_lruvec(memcg, nid);
  253. +
  254. + spin_lock(&pgdat->memcg_lru.lock);
  255. +
  256. + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
  257. +
  258. + gen = lruvec->lrugen.gen;
  259. +
  260. + hlist_nulls_del_rcu(&lruvec->lrugen.list);
  261. + pgdat->memcg_lru.nr_memcgs[gen]--;
  262. +
  263. + if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
  264. + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
  265. +
  266. + spin_unlock(&pgdat->memcg_lru.lock);
  267. + }
  268. +}
  269. +
  270. +void lru_gen_soft_reclaim(struct lruvec *lruvec)
  271. +{
  272. + /* see the comment on MEMCG_NR_GENS */
  273. + if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
  274. + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
  275. +}
  276. +
  277. +#else /* !CONFIG_MEMCG */
  278. +
  279. +static int lru_gen_memcg_seg(struct lruvec *lruvec)
  280. +{
  281. + return 0;
  282. +}
  283. +
  284. +#endif
  285. +
  286. +/******************************************************************************
  287. * the eviction
  288. ******************************************************************************/
  289. @@ -5398,53 +5540,6 @@ done:
  290. pgdat->kswapd_failures = 0;
  291. }
  292. -#ifdef CONFIG_MEMCG
  293. -void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
  294. -{
  295. - int seg;
  296. - int old, new;
  297. - int bin = get_random_u32_below(MEMCG_NR_BINS);
  298. - struct pglist_data *pgdat = lruvec_pgdat(lruvec);
  299. -
  300. - spin_lock(&pgdat->memcg_lru.lock);
  301. -
  302. - VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
  303. -
  304. - seg = 0;
  305. - new = old = lruvec->lrugen.gen;
  306. -
  307. - /* see the comment on MEMCG_NR_GENS */
  308. - if (op == MEMCG_LRU_HEAD)
  309. - seg = MEMCG_LRU_HEAD;
  310. - else if (op == MEMCG_LRU_TAIL)
  311. - seg = MEMCG_LRU_TAIL;
  312. - else if (op == MEMCG_LRU_OLD)
  313. - new = get_memcg_gen(pgdat->memcg_lru.seq);
  314. - else if (op == MEMCG_LRU_YOUNG)
  315. - new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
  316. - else
  317. - VM_WARN_ON_ONCE(true);
  318. -
  319. - hlist_nulls_del_rcu(&lruvec->lrugen.list);
  320. -
  321. - if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
  322. - hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
  323. - else
  324. - hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
  325. -
  326. - pgdat->memcg_lru.nr_memcgs[old]--;
  327. - pgdat->memcg_lru.nr_memcgs[new]++;
  328. -
  329. - lruvec->lrugen.gen = new;
  330. - WRITE_ONCE(lruvec->lrugen.seg, seg);
  331. -
  332. - if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
  333. - WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
  334. -
  335. - spin_unlock(&pgdat->memcg_lru.lock);
  336. -}
  337. -#endif
  338. -
  339. /******************************************************************************
  340. * state change
  341. ******************************************************************************/
  342. @@ -6090,67 +6185,6 @@ void lru_gen_exit_memcg(struct mem_cgrou
  343. }
  344. }
  345. -void lru_gen_online_memcg(struct mem_cgroup *memcg)
  346. -{
  347. - int gen;
  348. - int nid;
  349. - int bin = get_random_u32_below(MEMCG_NR_BINS);
  350. -
  351. - for_each_node(nid) {
  352. - struct pglist_data *pgdat = NODE_DATA(nid);
  353. - struct lruvec *lruvec = get_lruvec(memcg, nid);
  354. -
  355. - spin_lock(&pgdat->memcg_lru.lock);
  356. -
  357. - VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
  358. -
  359. - gen = get_memcg_gen(pgdat->memcg_lru.seq);
  360. -
  361. - hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
  362. - pgdat->memcg_lru.nr_memcgs[gen]++;
  363. -
  364. - lruvec->lrugen.gen = gen;
  365. -
  366. - spin_unlock(&pgdat->memcg_lru.lock);
  367. - }
  368. -}
  369. -
  370. -void lru_gen_offline_memcg(struct mem_cgroup *memcg)
  371. -{
  372. - int nid;
  373. -
  374. - for_each_node(nid) {
  375. - struct lruvec *lruvec = get_lruvec(memcg, nid);
  376. -
  377. - lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
  378. - }
  379. -}
  380. -
  381. -void lru_gen_release_memcg(struct mem_cgroup *memcg)
  382. -{
  383. - int gen;
  384. - int nid;
  385. -
  386. - for_each_node(nid) {
  387. - struct pglist_data *pgdat = NODE_DATA(nid);
  388. - struct lruvec *lruvec = get_lruvec(memcg, nid);
  389. -
  390. - spin_lock(&pgdat->memcg_lru.lock);
  391. -
  392. - VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
  393. -
  394. - gen = lruvec->lrugen.gen;
  395. -
  396. - hlist_nulls_del_rcu(&lruvec->lrugen.list);
  397. - pgdat->memcg_lru.nr_memcgs[gen]--;
  398. -
  399. - if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
  400. - WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
  401. -
  402. - spin_unlock(&pgdat->memcg_lru.lock);
  403. - }
  404. -}
  405. -
  406. #endif /* CONFIG_MEMCG */
  407. static int __init init_lru_gen(void)