0008-ocfs2-try-to-reuse-extent-block-in-dealloc-without-m.patch 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370
  1. From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
  2. From: Changwei Ge <[email protected]>
  3. Date: Wed, 31 Jan 2018 16:15:06 -0800
  4. Subject: [PATCH] ocfs2: try to reuse extent block in dealloc without
  5. meta_alloc
  6. MIME-Version: 1.0
  7. Content-Type: text/plain; charset=UTF-8
  8. Content-Transfer-Encoding: 8bit
  9. A crash issue was reported by John Lightsey with a call trace as follows:
  10. ocfs2_split_extent+0x1ad3/0x1b40 [ocfs2]
  11. ocfs2_change_extent_flag+0x33a/0x470 [ocfs2]
  12. ocfs2_mark_extent_written+0x172/0x220 [ocfs2]
  13. ocfs2_dio_end_io+0x62d/0x910 [ocfs2]
  14. dio_complete+0x19a/0x1a0
  15. do_blockdev_direct_IO+0x19dd/0x1eb0
  16. __blockdev_direct_IO+0x43/0x50
  17. ocfs2_direct_IO+0x8f/0xa0 [ocfs2]
  18. generic_file_direct_write+0xb2/0x170
  19. __generic_file_write_iter+0xc3/0x1b0
  20. ocfs2_file_write_iter+0x4bb/0xca0 [ocfs2]
  21. __vfs_write+0xae/0xf0
  22. vfs_write+0xb8/0x1b0
  23. SyS_write+0x4f/0xb0
  24. system_call_fastpath+0x16/0x75
  25. The BUG code told that extent tree wants to grow but no metadata was
  26. reserved ahead of time. From my investigation into this issue, the root
  27. cause it that although enough metadata is not reserved, there should be
  28. enough for following use. Rightmost extent is merged into its left one
  29. due to a certain times of marking extent written. Because during
  30. marking extent written, we got many physically continuous extents. At
  31. last, an empty extent showed up and the rightmost path is removed from
  32. extent tree.
  33. Add a new mechanism to reuse extent block cached in dealloc which were
  34. just unlinked from extent tree to solve this crash issue.
  35. Criteria is that during marking extents *written*, if extent rotation
  36. and merging results in unlinking extent with growing extent tree later
  37. without any metadata reserved ahead of time, try to reuse those extents
  38. in dealloc in which deleted extents are cached.
  39. Also, this patch addresses the issue John reported that ::dw_zero_count
  40. is not calculated properly.
  41. After applying this patch, the issue John reported was gone. Thanks for
  42. the reproducer provided by John. And this patch has passed
  43. ocfs2-test(29 cases) suite running by New H3C Group.
  44. [[email protected]: fix static checker warnning]
  45. Link: http://lkml.kernel.org/r/63ADC13FD55D6546B7DECE290D39E373F29196AE@H3CMLB12-EX.srv.huawei-3com.com
  46. [[email protected]: brelse(NULL) is legal]
  47. Link: http://lkml.kernel.org/r/[email protected]
  48. Signed-off-by: Changwei Ge <[email protected]>
  49. Reported-by: John Lightsey <[email protected]>
  50. Tested-by: John Lightsey <[email protected]>
  51. Cc: Joel Becker <[email protected]>
  52. Cc: Joseph Qi <[email protected]>
  53. Cc: Junxiao Bi <[email protected]>
  54. Cc: Dan Carpenter <[email protected]>
  55. Cc: Mark Fasheh <[email protected]>
  56. Signed-off-by: Andrew Morton <[email protected]>
  57. Signed-off-by: Linus Torvalds <[email protected]>
  58. (cherry picked from commit 71a36944042b7d9dd71f6a5d1c5ea1c2353b5d42)
  59. Signed-off-by: Fabian Grünbichler <[email protected]>
  60. ---
  61. fs/ocfs2/alloc.h | 1 +
  62. fs/ocfs2/alloc.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
  63. fs/ocfs2/aops.c | 6 ++
  64. 3 files changed, 203 insertions(+), 10 deletions(-)
  65. diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
  66. index 27b75cf32cfa..250bcacdf9e9 100644
  67. --- a/fs/ocfs2/alloc.h
  68. +++ b/fs/ocfs2/alloc.h
  69. @@ -61,6 +61,7 @@ struct ocfs2_extent_tree {
  70. ocfs2_journal_access_func et_root_journal_access;
  71. void *et_object;
  72. unsigned int et_max_leaf_clusters;
  73. + struct ocfs2_cached_dealloc_ctxt *et_dealloc;
  74. };
  75. /*
  76. diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
  77. index ab5105f9767e..2f2c76193f54 100644
  78. --- a/fs/ocfs2/alloc.c
  79. +++ b/fs/ocfs2/alloc.c
  80. @@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
  81. struct ocfs2_extent_rec *rec);
  82. static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
  83. static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
  84. +
  85. +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
  86. + struct ocfs2_extent_tree *et,
  87. + struct buffer_head **new_eb_bh,
  88. + int blk_wanted, int *blk_given);
  89. +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et);
  90. +
  91. static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
  92. .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
  93. .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
  94. @@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
  95. if (!obj)
  96. obj = (void *)bh->b_data;
  97. et->et_object = obj;
  98. + et->et_dealloc = NULL;
  99. et->et_ops->eo_fill_root_el(et);
  100. if (!et->et_ops->eo_fill_max_leaf_clusters)
  101. @@ -1158,7 +1166,7 @@ static int ocfs2_add_branch(handle_t *handle,
  102. struct buffer_head **last_eb_bh,
  103. struct ocfs2_alloc_context *meta_ac)
  104. {
  105. - int status, new_blocks, i;
  106. + int status, new_blocks, i, block_given = 0;
  107. u64 next_blkno, new_last_eb_blk;
  108. struct buffer_head *bh;
  109. struct buffer_head **new_eb_bhs = NULL;
  110. @@ -1213,11 +1221,31 @@ static int ocfs2_add_branch(handle_t *handle,
  111. goto bail;
  112. }
  113. - status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
  114. - meta_ac, new_eb_bhs);
  115. - if (status < 0) {
  116. - mlog_errno(status);
  117. - goto bail;
  118. + /* Firstyly, try to reuse dealloc since we have already estimated how
  119. + * many extent blocks we may use.
  120. + */
  121. + if (!ocfs2_is_dealloc_empty(et)) {
  122. + status = ocfs2_reuse_blk_from_dealloc(handle, et,
  123. + new_eb_bhs, new_blocks,
  124. + &block_given);
  125. + if (status < 0) {
  126. + mlog_errno(status);
  127. + goto bail;
  128. + }
  129. + }
  130. +
  131. + BUG_ON(block_given > new_blocks);
  132. +
  133. + if (block_given < new_blocks) {
  134. + BUG_ON(!meta_ac);
  135. + status = ocfs2_create_new_meta_bhs(handle, et,
  136. + new_blocks - block_given,
  137. + meta_ac,
  138. + &new_eb_bhs[block_given]);
  139. + if (status < 0) {
  140. + mlog_errno(status);
  141. + goto bail;
  142. + }
  143. }
  144. /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
  145. @@ -1340,15 +1368,25 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
  146. struct ocfs2_alloc_context *meta_ac,
  147. struct buffer_head **ret_new_eb_bh)
  148. {
  149. - int status, i;
  150. + int status, i, block_given = 0;
  151. u32 new_clusters;
  152. struct buffer_head *new_eb_bh = NULL;
  153. struct ocfs2_extent_block *eb;
  154. struct ocfs2_extent_list *root_el;
  155. struct ocfs2_extent_list *eb_el;
  156. - status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
  157. - &new_eb_bh);
  158. + if (!ocfs2_is_dealloc_empty(et)) {
  159. + status = ocfs2_reuse_blk_from_dealloc(handle, et,
  160. + &new_eb_bh, 1,
  161. + &block_given);
  162. + } else if (meta_ac) {
  163. + status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
  164. + &new_eb_bh);
  165. +
  166. + } else {
  167. + BUG();
  168. + }
  169. +
  170. if (status < 0) {
  171. mlog_errno(status);
  172. goto bail;
  173. @@ -1511,7 +1549,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
  174. int depth = le16_to_cpu(el->l_tree_depth);
  175. struct buffer_head *bh = NULL;
  176. - BUG_ON(meta_ac == NULL);
  177. + BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et));
  178. shift = ocfs2_find_branch_target(et, &bh);
  179. if (shift < 0) {
  180. @@ -6585,6 +6623,154 @@ ocfs2_find_per_slot_free_list(int type,
  181. return fl;
  182. }
  183. +static struct ocfs2_per_slot_free_list *
  184. +ocfs2_find_preferred_free_list(int type,
  185. + int preferred_slot,
  186. + int *real_slot,
  187. + struct ocfs2_cached_dealloc_ctxt *ctxt)
  188. +{
  189. + struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
  190. +
  191. + while (fl) {
  192. + if (fl->f_inode_type == type && fl->f_slot == preferred_slot) {
  193. + *real_slot = fl->f_slot;
  194. + return fl;
  195. + }
  196. +
  197. + fl = fl->f_next_suballocator;
  198. + }
  199. +
  200. + /* If we can't find any free list matching preferred slot, just use
  201. + * the first one.
  202. + */
  203. + fl = ctxt->c_first_suballocator;
  204. + *real_slot = fl->f_slot;
  205. +
  206. + return fl;
  207. +}
  208. +
  209. +/* Return Value 1 indicates empty */
  210. +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et)
  211. +{
  212. + struct ocfs2_per_slot_free_list *fl = NULL;
  213. +
  214. + if (!et->et_dealloc)
  215. + return 1;
  216. +
  217. + fl = et->et_dealloc->c_first_suballocator;
  218. + if (!fl)
  219. + return 1;
  220. +
  221. + if (!fl->f_first)
  222. + return 1;
  223. +
  224. + return 0;
  225. +}
  226. +
  227. +/* If extent was deleted from tree due to extent rotation and merging, and
  228. + * no metadata is reserved ahead of time. Try to reuse some extents
  229. + * just deleted. This is only used to reuse extent blocks.
  230. + * It is supposed to find enough extent blocks in dealloc if our estimation
  231. + * on metadata is accurate.
  232. + */
  233. +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
  234. + struct ocfs2_extent_tree *et,
  235. + struct buffer_head **new_eb_bh,
  236. + int blk_wanted, int *blk_given)
  237. +{
  238. + int i, status = 0, real_slot;
  239. + struct ocfs2_cached_dealloc_ctxt *dealloc;
  240. + struct ocfs2_per_slot_free_list *fl;
  241. + struct ocfs2_cached_block_free *bf;
  242. + struct ocfs2_extent_block *eb;
  243. + struct ocfs2_super *osb =
  244. + OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
  245. +
  246. + *blk_given = 0;
  247. +
  248. + /* If extent tree doesn't have a dealloc, this is not faulty. Just
  249. + * tell upper caller dealloc can't provide any block and it should
  250. + * ask for alloc to claim more space.
  251. + */
  252. + dealloc = et->et_dealloc;
  253. + if (!dealloc)
  254. + goto bail;
  255. +
  256. + for (i = 0; i < blk_wanted; i++) {
  257. + /* Prefer to use local slot */
  258. + fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE,
  259. + osb->slot_num, &real_slot,
  260. + dealloc);
  261. + /* If no more block can be reused, we should claim more
  262. + * from alloc. Just return here normally.
  263. + */
  264. + if (!fl) {
  265. + status = 0;
  266. + break;
  267. + }
  268. +
  269. + bf = fl->f_first;
  270. + fl->f_first = bf->free_next;
  271. +
  272. + new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk);
  273. + if (new_eb_bh[i] == NULL) {
  274. + status = -ENOMEM;
  275. + mlog_errno(status);
  276. + goto bail;
  277. + }
  278. +
  279. + mlog(0, "Reusing block(%llu) from "
  280. + "dealloc(local slot:%d, real slot:%d)\n",
  281. + bf->free_blk, osb->slot_num, real_slot);
  282. +
  283. + ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]);
  284. +
  285. + status = ocfs2_journal_access_eb(handle, et->et_ci,
  286. + new_eb_bh[i],
  287. + OCFS2_JOURNAL_ACCESS_CREATE);
  288. + if (status < 0) {
  289. + mlog_errno(status);
  290. + goto bail;
  291. + }
  292. +
  293. + memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize);
  294. + eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data;
  295. +
  296. + /* We can't guarantee that buffer head is still cached, so
  297. + * polutlate the extent block again.
  298. + */
  299. + strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
  300. + eb->h_blkno = cpu_to_le64(bf->free_blk);
  301. + eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
  302. + eb->h_suballoc_slot = cpu_to_le16(real_slot);
  303. + eb->h_suballoc_loc = cpu_to_le64(bf->free_bg);
  304. + eb->h_suballoc_bit = cpu_to_le16(bf->free_bit);
  305. + eb->h_list.l_count =
  306. + cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
  307. +
  308. + /* We'll also be dirtied by the caller, so
  309. + * this isn't absolutely necessary.
  310. + */
  311. + ocfs2_journal_dirty(handle, new_eb_bh[i]);
  312. +
  313. + if (!fl->f_first) {
  314. + dealloc->c_first_suballocator = fl->f_next_suballocator;
  315. + kfree(fl);
  316. + }
  317. + kfree(bf);
  318. + }
  319. +
  320. + *blk_given = i;
  321. +
  322. +bail:
  323. + if (unlikely(status < 0)) {
  324. + for (i = 0; i < blk_wanted; i++)
  325. + brelse(new_eb_bh[i]);
  326. + }
  327. +
  328. + return status;
  329. +}
  330. +
  331. int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
  332. int type, int slot, u64 suballoc,
  333. u64 blkno, unsigned int bit)
  334. diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
  335. index 256986aca8df..e8e205bf2e41 100644
  336. --- a/fs/ocfs2/aops.c
  337. +++ b/fs/ocfs2/aops.c
  338. @@ -2332,6 +2332,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
  339. ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
  340. + /* Attach dealloc with extent tree in case that we may reuse extents
  341. + * which are already unlinked from current extent tree due to extent
  342. + * rotation and merging.
  343. + */
  344. + et.et_dealloc = &dealloc;
  345. +
  346. ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
  347. &data_ac, &meta_ac);
  348. if (ret) {
  349. --
  350. 2.14.2