Browse Source

add cherry-picks for OCFS2 bug

see https://forum.proxmox.com/threads/ocfs2-kernel-bug.39163/
Fabian Grünbichler 7 years ago
parent
commit
3323a8b78c

+ 61 - 0
patches/kernel/0028-ocfs2-make-metadata-estimation-accurate-and-clear.patch

@@ -0,0 +1,61 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Changwei Ge <[email protected]>
+Date: Wed, 31 Jan 2018 16:15:02 -0800
+Subject: [PATCH] ocfs2: make metadata estimation accurate and clear
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Current code assume that ::w_unwritten_list always has only one item on.
+This is not right and hard to get understood.  So improve how to count
+unwritten item.
+
+Link: http://lkml.kernel.org/r/[email protected]
+Signed-off-by: Changwei Ge <[email protected]>
+Reported-by: John Lightsey <[email protected]>
+Tested-by: John Lightsey <[email protected]>
+Cc: Mark Fasheh <[email protected]>
+Cc: Joseph Qi <[email protected]>
+Cc: Junxiao Bi <[email protected]>
+Cc: Joel Becker <[email protected]>
+Cc: Changwei Ge <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+Signed-off-by: Linus Torvalds <[email protected]>
+(cherry picked from commit 63de8bd9328bf2a778fc277503da163ae3defa3c)
+Signed-off-by: Fabian Grünbichler <[email protected]>
+---
+ fs/ocfs2/aops.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
+index 88a31e9340a0..77ec9b495027 100644
+--- a/fs/ocfs2/aops.c
++++ b/fs/ocfs2/aops.c
+@@ -784,6 +784,7 @@ struct ocfs2_write_ctxt {
+ 	struct ocfs2_cached_dealloc_ctxt w_dealloc;
+ 
+ 	struct list_head		w_unwritten_list;
++	unsigned int			w_unwritten_count;
+ };
+ 
+ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
+@@ -1373,6 +1374,7 @@ static int ocfs2_unwritten_check(struct inode *inode,
+ 	desc->c_clear_unwritten = 0;
+ 	list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
+ 	list_add_tail(&new->ue_node, &wc->w_unwritten_list);
++	wc->w_unwritten_count++;
+ 	new = NULL;
+ unlock:
+ 	spin_unlock(&oi->ip_lock);
+@@ -2246,7 +2248,7 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
+ 		ue->ue_phys = desc->c_phys;
+ 
+ 		list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
+-		dwc->dw_zero_count++;
++		dwc->dw_zero_count += wc->w_unwritten_count;
+ 	}
+ 
+ 	ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
+-- 
+2.14.2
+

+ 370 - 0
patches/kernel/0029-ocfs2-try-to-reuse-extent-block-in-dealloc-without-m.patch

@@ -0,0 +1,370 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Changwei Ge <[email protected]>
+Date: Wed, 31 Jan 2018 16:15:06 -0800
+Subject: [PATCH] ocfs2: try to reuse extent block in dealloc without
+ meta_alloc
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+A crash issue was reported by John Lightsey with a call trace as follows:
+
+  ocfs2_split_extent+0x1ad3/0x1b40 [ocfs2]
+  ocfs2_change_extent_flag+0x33a/0x470 [ocfs2]
+  ocfs2_mark_extent_written+0x172/0x220 [ocfs2]
+  ocfs2_dio_end_io+0x62d/0x910 [ocfs2]
+  dio_complete+0x19a/0x1a0
+  do_blockdev_direct_IO+0x19dd/0x1eb0
+  __blockdev_direct_IO+0x43/0x50
+  ocfs2_direct_IO+0x8f/0xa0 [ocfs2]
+  generic_file_direct_write+0xb2/0x170
+  __generic_file_write_iter+0xc3/0x1b0
+  ocfs2_file_write_iter+0x4bb/0xca0 [ocfs2]
+  __vfs_write+0xae/0xf0
+  vfs_write+0xb8/0x1b0
+  SyS_write+0x4f/0xb0
+  system_call_fastpath+0x16/0x75
+
+The BUG code told that extent tree wants to grow but no metadata was
+reserved ahead of time.  From my investigation into this issue, the root
+cause it that although enough metadata is not reserved, there should be
+enough for following use.  Rightmost extent is merged into its left one
+due to a certain times of marking extent written.  Because during
+marking extent written, we got many physically continuous extents.  At
+last, an empty extent showed up and the rightmost path is removed from
+extent tree.
+
+Add a new mechanism to reuse extent block cached in dealloc which were
+just unlinked from extent tree to solve this crash issue.
+
+Criteria is that during marking extents *written*, if extent rotation
+and merging results in unlinking extent with growing extent tree later
+without any metadata reserved ahead of time, try to reuse those extents
+in dealloc in which deleted extents are cached.
+
+Also, this patch addresses the issue John reported that ::dw_zero_count
+is not calculated properly.
+
+After applying this patch, the issue John reported was gone.  Thanks for
+the reproducer provided by John.  And this patch has passed
+ocfs2-test(29 cases) suite running by New H3C Group.
+
+[[email protected]: fix static checker warnning]
+  Link: http://lkml.kernel.org/r/63ADC13FD55D6546B7DECE290D39E373F29196AE@H3CMLB12-EX.srv.huawei-3com.com
+[[email protected]: brelse(NULL) is legal]
+Link: http://lkml.kernel.org/r/[email protected]
+Signed-off-by: Changwei Ge <[email protected]>
+Reported-by: John Lightsey <[email protected]>
+Tested-by: John Lightsey <[email protected]>
+Cc: Joel Becker <[email protected]>
+Cc: Joseph Qi <[email protected]>
+Cc: Junxiao Bi <[email protected]>
+Cc: Dan Carpenter <[email protected]>
+Cc: Mark Fasheh <[email protected]>
+Signed-off-by: Andrew Morton <[email protected]>
+Signed-off-by: Linus Torvalds <[email protected]>
+(cherry picked from commit 71a36944042b7d9dd71f6a5d1c5ea1c2353b5d42)
+Signed-off-by: Fabian Grünbichler <[email protected]>
+---
+ fs/ocfs2/alloc.h |   1 +
+ fs/ocfs2/alloc.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
+ fs/ocfs2/aops.c  |   6 ++
+ 3 files changed, 203 insertions(+), 10 deletions(-)
+
+diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
+index 4a5152ec88a3..571692171dd1 100644
+--- a/fs/ocfs2/alloc.h
++++ b/fs/ocfs2/alloc.h
+@@ -61,6 +61,7 @@ struct ocfs2_extent_tree {
+ 	ocfs2_journal_access_func		et_root_journal_access;
+ 	void					*et_object;
+ 	unsigned int				et_max_leaf_clusters;
++	struct ocfs2_cached_dealloc_ctxt	*et_dealloc;
+ };
+ 
+ /*
+diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
+index 386aecce881d..9b5e7d8ba710 100644
+--- a/fs/ocfs2/alloc.c
++++ b/fs/ocfs2/alloc.c
+@@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
+ 				     struct ocfs2_extent_rec *rec);
+ static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
+ static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
++
++static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
++					struct ocfs2_extent_tree *et,
++					struct buffer_head **new_eb_bh,
++					int blk_wanted, int *blk_given);
++static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et);
++
+ static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
+ 	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
+ 	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
+@@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
+ 	if (!obj)
+ 		obj = (void *)bh->b_data;
+ 	et->et_object = obj;
++	et->et_dealloc = NULL;
+ 
+ 	et->et_ops->eo_fill_root_el(et);
+ 	if (!et->et_ops->eo_fill_max_leaf_clusters)
+@@ -1159,7 +1167,7 @@ static int ocfs2_add_branch(handle_t *handle,
+ 			    struct buffer_head **last_eb_bh,
+ 			    struct ocfs2_alloc_context *meta_ac)
+ {
+-	int status, new_blocks, i;
++	int status, new_blocks, i, block_given = 0;
+ 	u64 next_blkno, new_last_eb_blk;
+ 	struct buffer_head *bh;
+ 	struct buffer_head **new_eb_bhs = NULL;
+@@ -1214,11 +1222,31 @@ static int ocfs2_add_branch(handle_t *handle,
+ 		goto bail;
+ 	}
+ 
+-	status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
+-					   meta_ac, new_eb_bhs);
+-	if (status < 0) {
+-		mlog_errno(status);
+-		goto bail;
++	/* Firstyly, try to reuse dealloc since we have already estimated how
++	 * many extent blocks we may use.
++	 */
++	if (!ocfs2_is_dealloc_empty(et)) {
++		status = ocfs2_reuse_blk_from_dealloc(handle, et,
++						      new_eb_bhs, new_blocks,
++						      &block_given);
++		if (status < 0) {
++			mlog_errno(status);
++			goto bail;
++		}
++	}
++
++	BUG_ON(block_given > new_blocks);
++
++	if (block_given < new_blocks) {
++		BUG_ON(!meta_ac);
++		status = ocfs2_create_new_meta_bhs(handle, et,
++						   new_blocks - block_given,
++						   meta_ac,
++						   &new_eb_bhs[block_given]);
++		if (status < 0) {
++			mlog_errno(status);
++			goto bail;
++		}
+ 	}
+ 
+ 	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
+@@ -1341,15 +1369,25 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
+ 				  struct ocfs2_alloc_context *meta_ac,
+ 				  struct buffer_head **ret_new_eb_bh)
+ {
+-	int status, i;
++	int status, i, block_given = 0;
+ 	u32 new_clusters;
+ 	struct buffer_head *new_eb_bh = NULL;
+ 	struct ocfs2_extent_block *eb;
+ 	struct ocfs2_extent_list  *root_el;
+ 	struct ocfs2_extent_list  *eb_el;
+ 
+-	status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
+-					   &new_eb_bh);
++	if (!ocfs2_is_dealloc_empty(et)) {
++		status = ocfs2_reuse_blk_from_dealloc(handle, et,
++						      &new_eb_bh, 1,
++						      &block_given);
++	} else if (meta_ac) {
++		status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
++						   &new_eb_bh);
++
++	} else {
++		BUG();
++	}
++
+ 	if (status < 0) {
+ 		mlog_errno(status);
+ 		goto bail;
+@@ -1512,7 +1550,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
+ 	int depth = le16_to_cpu(el->l_tree_depth);
+ 	struct buffer_head *bh = NULL;
+ 
+-	BUG_ON(meta_ac == NULL);
++	BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et));
+ 
+ 	shift = ocfs2_find_branch_target(et, &bh);
+ 	if (shift < 0) {
+@@ -6593,6 +6631,154 @@ ocfs2_find_per_slot_free_list(int type,
+ 	return fl;
+ }
+ 
++static struct ocfs2_per_slot_free_list *
++ocfs2_find_preferred_free_list(int type,
++			       int preferred_slot,
++			       int *real_slot,
++			       struct ocfs2_cached_dealloc_ctxt *ctxt)
++{
++	struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
++
++	while (fl) {
++		if (fl->f_inode_type == type && fl->f_slot == preferred_slot) {
++			*real_slot = fl->f_slot;
++			return fl;
++		}
++
++		fl = fl->f_next_suballocator;
++	}
++
++	/* If we can't find any free list matching preferred slot, just use
++	 * the first one.
++	 */
++	fl = ctxt->c_first_suballocator;
++	*real_slot = fl->f_slot;
++
++	return fl;
++}
++
++/* Return Value 1 indicates empty */
++static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et)
++{
++	struct ocfs2_per_slot_free_list *fl = NULL;
++
++	if (!et->et_dealloc)
++		return 1;
++
++	fl = et->et_dealloc->c_first_suballocator;
++	if (!fl)
++		return 1;
++
++	if (!fl->f_first)
++		return 1;
++
++	return 0;
++}
++
++/* If extent was deleted from tree due to extent rotation and merging, and
++ * no metadata is reserved ahead of time. Try to reuse some extents
++ * just deleted. This is only used to reuse extent blocks.
++ * It is supposed to find enough extent blocks in dealloc if our estimation
++ * on metadata is accurate.
++ */
++static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
++					struct ocfs2_extent_tree *et,
++					struct buffer_head **new_eb_bh,
++					int blk_wanted, int *blk_given)
++{
++	int i, status = 0, real_slot;
++	struct ocfs2_cached_dealloc_ctxt *dealloc;
++	struct ocfs2_per_slot_free_list *fl;
++	struct ocfs2_cached_block_free *bf;
++	struct ocfs2_extent_block *eb;
++	struct ocfs2_super *osb =
++		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
++
++	*blk_given = 0;
++
++	/* If extent tree doesn't have a dealloc, this is not faulty. Just
++	 * tell upper caller dealloc can't provide any block and it should
++	 * ask for alloc to claim more space.
++	 */
++	dealloc = et->et_dealloc;
++	if (!dealloc)
++		goto bail;
++
++	for (i = 0; i < blk_wanted; i++) {
++		/* Prefer to use local slot */
++		fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE,
++						    osb->slot_num, &real_slot,
++						    dealloc);
++		/* If no more block can be reused, we should claim more
++		 * from alloc. Just return here normally.
++		 */
++		if (!fl) {
++			status = 0;
++			break;
++		}
++
++		bf = fl->f_first;
++		fl->f_first = bf->free_next;
++
++		new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk);
++		if (new_eb_bh[i] == NULL) {
++			status = -ENOMEM;
++			mlog_errno(status);
++			goto bail;
++		}
++
++		mlog(0, "Reusing block(%llu) from "
++		     "dealloc(local slot:%d, real slot:%d)\n",
++		     bf->free_blk, osb->slot_num, real_slot);
++
++		ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]);
++
++		status = ocfs2_journal_access_eb(handle, et->et_ci,
++						 new_eb_bh[i],
++						 OCFS2_JOURNAL_ACCESS_CREATE);
++		if (status < 0) {
++			mlog_errno(status);
++			goto bail;
++		}
++
++		memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize);
++		eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data;
++
++		/* We can't guarantee that buffer head is still cached, so
++		 * polutlate the extent block again.
++		 */
++		strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
++		eb->h_blkno = cpu_to_le64(bf->free_blk);
++		eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
++		eb->h_suballoc_slot = cpu_to_le16(real_slot);
++		eb->h_suballoc_loc = cpu_to_le64(bf->free_bg);
++		eb->h_suballoc_bit = cpu_to_le16(bf->free_bit);
++		eb->h_list.l_count =
++			cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
++
++		/* We'll also be dirtied by the caller, so
++		 * this isn't absolutely necessary.
++		 */
++		ocfs2_journal_dirty(handle, new_eb_bh[i]);
++
++		if (!fl->f_first) {
++			dealloc->c_first_suballocator = fl->f_next_suballocator;
++			kfree(fl);
++		}
++		kfree(bf);
++	}
++
++	*blk_given = i;
++
++bail:
++	if (unlikely(status < 0)) {
++		for (i = 0; i < blk_wanted; i++)
++			brelse(new_eb_bh[i]);
++	}
++
++	return status;
++}
++
+ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+ 			      int type, int slot, u64 suballoc,
+ 			      u64 blkno, unsigned int bit)
+diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
+index 77ec9b495027..2ff02dda97d8 100644
+--- a/fs/ocfs2/aops.c
++++ b/fs/ocfs2/aops.c
+@@ -2322,6 +2322,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
+ 
+ 	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+ 
++	/* Attach dealloc with extent tree in case that we may reuse extents
++	 * which are already unlinked from current extent tree due to extent
++	 * rotation and merging.
++	 */
++	et.et_dealloc = &dealloc;
++
+ 	ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
+ 				    &data_ac, &meta_ac);
+ 	if (ret) {
+-- 
+2.14.2
+