Blame SOURCES/xfsprogs-5.7.0-xfs_repair-fix-rebuilding-btree-block-less-than-minr.patch

d13294
From 6df28d12d7760701c9d11e659e374665c5ffd0b9 Mon Sep 17 00:00:00 2001
d13294
From: Gao Xiang <hsiangkao@redhat.com>
d13294
Date: Fri, 10 Jul 2020 15:32:36 -0400
d13294
Subject: [PATCH] xfs_repair: fix rebuilding btree block less than minrecs
d13294
d13294
In production, we found that sometimes xfs_repair phase 5
d13294
rebuilds freespace node block with pointers less than minrecs
d13294
and if we trigger xfs_repair again it would report such
d13294
the following message:
d13294
d13294
bad btree nrecs (39, min=40, max=80) in btbno block 0/7882
d13294
d13294
The background is that xfs_repair starts to rebuild AGFL
d13294
after the freespace btree is settled in phase 5 so we may
d13294
need to leave necessary room in advance for each btree
d13294
leaves in order to avoid freespace btree split and then
d13294
result in AGFL rebuild fails. The old mathematics uses
d13294
ceil(num_extents / maxrecs) to decide the number of node
d13294
blocks. That would be fine without leaving extra space
d13294
since minrecs = maxrecs / 2 but if some slack was decreased
d13294
from maxrecs, the result would be larger than what is
d13294
expected and cause num_recs_pb less than minrecs, i.e:
d13294
d13294
num_extents = 79, adj_maxrecs = 80 - 2 (slack) = 78
d13294
d13294
so we'd get
d13294
d13294
num_blocks = ceil(79 / 78) = 2,
d13294
num_recs_pb = 79 / 2 = 39, which is less than
d13294
minrecs = 80 / 2 = 40
d13294
d13294
OTOH, btree bulk loading code behaves in a different way.
d13294
As in xfs_btree_bload_level_geometry it wrote
d13294
d13294
num_blocks = floor(num_extents / maxrecs)
d13294
d13294
which will never go below minrecs. And when it goes above
d13294
maxrecs, just increment num_blocks and recalculate so we
d13294
can get the reasonable results.
d13294
d13294
Later, btree bulk loader will replace the current repair code.
d13294
But we may still want to look for a backportable solution
d13294
for stable versions. Hence, keep the same logic to avoid
d13294
the freespace as well as rmap btree minrecs underflow for now.
d13294
d13294
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
d13294
Cc: Dave Chinner <dchinner@redhat.com>
d13294
Cc: Eric Sandeen <sandeen@sandeen.net>
d13294
Fixes: 9851fd79bfb1 ("repair: AGFL rebuild fails if btree split required")
d13294
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
d13294
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
d13294
Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
d13294
---
d13294
 repair/phase5.c | 152 ++++++++++++++++++++----------------------------
d13294
 1 file changed, 63 insertions(+), 89 deletions(-)
d13294
d13294
Index: xfsprogs-5.0.0/repair/phase5.c
d13294
===================================================================
d13294
--- xfsprogs-5.0.0.orig/repair/phase5.c
d13294
+++ xfsprogs-5.0.0/repair/phase5.c
d13294
@@ -346,11 +346,32 @@ finish_cursor(bt_status_t *curs)
d13294
  * failure at runtime. Hence leave a couple of records slack space in
d13294
  * each block to allow immediate modification of the tree without
d13294
  * requiring splits to be done.
d13294
- *
d13294
- * XXX(hch): any reason we don't just look at mp->m_alloc_mxr?
d13294
  */
d13294
-#define XR_ALLOC_BLOCK_MAXRECS(mp, level) \
d13294
-	(libxfs_allocbt_maxrecs((mp), (mp)->m_sb.sb_blocksize, (level) == 0) - 2)
d13294
+static void
d13294
+compute_level_geometry(
d13294
+	struct xfs_mount	*mp,
d13294
+	struct bt_stat_level	*lptr,
d13294
+	uint64_t		nr_this_level,
d13294
+	int			slack,
d13294
+	bool			leaf)
d13294
+{
d13294
+	unsigned int		maxrecs = mp->m_alloc_mxr[!leaf];
d13294
+	unsigned int		desired_npb;
d13294
+
d13294
+	desired_npb = max(mp->m_alloc_mnr[!leaf], maxrecs - slack);
d13294
+	lptr->num_recs_tot = nr_this_level;
d13294
+	lptr->num_blocks = max(1ULL, nr_this_level / desired_npb);
d13294
+
d13294
+	lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
d13294
+	lptr->modulo = nr_this_level % lptr->num_blocks;
d13294
+	if (lptr->num_recs_pb > maxrecs ||
d13294
+	    (lptr->num_recs_pb == maxrecs && lptr->modulo)) {
d13294
+		lptr->num_blocks++;
d13294
+
d13294
+		lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
d13294
+		lptr->modulo = nr_this_level % lptr->num_blocks;
d13294
+	}
d13294
+}
d13294
 
d13294
 /*
d13294
  * this calculates a freespace cursor for an ag.
d13294
@@ -368,6 +389,7 @@ calculate_freespace_cursor(xfs_mount_t *
d13294
 	int			i;
d13294
 	int			extents_used;
d13294
 	int			extra_blocks;
d13294
+	uint64_t		old_blocks;
d13294
 	bt_stat_level_t		*lptr;
d13294
 	bt_stat_level_t		*p_lptr;
d13294
 	extent_tree_node_t	*ext_ptr;
d13294
@@ -386,10 +408,7 @@ calculate_freespace_cursor(xfs_mount_t *
d13294
 	 * of the tree and set up the cursor for the leaf level
d13294
 	 * (note that the same code is duplicated further down)
d13294
 	 */
d13294
-	lptr->num_blocks = howmany(num_extents, XR_ALLOC_BLOCK_MAXRECS(mp, 0));
d13294
-	lptr->num_recs_pb = num_extents / lptr->num_blocks;
d13294
-	lptr->modulo = num_extents % lptr->num_blocks;
d13294
-	lptr->num_recs_tot = num_extents;
d13294
+	compute_level_geometry(mp, lptr, num_extents, 2, true);
d13294
 	level = 1;
d13294
 
d13294
 #ifdef XR_BLD_FREE_TRACE
d13294
@@ -403,30 +422,23 @@ calculate_freespace_cursor(xfs_mount_t *
d13294
 	 * if we need more levels, set them up.  # of records
d13294
 	 * per level is the # of blocks in the level below it
d13294
 	 */
d13294
-	if (lptr->num_blocks > 1)  {
d13294
-		for (; btree_curs->level[level - 1].num_blocks > 1
d13294
-				&& level < XFS_BTREE_MAXLEVELS;
d13294
-				level++)  {
d13294
-			lptr = &btree_curs->level[level];
d13294
-			p_lptr = &btree_curs->level[level - 1];
d13294
-			lptr->num_blocks = howmany(p_lptr->num_blocks,
d13294
-					XR_ALLOC_BLOCK_MAXRECS(mp, level));
d13294
-			lptr->modulo = p_lptr->num_blocks
d13294
-					% lptr->num_blocks;
d13294
-			lptr->num_recs_pb = p_lptr->num_blocks
d13294
-					/ lptr->num_blocks;
d13294
-			lptr->num_recs_tot = p_lptr->num_blocks;
d13294
+	while (lptr->num_blocks > 1) {
d13294
+		p_lptr = lptr;
d13294
+		lptr = &btree_curs->level[level];
d13294
+
d13294
+		compute_level_geometry(mp, lptr,
d13294
+				p_lptr->num_blocks, 0, false);
d13294
 #ifdef XR_BLD_FREE_TRACE
d13294
-			fprintf(stderr, "%s %d %d %d %d %d\n", __func__,
d13294
-					level,
d13294
-					lptr->num_blocks,
d13294
-					lptr->num_recs_pb,
d13294
-					lptr->modulo,
d13294
-					lptr->num_recs_tot);
d13294
+		fprintf(stderr, "%s %d %d %d %d %d\n", __func__,
d13294
+				level,
d13294
+				lptr->num_blocks,
d13294
+				lptr->num_recs_pb,
d13294
+				lptr->modulo,
d13294
+				lptr->num_recs_tot);
d13294
 #endif
d13294
-		}
d13294
+		level++;
d13294
 	}
d13294
-
d13294
+	ASSERT(level < XFS_BTREE_MAXLEVELS);
d13294
 	ASSERT(lptr->num_blocks == 1);
d13294
 	btree_curs->num_levels = level;
d13294
 
d13294
@@ -494,8 +506,11 @@ calculate_freespace_cursor(xfs_mount_t *
d13294
 	 * see if the number of leaf blocks will change as a result
d13294
 	 * of the number of extents changing
d13294
 	 */
d13294
-	if (howmany(num_extents, XR_ALLOC_BLOCK_MAXRECS(mp, 0))
d13294
-			!= btree_curs->level[0].num_blocks)  {
d13294
+	old_blocks = btree_curs->level[0].num_blocks;
d13294
+	compute_level_geometry(mp, &btree_curs->level[0], num_extents, 2, true);
d13294
+	extra_blocks = 0;
d13294
+
d13294
+	if (old_blocks != btree_curs->level[0].num_blocks)  {
d13294
 		/*
d13294
 		 * yes -- recalculate the cursor.  If the number of
d13294
 		 * excess (overallocated) blocks is < xfs_agfl_size/2, we're ok.
d13294
@@ -551,31 +566,19 @@ calculate_freespace_cursor(xfs_mount_t *
d13294
 		}
d13294
 
d13294
 		lptr = &btree_curs->level[0];
d13294
-		lptr->num_blocks = howmany(num_extents,
d13294
-					XR_ALLOC_BLOCK_MAXRECS(mp, 0));
d13294
-		lptr->num_recs_pb = num_extents / lptr->num_blocks;
d13294
-		lptr->modulo = num_extents % lptr->num_blocks;
d13294
-		lptr->num_recs_tot = num_extents;
d13294
 		level = 1;
d13294
 
d13294
 		/*
d13294
 		 * if we need more levels, set them up
d13294
 		 */
d13294
-		if (lptr->num_blocks > 1)  {
d13294
-			for (level = 1; btree_curs->level[level-1].num_blocks
d13294
-					> 1 && level < XFS_BTREE_MAXLEVELS;
d13294
-					level++)  {
d13294
-				lptr = &btree_curs->level[level];
d13294
-				p_lptr = &btree_curs->level[level-1];
d13294
-				lptr->num_blocks = howmany(p_lptr->num_blocks,
d13294
-					XR_ALLOC_BLOCK_MAXRECS(mp, level));
d13294
-				lptr->modulo = p_lptr->num_blocks
d13294
-						% lptr->num_blocks;
d13294
-				lptr->num_recs_pb = p_lptr->num_blocks
d13294
-						/ lptr->num_blocks;
d13294
-				lptr->num_recs_tot = p_lptr->num_blocks;
d13294
-			}
d13294
+		while (lptr->num_blocks > 1) {
d13294
+			p_lptr = lptr;
d13294
+			lptr = &btree_curs->level[level++];
d13294
+
d13294
+			compute_level_geometry(mp, lptr,
d13294
+					p_lptr->num_blocks, 0, false);
d13294
 		}
d13294
+		ASSERT(level < XFS_BTREE_MAXLEVELS);
d13294
 		ASSERT(lptr->num_blocks == 1);
d13294
 		btree_curs->num_levels = level;
d13294
 
d13294
@@ -589,22 +592,6 @@ calculate_freespace_cursor(xfs_mount_t *
d13294
 
d13294
 		ASSERT(blocks_allocated_total >= blocks_needed);
d13294
 		extra_blocks = blocks_allocated_total - blocks_needed;
d13294
-	} else  {
d13294
-		if (extents_used > 0) {
d13294
-			/*
d13294
-			 * reset the leaf level geometry to account
d13294
-			 * for consumed extents.  we can leave the
d13294
-			 * rest of the cursor alone since the number
d13294
-			 * of leaf blocks hasn't changed.
d13294
-			 */
d13294
-			lptr = &btree_curs->level[0];
d13294
-
d13294
-			lptr->num_recs_pb = num_extents / lptr->num_blocks;
d13294
-			lptr->modulo = num_extents % lptr->num_blocks;
d13294
-			lptr->num_recs_tot = num_extents;
d13294
-		}
d13294
-
d13294
-		extra_blocks = 0;
d13294
 	}
d13294
 
d13294
 	btree_curs->num_tot_blocks = blocks_allocated_pt;
d13294
@@ -1335,7 +1322,6 @@ init_rmapbt_cursor(
d13294
 	struct bt_stat_level	*lptr;
d13294
 	struct bt_stat_level	*p_lptr;
d13294
 	xfs_extlen_t		blocks_allocated;
d13294
-	int			maxrecs;
d13294
 
d13294
 	if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) {
d13294
 		memset(btree_curs, 0, sizeof(struct bt_status));
d13294
@@ -1371,32 +1357,20 @@ init_rmapbt_cursor(
d13294
 	 * Leave enough slack in the rmapbt that we can insert the
d13294
 	 * metadata AG entries without too many splits.
d13294
 	 */
d13294
-	maxrecs = mp->m_rmap_mxr[0];
d13294
-	if (num_recs > maxrecs)
d13294
-		maxrecs -= 10;
d13294
-	blocks_allocated = lptr->num_blocks = howmany(num_recs, maxrecs);
d13294
-
d13294
-	lptr->modulo = num_recs % lptr->num_blocks;
d13294
-	lptr->num_recs_pb = num_recs / lptr->num_blocks;
d13294
-	lptr->num_recs_tot = num_recs;
d13294
+	compute_level_geometry(mp, lptr, num_recs,
d13294
+			num_recs > mp->m_rmap_mxr[0] ? 10 : 0, true);
d13294
+	blocks_allocated = lptr->num_blocks;
d13294
 	level = 1;
d13294
 
d13294
-	if (lptr->num_blocks > 1)  {
d13294
-		for (; btree_curs->level[level-1].num_blocks > 1
d13294
-				&& level < XFS_BTREE_MAXLEVELS;
d13294
-				level++)  {
d13294
-			lptr = &btree_curs->level[level];
d13294
-			p_lptr = &btree_curs->level[level - 1];
d13294
-			lptr->num_blocks = howmany(p_lptr->num_blocks,
d13294
-				mp->m_rmap_mxr[1]);
d13294
-			lptr->modulo = p_lptr->num_blocks % lptr->num_blocks;
d13294
-			lptr->num_recs_pb = p_lptr->num_blocks
d13294
-					/ lptr->num_blocks;
d13294
-			lptr->num_recs_tot = p_lptr->num_blocks;
d13294
-
d13294
-			blocks_allocated += lptr->num_blocks;
d13294
-		}
d13294
+	while (lptr->num_blocks > 1) {
d13294
+		p_lptr = lptr;
d13294
+		lptr = &btree_curs->level[level++];
d13294
+
d13294
+		compute_level_geometry(mp, lptr,
d13294
+				p_lptr->num_blocks, 0, false);
d13294
+		blocks_allocated += lptr->num_blocks;
d13294
 	}
d13294
+	ASSERT(level < XFS_BTREE_MAXLEVELS);
d13294
 	ASSERT(lptr->num_blocks == 1);
d13294
 	btree_curs->num_levels = level;
d13294