Blame SOURCES/xfsprogs-5.7.0-xfs_repair-fix-rebuilding-btree-block-less-than-minr.patch

48cf7c
From 6df28d12d7760701c9d11e659e374665c5ffd0b9 Mon Sep 17 00:00:00 2001
48cf7c
From: Gao Xiang <hsiangkao@redhat.com>
48cf7c
Date: Fri, 10 Jul 2020 15:32:36 -0400
48cf7c
Subject: [PATCH] xfs_repair: fix rebuilding btree block less than minrecs
48cf7c
48cf7c
In production, we found that sometimes xfs_repair phase 5
48cf7c
rebuilds freespace node block with pointers less than minrecs
48cf7c
and if we trigger xfs_repair again it would report such
48cf7c
the following message:
48cf7c
48cf7c
bad btree nrecs (39, min=40, max=80) in btbno block 0/7882
48cf7c
48cf7c
The background is that xfs_repair starts to rebuild AGFL
48cf7c
after the freespace btree is settled in phase 5 so we may
48cf7c
need to leave necessary room in advance for each btree
48cf7c
leaves in order to avoid freespace btree split and then
48cf7c
result in AGFL rebuild fails. The old mathematics uses
48cf7c
ceil(num_extents / maxrecs) to decide the number of node
48cf7c
blocks. That would be fine without leaving extra space
48cf7c
since minrecs = maxrecs / 2 but if some slack was decreased
48cf7c
from maxrecs, the result would be larger than what is
48cf7c
expected and cause num_recs_pb less than minrecs, i.e:
48cf7c
48cf7c
num_extents = 79, adj_maxrecs = 80 - 2 (slack) = 78
48cf7c
48cf7c
so we'd get
48cf7c
48cf7c
num_blocks = ceil(79 / 78) = 2,
48cf7c
num_recs_pb = 79 / 2 = 39, which is less than
48cf7c
minrecs = 80 / 2 = 40
48cf7c
48cf7c
OTOH, btree bulk loading code behaves in a different way.
48cf7c
As in xfs_btree_bload_level_geometry it wrote
48cf7c
48cf7c
num_blocks = floor(num_extents / maxrecs)
48cf7c
48cf7c
which will never go below minrecs. And when it goes above
48cf7c
maxrecs, just increment num_blocks and recalculate so we
48cf7c
can get the reasonable results.
48cf7c
48cf7c
Later, btree bulk loader will replace the current repair code.
48cf7c
But we may still want to look for a backportable solution
48cf7c
for stable versions. Hence, keep the same logic to avoid
48cf7c
the freespace as well as rmap btree minrecs underflow for now.
48cf7c
48cf7c
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
48cf7c
Cc: Dave Chinner <dchinner@redhat.com>
48cf7c
Cc: Eric Sandeen <sandeen@sandeen.net>
48cf7c
Fixes: 9851fd79bfb1 ("repair: AGFL rebuild fails if btree split required")
48cf7c
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
48cf7c
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
48cf7c
Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
48cf7c
---
48cf7c
 repair/phase5.c | 152 ++++++++++++++++++++----------------------------
48cf7c
 1 file changed, 63 insertions(+), 89 deletions(-)
48cf7c
48cf7c
Index: xfsprogs-5.0.0/repair/phase5.c
48cf7c
===================================================================
48cf7c
--- xfsprogs-5.0.0.orig/repair/phase5.c
48cf7c
+++ xfsprogs-5.0.0/repair/phase5.c
48cf7c
@@ -346,11 +346,32 @@ finish_cursor(bt_status_t *curs)
48cf7c
  * failure at runtime. Hence leave a couple of records slack space in
48cf7c
  * each block to allow immediate modification of the tree without
48cf7c
  * requiring splits to be done.
48cf7c
- *
48cf7c
- * XXX(hch): any reason we don't just look at mp->m_alloc_mxr?
48cf7c
  */
48cf7c
-#define XR_ALLOC_BLOCK_MAXRECS(mp, level) \
48cf7c
-	(libxfs_allocbt_maxrecs((mp), (mp)->m_sb.sb_blocksize, (level) == 0) - 2)
48cf7c
+static void
48cf7c
+compute_level_geometry(
48cf7c
+	struct xfs_mount	*mp,
48cf7c
+	struct bt_stat_level	*lptr,
48cf7c
+	uint64_t		nr_this_level,
48cf7c
+	int			slack,
48cf7c
+	bool			leaf)
48cf7c
+{
48cf7c
+	unsigned int		maxrecs = mp->m_alloc_mxr[!leaf];
48cf7c
+	unsigned int		desired_npb;
48cf7c
+
48cf7c
+	desired_npb = max(mp->m_alloc_mnr[!leaf], maxrecs - slack);
48cf7c
+	lptr->num_recs_tot = nr_this_level;
48cf7c
+	lptr->num_blocks = max(1ULL, nr_this_level / desired_npb);
48cf7c
+
48cf7c
+	lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
48cf7c
+	lptr->modulo = nr_this_level % lptr->num_blocks;
48cf7c
+	if (lptr->num_recs_pb > maxrecs ||
48cf7c
+	    (lptr->num_recs_pb == maxrecs && lptr->modulo)) {
48cf7c
+		lptr->num_blocks++;
48cf7c
+
48cf7c
+		lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
48cf7c
+		lptr->modulo = nr_this_level % lptr->num_blocks;
48cf7c
+	}
48cf7c
+}
48cf7c
 
48cf7c
 /*
48cf7c
  * this calculates a freespace cursor for an ag.
48cf7c
@@ -368,6 +389,7 @@ calculate_freespace_cursor(xfs_mount_t *
48cf7c
 	int			i;
48cf7c
 	int			extents_used;
48cf7c
 	int			extra_blocks;
48cf7c
+	uint64_t		old_blocks;
48cf7c
 	bt_stat_level_t		*lptr;
48cf7c
 	bt_stat_level_t		*p_lptr;
48cf7c
 	extent_tree_node_t	*ext_ptr;
48cf7c
@@ -386,10 +408,7 @@ calculate_freespace_cursor(xfs_mount_t *
48cf7c
 	 * of the tree and set up the cursor for the leaf level
48cf7c
 	 * (note that the same code is duplicated further down)
48cf7c
 	 */
48cf7c
-	lptr->num_blocks = howmany(num_extents, XR_ALLOC_BLOCK_MAXRECS(mp, 0));
48cf7c
-	lptr->num_recs_pb = num_extents / lptr->num_blocks;
48cf7c
-	lptr->modulo = num_extents % lptr->num_blocks;
48cf7c
-	lptr->num_recs_tot = num_extents;
48cf7c
+	compute_level_geometry(mp, lptr, num_extents, 2, true);
48cf7c
 	level = 1;
48cf7c
 
48cf7c
 #ifdef XR_BLD_FREE_TRACE
48cf7c
@@ -403,30 +422,23 @@ calculate_freespace_cursor(xfs_mount_t *
48cf7c
 	 * if we need more levels, set them up.  # of records
48cf7c
 	 * per level is the # of blocks in the level below it
48cf7c
 	 */
48cf7c
-	if (lptr->num_blocks > 1)  {
48cf7c
-		for (; btree_curs->level[level - 1].num_blocks > 1
48cf7c
-				&& level < XFS_BTREE_MAXLEVELS;
48cf7c
-				level++)  {
48cf7c
-			lptr = &btree_curs->level[level];
48cf7c
-			p_lptr = &btree_curs->level[level - 1];
48cf7c
-			lptr->num_blocks = howmany(p_lptr->num_blocks,
48cf7c
-					XR_ALLOC_BLOCK_MAXRECS(mp, level));
48cf7c
-			lptr->modulo = p_lptr->num_blocks
48cf7c
-					% lptr->num_blocks;
48cf7c
-			lptr->num_recs_pb = p_lptr->num_blocks
48cf7c
-					/ lptr->num_blocks;
48cf7c
-			lptr->num_recs_tot = p_lptr->num_blocks;
48cf7c
+	while (lptr->num_blocks > 1) {
48cf7c
+		p_lptr = lptr;
48cf7c
+		lptr = &btree_curs->level[level];
48cf7c
+
48cf7c
+		compute_level_geometry(mp, lptr,
48cf7c
+				p_lptr->num_blocks, 0, false);
48cf7c
 #ifdef XR_BLD_FREE_TRACE
48cf7c
-			fprintf(stderr, "%s %d %d %d %d %d\n", __func__,
48cf7c
-					level,
48cf7c
-					lptr->num_blocks,
48cf7c
-					lptr->num_recs_pb,
48cf7c
-					lptr->modulo,
48cf7c
-					lptr->num_recs_tot);
48cf7c
+		fprintf(stderr, "%s %d %d %d %d %d\n", __func__,
48cf7c
+				level,
48cf7c
+				lptr->num_blocks,
48cf7c
+				lptr->num_recs_pb,
48cf7c
+				lptr->modulo,
48cf7c
+				lptr->num_recs_tot);
48cf7c
 #endif
48cf7c
-		}
48cf7c
+		level++;
48cf7c
 	}
48cf7c
-
48cf7c
+	ASSERT(level < XFS_BTREE_MAXLEVELS);
48cf7c
 	ASSERT(lptr->num_blocks == 1);
48cf7c
 	btree_curs->num_levels = level;
48cf7c
 
48cf7c
@@ -494,8 +506,11 @@ calculate_freespace_cursor(xfs_mount_t *
48cf7c
 	 * see if the number of leaf blocks will change as a result
48cf7c
 	 * of the number of extents changing
48cf7c
 	 */
48cf7c
-	if (howmany(num_extents, XR_ALLOC_BLOCK_MAXRECS(mp, 0))
48cf7c
-			!= btree_curs->level[0].num_blocks)  {
48cf7c
+	old_blocks = btree_curs->level[0].num_blocks;
48cf7c
+	compute_level_geometry(mp, &btree_curs->level[0], num_extents, 2, true);
48cf7c
+	extra_blocks = 0;
48cf7c
+
48cf7c
+	if (old_blocks != btree_curs->level[0].num_blocks)  {
48cf7c
 		/*
48cf7c
 		 * yes -- recalculate the cursor.  If the number of
48cf7c
 		 * excess (overallocated) blocks is < xfs_agfl_size/2, we're ok.
48cf7c
@@ -551,31 +566,19 @@ calculate_freespace_cursor(xfs_mount_t *
48cf7c
 		}
48cf7c
 
48cf7c
 		lptr = &btree_curs->level[0];
48cf7c
-		lptr->num_blocks = howmany(num_extents,
48cf7c
-					XR_ALLOC_BLOCK_MAXRECS(mp, 0));
48cf7c
-		lptr->num_recs_pb = num_extents / lptr->num_blocks;
48cf7c
-		lptr->modulo = num_extents % lptr->num_blocks;
48cf7c
-		lptr->num_recs_tot = num_extents;
48cf7c
 		level = 1;
48cf7c
 
48cf7c
 		/*
48cf7c
 		 * if we need more levels, set them up
48cf7c
 		 */
48cf7c
-		if (lptr->num_blocks > 1)  {
48cf7c
-			for (level = 1; btree_curs->level[level-1].num_blocks
48cf7c
-					> 1 && level < XFS_BTREE_MAXLEVELS;
48cf7c
-					level++)  {
48cf7c
-				lptr = &btree_curs->level[level];
48cf7c
-				p_lptr = &btree_curs->level[level-1];
48cf7c
-				lptr->num_blocks = howmany(p_lptr->num_blocks,
48cf7c
-					XR_ALLOC_BLOCK_MAXRECS(mp, level));
48cf7c
-				lptr->modulo = p_lptr->num_blocks
48cf7c
-						% lptr->num_blocks;
48cf7c
-				lptr->num_recs_pb = p_lptr->num_blocks
48cf7c
-						/ lptr->num_blocks;
48cf7c
-				lptr->num_recs_tot = p_lptr->num_blocks;
48cf7c
-			}
48cf7c
+		while (lptr->num_blocks > 1) {
48cf7c
+			p_lptr = lptr;
48cf7c
+			lptr = &btree_curs->level[level++];
48cf7c
+
48cf7c
+			compute_level_geometry(mp, lptr,
48cf7c
+					p_lptr->num_blocks, 0, false);
48cf7c
 		}
48cf7c
+		ASSERT(level < XFS_BTREE_MAXLEVELS);
48cf7c
 		ASSERT(lptr->num_blocks == 1);
48cf7c
 		btree_curs->num_levels = level;
48cf7c
 
48cf7c
@@ -589,22 +592,6 @@ calculate_freespace_cursor(xfs_mount_t *
48cf7c
 
48cf7c
 		ASSERT(blocks_allocated_total >= blocks_needed);
48cf7c
 		extra_blocks = blocks_allocated_total - blocks_needed;
48cf7c
-	} else  {
48cf7c
-		if (extents_used > 0) {
48cf7c
-			/*
48cf7c
-			 * reset the leaf level geometry to account
48cf7c
-			 * for consumed extents.  we can leave the
48cf7c
-			 * rest of the cursor alone since the number
48cf7c
-			 * of leaf blocks hasn't changed.
48cf7c
-			 */
48cf7c
-			lptr = &btree_curs->level[0];
48cf7c
-
48cf7c
-			lptr->num_recs_pb = num_extents / lptr->num_blocks;
48cf7c
-			lptr->modulo = num_extents % lptr->num_blocks;
48cf7c
-			lptr->num_recs_tot = num_extents;
48cf7c
-		}
48cf7c
-
48cf7c
-		extra_blocks = 0;
48cf7c
 	}
48cf7c
 
48cf7c
 	btree_curs->num_tot_blocks = blocks_allocated_pt;
48cf7c
@@ -1335,7 +1322,6 @@ init_rmapbt_cursor(
48cf7c
 	struct bt_stat_level	*lptr;
48cf7c
 	struct bt_stat_level	*p_lptr;
48cf7c
 	xfs_extlen_t		blocks_allocated;
48cf7c
-	int			maxrecs;
48cf7c
 
48cf7c
 	if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) {
48cf7c
 		memset(btree_curs, 0, sizeof(struct bt_status));
48cf7c
@@ -1371,32 +1357,20 @@ init_rmapbt_cursor(
48cf7c
 	 * Leave enough slack in the rmapbt that we can insert the
48cf7c
 	 * metadata AG entries without too many splits.
48cf7c
 	 */
48cf7c
-	maxrecs = mp->m_rmap_mxr[0];
48cf7c
-	if (num_recs > maxrecs)
48cf7c
-		maxrecs -= 10;
48cf7c
-	blocks_allocated = lptr->num_blocks = howmany(num_recs, maxrecs);
48cf7c
-
48cf7c
-	lptr->modulo = num_recs % lptr->num_blocks;
48cf7c
-	lptr->num_recs_pb = num_recs / lptr->num_blocks;
48cf7c
-	lptr->num_recs_tot = num_recs;
48cf7c
+	compute_level_geometry(mp, lptr, num_recs,
48cf7c
+			num_recs > mp->m_rmap_mxr[0] ? 10 : 0, true);
48cf7c
+	blocks_allocated = lptr->num_blocks;
48cf7c
 	level = 1;
48cf7c
 
48cf7c
-	if (lptr->num_blocks > 1)  {
48cf7c
-		for (; btree_curs->level[level-1].num_blocks > 1
48cf7c
-				&& level < XFS_BTREE_MAXLEVELS;
48cf7c
-				level++)  {
48cf7c
-			lptr = &btree_curs->level[level];
48cf7c
-			p_lptr = &btree_curs->level[level - 1];
48cf7c
-			lptr->num_blocks = howmany(p_lptr->num_blocks,
48cf7c
-				mp->m_rmap_mxr[1]);
48cf7c
-			lptr->modulo = p_lptr->num_blocks % lptr->num_blocks;
48cf7c
-			lptr->num_recs_pb = p_lptr->num_blocks
48cf7c
-					/ lptr->num_blocks;
48cf7c
-			lptr->num_recs_tot = p_lptr->num_blocks;
48cf7c
-
48cf7c
-			blocks_allocated += lptr->num_blocks;
48cf7c
-		}
48cf7c
+	while (lptr->num_blocks > 1) {
48cf7c
+		p_lptr = lptr;
48cf7c
+		lptr = &btree_curs->level[level++];
48cf7c
+
48cf7c
+		compute_level_geometry(mp, lptr,
48cf7c
+				p_lptr->num_blocks, 0, false);
48cf7c
+		blocks_allocated += lptr->num_blocks;
48cf7c
 	}
48cf7c
+	ASSERT(level < XFS_BTREE_MAXLEVELS);
48cf7c
 	ASSERT(lptr->num_blocks == 1);
48cf7c
 	btree_curs->num_levels = level;
48cf7c