xfs
[Top] [All Lists]

[PATCH 5/6] xfs: do not classify freed allocation btree blocks as busy

To: xfs@xxxxxxxxxxx
Subject: [PATCH 5/6] xfs: do not classify freed allocation btree blocks as busy
From: Christoph Hellwig <hch@xxxxxxxxxxxxx>
Date: Fri, 21 Jan 2011 04:22:32 -0500
Cc: Dave Chinner <david@xxxxxxxxxxxxx>
References: <20110121092227.115815324@xxxxxxxxxxxxxxxxxxxxxx>
User-agent: quilt/0.48-1
During an allocation or freeing of an extent, we occasionally have an
interesting situation happen during a btree update.  We may merge a block
as part of a record delete, then allocate it again immediately afterwards
when inserting a modified extent.  xfs_alloc_fixup_trees() does this sort
of manipulation to the btrees, and so can merge then immediately split
the tree resulting the in the same busy block being used from the
freelist.

Previously, this would trigger a synchronous log force of the entire log
(as the current transaction had a lsn of 0 until committed), but continue
to allow the extent to be used because if it is not on disk now then it
must have been freed and marked busy in this transaction.

In the case of allocbt blocks, we log the fact that they get moved to the
freelist and we also log them when the move off the free list back into
the free space trees.  When we move the blocks off the freelist back to
the trees, we mark the extent busy at that point so that it doesn't get
reused until the transaction is committed.

This means that as long as the block is on the AGFL and is only used
for allocbt blocks, we don't have to force the log to ensure prior
manipulating transactions are on disk as the process of log recovery
will replay the transactions in the correct order.

However, we still need to protect against the fact that as we approach
ENOSPC in an AG we can allocate data and metadata blocks direct from the
AGFL. In this case, we need to treat btree blocks just like any other
busy extent. Hence we still need to track btree blocks in the busy extent
tree, but we need to distinguish them from normal extents so we can apply
the necessary exceptions for btree block allocation.

Signed-off-by: Dave Chinner <david@xxxxxxxxxxxxx>
Signed-off-by: Christoph Hellwig <hch@xxxxxx>

Index: xfs/fs/xfs/xfs_ag.h
===================================================================
--- xfs.orig/fs/xfs/xfs_ag.h    2011-01-17 22:03:15.000000000 +0100
+++ xfs/fs/xfs/xfs_ag.h 2011-01-17 22:32:06.541004201 +0100
@@ -188,6 +188,11 @@ struct xfs_busy_extent {
        xfs_agblock_t   bno;
        xfs_extlen_t    length;
        xlog_tid_t      tid;            /* transaction that created this */
+       int             flags;
+};
+
+enum {
+       XFS_BUSY_FREELIST       = 0x0001, /* busy extent on freelist from abt */
 };
 
 /*
Index: xfs/fs/xfs/xfs_alloc.c
===================================================================
--- xfs.orig/fs/xfs/xfs_alloc.c 2011-01-17 22:30:54.000000000 +0100
+++ xfs/fs/xfs/xfs_alloc.c      2011-01-17 22:39:29.021005529 +0100
@@ -46,8 +46,12 @@ STATIC int xfs_alloc_ag_vextent_near(xfs
 STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
                xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+STATIC void xfs_alloc_busy_insert(struct xfs_trans *, xfs_agnumber_t,
+               xfs_agblock_t, xfs_extlen_t, int);
 STATIC int xfs_alloc_busy_search(struct xfs_mount *, xfs_agnumber_t,
                xfs_agblock_t, xfs_extlen_t);
+STATIC void xfs_alloc_busy_force(struct xfs_mount *, xfs_agnumber_t,
+               xfs_agblock_t, xfs_extlen_t, int);
 
 /*
  * Internal functions.
@@ -1691,7 +1695,7 @@ xfs_free_ag_extent(
                 * Mark the extent busy unless it comes from the freelist,
                 * in which case it has already been marked busy.
                 */
-               xfs_alloc_busy_insert(tp, agno, bno, len);
+               xfs_alloc_busy_insert(tp, agno, bno, len, 0);
        }
 
        XFS_STATS_INC(xs_freex);
@@ -1996,22 +2000,23 @@ xfs_alloc_get_freelist(
        xfs_alloc_log_agf(tp, agbp, logflags);
        *bnop = bno;
 
-       /*
-        * As blocks are freed, they are added to the per-ag busy list and
-        * remain there until the freeing transaction is committed to disk.
-        * Now that we have allocated blocks, this list must be searched to see
-        * if a block is being reused.  If one is, then the freeing transaction
-        * must be pushed to disk before this transaction.
-        *
-        * We do this by setting the current transaction to a sync transaction
-        * which guarantees that the freeing transaction is on disk before this
-        * transaction. This is done instead of a synchronous log force here so
-        * that we don't sit and wait with the AGF locked in the transaction
-        * during the log force.
-        */
        if (type != XFS_FREELIST_BALANCE) {
-               if (xfs_alloc_busy_search(mp, agno, bno, 1))
-                       xfs_trans_set_sync(tp);
+               /*
+                * If this block is marked busy we may have to force out the
+                * log to prevent reuse until the freeing transaction has been
+                * commited.
+                *
+                * If we're just allocating the block to rebalance the the
+                * freelist size we can skip this exercise as the block
+                * just keeps its busy marking while migrating to the
+                * allocation btree.
+                *
+                * If the block was freed from a btree and gets reallocated
+                * to it we may skip the log force, but details are handled
+                * by xfs_alloc_busy_force.
+                */
+               xfs_alloc_busy_force(mp, agno, bno, 1,
+                                    type == XFS_FREELIST_BTREE);
        }
 
        return 0;
@@ -2081,26 +2086,27 @@ xfs_alloc_put_freelist(
        xfs_agblock_t           bno,    /* block being freed */
        int                     btreeblk) /* block came from a AGF btree */
 {
-       xfs_agf_t               *agf;   /* a.g. freespace structure */
+       xfs_mount_t             *mp = tp->t_mountp;
+       xfs_agf_t               *agf = XFS_BUF_TO_AGF(agbp);
+       xfs_agnumber_t          agno = be32_to_cpu(agf->agf_seqno);
        xfs_agfl_t              *agfl;  /* a.g. free block array */
        __be32                  *blockp;/* pointer to array entry */
        int                     error;
        int                     logflags;
-       xfs_mount_t             *mp;    /* mount structure */
        xfs_perag_t             *pag;   /* per allocation group data */
 
-       agf = XFS_BUF_TO_AGF(agbp);
-       mp = tp->t_mountp;
+       if (!agflbp) {
+               error = xfs_alloc_read_agfl(mp, tp, agno, &agflbp);
+               if (error)
+                       return error;
+       }
 
-       if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
-                       be32_to_cpu(agf->agf_seqno), &agflbp)))
-               return error;
        agfl = XFS_BUF_TO_AGFL(agflbp);
        be32_add_cpu(&agf->agf_fllast, 1);
        if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
                agf->agf_fllast = 0;
 
-       pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+       pag = xfs_perag_get(mp, agno);
        be32_add_cpu(&agf->agf_flcount, 1);
        xfs_trans_agflist_delta(tp, 1);
        pag->pagf_flcount++;
@@ -2123,6 +2129,14 @@ xfs_alloc_put_freelist(
                (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl),
                (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl +
                        sizeof(xfs_agblock_t) - 1));
+
+       /*
+        * If it's a btree block, mark it busy as it has just been freed.
+        * Otherwise we are just replenishing the free list with extents
+        * that are already free so we don't need to track them.
+        */
+       if (btreeblk)
+               xfs_alloc_busy_insert(tp, agno, bno, 1, XFS_BUSY_FREELIST);
        return 0;
 }
 
@@ -2582,12 +2596,13 @@ error0:
  * logically into the one checkpoint. If the checkpoint sequences are
  * different, however, we still need to wait on a log force.
  */
-void
+STATIC void
 xfs_alloc_busy_insert(
        struct xfs_trans        *tp,
        xfs_agnumber_t          agno,
        xfs_agblock_t           bno,
-       xfs_extlen_t            len)
+       xfs_extlen_t            len,
+       int                     flags)
 {
        struct xfs_busy_extent  *new;
        struct xfs_busy_extent  *busyp;
@@ -2612,6 +2627,7 @@ xfs_alloc_busy_insert(
        new->agno = agno;
        new->bno = bno;
        new->length = len;
+       new->flags = flags;
        new->tid = xfs_log_get_trans_ident(tp);
 
        INIT_LIST_HEAD(&new->list);
@@ -2733,6 +2749,62 @@ xfs_alloc_busy_search(
        return match;
 }
 
+STATIC void
+xfs_alloc_busy_force(
+       struct xfs_mount        *mp,
+       xfs_agnumber_t          agno,
+       xfs_agblock_t           bno,
+       xfs_extlen_t            len,
+       int                     btreeblk)
+{
+       struct xfs_perag        *pag;
+       struct rb_node          *rbp;
+       struct xfs_busy_extent  *busyp;
+       int                     match = 0;
+
+       pag = xfs_perag_get(mp, agno);
+       spin_lock(&pag->pagb_lock);
+
+       rbp = pag->pagb_tree.rb_node;
+
+       /* find closest start bno overlap */
+       while (rbp) {
+               busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
+               if (bno < busyp->bno) {
+                       /* may overlap, but exact start block is lower */
+                       if (bno + len > busyp->bno) {
+                               match = 1;
+                               break;
+                       }
+                       rbp = rbp->rb_left;
+               } else if (bno > busyp->bno) {
+                       /* may overlap, but exact start block is higher */
+                       if (bno < busyp->bno + busyp->length) {
+                               match = 1;
+                               break;
+                       }
+                       rbp = rbp->rb_right;
+               } else {
+                       /* bno matches busyp, length determines exact match */
+                       match = 1;
+                       break;
+               }
+       }
+
+       if (match && btreeblk && (busyp->flags & XFS_BUSY_FREELIST)) {
+               list_del_init(&busyp->list);
+               rb_erase(&busyp->rb_node, &pag->pagb_tree);
+               kmem_free(busyp);
+               match = 0;
+       }
+       spin_unlock(&pag->pagb_lock);
+       trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
+       xfs_perag_put(pag);
+
+       if (match)
+               xfs_log_force(mp, XFS_LOG_SYNC);
+}
+
 /*
  * For a given extent [fbno, flen], search the busy extent list
  * to find a subset of the extent that is not busy.
Index: xfs/fs/xfs/xfs_alloc_btree.c
===================================================================
--- xfs.orig/fs/xfs/xfs_alloc_btree.c   2011-01-17 22:24:35.000000000 +0100
+++ xfs/fs/xfs/xfs_alloc_btree.c        2011-01-17 22:32:06.550048202 +0100
@@ -109,7 +109,6 @@ xfs_allocbt_free_block(
        struct xfs_buf          *bp)
 {
        struct xfs_buf          *agbp = cur->bc_private.a.agbp;
-       struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
        xfs_agblock_t           bno;
        int                     error;
 
@@ -118,18 +117,6 @@ xfs_allocbt_free_block(
        if (error)
                return error;
 
-       /*
-        * Since blocks move to the free list without the coordination used in
-        * xfs_bmap_finish, we can't allow block to be available for
-        * reallocation and non-transaction writing (user data) until we know
-        * that the transaction that moved it to the free list is permanently
-        * on disk. We track the blocks by declaring these blocks as "busy";
-        * the busy list is maintained on a per-ag basis and each transaction
-        * records which entries should be removed when the iclog commits to
-        * disk. If a busy block is allocated, the iclog is pushed up to the
-        * LSN that freed the block.
-        */
-       xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
        xfs_trans_agbtree_delta(cur->bc_tp, -1);
        return 0;
 }
Index: xfs/fs/xfs/xfs_alloc.h
===================================================================
--- xfs.orig/fs/xfs/xfs_alloc.h 2011-01-17 22:30:06.456005528 +0100
+++ xfs/fs/xfs/xfs_alloc.h      2011-01-17 22:40:14.542034513 +0100
@@ -120,10 +120,6 @@ xfs_alloc_longest_free_extent(struct xfs
 
 #ifdef __KERNEL__
 void
-xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
-       xfs_agblock_t bno, xfs_extlen_t len);
-
-void
 xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
 
 void
@@ -140,7 +136,8 @@ xfs_alloc_compute_maxlevels(
        struct xfs_mount        *mp);   /* file system mount structure */
 
 /*
- * Destination of blocks allocated by xfs_alloc_get_freelist.
+ * Destination of blocks allocated by xfs_alloc_get_freelist  /
+ * source of blocks freed by xfs_alloc_put_freelist.
  */
 #define XFS_FREELIST_ALLOC     0
 #define XFS_FREELIST_BTREE     1

<Prev in Thread] Current Thread [Next in Thread>