xfs
[Top] [All Lists]

Re: [PATCH] Implement shrink of empty AGs

To: xfs@xxxxxxxxxxx
Subject: Re: [PATCH] Implement shrink of empty AGs
From: David Chinner <dgc@xxxxxxx>
Date: Tue, 12 Jun 2007 12:40:25 +1000
Cc: iusty@xxxxxxxxx
In-reply-to: <20070610164014.GA10936@xxxxxxxxxxxxxxxxx>
References: <20070610164014.GA10936@xxxxxxxxxxxxxxxxx>
Sender: xfs-bounce@xxxxxxxxxxx
User-agent: Mutt/1.4.2.1i
On Sun, Jun 10, 2007 at 06:40:14PM +0200, Iustin Pop wrote:
> The attached patch implements shrinking of completely empty allocation
> groups. The patch is against current CVS and modifies two files:
>   - xfs_trans.c to remove two asserts in which prevent lowering the
>     number of AGs or filesystem blocks;
>   - xfs_fsops.c where it does:
>     - modify xfs_growfs_data() to branch to either
>       xfs_growfs_data_private or xfs_shrinkfs_data private depending on
>       the new size of the fs
>     - abstract the last part of xfs_growfs_data_private (the modify of
>       all the superblocks) into a separate function, xfs_update_sb(),
>       which is called both from shrink and grow
>     - add the new xfs_shrinkfs_data_private function, mostly based on
>       the growfs function

comments are all inline....

> 
> There are many printk()'s left in the patch, I left them as they show
> where I compute some important values. There are also many FIXMEs in the
> comments showing what parts I didn't understand or was not sure about
> (not that these are the only ones...). Probably for a real patch,
> xfs-specific debug hooks need to be added and the printk()s removed.
> 
> The patch works on UML and QEMU virtual machines, both in UP and SMP. I
> just tested many shrink/grow operations and verified with xfs_repair
> that the fs is not corrupted. The free space counters seem to be correct
> after shrink.
> 
> Note that you also need to remove the check from xfs_growfs.c of not
> allowing to shrink the filesystem.
> 
> regards,
> iustin

> diff -X ignore -urN linux-2.6-xfs.cvs-orig/fs/xfs/xfs_fsops.c 
> linux-2.6-xfs.shrink/fs/xfs/xfs_fsops.c
> --- linux-2.6-xfs.cvs-orig/fs/xfs/xfs_fsops.c 2007-06-09 18:56:21.509308225 
> +0200
> +++ linux-2.6-xfs.shrink/fs/xfs/xfs_fsops.c   2007-06-10 18:32:36.074856477 
> +0200
> @@ -112,6 +112,53 @@
>       return 0;
>  }
>  
> +static void xfs_update_sb(

STATIC void
xfs_growfs_update_sb(

> +     xfs_mount_t             *mp,            /* mount point for filesystem */
> +     xfs_agnumber_t          nagimax,
> +     xfs_agnumber_t          nagcount)       /* new number of a.g. */

tabs, not spaces (and tabs are 8 spaces).

> +{
> +     xfs_agnumber_t          agno;
> +     xfs_buf_t               *bp;
> +     xfs_sb_t                *sbp;
> +     int                     error;
> +
> +     /* New allocation groups fully initialized, so update mount struct */
> +     if (nagimax)
> +             mp->m_maxagi = nagimax;
> +     if (mp->m_sb.sb_imax_pct) {
> +             __uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;

I'd prefer to have long lines like this split.

> +             do_div(icount, 100);
> +             mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
> +     } else
> +             mp->m_maxicount = 0;

Insert empty line.

> +     for (agno = 1; agno < nagcount; agno++) {
> +             error = xfs_read_buf(mp, mp->m_ddev_targp,
> +                               XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
> +                               XFS_FSS_TO_BB(mp, 1), 0, &bp);
> +             if (error) {
> +                     xfs_fs_cmn_err(CE_WARN, mp,
> +                     "error %d reading secondary superblock for ag %d",
> +                             error, agno);
> +                     break;
> +             }
> +             sbp = XFS_BUF_TO_SBP(bp);
> +             xfs_xlatesb(sbp, &mp->m_sb, -1, XFS_SB_ALL_BITS);

Insert empty line

> +             /*
> +              * If we get an error writing out the alternate superblocks,
> +              * just issue a warning and continue.  The real work is
> +              * already done and committed.
> +              */
> +             if (!(error = xfs_bwrite(mp, bp))) {
> +                     continue;
> +             } else {
> +                     xfs_fs_cmn_err(CE_WARN, mp,
> +             "write error %d updating secondary superblock for ag %d",
> +                             error, agno);
> +                     break; /* no point in continuing */
> +             }
> +     }

                error = xfs_bwrite(mp, bp);
                if (error) {
                        xfs_fs_cmn_err(...)
                        break;
                }
        }
> +}
> +
>  static int
>  xfs_growfs_data_private(
>       xfs_mount_t             *mp,            /* mount point for filesystem */
> @@ -135,7 +182,6 @@
>       xfs_rfsblock_t          nfree;
>       xfs_agnumber_t          oagcount;
>       int                     pct;
> -     xfs_sb_t                *sbp;
>       xfs_trans_t             *tp;
>  
>       nb = in->newblocks;
> @@ -356,44 +402,228 @@
>       if (error) {
>               return error;
>       }
> -     /* New allocation groups fully initialized, so update mount struct */
> -     if (nagimax)
> -             mp->m_maxagi = nagimax;
> -     if (mp->m_sb.sb_imax_pct) {
> -             __uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
> -             do_div(icount, 100);
> -             mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
> -     } else
> -             mp->m_maxicount = 0;
> -     for (agno = 1; agno < nagcount; agno++) {
> -             error = xfs_read_buf(mp, mp->m_ddev_targp,
> -                               XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
> -                               XFS_FSS_TO_BB(mp, 1), 0, &bp);
> +     xfs_update_sb(mp, nagimax, nagcount);
> +     return 0;
> +
> + error0:
> +     xfs_trans_cancel(tp, XFS_TRANS_ABORT);
> +     return error;
> +}
> +
> +static int

STATIC int

> +xfs_shrinkfs_data_private(
> +                       xfs_mount_t           *mp,            /* mount point 
> for filesystem */
> +                       xfs_growfs_data_t     *in)            /* growfs data 
> input struct */

whitespace issues

> +{
> +     xfs_agf_t               *agf;
> +     xfs_agnumber_t          agno;
> +     xfs_buf_t               *bp;
> +     int                     dpct;
> +     int                     error;
> +     xfs_agnumber_t          nagcount;       /* new AG count */
> +     xfs_agnumber_t          oagcount;       /* old AG count */
> +     xfs_agnumber_t          nagimax = 0;
> +     xfs_rfsblock_t          nb, nb_mod;
> +     xfs_rfsblock_t          dbdelta;        /* will be used as a
> +                                                check that we
> +                                                shrink the fs by
> +                                                the correct number
> +                                                of blocks */
> +     xfs_rfsblock_t          fdbdelta;       /* will keep track of
> +                                                how many ag blocks
> +                                                we need to
> +                                                remove */

Long comments like this don't go on the declaration. Put them where
the variable is initialised or first used.

> +     int                     pct;
> +     xfs_trans_t             *tp;
> +
> +     nb = in->newblocks;
> +     pct = in->imaxpct;
> +     if (nb >= mp->m_sb.sb_dblocks || pct < 0 || pct > 100)
> +             return XFS_ERROR(EINVAL);
> +     dpct = pct - mp->m_sb.sb_imax_pct;

This next bit:

> +     error = xfs_read_buf(mp, mp->m_ddev_targp,
> +                          XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
> +                          XFS_FSS_TO_BB(mp, 1), 0, &bp);
> +     if (error)
> +             return error;
> +     ASSERT(bp);
> +     /* FIXME: we release the buffer here manually because we are
> +      * outside of a transaction? The other buffers read using the
> +      * functions which take a tp parameter are not released in
> +      * growfs
> +      */
> +     xfs_buf_relse(bp);

Should not be necessary - we don't need to check if the new last
filesystem block is beyond EOF because we are shrinking....

To answer the FIXME - xfs_trans_commit() releases locked buffers and
inodes that have been joined ot the transaction unless they have
also been held. So if you are outside a transaction, you do have to
ensure you release any buffers you read.

> +     /* Do basic checks (at the fs level) */
> +     oagcount = mp->m_sb.sb_agcount;
> +     nagcount = nb;
> +     nb_mod = do_div(nagcount, mp->m_sb.sb_agblocks);
> +     if(nb_mod) {
> +             printk("not shrinking on an AG boundary (diff=%d)\n", nb_mod);
> +             return XFS_ERROR(ENOSPC);

EINVAL, I think.

> +     }
> +     if(nagcount < 2) {
> +             printk("refusing to shrink below 2 AGs\n");
> +             return XFS_ERROR(ENOSPC);

EINVAL.

> +     }
> +     if(nagcount >= oagcount) {
> +             printk("number of AGs will not decrease\n");
> +             return XFS_ERROR(EINVAL);
> +     }
> +     printk("Cur ag=%d, cur blocks=%llu\n",
> +            mp->m_sb.sb_agcount, mp->m_sb.sb_dblocks);
> +     printk("New ag=%d, new blocks=%d\n", nagcount, nb);
> +
> +     printk("Will resize from %llu to %d, delta is %llu\n",
> +               mp->m_sb.sb_dblocks, nb, mp->m_sb.sb_dblocks - nb);
> +     /* Check to see if we trip over the log section */
> +     printk("logstart=%llu logblocks=%u\n",
> +               mp->m_sb.sb_logstart, mp->m_sb.sb_logblocks);
> +     if (nb < mp->m_sb.sb_logstart + mp->m_sb.sb_logblocks)
> +             return XFS_ERROR(EINVAL);

Insert empty line

> +     /* dbdelta starts at the diff and must become zero */
> +     dbdelta = mp->m_sb.sb_dblocks - nb;
> +     tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
> +     printk("reserving %d\n", XFS_GROWFS_SPACE_RES(mp) + dbdelta);
> +     if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp) + dbdelta,
> +                                    XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) {
> +             xfs_trans_cancel(tp, 0);
> +             return error;
> +     }

What's the dbdelta part of the reservation for? That's reserving dbdelta
blocks for *allocations*, so I don't think this is right....

> +
> +     fdbdelta = 0;
> +
> +     /* Per-AG checks */
> +     /* FIXME: do we need to hold m_peraglock while doing this? */

Yes.

> +     /* I think that since we do read and write to the m_perag
> +      * stuff, we should be holding the lock for the entire walk &
> +      * modify of the fs
> +      */

Deadlock warning! holding the m_peraglock in write mode will cause
allocation deadlocks if you are not careful as all allocation/free
operations take the m_peraglock in read mode. (And yes, growing
an active, loaded filesystem can deadlock because of this.)

> +     /* Note that because we hold the lock, on any error+early
> +      *  return, we must either release manually and return, or
> +      *  jump to error0
> +      */

whitespace.

> +     down_write(&mp->m_peraglock);
> +     for(agno = oagcount - 1; agno >= nagcount; agno--) {
> +             xfs_extlen_t    usedblks; /* total used blocks in this a.g. */
> +             xfs_extlen_t    freeblks; /* free blocks in this a.g. */
> +             xfs_agblock_t   aglen;    /* this ag's len */
> +             struct xfs_perag *pag;  /* the m_perag structure */
> +
> +             printk("doing agno=%d\n", agno);
> +
> +             pag = &mp->m_perag[agno];
> +
> +             error = xfs_alloc_read_agf(mp, tp, agno, 0, &bp);
>               if (error) {
> -                     xfs_fs_cmn_err(CE_WARN, mp,
> -                     "error %d reading secondary superblock for ag %d",
> -                             error, agno);
> -                     break;
> +                     goto error0;
>               }
> -             sbp = XFS_BUF_TO_SBP(bp);
> -             xfs_xlatesb(sbp, &mp->m_sb, -1, XFS_SB_ALL_BITS);
> +             ASSERT(bp);
> +             agf = XFS_BUF_TO_AGF(bp);
> +             aglen = INT_GET(agf->agf_length, ARCH_CONVERT);
> +
> +             /* read the pagf/pagi if not already initialized */
> +             /* agf should be initialized because of the ablove read_agf */
> +             ASSERT(pag->pagf_init);
> +             if (!pag->pagi_init) {
> +                     if ((error = xfs_ialloc_read_agi(mp, tp, agno, &bp)))
> +                             goto error0;

I don't think you should be overwriting bp here....

> +                     ASSERT(pag->pagi_init);
> +             }
> +

Because now you have bp potentially pointing to two different buffers.

>               /*
> -              * If we get an error writing out the alternate superblocks,
> -              * just issue a warning and continue.  The real work is
> -              * already done and committed.
> +              * Check the inodes: as long as we have pagi_count ==
> +              * pagi_freecount == 0, then: a) we don't have to
> +              * update any global inode counters, and b) there are
> +              * no extra blocks in inode btrees
>                */
> -             if (!(error = xfs_bwrite(mp, bp))) {
> -                     continue;
> -             } else {
> -                     xfs_fs_cmn_err(CE_WARN, mp,
> -             "write error %d updating secondary superblock for ag %d",
> -                             error, agno);
> -                     break; /* no point in continuing */
> +             if(pag->pagi_count > 0 ||
> +                pag->pagi_freecount > 0) {
> +                     printk("agi %d has %d inodes in total and %d free\n",
> +                            agno, pag->pagi_count, pag->pagi_freecount);
> +                     error = XFS_ERROR(ENOSPC);
> +                     goto error0;
> +             }
> +
> +             /* Check the AGF: if levels[] == 1, then there should
> +              *  be no extra blocks in the btrees beyond the ones
> +              *  at the beggining of the AG
> +              */
> +             if(pag->pagf_levels[XFS_BTNUM_BNOi] > 1 ||
> +                pag->pagf_levels[XFS_BTNUM_CNTi] > 1) {
> +                     printk("agf %d has level %d bt and %d cnt\n",
> +                            agno,
> +                            pag->pagf_levels[XFS_BTNUM_BNOi],
> +                            pag->pagf_levels[XFS_BTNUM_CNTi]);
> +                     error = XFS_ERROR(ENOSPC);
> +                     goto error0;
>               }

ok, so we have empty ag's here. You might want to check that the
inode btree is empty and that the AGI unlinked list is empty.

> +
> +             freeblks = pag->pagf_freeblks;
> +             printk("Usage: %d prealloc, %d flcount\n",
> +                    XFS_PREALLOC_BLOCKS(mp), pag->pagf_flcount);
> +
> +             /* Done gathering data, check sizes */
> +             usedblks = XFS_PREALLOC_BLOCKS(mp) + pag->pagf_flcount;
> +             printk("agno=%d agf_length=%d computed used=%d"
> +                    " known free=%d\n", agno, aglen, usedblks, freeblks);
> +
> +             if(usedblks + freeblks != aglen) {
> +                     printk("agno %d is not free (%d blocks allocated)\n",
> +                            agno, aglen-usedblks-freeblks);
> +                     error = XFS_ERROR(ENOSPC);
> +                     goto error0;
> +             }
> +             dbdelta -= aglen;
> +             printk("will lower with %d\n",
> +                    aglen - XFS_PREALLOC_BLOCKS(mp));
> +             fdbdelta += aglen - XFS_PREALLOC_BLOCKS(mp);

Ok, so why not just

        fdbdelta += mp->m_sb.sb_agblocks - XFS_PREALLOC_BLOCKS(mp);

> +     }
> +     /*
> +      * Check that we removed all blocks
> +      */
> +     ASSERT(!dbdelta);
> +     ASSERT(nagcount < oagcount);

Error out, not assert, because at this point we have not changed anything.

> +
> +     printk("to free: %d, oagcount=%d, nagcount=%d\n",
> +            fdbdelta, oagcount, nagcount);
> +
> +     xfs_trans_agblocks_delta(tp, -((long)fdbdelta));
> +     xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
> +     xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, nb - mp->m_sb.sb_dblocks);
> +     xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, -((int64_t)fdbdelta));
> +
> +     if (dpct)
> +             xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
> +     error = xfs_trans_commit(tp, 0);
> +     if (error) {
> +             up_write(&mp->m_peraglock);
> +             return error;
>       }
> +     /* Free memory as the number of AG has changed */
> +     for (agno = nagcount; agno < oagcount; agno++)
> +             if (mp->m_perag[agno].pagb_list)
> +                     kmem_free(mp->m_perag[agno].pagb_list,
> +                               sizeof(xfs_perag_busy_t) *
> +                               XFS_PAGB_NUM_SLOTS);
> +
> +     mp->m_perag = kmem_realloc(mp->m_perag,
> +                                sizeof(xfs_perag_t) * nagcount,
> +                                sizeof(xfs_perag_t) * oagcount,
> +                                KM_SLEEP);

This is not really safe - how do we know if all the users of the
higher AGs have gone away? I think we should simply just zero out
the unused AGs and don't worry about a realloc().

> +     /* FIXME: here we could instead just lower
> +      * nagimax to nagcount; is it better this way?
> +      */

Not really.

> +     /* FIXME: why is this flag unconditionally set in growfs? */
> +     mp->m_flags |= XFS_MOUNT_32BITINODES;

good question. I don't think it should be there but I'll have to
do some digging....

> +     nagimax = xfs_initialize_perag(XFS_MTOVFS(mp), mp, nagcount);
> +     up_write(&mp->m_peraglock);
> +
> +     xfs_update_sb(mp, nagimax, nagcount);
>       return 0;
>  
>   error0:
> +     up_write(&mp->m_peraglock);
>       xfs_trans_cancel(tp, XFS_TRANS_ABORT);
>       return error;
>  }
> @@ -435,7 +665,10 @@
>       int error;
>       if (!cpsema(&mp->m_growlock))
>               return XFS_ERROR(EWOULDBLOCK);
> -     error = xfs_growfs_data_private(mp, in);
> +     if(in->newblocks < mp->m_sb.sb_dblocks)
> +             error = xfs_shrinkfs_data_private(mp, in);
> +     else
> +             error = xfs_growfs_data_private(mp, in);

Hmmm - that's using the one ioctl to do both grow and shrink. I'd
prefer a new shrink ioctl rather than changing the behaviour of an
existing ioctl.

Looks like a good start ;)

Cheers,

Dave.
-- 
Dave Chinner
Principal Engineer
SGI Australian Software Group


<Prev in Thread] Current Thread [Next in Thread>