On Wed, Jul 01, 2015 at 10:13:06AM +1000, Dave Chinner wrote:
> On Thu, Jun 25, 2015 at 04:39:16PM -0700, Darrick J. Wong wrote:
> > Create a per-AG btree to track the reference counts of physical blocks
> > to support reflink.
>
> Few things from a quick glance...
>
> > --- a/fs/xfs/libxfs/xfs_alloc.c
> > +++ b/fs/xfs/libxfs/xfs_alloc.c
> > @@ -54,6 +54,8 @@ xfs_extlen_t
> > xfs_prealloc_blocks(
> > struct xfs_mount *mp)
> > {
> > + if (xfs_sb_version_hasreflink(&mp->m_sb))
> > + return XFS_RL_BLOCK(mp) + 1;
>
> Should introduce the sb version stuff as a separate patch perhaps
> with the basic infrastructure defines (see how I did the first rmap
> btree patch).
Ok.
> > @@ -1117,6 +1118,9 @@ xfs_btree_set_refs(
> > case XFS_BTNUM_RMAP:
> > xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
> > break;
> > + case XFS_BTNUM_RL:
>
> Probably better to call it XFS_BTNUM_REFLINK
I was thinking about renaming the whole thing to 'refcount', i.e.
XFS_BTNUM_REFCOUNT since it /is/ a btree of reference counts.
> > diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
> > index 48ab2b1..a3f8661 100644
> > --- a/fs/xfs/libxfs/xfs_btree.h
> > +++ b/fs/xfs/libxfs/xfs_btree.h
> > @@ -43,6 +43,7 @@ union xfs_btree_key {
> > xfs_alloc_key_t alloc;
> > struct xfs_inobt_key inobt;
> > struct xfs_rmap_key rmap;
> > + xfs_reflink_key_t reflink;
>
> No new typedefs. struct xfs_reflink_key...
>
> (only say this once, but applies many times ;)
Yeah, sorry about that.
> > diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
> > index 9cff517..e4954ab 100644
> > --- a/fs/xfs/libxfs/xfs_format.h
> > +++ b/fs/xfs/libxfs/xfs_format.h
> > @@ -446,9 +446,11 @@ xfs_sb_has_compat_feature(
> >
> > #define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode
> > btree */
> > #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map
> > btree */
> > +#define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflink
> > btree */
> > #define XFS_SB_FEAT_RO_COMPAT_ALL \
> > (XFS_SB_FEAT_RO_COMPAT_FINOBT | \
> > - XFS_SB_FEAT_RO_COMPAT_RMAPBT)
> > + XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
> > + XFS_SB_FEAT_RO_COMPAT_REFLINK)
>
> The XFS_SB_FEAT_RO_COMPAT_REFLINK flag shoul dbe added as a separate
> patch and put last in the series so it is only enabled once
> everything is complete.
What if I define XFS_SB_FEAT_RO_COMPAT_REFLINK at the beginning but omit it
from the XFS_SB_FEAT_RO_COMPAT_ALL definition until the final patch? That
should prohibit anyone from using the half-baked feature during a bisect.
> > #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
> > static inline bool
> > xfs_sb_has_ro_compat_feature(
> > @@ -522,6 +524,12 @@ static inline bool xfs_sb_version_hasrmapbt(struct
> > xfs_sb *sbp)
> > (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT);
> > }
> >
> > +static inline int xfs_sb_version_hasreflink(xfs_sb_t *sbp)
>
> bool.
>
> > @@ -1338,6 +1349,50 @@ typedef __be32 xfs_rmap_ptr_t;
> > XFS_IBT_BLOCK(mp) + 1)
> >
> > /*
> > + * reflink Btree format definitions
> > + *
> > + */
> > +#define XFS_RLBT_CRC_MAGIC 0x524C4233 /* 'RLB3' */
>
> #define XFS_RMAP_CRC_MAGIC 0x524d4233 /* 'RMB3' */
>
> Only one bit difference in the magic numbers, which means they are
> too similar. "RFL3" might be better or maybe "R3FL"...
"RFC3" ?
> > +/*
> > + * Data record/key structure
> > + */
> > +typedef struct xfs_reflink_rec {
> > + __be32 rr_startblock; /* starting block number */
> > + __be32 rr_blockcount; /* count of blocks */
> > + __be32 rr_nlinks; /* number of inodes linked here */
> > +} xfs_reflink_rec_t;
> > +
> > +typedef struct xfs_reflink_key {
> > + __be32 rr_startblock; /* starting block number */
> > +} xfs_reflink_key_t;
> > +
> > +typedef struct xfs_reflink_rec_incore {
> > + xfs_agblock_t rr_startblock; /* starting block number */
> > + xfs_extlen_t rr_blockcount; /* count of free blocks */
> > + xfs_nlink_t rr_nlinks; /* number of inodes linked here */
> > +} xfs_reflink_rec_incore_t;
>
> We have being using "irec" as shorthand for "in-core record". i.e:
> struct xfs_reflink_irec.
Noted.
> (kill typedefs)
>
> > +
> > +/*
> > + * When a block hits MAXRLCOUNT references, it becomes permanently
> > + * stuck in CoW mode, because who knows how many times it's really
> > + * referenced.
> > + */
> > +#define MAXRLCOUNT ((xfs_nlink_t)~0U)
> > +#define MAXRLEXTLEN ((xfs_extlen_t)~0U)
>
> I'd suggest that if we hit the maximum count, we just abort the
> reflink operation.
<nod>
> > +/* btree pointer type */
> > +typedef __be32 xfs_reflink_ptr_t;
> > +
> > +#define XFS_RL_BLOCK(mp) \
> > + (xfs_sb_version_hasrmapbt(&((mp)->m_sb)) ? \
> > + XFS_RMAP_BLOCK(mp) + 1 : \
> > + (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \
> > + XFS_FIBT_BLOCK(mp) + 1 : \
> > + XFS_IBT_BLOCK(mp) + 1))
>
> That's getting unwieldy. It's large enough for a function....
Ok.
> > +#ifdef REFLINK_DEBUG
> > +# define dbg_printk(f, a...) do {printk(KERN_ERR f, ## a); } while (0)
> > +#else
> > +# define dbg_printk(f, a...)
> > +#endif
>
> xfs_debug() is your friend.
>
> > +#define CHECK_AG_NUMBER(mp, agno) \
> > + do { \
> > + ASSERT((agno) != NULLAGNUMBER); \
> > + ASSERT((agno) < (mp)->m_sb.sb_agcount); \
> > + } while(0);
>
> Ugh. Used once, just open code.
>
> > +#define CHECK_AG_EXTENT(mp, agbno, len) \
> > + do { \
> > + ASSERT((agbno) != NULLAGBLOCK); \
> > + ASSERT((len) > 0); \
> > + ASSERT((unsigned long long)(agbno) + (len) <= \
> > + (mp)->m_sb.sb_agblocks); \
> > + } while(0);
>
> These are really used in places where corruption checks are
> warranted, or the extent has already been checked....
>
> > +#define XFS_WANT_CORRUPTED_RLEXT_GOTO(mp, have, agbno, len, nr, label) \
> > + do { \
> > + XFS_WANT_CORRUPTED_GOTO((mp), (have) == 1, label); \
> > + XFS_WANT_CORRUPTED_GOTO((mp), (len) > 0, label); \
> > + XFS_WANT_CORRUPTED_GOTO((mp), (nr) >= 2, label); \
> > + XFS_WANT_CORRUPTED_GOTO((mp), (unsigned long long)(agbno) + \
> > + (len) <= (mp)->m_sb.sb_agblocks, label); \
> > + } while(0);
>
> Unused.
>
> > +
> > +STATIC int
> > +xfs_reflinkbt_alloc_block(
> > + struct xfs_btree_cur *cur,
> > + union xfs_btree_ptr *start,
> > + union xfs_btree_ptr *new,
> > + int *stat)
> > +{
> > + int error;
> > + xfs_agblock_t bno;
> > +
> > + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
> > +
> > + /* Allocate the new block from the freelist. If we can't, give up. */
> > + error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
> > + &bno, 1);
> > + if (error) {
> > + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
> > + return error;
> > + }
>
> Why does the reflink btree use the free list? Why can't it use
> block allocation like the BMBT tree?
I'm confused about the intended usage of the AGFL -- the XFS FS structure doc
says that it's for growing the free space btrees and can't be used for anything
else, but the rmap btree uses it.
Originally it /did/ use xfs_alloc_vextent(), though it won't be difficult to
revert.
>
> > +/*
> > + * Allocate a new reflink btree cursor.
> > + */
> > +struct xfs_btree_cur * /* new reflink btree cursor */
> > +xfs_reflinkbt_init_cursor(
> > + struct xfs_mount *mp, /* file system mount point */
> > + struct xfs_trans *tp, /* transaction pointer */
> > + struct xfs_buf *agbp, /* buffer for agf structure */
> > + xfs_agnumber_t agno) /* allocation group number */
>
> No real need for these comments on the variables. They are redundant
> as the code documents what they are just fine.
I was playing monkey-see monkey-do. Some of the other functions had
commented args. :)
>
> > +{
> > + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
> > + struct xfs_btree_cur *cur;
> > +
> > + CHECK_AG_NUMBER(mp, agno);
> > + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
> > +
> > + cur->bc_tp = tp;
> > + cur->bc_mp = mp;
> > + cur->bc_btnum = XFS_BTNUM_RL;
> > + cur->bc_blocklog = mp->m_sb.sb_blocklog;
> > + cur->bc_ops = &xfs_reflinkbt_ops;
> > +
> > + cur->bc_nlevels = be32_to_cpu(agf->agf_reflink_level);
> > +
> > + cur->bc_private.a.agbp = agbp;
> > + cur->bc_private.a.agno = agno;
> > +
> > + if (xfs_sb_version_hascrc(&mp->m_sb))
> > + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
>
> Can be set unconditionally.
>
> The next set of functions normally go into a different file. i.e the
> "xfs_foo_btree.c" file contains the functions required by the
> generic btree abstraction to implement the "foo" btree format. The
> file "xfs_foo.c" then contains the code/logic that provides the
> external foo API, manages the information inthe foo btree, and calls
> the generic btree functions to manage the btree. This logic isn't
> present in the patch, so really it shoul dbe added by the patch that
> starts implementing the reflink API....
Ok, I'll split this stuff out into smaller files.
> > +/*
> > + * Get the data from the pointed-to record.
> > + */
> > +int /* error */
> > +xfs_reflink_get_rec(
> > + struct xfs_btree_cur *cur, /* btree cursor */
> > + xfs_agblock_t *bno, /* output: starting block of extent */
> > + xfs_extlen_t *len, /* output: length of extent */
> > + xfs_nlink_t *nlink, /* output: number of links */
> > + int *stat) /* output: success/failure */
> > +{
> > + union xfs_btree_rec *rec;
> > + int error;
> > +
> > + error = xfs_btree_get_rec(cur, &rec, stat);
> > + if (!error && *stat == 1) {
> > + CHECK_AG_EXTENT(cur->bc_mp,
> > + be32_to_cpu(rec->reflink.rr_startblock),
> > + be32_to_cpu(rec->reflink.rr_blockcount));
> > + *bno = be32_to_cpu(rec->reflink.rr_startblock);
> > + *len = be32_to_cpu(rec->reflink.rr_blockcount);
> > + *nlink = be32_to_cpu(rec->reflink.rr_nlinks);
> > + }
> > + return error;
>
> if (error || !*stat)
> return error;
> .....
> return 0;
>
> > + error = xfs_reflink_get_rec(cur, &bno, &len, &nr, &x);
> > + if (error)
> > + return error;
> > + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, x == 1, error0);
> > + error = xfs_btree_delete(cur, i);
> > + if (error)
> > + return error;
> > + error = xfs_reflink_lookup_ge(cur, bno, &x);
> > +error0:
>
> New code should use sane jump labels. e.g. "out_error" is a pretty
> standard jump label name for this...
>
> > diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
> > index 88efbb4..d1de74e 100644
> > --- a/fs/xfs/libxfs/xfs_shared.h
> > +++ b/fs/xfs/libxfs/xfs_shared.h
> > @@ -216,6 +216,7 @@ int xfs_log_calc_minimum_size(struct xfs_mount *);
> > #define XFS_INO_REF 2
> > #define XFS_ATTR_BTREE_REF 1
> > #define XFS_DQUOT_REF 1
> > +#define XFS_REFLINK_BTREE_REF 1
>
> whitespace.
>
> > @@ -315,6 +317,9 @@ typedef struct xfs_perag {
> > /* for rcu-safe freeing */
> > struct rcu_head rcu_head;
> > int pagb_count; /* pagb slots in use */
> > +
> > + /* reflink */
> > + __uint8_t pagf_reflink_level;
>
> May as well just make it the same as what is on disk (i.e.
> uint32_t).
>
> > +++ b/fs/xfs/xfs_stats.c
> > @@ -61,6 +61,7 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v)
> > { "ibt2", XFSSTAT_END_IBT_V2 },
> > { "fibt2", XFSSTAT_END_FIBT_V2 },
> > { "rmapbt", XFSSTAT_END_RMAP_V2 },
> > + { "rlbt2", XFSSTAT_END_RLBT_V2 },
>
> "reflinkbt". No need for the "2", as there is only one set of
> reflink btree stats.
Ok, thanks for the review!
--D
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@xxxxxxxxxxxxx
>
> _______________________________________________
> xfs mailing list
> xfs@xxxxxxxxxxx
> http://oss.sgi.com/mailman/listinfo/xfs
|