[PATCH v3 10/18] xfs: allocate sparse inode chunks on full chunk allocation failure
Brian Foster
bfoster at redhat.com
Fri Feb 6 13:52:57 CST 2015
xfs_ialloc_ag_alloc() makes several attempts to allocate a full inode
chunk. If all else fails, reduce the allocation to the minimum sparse
granularity and attempt to allocate a sparse inode chunk.
If sparse chunk allocation succeeds, check whether an inobt record
already exists that can track the chunk. If so, inherit and update the
existing record. Otherwise, insert a new record for the sparse chunk.
Update xfs_inobt_insert_rec() to take the holemask as a parameter and
set the associated field on disk. Create the xfs_inobt_update_insert()
helper to handle the sparse chunk allocation case - insert or update an
existing record depending on whether it already exists.
Signed-off-by: Brian Foster <bfoster at redhat.com>
---
fs/xfs/libxfs/xfs_ialloc.c | 397 +++++++++++++++++++++++++++++++++++++++++++--
fs/xfs/xfs_trace.h | 47 ++++++
2 files changed, 426 insertions(+), 18 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index fc001d9..090d114 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -122,12 +122,16 @@ xfs_inobt_get_rec(
STATIC int
xfs_inobt_insert_rec(
struct xfs_btree_cur *cur,
+ __uint16_t holemask,
+ __uint8_t count,
__int32_t freecount,
xfs_inofree_t free,
int *stat)
{
- cur->bc_rec.i.ir_holemask = 0;
- cur->bc_rec.i.ir_count = 0; /* zero for backwards compatibility */
+ ASSERT(count == 0 || xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb));
+
+ cur->bc_rec.i.ir_holemask = holemask;
+ cur->bc_rec.i.ir_count = count;
cur->bc_rec.i.ir_freecount = freecount;
cur->bc_rec.i.ir_free = free;
return xfs_btree_insert(cur, stat);
@@ -151,6 +155,19 @@ xfs_inobt_insert(
xfs_agino_t thisino;
int i;
int error;
+ uint8_t count;
+
+ /*
+ * Only set ir_count in the inobt record if the sparse inodes feature is
+ * enabled. If disabled, we must maintain backwards compatibility with
+ * the older inobt record format where the current count and holemask
+ * fields map to the higher order bytes of freecount and thus must be
+ * zeroed.
+ */
+ if (xfs_sb_version_hassparseinodes(&mp->m_sb))
+ count = XFS_INODES_PER_CHUNK;
+ else
+ count = 0;
cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
@@ -164,7 +181,7 @@ xfs_inobt_insert(
}
ASSERT(i == 0);
- error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK,
+ error = xfs_inobt_insert_rec(cur, 0, count, XFS_INODES_PER_CHUNK,
XFS_INOBT_ALL_FREE, &i);
if (error) {
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -174,8 +191,58 @@ xfs_inobt_insert(
}
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+}
+/*
+ * Update or insert a new record based on a sparse inode chunk allocation.
+ *
+ * If a record already exists, the new record is an updated version of that
+ * record based on a merge of sparse inode chunks. Update the record in place.
+ * Otherwise, insert a new record in the tree. Note that the record to insert
+ * must already have been aligned and merged, if necessary.
+ */
+STATIC int
+xfs_inobt_update_insert(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ struct xfs_inobt_rec_incore *rec,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ int i;
+ int error;
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+
+ error = xfs_inobt_lookup(cur, rec->ir_startino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ goto error;
+ if (i == 1) {
+ /* found a record, update it with the merged record */
+ error = xfs_inobt_update(cur, rec);
+ if (error)
+ goto error;
+ goto out;
+ }
+
+ /* no existing record, insert a new one */
+ error = xfs_inobt_insert_rec(cur, rec->ir_holemask, rec->ir_count,
+ rec->ir_freecount, rec->ir_free, &i);
+ if (error)
+ goto error;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error);
+
+out:
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
return 0;
+
+error:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
}
/*
@@ -215,8 +282,36 @@ xfs_check_agi_freecount(
}
return 0;
}
+
+/*
+ * Verify that an inode record has a valid inode count. With sparse inode chunk
+ * support enabled, the count must be consistent with the holemask. Otherwise,
+ * the count is set to 0.
+ */
+STATIC int
+xfs_inobt_rec_check_count(
+ struct xfs_mount *mp,
+ struct xfs_inobt_rec_incore *rec)
+{
+ int inocount;
+ DECLARE_BITMAP(allocbmap, XFS_INODES_PER_CHUNK);
+
+ if (!xfs_sb_version_hassparseinodes(&mp->m_sb)) {
+ if (rec->ir_count)
+ return -EFSCORRUPTED;
+ return 0;
+ }
+
+ xfs_inobt_ialloc_bitmap(allocbmap, rec);
+ inocount = bitmap_weight(allocbmap, XFS_INODES_PER_CHUNK);
+ if (inocount != rec->ir_count)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
#else
#define xfs_check_agi_freecount(cur, agi) 0
+#define xfs_inobt_rec_check_count(mp, rec) 0
#endif
/*
@@ -358,6 +453,183 @@ xfs_ialloc_inode_init(
}
/*
+ * Align a record for a recently allocated sparse chunk. The input is a record
+ * that describes the unaligned chunk. The record is aligned such that it is fit
+ * for insertion (or merge) into the on-disk inode btrees.
+ */
+STATIC void
+xfs_align_sparse_rec(
+ struct xfs_mount *mp,
+ struct xfs_inobt_rec_incore *rec)
+{
+ xfs_agblock_t agbno;
+ xfs_agblock_t mod;
+ int offset;
+ uint16_t allocmask;
+
+ agbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino);
+ mod = agbno % mp->m_sb.sb_inoalignmt;
+ if (!mod)
+ return;
+
+ /* calculate the inode offset and align startino */
+ offset = mod << mp->m_sb.sb_inopblog;
+ rec->ir_startino -= offset;
+
+ /*
+ * Since startino has been aligned down, we have to left shift
+ * ir_holemask such that it continues to represent the same physical
+ * inodes as the unaligned record. The unaligned record by definition
+ * tracks the allocated inodes with the lowest order bits.
+ *
+ * ir_holemask is inverted before the shift such that set bits represent
+ * allocated inodes. This makes it safe for the bit-shift to introduce
+ * zeroes in the lower order bits without corrupting the record.
+ *
+ * Note that no change is required for ir_count, ir_freecount or
+ * ir_free. The count values are not affected by alignment and ir_free
+ * is initialized to 1s for all inodes, sparse or otherwise.
+ */
+ allocmask = ~rec->ir_holemask;
+ allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
+ rec->ir_holemask = ~allocmask;
+}
+
+/*
+ * Determine whether two sparse inode records can be merged. The inode ranges
+ * must match and there must be no allocation overlap between the records.
+ */
+STATIC bool
+__xfs_inobt_can_merge(
+ struct xfs_inobt_rec_incore *trec, /* tgt record */
+ struct xfs_inobt_rec_incore *srec) /* src record */
+{
+ DECLARE_BITMAP(talloc, 64);
+ DECLARE_BITMAP(salloc, 64);
+ DECLARE_BITMAP(tmp, 64);
+
+ /* records must cover the same inode range */
+ if (trec->ir_startino != srec->ir_startino)
+ return false;
+
+ /* both records must be sparse */
+ if (!xfs_inobt_issparse(trec->ir_holemask) ||
+ !xfs_inobt_issparse(srec->ir_holemask))
+ return false;
+
+ /* can't exceed capacity of a full record */
+ if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
+ return false;
+
+ /* verify there is no allocation overlap */
+ xfs_inobt_ialloc_bitmap(talloc, trec);
+ xfs_inobt_ialloc_bitmap(salloc, srec);
+
+ bitmap_and(tmp, salloc, talloc, 64);
+ if (!bitmap_empty(tmp, 64))
+ return false;
+
+ return true;
+}
+
+/*
+ * Merge two sparse inode records. The caller must call __xfs_inobt_can_merge()
+ * to ensure the merge is valid.
+ */
+STATIC void
+__xfs_inobt_rec_merge(
+ struct xfs_inobt_rec_incore *trec, /* target */
+ struct xfs_inobt_rec_incore *srec) /* src */
+{
+ ASSERT(trec->ir_startino == srec->ir_startino);
+
+ /* combine the counts */
+ trec->ir_count += srec->ir_count;
+ trec->ir_freecount += srec->ir_freecount;
+
+ /* merge the holemask */
+ trec->ir_holemask &= srec->ir_holemask;
+
+ /* merge the free mask */
+ trec->ir_free &= srec->ir_free;
+}
+
+/*
+ * Determine whether a newly allocated sparse inode chunk record overlaps with
+ * an existing sparse record in the inobt. When sparse inode chunks are enabled,
+ * all inode chunk alignment is increased from cluster size to physical inode
+ * chunk size. This means that the smallest, non-zero gap between two inode
+ * chunks is at least one full inode chunk. When a sparse inode chunk is
+ * allocated, the containing record is also aligned in this manner such that
+ * future sparse allocations within that same range all align to the same record
+ * startino. This alignment policy supports the ability to merge sparse chunks
+ * into complete chunks over time.
+ *
+ * Given an newly allocated/aligned sparse inode record, look up whether a
+ * sparse record already exists at this startino. If so, merge the two records
+ * and return the merged record in nrec.
+ *
+ * An error is returned if records overlap but a merge is not possible. Given
+ * the alignment constraints described above, this should never happen and thus
+ * is treated as fs corruption.
+ */
+STATIC int
+xfs_inobt_rec_merge(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_btnum_t btnum,
+ struct xfs_inobt_rec_incore *nrec) /* in/out: new/merged rec. */
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ int error;
+ int i;
+ struct xfs_inobt_rec_incore rec;
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+
+ /* the new record is pre-aligned so we know where to look */
+ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ goto error;
+ /* if nothing there, we're done */
+ if (i == 0)
+ goto out;
+
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ goto error;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error);
+ ASSERT(rec.ir_startino == nrec->ir_startino);
+
+ /*
+ * This should never happen. If we have coexisting records that cannot
+ * merge, something is seriously wrong.
+ */
+ if (!__xfs_inobt_can_merge(nrec, &rec)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
+
+ trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino, rec.ir_holemask,
+ nrec->ir_startino, nrec->ir_holemask);
+
+ __xfs_inobt_rec_merge(nrec, &rec);
+
+ trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino,
+ nrec->ir_holemask);
+
+out:
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+error:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
* Allocate new inodes in the allocation group specified by agbp.
* Return 0 for success, else error code.
*/
@@ -375,6 +647,9 @@ xfs_ialloc_ag_alloc(
xfs_agino_t newlen; /* new number of inodes */
int isaligned = 0; /* inode allocation at stripe unit */
/* boundary */
+ uint16_t allocmask = (uint16_t) -1; /* init. to full chunk */
+ struct xfs_inobt_rec_incore rec;
+
struct xfs_perag *pag;
memset(&args, 0, sizeof(args));
@@ -490,6 +765,45 @@ xfs_ialloc_ag_alloc(
return error;
}
+ /*
+ * Finally, try a sparse allocation if the filesystem supports it and
+ * the sparse allocation length is smaller than a full chunk.
+ */
+ if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) &&
+ args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks &&
+ args.fsbno == NULLFSBLOCK) {
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.agbno = be32_to_cpu(agi->agi_root);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+ args.alignment = args.mp->m_sb.sb_spinoalignmt;
+ args.prod = 1;
+
+ args.minlen = args.mp->m_ialloc_min_blks;
+ args.maxlen = args.minlen;
+
+ /*
+ * The inode record will be aligned to full chunk size. We must
+ * prevent sparse allocation from AG boundaries that result in
+ * invalid inode records, such as records that start at agbno 0
+ * or extend beyond the AG.
+ *
+ * Set min agbno to the first aligned, non-zero agbno and max to
+ * the last aligned agbno that is at least one full chunk from
+ * the end of the AG.
+ */
+ args.min_agbno = args.mp->m_sb.sb_inoalignmt;
+ args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
+ args.mp->m_sb.sb_inoalignmt) -
+ args.mp->m_ialloc_blks;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ return error;
+
+ newlen = args.len << args.mp->m_sb.sb_inopblog;
+ allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
+ }
+
if (args.fsbno == NULLFSBLOCK) {
*alloc = 0;
return 0;
@@ -514,6 +828,65 @@ xfs_ialloc_ag_alloc(
* Convert the results.
*/
newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
+
+ if (xfs_inobt_issparse(~allocmask)) {
+ /*
+ * We've allocated a sparse chunk...
+ */
+ rec.ir_startino = newino;
+ rec.ir_holemask = ~allocmask;
+ rec.ir_count = newlen;
+ rec.ir_freecount = newlen;
+ rec.ir_free = XFS_INOBT_ALL_FREE;
+
+ /* align record and update newino for agi_newino */
+ xfs_align_sparse_rec(args.mp, &rec);
+ newino = rec.ir_startino;
+
+ error = xfs_inobt_rec_merge(args.mp, tp, agbp, XFS_BTNUM_INO,
+ &rec);
+ if (!error)
+ error = xfs_inobt_rec_check_count(args.mp, &rec);
+ if (error == -EFSCORRUPTED) {
+ xfs_alert(args.mp,
+ "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
+ XFS_AGINO_TO_INO(args.mp, agno,
+ rec.ir_startino),
+ rec.ir_holemask, rec.ir_count);
+ xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
+ }
+ if (error)
+ return error;
+
+ error = xfs_inobt_update_insert(args.mp, tp, agbp, &rec,
+ XFS_BTNUM_INO);
+ if (error)
+ return error;
+
+ if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+ error = xfs_inobt_update_insert(args.mp, tp, agbp, &rec,
+ XFS_BTNUM_FINO);
+ if (error)
+ return error;
+ }
+ } else {
+ /* full chunk - insert new records to both btrees */
+ error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+ XFS_BTNUM_INO);
+ if (error)
+ return error;
+
+ if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+ error = xfs_inobt_insert(args.mp, tp, agbp, newino,
+ newlen, XFS_BTNUM_FINO);
+ if (error)
+ return error;
+ }
+ }
+
+ /*
+ * Update AGI counts and newino.
+ */
be32_add_cpu(&agi->agi_count, newlen);
be32_add_cpu(&agi->agi_freecount, newlen);
pag = xfs_perag_get(args.mp, agno);
@@ -522,20 +895,6 @@ xfs_ialloc_ag_alloc(
agi->agi_newino = cpu_to_be32(newino);
/*
- * Insert records describing the new inode chunk into the btrees.
- */
- error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
- XFS_BTNUM_INO);
- if (error)
- return error;
-
- if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
- error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
- XFS_BTNUM_FINO);
- if (error)
- return error;
- }
- /*
* Log allocation group header fields
*/
xfs_ialloc_log_agi(tp, agbp,
@@ -1672,7 +2031,9 @@ xfs_difree_finobt(
*/
XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error);
- error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
+ error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask,
+ ibtrec->ir_count,
+ ibtrec->ir_freecount,
ibtrec->ir_free, &i);
if (error)
goto error;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 51372e3..12a4bf4 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -734,6 +734,53 @@ TRACE_EVENT(xfs_iomap_prealloc_size,
__entry->blocks, __entry->shift, __entry->writeio_blocks)
)
+TRACE_EVENT(xfs_irec_merge_pre,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+ uint16_t holemask, xfs_agino_t nagino, uint16_t nholemask),
+ TP_ARGS(mp, agno, agino, holemask, nagino, nholemask),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(uint16_t, holemask)
+ __field(xfs_agino_t, nagino)
+ __field(uint16_t, nholemask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agino = agino;
+ __entry->holemask = holemask;
+ __entry->nagino = nagino;
+ __entry->nholemask = holemask;
+ ),
+ TP_printk("dev %d:%d agno %d inobt (%u:0x%x) new (%u:0x%x)",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
+ __entry->agino, __entry->holemask, __entry->nagino,
+ __entry->nholemask)
+)
+
+TRACE_EVENT(xfs_irec_merge_post,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+ uint16_t holemask),
+ TP_ARGS(mp, agno, agino, holemask),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(uint16_t, holemask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agino = agino;
+ __entry->holemask = holemask;
+ ),
+ TP_printk("dev %d:%d agno %d inobt (%u:0x%x)", MAJOR(__entry->dev),
+ MINOR(__entry->dev), __entry->agno, __entry->agino,
+ __entry->holemask)
+)
+
#define DEFINE_IREF_EVENT(name) \
DEFINE_EVENT(xfs_iref_class, name, \
TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
--
1.8.3.1
More information about the xfs
mailing list