Hi SGI guys,
As I previously reported, XFS has a deadlock problem on a ENOSPC
device.
http://oss.sgi.com/archives/linux-xfs/2005-07/msg00107.html
I've noticed that XFS in 2.6.14 kernel has the same problem. The
problem is that XFS may lock allocation groups (AGs) out of order if
there isn't enough free space. XFS needs ascending order to lock
multiple AGs in a transaction.
In my inspection of the XFS allocation code, the following flaws were
found.
(1) xfs_alloc_fix_freelist() and xfs_alloc_ag_vextent() may touch
an AGF and make it busy, though there is no available block on
the AG.
(2) In xfs_alloc_vextent(), the first loop of XFS_ALLOCTYPE_FIRST_AG
starts at fsbno, but the second starts at AG#0.
(3) xfs_bmap_alloc() calls xfs_alloc_vextent() repeatedly without
attention whether some AGs have been locked.
I've tried to fix the (1) behavior, but I couldn't. So I made a patch
to fix (2) and (3).
fs/xfs/xfs_alloc.c:
recalculate agno at the beginning of the second loop of
XFS_ALLOCTYPE_FIRST_AG.
fs/xfs/xfs_alloc.h, fs/xfs/xfs_alloc.c,
fs/xfs/xfs_trans.h, fs/xfs/xfs_trans_buf.c:
release the AGF buffer if it is non-dirty.
fs/xfs/xfs_bmap.c:
quit if the AGF is locked, to start new transaction.
fs/xfs/xfs_iomap.c:
yield the CPU.
It has survived from 24 hours of stress test.
Any comments are welcome.
Signed-off-by: ASANO Masahiro <masano@xxxxxxxxxxxxxx>
---
--- linux-2.6.14/fs/xfs/xfs_alloc.h.orig 2005-11-07 17:43:39.000000000
+0900
+++ linux-2.6.14/fs/xfs/xfs_alloc.h 2005-11-14 18:01:25.000000000 +0900
@@ -65,6 +65,7 @@ typedef struct xfs_alloc_arg {
struct xfs_trans *tp; /* transaction pointer */
struct xfs_mount *mp; /* file system mount point */
struct xfs_buf *agbp; /* buffer for a.g. freelist header */
+ struct xfs_buf *agflbp; /* buffer for agfl block pointer */
struct xfs_perag *pag; /* per-ag struct for this agno */
xfs_fsblock_t fsbno; /* file system block number */
xfs_agnumber_t agno; /* allocation group number */
--- linux-2.6.14/fs/xfs/xfs_alloc.c.orig 2005-11-07 12:07:29.000000000
+0900
+++ linux-2.6.14/fs/xfs/xfs_alloc.c 2005-11-14 18:15:58.000000000 +0900
@@ -1863,6 +1863,8 @@ xfs_alloc_fix_freelist(
} else
agbp = NULL;
+ args->agflbp = NULL;
+
/* If this is a metadata prefered pag and we are user data
* then try somewhere else if we are not being asked to
* try harder at this point
@@ -1981,6 +1983,7 @@ xfs_alloc_fix_freelist(
}
}
args->agbp = agbp;
+ args->agflbp = agflbp;
return 0;
}
@@ -2393,6 +2396,10 @@ xfs_alloc_vextent(
}
if (flags == 0) {
no_min = 1;
+ if (type == XFS_ALLOCTYPE_FIRST_AG) {
+ args->agno = XFS_FSB_TO_AGNO(mp,
+ args->fsbno);
+ }
} else {
flags = 0;
if (type == XFS_ALLOCTYPE_START_BNO) {
@@ -2417,9 +2424,17 @@ xfs_alloc_vextent(
ASSERT(0);
/* NOTREACHED */
}
- if (args->agbno == NULLAGBLOCK)
+ if (args->agbno == NULLAGBLOCK) {
+ if (args->agbp && !xfs_trans_buf_is_dirty(args->tp,
args->agbp)) {
+ xfs_trans_brelse(args->tp, args->agbp);
+ if (args->agflbp) {
+ xfs_trans_brelse(args->tp, args->agflbp);
+ }
+ args->agbp = NULL;
+ args->agflbp = NULL;
+ }
args->fsbno = NULLFSBLOCK;
- else {
+ } else {
args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
#ifdef DEBUG
ASSERT(args->len >= args->minlen);
--- linux-2.6.14/fs/xfs/xfs_trans.h.orig 2005-11-14 19:12:32.000000000
+0900
+++ linux-2.6.14/fs/xfs/xfs_trans.h 2005-11-14 18:27:24.000000000 +0900
@@ -996,6 +996,7 @@ int xfs_trans_read_buf(struct xfs_mount
struct xfs_buf **);
struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
+int xfs_trans_buf_is_dirty(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *);
--- linux-2.6.14/fs/xfs/xfs_trans_buf.c.orig 2005-11-14 18:00:26.000000000
+0900
+++ linux-2.6.14/fs/xfs/xfs_trans_buf.c 2005-11-14 18:31:05.000000000 +0900
@@ -497,6 +497,20 @@ shutdown_abort:
return XFS_ERROR(EIO);
}
+/*
+ * Check if the buffer is dirty within this transaction.
+ */
+int
+xfs_trans_buf_is_dirty(xfs_trans_t *tp,
+ xfs_buf_t *bp)
+{
+ xfs_buf_log_item_t *bip;
+ xfs_log_item_desc_t *lidp;
+
+ bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+ lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
+ return (lidp->lid_flags & XFS_LID_DIRTY) != 0;
+}
/*
* Release the buffer bp which was previously acquired with one of the
--- linux-2.6.14/fs/xfs/xfs_bmap.c.orig 2005-11-11 22:42:43.000000000 +0900
+++ linux-2.6.14/fs/xfs/xfs_bmap.c 2005-11-11 23:05:42.000000000 +0900
@@ -2674,9 +2674,10 @@ xfs_bmap_alloc(
args.wasdel = ap->wasdel;
args.isfl = 0;
args.userdata = ap->userdata;
+ args.agbp = NULL;
if ((error = xfs_alloc_vextent(&args)))
return error;
- if (tryagain && args.fsbno == NULLFSBLOCK) {
+ if (tryagain && args.fsbno == NULLFSBLOCK && args.agbp == NULL)
{
/*
* Exact allocation failed. Now try with alignment
* turned on.
@@ -2690,7 +2691,7 @@ xfs_bmap_alloc(
if ((error = xfs_alloc_vextent(&args)))
return error;
}
- if (isaligned && args.fsbno == NULLFSBLOCK) {
+ if (isaligned && args.fsbno == NULLFSBLOCK && args.agbp ==
NULL) {
/*
* allocation failed, so turn off alignment and
* try again.
@@ -2702,14 +2703,14 @@ xfs_bmap_alloc(
return error;
}
if (args.fsbno == NULLFSBLOCK && nullfb &&
- args.minlen > ap->minlen) {
+ args.minlen > ap->minlen && args.agbp == NULL) {
args.minlen = ap->minlen;
args.type = XFS_ALLOCTYPE_START_BNO;
args.fsbno = ap->rval;
if ((error = xfs_alloc_vextent(&args)))
return error;
}
- if (args.fsbno == NULLFSBLOCK && nullfb) {
+ if (args.fsbno == NULLFSBLOCK && nullfb && args.agbp == NULL) {
args.fsbno = 0;
args.type = XFS_ALLOCTYPE_FIRST_AG;
args.total = ap->minlen;
--- linux-2.6.14/fs/xfs/xfs_iomap.c.orig 2005-11-07 13:01:22.000000000
+0900
+++ linux-2.6.14/fs/xfs/xfs_iomap.c 2005-11-11 23:07:35.000000000 +0900
@@ -840,6 +840,9 @@ xfs_iomap_write_allocate(
goto error0;
xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (nimaps == 0) {
+ yield(); /* to prevent long CPU loop */
+ }
}
/*
|