Hallo,
I've been tracking a deadlock on out of space conditions. It's easy to
reproduce:
Use a XFS file systems with about 30MB free and run fsstress -d/xfs -p30 -n50000
on it and run in parallel while true ; do cp -a /usr /xfs ; done
After some time all fsstress processes and up in D and don't make any progress
anymore. Looking at sysrq output they seem to be mostly deadlocked in grabbing
the semaphore of a pagebuf. Two or three are deep in allocation (usually
beyind xfs_alloc_read_agf()), the others are in sync and wait_on_inode
(e.g. http://www.firstfloor.org/~andi/deadlock3 has a full example)
I see it with 2.6.7rc3-bk3 and earlier kernels.
My theory is that one of the ENOSPC error paths forgets to brelse a buffer
which
causes the semaphore to get lost and causes the deadlocks later. This is likely
a AGF buffer, since xfs_alloc_read_agf() and children are blocking too.
While looking at the code I found some missing brelses in xfs_alloc errors
paths that could explain some of the problems. With this patch applied
I unfortunately get still deadlocks with the same test case eventually
(lost pagebuf sem again) but they take soemwhat longer.
Most likely more audit of error paths is needed (i wish there was an automatic
tool for that).
-Andi
diff -u linux/fs/xfs/xfs_alloc.c-o linux/fs/xfs/xfs_alloc.c
--- linux/fs/xfs/xfs_alloc.c-o 1970-01-01 01:12:51.000000000 +0100
+++ linux/fs/xfs/xfs_alloc.c 2004-06-12 03:54:18.000000000 +0200
@@ -1868,6 +1868,7 @@
* try harder at this point
*/
if (pag->pagf_metadata && args->userdata && flags) {
+ xfs_trans_brelse(tp, agbp);
args->agbp = NULL;
return 0;
}
@@ -1886,8 +1887,7 @@
(int)(pag->pagf_freeblks + pag->pagf_flcount -
need - args->total) <
(int)args->minleft)) {
- if (agbp)
- xfs_trans_brelse(tp, agbp);
+ xfs_trans_brelse(tp, agbp);
args->agbp = NULL;
return 0;
}
@@ -1932,10 +1932,16 @@
while (INT_GET(agf->agf_flcount, ARCH_CONVERT) > need) {
xfs_buf_t *bp;
- if ((error = xfs_alloc_get_freelist(tp, agbp, &bno)))
+ if ((error = xfs_alloc_get_freelist(tp, agbp, &bno))) {
+ xfs_trans_brelse(tp, agbp);
+ args->agbp = NULL;
return error;
- if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1,
1)))
+ }
+ if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1,
1))) {
+ xfs_trans_brelse(tp, agbp);
+ args->agbp = NULL;
return error;
+ }
bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
xfs_trans_binval(tp, bp);
}
@@ -1951,8 +1957,11 @@
targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
targs.type = XFS_ALLOCTYPE_THIS_AG;
targs.pag = pag;
- if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp)))
+ if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp))) {
+ xfs_trans_brelse(tp, agbp);
+ args->agbp = NULL;
return error;
+ }
/*
* Make the freelist longer if it's too short.
*/
@@ -1962,8 +1971,12 @@
/*
* Allocate as many blocks as possible at once.
*/
- if ((error = xfs_alloc_ag_vextent(&targs)))
+ if ((error = xfs_alloc_ag_vextent(&targs))) {
+ xfs_trans_brelse(tp, agbp);
+ xfs_trans_brelse(tp, agflbp);
+ args->agbp = NULL;
return error;
+ }
/*
* Stop if we run out. Won't happen if callers are obeying
* the restrictions correctly. Can happen for free calls
@@ -1976,8 +1989,12 @@
*/
for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
if ((error = xfs_alloc_put_freelist(tp, agbp, agflbp,
- bno)))
+ bno))) {
+ xfs_trans_brelse(tp, agbp);
+ xfs_trans_brelse(tp, agflbp);
+ args->agbp = NULL;
return error;
+ }
}
}
args->agbp = agbp;
diff -u linux/fs/xfs/xfs_trans_buf.c-o linux/fs/xfs/xfs_trans_buf.c
--- linux/fs/xfs/xfs_trans_buf.c-o 2004-06-11 16:32:53.000000000 +0200
+++ linux/fs/xfs/xfs_trans_buf.c 2004-06-12 03:45:11.000000000 +0200
@@ -521,6 +521,9 @@
xfs_log_item_t *lip;
xfs_log_item_desc_t *lidp;
+ if (!bp)
+ return;
+
/*
* Default to a normal brelse() call if the tp is NULL.
*/
|