Make the inode tree reclaim walk exclusive to avoid races with
concurrent sync walkers and lookups. This is a version of
a patch posted by Christoph Hellwig that avoids all the code
duplication.
Signed-off-by: Dave Chinner <david@xxxxxxxxxxxxx>
---
fs/xfs/linux-2.6/xfs_sync.c | 90 +++++++++++++++++++--------------------
fs/xfs/linux-2.6/xfs_sync.h | 2 +-
fs/xfs/quota/xfs_qm_syscalls.c | 2 +-
3 files changed, 46 insertions(+), 48 deletions(-)
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index f974d1a..6d1cd6e 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -55,7 +55,8 @@ xfs_inode_ag_lookup(
struct xfs_mount *mp,
struct xfs_perag *pag,
uint32_t *first_index,
- int tag)
+ int tag,
+ int write_lock)
{
int nr_found;
struct xfs_inode *ip;
@@ -65,7 +66,10 @@ xfs_inode_ag_lookup(
* as the tree is sparse and a gang lookup walks to find
* the number of objects requested.
*/
- read_lock(&pag->pag_ici_lock);
+ if (write_lock)
+ write_lock(&pag->pag_ici_lock);
+ else
+ read_lock(&pag->pag_ici_lock);
if (tag == XFS_ICI_NO_TAG) {
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
(void **)&ip, *first_index, 1);
@@ -89,7 +93,10 @@ xfs_inode_ag_lookup(
return ip;
unlock:
- read_unlock(&pag->pag_ici_lock);
+ if (write_lock)
+ write_unlock(&pag->pag_ici_lock);
+ else
+ read_unlock(&pag->pag_ici_lock);
return NULL;
}
@@ -100,7 +107,8 @@ xfs_inode_ag_walk(
int (*execute)(struct xfs_inode *ip,
struct xfs_perag *pag, int flags),
int flags,
- int tag)
+ int tag,
+ int write_lock)
{
uint32_t first_index;
int last_error = 0;
@@ -113,7 +121,8 @@ restart:
int error = 0;
xfs_inode_t *ip;
- ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
+ ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag,
+ write_lock);
if (!ip)
break;
@@ -145,7 +154,8 @@ xfs_inode_ag_iterator(
int (*execute)(struct xfs_inode *ip,
struct xfs_perag *pag, int flags),
int flags,
- int tag)
+ int tag,
+ int write_lock)
{
int error = 0;
int last_error = 0;
@@ -159,7 +169,8 @@ xfs_inode_ag_iterator(
xfs_perag_put(pag);
continue;
}
- error = xfs_inode_ag_walk(mp, pag, execute, flags, tag);
+ error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
+ write_lock);
xfs_perag_put(pag);
if (error) {
last_error = error;
@@ -184,18 +195,20 @@ xfs_sync_inode_valid(
return EFSCORRUPTED;
}
- /*
- * If we can't get a reference on the inode, it must be in reclaim.
- * Leave it for the reclaim code to flush. Also avoid inodes that
- * haven't been fully initialised.
- */
+ /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+ if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) {
+ read_unlock(&pag->pag_ici_lock);
+ return ENOENT;
+ }
+
+ /* If we can't get a reference on the inode, it must be in reclaim. */
if (!igrab(inode)) {
read_unlock(&pag->pag_ici_lock);
return ENOENT;
}
read_unlock(&pag->pag_ici_lock);
- if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) {
+ if (is_bad_inode(inode)) {
IRELE(ip);
return ENOENT;
}
@@ -285,7 +298,7 @@ xfs_sync_data(
ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
- XFS_ICI_NO_TAG);
+ XFS_ICI_NO_TAG, 0);
if (error)
return XFS_ERROR(error);
@@ -307,7 +320,7 @@ xfs_sync_attr(
ASSERT((flags & ~SYNC_WAIT) == 0);
return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
- XFS_ICI_NO_TAG);
+ XFS_ICI_NO_TAG, 0);
}
STATIC int
@@ -669,33 +682,8 @@ xfs_reclaim_inode(
xfs_inode_t *ip,
int sync_mode)
{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_perag *pag;
-
-
/*
- * The radix tree lock here protects a thread in xfs_iget_core from
- * racing with us on linking the inode back with a vnode.
- * Once we have the XFS_IRECLAIM flag set it will not touch
- * us.
- */
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
- write_lock(&pag->pag_ici_lock);
- spin_lock(&ip->i_flags_lock);
- if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
- !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
- spin_unlock(&ip->i_flags_lock);
- write_unlock(&pag->pag_ici_lock);
- xfs_perag_put(pag);
- return EAGAIN;
- }
- __xfs_iflags_set(ip, XFS_IRECLAIM);
- spin_unlock(&ip->i_flags_lock);
- write_unlock(&pag->pag_ici_lock);
- xfs_perag_put(pag);
-
- /*
- * The inode is flushed delayed write. That means the flush lock
+ * Inodes are flushed delayed write. That means the flush lock
* may be held here and we will block for some time on it. Further,
* if we hold the inode lock, we prevent the AIL from locking and
* therefore being able to push the buffer. This means that we'll end
@@ -791,12 +779,22 @@ xfs_reclaim_inode_now(
struct xfs_perag *pag,
int flags)
{
- /* ignore if already under reclaim */
- if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
- read_unlock(&pag->pag_ici_lock);
+ /*
+ * The radix tree lock here protects a thread in xfs_iget from racing
+ * with us starting reclaim on the inode. Once we have the
+ * XFS_IRECLAIM flag set it will not touch us.
+ */
+ spin_lock(&ip->i_flags_lock);
+ ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+ if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
+ /* ignore as it is already under reclaim */
+ spin_unlock(&ip->i_flags_lock);
+ write_unlock(&pag->pag_ici_lock);
return 0;
}
- read_unlock(&pag->pag_ici_lock);
+ __xfs_iflags_set(ip, XFS_IRECLAIM);
+ spin_unlock(&ip->i_flags_lock);
+ write_unlock(&pag->pag_ici_lock);
return xfs_reclaim_inode(ip, flags);
}
@@ -807,5 +805,5 @@ xfs_reclaim_inodes(
int mode)
{
return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode,
- XFS_ICI_RECLAIM_TAG);
+ XFS_ICI_RECLAIM_TAG, 1);
}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index a500b4d..ea932b4 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -54,6 +54,6 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
struct xfs_perag *pag,
int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
- int flags, int tag);
+ int flags, int tag, int write_lock);
#endif
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 71af76f..873e07e 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -891,7 +891,7 @@ xfs_qm_dqrele_all_inodes(
uint flags)
{
ASSERT(mp->m_quotainfo);
- xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG);
+ xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
}
/*------------------------------------------------------------------------*/
--
1.6.5
|