[PATCH 1/2] xfs: reclaim inodes under a write lock

Dave Chinner david at fromorbit.com
Wed Jan 6 17:05:24 CST 2010


Make the inode tree reclaim walk exclusive to avoid races with
concurrent sync walkers and lookups. This is a version of
a patch posted by Christoph Hellwig that avoids all the code
duplication.

Signed-off-by: Dave Chinner <david at fromorbit.com>
---
 fs/xfs/linux-2.6/xfs_sync.c    |   90 +++++++++++++++++++--------------------
 fs/xfs/linux-2.6/xfs_sync.h    |    2 +-
 fs/xfs/quota/xfs_qm_syscalls.c |    2 +-
 3 files changed, 46 insertions(+), 48 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index f974d1a..6d1cd6e 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -55,7 +55,8 @@ xfs_inode_ag_lookup(
 	struct xfs_mount	*mp,
 	struct xfs_perag	*pag,
 	uint32_t		*first_index,
-	int			tag)
+	int			tag,
+	int			write_lock)
 {
 	int			nr_found;
 	struct xfs_inode	*ip;
@@ -65,7 +66,10 @@ xfs_inode_ag_lookup(
 	 * as the tree is sparse and a gang lookup walks to find
 	 * the number of objects requested.
 	 */
-	read_lock(&pag->pag_ici_lock);
+	if (write_lock)
+		write_lock(&pag->pag_ici_lock);
+	else
+		read_lock(&pag->pag_ici_lock);
 	if (tag == XFS_ICI_NO_TAG) {
 		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
 				(void **)&ip, *first_index, 1);
@@ -89,7 +93,10 @@ xfs_inode_ag_lookup(
 	return ip;
 
 unlock:
-	read_unlock(&pag->pag_ici_lock);
+	if (write_lock)
+		write_unlock(&pag->pag_ici_lock);
+	else
+		read_unlock(&pag->pag_ici_lock);
 	return NULL;
 }
 
@@ -100,7 +107,8 @@ xfs_inode_ag_walk(
 	int			(*execute)(struct xfs_inode *ip,
 					   struct xfs_perag *pag, int flags),
 	int			flags,
-	int			tag)
+	int			tag,
+	int			write_lock)
 {
 	uint32_t		first_index;
 	int			last_error = 0;
@@ -113,7 +121,8 @@ restart:
 		int		error = 0;
 		xfs_inode_t	*ip;
 
-		ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
+		ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag,
+						write_lock);
 		if (!ip)
 			break;
 
@@ -145,7 +154,8 @@ xfs_inode_ag_iterator(
 	int			(*execute)(struct xfs_inode *ip,
 					   struct xfs_perag *pag, int flags),
 	int			flags,
-	int			tag)
+	int			tag,
+	int			write_lock)
 {
 	int			error = 0;
 	int			last_error = 0;
@@ -159,7 +169,8 @@ xfs_inode_ag_iterator(
 			xfs_perag_put(pag);
 			continue;
 		}
-		error = xfs_inode_ag_walk(mp, pag, execute, flags, tag);
+		error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
+						write_lock);
 		xfs_perag_put(pag);
 		if (error) {
 			last_error = error;
@@ -184,18 +195,20 @@ xfs_sync_inode_valid(
 		return EFSCORRUPTED;
 	}
 
-	/*
-	 * If we can't get a reference on the inode, it must be in reclaim.
-	 * Leave it for the reclaim code to flush. Also avoid inodes that
-	 * haven't been fully initialised.
-	 */
+	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+	if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) {
+		read_unlock(&pag->pag_ici_lock);
+		return ENOENT;
+	}
+
+	/* If we can't get a reference on the inode, it must be in reclaim. */
 	if (!igrab(inode)) {
 		read_unlock(&pag->pag_ici_lock);
 		return ENOENT;
 	}
 	read_unlock(&pag->pag_ici_lock);
 
-	if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) {
+	if (is_bad_inode(inode)) {
 		IRELE(ip);
 		return ENOENT;
 	}
@@ -285,7 +298,7 @@ xfs_sync_data(
 	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
 
 	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
-				      XFS_ICI_NO_TAG);
+				      XFS_ICI_NO_TAG, 0);
 	if (error)
 		return XFS_ERROR(error);
 
@@ -307,7 +320,7 @@ xfs_sync_attr(
 	ASSERT((flags & ~SYNC_WAIT) == 0);
 
 	return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
-				     XFS_ICI_NO_TAG);
+				     XFS_ICI_NO_TAG, 0);
 }
 
 STATIC int
@@ -669,33 +682,8 @@ xfs_reclaim_inode(
 	xfs_inode_t	*ip,
 	int		sync_mode)
 {
-	struct xfs_mount *mp = ip->i_mount;
-	struct xfs_perag *pag;
-
-
 	/*
-	 * The radix tree lock here protects a thread in xfs_iget_core from
-	 * racing with us on linking the inode back with a vnode.
-	 * Once we have the XFS_IRECLAIM flag set it will not touch
-	 * us.
-	 */
-	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-	write_lock(&pag->pag_ici_lock);
-	spin_lock(&ip->i_flags_lock);
-	if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
-	    !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
-		spin_unlock(&ip->i_flags_lock);
-		write_unlock(&pag->pag_ici_lock);
-		xfs_perag_put(pag);
-		return EAGAIN;
-	}
-	__xfs_iflags_set(ip, XFS_IRECLAIM);
-	spin_unlock(&ip->i_flags_lock);
-	write_unlock(&pag->pag_ici_lock);
-	xfs_perag_put(pag);
-
-	/*
-	 * The inode is flushed delayed write. That means the flush lock
+	 * Inodes are flushed delayed write. That means the flush lock
 	 * may be held here and we will block for some time on it. Further,
 	 * if we hold the inode lock, we prevent the AIL from locking and
 	 * therefore being able to push the buffer. This means that we'll end
@@ -791,12 +779,22 @@ xfs_reclaim_inode_now(
 	struct xfs_perag	*pag,
 	int			flags)
 {
-	/* ignore if already under reclaim */
-	if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
-		read_unlock(&pag->pag_ici_lock);
+	/*
+	 * The radix tree lock here protects a thread in xfs_iget from racing
+	 * with us starting reclaim on the inode.  Once we have the
+	 * XFS_IRECLAIM flag set it will not touch us.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+	if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
+		/* ignore as it is already under reclaim */
+		spin_unlock(&ip->i_flags_lock);
+		write_unlock(&pag->pag_ici_lock);
 		return 0;
 	}
-	read_unlock(&pag->pag_ici_lock);
+	__xfs_iflags_set(ip, XFS_IRECLAIM);
+	spin_unlock(&ip->i_flags_lock);
+	write_unlock(&pag->pag_ici_lock);
 
 	return xfs_reclaim_inode(ip, flags);
 }
@@ -807,5 +805,5 @@ xfs_reclaim_inodes(
 	int		mode)
 {
 	return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode,
-					XFS_ICI_RECLAIM_TAG);
+					XFS_ICI_RECLAIM_TAG, 1);
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index a500b4d..ea932b4 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -54,6 +54,6 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
 int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
 	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
-	int flags, int tag);
+	int flags, int tag, int write_lock);
 
 #endif
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 71af76f..873e07e 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -891,7 +891,7 @@ xfs_qm_dqrele_all_inodes(
 	uint		 flags)
 {
 	ASSERT(mp->m_quotainfo);
-	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG);
+	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
 }
 
 /*------------------------------------------------------------------------*/
-- 
1.6.5




More information about the xfs mailing list