xfs
[Top] [All Lists]

[PATCH 1/2] xfs: convert inode cache lookups to use RCU locking

To: xfs@xxxxxxxxxxx
Subject: [PATCH 1/2] xfs: convert inode cache lookups to use RCU locking
From: Dave Chinner <david@xxxxxxxxxxxxx>
Date: Mon, 29 Nov 2010 11:50:25 +1100
Cc: paulmck@xxxxxxxxxxxxxxxxxx, eric.dumazet@xxxxxxxxx
In-reply-to: <1290991826-20714-1-git-send-email-david@xxxxxxxxxxxxx>
References: <1290991826-20714-1-git-send-email-david@xxxxxxxxxxxxx>
From: Dave Chinner <dchinner@xxxxxxxxxx>

With delayed logging greatly increasing the sustained parallelism of inode
operations, the inode cache locking is showing significant read vs write
contention when inode reclaim runs at the same time as lookups. There is
also a lot more write lock acquistions than there are read locks (4:1 ratio)
so the read locking is not really buying us much in the way of parallelism.

To avoid the read vs write contention, change the cache to use RCU locking on
the read side. To avoid needing to RCU free every single inode, use the built
in slab RCU freeing mechanism. This requires us to be able to detect lookups of
freed inodes, so enѕure that ever freed inode has an inode number of zero and
the XFS_IRECLAIM flag set. We already check the XFS_IRECLAIM flag in cache hit
lookup path, but also add a check for a zero inode number as well.

We canthen convert all the read locking lockups to use RCU read side locking
and hence remove all read side locking.

Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
Reviewed-by: Alex Elder <aelder@xxxxxxx>
---
 fs/xfs/linux-2.6/xfs_sync.c |   27 ++++++++++++++++-----
 fs/xfs/xfs_iget.c           |   43 +++++++++++++++++++++++++++--------
 fs/xfs/xfs_inode.c          |   52 +++++++++++++++++++++++++++++++++----------
 3 files changed, 93 insertions(+), 29 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index afb0d7c..5ee02d7 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -53,14 +53,20 @@ xfs_inode_ag_walk_grab(
 {
        struct inode            *inode = VFS_I(ip);
 
+       /* check for stale RCU freed inode */
+       spin_lock(&ip->i_flags_lock);
+       if (!ip->i_ino)
+               goto out_unlock_noent;
+
+       /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+       if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+               goto out_unlock_noent;
+       spin_unlock(&ip->i_flags_lock);
+
        /* nothing to sync during shutdown */
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return EFSCORRUPTED;
 
-       /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-       if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-               return ENOENT;
-
        /* If we can't grab the inode, it must on it's way to reclaim. */
        if (!igrab(inode))
                return ENOENT;
@@ -72,6 +78,10 @@ xfs_inode_ag_walk_grab(
 
        /* inode is valid */
        return 0;
+
+out_unlock_noent:
+       spin_unlock(&ip->i_flags_lock);
+       return ENOENT;
 }
 
 STATIC int
@@ -98,12 +108,12 @@ restart:
                int             error = 0;
                int             i;
 
-               read_lock(&pag->pag_ici_lock);
+               rcu_read_lock();
                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH);
                if (!nr_found) {
-                       read_unlock(&pag->pag_ici_lock);
+                       rcu_read_unlock();
                        break;
                }
 
@@ -129,7 +139,7 @@ restart:
                }
 
                /* unlock now we've grabbed the inodes. */
-               read_unlock(&pag->pag_ici_lock);
+               rcu_read_unlock();
 
                for (i = 0; i < nr_found; i++) {
                        if (!batch[i])
@@ -639,6 +649,9 @@ xfs_reclaim_inode_grab(
        struct xfs_inode        *ip,
        int                     flags)
 {
+       /* check for stale RCU freed inode */
+       if (!ip->i_ino)
+               return 1;
 
        /*
         * do some unlocked checks first to avoid unnecceary lock traffic.
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 18991a9..0d2b6d2 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -69,6 +69,7 @@ xfs_inode_alloc(
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
+       ASSERT(ip->i_ino == 0);
 
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
 
@@ -86,9 +87,6 @@ xfs_inode_alloc(
        ip->i_new_size = 0;
        ip->i_dirty_releases = 0;
 
-       /* prevent anyone from using this yet */
-       VFS_I(ip)->i_state = I_NEW;
-
        return ip;
 }
 
@@ -135,6 +133,16 @@ xfs_inode_free(
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
 
+       /*
+        * because we use SLAB_DESTROY_BY_RCU freeing, ensure the inode
+        * always appears to be reclaimed with an invalid inode number
+        * when in the free state. The ip->i_flags_lock provides the barrier
+        * against lookup races.
+        */
+       spin_lock(&ip->i_flags_lock);
+       ip->i_flags = XFS_IRECLAIM;
+       ip->i_ino = 0;
+       spin_unlock(&ip->i_flags_lock);
        kmem_zone_free(xfs_inode_zone, ip);
 }
 
@@ -146,13 +154,28 @@ xfs_iget_cache_hit(
        struct xfs_perag        *pag,
        struct xfs_inode        *ip,
        int                     flags,
-       int                     lock_flags) __releases(pag->pag_ici_lock)
+       int                     lock_flags) __releases(RCU)
 {
        struct inode            *inode = VFS_I(ip);
        struct xfs_mount        *mp = ip->i_mount;
        int                     error;
 
+       /*
+        * check for re-use of an inode within an RCU grace period due to the
+        * radix tree nodes not being updated yet. We monitor for this by
+        * setting the inode number to zero before freeing the inode structure.
+        */
        spin_lock(&ip->i_flags_lock);
+       if (ip->i_ino == 0) {
+               trace_xfs_iget_skip(ip);
+               XFS_STATS_INC(xs_ig_frecycle);
+               spin_unlock(&ip->i_flags_lock);
+               rcu_read_unlock();
+               /* Expire the grace period so we don't trip over it again. */
+               synchronize_rcu();
+               return EAGAIN;
+       }
+
 
        /*
         * If we are racing with another cache hit that is currently
@@ -195,7 +218,7 @@ xfs_iget_cache_hit(
                ip->i_flags |= XFS_IRECLAIM;
 
                spin_unlock(&ip->i_flags_lock);
-               read_unlock(&pag->pag_ici_lock);
+               rcu_read_unlock();
 
                error = -inode_init_always(mp->m_super, inode);
                if (error) {
@@ -203,7 +226,7 @@ xfs_iget_cache_hit(
                         * Re-initializing the inode failed, and we are in deep
                         * trouble.  Try to re-add it to the reclaim list.
                         */
-                       read_lock(&pag->pag_ici_lock);
+                       rcu_read_lock();
                        spin_lock(&ip->i_flags_lock);
 
                        ip->i_flags &= ~XFS_INEW;
@@ -231,7 +254,7 @@ xfs_iget_cache_hit(
 
                /* We've got a live one. */
                spin_unlock(&ip->i_flags_lock);
-               read_unlock(&pag->pag_ici_lock);
+               rcu_read_unlock();
                trace_xfs_iget_hit(ip);
        }
 
@@ -245,7 +268,7 @@ xfs_iget_cache_hit(
 
 out_error:
        spin_unlock(&ip->i_flags_lock);
-       read_unlock(&pag->pag_ici_lock);
+       rcu_read_unlock();
        return error;
 }
 
@@ -376,7 +399,7 @@ xfs_iget(
 
 again:
        error = 0;
-       read_lock(&pag->pag_ici_lock);
+       rcu_read_lock();
        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
 
        if (ip) {
@@ -384,7 +407,7 @@ again:
                if (error)
                        goto out_error_or_again;
        } else {
-               read_unlock(&pag->pag_ici_lock);
+               rcu_read_unlock();
                XFS_STATS_INC(xs_ig_missed);
 
                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 108c7a0..43ffd90 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2000,17 +2000,33 @@ xfs_ifree_cluster(
                 */
                for (i = 0; i < ninodes; i++) {
 retry:
-                       read_lock(&pag->pag_ici_lock);
+                       rcu_read_lock();
                        ip = radix_tree_lookup(&pag->pag_ici_root,
                                        XFS_INO_TO_AGINO(mp, (inum + i)));
 
-                       /* Inode not in memory or stale, nothing to do */
-                       if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
-                               read_unlock(&pag->pag_ici_lock);
+                       /* Inode not in memory, nothing to do */
+                       if (!ip) {
+                               rcu_read_unlock();
                                continue;
                        }
 
                        /*
+                        * because this is an RCU protected lookup, we could
+                        * find a recently freed or even reallocated inode
+                        * during the lookup. We need to check under the
+                        * i_flags_lock for a valid inode here. Skip it if it
+                        * is not valid, the wrong inode or stale.
+                        */
+                       spin_lock(&ip->i_flags_lock);
+                       if (ip->i_ino != inum + i ||
+                           __xfs_iflags_test(ip, XFS_ISTALE)) {
+                               spin_unlock(&ip->i_flags_lock);
+                               rcu_read_unlock();
+                               continue;
+                       }
+                       spin_unlock(&ip->i_flags_lock);
+
+                       /*
                         * Don't try to lock/unlock the current inode, but we
                         * _cannot_ skip the other inodes that we did not find
                         * in the list attached to the buffer and are not
@@ -2019,11 +2035,11 @@ retry:
                         */
                        if (ip != free_ip &&
                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                               read_unlock(&pag->pag_ici_lock);
+                               rcu_read_unlock();
                                delay(1);
                                goto retry;
                        }
-                       read_unlock(&pag->pag_ici_lock);
+                       rcu_read_unlock();
 
                        xfs_iflock(ip);
                        xfs_iflags_set(ip, XFS_ISTALE);
@@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
 
        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
-       read_lock(&pag->pag_ici_lock);
+       rcu_read_lock();
        /* really need a gang lookup range call here */
        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
                                        first_index, inodes_per_cluster);
@@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
                iq = ilist[i];
                if (iq == ip)
                        continue;
-               /* if the inode lies outside this cluster, we're done. */
-               if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
-                       break;
+
+               /*
+                * because this is an RCU protected lookup, we could find a
+                * recently freed or even reallocated inode during the lookup.
+                * We need to check under the i_flags_lock for a valid inode
+                * here. Skip it if it is not valid or the wrong inode.
+                */
+               spin_lock(&ip->i_flags_lock);
+               if (!ip->i_ino ||
+                   (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+                       spin_unlock(&ip->i_flags_lock);
+                       continue;
+               }
+               spin_unlock(&ip->i_flags_lock);
+
                /*
                 * Do an un-protected check to see if the inode is dirty and
                 * is a candidate for flushing.  These checks will be repeated
@@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
        }
 
 out_free:
-       read_unlock(&pag->pag_ici_lock);
+       rcu_read_unlock();
        kmem_free(ilist);
 out_put:
        xfs_perag_put(pag);
@@ -2704,7 +2732,7 @@ cluster_corrupt_out:
         * Corruption detected in the clustering loop.  Invalidate the
         * inode buffer and shut down the filesystem.
         */
-       read_unlock(&pag->pag_ici_lock);
+       rcu_read_unlock();
        /*
         * Clean up the buffer.  If it was B_DELWRI, just release it --
         * brelse can handle it with no problems.  If not, shut down the
-- 
1.7.2.3

<Prev in Thread] Current Thread [Next in Thread>