xfs
[Top] [All Lists]

Re: [PATCH] xfs: add a shrinker to background inode reclaim

To: Dave Chinner <david@xxxxxxxxxxxxx>
Subject: Re: [PATCH] xfs: add a shrinker to background inode reclaim
From: Christoph Hellwig <hch@xxxxxxxxxxxxx>
Date: Wed, 28 Apr 2010 07:31:11 -0400
Cc: xfs@xxxxxxxxxxx, npiggin@xxxxxxx, linux-mm@xxxxxxxxx
In-reply-to: <1272429248-5269-1-git-send-email-david@xxxxxxxxxxxxx>
References: <1272429248-5269-1-git-send-email-david@xxxxxxxxxxxxx>
User-agent: Mutt/1.5.19 (2009-01-05)
This is quite ugly compared to the previous version but looks correct
enough to me.  One problem is that the first filesystem registered will
get an far over-proportional number of shrink requests, which the simple
patch to pass private data to the shrinker would get around easily.

Anyway, we need a fix, so:


Reviewed-by: Christoph Hellwig <hch@xxxxxx>

On Wed, Apr 28, 2010 at 02:34:08PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@xxxxxxxxxx>
> 
> On low memory boxes or those with highmem, kernel can OOM before the
> background reclaims inodes via xfssyncd. Add a shrinker to run inode
> reclaim so that it inode reclaim is expedited when memory is low.
> 
> This is more complex than it needs to be because the VM folk don't
> want a context added to the shrinker infrastructure. Hence we need
> to add a global list of XFS mount structures so the shrinker can
> traverse them.
> 
> Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
> ---
>  fs/xfs/linux-2.6/xfs_super.c   |    5 ++
>  fs/xfs/linux-2.6/xfs_sync.c    |  112 
> +++++++++++++++++++++++++++++++++++++---
>  fs/xfs/linux-2.6/xfs_sync.h    |    7 ++-
>  fs/xfs/quota/xfs_qm_syscalls.c |    3 +-
>  fs/xfs/xfs_ag.h                |    1 +
>  fs/xfs/xfs_mount.h             |    1 +
>  6 files changed, 120 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
> index 1abcacc..a43d09e 100644
> --- a/fs/xfs/linux-2.6/xfs_super.c
> +++ b/fs/xfs/linux-2.6/xfs_super.c
> @@ -1210,6 +1210,7 @@ xfs_fs_put_super(
>  
>       xfs_unmountfs(mp);
>       xfs_freesb(mp);
> +     xfs_inode_shrinker_unregister(mp);
>       xfs_icsb_destroy_counters(mp);
>       xfs_close_devices(mp);
>       xfs_dmops_put(mp);
> @@ -1623,6 +1624,8 @@ xfs_fs_fill_super(
>       if (error)
>               goto fail_vnrele;
>  
> +     xfs_inode_shrinker_register(mp);
> +
>       kfree(mtpt);
>       return 0;
>  
> @@ -1868,6 +1871,7 @@ init_xfs_fs(void)
>               goto out_cleanup_procfs;
>  
>       vfs_initquota();
> +     xfs_inode_shrinker_init();
>  
>       error = register_filesystem(&xfs_fs_type);
>       if (error)
> @@ -1895,6 +1899,7 @@ exit_xfs_fs(void)
>  {
>       vfs_exitquota();
>       unregister_filesystem(&xfs_fs_type);
> +     xfs_inode_shrinker_destroy();
>       xfs_sysctl_unregister();
>       xfs_cleanup_procfs();
>       xfs_buf_terminate();
> diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
> index 3a64179..3884e20 100644
> --- a/fs/xfs/linux-2.6/xfs_sync.c
> +++ b/fs/xfs/linux-2.6/xfs_sync.c
> @@ -95,7 +95,8 @@ xfs_inode_ag_walk(
>                                          struct xfs_perag *pag, int flags),
>       int                     flags,
>       int                     tag,
> -     int                     exclusive)
> +     int                     exclusive,
> +     int                     *nr_to_scan)
>  {
>       uint32_t                first_index;
>       int                     last_error = 0;
> @@ -134,7 +135,7 @@ restart:
>               if (error == EFSCORRUPTED)
>                       break;
>  
> -     } while (1);
> +     } while ((*nr_to_scan)--);
>  
>       if (skipped) {
>               delay(1);
> @@ -150,12 +151,15 @@ xfs_inode_ag_iterator(
>                                          struct xfs_perag *pag, int flags),
>       int                     flags,
>       int                     tag,
> -     int                     exclusive)
> +     int                     exclusive,
> +     int                     *nr_to_scan)
>  {
>       int                     error = 0;
>       int                     last_error = 0;
>       xfs_agnumber_t          ag;
> +     int                     nr;
>  
> +     nr = nr_to_scan ? *nr_to_scan : INT_MAX;
>       for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
>               struct xfs_perag        *pag;
>  
> @@ -165,14 +169,18 @@ xfs_inode_ag_iterator(
>                       continue;
>               }
>               error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
> -                                             exclusive);
> +                                             exclusive, &nr);
>               xfs_perag_put(pag);
>               if (error) {
>                       last_error = error;
>                       if (error == EFSCORRUPTED)
>                               break;
>               }
> +             if (nr <= 0)
> +                     break;
>       }
> +     if (nr_to_scan)
> +             *nr_to_scan = nr;
>       return XFS_ERROR(last_error);
>  }
>  
> @@ -291,7 +299,7 @@ xfs_sync_data(
>       ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
>  
>       error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
> -                                   XFS_ICI_NO_TAG, 0);
> +                                   XFS_ICI_NO_TAG, 0, NULL);
>       if (error)
>               return XFS_ERROR(error);
>  
> @@ -310,7 +318,7 @@ xfs_sync_attr(
>       ASSERT((flags & ~SYNC_WAIT) == 0);
>  
>       return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
> -                                  XFS_ICI_NO_TAG, 0);
> +                                  XFS_ICI_NO_TAG, 0, NULL);
>  }
>  
>  STATIC int
> @@ -636,6 +644,7 @@ __xfs_inode_set_reclaim_tag(
>       radix_tree_tag_set(&pag->pag_ici_root,
>                          XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
>                          XFS_ICI_RECLAIM_TAG);
> +     pag->pag_ici_reclaimable++;
>  }
>  
>  /*
> @@ -668,6 +677,7 @@ __xfs_inode_clear_reclaim_tag(
>  {
>       radix_tree_tag_clear(&pag->pag_ici_root,
>                       XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
> +     pag->pag_ici_reclaimable--;
>  }
>  
>  /*
> @@ -817,5 +827,93 @@ xfs_reclaim_inodes(
>       int             mode)
>  {
>       return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
> -                                     XFS_ICI_RECLAIM_TAG, 1);
> +                                     XFS_ICI_RECLAIM_TAG, 1, NULL);
> +}
> +
> +/*
> + * Shrinker infrastructure.
> + *
> + * This is all far more complex than it needs to be. It adds a global list of
> + * mounts because the shrinkers can only call a global context. We need to 
> make
> + * the shrinkers pass a context to avoid the need for global state.
> + */
> +static LIST_HEAD(xfs_mount_list);
> +static struct rw_semaphore xfs_mount_list_lock;
> +
> +static int
> +xfs_reclaim_inode_shrink(
> +     int             nr_to_scan,
> +     gfp_t           gfp_mask)
> +{
> +     struct xfs_mount *mp;
> +     struct xfs_perag *pag;
> +     xfs_agnumber_t  ag;
> +     int             reclaimable = 0;
> +
> +     if (nr_to_scan) {
> +             if (!(gfp_mask & __GFP_FS))
> +                     return -1;
> +
> +             down_read(&xfs_mount_list_lock);
> +             list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
> +                     xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
> +                                     XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
> +                     if (nr_to_scan <= 0)
> +                             break;
> +             }
> +             up_read(&xfs_mount_list_lock);
> +     }
> +
> +     down_read(&xfs_mount_list_lock);
> +     list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
> +             for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
> +
> +                     pag = xfs_perag_get(mp, ag);
> +                     if (!pag->pag_ici_init) {
> +                             xfs_perag_put(pag);
> +                             continue;
> +                     }
> +                     reclaimable += pag->pag_ici_reclaimable;
> +                     xfs_perag_put(pag);
> +             }
> +     }
> +     up_read(&xfs_mount_list_lock);
> +     return reclaimable;
> +}
> +
> +static struct shrinker xfs_inode_shrinker = {
> +     .shrink = xfs_reclaim_inode_shrink,
> +     .seeks = DEFAULT_SEEKS,
> +};
> +
> +void __init
> +xfs_inode_shrinker_init(void)
> +{
> +     init_rwsem(&xfs_mount_list_lock);
> +     register_shrinker(&xfs_inode_shrinker);
> +}
> +
> +void
> +xfs_inode_shrinker_destroy(void)
> +{
> +     ASSERT(list_empty(&xfs_mount_list));
> +     unregister_shrinker(&xfs_inode_shrinker);
> +}
> +
> +void
> +xfs_inode_shrinker_register(
> +     struct xfs_mount        *mp)
> +{
> +     down_write(&xfs_mount_list_lock);
> +     list_add_tail(&mp->m_mplist, &xfs_mount_list);
> +     up_write(&xfs_mount_list_lock);
> +}
> +
> +void
> +xfs_inode_shrinker_unregister(
> +     struct xfs_mount        *mp)
> +{
> +     down_write(&xfs_mount_list_lock);
> +     list_del(&mp->m_mplist);
> +     up_write(&xfs_mount_list_lock);
>  }
> diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
> index d480c34..cdcbaac 100644
> --- a/fs/xfs/linux-2.6/xfs_sync.h
> +++ b/fs/xfs/linux-2.6/xfs_sync.h
> @@ -53,6 +53,11 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, 
> struct xfs_perag *pag,
>  int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
>  int xfs_inode_ag_iterator(struct xfs_mount *mp,
>       int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
> -     int flags, int tag, int write_lock);
> +     int flags, int tag, int write_lock, int *nr_to_scan);
> +
> +void xfs_inode_shrinker_init(void);
> +void xfs_inode_shrinker_destroy(void);
> +void xfs_inode_shrinker_register(struct xfs_mount *mp);
> +void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
>  
>  #endif
> diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
> index f14408e..26fa431 100644
> --- a/fs/xfs/quota/xfs_qm_syscalls.c
> +++ b/fs/xfs/quota/xfs_qm_syscalls.c
> @@ -889,7 +889,8 @@ xfs_qm_dqrele_all_inodes(
>       uint             flags)
>  {
>       ASSERT(mp->m_quotainfo);
> -     xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
> +     xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags,
> +                             XFS_ICI_NO_TAG, 0, NULL);
>  }
>  
>  /*------------------------------------------------------------------------*/
> diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
> index b1a5a1f..abb8222 100644
> --- a/fs/xfs/xfs_ag.h
> +++ b/fs/xfs/xfs_ag.h
> @@ -223,6 +223,7 @@ typedef struct xfs_perag {
>       int             pag_ici_init;   /* incore inode cache initialised */
>       rwlock_t        pag_ici_lock;   /* incore inode lock */
>       struct radix_tree_root pag_ici_root;    /* incore inode cache root */
> +     int             pag_ici_reclaimable;    /* reclaimable inodes */
>  #endif
>       int             pagb_count;     /* pagb slots in use */
>       xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 4fa0bc7..9ff48a1 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -259,6 +259,7 @@ typedef struct xfs_mount {
>       wait_queue_head_t       m_wait_single_sync_task;
>       __int64_t               m_update_flags; /* sb flags we need to update
>                                                  on the next remount,rw */
> +     struct list_head        m_mplist;       /* inode shrinker mount list */
>  } xfs_mount_t;
>  
>  /*
> -- 
> 1.5.6.5
> 
> _______________________________________________
> xfs mailing list
> xfs@xxxxxxxxxxx
> http://oss.sgi.com/mailman/listinfo/xfs
---end quoted text---

<Prev in Thread] Current Thread [Next in Thread>