xfs
[Top] [All Lists]

Re: [PATCH] xfs: Do background CIL flushes via a workqueue

To: Dave Chinner <david@xxxxxxxxxxxxx>
Subject: Re: [PATCH] xfs: Do background CIL flushes via a workqueue
From: Christoph Hellwig <hch@xxxxxxxxxxxxx>
Date: Tue, 27 Mar 2012 10:31:27 -0400
Cc: xfs@xxxxxxxxxxx, Vivek Goyal <vgoyal@xxxxxxxxxx>
In-reply-to: <1332841605-3538-1-git-send-email-david@xxxxxxxxxxxxx>
References: <1332841605-3538-1-git-send-email-david@xxxxxxxxxxxxx>
User-agent: Mutt/1.5.21 (2010-09-15)
Vivek, does CFQ still need any hints for this sort of handoff?

On Tue, Mar 27, 2012 at 08:46:45PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@xxxxxxxxxx>
> 
> Doing background CIL flushes adds significant latency to whatever
> async transaction that triggers it. To avoid blocking async
> transactions on things like waiting for log buffer IO to complete,
> move the CIL push off into a workqueue.  By moving the push work
> into a workqueue, we remove all the latency that the commit adds
> from the foreground transaction commit path. This also means that
> single threaded workloads won't do the CIL push procssing, leaving
> them more CPU to do more async transactions.
> 
> To do this, we need to keep track of the sequence number we have
> pushed work for. This avoids having many transaction commits
> attempting to schedule work for the same sequence, and ensures that
> we only ever have one push (background or forced) in progress at a
> time. It also means that we don't need to take the CIL lock in write
> mode to check for potential background push races, which reduces
> lock contention.
> 
> To avoid potential issues with "smart" IO schedulers, don't use the
> workqueue for log force triggered flushes. Instead, do them directly
> so that the log IO is done directly by the process issuing the log
> force and so doesn't get stuck on IO elevator queue idling
> incorrectly delaying the log IO from the workqueue.
> 
> Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
> ---
>  fs/xfs/xfs_log_cil.c  |  241 
> ++++++++++++++++++++++++++++++-------------------
>  fs/xfs/xfs_log_priv.h |    4 +
>  fs/xfs/xfs_super.c    |    6 ++
>  3 files changed, 158 insertions(+), 93 deletions(-)
> 
> diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> index d4fadbe..6a5a7ba 100644
> --- a/fs/xfs/xfs_log_cil.c
> +++ b/fs/xfs/xfs_log_cil.c
> @@ -31,57 +31,7 @@
>  #include "xfs_alloc.h"
>  #include "xfs_discard.h"
>  
> -/*
> - * Perform initial CIL structure initialisation.
> - */
> -int
> -xlog_cil_init(
> -     struct log      *log)
> -{
> -     struct xfs_cil  *cil;
> -     struct xfs_cil_ctx *ctx;
> -
> -     cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
> -     if (!cil)
> -             return ENOMEM;
> -
> -     ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
> -     if (!ctx) {
> -             kmem_free(cil);
> -             return ENOMEM;
> -     }
> -
> -     INIT_LIST_HEAD(&cil->xc_cil);
> -     INIT_LIST_HEAD(&cil->xc_committing);
> -     spin_lock_init(&cil->xc_cil_lock);
> -     init_rwsem(&cil->xc_ctx_lock);
> -     init_waitqueue_head(&cil->xc_commit_wait);
> -
> -     INIT_LIST_HEAD(&ctx->committing);
> -     INIT_LIST_HEAD(&ctx->busy_extents);
> -     ctx->sequence = 1;
> -     ctx->cil = cil;
> -     cil->xc_ctx = ctx;
> -     cil->xc_current_sequence = ctx->sequence;
> -
> -     cil->xc_log = log;
> -     log->l_cilp = cil;
> -     return 0;
> -}
> -
> -void
> -xlog_cil_destroy(
> -     struct log      *log)
> -{
> -     if (log->l_cilp->xc_ctx) {
> -             if (log->l_cilp->xc_ctx->ticket)
> -                     xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
> -             kmem_free(log->l_cilp->xc_ctx);
> -     }
> -
> -     ASSERT(list_empty(&log->l_cilp->xc_cil));
> -     kmem_free(log->l_cilp);
> -}
> +struct workqueue_struct *xfs_cil_wq;
>  
>  /*
>   * Allocate a new ticket. Failing to get a new ticket makes it really hard to
> @@ -426,8 +376,7 @@ xlog_cil_committed(
>   */
>  STATIC int
>  xlog_cil_push(
> -     struct log              *log,
> -     xfs_lsn_t               push_seq)
> +     struct log              *log)
>  {
>       struct xfs_cil          *cil = log->l_cilp;
>       struct xfs_log_vec      *lv;
> @@ -443,39 +392,35 @@ xlog_cil_push(
>       struct xfs_log_iovec    lhdr;
>       struct xfs_log_vec      lvhdr = { NULL };
>       xfs_lsn_t               commit_lsn;
> +     xfs_lsn_t               push_seq;
>  
>       if (!cil)
>               return 0;
>  
> -     ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
> -
>       new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
>       new_ctx->ticket = xlog_cil_ticket_alloc(log);
>  
> -     /*
> -      * Lock out transaction commit, but don't block for background pushes
> -      * unless we are well over the CIL space limit. See the definition of
> -      * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic
> -      * used here.
> -      */
> -     if (!down_write_trylock(&cil->xc_ctx_lock)) {
> -             if (!push_seq &&
> -                 cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log))
> -                     goto out_free_ticket;
> -             down_write(&cil->xc_ctx_lock);
> -     }
> +     down_write(&cil->xc_ctx_lock);
>       ctx = cil->xc_ctx;
>  
> -     /* check if we've anything to push */
> -     if (list_empty(&cil->xc_cil))
> -             goto out_skip;
> +     spin_lock(&cil->xc_cil_lock);
> +     push_seq = cil->xc_push_seq;
> +     ASSERT(push_seq > 0 && push_seq <= ctx->sequence);
>  
> -     /* check for spurious background flush */
> -     if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
> +     /*
> +      * Check if we've anything to push. If there is nothing, then we don't
> +      * move on to a new sequence number and so we have to be able to push
> +      * this sequence again later.
> +      */
> +     if (list_empty(&cil->xc_cil)) {
> +             cil->xc_push_seq = 0;
> +             spin_unlock(&cil->xc_cil_lock);
>               goto out_skip;
> +     }
> +     spin_unlock(&cil->xc_cil_lock);
>  
>       /* check for a previously pushed seqeunce */
> -     if (push_seq && push_seq < cil->xc_ctx->sequence)
> +     if (push_seq < cil->xc_ctx->sequence)
>               goto out_skip;
>  
>       /*
> @@ -629,7 +574,6 @@ restart:
>  
>  out_skip:
>       up_write(&cil->xc_ctx_lock);
> -out_free_ticket:
>       xfs_log_ticket_put(new_ctx->ticket);
>       kmem_free(new_ctx);
>       return 0;
> @@ -641,6 +585,80 @@ out_abort:
>       return XFS_ERROR(EIO);
>  }
>  
> +static void
> +xlog_cil_push_work(
> +     struct work_struct      *work)
> +{
> +     struct xfs_cil          *cil = container_of(work, struct xfs_cil,
> +                                                     xc_push_work);
> +     xlog_cil_push(cil->xc_log);
> +}
> +
> +/*
> + * We need to push CIL every so often so we don't cache more than we can fit 
> in
> + * the log. The limit really is that a checkpoint can't be more than half the
> + * log (the current checkpoint is not allowed to overwrite the previous
> + * checkpoint), but commit latency and memory usage limit this to a smaller
> + * size.
> + */
> +static void
> +xlog_cil_push_background(
> +     struct log      *log)
> +{
> +     struct xfs_cil  *cil = log->l_cilp;
> +
> +     /*
> +      * The cil won't be empty because we are called while holding the
> +      * context lock so whatever we added to the CIL will still be there
> +      */
> +     ASSERT(!list_empty(&cil->xc_cil));
> +
> +     /*
> +      * don't do a background push if we haven't used up all the
> +      * space available yet.
> +      */
> +     if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
> +             return;
> +
> +     spin_lock(&cil->xc_cil_lock);
> +     cil->xc_push_seq = cil->xc_current_sequence;
> +     queue_work(xfs_cil_wq, &cil->xc_push_work);
> +     spin_unlock(&cil->xc_cil_lock);
> +
> +}
> +
> +static void
> +xlog_cil_push_foreground(
> +     struct log      *log,
> +     xfs_lsn_t       push_seq)
> +{
> +     struct xfs_cil  *cil = log->l_cilp;
> +
> +     if (!cil)
> +             return;
> +
> +     ASSERT(push_seq && push_seq <= cil->xc_current_sequence);
> +
> +     /* start on any pending background push to minimise wait time on it */
> +     flush_work(&cil->xc_push_work);
> +
> +     /*
> +      * If the CIL is empty or we've already pushed the sequence then
> +      * there's no work we need to do.
> +      */
> +     spin_lock(&cil->xc_cil_lock);
> +     if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
> +             spin_unlock(&cil->xc_cil_lock);
> +             return;
> +     }
> +
> +     cil->xc_push_seq = push_seq;
> +     spin_unlock(&cil->xc_cil_lock);
> +
> +     /* do the push now */
> +     xlog_cil_push(log);
> +}
> +
>  /*
>   * Commit a transaction with the given vector to the Committed Item List.
>   *
> @@ -667,7 +685,6 @@ xfs_log_commit_cil(
>  {
>       struct log              *log = mp->m_log;
>       int                     log_flags = 0;
> -     int                     push = 0;
>       struct xfs_log_vec      *log_vector;
>  
>       if (flags & XFS_TRANS_RELEASE_LOG_RES)
> @@ -719,21 +736,9 @@ xfs_log_commit_cil(
>        */
>       xfs_trans_free_items(tp, *commit_lsn, 0);
>  
> -     /* check for background commit before unlock */
> -     if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
> -             push = 1;
> +     xlog_cil_push_background(log);
>  
>       up_read(&log->l_cilp->xc_ctx_lock);
> -
> -     /*
> -      * We need to push CIL every so often so we don't cache more than we
> -      * can fit in the log. The limit really is that a checkpoint can't be
> -      * more than half the log (the current checkpoint is not allowed to
> -      * overwrite the previous checkpoint), but commit latency and memory
> -      * usage limit this to a smaller size in most cases.
> -      */
> -     if (push)
> -             xlog_cil_push(log, 0);
>       return 0;
>  }
>  
> @@ -746,9 +751,6 @@ xfs_log_commit_cil(
>   *
>   * We return the current commit lsn to allow the callers to determine if a
>   * iclog flush is necessary following this call.
> - *
> - * XXX: Initially, just push the CIL unconditionally and return whatever
> - * commit lsn is there. It'll be empty, so this is broken for now.
>   */
>  xfs_lsn_t
>  xlog_cil_force_lsn(
> @@ -766,8 +768,7 @@ xlog_cil_force_lsn(
>        * xlog_cil_push() handles racing pushes for the same sequence,
>        * so no need to deal with it here.
>        */
> -     if (sequence == cil->xc_current_sequence)
> -             xlog_cil_push(log, sequence);
> +     xlog_cil_push_foreground(log, sequence);
>  
>       /*
>        * See if we can find a previous sequence still committing.
> @@ -826,3 +827,57 @@ xfs_log_item_in_current_chkpt(
>               return false;
>       return true;
>  }
> +
> +/*
> + * Perform initial CIL structure initialisation.
> + */
> +int
> +xlog_cil_init(
> +     struct log      *log)
> +{
> +     struct xfs_cil  *cil;
> +     struct xfs_cil_ctx *ctx;
> +
> +     cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
> +     if (!cil)
> +             return ENOMEM;
> +
> +     ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
> +     if (!ctx) {
> +             kmem_free(cil);
> +             return ENOMEM;
> +     }
> +
> +     INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
> +     INIT_LIST_HEAD(&cil->xc_cil);
> +     INIT_LIST_HEAD(&cil->xc_committing);
> +     spin_lock_init(&cil->xc_cil_lock);
> +     init_rwsem(&cil->xc_ctx_lock);
> +     init_waitqueue_head(&cil->xc_commit_wait);
> +
> +     INIT_LIST_HEAD(&ctx->committing);
> +     INIT_LIST_HEAD(&ctx->busy_extents);
> +     ctx->sequence = 1;
> +     ctx->cil = cil;
> +     cil->xc_ctx = ctx;
> +     cil->xc_current_sequence = ctx->sequence;
> +
> +     cil->xc_log = log;
> +     log->l_cilp = cil;
> +     return 0;
> +}
> +
> +void
> +xlog_cil_destroy(
> +     struct log      *log)
> +{
> +     if (log->l_cilp->xc_ctx) {
> +             if (log->l_cilp->xc_ctx->ticket)
> +                     xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
> +             kmem_free(log->l_cilp->xc_ctx);
> +     }
> +
> +     ASSERT(list_empty(&log->l_cilp->xc_cil));
> +     kmem_free(log->l_cilp);
> +}
> +
> diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
> index 2152900..ea8c076 100644
> --- a/fs/xfs/xfs_log_priv.h
> +++ b/fs/xfs/xfs_log_priv.h
> @@ -417,8 +417,12 @@ struct xfs_cil {
>       struct list_head        xc_committing;
>       wait_queue_head_t       xc_commit_wait;
>       xfs_lsn_t               xc_current_sequence;
> +     struct work_struct      xc_push_work;
> +     xfs_lsn_t               xc_push_seq;
>  };
>  
> +extern struct workqueue_struct *xfs_cil_wq;
> +
>  /*
>   * The amount of log space we allow the CIL to aggregate is difficult to 
> size.
>   * Whatever we choose, we have to make sure we can get a reservation for the
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index aef50ab..c5059f5 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -1634,8 +1634,13 @@ xfs_init_workqueues(void)
>       if (!xfs_alloc_wq)
>               goto out_destroy_syncd;
>  
> +     xfs_cil_wq = alloc_workqueue("xfscwcilalloc", WQ_MEM_RECLAIM, 0);
> +     if (!xfs_cil_wq)
> +             goto out_destroy_alloc;
>       return 0;
>  
> +out_destroy_alloc:
> +     destroy_workqueue(xfs_alloc_wq);
>  out_destroy_syncd:
>       destroy_workqueue(xfs_syncd_wq);
>       return -ENOMEM;
> @@ -1644,6 +1649,7 @@ out_destroy_syncd:
>  STATIC void
>  xfs_destroy_workqueues(void)
>  {
> +     destroy_workqueue(xfs_cil_wq);
>       destroy_workqueue(xfs_alloc_wq);
>       destroy_workqueue(xfs_syncd_wq);
>  }
> -- 
> 1.7.9
> 
> _______________________________________________
> xfs mailing list
> xfs@xxxxxxxxxxx
> http://oss.sgi.com/mailman/listinfo/xfs
---end quoted text---

<Prev in Thread] Current Thread [Next in Thread>