[PATCH] xfs: Do background CIL flushes via a workqueue
Christoph Hellwig
hch at infradead.org
Tue Mar 27 09:31:27 CDT 2012
Vivek, does CFQ still need any hints for this sort of handoff?
On Tue, Mar 27, 2012 at 08:46:45PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner at redhat.com>
>
> Doing background CIL flushes adds significant latency to whatever
> async transaction that triggers it. To avoid blocking async
> transactions on things like waiting for log buffer IO to complete,
> move the CIL push off into a workqueue. By moving the push work
> into a workqueue, we remove all the latency that the commit adds
> from the foreground transaction commit path. This also means that
> single threaded workloads won't do the CIL push procssing, leaving
> them more CPU to do more async transactions.
>
> To do this, we need to keep track of the sequence number we have
> pushed work for. This avoids having many transaction commits
> attempting to schedule work for the same sequence, and ensures that
> we only ever have one push (background or forced) in progress at a
> time. It also means that we don't need to take the CIL lock in write
> mode to check for potential background push races, which reduces
> lock contention.
>
> To avoid potential issues with "smart" IO schedulers, don't use the
> workqueue for log force triggered flushes. Instead, do them directly
> so that the log IO is done directly by the process issuing the log
> force and so doesn't get stuck on IO elevator queue idling
> incorrectly delaying the log IO from the workqueue.
>
> Signed-off-by: Dave Chinner <dchinner at redhat.com>
> ---
> fs/xfs/xfs_log_cil.c | 241 ++++++++++++++++++++++++++++++-------------------
> fs/xfs/xfs_log_priv.h | 4 +
> fs/xfs/xfs_super.c | 6 ++
> 3 files changed, 158 insertions(+), 93 deletions(-)
>
> diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> index d4fadbe..6a5a7ba 100644
> --- a/fs/xfs/xfs_log_cil.c
> +++ b/fs/xfs/xfs_log_cil.c
> @@ -31,57 +31,7 @@
> #include "xfs_alloc.h"
> #include "xfs_discard.h"
>
> -/*
> - * Perform initial CIL structure initialisation.
> - */
> -int
> -xlog_cil_init(
> - struct log *log)
> -{
> - struct xfs_cil *cil;
> - struct xfs_cil_ctx *ctx;
> -
> - cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
> - if (!cil)
> - return ENOMEM;
> -
> - ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
> - if (!ctx) {
> - kmem_free(cil);
> - return ENOMEM;
> - }
> -
> - INIT_LIST_HEAD(&cil->xc_cil);
> - INIT_LIST_HEAD(&cil->xc_committing);
> - spin_lock_init(&cil->xc_cil_lock);
> - init_rwsem(&cil->xc_ctx_lock);
> - init_waitqueue_head(&cil->xc_commit_wait);
> -
> - INIT_LIST_HEAD(&ctx->committing);
> - INIT_LIST_HEAD(&ctx->busy_extents);
> - ctx->sequence = 1;
> - ctx->cil = cil;
> - cil->xc_ctx = ctx;
> - cil->xc_current_sequence = ctx->sequence;
> -
> - cil->xc_log = log;
> - log->l_cilp = cil;
> - return 0;
> -}
> -
> -void
> -xlog_cil_destroy(
> - struct log *log)
> -{
> - if (log->l_cilp->xc_ctx) {
> - if (log->l_cilp->xc_ctx->ticket)
> - xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
> - kmem_free(log->l_cilp->xc_ctx);
> - }
> -
> - ASSERT(list_empty(&log->l_cilp->xc_cil));
> - kmem_free(log->l_cilp);
> -}
> +struct workqueue_struct *xfs_cil_wq;
>
> /*
> * Allocate a new ticket. Failing to get a new ticket makes it really hard to
> @@ -426,8 +376,7 @@ xlog_cil_committed(
> */
> STATIC int
> xlog_cil_push(
> - struct log *log,
> - xfs_lsn_t push_seq)
> + struct log *log)
> {
> struct xfs_cil *cil = log->l_cilp;
> struct xfs_log_vec *lv;
> @@ -443,39 +392,35 @@ xlog_cil_push(
> struct xfs_log_iovec lhdr;
> struct xfs_log_vec lvhdr = { NULL };
> xfs_lsn_t commit_lsn;
> + xfs_lsn_t push_seq;
>
> if (!cil)
> return 0;
>
> - ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
> -
> new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
> new_ctx->ticket = xlog_cil_ticket_alloc(log);
>
> - /*
> - * Lock out transaction commit, but don't block for background pushes
> - * unless we are well over the CIL space limit. See the definition of
> - * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic
> - * used here.
> - */
> - if (!down_write_trylock(&cil->xc_ctx_lock)) {
> - if (!push_seq &&
> - cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log))
> - goto out_free_ticket;
> - down_write(&cil->xc_ctx_lock);
> - }
> + down_write(&cil->xc_ctx_lock);
> ctx = cil->xc_ctx;
>
> - /* check if we've anything to push */
> - if (list_empty(&cil->xc_cil))
> - goto out_skip;
> + spin_lock(&cil->xc_cil_lock);
> + push_seq = cil->xc_push_seq;
> + ASSERT(push_seq > 0 && push_seq <= ctx->sequence);
>
> - /* check for spurious background flush */
> - if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
> + /*
> + * Check if we've anything to push. If there is nothing, then we don't
> + * move on to a new sequence number and so we have to be able to push
> + * this sequence again later.
> + */
> + if (list_empty(&cil->xc_cil)) {
> + cil->xc_push_seq = 0;
> + spin_unlock(&cil->xc_cil_lock);
> goto out_skip;
> + }
> + spin_unlock(&cil->xc_cil_lock);
>
> /* check for a previously pushed seqeunce */
> - if (push_seq && push_seq < cil->xc_ctx->sequence)
> + if (push_seq < cil->xc_ctx->sequence)
> goto out_skip;
>
> /*
> @@ -629,7 +574,6 @@ restart:
>
> out_skip:
> up_write(&cil->xc_ctx_lock);
> -out_free_ticket:
> xfs_log_ticket_put(new_ctx->ticket);
> kmem_free(new_ctx);
> return 0;
> @@ -641,6 +585,80 @@ out_abort:
> return XFS_ERROR(EIO);
> }
>
> +static void
> +xlog_cil_push_work(
> + struct work_struct *work)
> +{
> + struct xfs_cil *cil = container_of(work, struct xfs_cil,
> + xc_push_work);
> + xlog_cil_push(cil->xc_log);
> +}
> +
> +/*
> + * We need to push CIL every so often so we don't cache more than we can fit in
> + * the log. The limit really is that a checkpoint can't be more than half the
> + * log (the current checkpoint is not allowed to overwrite the previous
> + * checkpoint), but commit latency and memory usage limit this to a smaller
> + * size.
> + */
> +static void
> +xlog_cil_push_background(
> + struct log *log)
> +{
> + struct xfs_cil *cil = log->l_cilp;
> +
> + /*
> + * The cil won't be empty because we are called while holding the
> + * context lock so whatever we added to the CIL will still be there
> + */
> + ASSERT(!list_empty(&cil->xc_cil));
> +
> + /*
> + * don't do a background push if we haven't used up all the
> + * space available yet.
> + */
> + if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
> + return;
> +
> + spin_lock(&cil->xc_cil_lock);
> + cil->xc_push_seq = cil->xc_current_sequence;
> + queue_work(xfs_cil_wq, &cil->xc_push_work);
> + spin_unlock(&cil->xc_cil_lock);
> +
> +}
> +
> +static void
> +xlog_cil_push_foreground(
> + struct log *log,
> + xfs_lsn_t push_seq)
> +{
> + struct xfs_cil *cil = log->l_cilp;
> +
> + if (!cil)
> + return;
> +
> + ASSERT(push_seq && push_seq <= cil->xc_current_sequence);
> +
> + /* start on any pending background push to minimise wait time on it */
> + flush_work(&cil->xc_push_work);
> +
> + /*
> + * If the CIL is empty or we've already pushed the sequence then
> + * there's no work we need to do.
> + */
> + spin_lock(&cil->xc_cil_lock);
> + if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
> + spin_unlock(&cil->xc_cil_lock);
> + return;
> + }
> +
> + cil->xc_push_seq = push_seq;
> + spin_unlock(&cil->xc_cil_lock);
> +
> + /* do the push now */
> + xlog_cil_push(log);
> +}
> +
> /*
> * Commit a transaction with the given vector to the Committed Item List.
> *
> @@ -667,7 +685,6 @@ xfs_log_commit_cil(
> {
> struct log *log = mp->m_log;
> int log_flags = 0;
> - int push = 0;
> struct xfs_log_vec *log_vector;
>
> if (flags & XFS_TRANS_RELEASE_LOG_RES)
> @@ -719,21 +736,9 @@ xfs_log_commit_cil(
> */
> xfs_trans_free_items(tp, *commit_lsn, 0);
>
> - /* check for background commit before unlock */
> - if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
> - push = 1;
> + xlog_cil_push_background(log);
>
> up_read(&log->l_cilp->xc_ctx_lock);
> -
> - /*
> - * We need to push CIL every so often so we don't cache more than we
> - * can fit in the log. The limit really is that a checkpoint can't be
> - * more than half the log (the current checkpoint is not allowed to
> - * overwrite the previous checkpoint), but commit latency and memory
> - * usage limit this to a smaller size in most cases.
> - */
> - if (push)
> - xlog_cil_push(log, 0);
> return 0;
> }
>
> @@ -746,9 +751,6 @@ xfs_log_commit_cil(
> *
> * We return the current commit lsn to allow the callers to determine if a
> * iclog flush is necessary following this call.
> - *
> - * XXX: Initially, just push the CIL unconditionally and return whatever
> - * commit lsn is there. It'll be empty, so this is broken for now.
> */
> xfs_lsn_t
> xlog_cil_force_lsn(
> @@ -766,8 +768,7 @@ xlog_cil_force_lsn(
> * xlog_cil_push() handles racing pushes for the same sequence,
> * so no need to deal with it here.
> */
> - if (sequence == cil->xc_current_sequence)
> - xlog_cil_push(log, sequence);
> + xlog_cil_push_foreground(log, sequence);
>
> /*
> * See if we can find a previous sequence still committing.
> @@ -826,3 +827,57 @@ xfs_log_item_in_current_chkpt(
> return false;
> return true;
> }
> +
> +/*
> + * Perform initial CIL structure initialisation.
> + */
> +int
> +xlog_cil_init(
> + struct log *log)
> +{
> + struct xfs_cil *cil;
> + struct xfs_cil_ctx *ctx;
> +
> + cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
> + if (!cil)
> + return ENOMEM;
> +
> + ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
> + if (!ctx) {
> + kmem_free(cil);
> + return ENOMEM;
> + }
> +
> + INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
> + INIT_LIST_HEAD(&cil->xc_cil);
> + INIT_LIST_HEAD(&cil->xc_committing);
> + spin_lock_init(&cil->xc_cil_lock);
> + init_rwsem(&cil->xc_ctx_lock);
> + init_waitqueue_head(&cil->xc_commit_wait);
> +
> + INIT_LIST_HEAD(&ctx->committing);
> + INIT_LIST_HEAD(&ctx->busy_extents);
> + ctx->sequence = 1;
> + ctx->cil = cil;
> + cil->xc_ctx = ctx;
> + cil->xc_current_sequence = ctx->sequence;
> +
> + cil->xc_log = log;
> + log->l_cilp = cil;
> + return 0;
> +}
> +
> +void
> +xlog_cil_destroy(
> + struct log *log)
> +{
> + if (log->l_cilp->xc_ctx) {
> + if (log->l_cilp->xc_ctx->ticket)
> + xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
> + kmem_free(log->l_cilp->xc_ctx);
> + }
> +
> + ASSERT(list_empty(&log->l_cilp->xc_cil));
> + kmem_free(log->l_cilp);
> +}
> +
> diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
> index 2152900..ea8c076 100644
> --- a/fs/xfs/xfs_log_priv.h
> +++ b/fs/xfs/xfs_log_priv.h
> @@ -417,8 +417,12 @@ struct xfs_cil {
> struct list_head xc_committing;
> wait_queue_head_t xc_commit_wait;
> xfs_lsn_t xc_current_sequence;
> + struct work_struct xc_push_work;
> + xfs_lsn_t xc_push_seq;
> };
>
> +extern struct workqueue_struct *xfs_cil_wq;
> +
> /*
> * The amount of log space we allow the CIL to aggregate is difficult to size.
> * Whatever we choose, we have to make sure we can get a reservation for the
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index aef50ab..c5059f5 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -1634,8 +1634,13 @@ xfs_init_workqueues(void)
> if (!xfs_alloc_wq)
> goto out_destroy_syncd;
>
> + xfs_cil_wq = alloc_workqueue("xfscwcilalloc", WQ_MEM_RECLAIM, 0);
> + if (!xfs_cil_wq)
> + goto out_destroy_alloc;
> return 0;
>
> +out_destroy_alloc:
> + destroy_workqueue(xfs_alloc_wq);
> out_destroy_syncd:
> destroy_workqueue(xfs_syncd_wq);
> return -ENOMEM;
> @@ -1644,6 +1649,7 @@ out_destroy_syncd:
> STATIC void
> xfs_destroy_workqueues(void)
> {
> + destroy_workqueue(xfs_cil_wq);
> destroy_workqueue(xfs_alloc_wq);
> destroy_workqueue(xfs_syncd_wq);
> }
> --
> 1.7.9
>
> _______________________________________________
> xfs mailing list
> xfs at oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs
---end quoted text---
More information about the xfs
mailing list