[PATCH 10/14] xfs: convert grant head manipulations to lockless algorithm
Christoph Hellwig
hch at infradead.org
Wed Dec 1 07:15:13 CST 2010
On Mon, Nov 29, 2010 at 12:38:28PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner at redhat.com>
>
> The only thing that the grant lock remains to protect is the grant head
> manipulations when adding or removing space from the log. These calculations
> are already based on atomic variables, so we can already update them safely
> without locks. However, the grant head manpulations require atomic multi-step
> calculations to be executed, which the algorithms currently don't allow.
>
> To make these multi-step calculations atomic, convert the algorithms to
> compare-and-exchange loops on the atomic variables. That is, we sample the old
> value, perform the calculation and use atomic64_cmpxchg() to attempt to update
> the head with the new value. If the head has not changed since we sampled it,
> it will succeed and we are done. Otherwise, we rerun the calculation again from
> a new sample of the head.
>
> This allows us to remove the grant lock from around all the grant head space
> manipulations, and that effectively removes the grant lock from the log
> completely.
>
> Signed-off-by: Dave Chinner <dchinner at redhat.com>
> ---
> fs/xfs/xfs_log.c | 133 ++++++++++++++++++++++++++++++------------------------
> 1 files changed, 74 insertions(+), 59 deletions(-)
>
> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> index 8365496..50cf6af 100644
> --- a/fs/xfs/xfs_log.c
> +++ b/fs/xfs/xfs_log.c
> @@ -111,6 +111,13 @@ STATIC int xlog_iclogs_empty(xlog_t *log);
> * is the byte offset into the log for the marker. Unlike the xfs_lsn_t, this
> * is held in bytes rather than basic blocks, even though it uses the
> * BLOCK_LSN() macro to extract it.
> + *
> + * We use a lock-free compare and exchange algorithm to atomically update
> + * the grant head. That is, we sample the current head, crack it, perform the
> + * calculation, recombine it into a new value, and then conditionally set the
> + * value back into the atomic variable only if it hasn't changed since we first
> + * sampled it. This provides atomic updates of the head, even though we do
> + * non-atomic, multi-step calculation to arrive at the new value.
> */
> static void
> __xlog_grant_sub_space(
> @@ -119,16 +126,23 @@ __xlog_grant_sub_space(
> int logsize)
> {
> xfs_lsn_t head_lsn = atomic64_read(head);
> - int cycle, space;
> + xfs_lsn_t old, new;
>
> - cycle = CYCLE_LSN(head_lsn);
> - space = BLOCK_LSN(head_lsn);
> - space -= bytes;
> - if (space < 0) {
> - cycle--;
> - space += logsize;
> - }
> - atomic64_set(head, xlog_assign_lsn(cycle, space));
> + do {
> + int cycle, space;
> +
> + cycle = CYCLE_LSN(head_lsn);
> + space = BLOCK_LSN(head_lsn);
> + space -= bytes;
> + if (space < 0) {
> + cycle--;
> + space += logsize;
> + }
> +
> + old = head_lsn;
> + new = xlog_assign_lsn(cycle, space);
> + head_lsn = atomic64_cmpxchg(head, old, new);
> + } while (head_lsn != old);
> }
>
> static void
> @@ -138,18 +152,25 @@ __xlog_grant_add_space(
> int logsize)
> {
> xfs_lsn_t head_lsn = atomic64_read(head);
> - int cycle, space, tmp;
> + xfs_lsn_t old, new;
>
> - cycle = CYCLE_LSN(head_lsn);
> - space = BLOCK_LSN(head_lsn);
> - tmp = logsize - space;
> - if (tmp > bytes)
> - space += bytes;
> - else {
> - cycle++;
> - space = bytes - tmp;
> - }
> - atomic64_set(head, xlog_assign_lsn(cycle, space));
> + do {
> + int cycle, space, tmp;
> +
> + cycle = CYCLE_LSN(head_lsn);
> + space = BLOCK_LSN(head_lsn);
> + tmp = logsize - space;
> + if (tmp > bytes)
> + space += bytes;
> + else {
> + cycle++;
> + space = bytes - tmp;
> + }
> +
> + old = head_lsn;
> + new = xlog_assign_lsn(cycle, space);
> + head_lsn = atomic64_cmpxchg(head, old, new);
> + } while (head_lsn != old);
> }
>
> static inline void
> @@ -360,11 +381,9 @@ xfs_log_reserve(
>
> trace_xfs_log_reserve(log, internal_ticket);
>
> - spin_lock(&log->l_grant_lock);
> xlog_grant_push_ail(log, atomic64_read(&log->l_tail_lsn),
> atomic64_read(&log->l_last_sync_lsn),
> internal_ticket->t_unit_res);
> - spin_unlock(&log->l_grant_lock);
> retval = xlog_regrant_write_log_space(log, internal_ticket);
> } else {
> /* may sleep if need to allocate more tickets */
> @@ -378,12 +397,10 @@ xfs_log_reserve(
>
> trace_xfs_log_reserve(log, internal_ticket);
>
> - spin_lock(&log->l_grant_lock);
> xlog_grant_push_ail(log, atomic64_read(&log->l_tail_lsn),
> atomic64_read(&log->l_last_sync_lsn),
> (internal_ticket->t_unit_res *
> internal_ticket->t_cnt));
> - spin_unlock(&log->l_grant_lock);
> retval = xlog_grant_log_space(log, internal_ticket);
> }
>
> @@ -1376,9 +1393,7 @@ xlog_sync(xlog_t *log,
> roundoff < BBTOB(1)));
>
> /* move grant heads by roundoff in sync */
> - spin_lock(&log->l_grant_lock);
> xlog_grant_add_space(log, roundoff);
> - spin_unlock(&log->l_grant_lock);
>
> /* put cycle number in every block */
> xlog_pack_data(log, iclog, roundoff);
> @@ -2618,13 +2633,11 @@ redo:
> }
>
> /* we've got enough space */
> - spin_lock(&log->l_grant_lock);
> xlog_grant_add_space(log, need_bytes);
>
> trace_xfs_log_grant_exit(log, tic);
> xlog_verify_grant_tail(log);
> xlog_verify_grant_head(log, 1);
> - spin_unlock(&log->l_grant_lock);
> return 0;
>
> error_return:
> @@ -2752,13 +2765,11 @@ redo:
> }
>
> /* we've got enough space */
> - spin_lock(&log->l_grant_lock);
> xlog_grant_add_space_write(log, need_bytes);
>
> trace_xfs_log_regrant_write_exit(log, tic);
> xlog_verify_grant_tail(log);
> xlog_verify_grant_head(log, 1);
> - spin_unlock(&log->l_grant_lock);
> return 0;
>
>
> @@ -2779,23 +2790,23 @@ redo:
> }
>
>
> -/* The first cnt-1 times through here we don't need to
> - * move the grant write head because the permanent
> - * reservation has reserved cnt times the unit amount.
> - * Release part of current permanent unit reservation and
> - * reset current reservation to be one units worth. Also
> - * move grant reservation head forward.
> +/*
> + * The first cnt-1 times through here we don't need to move the grant write
> + * head because the permanent reservation has reserved cnt times the unit
> + * amount. Release part of current permanent unit reservation and reset
> + * current reservation to be one units worth. Also move grant reservation head
> + * forward.
> */
> STATIC void
> -xlog_regrant_reserve_log_space(xlog_t *log,
> - xlog_ticket_t *ticket)
> +xlog_regrant_reserve_log_space(
> + struct log *log,
> + struct xlog_ticket *ticket)
> {
> trace_xfs_log_regrant_reserve_enter(log, ticket);
>
> if (ticket->t_cnt > 0)
> ticket->t_cnt--;
>
> - spin_lock(&log->l_grant_lock);
> xlog_grant_sub_space(log, ticket->t_curr_res);
> ticket->t_curr_res = ticket->t_unit_res;
> xlog_tic_reset_res(ticket);
> @@ -2805,20 +2816,17 @@ xlog_regrant_reserve_log_space(xlog_t *log,
> xlog_verify_grant_head(log, 1);
>
> /* just return if we still have some of the pre-reserved space */
> - if (ticket->t_cnt > 0) {
> - spin_unlock(&log->l_grant_lock);
> + if (ticket->t_cnt > 0)
> return;
> - }
>
> xlog_grant_add_space_reserve(log, ticket->t_unit_res);
>
> trace_xfs_log_regrant_reserve_exit(log, ticket);
>
> xlog_verify_grant_head(log, 0);
> - spin_unlock(&log->l_grant_lock);
> ticket->t_curr_res = ticket->t_unit_res;
> xlog_tic_reset_res(ticket);
> -} /* xlog_regrant_reserve_log_space */
> +}
>
>
> /*
> @@ -2836,34 +2844,33 @@ xlog_regrant_reserve_log_space(xlog_t *log,
> * in the current reservation field.
> */
> STATIC void
> -xlog_ungrant_log_space(xlog_t *log,
> - xlog_ticket_t *ticket)
> +xlog_ungrant_log_space(
> + struct log *log,
> + struct xlog_ticket *ticket)
> {
> - if (ticket->t_cnt > 0)
> - ticket->t_cnt--;
> + int space;
>
> - spin_lock(&log->l_grant_lock);
> trace_xfs_log_ungrant_enter(log, ticket);
> + if (ticket->t_cnt > 0)
> + ticket->t_cnt--;
>
> - xlog_grant_sub_space(log, ticket->t_curr_res);
> -
> - trace_xfs_log_ungrant_sub(log, ticket);
> -
> - /* If this is a permanent reservation ticket, we may be able to free
> + /*
> + * If this is a permanent reservation ticket, we may be able to free
> * up more space based on the remaining count.
> */
> + space = ticket->t_curr_res;
> if (ticket->t_cnt > 0) {
> ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
> - xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt);
> + space += ticket->t_unit_res * ticket->t_cnt;
> }
> + trace_xfs_log_ungrant_sub(log, ticket);
> + xlog_grant_sub_space(log, space);
>
> trace_xfs_log_ungrant_exit(log, ticket);
> -
> xlog_verify_grant_head(log, 1);
> - spin_unlock(&log->l_grant_lock);
> - xfs_log_move_tail(log->l_mp, 1);
> -} /* xlog_ungrant_log_space */
>
> + xfs_log_move_tail(log->l_mp, 1);
> +}
>
> /*
> * Flush iclog to disk if this is the last reference to the given iclog and
> @@ -3478,11 +3485,18 @@ xlog_verify_dest_ptr(
> xlog_panic("xlog_verify_dest_ptr: invalid ptr");
> }
>
> +/*
> + * XXX: because we cannot atomically sample both the reserve and write heads,
> + * we cannot verify them reliably as they may be sampled in the middle of
> + * racing modifications. Hence just taking snapshots of the heads can give an
> + * incorrect view of the state of log. Hence just disable this check for now.
> + */
> STATIC void
> xlog_verify_grant_head(
I can't see any way to keep this check with the atomic reserve/write
heads, so we might as well remove it entirely.
Otherwise the patch looks good,
Reviewed-by: Christoph Hellwig <hch at lst.de>
More information about the xfs
mailing list