xfs
[Top] [All Lists]

Re: [PATCH] xfs: fix race while discarding buffers [V4]

To: Carlos Maiolino <cmaiolino@xxxxxxxxxx>
Subject: Re: [PATCH] xfs: fix race while discarding buffers [V4]
From: Ben Myers <bpm@xxxxxxx>
Date: Mon, 20 Aug 2012 15:51:34 -0500
Cc: xfs@xxxxxxxxxxx
In-reply-to: <1344621711-8049-1-git-send-email-cmaiolino@xxxxxxxxxx>
References: <1344621711-8049-1-git-send-email-cmaiolino@xxxxxxxxxx>
User-agent: Mutt/1.5.20 (2009-06-14)
Hi Carlos,

On Fri, Aug 10, 2012 at 03:01:51PM -0300, Carlos Maiolino wrote:
> While xfs_buftarg_shrink() is freeing buffers from the dispose list (filled 
> with
> buffers from lru list), there is a possibility to have xfs_buf_stale() racing
> with it, and removing buffers from dispose list before xfs_buftarg_shrink() 
> does
> it.
> 
> This happens because xfs_buftarg_shrink() handle the dispose list without
> locking and the test condition in xfs_buf_stale() checks for the buffer being 
> in
> *any* list:
> 
> if (!list_empty(&bp->b_lru)
                             )

That's cruel and unusual punishment.
 
> If the buffer happens to be on dispose list, this causes the buffer counter of
> lru list (btp->bt_lru_nr) to be decremented twice (once in 
> xfs_buftarg_shrink()
> and another in xfs_buf_stale()) causing a wrong account usage of the lru list.
> 
> This may cause xfs_buftarg_shrink() to return a wrong value to the memory
> shrinker shrink_slab(), and such account error may also cause an underflowed
> value to be returned; since the counter is lower than the current number of
> items in the lru list, a decrement may happen when the counter is 0, causing
> an underflow on the counter.
> 
> The fix uses a new flag field (and a new buffer flag) to serialize buffer
> handling during the shrink process. The new flag field has been designed to 
> use
> btp->bt_lru_lock/unlock instead of xfs_buf_lock/unlock mechanism.
> 
> dchinner, sandeen, aquini and aris also deserve credits for this.
>
> Signed-off-by: Carlos Maiolino <cmaiolino@xxxxxxxxxx>
> ---
>  fs/xfs/xfs_buf.c |  5 ++++-
>  fs/xfs/xfs_buf.h | 41 ++++++++++++++++++++++++-----------------
>  2 files changed, 28 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index d7a9dd7..933b793 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -96,6 +96,7 @@ xfs_buf_lru_add(
>               atomic_inc(&bp->b_hold);
>               list_add_tail(&bp->b_lru, &btp->bt_lru);
>               btp->bt_lru_nr++;
> +             bp->b_lru_flags &= ~_XBF_LRU_DISPOSE;
>       }
>       spin_unlock(&btp->bt_lru_lock);
>  }
> @@ -154,7 +155,8 @@ xfs_buf_stale(
>               struct xfs_buftarg *btp = bp->b_target;
>  
>               spin_lock(&btp->bt_lru_lock);
> -             if (!list_empty(&bp->b_lru)) {
> +             if (!list_empty(&bp->b_lru) &&
> +                 !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) {
>                       list_del_init(&bp->b_lru);
>                       btp->bt_lru_nr--;
>                       atomic_dec(&bp->b_hold);
> @@ -1501,6 +1503,7 @@ xfs_buftarg_shrink(
>                */
>               list_move(&bp->b_lru, &dispose);
>               btp->bt_lru_nr--;
> +             bp->b_lru_flags |= _XBF_LRU_DISPOSE;
>       }
>       spin_unlock(&btp->bt_lru_lock);
>  
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index d03b73b..7c0b6a0 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -38,27 +38,28 @@ typedef enum {
>       XBRW_ZERO = 3,                  /* Zero target memory */
>  } xfs_buf_rw_t;
>  
> -#define XBF_READ     (1 << 0) /* buffer intended for reading from device */
> -#define XBF_WRITE    (1 << 1) /* buffer intended for writing to device */
> -#define XBF_READ_AHEAD       (1 << 2) /* asynchronous read-ahead */
> -#define XBF_ASYNC    (1 << 4) /* initiator will not wait for completion */
> -#define XBF_DONE     (1 << 5) /* all pages in the buffer uptodate */
> -#define XBF_STALE    (1 << 6) /* buffer has been staled, do not find it */
> +#define XBF_READ      (1 << 0) /* buffer intended for reading from device */
> +#define XBF_WRITE     (1 << 1) /* buffer intended for writing to device */
> +#define XBF_READ_AHEAD        (1 << 2) /* asynchronous read-ahead */
> +#define XBF_ASYNC     (1 << 4) /* initiator will not wait for completion */
> +#define XBF_DONE      (1 << 5) /* all pages in the buffer uptodate */
> +#define XBF_STALE     (1 << 6) /* buffer has been staled, do not find it */
>  
>  /* I/O hints for the BIO layer */
> -#define XBF_SYNCIO   (1 << 10)/* treat this buffer as synchronous I/O */
> -#define XBF_FUA              (1 << 11)/* force cache write through mode */
> -#define XBF_FLUSH    (1 << 12)/* flush the disk cache before a write */
> +#define XBF_SYNCIO    (1 << 10)/* treat this buffer as synchronous I/O */
> +#define XBF_FUA               (1 << 11)/* force cache write through mode */
> +#define XBF_FLUSH     (1 << 12)/* flush the disk cache before a write */
>  
>  /* flags used only as arguments to access routines */
> -#define XBF_TRYLOCK  (1 << 16)/* lock requested, but do not wait */
> -#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */
> +#define XBF_TRYLOCK   (1 << 16)/* lock requested, but do not wait */
> +#define XBF_UNMAPPED  (1 << 17)/* do not map the buffer */
>  
>  /* flags used only internally */
> -#define _XBF_PAGES   (1 << 20)/* backed by refcounted pages */
> -#define _XBF_KMEM    (1 << 21)/* backed by heap memory */
> -#define _XBF_DELWRI_Q        (1 << 22)/* buffer on a delwri queue */
> -#define _XBF_COMPOUND        (1 << 23)/* compound buffer */
> +#define _XBF_PAGES    (1 << 20)/* backed by refcounted pages */
> +#define _XBF_KMEM     (1 << 21)/* backed by heap memory */
> +#define _XBF_DELWRI_Q         (1 << 22)/* buffer on a delwri queue */
> +#define _XBF_COMPOUND         (1 << 23)/* compound buffer */
> +#define _XBF_LRU_DISPOSE (1 << 24)/* buffer being discarded */

It's nice to have them lined up like that.

>  
>  typedef unsigned int xfs_buf_flags_t;
>  
> @@ -72,12 +73,13 @@ typedef unsigned int xfs_buf_flags_t;
>       { XBF_SYNCIO,           "SYNCIO" }, \
>       { XBF_FUA,              "FUA" }, \
>       { XBF_FLUSH,            "FLUSH" }, \
> -     { XBF_TRYLOCK,          "TRYLOCK" },    /* should never be set */\
> +     { XBF_TRYLOCK,          "TRYLOCK" },    /* should never be set */\

...and you got rid of an extra space here.

>       { XBF_UNMAPPED,         "UNMAPPED" },   /* ditto */\
>       { _XBF_PAGES,           "PAGES" }, \
>       { _XBF_KMEM,            "KMEM" }, \
>       { _XBF_DELWRI_Q,        "DELWRI_Q" }, \
> -     { _XBF_COMPOUND,        "COMPOUND" }
> +     { _XBF_COMPOUND,        "COMPOUND" }, \
> +     { _XBF_LRU_DISPOSE,     "LRU_DISPOSE" }
>  
>  typedef struct xfs_buftarg {
>       dev_t                   bt_dev;
> @@ -124,7 +126,12 @@ typedef struct xfs_buf {
>       xfs_buf_flags_t         b_flags;        /* status flags */
>       struct semaphore        b_sema;         /* semaphore for lockables */
>  
> +     /*
> +      * concurrent access to b_lru and b_lru_flags are protected by
> +      * bt_lru_lock and not by b_sema
> +      */
>       struct list_head        b_lru;          /* lru list */
> +     xfs_buf_flags_t         b_lru_flags;    /* internal lru status flags */
>       wait_queue_head_t       b_waiters;      /* unpin waiters */
>       struct list_head        b_list;
>       struct xfs_perag        *b_pag;         /* contains rbtree root */

This looks pretty good to me.  Looks like you've been careful about the
locking.

What was the symptom that led to the discovery of this problem?

Reviewed-by: Ben Myers <bpm@xxxxxxx>

Regards,
Ben

<Prev in Thread] Current Thread [Next in Thread>