xfs
[Top] [All Lists]

Re: [PATCH] dio: track and serialise unaligned direct IO

To: Dave Chinner <david@xxxxxxxxxxxxx>
Subject: Re: [PATCH] dio: track and serialise unaligned direct IO
From: Badari Pulavarty <pbadari@xxxxxxxxx>
Date: Fri, 30 Jul 2010 10:43:09 -0700
Cc: linux-fsdevel@xxxxxxxxxxxxxxx, xfs@xxxxxxxxxxx, sandeen@xxxxxxxxxxx
In-reply-to: <1280443516-14448-1-git-send-email-david@xxxxxxxxxxxxx>
References: <1280443516-14448-1-git-send-email-david@xxxxxxxxxxxxx>
On Fri, 2010-07-30 at 08:45 +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@xxxxxxxxxx>
> 
> If we get two unaligned direct IO's to the same filesystem block
> that is marked as a new allocation (i.e. buffer_new), then both IOs will
> zero the portion of the block they are not writing data to. As a
> result, when the IOs complete there will be a portion of the block
> that contains zeros from the last IO to complete rather than the
> data that should be there.
> 
> This is easily manifested by qemu using aio+dio with an unaligned
> guest filesystem - every IO is unaligned and fileystem corruption is
> encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
> is also a simple reproducer.
> 
> To avoid this problem, track unaligned IO that triggers sub-block zeroing and
> check new incoming unaligned IO that require sub-block zeroing against that
> list. If we get an overlap where the start and end of unaligned IOs hit the
> same filesystem block, then we need to block the incoming IOs until the IO 
> that
> is zeroing the block completes. The blocked IO can then continue without
> needing to do any zeroing and hence won't overwrite valid data with zeros.
> 
> Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>

I can confirm that, it  fixes corruption  of my VM images when using AIO
+DIO. (cache=none,aio=native). I haven't reviewed the patch closely, but

1) can we do this only for AIO+DIO combination ? For regular DIO, we
should have all the IOs serialized by i_mutex anyway..

2) Having a single global list (for all devices) might cause scaling
issues.

3) Are you dropping i_mutex when you are waiting for the zero-out to
finish ?


Thanks,
Badari
> ---
>  fs/direct-io.c |  152 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 files changed, 146 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index a10cb91..611524e 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -71,6 +71,9 @@ struct dio {
>       unsigned start_zero_done;       /* flag: sub-blocksize zeroing has
>                                          been performed at the start of a
>                                          write */
> +#define LAST_SECTOR ((sector_t)-1LL)
> +     sector_t zero_block_front;      /* fs block we are zeroing at front */
> +     sector_t zero_block_rear;       /* fs block we are zeroing at rear */
>       int pages_in_io;                /* approximate total IO pages */
>       size_t  size;                   /* total request size (doesn't change)*/
>       sector_t block_in_file;         /* Current offset into the underlying
> @@ -135,6 +138,101 @@ struct dio {
>       struct page *pages[DIO_PAGES];  /* page buffer */
>  };
>  
> +
> +/*
> + * record fs blocks we are doing zeroing on in a zero block list.
> + * unaligned IO is not very performant and so is relatively uncommon,
> + * so a global list should be sufficent to track them.
> + */
> +struct dio_zero_block {
> +     struct list_head dio_list;      /* list of io in progress */
> +     sector_t        zero_block;     /* block being zeroed */
> +     struct dio      *dio;           /* owner dio */
> +     wait_queue_head_t wq;           /* New IO block here */
> +     atomic_t        ref;            /* reference count */
> +};
> +
> +DEFINE_SPINLOCK(dio_zero_block_lock);
> +LIST_HEAD(dio_zero_block_list);
> +
> +/*
> + * Add a filesystem block to the list of blocks we are tracking.
> + */
> +static void
> +dio_start_zero_block(struct dio *dio, sector_t zero_block)
> +{
> +     struct dio_zero_block *zb;
> +
> +     zb = kmalloc(sizeof(*zb), GFP_NOIO);
> +     if (!zb)
> +             return;
> +     INIT_LIST_HEAD(&zb->dio_list);
> +     init_waitqueue_head(&zb->wq);
> +     zb->zero_block = zero_block;
> +     zb->dio = dio;
> +     atomic_set(&zb->ref, 1);
> +
> +     spin_lock(&dio_zero_block_lock);
> +     list_add(&zb->dio_list, &dio_zero_block_list);
> +     spin_unlock(&dio_zero_block_lock);
> +}
> +
> +static void
> +dio_drop_zero_block(struct dio_zero_block *zb)
> +{
> +     if (atomic_dec_and_test(&zb->ref))
> +             kfree(zb);
> +}
> +
> +/*
> + * Check whether a filesystem block is currently being zeroed, and if it is
> + * wait for it to complete before returning. If we waited for a block being
> + * zeroed, return 1 to indicate that the block is already initialised,
> + * otherwise return 0 to indicate that it needs zeroing.
> + */
> +static int
> +dio_wait_zero_block(struct dio *dio, sector_t zero_block)
> +{
> +     struct dio_zero_block *zb;
> +
> +     spin_lock(&dio_zero_block_lock);
> +     list_for_each_entry(zb, &dio_zero_block_list, dio_list) {
> +             if (zb->dio->inode != dio->inode)
> +                     continue;
> +             if (zb->zero_block != zero_block)
> +                     continue;
> +             atomic_inc(&zb->ref);
> +             spin_unlock(&dio_zero_block_lock);
> +             wait_event(zb->wq, (list_empty(&zb->dio_list)));
> +             dio_drop_zero_block(zb);
> +             return 1;
> +     }
> +     spin_unlock(&dio_zero_block_lock);
> +     return 0;
> +}
> +
> +/*
> + * Complete a block zeroing and wake up anyone waiting for it.
> + */
> +static void dio_end_zero_block(struct dio *dio, sector_t zero_block)
> +{
> +     struct dio_zero_block *zb;
> +
> +     spin_lock(&dio_zero_block_lock);
> +     list_for_each_entry(zb, &dio_zero_block_list, dio_list) {
> +             if (zb->dio->inode != dio->inode)
> +                     continue;
> +             if (zb->zero_block != zero_block)
> +                     continue;
> +             list_del_init(&zb->dio_list);
> +             spin_unlock(&dio_zero_block_lock);
> +             wake_up(&zb->wq);
> +             dio_drop_zero_block(zb);
> +             return;
> +     }
> +     spin_unlock(&dio_zero_block_lock);
> +}
> +
>  /*
>   * How many pages are in the queue?
>   */
> @@ -253,6 +351,11 @@ static int dio_complete(struct dio *dio, loff_t offset, 
> int ret, bool is_async)
>               aio_complete(dio->iocb, ret, 0);
>       }
>  
> +     if (dio->zero_block_front != LAST_SECTOR)
> +             dio_end_zero_block(dio, dio->zero_block_front);
> +     if (dio->zero_block_rear != LAST_SECTOR)
> +             dio_end_zero_block(dio, dio->zero_block_rear);
> +
>       if (dio->flags & DIO_LOCKING)
>               /* lockdep: non-owner release */
>               up_read_non_owner(&dio->inode->i_alloc_sem);
> @@ -777,6 +880,12 @@ static void clean_blockdev_aliases(struct dio *dio)
>   * block with zeros. This happens only if user-buffer, fileoffset or
>   * io length is not filesystem block-size multiple.
>   *
> + * We need to track the blocks we are zeroing. If we have concurrent IOs 
> that hit
> + * the same start or end block, we do not want all the IOs to zero the 
> portion
> + * they are not writing data to as that will overwrite data from the other 
> IOs.
> + * Hence we need to block until the first unaligned IO completes before we 
> can
> + * continue (without executing any zeroing).
> + *
>   * `end' is zero if we're doing the start of the IO, 1 at the end of the
>   * IO.
>   */
> @@ -784,8 +893,8 @@ static void dio_zero_block(struct dio *dio, int end)
>  {
>       unsigned dio_blocks_per_fs_block;
>       unsigned this_chunk_blocks;     /* In dio_blocks */
> -     unsigned this_chunk_bytes;
>       struct page *page;
> +     sector_t fsblock;
>  
>       dio->start_zero_done = 1;
>       if (!dio->blkfactor || !buffer_new(&dio->map_bh))
> @@ -797,17 +906,41 @@ static void dio_zero_block(struct dio *dio, int end)
>       if (!this_chunk_blocks)
>               return;
>  
> +     if (end)
> +             this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
> +
>       /*
>        * We need to zero out part of an fs block.  It is either at the
> -      * beginning or the end of the fs block.
> +      * beginning or the end of the fs block, but first we need to check if
> +      * there is already a zeroing being run on this block.
> +      *
> +      * If we are doing a sub-block IO (i.e. zeroing both front and rear of
> +      * the same block) we don't need to wait or set a gaurd for the rear of
> +      * the block as we already have one set.
>        */
> -     if (end) 
> -             this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
> +     fsblock = dio->block_in_file >> dio->blkfactor;
> +     if (!end || dio->zero_block_front != fsblock) {
>  
> -     this_chunk_bytes = this_chunk_blocks << dio->blkbits;
> +             /* wait for any zeroing already in progress */
> +             if (dio_wait_zero_block(dio, fsblock)) {
> +                     /* skip the range we would have zeroed. */
> +                     dio->next_block_for_io += this_chunk_blocks;
> +                     return;
> +             }
> +
> +             /*
> +              * we are going to zero stuff now, so set a guard to catch
> +              * others that might want to zero the same block.
> +              */
> +             dio_start_zero_block(dio, fsblock);
> +             if (end)
> +                     dio->zero_block_rear = fsblock;
> +             else
> +                     dio->zero_block_front = fsblock;
> +     }
>  
>       page = ZERO_PAGE(0);
> -     if (submit_page_section(dio, page, 0, this_chunk_bytes, 
> +     if (submit_page_section(dio, page, 0, this_chunk_blocks << dio->blkbits,
>                               dio->next_block_for_io))
>               return;
>  
> @@ -1191,6 +1324,13 @@ __blockdev_direct_IO_newtrunc(int rw, struct kiocb 
> *iocb, struct inode *inode,
>        */
>       memset(dio, 0, offsetof(struct dio, pages));
>  
> +     /*
> +      * zero_blocks need to initialised to largeѕt value to avoid
> +      * matching the zero block accidentally.
> +      */
> +     dio->zero_block_front = LAST_SECTOR;
> +     dio->zero_block_rear = LAST_SECTOR;
> +
>       dio->flags = flags;
>       if (dio->flags & DIO_LOCKING) {
>               /* watch out for a 0 len io from a tricksy fs */
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

<Prev in Thread] Current Thread [Next in Thread>