xfs
[Top] [All Lists]

Re: [PATCH 1/6] dax: don't abuse get_block mapping for endio callbacks

To: Dave Chinner <david@xxxxxxxxxxxxx>
Subject: Re: [PATCH 1/6] dax: don't abuse get_block mapping for endio callbacks
From: Jan Kara <jack@xxxxxxx>
Date: Wed, 4 Mar 2015 16:54:08 +0100
Cc: xfs@xxxxxxxxxxx, linux-fsdevel@xxxxxxxxxxxxxxx, jack@xxxxxxx, willy@xxxxxxxxxxxxxxx
Delivered-to: xfs@xxxxxxxxxxx
In-reply-to: <1425425427-16283-2-git-send-email-david@xxxxxxxxxxxxx>
References: <1425425427-16283-1-git-send-email-david@xxxxxxxxxxxxx> <1425425427-16283-2-git-send-email-david@xxxxxxxxxxxxx>
User-agent: Mutt/1.5.21 (2010-09-15)
On Wed 04-03-15 10:30:22, Dave Chinner wrote:
> From: Dave Chinner <dchinner@xxxxxxxxxx>
> 
> dax_fault() currently relies on the get_block callback to attach an
> io completion callback to the mapping buffer head so that it can
> run unwritten extent conversion after zeroing allocated blocks.
> 
> Instead of this hack, pass the conversion callback directly into
> dax_fault() similar to the get_block callback. When the filesystem
> allocates unwritten extents, it will set the buffer_unwritten()
> flag, and hence the dax_fault code can call the completion function
> in the contexts where it is necessary without overloading the
> mapping buffer head.
> 
> Note: The changes to ext4 to use this interface are suspect at best.
> In fact, the way ext4 did this end_io assignment in the first place
> looks suspect because it only set a completion callback when there
> wasn't already some other write() call taking place on the same
> inode. The ext4 end_io code looks rather intricate and fragile with
> all it's reference counting and passing to different contexts for
> modification via inode private pointers that aren't protected by
> locks...
  Yeah, ext4 is currently broken in that regard so if we won't make things
worse, I'm OK.

> Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
> ---
>  fs/dax.c           | 15 ++++++++-------
>  fs/ext2/file.c     |  4 ++--
>  fs/ext4/file.c     | 16 ++++++++++++++--
>  fs/ext4/inode.c    | 21 +++++++--------------
>  include/linux/fs.h |  6 ++++--
>  5 files changed, 35 insertions(+), 27 deletions(-)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index ed1619e..d7b4dba 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -269,7 +269,8 @@ static int copy_user_bh(struct page *to, struct 
> buffer_head *bh,
>  }
>  
>  static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
> -                     struct vm_area_struct *vma, struct vm_fault *vmf)
> +                     struct vm_area_struct *vma, struct vm_fault *vmf,
> +                     dax_iodone_t complete_unwritten)
>  {
>       struct address_space *mapping = inode->i_mapping;
>       sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
> @@ -310,14 +311,14 @@ static int dax_insert_mapping(struct inode *inode, 
> struct buffer_head *bh,
>   out:
>       i_mmap_unlock_read(mapping);
>  
> -     if (bh->b_end_io)
> -             bh->b_end_io(bh, 1);
> +     if (buffer_unwritten(bh))
> +             complete_unwritten(bh, 1);
>  
>       return error;
>  }
  So frankly I don't see a big point in passing completion callback into
dax_insert_mapping() only to call the function at the end of it. We could
as well call the completion function from do_dax_fault() where it would
seem more natural to me. But I don't feel too strongly about this.

Instead of the above I was also thinking about some way to pass information
out of do_dax_fault() into filesystem so that it could just call completion
handler itself but the completion callback is more standard interface I
guess.

                                                                Honza

>  static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
> -                     get_block_t get_block)
> +                     get_block_t get_block, dax_iodone_t complete_unwritten)
>  {
>       struct file *file = vma->vm_file;
>       struct address_space *mapping = file->f_mapping;
> @@ -418,7 +419,7 @@ static int do_dax_fault(struct vm_area_struct *vma, 
> struct vm_fault *vmf,
>               page_cache_release(page);
>       }
>  
> -     error = dax_insert_mapping(inode, &bh, vma, vmf);
> +     error = dax_insert_mapping(inode, &bh, vma, vmf, complete_unwritten);
>  
>   out:
>       if (error == -ENOMEM)
> @@ -446,7 +447,7 @@ static int do_dax_fault(struct vm_area_struct *vma, 
> struct vm_fault *vmf,
>   * fault handler for DAX files.
>   */
>  int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
> -                     get_block_t get_block)
> +           get_block_t get_block, dax_iodone_t complete_unwritten)
>  {
>       int result;
>       struct super_block *sb = file_inode(vma->vm_file)->i_sb;
> @@ -455,7 +456,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault 
> *vmf,
>               sb_start_pagefault(sb);
>               file_update_time(vma->vm_file);
>       }
> -     result = do_dax_fault(vma, vmf, get_block);
> +     result = do_dax_fault(vma, vmf, get_block, complete_unwritten);
>       if (vmf->flags & FAULT_FLAG_WRITE)
>               sb_end_pagefault(sb);
>  
> diff --git a/fs/ext2/file.c b/fs/ext2/file.c
> index e317017..8da747a 100644
> --- a/fs/ext2/file.c
> +++ b/fs/ext2/file.c
> @@ -28,12 +28,12 @@
>  #ifdef CONFIG_FS_DAX
>  static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
> -     return dax_fault(vma, vmf, ext2_get_block);
> +     return dax_fault(vma, vmf, ext2_get_block, NULL);
>  }
>  
>  static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
> -     return dax_mkwrite(vma, vmf, ext2_get_block);
> +     return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
>  }
>  
>  static const struct vm_operations_struct ext2_dax_vm_ops = {
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 33a09da..f7dabb1 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -192,15 +192,27 @@ errout:
>  }
>  
>  #ifdef CONFIG_FS_DAX
> +static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
> +{
> +     struct inode *inode = bh->b_assoc_map->host;
> +     /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
> +     loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
> +     int err;
> +     if (!uptodate)
> +             return;
> +     WARN_ON(!buffer_unwritten(bh));
> +     err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
> +}
> +
>  static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
> -     return dax_fault(vma, vmf, ext4_get_block);
> +     return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
>                                       /* Is this the right get_block? */
>  }
>  
>  static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
> -     return dax_mkwrite(vma, vmf, ext4_get_block);
> +     return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
>  }
>  
>  static const struct vm_operations_struct ext4_dax_vm_ops = {
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 5cb9a21..43433de 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -657,18 +657,6 @@ has_zeroout:
>       return retval;
>  }
>  
> -static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
> -{
> -     struct inode *inode = bh->b_assoc_map->host;
> -     /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
> -     loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
> -     int err;
> -     if (!uptodate)
> -             return;
> -     WARN_ON(!buffer_unwritten(bh));
> -     err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
> -}
> -
>  /* Maximum number of blocks we map for direct IO at once. */
>  #define DIO_MAX_BLOCKS 4096
>  
> @@ -706,10 +694,15 @@ static int _ext4_get_block(struct inode *inode, 
> sector_t iblock,
>  
>               map_bh(bh, inode->i_sb, map.m_pblk);
>               bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
> -             if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
> +             if (IS_DAX(inode) && buffer_unwritten(bh)) {
> +                     /*
> +                      * dgc: I suspect unwritten conversion on ext4+DAX is
> +                      * fundamentally broken here when there are concurrent
> +                      * read/write in progress on this inode.
> +                      */
> +                     WARN_ON_ONCE(io_end);
>                       bh->b_assoc_map = inode->i_mapping;
>                       bh->b_private = (void *)(unsigned long)iblock;
> -                     bh->b_end_io = ext4_end_io_unwritten;
>               }
>               if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
>                       set_buffer_defer_completion(bh);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 937e280..82100ae 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -70,6 +70,7 @@ typedef int (get_block_t)(struct inode *inode, sector_t 
> iblock,
>                       struct buffer_head *bh_result, int create);
>  typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
>                       ssize_t bytes, void *private);
> +typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
>  
>  #define MAY_EXEC             0x00000001
>  #define MAY_WRITE            0x00000002
> @@ -2603,8 +2604,9 @@ ssize_t dax_do_io(int rw, struct kiocb *, struct inode 
> *, struct iov_iter *,
>  int dax_clear_blocks(struct inode *, sector_t block, long size);
>  int dax_zero_page_range(struct inode *, loff_t from, unsigned len, 
> get_block_t);
>  int dax_truncate_page(struct inode *, loff_t from, get_block_t);
> -int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
> -#define dax_mkwrite(vma, vmf, gb)    dax_fault(vma, vmf, gb)
> +int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
> +             dax_iodone_t);
> +#define dax_mkwrite(vma, vmf, gb, iod)       dax_fault(vma, vmf, gb, iod)
>  
>  #ifdef CONFIG_BLOCK
>  typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
> -- 
> 2.0.0
> 
-- 
Jan Kara <jack@xxxxxxx>
SUSE Labs, CR

<Prev in Thread] Current Thread [Next in Thread>