xfs
[Top] [All Lists]

Re: [PATCH 2/5] xfs: direct IO needs to use append ioends

To: Dave Chinner <david@xxxxxxxxxxxxx>
Subject: Re: [PATCH 2/5] xfs: direct IO needs to use append ioends
From: Brian Foster <bfoster@xxxxxxxxxx>
Date: Fri, 10 Apr 2015 16:21:47 -0400
Cc: xfs@xxxxxxxxxxx
Delivered-to: xfs@xxxxxxxxxxx
In-reply-to: <1428673080-23052-3-git-send-email-david@xxxxxxxxxxxxx>
References: <1428673080-23052-1-git-send-email-david@xxxxxxxxxxxxx> <1428673080-23052-3-git-send-email-david@xxxxxxxxxxxxx>
User-agent: Mutt/1.5.23 (2014-03-12)
On Fri, Apr 10, 2015 at 11:37:57PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@xxxxxxxxxx>
> 
> Now we have an ioend being passed unconditionally to the direct IO
> write completion context, we can pass a preallocated transaction
> handle for on-disk inode size updates that are run in completion.
> 
> At this point we really need to be passing the correct block range
> that the IO spans through the ioend, so calculate the last block in
> the mapping before we map the allocated range and use that instead
> of the size desired by the direct IO.
> 
> This enables us to keep track of multiple get-blocks calls in the
> same direct IO - the ioend will keep coming back to us, and we can
> keep extending it's range as new allocations and mappings are done.
> 
> There are some new trace points added for debugging, and a small
> hack to actually make the tracepoints work (enums in tracepoints
> that use __print_symbolic don't work correctly) that should be fixed
> in the 4.1 merge window. THis hack can be removed when the
> tracepoint fix is upstream.
> 
> There are lots of comments explaining the intricacies of passing the
> ioend and append transaction in the code; they are better placed in
> the code because we're going to need them to understand why this
> code does what it does in a few years time....
> 
> Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
> ---

I still need to look at this one (and grok the dio code more)... but an
initial question: is this multiple get_blocks() call aggregation a
requirement for the append ioend mechanism or an optimization? If the
latter, I think a separate patch is more appropriate...

Brian

>  fs/xfs/xfs_aops.c  | 262 
> +++++++++++++++++++++++++++++++++++++----------------
>  fs/xfs/xfs_trace.h |  10 +-
>  2 files changed, 194 insertions(+), 78 deletions(-)
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index d95a42b..52c7e46 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -178,6 +178,25 @@ xfs_setfilesize_ioend(
>       return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
>  }
>  
> +STATIC void
> +xfs_setfilesize_ioend_cancel(
> +     struct xfs_ioend        *ioend)
> +{
> +     struct xfs_trans        *tp = ioend->io_append_trans;
> +
> +     /*
> +      * The transaction may have been allocated in the I/O submission thread,
> +      * thus we need to mark ourselves as being in a transaction manually.
> +      * Similarly for freeze protection.
> +      */
> +     current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
> +     
> rwsem_acquire_read(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
> +                        0, 1, _THIS_IP_);
> +
> +     xfs_trans_cancel(tp, 0);
> +     ioend->io_append_trans = NULL;
> +}
> +
>  /*
>   * Schedule IO completion handling on the final put of an ioend.
>   *
> @@ -1233,18 +1252,18 @@ xfs_vm_releasepage(
>       return try_to_free_buffers(page);
>  }
>  
> -static void
> +static int
>  xfs_get_blocks_map_buffer(
>       struct inode            *inode,
>       struct buffer_head      *bh_result,
>       int                     create,
>       int                     direct,
>       struct xfs_bmbt_irec    *imap,
> -     xfs_off_t               offset,
> -     ssize_t                 size)
> +     xfs_off_t               offset)
>  {
>       struct xfs_ioend        *ioend;
>       int                     type;
> +     loff_t                  size;
>  
>       if (!create) {
>               /*
> @@ -1253,7 +1272,7 @@ xfs_get_blocks_map_buffer(
>                */
>               if (!ISUNWRITTEN(imap))
>                       xfs_map_buffer(inode, bh_result, imap, offset);
> -             return;
> +             return 0;
>       }
>  
>       xfs_map_buffer(inode, bh_result, imap, offset);
> @@ -1262,26 +1281,93 @@ xfs_get_blocks_map_buffer(
>               set_buffer_unwritten(bh_result);
>  
>       if (!direct)
> -             return;
> +             return 0;
>  
>       /*
> -      * Direct IO writes require an ioend to be allocated and
> -      * passed via the returned mapping. This allows the end
> -      * io function to determine the correct course of
> -      * action.
> +      * Direct IO writes require an ioend to be allocated and passed via the
> +      * returned mapping. This allows the end io function to determine the
> +      * correct course of action.
> +      *
> +      * Unwritten extents will need transactions at completion, so is known
> +      * to need deferring to a workqueue. However, for writes into written
> +      * extents, we *may* need a transaction if this IO extends the on-disk
> +      * EOF. Because we can race with other IOs the file may already be
> +      * extended by the time we get to the transaction. IO completion already
> +      * handles that case so all we will have done is incurred the overhead
> +      * of workqueue deferral for completion. This is acceptable overhead for
> +      * the rare case that this occurs.
>        */
> -
>       if (ISUNWRITTEN(imap)) {
>               type = XFS_IO_UNWRITTEN;
>               set_buffer_defer_completion(bh_result);
>       } else
>               type = XFS_IO_OVERWRITE;
> -     ioend = xfs_alloc_ioend(inode, type);
> -     ioend->io_offset = offset;
> -     ioend->io_size = size;
> -     bh_result->b_private = ioend;
>  
> -     return;
> +     /*
> +      * The offset that is passed in is the first block the DIO will fall
> +      * into. The size supplied by the DIO layer is what it thinks it needs
> +      * but the mapping may not span this entire range. Hence we use the
> +      * truncated mapping size that's already been stashed in the bh_result
> +      * to calculate the range covered by the ioend.
> +      */
> +     size = bh_result->b_size;
> +     trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
> +
> +     /*
> +      * If we get multiple mappings to in a single IO, we might be mapping
> +      * dfferent types. But because the direct IO can only have a single
> +      * private pointer, we need to ensure that:
> +      *
> +      * a) the ioend spans the entire region of the IO; and
> +      * b) if it contains unwritten extents, it is *permanently* marked as
> +      *    such and we cancel any append transaction attached to the ioend.
> +      *
> +      * We could do this by chaining ioends like buffered IO does, but
> +      * we only actually get one IO completion callback from the direct IO,
> +      * and that spans the entire IO regardless of how many mappings and IOs
> +      * are needed to complete the DIO. There is only going to be one
> +      * reference to the ioend and it's life cycle is constrained by the
> +      * DIO completion code. hence we don't need reference counting here.
> +      */
> +     if (bh_result->b_private) {
> +             ioend = bh_result->b_private;
> +             ASSERT(ioend->io_size > 0);
> +             ASSERT(offset >= ioend->io_offset);
> +
> +             if (offset + size > ioend->io_offset + ioend->io_size)
> +                     ioend->io_size = offset - ioend->io_offset + size;
> +
> +             if (type == XFS_IO_UNWRITTEN) {
> +                     if (ioend->io_append_trans)
> +                             xfs_setfilesize_ioend_cancel(ioend);
> +                     ioend->io_type = XFS_IO_UNWRITTEN;
> +             }
> +             trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
> +                                           ioend->io_size, ioend->io_type,
> +                                           imap);
> +     } else {
> +             ioend = xfs_alloc_ioend(inode, type);
> +             ioend->io_offset = offset;
> +             ioend->io_size = size;
> +             bh_result->b_private = ioend;
> +             trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
> +                                        imap);
> +     }
> +
> +     /* check if we need an append transaction allocated. */
> +     if (ioend->io_type == XFS_IO_OVERWRITE &&
> +         xfs_ioend_is_append(ioend) && !ioend->io_append_trans) {
> +             int     error;
> +
> +             error = xfs_setfilesize_trans_alloc(ioend);
> +             ASSERT(!error);
> +             if (error) {
> +                     xfs_destroy_ioend(ioend);
> +                     return error;
> +             }
> +             set_buffer_defer_completion(bh_result);
> +     }
> +     return 0;
>  }
>  
>  STATIC int
> @@ -1374,50 +1460,19 @@ __xfs_get_blocks(
>                       xfs_iunlock(ip, lockmode);
>               }
>  
> -             trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
> +             trace_xfs_get_blocks_alloc(ip, offset, size,
> +                             ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
> +                                                : XFS_IO_DELALLOC, &imap);
>       } else if (nimaps) {
> -             trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
> +             trace_xfs_get_blocks_found(ip, offset, size,
> +                             ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
> +                                                : XFS_IO_OVERWRITE, &imap);
>               xfs_iunlock(ip, lockmode);
>       } else {
>               trace_xfs_get_blocks_notfound(ip, offset, size);
>               goto out_unlock;
>       }
>  
> -     if (imap.br_startblock != HOLESTARTBLOCK &&
> -         imap.br_startblock != DELAYSTARTBLOCK)
> -             xfs_get_blocks_map_buffer(inode, bh_result, create, direct,
> -                                       &imap, offset, size);
> -
> -     /*
> -      * If this is a realtime file, data may be on a different device.
> -      * to that pointed to from the buffer_head b_bdev currently.
> -      */
> -     bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
> -
> -     /*
> -      * If we previously allocated a block out beyond eof and we are now
> -      * coming back to use it then we will need to flag it as new even if it
> -      * has a disk address.
> -      *
> -      * With sub-block writes into unwritten extents we also need to mark
> -      * the buffer as new so that the unwritten parts of the buffer gets
> -      * correctly zeroed.
> -      */
> -     if (create &&
> -         ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
> -          (offset >= i_size_read(inode)) ||
> -          (new || ISUNWRITTEN(&imap))))
> -             set_buffer_new(bh_result);
> -
> -     if (imap.br_startblock == DELAYSTARTBLOCK) {
> -             BUG_ON(direct);
> -             if (create) {
> -                     set_buffer_uptodate(bh_result);
> -                     set_buffer_mapped(bh_result);
> -                     set_buffer_delay(bh_result);
> -             }
> -     }
> -
>       /*
>        * If this is O_DIRECT or the mpage code calling tell them how large
>        * the mapping is, so that we can avoid repeated get_blocks calls.
> @@ -1451,6 +1506,46 @@ __xfs_get_blocks(
>               bh_result->b_size = mapping_size;
>       }
>  
> +     if (imap.br_startblock != HOLESTARTBLOCK &&
> +         imap.br_startblock != DELAYSTARTBLOCK) {
> +             error = xfs_get_blocks_map_buffer(inode, bh_result, create,
> +                                               direct, &imap, offset);
> +             if (error)
> +                     return error;
> +     }
> +     if (create && direct)
> +             ASSERT(bh_result->b_private);
> +
> +     /*
> +      * If this is a realtime file, data may be on a different device.
> +      * to that pointed to from the buffer_head b_bdev currently.
> +      */
> +     bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
> +
> +     /*
> +      * If we previously allocated a block out beyond eof and we are now
> +      * coming back to use it then we will need to flag it as new even if it
> +      * has a disk address.
> +      *
> +      * With sub-block writes into unwritten extents we also need to mark
> +      * the buffer as new so that the unwritten parts of the buffer gets
> +      * correctly zeroed.
> +      */
> +     if (create &&
> +         ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
> +          (offset >= i_size_read(inode)) ||
> +          (new || ISUNWRITTEN(&imap))))
> +             set_buffer_new(bh_result);
> +
> +     if (imap.br_startblock == DELAYSTARTBLOCK) {
> +             BUG_ON(direct);
> +             if (create) {
> +                     set_buffer_uptodate(bh_result);
> +                     set_buffer_mapped(bh_result);
> +                     set_buffer_delay(bh_result);
> +             }
> +     }
> +
>       return 0;
>  
>  out_unlock:
> @@ -1501,38 +1596,51 @@ xfs_end_io_direct_write(
>               goto out_destroy_ioend;
>  
>       /*
> -      * While the generic direct I/O code updates the inode size, it does
> -      * so only after the end_io handler is called, which means our
> -      * end_io handler thinks the on-disk size is outside the in-core
> -      * size.  To prevent this just update it a little bit earlier here.
> +      * dio completion end_io functions are only called on writes if more
> +      * than 0 bytes was written.
>        */
> -     if (offset + size > i_size_read(inode))
> -             i_size_write(inode, offset + size);
> +     ASSERT(size > 0);
>  
>       /*
> -      * For direct I/O we do not know if we need to allocate blocks or not,
> -      * so we can't preallocate an append transaction, as that results in
> -      * nested reservations and log space deadlocks. Hence allocate the
> -      * transaction here. While this is sub-optimal and can block IO
> -      * completion for some time, we're stuck with doing it this way until
> -      * we can pass the ioend to the direct IO allocation callbacks and
> -      * avoid nesting that way.
> +      * The ioend only maps whole blocks, while the IO may be sector aligned.
> +      * Hence the ioend offset/size may not match the IO offset/size exactly,
> +      * but should span it completely. Write the IO sizes into the ioend so
> +      * that completion processing does the right thing.
>        */
> -     if (ioend->io_type == XFS_IO_UNWRITTEN && size > 0) {
> -             xfs_iomap_write_unwritten(ip, offset, size);
> -     } else if (offset + size > ip->i_d.di_size) {
> -             struct xfs_trans        *tp;
> -             int                     error;
> +     ASSERT(size <= ioend->io_size);
> +     ASSERT(offset >= ioend->io_offset);
> +     ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
> +     ioend->io_size = size;
> +     ioend->io_offset = offset;
>  
> -             tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
> -             error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
> -             if (error) {
> -                     xfs_trans_cancel(tp, 0);
> -                     goto out_destroy_ioend;
> -             }
> +     /*
> +      * The ioend tells us whether we are doing unwritten extent conversion
> +      * or an append transaction that updates the on-disk file size. These
> +      * cases are the only cases where we should *potentially* be needing
> +      * to update the VFS inode size. When the ioend indicates this, we
> +      * are *guaranteed* to be running in non-interrupt context.
> +      *
> +      * We need to update the in-core inode size here so that we don't end up
> +      * with the on-disk inode size being outside the in-core inode size.
> +      * While we can do this in the process context after the IO has
> +      * completed, this does not work for AIO and hence we always update
> +      * the in-core inode size here if necessary.
> +      */
> +     if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_append_trans) {
> +             if (offset + size > i_size_read(inode))
> +                     i_size_write(inode, offset + size);
> +     } else
> +             ASSERT(offset + size <= i_size_read(inode));
>  
> -             xfs_setfilesize(ip, tp, offset, size);
> +     /* Ugh. No way to propagate errors, so ignore them. */
> +     if (ioend->io_type == XFS_IO_UNWRITTEN) {
> +             xfs_iomap_write_unwritten(ip, offset, size);
> +     } else if (ioend->io_append_trans) {
> +             xfs_setfilesize_ioend(ioend);
> +     } else {
> +             ASSERT(!xfs_ioend_is_append(ioend));
>       }
> +
>  out_destroy_ioend:
>       xfs_destroy_ioend(ioend);
>  }
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index b2a45cc..a584c27 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -1200,13 +1200,18 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
>               __entry->blockcount = irec ? irec->br_blockcount : 0;
>       ),
>       TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
> -               "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
> +               "type %d startoff 0x%llx startblock %lld blockcount 0x%llx",
>                 MAJOR(__entry->dev), MINOR(__entry->dev),
>                 __entry->ino,
>                 __entry->size,
>                 __entry->offset,
>                 __entry->count,
> +               /*
> +                * XXX: __print_symbolic broken for enums, fix coming in 4.1
> +                * cycle from Mr Rostedt. Need to know type now, so...
>                 __print_symbolic(__entry->type, XFS_IO_TYPES),
> +                */
> +               __entry->type,
>                 __entry->startoff,
>                 (__int64_t)__entry->startblock,
>                 __entry->blockcount)
> @@ -1221,6 +1226,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
>  DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
>  DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
>  DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
> +DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
> +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
> +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
>  
>  DECLARE_EVENT_CLASS(xfs_simple_io_class,
>       TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
> -- 
> 2.0.0
> 
> _______________________________________________
> xfs mailing list
> xfs@xxxxxxxxxxx
> http://oss.sgi.com/mailman/listinfo/xfs

<Prev in Thread] Current Thread [Next in Thread>