On Fri, Apr 10, 2015 at 11:37:57PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@xxxxxxxxxx>
>
> Now we have an ioend being passed unconditionally to the direct IO
> write completion context, we can pass a preallocated transaction
> handle for on-disk inode size updates that are run in completion.
>
> At this point we really need to be passing the correct block range
> that the IO spans through the ioend, so calculate the last block in
> the mapping before we map the allocated range and use that instead
> of the size desired by the direct IO.
>
> This enables us to keep track of multiple get-blocks calls in the
> same direct IO - the ioend will keep coming back to us, and we can
> keep extending it's range as new allocations and mappings are done.
>
> There are some new trace points added for debugging, and a small
> hack to actually make the tracepoints work (enums in tracepoints
> that use __print_symbolic don't work correctly) that should be fixed
> in the 4.1 merge window. THis hack can be removed when the
> tracepoint fix is upstream.
>
> There are lots of comments explaining the intricacies of passing the
> ioend and append transaction in the code; they are better placed in
> the code because we're going to need them to understand why this
> code does what it does in a few years time....
>
> Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
> ---
I still need to look at this one (and grok the dio code more)... but an
initial question: is this multiple get_blocks() call aggregation a
requirement for the append ioend mechanism or an optimization? If the
latter, I think a separate patch is more appropriate...
Brian
> fs/xfs/xfs_aops.c | 262
> +++++++++++++++++++++++++++++++++++++----------------
> fs/xfs/xfs_trace.h | 10 +-
> 2 files changed, 194 insertions(+), 78 deletions(-)
>
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index d95a42b..52c7e46 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -178,6 +178,25 @@ xfs_setfilesize_ioend(
> return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
> }
>
> +STATIC void
> +xfs_setfilesize_ioend_cancel(
> + struct xfs_ioend *ioend)
> +{
> + struct xfs_trans *tp = ioend->io_append_trans;
> +
> + /*
> + * The transaction may have been allocated in the I/O submission thread,
> + * thus we need to mark ourselves as being in a transaction manually.
> + * Similarly for freeze protection.
> + */
> + current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
> +
> rwsem_acquire_read(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
> + 0, 1, _THIS_IP_);
> +
> + xfs_trans_cancel(tp, 0);
> + ioend->io_append_trans = NULL;
> +}
> +
> /*
> * Schedule IO completion handling on the final put of an ioend.
> *
> @@ -1233,18 +1252,18 @@ xfs_vm_releasepage(
> return try_to_free_buffers(page);
> }
>
> -static void
> +static int
> xfs_get_blocks_map_buffer(
> struct inode *inode,
> struct buffer_head *bh_result,
> int create,
> int direct,
> struct xfs_bmbt_irec *imap,
> - xfs_off_t offset,
> - ssize_t size)
> + xfs_off_t offset)
> {
> struct xfs_ioend *ioend;
> int type;
> + loff_t size;
>
> if (!create) {
> /*
> @@ -1253,7 +1272,7 @@ xfs_get_blocks_map_buffer(
> */
> if (!ISUNWRITTEN(imap))
> xfs_map_buffer(inode, bh_result, imap, offset);
> - return;
> + return 0;
> }
>
> xfs_map_buffer(inode, bh_result, imap, offset);
> @@ -1262,26 +1281,93 @@ xfs_get_blocks_map_buffer(
> set_buffer_unwritten(bh_result);
>
> if (!direct)
> - return;
> + return 0;
>
> /*
> - * Direct IO writes require an ioend to be allocated and
> - * passed via the returned mapping. This allows the end
> - * io function to determine the correct course of
> - * action.
> + * Direct IO writes require an ioend to be allocated and passed via the
> + * returned mapping. This allows the end io function to determine the
> + * correct course of action.
> + *
> + * Unwritten extents will need transactions at completion, so is known
> + * to need deferring to a workqueue. However, for writes into written
> + * extents, we *may* need a transaction if this IO extends the on-disk
> + * EOF. Because we can race with other IOs the file may already be
> + * extended by the time we get to the transaction. IO completion already
> + * handles that case so all we will have done is incurred the overhead
> + * of workqueue deferral for completion. This is acceptable overhead for
> + * the rare case that this occurs.
> */
> -
> if (ISUNWRITTEN(imap)) {
> type = XFS_IO_UNWRITTEN;
> set_buffer_defer_completion(bh_result);
> } else
> type = XFS_IO_OVERWRITE;
> - ioend = xfs_alloc_ioend(inode, type);
> - ioend->io_offset = offset;
> - ioend->io_size = size;
> - bh_result->b_private = ioend;
>
> - return;
> + /*
> + * The offset that is passed in is the first block the DIO will fall
> + * into. The size supplied by the DIO layer is what it thinks it needs
> + * but the mapping may not span this entire range. Hence we use the
> + * truncated mapping size that's already been stashed in the bh_result
> + * to calculate the range covered by the ioend.
> + */
> + size = bh_result->b_size;
> + trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
> +
> + /*
> + * If we get multiple mappings to in a single IO, we might be mapping
> + * dfferent types. But because the direct IO can only have a single
> + * private pointer, we need to ensure that:
> + *
> + * a) the ioend spans the entire region of the IO; and
> + * b) if it contains unwritten extents, it is *permanently* marked as
> + * such and we cancel any append transaction attached to the ioend.
> + *
> + * We could do this by chaining ioends like buffered IO does, but
> + * we only actually get one IO completion callback from the direct IO,
> + * and that spans the entire IO regardless of how many mappings and IOs
> + * are needed to complete the DIO. There is only going to be one
> + * reference to the ioend and it's life cycle is constrained by the
> + * DIO completion code. hence we don't need reference counting here.
> + */
> + if (bh_result->b_private) {
> + ioend = bh_result->b_private;
> + ASSERT(ioend->io_size > 0);
> + ASSERT(offset >= ioend->io_offset);
> +
> + if (offset + size > ioend->io_offset + ioend->io_size)
> + ioend->io_size = offset - ioend->io_offset + size;
> +
> + if (type == XFS_IO_UNWRITTEN) {
> + if (ioend->io_append_trans)
> + xfs_setfilesize_ioend_cancel(ioend);
> + ioend->io_type = XFS_IO_UNWRITTEN;
> + }
> + trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
> + ioend->io_size, ioend->io_type,
> + imap);
> + } else {
> + ioend = xfs_alloc_ioend(inode, type);
> + ioend->io_offset = offset;
> + ioend->io_size = size;
> + bh_result->b_private = ioend;
> + trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
> + imap);
> + }
> +
> + /* check if we need an append transaction allocated. */
> + if (ioend->io_type == XFS_IO_OVERWRITE &&
> + xfs_ioend_is_append(ioend) && !ioend->io_append_trans) {
> + int error;
> +
> + error = xfs_setfilesize_trans_alloc(ioend);
> + ASSERT(!error);
> + if (error) {
> + xfs_destroy_ioend(ioend);
> + return error;
> + }
> + set_buffer_defer_completion(bh_result);
> + }
> + return 0;
> }
>
> STATIC int
> @@ -1374,50 +1460,19 @@ __xfs_get_blocks(
> xfs_iunlock(ip, lockmode);
> }
>
> - trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
> + trace_xfs_get_blocks_alloc(ip, offset, size,
> + ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
> + : XFS_IO_DELALLOC, &imap);
> } else if (nimaps) {
> - trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
> + trace_xfs_get_blocks_found(ip, offset, size,
> + ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
> + : XFS_IO_OVERWRITE, &imap);
> xfs_iunlock(ip, lockmode);
> } else {
> trace_xfs_get_blocks_notfound(ip, offset, size);
> goto out_unlock;
> }
>
> - if (imap.br_startblock != HOLESTARTBLOCK &&
> - imap.br_startblock != DELAYSTARTBLOCK)
> - xfs_get_blocks_map_buffer(inode, bh_result, create, direct,
> - &imap, offset, size);
> -
> - /*
> - * If this is a realtime file, data may be on a different device.
> - * to that pointed to from the buffer_head b_bdev currently.
> - */
> - bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
> -
> - /*
> - * If we previously allocated a block out beyond eof and we are now
> - * coming back to use it then we will need to flag it as new even if it
> - * has a disk address.
> - *
> - * With sub-block writes into unwritten extents we also need to mark
> - * the buffer as new so that the unwritten parts of the buffer gets
> - * correctly zeroed.
> - */
> - if (create &&
> - ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
> - (offset >= i_size_read(inode)) ||
> - (new || ISUNWRITTEN(&imap))))
> - set_buffer_new(bh_result);
> -
> - if (imap.br_startblock == DELAYSTARTBLOCK) {
> - BUG_ON(direct);
> - if (create) {
> - set_buffer_uptodate(bh_result);
> - set_buffer_mapped(bh_result);
> - set_buffer_delay(bh_result);
> - }
> - }
> -
> /*
> * If this is O_DIRECT or the mpage code calling tell them how large
> * the mapping is, so that we can avoid repeated get_blocks calls.
> @@ -1451,6 +1506,46 @@ __xfs_get_blocks(
> bh_result->b_size = mapping_size;
> }
>
> + if (imap.br_startblock != HOLESTARTBLOCK &&
> + imap.br_startblock != DELAYSTARTBLOCK) {
> + error = xfs_get_blocks_map_buffer(inode, bh_result, create,
> + direct, &imap, offset);
> + if (error)
> + return error;
> + }
> + if (create && direct)
> + ASSERT(bh_result->b_private);
> +
> + /*
> + * If this is a realtime file, data may be on a different device.
> + * to that pointed to from the buffer_head b_bdev currently.
> + */
> + bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
> +
> + /*
> + * If we previously allocated a block out beyond eof and we are now
> + * coming back to use it then we will need to flag it as new even if it
> + * has a disk address.
> + *
> + * With sub-block writes into unwritten extents we also need to mark
> + * the buffer as new so that the unwritten parts of the buffer gets
> + * correctly zeroed.
> + */
> + if (create &&
> + ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
> + (offset >= i_size_read(inode)) ||
> + (new || ISUNWRITTEN(&imap))))
> + set_buffer_new(bh_result);
> +
> + if (imap.br_startblock == DELAYSTARTBLOCK) {
> + BUG_ON(direct);
> + if (create) {
> + set_buffer_uptodate(bh_result);
> + set_buffer_mapped(bh_result);
> + set_buffer_delay(bh_result);
> + }
> + }
> +
> return 0;
>
> out_unlock:
> @@ -1501,38 +1596,51 @@ xfs_end_io_direct_write(
> goto out_destroy_ioend;
>
> /*
> - * While the generic direct I/O code updates the inode size, it does
> - * so only after the end_io handler is called, which means our
> - * end_io handler thinks the on-disk size is outside the in-core
> - * size. To prevent this just update it a little bit earlier here.
> + * dio completion end_io functions are only called on writes if more
> + * than 0 bytes was written.
> */
> - if (offset + size > i_size_read(inode))
> - i_size_write(inode, offset + size);
> + ASSERT(size > 0);
>
> /*
> - * For direct I/O we do not know if we need to allocate blocks or not,
> - * so we can't preallocate an append transaction, as that results in
> - * nested reservations and log space deadlocks. Hence allocate the
> - * transaction here. While this is sub-optimal and can block IO
> - * completion for some time, we're stuck with doing it this way until
> - * we can pass the ioend to the direct IO allocation callbacks and
> - * avoid nesting that way.
> + * The ioend only maps whole blocks, while the IO may be sector aligned.
> + * Hence the ioend offset/size may not match the IO offset/size exactly,
> + * but should span it completely. Write the IO sizes into the ioend so
> + * that completion processing does the right thing.
> */
> - if (ioend->io_type == XFS_IO_UNWRITTEN && size > 0) {
> - xfs_iomap_write_unwritten(ip, offset, size);
> - } else if (offset + size > ip->i_d.di_size) {
> - struct xfs_trans *tp;
> - int error;
> + ASSERT(size <= ioend->io_size);
> + ASSERT(offset >= ioend->io_offset);
> + ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
> + ioend->io_size = size;
> + ioend->io_offset = offset;
>
> - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
> - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
> - if (error) {
> - xfs_trans_cancel(tp, 0);
> - goto out_destroy_ioend;
> - }
> + /*
> + * The ioend tells us whether we are doing unwritten extent conversion
> + * or an append transaction that updates the on-disk file size. These
> + * cases are the only cases where we should *potentially* be needing
> + * to update the VFS inode size. When the ioend indicates this, we
> + * are *guaranteed* to be running in non-interrupt context.
> + *
> + * We need to update the in-core inode size here so that we don't end up
> + * with the on-disk inode size being outside the in-core inode size.
> + * While we can do this in the process context after the IO has
> + * completed, this does not work for AIO and hence we always update
> + * the in-core inode size here if necessary.
> + */
> + if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_append_trans) {
> + if (offset + size > i_size_read(inode))
> + i_size_write(inode, offset + size);
> + } else
> + ASSERT(offset + size <= i_size_read(inode));
>
> - xfs_setfilesize(ip, tp, offset, size);
> + /* Ugh. No way to propagate errors, so ignore them. */
> + if (ioend->io_type == XFS_IO_UNWRITTEN) {
> + xfs_iomap_write_unwritten(ip, offset, size);
> + } else if (ioend->io_append_trans) {
> + xfs_setfilesize_ioend(ioend);
> + } else {
> + ASSERT(!xfs_ioend_is_append(ioend));
> }
> +
> out_destroy_ioend:
> xfs_destroy_ioend(ioend);
> }
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index b2a45cc..a584c27 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -1200,13 +1200,18 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
> __entry->blockcount = irec ? irec->br_blockcount : 0;
> ),
> TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
> - "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
> + "type %d startoff 0x%llx startblock %lld blockcount 0x%llx",
> MAJOR(__entry->dev), MINOR(__entry->dev),
> __entry->ino,
> __entry->size,
> __entry->offset,
> __entry->count,
> + /*
> + * XXX: __print_symbolic broken for enums, fix coming in 4.1
> + * cycle from Mr Rostedt. Need to know type now, so...
> __print_symbolic(__entry->type, XFS_IO_TYPES),
> + */
> + __entry->type,
> __entry->startoff,
> (__int64_t)__entry->startblock,
> __entry->blockcount)
> @@ -1221,6 +1226,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
> DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
> DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
> DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
> +DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
> +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
> +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
>
> DECLARE_EVENT_CLASS(xfs_simple_io_class,
> TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
> --
> 2.0.0
>
> _______________________________________________
> xfs mailing list
> xfs@xxxxxxxxxxx
> http://oss.sgi.com/mailman/listinfo/xfs
|