xfs
[Top] [All Lists]

Re: [PATCH] xfs: don't allocate an ioend for direct I/O completions

To: Christoph Hellwig <hch@xxxxxx>
Subject: Re: [PATCH] xfs: don't allocate an ioend for direct I/O completions
From: Brian Foster <bfoster@xxxxxxxxxx>
Date: Fri, 30 Jan 2015 09:42:23 -0500
Cc: xfs@xxxxxxxxxxx
Delivered-to: xfs@xxxxxxxxxxx
In-reply-to: <1422485661-520-1-git-send-email-hch@xxxxxx>
References: <1422485661-520-1-git-send-email-hch@xxxxxx>
User-agent: Mutt/1.5.23 (2014-03-12)
On Wed, Jan 28, 2015 at 11:54:21PM +0100, Christoph Hellwig wrote:
> Back in the days when the direct I/O ->end_io callback could be called
> from interrupt context for AIO we needed a structure to hand off to the
> workqueue, and reused the ioend structure for this purpose.  These days
> ->end_io is always called from user or workqueue context, which allows us
> to avoid this memory allocation and simplify the code significantly.
> 
> Signed-off-by: Christoph Hellwig <hch@xxxxxx>
> ---

Looks mostly Ok to me. In fact, with xfs_finish_ioend_sync() calling
xfs_end_io() directly, I don't see how we currently get into the wq at
all. Anyways, a few notes...

>  fs/xfs/xfs_aops.c | 133 
> ++++++++++++++++++++++++------------------------------
>  fs/xfs/xfs_aops.h |   3 --
>  2 files changed, 59 insertions(+), 77 deletions(-)
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 18e2f3b..31d3d7d 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -135,30 +135,22 @@ xfs_setfilesize_trans_alloc(
>   */
>  STATIC int
>  xfs_setfilesize(
> -     struct xfs_ioend        *ioend)
> +     struct xfs_inode        *ip,
> +     struct xfs_trans        *tp,
> +     xfs_off_t               offset,
> +     size_t                  size)
>  {
> -     struct xfs_inode        *ip = XFS_I(ioend->io_inode);
> -     struct xfs_trans        *tp = ioend->io_append_trans;
>       xfs_fsize_t             isize;
>  
> -     /*
> -      * The transaction may have been allocated in the I/O submission thread,
> -      * thus we need to mark ourselves as beeing in a transaction manually.
> -      * Similarly for freeze protection.
> -      */
> -     current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
> -     rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
> -                        0, 1, _THIS_IP_);
> -
>       xfs_ilock(ip, XFS_ILOCK_EXCL);
> -     isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
> +     isize = xfs_new_eof(ip, offset + size);
>       if (!isize) {
>               xfs_iunlock(ip, XFS_ILOCK_EXCL);
>               xfs_trans_cancel(tp, 0);
>               return 0;
>       }
>  
> -     trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
> +     trace_xfs_setfilesize(ip, offset, size);
>  
>       ip->i_d.di_size = isize;
>       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
> @@ -167,6 +159,25 @@ xfs_setfilesize(
>       return xfs_trans_commit(tp, 0);
>  }
>  
> +STATIC int
> +xfs_setfilesize_ioend(
> +     struct xfs_ioend        *ioend)
> +{
> +     struct xfs_inode        *ip = XFS_I(ioend->io_inode);
> +     struct xfs_trans        *tp = ioend->io_append_trans;
> +
> +     /*
> +      * The transaction may have been allocated in the I/O submission thread,
> +      * thus we need to mark ourselves as beeing in a transaction manually.

Copied from the the original, but since we're here: "being."

> +      * Similarly for freeze protection.
> +      */
> +     current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
> +     rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
> +                        0, 1, _THIS_IP_);
> +
> +     return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
> +}
> +
>  /*
>   * Schedule IO completion handling on the final put of an ioend.
>   *
> @@ -182,8 +193,7 @@ xfs_finish_ioend(
>  
>               if (ioend->io_type == XFS_IO_UNWRITTEN)
>                       queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
> -             else if (ioend->io_append_trans ||
> -                      (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
> +             else if (ioend->io_append_trans)
>                       queue_work(mp->m_data_workqueue, &ioend->io_work);
>               else
>                       xfs_destroy_ioend(ioend);
> @@ -215,22 +225,8 @@ xfs_end_io(
>       if (ioend->io_type == XFS_IO_UNWRITTEN) {
>               error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
>                                                 ioend->io_size);
> -     } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
> -             /*
> -              * For direct I/O we do not know if we need to allocate blocks
> -              * or not so we can't preallocate an append transaction as that
> -              * results in nested reservations and log space deadlocks. Hence
> -              * allocate the transaction here. While this is sub-optimal and
> -              * can block IO completion for some time, we're stuck with doing
> -              * it this way until we can pass the ioend to the direct IO
> -              * allocation callbacks and avoid nesting that way.
> -              */
> -             error = xfs_setfilesize_trans_alloc(ioend);
> -             if (error)
> -                     goto done;
> -             error = xfs_setfilesize(ioend);
>       } else if (ioend->io_append_trans) {
> -             error = xfs_setfilesize(ioend);
> +             error = xfs_setfilesize_ioend(ioend);
>       } else {
>               ASSERT(!xfs_ioend_is_append(ioend));
>       }
> @@ -273,7 +269,6 @@ xfs_alloc_ioend(
>        * all the I/O from calling the completion routine too early.
>        */
>       atomic_set(&ioend->io_remaining, 1);
> -     ioend->io_isdirect = 0;
>       ioend->io_error = 0;
>       ioend->io_list = NULL;
>       ioend->io_type = type;
> @@ -1459,11 +1454,7 @@ xfs_get_blocks_direct(
>   *
>   * If the private argument is non-NULL __xfs_get_blocks signals us that we
>   * need to issue a transaction to convert the range from unwritten to written
> - * extents.  In case this is regular synchronous I/O we just call xfs_end_io
> - * to do this and we are done.  But in case this was a successful AIO
> - * request this handler is called from interrupt context, from which we
> - * can't start transactions.  In that case offload the I/O completion to
> - * the workqueues we also use for buffered I/O completion.
> + * extents.
>   */
>  STATIC void
>  xfs_end_io_direct_write(
> @@ -1472,7 +1463,12 @@ xfs_end_io_direct_write(
>       ssize_t                 size,
>       void                    *private)
>  {
> -     struct xfs_ioend        *ioend = iocb->private;
> +     struct inode            *inode = file_inode(iocb->ki_filp);
> +     struct xfs_inode        *ip = XFS_I(inode);
> +     struct xfs_mount        *mp = ip->i_mount;
> +
> +     if (XFS_FORCED_SHUTDOWN(mp))
> +             return;
>  
>       /*
>        * While the generic direct I/O code updates the inode size, it does
> @@ -1480,22 +1476,33 @@ xfs_end_io_direct_write(
>        * end_io handler thinks the on-disk size is outside the in-core
>        * size.  To prevent this just update it a little bit earlier here.
>        */
> -     if (offset + size > i_size_read(ioend->io_inode))
> -             i_size_write(ioend->io_inode, offset + size);
> +     if (offset + size > i_size_read(inode))
> +             i_size_write(inode, offset + size);
>  
>       /*
> -      * blockdev_direct_IO can return an error even after the I/O
> -      * completion handler was called.  Thus we need to protect
> -      * against double-freeing.
> +      * For direct I/O we do not know if we need to allocate blocks or not,
> +      * so we can't preallocate an append transaction, as that results in
> +      * nested reservations and log space deadlocks. Hence allocate the
> +      * transaction here. While this is sub-optimal and can block IO
> +      * completion for some time, we're stuck with doing it this way until
> +      * we can pass the ioend to the direct IO allocation callbacks and
> +      * avoid nesting that way.
>        */
> -     iocb->private = NULL;
> -
> -     ioend->io_offset = offset;
> -     ioend->io_size = size;
> -     if (private && size > 0)
> -             ioend->io_type = XFS_IO_UNWRITTEN;
> +     if (private && size > 0) {
> +             xfs_iomap_write_unwritten(ip, offset, size);
> +     } else if (offset + size > ip->i_d.di_size) {
> +             struct xfs_trans        *tp;
> +             int                     error;
> +
> +             tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
> +             error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
> +             if (error) {
> +                     xfs_trans_cancel(tp, 0);
> +                     return;
> +             }
>  
> -     xfs_finish_ioend_sync(ioend);

I get an unused function warning for xfs_finish_ioend_sync() now, so I
guess we should kill that.

> +             xfs_setfilesize(ip, tp, offset, size);
> +     }
>  }
>  
>  STATIC ssize_t
> @@ -1507,39 +1514,17 @@ xfs_vm_direct_IO(
>  {
>       struct inode            *inode = iocb->ki_filp->f_mapping->host;
>       struct block_device     *bdev = xfs_find_bdev_for_inode(inode);
> -     struct xfs_ioend        *ioend = NULL;
> -     ssize_t                 ret;
>  
>       if (rw & WRITE) {

A nit, but I guess you could kill the braces here now too.

Brian

> -             size_t size = iov_iter_count(iter);
> -
> -             /*
> -              * We cannot preallocate a size update transaction here as we
> -              * don't know whether allocation is necessary or not. Hence we
> -              * can only tell IO completion that one is necessary if we are
> -              * not doing unwritten extent conversion.
> -              */
> -             iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
> -             if (offset + size > XFS_I(inode)->i_d.di_size)
> -                     ioend->io_isdirect = 1;
> -
> -             ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
> +             return __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
>                                           offset, xfs_get_blocks_direct,
>                                           xfs_end_io_direct_write, NULL,
>                                           DIO_ASYNC_EXTEND);
> -             if (ret != -EIOCBQUEUED && iocb->private)
> -                     goto out_destroy_ioend;
>       } else {
> -             ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
> +             return __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
>                                           offset, xfs_get_blocks_direct,
>                                           NULL, NULL, 0);
>       }
> -
> -     return ret;
> -
> -out_destroy_ioend:
> -     xfs_destroy_ioend(ioend);
> -     return ret;
>  }
>  
>  /*
> diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
> index f94dd45..ac644e0 100644
> --- a/fs/xfs/xfs_aops.h
> +++ b/fs/xfs/xfs_aops.h
> @@ -24,14 +24,12 @@ extern mempool_t *xfs_ioend_pool;
>   * Types of I/O for bmap clustering and I/O completion tracking.
>   */
>  enum {
> -     XFS_IO_DIRECT = 0,      /* special case for direct I/O ioends */
>       XFS_IO_DELALLOC,        /* covers delalloc region */
>       XFS_IO_UNWRITTEN,       /* covers allocated but uninitialized data */
>       XFS_IO_OVERWRITE,       /* covers already allocated extent */
>  };
>  
>  #define XFS_IO_TYPES \
> -     { 0,                    "" }, \
>       { XFS_IO_DELALLOC,              "delalloc" }, \
>       { XFS_IO_UNWRITTEN,             "unwritten" }, \
>       { XFS_IO_OVERWRITE,             "overwrite" }
> @@ -45,7 +43,6 @@ typedef struct xfs_ioend {
>       unsigned int            io_type;        /* delalloc / unwritten */
>       int                     io_error;       /* I/O error code */
>       atomic_t                io_remaining;   /* hold count */
> -     unsigned int            io_isdirect : 1;/* direct I/O */
>       struct inode            *io_inode;      /* file being written to */
>       struct buffer_head      *io_buffer_head;/* buffer linked list head */
>       struct buffer_head      *io_buffer_tail;/* buffer linked list tail */
> -- 
> 1.9.1
> 
> _______________________________________________
> xfs mailing list
> xfs@xxxxxxxxxxx
> http://oss.sgi.com/mailman/listinfo/xfs

<Prev in Thread] Current Thread [Next in Thread>