xfs
[Top] [All Lists]

Re: [PATCH 7/8] xfs: split direct I/O and DAX path

To: Christoph Hellwig <hch@xxxxxx>
Subject: Re: [PATCH 7/8] xfs: split direct I/O and DAX path
From: "Darrick J. Wong" <darrick.wong@xxxxxxxxxx>
Date: Wed, 28 Sep 2016 19:53:52 -0700
Cc: xfs@xxxxxxxxxxx, linux-fsdevel@xxxxxxxxxxxxxxx, linux-nvdimm@xxxxxxxxxxx
Delivered-to: xfs@xxxxxxxxxxx
In-reply-to: <1466609236-23801-8-git-send-email-hch@xxxxxx>
References: <1466609236-23801-1-git-send-email-hch@xxxxxx> <1466609236-23801-8-git-send-email-hch@xxxxxx>
User-agent: Mutt/1.5.24 (2015-08-30)
On Wed, Jun 22, 2016 at 05:27:15PM +0200, Christoph Hellwig wrote:
> So far the DAX code overloaded the direct I/O code path.  There is very little
> in common between the two, and untangling them allows to clean up both 
> variants.
> 
> As a Ñide effect we also get separate trace points for both I/O types.
> 
> Signed-off-by: Christoph Hellwig <hch@xxxxxx>
> ---
>  fs/xfs/xfs_file.c  | 139 
> ++++++++++++++++++++++++++++++++++++++++++-----------
>  fs/xfs/xfs_trace.h |   2 +
>  2 files changed, 112 insertions(+), 29 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 24b267d..0e74325 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -262,13 +262,11 @@ xfs_file_dio_aio_read(
>       else
>               target = ip->i_mount->m_ddev_targp;
>  
> -     if (!IS_DAX(inode)) {
> -             /* DIO must be aligned to device logical sector size */
> -             if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
> -                     if (iocb->ki_pos == isize)
> -                             return 0;
> -                     return -EINVAL;
> -             }
> +     /* DIO must be aligned to device logical sector size */
> +     if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
> +             if (iocb->ki_pos == isize)
> +                     return 0;
> +             return -EINVAL;
>       }
>  
>       /*
> @@ -317,13 +315,37 @@ xfs_file_dio_aio_read(
>       }
>  
>       data = *to;
> -     if (IS_DAX(inode)) {
> -             ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
> -                             NULL, 0);
> -     } else {
> -             ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
> -                             xfs_get_blocks_direct, NULL, NULL, 0);
> +     ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
> +                     xfs_get_blocks_direct, NULL, NULL, 0);
> +     if (ret > 0) {
> +             iocb->ki_pos += ret;
> +             iov_iter_advance(to, ret);
>       }
> +     xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
> +
> +     file_accessed(iocb->ki_filp);

So I noticed that generic/323 starts crashing in file_accessed -> touch_atime
because iocb->ki_filp->f_path.dentry == NULL.  For a while I thought it was
some weird reflink bug, but I finally had time to go build a vanilla 4.8-rc8
kernel and that blew up here too.  I'm not sure why this line got inserted
here, since it wasn't there prior to this patch, AFAICT.

<shrug>

--D

> +     return ret;
> +}
> +
> +STATIC ssize_t
> +xfs_file_dax_read(
> +     struct kiocb            *iocb,
> +     struct iov_iter         *to)
> +{
> +     struct address_space    *mapping = iocb->ki_filp->f_mapping;
> +     struct inode            *inode = mapping->host;
> +     struct xfs_inode        *ip = XFS_I(inode);
> +     struct iov_iter         data = *to;
> +     size_t                  count = iov_iter_count(to);
> +     ssize_t                 ret = 0;
> +
> +     trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
> +
> +     if (!count)
> +             return 0; /* skip atime */
> +
> +     xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
> +     ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
>       if (ret > 0) {
>               iocb->ki_pos += ret;
>               iov_iter_advance(to, ret);
> @@ -356,7 +378,8 @@ xfs_file_read_iter(
>       struct kiocb            *iocb,
>       struct iov_iter         *to)
>  {
> -     struct xfs_mount        *mp = XFS_I(file_inode(iocb->ki_filp))->i_mount;
> +     struct inode            *inode = file_inode(iocb->ki_filp);
> +     struct xfs_mount        *mp = XFS_I(inode)->i_mount;
>       ssize_t                 ret = 0;
>  
>       XFS_STATS_INC(mp, xs_read_calls);
> @@ -364,7 +387,9 @@ xfs_file_read_iter(
>       if (XFS_FORCED_SHUTDOWN(mp))
>               return -EIO;
>  
> -     if (iocb->ki_flags & IOCB_DIRECT)
> +     if (IS_DAX(inode))
> +             ret = xfs_file_dax_read(iocb, to);
> +     else if (iocb->ki_flags & IOCB_DIRECT)
>               ret = xfs_file_dio_aio_read(iocb, to);
>       else
>               ret = xfs_file_buffered_aio_read(iocb, to);
> @@ -586,8 +611,7 @@ xfs_file_dio_aio_write(
>                                       mp->m_rtdev_targp : mp->m_ddev_targp;
>  
>       /* DIO must be aligned to device logical sector size */
> -     if (!IS_DAX(inode) &&
> -         ((iocb->ki_pos | count) & target->bt_logical_sectormask))
> +     if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
>               return -EINVAL;
>  
>       /* "unaligned" here means not aligned to a filesystem block */
> @@ -656,14 +680,9 @@ xfs_file_dio_aio_write(
>       trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
>  
>       data = *from;
> -     if (IS_DAX(inode)) {
> -             ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
> -                             xfs_end_io_direct_write, 0);
> -     } else {
> -             ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
> -                             xfs_get_blocks_direct, xfs_end_io_direct_write,
> -                             NULL, DIO_ASYNC_EXTEND);
> -     }
> +     ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
> +                     xfs_get_blocks_direct, xfs_end_io_direct_write,
> +                     NULL, DIO_ASYNC_EXTEND);
>  
>       /* see generic_file_direct_write() for why this is necessary */
>       if (mapping->nrpages) {
> @@ -680,10 +699,70 @@ out:
>       xfs_rw_iunlock(ip, iolock);
>  
>       /*
> -      * No fallback to buffered IO on errors for XFS. DAX can result in
> -      * partial writes, but direct IO will either complete fully or fail.
> +      * No fallback to buffered IO on errors for XFS, direct IO will either
> +      * complete fully or fail.
> +      */
> +     ASSERT(ret < 0 || ret == count);
> +     return ret;
> +}
> +
> +STATIC ssize_t
> +xfs_file_dax_write(
> +     struct kiocb            *iocb,
> +     struct iov_iter         *from)
> +{
> +     struct address_space    *mapping = iocb->ki_filp->f_mapping;
> +     struct inode            *inode = mapping->host;
> +     struct xfs_inode        *ip = XFS_I(inode);
> +     struct xfs_mount        *mp = ip->i_mount;
> +     ssize_t                 ret = 0;
> +     int                     unaligned_io = 0;
> +     int                     iolock;
> +     struct iov_iter         data;
> +
> +     /* "unaligned" here means not aligned to a filesystem block */
> +     if ((iocb->ki_pos & mp->m_blockmask) ||
> +         ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
> +             unaligned_io = 1;
> +             iolock = XFS_IOLOCK_EXCL;
> +     } else if (mapping->nrpages) {
> +             iolock = XFS_IOLOCK_EXCL;
> +     } else {
> +             iolock = XFS_IOLOCK_SHARED;
> +     }
> +     xfs_rw_ilock(ip, iolock);
> +
> +     ret = xfs_file_aio_write_checks(iocb, from, &iolock);
> +     if (ret)
> +             goto out;
> +
> +     /*
> +      * Yes, even DAX files can have page cache attached to them:  A zeroed
> +      * page is inserted into the pagecache when we have to serve a write
> +      * fault on a hole.  It should never be dirtied and can simply be
> +      * dropped from the pagecache once we get real data for the page.
>        */
> -     ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
> +     if (mapping->nrpages) {
> +             ret = invalidate_inode_pages2(mapping);
> +             WARN_ON_ONCE(ret);
> +     }
> +
> +     if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
> +             xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
> +             iolock = XFS_IOLOCK_SHARED;
> +     }
> +
> +     trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
> +
> +     data = *from;
> +     ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
> +                     xfs_end_io_direct_write, 0);
> +     if (ret > 0) {
> +             iocb->ki_pos += ret;
> +             iov_iter_advance(from, ret);
> +     }
> +out:
> +     xfs_rw_iunlock(ip, iolock);
>       return ret;
>  }
>  
> @@ -765,7 +844,9 @@ xfs_file_write_iter(
>       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
>               return -EIO;
>  
> -     if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
> +     if (IS_DAX(inode))
> +             ret = xfs_file_dax_write(iocb, from);
> +     else if (iocb->ki_flags & IOCB_DIRECT)
>               ret = xfs_file_dio_aio_write(iocb, from);
>       else
>               ret = xfs_file_buffered_aio_write(iocb, from);
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 2504f94..1451690 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -1165,8 +1165,10 @@ DEFINE_EVENT(xfs_file_class, name,     \
>       TP_ARGS(ip, count, offset))
>  DEFINE_RW_EVENT(xfs_file_buffered_read);
>  DEFINE_RW_EVENT(xfs_file_direct_read);
> +DEFINE_RW_EVENT(xfs_file_dax_read);
>  DEFINE_RW_EVENT(xfs_file_buffered_write);
>  DEFINE_RW_EVENT(xfs_file_direct_write);
> +DEFINE_RW_EVENT(xfs_file_dax_write);
>  DEFINE_RW_EVENT(xfs_file_splice_read);
>  
>  DECLARE_EVENT_CLASS(xfs_page_class,
> -- 
> 2.1.4
> 
> _______________________________________________
> xfs mailing list
> xfs@xxxxxxxxxxx
> http://oss.sgi.com/mailman/listinfo/xfs

<Prev in Thread] Current Thread [Next in Thread>