xfs
[Top] [All Lists]

[PATCH 7/8] xfs: split direct I/O and DAX path

To: xfs@xxxxxxxxxxx
Subject: [PATCH 7/8] xfs: split direct I/O and DAX path
From: Christoph Hellwig <hch@xxxxxx>
Date: Wed, 22 Jun 2016 17:27:15 +0200
Cc: linux-nvdimm@xxxxxxxxxxx, linux-fsdevel@xxxxxxxxxxxxxxx
Delivered-to: xfs@xxxxxxxxxxx
In-reply-to: <1466609236-23801-1-git-send-email-hch@xxxxxx>
References: <1466609236-23801-1-git-send-email-hch@xxxxxx>
So far the DAX code overloaded the direct I/O code path.  There is very little
in common between the two, and untangling them allows to clean up both variants.

As a Ñide effect we also get separate trace points for both I/O types.

Signed-off-by: Christoph Hellwig <hch@xxxxxx>
---
 fs/xfs/xfs_file.c  | 139 ++++++++++++++++++++++++++++++++++++++++++-----------
 fs/xfs/xfs_trace.h |   2 +
 2 files changed, 112 insertions(+), 29 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 24b267d..0e74325 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -262,13 +262,11 @@ xfs_file_dio_aio_read(
        else
                target = ip->i_mount->m_ddev_targp;
 
-       if (!IS_DAX(inode)) {
-               /* DIO must be aligned to device logical sector size */
-               if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
-                       if (iocb->ki_pos == isize)
-                               return 0;
-                       return -EINVAL;
-               }
+       /* DIO must be aligned to device logical sector size */
+       if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
+               if (iocb->ki_pos == isize)
+                       return 0;
+               return -EINVAL;
        }
 
        /*
@@ -317,13 +315,37 @@ xfs_file_dio_aio_read(
        }
 
        data = *to;
-       if (IS_DAX(inode)) {
-               ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
-                               NULL, 0);
-       } else {
-               ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
-                               xfs_get_blocks_direct, NULL, NULL, 0);
+       ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+                       xfs_get_blocks_direct, NULL, NULL, 0);
+       if (ret > 0) {
+               iocb->ki_pos += ret;
+               iov_iter_advance(to, ret);
        }
+       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+
+       file_accessed(iocb->ki_filp);
+       return ret;
+}
+
+STATIC ssize_t
+xfs_file_dax_read(
+       struct kiocb            *iocb,
+       struct iov_iter         *to)
+{
+       struct address_space    *mapping = iocb->ki_filp->f_mapping;
+       struct inode            *inode = mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct iov_iter         data = *to;
+       size_t                  count = iov_iter_count(to);
+       ssize_t                 ret = 0;
+
+       trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
+
+       if (!count)
+               return 0; /* skip atime */
+
+       xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+       ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
        if (ret > 0) {
                iocb->ki_pos += ret;
                iov_iter_advance(to, ret);
@@ -356,7 +378,8 @@ xfs_file_read_iter(
        struct kiocb            *iocb,
        struct iov_iter         *to)
 {
-       struct xfs_mount        *mp = XFS_I(file_inode(iocb->ki_filp))->i_mount;
+       struct inode            *inode = file_inode(iocb->ki_filp);
+       struct xfs_mount        *mp = XFS_I(inode)->i_mount;
        ssize_t                 ret = 0;
 
        XFS_STATS_INC(mp, xs_read_calls);
@@ -364,7 +387,9 @@ xfs_file_read_iter(
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
 
-       if (iocb->ki_flags & IOCB_DIRECT)
+       if (IS_DAX(inode))
+               ret = xfs_file_dax_read(iocb, to);
+       else if (iocb->ki_flags & IOCB_DIRECT)
                ret = xfs_file_dio_aio_read(iocb, to);
        else
                ret = xfs_file_buffered_aio_read(iocb, to);
@@ -586,8 +611,7 @@ xfs_file_dio_aio_write(
                                        mp->m_rtdev_targp : mp->m_ddev_targp;
 
        /* DIO must be aligned to device logical sector size */
-       if (!IS_DAX(inode) &&
-           ((iocb->ki_pos | count) & target->bt_logical_sectormask))
+       if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
                return -EINVAL;
 
        /* "unaligned" here means not aligned to a filesystem block */
@@ -656,14 +680,9 @@ xfs_file_dio_aio_write(
        trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
 
        data = *from;
-       if (IS_DAX(inode)) {
-               ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
-                               xfs_end_io_direct_write, 0);
-       } else {
-               ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
-                               xfs_get_blocks_direct, xfs_end_io_direct_write,
-                               NULL, DIO_ASYNC_EXTEND);
-       }
+       ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+                       xfs_get_blocks_direct, xfs_end_io_direct_write,
+                       NULL, DIO_ASYNC_EXTEND);
 
        /* see generic_file_direct_write() for why this is necessary */
        if (mapping->nrpages) {
@@ -680,10 +699,70 @@ out:
        xfs_rw_iunlock(ip, iolock);
 
        /*
-        * No fallback to buffered IO on errors for XFS. DAX can result in
-        * partial writes, but direct IO will either complete fully or fail.
+        * No fallback to buffered IO on errors for XFS, direct IO will either
+        * complete fully or fail.
+        */
+       ASSERT(ret < 0 || ret == count);
+       return ret;
+}
+
+STATIC ssize_t
+xfs_file_dax_write(
+       struct kiocb            *iocb,
+       struct iov_iter         *from)
+{
+       struct address_space    *mapping = iocb->ki_filp->f_mapping;
+       struct inode            *inode = mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       ssize_t                 ret = 0;
+       int                     unaligned_io = 0;
+       int                     iolock;
+       struct iov_iter         data;
+
+       /* "unaligned" here means not aligned to a filesystem block */
+       if ((iocb->ki_pos & mp->m_blockmask) ||
+           ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
+               unaligned_io = 1;
+               iolock = XFS_IOLOCK_EXCL;
+       } else if (mapping->nrpages) {
+               iolock = XFS_IOLOCK_EXCL;
+       } else {
+               iolock = XFS_IOLOCK_SHARED;
+       }
+       xfs_rw_ilock(ip, iolock);
+
+       ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+       if (ret)
+               goto out;
+
+       /*
+        * Yes, even DAX files can have page cache attached to them:  A zeroed
+        * page is inserted into the pagecache when we have to serve a write
+        * fault on a hole.  It should never be dirtied and can simply be
+        * dropped from the pagecache once we get real data for the page.
         */
-       ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
+       if (mapping->nrpages) {
+               ret = invalidate_inode_pages2(mapping);
+               WARN_ON_ONCE(ret);
+       }
+
+       if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
+               xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+               iolock = XFS_IOLOCK_SHARED;
+       }
+
+       trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
+
+       data = *from;
+       ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
+                       xfs_end_io_direct_write, 0);
+       if (ret > 0) {
+               iocb->ki_pos += ret;
+               iov_iter_advance(from, ret);
+       }
+out:
+       xfs_rw_iunlock(ip, iolock);
        return ret;
 }
 
@@ -765,7 +844,9 @@ xfs_file_write_iter(
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -EIO;
 
-       if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
+       if (IS_DAX(inode))
+               ret = xfs_file_dax_write(iocb, from);
+       else if (iocb->ki_flags & IOCB_DIRECT)
                ret = xfs_file_dio_aio_write(iocb, from);
        else
                ret = xfs_file_buffered_aio_write(iocb, from);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 2504f94..1451690 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1165,8 +1165,10 @@ DEFINE_EVENT(xfs_file_class, name,       \
        TP_ARGS(ip, count, offset))
 DEFINE_RW_EVENT(xfs_file_buffered_read);
 DEFINE_RW_EVENT(xfs_file_direct_read);
+DEFINE_RW_EVENT(xfs_file_dax_read);
 DEFINE_RW_EVENT(xfs_file_buffered_write);
 DEFINE_RW_EVENT(xfs_file_direct_write);
+DEFINE_RW_EVENT(xfs_file_dax_write);
 DEFINE_RW_EVENT(xfs_file_splice_read);
 
 DECLARE_EVENT_CLASS(xfs_page_class,
-- 
2.1.4

<Prev in Thread] Current Thread [Next in Thread>