xfs
[Top] [All Lists]

[PATCH 20/24] xfs: emulate the btrfs dedupe extent same ioctl

To: david@xxxxxxxxxxxxx, darrick.wong@xxxxxxxxxx
Subject: [PATCH 20/24] xfs: emulate the btrfs dedupe extent same ioctl
From: "Darrick J. Wong" <darrick.wong@xxxxxxxxxx>
Date: Wed, 29 Jul 2015 15:35:06 -0700
Cc: xfs@xxxxxxxxxxx
Delivered-to: xfs@xxxxxxxxxxx
In-reply-to: <20150729223258.17414.91354.stgit@xxxxxxxxxxxxxxxx>
References: <20150729223258.17414.91354.stgit@xxxxxxxxxxxxxxxx>
User-agent: StGit/0.17.1-dirty
Emulate the BTRFS_IOC_EXTENT_SAME ioctl.  This operation is similar
to clone_range, but the kernel must confirm that the contents of the
two extents are identical before performing the reflink.

Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx>
---
 fs/xfs/libxfs/xfs_fs.h |   26 ++++++++++
 fs/xfs/xfs_ioctl.c     |  123 ++++++++++++++++++++++++++++++++++++++++++++++--
 fs/xfs/xfs_ioctl32.c   |    1 
 fs/xfs/xfs_reflink.c   |  120 ++++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_reflink.h   |    6 ++
 5 files changed, 270 insertions(+), 6 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 22a0451..2951abb 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -569,8 +569,34 @@ struct xfs_ioctl_clone_range_args {
        __u64 dest_offset;
 };
 
+#define XFS_SAME_DATA_DIFFERS  1
+/* For extent-same ioctl */
+struct xfs_ioctl_file_extent_same_info {
+       __s64 fd;               /* in - destination file */
+       __u64 logical_offset;   /* in - start of extent in destination */
+       __u64 bytes_deduped;    /* out - total # of bytes we were able
+                                * to dedupe from this file */
+       /* status of this dedupe operation:
+        * 0 if dedup succeeds
+        * < 0 for error
+        * == XFS_SAME_DATA_DIFFERS if data differs
+        */
+       __s32 status;           /* out - see above description */
+       __u32 reserved;
+};
+
+struct xfs_ioctl_file_extent_same_args {
+       __u64 logical_offset;   /* in - start of extent in source */
+       __u64 length;           /* in - length of extent */
+       __u16 dest_count;       /* in - total elements in info array */
+       __u16 reserved1;
+       __u32 reserved2;
+       struct xfs_ioctl_file_extent_same_info info[0];
+};
+
 #define XFS_IOC_CLONE           _IOW (0x94, 9, int)
 #define XFS_IOC_CLONE_RANGE     _IOW (0x94, 13, struct 
xfs_ioctl_clone_range_args)
+#define XFS_IOC_FILE_EXTENT_SAME _IOWR(0x94, 54, struct 
xfs_ioctl_file_extent_same_args)
 
 #ifndef HAVE_BBMACROS
 /*
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index d93adfa..ce882aa 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1541,7 +1541,8 @@ xfs_ioctl_reflink(
        loff_t          pos_in,
        struct file     *file_out,
        loff_t          pos_out,
-       size_t          len)
+       size_t          len,
+       bool            is_dedupe)
 {
        struct inode    *inode_in;
        struct inode    *inode_out;
@@ -1550,6 +1551,7 @@ xfs_ioctl_reflink(
        loff_t          isize;
        int             same_inode;
        loff_t          blen;
+       unsigned int    flags;
 
        if (len == 0)
                return 0;
@@ -1629,8 +1631,12 @@ xfs_ioctl_reflink(
        if (ret)
                goto out_unlock;
 
+       flags = 0;
+       if (is_dedupe)
+               flags |= XFS_REFLINK_DEDUPE;
+
        ret = xfs_reflink(XFS_I(inode_in), pos_in, XFS_I(inode_out),
-                       pos_out, len);
+                       pos_out, len, flags);
        if (ret < 0)
                goto out_unlock;
 
@@ -1652,6 +1658,111 @@ out_unlock:
        return ret;
 }
 
+#define XFS_MAX_DEDUPE_LEN     (16 * 1024 * 1024)
+
+static long
+xfs_ioctl_file_extent_same(
+       struct file                                     *file,
+       struct xfs_ioctl_file_extent_same_args __user   *argp)
+{
+       struct xfs_ioctl_file_extent_same_args  *same;
+       struct xfs_ioctl_file_extent_same_info  *info;
+       struct inode                            *src;
+       u64                                     off;
+       u64                                     len;
+       int                                     i;
+       int                                     ret;
+       unsigned long                           size;
+       bool                                    is_admin;
+       u16                                     count;
+
+       is_admin = capable(CAP_SYS_ADMIN);
+       src = file_inode(file);
+       if (!(file->f_mode & FMODE_READ))
+               return -EINVAL;
+
+       if (get_user(count, &argp->dest_count)) {
+               ret = -EFAULT;
+               goto out;
+       }
+
+       size = offsetof(struct xfs_ioctl_file_extent_same_args __user,
+                       info[count]);
+
+       same = memdup_user(argp, size);
+
+       if (IS_ERR(same)) {
+               ret = PTR_ERR(same);
+               goto out;
+       }
+
+       off = same->logical_offset;
+       len = same->length;
+
+       /*
+        * Limit the total length we will dedupe for each operation.
+        * This is intended to bound the total time spent in this
+        * ioctl to something sane.
+        */
+       if (len > XFS_MAX_DEDUPE_LEN)
+               len = XFS_MAX_DEDUPE_LEN;
+
+       ret = -EISDIR;
+       if (S_ISDIR(src->i_mode))
+               goto out;
+
+       ret = -EACCES;
+       if (!S_ISREG(src->i_mode))
+               goto out;
+
+       /* pre-format output fields to sane values */
+       for (i = 0; i < count; i++) {
+               same->info[i].bytes_deduped = 0ULL;
+               same->info[i].status = 0;
+       }
+
+       for (i = 0, info = same->info; i < count; i++, info++) {
+               struct inode *dst;
+               struct fd dst_file = fdget(info->fd);
+               if (!dst_file.file) {
+                       info->status = -EBADF;
+                       continue;
+               }
+               dst = file_inode(dst_file.file);
+
+               trace_xfs_ioctl_file_extent_same(file_inode(file), off, len,
+                               dst, info->logical_offset);
+
+               info->bytes_deduped = 0;
+               if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) {
+                       info->status = -EINVAL;
+               } else if (file->f_path.mnt != dst_file.file->f_path.mnt) {
+                       info->status = -EXDEV;
+               } else if (S_ISDIR(dst->i_mode)) {
+                       info->status = -EISDIR;
+               } else if (!S_ISREG(dst->i_mode)) {
+                       info->status = -EACCES;
+               } else {
+                       info->status = xfs_ioctl_reflink(file, off,
+                                                        dst_file.file,
+                                                        info->logical_offset,
+                                                        len, true);
+                       if (info->status == -EBADE)
+                               info->status = XFS_SAME_DATA_DIFFERS;
+                       else if (info->status == 0)
+                               info->bytes_deduped = len;
+               }
+               fdput(dst_file);
+       }
+
+       ret = copy_to_user(argp, same, size);
+       if (ret)
+               ret = -EFAULT;
+
+out:
+       return ret;
+}
+
 /*
  * Note: some of the ioctl's return positive numbers as a
  * byte count indicating success, such as readlink_by_handle.
@@ -1959,7 +2070,7 @@ xfs_file_ioctl(
 
                trace_xfs_ioctl_clone(file_inode(src.file), file_inode(filp));
 
-               error = xfs_ioctl_reflink(src.file, 0, filp, 0, ~0ULL);
+               error = xfs_ioctl_reflink(src.file, 0, filp, 0, ~0ULL, false);
                fdput(src);
                if (error > 0)
                        error = 0;
@@ -1984,7 +2095,8 @@ xfs_file_ioctl(
                                file_inode(filp), args.dest_offset);
 
                error = xfs_ioctl_reflink(src.file, args.src_offset, filp,
-                                         args.dest_offset, args.src_length);
+                                         args.dest_offset, args.src_length,
+                                         false);
                fdput(src);
                if (error > 0)
                        error = 0;
@@ -1992,6 +2104,9 @@ xfs_file_ioctl(
                return error;
        }
 
+       case XFS_IOC_FILE_EXTENT_SAME:
+               return xfs_ioctl_file_extent_same(filp, arg);
+
        default:
                return -ENOTTY;
        }
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 76d8729..575c292 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -560,6 +560,7 @@ xfs_file_compat_ioctl(
        case XFS_IOC_ERROR_CLEARALL:
        case XFS_IOC_CLONE:
        case XFS_IOC_CLONE_RANGE:
+       case XFS_IOC_FILE_EXTENT_SAME:
                return xfs_file_ioctl(filp, cmd, p);
 #ifndef BROKEN_X86_ALIGNMENT
        /* These are handled fine if no alignment issues */
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 7605519..f2086f6b 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1370,6 +1370,103 @@ advloop:
 }
 #undef IMAPNEXT
 
+/*
+ * Read a page's worth of file data into the page cache.
+ */
+STATIC struct page *
+xfs_get_page(
+       struct inode    *inode,         /* inode */
+       xfs_off_t       offset)         /* where in the inode to read */
+{
+       struct address_space    *mapping;
+       struct page             *page;
+       pgoff_t                 n;
+
+       n = offset >> PAGE_CACHE_SHIFT;
+       mapping = inode->i_mapping;
+       page = read_mapping_page(mapping, n, NULL);
+       if (IS_ERR(page))
+               return page;
+       if (!PageUptodate(page)) {
+               page_cache_release(page);
+               return NULL;
+       }
+       return page;
+}
+
+/*
+ * Compare extents of two files to see if they are the same.
+ */
+STATIC int
+xfs_compare_extents(
+       struct inode    *src,           /* first inode */
+       xfs_off_t       srcoff,         /* offset of first inode */
+       struct inode    *dest,          /* second inode */
+       xfs_off_t       destoff,        /* offset of second inode */
+       xfs_off_t       len,            /* length of data to compare */
+       bool            *is_same)       /* out: true if the contents match */
+{
+       xfs_off_t       src_poff;
+       xfs_off_t       dest_poff;
+       void            *src_addr;
+       void            *dest_addr;
+       struct page     *src_page;
+       struct page     *dest_page;
+       xfs_off_t       cmp_len;
+       bool            same;
+       int             error;
+
+       error = -EINVAL;
+       same = true;
+       while (len) {
+               src_poff = srcoff & (PAGE_CACHE_SIZE - 1);
+               dest_poff = destoff & (PAGE_CACHE_SIZE - 1);
+               cmp_len = min(PAGE_CACHE_SIZE - src_poff,
+                             PAGE_CACHE_SIZE - dest_poff);
+               cmp_len = min(cmp_len, len);
+               ASSERT(cmp_len > 0);
+
+               trace_xfs_reflink_compare_extents(XFS_I(src), srcoff, cmp_len,
+                               XFS_I(dest), destoff);
+
+               src_page = xfs_get_page(src, srcoff);
+               if (!src_page)
+                       goto out_error;
+               dest_page = xfs_get_page(dest, destoff);
+               if (!dest_page) {
+                       page_cache_release(src_page);
+                       goto out_error;
+               }
+               src_addr = kmap_atomic(src_page);
+               dest_addr = kmap_atomic(dest_page);
+
+               flush_dcache_page(src_page);
+               flush_dcache_page(dest_page);
+
+               if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+                       same = false;
+
+               kunmap_atomic(src_addr);
+               kunmap_atomic(dest_addr);
+               page_cache_release(src_page);
+               page_cache_release(dest_page);
+
+               if (!same)
+                       break;
+
+               srcoff += cmp_len;
+               destoff += cmp_len;
+               len -= cmp_len;
+       }
+
+       *is_same = same;
+       return 0;
+
+out_error:
+       trace_xfs_reflink_compare_extents_error(XFS_I(dest), error, _RET_IP_);
+       return error;
+}
+
 /**
  * xfs_reflink() - link a range of blocks from one inode to another
  *
@@ -1378,6 +1475,7 @@ advloop:
  * @dest: Inode to clone to
  * @destoff: Offset within @inode to start clone
  * @len: Original length, passed by user, of range to clone
+ * @flags: Flags to modify reflink's behavior
  */
 int
 xfs_reflink(
@@ -1385,12 +1483,14 @@ xfs_reflink(
        xfs_off_t               srcoff,
        struct xfs_inode        *dest,
        xfs_off_t               destoff,
-       xfs_off_t               len)
+       xfs_off_t               len,
+       unsigned int            flags)
 {
        struct xfs_mount        *mp = src->i_mount;
        xfs_fileoff_t           sfsbno, dfsbno;
        xfs_filblks_t           fsblen;
        int                     error;
+       bool                    is_same;
 
        if (!xfs_sb_version_hasreflink(&mp->m_sb))
                return -EOPNOTSUPP;
@@ -1402,6 +1502,9 @@ xfs_reflink(
        if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
                return -EINVAL;
 
+       if (flags & ~XFS_REFLINK_ALL)
+               return -EINVAL;
+
        trace_xfs_reflink_range(src, srcoff, len, dest, destoff);
 
        /* Lock both files against IO */
@@ -1413,6 +1516,21 @@ xfs_reflink(
                xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL);
        }
 
+       /*
+        * Check that the extents are the same.
+        */
+       if (flags & XFS_REFLINK_DEDUPE) {
+               is_same = false;
+               error = xfs_compare_extents(VFS_I(src), srcoff, VFS_I(dest),
+                               destoff, len, &is_same);
+               if (error)
+                       goto out_error;
+               if (!is_same) {
+                       error = -EBADE;
+                       goto out_error;
+               }
+       }
+
        error = set_inode_reflink_flag(src, dest);
        if (error)
                goto out_error;
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index b633824..c60a9bd 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -44,7 +44,11 @@ extern int xfs_reflink_finish_fork_buf(struct xfs_inode *ip, 
struct xfs_buf *bp,
                xfs_fileoff_t fileoff, struct xfs_trans *tp, int write_error,
                xfs_fsblock_t old_fsbno);
 
+#define XFS_REFLINK_DEDUPE     1       /* only reflink if contents match */
+#define XFS_REFLINK_ALL                (XFS_REFLINK_DEDUPE)
+
 extern int xfs_reflink(struct xfs_inode *src, xfs_off_t srcoff,
-               struct xfs_inode *dest, xfs_off_t destoff, xfs_off_t len);
+               struct xfs_inode *dest, xfs_off_t destoff, xfs_off_t len,
+               unsigned int flags);
 
 #endif /* __XFS_REFLINK_H */

<Prev in Thread] Current Thread [Next in Thread>