On Sat, Dec 19, 2015 at 01:03:07AM -0800, Darrick J. Wong wrote:
> For O_DIRECT writes to shared blocks, we have to CoW them just like
> we would with buffered writes. For writes that are not block-aligned,
> just bounce them to the page cache.
>
> For block-aligned writes, however, we can do better than that. Use
> the same mechanisms that we employ for buffered CoW to set up a
> delalloc reservation, allocate all the blocks at once, issue the
> writes against the new blocks and use the same ioend functions to
> remap the blocks after the write. This should be fairly performant.
>
> Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx>
> ---
> fs/xfs/xfs_aops.c | 63 +++++++++++++++++++++++++---
> fs/xfs/xfs_file.c | 12 ++++-
> fs/xfs/xfs_reflink.c | 114
> ++++++++++++++++++++++++++++++++++++++++++++++++++
> fs/xfs/xfs_reflink.h | 5 ++
> 4 files changed, 186 insertions(+), 8 deletions(-)
>
>
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 8101d6a..4b77d07 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -1339,7 +1339,8 @@ xfs_map_direct(
> struct buffer_head *bh_result,
> struct xfs_bmbt_irec *imap,
> xfs_off_t offset,
> - bool dax_fault)
> + bool dax_fault,
> + bool is_cow)
> {
> struct xfs_ioend *ioend;
> xfs_off_t size = bh_result->b_size;
> @@ -1368,20 +1369,23 @@ xfs_map_direct(
>
> if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
> ioend->io_type = XFS_IO_UNWRITTEN;
> + if (is_cow)
> + ioend->io_flags |= XFS_IOEND_COW;
>
> trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
> ioend->io_size, ioend->io_type,
> imap);
> - } else if (type == XFS_IO_UNWRITTEN ||
> + } else if (type == XFS_IO_UNWRITTEN || is_cow ||
> offset + size > i_size_read(inode) ||
> offset + size < 0) {
> ioend = xfs_alloc_ioend(inode, type);
> ioend->io_offset = offset;
> ioend->io_size = size;
> + if (is_cow)
> + ioend->io_flags |= XFS_IOEND_COW;
NAK. Further testing demonstrates incorrect remapping when the DIO CoW write
fails because xfs_end_io_direct_write doesn't know if the write succeeded or
not. xfs_vm_do_dio does, however, so we should move the remapping/cancelling
code to that function and avoid using the ioend entirely for directio cow.
--D
>
> bh_result->b_private = ioend;
> set_buffer_defer_completion(bh_result);
> -
> trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
> imap);
> } else {
> @@ -1449,6 +1453,8 @@ __xfs_get_blocks(
> xfs_off_t offset;
> ssize_t size;
> int new = 0;
> + bool is_cow = false;
> + bool need_alloc = false;
>
> if (XFS_FORCED_SHUTDOWN(mp))
> return -EIO;
> @@ -1480,8 +1486,15 @@ __xfs_get_blocks(
> end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
> offset_fsb = XFS_B_TO_FSBT(mp, offset);
>
> - error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
> - &imap, &nimaps, XFS_BMAPI_ENTIRE);
> + if (create && direct)
> + is_cow = xfs_reflink_is_cow_pending(ip, offset);
> + if (is_cow)
> + error = xfs_reflink_find_cow_mapping(ip, offset, &imap,
> + &need_alloc);
> + else
> + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
> + &imap, &nimaps, XFS_BMAPI_ENTIRE);
> + ASSERT(!need_alloc);
> if (error)
> goto out_unlock;
>
> @@ -1553,13 +1566,33 @@ __xfs_get_blocks(
> if (imap.br_startblock != HOLESTARTBLOCK &&
> imap.br_startblock != DELAYSTARTBLOCK &&
> (create || !ISUNWRITTEN(&imap))) {
> + if (create && direct && !is_cow) {
> + bool shared;
> +
> + error = xfs_reflink_irec_is_shared(ip, &imap, &shared);
> + if (error)
> + return error;
> + /*
> + * Are we doing a DIO write to a shared block? In
> + * the ideal world we at least would fork full blocks,
> + * but for now just fall back to buffered mode. Yuck.
> + * Use -EREMCHG ("remote address changed") to signal
> + * this, since in general XFS doesn't do this sort of
> + * fallback.
> + */
> + if (shared) {
> + trace_xfs_reflink_bounce_dio_write(ip, &imap);
> + return -EREMCHG;
> + }
> + }
> +
> xfs_map_buffer(inode, bh_result, &imap, offset);
> if (ISUNWRITTEN(&imap))
> set_buffer_unwritten(bh_result);
> /* direct IO needs special help */
> if (create && direct)
> xfs_map_direct(inode, bh_result, &imap, offset,
> - dax_fault);
> + dax_fault, is_cow);
> }
>
> /*
> @@ -1738,6 +1771,24 @@ xfs_vm_do_dio(
> int flags)
> {
> struct block_device *bdev;
> + loff_t end;
> + loff_t block_mask;
> + int error;
> +
> + /* If this is a block-aligned directio CoW, remap immediately. */
> + end = offset + iov_iter_count(iter);
> + block_mask = (1 << inode->i_blkbits) - 1;
> + if (xfs_is_reflink_inode(XFS_I(inode)) && iov_iter_rw(iter) == WRITE &&
> + !(offset & block_mask) && !(end & block_mask)) {
> + error = xfs_reflink_reserve_cow_range(XFS_I(inode), offset,
> + iov_iter_count(iter));
> + if (error)
> + return error;
> + error = xfs_reflink_allocate_cow_range(XFS_I(inode), offset,
> + iov_iter_count(iter));
> + if (error)
> + return error;
> + }
>
> if (IS_DAX(inode))
> return dax_do_io(iocb, inode, iter, offset,
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 0fbcb38..31b002e 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -892,10 +892,18 @@ xfs_file_write_iter(
> if (XFS_FORCED_SHUTDOWN(ip->i_mount))
> return -EIO;
>
> - if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
> + /*
> + * Allow DIO to fall back to buffered *only* in the case that we're
> + * doing a reflink CoW.
> + */
> + if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) {
> ret = xfs_file_dio_aio_write(iocb, from);
> - else
> + if (ret == -EREMCHG)
> + goto buffered;
> + } else {
> +buffered:
> ret = xfs_file_buffered_aio_write(iocb, from);
> + }
>
> if (ret > 0) {
> ssize_t err;
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 9c1c262..8594bc4 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -134,6 +134,56 @@ xfs_trim_extent(
> }
> }
>
> +/**
> + * xfs_reflink_irec_is_shared() -- Are any of the blocks in this mapping
> + * shared?
> + *
> + * @ip: XFS inode object
> + * @irec: the fileoff:fsblock mapping that we might fork
> + * @shared: set to true if the mapping is shared.
> + */
> +int
> +xfs_reflink_irec_is_shared(
> + struct xfs_inode *ip,
> + struct xfs_bmbt_irec *irec,
> + bool *shared)
> +{
> + xfs_agnumber_t agno;
> + xfs_agblock_t agbno;
> + xfs_extlen_t aglen;
> + xfs_agblock_t fbno;
> + xfs_extlen_t flen;
> + int error = 0;
> +
> + /* Holes, unwritten, and delalloc extents cannot be shared */
> + if (!xfs_is_reflink_inode(ip) ||
> + ISUNWRITTEN(irec) ||
> + irec->br_startblock == HOLESTARTBLOCK ||
> + irec->br_startblock == DELAYSTARTBLOCK) {
> + *shared = false;
> + return 0;
> + }
> +
> + trace_xfs_reflink_irec_is_shared(ip, irec);
> +
> + agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock);
> + agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock);
> + aglen = irec->br_blockcount;
> +
> + /* Are there any shared blocks here? */
> + error = xfs_refcount_find_shared(ip->i_mount, agno, agbno,
> + aglen, &fbno, &flen, false);
> + if (error)
> + return error;
> + if (flen == 0) {
> + *shared = false;
> + return 0;
> + }
> +
> + *shared = true;
> + return 0;
> +}
> +
> /* Find the shared ranges under an irec, and set up delalloc extents. */
> STATIC int
> xfs_reflink_reserve_cow_extent(
> @@ -251,6 +301,70 @@ xfs_reflink_reserve_cow_range(
> }
>
> /**
> + * xfs_reflink_allocate_cow_range() -- Allocate blocks to satisfy a copy on
> + * write operation.
> + * @ip: XFS inode.
> + * @pos: file offset to start CoWing.
> + * @len: number of bytes to CoW.
> + */
> +int
> +xfs_reflink_allocate_cow_range(
> + struct xfs_inode *ip,
> + xfs_off_t pos,
> + xfs_off_t len)
> +{
> + struct xfs_ifork *ifp;
> + struct xfs_bmbt_rec_host *gotp;
> + struct xfs_bmbt_irec imap;
> + int error = 0;
> + xfs_fileoff_t start_lblk;
> + xfs_fileoff_t end_lblk;
> + xfs_extnum_t idx;
> +
> + if (!xfs_is_reflink_inode(ip))
> + return 0;
> +
> + trace_xfs_reflink_allocate_cow_range(ip, len, pos, 0);
> +
> + start_lblk = XFS_B_TO_FSBT(ip->i_mount, pos);
> + end_lblk = XFS_B_TO_FSB(ip->i_mount, pos + len);
> + ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
> + xfs_ilock(ip, XFS_ILOCK_EXCL);
> +
> + gotp = xfs_iext_bno_to_ext(ifp, start_lblk, &idx);
> + while (gotp) {
> + xfs_bmbt_get_all(gotp, &imap);
> +
> + if (imap.br_startoff >= end_lblk)
> + break;
> + if (!isnullstartblock(imap.br_startblock))
> + goto advloop;
> + xfs_trim_extent(&imap, start_lblk, end_lblk - start_lblk);
> + trace_xfs_reflink_allocate_cow_extent(ip, &imap);
> +
> + xfs_iunlock(ip, XFS_ILOCK_EXCL);
> + error = xfs_iomap_write_allocate(ip, XFS_COW_FORK,
> + XFS_FSB_TO_B(ip->i_mount, imap.br_startoff +
> + imap.br_blockcount - 1), &imap);
> + xfs_ilock(ip, XFS_ILOCK_EXCL);
> + if (error)
> + break;
> +advloop:
> + /* Roll on... */
> + idx++;
> + if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
> + break;
> + gotp = xfs_iext_get_ext(ifp, idx);
> + }
> +
> + xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +
> + if (error)
> + trace_xfs_reflink_allocate_cow_range_error(ip, error, _RET_IP_);
> + return error;
> +}
> +
> +/**
> * xfs_reflink_is_cow_pending() -- Determine if CoW is pending for a given
> * file and offset.
> *
> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> index 8ec1ebb..d356c00 100644
> --- a/fs/xfs/xfs_reflink.h
> +++ b/fs/xfs/xfs_reflink.h
> @@ -18,8 +18,13 @@
> #ifndef __XFS_REFLINK_H
> #define __XFS_REFLINK_H 1
>
> +extern int xfs_reflink_irec_is_shared(struct xfs_inode *ip,
> + struct xfs_bmbt_irec *imap, bool *shared);
> +
> extern int xfs_reflink_reserve_cow_range(struct xfs_inode *ip, xfs_off_t pos,
> xfs_off_t len);
> +extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, xfs_off_t
> pos,
> + xfs_off_t len);
> extern bool xfs_reflink_is_cow_pending(struct xfs_inode *ip, xfs_off_t
> offset);
> extern int xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t
> offset,
> struct xfs_bmbt_irec *imap, bool *need_alloc);
>
> _______________________________________________
> xfs mailing list
> xfs@xxxxxxxxxxx
> http://oss.sgi.com/mailman/listinfo/xfs
|