[PATCH 1/6] xfs: fix inode size update overflow in xfs_map_direct()
Brian Foster
bfoster at redhat.com
Thu Oct 29 09:27:32 CDT 2015
On Mon, Oct 19, 2015 at 02:27:13PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner at redhat.com>
>
> Both direct IO and DAX pass an offset and count into get_blocks that
> will overflow a s64 variable when an IO goes into the last supported
> block in a file (i.e. at offset 2^63 - 1FSB bytes). This can be seen
> from the tracing:
>
> xfs_get_blocks_alloc: [...] offset 0x7ffffffffffff000 count 4096
> xfs_gbmap_direct: [...] offset 0x7ffffffffffff000 count 4096
> xfs_gbmap_direct_none:[...] offset 0x7ffffffffffff000 count 4096
>
> 0x7ffffffffffff000 + 4096 = 0x8000000000000000, and hence that
> overflows the s64 offset and we fail to detect the need for a
> filesize update and an ioend is not allocated.
>
> This is *mostly* avoided for direct IO because such extending IOs
> occur with full block allocation, and so the "IS_UNWRITTEN()" check
> still evaluates as true and we get an ioend that way. However, doing
> single sector extending IOs to this last block will expose the fact
> that file size updates will not occur after the first allocating
> direct IO as the overflow will then be exposed.
>
> There is one further complexity: the DAX page fault path also
> exposes the same issue in block allocation. However, page faults
> cannot extend the file size, so in this case we want to allocate the
> block but do not want to allocate an ioend to enable file size
> update at IO completion. Hence we now need to distinguish between
> the direct IO patch allocation and dax fault path allocation to
> avoid leaking ioend structures.
>
> Signed-off-by: Dave Chinner <dchinner at redhat.com>
> ---
Reviewed-by: Brian Foster <bfoster at redhat.com>
> fs/xfs/xfs_aops.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------
> fs/xfs/xfs_aops.h | 2 ++
> fs/xfs/xfs_file.c | 6 +++---
> 3 files changed, 49 insertions(+), 9 deletions(-)
>
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index e4fff58..366e41eb 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -1259,13 +1259,28 @@ xfs_vm_releasepage(
> * the DIO. There is only going to be one reference to the ioend and its life
> * cycle is constrained by the DIO completion code. hence we don't need
> * reference counting here.
> + *
> + * Note that for DIO, an IO to the highest supported file block offset (i.e.
> + * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
> + * bit variable. Hence if we see this overflow, we have to assume that the IO is
> + * extending the file size. We won't know for sure until IO completion is run
> + * and the actual max write offset is communicated to the IO completion
> + * routine.
> + *
> + * For DAX page faults, we are preparing to never see unwritten extents here,
> + * nor should we ever extend the inode size. Hence we will soon have nothing to
> + * do here for this case, ensuring we don't have to provide an IO completion
> + * callback to free an ioend that we don't actually need for a fault into the
> + * page at offset (2^63 - 1FSB) bytes.
> */
> +
> static void
> xfs_map_direct(
> struct inode *inode,
> struct buffer_head *bh_result,
> struct xfs_bmbt_irec *imap,
> - xfs_off_t offset)
> + xfs_off_t offset,
> + bool dax_fault)
> {
> struct xfs_ioend *ioend;
> xfs_off_t size = bh_result->b_size;
> @@ -1278,6 +1293,16 @@ xfs_map_direct(
>
> trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
>
> + /* XXX: preparation for removing unwritten extents in DAX */
> +#if 0
> + if (dax_fault) {
> + ASSERT(type == XFS_IO_OVERWRITE);
> + trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
> + imap);
> + return;
> + }
> +#endif
> +
> if (bh_result->b_private) {
> ioend = bh_result->b_private;
> ASSERT(ioend->io_size > 0);
> @@ -1292,7 +1317,8 @@ xfs_map_direct(
> ioend->io_size, ioend->io_type,
> imap);
> } else if (type == XFS_IO_UNWRITTEN ||
> - offset + size > i_size_read(inode)) {
> + offset + size > i_size_read(inode) ||
> + offset + size < 0) {
> ioend = xfs_alloc_ioend(inode, type);
> ioend->io_offset = offset;
> ioend->io_size = size;
> @@ -1354,7 +1380,8 @@ __xfs_get_blocks(
> sector_t iblock,
> struct buffer_head *bh_result,
> int create,
> - bool direct)
> + bool direct,
> + bool dax_fault)
> {
> struct xfs_inode *ip = XFS_I(inode);
> struct xfs_mount *mp = ip->i_mount;
> @@ -1467,7 +1494,8 @@ __xfs_get_blocks(
> set_buffer_unwritten(bh_result);
> /* direct IO needs special help */
> if (create && direct)
> - xfs_map_direct(inode, bh_result, &imap, offset);
> + xfs_map_direct(inode, bh_result, &imap, offset,
> + dax_fault);
> }
>
> /*
> @@ -1514,7 +1542,7 @@ xfs_get_blocks(
> struct buffer_head *bh_result,
> int create)
> {
> - return __xfs_get_blocks(inode, iblock, bh_result, create, false);
> + return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
> }
>
> int
> @@ -1524,7 +1552,17 @@ xfs_get_blocks_direct(
> struct buffer_head *bh_result,
> int create)
> {
> - return __xfs_get_blocks(inode, iblock, bh_result, create, true);
> + return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
> +}
> +
> +int
> +xfs_get_blocks_dax_fault(
> + struct inode *inode,
> + sector_t iblock,
> + struct buffer_head *bh_result,
> + int create)
> +{
> + return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
> }
>
> static void
> diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
> index 86afd1a..d39ba25 100644
> --- a/fs/xfs/xfs_aops.h
> +++ b/fs/xfs/xfs_aops.h
> @@ -58,6 +58,8 @@ int xfs_get_blocks(struct inode *inode, sector_t offset,
> struct buffer_head *map_bh, int create);
> int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
> struct buffer_head *map_bh, int create);
> +int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
> + struct buffer_head *map_bh, int create);
> void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
>
> extern void xfs_count_page_state(struct page *, int *, int *);
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 2f7b6bd..7f873bc 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1508,7 +1508,7 @@ xfs_filemap_page_mkwrite(
> xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
>
> if (IS_DAX(inode)) {
> - ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
> + ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault,
> xfs_end_io_dax_write);
> } else {
> ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
> @@ -1543,7 +1543,7 @@ xfs_filemap_fault(
> * changes to xfs_get_blocks_direct() to map unwritten extent
> * ioend for conversion on read-only mappings.
> */
> - ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL);
> + ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
> } else
> ret = filemap_fault(vma, vmf);
> xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> @@ -1570,7 +1570,7 @@ xfs_filemap_pmd_fault(
> sb_start_pagefault(inode->i_sb);
> file_update_time(vma->vm_file);
> xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> - ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct,
> + ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
> xfs_end_io_dax_write);
> xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
> sb_end_pagefault(inode->i_sb);
> --
> 2.5.0
>
> _______________________________________________
> xfs mailing list
> xfs at oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs
More information about the xfs
mailing list