xfs
[Top] [All Lists]

Re: Introduce SEEK_DATA/SEEK_HOLE to XFS V5

To: Jeff Liu <jeff.liu@xxxxxxxxxx>
Subject: Re: Introduce SEEK_DATA/SEEK_HOLE to XFS V5
From: Ben Myers <bpm@xxxxxxx>
Date: Wed, 11 Jan 2012 09:43:18 -0600
Cc: xfs@xxxxxxxxxxx, Christoph Hellwig <hch@xxxxxxxxxxxxx>, Chris Mason <chris.mason@xxxxxxxxxx>
In-reply-to: <4F06F71A.2010301@xxxxxxxxxx>
References: <4F06F71A.2010301@xxxxxxxxxx>
User-agent: Mutt/1.5.18 (2008-05-17)
Hey Jeff,

Here are a few additional minor comments from yesterday.

I'm looking forward to seeing your next version, and I'm still working
through this one.

I would like to suggest that you split this into two patches.  The first
patch should be the 'simple' implementation that that you began with
that only looks at extents, and assumes that unwritten extents contain
data.  The second patch can remove the assumption that unwritten extents
contain data, and go over pages over the extent to determine if it is
clean.  I feel we have a better chance of coming to consensus that the
first patch is correct in the near term, and then we can move on to the
more complicated matter of whether the unwritten extent can be treated
as a hole safe in the knowledge that the initial implementation was
awesome.

Regards,
Ben

On Fri, Jan 06, 2012 at 09:28:58PM +0800, Jeff Liu wrote:
> This is a revised patch according to Christoph's comments at V4.
> 
> Changes to V5:
> --------------
> * Revise xfs_has_unwritten_buffer() to lookup pages match tag.
> * For unwritten extents, in both xfs_seek_data() and xfs_seek_hole(), call 
> xfs_has_unwritten_buffer() to search
>   DIRTY pages firstly, if no dirty data found, call it again to search 
> WRITEBACK pages.
> * In xfs_seek_hole(), if dirty data was found in page cache for an unwritten 
> extents, but its start offset past the start block
>   of the map,  treat it as a hole, returns the offset if 
> possible(data_buffer_offset > max(seek_offset, start_block_of_map)).
> 
> Tests:
> ------
> seek sanity tester:
> http://patchwork.xfs.org/patch/3108/
> seek copy tester:
> http://patchwork.xfs.org/patch/3109/
> 
> 
> Thanks,
> -Jeff
> 
> Signed-off-by: Jie Liu <jeff.liu@xxxxxxxxxx>
> 
> ---
>  fs/xfs/xfs_file.c |  466 
> ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 files changed, 465 insertions(+), 1 deletions(-)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 753ed9b..24ae40a 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -38,6 +38,7 @@
>  
>  #include <linux/dcache.h>
>  #include <linux/falloc.h>
> +#include <linux/pagevec.h>
>  
>  static const struct vm_operations_struct xfs_file_vm_ops;
>  
> @@ -1141,8 +1142,471 @@ xfs_vm_page_mkwrite(
>       return block_page_mkwrite(vma, vmf, xfs_get_blocks);
>  }
>  
> +/*
> + * Probe the data buffer offset in page cache for unwritten extents.
> + * Fetch all the pages match @tag, and iterate each page to find out
> + * if a buffer head state has BH_Unwritten or BH_Uptodate set.
> + */
> +STATIC bool
> +xfs_has_unwritten_buffer(
> +     struct inode            *inode,
> +     struct xfs_bmbt_irec    *map,
> +     int                     tag,
> +     loff_t                  *offset)
> +{
> +     struct xfs_inode        *ip = XFS_I(inode);
> +     struct xfs_mount        *mp = ip->i_mount;
> +     struct pagevec          pvec;
> +     pgoff_t                 index;
> +     pgoff_t                 end;
> +     bool                    found = false;
> +
> +     pagevec_init(&pvec, 0);
> +
> +     index = XFS_FSB_TO_B(mp, map->br_startoff) >> PAGE_CACHE_SHIFT;
> +     end = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount)
> +                        >> PAGE_CACHE_SHIFT;
> +
> +     do {
> +             unsigned int    i;
> +             unsigned        nr_pages;
> +             int             want = min_t(pgoff_t, end - index,
> +                                          (pgoff_t)PAGEVEC_SIZE - 1) + 1;
> +             nr_pages = pagevec_lookup_tag(&pvec, inode->i_mapping,
> +                                           &index, tag, want);
> +             if (nr_pages == 0)
> +                     break;
> +
> +             for (i = 0; i < nr_pages; i++) {
> +                     struct page             *page = pvec.pages[i];
> +                     struct buffer_head      *bh;
> +                     struct buffer_head      *head;
> +                     xfs_fileoff_t           last;
> +
> +                     if (!page_has_buffers(page))
> +                             continue;
> +
> +                     /*
> +                      * There is no need to check the following pages
> +                      * if the current page offset is out of range.
> +                      */
> +                     if (page->index > end)
> +                             goto out;
> +
> +                     last = XFS_B_TO_FSBT(mp,
> +                                          page->index << PAGE_CACHE_SHIFT);
> +
> +                     bh = head = page_buffers(page);
> +                     do {
> +                             /*
> +                              * An extent in XFS_EXT_UNWRITTEN have disk
> +                              * blocks already mapped to it, but no data
> +                              * has been committed to them yet. If it has
> +                              * dirty data in the page cache it can be
> +                              * identified by having BH_Unwritten set in
> +                              * each buffer. Also, the buffer head state
> +                              * might be in BH_Uptodate too if the buffer
> +                              * writeback procedure was fired, we need to
> +                              * examine it as well.
> +                              */
> +                             if (buffer_unwritten(bh) ||
> +                                 buffer_uptodate(bh)) {
> +                                     found = true;
> +                                     *offset = XFS_FSB_TO_B(mp, last);
> +                                     goto out;
> +                             }
> +                             last++;
> +                     } while ((bh = bh->b_this_page) != head);
> +             }
> +
> +             /*
> +              * If the number of probed pages less than our desired,
> +              * there should no more pages mapped, search done.
> +              */
> +             if (nr_pages < want)
> +                     break;
> +
> +             index = pvec.pages[i - 1]->index + 1;
> +             pagevec_release(&pvec);
> +     } while (index < end);
> +
> +out:
> +     pagevec_release(&pvec);
> +     if (!found)
> +             *offset = 0;
> +
> +     return found;
> +}
> +
> +STATIC loff_t
> +xfs_seek_data(
> +     struct file             *file,
> +     loff_t                  start)
> +{
> +     struct inode            *inode = file->f_mapping->host;
> +     struct xfs_inode        *ip = XFS_I(inode);
> +     struct xfs_mount        *mp = ip->i_mount;
> +     xfs_fsize_t             isize = i_size_read(inode);
> +     loff_t                  offset = 0;
> +     struct xfs_ifork        *ifp;
> +     xfs_fileoff_t           fsbno;
> +     xfs_filblks_t           len;
> +     int                     lock;
> +     int                     error;
> +
> +     lock = xfs_ilock_map_shared(ip);
> +
> +     if (start >= isize) {
> +             error = ENXIO;
> +             goto out_lock;
> +     }

In Christoph's v3 review he asked you to move this check to after the
lock is taken, which you've done.  Note that you've read from ip->i_size
using i_size_read before taking the lock, so isize could be stale.  Call
i_size_read only after taking the ilock shared.

> +
> +     fsbno = XFS_B_TO_FSBT(mp, start);
> +     ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
> +     len = XFS_B_TO_FSB(mp, isize);

Put calculation of start_fsb and end_fsb next to each other.

> +
> +     for (;;) {
> +             struct xfs_bmbt_irec    map[2];
> +             int                     nmap = 2;
> +             loff_t                  seekoff;
> +
> +             error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap,
> +                                    XFS_BMAPI_ENTIRE);
> +             if (error)
> +                     goto out_lock;
> +
> +             /* No extents at given offset, must be beyond EOF */
> +             if (!nmap) {
> +                     error = ENXIO;
> +                     goto out_lock;
> +             }
> +
> +             seekoff = XFS_FSB_TO_B(mp, fsbno);
> +             /*
> +              * Landed in a hole, skip to check the next extent.
> +              * If the next extent landed in an in-memory data extent,
> +              * or it is a normal extent, its fine to return.
> +              * If the next extent landed in a hole extent, calculate
> +              * the start file system block number for the next scan.
> +              * If the next extent landed in an unwritten extent, we
> +              * need to lookup the page cache to examine the data
> +              * buffer offset, if nothing found, treat it as a hole
> +              * extent too.
> +              */
> +             if (map[0].br_startblock == HOLESTARTBLOCK) {
> +                     /*
> +                      * Return ENXIO if no data extent behind
> +                      * the given offset. In this case, the seek
> +                      * offset should be landed in a hole.
> +                      */
> +                     if (nmap == 1) {
> +                             error = ENXIO;
> +                             break;
> +                     }
> +
> +                     if (map[1].br_state == XFS_EXT_NORM ||
> +                         map[1].br_startblock == DELAYSTARTBLOCK) {
> +                             offset = max_t(loff_t, seekoff,
> +                                     XFS_FSB_TO_B(mp, map[1].br_startoff));
> +
> +                             break;
> +                     } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
> +                             if (xfs_has_unwritten_buffer(inode, &map[1],
> +                                                  PAGECACHE_TAG_DIRTY,
> +                                                  &offset) ||
> +                                 xfs_has_unwritten_buffer(inode, &map[1],
> +                                                  PAGECACHE_TAG_WRITEBACK,
> +                                                  &offset)) {
> +                                     offset = max_t(loff_t, seekoff, offset);
> +                                     break;
> +                             }
> +                     }
> +
> +                     fsbno = map[1].br_startoff + map[1].br_blockcount;
> +             }
> +
> +             /*
> +              * Landed in an unwritten extent, try to find out the data
> +              * buffer offset from page cache firstly. If nothing was
> +              * found, treat it as a hole, and skip to check the next
> +              * extent, something just like above.
> +              */
> +             if (map[0].br_state == XFS_EXT_UNWRITTEN) {
> +                     if (xfs_has_unwritten_buffer(inode, &map[0],
> +                                                  PAGECACHE_TAG_DIRTY,
> +                                                  &offset) ||
> +                         xfs_has_unwritten_buffer(inode, &map[0],
> +                                                  PAGECACHE_TAG_WRITEBACK,
> +                                                  &offset)) {
> +                             offset = max_t(loff_t, seekoff, offset);
> +                             break;
> +                     }
> +
> +                     /* No data extent at the given offset */
> +                     if (nmap == 1) {
> +                             error = ENXIO;
> +                             break;
> +                     }
> +
> +                     if (map[1].br_state == XFS_EXT_NORM ||
> +                         map[1].br_startblock == DELAYSTARTBLOCK) {
> +                             offset = max_t(loff_t, seekoff,
> +                                     XFS_FSB_TO_B(mp, map[1].br_startoff));
> +                             break;
> +                     } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
> +                             if (xfs_has_unwritten_buffer(inode, &map[1],
> +                                                  PAGECACHE_TAG_DIRTY,
> +                                                  &offset) ||
> +                                 xfs_has_unwritten_buffer(inode, &map[1],
> +                                                  PAGECACHE_TAG_WRITEBACK,
> +                                                  &offset)) {
> +                                     offset = max_t(loff_t, seekoff, offset);
> +                                     break;
> +                             }
> +                     }
> +
> +                     fsbno = map[1].br_startoff + map[1].br_blockcount;
> +             }
> +
> +             /* Landed in a delay allocated extent or a real data extent */
> +             if (map[0].br_startblock == DELAYSTARTBLOCK ||
> +                 map[0].br_state == XFS_EXT_NORM) {
> +                     offset = max_t(loff_t, seekoff,
> +                                    XFS_FSB_TO_B(mp, map[0].br_startoff));
> +                     break;
> +             }
> +
> +             /* Return ENXIO if beyond eof */
> +             if (XFS_FSB_TO_B(mp, fsbno) > isize) {
> +                     error = ENXIO;
> +                     goto out_lock;
> +             }
> +     }
> +
> +     if (offset < start)
> +             offset = start;
> +
> +     if (offset != file->f_pos)
> +             file->f_pos = offset;
> +
> +out_lock:
> +     xfs_iunlock_map_shared(ip, lock);
> +     if (error)
> +             return -error;
> +
> +     return offset;
> +}
> +
> +STATIC loff_t
> +xfs_seek_hole(
> +     struct file             *file,
> +     loff_t                  start)
> +{
> +     struct inode            *inode = file->f_mapping->host;
> +     struct xfs_inode        *ip = XFS_I(inode);
> +     struct xfs_mount        *mp = ip->i_mount;
> +     xfs_fsize_t             isize = i_size_read(inode);

Call i_size_read under ilock.

> +     loff_t                  offset = 0;
> +     struct xfs_ifork        *ifp;
> +     xfs_fileoff_t           fsbno;
> +     xfs_filblks_t           len;
> +     int                     lock;

lock should be a uint

> +     int                     error;
> +
> +     lock = xfs_ilock_map_shared(ip);
> +
> +     if (start >= isize) {
> +             error = ENXIO;
> +             goto out_lock;
> +     }
> +
> +     fsbno = XFS_B_TO_FSBT(mp, start);
> +     ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
> +     len = XFS_B_TO_FSB(mp, isize);

Calculation of start_fsb and end_fsb look nicer next to each other.

> +
> +     for (;;) {
> +             struct xfs_bmbt_irec    map[2];
> +             int                     nmap = 2;
> +             loff_t                  seekoff;
> +
> +             error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap,
> +                                    XFS_BMAPI_ENTIRE);
> +             if (error)
> +                     goto out_lock;
> +
> +             /* No extents at given offset, must be beyond EOF */
> +             if (!nmap) {
> +                     error = ENXIO;
> +                     goto out_lock;
> +             }
> +
> +             seekoff = XFS_FSB_TO_B(mp, fsbno);
> +             /*
> +              * Landed in an unwritten extent, try to lookup the page
> +              * cache to find out if there is dirty data or not. If
> +              * nothing was found, treate it as a hole. If there has
> +              * dirty data and its offset starts past both the start
> +              * block of the map and the current seek offset, it should
> +              * be treated as hole too. Otherwise, go through the next
> +              * extent to fetch holes.
> +              */
> +             if (map[0].br_state == XFS_EXT_UNWRITTEN) {
> +                     if (xfs_has_unwritten_buffer(inode, &map[0],
> +                                                  PAGECACHE_TAG_DIRTY,
> +                                                  &offset) ||
> +                         xfs_has_unwritten_buffer(inode, &map[0],
> +                                                  PAGECACHE_TAG_WRITEBACK,
> +                                                  &offset)) {
> +                             if (offset > max_t(loff_t, seekoff,
> +                                                XFS_FSB_TO_B(mp,
> +                                                map[0].br_startoff))) {
> +                                     offset = max_t(loff_t, seekoff,
> +                                                    XFS_FSB_TO_B(mp,
> +                                                    map[0].br_startoff));
> +                                     break;
> +                             }
> +                     } else {
> +                             offset = max_t(loff_t, seekoff,
> +                                     XFS_FSB_TO_B(mp, map[0].br_startoff));
> +                             break;
> +                     }
> +
> +                     /*
> +                      * No more extent at the given offst, return the total
> +                      * file size.
> +                      */
> +                     if (nmap == 1) {
> +                             offset = isize;
> +                             break;
> +                     }
> +
> +                     if (map[1].br_startblock == HOLESTARTBLOCK) {
> +                             offset = max_t(loff_t, seekoff,
> +                                     XFS_FSB_TO_B(mp, map[1].br_startoff));
> +                             break;
> +                     } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
> +                             if (xfs_has_unwritten_buffer(inode, &map[1],
> +                                                     PAGECACHE_TAG_DIRTY,
> +                                                     &offset) ||
> +                                 xfs_has_unwritten_buffer(inode, &map[1],
> +                                                     PAGECACHE_TAG_WRITEBACK,
> +                                                     &offset)) {
> +                                     if (offset > max_t(loff_t, seekoff,
> +                                                     XFS_FSB_TO_B(mp,
> +                                                     map[1].br_startoff))) {
> +                                             offset = max_t(loff_t, seekoff,
> +                                                     XFS_FSB_TO_B(mp,
> +                                                     map[1].br_startoff));
> +                                             break;
> +                                     }
> +                             } else {
> +                                     offset = max_t(loff_t, seekoff,
> +                                     XFS_FSB_TO_B(mp, map[1].br_startoff));
> +                                     break;
> +                             }
> +                     }
> +
> +                     fsbno = map[1].br_startoff + map[1].br_blockcount;
> +             }
> +
> +             /*
> +              * Landed in a delay allocated extent or a real data extent,
> +              * if the next extent is landed in a hole or in an unwritten
> +              * extent but without data committed in the page cache, return
> +              * its offset. If the next extent has dirty data in page cache,
> +              * but its offset starts past both the start block of the map
> +              * and the seek offset, it still be a hole.
> +              */
> +             if (map[0].br_startblock == DELAYSTARTBLOCK ||
> +                 map[0].br_state == XFS_EXT_NORM) {
> +                     /*
> +                      * No more extent at the give offset, return the
> +                      * total file size.
> +                      */
> +                     if (nmap == 1) {
> +                             offset = isize;
> +                             break;
> +                     }
> +
> +                     if (map[1].br_startblock == HOLESTARTBLOCK) {
> +                             offset = max_t(loff_t, seekoff,
> +                                     XFS_FSB_TO_B(mp, map[1].br_startoff));
> +                             break;
> +                     } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
> +                             if (xfs_has_unwritten_buffer(inode, &map[1],
> +                                                     PAGECACHE_TAG_DIRTY,
> +                                                     &offset) ||
> +                                 xfs_has_unwritten_buffer(inode, &map[1],
> +                                                     PAGECACHE_TAG_WRITEBACK,
> +                                                     &offset)) {
> +                                     if (offset > max_t(loff_t, seekoff,
> +                                                     XFS_FSB_TO_B(mp,
> +                                                     map[1].br_startoff))) {
> +                                             offset = max_t(loff_t, seekoff,
> +                                                     XFS_FSB_TO_B(mp,
> +                                                     map[1].br_startoff));
> +                                             break;
> +                                     }
> +                             } else {
> +                                     offset = max_t(loff_t, seekoff,
> +                                     XFS_FSB_TO_B(mp, map[1].br_startoff));
> +                                     break;
> +                             }
> +                     }
> +
> +                     fsbno = map[1].br_startoff + map[1].br_blockcount;
> +             }
> +
> +             /* Landed in a hole, its fine to return */
> +             if (map[0].br_startblock == HOLESTARTBLOCK) {
> +                     offset = max_t(loff_t, seekoff,
> +                                    XFS_FSB_TO_B(mp, map[0].br_startoff));
> +                     break;
> +             }
> +
> +             /* Return ENXIO if beyond eof */
> +             if (XFS_FSB_TO_B(mp, fsbno) > isize) {
> +                     error = ENXIO;
> +                     goto out_lock;
> +             }
> +     }
> +
> +     if (offset < start)
> +             offset = start;
> +
> +     if (offset != file->f_pos)
> +             file->f_pos = offset;
> +
> +out_lock:

name this out_unlock

> +     xfs_iunlock_map_shared(ip, lock);
> +     if (error)
> +             return -error;
> +
> +     return offset;
> +}
> +
> +STATIC loff_t
> +xfs_file_llseek(
> +     struct file     *file,
> +     loff_t          offset,
> +     int             origin)
> +{
> +     switch (origin) {
> +     case SEEK_END:
> +     case SEEK_CUR:
> +     case SEEK_SET:
> +             return generic_file_llseek(file, offset, origin);
> +     case SEEK_DATA:
> +             return xfs_seek_data(file, offset);
> +     case SEEK_HOLE:
> +             return xfs_seek_hole(file, offset);
> +     default:
> +             return -EOPNOTSUPP;

I suggest -EINVAL here, as per http://linux.die.net/man/2/lseek

> +     }
> +}
> +
>  const struct file_operations xfs_file_operations = {
> -     .llseek         = generic_file_llseek,
> +     .llseek         = xfs_file_llseek,
>       .read           = do_sync_read,
>       .write          = do_sync_write,
>       .aio_read       = xfs_file_aio_read,
> -- 
> 1.7.4.1
> 
> 
> 
> 
> 
> 
> _______________________________________________
> xfs mailing list
> xfs@xxxxxxxxxxx
> http://oss.sgi.com/mailman/listinfo/xfs

<Prev in Thread] Current Thread [Next in Thread>