xfs
[Top] [All Lists]

Introduce SEEK_DATA/SEEK_HOLE to XFS V5

To: xfs@xxxxxxxxxxx
Subject: Introduce SEEK_DATA/SEEK_HOLE to XFS V5
From: Jeff Liu <jeff.liu@xxxxxxxxxx>
Date: Fri, 06 Jan 2012 21:28:58 +0800
Cc: Christoph Hellwig <hch@xxxxxxxxxxxxx>, Chris Mason <chris.mason@xxxxxxxxxx>
Organization: Oracle
Reply-to: jeff.liu@xxxxxxxxxx
User-agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.18) Gecko/20110617 Thunderbird/3.1.11
Hello,

This is a revised patch according to Christoph's comments at V4.

Changes to V5:
--------------
* Revise xfs_has_unwritten_buffer() to lookup pages match tag.
* For unwritten extents, in both xfs_seek_data() and xfs_seek_hole(), call 
xfs_has_unwritten_buffer() to search
  DIRTY pages firstly, if no dirty data found, call it again to search 
WRITEBACK pages.
* In xfs_seek_hole(), if dirty data was found in page cache for an unwritten 
extents, but its start offset past the start block
  of the map,  treat it as a hole, returns the offset if 
possible(data_buffer_offset > max(seek_offset, start_block_of_map)).

Tests:
------
seek sanity tester:
http://patchwork.xfs.org/patch/3108/
seek copy tester:
http://patchwork.xfs.org/patch/3109/


Thanks,
-Jeff

Signed-off-by: Jie Liu <jeff.liu@xxxxxxxxxx>

---
 fs/xfs/xfs_file.c |  466 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 465 insertions(+), 1 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 753ed9b..24ae40a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -38,6 +38,7 @@
 
 #include <linux/dcache.h>
 #include <linux/falloc.h>
+#include <linux/pagevec.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
@@ -1141,8 +1142,471 @@ xfs_vm_page_mkwrite(
        return block_page_mkwrite(vma, vmf, xfs_get_blocks);
 }
 
+/*
+ * Probe the data buffer offset in page cache for unwritten extents.
+ * Fetch all the pages match @tag, and iterate each page to find out
+ * if a buffer head state has BH_Unwritten or BH_Uptodate set.
+ */
+STATIC bool
+xfs_has_unwritten_buffer(
+       struct inode            *inode,
+       struct xfs_bmbt_irec    *map,
+       int                     tag,
+       loff_t                  *offset)
+{
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       struct pagevec          pvec;
+       pgoff_t                 index;
+       pgoff_t                 end;
+       bool                    found = false;
+
+       pagevec_init(&pvec, 0);
+
+       index = XFS_FSB_TO_B(mp, map->br_startoff) >> PAGE_CACHE_SHIFT;
+       end = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount)
+                          >> PAGE_CACHE_SHIFT;
+
+       do {
+               unsigned int    i;
+               unsigned        nr_pages;
+               int             want = min_t(pgoff_t, end - index,
+                                            (pgoff_t)PAGEVEC_SIZE - 1) + 1;
+               nr_pages = pagevec_lookup_tag(&pvec, inode->i_mapping,
+                                             &index, tag, want);
+               if (nr_pages == 0)
+                       break;
+
+               for (i = 0; i < nr_pages; i++) {
+                       struct page             *page = pvec.pages[i];
+                       struct buffer_head      *bh;
+                       struct buffer_head      *head;
+                       xfs_fileoff_t           last;
+
+                       if (!page_has_buffers(page))
+                               continue;
+
+                       /*
+                        * There is no need to check the following pages
+                        * if the current page offset is out of range.
+                        */
+                       if (page->index > end)
+                               goto out;
+
+                       last = XFS_B_TO_FSBT(mp,
+                                            page->index << PAGE_CACHE_SHIFT);
+
+                       bh = head = page_buffers(page);
+                       do {
+                               /*
+                                * An extent in XFS_EXT_UNWRITTEN have disk
+                                * blocks already mapped to it, but no data
+                                * has been committed to them yet. If it has
+                                * dirty data in the page cache it can be
+                                * identified by having BH_Unwritten set in
+                                * each buffer. Also, the buffer head state
+                                * might be in BH_Uptodate too if the buffer
+                                * writeback procedure was fired, we need to
+                                * examine it as well.
+                                */
+                               if (buffer_unwritten(bh) ||
+                                   buffer_uptodate(bh)) {
+                                       found = true;
+                                       *offset = XFS_FSB_TO_B(mp, last);
+                                       goto out;
+                               }
+                               last++;
+                       } while ((bh = bh->b_this_page) != head);
+               }
+
+               /*
+                * If the number of probed pages less than our desired,
+                * there should no more pages mapped, search done.
+                */
+               if (nr_pages < want)
+                       break;
+
+               index = pvec.pages[i - 1]->index + 1;
+               pagevec_release(&pvec);
+       } while (index < end);
+
+out:
+       pagevec_release(&pvec);
+       if (!found)
+               *offset = 0;
+
+       return found;
+}
+
+STATIC loff_t
+xfs_seek_data(
+       struct file             *file,
+       loff_t                  start)
+{
+       struct inode            *inode = file->f_mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fsize_t             isize = i_size_read(inode);
+       loff_t                  offset = 0;
+       struct xfs_ifork        *ifp;
+       xfs_fileoff_t           fsbno;
+       xfs_filblks_t           len;
+       int                     lock;
+       int                     error;
+
+       lock = xfs_ilock_map_shared(ip);
+
+       if (start >= isize) {
+               error = ENXIO;
+               goto out_lock;
+       }
+
+       fsbno = XFS_B_TO_FSBT(mp, start);
+       ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+       len = XFS_B_TO_FSB(mp, isize);
+
+       for (;;) {
+               struct xfs_bmbt_irec    map[2];
+               int                     nmap = 2;
+               loff_t                  seekoff;
+
+               error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap,
+                                      XFS_BMAPI_ENTIRE);
+               if (error)
+                       goto out_lock;
+
+               /* No extents at given offset, must be beyond EOF */
+               if (!nmap) {
+                       error = ENXIO;
+                       goto out_lock;
+               }
+
+               seekoff = XFS_FSB_TO_B(mp, fsbno);
+               /*
+                * Landed in a hole, skip to check the next extent.
+                * If the next extent landed in an in-memory data extent,
+                * or it is a normal extent, its fine to return.
+                * If the next extent landed in a hole extent, calculate
+                * the start file system block number for the next scan.
+                * If the next extent landed in an unwritten extent, we
+                * need to lookup the page cache to examine the data
+                * buffer offset, if nothing found, treat it as a hole
+                * extent too.
+                */
+               if (map[0].br_startblock == HOLESTARTBLOCK) {
+                       /*
+                        * Return ENXIO if no data extent behind
+                        * the given offset. In this case, the seek
+                        * offset should be landed in a hole.
+                        */
+                       if (nmap == 1) {
+                               error = ENXIO;
+                               break;
+                       }
+
+                       if (map[1].br_state == XFS_EXT_NORM ||
+                           map[1].br_startblock == DELAYSTARTBLOCK) {
+                               offset = max_t(loff_t, seekoff,
+                                       XFS_FSB_TO_B(mp, map[1].br_startoff));
+
+                               break;
+                       } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
+                               if (xfs_has_unwritten_buffer(inode, &map[1],
+                                                    PAGECACHE_TAG_DIRTY,
+                                                    &offset) ||
+                                   xfs_has_unwritten_buffer(inode, &map[1],
+                                                    PAGECACHE_TAG_WRITEBACK,
+                                                    &offset)) {
+                                       offset = max_t(loff_t, seekoff, offset);
+                                       break;
+                               }
+                       }
+
+                       fsbno = map[1].br_startoff + map[1].br_blockcount;
+               }
+
+               /*
+                * Landed in an unwritten extent, try to find out the data
+                * buffer offset from page cache firstly. If nothing was
+                * found, treat it as a hole, and skip to check the next
+                * extent, something just like above.
+                */
+               if (map[0].br_state == XFS_EXT_UNWRITTEN) {
+                       if (xfs_has_unwritten_buffer(inode, &map[0],
+                                                    PAGECACHE_TAG_DIRTY,
+                                                    &offset) ||
+                           xfs_has_unwritten_buffer(inode, &map[0],
+                                                    PAGECACHE_TAG_WRITEBACK,
+                                                    &offset)) {
+                               offset = max_t(loff_t, seekoff, offset);
+                               break;
+                       }
+
+                       /* No data extent at the given offset */
+                       if (nmap == 1) {
+                               error = ENXIO;
+                               break;
+                       }
+
+                       if (map[1].br_state == XFS_EXT_NORM ||
+                           map[1].br_startblock == DELAYSTARTBLOCK) {
+                               offset = max_t(loff_t, seekoff,
+                                       XFS_FSB_TO_B(mp, map[1].br_startoff));
+                               break;
+                       } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
+                               if (xfs_has_unwritten_buffer(inode, &map[1],
+                                                    PAGECACHE_TAG_DIRTY,
+                                                    &offset) ||
+                                   xfs_has_unwritten_buffer(inode, &map[1],
+                                                    PAGECACHE_TAG_WRITEBACK,
+                                                    &offset)) {
+                                       offset = max_t(loff_t, seekoff, offset);
+                                       break;
+                               }
+                       }
+
+                       fsbno = map[1].br_startoff + map[1].br_blockcount;
+               }
+
+               /* Landed in a delay allocated extent or a real data extent */
+               if (map[0].br_startblock == DELAYSTARTBLOCK ||
+                   map[0].br_state == XFS_EXT_NORM) {
+                       offset = max_t(loff_t, seekoff,
+                                      XFS_FSB_TO_B(mp, map[0].br_startoff));
+                       break;
+               }
+
+               /* Return ENXIO if beyond eof */
+               if (XFS_FSB_TO_B(mp, fsbno) > isize) {
+                       error = ENXIO;
+                       goto out_lock;
+               }
+       }
+
+       if (offset < start)
+               offset = start;
+
+       if (offset != file->f_pos)
+               file->f_pos = offset;
+
+out_lock:
+       xfs_iunlock_map_shared(ip, lock);
+       if (error)
+               return -error;
+
+       return offset;
+}
+
+STATIC loff_t
+xfs_seek_hole(
+       struct file             *file,
+       loff_t                  start)
+{
+       struct inode            *inode = file->f_mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fsize_t             isize = i_size_read(inode);
+       loff_t                  offset = 0;
+       struct xfs_ifork        *ifp;
+       xfs_fileoff_t           fsbno;
+       xfs_filblks_t           len;
+       int                     lock;
+       int                     error;
+
+       lock = xfs_ilock_map_shared(ip);
+
+       if (start >= isize) {
+               error = ENXIO;
+               goto out_lock;
+       }
+
+       fsbno = XFS_B_TO_FSBT(mp, start);
+       ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+       len = XFS_B_TO_FSB(mp, isize);
+
+       for (;;) {
+               struct xfs_bmbt_irec    map[2];
+               int                     nmap = 2;
+               loff_t                  seekoff;
+
+               error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap,
+                                      XFS_BMAPI_ENTIRE);
+               if (error)
+                       goto out_lock;
+
+               /* No extents at given offset, must be beyond EOF */
+               if (!nmap) {
+                       error = ENXIO;
+                       goto out_lock;
+               }
+
+               seekoff = XFS_FSB_TO_B(mp, fsbno);
+               /*
+                * Landed in an unwritten extent, try to lookup the page
+                * cache to find out if there is dirty data or not. If
+                * nothing was found, treate it as a hole. If there has
+                * dirty data and its offset starts past both the start
+                * block of the map and the current seek offset, it should
+                * be treated as hole too. Otherwise, go through the next
+                * extent to fetch holes.
+                */
+               if (map[0].br_state == XFS_EXT_UNWRITTEN) {
+                       if (xfs_has_unwritten_buffer(inode, &map[0],
+                                                    PAGECACHE_TAG_DIRTY,
+                                                    &offset) ||
+                           xfs_has_unwritten_buffer(inode, &map[0],
+                                                    PAGECACHE_TAG_WRITEBACK,
+                                                    &offset)) {
+                               if (offset > max_t(loff_t, seekoff,
+                                                  XFS_FSB_TO_B(mp,
+                                                  map[0].br_startoff))) {
+                                       offset = max_t(loff_t, seekoff,
+                                                      XFS_FSB_TO_B(mp,
+                                                      map[0].br_startoff));
+                                       break;
+                               }
+                       } else {
+                               offset = max_t(loff_t, seekoff,
+                                       XFS_FSB_TO_B(mp, map[0].br_startoff));
+                               break;
+                       }
+
+                       /*
+                        * No more extent at the given offst, return the total
+                        * file size.
+                        */
+                       if (nmap == 1) {
+                               offset = isize;
+                               break;
+                       }
+
+                       if (map[1].br_startblock == HOLESTARTBLOCK) {
+                               offset = max_t(loff_t, seekoff,
+                                       XFS_FSB_TO_B(mp, map[1].br_startoff));
+                               break;
+                       } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
+                               if (xfs_has_unwritten_buffer(inode, &map[1],
+                                                       PAGECACHE_TAG_DIRTY,
+                                                       &offset) ||
+                                   xfs_has_unwritten_buffer(inode, &map[1],
+                                                       PAGECACHE_TAG_WRITEBACK,
+                                                       &offset)) {
+                                       if (offset > max_t(loff_t, seekoff,
+                                                       XFS_FSB_TO_B(mp,
+                                                       map[1].br_startoff))) {
+                                               offset = max_t(loff_t, seekoff,
+                                                       XFS_FSB_TO_B(mp,
+                                                       map[1].br_startoff));
+                                               break;
+                                       }
+                               } else {
+                                       offset = max_t(loff_t, seekoff,
+                                       XFS_FSB_TO_B(mp, map[1].br_startoff));
+                                       break;
+                               }
+                       }
+
+                       fsbno = map[1].br_startoff + map[1].br_blockcount;
+               }
+
+               /*
+                * Landed in a delay allocated extent or a real data extent,
+                * if the next extent is landed in a hole or in an unwritten
+                * extent but without data committed in the page cache, return
+                * its offset. If the next extent has dirty data in page cache,
+                * but its offset starts past both the start block of the map
+                * and the seek offset, it still be a hole.
+                */
+               if (map[0].br_startblock == DELAYSTARTBLOCK ||
+                   map[0].br_state == XFS_EXT_NORM) {
+                       /*
+                        * No more extent at the give offset, return the
+                        * total file size.
+                        */
+                       if (nmap == 1) {
+                               offset = isize;
+                               break;
+                       }
+
+                       if (map[1].br_startblock == HOLESTARTBLOCK) {
+                               offset = max_t(loff_t, seekoff,
+                                       XFS_FSB_TO_B(mp, map[1].br_startoff));
+                               break;
+                       } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
+                               if (xfs_has_unwritten_buffer(inode, &map[1],
+                                                       PAGECACHE_TAG_DIRTY,
+                                                       &offset) ||
+                                   xfs_has_unwritten_buffer(inode, &map[1],
+                                                       PAGECACHE_TAG_WRITEBACK,
+                                                       &offset)) {
+                                       if (offset > max_t(loff_t, seekoff,
+                                                       XFS_FSB_TO_B(mp,
+                                                       map[1].br_startoff))) {
+                                               offset = max_t(loff_t, seekoff,
+                                                       XFS_FSB_TO_B(mp,
+                                                       map[1].br_startoff));
+                                               break;
+                                       }
+                               } else {
+                                       offset = max_t(loff_t, seekoff,
+                                       XFS_FSB_TO_B(mp, map[1].br_startoff));
+                                       break;
+                               }
+                       }
+
+                       fsbno = map[1].br_startoff + map[1].br_blockcount;
+               }
+
+               /* Landed in a hole, its fine to return */
+               if (map[0].br_startblock == HOLESTARTBLOCK) {
+                       offset = max_t(loff_t, seekoff,
+                                      XFS_FSB_TO_B(mp, map[0].br_startoff));
+                       break;
+               }
+
+               /* Return ENXIO if beyond eof */
+               if (XFS_FSB_TO_B(mp, fsbno) > isize) {
+                       error = ENXIO;
+                       goto out_lock;
+               }
+       }
+
+       if (offset < start)
+               offset = start;
+
+       if (offset != file->f_pos)
+               file->f_pos = offset;
+
+out_lock:
+       xfs_iunlock_map_shared(ip, lock);
+       if (error)
+               return -error;
+
+       return offset;
+}
+
+STATIC loff_t
+xfs_file_llseek(
+       struct file     *file,
+       loff_t          offset,
+       int             origin)
+{
+       switch (origin) {
+       case SEEK_END:
+       case SEEK_CUR:
+       case SEEK_SET:
+               return generic_file_llseek(file, offset, origin);
+       case SEEK_DATA:
+               return xfs_seek_data(file, offset);
+       case SEEK_HOLE:
+               return xfs_seek_hole(file, offset);
+       default:
+               return -EOPNOTSUPP;
+       }
+}
+
 const struct file_operations xfs_file_operations = {
-       .llseek         = generic_file_llseek,
+       .llseek         = xfs_file_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .aio_read       = xfs_file_aio_read,
-- 
1.7.4.1






<Prev in Thread] Current Thread [Next in Thread>