xfs
[Top] [All Lists]

[PATCH] Introduce SEEK_DATA/SEEK_HOLE to XFS V3

To: xfs@xxxxxxxxxxx
Subject: [PATCH] Introduce SEEK_DATA/SEEK_HOLE to XFS V3
From: Jeff Liu <jeff.liu@xxxxxxxxxx>
Date: Tue, 13 Dec 2011 23:02:53 +0800
Cc: Christoph Hellwig <hch@xxxxxxxxxxxxx>, Dave Chinner <david@xxxxxxxxxxxxx>, Chris Mason <chris.mason@xxxxxxxxxx>
Organization: Oracle
Reply-to: jeff.liu@xxxxxxxxxx
User-agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.18) Gecko/20110617 Thunderbird/3.1.11
Hello,

Now the V3 was coming out.

Major changes:
==============
1. Isolate SEEK_DATA/SEEK_HOLE to individual functions, call them directly from
xfs_llseek().
2. Probing page cache for unwritten extents.

Tests:
======
1. General tests I have mentioned before.
2. Cover look up DIRTY pages through fallocate(2).

The issue is I have not yet successfully worked out a test case can cover look 
up WRITEBACK pages easily,
I'll continue to try out some other methods in this case.
In the meantime, I'd like to post the revised patch for your guys review, to 
make sure that there is no
obvious mistake in my current implementation, especially for the page cache 
look up approach.

Also, I took care of the function parameters and variables in TAB-align code 
style this time, for variables
defined at conditions code blocks(i.e, "{}"), I also adhere to this rule, but 
not sure if it is properly, since I
found some code snippets in xfs_file.c does not defined in this way.

Thank you!
-Jeff

Signed-off-by: Jie Liu <jeff.liu@xxxxxxxxxx>

---
 fs/xfs/xfs_file.c |  308 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 307 insertions(+), 1 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 753ed9b..1e9d6be 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -38,6 +38,7 @@
 
 #include <linux/dcache.h>
 #include <linux/falloc.h>
+#include <linux/pagevec.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
@@ -1141,8 +1142,313 @@ xfs_vm_page_mkwrite(
        return block_page_mkwrite(vma, vmf, xfs_get_blocks);
 }
 
+/*
+ * Try to find out the data buffer offset in page cache for unwritten
+ * extents. Firstly, try to probe the DIRTY pages in current extent range,
+ * and iterate each page to lookup all theirs data buffers, if a buffer
+ * head status is unwritten, return its offset. If there is no DIRTY pages
+ * found or lookup done, we need to lookup the WRITEBACK pages again and
+ * perform the same operation as previously to avoid data loss.
+ */
+STATIC loff_t
+xfs_probe_unwritten_buffer(
+       struct inode            *inode,
+       struct xfs_bmbt_irec    *map,
+       int                     *found)
+{
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       struct pagevec          pvec;
+       pgoff_t                 index;
+       pgoff_t                 end;
+       loff_t                  offset = 0;
+       int                     tag = PAGECACHE_TAG_DIRTY;
+
+       pagevec_init(&pvec, 0);
+
+probe_writeback_pages:
+       index = XFS_FSB_TO_B(mp, map->br_startoff) >> PAGE_CACHE_SHIFT;
+       end = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount)
+                          >> PAGE_CACHE_SHIFT;
+
+       do {
+               unsigned        nr_pages;
+               unsigned int    i;
+               int             want = min_t(pgoff_t, end - index,
+                                            PAGEVEC_SIZE - 1) + 1;
+               nr_pages = pagevec_lookup_tag(&pvec, inode->i_mapping,
+                                             &index, tag, want);
+               if (nr_pages == 0) {
+                       /*
+                        * No dirty pages returns for this extent, try
+                        * to lookup the writeback pages again.
+                        * FIXME: If this is the first time for probing
+                        * DIRTY pages but nothing returned, we need to
+                        * search the WRITEBACK pages from the extent
+                        * beginning offset, but if we have found out
+                        * some DIRTY pages before, maybe we should
+                        * continue to probe the WRITEBACK pages from
+                        * the current page index rather than beginning?
+                        */
+                       if (tag == PAGECACHE_TAG_DIRTY) {
+                               tag = PAGECACHE_TAG_WRITEBACK;
+                               goto probe_writeback_pages;
+                       }
+                       break;
+               }
+
+               for (i = 0; i < nr_pages; i++) {
+                       struct page             *page = pvec.pages[i];
+                       struct buffer_head      *bh;
+                       struct buffer_head      *head;
+                       xfs_fileoff_t           last;
+
+                       if (!page_has_buffers(page))
+                               goto out;
+
+                       last = XFS_B_TO_FSBT(mp,
+                                            page->index << PAGE_CACHE_SHIFT);
+                       bh = head = page_buffers(page);
+                       do {
+                               /*
+                                * In XFS, if an extent in XFS_EXT_UNWRITTEN
+                                * state, that means the disk blocks were
+                                * already mapped for it, but the data is
+                                * still lived at page caches. For buffers
+                                * resides at DIRTY pages, their BH state
+                                * should be in (dirty && mapped && unwritten
+                                * && uptodate) status. For buffers resides
+                                * at WRITEBACK pages, their BH state should
+                                * be in (mapped && unwritten && uptodate)
+                                * status. So we only need to check unwritten
+                                * buffer status here.
+                                */
+                               if (buffer_unwritten(bh)) {
+                                       offset = XFS_FSB_TO_B(mp, last);
+                                       *found = 1;
+                                       goto out;
+                               }
+                               last++;
+                       } while ((bh = bh->b_this_page) != head);
+               }
+
+               index = pvec.pages[i]->index + 1;
+               pagevec_release(&pvec);
+       } while (index < end);
+
+out:
+       pagevec_release(&pvec);
+       return offset;
+}
+
+STATIC loff_t
+xfs_seek_data(
+       struct file             *file,
+       loff_t                  start,
+       u32                     type)
+{
+       struct inode            *inode = file->f_mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fsize_t             isize = i_size_read(inode);
+       loff_t                  offset = 0;
+       struct xfs_ifork        *ifp;
+       xfs_fileoff_t           fsbno;
+       xfs_filblks_t           len;
+       int                     lock;
+       int                     error;
+
+       if (start >= isize)
+               return -ENXIO;
+
+       lock = xfs_ilock_map_shared(ip);
+
+       fsbno = XFS_B_TO_FSBT(mp, start);
+       ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+       len = XFS_B_TO_FSB(mp, isize);
+
+       for (;;) {
+               struct xfs_bmbt_irec    map[2];
+               int                     nmap = 2;
+               int                     found = 0;
+               loff_t                  seekoff;
+
+               error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap,
+                                      XFS_BMAPI_ENTIRE);
+               if (error)
+                       goto out_lock;
+
+               /* No extents at given offset, must be beyond EOF */
+               if (!nmap) {
+                       error = ENXIO;
+                       goto out_lock;
+               }
+
+               seekoff = XFS_FSB_TO_B(mp, fsbno);
+               /*
+                * Landed in a hole, skip to check the next extent.
+                * If the next extent landed in an in-memory data extent,
+                * or it is a normal extent, its fine to return.
+                * If the next extent landed in a hole extent, calculate
+                * the start file system block number for next bmapi read.
+                * If the next extent landed in an unwritten extent, we
+                * need to probe the page cache to find out the data buffer
+                * offset, if nothing found, treat it as a hole extent too.
+                */
+               if (map[0].br_startblock == HOLESTARTBLOCK) {
+                       if (map[1].br_startblock == HOLESTARTBLOCK) {
+                               fsbno = map[1].br_startoff +
+                                       map[1].br_blockcount;
+                       } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
+                               offset = xfs_probe_unwritten_buffer(inode,
+                                                                   &map[1],
+                                                                   &found);
+                               if (found) {
+                                       offset = max_t(loff_t, seekoff, offset);
+                                       break;
+                               }
+                               /*
+                                * No data buffer found in pagecache, treate it
+                                * as a hole.
+                                */
+                               fsbno = map[1].br_startoff +
+                                       map[1].br_blockcount;
+                       } else {
+                               offset = max_t(loff_t, seekoff,
+                                       XFS_FSB_TO_B(mp, map[1].br_startoff));
+                               break;
+                       }
+               }
+
+               /*
+                * Landed in an unwritten extent, try to find out the data
+                * buffer offset from page cache firstly. if nothing was
+                * found, treat it as a hole, and skip to check the next
+                * extent, something just like above.
+                */
+               if (map[0].br_state == XFS_EXT_UNWRITTEN) {
+                       offset = xfs_probe_unwritten_buffer(inode, &map[0],
+                                                           &found);
+                       if (found) {
+                               offset = max_t(loff_t, seekoff, offset);
+                               break;
+                       }
+                       if (map[1].br_startblock == HOLESTARTBLOCK) {
+                               fsbno = map[1].br_startoff +
+                                       map[1].br_blockcount;
+                       } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
+                               offset = xfs_probe_unwritten_buffer(inode,
+                                                                   &map[1],
+                                                                   &found);
+                               if (found) {
+                                       offset = max_t(loff_t, seekoff,
+                                                      offset);
+                                       break;
+                               }
+                               fsbno = map[1].br_startoff +
+                                       map[1].br_blockcount;
+                       } else {
+                               offset = max_t(loff_t, seekoff,
+                                       XFS_FSB_TO_B(mp, map[1].br_startoff));
+                               break;
+                       }
+               }
+
+               /* Landed in a delay allocated extent or a read data extent */
+               if (map[0].br_startblock == DELAYSTARTBLOCK ||
+                   map[0].br_state == XFS_EXT_NORM) {
+                       offset = max_t(loff_t, seekoff,
+                                      XFS_FSB_TO_B(mp, map[0].br_startoff));
+                       break;
+               }
+
+               /* return ENXIO if beyond eof */
+               if (XFS_FSB_TO_B(mp, fsbno) > isize) {
+                       error = ENXIO;
+                       goto out_lock;
+               }
+       }
+
+       if (offset < start)
+               offset = start;
+
+       if (offset != file->f_pos)
+               file->f_pos = offset;
+
+out_lock:
+       xfs_iunlock_map_shared(ip, lock);
+
+       if (error)
+               return -error;
+
+       return offset;
+}
+
+STATIC loff_t
+xfs_seek_hole(
+       struct file             *file,
+       loff_t                  start,
+       u32                     type)
+{
+       struct inode            *inode = file->f_mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fsize_t             isize = i_size_read(inode);
+       xfs_fileoff_t           fsbno;
+       loff_t                  holeoff;
+       loff_t                  offset = 0;
+       int                     lock;
+       int                     error;
+
+       if (start >= isize)
+               return -ENXIO;
+
+       lock = xfs_ilock_map_shared(ip);
+
+       fsbno = XFS_B_TO_FSBT(mp, start);
+       error = xfs_bmap_first_unused(NULL, ip, 1, &fsbno, XFS_DATA_FORK);
+       if (error)
+               goto out_lock;
+
+       holeoff = XFS_FSB_TO_B(mp, fsbno);
+       if (holeoff <= start)
+               offset = start;
+       else
+               offset = min_t(loff_t, holeoff, isize);
+
+       if (offset != file->f_pos)
+               file->f_pos = offset;
+
+out_lock:
+       xfs_iunlock_map_shared(ip, lock);
+
+       if (error)
+               return -error;
+       return offset;
+}
+
+STATIC loff_t
+xfs_file_llseek(
+       struct file     *file,
+       loff_t          offset,
+       int             origin)
+{
+       switch (origin) {
+       case SEEK_END:
+       case SEEK_CUR:
+       case SEEK_SET:
+               return generic_file_llseek(file, offset, origin);
+       case SEEK_DATA:
+               return xfs_seek_data(file, offset, origin);
+       case SEEK_HOLE:
+               return xfs_seek_hole(file, offset, origin);
+       default:
+               return -EOPNOTSUPP;
+       }
+}
+
 const struct file_operations xfs_file_operations = {
-       .llseek         = generic_file_llseek,
+       .llseek         = xfs_file_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .aio_read       = xfs_file_aio_read,
-- 
1.7.4.1

<Prev in Thread] Current Thread [Next in Thread>