Introduce SEEK_DATA/SEEK_HOLE to XFS V5
Jeff Liu
jeff.liu at oracle.com
Fri Jan 6 07:28:58 CST 2012
Hello,
This is a revised patch according to Christoph's comments at V4.
Changes to V5:
--------------
* Revise xfs_has_unwritten_buffer() to lookup pages match tag.
* For unwritten extents, in both xfs_seek_data() and xfs_seek_hole(), call xfs_has_unwritten_buffer() to search
DIRTY pages firstly, if no dirty data found, call it again to search WRITEBACK pages.
* In xfs_seek_hole(), if dirty data was found in page cache for an unwritten extents, but its start offset past the start block
of the map, treat it as a hole, returns the offset if possible(data_buffer_offset > max(seek_offset, start_block_of_map)).
Tests:
------
seek sanity tester:
http://patchwork.xfs.org/patch/3108/
seek copy tester:
http://patchwork.xfs.org/patch/3109/
Thanks,
-Jeff
Signed-off-by: Jie Liu <jeff.liu at oracle.com>
---
fs/xfs/xfs_file.c | 466 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 465 insertions(+), 1 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 753ed9b..24ae40a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -38,6 +38,7 @@
#include <linux/dcache.h>
#include <linux/falloc.h>
+#include <linux/pagevec.h>
static const struct vm_operations_struct xfs_file_vm_ops;
@@ -1141,8 +1142,471 @@ xfs_vm_page_mkwrite(
return block_page_mkwrite(vma, vmf, xfs_get_blocks);
}
+/*
+ * Probe the data buffer offset in page cache for unwritten extents.
+ * Fetch all the pages match @tag, and iterate each page to find out
+ * if a buffer head state has BH_Unwritten or BH_Uptodate set.
+ */
+STATIC bool
+xfs_has_unwritten_buffer(
+ struct inode *inode,
+ struct xfs_bmbt_irec *map,
+ int tag,
+ loff_t *offset)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct pagevec pvec;
+ pgoff_t index;
+ pgoff_t end;
+ bool found = false;
+
+ pagevec_init(&pvec, 0);
+
+ index = XFS_FSB_TO_B(mp, map->br_startoff) >> PAGE_CACHE_SHIFT;
+ end = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount)
+ >> PAGE_CACHE_SHIFT;
+
+ do {
+ unsigned int i;
+ unsigned nr_pages;
+ int want = min_t(pgoff_t, end - index,
+ (pgoff_t)PAGEVEC_SIZE - 1) + 1;
+ nr_pages = pagevec_lookup_tag(&pvec, inode->i_mapping,
+ &index, tag, want);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ struct buffer_head *bh;
+ struct buffer_head *head;
+ xfs_fileoff_t last;
+
+ if (!page_has_buffers(page))
+ continue;
+
+ /*
+ * There is no need to check the following pages
+ * if the current page offset is out of range.
+ */
+ if (page->index > end)
+ goto out;
+
+ last = XFS_B_TO_FSBT(mp,
+ page->index << PAGE_CACHE_SHIFT);
+
+ bh = head = page_buffers(page);
+ do {
+ /*
+ * An extent in XFS_EXT_UNWRITTEN have disk
+ * blocks already mapped to it, but no data
+ * has been committed to them yet. If it has
+ * dirty data in the page cache it can be
+ * identified by having BH_Unwritten set in
+ * each buffer. Also, the buffer head state
+ * might be in BH_Uptodate too if the buffer
+ * writeback procedure was fired, we need to
+ * examine it as well.
+ */
+ if (buffer_unwritten(bh) ||
+ buffer_uptodate(bh)) {
+ found = true;
+ *offset = XFS_FSB_TO_B(mp, last);
+ goto out;
+ }
+ last++;
+ } while ((bh = bh->b_this_page) != head);
+ }
+
+ /*
+ * If the number of probed pages less than our desired,
+ * there should no more pages mapped, search done.
+ */
+ if (nr_pages < want)
+ break;
+
+ index = pvec.pages[i - 1]->index + 1;
+ pagevec_release(&pvec);
+ } while (index < end);
+
+out:
+ pagevec_release(&pvec);
+ if (!found)
+ *offset = 0;
+
+ return found;
+}
+
+STATIC loff_t
+xfs_seek_data(
+ struct file *file,
+ loff_t start)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fsize_t isize = i_size_read(inode);
+ loff_t offset = 0;
+ struct xfs_ifork *ifp;
+ xfs_fileoff_t fsbno;
+ xfs_filblks_t len;
+ int lock;
+ int error;
+
+ lock = xfs_ilock_map_shared(ip);
+
+ if (start >= isize) {
+ error = ENXIO;
+ goto out_lock;
+ }
+
+ fsbno = XFS_B_TO_FSBT(mp, start);
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ len = XFS_B_TO_FSB(mp, isize);
+
+ for (;;) {
+ struct xfs_bmbt_irec map[2];
+ int nmap = 2;
+ loff_t seekoff;
+
+ error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap,
+ XFS_BMAPI_ENTIRE);
+ if (error)
+ goto out_lock;
+
+ /* No extents at given offset, must be beyond EOF */
+ if (!nmap) {
+ error = ENXIO;
+ goto out_lock;
+ }
+
+ seekoff = XFS_FSB_TO_B(mp, fsbno);
+ /*
+ * Landed in a hole, skip to check the next extent.
+ * If the next extent landed in an in-memory data extent,
+ * or it is a normal extent, its fine to return.
+ * If the next extent landed in a hole extent, calculate
+ * the start file system block number for the next scan.
+ * If the next extent landed in an unwritten extent, we
+ * need to lookup the page cache to examine the data
+ * buffer offset, if nothing found, treat it as a hole
+ * extent too.
+ */
+ if (map[0].br_startblock == HOLESTARTBLOCK) {
+ /*
+ * Return ENXIO if no data extent behind
+ * the given offset. In this case, the seek
+ * offset should be landed in a hole.
+ */
+ if (nmap == 1) {
+ error = ENXIO;
+ break;
+ }
+
+ if (map[1].br_state == XFS_EXT_NORM ||
+ map[1].br_startblock == DELAYSTARTBLOCK) {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp, map[1].br_startoff));
+
+ break;
+ } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
+ if (xfs_has_unwritten_buffer(inode, &map[1],
+ PAGECACHE_TAG_DIRTY,
+ &offset) ||
+ xfs_has_unwritten_buffer(inode, &map[1],
+ PAGECACHE_TAG_WRITEBACK,
+ &offset)) {
+ offset = max_t(loff_t, seekoff, offset);
+ break;
+ }
+ }
+
+ fsbno = map[1].br_startoff + map[1].br_blockcount;
+ }
+
+ /*
+ * Landed in an unwritten extent, try to find out the data
+ * buffer offset from page cache firstly. If nothing was
+ * found, treat it as a hole, and skip to check the next
+ * extent, something just like above.
+ */
+ if (map[0].br_state == XFS_EXT_UNWRITTEN) {
+ if (xfs_has_unwritten_buffer(inode, &map[0],
+ PAGECACHE_TAG_DIRTY,
+ &offset) ||
+ xfs_has_unwritten_buffer(inode, &map[0],
+ PAGECACHE_TAG_WRITEBACK,
+ &offset)) {
+ offset = max_t(loff_t, seekoff, offset);
+ break;
+ }
+
+ /* No data extent at the given offset */
+ if (nmap == 1) {
+ error = ENXIO;
+ break;
+ }
+
+ if (map[1].br_state == XFS_EXT_NORM ||
+ map[1].br_startblock == DELAYSTARTBLOCK) {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp, map[1].br_startoff));
+ break;
+ } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
+ if (xfs_has_unwritten_buffer(inode, &map[1],
+ PAGECACHE_TAG_DIRTY,
+ &offset) ||
+ xfs_has_unwritten_buffer(inode, &map[1],
+ PAGECACHE_TAG_WRITEBACK,
+ &offset)) {
+ offset = max_t(loff_t, seekoff, offset);
+ break;
+ }
+ }
+
+ fsbno = map[1].br_startoff + map[1].br_blockcount;
+ }
+
+ /* Landed in a delay allocated extent or a real data extent */
+ if (map[0].br_startblock == DELAYSTARTBLOCK ||
+ map[0].br_state == XFS_EXT_NORM) {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp, map[0].br_startoff));
+ break;
+ }
+
+ /* Return ENXIO if beyond eof */
+ if (XFS_FSB_TO_B(mp, fsbno) > isize) {
+ error = ENXIO;
+ goto out_lock;
+ }
+ }
+
+ if (offset < start)
+ offset = start;
+
+ if (offset != file->f_pos)
+ file->f_pos = offset;
+
+out_lock:
+ xfs_iunlock_map_shared(ip, lock);
+ if (error)
+ return -error;
+
+ return offset;
+}
+
+STATIC loff_t
+xfs_seek_hole(
+ struct file *file,
+ loff_t start)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fsize_t isize = i_size_read(inode);
+ loff_t offset = 0;
+ struct xfs_ifork *ifp;
+ xfs_fileoff_t fsbno;
+ xfs_filblks_t len;
+ int lock;
+ int error;
+
+ lock = xfs_ilock_map_shared(ip);
+
+ if (start >= isize) {
+ error = ENXIO;
+ goto out_lock;
+ }
+
+ fsbno = XFS_B_TO_FSBT(mp, start);
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ len = XFS_B_TO_FSB(mp, isize);
+
+ for (;;) {
+ struct xfs_bmbt_irec map[2];
+ int nmap = 2;
+ loff_t seekoff;
+
+ error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap,
+ XFS_BMAPI_ENTIRE);
+ if (error)
+ goto out_lock;
+
+ /* No extents at given offset, must be beyond EOF */
+ if (!nmap) {
+ error = ENXIO;
+ goto out_lock;
+ }
+
+ seekoff = XFS_FSB_TO_B(mp, fsbno);
+ /*
+ * Landed in an unwritten extent, try to lookup the page
+ * cache to find out if there is dirty data or not. If
+ * nothing was found, treate it as a hole. If there has
+ * dirty data and its offset starts past both the start
+ * block of the map and the current seek offset, it should
+ * be treated as hole too. Otherwise, go through the next
+ * extent to fetch holes.
+ */
+ if (map[0].br_state == XFS_EXT_UNWRITTEN) {
+ if (xfs_has_unwritten_buffer(inode, &map[0],
+ PAGECACHE_TAG_DIRTY,
+ &offset) ||
+ xfs_has_unwritten_buffer(inode, &map[0],
+ PAGECACHE_TAG_WRITEBACK,
+ &offset)) {
+ if (offset > max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp,
+ map[0].br_startoff))) {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp,
+ map[0].br_startoff));
+ break;
+ }
+ } else {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp, map[0].br_startoff));
+ break;
+ }
+
+ /*
+ * No more extent at the given offst, return the total
+ * file size.
+ */
+ if (nmap == 1) {
+ offset = isize;
+ break;
+ }
+
+ if (map[1].br_startblock == HOLESTARTBLOCK) {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp, map[1].br_startoff));
+ break;
+ } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
+ if (xfs_has_unwritten_buffer(inode, &map[1],
+ PAGECACHE_TAG_DIRTY,
+ &offset) ||
+ xfs_has_unwritten_buffer(inode, &map[1],
+ PAGECACHE_TAG_WRITEBACK,
+ &offset)) {
+ if (offset > max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp,
+ map[1].br_startoff))) {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp,
+ map[1].br_startoff));
+ break;
+ }
+ } else {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp, map[1].br_startoff));
+ break;
+ }
+ }
+
+ fsbno = map[1].br_startoff + map[1].br_blockcount;
+ }
+
+ /*
+ * Landed in a delay allocated extent or a real data extent,
+ * if the next extent is landed in a hole or in an unwritten
+ * extent but without data committed in the page cache, return
+ * its offset. If the next extent has dirty data in page cache,
+ * but its offset starts past both the start block of the map
+ * and the seek offset, it still be a hole.
+ */
+ if (map[0].br_startblock == DELAYSTARTBLOCK ||
+ map[0].br_state == XFS_EXT_NORM) {
+ /*
+ * No more extent at the give offset, return the
+ * total file size.
+ */
+ if (nmap == 1) {
+ offset = isize;
+ break;
+ }
+
+ if (map[1].br_startblock == HOLESTARTBLOCK) {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp, map[1].br_startoff));
+ break;
+ } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
+ if (xfs_has_unwritten_buffer(inode, &map[1],
+ PAGECACHE_TAG_DIRTY,
+ &offset) ||
+ xfs_has_unwritten_buffer(inode, &map[1],
+ PAGECACHE_TAG_WRITEBACK,
+ &offset)) {
+ if (offset > max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp,
+ map[1].br_startoff))) {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp,
+ map[1].br_startoff));
+ break;
+ }
+ } else {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp, map[1].br_startoff));
+ break;
+ }
+ }
+
+ fsbno = map[1].br_startoff + map[1].br_blockcount;
+ }
+
+ /* Landed in a hole, its fine to return */
+ if (map[0].br_startblock == HOLESTARTBLOCK) {
+ offset = max_t(loff_t, seekoff,
+ XFS_FSB_TO_B(mp, map[0].br_startoff));
+ break;
+ }
+
+ /* Return ENXIO if beyond eof */
+ if (XFS_FSB_TO_B(mp, fsbno) > isize) {
+ error = ENXIO;
+ goto out_lock;
+ }
+ }
+
+ if (offset < start)
+ offset = start;
+
+ if (offset != file->f_pos)
+ file->f_pos = offset;
+
+out_lock:
+ xfs_iunlock_map_shared(ip, lock);
+ if (error)
+ return -error;
+
+ return offset;
+}
+
+STATIC loff_t
+xfs_file_llseek(
+ struct file *file,
+ loff_t offset,
+ int origin)
+{
+ switch (origin) {
+ case SEEK_END:
+ case SEEK_CUR:
+ case SEEK_SET:
+ return generic_file_llseek(file, offset, origin);
+ case SEEK_DATA:
+ return xfs_seek_data(file, offset);
+ case SEEK_HOLE:
+ return xfs_seek_hole(file, offset);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
const struct file_operations xfs_file_operations = {
- .llseek = generic_file_llseek,
+ .llseek = xfs_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read = xfs_file_aio_read,
--
1.7.4.1
More information about the xfs
mailing list