Hi Ben,
On 01/11/2012 01:18 AM, Ben Myers wrote:
> Hey Jeff,
>
> On Fri, Jan 06, 2012 at 09:28:58PM +0800, Jeff Liu wrote:
>> This is a revised patch according to Christoph's comments at V4.
>
> I got caught up on versions 1-4, and am looking at this now.
> Thanks for your excellent contribution!
>
> I have some inital comments here, and then I'll get into it a little
> more deeply.
>
>> Changes to V5:
>> --------------
>> * Revise xfs_has_unwritten_buffer() to lookup pages match tag.
>> * For unwritten extents, in both xfs_seek_data() and xfs_seek_hole(), call
>> xfs_has_unwritten_buffer() to search
>> DIRTY pages firstly, if no dirty data found, call it again to search
>> WRITEBACK pages.
>> * In xfs_seek_hole(), if dirty data was found in page cache for an unwritten
>> extents, but its start offset past the start block
>> of the map, treat it as a hole, returns the offset if
>> possible(data_buffer_offset > max(seek_offset, start_block_of_map)).
>>
>> Tests:
>> ------
>> seek sanity tester:
>> http://patchwork.xfs.org/patch/3108/
>> seek copy tester:
>> http://patchwork.xfs.org/patch/3109/
>>
>>
>> Thanks,
>> -Jeff
>>
>> Signed-off-by: Jie Liu <jeff.liu@xxxxxxxxxx>
>>
>> ---
>> fs/xfs/xfs_file.c | 466
>> ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>> 1 files changed, 465 insertions(+), 1 deletions(-)
>>
>> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
>> index 753ed9b..24ae40a 100644
>> --- a/fs/xfs/xfs_file.c
>> +++ b/fs/xfs/xfs_file.c
>> @@ -38,6 +38,7 @@
>>
>> #include <linux/dcache.h>
>> #include <linux/falloc.h>
>> +#include <linux/pagevec.h>
>>
>> static const struct vm_operations_struct xfs_file_vm_ops;
>>
>> @@ -1141,8 +1142,471 @@ xfs_vm_page_mkwrite(
>> return block_page_mkwrite(vma, vmf, xfs_get_blocks);
>> }
>>
>> +/*
>> + * Probe the data buffer offset in page cache for unwritten extents.
>> + * Fetch all the pages match @tag, and iterate each page to find out
>> + * if a buffer head state has BH_Unwritten or BH_Uptodate set.
>> + */
>> +STATIC bool
>> +xfs_has_unwritten_buffer(
>> + struct inode *inode,
>> + struct xfs_bmbt_irec *map,
>> + int tag,
>> + loff_t *offset)
>> +{
>> + struct xfs_inode *ip = XFS_I(inode);
>> + struct xfs_mount *mp = ip->i_mount;
>> + struct pagevec pvec;
>> + pgoff_t index;
>> + pgoff_t end;
>> + bool found = false;
>> +
>> + pagevec_init(&pvec, 0);
>> +
>> + index = XFS_FSB_TO_B(mp, map->br_startoff) >> PAGE_CACHE_SHIFT;
>> + end = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount)
>> + >> PAGE_CACHE_SHIFT;
>> +
>> + do {
>> + unsigned int i;
>> + unsigned nr_pages;
>> + int want = min_t(pgoff_t, end - index,
>> + (pgoff_t)PAGEVEC_SIZE - 1) + 1;
>> + nr_pages = pagevec_lookup_tag(&pvec, inode->i_mapping,
>> + &index, tag, want);
>> + if (nr_pages == 0)
>> + break;
>> +
>> + for (i = 0; i < nr_pages; i++) {
>> + struct page *page = pvec.pages[i];
>> + struct buffer_head *bh;
>> + struct buffer_head *head;
>> + xfs_fileoff_t last;
>
> You should take the page lock here, before looking at PagePrivate and
> the buffers. This is out of order with respect to the ilock. (The
> page_lock is held when xfs_vm_writepage calls xfs_map_blocks which will
> go after the ilock, and the same is the case when __xfs_get_blocks is
> called when writing into the page.) So... here you need to use trylock
> to avoid deadlocks, and assume that there is data on the page if you
> don't get the lock. Take a look at xfs_cluster_write and
> xfs_convert_page for an example.
Thanks for your timely response and pointing this out!
I am suffering with a data loss when improving seek copy tester to produce a
file with
around thousands of extents these days, looks the root cause is most likely
related to miss the page lock stuff.
>
> I think that this also means that you can check for PageDirty and
> PageWriteback under lock together... and so the upside is that you don't
> have to call this once for each tag anymore. Just use pagevec_lookup.
Cool. originally, I have prepared to fix the below stupid code blocks at V5:
if (offset > max_t(loff_t, seekoff, XFS_FSB_TO_B(mp, map[0].br_startoff))) {
offset = max_t(loff_t, seekoff, XFS_FSB_TO_B(mp, map[0].br_startoff));
break;
}
to:
max_off = max_t(loff_t, seekoff, XFS_FSB_TO_B(mp, map[0].br_startoff));
if (offset > max_off) {
offset = max_off;
break;
}
With your comments, this issue will gone :-P.
>
>> +
>> + if (!page_has_buffers(page))
>> + continue;
>> +
>> + /*
>> + * There is no need to check the following pages
>> + * if the current page offset is out of range.
>> + */
>> + if (page->index > end)
>> + goto out;
>> +
>> + last = XFS_B_TO_FSBT(mp,
>> + page->index << PAGE_CACHE_SHIFT);
>> +
>> + bh = head = page_buffers(page);
>> + do {
>> + /*
>> + * An extent in XFS_EXT_UNWRITTEN have disk
>> + * blocks already mapped to it, but no data
>> + * has been committed to them yet. If it has
>> + * dirty data in the page cache it can be
>> + * identified by having BH_Unwritten set in
>> + * each buffer. Also, the buffer head state
>> + * might be in BH_Uptodate too if the buffer
>> + * writeback procedure was fired, we need to
>> + * examine it as well.
>> + */
>> + if (buffer_unwritten(bh) ||
>> + buffer_uptodate(bh)) {
>> + found = true;
>> + *offset = XFS_FSB_TO_B(mp, last);
>> + goto out;
>> + }
>> + last++;
>> + } while ((bh = bh->b_this_page) != head);
>> + }
>> +
>> + /*
>> + * If the number of probed pages less than our desired,
>> + * there should no more pages mapped, search done.
>> + */
>> + if (nr_pages < want)
>> + break;
>> +
>> + index = pvec.pages[i - 1]->index + 1;
>> + pagevec_release(&pvec);
>> + } while (index < end);
>> +
>> +out:
>> + pagevec_release(&pvec);
>> + if (!found)
>> + *offset = 0;
>> +
>> + return found;
>> +}
>> +
>> +STATIC loff_t
>> +xfs_seek_data(
>> + struct file *file,
>> + loff_t start)
>> +{
>> + struct inode *inode = file->f_mapping->host;
>> + struct xfs_inode *ip = XFS_I(inode);
>> + struct xfs_mount *mp = ip->i_mount;
>> + xfs_fsize_t isize = i_size_read(inode);
>> + loff_t offset = 0;
>> + struct xfs_ifork *ifp;
>> + xfs_fileoff_t fsbno;
>> + xfs_filblks_t len;
>> + int lock;
>> + int error;
>> +
>> + lock = xfs_ilock_map_shared(ip);
>> +
>> + if (start >= isize) {
>> + error = ENXIO;
>> + goto out_lock;
>> + }
>> +
>> + fsbno = XFS_B_TO_FSBT(mp, start);
>> + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
>
> Nit: calculation of start_fsb and end_fsb belong together. Just move
> ifp up a line. ;)
Hmm, looks we can safely remove ifp line since XFS_BMAPI_ENTIRE is used for
xfs_bmapi_read().
>
> That's all I have for right now. I'll be looking at it in greater
> detail today.
So I'll collect all your comments and submit it again.
Thanks,
-Jeff
>
> Thanks,
> Ben
>
>> + len = XFS_B_TO_FSB(mp, isize);
>> +
>> + for (;;) {
>> + struct xfs_bmbt_irec map[2];
>> + int nmap = 2;
>> + loff_t seekoff;
>> +
>> + error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap,
>> + XFS_BMAPI_ENTIRE);
>> + if (error)
>> + goto out_lock;
>> +
>> + /* No extents at given offset, must be beyond EOF */
>> + if (!nmap) {
>> + error = ENXIO;
>> + goto out_lock;
>> + }
>> +
>> + seekoff = XFS_FSB_TO_B(mp, fsbno);
>> + /*
>> + * Landed in a hole, skip to check the next extent.
>> + * If the next extent landed in an in-memory data extent,
>> + * or it is a normal extent, its fine to return.
>> + * If the next extent landed in a hole extent, calculate
>> + * the start file system block number for the next scan.
>> + * If the next extent landed in an unwritten extent, we
>> + * need to lookup the page cache to examine the data
>> + * buffer offset, if nothing found, treat it as a hole
>> + * extent too.
>> + */
>> + if (map[0].br_startblock == HOLESTARTBLOCK) {
>> + /*
>> + * Return ENXIO if no data extent behind
>> + * the given offset. In this case, the seek
>> + * offset should be landed in a hole.
>> + */
>> + if (nmap == 1) {
>> + error = ENXIO;
>> + break;
>> + }
>> +
>> + if (map[1].br_state == XFS_EXT_NORM ||
>> + map[1].br_startblock == DELAYSTARTBLOCK) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[1].br_startoff));
>> +
>> + break;
>> + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
>> + if (xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_DIRTY,
>> + &offset) ||
>> + xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_WRITEBACK,
>> + &offset)) {
>> + offset = max_t(loff_t, seekoff, offset);
>> + break;
>> + }
>> + }
>> +
>> + fsbno = map[1].br_startoff + map[1].br_blockcount;
>> + }
>> +
>> + /*
>> + * Landed in an unwritten extent, try to find out the data
>> + * buffer offset from page cache firstly. If nothing was
>> + * found, treat it as a hole, and skip to check the next
>> + * extent, something just like above.
>> + */
>> + if (map[0].br_state == XFS_EXT_UNWRITTEN) {
>> + if (xfs_has_unwritten_buffer(inode, &map[0],
>> + PAGECACHE_TAG_DIRTY,
>> + &offset) ||
>> + xfs_has_unwritten_buffer(inode, &map[0],
>> + PAGECACHE_TAG_WRITEBACK,
>> + &offset)) {
>> + offset = max_t(loff_t, seekoff, offset);
>> + break;
>> + }
>> +
>> + /* No data extent at the given offset */
>> + if (nmap == 1) {
>> + error = ENXIO;
>> + break;
>> + }
>> +
>> + if (map[1].br_state == XFS_EXT_NORM ||
>> + map[1].br_startblock == DELAYSTARTBLOCK) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[1].br_startoff));
>> + break;
>> + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
>> + if (xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_DIRTY,
>> + &offset) ||
>> + xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_WRITEBACK,
>> + &offset)) {
>> + offset = max_t(loff_t, seekoff, offset);
>> + break;
>> + }
>> + }
>> +
>> + fsbno = map[1].br_startoff + map[1].br_blockcount;
>> + }
>> +
>> + /* Landed in a delay allocated extent or a real data extent */
>> + if (map[0].br_startblock == DELAYSTARTBLOCK ||
>> + map[0].br_state == XFS_EXT_NORM) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[0].br_startoff));
>> + break;
>> + }
>> +
>> + /* Return ENXIO if beyond eof */
>> + if (XFS_FSB_TO_B(mp, fsbno) > isize) {
>> + error = ENXIO;
>> + goto out_lock;
>> + }
>> + }
>> +
>> + if (offset < start)
>> + offset = start;
>> +
>> + if (offset != file->f_pos)
>> + file->f_pos = offset;
>> +
>> +out_lock:
>> + xfs_iunlock_map_shared(ip, lock);
>> + if (error)
>> + return -error;
>> +
>> + return offset;
>> +}
>> +
>> +STATIC loff_t
>> +xfs_seek_hole(
>> + struct file *file,
>> + loff_t start)
>> +{
>> + struct inode *inode = file->f_mapping->host;
>> + struct xfs_inode *ip = XFS_I(inode);
>> + struct xfs_mount *mp = ip->i_mount;
>> + xfs_fsize_t isize = i_size_read(inode);
>> + loff_t offset = 0;
>> + struct xfs_ifork *ifp;
>> + xfs_fileoff_t fsbno;
>> + xfs_filblks_t len;
>> + int lock;
>> + int error;
>> +
>> + lock = xfs_ilock_map_shared(ip);
>> +
>> + if (start >= isize) {
>> + error = ENXIO;
>> + goto out_lock;
>> + }
>> +
>> + fsbno = XFS_B_TO_FSBT(mp, start);
>> + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
>> + len = XFS_B_TO_FSB(mp, isize);
>> +
>> + for (;;) {
>> + struct xfs_bmbt_irec map[2];
>> + int nmap = 2;
>> + loff_t seekoff;
>> +
>> + error = xfs_bmapi_read(ip, fsbno, len - fsbno, map, &nmap,
>> + XFS_BMAPI_ENTIRE);
>> + if (error)
>> + goto out_lock;
>> +
>> + /* No extents at given offset, must be beyond EOF */
>> + if (!nmap) {
>> + error = ENXIO;
>> + goto out_lock;
>> + }
>> +
>> + seekoff = XFS_FSB_TO_B(mp, fsbno);
>> + /*
>> + * Landed in an unwritten extent, try to lookup the page
>> + * cache to find out if there is dirty data or not. If
>> + * nothing was found, treate it as a hole. If there has
>> + * dirty data and its offset starts past both the start
>> + * block of the map and the current seek offset, it should
>> + * be treated as hole too. Otherwise, go through the next
>> + * extent to fetch holes.
>> + */
>> + if (map[0].br_state == XFS_EXT_UNWRITTEN) {
>> + if (xfs_has_unwritten_buffer(inode, &map[0],
>> + PAGECACHE_TAG_DIRTY,
>> + &offset) ||
>> + xfs_has_unwritten_buffer(inode, &map[0],
>> + PAGECACHE_TAG_WRITEBACK,
>> + &offset)) {
>> + if (offset > max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp,
>> + map[0].br_startoff))) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp,
>> + map[0].br_startoff));
>> + break;
>> + }
>> + } else {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[0].br_startoff));
>> + break;
>> + }
>> +
>> + /*
>> + * No more extent at the given offst, return the total
>> + * file size.
>> + */
>> + if (nmap == 1) {
>> + offset = isize;
>> + break;
>> + }
>> +
>> + if (map[1].br_startblock == HOLESTARTBLOCK) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[1].br_startoff));
>> + break;
>> + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
>> + if (xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_DIRTY,
>> + &offset) ||
>> + xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_WRITEBACK,
>> + &offset)) {
>> + if (offset > max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp,
>> + map[1].br_startoff))) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp,
>> + map[1].br_startoff));
>> + break;
>> + }
>> + } else {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[1].br_startoff));
>> + break;
>> + }
>> + }
>> +
>> + fsbno = map[1].br_startoff + map[1].br_blockcount;
>> + }
>> +
>> + /*
>> + * Landed in a delay allocated extent or a real data extent,
>> + * if the next extent is landed in a hole or in an unwritten
>> + * extent but without data committed in the page cache, return
>> + * its offset. If the next extent has dirty data in page cache,
>> + * but its offset starts past both the start block of the map
>> + * and the seek offset, it still be a hole.
>> + */
>> + if (map[0].br_startblock == DELAYSTARTBLOCK ||
>> + map[0].br_state == XFS_EXT_NORM) {
>> + /*
>> + * No more extent at the give offset, return the
>> + * total file size.
>> + */
>> + if (nmap == 1) {
>> + offset = isize;
>> + break;
>> + }
>> +
>> + if (map[1].br_startblock == HOLESTARTBLOCK) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[1].br_startoff));
>> + break;
>> + } else if (map[1].br_state == XFS_EXT_UNWRITTEN) {
>> + if (xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_DIRTY,
>> + &offset) ||
>> + xfs_has_unwritten_buffer(inode, &map[1],
>> + PAGECACHE_TAG_WRITEBACK,
>> + &offset)) {
>> + if (offset > max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp,
>> + map[1].br_startoff))) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp,
>> + map[1].br_startoff));
>> + break;
>> + }
>> + } else {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[1].br_startoff));
>> + break;
>> + }
>> + }
>> +
>> + fsbno = map[1].br_startoff + map[1].br_blockcount;
>> + }
>> +
>> + /* Landed in a hole, its fine to return */
>> + if (map[0].br_startblock == HOLESTARTBLOCK) {
>> + offset = max_t(loff_t, seekoff,
>> + XFS_FSB_TO_B(mp, map[0].br_startoff));
>> + break;
>> + }
>> +
>> + /* Return ENXIO if beyond eof */
>> + if (XFS_FSB_TO_B(mp, fsbno) > isize) {
>> + error = ENXIO;
>> + goto out_lock;
>> + }
>> + }
>> +
>> + if (offset < start)
>> + offset = start;
>> +
>> + if (offset != file->f_pos)
>> + file->f_pos = offset;
>> +
>> +out_lock:
>> + xfs_iunlock_map_shared(ip, lock);
>> + if (error)
>> + return -error;
>> +
>> + return offset;
>> +}
>> +
>> +STATIC loff_t
>> +xfs_file_llseek(
>> + struct file *file,
>> + loff_t offset,
>> + int origin)
>> +{
>> + switch (origin) {
>> + case SEEK_END:
>> + case SEEK_CUR:
>> + case SEEK_SET:
>> + return generic_file_llseek(file, offset, origin);
>> + case SEEK_DATA:
>> + return xfs_seek_data(file, offset);
>> + case SEEK_HOLE:
>> + return xfs_seek_hole(file, offset);
>> + default:
>> + return -EOPNOTSUPP;
>> + }
>> +}
>> +
>> const struct file_operations xfs_file_operations = {
>> - .llseek = generic_file_llseek,
>> + .llseek = xfs_file_llseek,
>> .read = do_sync_read,
>> .write = do_sync_write,
>> .aio_read = xfs_file_aio_read,
|