xfs
[Top] [All Lists]

Re: + ext4-add-dax-functionality.patch added to -mm tree

To: Jan Kara <jack@xxxxxxx>
Subject: Re: + ext4-add-dax-functionality.patch added to -mm tree
From: Matthew Wilcox <willy@xxxxxxxxxxxxxxx>
Date: Fri, 20 Feb 2015 17:15:51 -0500
Cc: Matthew Wilcox <willy@xxxxxxxxxxxxxxx>, "Wilcox, Matthew R" <matthew.r.wilcox@xxxxxxxxx>, "ross.zwisler@xxxxxxxxxxxxxxx" <ross.zwisler@xxxxxxxxxxxxxxx>, "akpm@xxxxxxxxxxxxxxxxxxxx" <akpm@xxxxxxxxxxxxxxxxxxxx>, "Dilger, Andreas" <andreas.dilger@xxxxxxxxx>, "axboe@xxxxxxxxx" <axboe@xxxxxxxxx>, "boaz@xxxxxxxxxxxxx" <boaz@xxxxxxxxxxxxx>, "david@xxxxxxxxxxxxx" <david@xxxxxxxxxxxxx>, "hch@xxxxxx" <hch@xxxxxx>, "kirill.shutemov@xxxxxxxxxxxxxxx" <kirill.shutemov@xxxxxxxxxxxxxxx>, "mathieu.desnoyers@xxxxxxxxxxxx" <mathieu.desnoyers@xxxxxxxxxxxx>, "rdunlap@xxxxxxxxxxxxx" <rdunlap@xxxxxxxxxxxxx>, "tytso@xxxxxxx" <tytso@xxxxxxx>, "mm-commits@xxxxxxxxxxxxxxx" <mm-commits@xxxxxxxxxxxxxxx>, "linux-ext4@xxxxxxxxxxxxxxx" <linux-ext4@xxxxxxxxxxxxxxx>, xfs@xxxxxxxxxxx
Delivered-to: xfs@xxxxxxxxxxx
In-reply-to: <20150218104009.GB4614@xxxxxxxxxxxxx>
References: <54b45495.+RptMlNQorYE9TTf%akpm@xxxxxxxxxxxxxxxxxxxx> <20150115124106.GF12739@xxxxxxxxxxxxx> <100D68C7BA14664A8938383216E40DE040853440@xxxxxxxxxxxxxxxxxxxxxxxxxxxx> <20150119141858.GF5662@xxxxxxxxxxxxx> <20150217085200.GA23192@xxxxxxxxxxxxx> <20150217133745.GG3364@xxxxxx> <20150218104009.GB4614@xxxxxxxxxxxxx>
User-agent: Mutt/1.5.23 (2014-03-12)
> So to handle this it can start transaction in ext4_dax_fault() /
> ext4_dax_mkwrite() if write is requested and call ext4_jbd2_file_inode()
> after dax_fault() / dax_mkwrite() returns. Complete function will look
> something like follows:

How about this?  I tried to encompass both the unwritten extent conversion
as well as starting the journal at the right point in the locking hierarchy.

If we're going to expose do_dax_fault(), I think it needs to be called
__dax_fault().

I decided to return VM_FAULT_RETRY and a new flag VM_FAULT_UNWRITTEN from
__dax_fault(), rather than convert it to return an errno.

P.S. I love patches which touch *both* fs.h *and* mm.h.  In case there
were any files that weren't already being rebuilt.

diff --git a/fs/dax.c b/fs/dax.c
index 556238f..81dbdaa 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -316,7 +316,7 @@ static int dax_insert_mapping(struct inode *inode, struct 
buffer_head *bh,
        return error;
 }
 
-static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
                        get_block_t get_block)
 {
        struct file *file = vma->vm_file;
@@ -329,7 +329,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct 
vm_fault *vmf,
        sector_t block;
        pgoff_t size;
        int error;
-       int major = 0;
+       int ret = 0;
 
        size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
        if (vmf->pgoff >= size)
@@ -367,13 +367,15 @@ static int do_dax_fault(struct vm_area_struct *vma, 
struct vm_fault *vmf,
                error = -EIO;           /* fs corruption? */
        if (error)
                goto unlock_page;
+       if (buffer_unwritten(&bh))
+               ret |= VM_FAULT_UNWRITTEN;
 
        if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
                if (vmf->flags & FAULT_FLAG_WRITE) {
                        error = get_block(inode, block, &bh, 1);
                        count_vm_event(PGMAJFAULT);
                        mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-                       major = VM_FAULT_MAJOR;
+                       ret = VM_FAULT_MAJOR;
                        if (!error && (bh.b_size < PAGE_SIZE))
                                error = -EIO;
                        if (error)
@@ -407,7 +409,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct 
vm_fault *vmf,
        }
 
        /* Check we didn't race with a read fault installing a new page */
-       if (!page && major)
+       if (!page && (ret & VM_FAULT_MAJOR))
                page = find_lock_page(mapping, vmf->pgoff);
 
        if (page) {
@@ -421,12 +423,14 @@ static int do_dax_fault(struct vm_area_struct *vma, 
struct vm_fault *vmf,
        error = dax_insert_mapping(inode, &bh, vma, vmf);
 
  out:
+       if (error == -ENOSPC)
+               return VM_FAULT_RETRY | ret;
        if (error == -ENOMEM)
-               return VM_FAULT_OOM | major;
+               return VM_FAULT_OOM | ret;
        /* -EBUSY is fine, somebody else faulted on the same PTE */
        if ((error < 0) && (error != -EBUSY))
-               return VM_FAULT_SIGBUS | major;
-       return VM_FAULT_NOPAGE | major;
+               return VM_FAULT_SIGBUS | ret;
+       return VM_FAULT_NOPAGE | ret;
 
  unlock_page:
        if (page) {
@@ -435,6 +439,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct 
vm_fault *vmf,
        }
        goto out;
 }
+EXPORT_SYMBOL_GPL(__dax_fault);
 
 /**
  * dax_fault - handle a page fault on a DAX file
@@ -455,7 +460,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault 
*vmf,
                sb_start_pagefault(sb);
                file_update_time(vma->vm_file);
        }
-       result = do_dax_fault(vma, vmf, get_block);
+       result = __dax_fault(vma, vmf, get_block);
        if (vmf->flags & FAULT_FLAG_WRITE)
                sb_end_pagefault(sb);
 
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4340e38..84b4f1c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -194,7 +194,58 @@ errout:
 #ifdef CONFIG_FS_DAX
 static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-       return dax_fault(vma, vmf, ext4_get_block_write);
+       handle_t *handle;
+       int create = (vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page;
+       struct inode *inode = file_inode(vma->vm_file);
+       int ret, err = 0;
+       int retries = 0;
+
+       if (create) {
+               sb_start_pagefault(inode->i_sb);
+               file_update_time(vma->vm_file);
+ retry_alloc:
+               handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+                                       ext4_writepage_trans_blocks(inode));
+               if (IS_ERR(handle)) {
+                       err = PTR_ERR(handle);
+                       goto err;
+               }
+       }
+
+       ret = __dax_fault(vma, vmf, ext4_get_block);
+
+       if (create) {
+               if (ret & VM_FAULT_UNWRITTEN) {
+                       loff_t offset = (loff_t)vmf->pgoff << PAGE_SHIFT;
+                       err = ext4_convert_unwritten_extents(NULL, inode,
+                                                       offset, PAGE_SIZE);
+                       ret &= ~VM_FAULT_UNWRITTEN;
+               }
+               if (!err &&
+                   ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
+                       err = ext4_jbd2_file_inode(handle, inode);
+
+               if (err == -ENOSPC) {
+                       ret |= VM_FAULT_RETRY;
+                       err = 0;
+               }
+
+               ext4_journal_stop(handle);
+               if (err < 0)
+                       goto err;
+               if ((ret & VM_FAULT_RETRY) &&
+                   ext4_should_retry_alloc(inode->i_sb, &retries))
+                       goto retry_alloc;
+               ret &= ~VM_FAULT_RETRY;
+       }
+
+ out:
+       if (create)
+               sb_end_pagefault(inode->i_sb);
+       return ret;
+ err:
+       ret = block_page_mkwrite_return(err);
+       goto out;
 }
 
 static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 85404f1..8f1ea7d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -657,18 +657,6 @@ has_zeroout:
        return retval;
 }
 
-static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
-{
-       struct inode *inode = bh->b_assoc_map->host;
-       /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
-       loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
-       int err;
-       if (!uptodate)
-               return;
-       WARN_ON(!buffer_unwritten(bh));
-       err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
-}
-
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
 
@@ -706,11 +694,6 @@ static int _ext4_get_block(struct inode *inode, sector_t 
iblock,
 
                map_bh(bh, inode->i_sb, map.m_pblk);
                bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
-               if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
-                       bh->b_assoc_map = inode->i_mapping;
-                       bh->b_private = (void *)(unsigned long)iblock;
-                       bh->b_end_io = ext4_end_io_unwritten;
-               }
                if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
                        set_buffer_defer_completion(bh);
                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 239c89c..2af5050 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2597,6 +2597,7 @@ int dax_clear_blocks(struct inode *, sector_t block, long 
size);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, 
get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
                                        unsigned int flags, get_block_t);
 #define dax_mkwrite(vma, vmf, gb)      dax_fault(vma, vmf, gb)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ceb50ec..ffc9947 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1100,7 +1100,7 @@ static inline int page_mapped(struct page *page)
 #define VM_FAULT_HWPOISON 0x0010       /* Hit poisoned small page */
 #define VM_FAULT_HWPOISON_LARGE 0x0020  /* Hit poisoned large page. Index 
encoded in upper bits */
 #define VM_FAULT_SIGSEGV 0x0040
-
+#define VM_FAULT_UNWRITTEN 0x0080      /* Unwritten extent needs conversion */
 #define VM_FAULT_NOPAGE        0x0100  /* ->fault installed the pte, not 
return page */
 #define VM_FAULT_LOCKED        0x0200  /* ->fault locked the returned page */
 #define VM_FAULT_RETRY 0x0400  /* ->fault blocked, must retry */

<Prev in Thread] Current Thread [Next in Thread>