xfs
[Top] [All Lists]

RE: kioclusters

To: 'Rajagopal Ananthanarayanan' <ananth@xxxxxxx>
Subject: RE: kioclusters
From: "Davida, Joe" <Joe_Davida@xxxxxxxxxx>
Date: Tue, 5 Dec 2000 09:54:32 -0700
Cc: "'linux-xfs@xxxxxxxxxxx'" <linux-xfs@xxxxxxxxxxx>
Sender: owner-linux-xfs@xxxxxxxxxxx
        Thank you RajaGopal,
        Will try the patch. By latest you mean as of what date?
        My current tree is as of last Friday 12/1.
        Cheers,

        Joe

>-----Original Message-----
>From: Rajagopal Ananthanarayanan [mailto:ananth@xxxxxxx]
>Sent: Monday, December 04, 2000 4:53 PM
>To: Davida, Joe
>Cc: 'linux-xfs@xxxxxxxxxxx'
>Subject: Re: kioclusters
>
>
>"Davida, Joe" wrote:
>> 
>> You say you have seen performance problems with non-sequential
>> localized writes using the kiocluster option on mounts.
>> Is the performance problem bad enough so that it is worse
>> than ext2fs?
>> 
>> Joe
>
>
>Performance difference really depends on what is being run.
>We have noticed that for dbench with small number of clients,
>xfs+kiocluter does worse than ext2 ... but for dbench with
>larger number of clients xfs+kiocluster does better than ext2.
>
>I've been working on a patch which in my tests with dbench
>has improved xfs+kiocluster significantly. If you are working
>with a test system, can you please try the following patch
>(in conjuction with the kiocluster option)? The patch is against
>the latest development xfs tree ...
>
>
>----------------- patch begins -------------------------------
>diff -Naur ../../xfs-tot/linux/fs/pagebuf/page_buf.c 
>./fs/pagebuf/page_buf.c
>--- ../../xfs-tot/linux/fs/pagebuf/page_buf.c   Tue Nov 28 
>16:34:17 2000
>+++ ./fs/pagebuf/page_buf.c     Thu Nov 30 10:29:39 2000
>@@ -186,7 +186,7 @@
> unsigned long pagebuf_min[P_PARAM] = { HZ/2, 1*HZ, HZ/2, 1, 0, 0 };
> unsigned long pagebuf_max[P_PARAM] = { HZ*30, HZ*300, HZ*30, 
>1024, 4096, 1 };
> 
>-pagebuf_param_t pb_params = {{ HZ, 15 * HZ, HZ, 512, 1024, 0 }};
>+pagebuf_param_t pb_params = {{ HZ, 15 * HZ, 2*HZ, 512, 1024, 0 }};
> 
> /*
>  * Pagebuf statistics variables
>@@ -595,7 +595,11 @@
>                        kp->maplist[pi] = cp;
>                } else {
>                        cp = kp->maplist[pi];
>-                       set_bit(PG_referenced, &cp->flags);
>+                       /*
>+                        * Don't set referenced bit;
>+                        * we may consider age_page_up.
>+                       //set_bit(PG_referenced, &cp->flags);
>+                        */
>                        while (TryLockPage(cp)) {
>                                wait_on_page(cp);
>                        }
>@@ -1345,7 +1349,7 @@
>        struct buffer_head *bh;
>        off_t blk_offset;
>        size_t blk_length;
>-       int err=0;
>+       int err=0, need_balance_dirty = 0;
>        int force_io = (rw != READ) || (pb->pb_flags & PBF_FORCEIO);
>        int concat_ok = ((MAJOR(dev) != LVM_BLK_MAJOR) || 
>(MAJOR(dev) != MD_MAJOR));
> 
>@@ -1425,7 +1429,9 @@
> 
>                        if (rw == WRITE ) {
>                                set_bit(BH_Uptodate, &bh->b_state);
>-                               set_bit(BH_Dirty, &bh->b_state);
>+                               if (!buffer_dirty(bh))
>+                                       need_balance_dirty = 1;
>+                               __mark_buffer_dirty(bh);
>                        }
>                        psync->bh[cnt++] = bh;
>                        atomic_inc(&psync->remain);
>@@ -1444,7 +1450,8 @@
>                if (locking)
>                        UnlockPage(page);
>        }
>-
>+       if (need_balance_dirty)
>+               balance_dirty(dev);
>        return err;
> error:
>        /* If we ever do get here then clean up what we already did */
>diff -Naur ../../xfs-tot/linux/fs/pagebuf/page_buf_io.c 
>./fs/pagebuf/page_buf_io.c
>--- ../../xfs-tot/linux/fs/pagebuf/page_buf_io.c        Tue 
>Nov 28 16:34:17 2000
>+++ ./fs/pagebuf/page_buf_io.c  Fri Dec  1 10:31:28 2000
>@@ -93,11 +93,11 @@
> /*
>  * Forward declarations.
>  */
>-STATIC void __pb_block_commit_write_async(struct inode *,
>+STATIC int __pb_block_commit_write_async(struct inode *,
>                struct page *, page_buf_bmap_t *);
> STATIC int __pb_block_prepare_write_async(struct inode *, 
>struct page *,
>                unsigned, unsigned, int, page_buf_bmap_t *, int);
>-STATIC int pagebuf_delalloc_convert(struct page *, u_long, 
>struct page **);
>+STATIC int pagebuf_delalloc_convert(struct page *, u_long, 
>struct page **, int, int);
> STATIC void hook_buffers_to_page(struct inode *, struct page *,
>                page_buf_bmap_t *, ulong);
> 
>@@ -114,6 +114,20 @@
> int MAX_CLUSTER = 512;
> int CLUSTER_PAGE_LIST_SIZE = ((2*512)+1);
> 
>+/*
>+ * stats
>+ */
>+struct pb_io_stat_s {
>+       int pcd_normal_scan;
>+       int pcd_normal_converted;
>+
>+       int pcd_skip_locked;
>+       int pcd_skip_referenced;
>+
>+       int pcd_ilock_failed;
>+} pb_io_stat = {0, 0, 0, 0, 0};
>+
>+EXPORT_SYMBOL(pb_io_stat);
> 
> /*
>  * The minimum size where we will start using pagebuf 
>structures instead
>@@ -210,10 +224,12 @@
> 
>        if (IS_KIOCLUSTER(ip)){
>                /*
>-                * If kmalloc, no big deal; the lower layers 
>won't cluster.
>+                * If kmalloc fails, no big deal; the lower 
>layers won't
>+                * cluster. Also, this allocation has to be 
>non-sleeping
>+                * since this can be in kswapd's path ...
>                 */
>                cpages = kmalloc(CLUSTER_PAGE_LIST_SIZE * 
>sizeof(struct page *),
>-                                          GFP_KERNEL);
>+                                          GFP_ATOMIC);
>        } else {
>                cpages = (struct page **)1; /* a boolean */
>        }
>@@ -235,7 +251,7 @@
> 
>                        lock_page(page);
>                        if (test_and_clear_bit(PG_delalloc, 
>&page->flags)) {
>-                               pagebuf_delalloc_convert(page, 
>0, cpages);
>+                               pagebuf_delalloc_convert(page, 
>0, cpages, 0, 0);
>                        } else {
>                                UnlockPage(page);
>                        }
>@@ -511,7 +527,8 @@
> int pagebuf_iozero(            /* zero contents of buffer      */
>     page_buf_t * pb,           /* buffer to zero               */
>     off_t boff,                        /* offset in buffer    
>         */
>-    size_t bsize)              /* size of data to zero           */
>+    size_t bsize,              /* size of data to zero           */
>+    int  *dirty)
> {
>        loff_t cboff;
>        size_t cpoff;
>@@ -541,6 +558,7 @@
>                if (pb->pb_bn == PAGE_BUF_DADDR_NULL) {
>                        if (test_and_set_bit(PG_delalloc, 
>&pm->flags) == 0) {
>                                atomic_inc(&pb_delalloc_pages);
>+                               (*dirty)++;
>                        }
>                }
>        }
>@@ -548,16 +566,6 @@
>        pb->pb_flags &= ~(PBF_READ | PBF_WRITE);
>        pb->pb_flags &= ~(_PBF_SOME_INVALID_PAGES | 
>PBF_PARTIAL | PBF_NONE);
> 
>-       if (!pcd_active && (pb->pb_bn == PAGE_BUF_DADDR_NULL)) {
>-               unsigned int    np = atomic_read(&pb_delalloc_pages);
>-
>-               if (np > 2 * pb_params.p_un.max_dirty_pages)
>-                       wake_up_interruptible_sync(&pcd_waitq);
>-               else if (np > pb_params.p_un.max_dirty_pages)
>-                       wake_up_interruptible(&pcd_waitq);
>-       }
>-
>-
>        return (0);
> }
> 
>@@ -1174,62 +1182,6 @@
>                        page, page->index, bh->b_blocknr));
> }
> 
>-
>-void
>-set_buffer_dirty_uptodate(struct buffer_head *bh)
>-{
>-       int need_balance_dirty = 0;
>-
>-       if (bh->b_blocknr <= 0) {
>-               printk("Warning: buffer 0x%p with weird 
>blockno (%ld)\n",
>-                       bh, bh->b_blocknr);
>-       }
>-       set_bit(BH_Uptodate, &bh->b_state);
>-       if (!buffer_dirty(bh)) {
>-               bh->b_end_io = end_pb_buffer_io_async;
>-               need_balance_dirty = 1;
>-       }
>-       __mark_buffer_dirty(bh);
>-
>-       if (need_balance_dirty)
>-               balance_dirty(bh->b_dev);
>-}
>-
>-int pbwcm_debug = 0;
>-
>-int
>-__pb_write_or_convert_bmap(
>-       struct inode *inode,
>-       struct page *page)
>-{
>-       loff_t  offset = page->index << PAGE_CACHE_SHIFT;
>-       int error, nmaps;
>-       page_buf_bmap_t map;
>-
>-       error = inode->i_op->pagebuf_bmap(inode, offset, 
>PAGE_CACHE_SIZE,
>-                       &map, 1, &nmaps, PBF_WRITE);
>-       if (error == 0 && (map.pbm_flags & PBMF_DELAY)) {
>-               error = inode->i_op->pagebuf_bmap(inode, offset,
>-                               map.pbm_bsize, &map, 1,
>-                               &nmaps, PBF_WRITE|PBF_FILE_ALLOCATE);
>-               if (error) {
>-                       printk("pbwcm: bmap error %d ro 0x%Lx 
>size 0x%x\n",
>-                                  error, offset, map.pbm_bsize);
>-               } else {
>-                       dprintk(pbwcm_debug,
>-                        ("converted bn:%Ld off:%Ld size:%d 
>flags:%d\n",
>-                            map.pbm_bn, map.pbm_offset,
>-                            map.pbm_bsize, map.pbm_flags));
>-               }
>-       }
>-       if (!error) {
>-               hook_buffers_to_page(inode, page, &map, 
>PAGE_CACHE_SHIFT);
>-               set_buffer_dirty_uptodate(page->buffers);
>-       }
>-       return error;
>-}
>-
>-
> STATIC int
> __pb_block_prepare_write_async(struct inode *inode, struct page *page,
>                unsigned from, unsigned to, int at_eof,
>@@ -1390,15 +1342,34 @@
> }
> 
> int pbcw_debug = 0;
>+
>+int
>+set_buffer_dirty_uptodate(struct buffer_head *bh)
>+{
>+       int need_balance_dirty = 0;
>+
>+       if (bh->b_blocknr <= 0) {
>+               printk("Warning: buffer 0x%p with weird 
>blockno (%ld)\n",
>+                       bh, bh->b_blocknr);
>+       }
>+       set_bit(BH_Uptodate, &bh->b_state);
>+       if (!buffer_dirty(bh)) {
>+               bh->b_end_io = end_pb_buffer_io_async;
>+               need_balance_dirty = 1;
>+       }
>+       __mark_buffer_dirty(bh);
>+       return (need_balance_dirty);
>+}
>+
> int pbcw_debug2 = 0;
> 
>-STATIC void
>+STATIC int
> __pb_block_commit_write_async(struct inode     *inode,
>                                struct page     *page,
>                                page_buf_bmap_t *mp)
> {
>        struct buffer_head      *bh;
>-       unsigned int            np;
>+       int                     dirty = 0;
> 
>        /*
>         * Prepare write took care of reading/zero-out
>@@ -1412,32 +1383,20 @@
>                if (test_bit(PG_delalloc, &page->flags)) {
>                        dprintk(pbcw_debug2, ("mapped buffer 
>0x%p page 0x%p is delalloc\n", bh,
>page));
>                }
>-               set_buffer_dirty_uptodate(page->buffers);
>+               dirty = set_buffer_dirty_uptodate(page->buffers);
>                dprintk(pbcw_debug, ("pbcw: refiled valid 
>buffer 0x%p\n",
>                        page->buffers));
>        } else if (test_and_set_bit(PG_delalloc, &page->flags) == 0) {
>                dprintk(pbcw_debug, ("Marking page 0x%p 
>delalloc\n", page));
>-               np = atomic_read(&pb_delalloc_pages);
>-               if (np > PB_MAX_DIRTY_FACTOR * 
>pb_params.p_un.max_dirty_pages) {
>-                       clear_bit(PG_delalloc, &page->flags);
>-                       if (__pb_write_or_convert_bmap(inode, page)) {
>-                               BUG();
>-                       }
>-               } else {
>-                       atomic_inc(&pb_delalloc_pages);
>-                       if (!pcd_active) {
>-                               if (np > 2 * 
>pb_params.p_un.max_dirty_pages)
>-                                       
>wake_up_interruptible_sync(&pcd_waitq);
>-                               else if (np > 
>pb_params.p_un.max_dirty_pages)
>-                                       
>wake_up_interruptible(&pcd_waitq);
>-                       }
>-                       balance_dirty(inode->i_rdev);
>-               }
>+
>+               atomic_inc(&pb_delalloc_pages);
>+               dirty = 1;
>        }
> 
>        /* Advance though extent no matter what */
>        if (mp)
>                mp->pbm_delta += PAGE_CACHE_SIZE;
>+       return dirty;
> }
> 
> int
>@@ -1448,7 +1407,8 @@
>        char            *user_addr,
>        size_t          len,
>        loff_t          *lp,
>-       page_buf_bmap_t *mp)            /* bmap for page       
>              */
>+       page_buf_bmap_t *mp,            /* bmap for page       
>              */
>+       int             *dirty)
> {
>        struct page *page;
>        unsigned long done;
>@@ -1507,7 +1467,7 @@
>                        goto unlock;
>                }
> 
>-               __pb_block_commit_write_async(inode, page, mp);
>+               *dirty += __pb_block_commit_write_async(inode, 
>page, mp);
> 
>                foff += bytes_in_page;
>                len -= bytes_in_page;
>@@ -1533,7 +1493,8 @@
>        char *buf,              /* buffer address               */
>        size_t len,             /* size of buffer               */
>        loff_t * lp,            /* file offset to use and update */
>-       int pb_flags)           /* flags to pass to bmap calls  */
>+       int pb_flags,           /* flags to pass to bmap calls  */
>+       int *dirty)
> {
>        struct inode *inode = filp->f_dentry->d_inode;
>        page_buf_bmap_t map;
>@@ -1628,7 +1589,7 @@
>                 */
>                status = __pagebuf_do_delwri(inode,
>                                rounded_offset, size, buf,
>-                               len, &foff, &map);
>+                               len, &foff, &map, dirty);
>                if (status <= 0)
>                        break;
>                written += status;
>@@ -1646,7 +1607,8 @@
>     struct file * filp,                /* file to write       
>         */
>     char *buf,                 /* buffer address               */
>     size_t len,                        /* size of buffer      
>         */
>-    loff_t * lp)               /* file offset to use and update */
>+    loff_t * lp,               /* file offset to use and update */
>+    int        *dirty)
> {
>        struct inode *inode = filp->f_dentry->d_inode;
>        unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
>@@ -1711,7 +1673,7 @@
> 
>                if (!page) {
>                        status = _pagebuf_file_write(filp,
>-                                       buf, len, &foff, pb_flags);
>+                                       buf, len, &foff, 
>pb_flags, dirty);
>                        if (status > 0)
>                                written += status;
> 
>@@ -1748,7 +1710,7 @@
>                        goto unlock;
>                }
> 
>-               __pb_block_commit_write_async(inode, page, &map);
>+               *dirty += __pb_block_commit_write_async(inode, 
>page, &map);
> 
>                len -= bytes;
>                buf += bytes;
>@@ -1773,8 +1735,6 @@
> }
> 
> int pcd_debug = 0;
>-int pcd_skip_locked = 0;
>-int pcd_ilock_failed = 0;
> static int page_cleaner_daemon_started = 0;
> static int daemon_terminate = 0;
> 
>@@ -1783,12 +1743,12 @@
>  * Returns page locked and with an extra reference count.
>  */
> STATIC struct page *
>-probe_page(struct inode *inode, unsigned long index)
>+probe_page(struct inode *inode, unsigned long index, int check)
> {
>        struct page *page;
> 
>        page = __find_lock_page_nowait(inode->i_mapping, index,
>-                               page_hash(inode->i_mapping, index));
>+                               page_hash(inode->i_mapping, 
>index), check);
>        if (!page)
>                return NULL;
>        if (!test_and_clear_bit(PG_delalloc, &(page)->flags)) {
>@@ -1820,26 +1780,33 @@
> kio_cluster_write(struct inode *inode,
>              struct page  *startpage,
>              page_buf_bmap_t *mp,
>-             struct page **cpages)
>+             struct page **cpages,
>+             int np,
>+             int check)
> {
>        unsigned long   tindex, tlast;
>        struct page     **pcp, **pcstart;
>        loff_t          cstart_offset;
>        page_buf_t      *pb;
>        size_t          csize;
>-       int             count = pb_params.p_un.max_cluster;
>+       int             m, count = pb_params.p_un.max_cluster;
> 
>-       pcp = &cpages[MAX_CLUSTER]; /* start from the middle */
>        dprintk(cluster_debug,
>                ("cluster_write: inode 0x%p page 0x%p index 0x%lx\n",
>                        inode, startpage, startpage->index));
>+
>+       if (np && count > np) /* obey limit if supplied */
>+               count = np;
>+       m = count >> 1;         /* start from middle */ 
>+       pcp = &cpages[m];
>        *pcp-- = startpage;
>+       count--;
>        if (startpage->index != 0) {
>                tlast = mp->pbm_offset >> PAGE_CACHE_SHIFT;
>                for (tindex = startpage->index-1; tindex >= tlast &&
>                      pcp >= &cpages[0] && count; tindex--, 
>pcp--, count--)
>                {
>-                       if (!(*pcp = probe_page(inode, tindex)))
>+                       if (!(*pcp = probe_page(inode, tindex, check)))
>                                break;
>                        dprintk(cluster_debug,
>                            ("cluster_write(L): inode 0x%p 
>page 0x%p idx 0x%lx\n",
>@@ -1849,11 +1816,11 @@
>        pcstart = pcp+1;
>        tlast = PAGE_CACHE_ALIGN_LL(mp->pbm_offset + mp->pbm_bsize) >>
>                                                        
>PAGE_CACHE_SHIFT;
>-       for (tindex = startpage->index + 1, pcp = 
>&cpages[MAX_CLUSTER+1];
>-               tindex < tlast && pcp < 
>&cpages[CLUSTER_PAGE_LIST_SIZE] && count;
>+       for (tindex = startpage->index + 1, pcp = &cpages[m+1];
>+               tindex < tlast && pcp < &cpages[2*m] && count;
>                tindex++, pcp++, count--)
>        {
>-               if (!(*pcp = probe_page(inode, tindex)))
>+               if (!(*pcp = probe_page(inode, tindex, check)))
>                        break;
>                dprintk(cluster_debug,
>                        ("cluster_write(R): inode 0x%p page 
>0x%p index 0x%lx\n",
>@@ -1920,7 +1887,8 @@
> STATIC void
> cluster_write(struct inode *inode,
>              unsigned long index,
>-             page_buf_bmap_t *mp)
>+             page_buf_bmap_t *mp,
>+             int check)
> {
>        unsigned long tindex;
>        unsigned long tlast;
>@@ -1930,7 +1898,7 @@
>        if (index != 0) {
>                tlast = mp->pbm_offset >> PAGE_CACHE_SHIFT;
>                for (tindex = index-1; tindex >= tlast; tindex--) {
>-                       if (!(page = probe_page(inode, tindex)))
>+                       if (!(page = probe_page(inode, tindex, check)))
>                                break;
>                        convert_page(inode, page, mp);
>                }
>@@ -1938,13 +1906,12 @@
>        tlast = PAGE_CACHE_ALIGN_LL(mp->pbm_offset + mp->pbm_bsize) >>
>                                                        
>PAGE_CACHE_SHIFT;
>        for (tindex = index + 1; tindex < tlast; tindex++) {
>-               if (!(page = probe_page(inode, tindex)))
>+               if (!(page = probe_page(inode, tindex, check)))
>                        break;
>                convert_page(inode, page, mp);
>        }
> }
> 
>-
> int
> pagebuf_convert_page(struct page *page, int toss, int wait)
> {
>@@ -1972,7 +1939,9 @@
> pagebuf_delalloc_convert(
>        struct page *mm,        /* delalloc page to convert - locked */
>        u_long  flags,          /* flags to pass to bmap call */
>-       struct page **cpages)   /* can we cluster conversion? */ 
>+       struct page **cpages,   /* can we cluster conversion? */ 
>+       int np,                 /* n pages in cpages          */
>+       int check)              /* check flush times          */
> {
>        page_buf_bmap_t maps[PBF_MAX_MAPS];
>        struct inode *inode;
>@@ -1996,7 +1965,7 @@
> 
>        if (error) {
>                if (error == -EAGAIN) {
>-                       pcd_ilock_failed++;
>+                       pb_io_stat.pcd_ilock_failed++;
>                        set_bit(PG_delalloc, &mm->flags);
>                 } else {
>                        printk("PCD: pagebuf_bmap error %d 
>pb_flags 0x%lx\n",
>@@ -2020,13 +1989,13 @@
>        if (cpages) {
>                if (IS_KIOCLUSTER(inode)) {
>                        get_page(mm);
>-                       count = kio_cluster_write(inode, mm, 
>&maps[0], cpages);
>+                       count = kio_cluster_write(inode, mm, 
>&maps[0], cpages, np, check);
>                } else {
>                        hook_buffers_to_page(inode, mm, &maps[0],
>                                                        
>PAGE_CACHE_SHIFT);
>                        set_buffer_dirty_uptodate(mm->buffers);
>                        UnlockPage(mm);
>-                       cluster_write(inode, mm->index, &maps[0]);
>+                       cluster_write(inode, mm->index, 
>&maps[0], check);
>                        count = 1;
>                }
> 
>@@ -2042,6 +2011,8 @@
> }
> 
> int pcd_debug2 = 0;
>+int sum_min = 0;
>+EXPORT_SYMBOL(sum_min);
> 
> STATIC int
> page_cleaner_daemon(void *data)
>@@ -2049,9 +2020,8 @@
>        mem_map_t *mm = &mem_map[0], *mmlast = &mem_map[max_mapnr];
>        u_long flags;
>        struct buffer_head *bh;
>-       int     pb_min_save = PB_MIN_DIRTY_PAGES;
>        struct page **cpages;
>-       int     looped, sum;
>+       int     looped, tsum, sum;
> 
>        /*  Set up the thread  */
>        exit_files(current);
>@@ -2074,7 +2044,6 @@
>        cpages = kmalloc(CLUSTER_PAGE_LIST_SIZE * 
>sizeof(struct page *),
>                                GFP_KERNEL);
> 
>-       mm = &mem_map[0] - 1;
>        while (1) {
>                /*
>                 * If we actually get into a low-memory situation,
>@@ -2082,10 +2051,11 @@
>                 * up on a more timely basis.
>                 */
> 
>-               pcd_skip_locked = 0;
>-               pcd_ilock_failed = 0;
>+               pb_io_stat.pcd_skip_locked = 
>pb_io_stat.pcd_skip_referenced = 0;
>+               pb_io_stat.pcd_ilock_failed = 0;
>                sum = looped = 0;
>-               while (atomic_read(&pb_delalloc_pages) > 
>PB_MIN_DIRTY_PAGES) {
>+               mm = &mem_map[0] - 1;
>+               while (1) {
>                        if (current->need_resched)
>                                schedule();
> 
>@@ -2101,8 +2071,12 @@
>                        }
>                        if (!test_bit(PG_delalloc, &(mm)->flags))
>                                continue;
>+                       if (mm->age >= PAGE_AGE_START && !looped) {
>+                               pb_io_stat.pcd_skip_referenced++;
>+                               continue;
>+                       }
>                        if (TryLockPage(mm)) {
>-                               pcd_skip_locked++;
>+                               pb_io_stat.pcd_skip_locked++;
>                                continue;
>                        }
>                        if (!test_and_clear_bit(PG_delalloc, 
>&(mm)->flags)) {
>@@ -2129,16 +2103,20 @@
> /* since bmap can block, this should be in a different daemon       */
> /*---------------- DELALLOC CONVERT --------------------------------*/
> 
>-                       sum += pagebuf_delalloc_convert(mm,
>-                               PBF_BMAP_TRY_ILOCK, cpages);
>+                       tsum = pagebuf_delalloc_convert(mm,
>+                               PBF_BMAP_TRY_ILOCK, cpages, 0, 0);
>+
>+                       pb_io_stat.pcd_normal_converted += tsum;
>+                       sum += tsum;
> 
>                        /* Do not let too many pages get locked up
>                         * waiting for the queue to open in here
>                         */
>-                       if (sum > 256) {
>+                       if (tsum > 256) {
>                                run_task_queue(&tq_disk);
>-                               sum = 0;
>                        }
>+                       if (sum > sum_min)
>+                               break;
> 
>                }
>                run_task_queue(&tq_disk);
>@@ -2149,18 +2127,9 @@
>                        wake_up_interruptible(&pcd_waitq);
>                        break;
>                }
>-
>-               /*
>-                * if woken up periodically (nothing else to do)
>-                * convert all the pages, else convert only
>-                * to keep watermarks happy.
>-                */
>-               if (interruptible_sleep_on_timeout(&pcd_waitq,
>-                               pb_params.p_un.cluster_interval) == 0)
>-               {
>-                       PB_MIN_DIRTY_PAGES = 0;
>-               } else
>-                       PB_MIN_DIRTY_PAGES = pb_min_save;
>+               interruptible_sleep_on_timeout(&pcd_waitq,
>+                               pb_params.p_un.cluster_interval);
>+               pb_io_stat.pcd_normal_scan++;
>                pcd_active = 1;
>        }
>        kfree(cpages);
>diff -Naur ../../xfs-tot/linux/fs/xfs/linux/xfs_lrw.c 
>./fs/xfs/linux/xfs_lrw.c
>--- ../../xfs-tot/linux/fs/xfs/linux/xfs_lrw.c  Mon Dec  4 
>13:28:38 2000
>+++ ./fs/xfs/linux/xfs_lrw.c    Fri Dec  1 10:30:10 2000
>@@ -77,7 +77,8 @@
>        char            *buf,
>        size_t          size,
>        loff_t          *offsetp,
>-       int             read)   /* set if read, otherwise this 
>is write */
>+       int             read,   /* set if read, otherwise this 
>is write */
>+       int             *dirty)
> {
>        ssize_t         ret;
>        struct xfs_inode *xip;
>@@ -98,7 +99,7 @@
>                if (!(filp->f_flags & O_INVISIBLE))
>                        xfs_ichgtime(xip, XFS_ICHGTIME_ACC);
>        } else {
>-               ret = pagebuf_generic_file_write(filp, buf, 
>size, offsetp);
>+               ret = pagebuf_generic_file_write(filp, buf, 
>size, offsetp, dirty);
>        }
> out:
>        return(ret);
>@@ -118,6 +119,7 @@
>        vnode_t         *vp;
>        xfs_inode_t     *ip;
> #endif
>+       int dirty = 0;
> 
>        n = XFS_MAX_FILE_OFFSET - *offsetp;
>        if (n <= 0)
>@@ -145,7 +147,8 @@
>        }
> #endif /* CONFIG_XFS_DMAPI */
> 
>-       ret = xfs_rdwr(bdp, filp, buf, size, offsetp, 1);
>+       /* dirty doesn't matter */
>+       ret = xfs_rdwr(bdp, filp, buf, size, offsetp, 1, &dirty);
>        return(ret);
> }
> 
>@@ -168,7 +171,8 @@
>        xfs_iocore_t    *io,
>        xfs_off_t       offset,
>        xfs_fsize_t     isize,
>-       struct pm       *pmp)
>+       struct pm       *pmp,
>+       int             *dirty)
> {
>        xfs_fileoff_t   last_fsb;
>        xfs_fileoff_t   next_fsb;
>@@ -342,7 +346,7 @@
>                        printk("xfs_zero_last_block: unwritten?\n");
>                }
>        } else {
>-               error = pagebuf_iozero(pb, zero_offset, zero_len);
>+               error = pagebuf_iozero(pb, zero_offset, 
>zero_len, dirty);
>                pagebuf_rele(pb);
>                goto out_lock;
>        }
>@@ -358,7 +362,7 @@
>              ("zlb: pb_iozero pb 0x%p zf 0x%x zl 0x%x\n",
>                pb, zero_offset, zero_len));
> 
>-       if (error = pagebuf_iozero(pb, zero_offset, zero_len)) {
>+       if (error = pagebuf_iozero(pb, zero_offset, zero_len, dirty)) {
>                pagebuf_rele(pb);
>                goto out_lock;
>        }
>@@ -409,7 +413,8 @@
>        xfs_iocore_t    *io,
>        xfs_off_t       offset,
>        xfs_fsize_t     isize,
>-       struct pm       *pmp)
>+       struct pm       *pmp,
>+       int             *dirty)
> {
>        struct inode    *ip = vp->v_inode;
>        xfs_fileoff_t   start_zero_fsb;
>@@ -440,7 +445,7 @@
>         * First handle zeroing the block on which isize resides.
>         * We only zero a part of that block so it is handled 
>specially.
>         */
>-       error = xfs_zero_last_block(ip, io, offset, isize, pmp);
>+       error = xfs_zero_last_block(ip, io, offset, isize, pmp, dirty);
>        if (error) {
>                ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
>                ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
>@@ -555,7 +560,7 @@
>                }
> 
>                if (imap.br_startblock == DELAYSTARTBLOCK) {
>-                       error = pagebuf_iozero(pb, 0, lsize);
>+                       error = pagebuf_iozero(pb, 0, lsize, dirty);
>                        pagebuf_rele(pb);
>                } else {
>                        pb->pb_bn = XFS_FSB_TO_DB_IO(io, 
>imap.br_startblock);
>@@ -568,7 +573,7 @@
>                                        ("xfs_zero_eof: real 
>time device? use diff inode\n"));
>                        }
> 
>-                       if (error = pagebuf_iozero(pb, 0, lsize)) {
>+                       if (error = pagebuf_iozero(pb, 0, 
>lsize, dirty)) {
>                                pagebuf_rele(pb);
>                                goto out_lock;
>                        }
>@@ -629,6 +634,7 @@
>        int             eventsent = 0;
>        loff_t          savedsize = *offsetp;
> #endif
>+       int             dirty = 0;
> 
>        vp = BHV_TO_VNODE(bdp);
>        xip = XFS_BHVTOI(bdp);
>@@ -704,7 +710,7 @@
>        if (*offsetp > isize && isize) {
>                io->io_writeio_blocks = mp->m_writeio_blocks;
>                ret = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offsetp,
>-                       isize, NULL);
>+                       isize, NULL, &dirty);
>                if (ret) {
>                        xfs_iunlock(xip, 
>XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL);
>                        return(ret); /* JIMJIM should this be 
>negative? */
>@@ -713,7 +719,7 @@
>        xfs_iunlock(xip, XFS_ILOCK_EXCL);
> 
> retry:
>-       ret = xfs_rdwr(bdp, filp, buf, size, offsetp, 0);
>+       ret = xfs_rdwr(bdp, filp, buf, size, offsetp, 0, &dirty);
> 
> #ifdef CONFIG_XFS_DMAPI
>        if ((ret == -ENOSPC) &&
>@@ -754,6 +760,8 @@
>                }
>        }
>        xfs_iunlock(xip, XFS_IOLOCK_EXCL);
>+       if (dirty)
>+               balance_dirty(ip->i_dev);
>        return(ret);
> }
> 
>diff -Naur ../../xfs-tot/linux/fs/xfs/linux/xfs_lrw.h 
>./fs/xfs/linux/xfs_lrw.h
>--- ../../xfs-tot/linux/fs/xfs/linux/xfs_lrw.h  Tue Nov 28 
>16:34:23 2000
>+++ ./fs/xfs/linux/xfs_lrw.h    Wed Oct 25 12:37:18 2000
>@@ -48,7 +48,7 @@
> extern int xfs_bdstrat_cb (struct xfs_buf *);
> 
> extern int xfs_zero_eof (vnode_t *, struct xfs_iocore *, xfs_off_t,
>-                               xfs_fsize_t, struct pm *);
>+                               xfs_fsize_t, struct pm *, int *dirty);
> extern ssize_t xfs_read (bhv_desc_t *, struct file *, char *,
>                                size_t, loff_t *);
> extern ssize_t xfs_write (bhv_desc_t *, struct file *, char *,
>diff -Naur ../../xfs-tot/linux/fs/xfs/xfs_inode.c ./fs/xfs/xfs_inode.c
>--- ../../xfs-tot/linux/fs/xfs/xfs_inode.c      Tue Nov 28 
>16:34:30 2000
>+++ ./fs/xfs/xfs_inode.c        Thu Nov 30 10:29:40 2000
>@@ -1707,7 +1707,7 @@
>        cred_t          *credp)
> {
>        xfs_fsize_t     isize;
>-       int             error;
>+       int             error, dirty;
> 
>        ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
>        ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
>@@ -1720,7 +1720,8 @@
>         * xfs_write_file() beyond the end of the file
>         * and any blocks between the old and new file sizes.
>         */
>-       error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, 
>new_size, isize, NULL);
>+       error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, 
>new_size, isize,
>+                                                       NULL, &dirty);
>        return error;
> }
> 
>diff -Naur ../../xfs-tot/linux/fs/xfs/xfs_rw.c ./fs/xfs/xfs_rw.c
>--- ../../xfs-tot/linux/fs/xfs/xfs_rw.c Tue Nov 28 16:34:31 2000
>+++ ./fs/xfs/xfs_rw.c   Wed Oct 25 12:11:52 2000
>@@ -690,7 +690,7 @@
>        void            *dio)               
> {
>        xfs_dio_t       *diop = (xfs_dio_t *)dio;
>-       int             relock;
>+       int             relock, dirty;
>        __uint64_t      flush_end;
>        xfs_mount_t     *mp;
> 
>@@ -717,7 +717,8 @@
>                XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
>                isize = XFS_SIZE(mp, io);
>                if (offset > isize) {
>-                       xfs_zero_eof(vp, io, offset, isize, 
>diop->xd_pmp);
>+                       xfs_zero_eof(vp, io, offset, isize,
>+                                       diop->xd_pmp, &dirty);
>                }
>                XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
>        }
>diff -Naur ../../xfs-tot/linux/include/linux/page_buf.h 
>./include/linux/page_buf.h
>--- ../../xfs-tot/linux/include/linux/page_buf.h        Tue 
>Nov 28 16:34:57 2000
>+++ ./include/linux/page_buf.h  Fri Dec  1 16:38:38 2000
>@@ -570,7 +570,8 @@
> extern int pagebuf_iozero(             /* zero contents of 
>buffer      */
>                page_buf_t *,           /* buffer to zero      
>         */
>                off_t,                  /* offset in buffer    
>         */
>-               size_t);                /* size of data to 
>zero         */
>+               size_t,                 /* size of data to 
>zero         */
>+               int *);                 /* generated new dirty 
>data?    */
> 
> extern int pagebuf_mapin(              /* make buffer 
>addressable      */
>                page_buf_t *);          /* buffer to make 
>addressable   */
>@@ -635,7 +636,8 @@
>                struct file *,          /* file to write       
>         */
>                char *,                 /* buffer address      
>         */
>                size_t,                 /* size of buffer      
>         */
>-               loff_t *);              /* file offset to use 
>and update */
>+               loff_t *,               /* file offset to use 
>and update */
>+               int *);                 /* dirty indicator     
>         */
> 
>        /*
>         * pagebuf_generic_file_write writes data from the 
>specified file
>diff -Naur ../../xfs-tot/linux/include/linux/pagemap.h 
>./include/linux/pagemap.h
>--- ../../xfs-tot/linux/include/linux/pagemap.h Tue Nov 28 
>16:34:57 2000
>+++ ./include/linux/pagemap.h   Fri Dec  1 16:38:39 2000
>@@ -70,7 +70,7 @@
> extern struct page * __find_lock_page (struct address_space * mapping,
>                                unsigned long index, struct 
>page **hash);
> extern struct page * __find_lock_page_nowait (struct 
>address_space * mapping,
>-                               unsigned long index, struct 
>page **hash);
>+                               unsigned long index, struct 
>page **hash, int);
> extern void lock_page(struct page *page);
> #define find_lock_page(mapping, index) \
>                __find_lock_page(mapping, index, 
>page_hash(mapping, index))
>diff -Naur ../../xfs-tot/linux/include/linux/swap.h 
>./include/linux/swap.h
>--- ../../xfs-tot/linux/include/linux/swap.h    Tue Nov 28 
>16:34:59 2000
>+++ ./include/linux/swap.h      Fri Dec  1 16:36:29 2000
>@@ -208,6 +208,9 @@
> #define ZERO_PAGE_BUG \
>        if (page_count(page) == 0) BUG();
> 
>+#define DELALLOC_DEBUG_PAGE \
>+       if (test_bit(PG_delalloc, &(page)->flags)) BUG();
>+
> #define add_page_to_active_list(page) { \
>        DEBUG_ADD_PAGE \
>        ZERO_PAGE_BUG \
>@@ -228,6 +231,7 @@
> #define add_page_to_inactive_clean_list(page) { \
>        DEBUG_ADD_PAGE \
>        ZERO_PAGE_BUG \
>+       DELALLOC_DEBUG_PAGE \
>        SetPageInactiveClean(page); \
>        list_add(&(page)->lru, &page->zone->inactive_clean_list); \
>        page->zone->inactive_clean_pages++; \
>diff -Naur ../../xfs-tot/linux/mm/filemap.c ./mm/filemap.c
>--- ../../xfs-tot/linux/mm/filemap.c    Tue Nov 28 16:35:03 2000
>+++ ./mm/filemap.c      Thu Nov 30 10:29:41 2000
>@@ -252,6 +252,24 @@
>        spin_unlock(&pagecache_lock);
> }
> 
>+static inline struct page * __find_page_nolock_noref(struct 
>address_space *mapping, unsigned
>long offset, struct page *page)
>+{
>+       goto inside;
>+
>+       for (;;) {
>+               page = page->next_hash;
>+inside:
>+               if (!page)
>+                       goto not_found;
>+               if (page->mapping != mapping)
>+                       continue;
>+               if (page->index == offset)
>+                       break;
>+       }
>+not_found:
>+       return page;
>+}
>+
> static inline struct page * __find_page_nolock(struct 
>address_space *mapping, unsigned long
>offset, struct page *page)
> {
>        goto inside;
>@@ -580,17 +598,19 @@
> }
> 
> struct page * __find_lock_page_nowait(struct address_space *mapping,
>-                               unsigned long offset, struct 
>page **hash)
>+                       unsigned long offset, struct page 
>**hash, int check)
> {
>        struct page *page;
> 
>        spin_lock(&pagecache_lock);
>-       page = __find_page_nolock(mapping, offset, *hash);
>+       page = __find_page_nolock_noref(mapping, offset, *hash);
>        if (page)
>                page_cache_get(page);
>        spin_unlock(&pagecache_lock);
> 
>-       if (page && TryLockPage(page)) {
>+       if (page &&
>+               ((check && page->age >= PAGE_AGE_START) || 
>TryLockPage(page)))
>+       {
>                /* don't wait for page */
>                put_page(page);
>                return NULL;
>diff -Naur ../../xfs-tot/linux/mm/swap.c ./mm/swap.c
>--- ../../xfs-tot/linux/mm/swap.c       Tue Nov 28 16:35:03 2000
>+++ ./mm/swap.c Wed Nov  1 14:03:55 2000
>@@ -173,7 +173,8 @@
>         * inactive_clean list it doesn't need to be perfect...
>         */
>        int maxcount = (page->buffers ? 3 : 2);
>-       page->age = 0;
>+       if (page->age)
>+               return;
>        ClearPageReferenced(page);
> 
>        /*
>@@ -181,8 +182,7 @@
>         * (some pages aren't on any list at all)
>         */
>        if (PageActive(page) && page_count(page) <= maxcount &&
>-                       !page_ramdisk(page) && 
>-                       !test_bit(PG_delalloc, &page->flags))
>+                       !page_ramdisk(page)) 
>        {
> 
>                /*
>@@ -194,7 +194,9 @@
>                 * need to be cleared away) and/or the function calling
>                 * us has an extra reference count on the page.
>                 */
>-               if (page->buffers || page_count(page) == 2) {
>+               if (page->buffers || page_count(page) == 2 
>+                       || test_bit(PG_delalloc, &page->flags))
>+               {
>                        del_page_from_active_list(page);
>                        add_page_to_inactive_dirty_list(page);
>                /*
>-------------------------------- patch ends 
>------------------------------
>
>-- 
>---------------------------------------------------------------
>-----------
>Rajagopal Ananthanarayanan ("ananth")
>Member Technical Staff, SGI.
>---------------------------------------------------------------
>-----------
>

<Prev in Thread] Current Thread [Next in Thread>