xfs
[Top] [All Lists]

Re: kioclusters

To: "Davida, Joe" <Joe_Davida@xxxxxxxxxx>
Subject: Re: kioclusters
From: Rajagopal Ananthanarayanan <ananth@xxxxxxx>
Date: Mon, 04 Dec 2000 16:53:24 -0800
Cc: "'linux-xfs@xxxxxxxxxxx'" <linux-xfs@xxxxxxxxxxx>
References: <09D1E9BD9C30D311919200A0C9DD5C2C0253704F@mcaexc01.msj.maxtor.com>
Sender: owner-linux-xfs@xxxxxxxxxxx
"Davida, Joe" wrote:
> 
> You say you have seen performance problems with non-sequential
> localized writes using the kiocluster option on mounts.
> Is the performance problem bad enough so that it is worse
> than ext2fs?
> 
> Joe


Performance difference really depends on what is being run.
We have noticed that for dbench with small number of clients,
xfs+kiocluter does worse than ext2 ... but for dbench with
larger number of clients xfs+kiocluster does better than ext2.

I've been working on a patch which in my tests with dbench
has improved xfs+kiocluster significantly. If you are working
with a test system, can you please try the following patch
(in conjuction with the kiocluster option)? The patch is against
the latest development xfs tree ...


----------------- patch begins -------------------------------
diff -Naur ../../xfs-tot/linux/fs/pagebuf/page_buf.c ./fs/pagebuf/page_buf.c
--- ../../xfs-tot/linux/fs/pagebuf/page_buf.c   Tue Nov 28 16:34:17 2000
+++ ./fs/pagebuf/page_buf.c     Thu Nov 30 10:29:39 2000
@@ -186,7 +186,7 @@
 unsigned long pagebuf_min[P_PARAM] = { HZ/2, 1*HZ, HZ/2, 1, 0, 0 };
 unsigned long pagebuf_max[P_PARAM] = { HZ*30, HZ*300, HZ*30, 1024, 4096, 1 };
 
-pagebuf_param_t pb_params = {{ HZ, 15 * HZ, HZ, 512, 1024, 0 }};
+pagebuf_param_t pb_params = {{ HZ, 15 * HZ, 2*HZ, 512, 1024, 0 }};
 
 /*
  * Pagebuf statistics variables
@@ -595,7 +595,11 @@
                        kp->maplist[pi] = cp;
                } else {
                        cp = kp->maplist[pi];
-                       set_bit(PG_referenced, &cp->flags);
+                       /*
+                        * Don't set referenced bit;
+                        * we may consider age_page_up.
+                       //set_bit(PG_referenced, &cp->flags);
+                        */
                        while (TryLockPage(cp)) {
                                wait_on_page(cp);
                        }
@@ -1345,7 +1349,7 @@
        struct buffer_head *bh;
        off_t blk_offset;
        size_t blk_length;
-       int err=0;
+       int err=0, need_balance_dirty = 0;
        int force_io = (rw != READ) || (pb->pb_flags & PBF_FORCEIO);
        int concat_ok = ((MAJOR(dev) != LVM_BLK_MAJOR) || (MAJOR(dev) != 
MD_MAJOR));
 
@@ -1425,7 +1429,9 @@
 
                        if (rw == WRITE ) {
                                set_bit(BH_Uptodate, &bh->b_state);
-                               set_bit(BH_Dirty, &bh->b_state);
+                               if (!buffer_dirty(bh))
+                                       need_balance_dirty = 1;
+                               __mark_buffer_dirty(bh);
                        }
                        psync->bh[cnt++] = bh;
                        atomic_inc(&psync->remain);
@@ -1444,7 +1450,8 @@
                if (locking)
                        UnlockPage(page);
        }
-
+       if (need_balance_dirty)
+               balance_dirty(dev);
        return err;
 error:
        /* If we ever do get here then clean up what we already did */
diff -Naur ../../xfs-tot/linux/fs/pagebuf/page_buf_io.c 
./fs/pagebuf/page_buf_io.c
--- ../../xfs-tot/linux/fs/pagebuf/page_buf_io.c        Tue Nov 28 16:34:17 2000
+++ ./fs/pagebuf/page_buf_io.c  Fri Dec  1 10:31:28 2000
@@ -93,11 +93,11 @@
 /*
  * Forward declarations.
  */
-STATIC void __pb_block_commit_write_async(struct inode *,
+STATIC int __pb_block_commit_write_async(struct inode *,
                struct page *, page_buf_bmap_t *);
 STATIC int __pb_block_prepare_write_async(struct inode *, struct page *,
                unsigned, unsigned, int, page_buf_bmap_t *, int);
-STATIC int pagebuf_delalloc_convert(struct page *, u_long, struct page **);
+STATIC int pagebuf_delalloc_convert(struct page *, u_long, struct page **, 
int, int);
 STATIC void hook_buffers_to_page(struct inode *, struct page *,
                page_buf_bmap_t *, ulong);
 
@@ -114,6 +114,20 @@
 int MAX_CLUSTER = 512;
 int CLUSTER_PAGE_LIST_SIZE = ((2*512)+1);
 
+/*
+ * stats
+ */
+struct pb_io_stat_s {
+       int pcd_normal_scan;
+       int pcd_normal_converted;
+
+       int pcd_skip_locked;
+       int pcd_skip_referenced;
+
+       int pcd_ilock_failed;
+} pb_io_stat = {0, 0, 0, 0, 0};
+
+EXPORT_SYMBOL(pb_io_stat);
 
 /*
  * The minimum size where we will start using pagebuf structures instead
@@ -210,10 +224,12 @@
 
        if (IS_KIOCLUSTER(ip)){
                /*
-                * If kmalloc, no big deal; the lower layers won't cluster.
+                * If kmalloc fails, no big deal; the lower layers won't
+                * cluster. Also, this allocation has to be non-sleeping
+                * since this can be in kswapd's path ...
                 */
                cpages = kmalloc(CLUSTER_PAGE_LIST_SIZE * sizeof(struct page *),
-                                          GFP_KERNEL);
+                                          GFP_ATOMIC);
        } else {
                cpages = (struct page **)1; /* a boolean */
        }
@@ -235,7 +251,7 @@
 
                        lock_page(page);
                        if (test_and_clear_bit(PG_delalloc, &page->flags)) {
-                               pagebuf_delalloc_convert(page, 0, cpages);
+                               pagebuf_delalloc_convert(page, 0, cpages, 0, 0);
                        } else {
                                UnlockPage(page);
                        }
@@ -511,7 +527,8 @@
 int pagebuf_iozero(            /* zero contents of buffer      */
     page_buf_t * pb,           /* buffer to zero               */
     off_t boff,                        /* offset in buffer             */
-    size_t bsize)              /* size of data to zero           */
+    size_t bsize,              /* size of data to zero           */
+    int  *dirty)
 {
        loff_t cboff;
        size_t cpoff;
@@ -541,6 +558,7 @@
                if (pb->pb_bn == PAGE_BUF_DADDR_NULL) {
                        if (test_and_set_bit(PG_delalloc, &pm->flags) == 0) {
                                atomic_inc(&pb_delalloc_pages);
+                               (*dirty)++;
                        }
                }
        }
@@ -548,16 +566,6 @@
        pb->pb_flags &= ~(PBF_READ | PBF_WRITE);
        pb->pb_flags &= ~(_PBF_SOME_INVALID_PAGES | PBF_PARTIAL | PBF_NONE);
 
-       if (!pcd_active && (pb->pb_bn == PAGE_BUF_DADDR_NULL)) {
-               unsigned int    np = atomic_read(&pb_delalloc_pages);
-
-               if (np > 2 * pb_params.p_un.max_dirty_pages)
-                       wake_up_interruptible_sync(&pcd_waitq);
-               else if (np > pb_params.p_un.max_dirty_pages)
-                       wake_up_interruptible(&pcd_waitq);
-       }
-
-
        return (0);
 }
 
@@ -1174,62 +1182,6 @@
                        page, page->index, bh->b_blocknr));
 }
 
-
-void
-set_buffer_dirty_uptodate(struct buffer_head *bh)
-{
-       int need_balance_dirty = 0;
-
-       if (bh->b_blocknr <= 0) {
-               printk("Warning: buffer 0x%p with weird blockno (%ld)\n",
-                       bh, bh->b_blocknr);
-       }
-       set_bit(BH_Uptodate, &bh->b_state);
-       if (!buffer_dirty(bh)) {
-               bh->b_end_io = end_pb_buffer_io_async;
-               need_balance_dirty = 1;
-       }
-       __mark_buffer_dirty(bh);
-
-       if (need_balance_dirty)
-               balance_dirty(bh->b_dev);
-}
-
-int pbwcm_debug = 0;
-
-int
-__pb_write_or_convert_bmap(
-       struct inode *inode,
-       struct page *page)
-{
-       loff_t  offset = page->index << PAGE_CACHE_SHIFT;
-       int error, nmaps;
-       page_buf_bmap_t map;
-
-       error = inode->i_op->pagebuf_bmap(inode, offset, PAGE_CACHE_SIZE,
-                       &map, 1, &nmaps, PBF_WRITE);
-       if (error == 0 && (map.pbm_flags & PBMF_DELAY)) {
-               error = inode->i_op->pagebuf_bmap(inode, offset,
-                               map.pbm_bsize, &map, 1,
-                               &nmaps, PBF_WRITE|PBF_FILE_ALLOCATE);
-               if (error) {
-                       printk("pbwcm: bmap error %d ro 0x%Lx size 0x%x\n",
-                                  error, offset, map.pbm_bsize);
-               } else {
-                       dprintk(pbwcm_debug,
-                        ("converted bn:%Ld off:%Ld size:%d flags:%d\n",
-                            map.pbm_bn, map.pbm_offset,
-                            map.pbm_bsize, map.pbm_flags));
-               }
-       }
-       if (!error) {
-               hook_buffers_to_page(inode, page, &map, PAGE_CACHE_SHIFT);
-               set_buffer_dirty_uptodate(page->buffers);
-       }
-       return error;
-}
-
-
 STATIC int
 __pb_block_prepare_write_async(struct inode *inode, struct page *page,
                unsigned from, unsigned to, int at_eof,
@@ -1390,15 +1342,34 @@
 }
 
 int pbcw_debug = 0;
+
+int
+set_buffer_dirty_uptodate(struct buffer_head *bh)
+{
+       int need_balance_dirty = 0;
+
+       if (bh->b_blocknr <= 0) {
+               printk("Warning: buffer 0x%p with weird blockno (%ld)\n",
+                       bh, bh->b_blocknr);
+       }
+       set_bit(BH_Uptodate, &bh->b_state);
+       if (!buffer_dirty(bh)) {
+               bh->b_end_io = end_pb_buffer_io_async;
+               need_balance_dirty = 1;
+       }
+       __mark_buffer_dirty(bh);
+       return (need_balance_dirty);
+}
+
 int pbcw_debug2 = 0;
 
-STATIC void
+STATIC int
 __pb_block_commit_write_async(struct inode     *inode,
                                struct page     *page,
                                page_buf_bmap_t *mp)
 {
        struct buffer_head      *bh;
-       unsigned int            np;
+       int                     dirty = 0;
 
        /*
         * Prepare write took care of reading/zero-out
@@ -1412,32 +1383,20 @@
                if (test_bit(PG_delalloc, &page->flags)) {
                        dprintk(pbcw_debug2, ("mapped buffer 0x%p page 0x%p is 
delalloc\n", bh,
page));
                }
-               set_buffer_dirty_uptodate(page->buffers);
+               dirty = set_buffer_dirty_uptodate(page->buffers);
                dprintk(pbcw_debug, ("pbcw: refiled valid buffer 0x%p\n",
                        page->buffers));
        } else if (test_and_set_bit(PG_delalloc, &page->flags) == 0) {
                dprintk(pbcw_debug, ("Marking page 0x%p delalloc\n", page));
-               np = atomic_read(&pb_delalloc_pages);
-               if (np > PB_MAX_DIRTY_FACTOR * pb_params.p_un.max_dirty_pages) {
-                       clear_bit(PG_delalloc, &page->flags);
-                       if (__pb_write_or_convert_bmap(inode, page)) {
-                               BUG();
-                       }
-               } else {
-                       atomic_inc(&pb_delalloc_pages);
-                       if (!pcd_active) {
-                               if (np > 2 * pb_params.p_un.max_dirty_pages)
-                                       wake_up_interruptible_sync(&pcd_waitq);
-                               else if (np > pb_params.p_un.max_dirty_pages)
-                                       wake_up_interruptible(&pcd_waitq);
-                       }
-                       balance_dirty(inode->i_rdev);
-               }
+
+               atomic_inc(&pb_delalloc_pages);
+               dirty = 1;
        }
 
        /* Advance though extent no matter what */
        if (mp)
                mp->pbm_delta += PAGE_CACHE_SIZE;
+       return dirty;
 }
 
 int
@@ -1448,7 +1407,8 @@
        char            *user_addr,
        size_t          len,
        loff_t          *lp,
-       page_buf_bmap_t *mp)            /* bmap for page                     */
+       page_buf_bmap_t *mp,            /* bmap for page                     */
+       int             *dirty)
 {
        struct page *page;
        unsigned long done;
@@ -1507,7 +1467,7 @@
                        goto unlock;
                }
 
-               __pb_block_commit_write_async(inode, page, mp);
+               *dirty += __pb_block_commit_write_async(inode, page, mp);
 
                foff += bytes_in_page;
                len -= bytes_in_page;
@@ -1533,7 +1493,8 @@
        char *buf,              /* buffer address               */
        size_t len,             /* size of buffer               */
        loff_t * lp,            /* file offset to use and update */
-       int pb_flags)           /* flags to pass to bmap calls  */
+       int pb_flags,           /* flags to pass to bmap calls  */
+       int *dirty)
 {
        struct inode *inode = filp->f_dentry->d_inode;
        page_buf_bmap_t map;
@@ -1628,7 +1589,7 @@
                 */
                status = __pagebuf_do_delwri(inode,
                                rounded_offset, size, buf,
-                               len, &foff, &map);
+                               len, &foff, &map, dirty);
                if (status <= 0)
                        break;
                written += status;
@@ -1646,7 +1607,8 @@
     struct file * filp,                /* file to write                */
     char *buf,                 /* buffer address               */
     size_t len,                        /* size of buffer               */
-    loff_t * lp)               /* file offset to use and update */
+    loff_t * lp,               /* file offset to use and update */
+    int        *dirty)
 {
        struct inode *inode = filp->f_dentry->d_inode;
        unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
@@ -1711,7 +1673,7 @@
 
                if (!page) {
                        status = _pagebuf_file_write(filp,
-                                       buf, len, &foff, pb_flags);
+                                       buf, len, &foff, pb_flags, dirty);
                        if (status > 0)
                                written += status;
 
@@ -1748,7 +1710,7 @@
                        goto unlock;
                }
 
-               __pb_block_commit_write_async(inode, page, &map);
+               *dirty += __pb_block_commit_write_async(inode, page, &map);
 
                len -= bytes;
                buf += bytes;
@@ -1773,8 +1735,6 @@
 }
 
 int pcd_debug = 0;
-int pcd_skip_locked = 0;
-int pcd_ilock_failed = 0;
 static int page_cleaner_daemon_started = 0;
 static int daemon_terminate = 0;
 
@@ -1783,12 +1743,12 @@
  * Returns page locked and with an extra reference count.
  */
 STATIC struct page *
-probe_page(struct inode *inode, unsigned long index)
+probe_page(struct inode *inode, unsigned long index, int check)
 {
        struct page *page;
 
        page = __find_lock_page_nowait(inode->i_mapping, index,
-                               page_hash(inode->i_mapping, index));
+                               page_hash(inode->i_mapping, index), check);
        if (!page)
                return NULL;
        if (!test_and_clear_bit(PG_delalloc, &(page)->flags)) {
@@ -1820,26 +1780,33 @@
 kio_cluster_write(struct inode *inode,
              struct page  *startpage,
              page_buf_bmap_t *mp,
-             struct page **cpages)
+             struct page **cpages,
+             int np,
+             int check)
 {
        unsigned long   tindex, tlast;
        struct page     **pcp, **pcstart;
        loff_t          cstart_offset;
        page_buf_t      *pb;
        size_t          csize;
-       int             count = pb_params.p_un.max_cluster;
+       int             m, count = pb_params.p_un.max_cluster;
 
-       pcp = &cpages[MAX_CLUSTER]; /* start from the middle */
        dprintk(cluster_debug,
                ("cluster_write: inode 0x%p page 0x%p index 0x%lx\n",
                        inode, startpage, startpage->index));
+
+       if (np && count > np) /* obey limit if supplied */
+               count = np;
+       m = count >> 1;         /* start from middle */ 
+       pcp = &cpages[m];
        *pcp-- = startpage;
+       count--;
        if (startpage->index != 0) {
                tlast = mp->pbm_offset >> PAGE_CACHE_SHIFT;
                for (tindex = startpage->index-1; tindex >= tlast &&
                      pcp >= &cpages[0] && count; tindex--, pcp--, count--)
                {
-                       if (!(*pcp = probe_page(inode, tindex)))
+                       if (!(*pcp = probe_page(inode, tindex, check)))
                                break;
                        dprintk(cluster_debug,
                            ("cluster_write(L): inode 0x%p page 0x%p idx 
0x%lx\n",
@@ -1849,11 +1816,11 @@
        pcstart = pcp+1;
        tlast = PAGE_CACHE_ALIGN_LL(mp->pbm_offset + mp->pbm_bsize) >>
                                                        PAGE_CACHE_SHIFT;
-       for (tindex = startpage->index + 1, pcp = &cpages[MAX_CLUSTER+1];
-               tindex < tlast && pcp < &cpages[CLUSTER_PAGE_LIST_SIZE] && 
count;
+       for (tindex = startpage->index + 1, pcp = &cpages[m+1];
+               tindex < tlast && pcp < &cpages[2*m] && count;
                tindex++, pcp++, count--)
        {
-               if (!(*pcp = probe_page(inode, tindex)))
+               if (!(*pcp = probe_page(inode, tindex, check)))
                        break;
                dprintk(cluster_debug,
                        ("cluster_write(R): inode 0x%p page 0x%p index 0x%lx\n",
@@ -1920,7 +1887,8 @@
 STATIC void
 cluster_write(struct inode *inode,
              unsigned long index,
-             page_buf_bmap_t *mp)
+             page_buf_bmap_t *mp,
+             int check)
 {
        unsigned long tindex;
        unsigned long tlast;
@@ -1930,7 +1898,7 @@
        if (index != 0) {
                tlast = mp->pbm_offset >> PAGE_CACHE_SHIFT;
                for (tindex = index-1; tindex >= tlast; tindex--) {
-                       if (!(page = probe_page(inode, tindex)))
+                       if (!(page = probe_page(inode, tindex, check)))
                                break;
                        convert_page(inode, page, mp);
                }
@@ -1938,13 +1906,12 @@
        tlast = PAGE_CACHE_ALIGN_LL(mp->pbm_offset + mp->pbm_bsize) >>
                                                        PAGE_CACHE_SHIFT;
        for (tindex = index + 1; tindex < tlast; tindex++) {
-               if (!(page = probe_page(inode, tindex)))
+               if (!(page = probe_page(inode, tindex, check)))
                        break;
                convert_page(inode, page, mp);
        }
 }
 
-
 int
 pagebuf_convert_page(struct page *page, int toss, int wait)
 {
@@ -1972,7 +1939,9 @@
 pagebuf_delalloc_convert(
        struct page *mm,        /* delalloc page to convert - locked */
        u_long  flags,          /* flags to pass to bmap call */
-       struct page **cpages)   /* can we cluster conversion? */ 
+       struct page **cpages,   /* can we cluster conversion? */ 
+       int np,                 /* n pages in cpages          */
+       int check)              /* check flush times          */
 {
        page_buf_bmap_t maps[PBF_MAX_MAPS];
        struct inode *inode;
@@ -1996,7 +1965,7 @@
 
        if (error) {
                if (error == -EAGAIN) {
-                       pcd_ilock_failed++;
+                       pb_io_stat.pcd_ilock_failed++;
                        set_bit(PG_delalloc, &mm->flags);
                 } else {
                        printk("PCD: pagebuf_bmap error %d pb_flags 0x%lx\n",
@@ -2020,13 +1989,13 @@
        if (cpages) {
                if (IS_KIOCLUSTER(inode)) {
                        get_page(mm);
-                       count = kio_cluster_write(inode, mm, &maps[0], cpages);
+                       count = kio_cluster_write(inode, mm, &maps[0], cpages, 
np, check);
                } else {
                        hook_buffers_to_page(inode, mm, &maps[0],
                                                        PAGE_CACHE_SHIFT);
                        set_buffer_dirty_uptodate(mm->buffers);
                        UnlockPage(mm);
-                       cluster_write(inode, mm->index, &maps[0]);
+                       cluster_write(inode, mm->index, &maps[0], check);
                        count = 1;
                }
 
@@ -2042,6 +2011,8 @@
 }
 
 int pcd_debug2 = 0;
+int sum_min = 0;
+EXPORT_SYMBOL(sum_min);
 
 STATIC int
 page_cleaner_daemon(void *data)
@@ -2049,9 +2020,8 @@
        mem_map_t *mm = &mem_map[0], *mmlast = &mem_map[max_mapnr];
        u_long flags;
        struct buffer_head *bh;
-       int     pb_min_save = PB_MIN_DIRTY_PAGES;
        struct page **cpages;
-       int     looped, sum;
+       int     looped, tsum, sum;
 
        /*  Set up the thread  */
        exit_files(current);
@@ -2074,7 +2044,6 @@
        cpages = kmalloc(CLUSTER_PAGE_LIST_SIZE * sizeof(struct page *),
                                GFP_KERNEL);
 
-       mm = &mem_map[0] - 1;
        while (1) {
                /*
                 * If we actually get into a low-memory situation,
@@ -2082,10 +2051,11 @@
                 * up on a more timely basis.
                 */
 
-               pcd_skip_locked = 0;
-               pcd_ilock_failed = 0;
+               pb_io_stat.pcd_skip_locked = pb_io_stat.pcd_skip_referenced = 0;
+               pb_io_stat.pcd_ilock_failed = 0;
                sum = looped = 0;
-               while (atomic_read(&pb_delalloc_pages) > PB_MIN_DIRTY_PAGES) {
+               mm = &mem_map[0] - 1;
+               while (1) {
                        if (current->need_resched)
                                schedule();
 
@@ -2101,8 +2071,12 @@
                        }
                        if (!test_bit(PG_delalloc, &(mm)->flags))
                                continue;
+                       if (mm->age >= PAGE_AGE_START && !looped) {
+                               pb_io_stat.pcd_skip_referenced++;
+                               continue;
+                       }
                        if (TryLockPage(mm)) {
-                               pcd_skip_locked++;
+                               pb_io_stat.pcd_skip_locked++;
                                continue;
                        }
                        if (!test_and_clear_bit(PG_delalloc, &(mm)->flags)) {
@@ -2129,16 +2103,20 @@
 /* since bmap can block, this should be in a different daemon       */
 /*---------------- DELALLOC CONVERT --------------------------------*/
 
-                       sum += pagebuf_delalloc_convert(mm,
-                               PBF_BMAP_TRY_ILOCK, cpages);
+                       tsum = pagebuf_delalloc_convert(mm,
+                               PBF_BMAP_TRY_ILOCK, cpages, 0, 0);
+
+                       pb_io_stat.pcd_normal_converted += tsum;
+                       sum += tsum;
 
                        /* Do not let too many pages get locked up
                         * waiting for the queue to open in here
                         */
-                       if (sum > 256) {
+                       if (tsum > 256) {
                                run_task_queue(&tq_disk);
-                               sum = 0;
                        }
+                       if (sum > sum_min)
+                               break;
 
                }
                run_task_queue(&tq_disk);
@@ -2149,18 +2127,9 @@
                        wake_up_interruptible(&pcd_waitq);
                        break;
                }
-
-               /*
-                * if woken up periodically (nothing else to do)
-                * convert all the pages, else convert only
-                * to keep watermarks happy.
-                */
-               if (interruptible_sleep_on_timeout(&pcd_waitq,
-                               pb_params.p_un.cluster_interval) == 0)
-               {
-                       PB_MIN_DIRTY_PAGES = 0;
-               } else
-                       PB_MIN_DIRTY_PAGES = pb_min_save;
+               interruptible_sleep_on_timeout(&pcd_waitq,
+                               pb_params.p_un.cluster_interval);
+               pb_io_stat.pcd_normal_scan++;
                pcd_active = 1;
        }
        kfree(cpages);
diff -Naur ../../xfs-tot/linux/fs/xfs/linux/xfs_lrw.c ./fs/xfs/linux/xfs_lrw.c
--- ../../xfs-tot/linux/fs/xfs/linux/xfs_lrw.c  Mon Dec  4 13:28:38 2000
+++ ./fs/xfs/linux/xfs_lrw.c    Fri Dec  1 10:30:10 2000
@@ -77,7 +77,8 @@
        char            *buf,
        size_t          size,
        loff_t          *offsetp,
-       int             read)   /* set if read, otherwise this is write */
+       int             read,   /* set if read, otherwise this is write */
+       int             *dirty)
 {
        ssize_t         ret;
        struct xfs_inode *xip;
@@ -98,7 +99,7 @@
                if (!(filp->f_flags & O_INVISIBLE))
                        xfs_ichgtime(xip, XFS_ICHGTIME_ACC);
        } else {
-               ret = pagebuf_generic_file_write(filp, buf, size, offsetp);
+               ret = pagebuf_generic_file_write(filp, buf, size, offsetp, 
dirty);
        }
 out:
        return(ret);
@@ -118,6 +119,7 @@
        vnode_t         *vp;
        xfs_inode_t     *ip;
 #endif
+       int dirty = 0;
 
        n = XFS_MAX_FILE_OFFSET - *offsetp;
        if (n <= 0)
@@ -145,7 +147,8 @@
        }
 #endif /* CONFIG_XFS_DMAPI */
 
-       ret = xfs_rdwr(bdp, filp, buf, size, offsetp, 1);
+       /* dirty doesn't matter */
+       ret = xfs_rdwr(bdp, filp, buf, size, offsetp, 1, &dirty);
        return(ret);
 }
 
@@ -168,7 +171,8 @@
        xfs_iocore_t    *io,
        xfs_off_t       offset,
        xfs_fsize_t     isize,
-       struct pm       *pmp)
+       struct pm       *pmp,
+       int             *dirty)
 {
        xfs_fileoff_t   last_fsb;
        xfs_fileoff_t   next_fsb;
@@ -342,7 +346,7 @@
                        printk("xfs_zero_last_block: unwritten?\n");
                }
        } else {
-               error = pagebuf_iozero(pb, zero_offset, zero_len);
+               error = pagebuf_iozero(pb, zero_offset, zero_len, dirty);
                pagebuf_rele(pb);
                goto out_lock;
        }
@@ -358,7 +362,7 @@
              ("zlb: pb_iozero pb 0x%p zf 0x%x zl 0x%x\n",
                pb, zero_offset, zero_len));
 
-       if (error = pagebuf_iozero(pb, zero_offset, zero_len)) {
+       if (error = pagebuf_iozero(pb, zero_offset, zero_len, dirty)) {
                pagebuf_rele(pb);
                goto out_lock;
        }
@@ -409,7 +413,8 @@
        xfs_iocore_t    *io,
        xfs_off_t       offset,
        xfs_fsize_t     isize,
-       struct pm       *pmp)
+       struct pm       *pmp,
+       int             *dirty)
 {
        struct inode    *ip = vp->v_inode;
        xfs_fileoff_t   start_zero_fsb;
@@ -440,7 +445,7 @@
         * First handle zeroing the block on which isize resides.
         * We only zero a part of that block so it is handled specially.
         */
-       error = xfs_zero_last_block(ip, io, offset, isize, pmp);
+       error = xfs_zero_last_block(ip, io, offset, isize, pmp, dirty);
        if (error) {
                ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
                ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
@@ -555,7 +560,7 @@
                }
 
                if (imap.br_startblock == DELAYSTARTBLOCK) {
-                       error = pagebuf_iozero(pb, 0, lsize);
+                       error = pagebuf_iozero(pb, 0, lsize, dirty);
                        pagebuf_rele(pb);
                } else {
                        pb->pb_bn = XFS_FSB_TO_DB_IO(io, imap.br_startblock);
@@ -568,7 +573,7 @@
                                        ("xfs_zero_eof: real time device? use 
diff inode\n"));
                        }
 
-                       if (error = pagebuf_iozero(pb, 0, lsize)) {
+                       if (error = pagebuf_iozero(pb, 0, lsize, dirty)) {
                                pagebuf_rele(pb);
                                goto out_lock;
                        }
@@ -629,6 +634,7 @@
        int             eventsent = 0;
        loff_t          savedsize = *offsetp;
 #endif
+       int             dirty = 0;
 
        vp = BHV_TO_VNODE(bdp);
        xip = XFS_BHVTOI(bdp);
@@ -704,7 +710,7 @@
        if (*offsetp > isize && isize) {
                io->io_writeio_blocks = mp->m_writeio_blocks;
                ret = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offsetp,
-                       isize, NULL);
+                       isize, NULL, &dirty);
                if (ret) {
                        xfs_iunlock(xip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL);
                        return(ret); /* JIMJIM should this be negative? */
@@ -713,7 +719,7 @@
        xfs_iunlock(xip, XFS_ILOCK_EXCL);
 
 retry:
-       ret = xfs_rdwr(bdp, filp, buf, size, offsetp, 0);
+       ret = xfs_rdwr(bdp, filp, buf, size, offsetp, 0, &dirty);
 
 #ifdef CONFIG_XFS_DMAPI
        if ((ret == -ENOSPC) &&
@@ -754,6 +760,8 @@
                }
        }
        xfs_iunlock(xip, XFS_IOLOCK_EXCL);
+       if (dirty)
+               balance_dirty(ip->i_dev);
        return(ret);
 }
 
diff -Naur ../../xfs-tot/linux/fs/xfs/linux/xfs_lrw.h ./fs/xfs/linux/xfs_lrw.h
--- ../../xfs-tot/linux/fs/xfs/linux/xfs_lrw.h  Tue Nov 28 16:34:23 2000
+++ ./fs/xfs/linux/xfs_lrw.h    Wed Oct 25 12:37:18 2000
@@ -48,7 +48,7 @@
 extern int xfs_bdstrat_cb (struct xfs_buf *);
 
 extern int xfs_zero_eof (vnode_t *, struct xfs_iocore *, xfs_off_t,
-                               xfs_fsize_t, struct pm *);
+                               xfs_fsize_t, struct pm *, int *dirty);
 extern ssize_t xfs_read (bhv_desc_t *, struct file *, char *,
                                size_t, loff_t *);
 extern ssize_t xfs_write (bhv_desc_t *, struct file *, char *,
diff -Naur ../../xfs-tot/linux/fs/xfs/xfs_inode.c ./fs/xfs/xfs_inode.c
--- ../../xfs-tot/linux/fs/xfs/xfs_inode.c      Tue Nov 28 16:34:30 2000
+++ ./fs/xfs/xfs_inode.c        Thu Nov 30 10:29:40 2000
@@ -1707,7 +1707,7 @@
        cred_t          *credp)
 {
        xfs_fsize_t     isize;
-       int             error;
+       int             error, dirty;
 
        ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
        ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
@@ -1720,7 +1720,8 @@
         * xfs_write_file() beyond the end of the file
         * and any blocks between the old and new file sizes.
         */
-       error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size, isize, 
NULL);
+       error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size, isize,
+                                                       NULL, &dirty);
        return error;
 }
 
diff -Naur ../../xfs-tot/linux/fs/xfs/xfs_rw.c ./fs/xfs/xfs_rw.c
--- ../../xfs-tot/linux/fs/xfs/xfs_rw.c Tue Nov 28 16:34:31 2000
+++ ./fs/xfs/xfs_rw.c   Wed Oct 25 12:11:52 2000
@@ -690,7 +690,7 @@
        void            *dio)               
 {
        xfs_dio_t       *diop = (xfs_dio_t *)dio;
-       int             relock;
+       int             relock, dirty;
        __uint64_t      flush_end;
        xfs_mount_t     *mp;
 
@@ -717,7 +717,8 @@
                XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
                isize = XFS_SIZE(mp, io);
                if (offset > isize) {
-                       xfs_zero_eof(vp, io, offset, isize, diop->xd_pmp);
+                       xfs_zero_eof(vp, io, offset, isize,
+                                       diop->xd_pmp, &dirty);
                }
                XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
        }
diff -Naur ../../xfs-tot/linux/include/linux/page_buf.h 
./include/linux/page_buf.h
--- ../../xfs-tot/linux/include/linux/page_buf.h        Tue Nov 28 16:34:57 2000
+++ ./include/linux/page_buf.h  Fri Dec  1 16:38:38 2000
@@ -570,7 +570,8 @@
 extern int pagebuf_iozero(             /* zero contents of buffer      */
                page_buf_t *,           /* buffer to zero               */
                off_t,                  /* offset in buffer             */
-               size_t);                /* size of data to zero         */
+               size_t,                 /* size of data to zero         */
+               int *);                 /* generated new dirty data?    */
 
 extern int pagebuf_mapin(              /* make buffer addressable      */
                page_buf_t *);          /* buffer to make addressable   */
@@ -635,7 +636,8 @@
                struct file *,          /* file to write                */
                char *,                 /* buffer address               */
                size_t,                 /* size of buffer               */
-               loff_t *);              /* file offset to use and update */
+               loff_t *,               /* file offset to use and update */
+               int *);                 /* dirty indicator              */
 
        /*
         * pagebuf_generic_file_write writes data from the specified file
diff -Naur ../../xfs-tot/linux/include/linux/pagemap.h ./include/linux/pagemap.h
--- ../../xfs-tot/linux/include/linux/pagemap.h Tue Nov 28 16:34:57 2000
+++ ./include/linux/pagemap.h   Fri Dec  1 16:38:39 2000
@@ -70,7 +70,7 @@
 extern struct page * __find_lock_page (struct address_space * mapping,
                                unsigned long index, struct page **hash);
 extern struct page * __find_lock_page_nowait (struct address_space * mapping,
-                               unsigned long index, struct page **hash);
+                               unsigned long index, struct page **hash, int);
 extern void lock_page(struct page *page);
 #define find_lock_page(mapping, index) \
                __find_lock_page(mapping, index, page_hash(mapping, index))
diff -Naur ../../xfs-tot/linux/include/linux/swap.h ./include/linux/swap.h
--- ../../xfs-tot/linux/include/linux/swap.h    Tue Nov 28 16:34:59 2000
+++ ./include/linux/swap.h      Fri Dec  1 16:36:29 2000
@@ -208,6 +208,9 @@
 #define ZERO_PAGE_BUG \
        if (page_count(page) == 0) BUG();
 
+#define DELALLOC_DEBUG_PAGE \
+       if (test_bit(PG_delalloc, &(page)->flags)) BUG();
+
 #define add_page_to_active_list(page) { \
        DEBUG_ADD_PAGE \
        ZERO_PAGE_BUG \
@@ -228,6 +231,7 @@
 #define add_page_to_inactive_clean_list(page) { \
        DEBUG_ADD_PAGE \
        ZERO_PAGE_BUG \
+       DELALLOC_DEBUG_PAGE \
        SetPageInactiveClean(page); \
        list_add(&(page)->lru, &page->zone->inactive_clean_list); \
        page->zone->inactive_clean_pages++; \
diff -Naur ../../xfs-tot/linux/mm/filemap.c ./mm/filemap.c
--- ../../xfs-tot/linux/mm/filemap.c    Tue Nov 28 16:35:03 2000
+++ ./mm/filemap.c      Thu Nov 30 10:29:41 2000
@@ -252,6 +252,24 @@
        spin_unlock(&pagecache_lock);
 }
 
+static inline struct page * __find_page_nolock_noref(struct address_space 
*mapping, unsigned
long offset, struct page *page)
+{
+       goto inside;
+
+       for (;;) {
+               page = page->next_hash;
+inside:
+               if (!page)
+                       goto not_found;
+               if (page->mapping != mapping)
+                       continue;
+               if (page->index == offset)
+                       break;
+       }
+not_found:
+       return page;
+}
+
 static inline struct page * __find_page_nolock(struct address_space *mapping, 
unsigned long
offset, struct page *page)
 {
        goto inside;
@@ -580,17 +598,19 @@
 }
 
 struct page * __find_lock_page_nowait(struct address_space *mapping,
-                               unsigned long offset, struct page **hash)
+                       unsigned long offset, struct page **hash, int check)
 {
        struct page *page;
 
        spin_lock(&pagecache_lock);
-       page = __find_page_nolock(mapping, offset, *hash);
+       page = __find_page_nolock_noref(mapping, offset, *hash);
        if (page)
                page_cache_get(page);
        spin_unlock(&pagecache_lock);
 
-       if (page && TryLockPage(page)) {
+       if (page &&
+               ((check && page->age >= PAGE_AGE_START) || TryLockPage(page)))
+       {
                /* don't wait for page */
                put_page(page);
                return NULL;
diff -Naur ../../xfs-tot/linux/mm/swap.c ./mm/swap.c
--- ../../xfs-tot/linux/mm/swap.c       Tue Nov 28 16:35:03 2000
+++ ./mm/swap.c Wed Nov  1 14:03:55 2000
@@ -173,7 +173,8 @@
         * inactive_clean list it doesn't need to be perfect...
         */
        int maxcount = (page->buffers ? 3 : 2);
-       page->age = 0;
+       if (page->age)
+               return;
        ClearPageReferenced(page);
 
        /*
@@ -181,8 +182,7 @@
         * (some pages aren't on any list at all)
         */
        if (PageActive(page) && page_count(page) <= maxcount &&
-                       !page_ramdisk(page) && 
-                       !test_bit(PG_delalloc, &page->flags))
+                       !page_ramdisk(page)) 
        {
 
                /*
@@ -194,7 +194,9 @@
                 * need to be cleared away) and/or the function calling
                 * us has an extra reference count on the page.
                 */
-               if (page->buffers || page_count(page) == 2) {
+               if (page->buffers || page_count(page) == 2 
+                       || test_bit(PG_delalloc, &page->flags))
+               {
                        del_page_from_active_list(page);
                        add_page_to_inactive_dirty_list(page);
                /*
-------------------------------- patch ends ------------------------------

-- 
--------------------------------------------------------------------------
Rajagopal Ananthanarayanan ("ananth")
Member Technical Staff, SGI.
--------------------------------------------------------------------------

<Prev in Thread] Current Thread [Next in Thread>