Thank you RajaGopal,
Will try the patch. By latest you mean as of what date?
My current tree is as of last Friday 12/1.
Cheers,
Joe
>-----Original Message-----
>From: Rajagopal Ananthanarayanan [mailto:ananth@xxxxxxx]
>Sent: Monday, December 04, 2000 4:53 PM
>To: Davida, Joe
>Cc: 'linux-xfs@xxxxxxxxxxx'
>Subject: Re: kioclusters
>
>
>"Davida, Joe" wrote:
>>
>> You say you have seen performance problems with non-sequential
>> localized writes using the kiocluster option on mounts.
>> Is the performance problem bad enough so that it is worse
>> than ext2fs?
>>
>> Joe
>
>
>Performance difference really depends on what is being run.
>We have noticed that for dbench with small number of clients,
>xfs+kiocluter does worse than ext2 ... but for dbench with
>larger number of clients xfs+kiocluster does better than ext2.
>
>I've been working on a patch which in my tests with dbench
>has improved xfs+kiocluster significantly. If you are working
>with a test system, can you please try the following patch
>(in conjuction with the kiocluster option)? The patch is against
>the latest development xfs tree ...
>
>
>----------------- patch begins -------------------------------
>diff -Naur ../../xfs-tot/linux/fs/pagebuf/page_buf.c
>./fs/pagebuf/page_buf.c
>--- ../../xfs-tot/linux/fs/pagebuf/page_buf.c Tue Nov 28
>16:34:17 2000
>+++ ./fs/pagebuf/page_buf.c Thu Nov 30 10:29:39 2000
>@@ -186,7 +186,7 @@
> unsigned long pagebuf_min[P_PARAM] = { HZ/2, 1*HZ, HZ/2, 1, 0, 0 };
> unsigned long pagebuf_max[P_PARAM] = { HZ*30, HZ*300, HZ*30,
>1024, 4096, 1 };
>
>-pagebuf_param_t pb_params = {{ HZ, 15 * HZ, HZ, 512, 1024, 0 }};
>+pagebuf_param_t pb_params = {{ HZ, 15 * HZ, 2*HZ, 512, 1024, 0 }};
>
> /*
> * Pagebuf statistics variables
>@@ -595,7 +595,11 @@
> kp->maplist[pi] = cp;
> } else {
> cp = kp->maplist[pi];
>- set_bit(PG_referenced, &cp->flags);
>+ /*
>+ * Don't set referenced bit;
>+ * we may consider age_page_up.
>+ //set_bit(PG_referenced, &cp->flags);
>+ */
> while (TryLockPage(cp)) {
> wait_on_page(cp);
> }
>@@ -1345,7 +1349,7 @@
> struct buffer_head *bh;
> off_t blk_offset;
> size_t blk_length;
>- int err=0;
>+ int err=0, need_balance_dirty = 0;
> int force_io = (rw != READ) || (pb->pb_flags & PBF_FORCEIO);
> int concat_ok = ((MAJOR(dev) != LVM_BLK_MAJOR) ||
>(MAJOR(dev) != MD_MAJOR));
>
>@@ -1425,7 +1429,9 @@
>
> if (rw == WRITE ) {
> set_bit(BH_Uptodate, &bh->b_state);
>- set_bit(BH_Dirty, &bh->b_state);
>+ if (!buffer_dirty(bh))
>+ need_balance_dirty = 1;
>+ __mark_buffer_dirty(bh);
> }
> psync->bh[cnt++] = bh;
> atomic_inc(&psync->remain);
>@@ -1444,7 +1450,8 @@
> if (locking)
> UnlockPage(page);
> }
>-
>+ if (need_balance_dirty)
>+ balance_dirty(dev);
> return err;
> error:
> /* If we ever do get here then clean up what we already did */
>diff -Naur ../../xfs-tot/linux/fs/pagebuf/page_buf_io.c
>./fs/pagebuf/page_buf_io.c
>--- ../../xfs-tot/linux/fs/pagebuf/page_buf_io.c Tue
>Nov 28 16:34:17 2000
>+++ ./fs/pagebuf/page_buf_io.c Fri Dec 1 10:31:28 2000
>@@ -93,11 +93,11 @@
> /*
> * Forward declarations.
> */
>-STATIC void __pb_block_commit_write_async(struct inode *,
>+STATIC int __pb_block_commit_write_async(struct inode *,
> struct page *, page_buf_bmap_t *);
> STATIC int __pb_block_prepare_write_async(struct inode *,
>struct page *,
> unsigned, unsigned, int, page_buf_bmap_t *, int);
>-STATIC int pagebuf_delalloc_convert(struct page *, u_long,
>struct page **);
>+STATIC int pagebuf_delalloc_convert(struct page *, u_long,
>struct page **, int, int);
> STATIC void hook_buffers_to_page(struct inode *, struct page *,
> page_buf_bmap_t *, ulong);
>
>@@ -114,6 +114,20 @@
> int MAX_CLUSTER = 512;
> int CLUSTER_PAGE_LIST_SIZE = ((2*512)+1);
>
>+/*
>+ * stats
>+ */
>+struct pb_io_stat_s {
>+ int pcd_normal_scan;
>+ int pcd_normal_converted;
>+
>+ int pcd_skip_locked;
>+ int pcd_skip_referenced;
>+
>+ int pcd_ilock_failed;
>+} pb_io_stat = {0, 0, 0, 0, 0};
>+
>+EXPORT_SYMBOL(pb_io_stat);
>
> /*
> * The minimum size where we will start using pagebuf
>structures instead
>@@ -210,10 +224,12 @@
>
> if (IS_KIOCLUSTER(ip)){
> /*
>- * If kmalloc, no big deal; the lower layers
>won't cluster.
>+ * If kmalloc fails, no big deal; the lower
>layers won't
>+ * cluster. Also, this allocation has to be
>non-sleeping
>+ * since this can be in kswapd's path ...
> */
> cpages = kmalloc(CLUSTER_PAGE_LIST_SIZE *
>sizeof(struct page *),
>- GFP_KERNEL);
>+ GFP_ATOMIC);
> } else {
> cpages = (struct page **)1; /* a boolean */
> }
>@@ -235,7 +251,7 @@
>
> lock_page(page);
> if (test_and_clear_bit(PG_delalloc,
>&page->flags)) {
>- pagebuf_delalloc_convert(page,
>0, cpages);
>+ pagebuf_delalloc_convert(page,
>0, cpages, 0, 0);
> } else {
> UnlockPage(page);
> }
>@@ -511,7 +527,8 @@
> int pagebuf_iozero( /* zero contents of buffer */
> page_buf_t * pb, /* buffer to zero */
> off_t boff, /* offset in buffer
> */
>- size_t bsize) /* size of data to zero */
>+ size_t bsize, /* size of data to zero */
>+ int *dirty)
> {
> loff_t cboff;
> size_t cpoff;
>@@ -541,6 +558,7 @@
> if (pb->pb_bn == PAGE_BUF_DADDR_NULL) {
> if (test_and_set_bit(PG_delalloc,
>&pm->flags) == 0) {
> atomic_inc(&pb_delalloc_pages);
>+ (*dirty)++;
> }
> }
> }
>@@ -548,16 +566,6 @@
> pb->pb_flags &= ~(PBF_READ | PBF_WRITE);
> pb->pb_flags &= ~(_PBF_SOME_INVALID_PAGES |
>PBF_PARTIAL | PBF_NONE);
>
>- if (!pcd_active && (pb->pb_bn == PAGE_BUF_DADDR_NULL)) {
>- unsigned int np = atomic_read(&pb_delalloc_pages);
>-
>- if (np > 2 * pb_params.p_un.max_dirty_pages)
>- wake_up_interruptible_sync(&pcd_waitq);
>- else if (np > pb_params.p_un.max_dirty_pages)
>- wake_up_interruptible(&pcd_waitq);
>- }
>-
>-
> return (0);
> }
>
>@@ -1174,62 +1182,6 @@
> page, page->index, bh->b_blocknr));
> }
>
>-
>-void
>-set_buffer_dirty_uptodate(struct buffer_head *bh)
>-{
>- int need_balance_dirty = 0;
>-
>- if (bh->b_blocknr <= 0) {
>- printk("Warning: buffer 0x%p with weird
>blockno (%ld)\n",
>- bh, bh->b_blocknr);
>- }
>- set_bit(BH_Uptodate, &bh->b_state);
>- if (!buffer_dirty(bh)) {
>- bh->b_end_io = end_pb_buffer_io_async;
>- need_balance_dirty = 1;
>- }
>- __mark_buffer_dirty(bh);
>-
>- if (need_balance_dirty)
>- balance_dirty(bh->b_dev);
>-}
>-
>-int pbwcm_debug = 0;
>-
>-int
>-__pb_write_or_convert_bmap(
>- struct inode *inode,
>- struct page *page)
>-{
>- loff_t offset = page->index << PAGE_CACHE_SHIFT;
>- int error, nmaps;
>- page_buf_bmap_t map;
>-
>- error = inode->i_op->pagebuf_bmap(inode, offset,
>PAGE_CACHE_SIZE,
>- &map, 1, &nmaps, PBF_WRITE);
>- if (error == 0 && (map.pbm_flags & PBMF_DELAY)) {
>- error = inode->i_op->pagebuf_bmap(inode, offset,
>- map.pbm_bsize, &map, 1,
>- &nmaps, PBF_WRITE|PBF_FILE_ALLOCATE);
>- if (error) {
>- printk("pbwcm: bmap error %d ro 0x%Lx
>size 0x%x\n",
>- error, offset, map.pbm_bsize);
>- } else {
>- dprintk(pbwcm_debug,
>- ("converted bn:%Ld off:%Ld size:%d
>flags:%d\n",
>- map.pbm_bn, map.pbm_offset,
>- map.pbm_bsize, map.pbm_flags));
>- }
>- }
>- if (!error) {
>- hook_buffers_to_page(inode, page, &map,
>PAGE_CACHE_SHIFT);
>- set_buffer_dirty_uptodate(page->buffers);
>- }
>- return error;
>-}
>-
>-
> STATIC int
> __pb_block_prepare_write_async(struct inode *inode, struct page *page,
> unsigned from, unsigned to, int at_eof,
>@@ -1390,15 +1342,34 @@
> }
>
> int pbcw_debug = 0;
>+
>+int
>+set_buffer_dirty_uptodate(struct buffer_head *bh)
>+{
>+ int need_balance_dirty = 0;
>+
>+ if (bh->b_blocknr <= 0) {
>+ printk("Warning: buffer 0x%p with weird
>blockno (%ld)\n",
>+ bh, bh->b_blocknr);
>+ }
>+ set_bit(BH_Uptodate, &bh->b_state);
>+ if (!buffer_dirty(bh)) {
>+ bh->b_end_io = end_pb_buffer_io_async;
>+ need_balance_dirty = 1;
>+ }
>+ __mark_buffer_dirty(bh);
>+ return (need_balance_dirty);
>+}
>+
> int pbcw_debug2 = 0;
>
>-STATIC void
>+STATIC int
> __pb_block_commit_write_async(struct inode *inode,
> struct page *page,
> page_buf_bmap_t *mp)
> {
> struct buffer_head *bh;
>- unsigned int np;
>+ int dirty = 0;
>
> /*
> * Prepare write took care of reading/zero-out
>@@ -1412,32 +1383,20 @@
> if (test_bit(PG_delalloc, &page->flags)) {
> dprintk(pbcw_debug2, ("mapped buffer
>0x%p page 0x%p is delalloc\n", bh,
>page));
> }
>- set_buffer_dirty_uptodate(page->buffers);
>+ dirty = set_buffer_dirty_uptodate(page->buffers);
> dprintk(pbcw_debug, ("pbcw: refiled valid
>buffer 0x%p\n",
> page->buffers));
> } else if (test_and_set_bit(PG_delalloc, &page->flags) == 0) {
> dprintk(pbcw_debug, ("Marking page 0x%p
>delalloc\n", page));
>- np = atomic_read(&pb_delalloc_pages);
>- if (np > PB_MAX_DIRTY_FACTOR *
>pb_params.p_un.max_dirty_pages) {
>- clear_bit(PG_delalloc, &page->flags);
>- if (__pb_write_or_convert_bmap(inode, page)) {
>- BUG();
>- }
>- } else {
>- atomic_inc(&pb_delalloc_pages);
>- if (!pcd_active) {
>- if (np > 2 *
>pb_params.p_un.max_dirty_pages)
>-
>wake_up_interruptible_sync(&pcd_waitq);
>- else if (np >
>pb_params.p_un.max_dirty_pages)
>-
>wake_up_interruptible(&pcd_waitq);
>- }
>- balance_dirty(inode->i_rdev);
>- }
>+
>+ atomic_inc(&pb_delalloc_pages);
>+ dirty = 1;
> }
>
> /* Advance though extent no matter what */
> if (mp)
> mp->pbm_delta += PAGE_CACHE_SIZE;
>+ return dirty;
> }
>
> int
>@@ -1448,7 +1407,8 @@
> char *user_addr,
> size_t len,
> loff_t *lp,
>- page_buf_bmap_t *mp) /* bmap for page
> */
>+ page_buf_bmap_t *mp, /* bmap for page
> */
>+ int *dirty)
> {
> struct page *page;
> unsigned long done;
>@@ -1507,7 +1467,7 @@
> goto unlock;
> }
>
>- __pb_block_commit_write_async(inode, page, mp);
>+ *dirty += __pb_block_commit_write_async(inode,
>page, mp);
>
> foff += bytes_in_page;
> len -= bytes_in_page;
>@@ -1533,7 +1493,8 @@
> char *buf, /* buffer address */
> size_t len, /* size of buffer */
> loff_t * lp, /* file offset to use and update */
>- int pb_flags) /* flags to pass to bmap calls */
>+ int pb_flags, /* flags to pass to bmap calls */
>+ int *dirty)
> {
> struct inode *inode = filp->f_dentry->d_inode;
> page_buf_bmap_t map;
>@@ -1628,7 +1589,7 @@
> */
> status = __pagebuf_do_delwri(inode,
> rounded_offset, size, buf,
>- len, &foff, &map);
>+ len, &foff, &map, dirty);
> if (status <= 0)
> break;
> written += status;
>@@ -1646,7 +1607,8 @@
> struct file * filp, /* file to write
> */
> char *buf, /* buffer address */
> size_t len, /* size of buffer
> */
>- loff_t * lp) /* file offset to use and update */
>+ loff_t * lp, /* file offset to use and update */
>+ int *dirty)
> {
> struct inode *inode = filp->f_dentry->d_inode;
> unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
>@@ -1711,7 +1673,7 @@
>
> if (!page) {
> status = _pagebuf_file_write(filp,
>- buf, len, &foff, pb_flags);
>+ buf, len, &foff,
>pb_flags, dirty);
> if (status > 0)
> written += status;
>
>@@ -1748,7 +1710,7 @@
> goto unlock;
> }
>
>- __pb_block_commit_write_async(inode, page, &map);
>+ *dirty += __pb_block_commit_write_async(inode,
>page, &map);
>
> len -= bytes;
> buf += bytes;
>@@ -1773,8 +1735,6 @@
> }
>
> int pcd_debug = 0;
>-int pcd_skip_locked = 0;
>-int pcd_ilock_failed = 0;
> static int page_cleaner_daemon_started = 0;
> static int daemon_terminate = 0;
>
>@@ -1783,12 +1743,12 @@
> * Returns page locked and with an extra reference count.
> */
> STATIC struct page *
>-probe_page(struct inode *inode, unsigned long index)
>+probe_page(struct inode *inode, unsigned long index, int check)
> {
> struct page *page;
>
> page = __find_lock_page_nowait(inode->i_mapping, index,
>- page_hash(inode->i_mapping, index));
>+ page_hash(inode->i_mapping,
>index), check);
> if (!page)
> return NULL;
> if (!test_and_clear_bit(PG_delalloc, &(page)->flags)) {
>@@ -1820,26 +1780,33 @@
> kio_cluster_write(struct inode *inode,
> struct page *startpage,
> page_buf_bmap_t *mp,
>- struct page **cpages)
>+ struct page **cpages,
>+ int np,
>+ int check)
> {
> unsigned long tindex, tlast;
> struct page **pcp, **pcstart;
> loff_t cstart_offset;
> page_buf_t *pb;
> size_t csize;
>- int count = pb_params.p_un.max_cluster;
>+ int m, count = pb_params.p_un.max_cluster;
>
>- pcp = &cpages[MAX_CLUSTER]; /* start from the middle */
> dprintk(cluster_debug,
> ("cluster_write: inode 0x%p page 0x%p index 0x%lx\n",
> inode, startpage, startpage->index));
>+
>+ if (np && count > np) /* obey limit if supplied */
>+ count = np;
>+ m = count >> 1; /* start from middle */
>+ pcp = &cpages[m];
> *pcp-- = startpage;
>+ count--;
> if (startpage->index != 0) {
> tlast = mp->pbm_offset >> PAGE_CACHE_SHIFT;
> for (tindex = startpage->index-1; tindex >= tlast &&
> pcp >= &cpages[0] && count; tindex--,
>pcp--, count--)
> {
>- if (!(*pcp = probe_page(inode, tindex)))
>+ if (!(*pcp = probe_page(inode, tindex, check)))
> break;
> dprintk(cluster_debug,
> ("cluster_write(L): inode 0x%p
>page 0x%p idx 0x%lx\n",
>@@ -1849,11 +1816,11 @@
> pcstart = pcp+1;
> tlast = PAGE_CACHE_ALIGN_LL(mp->pbm_offset + mp->pbm_bsize) >>
>
>PAGE_CACHE_SHIFT;
>- for (tindex = startpage->index + 1, pcp =
>&cpages[MAX_CLUSTER+1];
>- tindex < tlast && pcp <
>&cpages[CLUSTER_PAGE_LIST_SIZE] && count;
>+ for (tindex = startpage->index + 1, pcp = &cpages[m+1];
>+ tindex < tlast && pcp < &cpages[2*m] && count;
> tindex++, pcp++, count--)
> {
>- if (!(*pcp = probe_page(inode, tindex)))
>+ if (!(*pcp = probe_page(inode, tindex, check)))
> break;
> dprintk(cluster_debug,
> ("cluster_write(R): inode 0x%p page
>0x%p index 0x%lx\n",
>@@ -1920,7 +1887,8 @@
> STATIC void
> cluster_write(struct inode *inode,
> unsigned long index,
>- page_buf_bmap_t *mp)
>+ page_buf_bmap_t *mp,
>+ int check)
> {
> unsigned long tindex;
> unsigned long tlast;
>@@ -1930,7 +1898,7 @@
> if (index != 0) {
> tlast = mp->pbm_offset >> PAGE_CACHE_SHIFT;
> for (tindex = index-1; tindex >= tlast; tindex--) {
>- if (!(page = probe_page(inode, tindex)))
>+ if (!(page = probe_page(inode, tindex, check)))
> break;
> convert_page(inode, page, mp);
> }
>@@ -1938,13 +1906,12 @@
> tlast = PAGE_CACHE_ALIGN_LL(mp->pbm_offset + mp->pbm_bsize) >>
>
>PAGE_CACHE_SHIFT;
> for (tindex = index + 1; tindex < tlast; tindex++) {
>- if (!(page = probe_page(inode, tindex)))
>+ if (!(page = probe_page(inode, tindex, check)))
> break;
> convert_page(inode, page, mp);
> }
> }
>
>-
> int
> pagebuf_convert_page(struct page *page, int toss, int wait)
> {
>@@ -1972,7 +1939,9 @@
> pagebuf_delalloc_convert(
> struct page *mm, /* delalloc page to convert - locked */
> u_long flags, /* flags to pass to bmap call */
>- struct page **cpages) /* can we cluster conversion? */
>+ struct page **cpages, /* can we cluster conversion? */
>+ int np, /* n pages in cpages */
>+ int check) /* check flush times */
> {
> page_buf_bmap_t maps[PBF_MAX_MAPS];
> struct inode *inode;
>@@ -1996,7 +1965,7 @@
>
> if (error) {
> if (error == -EAGAIN) {
>- pcd_ilock_failed++;
>+ pb_io_stat.pcd_ilock_failed++;
> set_bit(PG_delalloc, &mm->flags);
> } else {
> printk("PCD: pagebuf_bmap error %d
>pb_flags 0x%lx\n",
>@@ -2020,13 +1989,13 @@
> if (cpages) {
> if (IS_KIOCLUSTER(inode)) {
> get_page(mm);
>- count = kio_cluster_write(inode, mm,
>&maps[0], cpages);
>+ count = kio_cluster_write(inode, mm,
>&maps[0], cpages, np, check);
> } else {
> hook_buffers_to_page(inode, mm, &maps[0],
>
>PAGE_CACHE_SHIFT);
> set_buffer_dirty_uptodate(mm->buffers);
> UnlockPage(mm);
>- cluster_write(inode, mm->index, &maps[0]);
>+ cluster_write(inode, mm->index,
>&maps[0], check);
> count = 1;
> }
>
>@@ -2042,6 +2011,8 @@
> }
>
> int pcd_debug2 = 0;
>+int sum_min = 0;
>+EXPORT_SYMBOL(sum_min);
>
> STATIC int
> page_cleaner_daemon(void *data)
>@@ -2049,9 +2020,8 @@
> mem_map_t *mm = &mem_map[0], *mmlast = &mem_map[max_mapnr];
> u_long flags;
> struct buffer_head *bh;
>- int pb_min_save = PB_MIN_DIRTY_PAGES;
> struct page **cpages;
>- int looped, sum;
>+ int looped, tsum, sum;
>
> /* Set up the thread */
> exit_files(current);
>@@ -2074,7 +2044,6 @@
> cpages = kmalloc(CLUSTER_PAGE_LIST_SIZE *
>sizeof(struct page *),
> GFP_KERNEL);
>
>- mm = &mem_map[0] - 1;
> while (1) {
> /*
> * If we actually get into a low-memory situation,
>@@ -2082,10 +2051,11 @@
> * up on a more timely basis.
> */
>
>- pcd_skip_locked = 0;
>- pcd_ilock_failed = 0;
>+ pb_io_stat.pcd_skip_locked =
>pb_io_stat.pcd_skip_referenced = 0;
>+ pb_io_stat.pcd_ilock_failed = 0;
> sum = looped = 0;
>- while (atomic_read(&pb_delalloc_pages) >
>PB_MIN_DIRTY_PAGES) {
>+ mm = &mem_map[0] - 1;
>+ while (1) {
> if (current->need_resched)
> schedule();
>
>@@ -2101,8 +2071,12 @@
> }
> if (!test_bit(PG_delalloc, &(mm)->flags))
> continue;
>+ if (mm->age >= PAGE_AGE_START && !looped) {
>+ pb_io_stat.pcd_skip_referenced++;
>+ continue;
>+ }
> if (TryLockPage(mm)) {
>- pcd_skip_locked++;
>+ pb_io_stat.pcd_skip_locked++;
> continue;
> }
> if (!test_and_clear_bit(PG_delalloc,
>&(mm)->flags)) {
>@@ -2129,16 +2103,20 @@
> /* since bmap can block, this should be in a different daemon */
> /*---------------- DELALLOC CONVERT --------------------------------*/
>
>- sum += pagebuf_delalloc_convert(mm,
>- PBF_BMAP_TRY_ILOCK, cpages);
>+ tsum = pagebuf_delalloc_convert(mm,
>+ PBF_BMAP_TRY_ILOCK, cpages, 0, 0);
>+
>+ pb_io_stat.pcd_normal_converted += tsum;
>+ sum += tsum;
>
> /* Do not let too many pages get locked up
> * waiting for the queue to open in here
> */
>- if (sum > 256) {
>+ if (tsum > 256) {
> run_task_queue(&tq_disk);
>- sum = 0;
> }
>+ if (sum > sum_min)
>+ break;
>
> }
> run_task_queue(&tq_disk);
>@@ -2149,18 +2127,9 @@
> wake_up_interruptible(&pcd_waitq);
> break;
> }
>-
>- /*
>- * if woken up periodically (nothing else to do)
>- * convert all the pages, else convert only
>- * to keep watermarks happy.
>- */
>- if (interruptible_sleep_on_timeout(&pcd_waitq,
>- pb_params.p_un.cluster_interval) == 0)
>- {
>- PB_MIN_DIRTY_PAGES = 0;
>- } else
>- PB_MIN_DIRTY_PAGES = pb_min_save;
>+ interruptible_sleep_on_timeout(&pcd_waitq,
>+ pb_params.p_un.cluster_interval);
>+ pb_io_stat.pcd_normal_scan++;
> pcd_active = 1;
> }
> kfree(cpages);
>diff -Naur ../../xfs-tot/linux/fs/xfs/linux/xfs_lrw.c
>./fs/xfs/linux/xfs_lrw.c
>--- ../../xfs-tot/linux/fs/xfs/linux/xfs_lrw.c Mon Dec 4
>13:28:38 2000
>+++ ./fs/xfs/linux/xfs_lrw.c Fri Dec 1 10:30:10 2000
>@@ -77,7 +77,8 @@
> char *buf,
> size_t size,
> loff_t *offsetp,
>- int read) /* set if read, otherwise this
>is write */
>+ int read, /* set if read, otherwise this
>is write */
>+ int *dirty)
> {
> ssize_t ret;
> struct xfs_inode *xip;
>@@ -98,7 +99,7 @@
> if (!(filp->f_flags & O_INVISIBLE))
> xfs_ichgtime(xip, XFS_ICHGTIME_ACC);
> } else {
>- ret = pagebuf_generic_file_write(filp, buf,
>size, offsetp);
>+ ret = pagebuf_generic_file_write(filp, buf,
>size, offsetp, dirty);
> }
> out:
> return(ret);
>@@ -118,6 +119,7 @@
> vnode_t *vp;
> xfs_inode_t *ip;
> #endif
>+ int dirty = 0;
>
> n = XFS_MAX_FILE_OFFSET - *offsetp;
> if (n <= 0)
>@@ -145,7 +147,8 @@
> }
> #endif /* CONFIG_XFS_DMAPI */
>
>- ret = xfs_rdwr(bdp, filp, buf, size, offsetp, 1);
>+ /* dirty doesn't matter */
>+ ret = xfs_rdwr(bdp, filp, buf, size, offsetp, 1, &dirty);
> return(ret);
> }
>
>@@ -168,7 +171,8 @@
> xfs_iocore_t *io,
> xfs_off_t offset,
> xfs_fsize_t isize,
>- struct pm *pmp)
>+ struct pm *pmp,
>+ int *dirty)
> {
> xfs_fileoff_t last_fsb;
> xfs_fileoff_t next_fsb;
>@@ -342,7 +346,7 @@
> printk("xfs_zero_last_block: unwritten?\n");
> }
> } else {
>- error = pagebuf_iozero(pb, zero_offset, zero_len);
>+ error = pagebuf_iozero(pb, zero_offset,
>zero_len, dirty);
> pagebuf_rele(pb);
> goto out_lock;
> }
>@@ -358,7 +362,7 @@
> ("zlb: pb_iozero pb 0x%p zf 0x%x zl 0x%x\n",
> pb, zero_offset, zero_len));
>
>- if (error = pagebuf_iozero(pb, zero_offset, zero_len)) {
>+ if (error = pagebuf_iozero(pb, zero_offset, zero_len, dirty)) {
> pagebuf_rele(pb);
> goto out_lock;
> }
>@@ -409,7 +413,8 @@
> xfs_iocore_t *io,
> xfs_off_t offset,
> xfs_fsize_t isize,
>- struct pm *pmp)
>+ struct pm *pmp,
>+ int *dirty)
> {
> struct inode *ip = vp->v_inode;
> xfs_fileoff_t start_zero_fsb;
>@@ -440,7 +445,7 @@
> * First handle zeroing the block on which isize resides.
> * We only zero a part of that block so it is handled
>specially.
> */
>- error = xfs_zero_last_block(ip, io, offset, isize, pmp);
>+ error = xfs_zero_last_block(ip, io, offset, isize, pmp, dirty);
> if (error) {
> ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
> ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
>@@ -555,7 +560,7 @@
> }
>
> if (imap.br_startblock == DELAYSTARTBLOCK) {
>- error = pagebuf_iozero(pb, 0, lsize);
>+ error = pagebuf_iozero(pb, 0, lsize, dirty);
> pagebuf_rele(pb);
> } else {
> pb->pb_bn = XFS_FSB_TO_DB_IO(io,
>imap.br_startblock);
>@@ -568,7 +573,7 @@
> ("xfs_zero_eof: real
>time device? use diff inode\n"));
> }
>
>- if (error = pagebuf_iozero(pb, 0, lsize)) {
>+ if (error = pagebuf_iozero(pb, 0,
>lsize, dirty)) {
> pagebuf_rele(pb);
> goto out_lock;
> }
>@@ -629,6 +634,7 @@
> int eventsent = 0;
> loff_t savedsize = *offsetp;
> #endif
>+ int dirty = 0;
>
> vp = BHV_TO_VNODE(bdp);
> xip = XFS_BHVTOI(bdp);
>@@ -704,7 +710,7 @@
> if (*offsetp > isize && isize) {
> io->io_writeio_blocks = mp->m_writeio_blocks;
> ret = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offsetp,
>- isize, NULL);
>+ isize, NULL, &dirty);
> if (ret) {
> xfs_iunlock(xip,
>XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL);
> return(ret); /* JIMJIM should this be
>negative? */
>@@ -713,7 +719,7 @@
> xfs_iunlock(xip, XFS_ILOCK_EXCL);
>
> retry:
>- ret = xfs_rdwr(bdp, filp, buf, size, offsetp, 0);
>+ ret = xfs_rdwr(bdp, filp, buf, size, offsetp, 0, &dirty);
>
> #ifdef CONFIG_XFS_DMAPI
> if ((ret == -ENOSPC) &&
>@@ -754,6 +760,8 @@
> }
> }
> xfs_iunlock(xip, XFS_IOLOCK_EXCL);
>+ if (dirty)
>+ balance_dirty(ip->i_dev);
> return(ret);
> }
>
>diff -Naur ../../xfs-tot/linux/fs/xfs/linux/xfs_lrw.h
>./fs/xfs/linux/xfs_lrw.h
>--- ../../xfs-tot/linux/fs/xfs/linux/xfs_lrw.h Tue Nov 28
>16:34:23 2000
>+++ ./fs/xfs/linux/xfs_lrw.h Wed Oct 25 12:37:18 2000
>@@ -48,7 +48,7 @@
> extern int xfs_bdstrat_cb (struct xfs_buf *);
>
> extern int xfs_zero_eof (vnode_t *, struct xfs_iocore *, xfs_off_t,
>- xfs_fsize_t, struct pm *);
>+ xfs_fsize_t, struct pm *, int *dirty);
> extern ssize_t xfs_read (bhv_desc_t *, struct file *, char *,
> size_t, loff_t *);
> extern ssize_t xfs_write (bhv_desc_t *, struct file *, char *,
>diff -Naur ../../xfs-tot/linux/fs/xfs/xfs_inode.c ./fs/xfs/xfs_inode.c
>--- ../../xfs-tot/linux/fs/xfs/xfs_inode.c Tue Nov 28
>16:34:30 2000
>+++ ./fs/xfs/xfs_inode.c Thu Nov 30 10:29:40 2000
>@@ -1707,7 +1707,7 @@
> cred_t *credp)
> {
> xfs_fsize_t isize;
>- int error;
>+ int error, dirty;
>
> ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
> ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
>@@ -1720,7 +1720,8 @@
> * xfs_write_file() beyond the end of the file
> * and any blocks between the old and new file sizes.
> */
>- error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore,
>new_size, isize, NULL);
>+ error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore,
>new_size, isize,
>+ NULL, &dirty);
> return error;
> }
>
>diff -Naur ../../xfs-tot/linux/fs/xfs/xfs_rw.c ./fs/xfs/xfs_rw.c
>--- ../../xfs-tot/linux/fs/xfs/xfs_rw.c Tue Nov 28 16:34:31 2000
>+++ ./fs/xfs/xfs_rw.c Wed Oct 25 12:11:52 2000
>@@ -690,7 +690,7 @@
> void *dio)
> {
> xfs_dio_t *diop = (xfs_dio_t *)dio;
>- int relock;
>+ int relock, dirty;
> __uint64_t flush_end;
> xfs_mount_t *mp;
>
>@@ -717,7 +717,8 @@
> XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
> isize = XFS_SIZE(mp, io);
> if (offset > isize) {
>- xfs_zero_eof(vp, io, offset, isize,
>diop->xd_pmp);
>+ xfs_zero_eof(vp, io, offset, isize,
>+ diop->xd_pmp, &dirty);
> }
> XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
> }
>diff -Naur ../../xfs-tot/linux/include/linux/page_buf.h
>./include/linux/page_buf.h
>--- ../../xfs-tot/linux/include/linux/page_buf.h Tue
>Nov 28 16:34:57 2000
>+++ ./include/linux/page_buf.h Fri Dec 1 16:38:38 2000
>@@ -570,7 +570,8 @@
> extern int pagebuf_iozero( /* zero contents of
>buffer */
> page_buf_t *, /* buffer to zero
> */
> off_t, /* offset in buffer
> */
>- size_t); /* size of data to
>zero */
>+ size_t, /* size of data to
>zero */
>+ int *); /* generated new dirty
>data? */
>
> extern int pagebuf_mapin( /* make buffer
>addressable */
> page_buf_t *); /* buffer to make
>addressable */
>@@ -635,7 +636,8 @@
> struct file *, /* file to write
> */
> char *, /* buffer address
> */
> size_t, /* size of buffer
> */
>- loff_t *); /* file offset to use
>and update */
>+ loff_t *, /* file offset to use
>and update */
>+ int *); /* dirty indicator
> */
>
> /*
> * pagebuf_generic_file_write writes data from the
>specified file
>diff -Naur ../../xfs-tot/linux/include/linux/pagemap.h
>./include/linux/pagemap.h
>--- ../../xfs-tot/linux/include/linux/pagemap.h Tue Nov 28
>16:34:57 2000
>+++ ./include/linux/pagemap.h Fri Dec 1 16:38:39 2000
>@@ -70,7 +70,7 @@
> extern struct page * __find_lock_page (struct address_space * mapping,
> unsigned long index, struct
>page **hash);
> extern struct page * __find_lock_page_nowait (struct
>address_space * mapping,
>- unsigned long index, struct
>page **hash);
>+ unsigned long index, struct
>page **hash, int);
> extern void lock_page(struct page *page);
> #define find_lock_page(mapping, index) \
> __find_lock_page(mapping, index,
>page_hash(mapping, index))
>diff -Naur ../../xfs-tot/linux/include/linux/swap.h
>./include/linux/swap.h
>--- ../../xfs-tot/linux/include/linux/swap.h Tue Nov 28
>16:34:59 2000
>+++ ./include/linux/swap.h Fri Dec 1 16:36:29 2000
>@@ -208,6 +208,9 @@
> #define ZERO_PAGE_BUG \
> if (page_count(page) == 0) BUG();
>
>+#define DELALLOC_DEBUG_PAGE \
>+ if (test_bit(PG_delalloc, &(page)->flags)) BUG();
>+
> #define add_page_to_active_list(page) { \
> DEBUG_ADD_PAGE \
> ZERO_PAGE_BUG \
>@@ -228,6 +231,7 @@
> #define add_page_to_inactive_clean_list(page) { \
> DEBUG_ADD_PAGE \
> ZERO_PAGE_BUG \
>+ DELALLOC_DEBUG_PAGE \
> SetPageInactiveClean(page); \
> list_add(&(page)->lru, &page->zone->inactive_clean_list); \
> page->zone->inactive_clean_pages++; \
>diff -Naur ../../xfs-tot/linux/mm/filemap.c ./mm/filemap.c
>--- ../../xfs-tot/linux/mm/filemap.c Tue Nov 28 16:35:03 2000
>+++ ./mm/filemap.c Thu Nov 30 10:29:41 2000
>@@ -252,6 +252,24 @@
> spin_unlock(&pagecache_lock);
> }
>
>+static inline struct page * __find_page_nolock_noref(struct
>address_space *mapping, unsigned
>long offset, struct page *page)
>+{
>+ goto inside;
>+
>+ for (;;) {
>+ page = page->next_hash;
>+inside:
>+ if (!page)
>+ goto not_found;
>+ if (page->mapping != mapping)
>+ continue;
>+ if (page->index == offset)
>+ break;
>+ }
>+not_found:
>+ return page;
>+}
>+
> static inline struct page * __find_page_nolock(struct
>address_space *mapping, unsigned long
>offset, struct page *page)
> {
> goto inside;
>@@ -580,17 +598,19 @@
> }
>
> struct page * __find_lock_page_nowait(struct address_space *mapping,
>- unsigned long offset, struct
>page **hash)
>+ unsigned long offset, struct page
>**hash, int check)
> {
> struct page *page;
>
> spin_lock(&pagecache_lock);
>- page = __find_page_nolock(mapping, offset, *hash);
>+ page = __find_page_nolock_noref(mapping, offset, *hash);
> if (page)
> page_cache_get(page);
> spin_unlock(&pagecache_lock);
>
>- if (page && TryLockPage(page)) {
>+ if (page &&
>+ ((check && page->age >= PAGE_AGE_START) ||
>TryLockPage(page)))
>+ {
> /* don't wait for page */
> put_page(page);
> return NULL;
>diff -Naur ../../xfs-tot/linux/mm/swap.c ./mm/swap.c
>--- ../../xfs-tot/linux/mm/swap.c Tue Nov 28 16:35:03 2000
>+++ ./mm/swap.c Wed Nov 1 14:03:55 2000
>@@ -173,7 +173,8 @@
> * inactive_clean list it doesn't need to be perfect...
> */
> int maxcount = (page->buffers ? 3 : 2);
>- page->age = 0;
>+ if (page->age)
>+ return;
> ClearPageReferenced(page);
>
> /*
>@@ -181,8 +182,7 @@
> * (some pages aren't on any list at all)
> */
> if (PageActive(page) && page_count(page) <= maxcount &&
>- !page_ramdisk(page) &&
>- !test_bit(PG_delalloc, &page->flags))
>+ !page_ramdisk(page))
> {
>
> /*
>@@ -194,7 +194,9 @@
> * need to be cleared away) and/or the function calling
> * us has an extra reference count on the page.
> */
>- if (page->buffers || page_count(page) == 2) {
>+ if (page->buffers || page_count(page) == 2
>+ || test_bit(PG_delalloc, &page->flags))
>+ {
> del_page_from_active_list(page);
> add_page_to_inactive_dirty_list(page);
> /*
>-------------------------------- patch ends
>------------------------------
>
>--
>---------------------------------------------------------------
>-----------
>Rajagopal Ananthanarayanan ("ananth")
>Member Technical Staff, SGI.
>---------------------------------------------------------------
>-----------
>
|