Marcelo Tosatti wrote:
>
> On Fri, 23 Feb 2001, Steve Lord wrote:
>
> <snip>
>
> > Other changes coming down the pipeline - probably next week will help too,
> > we have better dbench and bonnie numbers on some internal code right now.
>
> It looks we're allocating the page to hold the page pointers for the
> cluster unconditionally in __pagebuf_write_full_page (my last message
> talks a bit more about that kind of stuff). This page, as far as I can
> see, is not used at all without kiobuf io. (Thomas is not using kiobuf IO
> IIRC).
Yeah, the allocation of cpages is unnecessary for non-kiocluster.
The following patch (delay-buffer-6.patch) contains several key
changes & cleanups. The fundamental change is to employ core
linux daemons and codepaths for handling delayed allocation.
Comments, feedback (stability & performance) appreciated!
--
--------------------------------------------------------------------------
Rajagopal Ananthanarayanan ("ananth")
Member Technical Staff, SGI.
--------------------------------------------------------------------------diff -Naur ../../xfs-orig/linux/drivers/block/ll_rw_blk.c
drivers/block/ll_rw_blk.c
--- ../../xfs-orig/linux/drivers/block/ll_rw_blk.c Thu Feb 22 14:36:01 2001
+++ drivers/block/ll_rw_blk.c Sat Feb 24 12:10:07 2001
@@ -1250,6 +1250,7 @@
if (!nr)
return;
+
major = MAJOR(bhs[0]->b_dev);
/* Determine correct block size for this device. */
@@ -1270,6 +1271,8 @@
correct_size, bh->b_size);
goto sorry;
}
+ if (test_bit(BH_Delay, &bh->b_state) || !buffer_mapped(bh))
+ BUG();
}
if ((rw & WRITE) && is_read_only(bhs[0]->b_dev)) {
diff -Naur ../../xfs-orig/linux/drivers/scsi/scsi_merge.c
drivers/scsi/scsi_merge.c
--- ../../xfs-orig/linux/drivers/scsi/scsi_merge.c Thu Feb 22 14:36:21 2001
+++ drivers/scsi/scsi_merge.c Thu Feb 22 14:12:35 2001
@@ -92,7 +92,7 @@
printk("counted segments is %x\n", segments);
printk("Flags %d %d\n", use_clustering, dma_host);
if (req->bh != NULL) {
- for (bh = req->bh; bh->b_reqnext != NULL; bh = bh->b_reqnext) {
+ for (bh = req->bh; bh != NULL; bh = bh->b_reqnext) {
printk("Segment 0x%p, blocks %d, addr 0x%lx\n",
bh,
bh->b_size >> 9,
diff -Naur ../../xfs-orig/linux/fs/buffer.c fs/buffer.c
--- ../../xfs-orig/linux/fs/buffer.c Thu Feb 22 14:36:27 2001
+++ fs/buffer.c Sat Feb 24 12:04:12 2001
@@ -161,6 +161,38 @@
atomic_dec(&bh->b_count);
}
+
+#define buffer_delay_busy(bh) \
+ (test_bit(BH_Delay, &bh->b_state) && bh->b_page &&
PageLocked(bh->b_page))
+
+void
+_write_buffer(struct buffer_head *bh)
+{
+ struct page *page = bh->b_page;
+
+ if (!page || TryLockPage(page))
+ return;
+ if (!buffer_delay(bh) || !buffer_dirty(bh)) {
+ if (buffer_delay(bh))
+ BUG();
+ UnlockPage(page);
+ return;
+ }
+ page->mapping->a_ops->writepage(page);
+ if (DelallocPage(page))
+ BUG();
+}
+
+static inline void
+write_buffer(struct buffer_head *bh)
+{
+ if (!buffer_delay(bh))
+ ll_rw_block(WRITE, 1, &bh);
+ else
+ _write_buffer(bh);
+}
+
+
/* Call sync_buffers with wait!=0 to ensure that the call does not
* return until all buffer writes have completed. Sync() may return
* before the writes have finished; fsync() may not.
@@ -232,7 +264,7 @@
atomic_inc(&bh->b_count);
spin_unlock(&lru_list_lock);
- ll_rw_block(WRITE, 1, &bh);
+ write_buffer(bh);
atomic_dec(&bh->b_count);
retry = 1;
goto repeat;
@@ -507,6 +539,8 @@
struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)];
struct buffer_head **bhp = &head->list;
+ if (test_bit(BH_Delay, &bh->b_state))
+ BUG();
bh->b_state = 0;
spin_lock(&head->lock);
@@ -879,7 +913,7 @@
if (buffer_dirty(bh)) {
atomic_inc(&bh->b_count);
spin_unlock(&lru_list_lock);
- ll_rw_block(WRITE, 1, &bh);
+ write_buffer(bh);
brelse(bh);
spin_lock(&lru_list_lock);
}
@@ -1395,8 +1429,10 @@
head = page->buffers;
bh = head;
- if (DelallocPage(page))
- BUG();
+ if (buffer_delay(bh)) {
+ page->mapping->a_ops->writepage_nounlock(page);
+ return 0; /* just started I/O ... likely didn't complete */
+ }
do {
unsigned int next_off = curr_off + bh->b_size;
next = bh->b_this_page;
@@ -2381,7 +2417,7 @@
if (wait > 1)
__wait_on_buffer(p);
} else if (buffer_dirty(p))
- ll_rw_block(WRITE, 1, &p);
+ write_buffer(p);
} while (tmp != bh);
}
@@ -2408,6 +2444,11 @@
int index = BUFSIZE_INDEX(bh->b_size);
int loop = 0;
+ if (buffer_delay(bh)) {
+ if (wait)
+ page->mapping->a_ops->writepage_nounlock(page);
+ return 0; /* just started I/O ... likely didn't complete */
+ }
cleaned_buffers_try_again:
spin_lock(&lru_list_lock);
write_lock(&hash_table_lock);
@@ -2609,7 +2650,7 @@
__refile_buffer(bh);
continue;
}
- if (buffer_locked(bh))
+ if (buffer_locked(bh) || buffer_delay_busy(bh))
continue;
if (check_flushtime) {
@@ -2627,7 +2668,7 @@
/* OK, now we are committed to write it out. */
atomic_inc(&bh->b_count);
spin_unlock(&lru_list_lock);
- ll_rw_block(WRITE, 1, &bh);
+ write_buffer(bh);
atomic_dec(&bh->b_count);
if (current->need_resched)
diff -Naur ../../xfs-orig/linux/fs/pagebuf/page_buf.c fs/pagebuf/page_buf.c
--- ../../xfs-orig/linux/fs/pagebuf/page_buf.c Fri Feb 23 11:23:14 2001
+++ fs/pagebuf/page_buf.c Fri Feb 23 18:54:46 2001
@@ -152,8 +152,6 @@
* External pagebuf I/O functions
*/
-extern int _page_cleaner_daemon_start(void);
-extern void _page_cleaner_daemon_stop(void);
extern void _pb_zero_out_delay(struct inode *,
struct page *, page_buf_bmap_t *);
@@ -177,10 +175,10 @@
* /proc/sys/vm/pagebuf
*/
-unsigned long pagebuf_min[P_PARAM] = { HZ/2, 1*HZ, HZ/2, 1, 0, 0 };
-unsigned long pagebuf_max[P_PARAM] = { HZ*30, HZ*300, HZ*30, 1024, 4096, 1 };
+unsigned long pagebuf_min[P_PARAM] = { HZ/2, 1*HZ, 1, 0 };
+unsigned long pagebuf_max[P_PARAM] = { HZ*30, HZ*300, 1024, 1 };
-pagebuf_param_t pb_params = {{ HZ, 15 * HZ, HZ, 512, 1024, 0 }};
+pagebuf_param_t pb_params = {{ HZ, 15 * HZ, 512, 0 }};
/*
* Pagebuf statistics variables
@@ -455,14 +453,13 @@
struct page **pages)
{
loff_t next_buffer_offset;
- loff_t next_desired_offset;
unsigned long page_count;
int rval;
struct kiobuf *kp;
unsigned long pi;
unsigned long index;
off_t start_off, end_off;
- int all_mapped, good_pages, sectors, count;
+ int all_mapped, good_pages, sectors;
struct page *cp, **hash, *cached_page;
int gfp_mask;
@@ -2082,6 +2079,7 @@
spin_unlock_irqrestore(¤t->sigmask_lock, flags);
strcpy(current->comm, "pagebuf_daemon");
+ current->flags |= PF_MEMALLOC;
do {
if (pb_daemon->active == 1) {
@@ -2367,8 +2365,7 @@
return -1; /* error */
}
}
-
- return _page_cleaner_daemon_start();
+ return 0;
}
int
@@ -2404,18 +2401,12 @@
{PB_FLUSH_AGE, "flush_age", &pb_params.data[1],
sizeof(int), 0644, NULL, &proc_doulongvec_ms_jiffies_minmax,
&sysctl_intvec, NULL, &pagebuf_min[1], &pagebuf_max[1]},
- {PB_CLEAN_INT, "clean_int", &pb_params.data[2],
- sizeof(int), 0644, NULL, &proc_doulongvec_ms_jiffies_minmax,
- &sysctl_intvec, NULL, &pagebuf_min[2], &pagebuf_max[2]},
- {PB_CLUSTER_LIMIT, "cluster_limit", &pb_params.data[3],
+ {PB_CLUSTER_LIMIT, "cluster_limit", &pb_params.data[2],
sizeof(int), 0644, NULL, &proc_doulongvec_minmax, &sysctl_intvec, NULL,
- &pagebuf_min[3], &pagebuf_max[3]},
- {PB_DELALLOC_LIMIT, "delalloc_count", &pb_params.data[4],
- sizeof(int), 0644, NULL, &proc_doulongvec_minmax, &sysctl_intvec, NULL,
- &pagebuf_min[4], &pagebuf_max[4]},
- {PB_DEBUG, "debug", &pb_params.data[5],
+ &pagebuf_min[2], &pagebuf_max[2]},
+ {PB_DEBUG, "debug", &pb_params.data[3],
sizeof(int), 0644, NULL, &proc_doulongvec_minmax, &sysctl_intvec, NULL,
- &pagebuf_min[5], &pagebuf_max[5]},
+ &pagebuf_min[3], &pagebuf_max[3]},
{0}
};
@@ -2545,7 +2536,6 @@
{
if (pagebuf_cache != NULL)
kmem_cache_destroy(pagebuf_cache);
- _page_cleaner_daemon_stop();
pagebuf_daemon_stop();
pagebuf_locking_terminate();
avl_terminate();
diff -Naur ../../xfs-orig/linux/fs/pagebuf/page_buf_io.c
fs/pagebuf/page_buf_io.c
--- ../../xfs-orig/linux/fs/pagebuf/page_buf_io.c Fri Feb 23 10:29:09 2001
+++ fs/pagebuf/page_buf_io.c Sat Feb 24 12:03:50 2001
@@ -103,8 +103,6 @@
/*
* Globals
*/
-static int pcd_active;
-int PB_MAX_DIRTY_FACTOR = 4;
static DECLARE_WAIT_QUEUE_HEAD(pcd_waitq);
static atomic_t pb_delalloc_pages = ATOMIC_INIT(0);
@@ -114,7 +112,6 @@
extern spinlock_t pagecache_lock;
-int page_cleaner_count, page_cleaner_pages;
int do_write_full_page, do_write_pages;
int flush_convert, flush_convert_pages;
@@ -122,8 +119,6 @@
* The minimum size where we will start using pagebuf structures instead
* of just working with pages.
*/
-
-#define PAGEBUF_MIN_IOSIZE (4*PAGE_CACHE_SIZE)
#define PBF_IO_CHUNKSIZE 65536
#define PBF_MAX_MAPS 1
@@ -165,13 +160,28 @@
__pb_block_commit_write_async(inode, page, mp, 0);
}
-static inline void
-_unmark_delalloc(struct page *page)
+static void
+_unmark_delalloc(struct page *page, int toss)
{
+ struct buffer_head *bh = page->buffers;
+
if (!PageLocked(page))
PAGE_BUG(page);
- if (test_and_clear_bit(PG_delalloc, &page->flags))
- atomic_dec(&pb_delalloc_pages);
+ if (!DelallocPage(page))
+ PAGE_BUG(page);
+ if (!bh)
+ BUG();
+ clear_bit(BH_Delay, &bh->b_state);
+ atomic_dec(&pb_delalloc_pages);
+ if (!toss && !buffer_mapped(bh))
+ printk("warning: unmarking unmapped buffer page 0x%p\n", page);
+ if (toss && !buffer_mapped(bh)) {
+ if (!buffer_dirty(bh))
+ BUG();
+ mark_buffer_clean(bh);
+ if (bh->b_list != BUF_CLEAN)
+ printk("buffer bh 0x%p not clean\n", bh);
+ }
}
/*
@@ -208,7 +218,6 @@
static void _pagebuf_flush(
- struct inode *ip, /* used for KIOCLUSTER check */
struct list_head *head, /* list of pages */
loff_t ioff, /* first location in range */
struct page **cpages) /* clustering buffer */
@@ -238,9 +247,8 @@
flush_convert_pages +=
pagebuf_delalloc_convert(page,
PBF_FILE_ALLOCATE, cpages);
- } else {
- UnlockPage(page);
- }
+ }
+ UnlockPage(page);
page_cache_release(page);
spin_lock(&pagecache_lock);
goto repeat;
@@ -257,6 +265,7 @@
{
struct page **cpages = NULL;
+#if defined(KIOCLUSTER)
/*
* If kmalloc fails, no big deal; the lower layers won't
* cluster. Also, this allocation has to be non-sleeping
@@ -264,11 +273,12 @@
*/
cpages = kmalloc(CLUSTER_PAGE_LIST_SIZE * sizeof(struct page *),
GFP_PAGE_IO);
+#endif
spin_lock(&pagecache_lock);
- _pagebuf_flush(ip, &ip->i_mapping->clean_pages, ioff, cpages);
- _pagebuf_flush(ip, &ip->i_mapping->dirty_pages, ioff, cpages);
- _pagebuf_flush(ip, &ip->i_mapping->locked_pages, ioff, cpages);
+ _pagebuf_flush(&ip->i_mapping->clean_pages, ioff, cpages);
+ _pagebuf_flush(&ip->i_mapping->dirty_pages, ioff, cpages);
+ _pagebuf_flush(&ip->i_mapping->locked_pages, ioff, cpages);
spin_unlock(&pagecache_lock);
generic_buffer_fdatasync(ip, (unsigned long) ioff, ~0UL);
@@ -507,27 +517,19 @@
return (-ENOMEM);
}
assert(((csize + cpoff) <= PAGE_CACHE_SIZE));
+ lock_page(page);
memset((void *) (kmap(page) + cpoff), 0, csize);
kunmap(page);
SetPageUptodate(page);
if (pb->pb_bn == PAGE_BUF_DADDR_NULL) {
- if (test_and_set_bit(PG_delalloc, &page->flags) == 0) {
- atomic_inc(&pb_delalloc_pages);
- }
+ __pb_block_commit_write_async(pb->pb_target, page,
NULL, 0);
}
+ UnlockPage(page);
}
pb->pb_flags &= ~(PBF_READ | PBF_WRITE);
pb->pb_flags &= ~(_PBF_SOME_INVALID_PAGES | PBF_PARTIAL | PBF_NONE);
- if (!pcd_active && (pb->pb_bn == PAGE_BUF_DADDR_NULL)) {
- unsigned int np = atomic_read(&pb_delalloc_pages);
-
- if (np > pb_params.p_un.max_dirty_pages)
- wake_up_interruptible(&pcd_waitq);
- }
-
-
return (0);
}
@@ -995,15 +997,13 @@
PAGE_CACHE_SIZE,
&map, 1, &nmaps, PBF_READ);
- hook_buffers_to_page(inode, page, &map, PAGE_CACHE_SHIFT);
- bh = page->buffers;
if (map.pbm_bn > 0) {
+ hook_buffers_to_page(inode, page, &map,
PAGE_CACHE_SHIFT);
bh = head = page->buffers;
} else if (map.pbm_flags & (PBMF_HOLE|PBMF_DELAY)) {
memset(kmap(page), 0, PAGE_CACHE_SIZE);
flush_dcache_page(page);
kunmap(page);
- set_bit(BH_Uptodate, &bh->b_state);
goto page_done;
} else {
printk("pagebuf_read_full_page: page 0x%p map 0x%p\n",
@@ -1056,25 +1056,16 @@
struct inode *inode,
struct page *page)
{
- struct page **cpages;
+ struct page **cpages = NULL;
int pb_flags;
int count;
unsigned long save_flags = current->flags;
- spin_lock(&inode_lock);
- if (inode->i_state & I_MAPPING) {
- spin_unlock(&inode_lock);
- SetPageDirty(page);
- UnlockPage(page);
- return 0;
- }
- inode->i_state |= I_MAPPING;
- spin_unlock(&inode_lock);
-
current->flags |= PF_MEMALLOC;
+#if defined(KIOCLUSTER)
cpages = kmalloc(CLUSTER_PAGE_LIST_SIZE * sizeof(struct page *),
GFP_PAGE_IO);
-
+#endif
do_write_full_page++;
if (DelallocPage(page))
@@ -1085,13 +1076,11 @@
count = pagebuf_delalloc_convert(page, pb_flags, cpages);
do_write_pages += count;
+ if (DelallocPage(page))
+ BUG();
if (cpages)
kfree(cpages);
- spin_lock(&inode_lock);
- inode->i_state &= ~I_MAPPING;
- spin_unlock(&inode_lock);
-
current->flags = save_flags;
return 0;
}
@@ -1100,7 +1089,7 @@
* pagebuf_write_full_page
*/
-int pagebuf_write_full_page(struct page *page)
+STATIC int pagebuf_write_full_page(struct page *page)
{
struct inode *inode = (struct inode*)page->mapping->host;
unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
@@ -1112,11 +1101,12 @@
return __pagebuf_write_full_page(inode, page);
/* things got complicated... */
- offset = inode->i_size & PAGE_CACHE_MASK_LL;
+ offset = inode->i_size & (~PAGE_CACHE_MASK_LL);
/* OK, are we completely out? */
if ((page->index >= end_index+1) || !offset) {
- UnlockPage(page);
- return -EIO;
+ printk("Bad write on page 0x%p\n", page);
+ err = -EIO;
+ goto out;
}
if (DelallocPage(page))
@@ -1132,11 +1122,39 @@
__pb_block_commit_write_async(inode, page, NULL, 0);
}
+ if (DelallocPage(page))
+ BUG();
kunmap(page);
- UnlockPage(page);
+out:
return err;
}
+int pagebuf_write_full_page_unlock(struct page *page)
+{
+ int ret = pagebuf_write_full_page(page);
+ UnlockPage(page);
+ return ret;
+}
+
+int pagebuf_write_full_page_nounlock(struct page *page)
+{
+ return pagebuf_write_full_page(page);
+}
+
+STATIC void
+hook_buffers_to_page_delay(struct inode *inode, struct page *page)
+{
+ struct buffer_head *bh;
+
+ if (page->buffers)
+ BUG();
+ create_empty_buffers(page, inode->i_dev, PAGE_CACHE_SIZE);
+ bh = page->buffers;
+ bh->b_state = (1 << BH_Delay);
+ atomic_inc(&pb_delalloc_pages);
+ __mark_buffer_dirty(bh);
+ balance_dirty(bh->b_dev);
+}
STATIC void
hook_buffers_to_page(struct inode *inode,
@@ -1145,6 +1163,11 @@
struct buffer_head *bh;
page_buf_daddr_t bn;
+ if (mp->pbm_bn < 0) {
+ printk("hook_buffers_to_page: bad bn page 0x%p mp 0x%p\n",
+ page, mp);
+ BUG();
+ }
if (!page->buffers)
create_empty_buffers(page, inode->i_dev, PAGE_CACHE_SIZE);
@@ -1153,21 +1176,13 @@
bh->b_end_io = end_pb_buffer_io_async;
bh->b_private = (void *) 0;
- if (mp->pbm_flags & (PBMF_HOLE|PBMF_DELAY)) {
- bh->b_blocknr = 0;
- bh->b_state = (1 << BH_Req) | (1 << BH_End_io);
- return;
- }
- if (mp->pbm_bn < 0) {
- printk("hook_buffers_to_page: bad bn page 0x%p mp 0x%p\n",
- page, mp);
- BUG();
- }
bn = mp->pbm_bn >>
(bshift - inode->i_sb->s_blocksize_bits);
bn += (mp->pbm_delta >> bshift);
bh->b_blocknr = bn;
- bh->b_state = (1 << BH_Mapped) | (1 << BH_Req) | (1 << BH_End_io);
+ if (buffer_locked(bh) || buffer_req(bh))
+ BUG();
+ bh->b_state |= (1 << BH_Mapped) | (1 << BH_Req) | (1 << BH_End_io);
}
@@ -1183,6 +1198,7 @@
set_bit(BH_Uptodate, &bh->b_state);
if (!buffer_dirty(bh)) {
bh->b_end_io = end_pb_buffer_io_async;
+ bh->b_state |= (1 << BH_End_io);
need_balance_dirty = 1;
}
__mark_buffer_dirty(bh);
@@ -1198,7 +1214,7 @@
{
struct buffer_head *bh;
int err = 0;
- int nmaps;
+ int nmaps, dp = DelallocPage(page);
char *kaddr = kmap(page);
page_buf_bmap_t map;
@@ -1211,7 +1227,8 @@
* go get some space.
*/
bh = page->buffers;
- if ((!bh || !buffer_mapped(bh)) && !DelallocPage(page)) {
+ if ((!bh || !buffer_mapped(bh)) && (!dp || (flags & PBF_FILE_ALLOCATE)))
+ {
if (!mp) {
mp = ↦
err = inode->i_op->pagebuf_bmap(inode,
@@ -1226,6 +1243,8 @@
}
if (mp->pbm_bn > 0) {
hook_buffers_to_page(inode, page, mp, PAGE_CACHE_SHIFT);
+ if (dp)
+ _unmark_delalloc(page, 0);
bh = page->buffers;
}
}
@@ -1240,7 +1259,7 @@
/*
* Partial write. Is the page valid anyway?
*/
- if (Page_Uptodate(page) || DelallocPage(page)) {
+ if (Page_Uptodate(page) || dp) {
goto out;
}
/*
@@ -1341,7 +1360,6 @@
int partial)
{
struct buffer_head *bh;
- unsigned int np;
/*
* Prepare write took care of reading/zero-out
@@ -1351,15 +1369,8 @@
SetPageUptodate(page);
if ((bh = page->buffers) && buffer_mapped(bh)) {
set_buffer_dirty_uptodate(page->buffers, partial);
- } else if (test_and_set_bit(PG_delalloc, &page->flags) == 0) {
- atomic_inc(&pb_delalloc_pages);
- if (!pcd_active) {
- np = atomic_read(&pb_delalloc_pages);
- if (np > pb_params.p_un.max_dirty_pages)
- wake_up_interruptible(&pcd_waitq);
- }
- if (!partial)
- balance_dirty(inode->i_rdev);
+ } else if (!DelallocPage(page)) {
+ hook_buffers_to_page_delay(inode, page);
}
/* Advance though extent no matter what */
@@ -1693,9 +1704,6 @@
return written ? written : status;
}
-static int page_cleaner_daemon_started = 0;
-static int daemon_terminate = 0;
-
/*
* Probe for a given page (index) in the inode & test if it is delayed.
* Returns page locked and with an extra reference count.
@@ -1730,21 +1738,11 @@
page_cache_release(page);
return NULL;
}
- /* In the case where we probe a page - push it back down the LRU
- * so we do not hit it on the next pass.
- */
-
- spin_lock(&pagemap_lru_lock);
- if (PageInactiveDirty(page)) {
- list_del(&page->lru);
- list_add(&page->lru, &inactive_dirty_list);
- }
- spin_unlock(&pagemap_lru_lock);
- _unmark_delalloc(page);
return page;
}
+#if defined(KIOCLUSTER)
/*
* Convert & write out a cluster of pages in the same extent as defined
* by mp and surrounding "startpage". startpage is locked & has an extra
@@ -1815,16 +1813,36 @@
return count;
}
+#endif /* KIOCLUSTER */
+
/*
* Allocate & map buffers for page given the extent map.
*/
STATIC void
-convert_page(struct inode *inode, struct page *page, page_buf_bmap_t *mp)
+convert_page(struct inode *inode, struct page *page, page_buf_bmap_t *mp, int
u)
{
- mp->pbm_delta = (page->index << PAGE_CACHE_SHIFT) - mp->pbm_offset;
- hook_buffers_to_page(inode, page, mp, PAGE_CACHE_SHIFT);
- set_buffer_dirty_uptodate(page->buffers, 0);
- UnlockPage(page);
+ struct buffer_head *bh = page->buffers;
+ int dp = DelallocPage(page);
+
+ if (!bh || dp) {
+ mp->pbm_delta = (page->index << PAGE_CACHE_SHIFT) -
mp->pbm_offset;
+ hook_buffers_to_page(inode, page, mp, PAGE_CACHE_SHIFT);
+ if (dp)
+ _unmark_delalloc(page, 0);
+ }
+ bh = page->buffers;
+ /*
+ * 1 == don't balance dirty, we are doing I/O just below here.
+ * otherwise causes nasty recursions.
+ */
+ set_buffer_dirty_uptodate(bh, 1);
+ if (u)
+ UnlockPage(page);
+
+ atomic_inc(&bh->b_count);
+ ll_rw_block(WRITE, 1, &bh);
+ atomic_dec(&bh->b_count);
+
page_cache_release(page);
}
@@ -1849,16 +1867,16 @@
for (tindex = startpage->index-1; tindex >= tlast; tindex--) {
if (!(page = probe_page(inode, tindex)))
break;
- convert_page(inode, page, mp);
+ convert_page(inode, page, mp, 1);
}
}
- convert_page(inode, startpage, mp);
+ convert_page(inode, startpage, mp, 0);
tlast = PAGE_CACHE_ALIGN_LL(mp->pbm_offset + mp->pbm_bsize) >>
PAGE_CACHE_SHIFT;
for (tindex = startpage->index + 1; tindex < tlast; tindex++) {
if (!(page = probe_page(inode, tindex)))
break;
- convert_page(inode, page, mp);
+ convert_page(inode, page, mp, 1);
}
}
@@ -1872,7 +1890,7 @@
if (!PageLocked(page))
BUG();
- _unmark_delalloc(page);
+ _unmark_delalloc(page, 1);
}
@@ -1884,7 +1902,7 @@
{
page_buf_bmap_t maps[PBF_MAX_MAPS];
struct inode *inode;
- int maps_returned, error, count;
+ int maps_returned, error;
u_long pb_flags;
loff_t rounded_offset;
@@ -1894,8 +1912,8 @@
* anything.
*/
if (!inode->i_nlink && (inode->i_state & I_FREEING)) {
- _unmark_delalloc(page);
- UnlockPage(page);
+ BUG();
+ _unmark_delalloc(page, 1);
return 0;
}
@@ -1918,12 +1936,10 @@
if (error != -EIO)
printk("PCD: pagebuf_bmap error %d pb_flags 0x%lx\n",
error, pb_flags);
- UnlockPage(page);
return 0;
}
if (maps[0].pbm_delta % PAGE_CACHE_SIZE) {
printk("PCD: pbm_delta not page aligned mp 0x%p\n", &maps[0]);
- UnlockPage(page);
return 0;
}
@@ -1935,236 +1951,15 @@
}
page_cache_get(page);
- _unmark_delalloc(page);
/*
* page needs to be setup as though find_page(...) returned it,
* which is a locked page with an extra reference.
*/
- if (cpages) {
- count = kio_cluster_write(inode, page, &maps[0], cpages);
- } else {
- cluster_write(inode, page, &maps[0]);
- count = 1;
- }
- return count;
+ cluster_write(inode, page, &maps[0]);
+ return 1;
}
/*
- * Walk the active pages list looking for delalloc entries, we need to
- * age them out all the time, since they have to be converted before
- * being written to disk. If there is no other memory pressure then pages
- * on the active list do not get moved, and we do not put them somewhere
- * the cleaner can find them.
- */
-
-void age_delalloc_pages(void)
-{
- struct page *page;
- struct list_head * page_lru;
- int maxscan, page_active;
-
- maxscan = nr_active_pages;
- while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {
- page = list_entry(page_lru, struct page, lru);
- if (!DelallocPage(page)) {
- list_del(page_lru);
- list_add(page_lru, &active_list);
- continue;
- }
-
- /* Do aging on delalloc pages. */
- if (PageTestandClearReferenced(page)) {
- age_page_up_nolock(page);
- page_active = 1;
- } else {
- age_page_down_ageonly(page);
- if (page->age == 0 && page_count(page) <=
- (page->buffers ? 2 : 1)) {
- deactivate_page_nolock(page);
- page_active = 0;
- } else {
- page_active = 1;
- }
- }
- if (page_active || PageActive(page)) {
- list_del(page_lru);
- list_add(page_lru, &active_list);
- }
- }
-}
-
-STATIC int
-page_cleaner_daemon(void *data)
-{
- struct page *page;
- u_long flags;
- struct buffer_head *bh;
- struct page **cpages;
- int maxscan, sum;
- struct list_head * page_lru;
-
- /* Set up the thread */
- exit_files(current);
- daemonize();
-
- spin_lock_irqsave(¤t->sigmask_lock, flags);
- flush_signals(current);
- sigfillset(¤t->blocked);
- recalc_sigpending(current);
- spin_unlock_irqrestore(¤t->sigmask_lock, flags);
-
- sprintf(current->comm, "page_daemon");
-
- /*
- * If we need more memory to do bmap,
- * indicate this thread might really need it.
- */
- current->flags |= PF_MEMALLOC;
-
- cpages = kmalloc(CLUSTER_PAGE_LIST_SIZE * sizeof(struct page *),
- GFP_KERNEL);
- while (1) {
- /*
- * If we actually get into a low-memory situation,
- * the processes needing more memory will wake us
- * up on a more timely basis.
- */
-
- sum = 0;
- spin_lock(&pagemap_lru_lock);
-
- if (atomic_read(&pb_delalloc_pages) > 0)
- age_delalloc_pages();
-
-
- maxscan = nr_inactive_dirty_pages;
- while ((page_lru = inactive_dirty_list.prev) !=
- &inactive_dirty_list && maxscan-- > 0) {
-
- if (current->need_resched) {
- break;
- }
-
- page = list_entry(page_lru, struct page, lru);
- /*
- * We know this page is going to go somewhere, do not
- * bother scanning it again.
- */
- list_del(page_lru);
- list_add(page_lru, &inactive_dirty_list);
-
- if (!DelallocPage(page))
- continue;
-
- if (TryLockPage(page))
- continue;
-
- bh = page->buffers;
- if (bh && buffer_mapped(bh)) {
- /*
- * delalloc page has buffers refile it.
- */
-
- spin_unlock(&pagemap_lru_lock);
- _unmark_delalloc(page);
- set_buffer_dirty_uptodate(bh, 0);
- UnlockPage(page);
- spin_lock(&pagemap_lru_lock);
- continue;
- }
-
-/*---------------- DELALLOC CONVERT --------------------------------*/
-/* since bmap can block, this should be in a different daemon */
-/*---------------- DELALLOC CONVERT --------------------------------*/
-
- spin_unlock(&pagemap_lru_lock);
- page_cleaner_count++;
- {
- int cnt;
- cnt = pagebuf_delalloc_convert(page, PBF_FILE_ALLOCATE,
- cpages);
-
- sum += cnt;
- page_cleaner_pages += cnt;
- }
-
- /* Do not let too many pages get locked up
- * waiting for the queue to open in here
- */
- if (sum > 256) {
- run_task_queue(&tq_disk);
- sum = 0;
- }
- spin_lock(&pagemap_lru_lock);
- }
- spin_unlock(&pagemap_lru_lock);
- run_task_queue(&tq_disk);
- pcd_active = 0;
-
- if (daemon_terminate) {
- page_cleaner_daemon_started = 0;
- wake_up_interruptible(&pcd_waitq);
- break;
- }
-
- /*
- * if woken up periodically (nothing else to do)
- * convert all the pages, else convert only
- * to keep watermarks happy.
- */
- interruptible_sleep_on_timeout(&pcd_waitq,
- pb_params.p_un.cluster_interval);
- pcd_active = 1;
- }
- kfree(cpages);
- return 0;
-}
-
-int
-_page_cleaner_daemon_start(void)
-{
- extern int pagebuf_max[];
-
- if (!page_cleaner_daemon_started) {
- page_cleaner_daemon_started = 1;
-
- /*
- * watermarks: at 1/16 of total mem start waking
- * the daemon to convert ... at 1/8th kick the
- * daemon synchronously ... at 1/4th stop generating
- * any more delay pages. Low water before daemon
- * normally stops is 1/4th of when the daemon is
- * activated.
- */
- pb_params.p_un.max_dirty_pages = max_mapnr >> 4;
-
- MAX_CLUSTER = pb_params.p_un.max_dirty_pages >> 1;
- if (MAX_CLUSTER > 1024) /* arbitray max. */
- MAX_CLUSTER = 1024;
- CLUSTER_PAGE_LIST_SIZE = ((2*MAX_CLUSTER)+1);
- pagebuf_max[4] = MAX_CLUSTER;
-
- if (0 > kernel_thread(page_cleaner_daemon, (void *)0,
- CLONE_FS|CLONE_FILES|CLONE_SIGHAND))
- {
- printk("Can't start page cleaner daemon\n");
- return -1; /* error */
- }
- }
- return 0; /* success */
-}
-
-void
-_page_cleaner_daemon_stop(void)
-{
- daemon_terminate = 1;
- wake_up_interruptible_sync(&pcd_waitq);
- while (page_cleaner_daemon_started)
- interruptible_sleep_on(&pcd_waitq);
-}
-
-
-/*
* Module management
*/
@@ -2177,7 +1972,8 @@
EXPORT_SYMBOL(pagebuf_generic_file_read);
EXPORT_SYMBOL(pagebuf_generic_file_write);
EXPORT_SYMBOL(pagebuf_read_full_page);
-EXPORT_SYMBOL(pagebuf_write_full_page);
+EXPORT_SYMBOL(pagebuf_write_full_page_nounlock);
+EXPORT_SYMBOL(pagebuf_write_full_page_unlock);
EXPORT_SYMBOL(pagebuf_toss_page);
EXPORT_SYMBOL(pagebuf_prepare_write);
EXPORT_SYMBOL(pagebuf_commit_write);
diff -Naur ../../xfs-orig/linux/fs/xfs/linux/xfs_iops.c fs/xfs/linux/xfs_iops.c
--- ../../xfs-orig/linux/fs/xfs/linux/xfs_iops.c Mon Feb 12 14:20:44 2001
+++ fs/xfs/linux/xfs_iops.c Tue Feb 20 21:47:55 2001
@@ -756,7 +756,8 @@
struct address_space_operations linvfs_aops = {
readpage: pagebuf_read_full_page,
- writepage: pagebuf_write_full_page,
+ writepage: pagebuf_write_full_page_unlock,
+ writepage_nounlock: pagebuf_write_full_page_nounlock,
sync_page: block_sync_page,
bmap: linvfs_bmap,
toss_page: pagebuf_toss_page,
diff -Naur ../../xfs-orig/linux/fs/xfs/xfs_log.c fs/xfs/xfs_log.c
--- ../../xfs-orig/linux/fs/xfs/xfs_log.c Thu Feb 22 12:55:50 2001
+++ fs/xfs/xfs_log.c Thu Feb 22 12:00:31 2001
@@ -1345,6 +1345,7 @@
uint count; /* byte count of bwrite */
int split = 0; /* split write into two regions */
int error;
+ unsigned long save_flags = current->flags;
XFS_STATS_INC(xs_log_writes);
ASSERT(iclog->ic_refcnt == 0);
@@ -1354,6 +1355,8 @@
xlog_panic("xlog_sync: illegal flag");
#endif
+ current->flags |= PF_MEMALLOC;
+
xlog_pack_data(log, iclog); /* put cycle number in every block */
INT_SET(iclog->ic_header.h_len, ARCH_CONVERT, iclog->ic_offset);
/* real byte length */
@@ -1412,6 +1415,7 @@
if (error = XFS_bwrite(bp)) {
xfs_ioerror_alert("xlog_sync", log->l_mp, XFS_BUF_TARGET(bp),
XFS_BUF_ADDR(bp));
+ current->flags = save_flags;
return (error);
}
if (split) {
@@ -1448,9 +1452,11 @@
if (error = XFS_bwrite(bp)) {
xfs_ioerror_alert("xlog_sync (split)", log->l_mp,
XFS_BUF_TARGET(bp), XFS_BUF_ADDR(bp));
+ current->flags = save_flags;
return (error);
}
}
+ current->flags = save_flags;
return (0);
} /* xlog_sync */
diff -Naur ../../xfs-orig/linux/include/linux/fs.h include/linux/fs.h
--- ../../xfs-orig/linux/include/linux/fs.h Fri Feb 23 10:33:21 2001
+++ include/linux/fs.h Fri Feb 23 18:38:49 2001
@@ -220,6 +220,8 @@
#define BH_New 5 /* 1 if the buffer is new and not yet written
out */
#define BH_Protected 6 /* 1 if the buffer is protected */
#define BH_End_io 7 /* 1 End io function defined don't remap it */
+#define BH_Delay 8 /* disk mapping is delayed */
+
/*
* Try to keep the most commonly used fields in single cache lines (16
@@ -275,6 +277,7 @@
#define buffer_mapped(bh) __buffer_state(bh,Mapped)
#define buffer_new(bh) __buffer_state(bh,New)
#define buffer_protected(bh) __buffer_state(bh,Protected)
+#define buffer_delay(bh) __buffer_state(bh,Delay)
#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
@@ -379,6 +382,7 @@
int (*bmap)(struct address_space *, long);
int (*toss_page)(struct page *);
+ int (*writepage_nounlock)(struct page *);
};
@@ -481,8 +485,6 @@
void *generic_ip;
} u;
};
-
-extern spinlock_t inode_lock;
struct fown_struct {
int pid; /* pid or -pgrp where SIGIO should be sent */
diff -Naur ../../xfs-orig/linux/include/linux/mm.h include/linux/mm.h
--- ../../xfs-orig/linux/include/linux/mm.h Fri Feb 23 10:33:21 2001
+++ include/linux/mm.h Fri Feb 23 18:38:49 2001
@@ -167,7 +167,6 @@
#define PG_skip 10
#define PG_inactive_clean 11
#define PG_highmem 12
-#define PG_delalloc 13
/* bits 21-29 unused */
#define PG_arch_1 30
#define PG_reserved 31
@@ -182,7 +181,7 @@
#define PageLocked(page) test_bit(PG_locked, &(page)->flags)
#define LockPage(page) set_bit(PG_locked, &(page)->flags)
#define TryLockPage(page) test_and_set_bit(PG_locked, &(page)->flags)
-#define DelallocPage(page) test_bit(PG_delalloc, &(page)->flags)
+#define DelallocPage(page) (page->buffers && test_bit(BH_Delay,
&(page)->buffers->b_state))
extern void __set_page_dirty(struct page *);
diff -Naur ../../xfs-orig/linux/include/linux/page_buf.h
include/linux/page_buf.h
--- ../../xfs-orig/linux/include/linux/page_buf.h Fri Feb 23 10:34:56 2001
+++ include/linux/page_buf.h Fri Feb 23 18:40:25 2001
@@ -342,7 +342,7 @@
* Tunable pagebuf parameters
*/
-#define P_PARAM 6
+#define P_PARAM 4
typedef union pagebuf_param {
struct {
@@ -350,11 +350,7 @@
* delwri flush daemon. */
ulong age_buffer; /* time for buffer to age before
* we flush it. */
- ulong cluster_interval; /* interval between runs of the
- * page cleaner daemon. */
ulong max_cluster; /* maximum pages to cluster */
- ulong max_dirty_pages; /* maximum pages allowed to be
- * dirty. */
ulong debug; /* debug tracing on or off */
} p_un;
ulong data[P_PARAM];
@@ -364,10 +360,8 @@
{
PB_FLUSH_INT = 1,
PB_FLUSH_AGE = 2,
- PB_CLEAN_INT = 3,
- PB_CLUSTER_LIMIT = 4,
- PB_DELALLOC_LIMIT = 5,
- PB_DEBUG = 6
+ PB_CLUSTER_LIMIT = 3,
+ PB_DEBUG = 4
};
extern pagebuf_param_t pb_params;
@@ -626,8 +620,11 @@
struct file *, /* file to read */
struct page *); /* page to read */
-extern int pagebuf_write_full_page( /* write a page via pagebuf */
+extern int pagebuf_write_full_page_unlock(/* write a page via pagebuf */
struct page *); /* page to write */
+
+extern int pagebuf_write_full_page_nounlock(/* write a page via pagebuf
*/
+ struct page *); /* page to write */
extern void pagebuf_toss_page( /* convertion of a delalloc page */
struct page *); /* page to convert */
diff -Naur ../../xfs-orig/linux/kdb/modules/kdbm_pg.c kdb/modules/kdbm_pg.c
--- ../../xfs-orig/linux/kdb/modules/kdbm_pg.c Thu Feb 22 14:36:37 2001
+++ kdb/modules/kdbm_pg.c Thu Feb 22 14:15:56 2001
@@ -28,7 +28,7 @@
static char *bh_state_vals[] = {
"Uptodate", "Dirty", "Lock", "Req", "Mapped", "New",
- "Protected", NULL };
+ "Protected", "End_io", "Delay", NULL };
static char *map_flags(unsigned long flags, char *mapping[])
{
@@ -88,9 +88,9 @@
kdb_printf(" next 0x%p bno %ld rsec %ld size %d dev 0x%x rdev 0x%x\n",
bh.b_next, bh.b_blocknr, bh.b_rsector,
bh.b_size, bh.b_dev, bh.b_rdev);
- kdb_printf(" count %d state 0x%lx [%s] ftime 0x%lx\n",
+ kdb_printf(" count %d state 0x%lx [%s] ftime 0x%lx b_list %d b_reqnext
0x%p b_data 0x%p\n",
bh.b_count.counter, bh.b_state, map_flags(bh.b_state,
bh_state_vals),
- bh.b_flushtime);
+ bh.b_flushtime, bh.b_list, bh.b_reqnext, bh.b_data);
kdb_printf(" b_page 0x%p b_this_page 0x%p b_private 0x%p\n",
bh.b_page, bh.b_this_page, bh.b_private);
diff -Naur ../../xfs-orig/linux/kernel/ksyms.c kernel/ksyms.c
--- ../../xfs-orig/linux/kernel/ksyms.c Fri Feb 23 10:29:09 2001
+++ kernel/ksyms.c Fri Feb 23 18:29:03 2001
@@ -277,7 +277,6 @@
EXPORT_SYMBOL(lock_may_write);
EXPORT_SYMBOL(dcache_readdir);
-
/* for stackable file systems (lofs, wrapfs, cryptfs, etc.) */
EXPORT_SYMBOL(default_llseek);
EXPORT_SYMBOL(dentry_open);
@@ -285,8 +284,6 @@
EXPORT_SYMBOL(filemap_sync);
EXPORT_SYMBOL(lock_page);
-EXPORT_SYMBOL(inode_lock);
-
/* for page_buf cache */
EXPORT_SYMBOL(add_to_page_cache_unique);
EXPORT_SYMBOL(bh_cachep);
@@ -516,13 +513,6 @@
EXPORT_SYMBOL(file_fsync);
EXPORT_SYMBOL(fsync_inode_buffers);
EXPORT_SYMBOL(clear_inode);
-EXPORT_SYMBOL(inactive_dirty_list);
-EXPORT_SYMBOL(nr_active_pages);
-EXPORT_SYMBOL(active_list);
-EXPORT_SYMBOL(age_page_down_ageonly);
-EXPORT_SYMBOL(deactivate_page_nolock);
-EXPORT_SYMBOL(age_page_up_nolock);
-EXPORT_SYMBOL(nr_inactive_dirty_pages);
EXPORT_SYMBOL(nr_async_pages);
EXPORT_SYMBOL(___strtok);
EXPORT_SYMBOL(init_special_inode);
@@ -581,6 +571,3 @@
EXPORT_SYMBOL(tasklist_lock);
EXPORT_SYMBOL(pidhash);
-
-EXPORT_SYMBOL(pagemap_lru_lock);
-
diff -Naur ../../xfs-orig/linux/mm/page_alloc.c mm/page_alloc.c
--- ../../xfs-orig/linux/mm/page_alloc.c Mon Feb 12 14:20:46 2001
+++ mm/page_alloc.c Thu Feb 22 13:17:31 2001
@@ -88,11 +88,6 @@
if (PageInactiveClean(page))
BUG();
- if (DelallocPage(page)) {
- printk("Trying to free dirty page 0x%p\n", page);
- BUG();
- }
-
page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
page->age = PAGE_AGE_START;
diff -Naur ../../xfs-orig/linux/mm/swap.c mm/swap.c
diff -Naur ../../xfs-orig/linux/mm/vmscan.c mm/vmscan.c
--- ../../xfs-orig/linux/mm/vmscan.c Thu Feb 22 14:36:37 2001
+++ mm/vmscan.c Fri Feb 23 23:21:29 2001
@@ -51,11 +51,6 @@
if (TryLockPage(page))
return;
- if (DelallocPage(page)) {
- UnlockPage(page);
- return;
- }
-
/* From this point on, the odds are that we're going to
* nuke this pte, so read and clear the pte. This hook
* is needed on CPUs which update the accessed and dirty
@@ -363,12 +358,6 @@
add_page_to_inactive_dirty_list(page);
continue;
}
- if (DelallocPage(page)) {
- del_page_from_inactive_clean_list(page);
- add_page_to_inactive_dirty_list(page);
- UnlockPage(page);
- continue;
- }
/* OK, remove the page from the caches. */
if (PageSwapCache(page)) {
@@ -479,10 +468,10 @@
}
/*
- * Dirty swap-cache page or delayed allocate page?
- * Write it out if last copy..
+ * Dirty swap-cache page? Write it out if
+ * last copy..
*/
- if (PageDirty(page) || DelallocPage(page)) {
+ if (PageDirty(page)) {
int (*writepage)(struct page *) =
page->mapping->a_ops->writepage;
if (!writepage)
@@ -537,6 +526,9 @@
wait = 1; /* Async IO */
else
wait = 0; /* No IO */
+
+ if (!can_queue_buffers)
+ wait = 0;
/* Try to free the page buffers. */
clearedbuf = try_to_free_buffers(page, wait);
|