Knut J Bjuland wrote:
> Are there any plans to make xfs compatible with Riel rmap vm.patch.
> This'll make it easier to integrate it into Redhat 8.X when it ships, I
> believe it'll be based on a linux 2.4.17 or later with rmap patch.
send along a log off patching
# This is a BitKeeper generated patch for the following project:
# Project Name: Long-term Linux VM development
# This patch format is intended for GNU patch command version 2.5 or higher.
# This patch includes the following deltas:
# ChangeSet linux-2.4.19-pre2 -> 1.207
# include/asm-sparc64/pgtable.h 1.14.1.1 -> 1.16
# fs/inode.c 1.32 -> 1.33
# drivers/block/ll_rw_blk.c 1.27.1.2 -> 1.30
# include/linux/swapctl.h 1.2 -> 1.7
# include/linux/mmzone.h 1.6.1.1 -> 1.15
# fs/proc/proc_misc.c 1.11 -> 1.14
# include/linux/sched.h 1.20.1.4 -> 1.26
# drivers/char/agp/agpgart_be.c 1.20.1.3 -> 1.25
# include/linux/swap.h 1.31 -> 1.41
# include/asm-s390/pgtable.h 1.4.1.1 -> 1.6
# mm/slab.c 1.12 -> 1.13
# mm/vmscan.c 1.53.1.4 -> 1.97
# fs/dquot.c 1.16 -> 1.17
# drivers/char/drm/i810_dma.c 1.5.1.1 -> 1.10
# mm/mmap.c 1.18.1.2 -> 1.21
# include/asm-i386/pgtable.h 1.4.1.1 -> 1.7
# fs/dcache.c 1.15.1.1 -> 1.17
# include/asm-s390x/pgtable.h 1.4.1.1 -> 1.6
# include/asm-sparc/pgtable.h 1.4.1.1 -> 1.6
# include/asm-sh/pgtable.h 1.6.1.1 -> 1.8
# include/asm-arm/pgtable.h 1.5.1.1 -> 1.7
# mm/memory.c 1.41.1.6 -> 1.49
# mm/mremap.c 1.5 -> 1.6
# fs/buffer.c 1.44.1.11 -> 1.55
# include/linux/mm.h 1.29.1.6 -> 1.46
# mm/filemap.c 1.46.1.8 -> 1.54
# mm/bootmem.c 1.6 -> 1.7
# mm/page_alloc.c 1.39.1.2 -> 1.62
# kernel/sysctl.c 1.13.1.3 -> 1.17
# include/asm-i386/pgalloc.h 1.8 -> 1.11
# arch/arm/mm/mm-armv.c 1.4 -> 1.5
# include/asm-mips/pgtable.h 1.3.1.1 -> 1.5
# include/linux/slab.h 1.8 -> 1.9
# mm/swap.c 1.16 -> 1.24
# mm/swap_state.c 1.17 -> 1.20
# include/linux/fs.h 1.49.1.5 -> 1.53
# include/asm-alpha/pgtable.h 1.7.1.1 -> 1.9
# include/linux/pagemap.h 1.15.1.2 -> 1.19
# mm/oom_kill.c 1.9 -> 1.12
# kernel/ksyms.c 1.40.1.7 -> 1.46
# Makefile 1.135.1.16 -> 1.144
# kernel/fork.c 1.18.1.2 -> 1.22
# kernel/sys.c 1.8.1.1 -> 1.10
# mm/Makefile 1.3.1.2 -> 1.7
# include/asm-mips64/pgtable.h 1.3.1.1 -> 1.5
# arch/i386/kernel/setup.c 1.32.1.3 -> 1.35
# mm/swapfile.c 1.20.1.2 -> 1.23
# include/linux/elevator.h 1.4 -> 1.5
# include/linux/sysctl.h 1.10.1.2 -> 1.13
# drivers/block/elevator.c 1.5 -> 1.7
# fs/exec.c 1.17.1.2 -> 1.19
# include/asm-ia64/pgtable.h 1.6.1.1 -> 1.8
# include/asm-cris/pgtable.h 1.4.1.2 -> 1.7
# include/asm-parisc/pgtable.h 1.2.1.1 -> 1.4
# arch/i386/config.in 1.21.1.3 -> 1.24
# include/asm-ppc/pgtable.h 1.7.1.1 -> 1.9
# (new) -> 1.15 include/linux/mm_inline.h
# (new) -> 1.1 include/asm-arm/rmap.h
# (new) -> 1.1 include/asm-parisc/rmap.h
# (new) -> 1.1 include/asm-s390/rmap.h
# (new) -> 1.1 include/asm-mips/rmap.h
# (new) -> 1.1 include/asm-ia64/rmap.h
# (new) -> 1.1 include/asm-s390x/rmap.h
# (new) -> 1.14 mm/rmap.c
# (new) -> 1.2 include/asm-cris/rmap.h
# (new) -> 1.1 include/asm-sparc/rmap.h
# (new) -> 1.1
include/asm-arm/proc-armv/rmap.h
# (new) -> 1.9 mm/TODO
# (new) -> 1.1 include/asm-mips64/rmap.h
# (new) -> 1.34 Changelog.rmap
# (new) -> 1.1 include/asm-alpha/rmap.h
# (new) -> 1.2 include/asm-generic/rmap.h
# (new) -> 1.2 include/asm-i386/rmap.h
# (new) -> 1.1 include/asm-sparc64/rmap.h
# (new) -> 1.2 include/asm-ppc/rmap.h
# (new) -> 1.1 include/asm-m68k/rmap.h
# (new) -> 1.1 include/asm-sh/rmap.h
#
# The following is the BitKeeper ChangeSet Log
# --------------------------------------------
# 02/02/28 marcelo@xxxxxxxxxxxxxxxx 1.130.1.29
# linux-2.4.19-pre2:
# - -ac merge (Alan Cox)
# - Huge MIPS/MIPS64 merge (Ralf Baechle)
# - IA64 update (David Mosberger)
# - PPC update (Tom Rini)
# - Shrink struct page (Rik van Riel)
# - QNX4 update (now its able to mount QNX 6.1 fses) (Anders Larsen)
# - Make max_map_count sysctl configurable (Christoph Hellwig)
# - matroxfb update (Petr Vandrovec)
# - ymfpci update (Pete Zaitcev)
# - LVM update (Heinz J . Mauelshagen)
# - btaudio driver update (Gerd Knorr)
# - bttv update (Gerd Knorr)
# - Out of line code cleanup (Keith Owens)
# - Add watchdog API documentation (Christer Weinigel)
# - Rivafb update (Ani Joshi)
# - Enable PCI buses above quad0 on NUMA-Q (Martin J. Bligh)
# - Fix PIIX IDE slave PCI timings (Dave Bogdanoff)
# - Make PLIP work again (Tim Waugh)
# - Remove unecessary printk from lp.c (Tim Waugh)
# - Make parport_daisy_select work for ECP/EPP modes (Max Vorobiev)
# - Support O_NONBLOCK on lp/ppdev correctly (Tim Waugh)
# - Add PCI card hooks to parport (Tim Waugh)
# - Compaq cciss driver fixes (Stephen Cameron)
# - VFS cleanups and fixes (Alexander Viro)
# - USB update (including USB 2.0 support) (Greg KH)
# - More jiffies compare cleanups (Tim Schmielau)
# - PCI hotplug update (Greg KH)
# - bluesmoke fixes (Dave Jones)
# - Fix off-by-one in ide-scsi (John Fremlin)
# - Fix warnings in make xconfig (René Scharfe)
# - Make x86 MCE a configure option (Paul Gortmaker)
# - Small ramdisk fixes (Christoph Hellwig)
# - Add missing atime update to pipe code (Christoph
Hellwig)
# - Serialize microcode access (Tigran Aivazian)
# - AMD Elan handling on serial.c (Robert
Schwebel)
# --------------------------------------------
# 02/02/28 riel@xxxxxxxxxxxxxxxxxxxx 1.205
# merged
# --------------------------------------------
# 02/02/28 riel@xxxxxxxxxxxxxxxxxxxx 1.206
# remove obsolete code
# --------------------------------------------
# 02/02/28 riel@xxxxxxxxxxxxxxxxxxxx 1.207
# some more merging cleanups
# --------------------------------------------
#
diff -Nru a/Changelog.rmap b/Changelog.rmap
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/Changelog.rmap Fri Mar 1 18:19:44 2002
@@ -0,0 +1,142 @@
+The seventh maintenance release of the 12th version of the reverse
+mapping based VM is now available.
+This is an attempt at making a more robust and flexible VM
+subsystem, while cleaning up a lot of code at the same time.
+The patch is available from:
+
+ http://surriel.com/patches/2.4/2.4.19p1-rmap-12g
+and http://linuxvm.bkbits.net/
+
+
+My big TODO items for a next release are:
+ - page launder
+
+ - drop pte quicklist in anticipation of pte-highmem (me)
+ - replace andrea's highmem emulation by ingo's one (me)
+rmap 12g:
+ - port to armv architecture (David Woodhouse)
+ - NUMA fix to zone_table initialisation (Samuel Ortiz)
+ - remove init_page_count (David Miller)
+rmap 12f:
+ - for_each_pgdat macro (William Lee Irwin)
+ - put back EXPORT(__find_get_page) for modular rd (me)
+ - make bdflush and kswapd actually start queued disk IO (me)
+rmap 12e
+ - RSS limit fix, the limit can be 0 for some reason (me)
+ - clean up for_each_zone define to not need pgdata_t (William Lee Irwin)
+ - fix i810_dma bug introduced with page->wait removal (William Lee Irwin)
+rmap 12d:
+ - fix compiler warning in rmap.c (Roger Larsson)
+ - read latency improvement (read-latency2) (Andrew Morton)
+rmap 12c:
+ - fix small balancing bug in page_launder_zone (Nick Piggin)
+ - wakeup_kswapd / wakeup_memwaiters code fix (Arjan van de Ven)
+ - improve RSS limit enforcement (me)
+rmap 12b:
+ - highmem emulation (for debugging purposes) (Andrea Arcangeli)
+ - ulimit RSS enforcement when memory gets tight (me)
+ - sparc64 page->virtual quickfix (Greg Procunier)
+rmap 12a:
+ - fix the compile warning in buffer.c (me)
+ - fix divide-by-zero on highmem initialisation DOH! (me)
+ - remove the pgd quicklist (suspicious ...) (DaveM, me)
+rmap 12:
+ - keep some extra free memory on large machines (Arjan van de Ven,
me)
+ - higher-order allocation bugfix (Adrian Drzewiecki)
+ - nr_free_buffer_pages() returns inactive + free mem (me)
+ - pages from unused objects directly to inactive_clean (me)
+ - use fast pte quicklists on non-pae machines (Andrea Arcangeli)
+ - remove sleep_on from wakeup_kswapd (Arjan van de Ven)
+ - page waitqueue cleanup (Christoph Hellwig)
+rmap 11c:
+ - oom_kill race locking fix (Andres Salomon)
+ - elevator improvement (Andrew Morton)
+ - dirty buffer writeout speedup (hopefully ;)) (me)
+ - small documentation updates (me)
+ - page_launder() never does synchronous IO, kswapd
+ and the processes calling it sleep on higher level (me)
+ - deadlock fix in touch_page() (me)
+rmap 11b:
+ - added low latency reschedule points in vmscan.c (me)
+ - make i810_dma.c include mm_inline.h too (William Lee Irwin)
+ - wake up kswapd sleeper tasks on OOM kill so the
+ killed task can continue on its way out (me)
+ - tune page allocation sleep point a little (me)
+rmap 11a:
+ - don't let refill_inactive() progress count for OOM (me)
+ - after an OOM kill, wait 5 seconds for the next kill (me)
+ - agpgart_be fix for hashed waitqueues (William Lee Irwin)
+rmap 11:
+ - fix stupid logic inversion bug in wakeup_kswapd() (Andrew Morton)
+ - fix it again in the morning (me)
+ - add #ifdef BROKEN_PPC_PTE_ALLOC_ONE to rmap.h, it
+ seems PPC calls pte_alloc() before mem_map[] init (me)
+ - disable the debugging code in rmap.c ... the code
+ is working and people are running benchmarks (me)
+ - let the slab cache shrink functions return a value
+ to help prevent early OOM killing (Ed Tomlinson)
+ - also, don't call the OOM code if we have enough
+ free pages (me)
+ - move the call to lru_cache_del into __free_pages_ok (Ben LaHaise)
+ - replace the per-page waitqueue with a hashed
+ waitqueue, reduces size of struct page from 64
+ bytes to 52 bytes (48 bytes on non-highmem machines) (William Lee Irwin)
+rmap 10:
+ - fix the livelock for real (yeah right), turned out
+ to be a stupid bug in page_launder_zone() (me)
+ - to make sure the VM subsystem doesn't monopolise
+ the CPU, let kswapd and some apps sleep a bit under
+ heavy stress situations (me)
+ - let __GFP_HIGH allocations dig a little bit deeper
+ into the free page pool, the SCSI layer seems fragile (me)
+rmap 9:
+ - improve comments all over the place (Michael Cohen)
+ - don't panic if page_remove_rmap() cannot find the
+ rmap in question, it's possible that the memory was
+ PG_reserved and belonging to a driver, but the driver
+ exited and cleared the PG_reserved bit (me)
+ - fix the VM livelock by replacing > by >= in a few
+ critical places in the pageout code (me)
+ - treat the reclaiming of an inactive_clean page like
+ allocating a new page, calling try_to_free_pages()
+ and/or fixup_freespace() if required (me)
+ - when low on memory, don't make things worse by
+ doing swapin_readahead (me)
+rmap 8:
+ - add ANY_ZONE to the balancing functions to improve
+ kswapd's balancing a bit (me)
+ - regularize some of the maximum loop bounds in
+ vmscan.c for cosmetic purposes (William Lee Irwin)
+ - move page_address() to architecture-independent
+ code, now the removal of page->virtual is portable (William Lee Irwin)
+ - speed up free_area_init_core() by doing a single
+ pass over the pages and not using atomic ops (William Lee Irwin)
+ - documented the buddy allocator in page_alloc.c (William Lee Irwin)
+rmap 7:
+ - clean up and document vmscan.c (me)
+ - reduce size of page struct, part one (William Lee Irwin)
+ - add rmap.h for other archs (untested, not for ARM) (me)
+rmap 6:
+ - make the active and inactive_dirty list per zone,
+ this is finally possible because we can free pages
+ based on their physical address (William Lee Irwin)
+ - cleaned up William's code a bit (me)
+ - turn some defines into inlines and move those to
+ mm_inline.h (the includes are a mess ...) (me)
+ - improve the VM balancing a bit (me)
+ - add back inactive_target to /proc/meminfo (me)
+rmap 5:
+ - fixed recursive buglet, introduced by directly
+ editing the patch for making rmap 4 ;))) (me)
+rmap 4:
+ - look at the referenced bits in page tables (me)
+rmap 3:
+ - forgot one FASTCALL definition (me)
+rmap 2:
+ - teach try_to_unmap_one() about mremap() (me)
+ - don't assign swap space to pages with buffers (me)
+ - make the rmap.c functions FASTCALL / inline (me)
+rmap 1:
+ - fix the swap leak in rmap 0 (Dave McCracken)
+rmap 0:
+ - port of reverse mapping VM to 2.4.16 (me)
diff -Nru a/Makefile b/Makefile
--- a/Makefile Fri Mar 1 18:19:44 2002
+++ b/Makefile Fri Mar 1 18:19:44 2002
@@ -1,7 +1,7 @@
VERSION = 2
PATCHLEVEL = 4
SUBLEVEL = 19
-EXTRAVERSION = -pre2
+EXTRAVERSION = -pre2-rmap12g
KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
diff -Nru a/arch/arm/mm/mm-armv.c b/arch/arm/mm/mm-armv.c
--- a/arch/arm/mm/mm-armv.c Fri Mar 1 18:19:44 2002
+++ b/arch/arm/mm/mm-armv.c Fri Mar 1 18:19:44 2002
@@ -19,6 +19,7 @@
#include <asm/page.h>
#include <asm/io.h>
#include <asm/setup.h>
+#include <asm/rmap.h>
#include <asm/mach/map.h>
@@ -457,6 +458,7 @@
* cache implementation.
*/
kmem_cache_t *pte_cache;
+kmem_cache_t *pte_rmap_cache;
/*
* The constructor gets called for each object within the cache when the
@@ -467,6 +469,22 @@
{
unsigned long block = (unsigned long)pte;
+ if (!(block & 2048)) {
+ /* First object of two in a page - allocate the
+ pte_rmap_info to go with them */
+
+ struct page * page = virt_to_page(pte);
+
+ if (flags & SLAB_CTOR_ATOMIC)
+ BUG();
+
+ page->mapping = kmem_cache_alloc(pte_rmap_cache, GFP_KERNEL);
+ if (!page->mapping) {
+ printk(KERN_CRIT "pte_rmap_cache alloc failed. Oops.
Slab constructors need to be allowed to fail\n");
+ /* return -ENOMEM; */
+ BUG();
+ }
+ }
if (block & 2047)
BUG();
@@ -475,11 +493,32 @@
PTRS_PER_PTE * sizeof(pte_t), 0);
}
+static void pte_cache_dtor(void *pte, kmem_cache_t *cache, unsigned long flags)
+{
+ unsigned long block = (unsigned long)pte;
+
+ if (!(block & 2048)) {
+ /* First object of two in a page - free the
+ pte_rmap_info that was associated with them */
+
+ struct page * page = virt_to_page(pte);
+
+ kmem_cache_free(pte_rmap_cache, page->mapping);
+ page->mapping = NULL;
+ }
+}
+
void __init pgtable_cache_init(void)
{
+ pte_rmap_cache = kmem_cache_create("pte-rmap-cache",
+ 2 * sizeof(struct arm_rmap_info), 0, 0,
+ NULL, NULL);
+ if (!pte_rmap_cache)
+ BUG();
+
pte_cache = kmem_cache_create("pte-cache",
2 * PTRS_PER_PTE * sizeof(pte_t), 0, 0,
- pte_cache_ctor, NULL);
+ pte_cache_ctor, pte_cache_dtor);
if (!pte_cache)
BUG();
}
diff -Nru a/drivers/block/elevator.c b/drivers/block/elevator.c
--- a/drivers/block/elevator.c Fri Mar 1 18:19:44 2002
+++ b/drivers/block/elevator.c Fri Mar 1 18:19:44 2002
@@ -80,30 +80,38 @@
struct buffer_head *bh, int rw,
int max_sectors)
{
- struct list_head *entry = &q->queue_head;
- unsigned int count = bh->b_size >> 9, ret = ELEVATOR_NO_MERGE;
-
+ struct list_head *entry;
+ unsigned int count = bh->b_size >> 9;
+ unsigned int ret = ELEVATOR_NO_MERGE;
+ int merge_only = 0;
+ const int max_bomb_segments = q->elevator.max_bomb_segments;
+
+ entry = &q->queue_head;
while ((entry = entry->prev) != head) {
struct request *__rq = blkdev_entry_to_request(entry);
- /*
- * simply "aging" of requests in queue
- */
- if (__rq->elevator_sequence-- <= 0)
- break;
-
+ if (__rq->elevator_sequence-- <= 0) {
+ /*
+ * OK, we've exceeded someone's latency limit.
+ * But we still continue to look for merges,
+ * because they're so much better than seeks.
+ */
+ merge_only = 1;
+ }
if (__rq->waiting)
continue;
if (__rq->rq_dev != bh->b_rdev)
continue;
- if (!*req && bh_rq_in_between(bh, __rq, &q->queue_head))
+ if (!*req && !merge_only &&
+ bh_rq_in_between(bh, __rq, &q->queue_head)) {
*req = __rq;
+ }
if (__rq->cmd != rw)
continue;
if (__rq->nr_sectors + count > max_sectors)
continue;
if (__rq->elevator_sequence < count)
- break;
+ merge_only = 1;
if (__rq->sector + __rq->nr_sectors == bh->b_rsector) {
ret = ELEVATOR_BACK_MERGE;
*req = __rq;
@@ -116,6 +124,56 @@
}
}
+ /*
+ * If we failed to merge a read anywhere in the request
+ * queue, we really don't want to place it at the end
+ * of the list, behind lots of writes. So place it near
+ * the front.
+ *
+ * We don't want to place it in front of _all_ writes: that
+ * would create lots of seeking, and isn't tunable.
+ * We try to avoid promoting this read in front of existing
+ * reads.
+ *
+ * max_bomb_sectors becomes the maximum number of write
+ * requests which we allow to remain in place in front of
+ * a newly introduced read. We weight things a little bit,
+ * so large writes are more expensive than small ones, but it's
+ * requests which count, not sectors.
+ */
+ if (max_bomb_segments && rw == READ && ret == ELEVATOR_NO_MERGE) {
+ int cur_latency = 0;
+ struct request * const cur_request = *req;
+
+ entry = head->next;
+ while (entry != &q->queue_head) {
+ struct request *__rq;
+
+ if (entry == &q->queue_head)
+ BUG();
+ if (entry == q->queue_head.next &&
+ q->head_active && !q->plugged)
+ BUG();
+ __rq = blkdev_entry_to_request(entry);
+
+ if (__rq == cur_request) {
+ /*
+ * This is where the old algorithm placed it.
+ * There's no point pushing it further back,
+ * so leave it here, in sorted order.
+ */
+ break;
+ }
+ if (__rq->cmd == WRITE) {
+ cur_latency += 1 + __rq->nr_sectors / 64;
+ if (cur_latency >= max_bomb_segments) {
+ *req = __rq;
+ break;
+ }
+ }
+ entry = entry->next;
+ }
+ }
return ret;
}
@@ -188,7 +246,7 @@
output.queue_ID = elevator->queue_ID;
output.read_latency = elevator->read_latency;
output.write_latency = elevator->write_latency;
- output.max_bomb_segments = 0;
+ output.max_bomb_segments = elevator->max_bomb_segments;
if (copy_to_user(arg, &output, sizeof(blkelv_ioctl_arg_t)))
return -EFAULT;
@@ -207,9 +265,12 @@
return -EINVAL;
if (input.write_latency < 0)
return -EINVAL;
+ if (input.max_bomb_segments < 0)
+ return -EINVAL;
elevator->read_latency = input.read_latency;
elevator->write_latency = input.write_latency;
+ elevator->max_bomb_segments = input.max_bomb_segments;
return 0;
}
diff -Nru a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
--- a/drivers/block/ll_rw_blk.c Fri Mar 1 18:19:44 2002
+++ b/drivers/block/ll_rw_blk.c Fri Mar 1 18:19:44 2002
@@ -1176,9 +1176,11 @@
* Free request slots per queue.
* (Half for reads, half for writes)
*/
- queue_nr_requests = 64;
- if (total_ram > MB(32))
- queue_nr_requests = 128;
+ queue_nr_requests = (total_ram >> 9) & ~15; /* One per
half-megabyte */
+ if (queue_nr_requests < 32)
+ queue_nr_requests = 32;
+ if (queue_nr_requests > 1024)
+ queue_nr_requests = 1024;
/*
* Batch frees according to queue length
diff -Nru a/fs/buffer.c b/fs/buffer.c
--- a/fs/buffer.c Fri Mar 1 18:19:44 2002
+++ b/fs/buffer.c Fri Mar 1 18:19:44 2002
@@ -47,6 +47,7 @@
#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/completion.h>
+#include <linux/mm_inline.h>
#include <asm/uaccess.h>
#include <asm/io.h>
@@ -729,11 +730,9 @@
static void free_more_memory(void)
{
- zone_t * zone = contig_page_data.node_zonelists[GFP_NOFS &
GFP_ZONEMASK].zones[0];
-
balance_dirty();
wakeup_bdflush();
- try_to_free_pages(zone, GFP_NOFS, 0);
+ try_to_free_pages(GFP_NOFS);
run_task_queue(&tq_disk);
current->policy |= SCHED_YIELD;
__set_current_state(TASK_RUNNING);
@@ -1046,7 +1045,6 @@
unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
- dirty += size_buffers_type[BUF_LOCKED] >> PAGE_SHIFT;
tot = nr_free_buffer_pages();
dirty *= 100;
@@ -1078,18 +1076,17 @@
return;
/* If we're getting into imbalance, start write-out */
- spin_lock(&lru_list_lock);
- write_some_buffers(NODEV);
+ wakeup_bdflush();
/*
* And if we're _really_ out of balance, wait for
- * some of the dirty/locked buffers ourselves and
- * start bdflush.
+ * some of the dirty/locked buffers ourselves.
* This will throttle heavy writers.
*/
if (state > 0) {
+ spin_lock(&lru_list_lock);
+ write_some_buffers(NODEV);
wait_for_some_buffers(NODEV);
- wakeup_bdflush();
}
}
@@ -2592,10 +2589,9 @@
return 1;
}
-static int sync_page_buffers(struct buffer_head *head)
+static void sync_page_buffers(struct buffer_head *head)
{
struct buffer_head * bh = head;
- int tryagain = 0;
do {
if (!buffer_dirty(bh) && !buffer_locked(bh))
@@ -2605,15 +2601,11 @@
if (!test_and_set_bit(BH_Wait_IO, &bh->b_state))
continue;
- /* Second time through we start actively writing out.. */
- if (test_and_set_bit(BH_Lock, &bh->b_state)) {
- if (!test_bit(BH_launder, &bh->b_state))
- continue;
- wait_on_buffer(bh);
- tryagain = 1;
+ /* If we cannot lock the buffer just skip it. */
+ if (test_and_set_bit(BH_Lock, &bh->b_state))
continue;
- }
+ /* Second time through we start actively writing out.. */
if (!atomic_set_buffer_clean(bh)) {
unlock_buffer(bh);
continue;
@@ -2624,10 +2616,9 @@
set_bit(BH_launder, &bh->b_state);
bh->b_end_io = end_buffer_io_sync;
submit_bh(WRITE, bh);
- tryagain = 0;
} while ((bh = bh->b_this_page) != head);
- return tryagain;
+ return;
}
/*
@@ -2651,7 +2642,6 @@
{
struct buffer_head * tmp, * bh = page->buffers;
-cleaned_buffers_try_again:
spin_lock(&lru_list_lock);
write_lock(&hash_table_lock);
tmp = bh;
@@ -2694,15 +2684,9 @@
write_unlock(&hash_table_lock);
spin_unlock(&lru_list_lock);
gfp_mask = pf_gfp_mask(gfp_mask);
- if (gfp_mask & __GFP_IO) {
- if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
- if (sync_page_buffers(bh)) {
- /* no IO or waiting next time */
- gfp_mask = 0;
- goto cleaned_buffers_try_again;
- }
- }
- }
+ if ((gfp_mask & __GFP_IO) &&
+ ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)))
+ sync_page_buffers(bh);
if (balance_dirty_state() >= 0)
wakeup_bdflush();
return 0;
@@ -2951,7 +2935,7 @@
spin_lock(&lru_list_lock);
if (!write_some_buffers(NODEV) || balance_dirty_state() < 0) {
- wait_for_some_buffers(NODEV);
+ run_task_queue(&tq_disk);
interruptible_sleep_on(&bdflush_wait);
}
}
@@ -2982,7 +2966,6 @@
complete((struct completion *)startup);
for (;;) {
- wait_for_some_buffers(NODEV);
/* update interval */
interval = bdf_prm.b_un.interval;
@@ -3011,6 +2994,7 @@
printk(KERN_DEBUG "kupdate() activated...\n");
#endif
sync_old_buffers();
+ run_task_queue(&tq_disk);
}
}
diff -Nru a/fs/dcache.c b/fs/dcache.c
--- a/fs/dcache.c Fri Mar 1 18:19:44 2002
+++ b/fs/dcache.c Fri Mar 1 18:19:44 2002
@@ -568,8 +568,7 @@
count = dentry_stat.nr_unused / priority;
prune_dcache(count);
- kmem_cache_shrink(dentry_cache);
- return 0;
+ return kmem_cache_shrink_nr(dentry_cache);
}
#define NAME_ALLOC_LEN(len) ((len+16) & ~15)
diff -Nru a/fs/dquot.c b/fs/dquot.c
--- a/fs/dquot.c Fri Mar 1 18:19:44 2002
+++ b/fs/dquot.c Fri Mar 1 18:19:44 2002
@@ -413,8 +413,7 @@
lock_kernel();
prune_dqcache(nr_free_dquots / (priority + 1));
unlock_kernel();
- kmem_cache_shrink(dquot_cachep);
- return 0;
+ return kmem_cache_shrink_nr(dquot_cachep);
}
/* NOTE: If you change this function please check whether dqput_blocks() works
right... */
diff -Nru a/fs/exec.c b/fs/exec.c
--- a/fs/exec.c Fri Mar 1 18:19:44 2002
+++ b/fs/exec.c Fri Mar 1 18:19:44 2002
@@ -35,6 +35,7 @@
#include <linux/highmem.h>
#include <linux/spinlock.h>
#include <linux/personality.h>
+#include <linux/swap.h>
#define __NO_VERSION__
#include <linux/module.h>
@@ -279,6 +280,7 @@
flush_dcache_page(page);
flush_page_to_ram(page);
set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY))));
+ page_add_rmap(page, pte);
tsk->mm->rss++;
spin_unlock(&tsk->mm->page_table_lock);
diff -Nru a/fs/inode.c b/fs/inode.c
--- a/fs/inode.c Fri Mar 1 18:19:44 2002
+++ b/fs/inode.c Fri Mar 1 18:19:44 2002
@@ -725,8 +725,7 @@
count = inodes_stat.nr_unused / priority;
prune_icache(count);
- kmem_cache_shrink(inode_cachep);
- return 0;
+ return kmem_cache_shrink_nr(inode_cachep);
}
/*
diff -Nru a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
--- a/fs/proc/proc_misc.c Fri Mar 1 18:19:44 2002
+++ b/fs/proc/proc_misc.c Fri Mar 1 18:19:44 2002
@@ -36,6 +36,7 @@
#include <linux/init.h>
#include <linux/smp_lock.h>
#include <linux/seq_file.h>
+#include <linux/mm_inline.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -164,7 +165,9 @@
"Cached: %8lu kB\n"
"SwapCached: %8lu kB\n"
"Active: %8u kB\n"
- "Inactive: %8u kB\n"
+ "Inact_dirty: %8u kB\n"
+ "Inact_clean: %8u kB\n"
+ "Inact_target: %8lu kB\n"
"HighTotal: %8lu kB\n"
"HighFree: %8lu kB\n"
"LowTotal: %8lu kB\n"
@@ -178,7 +181,9 @@
K(pg_size - swapper_space.nrpages),
K(swapper_space.nrpages),
K(nr_active_pages),
- K(nr_inactive_pages),
+ K(nr_inactive_dirty_pages),
+ K(nr_inactive_clean_pages),
+ K(inactive_target()),
K(i.totalhigh),
K(i.freehigh),
K(i.totalram-i.totalhigh),
diff -Nru a/include/asm-alpha/rmap.h b/include/asm-alpha/rmap.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/include/asm-alpha/rmap.h Fri Mar 1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _ALPHA_RMAP_H
+#define _ALPHA_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-arm/proc-armv/rmap.h b/include/asm-arm/proc-armv/rmap.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/include/asm-arm/proc-armv/rmap.h Fri Mar 1 18:19:44 2002
@@ -0,0 +1,72 @@
+#ifndef _ARMV_RMAP_H
+#define _ARMV_RMAP_H
+/*
+ * linux/include/asm-arm/proc-armv/rmap.h
+ *
+ * Architecture dependant parts of the reverse mapping code,
+ *
+ * We use the struct page of the page table page to find a pointer
+ * to an array of two 'struct arm_rmap_info's, one for each of the
+ * two page tables in each page.
+ *
+ * - rmi->mm points to the process' mm_struct
+ * - rmi->index has the high bits of the address
+ * - the lower bits of the address are calculated from the
+ * offset of the page table entry within the page table page
+ */
+#include <linux/mm.h>
+
+struct arm_rmap_info {
+ struct mm_struct *mm;
+ unsigned long index;
+};
+
+static inline void pgtable_add_rmap(pte_t * ptep, struct mm_struct * mm,
unsigned long address)
+{
+ struct page * page = virt_to_page(ptep);
+ struct arm_rmap_info *rmi = (void *)page->mapping;
+
+ if (((unsigned long)ptep)&2048)
+ rmi++;
+
+ rmi->mm = mm;
+ rmi->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1);
+}
+
+static inline void pgtable_remove_rmap(pte_t * ptep)
+{
+ struct page * page = virt_to_page(ptep);
+ struct arm_rmap_info *rmi = (void *)page->mapping;
+
+ if (((unsigned long)ptep)&2048)
+ rmi++;
+
+ rmi->mm = NULL;
+ rmi->index = 0;
+}
+
+static inline struct mm_struct * ptep_to_mm(pte_t * ptep)
+{
+ struct page * page = virt_to_page(ptep);
+ struct arm_rmap_info *rmi = (void *)page->mapping;
+
+ if (((unsigned long)ptep)&2048)
+ rmi++;
+
+ return rmi->mm;
+}
+
+static inline unsigned long ptep_to_address(pte_t * ptep)
+{
+ struct page * page = virt_to_page(ptep);
+ struct arm_rmap_info *rmi = (void *)page->mapping;
+ unsigned long low_bits;
+
+ if (((unsigned long)ptep)&2048)
+ rmi++;
+
+ low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE;
+ return rmi->index + low_bits;
+}
+
+#endif /* _ARMV_RMAP_H */
diff -Nru a/include/asm-arm/rmap.h b/include/asm-arm/rmap.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/include/asm-arm/rmap.h Fri Mar 1 18:19:44 2002
@@ -0,0 +1,6 @@
+#ifndef _ARM_RMAP_H
+#define _ARM_RMAP_H
+
+#include <asm/proc/rmap.h>
+
+#endif /* _ARM_RMAP_H */
diff -Nru a/include/asm-cris/rmap.h b/include/asm-cris/rmap.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/include/asm-cris/rmap.h Fri Mar 1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _CRIS_RMAP_H
+#define _CRIS_RMAP_H
+
+/* nothing to see, move along :) */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-generic/rmap.h b/include/asm-generic/rmap.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/include/asm-generic/rmap.h Fri Mar 1 18:19:44 2002
@@ -0,0 +1,57 @@
+#ifndef _GENERIC_RMAP_H
+#define _GENERIC_RMAP_H
+/*
+ * linux/include/asm-generic/rmap.h
+ *
+ * Architecture dependant parts of the reverse mapping code,
+ * this version should work for most architectures with a
+ * 'normal' page table layout.
+ *
+ * We use the struct page of the page table page to find out
+ * the process and full address of a page table entry:
+ * - page->mapping points to the process' mm_struct
+ * - page->index has the high bits of the address
+ * - the lower bits of the address are calculated from the
+ * offset of the page table entry within the page table page
+ */
+#include <linux/mm.h>
+
+static inline void pgtable_add_rmap(pte_t * ptep, struct mm_struct * mm,
unsigned long address)
+{
+ struct page * page = virt_to_page(ptep);
+#ifdef BROKEN_PPC_PTE_ALLOC_ONE
+ /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */
+ extern int mem_init_done;
+
+ if (!mem_init_done)
+ return;
+#endif
+ page->mapping = (void *)mm;
+ page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1);
+}
+
+static inline void pgtable_remove_rmap(pte_t * ptep)
+{
+ struct page * page = virt_to_page(ptep);
+
+ page->mapping = NULL;
+ page->index = 0;
+}
+
+static inline struct mm_struct * ptep_to_mm(pte_t * ptep)
+{
+ struct page * page = virt_to_page(ptep);
+
+ return (struct mm_struct *) page->mapping;
+}
+
+static inline unsigned long ptep_to_address(pte_t * ptep)
+{
+ struct page * page = virt_to_page(ptep);
+ unsigned long low_bits;
+
+ low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE;
+ return page->index + low_bits;
+}
+
+#endif /* _GENERIC_RMAP_H */
diff -Nru a/include/asm-i386/rmap.h b/include/asm-i386/rmap.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/include/asm-i386/rmap.h Fri Mar 1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _I386_RMAP_H
+#define _I386_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-ia64/rmap.h b/include/asm-ia64/rmap.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/include/asm-ia64/rmap.h Fri Mar 1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _IA64_RMAP_H
+#define _IA64_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-m68k/rmap.h b/include/asm-m68k/rmap.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/include/asm-m68k/rmap.h Fri Mar 1 18:19:45 2002
@@ -0,0 +1,7 @@
+#ifndef _M86K_RMAP_H
+#define _M86K_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-mips/rmap.h b/include/asm-mips/rmap.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/include/asm-mips/rmap.h Fri Mar 1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _MIPS_RMAP_H
+#define _MIPS_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-mips64/rmap.h b/include/asm-mips64/rmap.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/include/asm-mips64/rmap.h Fri Mar 1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _MIPS64_RMAP_H
+#define _MIPS64_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-parisc/rmap.h b/include/asm-parisc/rmap.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/include/asm-parisc/rmap.h Fri Mar 1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _PARISC_RMAP_H
+#define _PARISC_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-ppc/rmap.h b/include/asm-ppc/rmap.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/include/asm-ppc/rmap.h Fri Mar 1 18:19:44 2002
@@ -0,0 +1,9 @@
+#ifndef _PPC_RMAP_H
+#define _PPC_RMAP_H
+
+/* PPC calls pte_alloc() before mem_map[] is setup ... */
+#define BROKEN_PPC_PTE_ALLOC_ONE
+
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-s390/rmap.h b/include/asm-s390/rmap.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/include/asm-s390/rmap.h Fri Mar 1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _S390_RMAP_H
+#define _S390_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-s390x/rmap.h b/include/asm-s390x/rmap.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/include/asm-s390x/rmap.h Fri Mar 1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _S390X_RMAP_H
+#define _S390X_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-sh/rmap.h b/include/asm-sh/rmap.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/include/asm-sh/rmap.h Fri Mar 1 18:19:45 2002
@@ -0,0 +1,7 @@
+#ifndef _SH_RMAP_H
+#define _SH_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-sparc/rmap.h b/include/asm-sparc/rmap.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/include/asm-sparc/rmap.h Fri Mar 1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _SPARC_RMAP_H
+#define _SPARC_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-sparc64/rmap.h b/include/asm-sparc64/rmap.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/include/asm-sparc64/rmap.h Fri Mar 1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _SPARC64_RMAP_H
+#define _SPARC64_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/linux/elevator.h b/include/linux/elevator.h
--- a/include/linux/elevator.h Fri Mar 1 18:19:44 2002
+++ b/include/linux/elevator.h Fri Mar 1 18:19:44 2002
@@ -1,12 +1,9 @@
#ifndef _LINUX_ELEVATOR_H
#define _LINUX_ELEVATOR_H
-typedef void (elevator_fn) (struct request *, elevator_t *,
- struct list_head *,
- struct list_head *, int);
-
-typedef int (elevator_merge_fn) (request_queue_t *, struct request **, struct
list_head *,
- struct buffer_head *, int, int);
+typedef int (elevator_merge_fn)(request_queue_t *, struct request **,
+ struct list_head *, struct buffer_head *bh,
+ int rw, int max_sectors);
typedef void (elevator_merge_cleanup_fn) (request_queue_t *, struct request *,
int);
@@ -16,6 +13,7 @@
{
int read_latency;
int write_latency;
+ int max_bomb_segments;
elevator_merge_fn *elevator_merge_fn;
elevator_merge_cleanup_fn *elevator_merge_cleanup_fn;
@@ -24,13 +22,13 @@
unsigned int queue_ID;
};
-int elevator_noop_merge(request_queue_t *, struct request **, struct list_head
*, struct buffer_head *, int, int);
-void elevator_noop_merge_cleanup(request_queue_t *, struct request *, int);
-void elevator_noop_merge_req(struct request *, struct request *);
-
-int elevator_linus_merge(request_queue_t *, struct request **, struct
list_head *, struct buffer_head *, int, int);
-void elevator_linus_merge_cleanup(request_queue_t *, struct request *, int);
-void elevator_linus_merge_req(struct request *, struct request *);
+elevator_merge_fn elevator_noop_merge;
+elevator_merge_cleanup_fn elevator_noop_merge_cleanup;
+elevator_merge_req_fn elevator_noop_merge_req;
+
+elevator_merge_fn elevator_linus_merge;
+elevator_merge_cleanup_fn elevator_linus_merge_cleanup;
+elevator_merge_req_fn elevator_linus_merge_req;
typedef struct blkelv_ioctl_arg_s {
int queue_ID;
@@ -54,22 +52,6 @@
#define ELEVATOR_FRONT_MERGE 1
#define ELEVATOR_BACK_MERGE 2
-/*
- * This is used in the elevator algorithm. We don't prioritise reads
- * over writes any more --- although reads are more time-critical than
- * writes, by treating them equally we increase filesystem throughput.
- * This turns out to give better overall performance. -- sct
- */
-#define IN_ORDER(s1,s2) \
- ((((s1)->rq_dev == (s2)->rq_dev && \
- (s1)->sector < (s2)->sector)) || \
- (s1)->rq_dev < (s2)->rq_dev)
-
-#define BHRQ_IN_ORDER(bh, rq) \
- ((((bh)->b_rdev == (rq)->rq_dev && \
- (bh)->b_rsector < (rq)->sector)) || \
- (bh)->b_rdev < (rq)->rq_dev)
-
static inline int elevator_request_latency(elevator_t * elevator, int rw)
{
int latency;
@@ -85,7 +67,7 @@
((elevator_t) {
\
0, /* read_latency */ \
0, /* write_latency */ \
- \
+ 0, /* max_bomb_segments */ \
elevator_noop_merge, /* elevator_merge_fn */ \
elevator_noop_merge_cleanup, /* elevator_merge_cleanup_fn */ \
elevator_noop_merge_req, /* elevator_merge_req_fn */ \
@@ -95,7 +77,7 @@
((elevator_t) {
\
8192, /* read passovers */ \
16384, /* write passovers */ \
- \
+ 6, /* max_bomb_segments */ \
elevator_linus_merge, /* elevator_merge_fn */ \
elevator_linus_merge_cleanup, /* elevator_merge_cleanup_fn */ \
elevator_linus_merge_req, /* elevator_merge_req_fn */ \
diff -Nru a/include/linux/fs.h b/include/linux/fs.h
--- a/include/linux/fs.h Fri Mar 1 18:19:44 2002
+++ b/include/linux/fs.h Fri Mar 1 18:19:44 2002
@@ -284,7 +284,7 @@
extern void set_bh_page(struct buffer_head *bh, struct page *page, unsigned
long offset);
-#define touch_buffer(bh) mark_page_accessed(bh->b_page)
+#define touch_buffer(bh) touch_page(bh->b_page)
#include <linux/pipe_fs_i.h>
diff -Nru a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h Fri Mar 1 18:19:44 2002
+++ b/include/linux/mm.h Fri Mar 1 18:19:44 2002
@@ -17,9 +17,6 @@
extern unsigned long num_physpages;
extern void * high_memory;
extern int page_cluster;
-/* The inactive_clean lists are per zone. */
-extern struct list_head active_list;
-extern struct list_head inactive_list;
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -133,6 +130,9 @@
struct page * (*nopage)(struct vm_area_struct * area, unsigned long
address, int unused);
};
+/* forward declaration; pte_chain is meant to be internal to rmap.c */
+struct pte_chain;
+
/*
* Each physical page in the system has a struct page associated with
* it to keep track of whatever it is we are using the page for at the
@@ -159,6 +159,8 @@
updated asynchronously */
struct list_head lru; /* Pageout list, eg. active_list;
protected by pagemap_lru_lock !! */
+ unsigned char age; /* Page aging counter. */
+ struct pte_chain * pte_chain; /* Reverse pte mapping pointer. */
struct page **pprev_hash; /* Complement to *next_hash. */
struct buffer_head * buffers; /* Buffer maps us to a disk block. */
@@ -286,9 +288,9 @@
#define PG_referenced 2
#define PG_uptodate 3
#define PG_dirty 4
-#define PG_unused 5
-#define PG_lru 6
-#define PG_active 7
+#define PG_inactive_clean 5
+#define PG_active 6
+#define PG_inactive_dirty 7
#define PG_slab 8
#define PG_skip 10
#define PG_highmem 11
@@ -391,10 +393,19 @@
#define PageActive(page) test_bit(PG_active, &(page)->flags)
#define SetPageActive(page) set_bit(PG_active, &(page)->flags)
#define ClearPageActive(page) clear_bit(PG_active, &(page)->flags)
+#define TestandSetPageActive(page) test_and_set_bit(PG_active,
&(page)->flags)
+#define TestandClearPageActive(page) test_and_clear_bit(PG_active,
&(page)->flags)
+
+#define PageInactiveDirty(page) test_bit(PG_inactive_dirty,
&(page)->flags)
+#define SetPageInactiveDirty(page) set_bit(PG_inactive_dirty,
&(page)->flags)
+#define ClearPageInactiveDirty(page) clear_bit(PG_inactive_dirty,
&(page)->flags)
+
+#define PageInactiveClean(page) test_bit(PG_inactive_clean,
&(page)->flags)
+#define SetPageInactiveClean(page) set_bit(PG_inactive_clean,
&(page)->flags)
+#define ClearPageInactiveClean(page) clear_bit(PG_inactive_clean,
&(page)->flags)
-#define PageLRU(page) test_bit(PG_lru, &(page)->flags)
-#define TestSetPageLRU(page) test_and_set_bit(PG_lru, &(page)->flags)
-#define TestClearPageLRU(page) test_and_clear_bit(PG_lru, &(page)->flags)
+#define PageLRU(pp) \
+ (PageActive(pp) | PageInactiveDirty(pp) | PageInactiveClean(pp))
#ifdef CONFIG_HIGHMEM
#define PageHighMem(page) test_bit(PG_highmem, &(page)->flags)
@@ -459,6 +470,7 @@
#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr),0)
+extern void FASTCALL(fixup_freespace(struct zone_struct *, int));
extern void show_free_areas(void);
extern void show_free_areas_node(pg_data_t *pgdat);
diff -Nru a/include/linux/mm_inline.h b/include/linux/mm_inline.h
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/include/linux/mm_inline.h Fri Mar 1 18:19:44 2002
@@ -0,0 +1,294 @@
+#ifndef _LINUX_VM_INLINE_H
+#define _LINUX_VM_INLINE_H
+
+#include <linux/mm.h>
+
+/*
+ * These inline functions tend to need bits and pieces of all the
+ * other VM include files, meaning they cannot be defined inside
+ * one of the other VM include files.
+ *
+ * The include file mess really needs to be cleaned up...
+ */
+
+static inline void add_page_to_active_list(struct page * page)
+{
+ struct zone_struct * zone = page_zone(page);
+ DEBUG_LRU_PAGE(page);
+ SetPageActive(page);
+ list_add(&page->lru, &zone->active_list);
+ zone->active_pages++;
+ nr_active_pages++;
+}
+
+static inline void add_page_to_inactive_dirty_list(struct page * page)
+{
+ struct zone_struct * zone = page_zone(page);
+ DEBUG_LRU_PAGE(page);
+ SetPageInactiveDirty(page);
+ list_add(&page->lru, &zone->inactive_dirty_list);
+ zone->inactive_dirty_pages++;
+ nr_inactive_dirty_pages++;
+}
+
+static inline void add_page_to_inactive_clean_list(struct page * page)
+{
+ struct zone_struct * zone = page_zone(page);
+ DEBUG_LRU_PAGE(page);
+ SetPageInactiveClean(page);
+ list_add(&page->lru, &zone->inactive_clean_list);
+ zone->inactive_clean_pages++;
+ nr_inactive_clean_pages++;
+}
+
+static inline void del_page_from_active_list(struct page * page)
+{
+ struct zone_struct * zone = page_zone(page);
+ list_del(&page->lru);
+ ClearPageActive(page);
+ nr_active_pages--;
+ zone->active_pages--;
+ DEBUG_LRU_PAGE(page);
+}
+
+static inline void del_page_from_inactive_dirty_list(struct page * page)
+{
+ struct zone_struct * zone = page_zone(page);
+ list_del(&page->lru);
+ ClearPageInactiveDirty(page);
+ nr_inactive_dirty_pages--;
+ zone->inactive_dirty_pages--;
+ DEBUG_LRU_PAGE(page);
+}
+
+static inline void del_page_from_inactive_clean_list(struct page * page)
+{
+ struct zone_struct * zone = page_zone(page);
+ list_del(&page->lru);
+ ClearPageInactiveClean(page);
+ zone->inactive_clean_pages--;
+ nr_inactive_clean_pages--;
+ DEBUG_LRU_PAGE(page);
+}
+
+/*
+ * Inline functions to control some balancing in the VM.
+ *
+ * Note that we do both global and per-zone balancing, with
+ * most of the balancing done globally.
+ */
+#define PLENTY_FACTOR 2
+#define ALL_ZONES NULL
+#define ANY_ZONE (struct zone_struct *)(~0UL)
+#define INACTIVE_FACTOR 5
+
+#define VM_MIN 0
+#define VM_LOW 1
+#define VM_HIGH 2
+#define VM_PLENTY 3
+static inline int zone_free_limit(struct zone_struct * zone, int limit)
+{
+ int free, target, delta;
+
+ /* This is really nasty, but GCC should completely optimise it away. */
+ if (limit == VM_MIN)
+ target = zone->pages_min;
+ else if (limit == VM_LOW)
+ target = zone->pages_low;
+ else if (limit == VM_HIGH)
+ target = zone->pages_high;
+ else
+ target = zone->pages_high * PLENTY_FACTOR;
+
+ free = zone->free_pages + zone->inactive_clean_pages;
+ delta = target - free;
+
+ return delta;
+}
+
+static inline int free_limit(struct zone_struct * zone, int limit)
+{
+ int shortage = 0, local;
+
+ if (zone == ALL_ZONES) {
+ for_each_zone(zone)
+ shortage += zone_free_limit(zone, limit);
+ } else if (zone == ANY_ZONE) {
+ for_each_zone(zone) {
+ local = zone_free_limit(zone, limit);
+ shortage += max(local, 0);
+ }
+ } else {
+ shortage = zone_free_limit(zone, limit);
+ }
+
+ return shortage;
+}
+
+/**
+ * free_min - test for critically low amount of free pages
+ * @zone: zone to test, ALL_ZONES to test memory globally
+ *
+ * Returns a positive value if we have a serious shortage of free and
+ * clean pages, zero or negative if there is no serious shortage.
+ */
+static inline int free_min(struct zone_struct * zone)
+{
+ return free_limit(zone, VM_MIN);
+}
+
+/**
+ * free_low - test for low amount of free pages
+ * @zone: zone to test, ALL_ZONES to test memory globally
+ *
+ * Returns a positive value if we have a shortage of free and
+ * clean pages, zero or negative if there is no shortage.
+ */
+static inline int free_low(struct zone_struct * zone)
+{
+ return free_limit(zone, VM_LOW);
+}
+
+/**
+ * free_high - test if amount of free pages is less than ideal
+ * @zone: zone to test, ALL_ZONES to test memory globally
+ *
+ * Returns a positive value if the number of free and clean
+ * pages is below kswapd's target, zero or negative if we
+ * have more than enough free and clean pages.
+ */
+static inline int free_high(struct zone_struct * zone)
+{
+ return free_limit(zone, VM_HIGH);
+}
+
+/**
+ * free_plenty - test if enough pages are freed
+ * @zone: zone to test, ALL_ZONES to test memory globally
+ *
+ * Returns a positive value if the number of free + clean pages
+ * in a zone is not yet excessive and kswapd is still allowed to
+ * free pages here, a negative value if kswapd should leave the
+ * zone alone.
+ */
+static inline int free_plenty(struct zone_struct * zone)
+{
+ return free_limit(zone, VM_PLENTY);
+}
+
+/*
+ * The inactive page target is the free target + 20% of (active + inactive)
+ * pages.
+ */
+static inline int zone_inactive_limit(struct zone_struct * zone, int limit)
+{
+ int inactive, target, inactive_base;
+
+ inactive_base = zone->active_pages + zone->inactive_dirty_pages;
+ inactive_base /= INACTIVE_FACTOR;
+
+ /* GCC should optimise this away completely. */
+ if (limit == VM_MIN)
+ target = zone->pages_high + inactive_base / 2;
+ else if (limit == VM_LOW)
+ target = zone->pages_high + inactive_base;
+ else
+ target = zone->pages_high + inactive_base * 2;
+
+ inactive = zone->free_pages + zone->inactive_clean_pages;
+ inactive += zone->inactive_dirty_pages;
+
+ return target - inactive;
+}
+
+static inline int inactive_limit(struct zone_struct * zone, int limit)
+{
+ int shortage = 0, local;
+
+ if (zone == ALL_ZONES) {
+ for_each_zone(zone)
+ shortage += zone_inactive_limit(zone, limit);
+ } else if (zone == ANY_ZONE) {
+ for_each_zone(zone) {
+ local = zone_inactive_limit(zone, limit);
+ shortage += max(local, 0);
+ }
+ } else {
+ shortage = zone_inactive_limit(zone, limit);
+ }
+
+ return shortage;
+}
+
+/**
+ * inactive_min - test for serious shortage of (free + inactive clean) pages
+ * @zone: zone to test, ALL_ZONES for global testing
+ *
+ * Returns the shortage as a positive number, a negative number
+ * if we have no serious shortage of (free + inactive clean) pages
+ */
+static inline int inactive_min(struct zone_struct * zone)
+{
+ return inactive_limit(zone, VM_MIN);
+}
+
+/**
+ * inactive_low - test for shortage of (free + inactive clean) pages
+ * @zone: zone to test, ALL_ZONES for global testing
+ *
+ * Returns the shortage as a positive number, a negative number
+ * if we have no shortage of (free + inactive clean) pages
+ */
+static inline int inactive_low(struct zone_struct * zone)
+{
+ return inactive_limit(zone, VM_LOW);
+}
+
+/**
+ * inactive_high - less than ideal amount of (free + inactive) pages
+ * @zone: zone to test, ALL_ZONES for global testing
+ *
+ * Returns the shortage as a positive number, a negative number
+ * if we have more than enough (free + inactive) pages
+ */
+static inline int inactive_high(struct zone_struct * zone)
+{
+ return inactive_limit(zone, VM_HIGH);
+}
+
+/*
+ * inactive_target - number of inactive pages we ought to have.
+ */
+static inline int inactive_target(void)
+{
+ int target;
+
+ target = nr_active_pages + nr_inactive_dirty_pages
+ + nr_inactive_clean_pages;
+
+ target /= INACTIVE_FACTOR;
+
+ return target;
+}
+
+/*
+ * Called whenever the VM references a page. We immediately reclaim
+ * the inactive clean pages because those are counted as freeable.
+ * We don't modify the inactive dirty ones because we're never sure
+ * if those are freeable anyway.
+ */
+static inline void touch_page(struct page * page)
+{
+ if (PageInactiveClean(page)) {
+ struct zone_struct * zone = page_zone(page);
+ int free = zone->free_pages + zone->inactive_clean_pages;
+ activate_page(page);
+ if (free < zone->pages_low)
+ wakeup_kswapd(GFP_NOIO);
+ if (zone->free_pages < zone->pages_min)
+ fixup_freespace(zone, 1);
+ } else
+ SetPageReferenced(page);
+}
+
+#endif
diff -Nru a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h Fri Mar 1 18:19:44 2002
+++ b/include/linux/mmzone.h Fri Mar 1 18:19:44 2002
@@ -40,12 +40,18 @@
*/
spinlock_t lock;
unsigned long free_pages;
- unsigned long pages_min, pages_low, pages_high;
+ unsigned long active_pages;
+ unsigned long inactive_dirty_pages;
+ unsigned long inactive_clean_pages;
+ unsigned long pages_min, pages_low, pages_high, pages_plenty;
int need_balance;
/*
* free areas of different sizes
*/
+ struct list_head active_list;
+ struct list_head inactive_dirty_list;
+ struct list_head inactive_clean_list;
free_area_t free_area[MAX_ORDER];
/*
@@ -143,9 +149,6 @@
extern int numnodes;
extern pg_data_t *pgdat_list;
-#define memclass(pgzone, classzone) (((pgzone)->zone_pgdat ==
(classzone)->zone_pgdat) \
- && ((pgzone) <= (classzone)))
-
/*
* The following two are not meant for general usage. They are here as
* prototypes for the discontig memory code.
@@ -157,6 +160,60 @@
struct page *pmap);
extern pg_data_t contig_page_data;
+
+/**
+ * for_each_pgdat - helper macro to iterate over all nodes
+ * @pgdat - pg_data_t * variable
+ *
+ * Meant to help with common loops of the form
+ * pgdat = pgdat_list;
+ * while(pgdat) {
+ * ...
+ * pgdat = pgdat->node_next;
+ * }
+ */
+#define for_each_pgdat(pgdat) \
+ for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next)
+
+
+/*
+ * next_zone - helper magic for for_each_zone()
+ * Thanks to William Lee Irwin III for this piece of ingenuity.
+ */
+static inline zone_t *next_zone(zone_t *zone)
+{
+ pg_data_t *pgdat = zone->zone_pgdat;
+
+ if (zone - pgdat->node_zones < MAX_NR_ZONES - 1)
+ zone++;
+
+ else if (pgdat->node_next) {
+ pgdat = pgdat->node_next;
+ zone = pgdat->node_zones;
+ } else
+ zone = NULL;
+
+ return zone;
+}
+
+/**
+ * for_each_zone - helper macro to iterate over all memory zones
+ * @zone - zone_t * variable
+ *
+ * The user only needs to declare the zone variable, for_each_zone
+ * fills it in. This basically means for_each_zone() is an
+ * easier to read version of this piece of code:
+ *
+ * for(pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next)
+ * for(i = 0; i < MAX_NR_ZONES; ++i) {
+ * zone_t * z = pgdat->node_zones + i;
+ * ...
+ * }
+ * }
+ */
+#define for_each_zone(zone) \
+ for(zone = pgdat_list->node_zones; zone; zone = next_zone(zone))
+
#ifndef CONFIG_DISCONTIGMEM
diff -Nru a/include/linux/sched.h b/include/linux/sched.h
--- a/include/linux/sched.h Fri Mar 1 18:19:44 2002
+++ b/include/linux/sched.h Fri Mar 1 18:19:44 2002
@@ -225,7 +225,7 @@
unsigned long rss, total_vm, locked_vm;
unsigned long def_flags;
unsigned long cpu_vm_mask;
- unsigned long swap_address;
+ unsigned long rlimit_rss;
unsigned dumpable:1;
@@ -244,6 +244,7 @@
mmap_sem: __RWSEM_INITIALIZER(name.mmap_sem), \
page_table_lock: SPIN_LOCK_UNLOCKED, \
mmlist: LIST_HEAD_INIT(name.mmlist), \
+ rlimit_rss: RLIM_INFINITY, \
}
struct signal_struct {
@@ -325,8 +326,6 @@
struct task_struct *next_task, *prev_task;
struct mm_struct *active_mm;
- struct list_head local_pages;
- unsigned int allocation_order, nr_local_pages;
/* task state */
struct linux_binfmt *binfmt;
diff -Nru a/include/linux/slab.h b/include/linux/slab.h
--- a/include/linux/slab.h Fri Mar 1 18:19:44 2002
+++ b/include/linux/slab.h Fri Mar 1 18:19:44 2002
@@ -55,6 +55,7 @@
void (*)(void *, kmem_cache_t *,
unsigned long));
extern int kmem_cache_destroy(kmem_cache_t *);
extern int kmem_cache_shrink(kmem_cache_t *);
+extern int kmem_cache_shrink_nr(kmem_cache_t *);
extern void *kmem_cache_alloc(kmem_cache_t *, int);
extern void kmem_cache_free(kmem_cache_t *, void *);
diff -Nru a/include/linux/swap.h b/include/linux/swap.h
--- a/include/linux/swap.h Fri Mar 1 18:19:44 2002
+++ b/include/linux/swap.h Fri Mar 1 18:19:44 2002
@@ -86,8 +86,8 @@
extern unsigned int nr_free_pages(void);
extern unsigned int nr_free_buffer_pages(void);
extern int nr_active_pages;
-extern int nr_inactive_pages;
-extern atomic_t nr_async_pages;
+extern int nr_inactive_dirty_pages;
+extern int nr_inactive_clean_pages;
extern atomic_t page_cache_size;
extern atomic_t buffermem_pages;
extern spinlock_t pagecache_lock;
@@ -100,18 +100,39 @@
struct zone_t;
+/* linux/mm/rmap.c */
+extern int FASTCALL(page_referenced(struct page *));
+extern void FASTCALL(page_add_rmap(struct page *, pte_t *));
+extern void FASTCALL(page_remove_rmap(struct page *, pte_t *));
+extern int FASTCALL(try_to_unmap(struct page *));
+extern int FASTCALL(page_over_rsslimit(struct page *));
+
+/* return values of try_to_unmap */
+#define SWAP_SUCCESS 0
+#define SWAP_AGAIN 1
+#define SWAP_FAIL 2
+#define SWAP_ERROR 3
+
/* linux/mm/swap.c */
+extern int total_swap_pages;
extern void FASTCALL(lru_cache_add(struct page *));
extern void FASTCALL(__lru_cache_del(struct page *));
extern void FASTCALL(lru_cache_del(struct page *));
extern void FASTCALL(activate_page(struct page *));
+extern void FASTCALL(activate_page_nolock(struct page *));
+extern void FASTCALL(deactivate_page(struct page *));
+extern void FASTCALL(deactivate_page_nolock(struct page *));
+extern void FASTCALL(drop_page(struct page *));
extern void swap_setup(void);
/* linux/mm/vmscan.c */
+extern struct page * FASTCALL(reclaim_page(zone_t *));
extern wait_queue_head_t kswapd_wait;
-extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int));
+extern int FASTCALL(try_to_free_pages(unsigned int gfp_mask));
+extern void wakeup_kswapd(unsigned int);
+extern void rss_free_pages(unsigned int);
/* linux/mm/page_io.c */
extern void rw_swap_page(int, struct page *);
@@ -125,6 +146,7 @@
extern void show_swap_cache_info(void);
#endif
extern int add_to_swap_cache(struct page *, swp_entry_t);
+extern int add_to_swap(struct page *);
extern void __delete_from_swap_cache(struct page *page);
extern void delete_from_swap_cache(struct page *page);
extern void free_page_and_swap_cache(struct page *page);
@@ -158,7 +180,14 @@
extern spinlock_t pagemap_lru_lock;
-extern void FASTCALL(mark_page_accessed(struct page *));
+/*
+ * Page aging defines. These seem to work great in FreeBSD,
+ * no need to reinvent the wheel.
+ */
+#define PAGE_AGE_START 5
+#define PAGE_AGE_ADV 3
+#define PAGE_AGE_DECL 1
+#define PAGE_AGE_MAX 64
/*
* List add/del helper macros. These must be called
@@ -166,38 +195,12 @@
*/
#define DEBUG_LRU_PAGE(page) \
do { \
- if (!PageLRU(page)) \
- BUG(); \
if (PageActive(page)) \
BUG(); \
-} while (0)
-
-#define add_page_to_active_list(page) \
-do { \
- DEBUG_LRU_PAGE(page); \
- SetPageActive(page); \
- list_add(&(page)->lru, &active_list); \
- nr_active_pages++; \
-} while (0)
-
-#define add_page_to_inactive_list(page) \
-do { \
- DEBUG_LRU_PAGE(page); \
- list_add(&(page)->lru, &inactive_list); \
- nr_inactive_pages++; \
-} while (0)
-
-#define del_page_from_active_list(page) \
-do { \
- list_del(&(page)->lru); \
- ClearPageActive(page); \
- nr_active_pages--; \
-} while (0)
-
-#define del_page_from_inactive_list(page) \
-do { \
- list_del(&(page)->lru); \
- nr_inactive_pages--; \
+ if (PageInactiveDirty(page)) \
+ BUG(); \
+ if (PageInactiveClean(page)) \
+ BUG(); \
} while (0)
extern spinlock_t swaplock;
diff -Nru a/include/linux/swapctl.h b/include/linux/swapctl.h
--- a/include/linux/swapctl.h Fri Mar 1 18:19:44 2002
+++ b/include/linux/swapctl.h Fri Mar 1 18:19:44 2002
@@ -10,4 +10,13 @@
typedef pager_daemon_v1 pager_daemon_t;
extern pager_daemon_t pager_daemon;
+typedef struct freepages_v1
+{
+ unsigned int min;
+ unsigned int low;
+ unsigned int high;
+} freepages_v1;
+typedef freepages_v1 freepages_t;
+extern freepages_t freepages;
+
#endif /* _LINUX_SWAPCTL_H */
diff -Nru a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c Fri Mar 1 18:19:44 2002
+++ b/kernel/fork.c Fri Mar 1 18:19:44 2002
@@ -139,7 +139,6 @@
mm->map_count = 0;
mm->rss = 0;
mm->cpu_vm_mask = 0;
- mm->swap_address = 0;
pprev = &mm->mmap;
/*
@@ -263,9 +262,6 @@
void mmput(struct mm_struct *mm)
{
if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) {
- extern struct mm_struct *swap_mm;
- if (swap_mm == mm)
- swap_mm = list_entry(mm->mmlist.next, struct mm_struct,
mmlist);
list_del(&mm->mmlist);
mmlist_nr--;
spin_unlock(&mmlist_lock);
@@ -658,8 +654,6 @@
#endif
p->lock_depth = -1; /* -1 = no lock */
p->start_time = jiffies;
-
- INIT_LIST_HEAD(&p->local_pages);
retval = -ENOMEM;
/* copy all the process information */
diff -Nru a/kernel/sys.c b/kernel/sys.c
--- a/kernel/sys.c Fri Mar 1 18:19:44 2002
+++ b/kernel/sys.c Fri Mar 1 18:19:44 2002
@@ -1128,6 +1128,12 @@
if (resource == RLIMIT_NOFILE) {
if (new_rlim.rlim_cur > NR_OPEN || new_rlim.rlim_max > NR_OPEN)
return -EPERM;
+ } else if (resource == RLIMIT_RSS && current->mm) {
+ /* rlimit is specified in bytes, convert to pages */
+ unsigned long pages = RLIM_INFINITY;
+ if (new_rlim.rlim_cur != RLIM_INFINITY)
+ pages = new_rlim.rlim_cur >> PAGE_SHIFT;
+ current->mm->rlimit_rss = pages;
}
*old_rlim = new_rlim;
return 0;
diff -Nru a/kernel/sysctl.c b/kernel/sysctl.c
--- a/kernel/sysctl.c Fri Mar 1 18:19:44 2002
+++ b/kernel/sysctl.c Fri Mar 1 18:19:44 2002
@@ -260,6 +260,8 @@
};
static ctl_table vm_table[] = {
+ {VM_FREEPG, "freepages",
+ &freepages, sizeof(freepages_t), 0644, NULL, &proc_dointvec},
{VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0644, NULL,
&proc_dointvec_minmax, &sysctl_intvec, NULL,
&bdflush_min, &bdflush_max},
diff -Nru a/mm/Makefile b/mm/Makefile
--- a/mm/Makefile Fri Mar 1 18:19:44 2002
+++ b/mm/Makefile Fri Mar 1 18:19:44 2002
@@ -14,7 +14,7 @@
obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
- shmem.o
+ shmem.o rmap.o
obj-$(CONFIG_HIGHMEM) += highmem.o
diff -Nru a/mm/TODO b/mm/TODO
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/mm/TODO Fri Mar 1 18:19:44 2002
@@ -0,0 +1,35 @@
+ VM TODO list
+
+Forever valid TODO entries:
+ - keep up with the official kernel
+ - port over bugfixes
+ - minimise the diff by keeping code in sync where possible
+
+Easy short-term features:
+ - reclaim swap space from refill_inactive()
+ - simplify SMP locking
+ - replace foo()/foo_pgd()/foo_pmd()/foo_pte() stuff with
+ one single function using a for_each_pte() macro
+ for_each_pte(ptep, mm, start_address, end_address)
+ - fix page_launder() to not eat horrible amounts of CPU or flush
+ all pages to disk at once
+ - better VM balancing, clean vs. dirty ratio
+ - fix loopback device deadlock
+ <akpm> riel: nr_fract=70%, nr_fract_sync=80%
+ <akpm> riel: setup a loopback fs ext2-on-ext2
+ <akpm> riel: boot with mem=64m
+ <akpm> riel: then write a 500 meg file.
+ <akpm> riel: current kernel livelocks.
+ - stabilise pte_highmem and integrate it with rmap
+
+Long-term features:
+ - extensive VM statistics
+ - IO clustering for page_launder() and sync_old_buffers()
+ - readahead on per-VMA level (+ drop behind?)
+ - more graceful degradation when the load gets high
+ - reducing readahead
+ - unfair pageout so not all apps fall over
+ - memory objects, using pagecache and tmpfs for storage so
+ the memory object itself doesn't introduce any new overhead
+ - using the memory objects, removing page table copying from fork()
+ - load control able to deal with really extreme loads, swapping
diff -Nru a/mm/bootmem.c b/mm/bootmem.c
--- a/mm/bootmem.c Fri Mar 1 18:19:44 2002
+++ b/mm/bootmem.c Fri Mar 1 18:19:44 2002
@@ -326,12 +326,11 @@
pg_data_t *pgdat = pgdat_list;
void *ptr;
- while (pgdat) {
+ for_each_pgdat(pgdat)
if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
align, goal)))
return(ptr);
- pgdat = pgdat->node_next;
- }
+
/*
* Whoops, we cannot satisfy the allocation request.
*/
diff -Nru a/mm/filemap.c b/mm/filemap.c
--- a/mm/filemap.c Fri Mar 1 18:19:44 2002
+++ b/mm/filemap.c Fri Mar 1 18:19:44 2002
@@ -22,6 +22,7 @@
#include <linux/swapctl.h>
#include <linux/init.h>
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/iobuf.h>
#include <linux/compiler.h>
@@ -234,7 +235,7 @@
static void truncate_complete_page(struct page *page)
{
/* Leave it on the LRU if it gets converted into anonymous buffers */
- if (!page->buffers || do_flushpage(page, 0))
+ if (!page->pte_chain && (!page->buffers || do_flushpage(page, 0)))
lru_cache_del(page);
/*
@@ -454,6 +455,11 @@
return page;
}
+static struct page * __find_page(struct address_space * mapping, unsigned long
index)
+{
+ return __find_page_nolock(mapping, index, *page_hash(mapping,index));
+}
+
static int do_buffer_fdatasync(struct list_head *head, unsigned long start,
unsigned long end, int (*fn)(struct page *))
{
struct list_head *curr;
@@ -1016,7 +1022,53 @@
/*
- * Same as grab_cache_page, but do not wait if the page is unavailable.
+ * We combine this with read-ahead to deactivate pages when we
+ * think there's sequential IO going on. Note that this is
+ * harmless since we don't actually evict the pages from memory
+ * but just move them to the inactive list.
+ *
+ * TODO:
+ * - make the readahead code smarter
+ * - move readahead to the VMA level so we can do the same
+ * trick with mmap()
+ *
+ * Rik van Riel, 2000
+ */
+static void drop_behind(struct file * file, unsigned long index)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page;
+ unsigned long start;
+
+ /* Nothing to drop-behind if we're on the first page. */
+ if (!index)
+ return;
+
+ if (index > file->f_rawin)
+ start = index - file->f_rawin;
+ else
+ start = 0;
+
+ /*
+ * Go backwards from index-1 and drop all pages in the
+ * readahead window. Since the readahead window may have
+ * been increased since the last time we were called, we
+ * stop when the page isn't there.
+ */
+ spin_lock(&pagemap_lru_lock);
+ while (--index >= start) {
+ spin_lock(&pagecache_lock);
+ page = __find_page(mapping, index);
+ spin_unlock(&pagecache_lock);
+ if (!page || !PageActive(page))
+ break;
+ drop_page(page);
+ }
+ spin_unlock(&pagemap_lru_lock);
+}
+
+/* Same as grab_cache_page, but do not wait if the page is unavailable.
* This is intended for speculative data generators, where the data can
* be regenerated if the page couldn't be grabbed. This routine should
* be safe to call while holding the lock for another page.
@@ -1286,6 +1338,12 @@
if (filp->f_ramax > max_readahead)
filp->f_ramax = max_readahead;
+ /*
+ * Move the pages that have already been passed
+ * to the inactive list.
+ */
+ drop_behind(filp, index);
+
#ifdef PROFILE_READAHEAD
profile_readahead((reada_ok == 2), filp);
#endif
@@ -1294,25 +1352,6 @@
return;
}
-/*
- * Mark a page as having seen activity.
- *
- * If it was already so marked, move it
- * to the active queue and drop the referenced
- * bit. Otherwise, just mark it for future
- * action..
- */
-void mark_page_accessed(struct page *page)
-{
- if (!PageActive(page) && PageReferenced(page)) {
- activate_page(page);
- ClearPageReferenced(page);
- return;
- }
-
- /* Mark the page referenced, AFTER checking for previous usage.. */
- SetPageReferenced(page);
-}
/*
* This is a generic file read routine, and uses the
@@ -1421,7 +1460,7 @@
* beginning or we just did an lseek.
*/
if (!offset || !filp->f_reada)
- mark_page_accessed(page);
+ touch_page(page);
/*
* Ok, we have the page, and it's up-to-date, so
@@ -1822,7 +1861,7 @@
nr = max;
/* And limit it to a sane percentage of the inactive list.. */
- max = nr_inactive_pages / 2;
+ max = nr_inactive_clean_pages / 2;
if (nr > max)
nr = max;
@@ -1967,7 +2006,7 @@
* Found the page and have a reference on it, need to check sharing
* and possibly copy it over to another page..
*/
- mark_page_accessed(page);
+ touch_page(page);
flush_page_to_ram(page);
return page;
@@ -2840,7 +2879,7 @@
page = __read_cache_page(mapping, index, filler, data);
if (IS_ERR(page))
goto out;
- mark_page_accessed(page);
+ touch_page(page);
if (Page_Uptodate(page))
goto out;
@@ -3037,6 +3076,7 @@
unsigned long index, offset;
long page_fault;
char *kaddr;
+ int deactivate = 1;
/*
* Try to find the page in the cache. If it isn't there,
@@ -3045,8 +3085,10 @@
offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
index = pos >> PAGE_CACHE_SHIFT;
bytes = PAGE_CACHE_SIZE - offset;
- if (bytes > count)
+ if (bytes > count) {
bytes = count;
+ deactivate = 0;
+ }
/*
* Bring in the user page that we will copy from _first_.
@@ -3090,8 +3132,11 @@
unlock:
kunmap(page);
/* Mark it unlocked again and drop the page.. */
- SetPageReferenced(page);
UnlockPage(page);
+ if (deactivate)
+ deactivate_page(page);
+ else
+ touch_page(page);
page_cache_release(page);
if (status < 0)
diff -Nru a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c Fri Mar 1 18:19:44 2002
+++ b/mm/memory.c Fri Mar 1 18:19:44 2002
@@ -45,8 +45,10 @@
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/module.h>
+#include <linux/mm_inline.h>
#include <asm/pgalloc.h>
+#include <asm/rmap.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
@@ -102,6 +104,7 @@
}
pte = pte_offset(dir, 0);
pmd_clear(dir);
+ pgtable_remove_rmap(pte);
pte_free(pte);
}
@@ -236,9 +239,11 @@
if (pte_none(pte))
goto cont_copy_pte_range_noset;
+ /* pte contains position in swap, so copy. */
if (!pte_present(pte)) {
swap_duplicate(pte_to_swp_entry(pte));
- goto cont_copy_pte_range;
+ set_pte(dst_pte, pte);
+ goto cont_copy_pte_range_noset;
}
ptepage = pte_page(pte);
if ((!VALID_PAGE(ptepage)) ||
@@ -246,7 +251,7 @@
goto cont_copy_pte_range;
/* If it's a COW mapping, write protect it both
in the parent and the child */
- if (cow && pte_write(pte)) {
+ if (cow) {
ptep_set_wrprotect(src_pte);
pte = *src_pte;
}
@@ -259,6 +264,7 @@
dst->rss++;
cont_copy_pte_range: set_pte(dst_pte, pte);
+ page_add_rmap(ptepage, dst_pte);
cont_copy_pte_range_noset: address += PAGE_SIZE;
if (address >= end)
goto out_unlock;
@@ -314,8 +320,10 @@
continue;
if (pte_present(pte)) {
struct page *page = pte_page(pte);
- if (VALID_PAGE(page) && !PageReserved(page))
+ if (VALID_PAGE(page) && !PageReserved(page)) {
freed ++;
+ page_remove_rmap(page, ptep);
+ }
/* This will eventually call __free_pte on the pte. */
tlb_remove_page(tlb, ptep, address + offset);
} else {
@@ -980,7 +988,9 @@
if (pte_same(*page_table, pte)) {
if (PageReserved(old_page))
++mm->rss;
+ page_remove_rmap(old_page, page_table);
break_cow(vma, new_page, address, page_table);
+ page_add_rmap(new_page, page_table);
lru_cache_add(new_page);
/* Free the old page.. */
@@ -1093,6 +1103,10 @@
struct page *new_page;
unsigned long offset;
+ /* Low on free memory ? Don't make things worse. */
+ if (free_low(ALL_ZONES) < 0)
+ return;
+
/*
* Get the number of handles we should do readahead io to.
*/
@@ -1141,7 +1155,7 @@
ret = 2;
}
- mark_page_accessed(page);
+ touch_page(page);
lock_page(page);
@@ -1172,6 +1186,7 @@
flush_page_to_ram(page);
flush_icache_page(vma, page);
set_pte(page_table, pte);
+ page_add_rmap(page, page_table);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, pte);
@@ -1187,14 +1202,13 @@
static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct *
vma, pte_t *page_table, int write_access, unsigned long addr)
{
pte_t entry;
+ struct page * page = ZERO_PAGE(addr);
/* Read-only mapping of ZERO_PAGE. */
entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
/* ..except if it's a write access */
if (write_access) {
- struct page *page;
-
/* Allocate our own private page. */
spin_unlock(&mm->page_table_lock);
@@ -1213,10 +1227,10 @@
flush_page_to_ram(page);
entry = pte_mkwrite(pte_mkdirty(mk_pte(page,
vma->vm_page_prot)));
lru_cache_add(page);
- mark_page_accessed(page);
}
set_pte(page_table, entry);
+ page_add_rmap(page, page_table); /* ignores ZERO_PAGE */
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, addr, entry);
@@ -1271,6 +1285,8 @@
new_page = page;
}
+ touch_page(new_page);
+
spin_lock(&mm->page_table_lock);
/*
* This silly early PAGE_DIRTY setting removes a race
@@ -1291,6 +1307,7 @@
if (write_access)
entry = pte_mkwrite(pte_mkdirty(entry));
set_pte(page_table, entry);
+ page_add_rmap(new_page, page_table);
} else {
/* One of our sibling threads was faster, back out. */
page_cache_release(new_page);
@@ -1367,6 +1384,14 @@
current->state = TASK_RUNNING;
pgd = pgd_offset(mm, address);
+ /*
+ * If we are over our RSS limit and the system needs memory,
+ * we will free memory for the non-hogs and slow down a bit.
+ */
+ if (mm->rlimit_rss && mm->rss > mm->rlimit_rss &&
+ free_high(ALL_ZONES) > 0)
+ rss_free_pages(GFP_HIGHUSER);
+
/*
* We need the page table lock to synchronize with kswapd
* and the SMP-safe atomic PTE updates.
@@ -1448,6 +1473,7 @@
goto out;
}
}
+ pgtable_add_rmap(new, mm, address);
pmd_populate(mm, pmd, new);
}
out:
diff -Nru a/mm/mremap.c b/mm/mremap.c
--- a/mm/mremap.c Fri Mar 1 18:19:44 2002
+++ b/mm/mremap.c Fri Mar 1 18:19:44 2002
@@ -61,8 +61,14 @@
{
int error = 0;
pte_t pte;
+ struct page * page = NULL;
+
+ if (pte_present(*src))
+ page = pte_page(*src);
if (!pte_none(*src)) {
+ if (page)
+ page_remove_rmap(page, src);
pte = ptep_get_and_clear(src);
if (!dst) {
/* No dest? We must put it back. */
@@ -70,6 +76,8 @@
error++;
}
set_pte(dst, pte);
+ if (page)
+ page_add_rmap(page, dst);
}
return error;
}
diff -Nru a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c Fri Mar 1 18:19:44 2002
+++ b/mm/oom_kill.c Fri Mar 1 18:19:44 2002
@@ -110,8 +110,7 @@
/*
* Simple selection loop. We chose the process with the highest
- * number of 'points'. We need the locks to make sure that the
- * list of task structs doesn't change while we look the other way.
+ * number of 'points'. We expect the caller will lock the tasklist.
*
* (not docbooked, we don't want this one cluttering up the manual)
*/
@@ -121,7 +120,6 @@
struct task_struct *p = NULL;
struct task_struct *chosen = NULL;
- read_lock(&tasklist_lock);
for_each_task(p) {
if (p->pid) {
int points = badness(p);
@@ -131,7 +129,6 @@
}
}
}
- read_unlock(&tasklist_lock);
return chosen;
}
@@ -170,19 +167,25 @@
*/
static void oom_kill(void)
{
- struct task_struct *p = select_bad_process(), *q;
+ struct task_struct *p, *q;
+ extern wait_queue_head_t kswapd_done;
+
+ read_lock(&tasklist_lock);
+ p = select_bad_process();
/* Found nothing?!?! Either we hang forever, or we panic. */
if (p == NULL)
panic("Out of memory and no killable processes...\n");
/* kill all processes that share the ->mm (i.e. all threads) */
- read_lock(&tasklist_lock);
for_each_task(q) {
if(q->mm == p->mm) oom_kill_task(q);
}
read_unlock(&tasklist_lock);
+ /* Chances are by this time our victim is sleeping on kswapd. */
+ wake_up(&kswapd_done);
+
/*
* Make kswapd go out of the way, so "p" has a good chance of
* killing itself before someone else gets the chance to ask
@@ -198,7 +201,7 @@
*/
void out_of_memory(void)
{
- static unsigned long first, last, count;
+ static unsigned long first, last, count, lastkill;
unsigned long now, since;
/*
@@ -235,8 +238,18 @@
return;
/*
+ * If we just killed a process, wait a while
+ * to give that task a chance to exit. This
+ * avoids killing multiple processes needlessly.
+ */
+ since = now - lastkill;
+ if (since < HZ*5)
+ return;
+
+ /*
* Ok, really out of memory. Kill something.
*/
+ lastkill = now;
oom_kill();
reset:
diff -Nru a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c Fri Mar 1 18:19:44 2002
+++ b/mm/page_alloc.c Fri Mar 1 18:19:44 2002
@@ -22,12 +22,12 @@
#include <linux/slab.h>
#include <linux/compiler.h>
#include <linux/module.h>
+#include <linux/mm_inline.h>
int nr_swap_pages;
int nr_active_pages;
-int nr_inactive_pages;
-struct list_head inactive_list;
-struct list_head active_list;
+int nr_inactive_dirty_pages;
+int nr_inactive_clean_pages;
pg_data_t *pgdat_list;
/* Used to look up the address of the struct zone encoded in page->zone */
@@ -38,6 +38,8 @@
static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
+static int zone_extrafree_ratio[MAX_NR_ZONES] __initdata = { 128, 512, 0, };
+static int zone_extrafree_max[MAX_NR_ZONES] __initdata = { 1024 , 1024, 0, };
/*
* Free_page() adds the page to the free lists. This is optimized for
@@ -113,16 +115,17 @@
BUG();
if (PageLocked(page))
BUG();
- if (PageLRU(page))
- BUG();
if (PageActive(page))
BUG();
+ if (PageInactiveDirty(page))
+ BUG();
+ if (PageInactiveClean(page))
+ BUG();
+ if (page->pte_chain)
+ BUG();
page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
-
- if (current->flags & PF_FREE_PAGES)
- goto local_freelist;
- back_local_freelist:
-
+ page->age = PAGE_AGE_START;
+
zone = page_zone(page);
mask = (~0UL) << order;
@@ -169,17 +172,6 @@
memlist_add_head(&(base + page_idx)->list, &area->free_list);
spin_unlock_irqrestore(&zone->lock, flags);
- return;
-
- local_freelist:
- if (current->nr_local_pages)
- goto back_local_freelist;
- if (in_interrupt())
- goto back_local_freelist;
-
- list_add(&page->list, ¤t->local_pages);
- page->index = order;
- current->nr_local_pages++;
}
#define MARK_USED(index, order, area) \
@@ -238,10 +230,7 @@
set_page_count(page, 1);
if (BAD_RANGE(zone,page))
BUG();
- if (PageLRU(page))
- BUG();
- if (PageActive(page))
- BUG();
+ DEBUG_LRU_PAGE(page);
return page;
}
curr_order++;
@@ -260,78 +249,83 @@
}
#endif
-static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int,
unsigned int, int *));
-static struct page * balance_classzone(zone_t * classzone, unsigned int
gfp_mask, unsigned int order, int * freed)
+/*
+ * If we are able to directly reclaim pages, we move pages from the
+ * inactive_clean list onto the free list until the zone has enough
+ * free pages or until the inactive_clean pages are exhausted.
+ * If we cannot do this work ourselves, call kswapd.
+ */
+void FASTCALL(fixup_freespace(zone_t * zone, int direct_reclaim));
+void fixup_freespace(zone_t * zone, int direct_reclaim)
+{
+ if (direct_reclaim) {
+ struct page * page;
+ do {
+ if ((page = reclaim_page(zone)))
+ __free_pages_ok(page, 0);
+ } while (page && zone->free_pages <= zone->pages_min);
+ } else
+ wakeup_kswapd(GFP_ATOMIC);
+}
+
+#define PAGES_KERNEL 0
+#define PAGES_MIN 1
+#define PAGES_LOW 2
+#define PAGES_HIGH 3
+
+/*
+ * This function does the dirty work for __alloc_pages
+ * and is separated out to keep the code size smaller.
+ * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)
+ */
+static struct page * __alloc_pages_limit(zonelist_t *zonelist,
+ unsigned long order, int limit, int direct_reclaim)
{
- struct page * page = NULL;
- int __freed = 0;
+ zone_t **zone = zonelist->zones;
+ unsigned long water_mark = 0;
- if (!(gfp_mask & __GFP_WAIT))
- goto out;
- if (in_interrupt())
- BUG();
-
- current->allocation_order = order;
- current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
-
- __freed = try_to_free_pages(classzone, gfp_mask, order);
-
- current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
-
- if (current->nr_local_pages) {
- struct list_head * entry, * local_pages;
- struct page * tmp;
- int nr_pages;
-
- local_pages = ¤t->local_pages;
-
- if (likely(__freed)) {
- /* pick from the last inserted so we're lifo */
- entry = local_pages->next;
- do {
- tmp = list_entry(entry, struct page, list);
- if (tmp->index == order &&
memclass(page_zone(tmp), classzone)) {
- list_del(entry);
- current->nr_local_pages--;
- set_page_count(tmp, 1);
- page = tmp;
-
- if (page->buffers)
- BUG();
- if (page->mapping)
- BUG();
- if (!VALID_PAGE(page))
- BUG();
- if (PageSwapCache(page))
- BUG();
- if (PageLocked(page))
- BUG();
- if (PageLRU(page))
- BUG();
- if (PageActive(page))
- BUG();
- if (PageDirty(page))
- BUG();
+ for (;;) {
+ zone_t *z = *(zone++);
- break;
- }
- } while ((entry = entry->next) != local_pages);
+ if (!z)
+ break;
+ if (!z->size)
+ BUG();
+
+ /*
+ * We allocate if the number of (free + inactive_clean)
+ * pages is above the watermark.
+ */
+ switch (limit) {
+ case PAGES_KERNEL:
+ water_mark = z->pages_min / 2;
+ break;
+ case PAGES_MIN:
+ water_mark = z->pages_min;
+ break;
+ case PAGES_LOW:
+ water_mark = z->pages_low;
+ break;
+ default:
+ case PAGES_HIGH:
+ water_mark = z->pages_high;
}
- nr_pages = current->nr_local_pages;
- /* free in reverse order so that the global order will be lifo
*/
- while ((entry = local_pages->prev) != local_pages) {
- list_del(entry);
- tmp = list_entry(entry, struct page, list);
- __free_pages_ok(tmp, tmp->index);
- if (!nr_pages--)
- BUG();
+ if (z->free_pages + z->inactive_clean_pages >= water_mark) {
+ struct page *page = NULL;
+ /* If possible, reclaim a page directly. */
+ if (direct_reclaim)
+ page = reclaim_page(z);
+ /* If that fails, fall back to rmqueue. */
+ if (!page)
+ page = rmqueue(z, order);
+ if (page)
+ return page;
}
- current->nr_local_pages = 0;
}
- out:
- *freed = __freed;
- return page;
+
+ /* Found nothing. */
+ return NULL;
}
/*
@@ -339,100 +333,248 @@
*/
struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order,
zonelist_t *zonelist)
{
- unsigned long min;
- zone_t **zone, * classzone;
+ zone_t **zone;
+ int min, direct_reclaim = 0;
struct page * page;
- int freed;
+ /*
+ * (If anyone calls gfp from interrupts nonatomically then it
+ * will sooner or later tripped up by a schedule().)
+ *
+ * We fall back to lower-level zones if allocation
+ * in a higher zone fails.
+ */
+
+ /*
+ * Can we take pages directly from the inactive_clean
+ * list?
+ */
+ if (order == 0 && (gfp_mask & __GFP_WAIT))
+ direct_reclaim = 1;
+
+try_again:
+ /*
+ * First, see if we have any zones with lots of free memory.
+ *
+ * We allocate free memory first because it doesn't contain
+ * any data we would want to cache.
+ */
zone = zonelist->zones;
- classzone = *zone;
min = 1UL << order;
for (;;) {
zone_t *z = *(zone++);
if (!z)
break;
+ if (!z->size)
+ BUG();
- min += z->pages_low;
+ min += z->pages_min;
if (z->free_pages > min) {
page = rmqueue(z, order);
if (page)
return page;
- }
+ } else if (z->free_pages < z->pages_min)
+ fixup_freespace(z, direct_reclaim);
+ }
+
+ /*
+ * Next, try to allocate a page from a zone with a HIGH
+ * amount of (free + inactive_clean) pages.
+ *
+ * If there is a lot of activity, inactive_target
+ * will be high and we'll have a good chance of
+ * finding a page using the HIGH limit.
+ */
+ page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);
+ if (page)
+ return page;
+
+ /*
+ * Then try to allocate a page from a zone with more
+ * than zone->pages_low of (free + inactive_clean) pages.
+ *
+ * When the working set is very large and VM activity
+ * is low, we're most likely to have our allocation
+ * succeed here.
+ */
+ page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);
+ if (page)
+ return page;
+
+ /*
+ * OK, none of the zones on our zonelist has lots
+ * of pages free.
+ *
+ * We wake up kswapd, in the hope that kswapd will
+ * resolve this situation before memory gets tight.
+ *
+ * We'll also help a bit trying to free pages, this
+ * way statistics will make sure really fast allocators
+ * are slowed down more than slow allocators and other
+ * programs in the system shouldn't be impacted as much
+ * by the hogs.
+ */
+ wakeup_kswapd(gfp_mask);
+
+ /*
+ * After waking up kswapd, we try to allocate a page
+ * from any zone which isn't critical yet.
+ *
+ * Kswapd should, in most situations, bring the situation
+ * back to normal in no time.
+ */
+ page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);
+ if (page)
+ return page;
+
+ /*
+ * Kernel allocations can eat a few emergency pages.
+ * We should be able to run without this, find out why
+ * the SCSI layer isn't happy ...
+ */
+ if (gfp_mask & __GFP_HIGH) {
+ page = __alloc_pages_limit(zonelist, order, PAGES_KERNEL,
direct_reclaim);
+ if (page)
+ return page;
}
- classzone->need_balance = 1;
- mb();
- if (waitqueue_active(&kswapd_wait))
- wake_up_interruptible(&kswapd_wait);
+ /*
+ * Oh well, we didn't succeed.
+ */
+ if (!(current->flags & PF_MEMALLOC)) {
+ /*
+ * Are we dealing with a higher order allocation?
+ *
+ * If so, try to defragment some memory.
+ */
+ if (order > 0 && (gfp_mask & __GFP_WAIT))
+ goto defragment;
+
+ /*
+ * If we arrive here, we are really tight on memory.
+ * Since kswapd didn't succeed in freeing pages for us,
+ * we need to help it.
+ *
+ * Single page allocs loop until the allocation succeeds.
+ * Multi-page allocs can fail due to memory fragmentation;
+ * in that case we bail out to prevent infinite loops and
+ * hanging device drivers ...
+ *
+ * Another issue are GFP_NOFS allocations; because they
+ * do not have __GFP_FS set it's possible we cannot make
+ * any progress freeing pages, in that case it's better
+ * to give up than to deadlock the kernel looping here.
+ *
+ * NFS: we must yield the CPU (to rpciod) to avoid deadlock.
+ */
+ if (gfp_mask & __GFP_WAIT) {
+ __set_current_state(TASK_RUNNING);
+ current->policy |= SCHED_YIELD;
+ schedule();
+ if (!order || free_high(ALL_ZONES) >= 0) {
+ int progress = try_to_free_pages(gfp_mask);
+ if (progress || (gfp_mask & __GFP_FS))
+ goto try_again;
+ /*
+ * Fail if no progress was made and the
+ * allocation may not be able to block on IO.
+ */
+ return NULL;
+ }
+ }
+ }
+ /*
+ * Final phase: allocate anything we can!
+ *
+ * Higher order allocations, GFP_ATOMIC allocations and
+ * recursive allocations (PF_MEMALLOC) end up here.
+ *
+ * Only recursive allocations can use the very last pages
+ * in the system, otherwise it would be just too easy to
+ * deadlock the system...
+ */
zone = zonelist->zones;
min = 1UL << order;
for (;;) {
- unsigned long local_min;
zone_t *z = *(zone++);
+ struct page * page = NULL;
if (!z)
break;
- local_min = z->pages_min;
- if (!(gfp_mask & __GFP_WAIT))
- local_min >>= 2;
- min += local_min;
- if (z->free_pages > min) {
+ /*
+ * SUBTLE: direct_reclaim is only possible if the task
+ * becomes PF_MEMALLOC while looping above. This will
+ * happen when the OOM killer selects this task for
+ * death.
+ */
+ if (direct_reclaim) {
+ page = reclaim_page(z);
+ if (page)
+ return page;
+ }
+
+ /* XXX: is pages_min/4 a good amount to reserve for this? */
+ min += z->pages_min / 4;
+ if (z->free_pages > min || ((current->flags & PF_MEMALLOC) &&
!in_interrupt())) {
page = rmqueue(z, order);
if (page)
return page;
}
}
+ goto out_failed;
- /* here we're in the low on memory slow path */
-rebalance:
- if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) {
+ /*
+ * Naive "defragmentation" for higher-order allocations. First we
+ * free the inactive_clean pages to see if we can allocate our
+ * allocation, then we call page_launder() to clean some dirty
+ * pages, and last we try once more.
+ *
+ * We might want to turn this into something which defragments
+ * memory based on physical page, simply by looking for unmapped
+ * pages next to pages on the free list...
+ */
+defragment:
+ {
+ int freed = 0;
+defragment_again:
zone = zonelist->zones;
for (;;) {
zone_t *z = *(zone++);
if (!z)
break;
-
- page = rmqueue(z, order);
- if (page)
- return page;
+ if (!z->size)
+ continue;
+ while (z->inactive_clean_pages) {
+ struct page * page;
+ /* Move one page to the free list. */
+ page = reclaim_page(z);
+ if (!page)
+ break;
+ __free_page(page);
+ /* Try if the allocation succeeds. */
+ page = rmqueue(z, order);
+ if (page)
+ return page;
+ }
}
- return NULL;
- }
-
- /* Atomic allocations - we can't balance anything */
- if (!(gfp_mask & __GFP_WAIT))
- return NULL;
-
- page = balance_classzone(classzone, gfp_mask, order, &freed);
- if (page)
- return page;
-
- zone = zonelist->zones;
- min = 1UL << order;
- for (;;) {
- zone_t *z = *(zone++);
- if (!z)
- break;
- min += z->pages_min;
- if (z->free_pages > min) {
- page = rmqueue(z, order);
- if (page)
- return page;
+ /* XXX: do real defragmentation instead of calling launder ? */
+ if (!freed) {
+ freed = 1;
+ current->flags |= PF_MEMALLOC;
+ try_to_free_pages(gfp_mask);
+ current->flags &= ~PF_MEMALLOC;
+ goto defragment_again;
}
}
- /* Don't let big-order allocations loop */
- if (order > 3)
- return NULL;
-
- /* Yield for kswapd, and try again */
- current->policy |= SCHED_YIELD;
- __set_current_state(TASK_RUNNING);
- schedule();
- goto rebalance;
+
+out_failed:
+ /* No luck.. */
+// printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);
+ return NULL;
}
/*
@@ -480,14 +622,11 @@
{
unsigned int sum;
zone_t *zone;
- pg_data_t *pgdat = pgdat_list;
sum = 0;
- while (pgdat) {
- for (zone = pgdat->node_zones; zone < pgdat->node_zones +
MAX_NR_ZONES; zone++)
- sum += zone->free_pages;
- pgdat = pgdat->node_next;
- }
+ for_each_zone(zone)
+ sum += zone->free_pages;
+
return sum;
}
@@ -496,23 +635,21 @@
*/
unsigned int nr_free_buffer_pages (void)
{
- pg_data_t *pgdat = pgdat_list;
+ pg_data_t *pgdat;
unsigned int sum = 0;
- do {
+ for_each_pgdat(pgdat) {
zonelist_t *zonelist = pgdat->node_zonelists + (GFP_USER &
GFP_ZONEMASK);
zone_t **zonep = zonelist->zones;
zone_t *zone;
for (zone = *zonep++; zone; zone = *zonep++) {
- unsigned long size = zone->size;
- unsigned long high = zone->pages_high;
- if (size > high)
- sum += size - high;
+ sum += zone->free_pages;
+ sum += zone->inactive_clean_pages;
+ sum += zone->inactive_dirty_pages;
}
- pgdat = pgdat->node_next;
- } while (pgdat);
+ }
return sum;
}
@@ -520,13 +657,12 @@
#if CONFIG_HIGHMEM
unsigned int nr_free_highpages (void)
{
- pg_data_t *pgdat = pgdat_list;
+ pg_data_t *pgdat;
unsigned int pages = 0;
- while (pgdat) {
+ for_each_pgdat(pgdat)
pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
- pgdat = pgdat->node_next;
- }
+
return pages;
}
#endif
@@ -563,10 +699,18 @@
tmpdat = tmpdat->node_next;
}
- printk("( Active: %d, inactive: %d, free: %d )\n",
- nr_active_pages,
- nr_inactive_pages,
- nr_free_pages());
+ printk("Free pages: %6dkB (%6dkB HighMem)\n",
+ nr_free_pages() << (PAGE_SHIFT-10),
+ nr_free_highpages() << (PAGE_SHIFT-10));
+
+ printk("( Active: %d, inactive_dirty: %d, inactive_clean: %d, free: %d
(%d %d %d) )\n",
+ nr_active_pages,
+ nr_inactive_dirty_pages,
+ nr_inactive_clean_pages,
+ nr_free_pages(),
+ freepages.min,
+ freepages.low,
+ freepages.high);
for (type = 0; type < MAX_NR_ZONES; type++) {
struct list_head *head, *curr;
@@ -726,9 +870,6 @@
printk("On node %d totalpages: %lu\n", nid, realtotalpages);
- INIT_LIST_HEAD(&active_list);
- INIT_LIST_HEAD(&inactive_list);
-
/*
* Some architectures (with lots of mem and discontinous memory
* maps) have to search for a good mem_map area:
@@ -751,7 +892,7 @@
offset = lmem_map - mem_map;
for (j = 0; j < MAX_NR_ZONES; j++) {
zone_t *zone = pgdat->node_zones + j;
- unsigned long mask;
+ unsigned long mask, extrafree = 0;
unsigned long size, realsize;
zone_table[nid * MAX_NR_ZONES + j] = zone;
@@ -765,7 +906,13 @@
zone->lock = SPIN_LOCK_UNLOCKED;
zone->zone_pgdat = pgdat;
zone->free_pages = 0;
+ zone->inactive_clean_pages = 0;
+ zone->inactive_dirty_pages = 0;
zone->need_balance = 0;
+ INIT_LIST_HEAD(&zone->active_list);
+ INIT_LIST_HEAD(&zone->inactive_dirty_list);
+ INIT_LIST_HEAD(&zone->inactive_clean_list);
+
if (!size)
continue;
@@ -785,15 +932,36 @@
pgdat->nr_zones = j+1;
+ /*
+ * On large memory machines we keep extra memory
+ * free for kernel allocations.
+ */
+ if (zone_extrafree_ratio[j])
+ extrafree = min_t(int, (realtotalpages /
zone_extrafree_ratio[j]), zone_extrafree_max[j]);
+ if (extrafree < zone_balance_max[j])
+ extrafree = 0;
+
mask = (realsize / zone_balance_ratio[j]);
if (mask < zone_balance_min[j])
mask = zone_balance_min[j];
- else if (mask > zone_balance_max[j])
- mask = zone_balance_max[j];
- zone->pages_min = mask;
- zone->pages_low = mask*2;
- zone->pages_high = mask*3;
-
+ zone->pages_min = extrafree + min(mask, (unsigned
long)zone_balance_max[j]);
+ zone->pages_low = extrafree + mask*2;
+ zone->pages_high = extrafree + mask*3;
+ zone->pages_plenty = extrafree + mask*6;
+ /*
+ * Add these free targets to the global free target;
+ * we have to be SURE that freepages.high is higher
+ * than SUM [zone->pages_min] for all zones, otherwise
+ * we may have bad bad problems.
+ *
+ * This means we cannot make the freepages array writable
+ * in /proc, but have to add a separate extra_free_target
+ * for people who require it to catch load spikes in eg.
+ * gigabit ethernet routing...
+ */
+ freepages.min += zone->pages_min;
+ freepages.low += zone->pages_low;
+ freepages.high += zone->pages_high;
zone->zone_mem_map = mem_map + offset;
zone->zone_start_mapnr = offset;
zone->zone_start_paddr = zone_start_paddr;
diff -Nru a/mm/rmap.c b/mm/rmap.c
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/mm/rmap.c Fri Mar 1 18:19:44 2002
@@ -0,0 +1,384 @@
+/*
+ * mm/rmap.c - physical to virtual reverse mappings
+ *
+ * Copyright 2001, Rik van Riel <riel@xxxxxxxxxxxxxxxx>
+ * Released under the General Public License (GPL).
+ *
+ *
+ * Simple, low overhead pte-based reverse mapping scheme.
+ * This is kept modular because we may want to experiment
+ * with object-based reverse mapping schemes. Please try
+ * to keep this thing as modular as possible.
+ */
+
+/*
+ * Locking:
+ * - the page->pte_chain is protected by the pagemap_lru_lock,
+ * we probably want to change this to a per-page lock in the
+ * future
+ * - because swapout locking is opposite to the locking order
+ * in the page fault path, the swapout path uses trylocks
+ * on the mm->page_table_lock
+ */
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+
+#include <asm/pgalloc.h>
+#include <asm/rmap.h>
+#include <asm/smplock.h>
+
+/* #define DEBUG_RMAP */
+
+/*
+ * Shared pages have a chain of pte_chain structures, used to locate
+ * all the mappings to this page. We only need a pointer to the pte
+ * here, the page struct for the page table page contains the process
+ * it belongs to and the offset within that process.
+ *
+ * A singly linked list should be fine for most, if not all, workloads.
+ * On fork-after-exec the mapping we'll be removing will still be near
+ * the start of the list, on mixed application systems the short-lived
+ * processes will have their mappings near the start of the list and
+ * in systems with long-lived applications the relative overhead of
+ * exit() will be lower since the applications are long-lived.
+ */
+struct pte_chain {
+ struct pte_chain * next;
+ pte_t * ptep;
+};
+
+static struct pte_chain * pte_chain_freelist;
+static inline struct pte_chain * pte_chain_alloc(void);
+static inline void pte_chain_free(struct pte_chain *, struct pte_chain *,
struct page *);
+static void alloc_new_pte_chains(void);
+
+/**
+ * page_referenced - test if the page was referenced
+ * @page: the page to test
+ *
+ * Quick test_and_clear_referenced for all mappings to a page,
+ * returns the number of processes which referenced the page.
+ * Caller needs to hold the pagemap_lru_lock.
+ */
+int FASTCALL(page_referenced(struct page *));
+int page_referenced(struct page * page)
+{
+ struct pte_chain * pc;
+ int referenced = 0;
+
+ if (PageTestandClearReferenced(page))
+ referenced++;
+
+ /* Check all the page tables mapping this page. */
+ for (pc = page->pte_chain; pc; pc = pc->next) {
+ if (ptep_test_and_clear_young(pc->ptep))
+ referenced++;
+ }
+
+ return referenced;
+}
+
+/**
+ * page_add_rmap - add reverse mapping entry to a page
+ * @page: the page to add the mapping to
+ * @ptep: the page table entry mapping this page
+ *
+ * Add a new pte reverse mapping to a page.
+ * The caller needs to hold the mm->page_table_lock.
+ */
+void FASTCALL(page_add_rmap(struct page *, pte_t *));
+void page_add_rmap(struct page * page, pte_t * ptep)
+{
+ struct pte_chain * pte_chain;
+
+ if (!VALID_PAGE(page) || PageReserved(page))
+ return;
+
+ spin_lock(&pagemap_lru_lock);
+#ifdef DEBUG_RMAP
+ if (!page || !ptep)
+ BUG();
+ if (!pte_present(*ptep))
+ BUG();
+ if (!ptep_to_mm(ptep));
+ BUG();
+ {
+ struct pte_chain * pc;
+ for (pc = page->pte_chain; pc; pc = pc->next) {
+ if (pc->ptep == ptep)
+ BUG();
+ }
+ }
+#endif
+ pte_chain = pte_chain_alloc();
+
+ /* Hook up the pte_chain to the page. */
+ pte_chain->ptep = ptep;
+ pte_chain->next = page->pte_chain;
+ page->pte_chain = pte_chain;
+
+ spin_unlock(&pagemap_lru_lock);
+}
+
+/**
+ * page_remove_rmap - take down reverse mapping to a page
+ * @page: page to remove mapping from
+ * @ptep: page table entry to remove
+ *
+ * Removes the reverse mapping from the pte_chain of the page,
+ * after that the caller can clear the page table entry and free
+ * the page.
+ * Caller needs to hold the mm->page_table_lock.
+ */
+void FASTCALL(page_remove_rmap(struct page *, pte_t *));
+void page_remove_rmap(struct page * page, pte_t * ptep)
+{
+ struct pte_chain * pc, * prev_pc = NULL;
+
+ if (!page || !ptep)
+ BUG();
+ if (!VALID_PAGE(page) || PageReserved(page))
+ return;
+
+ spin_lock(&pagemap_lru_lock);
+ for (pc = page->pte_chain; pc; prev_pc = pc, pc = pc->next) {
+ if (pc->ptep == ptep) {
+ pte_chain_free(pc, prev_pc, page);
+ goto out;
+ }
+ }
+#ifdef DEBUG_RMAP
+ /* Not found. This should NEVER happen! */
+ printk(KERN_ERR "page_remove_rmap: pte_chain %p not present.\n", ptep);
+ printk(KERN_ERR "page_remove_rmap: only found: ");
+ for (pc = page->pte_chain; pc; pc = pc->next)
+ printk("%p ", pc->ptep);
+ printk("\n");
+ printk(KERN_ERR "page_remove_rmap: driver cleared PG_reserved ?\n");
+#endif
+
+out:
+ spin_unlock(&pagemap_lru_lock);
+ return;
+
+}
+
+/**
+ * try_to_unmap_one - worker function for try_to_unmap
+ * @page: page to unmap
+ * @ptep: page table entry to unmap from page
+ *
+ * Internal helper function for try_to_unmap, called for each page
+ * table entry mapping a page. Because locking order here is opposite
+ * to the locking order used by the page fault path, we use trylocks.
+ * Locking:
+ * pagemap_lru_lock page_launder()
+ * page lock page_launder(), trylock
+ * mm->page_table_lock try_to_unmap_one(), trylock
+ */
+int FASTCALL(try_to_unmap_one(struct page *, pte_t *));
+int try_to_unmap_one(struct page * page, pte_t * ptep)
+{
+ unsigned long address = ptep_to_address(ptep);
+ struct mm_struct * mm = ptep_to_mm(ptep);
+ struct vm_area_struct * vma;
+ pte_t pte;
+ int ret;
+
+ if (!mm)
+ BUG();
+
+ /*
+ * We need the page_table_lock to protect us from page faults,
+ * munmap, fork, etc...
+ */
+ if (!spin_trylock(&mm->page_table_lock))
+ return SWAP_AGAIN;
+
+ /* During mremap, it's possible pages are not in a VMA. */
+ vma = find_vma(mm, address);
+ if (!vma) {
+ ret = SWAP_FAIL;
+ goto out_unlock;
+ }
+
+ /* The page is mlock()d, we cannot swap it out. */
+ if (vma->vm_flags & VM_LOCKED) {
+ ret = SWAP_FAIL;
+ goto out_unlock;
+ }
+
+ /* Nuke the page table entry. */
+ pte = ptep_get_and_clear(ptep);
+ flush_tlb_page(vma, address);
+ flush_cache_page(vma, address);
+
+ /* Store the swap location in the pte. See handle_pte_fault() ... */
+ if (PageSwapCache(page)) {
+ swp_entry_t entry;
+ entry.val = page->index;
+ swap_duplicate(entry);
+ set_pte(ptep, swp_entry_to_pte(entry));
+ }
+
+ /* Move the dirty bit to the physical page now the pte is gone. */
+ if (pte_dirty(pte))
+ set_page_dirty(page);
+
+ mm->rss--;
+ page_cache_release(page);
+ ret = SWAP_SUCCESS;
+
+out_unlock:
+ spin_unlock(&mm->page_table_lock);
+ return ret;
+}
+
+/**
+ * try_to_unmap - try to remove all page table mappings to a page
+ * @page: the page to get unmapped
+ *
+ * Tries to remove all the page table entries which are mapping this
+ * page, used in the pageout path. Caller must hold pagemap_lru_lock
+ * and the page lock. Return values are:
+ *
+ * SWAP_SUCCESS - we succeeded in removing all mappings
+ * SWAP_AGAIN - we missed a trylock, try again later
+ * SWAP_FAIL - the page is unswappable
+ * SWAP_ERROR - an error occurred
+ */
+int FASTCALL(try_to_unmap(struct page *));
+int try_to_unmap(struct page * page)
+{
+ struct pte_chain * pc, * next_pc, * prev_pc = NULL;
+ int ret = SWAP_SUCCESS;
+
+ /* This page should not be on the pageout lists. */
+ if (!VALID_PAGE(page) || PageReserved(page))
+ BUG();
+ if (!PageLocked(page))
+ BUG();
+ /* We need backing store to swap out a page. */
+ if (!page->mapping)
+ BUG();
+
+ for (pc = page->pte_chain; pc; pc = next_pc) {
+ next_pc = pc->next;
+ switch (try_to_unmap_one(page, pc->ptep)) {
+ case SWAP_SUCCESS:
+ /* Free the pte_chain struct. */
+ pte_chain_free(pc, prev_pc, page);
+ break;
+ case SWAP_AGAIN:
+ /* Skip this pte, remembering status. */
+ prev_pc = pc;
+ ret = SWAP_AGAIN;
+ continue;
+ case SWAP_FAIL:
+ return SWAP_FAIL;
+ case SWAP_ERROR:
+ return SWAP_ERROR;
+ }
+ }
+
+ return ret;
+}
+
+/**
+ * page_over_rsslimit - test if the page is over its RSS limit
+ * @page - page to test
+ *
+ * This function returns true if the process owning this page
+ * is over its RSS (resident set size) limit. For shared pages
+ * we make the optimisation of only checking the first process
+ * in the pte_chain list, this should catch hogs while not
+ * evicting pages shared by many processes.
+ * The caller needs to hold the pagemap_lru_lock.
+ */
+int FASTCALL(page_over_rsslimit(struct page *));
+int page_over_rsslimit(struct page * page)
+{
+ struct pte_chain * pte_chain = page->pte_chain;
+ struct mm_struct * mm;
+ pte_t * ptep;
+
+ /* No process is using the page. */
+ if (!pte_chain)
+ return 0;
+
+ ptep = pte_chain->ptep;
+ mm = ptep_to_mm(ptep);
+
+ return mm->rlimit_rss && (mm->rss > mm->rlimit_rss);
+}
+
+/**
+ * pte_chain_free - free pte_chain structure
+ * @pte_chain: pte_chain struct to free
+ * @prev_pte_chain: previous pte_chain on the list (may be NULL)
+ * @page: page this pte_chain hangs off (may be NULL)
+ *
+ * This function unlinks pte_chain from the singly linked list it
+ * may be on and adds the pte_chain to the free list. May also be
+ * called for new pte_chain structures which aren't on any list yet.
+ * Caller needs to hold the pagemap_lru_list.
+ */
+static inline void pte_chain_free(struct pte_chain * pte_chain, struct
pte_chain * prev_pte_chain, struct page * page)
+{
+ if (prev_pte_chain)
+ prev_pte_chain->next = pte_chain->next;
+ else if (page)
+ page->pte_chain = pte_chain->next;
+
+ pte_chain->ptep = NULL;
+ pte_chain->next = pte_chain_freelist;
+ pte_chain_freelist = pte_chain;
+}
+
+/**
+ * pte_chain_alloc - allocate a pte_chain struct
+ *
+ * Returns a pointer to a fresh pte_chain structure. Allocates new
+ * pte_chain structures as required.
+ * Caller needs to hold the pagemap_lru_lock.
+ */
+static inline struct pte_chain * pte_chain_alloc(void)
+{
+ struct pte_chain * pte_chain;
+
+ /* Allocate new pte_chain structs as needed. */
+ if (!pte_chain_freelist)
+ alloc_new_pte_chains();
+
+ /* Grab the first pte_chain from the freelist. */
+ pte_chain = pte_chain_freelist;
+ pte_chain_freelist = pte_chain->next;
+ pte_chain->next = NULL;
+
+ return pte_chain;
+}
+
+/**
+ * alloc_new_pte_chains - convert a free page to pte_chain structures
+ *
+ * Grabs a free page and converts it to pte_chain structures. We really
+ * should pre-allocate these earlier in the pagefault path or come up
+ * with some other trick.
+ *
+ * Note that we cannot use the slab cache because the pte_chain structure
+ * is way smaller than the minimum size of a slab cache allocation.
+ */
+static void alloc_new_pte_chains(void)
+{
+ struct pte_chain * pte_chain = (void *) get_zeroed_page(GFP_ATOMIC);
+ int i = PAGE_SIZE / sizeof(struct pte_chain);
+
+ if (pte_chain) {
+ for (; i-- > 0; pte_chain++)
+ pte_chain_free(pte_chain, NULL, NULL);
+ } else {
+ /* Yeah yeah, I'll fix the pte_chain allocation ... */
+ panic("Fix pte_chain allocation, you lazy bastard!\n");
+ }
+}
diff -Nru a/mm/slab.c b/mm/slab.c
--- a/mm/slab.c Fri Mar 1 18:19:44 2002
+++ b/mm/slab.c Fri Mar 1 18:19:44 2002
@@ -911,34 +911,45 @@
#define drain_cpu_caches(cachep) do { } while (0)
#endif
+/**
+ * Called with the &cachep->spinlock held, returns number of slabs released
+ */
+static int __kmem_cache_shrink_locked(kmem_cache_t *cachep)
+{
+ slab_t *slabp;
+ int ret = 0;
+
+ /* If the cache is growing, stop shrinking. */
+ while (!cachep->growing) {
+ struct list_head *p;
+
+ p = cachep->slabs_free.prev;
+ if (p == &cachep->slabs_free)
+ break;
+
+ slabp = list_entry(cachep->slabs_free.prev, slab_t, list);
+#if DEBUG
+ if (slabp->inuse)
+ BUG();
+#endif
+ list_del(&slabp->list);
+
+ spin_unlock_irq(&cachep->spinlock);
+ kmem_slab_destroy(cachep, slabp);
+ ret++;
+ spin_lock_irq(&cachep->spinlock);
+ }
+ return ret;
+}
+
static int __kmem_cache_shrink(kmem_cache_t *cachep)
{
- slab_t *slabp;
int ret;
drain_cpu_caches(cachep);
spin_lock_irq(&cachep->spinlock);
-
- /* If the cache is growing, stop shrinking. */
- while (!cachep->growing) {
- struct list_head *p;
-
- p = cachep->slabs_free.prev;
- if (p == &cachep->slabs_free)
- break;
-
- slabp = list_entry(cachep->slabs_free.prev, slab_t, list);
-#if DEBUG
- if (slabp->inuse)
- BUG();
-#endif
- list_del(&slabp->list);
-
- spin_unlock_irq(&cachep->spinlock);
- kmem_slab_destroy(cachep, slabp);
- spin_lock_irq(&cachep->spinlock);
- }
+ __kmem_cache_shrink_locked(cachep);
ret = !list_empty(&cachep->slabs_full) ||
!list_empty(&cachep->slabs_partial);
spin_unlock_irq(&cachep->spinlock);
return ret;
@@ -957,6 +968,24 @@
BUG();
return __kmem_cache_shrink(cachep);
+}
+
+/**
+ * kmem_cache_shrink_nr - Shrink a cache returning pages released
+ */
+int kmem_cache_shrink_nr(kmem_cache_t *cachep)
+{
+ int ret;
+
+ if (!cachep || in_interrupt() || !is_chained_kmem_cache(cachep))
+ BUG();
+
+ drain_cpu_caches(cachep);
+
+ spin_lock_irq(&cachep->spinlock);
+ ret = __kmem_cache_shrink_locked(cachep);
+ spin_unlock_irq(&cachep->spinlock);
+ return ret<<(cachep->gfporder);
}
/**
diff -Nru a/mm/swap.c b/mm/swap.c
--- a/mm/swap.c Fri Mar 1 18:19:44 2002
+++ b/mm/swap.c Fri Mar 1 18:19:44 2002
@@ -15,15 +15,29 @@
#include <linux/mm.h>
#include <linux/kernel_stat.h>
-#include <linux/swap.h>
#include <linux/swapctl.h>
#include <linux/pagemap.h>
#include <linux/init.h>
+#include <linux/mm_inline.h>
#include <asm/dma.h>
#include <asm/uaccess.h> /* for copy_to/from_user */
#include <asm/pgtable.h>
+/*
+ * We identify three levels of free memory. We never let free mem
+ * fall below the freepages.min except for atomic allocations. We
+ * start background swapping if we fall below freepages.high free
+ * pages, and we begin intensive swapping below freepages.low.
+ *
+ * Actual initialization is done in mm/page_alloc.c
+ */
+freepages_t freepages = {
+ 0, /* freepages.min */
+ 0, /* freepages.low */
+ 0 /* freepages.high */
+};
+
/* How many pages do we try to swap or page in/out together? */
int page_cluster;
@@ -33,17 +47,102 @@
8, /* do swap I/O in clusters of this size */
};
+/**
+ * (de)activate_page - move pages from/to active and inactive lists
+ * @page: the page we want to move
+ * @nolock - are we already holding the pagemap_lru_lock?
+ *
+ * Deactivate_page will move an active page to the right
+ * inactive list, while activate_page will move a page back
+ * from one of the inactive lists to the active list. If
+ * called on a page which is not on any of the lists, the
+ * page is left alone.
+ */
+void FASTCALL(deactivate_page_nolock(struct page *));
+void deactivate_page_nolock(struct page * page)
+{
+ /*
+ * Don't touch it if it's not on the active list.
+ * (some pages aren't on any list at all)
+ */
+ ClearPageReferenced(page);
+ page->age = 0;
+ if (PageActive(page)) {
+ del_page_from_active_list(page);
+ add_page_to_inactive_dirty_list(page);
+ }
+}
+
+void FASTCALL(deactivate_page(struct page *));
+void deactivate_page(struct page * page)
+{
+ spin_lock(&pagemap_lru_lock);
+ deactivate_page_nolock(page);
+ spin_unlock(&pagemap_lru_lock);
+}
+
+/**
+ * drop_page - like deactivate_page, but try inactive_clean list
+ * @page: the page to drop
+ *
+ * Try to move a page to the inactive_clean list, this succeeds if the
+ * page is clean and not in use by anybody. If the page cannot be placed
+ * on the inactive_clean list it is placed on the inactive_dirty list
+ * instead.
+ *
+ * Note: this function gets called with the pagemap_lru_lock held.
+ */
+void FASTCALL(drop_page(struct page *));
+void drop_page(struct page * page)
+{
+ if (!TryLockPage(page)) {
+ if (page->mapping && page->buffers) {
+ page_cache_get(page);
+ spin_unlock(&pagemap_lru_lock);
+ try_to_release_page(page, GFP_NOIO);
+ spin_lock(&pagemap_lru_lock);
+ page_cache_release(page);
+ }
+ UnlockPage(page);
+ }
+
+ /* Make sure the page really is reclaimable. */
+ if (!page->mapping || PageDirty(page) || page->pte_chain ||
+ page->buffers || page_count(page) > 1)
+ deactivate_page_nolock(page);
+
+ else if (page_count(page) == 1) {
+ ClearPageReferenced(page);
+ page->age = 0;
+ if (PageActive(page)) {
+ del_page_from_active_list(page);
+ add_page_to_inactive_clean_list(page);
+ } else if (PageInactiveDirty(page)) {
+ del_page_from_inactive_dirty_list(page);
+ add_page_to_inactive_clean_list(page);
+ }
+ }
+}
+
/*
* Move an inactive page to the active list.
*/
-static inline void activate_page_nolock(struct page * page)
+void FASTCALL(activate_page_nolock(struct page *));
+void activate_page_nolock(struct page * page)
{
- if (PageLRU(page) && !PageActive(page)) {
- del_page_from_inactive_list(page);
+ if (PageInactiveDirty(page)) {
+ del_page_from_inactive_dirty_list(page);
+ add_page_to_active_list(page);
+ } else if (PageInactiveClean(page)) {
+ del_page_from_inactive_clean_list(page);
add_page_to_active_list(page);
}
+
+ /* Make sure the page gets a fair chance at staying active. */
+ page->age = max((int)page->age, PAGE_AGE_START);
}
+void FASTCALL(activate_page(struct page *));
void activate_page(struct page * page)
{
spin_lock(&pagemap_lru_lock);
@@ -55,11 +154,12 @@
* lru_cache_add: add a page to the page lists
* @page: the page to add
*/
+void FASTCALL(lru_cache_add(struct page *));
void lru_cache_add(struct page * page)
{
- if (!TestSetPageLRU(page)) {
+ if (!PageLRU(page)) {
spin_lock(&pagemap_lru_lock);
- add_page_to_inactive_list(page);
+ add_page_to_active_list(page);
spin_unlock(&pagemap_lru_lock);
}
}
@@ -71,14 +171,15 @@
* This function is for when the caller already holds
* the pagemap_lru_lock.
*/
+void FASTCALL(__lru_cache_del(struct page *));
void __lru_cache_del(struct page * page)
{
- if (TestClearPageLRU(page)) {
- if (PageActive(page)) {
- del_page_from_active_list(page);
- } else {
- del_page_from_inactive_list(page);
- }
+ if (PageActive(page)) {
+ del_page_from_active_list(page);
+ } else if (PageInactiveDirty(page)) {
+ del_page_from_inactive_dirty_list(page);
+ } else if (PageInactiveClean(page)) {
+ del_page_from_inactive_clean_list(page);
}
}
@@ -86,6 +187,7 @@
* lru_cache_del: remove a page from the page lists
* @page: the page to remove
*/
+void FASTCALL(lru_cache_del(struct page *));
void lru_cache_del(struct page * page)
{
spin_lock(&pagemap_lru_lock);
diff -Nru a/mm/swap_state.c b/mm/swap_state.c
--- a/mm/swap_state.c Fri Mar 1 18:19:44 2002
+++ b/mm/swap_state.c Fri Mar 1 18:19:44 2002
@@ -89,6 +89,40 @@
return 0;
}
+/**
+ * add_to_swap - allocate swap space for a page
+ * @page: page we want to move to swap
+ *
+ * Allocate swap space for the page and add the page to the
+ * swap cache. Caller needs to hold the page lock.
+ */
+int add_to_swap(struct page * page)
+{
+ swp_entry_t entry;
+
+ if (!PageLocked(page))
+ BUG();
+
+ for (;;) {
+ entry = get_swap_page();
+ if (!entry.val)
+ return 0;
+ /*
+ * Add it to the swap cache and mark it dirty
+ * (adding to the page cache will clear the dirty
+ * and uptodate bits, so we need to do it again)
+ */
+ if (add_to_swap_cache(page, entry) == 0) {
+ SetPageUptodate(page);
+ set_page_dirty(page);
+ swap_free(entry);
+ return 1;
+ }
+ /* Raced with "speculative" read_swap_cache_async */
+ swap_free(entry);
+ }
+}
+
/*
* This must be called only on pages that have
* been verified to be in the swap cache.
diff -Nru a/mm/swapfile.c b/mm/swapfile.c
--- a/mm/swapfile.c Fri Mar 1 18:19:44 2002
+++ b/mm/swapfile.c Fri Mar 1 18:19:44 2002
@@ -374,6 +374,7 @@
return;
get_page(page);
set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
+ page_add_rmap(page, dir);
swap_free(entry);
++vma->vm_mm->rss;
}
diff -Nru a/mm/vmscan.c b/mm/vmscan.c
--- a/mm/vmscan.c Fri Mar 1 18:19:44 2002
+++ b/mm/vmscan.c Fri Mar 1 18:19:44 2002
@@ -1,6 +1,9 @@
/*
* linux/mm/vmscan.c
*
+ * The pageout daemon, decides which pages to evict (swap out) and
+ * does the actual work of freeing them.
+ *
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* Swap reorganised 29.12.95, Stephen Tweedie.
@@ -21,9 +24,12 @@
#include <linux/highmem.h>
#include <linux/file.h>
#include <linux/compiler.h>
+#include <linux/mm_inline.h>
#include <asm/pgalloc.h>
+static void refill_freelist(void);
+static void wakeup_memwaiters(void);
/*
* The "priority" of VM scanning is how much of the queues we
* will scan in one go. A value of 6 for DEF_PRIORITY implies
@@ -32,371 +38,275 @@
*/
#define DEF_PRIORITY (6)
-/*
- * The swap-out function returns 1 if it successfully
- * scanned all the pages it was asked to (`count').
- * It returns zero if it couldn't do anything,
- *
- * rss may decrease because pages are shared, but this
- * doesn't count as having freed a page.
- */
-
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int try_to_swap_out(struct mm_struct * mm, struct
vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page
*page, zone_t * classzone)
+static inline void age_page_up(struct page *page)
{
- pte_t pte;
- swp_entry_t entry;
+ page->age = min((int) (page->age + PAGE_AGE_ADV), PAGE_AGE_MAX);
+}
- /* Don't look at this pte if it's been accessed recently. */
- if ((vma->vm_flags & VM_LOCKED) ||
ptep_test_and_clear_young(page_table)) {
- mark_page_accessed(page);
- return 0;
- }
+static inline void age_page_down(struct page *page)
+{
+ page->age -= min(PAGE_AGE_DECL, (int)page->age);
+}
- /* Don't bother unmapping pages that are active */
- if (PageActive(page))
- return 0;
+static inline int page_mapping_inuse(struct page * page)
+{
+ struct address_space * mapping = page->mapping;
- /* Don't bother replenishing zones not under pressure.. */
- if (!memclass(page_zone(page), classzone))
- return 0;
+ /* Page is in somebody's page tables. */
+ if (page->pte_chain)
+ return 1;
- if (TryLockPage(page))
+ /* XXX: does this happen ? */
+ if (!mapping)
return 0;
- /* From this point on, the odds are that we're going to
- * nuke this pte, so read and clear the pte. This hook
- * is needed on CPUs which update the accessed and dirty
- * bits in hardware.
- */
- flush_cache_page(vma, address);
- pte = ptep_get_and_clear(page_table);
- flush_tlb_page(vma, address);
-
- if (pte_dirty(pte))
- set_page_dirty(page);
+ /* File is mmaped by somebody. */
+ if (mapping->i_mmap || mapping->i_mmap_shared)
+ return 1;
- /*
- * Is the page already in the swap cache? If so, then
- * we can just drop our reference to it without doing
- * any IO - it's already up-to-date on disk.
- */
- if (PageSwapCache(page)) {
- entry.val = page->index;
- swap_duplicate(entry);
-set_swap_pte:
- set_pte(page_table, swp_entry_to_pte(entry));
-drop_pte:
- mm->rss--;
- UnlockPage(page);
- {
- int freeable = page_count(page) - !!page->buffers <= 2;
- page_cache_release(page);
- return freeable;
- }
- }
+ return 0;
+}
- /*
- * Is it a clean page? Then it must be recoverable
- * by just paging it in again, and we can just drop
- * it.. or if it's dirty but has backing store,
- * just mark the page dirty and drop it.
- *
- * However, this won't actually free any real
- * memory, as the page will just be in the page cache
- * somewhere, and as such we should just continue
- * our scan.
- *
- * Basically, this just makes it possible for us to do
- * some real work in the future in "refill_inactive()".
- */
- if (page->mapping)
- goto drop_pte;
- if (!PageDirty(page))
- goto drop_pte;
+/**
+ * reclaim_page - reclaims one page from the inactive_clean list
+ * @zone: reclaim a page from this zone
+ *
+ * The pages on the inactive_clean can be instantly reclaimed.
+ * The tests look impressive, but most of the time we'll grab
+ * the first page of the list and exit successfully.
+ */
+struct page * reclaim_page(zone_t * zone)
+{
+ struct page * page = NULL;
+ struct list_head * page_lru;
+ swp_entry_t entry = {0};
+ int maxscan;
/*
- * Anonymous buffercache pages can be left behind by
- * concurrent truncate and pagefault.
+ * We need to hold the pagecache_lock around all tests to make sure
+ * reclaim_page() cannot race with find_get_page() and friends.
*/
- if (page->buffers)
- goto preserve;
+ spin_lock(&pagemap_lru_lock);
+ spin_lock(&pagecache_lock);
+ maxscan = zone->inactive_clean_pages;
+ while (maxscan-- && !list_empty(&zone->inactive_clean_list)) {
+ page_lru = zone->inactive_clean_list.prev;
+ page = list_entry(page_lru, struct page, lru);
- /*
- * This is a dirty, swappable page. First of all,
- * get a suitable swap entry for it, and make sure
- * we have the swap cache set up to associate the
- * page with that swap entry.
- */
- for (;;) {
- entry = get_swap_page();
- if (!entry.val)
- break;
- /* Add it to the swap cache and mark it dirty
- * (adding to the page cache will clear the dirty
- * and uptodate bits, so we need to do it again)
- */
- if (add_to_swap_cache(page, entry) == 0) {
- SetPageUptodate(page);
- set_page_dirty(page);
- goto set_swap_pte;
+ /* Wrong page on list?! (list corruption, should not happen) */
+ if (unlikely(!PageInactiveClean(page))) {
+ printk("VM: reclaim_page, wrong page on list.\n");
+ list_del(page_lru);
+ page_zone(page)->inactive_clean_pages--;
+ continue;
}
- /* Raced with "speculative" read_swap_cache_async */
- swap_free(entry);
- }
-
- /* No swap space left */
-preserve:
- set_pte(page_table, pte);
- UnlockPage(page);
- return 0;
-}
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct *
vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t *
classzone)
-{
- pte_t * pte;
- unsigned long pmd_end;
-
- if (pmd_none(*dir))
- return count;
- if (pmd_bad(*dir)) {
- pmd_ERROR(*dir);
- pmd_clear(dir);
- return count;
- }
-
- pte = pte_offset(dir, address);
-
- pmd_end = (address + PMD_SIZE) & PMD_MASK;
- if (end > pmd_end)
- end = pmd_end;
+ /* Page is being freed */
+ if (unlikely(page_count(page)) == 0) {
+ list_del(page_lru);
+ list_add(page_lru, &zone->inactive_clean_list);
+ continue;
+ }
- do {
- if (pte_present(*pte)) {
- struct page *page = pte_page(*pte);
+ /* Page cannot be reclaimed ? Move to inactive_dirty list. */
+ if (unlikely(page->pte_chain || page->buffers ||
+ PageReferenced(page) || PageDirty(page) ||
+ page_count(page) > 1 || TryLockPage(page))) {
+ del_page_from_inactive_clean_list(page);
+ add_page_to_inactive_dirty_list(page);
+ continue;
+ }
- if (VALID_PAGE(page) && !PageReserved(page)) {
- count -= try_to_swap_out(mm, vma, address, pte,
page, classzone);
- if (!count) {
- address += PAGE_SIZE;
- break;
- }
- }
+ /* OK, remove the page from the caches. */
+ if (PageSwapCache(page)) {
+ entry.val = page->index;
+ __delete_from_swap_cache(page);
+ goto found_page;
}
- address += PAGE_SIZE;
- pte++;
- } while (address && (address < end));
- mm->swap_address = address;
- return count;
-}
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct *
vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t *
classzone)
-{
- pmd_t * pmd;
- unsigned long pgd_end;
+ if (page->mapping) {
+ __remove_inode_page(page);
+ goto found_page;
+ }
- if (pgd_none(*dir))
- return count;
- if (pgd_bad(*dir)) {
- pgd_ERROR(*dir);
- pgd_clear(dir);
- return count;
+ /* We should never ever get here. */
+ printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
+ list_del(page_lru);
+ zone->inactive_clean_pages--;
+ UnlockPage(page);
}
+ spin_unlock(&pagecache_lock);
+ spin_unlock(&pagemap_lru_lock);
+ return NULL;
- pmd = pmd_offset(dir, address);
-
- pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
- if (pgd_end && (end > pgd_end))
- end = pgd_end;
-
- do {
- count = swap_out_pmd(mm, vma, pmd, address, end, count,
classzone);
- if (!count)
- break;
- address = (address + PMD_SIZE) & PMD_MASK;
- pmd++;
- } while (address && (address < end));
- return count;
-}
-
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct *
vma, unsigned long address, int count, zone_t * classzone)
-{
- pgd_t *pgdir;
- unsigned long end;
-
- /* Don't swap out areas which are reserved */
- if (vma->vm_flags & VM_RESERVED)
- return count;
-
- pgdir = pgd_offset(mm, address);
-
- end = vma->vm_end;
- if (address >= end)
- BUG();
- do {
- count = swap_out_pgd(mm, vma, pgdir, address, end, count,
classzone);
- if (!count)
- break;
- address = (address + PGDIR_SIZE) & PGDIR_MASK;
- pgdir++;
- } while (address && (address < end));
- return count;
+found_page:
+ del_page_from_inactive_clean_list(page);
+ spin_unlock(&pagecache_lock);
+ spin_unlock(&pagemap_lru_lock);
+ if (entry.val)
+ swap_free(entry);
+ UnlockPage(page);
+ page->age = PAGE_AGE_START;
+ if (page_count(page) != 1)
+ printk("VM: reclaim_page, found page with count %d!\n",
+ page_count(page));
+ return page;
}
-/* Placeholder for swap_out(): may be updated by fork.c:mmput() */
-struct mm_struct *swap_mm = &init_mm;
-
-/*
- * Returns remaining count of pages to be swapped out by followup call.
+/**
+ * page_dirty - do we need to write the data out to disk
+ * @page: page to test
+ *
+ * Returns true if the page contains data which needs to
+ * be written to disk. Doesn't test the page tables (yet?).
*/
-static inline int swap_out_mm(struct mm_struct * mm, int count, int *
mmcounter, zone_t * classzone)
+static inline int page_dirty(struct page *page)
{
- unsigned long address;
- struct vm_area_struct* vma;
+ struct buffer_head *tmp, *bh;
- /*
- * Find the proper vm-area after freezing the vma chain
- * and ptes.
- */
- spin_lock(&mm->page_table_lock);
- address = mm->swap_address;
- if (address == TASK_SIZE || swap_mm != mm) {
- /* We raced: don't count this mm but try again */
- ++*mmcounter;
- goto out_unlock;
- }
- vma = find_vma(mm, address);
- if (vma) {
- if (address < vma->vm_start)
- address = vma->vm_start;
-
- for (;;) {
- count = swap_out_vma(mm, vma, address, count,
classzone);
- vma = vma->vm_next;
- if (!vma)
- break;
- if (!count)
- goto out_unlock;
- address = vma->vm_start;
- }
- }
- /* Indicate that we reached the end of address space */
- mm->swap_address = TASK_SIZE;
+ if (PageDirty(page))
+ return 1;
-out_unlock:
- spin_unlock(&mm->page_table_lock);
- return count;
-}
+ if (page->mapping && !page->buffers)
+ return 0;
-static int FASTCALL(swap_out(unsigned int priority, unsigned int gfp_mask,
zone_t * classzone));
-static int swap_out(unsigned int priority, unsigned int gfp_mask, zone_t *
classzone)
-{
- int counter, nr_pages = SWAP_CLUSTER_MAX;
- struct mm_struct *mm;
+ tmp = bh = page->buffers;
- counter = mmlist_nr;
do {
- if (unlikely(current->need_resched)) {
- __set_current_state(TASK_RUNNING);
- schedule();
- }
-
- spin_lock(&mmlist_lock);
- mm = swap_mm;
- while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
- mm->swap_address = 0;
- mm = list_entry(mm->mmlist.next, struct mm_struct,
mmlist);
- if (mm == swap_mm)
- goto empty;
- swap_mm = mm;
- }
-
- /* Make sure the mm doesn't disappear when we drop the lock.. */
- atomic_inc(&mm->mm_users);
- spin_unlock(&mmlist_lock);
-
- nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
-
- mmput(mm);
-
- if (!nr_pages)
+ if (tmp->b_state & ((1<<BH_Dirty) | (1<<BH_Lock)))
return 1;
- } while (--counter >= 0);
+ tmp = tmp->b_this_page;
+ } while (tmp != bh);
return 0;
-
-empty:
- spin_unlock(&mmlist_lock);
- return 0;
}
-static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned
int gfp_mask, int priority));
-static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int
gfp_mask, int priority)
+/**
+ * page_launder_zone - clean dirty inactive pages, move to inactive_clean list
+ * @zone: zone to free pages in
+ * @gfp_mask: what operations we are allowed to do
+ *
+ * This function is called when we are low on free / inactive_clean
+ * pages, its purpose is to refill the free/clean list as efficiently
+ * as possible.
+ *
+ * This means we do writes asynchronously as long as possible and will
+ * only sleep on IO when we don't have another option. Since writeouts
+ * cause disk seeks and make read IO slower, we skip writes alltogether
+ * when the amount of dirty pages is small.
+ *
+ * This code is heavily inspired by the FreeBSD source code. Thanks
+ * go out to Matthew Dillon.
+ */
+#define CAN_DO_FS ((gfp_mask & __GFP_FS) && should_write)
+int page_launder_zone(zone_t * zone, int gfp_mask, int priority)
{
+ int maxscan, cleaned_pages, target;
struct list_head * entry;
- int max_scan = nr_inactive_pages / priority;
- int max_mapped = min((nr_pages << (10 - priority)), max_scan / 10);
+ target = free_plenty(zone);
+ cleaned_pages = 0;
+
+ /* The main launder loop. */
spin_lock(&pagemap_lru_lock);
- while (--max_scan >= 0 && (entry = inactive_list.prev) !=
&inactive_list) {
+ maxscan = zone->inactive_dirty_pages >> priority;
+ while (maxscan-- && !list_empty(&zone->inactive_dirty_list)) {
struct page * page;
-
- if (unlikely(current->need_resched)) {
+
+ /* Low latency reschedule point */
+ if (current->need_resched) {
spin_unlock(&pagemap_lru_lock);
- __set_current_state(TASK_RUNNING);
schedule();
spin_lock(&pagemap_lru_lock);
continue;
}
+ entry = zone->inactive_dirty_list.prev;
page = list_entry(entry, struct page, lru);
- if (unlikely(!PageLRU(page)))
- BUG();
- if (unlikely(PageActive(page)))
- BUG();
+ if (cleaned_pages > target)
+ break;
list_del(entry);
- list_add(entry, &inactive_list);
+ list_add(entry, &zone->inactive_dirty_list);
+
+ /* Wrong page on list?! (list corruption, should not happen) */
+ if (!PageInactiveDirty(page)) {
+ printk("VM: page_launder, wrong page on list.\n");
+ list_del(entry);
+ nr_inactive_dirty_pages--;
+ page_zone(page)->inactive_dirty_pages--;
+ continue;
+ }
/*
- * Zero page counts can happen because we unlink the pages
- * _after_ decrementing the usage count..
+ * The page is in active use or really unfreeable. Move to
+ * the active list and adjust the page age if needed.
*/
- if (unlikely(!page_count(page)))
+ if (page_referenced(page) && page_mapping_inuse(page) &&
+ !page_over_rsslimit(page)) {
+ del_page_from_inactive_dirty_list(page);
+ add_page_to_active_list(page);
+ page->age = max((int)page->age, PAGE_AGE_START);
continue;
+ }
- if (!memclass(page_zone(page), classzone))
+ /*
+ * Page is being freed, don't worry about it.
+ */
+ if (unlikely(page_count(page)) == 0)
continue;
- /* Racy check to avoid trylocking when not worthwhile */
- if (!page->buffers && (page_count(page) != 1 || !page->mapping))
- goto page_mapped;
-
/*
* The page is locked. IO in progress?
* Move it to the back of the list.
*/
- if (unlikely(TryLockPage(page))) {
- if (PageLaunder(page) && (gfp_mask & __GFP_FS)) {
- page_cache_get(page);
- spin_unlock(&pagemap_lru_lock);
- wait_on_page(page);
+ if (unlikely(TryLockPage(page)))
+ continue;
+
+ /*
+ * Anonymous process memory without backing store. Try to
+ * allocate it some swap space here.
+ *
+ * XXX: implement swap clustering ?
+ */
+ if (page->pte_chain && !page->mapping && !page->buffers) {
+ page_cache_get(page);
+ spin_unlock(&pagemap_lru_lock);
+ if (!add_to_swap(page)) {
+ activate_page(page);
+ UnlockPage(page);
page_cache_release(page);
spin_lock(&pagemap_lru_lock);
+ continue;
+ }
+ page_cache_release(page);
+ spin_lock(&pagemap_lru_lock);
+ }
+
+ /*
+ * The page is mapped into the page tables of one or more
+ * processes. Try to unmap it here.
+ */
+ if (page->pte_chain) {
+ switch (try_to_unmap(page)) {
+ case SWAP_ERROR:
+ case SWAP_FAIL:
+ goto page_active;
+ case SWAP_AGAIN:
+ UnlockPage(page);
+ continue;
+ case SWAP_SUCCESS:
+ ; /* try to free the page below */
}
- continue;
}
- if (PageDirty(page) && is_page_cache_freeable(page) &&
page->mapping) {
+ if (PageDirty(page) && page->mapping) {
/*
* It is not critical here to write it only if
* the page is unmapped beause any direct writer
* like O_DIRECT would set the PG_dirty bitflag
- * on the phisical page after having successfully
+ * on the physical page after having successfully
* pinned it and after the I/O to the page is finished,
* so the direct writes to the page cannot get lost.
*/
@@ -425,7 +335,7 @@
if (page->buffers) {
spin_unlock(&pagemap_lru_lock);
- /* avoid to free a locked page */
+ /* To avoid freeing our page before we're done. */
page_cache_get(page);
if (try_to_release_page(page, gfp_mask)) {
@@ -443,14 +353,14 @@
/* effectively free the page here */
page_cache_release(page);
- if (--nr_pages)
- continue;
- break;
+ cleaned_pages++;
+ continue;
} else {
/*
- * The page is still in pagecache so
undo the stuff
- * before the try_to_release_page since
we've not
- * finished and we can now try the next
step.
+ * We freed the buffers but may have
+ * slept; undo the stuff we did before
+ * try_to_release_page and fall through
+ * to the next step.
*/
page_cache_release(page);
@@ -466,227 +376,268 @@
}
}
- spin_lock(&pagecache_lock);
/*
- * this is the non-racy check for busy page.
+ * If the page is really freeable now, move it to the
+ * inactive_clean list.
+ *
+ * We re-test everything since the page could have been
+ * used by somebody else while we waited on IO above.
+ * This test is not safe from races, but only the one
+ * in reclaim_page() needs to be.
*/
- if (!page->mapping || !is_page_cache_freeable(page)) {
- spin_unlock(&pagecache_lock);
+ if (page->mapping && !PageDirty(page) && !page->pte_chain &&
+ page_count(page) == 1) {
+ del_page_from_inactive_dirty_list(page);
+ add_page_to_inactive_clean_list(page);
UnlockPage(page);
-page_mapped:
- if (--max_mapped >= 0)
- continue;
-
+ cleaned_pages++;
+ } else {
/*
- * Alert! We've found too many mapped pages on the
- * inactive list, so we start swapping out now!
+ * OK, we don't know what to do with the page.
+ * It's no use keeping it here, so we move it to
+ * the active list.
*/
- spin_unlock(&pagemap_lru_lock);
- swap_out(priority, gfp_mask, classzone);
- return nr_pages;
+page_active:
+ del_page_from_inactive_dirty_list(page);
+ add_page_to_active_list(page);
+ UnlockPage(page);
+ }
+ }
+ spin_unlock(&pagemap_lru_lock);
+
+ /* Return the number of pages moved to the inactive_clean list. */
+ return cleaned_pages;
+}
+
+/**
+ * page_launder - clean dirty inactive pages, move to inactive_clean list
+ * @gfp_mask: what operations we are allowed to do
+ *
+ * This function iterates over all zones and calls page_launder_zone(),
+ * balancing still needs to be added...
+ */
+int page_launder(int gfp_mask)
+{
+ int maxtry = 1 << DEF_PRIORITY;
+ struct zone_struct * zone;
+ int freed = 0;
+
+ /* Global balancing while we have a global shortage. */
+ while (maxtry-- && free_high(ALL_ZONES) >= 0) {
+ for_each_zone(zone)
+ if (free_plenty(zone) >= 0)
+ freed += page_launder_zone(zone, gfp_mask, 6);
+ }
+
+ /* Clean up the remaining zones with a serious shortage, if any. */
+ for_each_zone(zone)
+ if (free_min(zone) >= 0)
+ freed += page_launder_zone(zone, gfp_mask, 0);
+
+ return freed;
+}
+
+/**
+ * refill_inactive_zone - scan the active list and find pages to deactivate
+ * @priority: how much are we allowed to scan
+ *
+ * This function will scan a portion of the active list of a zone to find
+ * unused pages, those pages will then be moved to the inactive list.
+ */
+int refill_inactive_zone(struct zone_struct * zone, int priority)
+{
+ int maxscan = zone->active_pages >> priority;
+ int target = inactive_high(zone);
+ struct list_head * page_lru;
+ int nr_deactivated = 0;
+ struct page * page;
+
+ /* Take the lock while messing with the list... */
+ spin_lock(&pagemap_lru_lock);
+ while (maxscan-- && !list_empty(&zone->active_list)) {
+ page_lru = zone->active_list.prev;
+ page = list_entry(page_lru, struct page, lru);
+
+ /* Wrong page on list?! (list corruption, should not happen) */
+ if (unlikely(!PageActive(page))) {
+ printk("VM: refill_inactive, wrong page on list.\n");
+ list_del(page_lru);
+ nr_active_pages--;
+ continue;
}
/*
- * It is critical to check PageDirty _after_ we made sure
- * the page is freeable* so not in use by anybody.
+ * If the object the page is in is not in use we don't
+ * bother with page aging. If the page is touched again
+ * while on the inactive_clean list it'll be reactivated.
*/
- if (PageDirty(page)) {
- spin_unlock(&pagecache_lock);
- UnlockPage(page);
+ if (!page_mapping_inuse(page)) {
+ drop_page(page);
continue;
}
- /* point of no return */
- if (likely(!PageSwapCache(page))) {
- __remove_inode_page(page);
- spin_unlock(&pagecache_lock);
+ /*
+ * Do aging on the pages.
+ */
+ if (page_referenced(page)) {
+ age_page_up(page);
} else {
- swp_entry_t swap;
- swap.val = page->index;
- __delete_from_swap_cache(page);
- spin_unlock(&pagecache_lock);
- swap_free(swap);
+ age_page_down(page);
}
- __lru_cache_del(page);
- UnlockPage(page);
-
- /* effectively free the page here */
- page_cache_release(page);
+ /*
+ * If the page age is 'hot' and the process using the
+ * page doesn't exceed its RSS limit we keep the page.
+ * Otherwise we move it to the inactive_dirty list.
+ */
+ if (page->age && !page_over_rsslimit(page)) {
+ list_del(page_lru);
+ list_add(page_lru, &zone->active_list);
+ } else {
+ deactivate_page_nolock(page);
+ if (++nr_deactivated > target)
+ break;
+ }
- if (--nr_pages)
- continue;
- break;
+ /* Low latency reschedule point */
+ if (current->need_resched) {
+ spin_unlock(&pagemap_lru_lock);
+ schedule();
+ spin_lock(&pagemap_lru_lock);
+ }
}
spin_unlock(&pagemap_lru_lock);
- return nr_pages;
+ return nr_deactivated;
}
-/*
- * This moves pages from the active list to
- * the inactive list.
+/**
+ * refill_inactive - checks all zones and refills the inactive list as needed
*
- * We move them the other way when we see the
- * reference bit on the page.
+ * This function tries to balance page eviction from all zones by aging
+ * the pages from each zone in the same ratio until the global inactive
+ * shortage is resolved. After that it does one last "clean-up" scan to
+ * fix up local inactive shortages.
*/
-static void refill_inactive(int nr_pages)
+int refill_inactive(void)
{
- struct list_head * entry;
-
- spin_lock(&pagemap_lru_lock);
- entry = active_list.prev;
- while (nr_pages && entry != &active_list) {
- struct page * page;
+ int maxtry = 1 << DEF_PRIORITY;
+ zone_t * zone;
+ int ret = 0;
- page = list_entry(entry, struct page, lru);
- entry = entry->prev;
- if (PageTestandClearReferenced(page)) {
- list_del(&page->lru);
- list_add(&page->lru, &active_list);
- continue;
+ /* Global balancing while we have a global shortage. */
+ while (maxtry-- && inactive_low(ALL_ZONES) >= 0) {
+ for_each_zone(zone) {
+ if (inactive_high(zone) >= 0)
+ ret += refill_inactive_zone(zone, DEF_PRIORITY);
}
+ }
- nr_pages--;
-
- del_page_from_active_list(page);
- add_page_to_inactive_list(page);
- SetPageReferenced(page);
+ /* Local balancing for zones which really need it. */
+ for_each_zone(zone) {
+ if (inactive_min(zone) >= 0)
+ ret += refill_inactive_zone(zone, 0);
}
- spin_unlock(&pagemap_lru_lock);
+
+ return ret;
}
-static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned
int gfp_mask, int nr_pages));
-static int shrink_caches(zone_t * classzone, int priority, unsigned int
gfp_mask, int nr_pages)
+/**
+ * background_aging - slow background aging of zones
+ * @priority: priority at which to scan
+ *
+ * When the VM load is low or nonexistant, this function is
+ * called once a second to "sort" the pages in the VM. This
+ * way we know which pages to evict once a load spike happens.
+ * The effects of this function are very slow, the CPU usage
+ * should be minimal to nonexistant under most loads.
+ */
+static inline void background_aging(int priority)
{
- int chunk_size = nr_pages;
- unsigned long ratio;
+ struct zone_struct * zone;
- nr_pages -= kmem_cache_reap(gfp_mask);
- if (nr_pages <= 0)
- return 0;
-
- nr_pages = chunk_size;
- /* try to keep the active list 2/3 of the size of the cache */
- ratio = (unsigned long) nr_pages * nr_active_pages /
((nr_inactive_pages + 1) * 2);
- refill_inactive(ratio);
+ for_each_zone(zone)
+ if (inactive_high(zone) > 0)
+ refill_inactive_zone(zone, priority);
+}
- nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, priority);
- if (nr_pages <= 0)
- return 0;
+/*
+ * Worker function for kswapd and try_to_free_pages, we get
+ * called whenever there is a shortage of free/inactive_clean
+ * pages.
+ *
+ * This function will also move pages to the inactive list,
+ * if needed.
+ */
+static int do_try_to_free_pages(unsigned int gfp_mask)
+{
+ int ret = 0;
- shrink_dcache_memory(priority, gfp_mask);
- shrink_icache_memory(priority, gfp_mask);
+ /*
+ * Eat memory from filesystem page cache, buffer cache,
+ * dentry, inode and filesystem quota caches.
+ */
+ ret += page_launder(gfp_mask);
+ ret += shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
+ ret += shrink_icache_memory(1, gfp_mask);
#ifdef CONFIG_QUOTA
- shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
+ ret += shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
#endif
- return nr_pages;
-}
+ /*
+ * Move pages from the active list to the inactive list.
+ */
+ refill_inactive();
-int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int
order)
-{
- int priority = DEF_PRIORITY;
- int nr_pages = SWAP_CLUSTER_MAX;
+ /*
+ * Reclaim unused slab cache memory.
+ */
+ ret += kmem_cache_reap(gfp_mask);
- gfp_mask = pf_gfp_mask(gfp_mask);
- do {
- nr_pages = shrink_caches(classzone, priority, gfp_mask,
nr_pages);
- if (nr_pages <= 0)
- return 1;
- } while (--priority);
+ refill_freelist();
+
+ /* Start IO when needed. */
+ if (free_plenty(ALL_ZONES) > 0 || free_low(ANY_ZONE) > 0)
+ run_task_queue(&tq_disk);
/*
* Hmm.. Cache shrink failed - time to kill something?
* Mhwahahhaha! This is the part I really like. Giggle.
*/
- out_of_memory();
- return 0;
-}
-
-DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
-
-static int check_classzone_need_balance(zone_t * classzone)
-{
- zone_t * first_classzone;
+ if (!ret && free_low(ANY_ZONE) > 0)
+ out_of_memory();
- first_classzone = classzone->zone_pgdat->node_zones;
- while (classzone >= first_classzone) {
- if (classzone->free_pages > classzone->pages_high)
- return 0;
- classzone--;
- }
- return 1;
+ return ret;
}
-static int kswapd_balance_pgdat(pg_data_t * pgdat)
+/**
+ * refill_freelist - move inactive_clean pages to free list if needed
+ *
+ * Move some pages from the inactive_clean lists to the free
+ * lists so atomic allocations have pages to work from. This
+ * function really only does something when we don't have a
+ * userspace load on __alloc_pages().
+ *
+ * We refill the freelist in a bump from pages_min to pages_min * 2
+ * in order to give the buddy allocator something to play with.
+ */
+static void refill_freelist(void)
{
- int need_more_balance = 0, i;
+ struct page * page;
zone_t * zone;
- for (i = pgdat->nr_zones-1; i >= 0; i--) {
- zone = pgdat->node_zones + i;
- if (unlikely(current->need_resched))
- schedule();
- if (!zone->need_balance)
- continue;
- if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
- zone->need_balance = 0;
- __set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(HZ);
+ for_each_zone(zone) {
+ if (!zone->size || zone->free_pages >= zone->pages_min)
continue;
- }
- if (check_classzone_need_balance(zone))
- need_more_balance = 1;
- else
- zone->need_balance = 0;
- }
-
- return need_more_balance;
-}
-
-static void kswapd_balance(void)
-{
- int need_more_balance;
- pg_data_t * pgdat;
-
- do {
- need_more_balance = 0;
- pgdat = pgdat_list;
- do
- need_more_balance |= kswapd_balance_pgdat(pgdat);
- while ((pgdat = pgdat->node_next));
- } while (need_more_balance);
-}
-static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
-{
- zone_t * zone;
- int i;
-
- for (i = pgdat->nr_zones-1; i >= 0; i--) {
- zone = pgdat->node_zones + i;
- if (!zone->need_balance)
- continue;
- return 0;
+ while (zone->free_pages < zone->pages_min * 2) {
+ page = reclaim_page(zone);
+ if (!page)
+ break;
+ __free_page(page);
+ }
}
-
- return 1;
-}
-
-static int kswapd_can_sleep(void)
-{
- pg_data_t * pgdat;
-
- pgdat = pgdat_list;
- do {
- if (kswapd_can_sleep_pgdat(pgdat))
- continue;
- return 0;
- } while ((pgdat = pgdat->node_next));
-
- return 1;
}
/*
@@ -705,7 +656,6 @@
int kswapd(void *unused)
{
struct task_struct *tsk = current;
- DECLARE_WAITQUEUE(wait, tsk);
daemonize();
strcpy(tsk->comm, "kswapd");
@@ -729,24 +679,156 @@
* Kswapd main loop.
*/
for (;;) {
- __set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&kswapd_wait, &wait);
+ static long recalc = 0;
- mb();
- if (kswapd_can_sleep())
- schedule();
+ /*
+ * We try to rebalance the VM either when we have a
+ * global shortage of free pages or when one particular
+ * zone is very short on free pages.
+ */
+ if (free_high(ALL_ZONES) >= 0 || free_low(ANY_ZONE) > 0)
+ do_try_to_free_pages(GFP_KSWAPD);
+
+ refill_freelist();
+
+ /* Once a second ... */
+ if (time_after(jiffies, recalc + HZ)) {
+ recalc = jiffies;
+
+ /* Do background page aging. */
+ background_aging(DEF_PRIORITY);
+ }
+
+ wakeup_memwaiters();
+ }
+}
+
+DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
+DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
+#define VM_SHOULD_SLEEP (free_low(ALL_ZONES) > (freepages.min / 2))
+
+/**
+ * wakeup_kswapd - wake up the pageout daemon
+ * gfp_mask: page freeing flags
+ *
+ * This function wakes up kswapd and can, under heavy VM pressure,
+ * put the calling task to sleep temporarily.
+ */
+void wakeup_kswapd(unsigned int gfp_mask)
+{
+ DECLARE_WAITQUEUE(wait, current);
- __set_current_state(TASK_RUNNING);
+ /* If we're in the memory freeing business ourself, don't sleep
+ * but just wake kswapd and go back to businesss.
+ */
+ if (current->flags & PF_MEMALLOC) {
+ wake_up_interruptible(&kswapd_wait);
+ return;
+ }
+
+ /* We need all of kswapd's GFP flags, otherwise we can't sleep on it.
+ * We still wake kswapd of course.
+ */
+ if ((gfp_mask & GFP_KSWAPD) != GFP_KSWAPD) {
+ wake_up_interruptible(&kswapd_wait);
+ return;
+ }
+
+ add_wait_queue(&kswapd_done, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+
+ /* Wake kswapd .... */
+ wake_up_interruptible(&kswapd_wait);
+
+ /* ... and check if we need to wait on it */
+ if (VM_SHOULD_SLEEP)
+ schedule();
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&kswapd_done, &wait);
+}
+
+static void wakeup_memwaiters(void)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ /* Enough free RAM, we can easily keep up with memory demand. */
+ add_wait_queue(&kswapd_wait, &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (free_high(ALL_ZONES) <= 0) {
+ wake_up(&kswapd_done);
+ schedule_timeout(HZ);
remove_wait_queue(&kswapd_wait, &wait);
+ return;
+ }
+ remove_wait_queue(&kswapd_wait, &wait);
- /*
- * If we actually get into a low-memory situation,
- * the processes needing more memory will wake us
- * up on a more timely basis.
- */
- kswapd_balance();
- run_task_queue(&tq_disk);
+ /*
+ * kswapd is going to sleep for a long time. Wake up the waiters to
+ * prevent them to get stuck while waiting for us.
+ */
+ wake_up(&kswapd_done);
+
+ /* OK, the VM is very loaded. Sleep instead of using all CPU. */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(HZ / 4);
+ return;
+}
+
+/**
+ * try_to_free_pages - run the pageout code ourselves
+ * gfp_mask: mask of things the pageout code is allowed to do
+ *
+ * When the load on the system gets higher, it can happen
+ * that kswapd no longer manages to keep enough memory
+ * free. In those cases user programs allocating memory
+ * will call try_to_free_pages() and help the pageout code.
+ * This has the effects of freeing memory and slowing down
+ * the largest memory hogs a bit.
+ */
+int try_to_free_pages(unsigned int gfp_mask)
+{
+ int ret = 1;
+
+ gfp_mask = pf_gfp_mask(gfp_mask);
+ if (gfp_mask & __GFP_WAIT) {
+ current->flags |= PF_MEMALLOC;
+ ret = do_try_to_free_pages(gfp_mask);
+ current->flags &= ~PF_MEMALLOC;
}
+
+ return ret;
+}
+
+/**
+ * rss_free_pages - run part of the pageout code and slow down a bit
+ * @gfp_mask: mask of things the pageout code is allowed to do
+ *
+ * This function is called when a task is over its RSS limit and
+ * has a page fault. It's goal is to free some memory so non-hogs
+ * can run faster and slow down itself when needed so it won't eat
+ * the memory non-hogs can use.
+ */
+void rss_free_pages(unsigned int gfp_mask)
+{
+ long pause = 0;
+
+ if (current->flags & PF_MEMALLOC)
+ return;
+
+ current->flags |= PF_MEMALLOC;
+
+ do {
+ page_launder(gfp_mask);
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(pause);
+ set_current_state(TASK_RUNNING);
+ pause++;
+ } while (free_high(ALL_ZONES) >= 0);
+
+ current->flags &= ~PF_MEMALLOC;
+ return;
}
static int __init kswapd_init(void)
|