xfs
[Top] [All Lists]

Re: rmap vm and xfs

To: linux-xfs@xxxxxxxxxxx
Subject: Re: rmap vm and xfs
From: Knut J Bjuland <knutjbj@xxxxxxxxx>
Date: Sun, 10 Mar 2002 09:20:32 +0100
References: <3C8B15B9.4ECAC60B@xxxxxxxxx>
Sender: owner-linux-xfs@xxxxxxxxxxx
Knut J Bjuland wrote:

> Are there any plans to make xfs compatible with Riel rmap vm.patch.
> This'll make it easier to integrate it into Redhat 8.X when it ships, I
> believe it'll be based on a linux 2.4.17 or later with rmap patch.

send along a log off patching
# This is a BitKeeper generated patch for the following project:
# Project Name: Long-term Linux VM development
# This patch format is intended for GNU patch command version 2.5 or higher.
# This patch includes the following deltas:
#                  ChangeSet    linux-2.4.19-pre2 -> 1.207  
#       include/asm-sparc64/pgtable.h   1.14.1.1 -> 1.16   
#                 fs/inode.c    1.32    -> 1.33   
#       drivers/block/ll_rw_blk.c       1.27.1.2 -> 1.30   
#       include/linux/swapctl.h 1.2     -> 1.7    
#       include/linux/mmzone.h  1.6.1.1 -> 1.15   
#        fs/proc/proc_misc.c    1.11    -> 1.14   
#       include/linux/sched.h   1.20.1.4 -> 1.26   
#       drivers/char/agp/agpgart_be.c   1.20.1.3 -> 1.25   
#       include/linux/swap.h    1.31    -> 1.41   
#       include/asm-s390/pgtable.h      1.4.1.1 -> 1.6    
#                  mm/slab.c    1.12    -> 1.13   
#                mm/vmscan.c    1.53.1.4 -> 1.97   
#                 fs/dquot.c    1.16    -> 1.17   
#       drivers/char/drm/i810_dma.c     1.5.1.1 -> 1.10   
#                  mm/mmap.c    1.18.1.2 -> 1.21   
#       include/asm-i386/pgtable.h      1.4.1.1 -> 1.7    
#                fs/dcache.c    1.15.1.1 -> 1.17   
#       include/asm-s390x/pgtable.h     1.4.1.1 -> 1.6    
#       include/asm-sparc/pgtable.h     1.4.1.1 -> 1.6    
#       include/asm-sh/pgtable.h        1.6.1.1 -> 1.8    
#       include/asm-arm/pgtable.h       1.5.1.1 -> 1.7    
#                mm/memory.c    1.41.1.6 -> 1.49   
#                mm/mremap.c    1.5     -> 1.6    
#                fs/buffer.c    1.44.1.11 -> 1.55   
#         include/linux/mm.h    1.29.1.6 -> 1.46   
#               mm/filemap.c    1.46.1.8 -> 1.54   
#               mm/bootmem.c    1.6     -> 1.7    
#            mm/page_alloc.c    1.39.1.2 -> 1.62   
#            kernel/sysctl.c    1.13.1.3 -> 1.17   
#       include/asm-i386/pgalloc.h      1.8     -> 1.11   
#       arch/arm/mm/mm-armv.c   1.4     -> 1.5    
#       include/asm-mips/pgtable.h      1.3.1.1 -> 1.5    
#       include/linux/slab.h    1.8     -> 1.9    
#                  mm/swap.c    1.16    -> 1.24   
#            mm/swap_state.c    1.17    -> 1.20   
#         include/linux/fs.h    1.49.1.5 -> 1.53   
#       include/asm-alpha/pgtable.h     1.7.1.1 -> 1.9    
#       include/linux/pagemap.h 1.15.1.2 -> 1.19   
#              mm/oom_kill.c    1.9     -> 1.12   
#             kernel/ksyms.c    1.40.1.7 -> 1.46   
#                   Makefile    1.135.1.16 -> 1.144  
#              kernel/fork.c    1.18.1.2 -> 1.22   
#               kernel/sys.c    1.8.1.1 -> 1.10   
#                mm/Makefile    1.3.1.2 -> 1.7    
#       include/asm-mips64/pgtable.h    1.3.1.1 -> 1.5    
#       arch/i386/kernel/setup.c        1.32.1.3 -> 1.35   
#              mm/swapfile.c    1.20.1.2 -> 1.23   
#       include/linux/elevator.h        1.4     -> 1.5    
#       include/linux/sysctl.h  1.10.1.2 -> 1.13   
#       drivers/block/elevator.c        1.5     -> 1.7    
#                  fs/exec.c    1.17.1.2 -> 1.19   
#       include/asm-ia64/pgtable.h      1.6.1.1 -> 1.8    
#       include/asm-cris/pgtable.h      1.4.1.2 -> 1.7    
#       include/asm-parisc/pgtable.h    1.2.1.1 -> 1.4    
#        arch/i386/config.in    1.21.1.3 -> 1.24   
#       include/asm-ppc/pgtable.h       1.7.1.1 -> 1.9    
#                      (new)            -> 1.15    include/linux/mm_inline.h
#                      (new)            -> 1.1     include/asm-arm/rmap.h
#                      (new)            -> 1.1     include/asm-parisc/rmap.h
#                      (new)            -> 1.1     include/asm-s390/rmap.h
#                      (new)            -> 1.1     include/asm-mips/rmap.h
#                      (new)            -> 1.1     include/asm-ia64/rmap.h
#                      (new)            -> 1.1     include/asm-s390x/rmap.h
#                      (new)            -> 1.14    mm/rmap.c      
#                      (new)            -> 1.2     include/asm-cris/rmap.h
#                      (new)            -> 1.1     include/asm-sparc/rmap.h
#                      (new)            -> 1.1     
include/asm-arm/proc-armv/rmap.h
#                      (new)            -> 1.9     mm/TODO        
#                      (new)            -> 1.1     include/asm-mips64/rmap.h
#                      (new)            -> 1.34    Changelog.rmap 
#                      (new)            -> 1.1     include/asm-alpha/rmap.h
#                      (new)            -> 1.2     include/asm-generic/rmap.h
#                      (new)            -> 1.2     include/asm-i386/rmap.h
#                      (new)            -> 1.1     include/asm-sparc64/rmap.h
#                      (new)            -> 1.2     include/asm-ppc/rmap.h
#                      (new)            -> 1.1     include/asm-m68k/rmap.h
#                      (new)            -> 1.1     include/asm-sh/rmap.h
#
# The following is the BitKeeper ChangeSet Log
# --------------------------------------------
# 02/02/28      marcelo@xxxxxxxxxxxxxxxx        1.130.1.29
# linux-2.4.19-pre2:
# - -ac merge                                           (Alan Cox)
# - Huge MIPS/MIPS64 merge                              (Ralf Baechle)
# - IA64 update                                         (David Mosberger)
# - PPC update                                          (Tom Rini)
# - Shrink struct page                                  (Rik van Riel)
# - QNX4 update (now its able to mount QNX 6.1 fses)    (Anders Larsen)
# - Make max_map_count sysctl configurable              (Christoph Hellwig)
# - matroxfb update                                     (Petr Vandrovec)
# - ymfpci update                                               (Pete Zaitcev)
# - LVM update                                          (Heinz J . Mauelshagen)
# - btaudio driver update                                       (Gerd Knorr)
# - bttv update                                         (Gerd Knorr)
# - Out of line code cleanup                            (Keith Owens)
# - Add watchdog API documentation                      (Christer Weinigel)
# - Rivafb update                                               (Ani Joshi)
# - Enable PCI buses above quad0 on NUMA-Q              (Martin J. Bligh)
# - Fix PIIX IDE slave PCI timings                      (Dave Bogdanoff)
# - Make PLIP work again                                        (Tim Waugh)
# - Remove unecessary printk from lp.c                  (Tim Waugh)
# - Make parport_daisy_select work for ECP/EPP modes    (Max Vorobiev)
# - Support O_NONBLOCK on lp/ppdev correctly            (Tim Waugh)
# - Add PCI card hooks to parport                               (Tim Waugh)
# - Compaq cciss driver fixes                           (Stephen Cameron)
# - VFS cleanups and fixes                              (Alexander Viro)
# - USB update (including USB 2.0 support)              (Greg KH)
# - More jiffies compare cleanups                               (Tim Schmielau)
# - PCI hotplug update                                  (Greg KH)
# - bluesmoke fixes                                     (Dave Jones)
# - Fix off-by-one in ide-scsi                          (John Fremlin)
# - Fix warnings in make xconfig                                (René Scharfe)
# - Make x86 MCE a configure option                     (Paul Gortmaker)
# - Small ramdisk fixes                                 (Christoph Hellwig)
# - Add missing atime update to pipe code                       (Christoph 
Hellwig)
# - Serialize microcode access                          (Tigran Aivazian)
# - AMD Elan handling on serial.c                               (Robert 
Schwebel)
# --------------------------------------------
# 02/02/28      riel@xxxxxxxxxxxxxxxxxxxx       1.205
# merged
# --------------------------------------------
# 02/02/28      riel@xxxxxxxxxxxxxxxxxxxx       1.206
# remove obsolete code
# --------------------------------------------
# 02/02/28      riel@xxxxxxxxxxxxxxxxxxxx       1.207
# some more merging cleanups
# --------------------------------------------
#
diff -Nru a/Changelog.rmap b/Changelog.rmap
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ b/Changelog.rmap    Fri Mar  1 18:19:44 2002
@@ -0,0 +1,142 @@
+The seventh maintenance release of the 12th version of the reverse
+mapping based VM is now available.
+This is an attempt at making a more robust and flexible VM
+subsystem, while cleaning up a lot of code at the same time.
+The patch is available from:
+
+           http://surriel.com/patches/2.4/2.4.19p1-rmap-12g
+and        http://linuxvm.bkbits.net/
+
+
+My big TODO items for a next release are:
+  - page launder
+
+  - drop pte quicklist in anticipation of pte-highmem     (me)
+  - replace andrea's highmem emulation by ingo's one      (me)
+rmap 12g:
+  - port to armv architecture                             (David Woodhouse)
+  - NUMA fix to zone_table initialisation                 (Samuel Ortiz)
+  - remove init_page_count                                (David Miller)
+rmap 12f:
+  - for_each_pgdat macro                                  (William Lee Irwin)
+  - put back EXPORT(__find_get_page) for modular rd       (me)
+  - make bdflush and kswapd actually start queued disk IO (me)
+rmap 12e
+  - RSS limit fix, the limit can be 0 for some reason     (me)
+  - clean up for_each_zone define to not need pgdata_t    (William Lee Irwin)
+  - fix i810_dma bug introduced with page->wait removal   (William Lee Irwin)
+rmap 12d:
+  - fix compiler warning in rmap.c                        (Roger Larsson)
+  - read latency improvement   (read-latency2)            (Andrew Morton)
+rmap 12c:
+  - fix small balancing bug in page_launder_zone          (Nick Piggin)
+  - wakeup_kswapd / wakeup_memwaiters code fix            (Arjan van de Ven)
+  - improve RSS limit enforcement                         (me)
+rmap 12b:
+  - highmem emulation (for debugging purposes)            (Andrea Arcangeli)
+  - ulimit RSS enforcement when memory gets tight         (me)
+  - sparc64 page->virtual quickfix                        (Greg Procunier)
+rmap 12a:
+  - fix the compile warning in buffer.c                   (me)
+  - fix divide-by-zero on highmem initialisation  DOH!    (me)
+  - remove the pgd quicklist (suspicious ...)             (DaveM, me)
+rmap 12:
+  - keep some extra free memory on large machines         (Arjan van de Ven, 
me)
+  - higher-order allocation bugfix                        (Adrian Drzewiecki)
+  - nr_free_buffer_pages() returns inactive + free mem    (me)
+  - pages from unused objects directly to inactive_clean  (me)
+  - use fast pte quicklists on non-pae machines           (Andrea Arcangeli)
+  - remove sleep_on from wakeup_kswapd                    (Arjan van de Ven)
+  - page waitqueue cleanup                                (Christoph Hellwig)
+rmap 11c:
+  - oom_kill race locking fix                             (Andres Salomon)
+  - elevator improvement                                  (Andrew Morton)
+  - dirty buffer writeout speedup (hopefully ;))          (me)
+  - small documentation updates                           (me)
+  - page_launder() never does synchronous IO, kswapd
+    and the processes calling it sleep on higher level    (me)
+  - deadlock fix in touch_page()                          (me)
+rmap 11b:
+  - added low latency reschedule points in vmscan.c       (me)
+  - make i810_dma.c include mm_inline.h too               (William Lee Irwin)
+  - wake up kswapd sleeper tasks on OOM kill so the
+    killed task can continue on its way out               (me)
+  - tune page allocation sleep point a little             (me)
+rmap 11a:
+  - don't let refill_inactive() progress count for OOM    (me)
+  - after an OOM kill, wait 5 seconds for the next kill   (me)
+  - agpgart_be fix for hashed waitqueues                  (William Lee Irwin)
+rmap 11:
+  - fix stupid logic inversion bug in wakeup_kswapd()     (Andrew Morton)
+  - fix it again in the morning                           (me)
+  - add #ifdef BROKEN_PPC_PTE_ALLOC_ONE to rmap.h, it
+    seems PPC calls pte_alloc() before mem_map[] init     (me)
+  - disable the debugging code in rmap.c ... the code
+    is working and people are running benchmarks          (me)
+  - let the slab cache shrink functions return a value
+    to help prevent early OOM killing                     (Ed Tomlinson)
+  - also, don't call the OOM code if we have enough
+    free pages                                            (me)
+  - move the call to lru_cache_del into __free_pages_ok   (Ben LaHaise)
+  - replace the per-page waitqueue with a hashed
+    waitqueue, reduces size of struct page from 64
+    bytes to 52 bytes (48 bytes on non-highmem machines)  (William Lee Irwin)
+rmap 10:
+  - fix the livelock for real (yeah right), turned out
+    to be a stupid bug in page_launder_zone()             (me)
+  - to make sure the VM subsystem doesn't monopolise
+    the CPU, let kswapd and some apps sleep a bit under
+    heavy stress situations                               (me)
+  - let __GFP_HIGH allocations dig a little bit deeper
+    into the free page pool, the SCSI layer seems fragile (me)
+rmap 9:
+  - improve comments all over the place                   (Michael Cohen)
+  - don't panic if page_remove_rmap() cannot find the
+    rmap in question, it's possible that the memory was
+    PG_reserved and belonging to a driver, but the driver
+    exited and cleared the PG_reserved bit                (me)
+  - fix the VM livelock by replacing > by >= in a few
+    critical places in the pageout code                   (me)
+  - treat the reclaiming of an inactive_clean page like
+    allocating a new page, calling try_to_free_pages()
+    and/or fixup_freespace() if required                  (me)
+  - when low on memory, don't make things worse by
+    doing swapin_readahead                                (me)
+rmap 8:
+  - add ANY_ZONE to the balancing functions to improve
+    kswapd's balancing a bit                              (me)
+  - regularize some of the maximum loop bounds in
+    vmscan.c for cosmetic purposes                        (William Lee Irwin)
+  - move page_address() to architecture-independent
+    code, now the removal of page->virtual is portable    (William Lee Irwin)
+  - speed up free_area_init_core() by doing a single
+    pass over the pages and not using atomic ops          (William Lee Irwin)
+  - documented the buddy allocator in page_alloc.c        (William Lee Irwin)
+rmap 7:
+  - clean up and document vmscan.c                        (me)
+  - reduce size of page struct, part one                  (William Lee Irwin)
+  - add rmap.h for other archs (untested, not for ARM)    (me)
+rmap 6:
+  - make the active and inactive_dirty list per zone,
+    this is finally possible because we can free pages
+    based on their physical address                       (William Lee Irwin)
+  - cleaned up William's code a bit                       (me)
+  - turn some defines into inlines and move those to
+    mm_inline.h (the includes are a mess ...)             (me)
+  - improve the VM balancing a bit                        (me)
+  - add back inactive_target to /proc/meminfo             (me)
+rmap 5:
+  - fixed recursive buglet, introduced by directly
+    editing the patch for making rmap 4 ;)))              (me)
+rmap 4:
+  - look at the referenced bits in page tables            (me)
+rmap 3:
+  - forgot one FASTCALL definition                        (me)
+rmap 2:
+  - teach try_to_unmap_one() about mremap()               (me)
+  - don't assign swap space to pages with buffers         (me)
+  - make the rmap.c functions FASTCALL / inline           (me)
+rmap 1:
+  - fix the swap leak in rmap 0                           (Dave McCracken)
+rmap 0:
+  - port of reverse mapping VM to 2.4.16                  (me)
diff -Nru a/Makefile b/Makefile
--- a/Makefile  Fri Mar  1 18:19:44 2002
+++ b/Makefile  Fri Mar  1 18:19:44 2002
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 4
 SUBLEVEL = 19
-EXTRAVERSION = -pre2
+EXTRAVERSION = -pre2-rmap12g
 
 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
 
diff -Nru a/arch/arm/mm/mm-armv.c b/arch/arm/mm/mm-armv.c
--- a/arch/arm/mm/mm-armv.c     Fri Mar  1 18:19:44 2002
+++ b/arch/arm/mm/mm-armv.c     Fri Mar  1 18:19:44 2002
@@ -19,6 +19,7 @@
 #include <asm/page.h>
 #include <asm/io.h>
 #include <asm/setup.h>
+#include <asm/rmap.h>
 
 #include <asm/mach/map.h>
 
@@ -457,6 +458,7 @@
  * cache implementation.
  */
 kmem_cache_t *pte_cache;
+kmem_cache_t *pte_rmap_cache;
 
 /*
  * The constructor gets called for each object within the cache when the
@@ -467,6 +469,22 @@
 {
        unsigned long block = (unsigned long)pte;
 
+       if (!(block & 2048)) {
+               /* First object of two in a page - allocate the 
+                  pte_rmap_info to go with them */
+
+               struct page * page = virt_to_page(pte);
+
+               if (flags & SLAB_CTOR_ATOMIC)
+                       BUG();
+
+               page->mapping = kmem_cache_alloc(pte_rmap_cache, GFP_KERNEL);
+               if (!page->mapping) {
+                       printk(KERN_CRIT "pte_rmap_cache alloc failed. Oops. 
Slab constructors need to be allowed to fail\n");
+                       /* return -ENOMEM; */
+                       BUG();
+               }
+       }
        if (block & 2047)
                BUG();
 
@@ -475,11 +493,32 @@
                        PTRS_PER_PTE * sizeof(pte_t), 0);
 }
 
+static void pte_cache_dtor(void *pte, kmem_cache_t *cache, unsigned long flags)
+{
+       unsigned long block = (unsigned long)pte;
+
+       if (!(block & 2048)) {
+               /* First object of two in a page - free the 
+                  pte_rmap_info that was associated with them */
+
+               struct page * page = virt_to_page(pte);
+
+               kmem_cache_free(pte_rmap_cache, page->mapping);
+               page->mapping = NULL;
+       }
+}
+
 void __init pgtable_cache_init(void)
 {
+       pte_rmap_cache = kmem_cache_create("pte-rmap-cache",
+                               2 * sizeof(struct arm_rmap_info), 0, 0,
+                               NULL, NULL);
+       if (!pte_rmap_cache)
+               BUG();
+
        pte_cache = kmem_cache_create("pte-cache",
                                2 * PTRS_PER_PTE * sizeof(pte_t), 0, 0,
-                               pte_cache_ctor, NULL);
+                               pte_cache_ctor, pte_cache_dtor);
        if (!pte_cache)
                BUG();
 }
diff -Nru a/drivers/block/elevator.c b/drivers/block/elevator.c
--- a/drivers/block/elevator.c  Fri Mar  1 18:19:44 2002
+++ b/drivers/block/elevator.c  Fri Mar  1 18:19:44 2002
@@ -80,30 +80,38 @@
                         struct buffer_head *bh, int rw,
                         int max_sectors)
 {
-       struct list_head *entry = &q->queue_head;
-       unsigned int count = bh->b_size >> 9, ret = ELEVATOR_NO_MERGE;
-
+       struct list_head *entry;
+       unsigned int count = bh->b_size >> 9;
+       unsigned int ret = ELEVATOR_NO_MERGE;
+       int merge_only = 0;
+       const int max_bomb_segments = q->elevator.max_bomb_segments;
+ 
+       entry = &q->queue_head;
        while ((entry = entry->prev) != head) {
                struct request *__rq = blkdev_entry_to_request(entry);
 
-               /*
-                * simply "aging" of requests in queue
-                */
-               if (__rq->elevator_sequence-- <= 0)
-                       break;
-
+               if (__rq->elevator_sequence-- <= 0) {
+                       /*
+                        * OK, we've exceeded someone's latency limit.
+                        * But we still continue to look for merges,
+                        * because they're so much better than seeks.
+                        */
+                       merge_only = 1;
+               }
                if (__rq->waiting)
                        continue;
                if (__rq->rq_dev != bh->b_rdev)
                        continue;
-               if (!*req && bh_rq_in_between(bh, __rq, &q->queue_head))
+               if (!*req && !merge_only &&
+                       bh_rq_in_between(bh, __rq, &q->queue_head)) {
                        *req = __rq;
+               }
                if (__rq->cmd != rw)
                        continue;
                if (__rq->nr_sectors + count > max_sectors)
                        continue;
                if (__rq->elevator_sequence < count)
-                       break;
+                       merge_only = 1;
                if (__rq->sector + __rq->nr_sectors == bh->b_rsector) {
                        ret = ELEVATOR_BACK_MERGE;
                        *req = __rq;
@@ -116,6 +124,56 @@
                }
        }
 
+       /*
+        * If we failed to merge a read anywhere in the request
+        * queue, we really don't want to place it at the end
+        * of the list, behind lots of writes.  So place it near
+        * the front.
+        *
+        * We don't want to place it in front of _all_ writes: that
+        * would create lots of seeking, and isn't tunable.
+        * We try to avoid promoting this read in front of existing
+        * reads.
+        *
+        * max_bomb_sectors becomes the maximum number of write
+        * requests which we allow to remain in place in front of
+        * a newly introduced read.  We weight things a little bit,
+        * so large writes are more expensive than small ones, but it's
+        * requests which count, not sectors.
+        */
+       if (max_bomb_segments && rw == READ && ret == ELEVATOR_NO_MERGE) {
+               int cur_latency = 0;
+               struct request * const cur_request = *req;
+
+               entry = head->next;
+               while (entry != &q->queue_head) {
+                       struct request *__rq;
+
+                       if (entry == &q->queue_head)
+                               BUG();
+                       if (entry == q->queue_head.next &&
+                                       q->head_active && !q->plugged)
+                               BUG();
+                       __rq = blkdev_entry_to_request(entry);
+
+                       if (__rq == cur_request) {
+                               /*
+                                * This is where the old algorithm placed it.
+                                * There's no point pushing it further back,
+                                * so leave it here, in sorted order.
+                                */
+                               break;
+                       }
+                       if (__rq->cmd == WRITE) {
+                               cur_latency += 1 + __rq->nr_sectors / 64;
+                               if (cur_latency >= max_bomb_segments) {
+                                       *req = __rq;
+                                       break;
+                               }
+                       }
+                       entry = entry->next;
+               }
+       }
        return ret;
 }
 
@@ -188,7 +246,7 @@
        output.queue_ID                 = elevator->queue_ID;
        output.read_latency             = elevator->read_latency;
        output.write_latency            = elevator->write_latency;
-       output.max_bomb_segments        = 0;
+       output.max_bomb_segments        = elevator->max_bomb_segments;
 
        if (copy_to_user(arg, &output, sizeof(blkelv_ioctl_arg_t)))
                return -EFAULT;
@@ -207,9 +265,12 @@
                return -EINVAL;
        if (input.write_latency < 0)
                return -EINVAL;
+       if (input.max_bomb_segments < 0)
+               return -EINVAL;
 
        elevator->read_latency          = input.read_latency;
        elevator->write_latency         = input.write_latency;
+       elevator->max_bomb_segments     = input.max_bomb_segments;
        return 0;
 }
 
diff -Nru a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
--- a/drivers/block/ll_rw_blk.c Fri Mar  1 18:19:44 2002
+++ b/drivers/block/ll_rw_blk.c Fri Mar  1 18:19:44 2002
@@ -1176,9 +1176,11 @@
         * Free request slots per queue.
         * (Half for reads, half for writes)
         */
-       queue_nr_requests = 64;
-       if (total_ram > MB(32))
-               queue_nr_requests = 128;
+       queue_nr_requests = (total_ram >> 9) & ~15;     /* One per 
half-megabyte */
+       if (queue_nr_requests < 32)
+               queue_nr_requests = 32;
+       if (queue_nr_requests > 1024)
+               queue_nr_requests = 1024;
 
        /*
         * Batch frees according to queue length
diff -Nru a/fs/buffer.c b/fs/buffer.c
--- a/fs/buffer.c       Fri Mar  1 18:19:44 2002
+++ b/fs/buffer.c       Fri Mar  1 18:19:44 2002
@@ -47,6 +47,7 @@
 #include <linux/highmem.h>
 #include <linux/module.h>
 #include <linux/completion.h>
+#include <linux/mm_inline.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -729,11 +730,9 @@
 
 static void free_more_memory(void)
 {
-       zone_t * zone = contig_page_data.node_zonelists[GFP_NOFS & 
GFP_ZONEMASK].zones[0];
-       
        balance_dirty();
        wakeup_bdflush();
-       try_to_free_pages(zone, GFP_NOFS, 0);
+       try_to_free_pages(GFP_NOFS);
        run_task_queue(&tq_disk);
        current->policy |= SCHED_YIELD;
        __set_current_state(TASK_RUNNING);
@@ -1046,7 +1045,6 @@
        unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
 
        dirty = size_buffers_type[BUF_DIRTY] >> PAGE_SHIFT;
-       dirty += size_buffers_type[BUF_LOCKED] >> PAGE_SHIFT;
        tot = nr_free_buffer_pages();
 
        dirty *= 100;
@@ -1078,18 +1076,17 @@
                return;
 
        /* If we're getting into imbalance, start write-out */
-       spin_lock(&lru_list_lock);
-       write_some_buffers(NODEV);
+       wakeup_bdflush();
 
        /*
         * And if we're _really_ out of balance, wait for
-        * some of the dirty/locked buffers ourselves and
-        * start bdflush.
+        * some of the dirty/locked buffers ourselves.
         * This will throttle heavy writers.
         */
        if (state > 0) {
+               spin_lock(&lru_list_lock);
+               write_some_buffers(NODEV);
                wait_for_some_buffers(NODEV);
-               wakeup_bdflush();
        }
 }
 
@@ -2592,10 +2589,9 @@
        return 1;
 }
 
-static int sync_page_buffers(struct buffer_head *head)
+static void sync_page_buffers(struct buffer_head *head)
 {
        struct buffer_head * bh = head;
-       int tryagain = 0;
 
        do {
                if (!buffer_dirty(bh) && !buffer_locked(bh))
@@ -2605,15 +2601,11 @@
                if (!test_and_set_bit(BH_Wait_IO, &bh->b_state))
                        continue;
 
-               /* Second time through we start actively writing out.. */
-               if (test_and_set_bit(BH_Lock, &bh->b_state)) {
-                       if (!test_bit(BH_launder, &bh->b_state))
-                               continue;
-                       wait_on_buffer(bh);
-                       tryagain = 1;
+               /* If we cannot lock the buffer just skip it. */
+               if (test_and_set_bit(BH_Lock, &bh->b_state))
                        continue;
-               }
 
+               /* Second time through we start actively writing out.. */
                if (!atomic_set_buffer_clean(bh)) {
                        unlock_buffer(bh);
                        continue;
@@ -2624,10 +2616,9 @@
                set_bit(BH_launder, &bh->b_state);
                bh->b_end_io = end_buffer_io_sync;
                submit_bh(WRITE, bh);
-               tryagain = 0;
        } while ((bh = bh->b_this_page) != head);
 
-       return tryagain;
+       return;
 }
 
 /*
@@ -2651,7 +2642,6 @@
 {
        struct buffer_head * tmp, * bh = page->buffers;
 
-cleaned_buffers_try_again:
        spin_lock(&lru_list_lock);
        write_lock(&hash_table_lock);
        tmp = bh;
@@ -2694,15 +2684,9 @@
        write_unlock(&hash_table_lock);
        spin_unlock(&lru_list_lock);
        gfp_mask = pf_gfp_mask(gfp_mask);
-       if (gfp_mask & __GFP_IO) {
-               if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
-                       if (sync_page_buffers(bh)) {
-                               /* no IO or waiting next time */
-                               gfp_mask = 0;
-                               goto cleaned_buffers_try_again;
-                       }
-               }
-       }
+       if ((gfp_mask & __GFP_IO) &&
+                       ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)))
+               sync_page_buffers(bh);
        if (balance_dirty_state() >= 0)
                wakeup_bdflush();
        return 0;
@@ -2951,7 +2935,7 @@
 
                spin_lock(&lru_list_lock);
                if (!write_some_buffers(NODEV) || balance_dirty_state() < 0) {
-                       wait_for_some_buffers(NODEV);
+                       run_task_queue(&tq_disk);
                        interruptible_sleep_on(&bdflush_wait);
                }
        }
@@ -2982,7 +2966,6 @@
        complete((struct completion *)startup);
 
        for (;;) {
-               wait_for_some_buffers(NODEV);
 
                /* update interval */
                interval = bdf_prm.b_un.interval;
@@ -3011,6 +2994,7 @@
                printk(KERN_DEBUG "kupdate() activated...\n");
 #endif
                sync_old_buffers();
+               run_task_queue(&tq_disk);
        }
 }
 
diff -Nru a/fs/dcache.c b/fs/dcache.c
--- a/fs/dcache.c       Fri Mar  1 18:19:44 2002
+++ b/fs/dcache.c       Fri Mar  1 18:19:44 2002
@@ -568,8 +568,7 @@
        count = dentry_stat.nr_unused / priority;
 
        prune_dcache(count);
-       kmem_cache_shrink(dentry_cache);
-       return 0;
+       return kmem_cache_shrink_nr(dentry_cache);
 }
 
 #define NAME_ALLOC_LEN(len)    ((len+16) & ~15)
diff -Nru a/fs/dquot.c b/fs/dquot.c
--- a/fs/dquot.c        Fri Mar  1 18:19:44 2002
+++ b/fs/dquot.c        Fri Mar  1 18:19:44 2002
@@ -413,8 +413,7 @@
        lock_kernel();
        prune_dqcache(nr_free_dquots / (priority + 1));
        unlock_kernel();
-       kmem_cache_shrink(dquot_cachep);
-       return 0;
+       return kmem_cache_shrink_nr(dquot_cachep);
 }
 
 /* NOTE: If you change this function please check whether dqput_blocks() works 
right... */
diff -Nru a/fs/exec.c b/fs/exec.c
--- a/fs/exec.c Fri Mar  1 18:19:44 2002
+++ b/fs/exec.c Fri Mar  1 18:19:44 2002
@@ -35,6 +35,7 @@
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
 #include <linux/personality.h>
+#include <linux/swap.h>
 #define __NO_VERSION__
 #include <linux/module.h>
 
@@ -279,6 +280,7 @@
        flush_dcache_page(page);
        flush_page_to_ram(page);
        set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, PAGE_COPY))));
+       page_add_rmap(page, pte);
        tsk->mm->rss++;
        spin_unlock(&tsk->mm->page_table_lock);
 
diff -Nru a/fs/inode.c b/fs/inode.c
--- a/fs/inode.c        Fri Mar  1 18:19:44 2002
+++ b/fs/inode.c        Fri Mar  1 18:19:44 2002
@@ -725,8 +725,7 @@
        count = inodes_stat.nr_unused / priority;
 
        prune_icache(count);
-       kmem_cache_shrink(inode_cachep);
-       return 0;
+       return kmem_cache_shrink_nr(inode_cachep);
 }
 
 /*
diff -Nru a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
--- a/fs/proc/proc_misc.c       Fri Mar  1 18:19:44 2002
+++ b/fs/proc/proc_misc.c       Fri Mar  1 18:19:44 2002
@@ -36,6 +36,7 @@
 #include <linux/init.h>
 #include <linux/smp_lock.h>
 #include <linux/seq_file.h>
+#include <linux/mm_inline.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -164,7 +165,9 @@
                "Cached:       %8lu kB\n"
                "SwapCached:   %8lu kB\n"
                "Active:       %8u kB\n"
-               "Inactive:     %8u kB\n"
+               "Inact_dirty:  %8u kB\n"
+               "Inact_clean:  %8u kB\n"
+               "Inact_target: %8lu kB\n"
                "HighTotal:    %8lu kB\n"
                "HighFree:     %8lu kB\n"
                "LowTotal:     %8lu kB\n"
@@ -178,7 +181,9 @@
                K(pg_size - swapper_space.nrpages),
                K(swapper_space.nrpages),
                K(nr_active_pages),
-               K(nr_inactive_pages),
+               K(nr_inactive_dirty_pages),
+               K(nr_inactive_clean_pages),
+               K(inactive_target()),
                K(i.totalhigh),
                K(i.freehigh),
                K(i.totalram-i.totalhigh),
diff -Nru a/include/asm-alpha/rmap.h b/include/asm-alpha/rmap.h
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ b/include/asm-alpha/rmap.h  Fri Mar  1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _ALPHA_RMAP_H
+#define _ALPHA_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-arm/proc-armv/rmap.h b/include/asm-arm/proc-armv/rmap.h
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ b/include/asm-arm/proc-armv/rmap.h  Fri Mar  1 18:19:44 2002
@@ -0,0 +1,72 @@
+#ifndef _ARMV_RMAP_H
+#define _ARMV_RMAP_H
+/*
+ * linux/include/asm-arm/proc-armv/rmap.h
+ *
+ * Architecture dependant parts of the reverse mapping code,
+ *
+ * We use the struct page of the page table page to find a pointer
+ * to an array of two 'struct arm_rmap_info's, one for each of the
+ * two page tables in each page.
+ * 
+ * - rmi->mm points to the process' mm_struct
+ * - rmi->index has the high bits of the address
+ * - the lower bits of the address are calculated from the
+ *   offset of the page table entry within the page table page
+ */
+#include <linux/mm.h>
+
+struct arm_rmap_info {
+       struct mm_struct *mm;
+       unsigned long index;
+};
+
+static inline void pgtable_add_rmap(pte_t * ptep, struct mm_struct * mm, 
unsigned long address)
+{
+       struct page * page = virt_to_page(ptep);
+       struct arm_rmap_info *rmi = (void *)page->mapping;
+
+       if (((unsigned long)ptep)&2048)
+               rmi++;
+
+       rmi->mm = mm;
+       rmi->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1);
+}
+
+static inline void pgtable_remove_rmap(pte_t * ptep)
+{
+       struct page * page = virt_to_page(ptep);
+       struct arm_rmap_info *rmi = (void *)page->mapping;
+
+       if (((unsigned long)ptep)&2048)
+               rmi++;
+
+       rmi->mm = NULL;
+       rmi->index = 0;
+}
+
+static inline struct mm_struct * ptep_to_mm(pte_t * ptep)
+{
+       struct page * page = virt_to_page(ptep);
+       struct arm_rmap_info *rmi = (void *)page->mapping;
+
+       if (((unsigned long)ptep)&2048)
+               rmi++;
+
+       return rmi->mm;
+}
+
+static inline unsigned long ptep_to_address(pte_t * ptep)
+{
+       struct page * page = virt_to_page(ptep);
+       struct arm_rmap_info *rmi = (void *)page->mapping;
+       unsigned long low_bits;
+
+       if (((unsigned long)ptep)&2048)
+               rmi++;
+
+       low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE;
+       return rmi->index + low_bits;
+}
+
+#endif /* _ARMV_RMAP_H */
diff -Nru a/include/asm-arm/rmap.h b/include/asm-arm/rmap.h
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ b/include/asm-arm/rmap.h    Fri Mar  1 18:19:44 2002
@@ -0,0 +1,6 @@
+#ifndef _ARM_RMAP_H
+#define _ARM_RMAP_H
+
+#include <asm/proc/rmap.h>
+
+#endif /* _ARM_RMAP_H */
diff -Nru a/include/asm-cris/rmap.h b/include/asm-cris/rmap.h
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ b/include/asm-cris/rmap.h   Fri Mar  1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _CRIS_RMAP_H
+#define _CRIS_RMAP_H
+
+/* nothing to see, move along :) */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-generic/rmap.h b/include/asm-generic/rmap.h
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ b/include/asm-generic/rmap.h        Fri Mar  1 18:19:44 2002
@@ -0,0 +1,57 @@
+#ifndef _GENERIC_RMAP_H
+#define _GENERIC_RMAP_H
+/*
+ * linux/include/asm-generic/rmap.h
+ *
+ * Architecture dependant parts of the reverse mapping code,
+ * this version should work for most architectures with a
+ * 'normal' page table layout.
+ *
+ * We use the struct page of the page table page to find out
+ * the process and full address of a page table entry:
+ * - page->mapping points to the process' mm_struct
+ * - page->index has the high bits of the address
+ * - the lower bits of the address are calculated from the
+ *   offset of the page table entry within the page table page
+ */
+#include <linux/mm.h>
+
+static inline void pgtable_add_rmap(pte_t * ptep, struct mm_struct * mm, 
unsigned long address)
+{
+       struct page * page = virt_to_page(ptep);
+#ifdef BROKEN_PPC_PTE_ALLOC_ONE
+       /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */
+       extern int mem_init_done;
+
+       if (!mem_init_done)
+               return;
+#endif
+       page->mapping = (void *)mm;
+       page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1);
+}
+
+static inline void pgtable_remove_rmap(pte_t * ptep)
+{
+       struct page * page = virt_to_page(ptep);
+
+       page->mapping = NULL;
+       page->index = 0;
+}
+
+static inline struct mm_struct * ptep_to_mm(pte_t * ptep)
+{
+       struct page * page = virt_to_page(ptep);
+
+       return (struct mm_struct *) page->mapping;
+}
+
+static inline unsigned long ptep_to_address(pte_t * ptep)
+{
+       struct page * page = virt_to_page(ptep);
+       unsigned long low_bits;
+
+       low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE;
+       return page->index + low_bits;
+}
+
+#endif /* _GENERIC_RMAP_H */
diff -Nru a/include/asm-i386/rmap.h b/include/asm-i386/rmap.h
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ b/include/asm-i386/rmap.h   Fri Mar  1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _I386_RMAP_H
+#define _I386_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-ia64/rmap.h b/include/asm-ia64/rmap.h
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ b/include/asm-ia64/rmap.h   Fri Mar  1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _IA64_RMAP_H
+#define _IA64_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-m68k/rmap.h b/include/asm-m68k/rmap.h
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ b/include/asm-m68k/rmap.h   Fri Mar  1 18:19:45 2002
@@ -0,0 +1,7 @@
+#ifndef _M86K_RMAP_H
+#define _M86K_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-mips/rmap.h b/include/asm-mips/rmap.h
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ b/include/asm-mips/rmap.h   Fri Mar  1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _MIPS_RMAP_H
+#define _MIPS_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-mips64/rmap.h b/include/asm-mips64/rmap.h
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ b/include/asm-mips64/rmap.h Fri Mar  1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _MIPS64_RMAP_H
+#define _MIPS64_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-parisc/rmap.h b/include/asm-parisc/rmap.h
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ b/include/asm-parisc/rmap.h Fri Mar  1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _PARISC_RMAP_H
+#define _PARISC_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-ppc/rmap.h b/include/asm-ppc/rmap.h
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ b/include/asm-ppc/rmap.h    Fri Mar  1 18:19:44 2002
@@ -0,0 +1,9 @@
+#ifndef _PPC_RMAP_H
+#define _PPC_RMAP_H
+
+/* PPC calls pte_alloc() before mem_map[] is setup ... */
+#define BROKEN_PPC_PTE_ALLOC_ONE
+
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-s390/rmap.h b/include/asm-s390/rmap.h
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ b/include/asm-s390/rmap.h   Fri Mar  1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _S390_RMAP_H
+#define _S390_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-s390x/rmap.h b/include/asm-s390x/rmap.h
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ b/include/asm-s390x/rmap.h  Fri Mar  1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _S390X_RMAP_H
+#define _S390X_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-sh/rmap.h b/include/asm-sh/rmap.h
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ b/include/asm-sh/rmap.h     Fri Mar  1 18:19:45 2002
@@ -0,0 +1,7 @@
+#ifndef _SH_RMAP_H
+#define _SH_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-sparc/rmap.h b/include/asm-sparc/rmap.h
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ b/include/asm-sparc/rmap.h  Fri Mar  1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _SPARC_RMAP_H
+#define _SPARC_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/asm-sparc64/rmap.h b/include/asm-sparc64/rmap.h
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ b/include/asm-sparc64/rmap.h        Fri Mar  1 18:19:44 2002
@@ -0,0 +1,7 @@
+#ifndef _SPARC64_RMAP_H
+#define _SPARC64_RMAP_H
+
+/* nothing to see, move along */
+#include <asm-generic/rmap.h>
+
+#endif
diff -Nru a/include/linux/elevator.h b/include/linux/elevator.h
--- a/include/linux/elevator.h  Fri Mar  1 18:19:44 2002
+++ b/include/linux/elevator.h  Fri Mar  1 18:19:44 2002
@@ -1,12 +1,9 @@
 #ifndef _LINUX_ELEVATOR_H
 #define _LINUX_ELEVATOR_H
 
-typedef void (elevator_fn) (struct request *, elevator_t *,
-                           struct list_head *,
-                           struct list_head *, int);
-
-typedef int (elevator_merge_fn) (request_queue_t *, struct request **, struct 
list_head *,
-                                struct buffer_head *, int, int);
+typedef int (elevator_merge_fn)(request_queue_t *, struct request **,
+                               struct list_head *, struct buffer_head *bh,
+                               int rw, int max_sectors);
 
 typedef void (elevator_merge_cleanup_fn) (request_queue_t *, struct request *, 
int);
 
@@ -16,6 +13,7 @@
 {
        int read_latency;
        int write_latency;
+       int max_bomb_segments;
 
        elevator_merge_fn *elevator_merge_fn;
        elevator_merge_cleanup_fn *elevator_merge_cleanup_fn;
@@ -24,13 +22,13 @@
        unsigned int queue_ID;
 };
 
-int elevator_noop_merge(request_queue_t *, struct request **, struct list_head 
*, struct buffer_head *, int, int);
-void elevator_noop_merge_cleanup(request_queue_t *, struct request *, int);
-void elevator_noop_merge_req(struct request *, struct request *);
-
-int elevator_linus_merge(request_queue_t *, struct request **, struct 
list_head *, struct buffer_head *, int, int);
-void elevator_linus_merge_cleanup(request_queue_t *, struct request *, int);
-void elevator_linus_merge_req(struct request *, struct request *);
+elevator_merge_fn              elevator_noop_merge;
+elevator_merge_cleanup_fn      elevator_noop_merge_cleanup;
+elevator_merge_req_fn          elevator_noop_merge_req;
+
+elevator_merge_fn              elevator_linus_merge;
+elevator_merge_cleanup_fn      elevator_linus_merge_cleanup;
+elevator_merge_req_fn          elevator_linus_merge_req;
 
 typedef struct blkelv_ioctl_arg_s {
        int queue_ID;
@@ -54,22 +52,6 @@
 #define ELEVATOR_FRONT_MERGE   1
 #define ELEVATOR_BACK_MERGE    2
 
-/*
- * This is used in the elevator algorithm.  We don't prioritise reads
- * over writes any more --- although reads are more time-critical than
- * writes, by treating them equally we increase filesystem throughput.
- * This turns out to give better overall performance.  -- sct
- */
-#define IN_ORDER(s1,s2)                                \
-       ((((s1)->rq_dev == (s2)->rq_dev &&      \
-          (s1)->sector < (s2)->sector)) ||     \
-        (s1)->rq_dev < (s2)->rq_dev)
-
-#define BHRQ_IN_ORDER(bh, rq)                  \
-       ((((bh)->b_rdev == (rq)->rq_dev &&      \
-          (bh)->b_rsector < (rq)->sector)) ||  \
-        (bh)->b_rdev < (rq)->rq_dev)
-
 static inline int elevator_request_latency(elevator_t * elevator, int rw)
 {
        int latency;
@@ -85,7 +67,7 @@
 ((elevator_t) {                                                                
\
        0,                              /* read_latency */              \
        0,                              /* write_latency */             \
-                                                                       \
+       0,                              /* max_bomb_segments */         \
        elevator_noop_merge,            /* elevator_merge_fn */         \
        elevator_noop_merge_cleanup,    /* elevator_merge_cleanup_fn */ \
        elevator_noop_merge_req,        /* elevator_merge_req_fn */     \
@@ -95,7 +77,7 @@
 ((elevator_t) {                                                                
\
        8192,                           /* read passovers */            \
        16384,                          /* write passovers */           \
-                                                                       \
+       6,                              /* max_bomb_segments */         \
        elevator_linus_merge,           /* elevator_merge_fn */         \
        elevator_linus_merge_cleanup,   /* elevator_merge_cleanup_fn */ \
        elevator_linus_merge_req,       /* elevator_merge_req_fn */     \
diff -Nru a/include/linux/fs.h b/include/linux/fs.h
--- a/include/linux/fs.h        Fri Mar  1 18:19:44 2002
+++ b/include/linux/fs.h        Fri Mar  1 18:19:44 2002
@@ -284,7 +284,7 @@
 
 extern void set_bh_page(struct buffer_head *bh, struct page *page, unsigned 
long offset);
 
-#define touch_buffer(bh)       mark_page_accessed(bh->b_page)
+#define touch_buffer(bh)       touch_page(bh->b_page)
 
 
 #include <linux/pipe_fs_i.h>
diff -Nru a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h        Fri Mar  1 18:19:44 2002
+++ b/include/linux/mm.h        Fri Mar  1 18:19:44 2002
@@ -17,9 +17,6 @@
 extern unsigned long num_physpages;
 extern void * high_memory;
 extern int page_cluster;
-/* The inactive_clean lists are per zone. */
-extern struct list_head active_list;
-extern struct list_head inactive_list;
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -133,6 +130,9 @@
        struct page * (*nopage)(struct vm_area_struct * area, unsigned long 
address, int unused);
 };
 
+/* forward declaration; pte_chain is meant to be internal to rmap.c */
+struct pte_chain;
+
 /*
  * Each physical page in the system has a struct page associated with
  * it to keep track of whatever it is we are using the page for at the
@@ -159,6 +159,8 @@
                                           updated asynchronously */
        struct list_head lru;           /* Pageout list, eg. active_list;
                                           protected by pagemap_lru_lock !! */
+       unsigned char age;              /* Page aging counter. */
+       struct pte_chain * pte_chain;   /* Reverse pte mapping pointer. */
        struct page **pprev_hash;       /* Complement to *next_hash. */
        struct buffer_head * buffers;   /* Buffer maps us to a disk block. */
 
@@ -286,9 +288,9 @@
 #define PG_referenced           2
 #define PG_uptodate             3
 #define PG_dirty                4
-#define PG_unused               5
-#define PG_lru                  6
-#define PG_active               7
+#define PG_inactive_clean       5
+#define PG_active               6
+#define PG_inactive_dirty       7
 #define PG_slab                         8
 #define PG_skip                        10
 #define PG_highmem             11
@@ -391,10 +393,19 @@
 #define PageActive(page)       test_bit(PG_active, &(page)->flags)
 #define SetPageActive(page)    set_bit(PG_active, &(page)->flags)
 #define ClearPageActive(page)  clear_bit(PG_active, &(page)->flags)
+#define TestandSetPageActive(page)     test_and_set_bit(PG_active, 
&(page)->flags)
+#define TestandClearPageActive(page)   test_and_clear_bit(PG_active, 
&(page)->flags)
+
+#define PageInactiveDirty(page)        test_bit(PG_inactive_dirty, 
&(page)->flags)
+#define SetPageInactiveDirty(page)     set_bit(PG_inactive_dirty, 
&(page)->flags)
+#define ClearPageInactiveDirty(page)   clear_bit(PG_inactive_dirty, 
&(page)->flags)
+
+#define PageInactiveClean(page)        test_bit(PG_inactive_clean, 
&(page)->flags)
+#define SetPageInactiveClean(page)     set_bit(PG_inactive_clean, 
&(page)->flags)
+#define ClearPageInactiveClean(page)   clear_bit(PG_inactive_clean, 
&(page)->flags)
 
-#define PageLRU(page)          test_bit(PG_lru, &(page)->flags)
-#define TestSetPageLRU(page)   test_and_set_bit(PG_lru, &(page)->flags)
-#define TestClearPageLRU(page) test_and_clear_bit(PG_lru, &(page)->flags)
+#define PageLRU(pp) \
+       (PageActive(pp) | PageInactiveDirty(pp) | PageInactiveClean(pp))
 
 #ifdef CONFIG_HIGHMEM
 #define PageHighMem(page)              test_bit(PG_highmem, &(page)->flags)
@@ -459,6 +470,7 @@
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr),0)
 
+extern void FASTCALL(fixup_freespace(struct zone_struct *, int));
 extern void show_free_areas(void);
 extern void show_free_areas_node(pg_data_t *pgdat);
 
diff -Nru a/include/linux/mm_inline.h b/include/linux/mm_inline.h
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ b/include/linux/mm_inline.h Fri Mar  1 18:19:44 2002
@@ -0,0 +1,294 @@
+#ifndef _LINUX_VM_INLINE_H
+#define _LINUX_VM_INLINE_H
+
+#include <linux/mm.h>
+
+/*
+ * These inline functions tend to need bits and pieces of all the
+ * other VM include files, meaning they cannot be defined inside
+ * one of the other VM include files.
+ *
+ * The include file mess really needs to be cleaned up...
+ */
+
+static inline void add_page_to_active_list(struct page * page)
+{
+       struct zone_struct * zone = page_zone(page);
+       DEBUG_LRU_PAGE(page);
+       SetPageActive(page);
+       list_add(&page->lru, &zone->active_list);
+       zone->active_pages++;
+       nr_active_pages++;
+}
+
+static inline void add_page_to_inactive_dirty_list(struct page * page)
+{
+       struct zone_struct * zone = page_zone(page);
+       DEBUG_LRU_PAGE(page);
+       SetPageInactiveDirty(page);
+       list_add(&page->lru, &zone->inactive_dirty_list);
+       zone->inactive_dirty_pages++;
+       nr_inactive_dirty_pages++;
+}
+
+static inline void add_page_to_inactive_clean_list(struct page * page)
+{
+       struct zone_struct * zone = page_zone(page);
+       DEBUG_LRU_PAGE(page);
+       SetPageInactiveClean(page);
+       list_add(&page->lru, &zone->inactive_clean_list);
+       zone->inactive_clean_pages++;
+       nr_inactive_clean_pages++;
+}
+
+static inline void del_page_from_active_list(struct page * page)
+{
+       struct zone_struct * zone = page_zone(page);
+       list_del(&page->lru);
+       ClearPageActive(page);
+       nr_active_pages--;
+       zone->active_pages--;
+       DEBUG_LRU_PAGE(page);
+}
+
+static inline void del_page_from_inactive_dirty_list(struct page * page)
+{
+       struct zone_struct * zone = page_zone(page);
+       list_del(&page->lru);
+       ClearPageInactiveDirty(page);
+       nr_inactive_dirty_pages--;
+       zone->inactive_dirty_pages--;
+       DEBUG_LRU_PAGE(page);
+}
+
+static inline void del_page_from_inactive_clean_list(struct page * page)
+{
+       struct zone_struct * zone = page_zone(page);
+       list_del(&page->lru);
+       ClearPageInactiveClean(page);
+       zone->inactive_clean_pages--;
+       nr_inactive_clean_pages--;
+       DEBUG_LRU_PAGE(page);
+}
+
+/*
+ * Inline functions to control some balancing in the VM.
+ *
+ * Note that we do both global and per-zone balancing, with
+ * most of the balancing done globally.
+ */
+#define        PLENTY_FACTOR   2
+#define        ALL_ZONES       NULL
+#define        ANY_ZONE        (struct zone_struct *)(~0UL)
+#define INACTIVE_FACTOR        5
+
+#define        VM_MIN  0
+#define        VM_LOW  1
+#define        VM_HIGH 2
+#define VM_PLENTY 3
+static inline int zone_free_limit(struct zone_struct * zone, int limit)
+{
+       int free, target, delta;
+
+       /* This is really nasty, but GCC should completely optimise it away. */
+       if (limit == VM_MIN)
+               target = zone->pages_min;
+       else if (limit == VM_LOW)
+               target = zone->pages_low;
+       else if (limit == VM_HIGH)
+               target = zone->pages_high;
+       else
+               target = zone->pages_high * PLENTY_FACTOR;
+
+       free = zone->free_pages + zone->inactive_clean_pages;
+       delta = target - free;
+
+       return delta;
+}
+
+static inline int free_limit(struct zone_struct * zone, int limit)
+{
+       int shortage = 0, local;
+
+       if (zone == ALL_ZONES) {
+               for_each_zone(zone)
+                       shortage += zone_free_limit(zone, limit);
+       } else if (zone == ANY_ZONE) {
+               for_each_zone(zone) {
+                       local = zone_free_limit(zone, limit);
+                       shortage += max(local, 0);
+               }
+       } else {
+               shortage = zone_free_limit(zone, limit);
+       }
+
+       return shortage;
+}
+
+/**
+ * free_min - test for critically low amount of free pages
+ * @zone: zone to test, ALL_ZONES to test memory globally
+ *
+ * Returns a positive value if we have a serious shortage of free and
+ * clean pages, zero or negative if there is no serious shortage.
+ */
+static inline int free_min(struct zone_struct * zone)
+{
+       return free_limit(zone, VM_MIN);
+}
+
+/**
+ * free_low - test for low amount of free pages
+ * @zone: zone to test, ALL_ZONES to test memory globally
+ *
+ * Returns a positive value if we have a shortage of free and
+ * clean pages, zero or negative if there is no shortage.
+ */
+static inline int free_low(struct zone_struct * zone)
+{
+       return free_limit(zone, VM_LOW);
+}
+
+/**
+ * free_high - test if amount of free pages is less than ideal
+ * @zone: zone to test, ALL_ZONES to test memory globally
+ *
+ * Returns a positive value if the number of free and clean
+ * pages is below kswapd's target, zero or negative if we
+ * have more than enough free and clean pages.
+ */
+static inline int free_high(struct zone_struct * zone)
+{
+       return free_limit(zone, VM_HIGH);
+}
+
+/**
+ * free_plenty - test if enough pages are freed
+ * @zone: zone to test, ALL_ZONES to test memory globally
+ *
+ * Returns a positive value if the number of free + clean pages
+ * in a zone is not yet excessive and kswapd is still allowed to
+ * free pages here, a negative value if kswapd should leave the
+ * zone alone.
+ */
+static inline int free_plenty(struct zone_struct * zone)
+{
+       return free_limit(zone, VM_PLENTY);
+}
+
+/*
+ * The inactive page target is the free target + 20% of (active + inactive)
+ * pages. 
+ */
+static inline int zone_inactive_limit(struct zone_struct * zone, int limit)
+{
+       int inactive, target, inactive_base;
+
+       inactive_base = zone->active_pages + zone->inactive_dirty_pages;
+       inactive_base /= INACTIVE_FACTOR;
+
+       /* GCC should optimise this away completely. */
+       if (limit == VM_MIN)
+               target = zone->pages_high + inactive_base / 2;
+       else if (limit == VM_LOW)
+               target = zone->pages_high + inactive_base;
+       else
+               target = zone->pages_high + inactive_base * 2;
+
+       inactive = zone->free_pages + zone->inactive_clean_pages;
+       inactive += zone->inactive_dirty_pages;
+
+       return target - inactive;
+}
+
+static inline int inactive_limit(struct zone_struct * zone, int limit)
+{
+       int shortage = 0, local;
+
+       if (zone == ALL_ZONES) {
+               for_each_zone(zone)
+                       shortage += zone_inactive_limit(zone, limit);
+       } else if (zone == ANY_ZONE) {
+               for_each_zone(zone) {
+                       local = zone_inactive_limit(zone, limit);
+                       shortage += max(local, 0);
+               }
+       } else {
+               shortage = zone_inactive_limit(zone, limit);
+       }
+
+       return shortage;
+}
+
+/**
+ * inactive_min - test for serious shortage of (free + inactive clean) pages
+ * @zone: zone to test, ALL_ZONES for global testing
+ *
+ * Returns the shortage as a positive number, a negative number
+ * if we have no serious shortage of (free + inactive clean) pages
+ */
+static inline int inactive_min(struct zone_struct * zone)
+{
+       return inactive_limit(zone, VM_MIN);
+}
+
+/**
+ * inactive_low - test for shortage of (free + inactive clean) pages
+ * @zone: zone to test, ALL_ZONES for global testing
+ *
+ * Returns the shortage as a positive number, a negative number
+ * if we have no shortage of (free + inactive clean) pages
+ */
+static inline int inactive_low(struct zone_struct * zone)
+{
+       return inactive_limit(zone, VM_LOW);
+}
+
+/**
+ * inactive_high - less than ideal amount of (free + inactive) pages
+ * @zone: zone to test, ALL_ZONES for global testing
+ *
+ * Returns the shortage as a positive number, a negative number
+ * if we have more than enough (free + inactive) pages
+ */
+static inline int inactive_high(struct zone_struct * zone)
+{
+       return inactive_limit(zone, VM_HIGH);
+}
+
+/*
+ * inactive_target - number of inactive pages we ought to have.
+ */
+static inline int inactive_target(void)
+{
+       int target;
+
+       target = nr_active_pages + nr_inactive_dirty_pages
+                       + nr_inactive_clean_pages;
+
+       target /= INACTIVE_FACTOR;
+
+       return target;
+}
+
+/*
+ * Called whenever the VM references a page. We immediately reclaim
+ * the inactive clean pages because those are counted as freeable.
+ * We don't modify the inactive dirty ones because we're never sure
+ * if those are freeable anyway.
+ */
+static inline void touch_page(struct page * page)
+{
+       if (PageInactiveClean(page)) {
+               struct zone_struct * zone = page_zone(page);
+               int free = zone->free_pages + zone->inactive_clean_pages;
+               activate_page(page);
+               if (free < zone->pages_low)
+                       wakeup_kswapd(GFP_NOIO);
+               if (zone->free_pages < zone->pages_min)
+                       fixup_freespace(zone, 1);
+       } else
+               SetPageReferenced(page);
+}
+
+#endif
diff -Nru a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h    Fri Mar  1 18:19:44 2002
+++ b/include/linux/mmzone.h    Fri Mar  1 18:19:44 2002
@@ -40,12 +40,18 @@
         */
        spinlock_t              lock;
        unsigned long           free_pages;
-       unsigned long           pages_min, pages_low, pages_high;
+       unsigned long           active_pages;
+       unsigned long           inactive_dirty_pages;
+       unsigned long           inactive_clean_pages;
+       unsigned long           pages_min, pages_low, pages_high, pages_plenty;
        int                     need_balance;
 
        /*
         * free areas of different sizes
         */
+       struct list_head        active_list;
+       struct list_head        inactive_dirty_list;
+       struct list_head        inactive_clean_list;
        free_area_t             free_area[MAX_ORDER];
 
        /*
@@ -143,9 +149,6 @@
 extern int numnodes;
 extern pg_data_t *pgdat_list;
 
-#define memclass(pgzone, classzone)    (((pgzone)->zone_pgdat == 
(classzone)->zone_pgdat) \
-                       && ((pgzone) <= (classzone)))
-
 /*
  * The following two are not meant for general usage. They are here as
  * prototypes for the discontig memory code.
@@ -157,6 +160,60 @@
   struct page *pmap);
 
 extern pg_data_t contig_page_data;
+
+/**
+ * for_each_pgdat - helper macro to iterate over all nodes
+ * @pgdat - pg_data_t * variable
+ *
+ * Meant to help with common loops of the form
+ * pgdat = pgdat_list;
+ * while(pgdat) {
+ *     ...
+ *     pgdat = pgdat->node_next;
+ * }
+ */
+#define for_each_pgdat(pgdat) \
+               for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next)
+
+
+/*
+ * next_zone - helper magic for for_each_zone()
+ * Thanks to William Lee Irwin III for this piece of ingenuity.
+ */
+static inline zone_t *next_zone(zone_t *zone)
+{
+       pg_data_t *pgdat = zone->zone_pgdat;
+
+       if (zone - pgdat->node_zones < MAX_NR_ZONES - 1)
+               zone++;
+
+       else if (pgdat->node_next) {
+               pgdat = pgdat->node_next;
+               zone = pgdat->node_zones;
+       } else
+               zone = NULL;
+
+       return zone;
+}
+
+/**
+ * for_each_zone - helper macro to iterate over all memory zones
+ * @zone - zone_t * variable
+ *
+ * The user only needs to declare the zone variable, for_each_zone
+ * fills it in. This basically means for_each_zone() is an
+ * easier to read version of this piece of code:
+ *
+ * for(pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next)
+ *     for(i = 0; i < MAX_NR_ZONES; ++i) {
+ *             zone_t * z = pgdat->node_zones + i;
+ *             ...
+ *     }
+ * }
+ */
+#define for_each_zone(zone) \
+       for(zone = pgdat_list->node_zones; zone; zone = next_zone(zone))
+
 
 #ifndef CONFIG_DISCONTIGMEM
 
diff -Nru a/include/linux/sched.h b/include/linux/sched.h
--- a/include/linux/sched.h     Fri Mar  1 18:19:44 2002
+++ b/include/linux/sched.h     Fri Mar  1 18:19:44 2002
@@ -225,7 +225,7 @@
        unsigned long rss, total_vm, locked_vm;
        unsigned long def_flags;
        unsigned long cpu_vm_mask;
-       unsigned long swap_address;
+       unsigned long rlimit_rss;
 
        unsigned dumpable:1;
 
@@ -244,6 +244,7 @@
        mmap_sem:       __RWSEM_INITIALIZER(name.mmap_sem), \
        page_table_lock: SPIN_LOCK_UNLOCKED,            \
        mmlist:         LIST_HEAD_INIT(name.mmlist),    \
+       rlimit_rss:     RLIM_INFINITY,                  \
 }
 
 struct signal_struct {
@@ -325,8 +326,6 @@
 
        struct task_struct *next_task, *prev_task;
        struct mm_struct *active_mm;
-       struct list_head local_pages;
-       unsigned int allocation_order, nr_local_pages;
 
 /* task state */
        struct linux_binfmt *binfmt;
diff -Nru a/include/linux/slab.h b/include/linux/slab.h
--- a/include/linux/slab.h      Fri Mar  1 18:19:44 2002
+++ b/include/linux/slab.h      Fri Mar  1 18:19:44 2002
@@ -55,6 +55,7 @@
                                       void (*)(void *, kmem_cache_t *, 
unsigned long));
 extern int kmem_cache_destroy(kmem_cache_t *);
 extern int kmem_cache_shrink(kmem_cache_t *);
+extern int kmem_cache_shrink_nr(kmem_cache_t *);
 extern void *kmem_cache_alloc(kmem_cache_t *, int);
 extern void kmem_cache_free(kmem_cache_t *, void *);
 
diff -Nru a/include/linux/swap.h b/include/linux/swap.h
--- a/include/linux/swap.h      Fri Mar  1 18:19:44 2002
+++ b/include/linux/swap.h      Fri Mar  1 18:19:44 2002
@@ -86,8 +86,8 @@
 extern unsigned int nr_free_pages(void);
 extern unsigned int nr_free_buffer_pages(void);
 extern int nr_active_pages;
-extern int nr_inactive_pages;
-extern atomic_t nr_async_pages;
+extern int nr_inactive_dirty_pages;
+extern int nr_inactive_clean_pages;
 extern atomic_t page_cache_size;
 extern atomic_t buffermem_pages;
 extern spinlock_t pagecache_lock;
@@ -100,18 +100,39 @@
 
 struct zone_t;
 
+/* linux/mm/rmap.c */
+extern int FASTCALL(page_referenced(struct page *));
+extern void FASTCALL(page_add_rmap(struct page *, pte_t *));
+extern void FASTCALL(page_remove_rmap(struct page *, pte_t *));
+extern int FASTCALL(try_to_unmap(struct page *));
+extern int FASTCALL(page_over_rsslimit(struct page *));
+
+/* return values of try_to_unmap */
+#define        SWAP_SUCCESS    0
+#define        SWAP_AGAIN      1
+#define        SWAP_FAIL       2
+#define        SWAP_ERROR      3
+
 /* linux/mm/swap.c */
+extern int total_swap_pages;
 extern void FASTCALL(lru_cache_add(struct page *));
 extern void FASTCALL(__lru_cache_del(struct page *));
 extern void FASTCALL(lru_cache_del(struct page *));
 
 extern void FASTCALL(activate_page(struct page *));
+extern void FASTCALL(activate_page_nolock(struct page *));
+extern void FASTCALL(deactivate_page(struct page *));
+extern void FASTCALL(deactivate_page_nolock(struct page *));
+extern void FASTCALL(drop_page(struct page *));
 
 extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
+extern struct page * FASTCALL(reclaim_page(zone_t *));
 extern wait_queue_head_t kswapd_wait;
-extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int));
+extern int FASTCALL(try_to_free_pages(unsigned int gfp_mask));
+extern void wakeup_kswapd(unsigned int);
+extern void rss_free_pages(unsigned int);
 
 /* linux/mm/page_io.c */
 extern void rw_swap_page(int, struct page *);
@@ -125,6 +146,7 @@
 extern void show_swap_cache_info(void);
 #endif
 extern int add_to_swap_cache(struct page *, swp_entry_t);
+extern int add_to_swap(struct page *);
 extern void __delete_from_swap_cache(struct page *page);
 extern void delete_from_swap_cache(struct page *page);
 extern void free_page_and_swap_cache(struct page *page);
@@ -158,7 +180,14 @@
 
 extern spinlock_t pagemap_lru_lock;
 
-extern void FASTCALL(mark_page_accessed(struct page *));
+/*
+ * Page aging defines. These seem to work great in FreeBSD,
+ * no need to reinvent the wheel.
+ */
+#define PAGE_AGE_START 5
+#define PAGE_AGE_ADV 3
+#define PAGE_AGE_DECL 1
+#define PAGE_AGE_MAX 64
 
 /*
  * List add/del helper macros. These must be called
@@ -166,38 +195,12 @@
  */
 #define DEBUG_LRU_PAGE(page)                   \
 do {                                           \
-       if (!PageLRU(page))                     \
-               BUG();                          \
        if (PageActive(page))                   \
                BUG();                          \
-} while (0)
-
-#define add_page_to_active_list(page)          \
-do {                                           \
-       DEBUG_LRU_PAGE(page);                   \
-       SetPageActive(page);                    \
-       list_add(&(page)->lru, &active_list);   \
-       nr_active_pages++;                      \
-} while (0)
-
-#define add_page_to_inactive_list(page)                \
-do {                                           \
-       DEBUG_LRU_PAGE(page);                   \
-       list_add(&(page)->lru, &inactive_list); \
-       nr_inactive_pages++;                    \
-} while (0)
-
-#define del_page_from_active_list(page)                \
-do {                                           \
-       list_del(&(page)->lru);                 \
-       ClearPageActive(page);                  \
-       nr_active_pages--;                      \
-} while (0)
-
-#define del_page_from_inactive_list(page)      \
-do {                                           \
-       list_del(&(page)->lru);                 \
-       nr_inactive_pages--;                    \
+       if (PageInactiveDirty(page))            \
+               BUG();                          \
+       if (PageInactiveClean(page))            \
+               BUG();                          \
 } while (0)
 
 extern spinlock_t swaplock;
diff -Nru a/include/linux/swapctl.h b/include/linux/swapctl.h
--- a/include/linux/swapctl.h   Fri Mar  1 18:19:44 2002
+++ b/include/linux/swapctl.h   Fri Mar  1 18:19:44 2002
@@ -10,4 +10,13 @@
 typedef pager_daemon_v1 pager_daemon_t;
 extern pager_daemon_t pager_daemon;
 
+typedef struct freepages_v1
+{
+       unsigned int    min;
+       unsigned int    low;
+       unsigned int    high;
+} freepages_v1;
+typedef freepages_v1 freepages_t;
+extern freepages_t freepages;
+
 #endif /* _LINUX_SWAPCTL_H */
diff -Nru a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c     Fri Mar  1 18:19:44 2002
+++ b/kernel/fork.c     Fri Mar  1 18:19:44 2002
@@ -139,7 +139,6 @@
        mm->map_count = 0;
        mm->rss = 0;
        mm->cpu_vm_mask = 0;
-       mm->swap_address = 0;
        pprev = &mm->mmap;
 
        /*
@@ -263,9 +262,6 @@
 void mmput(struct mm_struct *mm)
 {
        if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) {
-               extern struct mm_struct *swap_mm;
-               if (swap_mm == mm)
-                       swap_mm = list_entry(mm->mmlist.next, struct mm_struct, 
mmlist);
                list_del(&mm->mmlist);
                mmlist_nr--;
                spin_unlock(&mmlist_lock);
@@ -658,8 +654,6 @@
 #endif
        p->lock_depth = -1;             /* -1 = no lock */
        p->start_time = jiffies;
-
-       INIT_LIST_HEAD(&p->local_pages);
 
        retval = -ENOMEM;
        /* copy all the process information */
diff -Nru a/kernel/sys.c b/kernel/sys.c
--- a/kernel/sys.c      Fri Mar  1 18:19:44 2002
+++ b/kernel/sys.c      Fri Mar  1 18:19:44 2002
@@ -1128,6 +1128,12 @@
        if (resource == RLIMIT_NOFILE) {
                if (new_rlim.rlim_cur > NR_OPEN || new_rlim.rlim_max > NR_OPEN)
                        return -EPERM;
+       } else if (resource == RLIMIT_RSS && current->mm) {
+               /* rlimit is specified in bytes, convert to pages */
+               unsigned long pages = RLIM_INFINITY;
+               if (new_rlim.rlim_cur != RLIM_INFINITY)
+                       pages = new_rlim.rlim_cur >> PAGE_SHIFT;
+               current->mm->rlimit_rss = pages;
        }
        *old_rlim = new_rlim;
        return 0;
diff -Nru a/kernel/sysctl.c b/kernel/sysctl.c
--- a/kernel/sysctl.c   Fri Mar  1 18:19:44 2002
+++ b/kernel/sysctl.c   Fri Mar  1 18:19:44 2002
@@ -260,6 +260,8 @@
 };
 
 static ctl_table vm_table[] = {
+       {VM_FREEPG, "freepages",
+        &freepages, sizeof(freepages_t), 0644, NULL, &proc_dointvec},
        {VM_BDFLUSH, "bdflush", &bdf_prm, 9*sizeof(int), 0644, NULL,
         &proc_dointvec_minmax, &sysctl_intvec, NULL,
         &bdflush_min, &bdflush_max},
diff -Nru a/mm/Makefile b/mm/Makefile
--- a/mm/Makefile       Fri Mar  1 18:19:44 2002
+++ b/mm/Makefile       Fri Mar  1 18:19:44 2002
@@ -14,7 +14,7 @@
 obj-y   := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
            vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
            page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
-           shmem.o
+           shmem.o rmap.o
 
 obj-$(CONFIG_HIGHMEM) += highmem.o
 
diff -Nru a/mm/TODO b/mm/TODO
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ b/mm/TODO   Fri Mar  1 18:19:44 2002
@@ -0,0 +1,35 @@
+               VM TODO list
+
+Forever valid TODO entries:
+  - keep up with the official kernel
+  - port over bugfixes
+  - minimise the diff by keeping code in sync where possible
+
+Easy short-term features:
+  - reclaim swap space from refill_inactive()
+  - simplify SMP locking 
+  - replace foo()/foo_pgd()/foo_pmd()/foo_pte() stuff with
+    one single function using a for_each_pte() macro
+       for_each_pte(ptep, mm, start_address, end_address)
+  - fix page_launder() to not eat horrible amounts of CPU or flush
+    all pages to disk at once
+  - better VM balancing, clean vs. dirty ratio
+  - fix loopback device deadlock
+    <akpm> riel: nr_fract=70%, nr_fract_sync=80%
+    <akpm> riel: setup a loopback fs ext2-on-ext2
+    <akpm> riel: boot with mem=64m
+    <akpm> riel: then write a 500 meg file.
+    <akpm> riel: current kernel livelocks.
+  - stabilise pte_highmem and integrate it with rmap
+
+Long-term features:
+  - extensive VM statistics
+  - IO clustering for page_launder() and sync_old_buffers()
+  - readahead on per-VMA level (+ drop behind?)
+  - more graceful degradation when the load gets high
+     - reducing readahead
+     - unfair pageout so not all apps fall over
+  - memory objects, using pagecache and tmpfs for storage so
+    the memory object itself doesn't introduce any new overhead
+  - using the memory objects, removing page table copying from fork()
+  - load control able to deal with really extreme loads, swapping
diff -Nru a/mm/bootmem.c b/mm/bootmem.c
--- a/mm/bootmem.c      Fri Mar  1 18:19:44 2002
+++ b/mm/bootmem.c      Fri Mar  1 18:19:44 2002
@@ -326,12 +326,11 @@
        pg_data_t *pgdat = pgdat_list;
        void *ptr;
 
-       while (pgdat) {
+       for_each_pgdat(pgdat)
                if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
                                                align, goal)))
                        return(ptr);
-               pgdat = pgdat->node_next;
-       }
+
        /*
         * Whoops, we cannot satisfy the allocation request.
         */
diff -Nru a/mm/filemap.c b/mm/filemap.c
--- a/mm/filemap.c      Fri Mar  1 18:19:44 2002
+++ b/mm/filemap.c      Fri Mar  1 18:19:44 2002
@@ -22,6 +22,7 @@
 #include <linux/swapctl.h>
 #include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/mm_inline.h>
 #include <linux/iobuf.h>
 #include <linux/compiler.h>
 
@@ -234,7 +235,7 @@
 static void truncate_complete_page(struct page *page)
 {
        /* Leave it on the LRU if it gets converted into anonymous buffers */
-       if (!page->buffers || do_flushpage(page, 0))
+       if (!page->pte_chain && (!page->buffers || do_flushpage(page, 0)))
                lru_cache_del(page);
 
        /*
@@ -454,6 +455,11 @@
        return page;
 }
 
+static struct page * __find_page(struct address_space * mapping, unsigned long 
index)
+{
+       return __find_page_nolock(mapping, index, *page_hash(mapping,index));
+}
+
 static int do_buffer_fdatasync(struct list_head *head, unsigned long start, 
unsigned long end, int (*fn)(struct page *))
 {
        struct list_head *curr;
@@ -1016,7 +1022,53 @@
 
 
 /*
- * Same as grab_cache_page, but do not wait if the page is unavailable.
+ * We combine this with read-ahead to deactivate pages when we
+ * think there's sequential IO going on. Note that this is
+ * harmless since we don't actually evict the pages from memory
+ * but just move them to the inactive list.
+ *
+ * TODO:
+ * - make the readahead code smarter
+ * - move readahead to the VMA level so we can do the same
+ *   trick with mmap()
+ *
+ * Rik van Riel, 2000
+ */
+static void drop_behind(struct file * file, unsigned long index)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       struct address_space *mapping = inode->i_mapping;
+       struct page *page;
+       unsigned long start;
+
+       /* Nothing to drop-behind if we're on the first page. */
+       if (!index)
+               return;
+
+       if (index > file->f_rawin)
+               start = index - file->f_rawin;
+       else
+               start = 0;
+
+       /*
+        * Go backwards from index-1 and drop all pages in the
+        * readahead window. Since the readahead window may have
+        * been increased since the last time we were called, we
+        * stop when the page isn't there.
+        */
+       spin_lock(&pagemap_lru_lock);
+       while (--index >= start) {
+               spin_lock(&pagecache_lock);
+               page = __find_page(mapping, index);
+               spin_unlock(&pagecache_lock);
+               if (!page || !PageActive(page))
+                       break;
+               drop_page(page);
+       }
+       spin_unlock(&pagemap_lru_lock);
+}
+
+/* Same as grab_cache_page, but do not wait if the page is unavailable.
  * This is intended for speculative data generators, where the data can
  * be regenerated if the page couldn't be grabbed.  This routine should
  * be safe to call while holding the lock for another page.
@@ -1286,6 +1338,12 @@
                if (filp->f_ramax > max_readahead)
                        filp->f_ramax = max_readahead;
 
+               /*
+                * Move the pages that have already been passed
+                * to the inactive list.
+                */
+               drop_behind(filp, index);
+
 #ifdef PROFILE_READAHEAD
                profile_readahead((reada_ok == 2), filp);
 #endif
@@ -1294,25 +1352,6 @@
        return;
 }
 
-/*
- * Mark a page as having seen activity.
- *
- * If it was already so marked, move it
- * to the active queue and drop the referenced
- * bit. Otherwise, just mark it for future
- * action..
- */
-void mark_page_accessed(struct page *page)
-{
-       if (!PageActive(page) && PageReferenced(page)) {
-               activate_page(page);
-               ClearPageReferenced(page);
-               return;
-       }
-
-       /* Mark the page referenced, AFTER checking for previous usage.. */
-       SetPageReferenced(page);
-}
 
 /*
  * This is a generic file read routine, and uses the
@@ -1421,7 +1460,7 @@
                 * beginning or we just did an lseek.
                 */
                if (!offset || !filp->f_reada)
-                       mark_page_accessed(page);
+                       touch_page(page);
 
                /*
                 * Ok, we have the page, and it's up-to-date, so
@@ -1822,7 +1861,7 @@
                nr = max;
 
        /* And limit it to a sane percentage of the inactive list.. */
-       max = nr_inactive_pages / 2;
+       max = nr_inactive_clean_pages / 2;
        if (nr > max)
                nr = max;
 
@@ -1967,7 +2006,7 @@
         * Found the page and have a reference on it, need to check sharing
         * and possibly copy it over to another page..
         */
-       mark_page_accessed(page);
+       touch_page(page);
        flush_page_to_ram(page);
        return page;
 
@@ -2840,7 +2879,7 @@
        page = __read_cache_page(mapping, index, filler, data);
        if (IS_ERR(page))
                goto out;
-       mark_page_accessed(page);
+       touch_page(page);
        if (Page_Uptodate(page))
                goto out;
 
@@ -3037,6 +3076,7 @@
                unsigned long index, offset;
                long page_fault;
                char *kaddr;
+               int deactivate = 1;
 
                /*
                 * Try to find the page in the cache. If it isn't there,
@@ -3045,8 +3085,10 @@
                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
                index = pos >> PAGE_CACHE_SHIFT;
                bytes = PAGE_CACHE_SIZE - offset;
-               if (bytes > count)
+               if (bytes > count) {
                        bytes = count;
+                       deactivate = 0;
+               }
 
                /*
                 * Bring in the user page that we will copy from _first_.
@@ -3090,8 +3132,11 @@
 unlock:
                kunmap(page);
                /* Mark it unlocked again and drop the page.. */
-               SetPageReferenced(page);
                UnlockPage(page);
+               if (deactivate)
+                       deactivate_page(page);
+               else
+                       touch_page(page);
                page_cache_release(page);
 
                if (status < 0)
diff -Nru a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c       Fri Mar  1 18:19:44 2002
+++ b/mm/memory.c       Fri Mar  1 18:19:44 2002
@@ -45,8 +45,10 @@
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/module.h>
+#include <linux/mm_inline.h>
 
 #include <asm/pgalloc.h>
+#include <asm/rmap.h>
 #include <asm/uaccess.h>
 #include <asm/tlb.h>
 
@@ -102,6 +104,7 @@
        }
        pte = pte_offset(dir, 0);
        pmd_clear(dir);
+       pgtable_remove_rmap(pte);
        pte_free(pte);
 }
 
@@ -236,9 +239,11 @@
 
                                if (pte_none(pte))
                                        goto cont_copy_pte_range_noset;
+                               /* pte contains position in swap, so copy. */
                                if (!pte_present(pte)) {
                                        swap_duplicate(pte_to_swp_entry(pte));
-                                       goto cont_copy_pte_range;
+                                       set_pte(dst_pte, pte);
+                                       goto cont_copy_pte_range_noset;
                                }
                                ptepage = pte_page(pte);
                                if ((!VALID_PAGE(ptepage)) || 
@@ -246,7 +251,7 @@
                                        goto cont_copy_pte_range;
 
                                /* If it's a COW mapping, write protect it both 
in the parent and the child */
-                               if (cow && pte_write(pte)) {
+                               if (cow) {
                                        ptep_set_wrprotect(src_pte);
                                        pte = *src_pte;
                                }
@@ -259,6 +264,7 @@
                                dst->rss++;
 
 cont_copy_pte_range:           set_pte(dst_pte, pte);
+                               page_add_rmap(ptepage, dst_pte);
 cont_copy_pte_range_noset:     address += PAGE_SIZE;
                                if (address >= end)
                                        goto out_unlock;
@@ -314,8 +320,10 @@
                        continue;
                if (pte_present(pte)) {
                        struct page *page = pte_page(pte);
-                       if (VALID_PAGE(page) && !PageReserved(page))
+                       if (VALID_PAGE(page) && !PageReserved(page)) {
                                freed ++;
+                               page_remove_rmap(page, ptep);
+                       }
                        /* This will eventually call __free_pte on the pte. */
                        tlb_remove_page(tlb, ptep, address + offset);
                } else {
@@ -980,7 +988,9 @@
        if (pte_same(*page_table, pte)) {
                if (PageReserved(old_page))
                        ++mm->rss;
+               page_remove_rmap(old_page, page_table);
                break_cow(vma, new_page, address, page_table);
+               page_add_rmap(new_page, page_table);
                lru_cache_add(new_page);
 
                /* Free the old page.. */
@@ -1093,6 +1103,10 @@
        struct page *new_page;
        unsigned long offset;
 
+       /* Low on free memory ?  Don't make things worse. */
+       if (free_low(ALL_ZONES) < 0)
+               return;
+
        /*
         * Get the number of handles we should do readahead io to.
         */
@@ -1141,7 +1155,7 @@
                ret = 2;
        }
 
-       mark_page_accessed(page);
+       touch_page(page);
 
        lock_page(page);
 
@@ -1172,6 +1186,7 @@
        flush_page_to_ram(page);
        flush_icache_page(vma, page);
        set_pte(page_table, pte);
+       page_add_rmap(page, page_table);
 
        /* No need to invalidate - it was non-present before */
        update_mmu_cache(vma, address, pte);
@@ -1187,14 +1202,13 @@
 static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * 
vma, pte_t *page_table, int write_access, unsigned long addr)
 {
        pte_t entry;
+       struct page * page = ZERO_PAGE(addr);
 
        /* Read-only mapping of ZERO_PAGE. */
        entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
 
        /* ..except if it's a write access */
        if (write_access) {
-               struct page *page;
-
                /* Allocate our own private page. */
                spin_unlock(&mm->page_table_lock);
 
@@ -1213,10 +1227,10 @@
                flush_page_to_ram(page);
                entry = pte_mkwrite(pte_mkdirty(mk_pte(page, 
vma->vm_page_prot)));
                lru_cache_add(page);
-               mark_page_accessed(page);
        }
 
        set_pte(page_table, entry);
+       page_add_rmap(page, page_table); /* ignores ZERO_PAGE */
 
        /* No need to invalidate - it was non-present before */
        update_mmu_cache(vma, addr, entry);
@@ -1271,6 +1285,8 @@
                new_page = page;
        }
 
+       touch_page(new_page);
+
        spin_lock(&mm->page_table_lock);
        /*
         * This silly early PAGE_DIRTY setting removes a race
@@ -1291,6 +1307,7 @@
                if (write_access)
                        entry = pte_mkwrite(pte_mkdirty(entry));
                set_pte(page_table, entry);
+               page_add_rmap(new_page, page_table);
        } else {
                /* One of our sibling threads was faster, back out. */
                page_cache_release(new_page);
@@ -1367,6 +1384,14 @@
        current->state = TASK_RUNNING;
        pgd = pgd_offset(mm, address);
 
+       /* 
+        * If we are over our RSS limit and the system needs memory,
+        * we will free memory for the non-hogs and slow down a bit.
+        */
+       if (mm->rlimit_rss && mm->rss > mm->rlimit_rss &&
+                                       free_high(ALL_ZONES) > 0)
+               rss_free_pages(GFP_HIGHUSER);
+
        /*
         * We need the page table lock to synchronize with kswapd
         * and the SMP-safe atomic PTE updates.
@@ -1448,6 +1473,7 @@
                                goto out;
                        }
                }
+               pgtable_add_rmap(new, mm, address);
                pmd_populate(mm, pmd, new);
        }
 out:
diff -Nru a/mm/mremap.c b/mm/mremap.c
--- a/mm/mremap.c       Fri Mar  1 18:19:44 2002
+++ b/mm/mremap.c       Fri Mar  1 18:19:44 2002
@@ -61,8 +61,14 @@
 {
        int error = 0;
        pte_t pte;
+       struct page * page = NULL;
+
+       if (pte_present(*src))
+               page = pte_page(*src);
 
        if (!pte_none(*src)) {
+               if (page)
+                       page_remove_rmap(page, src);
                pte = ptep_get_and_clear(src);
                if (!dst) {
                        /* No dest?  We must put it back. */
@@ -70,6 +76,8 @@
                        error++;
                }
                set_pte(dst, pte);
+               if (page)
+                       page_add_rmap(page, dst);
        }
        return error;
 }
diff -Nru a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c     Fri Mar  1 18:19:44 2002
+++ b/mm/oom_kill.c     Fri Mar  1 18:19:44 2002
@@ -110,8 +110,7 @@
 
 /*
  * Simple selection loop. We chose the process with the highest
- * number of 'points'. We need the locks to make sure that the
- * list of task structs doesn't change while we look the other way.
+ * number of 'points'. We expect the caller will lock the tasklist.
  *
  * (not docbooked, we don't want this one cluttering up the manual)
  */
@@ -121,7 +120,6 @@
        struct task_struct *p = NULL;
        struct task_struct *chosen = NULL;
 
-       read_lock(&tasklist_lock);
        for_each_task(p) {
                if (p->pid) {
                        int points = badness(p);
@@ -131,7 +129,6 @@
                        }
                }
        }
-       read_unlock(&tasklist_lock);
        return chosen;
 }
 
@@ -170,19 +167,25 @@
  */
 static void oom_kill(void)
 {
-       struct task_struct *p = select_bad_process(), *q;
+       struct task_struct *p, *q;
+       extern wait_queue_head_t kswapd_done;
+
+       read_lock(&tasklist_lock);
+       p = select_bad_process();
 
        /* Found nothing?!?! Either we hang forever, or we panic. */
        if (p == NULL)
                panic("Out of memory and no killable processes...\n");
 
        /* kill all processes that share the ->mm (i.e. all threads) */
-       read_lock(&tasklist_lock);
        for_each_task(q) {
                if(q->mm == p->mm) oom_kill_task(q);
        }
        read_unlock(&tasklist_lock);
 
+       /* Chances are by this time our victim is sleeping on kswapd. */
+       wake_up(&kswapd_done);
+
        /*
         * Make kswapd go out of the way, so "p" has a good chance of
         * killing itself before someone else gets the chance to ask
@@ -198,7 +201,7 @@
  */
 void out_of_memory(void)
 {
-       static unsigned long first, last, count;
+       static unsigned long first, last, count, lastkill;
        unsigned long now, since;
 
        /*
@@ -235,8 +238,18 @@
                return;
 
        /*
+        * If we just killed a process, wait a while
+        * to give that task a chance to exit. This
+        * avoids killing multiple processes needlessly.
+        */
+       since = now - lastkill;
+       if (since < HZ*5)
+               return;
+
+       /*
         * Ok, really out of memory. Kill something.
         */
+       lastkill = now;
        oom_kill();
 
 reset:
diff -Nru a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c   Fri Mar  1 18:19:44 2002
+++ b/mm/page_alloc.c   Fri Mar  1 18:19:44 2002
@@ -22,12 +22,12 @@
 #include <linux/slab.h>
 #include <linux/compiler.h>
 #include <linux/module.h>
+#include <linux/mm_inline.h>
 
 int nr_swap_pages;
 int nr_active_pages;
-int nr_inactive_pages;
-struct list_head inactive_list;
-struct list_head active_list;
+int nr_inactive_dirty_pages;
+int nr_inactive_clean_pages;
 pg_data_t *pgdat_list;
 
 /* Used to look up the address of the struct zone encoded in page->zone */
@@ -38,6 +38,8 @@
 static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
 static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
 static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
+static int zone_extrafree_ratio[MAX_NR_ZONES] __initdata = { 128, 512, 0, };
+static int zone_extrafree_max[MAX_NR_ZONES] __initdata = { 1024 , 1024, 0, };
 
 /*
  * Free_page() adds the page to the free lists. This is optimized for
@@ -113,16 +115,17 @@
                BUG();
        if (PageLocked(page))
                BUG();
-       if (PageLRU(page))
-               BUG();
        if (PageActive(page))
                BUG();
+       if (PageInactiveDirty(page))
+               BUG();
+       if (PageInactiveClean(page))
+               BUG();
+       if (page->pte_chain)
+               BUG();
        page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
-
-       if (current->flags & PF_FREE_PAGES)
-               goto local_freelist;
- back_local_freelist:
-
+       page->age = PAGE_AGE_START;
+       
        zone = page_zone(page);
 
        mask = (~0UL) << order;
@@ -169,17 +172,6 @@
        memlist_add_head(&(base + page_idx)->list, &area->free_list);
 
        spin_unlock_irqrestore(&zone->lock, flags);
-       return;
-
- local_freelist:
-       if (current->nr_local_pages)
-               goto back_local_freelist;
-       if (in_interrupt())
-               goto back_local_freelist;               
-
-       list_add(&page->list, &current->local_pages);
-       page->index = order;
-       current->nr_local_pages++;
 }
 
 #define MARK_USED(index, order, area) \
@@ -238,10 +230,7 @@
                        set_page_count(page, 1);
                        if (BAD_RANGE(zone,page))
                                BUG();
-                       if (PageLRU(page))
-                               BUG();
-                       if (PageActive(page))
-                               BUG();
+                       DEBUG_LRU_PAGE(page);
                        return page;    
                }
                curr_order++;
@@ -260,78 +249,83 @@
 }
 #endif
 
-static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, 
unsigned int, int *));
-static struct page * balance_classzone(zone_t * classzone, unsigned int 
gfp_mask, unsigned int order, int * freed)
+/*
+ * If we are able to directly reclaim pages, we move pages from the
+ * inactive_clean list onto the free list until the zone has enough
+ * free pages or until the inactive_clean pages are exhausted.
+ * If we cannot do this work ourselves, call kswapd.
+ */
+void FASTCALL(fixup_freespace(zone_t * zone, int direct_reclaim));
+void fixup_freespace(zone_t * zone, int direct_reclaim)
+{
+       if (direct_reclaim) {
+               struct page * page;
+               do {
+                       if ((page = reclaim_page(zone)))
+                               __free_pages_ok(page, 0);
+               } while (page && zone->free_pages <= zone->pages_min);
+       } else
+               wakeup_kswapd(GFP_ATOMIC);
+}
+
+#define PAGES_KERNEL   0
+#define PAGES_MIN      1
+#define PAGES_LOW      2
+#define PAGES_HIGH     3
+
+/*
+ * This function does the dirty work for __alloc_pages
+ * and is separated out to keep the code size smaller.
+ * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)
+ */
+static struct page * __alloc_pages_limit(zonelist_t *zonelist,
+                       unsigned long order, int limit, int direct_reclaim)
 {
-       struct page * page = NULL;
-       int __freed = 0;
+       zone_t **zone = zonelist->zones;
+       unsigned long water_mark = 0;
 
-       if (!(gfp_mask & __GFP_WAIT))
-               goto out;
-       if (in_interrupt())
-               BUG();
-
-       current->allocation_order = order;
-       current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
-
-       __freed = try_to_free_pages(classzone, gfp_mask, order);
-
-       current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
-
-       if (current->nr_local_pages) {
-               struct list_head * entry, * local_pages;
-               struct page * tmp;
-               int nr_pages;
-
-               local_pages = &current->local_pages;
-
-               if (likely(__freed)) {
-                       /* pick from the last inserted so we're lifo */
-                       entry = local_pages->next;
-                       do {
-                               tmp = list_entry(entry, struct page, list);
-                               if (tmp->index == order && 
memclass(page_zone(tmp), classzone)) {
-                                       list_del(entry);
-                                       current->nr_local_pages--;
-                                       set_page_count(tmp, 1);
-                                       page = tmp;
-
-                                       if (page->buffers)
-                                               BUG();
-                                       if (page->mapping)
-                                               BUG();
-                                       if (!VALID_PAGE(page))
-                                               BUG();
-                                       if (PageSwapCache(page))
-                                               BUG();
-                                       if (PageLocked(page))
-                                               BUG();
-                                       if (PageLRU(page))
-                                               BUG();
-                                       if (PageActive(page))
-                                               BUG();
-                                       if (PageDirty(page))
-                                               BUG();
+       for (;;) {
+               zone_t *z = *(zone++);
 
-                                       break;
-                               }
-                       } while ((entry = entry->next) != local_pages);
+               if (!z)
+                       break;
+               if (!z->size)
+                       BUG();
+
+               /*
+                * We allocate if the number of (free + inactive_clean)
+                * pages is above the watermark.
+                */
+               switch (limit) {
+                       case PAGES_KERNEL:
+                               water_mark = z->pages_min / 2;
+                               break;
+                       case PAGES_MIN:
+                               water_mark = z->pages_min;
+                               break;
+                       case PAGES_LOW:
+                               water_mark = z->pages_low;
+                               break;
+                       default:
+                       case PAGES_HIGH:
+                               water_mark = z->pages_high;
                }
 
-               nr_pages = current->nr_local_pages;
-               /* free in reverse order so that the global order will be lifo 
*/
-               while ((entry = local_pages->prev) != local_pages) {
-                       list_del(entry);
-                       tmp = list_entry(entry, struct page, list);
-                       __free_pages_ok(tmp, tmp->index);
-                       if (!nr_pages--)
-                               BUG();
+               if (z->free_pages + z->inactive_clean_pages >= water_mark) {
+                       struct page *page = NULL;
+                       /* If possible, reclaim a page directly. */
+                       if (direct_reclaim)
+                               page = reclaim_page(z);
+                       /* If that fails, fall back to rmqueue. */
+                       if (!page)
+                               page = rmqueue(z, order);
+                       if (page)
+                               return page;
                }
-               current->nr_local_pages = 0;
        }
- out:
-       *freed = __freed;
-       return page;
+
+       /* Found nothing. */
+       return NULL;
 }
 
 /*
@@ -339,100 +333,248 @@
  */
 struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, 
zonelist_t *zonelist)
 {
-       unsigned long min;
-       zone_t **zone, * classzone;
+       zone_t **zone;
+       int min, direct_reclaim = 0;
        struct page * page;
-       int freed;
 
+       /*
+        * (If anyone calls gfp from interrupts nonatomically then it
+        * will sooner or later tripped up by a schedule().)
+        *
+        * We fall back to lower-level zones if allocation
+        * in a higher zone fails.
+        */
+
+       /*
+        * Can we take pages directly from the inactive_clean
+        * list?
+        */
+       if (order == 0 && (gfp_mask & __GFP_WAIT))
+               direct_reclaim = 1;
+
+try_again:
+       /*
+        * First, see if we have any zones with lots of free memory.
+        *
+        * We allocate free memory first because it doesn't contain
+        * any data we would want to cache.
+        */
        zone = zonelist->zones;
-       classzone = *zone;
        min = 1UL << order;
        for (;;) {
                zone_t *z = *(zone++);
                if (!z)
                        break;
+               if (!z->size)
+                       BUG();
 
-               min += z->pages_low;
+               min += z->pages_min;
                if (z->free_pages > min) {
                        page = rmqueue(z, order);
                        if (page)
                                return page;
-               }
+               } else if (z->free_pages < z->pages_min)
+                       fixup_freespace(z, direct_reclaim);
+       }
+
+       /*
+        * Next, try to allocate a page from a zone with a HIGH
+        * amount of (free + inactive_clean) pages.
+        *
+        * If there is a lot of activity, inactive_target
+        * will be high and we'll have a good chance of
+        * finding a page using the HIGH limit.
+        */
+       page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);
+       if (page)
+               return page;
+
+       /*
+        * Then try to allocate a page from a zone with more
+        * than zone->pages_low of (free + inactive_clean) pages.
+        *
+        * When the working set is very large and VM activity
+        * is low, we're most likely to have our allocation
+        * succeed here.
+        */
+       page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);
+       if (page)
+               return page;
+
+       /*
+        * OK, none of the zones on our zonelist has lots
+        * of pages free.
+        *
+        * We wake up kswapd, in the hope that kswapd will
+        * resolve this situation before memory gets tight.
+        *
+        * We'll also help a bit trying to free pages, this
+        * way statistics will make sure really fast allocators
+        * are slowed down more than slow allocators and other
+        * programs in the system shouldn't be impacted as much
+        * by the hogs.
+        */
+       wakeup_kswapd(gfp_mask);
+
+       /*
+        * After waking up kswapd, we try to allocate a page
+        * from any zone which isn't critical yet.
+        *
+        * Kswapd should, in most situations, bring the situation
+        * back to normal in no time.
+        */
+       page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);
+       if (page)
+               return page;
+
+       /*
+        * Kernel allocations can eat a few emergency pages.
+        * We should be able to run without this, find out why
+        * the SCSI layer isn't happy ...
+        */
+       if (gfp_mask & __GFP_HIGH) {
+               page = __alloc_pages_limit(zonelist, order, PAGES_KERNEL, 
direct_reclaim);
+               if (page)
+                       return page;
        }
 
-       classzone->need_balance = 1;
-       mb();
-       if (waitqueue_active(&kswapd_wait))
-               wake_up_interruptible(&kswapd_wait);
+       /*
+        * Oh well, we didn't succeed.
+        */
+       if (!(current->flags & PF_MEMALLOC)) {
+               /*
+                * Are we dealing with a higher order allocation?
+                *
+                * If so, try to defragment some memory.
+                */
+               if (order > 0 && (gfp_mask & __GFP_WAIT))
+                       goto defragment;
+
+               /*
+                * If we arrive here, we are really tight on memory.
+                * Since kswapd didn't succeed in freeing pages for us,
+                * we need to help it.
+                *
+                * Single page allocs loop until the allocation succeeds.
+                * Multi-page allocs can fail due to memory fragmentation;
+                * in that case we bail out to prevent infinite loops and
+                * hanging device drivers ...
+                *
+                * Another issue are GFP_NOFS allocations; because they
+                * do not have __GFP_FS set it's possible we cannot make
+                * any progress freeing pages, in that case it's better
+                * to give up than to deadlock the kernel looping here.
+                *
+                * NFS: we must yield the CPU (to rpciod) to avoid deadlock.
+                */
+               if (gfp_mask & __GFP_WAIT) {
+                       __set_current_state(TASK_RUNNING);
+                       current->policy |= SCHED_YIELD;
+                       schedule();
+                       if (!order || free_high(ALL_ZONES) >= 0) {
+                               int progress = try_to_free_pages(gfp_mask);
+                               if (progress || (gfp_mask & __GFP_FS))
+                                       goto try_again;
+                               /*
+                                * Fail if no progress was made and the
+                                * allocation may not be able to block on IO.
+                                */
+                               return NULL;
+                       }
+               }
+       }
 
+       /*
+        * Final phase: allocate anything we can!
+        *
+        * Higher order allocations, GFP_ATOMIC allocations and
+        * recursive allocations (PF_MEMALLOC) end up here.
+        *
+        * Only recursive allocations can use the very last pages
+        * in the system, otherwise it would be just too easy to
+        * deadlock the system...
+        */
        zone = zonelist->zones;
        min = 1UL << order;
        for (;;) {
-               unsigned long local_min;
                zone_t *z = *(zone++);
+               struct page * page = NULL;
                if (!z)
                        break;
 
-               local_min = z->pages_min;
-               if (!(gfp_mask & __GFP_WAIT))
-                       local_min >>= 2;
-               min += local_min;
-               if (z->free_pages > min) {
+               /*
+                * SUBTLE: direct_reclaim is only possible if the task
+                * becomes PF_MEMALLOC while looping above. This will
+                * happen when the OOM killer selects this task for
+                * death.
+                */
+               if (direct_reclaim) {
+                       page = reclaim_page(z);
+                       if (page)
+                               return page;
+               }
+
+               /* XXX: is pages_min/4 a good amount to reserve for this? */
+               min += z->pages_min / 4;
+               if (z->free_pages > min || ((current->flags & PF_MEMALLOC) && 
!in_interrupt())) {
                        page = rmqueue(z, order);
                        if (page)
                                return page;
                }
        }
+       goto out_failed;
 
-       /* here we're in the low on memory slow path */
 
-rebalance:
-       if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) {
+       /*
+        * Naive "defragmentation" for higher-order allocations. First we
+        * free the inactive_clean pages to see if we can allocate our
+        * allocation, then we call page_launder() to clean some dirty
+        * pages, and last we try once more.
+        *
+        * We might want to turn this into something which defragments
+        * memory based on physical page, simply by looking for unmapped
+        * pages next to pages on the free list...
+        */
+defragment:
+       {
+               int freed = 0;
+defragment_again:
                zone = zonelist->zones;
                for (;;) {
                        zone_t *z = *(zone++);
                        if (!z)
                                break;
-
-                       page = rmqueue(z, order);
-                       if (page)
-                               return page;
+                       if (!z->size)
+                               continue;
+                       while (z->inactive_clean_pages) {
+                               struct page * page;
+                               /* Move one page to the free list. */
+                               page = reclaim_page(z);
+                               if (!page)
+                                       break;
+                               __free_page(page);
+                               /* Try if the allocation succeeds. */
+                               page = rmqueue(z, order);
+                               if (page)
+                                       return page;
+                       }
                }
-               return NULL;
-       }
-
-       /* Atomic allocations - we can't balance anything */
-       if (!(gfp_mask & __GFP_WAIT))
-               return NULL;
-
-       page = balance_classzone(classzone, gfp_mask, order, &freed);
-       if (page)
-               return page;
-
-       zone = zonelist->zones;
-       min = 1UL << order;
-       for (;;) {
-               zone_t *z = *(zone++);
-               if (!z)
-                       break;
 
-               min += z->pages_min;
-               if (z->free_pages > min) {
-                       page = rmqueue(z, order);
-                       if (page)
-                               return page;
+               /* XXX: do real defragmentation instead of calling launder ? */
+               if (!freed) {
+                       freed = 1;
+                       current->flags |= PF_MEMALLOC;
+                       try_to_free_pages(gfp_mask);
+                       current->flags &= ~PF_MEMALLOC;
+                       goto defragment_again;
                }
        }
 
-       /* Don't let big-order allocations loop */
-       if (order > 3)
-               return NULL;
-
-       /* Yield for kswapd, and try again */
-       current->policy |= SCHED_YIELD;
-       __set_current_state(TASK_RUNNING);
-       schedule();
-       goto rebalance;
+
+out_failed:
+       /* No luck.. */
+//     printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);
+       return NULL;
 }
 
 /*
@@ -480,14 +622,11 @@
 {
        unsigned int sum;
        zone_t *zone;
-       pg_data_t *pgdat = pgdat_list;
 
        sum = 0;
-       while (pgdat) {
-               for (zone = pgdat->node_zones; zone < pgdat->node_zones + 
MAX_NR_ZONES; zone++)
-                       sum += zone->free_pages;
-               pgdat = pgdat->node_next;
-       }
+       for_each_zone(zone)
+               sum += zone->free_pages;
+       
        return sum;
 }
 
@@ -496,23 +635,21 @@
  */
 unsigned int nr_free_buffer_pages (void)
 {
-       pg_data_t *pgdat = pgdat_list;
+       pg_data_t *pgdat;
        unsigned int sum = 0;
 
-       do {
+       for_each_pgdat(pgdat) {
                zonelist_t *zonelist = pgdat->node_zonelists + (GFP_USER & 
GFP_ZONEMASK);
                zone_t **zonep = zonelist->zones;
                zone_t *zone;
 
                for (zone = *zonep++; zone; zone = *zonep++) {
-                       unsigned long size = zone->size;
-                       unsigned long high = zone->pages_high;
-                       if (size > high)
-                               sum += size - high;
+                       sum += zone->free_pages;
+                       sum += zone->inactive_clean_pages;
+                       sum += zone->inactive_dirty_pages;
                }
 
-               pgdat = pgdat->node_next;
-       } while (pgdat);
+       }
 
        return sum;
 }
@@ -520,13 +657,12 @@
 #if CONFIG_HIGHMEM
 unsigned int nr_free_highpages (void)
 {
-       pg_data_t *pgdat = pgdat_list;
+       pg_data_t *pgdat;
        unsigned int pages = 0;
 
-       while (pgdat) {
+       for_each_pgdat(pgdat)
                pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
-               pgdat = pgdat->node_next;
-       }
+
        return pages;
 }
 #endif
@@ -563,10 +699,18 @@
                tmpdat = tmpdat->node_next;
        }
 
-       printk("( Active: %d, inactive: %d, free: %d )\n",
-              nr_active_pages,
-              nr_inactive_pages,
-              nr_free_pages());
+       printk("Free pages:      %6dkB (%6dkB HighMem)\n",
+               nr_free_pages() << (PAGE_SHIFT-10),
+               nr_free_highpages() << (PAGE_SHIFT-10));
+
+       printk("( Active: %d, inactive_dirty: %d, inactive_clean: %d, free: %d 
(%d %d %d) )\n",
+               nr_active_pages,
+               nr_inactive_dirty_pages,
+               nr_inactive_clean_pages,
+               nr_free_pages(),
+               freepages.min,
+               freepages.low,
+               freepages.high);
 
        for (type = 0; type < MAX_NR_ZONES; type++) {
                struct list_head *head, *curr;
@@ -726,9 +870,6 @@
                        
        printk("On node %d totalpages: %lu\n", nid, realtotalpages);
 
-       INIT_LIST_HEAD(&active_list);
-       INIT_LIST_HEAD(&inactive_list);
-
        /*
         * Some architectures (with lots of mem and discontinous memory
         * maps) have to search for a good mem_map area:
@@ -751,7 +892,7 @@
        offset = lmem_map - mem_map;    
        for (j = 0; j < MAX_NR_ZONES; j++) {
                zone_t *zone = pgdat->node_zones + j;
-               unsigned long mask;
+               unsigned long mask, extrafree = 0;
                unsigned long size, realsize;
 
                zone_table[nid * MAX_NR_ZONES + j] = zone;
@@ -765,7 +906,13 @@
                zone->lock = SPIN_LOCK_UNLOCKED;
                zone->zone_pgdat = pgdat;
                zone->free_pages = 0;
+               zone->inactive_clean_pages = 0;
+               zone->inactive_dirty_pages = 0;
                zone->need_balance = 0;
+               INIT_LIST_HEAD(&zone->active_list);
+               INIT_LIST_HEAD(&zone->inactive_dirty_list);
+               INIT_LIST_HEAD(&zone->inactive_clean_list);
+
                if (!size)
                        continue;
 
@@ -785,15 +932,36 @@
 
                pgdat->nr_zones = j+1;
 
+               /*
+                * On large memory machines we keep extra memory
+                * free for kernel allocations.
+                */
+               if (zone_extrafree_ratio[j])
+                       extrafree = min_t(int, (realtotalpages / 
zone_extrafree_ratio[j]), zone_extrafree_max[j]);
+               if (extrafree < zone_balance_max[j])
+                       extrafree = 0;
+
                mask = (realsize / zone_balance_ratio[j]);
                if (mask < zone_balance_min[j])
                        mask = zone_balance_min[j];
-               else if (mask > zone_balance_max[j])
-                       mask = zone_balance_max[j];
-               zone->pages_min = mask;
-               zone->pages_low = mask*2;
-               zone->pages_high = mask*3;
-
+               zone->pages_min = extrafree + min(mask, (unsigned 
long)zone_balance_max[j]);
+               zone->pages_low = extrafree + mask*2;
+               zone->pages_high = extrafree + mask*3;
+               zone->pages_plenty = extrafree + mask*6;
+               /*
+                * Add these free targets to the global free target;
+                * we have to be SURE that freepages.high is higher
+                * than SUM [zone->pages_min] for all zones, otherwise
+                * we may have bad bad problems.
+                *
+                * This means we cannot make the freepages array writable
+                * in /proc, but have to add a separate extra_free_target
+                * for people who require it to catch load spikes in eg.
+                * gigabit ethernet routing...
+                */
+               freepages.min += zone->pages_min;
+               freepages.low += zone->pages_low;
+               freepages.high += zone->pages_high;
                zone->zone_mem_map = mem_map + offset;
                zone->zone_start_mapnr = offset;
                zone->zone_start_paddr = zone_start_paddr;
diff -Nru a/mm/rmap.c b/mm/rmap.c
--- /dev/null   Wed Dec 31 16:00:00 1969
+++ b/mm/rmap.c Fri Mar  1 18:19:44 2002
@@ -0,0 +1,384 @@
+/*
+ * mm/rmap.c - physical to virtual reverse mappings
+ *
+ * Copyright 2001, Rik van Riel <riel@xxxxxxxxxxxxxxxx>
+ * Released under the General Public License (GPL).
+ *
+ *
+ * Simple, low overhead pte-based reverse mapping scheme.
+ * This is kept modular because we may want to experiment
+ * with object-based reverse mapping schemes. Please try
+ * to keep this thing as modular as possible.
+ */
+
+/*
+ * Locking:
+ * - the page->pte_chain is protected by the pagemap_lru_lock,
+ *   we probably want to change this to a per-page lock in the
+ *   future
+ * - because swapout locking is opposite to the locking order
+ *   in the page fault path, the swapout path uses trylocks
+ *   on the mm->page_table_lock
+ */
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+
+#include <asm/pgalloc.h>
+#include <asm/rmap.h>
+#include <asm/smplock.h>
+
+/* #define DEBUG_RMAP */
+
+/*
+ * Shared pages have a chain of pte_chain structures, used to locate
+ * all the mappings to this page. We only need a pointer to the pte
+ * here, the page struct for the page table page contains the process
+ * it belongs to and the offset within that process.
+ *
+ * A singly linked list should be fine for most, if not all, workloads.
+ * On fork-after-exec the mapping we'll be removing will still be near
+ * the start of the list, on mixed application systems the short-lived
+ * processes will have their mappings near the start of the list and
+ * in systems with long-lived applications the relative overhead of
+ * exit() will be lower since the applications are long-lived.
+ */
+struct pte_chain {
+       struct pte_chain * next;
+       pte_t * ptep;
+};
+
+static struct pte_chain * pte_chain_freelist;
+static inline struct pte_chain * pte_chain_alloc(void);
+static inline void pte_chain_free(struct pte_chain *, struct pte_chain *, 
struct page *);
+static void alloc_new_pte_chains(void);
+
+/**
+ * page_referenced - test if the page was referenced
+ * @page: the page to test
+ *
+ * Quick test_and_clear_referenced for all mappings to a page,
+ * returns the number of processes which referenced the page.
+ * Caller needs to hold the pagemap_lru_lock.
+ */
+int FASTCALL(page_referenced(struct page *));
+int page_referenced(struct page * page)
+{
+       struct pte_chain * pc;
+       int referenced = 0;
+
+       if (PageTestandClearReferenced(page))
+               referenced++;
+
+       /* Check all the page tables mapping this page. */
+       for (pc = page->pte_chain; pc; pc = pc->next) {
+               if (ptep_test_and_clear_young(pc->ptep))
+                       referenced++;
+       }
+
+       return referenced;
+}
+
+/**
+ * page_add_rmap - add reverse mapping entry to a page
+ * @page: the page to add the mapping to
+ * @ptep: the page table entry mapping this page
+ *
+ * Add a new pte reverse mapping to a page.
+ * The caller needs to hold the mm->page_table_lock.
+ */
+void FASTCALL(page_add_rmap(struct page *, pte_t *));
+void page_add_rmap(struct page * page, pte_t * ptep)
+{
+       struct pte_chain * pte_chain;
+
+       if (!VALID_PAGE(page) || PageReserved(page))
+               return;
+
+       spin_lock(&pagemap_lru_lock);
+#ifdef DEBUG_RMAP
+       if (!page || !ptep)
+               BUG();
+       if (!pte_present(*ptep))
+               BUG();
+       if (!ptep_to_mm(ptep));
+               BUG();
+       {
+               struct pte_chain * pc;
+               for (pc = page->pte_chain; pc; pc = pc->next) {
+                       if (pc->ptep == ptep)
+                               BUG();
+               }
+       }
+#endif
+       pte_chain = pte_chain_alloc();
+
+       /* Hook up the pte_chain to the page. */
+       pte_chain->ptep = ptep;
+       pte_chain->next = page->pte_chain;
+       page->pte_chain = pte_chain;
+
+       spin_unlock(&pagemap_lru_lock);
+}
+
+/**
+ * page_remove_rmap - take down reverse mapping to a page
+ * @page: page to remove mapping from
+ * @ptep: page table entry to remove
+ *
+ * Removes the reverse mapping from the pte_chain of the page,
+ * after that the caller can clear the page table entry and free
+ * the page.
+ * Caller needs to hold the mm->page_table_lock.
+ */
+void FASTCALL(page_remove_rmap(struct page *, pte_t *));
+void page_remove_rmap(struct page * page, pte_t * ptep)
+{
+       struct pte_chain * pc, * prev_pc = NULL;
+
+       if (!page || !ptep)
+               BUG();
+       if (!VALID_PAGE(page) || PageReserved(page))
+               return;
+
+       spin_lock(&pagemap_lru_lock);
+       for (pc = page->pte_chain; pc; prev_pc = pc, pc = pc->next) {
+               if (pc->ptep == ptep) {
+                       pte_chain_free(pc, prev_pc, page);
+                       goto out;
+               }
+       }
+#ifdef DEBUG_RMAP
+       /* Not found. This should NEVER happen! */
+       printk(KERN_ERR "page_remove_rmap: pte_chain %p not present.\n", ptep);
+       printk(KERN_ERR "page_remove_rmap: only found: ");
+       for (pc = page->pte_chain; pc; pc = pc->next)
+               printk("%p ", pc->ptep);
+       printk("\n");
+       printk(KERN_ERR "page_remove_rmap: driver cleared PG_reserved ?\n");
+#endif
+
+out:
+       spin_unlock(&pagemap_lru_lock);
+       return;
+                       
+}
+
+/**
+ * try_to_unmap_one - worker function for try_to_unmap
+ * @page: page to unmap
+ * @ptep: page table entry to unmap from page
+ *
+ * Internal helper function for try_to_unmap, called for each page
+ * table entry mapping a page. Because locking order here is opposite
+ * to the locking order used by the page fault path, we use trylocks.
+ * Locking:
+ *     pagemap_lru_lock                page_launder()
+ *         page lock                   page_launder(), trylock
+ *             mm->page_table_lock     try_to_unmap_one(), trylock
+ */
+int FASTCALL(try_to_unmap_one(struct page *, pte_t *));
+int try_to_unmap_one(struct page * page, pte_t * ptep)
+{
+       unsigned long address = ptep_to_address(ptep);
+       struct mm_struct * mm = ptep_to_mm(ptep);
+       struct vm_area_struct * vma;
+       pte_t pte;
+       int ret;
+
+       if (!mm)
+               BUG();
+
+       /*
+        * We need the page_table_lock to protect us from page faults,
+        * munmap, fork, etc...
+        */
+       if (!spin_trylock(&mm->page_table_lock))
+               return SWAP_AGAIN;
+
+       /* During mremap, it's possible pages are not in a VMA. */
+       vma = find_vma(mm, address);
+       if (!vma) {
+               ret = SWAP_FAIL;
+               goto out_unlock;
+       }
+
+       /* The page is mlock()d, we cannot swap it out. */
+       if (vma->vm_flags & VM_LOCKED) {
+               ret = SWAP_FAIL;
+               goto out_unlock;
+       }
+
+       /* Nuke the page table entry. */
+       pte = ptep_get_and_clear(ptep);
+       flush_tlb_page(vma, address);
+       flush_cache_page(vma, address);
+
+       /* Store the swap location in the pte. See handle_pte_fault() ... */
+       if (PageSwapCache(page)) {
+               swp_entry_t entry;
+               entry.val = page->index;
+               swap_duplicate(entry);
+               set_pte(ptep, swp_entry_to_pte(entry));
+       }
+
+       /* Move the dirty bit to the physical page now the pte is gone. */
+       if (pte_dirty(pte))
+               set_page_dirty(page);
+
+       mm->rss--;
+       page_cache_release(page);
+       ret = SWAP_SUCCESS;
+
+out_unlock:
+       spin_unlock(&mm->page_table_lock);
+       return ret;
+}
+
+/**
+ * try_to_unmap - try to remove all page table mappings to a page
+ * @page: the page to get unmapped
+ *
+ * Tries to remove all the page table entries which are mapping this
+ * page, used in the pageout path.  Caller must hold pagemap_lru_lock
+ * and the page lock.  Return values are:
+ *
+ * SWAP_SUCCESS        - we succeeded in removing all mappings
+ * SWAP_AGAIN  - we missed a trylock, try again later
+ * SWAP_FAIL   - the page is unswappable
+ * SWAP_ERROR  - an error occurred
+ */
+int FASTCALL(try_to_unmap(struct page *));
+int try_to_unmap(struct page * page)
+{
+       struct pte_chain * pc, * next_pc, * prev_pc = NULL;
+       int ret = SWAP_SUCCESS;
+
+       /* This page should not be on the pageout lists. */
+       if (!VALID_PAGE(page) || PageReserved(page))
+               BUG();
+       if (!PageLocked(page))
+               BUG();
+       /* We need backing store to swap out a page. */
+       if (!page->mapping)
+               BUG();
+
+       for (pc = page->pte_chain; pc; pc = next_pc) {
+               next_pc = pc->next;
+               switch (try_to_unmap_one(page, pc->ptep)) {
+                       case SWAP_SUCCESS:
+                               /* Free the pte_chain struct. */
+                               pte_chain_free(pc, prev_pc, page);
+                               break;
+                       case SWAP_AGAIN:
+                               /* Skip this pte, remembering status. */
+                               prev_pc = pc;
+                               ret = SWAP_AGAIN;
+                               continue;
+                       case SWAP_FAIL:
+                               return SWAP_FAIL;
+                       case SWAP_ERROR:
+                               return SWAP_ERROR;
+               }
+       }
+
+       return ret;
+}
+
+/**
+ * page_over_rsslimit - test if the page is over its RSS limit
+ * @page - page to test
+ *
+ * This function returns true if the process owning this page
+ * is over its RSS (resident set size) limit.  For shared pages
+ * we make the optimisation of only checking the first process
+ * in the pte_chain list, this should catch hogs while not
+ * evicting pages shared by many processes.
+ * The caller needs to hold the pagemap_lru_lock.
+ */
+int FASTCALL(page_over_rsslimit(struct page *));
+int page_over_rsslimit(struct page * page)
+{
+       struct pte_chain * pte_chain = page->pte_chain;
+       struct mm_struct * mm;
+       pte_t * ptep;
+
+       /* No process is using the page. */
+       if (!pte_chain)
+               return 0;
+
+       ptep = pte_chain->ptep;
+       mm = ptep_to_mm(ptep);
+
+       return mm->rlimit_rss && (mm->rss > mm->rlimit_rss);
+}
+
+/**
+ * pte_chain_free - free pte_chain structure
+ * @pte_chain: pte_chain struct to free
+ * @prev_pte_chain: previous pte_chain on the list (may be NULL)
+ * @page: page this pte_chain hangs off (may be NULL)
+ *
+ * This function unlinks pte_chain from the singly linked list it
+ * may be on and adds the pte_chain to the free list. May also be
+ * called for new pte_chain structures which aren't on any list yet.
+ * Caller needs to hold the pagemap_lru_list.
+ */
+static inline void pte_chain_free(struct pte_chain * pte_chain, struct 
pte_chain * prev_pte_chain, struct page * page)
+{
+       if (prev_pte_chain)
+               prev_pte_chain->next = pte_chain->next;
+       else if (page)
+               page->pte_chain = pte_chain->next;
+
+       pte_chain->ptep = NULL;
+       pte_chain->next = pte_chain_freelist;
+       pte_chain_freelist = pte_chain;
+}
+
+/**
+ * pte_chain_alloc - allocate a pte_chain struct
+ *
+ * Returns a pointer to a fresh pte_chain structure. Allocates new
+ * pte_chain structures as required.
+ * Caller needs to hold the pagemap_lru_lock.
+ */
+static inline struct pte_chain * pte_chain_alloc(void)
+{
+       struct pte_chain * pte_chain;
+
+       /* Allocate new pte_chain structs as needed. */
+       if (!pte_chain_freelist)
+               alloc_new_pte_chains();
+
+       /* Grab the first pte_chain from the freelist. */
+       pte_chain = pte_chain_freelist;
+       pte_chain_freelist = pte_chain->next;
+       pte_chain->next = NULL;
+
+       return pte_chain;
+}
+
+/**
+ * alloc_new_pte_chains - convert a free page to pte_chain structures
+ *
+ * Grabs a free page and converts it to pte_chain structures. We really
+ * should pre-allocate these earlier in the pagefault path or come up
+ * with some other trick.
+ *
+ * Note that we cannot use the slab cache because the pte_chain structure
+ * is way smaller than the minimum size of a slab cache allocation.
+ */
+static void alloc_new_pte_chains(void)
+{
+       struct pte_chain * pte_chain = (void *) get_zeroed_page(GFP_ATOMIC);
+       int i = PAGE_SIZE / sizeof(struct pte_chain);
+
+       if (pte_chain) {
+               for (; i-- > 0; pte_chain++)
+                       pte_chain_free(pte_chain, NULL, NULL);
+       } else {
+               /* Yeah yeah, I'll fix the pte_chain allocation ... */
+               panic("Fix pte_chain allocation, you lazy bastard!\n");
+       }
+}
diff -Nru a/mm/slab.c b/mm/slab.c
--- a/mm/slab.c Fri Mar  1 18:19:44 2002
+++ b/mm/slab.c Fri Mar  1 18:19:44 2002
@@ -911,34 +911,45 @@
 #define drain_cpu_caches(cachep)       do { } while (0)
 #endif
 
+/**
+ * Called with the &cachep->spinlock held, returns number of slabs released
+ */
+static int __kmem_cache_shrink_locked(kmem_cache_t *cachep)
+{
+        slab_t *slabp;
+        int ret = 0;
+
+        /* If the cache is growing, stop shrinking. */
+        while (!cachep->growing) {
+                struct list_head *p;
+
+                p = cachep->slabs_free.prev;
+                if (p == &cachep->slabs_free)
+                        break;
+
+                slabp = list_entry(cachep->slabs_free.prev, slab_t, list);
+#if DEBUG
+                if (slabp->inuse)
+                        BUG();
+#endif
+                list_del(&slabp->list);
+
+                spin_unlock_irq(&cachep->spinlock);
+                kmem_slab_destroy(cachep, slabp);
+               ret++;
+                spin_lock_irq(&cachep->spinlock);
+        }
+        return ret;
+}
+
 static int __kmem_cache_shrink(kmem_cache_t *cachep)
 {
-       slab_t *slabp;
        int ret;
 
        drain_cpu_caches(cachep);
 
        spin_lock_irq(&cachep->spinlock);
-
-       /* If the cache is growing, stop shrinking. */
-       while (!cachep->growing) {
-               struct list_head *p;
-
-               p = cachep->slabs_free.prev;
-               if (p == &cachep->slabs_free)
-                       break;
-
-               slabp = list_entry(cachep->slabs_free.prev, slab_t, list);
-#if DEBUG
-               if (slabp->inuse)
-                       BUG();
-#endif
-               list_del(&slabp->list);
-
-               spin_unlock_irq(&cachep->spinlock);
-               kmem_slab_destroy(cachep, slabp);
-               spin_lock_irq(&cachep->spinlock);
-       }
+       __kmem_cache_shrink_locked(cachep);
        ret = !list_empty(&cachep->slabs_full) || 
!list_empty(&cachep->slabs_partial);
        spin_unlock_irq(&cachep->spinlock);
        return ret;
@@ -957,6 +968,24 @@
                BUG();
 
        return __kmem_cache_shrink(cachep);
+}
+
+/**
+ * kmem_cache_shrink_nr - Shrink a cache returning pages released
+ */
+int kmem_cache_shrink_nr(kmem_cache_t *cachep)
+{
+        int ret;
+
+        if (!cachep || in_interrupt() || !is_chained_kmem_cache(cachep))
+                BUG();
+
+       drain_cpu_caches(cachep);
+
+       spin_lock_irq(&cachep->spinlock);
+       ret = __kmem_cache_shrink_locked(cachep);
+       spin_unlock_irq(&cachep->spinlock);
+       return ret<<(cachep->gfporder);
 }
 
 /**
diff -Nru a/mm/swap.c b/mm/swap.c
--- a/mm/swap.c Fri Mar  1 18:19:44 2002
+++ b/mm/swap.c Fri Mar  1 18:19:44 2002
@@ -15,15 +15,29 @@
 
 #include <linux/mm.h>
 #include <linux/kernel_stat.h>
-#include <linux/swap.h>
 #include <linux/swapctl.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
+#include <linux/mm_inline.h>
 
 #include <asm/dma.h>
 #include <asm/uaccess.h> /* for copy_to/from_user */
 #include <asm/pgtable.h>
 
+/*
+ * We identify three levels of free memory.  We never let free mem
+ * fall below the freepages.min except for atomic allocations.  We
+ * start background swapping if we fall below freepages.high free
+ * pages, and we begin intensive swapping below freepages.low.
+ *
+ * Actual initialization is done in mm/page_alloc.c
+ */
+freepages_t freepages = {
+       0,      /* freepages.min */
+       0,      /* freepages.low */
+       0       /* freepages.high */
+};
+
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
 
@@ -33,17 +47,102 @@
        8,      /* do swap I/O in clusters of this size */
 };
 
+/**
+ * (de)activate_page - move pages from/to active and inactive lists
+ * @page: the page we want to move
+ * @nolock - are we already holding the pagemap_lru_lock?
+ *
+ * Deactivate_page will move an active page to the right
+ * inactive list, while activate_page will move a page back
+ * from one of the inactive lists to the active list. If
+ * called on a page which is not on any of the lists, the
+ * page is left alone.
+ */
+void FASTCALL(deactivate_page_nolock(struct page *));
+void deactivate_page_nolock(struct page * page)
+{
+       /*
+        * Don't touch it if it's not on the active list.
+        * (some pages aren't on any list at all)
+        */
+       ClearPageReferenced(page);
+       page->age = 0;
+       if (PageActive(page)) {
+               del_page_from_active_list(page);
+               add_page_to_inactive_dirty_list(page);
+       }
+}      
+
+void FASTCALL(deactivate_page(struct page *));
+void deactivate_page(struct page * page)
+{
+       spin_lock(&pagemap_lru_lock);
+       deactivate_page_nolock(page);
+       spin_unlock(&pagemap_lru_lock);
+}
+
+/**
+ * drop_page - like deactivate_page, but try inactive_clean list
+ * @page: the page to drop
+ *
+ * Try to move a page to the inactive_clean list, this succeeds if the
+ * page is clean and not in use by anybody. If the page cannot be placed
+ * on the inactive_clean list it is placed on the inactive_dirty list
+ * instead.
+ *
+ * Note: this function gets called with the pagemap_lru_lock held.
+ */
+void FASTCALL(drop_page(struct page *));
+void drop_page(struct page * page)
+{
+       if (!TryLockPage(page)) {
+               if (page->mapping && page->buffers) {
+                       page_cache_get(page);
+                       spin_unlock(&pagemap_lru_lock);
+                       try_to_release_page(page, GFP_NOIO);
+                       spin_lock(&pagemap_lru_lock);
+                       page_cache_release(page);
+               }
+               UnlockPage(page);
+       }
+
+       /* Make sure the page really is reclaimable. */
+       if (!page->mapping || PageDirty(page) || page->pte_chain ||
+                       page->buffers || page_count(page) > 1)
+               deactivate_page_nolock(page);
+
+       else if (page_count(page) == 1) {
+               ClearPageReferenced(page);
+               page->age = 0;
+               if (PageActive(page)) {
+                       del_page_from_active_list(page);
+                       add_page_to_inactive_clean_list(page);
+               } else if (PageInactiveDirty(page)) {
+                       del_page_from_inactive_dirty_list(page);
+                       add_page_to_inactive_clean_list(page);
+               }
+       }
+}
+
 /*
  * Move an inactive page to the active list.
  */
-static inline void activate_page_nolock(struct page * page)
+void FASTCALL(activate_page_nolock(struct page *));
+void activate_page_nolock(struct page * page)
 {
-       if (PageLRU(page) && !PageActive(page)) {
-               del_page_from_inactive_list(page);
+       if (PageInactiveDirty(page)) {
+               del_page_from_inactive_dirty_list(page);
+               add_page_to_active_list(page);
+       } else if (PageInactiveClean(page)) {
+               del_page_from_inactive_clean_list(page);
                add_page_to_active_list(page);
        }
+
+       /* Make sure the page gets a fair chance at staying active. */
+       page->age = max((int)page->age, PAGE_AGE_START);
 }
 
+void FASTCALL(activate_page(struct page *));
 void activate_page(struct page * page)
 {
        spin_lock(&pagemap_lru_lock);
@@ -55,11 +154,12 @@
  * lru_cache_add: add a page to the page lists
  * @page: the page to add
  */
+void FASTCALL(lru_cache_add(struct page *));
 void lru_cache_add(struct page * page)
 {
-       if (!TestSetPageLRU(page)) {
+       if (!PageLRU(page)) {
                spin_lock(&pagemap_lru_lock);
-               add_page_to_inactive_list(page);
+               add_page_to_active_list(page);
                spin_unlock(&pagemap_lru_lock);
        }
 }
@@ -71,14 +171,15 @@
  * This function is for when the caller already holds
  * the pagemap_lru_lock.
  */
+void FASTCALL(__lru_cache_del(struct page *));
 void __lru_cache_del(struct page * page)
 {
-       if (TestClearPageLRU(page)) {
-               if (PageActive(page)) {
-                       del_page_from_active_list(page);
-               } else {
-                       del_page_from_inactive_list(page);
-               }
+       if (PageActive(page)) {
+               del_page_from_active_list(page);
+       } else if (PageInactiveDirty(page)) {
+               del_page_from_inactive_dirty_list(page);
+       } else if (PageInactiveClean(page)) {
+               del_page_from_inactive_clean_list(page);
        }
 }
 
@@ -86,6 +187,7 @@
  * lru_cache_del: remove a page from the page lists
  * @page: the page to remove
  */
+void FASTCALL(lru_cache_del(struct page *));
 void lru_cache_del(struct page * page)
 {
        spin_lock(&pagemap_lru_lock);
diff -Nru a/mm/swap_state.c b/mm/swap_state.c
--- a/mm/swap_state.c   Fri Mar  1 18:19:44 2002
+++ b/mm/swap_state.c   Fri Mar  1 18:19:44 2002
@@ -89,6 +89,40 @@
        return 0;
 }
 
+/**
+ * add_to_swap - allocate swap space for a page
+ * @page: page we want to move to swap
+ *
+ * Allocate swap space for the page and add the page to the
+ * swap cache.  Caller needs to hold the page lock.
+ */
+int add_to_swap(struct page * page)
+{
+       swp_entry_t entry;
+
+       if (!PageLocked(page))
+               BUG();
+
+       for (;;) {
+               entry = get_swap_page();
+               if (!entry.val)
+                       return 0;
+               /*
+                * Add it to the swap cache and mark it dirty
+                * (adding to the page cache will clear the dirty
+                * and uptodate bits, so we need to do it again)
+                */
+               if (add_to_swap_cache(page, entry) == 0) {
+                       SetPageUptodate(page);
+                       set_page_dirty(page);
+                       swap_free(entry);
+                       return 1;
+               }
+               /* Raced with "speculative" read_swap_cache_async */
+               swap_free(entry);
+       }
+}
+
 /*
  * This must be called only on pages that have
  * been verified to be in the swap cache.
diff -Nru a/mm/swapfile.c b/mm/swapfile.c
--- a/mm/swapfile.c     Fri Mar  1 18:19:44 2002
+++ b/mm/swapfile.c     Fri Mar  1 18:19:44 2002
@@ -374,6 +374,7 @@
                return;
        get_page(page);
        set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
+       page_add_rmap(page, dir);
        swap_free(entry);
        ++vma->vm_mm->rss;
 }
diff -Nru a/mm/vmscan.c b/mm/vmscan.c
--- a/mm/vmscan.c       Fri Mar  1 18:19:44 2002
+++ b/mm/vmscan.c       Fri Mar  1 18:19:44 2002
@@ -1,6 +1,9 @@
 /*
  *  linux/mm/vmscan.c
  *
+ *  The pageout daemon, decides which pages to evict (swap out) and
+ *  does the actual work of freeing them.
+ *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *
  *  Swap reorganised 29.12.95, Stephen Tweedie.
@@ -21,9 +24,12 @@
 #include <linux/highmem.h>
 #include <linux/file.h>
 #include <linux/compiler.h>
+#include <linux/mm_inline.h>
 
 #include <asm/pgalloc.h>
 
+static void refill_freelist(void);
+static void wakeup_memwaiters(void);
 /*
  * The "priority" of VM scanning is how much of the queues we
  * will scan in one go. A value of 6 for DEF_PRIORITY implies
@@ -32,371 +38,275 @@
  */
 #define DEF_PRIORITY (6)
 
-/*
- * The swap-out function returns 1 if it successfully
- * scanned all the pages it was asked to (`count').
- * It returns zero if it couldn't do anything,
- *
- * rss may decrease because pages are shared, but this
- * doesn't count as having freed a page.
- */
-
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int try_to_swap_out(struct mm_struct * mm, struct 
vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page 
*page, zone_t * classzone)
+static inline void age_page_up(struct page *page)
 {
-       pte_t pte;
-       swp_entry_t entry;
+       page->age = min((int) (page->age + PAGE_AGE_ADV), PAGE_AGE_MAX); 
+}
 
-       /* Don't look at this pte if it's been accessed recently. */
-       if ((vma->vm_flags & VM_LOCKED) || 
ptep_test_and_clear_young(page_table)) {
-               mark_page_accessed(page);
-               return 0;
-       }
+static inline void age_page_down(struct page *page)
+{
+       page->age -= min(PAGE_AGE_DECL, (int)page->age);
+}
 
-       /* Don't bother unmapping pages that are active */
-       if (PageActive(page))
-               return 0;
+static inline int page_mapping_inuse(struct page * page)
+{
+       struct address_space * mapping = page->mapping;
 
-       /* Don't bother replenishing zones not under pressure.. */
-       if (!memclass(page_zone(page), classzone))
-               return 0;
+       /* Page is in somebody's page tables. */
+       if (page->pte_chain)
+               return 1;
 
-       if (TryLockPage(page))
+       /* XXX: does this happen ? */
+       if (!mapping)
                return 0;
 
-       /* From this point on, the odds are that we're going to
-        * nuke this pte, so read and clear the pte.  This hook
-        * is needed on CPUs which update the accessed and dirty
-        * bits in hardware.
-        */
-       flush_cache_page(vma, address);
-       pte = ptep_get_and_clear(page_table);
-       flush_tlb_page(vma, address);
-
-       if (pte_dirty(pte))
-               set_page_dirty(page);
+       /* File is mmaped by somebody. */
+       if (mapping->i_mmap || mapping->i_mmap_shared)
+               return 1;
 
-       /*
-        * Is the page already in the swap cache? If so, then
-        * we can just drop our reference to it without doing
-        * any IO - it's already up-to-date on disk.
-        */
-       if (PageSwapCache(page)) {
-               entry.val = page->index;
-               swap_duplicate(entry);
-set_swap_pte:
-               set_pte(page_table, swp_entry_to_pte(entry));
-drop_pte:
-               mm->rss--;
-               UnlockPage(page);
-               {
-                       int freeable = page_count(page) - !!page->buffers <= 2;
-                       page_cache_release(page);
-                       return freeable;
-               }
-       }
+       return 0;
+}
 
-       /*
-        * Is it a clean page? Then it must be recoverable
-        * by just paging it in again, and we can just drop
-        * it..  or if it's dirty but has backing store,
-        * just mark the page dirty and drop it.
-        *
-        * However, this won't actually free any real
-        * memory, as the page will just be in the page cache
-        * somewhere, and as such we should just continue
-        * our scan.
-        *
-        * Basically, this just makes it possible for us to do
-        * some real work in the future in "refill_inactive()".
-        */
-       if (page->mapping)
-               goto drop_pte;
-       if (!PageDirty(page))
-               goto drop_pte;
+/**
+ * reclaim_page - reclaims one page from the inactive_clean list
+ * @zone: reclaim a page from this zone
+ *
+ * The pages on the inactive_clean can be instantly reclaimed.
+ * The tests look impressive, but most of the time we'll grab
+ * the first page of the list and exit successfully.
+ */
+struct page * reclaim_page(zone_t * zone)
+{
+       struct page * page = NULL;
+       struct list_head * page_lru;
+       swp_entry_t entry = {0};
+       int maxscan;
 
        /*
-        * Anonymous buffercache pages can be left behind by
-        * concurrent truncate and pagefault.
+        * We need to hold the pagecache_lock around all tests to make sure
+        * reclaim_page() cannot race with find_get_page() and friends.
         */
-       if (page->buffers)
-               goto preserve;
+       spin_lock(&pagemap_lru_lock);
+       spin_lock(&pagecache_lock);
+       maxscan = zone->inactive_clean_pages;
+       while (maxscan-- && !list_empty(&zone->inactive_clean_list)) {
+               page_lru = zone->inactive_clean_list.prev;
+               page = list_entry(page_lru, struct page, lru);
 
-       /*
-        * This is a dirty, swappable page.  First of all,
-        * get a suitable swap entry for it, and make sure
-        * we have the swap cache set up to associate the
-        * page with that swap entry.
-        */
-       for (;;) {
-               entry = get_swap_page();
-               if (!entry.val)
-                       break;
-               /* Add it to the swap cache and mark it dirty
-                * (adding to the page cache will clear the dirty
-                * and uptodate bits, so we need to do it again)
-                */
-               if (add_to_swap_cache(page, entry) == 0) {
-                       SetPageUptodate(page);
-                       set_page_dirty(page);
-                       goto set_swap_pte;
+               /* Wrong page on list?! (list corruption, should not happen) */
+               if (unlikely(!PageInactiveClean(page))) {
+                       printk("VM: reclaim_page, wrong page on list.\n");
+                       list_del(page_lru);
+                       page_zone(page)->inactive_clean_pages--;
+                       continue;
                }
-               /* Raced with "speculative" read_swap_cache_async */
-               swap_free(entry);
-       }
-
-       /* No swap space left */
-preserve:
-       set_pte(page_table, pte);
-       UnlockPage(page);
-       return 0;
-}
 
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * 
vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * 
classzone)
-{
-       pte_t * pte;
-       unsigned long pmd_end;
-
-       if (pmd_none(*dir))
-               return count;
-       if (pmd_bad(*dir)) {
-               pmd_ERROR(*dir);
-               pmd_clear(dir);
-               return count;
-       }
-       
-       pte = pte_offset(dir, address);
-       
-       pmd_end = (address + PMD_SIZE) & PMD_MASK;
-       if (end > pmd_end)
-               end = pmd_end;
+               /* Page is being freed */
+               if (unlikely(page_count(page)) == 0) {
+                       list_del(page_lru);
+                       list_add(page_lru, &zone->inactive_clean_list);
+                       continue;
+               }
 
-       do {
-               if (pte_present(*pte)) {
-                       struct page *page = pte_page(*pte);
+               /* Page cannot be reclaimed ?  Move to inactive_dirty list. */
+               if (unlikely(page->pte_chain || page->buffers ||
+                               PageReferenced(page) || PageDirty(page) ||
+                               page_count(page) > 1 || TryLockPage(page))) {
+                       del_page_from_inactive_clean_list(page);
+                       add_page_to_inactive_dirty_list(page);
+                       continue;
+               }
 
-                       if (VALID_PAGE(page) && !PageReserved(page)) {
-                               count -= try_to_swap_out(mm, vma, address, pte, 
page, classzone);
-                               if (!count) {
-                                       address += PAGE_SIZE;
-                                       break;
-                               }
-                       }
+               /* OK, remove the page from the caches. */
+                if (PageSwapCache(page)) {
+                       entry.val = page->index;
+                       __delete_from_swap_cache(page);
+                       goto found_page;
                }
-               address += PAGE_SIZE;
-               pte++;
-       } while (address && (address < end));
-       mm->swap_address = address;
-       return count;
-}
 
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * 
vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * 
classzone)
-{
-       pmd_t * pmd;
-       unsigned long pgd_end;
+               if (page->mapping) {
+                       __remove_inode_page(page);
+                       goto found_page;
+               }
 
-       if (pgd_none(*dir))
-               return count;
-       if (pgd_bad(*dir)) {
-               pgd_ERROR(*dir);
-               pgd_clear(dir);
-               return count;
+               /* We should never ever get here. */
+               printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
+               list_del(page_lru);
+               zone->inactive_clean_pages--;
+               UnlockPage(page);
        }
+       spin_unlock(&pagecache_lock);
+       spin_unlock(&pagemap_lru_lock);
+       return NULL;
 
-       pmd = pmd_offset(dir, address);
-
-       pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;  
-       if (pgd_end && (end > pgd_end))
-               end = pgd_end;
-       
-       do {
-               count = swap_out_pmd(mm, vma, pmd, address, end, count, 
classzone);
-               if (!count)
-                       break;
-               address = (address + PMD_SIZE) & PMD_MASK;
-               pmd++;
-       } while (address && (address < end));
-       return count;
-}
-
-/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * 
vma, unsigned long address, int count, zone_t * classzone)
-{
-       pgd_t *pgdir;
-       unsigned long end;
-
-       /* Don't swap out areas which are reserved */
-       if (vma->vm_flags & VM_RESERVED)
-               return count;
-
-       pgdir = pgd_offset(mm, address);
-
-       end = vma->vm_end;
-       if (address >= end)
-               BUG();
-       do {
-               count = swap_out_pgd(mm, vma, pgdir, address, end, count, 
classzone);
-               if (!count)
-                       break;
-               address = (address + PGDIR_SIZE) & PGDIR_MASK;
-               pgdir++;
-       } while (address && (address < end));
-       return count;
+found_page:
+       del_page_from_inactive_clean_list(page);
+       spin_unlock(&pagecache_lock);
+       spin_unlock(&pagemap_lru_lock);
+       if (entry.val)
+               swap_free(entry);
+       UnlockPage(page);
+       page->age = PAGE_AGE_START;
+       if (page_count(page) != 1)
+               printk("VM: reclaim_page, found page with count %d!\n",
+                               page_count(page));
+       return page;
 }
 
-/* Placeholder for swap_out(): may be updated by fork.c:mmput() */
-struct mm_struct *swap_mm = &init_mm;
-
-/*
- * Returns remaining count of pages to be swapped out by followup call.
+/**
+ * page_dirty - do we need to write the data out to disk
+ * @page: page to test
+ *
+ * Returns true if the page contains data which needs to
+ * be written to disk.  Doesn't test the page tables (yet?).
  */
-static inline int swap_out_mm(struct mm_struct * mm, int count, int * 
mmcounter, zone_t * classzone)
+static inline int page_dirty(struct page *page)
 {
-       unsigned long address;
-       struct vm_area_struct* vma;
+       struct buffer_head *tmp, *bh;
 
-       /*
-        * Find the proper vm-area after freezing the vma chain 
-        * and ptes.
-        */
-       spin_lock(&mm->page_table_lock);
-       address = mm->swap_address;
-       if (address == TASK_SIZE || swap_mm != mm) {
-               /* We raced: don't count this mm but try again */
-               ++*mmcounter;
-               goto out_unlock;
-       }
-       vma = find_vma(mm, address);
-       if (vma) {
-               if (address < vma->vm_start)
-                       address = vma->vm_start;
-
-               for (;;) {
-                       count = swap_out_vma(mm, vma, address, count, 
classzone);
-                       vma = vma->vm_next;
-                       if (!vma)
-                               break;
-                       if (!count)
-                               goto out_unlock;
-                       address = vma->vm_start;
-               }
-       }
-       /* Indicate that we reached the end of address space */
-       mm->swap_address = TASK_SIZE;
+       if (PageDirty(page))
+               return 1;
 
-out_unlock:
-       spin_unlock(&mm->page_table_lock);
-       return count;
-}
+       if (page->mapping && !page->buffers)
+               return 0;
 
-static int FASTCALL(swap_out(unsigned int priority, unsigned int gfp_mask, 
zone_t * classzone));
-static int swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * 
classzone)
-{
-       int counter, nr_pages = SWAP_CLUSTER_MAX;
-       struct mm_struct *mm;
+       tmp = bh = page->buffers;
 
-       counter = mmlist_nr;
        do {
-               if (unlikely(current->need_resched)) {
-                       __set_current_state(TASK_RUNNING);
-                       schedule();
-               }
-
-               spin_lock(&mmlist_lock);
-               mm = swap_mm;
-               while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
-                       mm->swap_address = 0;
-                       mm = list_entry(mm->mmlist.next, struct mm_struct, 
mmlist);
-                       if (mm == swap_mm)
-                               goto empty;
-                       swap_mm = mm;
-               }
-
-               /* Make sure the mm doesn't disappear when we drop the lock.. */
-               atomic_inc(&mm->mm_users);
-               spin_unlock(&mmlist_lock);
-
-               nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
-
-               mmput(mm);
-
-               if (!nr_pages)
+               if (tmp->b_state & ((1<<BH_Dirty) | (1<<BH_Lock)))
                        return 1;
-       } while (--counter >= 0);
+               tmp = tmp->b_this_page;
+       } while (tmp != bh);
 
        return 0;
-
-empty:
-       spin_unlock(&mmlist_lock);
-       return 0;
 }
 
-static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned 
int gfp_mask, int priority));
-static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int 
gfp_mask, int priority)
+/**
+ * page_launder_zone - clean dirty inactive pages, move to inactive_clean list
+ * @zone: zone to free pages in
+ * @gfp_mask: what operations we are allowed to do
+ *
+ * This function is called when we are low on free / inactive_clean
+ * pages, its purpose is to refill the free/clean list as efficiently
+ * as possible.
+ *
+ * This means we do writes asynchronously as long as possible and will
+ * only sleep on IO when we don't have another option. Since writeouts
+ * cause disk seeks and make read IO slower, we skip writes alltogether
+ * when the amount of dirty pages is small.
+ *
+ * This code is heavily inspired by the FreeBSD source code. Thanks
+ * go out to Matthew Dillon.
+ */
+#define        CAN_DO_FS       ((gfp_mask & __GFP_FS) && should_write)
+int page_launder_zone(zone_t * zone, int gfp_mask, int priority)
 {
+       int maxscan, cleaned_pages, target;
        struct list_head * entry;
-       int max_scan = nr_inactive_pages / priority;
-       int max_mapped = min((nr_pages << (10 - priority)), max_scan / 10);
 
+       target = free_plenty(zone);
+       cleaned_pages = 0;
+       
+       /* The main launder loop. */
        spin_lock(&pagemap_lru_lock);
-       while (--max_scan >= 0 && (entry = inactive_list.prev) != 
&inactive_list) {
+       maxscan = zone->inactive_dirty_pages >> priority;
+       while (maxscan-- && !list_empty(&zone->inactive_dirty_list)) {
                struct page * page;
-
-               if (unlikely(current->need_resched)) {
+               
+               /* Low latency reschedule point */
+               if (current->need_resched) {
                        spin_unlock(&pagemap_lru_lock);
-                       __set_current_state(TASK_RUNNING);
                        schedule();
                        spin_lock(&pagemap_lru_lock);
                        continue;
                }
 
+               entry = zone->inactive_dirty_list.prev;
                page = list_entry(entry, struct page, lru);
 
-               if (unlikely(!PageLRU(page)))
-                       BUG();
-               if (unlikely(PageActive(page)))
-                       BUG();
+               if (cleaned_pages > target)
+                       break;
 
                list_del(entry);
-               list_add(entry, &inactive_list);
+               list_add(entry, &zone->inactive_dirty_list);
+
+               /* Wrong page on list?! (list corruption, should not happen) */
+               if (!PageInactiveDirty(page)) {
+                       printk("VM: page_launder, wrong page on list.\n");
+                       list_del(entry);
+                       nr_inactive_dirty_pages--;
+                       page_zone(page)->inactive_dirty_pages--;
+                       continue;
+               }
 
                /*
-                * Zero page counts can happen because we unlink the pages
-                * _after_ decrementing the usage count..
+                * The page is in active use or really unfreeable. Move to
+                * the active list and adjust the page age if needed.
                 */
-               if (unlikely(!page_count(page)))
+               if (page_referenced(page) && page_mapping_inuse(page) &&
+                               !page_over_rsslimit(page)) {
+                       del_page_from_inactive_dirty_list(page);
+                       add_page_to_active_list(page);
+                       page->age = max((int)page->age, PAGE_AGE_START);
                        continue;
+               }
 
-               if (!memclass(page_zone(page), classzone))
+               /*
+                * Page is being freed, don't worry about it.
+                */
+               if (unlikely(page_count(page)) == 0)
                        continue;
 
-               /* Racy check to avoid trylocking when not worthwhile */
-               if (!page->buffers && (page_count(page) != 1 || !page->mapping))
-                       goto page_mapped;
-
                /*
                 * The page is locked. IO in progress?
                 * Move it to the back of the list.
                 */
-               if (unlikely(TryLockPage(page))) {
-                       if (PageLaunder(page) && (gfp_mask & __GFP_FS)) {
-                               page_cache_get(page);
-                               spin_unlock(&pagemap_lru_lock);
-                               wait_on_page(page);
+               if (unlikely(TryLockPage(page)))
+                       continue;
+
+               /*
+                * Anonymous process memory without backing store. Try to
+                * allocate it some swap space here.
+                *
+                * XXX: implement swap clustering ?
+                */
+               if (page->pte_chain && !page->mapping && !page->buffers) {
+                       page_cache_get(page);
+                       spin_unlock(&pagemap_lru_lock);
+                       if (!add_to_swap(page)) {
+                               activate_page(page);
+                               UnlockPage(page);
                                page_cache_release(page);
                                spin_lock(&pagemap_lru_lock);
+                               continue;
+                       }
+                       page_cache_release(page);
+                       spin_lock(&pagemap_lru_lock);
+               }
+
+               /*
+                * The page is mapped into the page tables of one or more
+                * processes. Try to unmap it here.
+                */
+               if (page->pte_chain) {
+                       switch (try_to_unmap(page)) {
+                               case SWAP_ERROR:
+                               case SWAP_FAIL:
+                                       goto page_active;
+                               case SWAP_AGAIN:
+                                       UnlockPage(page);
+                                       continue;
+                               case SWAP_SUCCESS:
+                                       ; /* try to free the page below */
                        }
-                       continue;
                }
 
-               if (PageDirty(page) && is_page_cache_freeable(page) && 
page->mapping) {
+               if (PageDirty(page) && page->mapping) {
                        /*
                         * It is not critical here to write it only if
                         * the page is unmapped beause any direct writer
                         * like O_DIRECT would set the PG_dirty bitflag
-                        * on the phisical page after having successfully
+                        * on the physical page after having successfully
                         * pinned it and after the I/O to the page is finished,
                         * so the direct writes to the page cannot get lost.
                         */
@@ -425,7 +335,7 @@
                if (page->buffers) {
                        spin_unlock(&pagemap_lru_lock);
 
-                       /* avoid to free a locked page */
+                       /* To avoid freeing our page before we're done. */
                        page_cache_get(page);
 
                        if (try_to_release_page(page, gfp_mask)) {
@@ -443,14 +353,14 @@
                                        /* effectively free the page here */
                                        page_cache_release(page);
 
-                                       if (--nr_pages)
-                                               continue;
-                                       break;
+                                       cleaned_pages++;
+                                       continue;
                                } else {
                                        /*
-                                        * The page is still in pagecache so 
undo the stuff
-                                        * before the try_to_release_page since 
we've not
-                                        * finished and we can now try the next 
step.
+                                        * We freed the buffers but may have
+                                        * slept; undo the stuff we did before
+                                        * try_to_release_page and fall through
+                                        * to the next step.
                                         */
                                        page_cache_release(page);
 
@@ -466,227 +376,268 @@
                        }
                }
 
-               spin_lock(&pagecache_lock);
 
                /*
-                * this is the non-racy check for busy page.
+                * If the page is really freeable now, move it to the
+                * inactive_clean list.
+                *
+                * We re-test everything since the page could have been
+                * used by somebody else while we waited on IO above.
+                * This test is not safe from races, but only the one
+                * in reclaim_page() needs to be.
                 */
-               if (!page->mapping || !is_page_cache_freeable(page)) {
-                       spin_unlock(&pagecache_lock);
+               if (page->mapping && !PageDirty(page) && !page->pte_chain &&
+                               page_count(page) == 1) {
+                       del_page_from_inactive_dirty_list(page);
+                       add_page_to_inactive_clean_list(page);
                        UnlockPage(page);
-page_mapped:
-                       if (--max_mapped >= 0)
-                               continue;
-
+                       cleaned_pages++;
+               } else {
                        /*
-                        * Alert! We've found too many mapped pages on the
-                        * inactive list, so we start swapping out now!
+                        * OK, we don't know what to do with the page.
+                        * It's no use keeping it here, so we move it to
+                        * the active list.
                         */
-                       spin_unlock(&pagemap_lru_lock);
-                       swap_out(priority, gfp_mask, classzone);
-                       return nr_pages;
+page_active:
+                       del_page_from_inactive_dirty_list(page);
+                       add_page_to_active_list(page);
+                       UnlockPage(page);
+               }
+       }
+       spin_unlock(&pagemap_lru_lock);
+
+       /* Return the number of pages moved to the inactive_clean list. */
+       return cleaned_pages;
+}
+
+/**
+ * page_launder - clean dirty inactive pages, move to inactive_clean list
+ * @gfp_mask: what operations we are allowed to do
+ *
+ * This function iterates over all zones and calls page_launder_zone(),
+ * balancing still needs to be added...
+ */
+int page_launder(int gfp_mask)
+{
+       int maxtry = 1 << DEF_PRIORITY;
+       struct zone_struct * zone;
+       int freed = 0;
+
+       /* Global balancing while we have a global shortage. */
+       while (maxtry-- && free_high(ALL_ZONES) >= 0) {
+               for_each_zone(zone)
+                       if (free_plenty(zone) >= 0)
+                               freed += page_launder_zone(zone, gfp_mask, 6);
+       }
+       
+       /* Clean up the remaining zones with a serious shortage, if any. */
+       for_each_zone(zone)
+               if (free_min(zone) >= 0)
+                       freed += page_launder_zone(zone, gfp_mask, 0);
+
+       return freed;
+}
+
+/**
+ * refill_inactive_zone - scan the active list and find pages to deactivate
+ * @priority: how much are we allowed to scan
+ *
+ * This function will scan a portion of the active list of a zone to find
+ * unused pages, those pages will then be moved to the inactive list.
+ */
+int refill_inactive_zone(struct zone_struct * zone, int priority)
+{
+       int maxscan = zone->active_pages >> priority;
+       int target = inactive_high(zone);
+       struct list_head * page_lru;
+       int nr_deactivated = 0;
+       struct page * page;
+
+       /* Take the lock while messing with the list... */
+       spin_lock(&pagemap_lru_lock);
+       while (maxscan-- && !list_empty(&zone->active_list)) {
+               page_lru = zone->active_list.prev;
+               page = list_entry(page_lru, struct page, lru);
+
+               /* Wrong page on list?! (list corruption, should not happen) */
+               if (unlikely(!PageActive(page))) {
+                       printk("VM: refill_inactive, wrong page on list.\n");
+                       list_del(page_lru);
+                       nr_active_pages--;
+                       continue;
                }
 
                /*
-                * It is critical to check PageDirty _after_ we made sure
-                * the page is freeable* so not in use by anybody.
+                * If the object the page is in is not in use we don't
+                * bother with page aging.  If the page is touched again
+                * while on the inactive_clean list it'll be reactivated.
                 */
-               if (PageDirty(page)) {
-                       spin_unlock(&pagecache_lock);
-                       UnlockPage(page);
+               if (!page_mapping_inuse(page)) {
+                       drop_page(page);
                        continue;
                }
 
-               /* point of no return */
-               if (likely(!PageSwapCache(page))) {
-                       __remove_inode_page(page);
-                       spin_unlock(&pagecache_lock);
+               /*
+                * Do aging on the pages.
+                */
+               if (page_referenced(page)) {
+                       age_page_up(page);
                } else {
-                       swp_entry_t swap;
-                       swap.val = page->index;
-                       __delete_from_swap_cache(page);
-                       spin_unlock(&pagecache_lock);
-                       swap_free(swap);
+                       age_page_down(page);
                }
 
-               __lru_cache_del(page);
-               UnlockPage(page);
-
-               /* effectively free the page here */
-               page_cache_release(page);
+               /* 
+                * If the page age is 'hot' and the process using the
+                * page doesn't exceed its RSS limit we keep the page.
+                * Otherwise we move it to the inactive_dirty list.
+                */
+               if (page->age && !page_over_rsslimit(page)) {
+                       list_del(page_lru);
+                       list_add(page_lru, &zone->active_list);
+               } else {
+                       deactivate_page_nolock(page);
+                       if (++nr_deactivated > target)
+                               break;
+               }
 
-               if (--nr_pages)
-                       continue;
-               break;
+               /* Low latency reschedule point */
+               if (current->need_resched) {
+                       spin_unlock(&pagemap_lru_lock);
+                       schedule();
+                       spin_lock(&pagemap_lru_lock);
+               }
        }
        spin_unlock(&pagemap_lru_lock);
 
-       return nr_pages;
+       return nr_deactivated;
 }
 
-/*
- * This moves pages from the active list to
- * the inactive list.
+/**
+ * refill_inactive - checks all zones and refills the inactive list as needed
  *
- * We move them the other way when we see the
- * reference bit on the page.
+ * This function tries to balance page eviction from all zones by aging
+ * the pages from each zone in the same ratio until the global inactive
+ * shortage is resolved. After that it does one last "clean-up" scan to
+ * fix up local inactive shortages.
  */
-static void refill_inactive(int nr_pages)
+int refill_inactive(void)
 {
-       struct list_head * entry;
-
-       spin_lock(&pagemap_lru_lock);
-       entry = active_list.prev;
-       while (nr_pages && entry != &active_list) {
-               struct page * page;
+       int maxtry = 1 << DEF_PRIORITY;
+       zone_t * zone;
+       int ret = 0;
 
-               page = list_entry(entry, struct page, lru);
-               entry = entry->prev;
-               if (PageTestandClearReferenced(page)) {
-                       list_del(&page->lru);
-                       list_add(&page->lru, &active_list);
-                       continue;
+       /* Global balancing while we have a global shortage. */
+       while (maxtry-- && inactive_low(ALL_ZONES) >= 0) {
+               for_each_zone(zone) {
+                       if (inactive_high(zone) >= 0)
+                               ret += refill_inactive_zone(zone, DEF_PRIORITY);
                }
+       }
 
-               nr_pages--;
-
-               del_page_from_active_list(page);
-               add_page_to_inactive_list(page);
-               SetPageReferenced(page);
+       /* Local balancing for zones which really need it. */
+       for_each_zone(zone) {
+               if (inactive_min(zone) >= 0)
+                       ret += refill_inactive_zone(zone, 0);
        }
-       spin_unlock(&pagemap_lru_lock);
+
+       return ret;
 }
 
-static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned 
int gfp_mask, int nr_pages));
-static int shrink_caches(zone_t * classzone, int priority, unsigned int 
gfp_mask, int nr_pages)
+/**
+ * background_aging - slow background aging of zones
+ * @priority: priority at which to scan
+ *
+ * When the VM load is low or nonexistant, this function is
+ * called once a second to "sort" the pages in the VM. This
+ * way we know which pages to evict once a load spike happens.
+ * The effects of this function are very slow, the CPU usage
+ * should be minimal to nonexistant under most loads.
+ */
+static inline void background_aging(int priority)
 {
-       int chunk_size = nr_pages;
-       unsigned long ratio;
+       struct zone_struct * zone;
 
-       nr_pages -= kmem_cache_reap(gfp_mask);
-       if (nr_pages <= 0)
-               return 0;
-
-       nr_pages = chunk_size;
-       /* try to keep the active list 2/3 of the size of the cache */
-       ratio = (unsigned long) nr_pages * nr_active_pages / 
((nr_inactive_pages + 1) * 2);
-       refill_inactive(ratio);
+       for_each_zone(zone)
+               if (inactive_high(zone) > 0)
+                       refill_inactive_zone(zone, priority);
+}
 
-       nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, priority);
-       if (nr_pages <= 0)
-               return 0;
+/*
+ * Worker function for kswapd and try_to_free_pages, we get
+ * called whenever there is a shortage of free/inactive_clean
+ * pages.
+ *
+ * This function will also move pages to the inactive list,
+ * if needed.
+ */
+static int do_try_to_free_pages(unsigned int gfp_mask)
+{
+       int ret = 0;
 
-       shrink_dcache_memory(priority, gfp_mask);
-       shrink_icache_memory(priority, gfp_mask);
+       /*
+        * Eat memory from filesystem page cache, buffer cache,
+        * dentry, inode and filesystem quota caches.
+        */
+       ret += page_launder(gfp_mask);
+       ret += shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
+       ret += shrink_icache_memory(1, gfp_mask);
 #ifdef CONFIG_QUOTA
-       shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
+       ret += shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
 #endif
 
-       return nr_pages;
-}
+       /*
+        * Move pages from the active list to the inactive list.
+        */
+       refill_inactive();
 
-int try_to_free_pages(zone_t *classzone, unsigned int gfp_mask, unsigned int 
order)
-{
-       int priority = DEF_PRIORITY;
-       int nr_pages = SWAP_CLUSTER_MAX;
+       /*      
+        * Reclaim unused slab cache memory.
+        */
+       ret += kmem_cache_reap(gfp_mask);
 
-       gfp_mask = pf_gfp_mask(gfp_mask);
-       do {
-               nr_pages = shrink_caches(classzone, priority, gfp_mask, 
nr_pages);
-               if (nr_pages <= 0)
-                       return 1;
-       } while (--priority);
+       refill_freelist();
+
+       /* Start IO when needed. */
+       if (free_plenty(ALL_ZONES) > 0 || free_low(ANY_ZONE) > 0)
+               run_task_queue(&tq_disk);
 
        /*
         * Hmm.. Cache shrink failed - time to kill something?
         * Mhwahahhaha! This is the part I really like. Giggle.
         */
-       out_of_memory();
-       return 0;
-}
-
-DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
-
-static int check_classzone_need_balance(zone_t * classzone)
-{
-       zone_t * first_classzone;
+       if (!ret && free_low(ANY_ZONE) > 0)
+               out_of_memory();
 
-       first_classzone = classzone->zone_pgdat->node_zones;
-       while (classzone >= first_classzone) {
-               if (classzone->free_pages > classzone->pages_high)
-                       return 0;
-               classzone--;
-       }
-       return 1;
+       return ret;
 }
 
-static int kswapd_balance_pgdat(pg_data_t * pgdat)
+/**
+ * refill_freelist - move inactive_clean pages to free list if needed
+ *
+ * Move some pages from the inactive_clean lists to the free
+ * lists so atomic allocations have pages to work from. This
+ * function really only does something when we don't have a 
+ * userspace load on __alloc_pages().
+ *
+ * We refill the freelist in a bump from pages_min to pages_min * 2
+ * in order to give the buddy allocator something to play with.
+ */
+static void refill_freelist(void)
 {
-       int need_more_balance = 0, i;
+       struct page * page;
        zone_t * zone;
 
-       for (i = pgdat->nr_zones-1; i >= 0; i--) {
-               zone = pgdat->node_zones + i;
-               if (unlikely(current->need_resched))
-                       schedule();
-               if (!zone->need_balance)
-                       continue;
-               if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
-                       zone->need_balance = 0;
-                       __set_current_state(TASK_INTERRUPTIBLE);
-                       schedule_timeout(HZ);
+       for_each_zone(zone) {
+               if (!zone->size || zone->free_pages >= zone->pages_min)
                        continue;
-               }
-               if (check_classzone_need_balance(zone))
-                       need_more_balance = 1;
-               else
-                       zone->need_balance = 0;
-       }
-
-       return need_more_balance;
-}
-
-static void kswapd_balance(void)
-{
-       int need_more_balance;
-       pg_data_t * pgdat;
-
-       do {
-               need_more_balance = 0;
-               pgdat = pgdat_list;
-               do
-                       need_more_balance |= kswapd_balance_pgdat(pgdat);
-               while ((pgdat = pgdat->node_next));
-       } while (need_more_balance);
-}
 
-static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
-{
-       zone_t * zone;
-       int i;
-
-       for (i = pgdat->nr_zones-1; i >= 0; i--) {
-               zone = pgdat->node_zones + i;
-               if (!zone->need_balance)
-                       continue;
-               return 0;
+               while (zone->free_pages < zone->pages_min * 2) {
+                       page = reclaim_page(zone);
+                       if (!page)
+                               break;
+                       __free_page(page);
+               }
        }
-
-       return 1;
-}
-
-static int kswapd_can_sleep(void)
-{
-       pg_data_t * pgdat;
-
-       pgdat = pgdat_list;
-       do {
-               if (kswapd_can_sleep_pgdat(pgdat))
-                       continue;
-               return 0;
-       } while ((pgdat = pgdat->node_next));
-
-       return 1;
 }
 
 /*
@@ -705,7 +656,6 @@
 int kswapd(void *unused)
 {
        struct task_struct *tsk = current;
-       DECLARE_WAITQUEUE(wait, tsk);
 
        daemonize();
        strcpy(tsk->comm, "kswapd");
@@ -729,24 +679,156 @@
         * Kswapd main loop.
         */
        for (;;) {
-               __set_current_state(TASK_INTERRUPTIBLE);
-               add_wait_queue(&kswapd_wait, &wait);
+               static long recalc = 0;
 
-               mb();
-               if (kswapd_can_sleep())
-                       schedule();
+               /*
+                * We try to rebalance the VM either when we have a
+                * global shortage of free pages or when one particular
+                * zone is very short on free pages.
+                */
+               if (free_high(ALL_ZONES) >= 0 || free_low(ANY_ZONE) > 0)
+                       do_try_to_free_pages(GFP_KSWAPD);
+
+               refill_freelist();
+
+               /* Once a second ... */
+               if (time_after(jiffies, recalc + HZ)) {
+                       recalc = jiffies;
+
+                       /* Do background page aging. */
+                       background_aging(DEF_PRIORITY);
+               }
+
+               wakeup_memwaiters();
+       }
+}
+
+DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
+DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
+#define VM_SHOULD_SLEEP (free_low(ALL_ZONES) > (freepages.min / 2))
+
+/**
+ * wakeup_kswapd - wake up the pageout daemon
+ * gfp_mask: page freeing flags
+ *
+ * This function wakes up kswapd and can, under heavy VM pressure,
+ * put the calling task to sleep temporarily.
+ */
+void wakeup_kswapd(unsigned int gfp_mask)
+{
+       DECLARE_WAITQUEUE(wait, current);
 
-               __set_current_state(TASK_RUNNING);
+       /* If we're in the memory freeing business ourself, don't sleep
+        * but just wake kswapd and go back to businesss.
+        */
+       if (current->flags & PF_MEMALLOC) {
+               wake_up_interruptible(&kswapd_wait);
+               return;
+       }
+
+       /* We need all of kswapd's GFP flags, otherwise we can't sleep on it.
+        * We still wake kswapd of course.
+        */
+       if ((gfp_mask & GFP_KSWAPD) != GFP_KSWAPD) {
+               wake_up_interruptible(&kswapd_wait);
+               return;
+       }
+       
+       add_wait_queue(&kswapd_done, &wait);
+        set_current_state(TASK_UNINTERRUPTIBLE);
+        
+        /* Wake kswapd .... */
+        wake_up_interruptible(&kswapd_wait);
+        
+        /* ... and check if we need to wait on it */
+       if (VM_SHOULD_SLEEP)
+               schedule();
+       set_current_state(TASK_RUNNING);
+       remove_wait_queue(&kswapd_done, &wait);
+}
+
+static void wakeup_memwaiters(void)
+{
+       DECLARE_WAITQUEUE(wait, current);
+               
+       /* Enough free RAM, we can easily keep up with memory demand. */
+       add_wait_queue(&kswapd_wait, &wait);
+       set_current_state(TASK_INTERRUPTIBLE);
+
+       if (free_high(ALL_ZONES) <= 0) {
+               wake_up(&kswapd_done);
+               schedule_timeout(HZ);
                remove_wait_queue(&kswapd_wait, &wait);
+               return;
+       }
+       remove_wait_queue(&kswapd_wait, &wait);
 
-               /*
-                * If we actually get into a low-memory situation,
-                * the processes needing more memory will wake us
-                * up on a more timely basis.
-                */
-               kswapd_balance();
-               run_task_queue(&tq_disk);
+       /* 
+        * kswapd is going to sleep for a long time. Wake up the waiters to
+        * prevent them to get stuck while waiting for us.
+        */
+       wake_up(&kswapd_done);
+
+       /* OK, the VM is very loaded. Sleep instead of using all CPU. */
+       set_current_state(TASK_UNINTERRUPTIBLE);
+       schedule_timeout(HZ / 4);
+       return;
+}
+
+/**
+ * try_to_free_pages - run the pageout code ourselves
+ * gfp_mask: mask of things the pageout code is allowed to do
+ *
+ * When the load on the system gets higher, it can happen
+ * that kswapd no longer manages to keep enough memory
+ * free. In those cases user programs allocating memory
+ * will call try_to_free_pages() and help the pageout code.
+ * This has the effects of freeing memory and slowing down
+ * the largest memory hogs a bit.
+ */
+int try_to_free_pages(unsigned int gfp_mask)
+{
+       int ret = 1;
+
+       gfp_mask = pf_gfp_mask(gfp_mask);
+       if (gfp_mask & __GFP_WAIT) {
+               current->flags |= PF_MEMALLOC;
+               ret = do_try_to_free_pages(gfp_mask);
+               current->flags &= ~PF_MEMALLOC;
        }
+
+       return ret;
+}
+
+/**
+ * rss_free_pages - run part of the pageout code and slow down a bit
+ * @gfp_mask: mask of things the pageout code is allowed to do
+ *
+ * This function is called when a task is over its RSS limit and
+ * has a page fault.  It's goal is to free some memory so non-hogs
+ * can run faster and slow down itself when needed so it won't eat
+ * the memory non-hogs can use.
+ */
+void rss_free_pages(unsigned int gfp_mask)
+{
+       long pause = 0;
+
+       if (current->flags & PF_MEMALLOC)
+               return;
+
+       current->flags |= PF_MEMALLOC;
+
+       do {
+               page_launder(gfp_mask);
+
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               schedule_timeout(pause);
+               set_current_state(TASK_RUNNING);
+               pause++;
+       } while (free_high(ALL_ZONES) >= 0);
+
+       current->flags &= ~PF_MEMALLOC;
+       return;
 }
 
 static int __init kswapd_init(void)
<Prev in Thread] Current Thread [Next in Thread>