netdev
[Top] [All Lists]

[PATCH + RFC] neighbour/ARP cache scalability

To: Linux Netdev List <netdev@xxxxxxxxxxx>
Subject: [PATCH + RFC] neighbour/ARP cache scalability
From: Harald Welte <laforge@xxxxxxxxxxxx>
Date: Tue, 21 Sep 2004 00:51:40 +0200
Sender: netdev-bounce@xxxxxxxxxxx
User-agent: Mutt/1.5.6+20040818i
Hi!

I've recently had some problems with the neighbour cache on systems
which have large layer-2 networks (such as 10/12 ethernet interfaces,
most of them attached to networks with a /16 netmaks).  Yes, I know,
networks of that size is a bad design anyway.  But at the same time
there are people using such strange setups ...

The current garbage collection mechanism's ultimate goal is to prevent
ARP floods.  It therefore never expires INCOMPLETE entries from the
neighbour cache.  I've seen systems in a state which is what I believe
beign completely starved of neighbour cache entries, while gc is unable
to evict anything => increasing gc_thesh2/gc_thresh3 to large values.

As a result, on systems with large networks (think of even less than 16
bit netmasks!) the neighbour table can grow quite significantly,
especially if some jerk runs pktgen with random destinations, and we
start expiring 'real' neighbours in favour of incomplete ones.  

This large number of ncache entries is distributed over only 32 buckets,
which is quite unsatisfactory.  Especially if we consider that the
distribution within the hash has been reported to be quite sub-optimal[1]

Tim Gardner has already sent two patches for 2.4.x on this some time
ago [1], [2] adressing this issue - but none of them was integrated.

I've now ported [2] to 2.6.9-rc2-bk6, and dropped [1]. Furthermore, I've
added:
        - neigh_hash_bits boot parameter (if not specified, auto-sizing
          hash based on system ram)
        - jenkins hash for ARP hash function, like Dave indicated [3]
        - make number of buckets readable via /proc/

The patch is not yet intende for mainline inclusion, since I want to
have feedback on

1) should I try to use jhash for ipv6, too?

2) Dave has indicated that there should be an upper limit for hash
   buckets.  What is considered a reasonable upper bound, even for very
   large systems?

3) What do you think about removing gc_interval in favour of the dynamic
   interval calculation, combined with the 'one bucket per gc run'
   approach that Tim selected?

4) Shouldn't we set gc_thresh3 automatically based on the netmassk of
   the interfaces we have IFF_RUNNING?  

5) What is the proposed solution for IPv6, when you have /48 or /64 bit
   prefixes and systems become even more vulnerable to neighbour cache
   attacks?  

6) Since everybody agrees the current scheme of bucket-sizing based on
   memory only is a bad idea, do we want to unify this bucket sizing
   code, so people have one single knob (boot option) which they can use
   to give the system some metrics on how large it should scale all the
   network hash tables?

[1] http://oss.sgi.com/projects/netdev/archive/2004-03/msg00265.html
[2] http://oss.sgi.com/projects/netdev/archive/2004-03/msg00250.html
[3] http://oss.sgi.com/projects/netdev/archive/2004-03/msg00279.html


diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff 
linux-2.6.9-rc2-bk6/net/atm/clip.c linux-2.6.9-rc2-bk6-ncache/net/atm/clip.c
--- linux-2.6.9-rc2-bk6/net/atm/clip.c  2004-09-21 00:23:16.000000000 +0200
+++ linux-2.6.9-rc2-bk6-ncache/net/atm/clip.c   2004-09-20 20:25:48.000000000 
+0200
@@ -130,7 +130,7 @@
 
        /*DPRINTK("idle_timer_check\n");*/
        write_lock(&clip_tbl.lock);
-       for (i = 0; i <= NEIGH_HASHMASK; i++) {
+       for (i = 0; i < clip_tbl.num_hash_buckets; i++) {
                struct neighbour **np;
 
                for (np = &clip_tbl.hash_buckets[i]; *np;) {
@@ -341,6 +341,7 @@
        return 0;
 }
 
+static struct neigh_table clip_tbl;
 static u32 clip_hash(const void *pkey, const struct net_device *dev)
 {
        u32 hash_val;
@@ -349,7 +350,7 @@
        hash_val ^= (hash_val>>16);
        hash_val ^= hash_val>>8;
        hash_val ^= hash_val>>3;
-       hash_val = (hash_val^dev->ifindex)&NEIGH_HASHMASK;
+       hash_val = (hash_val^dev->ifindex)&(clip_tbl.num_hash_buckets-1);
 
        return hash_val;
 }
@@ -894,7 +895,7 @@
 {
        void *v = NULL;
 
-       for (; state->bucket <= NEIGH_HASHMASK; state->bucket++) {
+       for (; state->bucket < clip_tbl.num_hash_buckets; state->bucket++) {
                for (; state->n; state->n = state->n->next) {
                        v = arp_vcc_walk(state, NEIGH2ENTRY(state->n), &l);
                        if (v)
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff 
linux-2.6.9-rc2-bk6/net/core/neighbour.c 
linux-2.6.9-rc2-bk6-ncache/net/core/neighbour.c
--- linux-2.6.9-rc2-bk6/net/core/neighbour.c    2004-09-21 00:23:16.000000000 
+0200
+++ linux-2.6.9-rc2-bk6-ncache/net/core/neighbour.c     2004-09-20 
22:13:56.000000000 +0200
@@ -12,6 +12,10 @@
  *
  *     Fixes:
  *     Vitaly E. Lavrov        releasing NULL neighbor in neigh_add.
+ *
+ *     Changes:
+ *     2004-09-17: Make cache size dynamically sized + configurable 
+ *       (Tim Gardner <timg@xxxxxxx> & Harald Welte <laforge@xxxxxxxxxxxxx>)
  */
 
 #include <linux/config.h>
@@ -47,6 +51,9 @@
 #define NEIGH_PRINTK2 NEIGH_PRINTK
 #endif
 
+static unsigned int neigh_hash_bits;
+module_param(neigh_hash_bits, int, 0400);
+
 static void neigh_timer_handler(unsigned long arg);
 #ifdef CONFIG_ARPD
 static void neigh_app_notify(struct neighbour *n);
@@ -113,7 +120,7 @@
        int shrunk = 0;
        int i;
 
-       for (i = 0; i <= NEIGH_HASHMASK; i++) {
+       for (i = 0; i < tbl->num_hash_buckets; i++) {
                struct neighbour *n, **np;
 
                np = &tbl->hash_buckets[i];
@@ -177,7 +184,7 @@
 
        write_lock_bh(&tbl->lock);
 
-       for (i=0; i <= NEIGH_HASHMASK; i++) {
+       for (i=0; i < tbl->num_hash_buckets; i++) {
                struct neighbour *n, **np;
 
                np = &tbl->hash_buckets[i];
@@ -204,7 +211,7 @@
 
        write_lock_bh(&tbl->lock);
 
-       for (i = 0; i <= NEIGH_HASHMASK; i++) {
+       for (i = 0; i < tbl->num_hash_buckets; i++) {
                struct neighbour *n, **np = &tbl->hash_buckets[i];
 
                while ((n = *np) != NULL) {
@@ -545,9 +552,8 @@
 static void neigh_periodic_timer(unsigned long arg)
 {
        struct neigh_table *tbl = (struct neigh_table *)arg;
+       struct neighbour *n, **np;
        unsigned long now = jiffies;
-       int i;
-
 
        write_lock(&tbl->lock);
 
@@ -563,41 +569,44 @@
                                neigh_rand_reach_time(p->base_reachable_time);
        }
 
-       for (i = 0; i <= NEIGH_HASHMASK; i++) {
-               struct neighbour *n, **np;
+       tbl->curr_hash_bucket &= (tbl->num_hash_buckets-1);
+       np = &tbl->hash_buckets[tbl->curr_hash_bucket++];
 
-               np = &tbl->hash_buckets[i];
-               while ((n = *np) != NULL) {
-                       unsigned state;
+       while ((n = *np) != NULL) {
+               unsigned state;
 
-                       write_lock(&n->lock);
+               write_lock(&n->lock);
 
-                       state = n->nud_state;
-                       if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
-                               write_unlock(&n->lock);
-                               goto next_elt;
-                       }
+               state = n->nud_state;
+               if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
+                       write_unlock(&n->lock);
+                       goto next_elt;
+               }
 
-                       if (time_before(n->used, n->confirmed))
-                               n->used = n->confirmed;
+               if (time_before(n->used, n->confirmed))
+                       n->used = n->confirmed;
 
-                       if (atomic_read(&n->refcnt) == 1 &&
-                           (state == NUD_FAILED ||
-                            time_after(now, n->used + 
n->parms->gc_staletime))) {
-                               *np = n->next;
-                               n->dead = 1;
-                               write_unlock(&n->lock);
-                               neigh_release(n);
-                               continue;
-                       }
+               if (atomic_read(&n->refcnt) == 1 &&
+                   (state == NUD_FAILED ||
+                    time_after(now, n->used + n->parms->gc_staletime))) {
+                       *np = n->next;
+                       n->dead = 1;
                        write_unlock(&n->lock);
+                       neigh_release(n);
+                       continue;
+               }
+               write_unlock(&n->lock);
 
 next_elt:
-                       np = &n->next;
-               }
+               np = &n->next;
        }
-
-       mod_timer(&tbl->gc_timer, now + tbl->gc_interval);
+       /* Cycle through all hash buckets every base_reachable_time/2 ticks.
+        * ARP entry timeouts range from 1/2 base_reachable_time to 3/2
+        * base_reachable_time. */
+ 
+       mod_timer(&tbl->gc_timer, now + 
+                ((tbl->parms.base_reachable_time>>1)/(tbl->num_hash_buckets)));
+ 
        write_unlock(&tbl->lock);
 }
 
@@ -1205,6 +1214,37 @@
 void neigh_table_init(struct neigh_table *tbl)
 {
        unsigned long now = jiffies;
+       unsigned int goal = neigh_hash_bits;
+
+       /* Allocate a power of 2 hash buckets for each power of 2 MB of RAM. */
+       if (!goal) {
+               unsigned int ram_mb = (num_physpages*PAGE_SIZE) / (1024*1024);
+               goal = 31;
+               while ((1<<goal) > ram_mb)
+               {
+                       goal--;
+               }
+       }
+
+       tbl->hash_buckets = NULL;
+       while (goal && (!tbl->hash_buckets)) {
+               tbl->num_hash_buckets = (1<<goal);
+               tbl->hash_buckets = kmalloc(sizeof(struct neighbour *)
+                                           *tbl->num_hash_buckets, GFP_ATOMIC);
+               goal--;
+       }
+
+       if (tbl->hash_buckets == NULL)
+               panic("%s: Could not allocate memory for hash buckets "
+                     "of table %s.\n", tbl->id, __FUNCTION__);
+       memset(tbl->hash_buckets, 0,
+              sizeof(struct neighbour *)*tbl->num_hash_buckets);
+
+       if (neigh_hash_bits && 
+           (tbl->num_hash_buckets != ((1<<neigh_hash_bits)+1)))
+               printk(KERN_WARNING "%s: Couldn't allocate %u hash buckets, "
+                      "did %u instead.\n", __FUNCTION__,
+                       (1<<neigh_hash_bits)+1, tbl->num_hash_buckets);
 
        atomic_set(&tbl->parms.refcnt, 1);
        INIT_RCU_HEAD(&tbl->parms.rcu_head);
@@ -1224,8 +1264,7 @@
        init_timer(&tbl->gc_timer);
        tbl->gc_timer.data     = (unsigned long)tbl;
        tbl->gc_timer.function = neigh_periodic_timer;
-       tbl->gc_timer.expires  = now + tbl->gc_interval +
-                                tbl->parms.reachable_time;
+       tbl->gc_timer.expires  = now + 1;
        add_timer(&tbl->gc_timer);
 
        init_timer(&tbl->proxy_timer);
@@ -1439,7 +1478,7 @@
        int rc, h, s_h = cb->args[1];
        int idx, s_idx = idx = cb->args[2];
 
-       for (h = 0; h <= NEIGH_HASHMASK; h++) {
+       for (h = 0; h < tbl->num_hash_buckets; h++) {
                if (h < s_h)
                        continue;
                if (h > s_h)
@@ -1628,14 +1667,6 @@
                        .proc_handler   = &proc_dointvec_userhz_jiffies,
                },
                {
-                       .ctl_name       = NET_NEIGH_GC_INTERVAL,
-                       .procname       = "gc_interval",
-                       .maxlen         = sizeof(int),
-                       .mode           = 0644,
-                       .proc_handler   = &proc_dointvec_jiffies,
-                       .strategy       = &sysctl_jiffies,
-               },
-               {
                        .ctl_name       = NET_NEIGH_GC_THRESH1,
                        .procname       = "gc_thresh1",
                        .maxlen         = sizeof(int),
@@ -1656,6 +1687,13 @@
                        .mode           = 0644,
                        .proc_handler   = &proc_dointvec,
                },
+               {
+                       .ctl_name       = NET_NEIGH_NUM_HASH_BUCKETS,
+                       .procname       = "hash_buckets",
+                       .maxlen         = sizeof(int),
+                       .mode           = 0400,
+                       .proc_handler   = &proc_dointvec,
+               },
        },
        .neigh_dev = {
                {
@@ -1719,10 +1757,13 @@
                t->neigh_dev[0].ctl_name = dev->ifindex;
                memset(&t->neigh_vars[12], 0, sizeof(ctl_table));
        } else {
+               /* If you listen carefully, you can actually hear this
+                * code suck */
                t->neigh_vars[12].data = (int *)(p + 1);
                t->neigh_vars[13].data = (int *)(p + 1) + 1;
                t->neigh_vars[14].data = (int *)(p + 1) + 2;
                t->neigh_vars[15].data = (int *)(p + 1) + 3;
+               t->neigh_vars[16].data = (int *)(p + 1) + 4;
        }
 
        dev_name = net_sysctl_strdup(dev_name_source);
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff 
linux-2.6.9-rc2-bk6/net/decnet/dn_neigh.c 
linux-2.6.9-rc2-bk6-ncache/net/decnet/dn_neigh.c
--- linux-2.6.9-rc2-bk6/net/decnet/dn_neigh.c   2004-09-13 07:32:55.000000000 
+0200
+++ linux-2.6.9-rc2-bk6-ncache/net/decnet/dn_neigh.c    2004-09-20 
20:25:48.000000000 +0200
@@ -128,7 +128,7 @@
        hash_val ^= (hash_val >> 10);
        hash_val ^= (hash_val >> 3);
 
-       return hash_val & NEIGH_HASHMASK;
+       return hash_val & (dn_neigh_table.num_hash_buckets-1);
 }
 
 static int dn_neigh_construct(struct neighbour *neigh)
@@ -526,7 +526,7 @@
 
        read_lock_bh(&tbl->lock);
 
-       for(i = 0; i < NEIGH_HASHMASK; i++) {
+       for(i = 0; i < tbl->num_hash_buckets; i++) {
                for(neigh = tbl->hash_buckets[i]; neigh != NULL; neigh = 
neigh->next) {
                        if (neigh->dev != dev)
                                continue;
@@ -567,7 +567,7 @@
        struct neighbour *n = NULL;
 
        for(state->bucket = 0;
-           state->bucket <= NEIGH_HASHMASK;
+           state->bucket < dn_neigh_table.num_hash_buckets;
            ++state->bucket) {
                n = dn_neigh_table.hash_buckets[state->bucket];
                if (n)
@@ -586,7 +586,7 @@
 try_again:
        if (n)
                goto out;
-       if (++state->bucket > NEIGH_HASHMASK)
+       if (++state->bucket >= dn_neigh_table.num_hash_buckets)
                goto out;
        n = dn_neigh_table.hash_buckets[state->bucket];
        goto try_again;
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff 
linux-2.6.9-rc2-bk6/net/ipv4/arp.c linux-2.6.9-rc2-bk6-ncache/net/ipv4/arp.c
--- linux-2.6.9-rc2-bk6/net/ipv4/arp.c  2004-09-21 00:23:16.000000000 +0200
+++ linux-2.6.9-rc2-bk6-ncache/net/ipv4/arp.c   2004-09-20 20:31:21.000000000 
+0200
@@ -71,6 +71,7 @@
  *                                     arp_xmit so intermediate drivers like
  *                                     bonding can change the skb before
  *                                     sending (e.g. insert 8021q tag).
+ *             Harald Welte    :       convert to make use of jenkins hash
  */
 
 #include <linux/module.h>
@@ -97,6 +98,7 @@
 #include <linux/init.h>
 #include <linux/net.h>
 #include <linux/rcupdate.h>
+#include <linux/jhash.h>
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
 #endif
@@ -124,6 +126,9 @@
 
 #include <linux/netfilter_arp.h>
 
+static u32 arp_hash_rnd;
+static int arp_hash_rnd_initted;
+
 /*
  *     Interface to generic neighbour cache.
  */
@@ -194,7 +199,6 @@
                .proxy_qlen =           64,
                .locktime =             1 * HZ,
        },
-       .gc_interval =  30 * HZ,
        .gc_thresh1 =   128,
        .gc_thresh2 =   512,
        .gc_thresh3 =   1024,
@@ -223,15 +227,13 @@
 
 static u32 arp_hash(const void *pkey, const struct net_device *dev)
 {
-       u32 hash_val;
-
-       hash_val = *(u32*)pkey;
-       hash_val ^= (hash_val>>16);
-       hash_val ^= hash_val>>8;
-       hash_val ^= hash_val>>3;
-       hash_val = (hash_val^dev->ifindex)&NEIGH_HASHMASK;
+       if (unlikely(!arp_hash_rnd_initted)) {
+               get_random_bytes(&arp_hash_rnd, 4);
+               arp_hash_rnd_initted = 1;
+       }
 
-       return hash_val;
+       return (jhash_2words(*(u32 *)pkey, dev->ifindex,
+                            arp_hash_rnd) & arp_tbl.num_hash_buckets-1);
 }
 
 static int arp_constructor(struct neighbour *neigh)
@@ -1281,7 +1283,7 @@
        state->is_pneigh = 0;
 
        for (state->bucket = 0;
-            state->bucket <= NEIGH_HASHMASK;
+            state->bucket < arp_tbl.num_hash_buckets;
             ++state->bucket) {
                n = arp_tbl.hash_buckets[state->bucket];
                while (n && !(n->nud_state & ~NUD_NOARP))
@@ -1307,7 +1309,7 @@
 
        if (n)
                goto out;
-       if (++state->bucket > NEIGH_HASHMASK)
+       if (++state->bucket >= arp_tbl.num_hash_buckets)
                goto out;
        n = arp_tbl.hash_buckets[state->bucket];
        goto try_again;
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff 
linux-2.6.9-rc2-bk6/net/ipv6/ndisc.c linux-2.6.9-rc2-bk6-ncache/net/ipv6/ndisc.c
--- linux-2.6.9-rc2-bk6/net/ipv6/ndisc.c        2004-09-21 00:23:16.000000000 
+0200
+++ linux-2.6.9-rc2-bk6-ncache/net/ipv6/ndisc.c 2004-09-20 20:25:48.000000000 
+0200
@@ -147,7 +147,6 @@
                .proxy_delay =          (8 * HZ) / 10,
                .proxy_qlen =           64,
        },
-       .gc_interval =    30 * HZ,
        .gc_thresh1 =    128,
        .gc_thresh2 =    512,
        .gc_thresh3 =   1024,
@@ -276,7 +275,7 @@
        hash_val ^= (hash_val>>16);
        hash_val ^= hash_val>>8;
        hash_val ^= hash_val>>3;
-       hash_val = (hash_val^dev->ifindex)&NEIGH_HASHMASK;
+       hash_val = (hash_val^dev->ifindex)&(nd_tbl.num_hash_buckets-1);
 
        return hash_val;
 }
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff 
linux-2.6.9-rc2-bk6/include/net/neighbour.h 
linux-2.6.9-rc2-bk6-ncache/include/net/neighbour.h
--- linux-2.6.9-rc2-bk6/include/net/neighbour.h 2004-09-21 00:23:16.000000000 
+0200
+++ linux-2.6.9-rc2-bk6-ncache/include/net/neighbour.h  2004-09-20 
22:12:50.000000000 +0200
@@ -139,7 +139,6 @@
        u8                      key[0];
 };
 
-#define NEIGH_HASHMASK         0x1F
 #define PNEIGH_HASHMASK                0xF
 
 /*
@@ -160,11 +159,12 @@
        void                    (*proxy_redo)(struct sk_buff *skb);
        char                    *id;
        struct neigh_parms      parms;
-       /* HACK. gc_* shoul follow parms without a gap! */
-       int                     gc_interval;
+       /* UGLY HACK. gc_* and num_hash_buckets must follow parms without a
+        * gap! */
        int                     gc_thresh1;
        int                     gc_thresh2;
        int                     gc_thresh3;
+       int                     num_hash_buckets;
        unsigned long           last_flush;
        struct timer_list       gc_timer;
        struct timer_list       proxy_timer;
@@ -175,7 +175,8 @@
        struct neigh_parms      *parms_list;
        kmem_cache_t            *kmem_cachep;
        struct neigh_statistics stats;
-       struct neighbour        *hash_buckets[NEIGH_HASHMASK+1];
+       struct neighbour        **hash_buckets;
+       int                     curr_hash_bucket; /* For GC */
        struct pneigh_entry     *phash_buckets[PNEIGH_HASHMASK+1];
 };
 
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff 
linux-2.6.9-rc2-bk6/include/linux/sysctl.h 
linux-2.6.9-rc2-bk6-ncache/include/linux/sysctl.h
--- linux-2.6.9-rc2-bk6/include/linux/sysctl.h  2004-09-13 07:32:48.000000000 
+0200
+++ linux-2.6.9-rc2-bk6-ncache/include/linux/sysctl.h   2004-09-20 
22:14:47.000000000 +0200
@@ -494,7 +494,8 @@
        NET_NEIGH_GC_INTERVAL=13,
        NET_NEIGH_GC_THRESH1=14,
        NET_NEIGH_GC_THRESH2=15,
-       NET_NEIGH_GC_THRESH3=16
+       NET_NEIGH_GC_THRESH3=16,
+       NET_NEIGH_NUM_HASH_BUCKETS=17
 };
 
 /* /proc/sys/net/ipx */


-- 
- Harald Welte <laforge@xxxxxxxxxxxx>               http://www.gnumonks.org/
============================================================================
Programming is like sex: One mistake and you have to support it your lifetime

Attachment: signature.asc
Description: Digital signature

<Prev in Thread] Current Thread [Next in Thread>