Hi!
I've recently had some problems with the neighbour cache on systems
which have large layer-2 networks (such as 10/12 ethernet interfaces,
most of them attached to networks with a /16 netmaks). Yes, I know,
networks of that size is a bad design anyway. But at the same time
there are people using such strange setups ...
The current garbage collection mechanism's ultimate goal is to prevent
ARP floods. It therefore never expires INCOMPLETE entries from the
neighbour cache. I've seen systems in a state which is what I believe
beign completely starved of neighbour cache entries, while gc is unable
to evict anything => increasing gc_thesh2/gc_thresh3 to large values.
As a result, on systems with large networks (think of even less than 16
bit netmasks!) the neighbour table can grow quite significantly,
especially if some jerk runs pktgen with random destinations, and we
start expiring 'real' neighbours in favour of incomplete ones.
This large number of ncache entries is distributed over only 32 buckets,
which is quite unsatisfactory. Especially if we consider that the
distribution within the hash has been reported to be quite sub-optimal[1]
Tim Gardner has already sent two patches for 2.4.x on this some time
ago [1], [2] adressing this issue - but none of them was integrated.
I've now ported [2] to 2.6.9-rc2-bk6, and dropped [1]. Furthermore, I've
added:
- neigh_hash_bits boot parameter (if not specified, auto-sizing
hash based on system ram)
- jenkins hash for ARP hash function, like Dave indicated [3]
- make number of buckets readable via /proc/
The patch is not yet intende for mainline inclusion, since I want to
have feedback on
1) should I try to use jhash for ipv6, too?
2) Dave has indicated that there should be an upper limit for hash
buckets. What is considered a reasonable upper bound, even for very
large systems?
3) What do you think about removing gc_interval in favour of the dynamic
interval calculation, combined with the 'one bucket per gc run'
approach that Tim selected?
4) Shouldn't we set gc_thresh3 automatically based on the netmassk of
the interfaces we have IFF_RUNNING?
5) What is the proposed solution for IPv6, when you have /48 or /64 bit
prefixes and systems become even more vulnerable to neighbour cache
attacks?
6) Since everybody agrees the current scheme of bucket-sizing based on
memory only is a bad idea, do we want to unify this bucket sizing
code, so people have one single knob (boot option) which they can use
to give the system some metrics on how large it should scale all the
network hash tables?
[1] http://oss.sgi.com/projects/netdev/archive/2004-03/msg00265.html
[2] http://oss.sgi.com/projects/netdev/archive/2004-03/msg00250.html
[3] http://oss.sgi.com/projects/netdev/archive/2004-03/msg00279.html
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff
linux-2.6.9-rc2-bk6/net/atm/clip.c linux-2.6.9-rc2-bk6-ncache/net/atm/clip.c
--- linux-2.6.9-rc2-bk6/net/atm/clip.c 2004-09-21 00:23:16.000000000 +0200
+++ linux-2.6.9-rc2-bk6-ncache/net/atm/clip.c 2004-09-20 20:25:48.000000000
+0200
@@ -130,7 +130,7 @@
/*DPRINTK("idle_timer_check\n");*/
write_lock(&clip_tbl.lock);
- for (i = 0; i <= NEIGH_HASHMASK; i++) {
+ for (i = 0; i < clip_tbl.num_hash_buckets; i++) {
struct neighbour **np;
for (np = &clip_tbl.hash_buckets[i]; *np;) {
@@ -341,6 +341,7 @@
return 0;
}
+static struct neigh_table clip_tbl;
static u32 clip_hash(const void *pkey, const struct net_device *dev)
{
u32 hash_val;
@@ -349,7 +350,7 @@
hash_val ^= (hash_val>>16);
hash_val ^= hash_val>>8;
hash_val ^= hash_val>>3;
- hash_val = (hash_val^dev->ifindex)&NEIGH_HASHMASK;
+ hash_val = (hash_val^dev->ifindex)&(clip_tbl.num_hash_buckets-1);
return hash_val;
}
@@ -894,7 +895,7 @@
{
void *v = NULL;
- for (; state->bucket <= NEIGH_HASHMASK; state->bucket++) {
+ for (; state->bucket < clip_tbl.num_hash_buckets; state->bucket++) {
for (; state->n; state->n = state->n->next) {
v = arp_vcc_walk(state, NEIGH2ENTRY(state->n), &l);
if (v)
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff
linux-2.6.9-rc2-bk6/net/core/neighbour.c
linux-2.6.9-rc2-bk6-ncache/net/core/neighbour.c
--- linux-2.6.9-rc2-bk6/net/core/neighbour.c 2004-09-21 00:23:16.000000000
+0200
+++ linux-2.6.9-rc2-bk6-ncache/net/core/neighbour.c 2004-09-20
22:13:56.000000000 +0200
@@ -12,6 +12,10 @@
*
* Fixes:
* Vitaly E. Lavrov releasing NULL neighbor in neigh_add.
+ *
+ * Changes:
+ * 2004-09-17: Make cache size dynamically sized + configurable
+ * (Tim Gardner <timg@xxxxxxx> & Harald Welte <laforge@xxxxxxxxxxxxx>)
*/
#include <linux/config.h>
@@ -47,6 +51,9 @@
#define NEIGH_PRINTK2 NEIGH_PRINTK
#endif
+static unsigned int neigh_hash_bits;
+module_param(neigh_hash_bits, int, 0400);
+
static void neigh_timer_handler(unsigned long arg);
#ifdef CONFIG_ARPD
static void neigh_app_notify(struct neighbour *n);
@@ -113,7 +120,7 @@
int shrunk = 0;
int i;
- for (i = 0; i <= NEIGH_HASHMASK; i++) {
+ for (i = 0; i < tbl->num_hash_buckets; i++) {
struct neighbour *n, **np;
np = &tbl->hash_buckets[i];
@@ -177,7 +184,7 @@
write_lock_bh(&tbl->lock);
- for (i=0; i <= NEIGH_HASHMASK; i++) {
+ for (i=0; i < tbl->num_hash_buckets; i++) {
struct neighbour *n, **np;
np = &tbl->hash_buckets[i];
@@ -204,7 +211,7 @@
write_lock_bh(&tbl->lock);
- for (i = 0; i <= NEIGH_HASHMASK; i++) {
+ for (i = 0; i < tbl->num_hash_buckets; i++) {
struct neighbour *n, **np = &tbl->hash_buckets[i];
while ((n = *np) != NULL) {
@@ -545,9 +552,8 @@
static void neigh_periodic_timer(unsigned long arg)
{
struct neigh_table *tbl = (struct neigh_table *)arg;
+ struct neighbour *n, **np;
unsigned long now = jiffies;
- int i;
-
write_lock(&tbl->lock);
@@ -563,41 +569,44 @@
neigh_rand_reach_time(p->base_reachable_time);
}
- for (i = 0; i <= NEIGH_HASHMASK; i++) {
- struct neighbour *n, **np;
+ tbl->curr_hash_bucket &= (tbl->num_hash_buckets-1);
+ np = &tbl->hash_buckets[tbl->curr_hash_bucket++];
- np = &tbl->hash_buckets[i];
- while ((n = *np) != NULL) {
- unsigned state;
+ while ((n = *np) != NULL) {
+ unsigned state;
- write_lock(&n->lock);
+ write_lock(&n->lock);
- state = n->nud_state;
- if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
- write_unlock(&n->lock);
- goto next_elt;
- }
+ state = n->nud_state;
+ if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
+ write_unlock(&n->lock);
+ goto next_elt;
+ }
- if (time_before(n->used, n->confirmed))
- n->used = n->confirmed;
+ if (time_before(n->used, n->confirmed))
+ n->used = n->confirmed;
- if (atomic_read(&n->refcnt) == 1 &&
- (state == NUD_FAILED ||
- time_after(now, n->used +
n->parms->gc_staletime))) {
- *np = n->next;
- n->dead = 1;
- write_unlock(&n->lock);
- neigh_release(n);
- continue;
- }
+ if (atomic_read(&n->refcnt) == 1 &&
+ (state == NUD_FAILED ||
+ time_after(now, n->used + n->parms->gc_staletime))) {
+ *np = n->next;
+ n->dead = 1;
write_unlock(&n->lock);
+ neigh_release(n);
+ continue;
+ }
+ write_unlock(&n->lock);
next_elt:
- np = &n->next;
- }
+ np = &n->next;
}
-
- mod_timer(&tbl->gc_timer, now + tbl->gc_interval);
+ /* Cycle through all hash buckets every base_reachable_time/2 ticks.
+ * ARP entry timeouts range from 1/2 base_reachable_time to 3/2
+ * base_reachable_time. */
+
+ mod_timer(&tbl->gc_timer, now +
+ ((tbl->parms.base_reachable_time>>1)/(tbl->num_hash_buckets)));
+
write_unlock(&tbl->lock);
}
@@ -1205,6 +1214,37 @@
void neigh_table_init(struct neigh_table *tbl)
{
unsigned long now = jiffies;
+ unsigned int goal = neigh_hash_bits;
+
+ /* Allocate a power of 2 hash buckets for each power of 2 MB of RAM. */
+ if (!goal) {
+ unsigned int ram_mb = (num_physpages*PAGE_SIZE) / (1024*1024);
+ goal = 31;
+ while ((1<<goal) > ram_mb)
+ {
+ goal--;
+ }
+ }
+
+ tbl->hash_buckets = NULL;
+ while (goal && (!tbl->hash_buckets)) {
+ tbl->num_hash_buckets = (1<<goal);
+ tbl->hash_buckets = kmalloc(sizeof(struct neighbour *)
+ *tbl->num_hash_buckets, GFP_ATOMIC);
+ goal--;
+ }
+
+ if (tbl->hash_buckets == NULL)
+ panic("%s: Could not allocate memory for hash buckets "
+ "of table %s.\n", tbl->id, __FUNCTION__);
+ memset(tbl->hash_buckets, 0,
+ sizeof(struct neighbour *)*tbl->num_hash_buckets);
+
+ if (neigh_hash_bits &&
+ (tbl->num_hash_buckets != ((1<<neigh_hash_bits)+1)))
+ printk(KERN_WARNING "%s: Couldn't allocate %u hash buckets, "
+ "did %u instead.\n", __FUNCTION__,
+ (1<<neigh_hash_bits)+1, tbl->num_hash_buckets);
atomic_set(&tbl->parms.refcnt, 1);
INIT_RCU_HEAD(&tbl->parms.rcu_head);
@@ -1224,8 +1264,7 @@
init_timer(&tbl->gc_timer);
tbl->gc_timer.data = (unsigned long)tbl;
tbl->gc_timer.function = neigh_periodic_timer;
- tbl->gc_timer.expires = now + tbl->gc_interval +
- tbl->parms.reachable_time;
+ tbl->gc_timer.expires = now + 1;
add_timer(&tbl->gc_timer);
init_timer(&tbl->proxy_timer);
@@ -1439,7 +1478,7 @@
int rc, h, s_h = cb->args[1];
int idx, s_idx = idx = cb->args[2];
- for (h = 0; h <= NEIGH_HASHMASK; h++) {
+ for (h = 0; h < tbl->num_hash_buckets; h++) {
if (h < s_h)
continue;
if (h > s_h)
@@ -1628,14 +1667,6 @@
.proc_handler = &proc_dointvec_userhz_jiffies,
},
{
- .ctl_name = NET_NEIGH_GC_INTERVAL,
- .procname = "gc_interval",
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_jiffies,
- .strategy = &sysctl_jiffies,
- },
- {
.ctl_name = NET_NEIGH_GC_THRESH1,
.procname = "gc_thresh1",
.maxlen = sizeof(int),
@@ -1656,6 +1687,13 @@
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .ctl_name = NET_NEIGH_NUM_HASH_BUCKETS,
+ .procname = "hash_buckets",
+ .maxlen = sizeof(int),
+ .mode = 0400,
+ .proc_handler = &proc_dointvec,
+ },
},
.neigh_dev = {
{
@@ -1719,10 +1757,13 @@
t->neigh_dev[0].ctl_name = dev->ifindex;
memset(&t->neigh_vars[12], 0, sizeof(ctl_table));
} else {
+ /* If you listen carefully, you can actually hear this
+ * code suck */
t->neigh_vars[12].data = (int *)(p + 1);
t->neigh_vars[13].data = (int *)(p + 1) + 1;
t->neigh_vars[14].data = (int *)(p + 1) + 2;
t->neigh_vars[15].data = (int *)(p + 1) + 3;
+ t->neigh_vars[16].data = (int *)(p + 1) + 4;
}
dev_name = net_sysctl_strdup(dev_name_source);
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff
linux-2.6.9-rc2-bk6/net/decnet/dn_neigh.c
linux-2.6.9-rc2-bk6-ncache/net/decnet/dn_neigh.c
--- linux-2.6.9-rc2-bk6/net/decnet/dn_neigh.c 2004-09-13 07:32:55.000000000
+0200
+++ linux-2.6.9-rc2-bk6-ncache/net/decnet/dn_neigh.c 2004-09-20
20:25:48.000000000 +0200
@@ -128,7 +128,7 @@
hash_val ^= (hash_val >> 10);
hash_val ^= (hash_val >> 3);
- return hash_val & NEIGH_HASHMASK;
+ return hash_val & (dn_neigh_table.num_hash_buckets-1);
}
static int dn_neigh_construct(struct neighbour *neigh)
@@ -526,7 +526,7 @@
read_lock_bh(&tbl->lock);
- for(i = 0; i < NEIGH_HASHMASK; i++) {
+ for(i = 0; i < tbl->num_hash_buckets; i++) {
for(neigh = tbl->hash_buckets[i]; neigh != NULL; neigh =
neigh->next) {
if (neigh->dev != dev)
continue;
@@ -567,7 +567,7 @@
struct neighbour *n = NULL;
for(state->bucket = 0;
- state->bucket <= NEIGH_HASHMASK;
+ state->bucket < dn_neigh_table.num_hash_buckets;
++state->bucket) {
n = dn_neigh_table.hash_buckets[state->bucket];
if (n)
@@ -586,7 +586,7 @@
try_again:
if (n)
goto out;
- if (++state->bucket > NEIGH_HASHMASK)
+ if (++state->bucket >= dn_neigh_table.num_hash_buckets)
goto out;
n = dn_neigh_table.hash_buckets[state->bucket];
goto try_again;
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff
linux-2.6.9-rc2-bk6/net/ipv4/arp.c linux-2.6.9-rc2-bk6-ncache/net/ipv4/arp.c
--- linux-2.6.9-rc2-bk6/net/ipv4/arp.c 2004-09-21 00:23:16.000000000 +0200
+++ linux-2.6.9-rc2-bk6-ncache/net/ipv4/arp.c 2004-09-20 20:31:21.000000000
+0200
@@ -71,6 +71,7 @@
* arp_xmit so intermediate drivers like
* bonding can change the skb before
* sending (e.g. insert 8021q tag).
+ * Harald Welte : convert to make use of jenkins hash
*/
#include <linux/module.h>
@@ -97,6 +98,7 @@
#include <linux/init.h>
#include <linux/net.h>
#include <linux/rcupdate.h>
+#include <linux/jhash.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
@@ -124,6 +126,9 @@
#include <linux/netfilter_arp.h>
+static u32 arp_hash_rnd;
+static int arp_hash_rnd_initted;
+
/*
* Interface to generic neighbour cache.
*/
@@ -194,7 +199,6 @@
.proxy_qlen = 64,
.locktime = 1 * HZ,
},
- .gc_interval = 30 * HZ,
.gc_thresh1 = 128,
.gc_thresh2 = 512,
.gc_thresh3 = 1024,
@@ -223,15 +227,13 @@
static u32 arp_hash(const void *pkey, const struct net_device *dev)
{
- u32 hash_val;
-
- hash_val = *(u32*)pkey;
- hash_val ^= (hash_val>>16);
- hash_val ^= hash_val>>8;
- hash_val ^= hash_val>>3;
- hash_val = (hash_val^dev->ifindex)&NEIGH_HASHMASK;
+ if (unlikely(!arp_hash_rnd_initted)) {
+ get_random_bytes(&arp_hash_rnd, 4);
+ arp_hash_rnd_initted = 1;
+ }
- return hash_val;
+ return (jhash_2words(*(u32 *)pkey, dev->ifindex,
+ arp_hash_rnd) & arp_tbl.num_hash_buckets-1);
}
static int arp_constructor(struct neighbour *neigh)
@@ -1281,7 +1283,7 @@
state->is_pneigh = 0;
for (state->bucket = 0;
- state->bucket <= NEIGH_HASHMASK;
+ state->bucket < arp_tbl.num_hash_buckets;
++state->bucket) {
n = arp_tbl.hash_buckets[state->bucket];
while (n && !(n->nud_state & ~NUD_NOARP))
@@ -1307,7 +1309,7 @@
if (n)
goto out;
- if (++state->bucket > NEIGH_HASHMASK)
+ if (++state->bucket >= arp_tbl.num_hash_buckets)
goto out;
n = arp_tbl.hash_buckets[state->bucket];
goto try_again;
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff
linux-2.6.9-rc2-bk6/net/ipv6/ndisc.c linux-2.6.9-rc2-bk6-ncache/net/ipv6/ndisc.c
--- linux-2.6.9-rc2-bk6/net/ipv6/ndisc.c 2004-09-21 00:23:16.000000000
+0200
+++ linux-2.6.9-rc2-bk6-ncache/net/ipv6/ndisc.c 2004-09-20 20:25:48.000000000
+0200
@@ -147,7 +147,6 @@
.proxy_delay = (8 * HZ) / 10,
.proxy_qlen = 64,
},
- .gc_interval = 30 * HZ,
.gc_thresh1 = 128,
.gc_thresh2 = 512,
.gc_thresh3 = 1024,
@@ -276,7 +275,7 @@
hash_val ^= (hash_val>>16);
hash_val ^= hash_val>>8;
hash_val ^= hash_val>>3;
- hash_val = (hash_val^dev->ifindex)&NEIGH_HASHMASK;
+ hash_val = (hash_val^dev->ifindex)&(nd_tbl.num_hash_buckets-1);
return hash_val;
}
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff
linux-2.6.9-rc2-bk6/include/net/neighbour.h
linux-2.6.9-rc2-bk6-ncache/include/net/neighbour.h
--- linux-2.6.9-rc2-bk6/include/net/neighbour.h 2004-09-21 00:23:16.000000000
+0200
+++ linux-2.6.9-rc2-bk6-ncache/include/net/neighbour.h 2004-09-20
22:12:50.000000000 +0200
@@ -139,7 +139,6 @@
u8 key[0];
};
-#define NEIGH_HASHMASK 0x1F
#define PNEIGH_HASHMASK 0xF
/*
@@ -160,11 +159,12 @@
void (*proxy_redo)(struct sk_buff *skb);
char *id;
struct neigh_parms parms;
- /* HACK. gc_* shoul follow parms without a gap! */
- int gc_interval;
+ /* UGLY HACK. gc_* and num_hash_buckets must follow parms without a
+ * gap! */
int gc_thresh1;
int gc_thresh2;
int gc_thresh3;
+ int num_hash_buckets;
unsigned long last_flush;
struct timer_list gc_timer;
struct timer_list proxy_timer;
@@ -175,7 +175,8 @@
struct neigh_parms *parms_list;
kmem_cache_t *kmem_cachep;
struct neigh_statistics stats;
- struct neighbour *hash_buckets[NEIGH_HASHMASK+1];
+ struct neighbour **hash_buckets;
+ int curr_hash_bucket; /* For GC */
struct pneigh_entry *phash_buckets[PNEIGH_HASHMASK+1];
};
diff -Nru --exclude-from=/sunbeam/home/laforge/scripts/dontdiff
linux-2.6.9-rc2-bk6/include/linux/sysctl.h
linux-2.6.9-rc2-bk6-ncache/include/linux/sysctl.h
--- linux-2.6.9-rc2-bk6/include/linux/sysctl.h 2004-09-13 07:32:48.000000000
+0200
+++ linux-2.6.9-rc2-bk6-ncache/include/linux/sysctl.h 2004-09-20
22:14:47.000000000 +0200
@@ -494,7 +494,8 @@
NET_NEIGH_GC_INTERVAL=13,
NET_NEIGH_GC_THRESH1=14,
NET_NEIGH_GC_THRESH2=15,
- NET_NEIGH_GC_THRESH3=16
+ NET_NEIGH_GC_THRESH3=16,
+ NET_NEIGH_NUM_HASH_BUCKETS=17
};
/* /proc/sys/net/ipx */
--
- Harald Welte <laforge@xxxxxxxxxxxx> http://www.gnumonks.org/
============================================================================
Programming is like sex: One mistake and you have to support it your lifetime
signature.asc
Description: Digital signature
|