File: [Development] / linux-2.4-xfs / net / ipv4 / ipvs / ip_vs_conn.c (download)
Revision 1.3, Thu Jan 20 13:59:19 2005 UTC (12 years, 8 months ago) by nathans.longdrop.melbourne.sgi.com
Branch: MAIN
Changes since 1.2: +3 -1
lines
Merge up to 2.4.29.
Merge of 2.4.x-xfs-melb:linux:21231a by kenmcd.
|
/*
* IPVS An implementation of the IP virtual server support for the
* LINUX operating system. IPVS is now implemented as a module
* over the Netfilter framework. IPVS can be used to build a
* high-performance and highly available server based on a
* cluster of servers.
*
* Version: $Id: ip_vs_conn.c,v 1.28.2.5 2003/08/09 13:27:08 wensong Exp $
*
* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
* Peter Kese <peter.kese@ijs.si>
* Julian Anastasov <ja@ssi.bg>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
* with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
* and others. Many code here is taken from IP MASQ code of kernel 2.2.
*
* Changes:
*
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/vmalloc.h>
#include <linux/ip.h>
#include <linux/tcp.h> /* for tcphdr */
#include <linux/in.h>
#include <linux/proc_fs.h> /* for proc_net_* */
#include <asm/softirq.h> /* for local_bh_* */
#include <net/ip.h>
#include <net/tcp.h> /* for csum_tcpudp_magic */
#include <net/udp.h>
#include <net/icmp.h> /* for icmp_send */
#include <net/route.h> /* for ip_route_output */
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <linux/jhash.h>
#include <linux/random.h>
#include <net/ip_vs.h>
/*
* Connection hash table: for input and output packets lookups of IPVS
*/
static struct list_head *ip_vs_conn_tab;
/* SLAB cache for IPVS connections */
static kmem_cache_t *ip_vs_conn_cachep;
/* counter for current IPVS connections */
static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
/* counter for no-client-port connections */
static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
/* random value for IPVS connection hash */
static unsigned int ip_vs_conn_rnd;
/*
* Fine locking granularity for big connection hash table
*/
#define CT_LOCKARRAY_BITS 4
#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
struct ip_vs_aligned_lock
{
rwlock_t l;
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
/* lock array for conn table */
struct ip_vs_aligned_lock
__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
static inline void ct_read_lock(unsigned key)
{
read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
static inline void ct_read_unlock(unsigned key)
{
read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
static inline void ct_write_lock(unsigned key)
{
write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
static inline void ct_write_unlock(unsigned key)
{
write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
static inline void ct_read_lock_bh(unsigned key)
{
read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
static inline void ct_read_unlock_bh(unsigned key)
{
read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
static inline void ct_write_lock_bh(unsigned key)
{
write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
static inline void ct_write_unlock_bh(unsigned key)
{
write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
/*
* Returns hash value for IPVS connection entry
*/
static unsigned
ip_vs_conn_hashkey(unsigned proto, __u32 addr, __u16 port)
{
return jhash_3words(addr, port, proto, ip_vs_conn_rnd)
& IP_VS_CONN_TAB_MASK;
}
/*
* Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
* returns bool success.
*/
static int ip_vs_conn_hash(struct ip_vs_conn *cp)
{
unsigned hash;
int ret;
/* Hash by protocol, client address and port */
hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
ct_write_lock(hash);
if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
cp->flags |= IP_VS_CONN_F_HASHED;
atomic_inc(&cp->refcnt);
ret = 1;
} else {
IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
"called from %p\n", __builtin_return_address(0));
ret = 0;
}
ct_write_unlock(hash);
return ret;
}
/*
* UNhashes ip_vs_conn from ip_vs_conn_tab.
* returns bool success.
*/
static int ip_vs_conn_unhash(struct ip_vs_conn *cp)
{
unsigned hash;
int ret;
/* unhash it and decrease its reference counter */
hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
ct_write_lock(hash);
if (cp->flags & IP_VS_CONN_F_HASHED) {
list_del(&cp->c_list);
cp->flags &= ~IP_VS_CONN_F_HASHED;
atomic_dec(&cp->refcnt);
ret = 1;
} else
ret = 0;
ct_write_unlock(hash);
return ret;
}
/*
* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
* Called for pkts coming from OUTside-to-INside.
* s_addr, s_port: pkt source address (foreign host)
* d_addr, d_port: pkt dest address (load balancer)
*/
static inline struct ip_vs_conn *__ip_vs_conn_in_get
(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
{
unsigned hash;
struct ip_vs_conn *cp;
struct list_head *l,*e;
hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
l = &ip_vs_conn_tab[hash];
ct_read_lock(hash);
for (e=l->next; e!=l; e=e->next) {
cp = list_entry(e, struct ip_vs_conn, c_list);
if (s_addr==cp->caddr && s_port==cp->cport &&
d_port==cp->vport && d_addr==cp->vaddr &&
protocol==cp->protocol) {
/* HIT */
atomic_inc(&cp->refcnt);
ct_read_unlock(hash);
return cp;
}
}
ct_read_unlock(hash);
return NULL;
}
struct ip_vs_conn *ip_vs_conn_in_get
(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
{
struct ip_vs_conn *cp;
cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port);
if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
ip_vs_proto_name(protocol),
NIPQUAD(s_addr), ntohs(s_port),
NIPQUAD(d_addr), ntohs(d_port),
cp?"hit":"not hit");
return cp;
}
/*
* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
* Called for pkts coming from inside-to-OUTside.
* s_addr, s_port: pkt source address (inside host)
* d_addr, d_port: pkt dest address (foreign host)
*/
struct ip_vs_conn *ip_vs_conn_out_get
(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
{
unsigned hash;
struct ip_vs_conn *cp, *ret=NULL;
struct list_head *l,*e;
/*
* Check for "full" addressed entries
*/
hash = ip_vs_conn_hashkey(protocol, d_addr, d_port);
l = &ip_vs_conn_tab[hash];
ct_read_lock(hash);
for (e=l->next; e!=l; e=e->next) {
cp = list_entry(e, struct ip_vs_conn, c_list);
if (d_addr == cp->caddr && d_port == cp->cport &&
s_port == cp->dport && s_addr == cp->daddr &&
protocol == cp->protocol) {
/* HIT */
atomic_inc(&cp->refcnt);
ret = cp;
break;
}
}
ct_read_unlock(hash);
IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
ip_vs_proto_name(protocol),
NIPQUAD(s_addr), ntohs(s_port),
NIPQUAD(d_addr), ntohs(d_port),
ret?"hit":"not hit");
return ret;
}
/*
* Put back the conn and restart its timer with its timeout
*/
void ip_vs_conn_put(struct ip_vs_conn *cp)
{
/* reset it expire in its timeout */
mod_timer(&cp->timer, jiffies+cp->timeout);
__ip_vs_conn_put(cp);
}
/*
* Timeout table[state]
*/
struct ip_vs_timeout_table vs_timeout_table = {
ATOMIC_INIT(0), /* refcnt */
0, /* scale */
{
[IP_VS_S_NONE] = 30*60*HZ,
[IP_VS_S_ESTABLISHED] = 15*60*HZ,
[IP_VS_S_SYN_SENT] = 2*60*HZ,
[IP_VS_S_SYN_RECV] = 1*60*HZ,
[IP_VS_S_FIN_WAIT] = 2*60*HZ,
[IP_VS_S_TIME_WAIT] = 2*60*HZ,
[IP_VS_S_CLOSE] = 10*HZ,
[IP_VS_S_CLOSE_WAIT] = 60*HZ,
[IP_VS_S_LAST_ACK] = 30*HZ,
[IP_VS_S_LISTEN] = 2*60*HZ,
[IP_VS_S_SYNACK] = 120*HZ,
[IP_VS_S_UDP] = 5*60*HZ,
[IP_VS_S_ICMP] = 1*60*HZ,
[IP_VS_S_LAST] = 2*HZ,
}, /* timeout */
};
struct ip_vs_timeout_table vs_timeout_table_dos = {
ATOMIC_INIT(0), /* refcnt */
0, /* scale */
{
[IP_VS_S_NONE] = 15*60*HZ,
[IP_VS_S_ESTABLISHED] = 8*60*HZ,
[IP_VS_S_SYN_SENT] = 60*HZ,
[IP_VS_S_SYN_RECV] = 10*HZ,
[IP_VS_S_FIN_WAIT] = 60*HZ,
[IP_VS_S_TIME_WAIT] = 60*HZ,
[IP_VS_S_CLOSE] = 10*HZ,
[IP_VS_S_CLOSE_WAIT] = 60*HZ,
[IP_VS_S_LAST_ACK] = 30*HZ,
[IP_VS_S_LISTEN] = 2*60*HZ,
[IP_VS_S_SYNACK] = 100*HZ,
[IP_VS_S_UDP] = 3*60*HZ,
[IP_VS_S_ICMP] = 1*60*HZ,
[IP_VS_S_LAST] = 2*HZ,
}, /* timeout */
};
/*
* Timeout table to use for the VS entries
* If NULL we use the default table (vs_timeout_table).
* Under flood attack we switch to vs_timeout_table_dos
*/
static struct ip_vs_timeout_table *ip_vs_timeout_table = &vs_timeout_table;
static const char * state_name_table[IP_VS_S_LAST+1] = {
[IP_VS_S_NONE] = "NONE",
[IP_VS_S_ESTABLISHED] = "ESTABLISHED",
[IP_VS_S_SYN_SENT] = "SYN_SENT",
[IP_VS_S_SYN_RECV] = "SYN_RECV",
[IP_VS_S_FIN_WAIT] = "FIN_WAIT",
[IP_VS_S_TIME_WAIT] = "TIME_WAIT",
[IP_VS_S_CLOSE] = "CLOSE",
[IP_VS_S_CLOSE_WAIT] = "CLOSE_WAIT",
[IP_VS_S_LAST_ACK] = "LAST_ACK",
[IP_VS_S_LISTEN] = "LISTEN",
[IP_VS_S_SYNACK] = "SYNACK",
[IP_VS_S_UDP] = "UDP",
[IP_VS_S_ICMP] = "ICMP",
[IP_VS_S_LAST] = "BUG!",
};
#define sNO IP_VS_S_NONE
#define sES IP_VS_S_ESTABLISHED
#define sSS IP_VS_S_SYN_SENT
#define sSR IP_VS_S_SYN_RECV
#define sFW IP_VS_S_FIN_WAIT
#define sTW IP_VS_S_TIME_WAIT
#define sCL IP_VS_S_CLOSE
#define sCW IP_VS_S_CLOSE_WAIT
#define sLA IP_VS_S_LAST_ACK
#define sLI IP_VS_S_LISTEN
#define sSA IP_VS_S_SYNACK
struct vs_tcp_states_t {
int next_state[IP_VS_S_LAST]; /* should be _LAST_TCP */
};
const char * ip_vs_state_name(int state)
{
if (state >= IP_VS_S_LAST)
return "ERR!";
return state_name_table[state] ? state_name_table[state] : "?";
}
static struct vs_tcp_states_t vs_tcp_states [] = {
/* INPUT */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
/* OUTPUT */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
/* INPUT-ONLY */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};
static struct vs_tcp_states_t vs_tcp_states_dos [] = {
/* INPUT */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
/* OUTPUT */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
/* INPUT-ONLY */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};
static struct vs_tcp_states_t *ip_vs_state_table = vs_tcp_states;
void ip_vs_secure_tcp_set(int on)
{
if (on) {
ip_vs_state_table = vs_tcp_states_dos;
ip_vs_timeout_table = &vs_timeout_table_dos;
} else {
ip_vs_state_table = vs_tcp_states;
ip_vs_timeout_table = &vs_timeout_table;
}
}
static inline int vs_tcp_state_idx(struct tcphdr *th, int state_off)
{
/*
* [0-3]: input states, [4-7]: output, [8-11] input only states.
*/
if (th->rst)
return state_off+3;
if (th->syn)
return state_off+0;
if (th->fin)
return state_off+1;
if (th->ack)
return state_off+2;
return -1;
}
static inline int vs_set_state_timeout(struct ip_vs_conn *cp, int state)
{
struct ip_vs_timeout_table *vstim = cp->timeout_table;
/*
* Use default timeout table if no specific for this entry
*/
if (!vstim)
vstim = &vs_timeout_table;
cp->timeout = vstim->timeout[cp->state=state];
if (vstim->scale) {
int scale = vstim->scale;
if (scale<0)
cp->timeout >>= -scale;
else if (scale > 0)
cp->timeout <<= scale;
}
return state;
}
static inline int
vs_tcp_state(struct ip_vs_conn *cp, int state_off, struct tcphdr *th)
{
int state_idx;
int new_state = IP_VS_S_CLOSE;
/*
* Update state offset to INPUT_ONLY if necessary
* or delete NO_OUTPUT flag if output packet detected
*/
if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
if (state_off == VS_STATE_OUTPUT)
cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
else
state_off = VS_STATE_INPUT_ONLY;
}
if ((state_idx = vs_tcp_state_idx(th, state_off)) < 0) {
IP_VS_DBG(8, "vs_tcp_state_idx(%d)=%d!!!\n",
state_off, state_idx);
goto tcp_state_out;
}
new_state = ip_vs_state_table[state_idx].next_state[cp->state];
tcp_state_out:
if (new_state != cp->state) {
struct ip_vs_dest *dest = cp->dest;
IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
"%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
ip_vs_proto_name(cp->protocol),
(state_off==VS_STATE_OUTPUT)?"output ":"input ",
th->syn? 'S' : '.',
th->fin? 'F' : '.',
th->ack? 'A' : '.',
th->rst? 'R' : '.',
NIPQUAD(cp->daddr), ntohs(cp->dport),
NIPQUAD(cp->caddr), ntohs(cp->cport),
ip_vs_state_name(cp->state),
ip_vs_state_name(new_state),
atomic_read(&cp->refcnt));
if (dest) {
if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
(new_state != IP_VS_S_ESTABLISHED)) {
atomic_dec(&dest->activeconns);
atomic_inc(&dest->inactconns);
cp->flags |= IP_VS_CONN_F_INACTIVE;
} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
(new_state == IP_VS_S_ESTABLISHED)) {
atomic_inc(&dest->activeconns);
atomic_dec(&dest->inactconns);
cp->flags &= ~IP_VS_CONN_F_INACTIVE;
}
}
}
return vs_set_state_timeout(cp, new_state);
}
/*
* Handle state transitions
*/
int ip_vs_set_state(struct ip_vs_conn *cp,
int state_off, struct iphdr *iph, void *tp)
{
int ret;
spin_lock(&cp->lock);
switch (iph->protocol) {
case IPPROTO_TCP:
ret = vs_tcp_state(cp, state_off, tp);
break;
case IPPROTO_UDP:
ret = vs_set_state_timeout(cp, IP_VS_S_UDP);
break;
case IPPROTO_ICMP:
ret = vs_set_state_timeout(cp, IP_VS_S_ICMP);
break;
default:
ret = -1;
}
spin_unlock(&cp->lock);
return ret;
}
/*
* Set LISTEN timeout. (ip_vs_conn_put will setup timer)
*/
int ip_vs_conn_listen(struct ip_vs_conn *cp)
{
vs_set_state_timeout(cp, IP_VS_S_LISTEN);
return cp->timeout;
}
/*
* Bypass transmitter
* Let packets bypass the destination when the destination is not
* available, it may be only used in transparent cache cluster.
*/
static int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
{
struct rtable *rt; /* Route to the other host */
struct iphdr *iph = skb->nh.iph;
u8 tos = iph->tos;
int mtu;
EnterFunction(10);
if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(tos), 0)) {
IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
"dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
goto tx_error_icmp;
}
/* MTU checking */
mtu = rt->u.dst.pmtu;
if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
ip_rt_put(rt);
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
goto tx_error;
}
/* update checksum because skb might be defragmented */
ip_send_check(iph);
if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
ip_rt_put(rt);
IP_VS_ERR_RL("ip_vs_bypass_xmit(): no memory\n");
goto tx_error;
}
}
/* drop old route */
dst_release(skb->dst);
skb->dst = &rt->u.dst;
#ifdef CONFIG_NETFILTER_DEBUG
skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
#endif /* CONFIG_NETFILTER_DEBUG */
skb->nfcache |= NFC_IPVS_PROPERTY;
ip_send(skb);
LeaveFunction(10);
return NF_STOLEN;
tx_error_icmp:
dst_link_failure(skb);
tx_error:
kfree_skb(skb);
return NF_STOLEN;
}
/*
* NULL transmitter (do nothing except return NF_ACCEPT)
*/
static int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
{
return NF_ACCEPT;
}
/*
* NAT transmitter (only for outside-to-inside nat forwarding)
*/
static int ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
{
struct rtable *rt; /* Route to the other host */
struct iphdr *iph;
union ip_vs_tphdr h;
int ihl;
unsigned short size;
int mtu;
EnterFunction(10);
/*
* If it has ip_vs_app helper, the helper may change the payload,
* so it needs full checksum checking and checksum calculation.
* If not, only the header (such as IP address and port number)
* will be changed, so it is fast to do incremental checksum update,
* and let the destination host do final checksum checking.
*/
if (cp->app && skb_is_nonlinear(skb)
&& skb_linearize(skb, GFP_ATOMIC) != 0)
return NF_DROP;
iph = skb->nh.iph;
ihl = iph->ihl << 2;
h.raw = (char*) iph + ihl;
size = ntohs(iph->tot_len) - ihl;
/* do TCP/UDP checksum checking if it has application helper */
if (cp->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) {
switch (skb->ip_summed) {
case CHECKSUM_NONE:
skb->csum = csum_partial(h.raw, size, 0);
case CHECKSUM_HW:
if (csum_tcpudp_magic(iph->saddr, iph->daddr, size,
iph->protocol, skb->csum)) {
IP_VS_DBG_RL("Incoming failed %s checksum "
"from %d.%d.%d.%d (size=%d)!\n",
ip_vs_proto_name(iph->protocol),
NIPQUAD(iph->saddr),
size);
goto tx_error;
}
break;
default:
/* CHECKSUM_UNNECESSARY */
break;
}
}
/*
* Check if it is no_cport connection ...
*/
if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
if (ip_vs_conn_unhash(cp)) {
spin_lock(&cp->lock);
if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
atomic_dec(&ip_vs_conn_no_cport_cnt);
cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
cp->cport = h.portp[0];
IP_VS_DBG(10, "filled cport=%d\n", ntohs(cp->dport));
}
spin_unlock(&cp->lock);
/* hash on new dport */
ip_vs_conn_hash(cp);
}
}
if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
goto tx_error_icmp;
/* MTU checking */
mtu = rt->u.dst.pmtu;
if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
ip_rt_put(rt);
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL("ip_vs_nat_xmit(): frag needed\n");
goto tx_error;
}
/* drop old route */
dst_release(skb->dst);
skb->dst = &rt->u.dst;
/* copy-on-write the packet before mangling it */
if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len, &iph, &h.raw))
return NF_DROP;
/* mangle the packet */
iph->daddr = cp->daddr;
h.portp[1] = cp->dport;
/*
* Attempt ip_vs_app call.
* will fix ip_vs_conn and iph ack_seq stuff
*/
if (ip_vs_app_pkt_in(cp, skb) != 0) {
/* skb data has probably changed, update pointers */
iph = skb->nh.iph;
h.raw = (char*) iph + ihl;
size = skb->len - ihl;
}
/*
* Adjust TCP/UDP checksums
*/
if (!cp->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) {
/* Only port and addr are changed, do fast csum update */
ip_vs_fast_check_update(&h, cp->vaddr, cp->daddr,
cp->vport, cp->dport, iph->protocol);
if (skb->ip_summed == CHECKSUM_HW)
skb->ip_summed = CHECKSUM_NONE;
} else {
/* full checksum calculation */
switch (iph->protocol) {
case IPPROTO_TCP:
h.th->check = 0;
h.th->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
size, iph->protocol,
csum_partial(h.raw, size, 0));
break;
case IPPROTO_UDP:
h.uh->check = 0;
h.uh->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
size, iph->protocol,
csum_partial(h.raw, size, 0));
if (h.uh->check == 0)
h.uh->check = 0xFFFF;
break;
}
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
ip_send_check(iph);
IP_VS_DBG(10, "NAT to %u.%u.%u.%u:%d\n",
NIPQUAD(iph->daddr), ntohs(h.portp[1]));
/* FIXME: when application helper enlarges the packet and the length
is larger than the MTU of outgoing device, there will be still
MTU problem. */
#ifdef CONFIG_NETFILTER_DEBUG
skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
#endif /* CONFIG_NETFILTER_DEBUG */
skb->nfcache |= NFC_IPVS_PROPERTY;
ip_send(skb);
LeaveFunction(10);
return NF_STOLEN;
tx_error_icmp:
dst_link_failure(skb);
tx_error:
kfree_skb(skb);
return NF_STOLEN;
}
/*
* IP Tunneling transmitter
*
* This function encapsulates the packet in a new IP packet, its
* destination will be set to cp->daddr. Most code of this function
* is taken from ipip.c.
*
* It is used in VS/TUN cluster. The load balancer selects a real
* server from a cluster based on a scheduling algorithm,
* encapsulates the request packet and forwards it to the selected
* server. For example, all real servers are configured with
* "ifconfig tunl0 <Virtual IP Address> up". When the server receives
* the encapsulated packet, it will decapsulate the packet, processe
* the request and return the response packets directly to the client
* without passing the load balancer. This can greatly increase the
* scalability of virtual server.
*/
static int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
{
struct rtable *rt; /* Route to the other host */
struct net_device *tdev; /* Device to other host */
struct iphdr *old_iph = skb->nh.iph;
u8 tos = old_iph->tos;
u16 df = old_iph->frag_off;
struct iphdr *iph; /* Our new IP header */
int max_headroom; /* The extra header space needed */
int mtu;
EnterFunction(10);
if (skb->protocol != __constant_htons(ETH_P_IP)) {
IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
"ETH_P_IP: %d, skb protocol: %d\n",
__constant_htons(ETH_P_IP), skb->protocol);
goto tx_error;
}
if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
goto tx_error_icmp;
tdev = rt->u.dst.dev;
mtu = rt->u.dst.pmtu - sizeof(struct iphdr);
if (mtu < 68) {
ip_rt_put(rt);
IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
goto tx_error;
}
if (skb->dst && mtu < skb->dst->pmtu)
skb->dst->pmtu = mtu;
df |= (old_iph->frag_off&__constant_htons(IP_DF));
if ((old_iph->frag_off&__constant_htons(IP_DF))
&& mtu < ntohs(old_iph->tot_len)) {
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
ip_rt_put(rt);
IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
goto tx_error;
}
/* update checksum because skb might be defragmented */
ip_send_check(old_iph);
/*
* Okay, now see if we can stuff it in the buffer as-is.
*/
max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr));
if (skb_headroom(skb) < max_headroom
|| skb_cloned(skb) || skb_shared(skb)) {
struct sk_buff *new_skb =
skb_realloc_headroom(skb, max_headroom);
if (!new_skb) {
ip_rt_put(rt);
IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
return NF_DROP;
}
kfree_skb(skb);
skb = new_skb;
old_iph = skb->nh.iph;
}
skb->h.raw = skb->nh.raw;
skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
/* drop old route */
dst_release(skb->dst);
skb->dst = &rt->u.dst;
/*
* Push down and install the IPIP header.
*/
iph = skb->nh.iph;
iph->version = 4;
iph->ihl = sizeof(struct iphdr)>>2;
iph->frag_off = df;
iph->protocol = IPPROTO_IPIP;
iph->tos = tos;
iph->daddr = rt->rt_dst;
iph->saddr = rt->rt_src;
iph->ttl = old_iph->ttl;
iph->tot_len = htons(skb->len);
ip_select_ident(iph, &rt->u.dst, NULL);
ip_send_check(iph);
skb->ip_summed = CHECKSUM_NONE;
#ifdef CONFIG_NETFILTER_DEBUG
skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
#endif /* CONFIG_NETFILTER_DEBUG */
skb->nfcache |= NFC_IPVS_PROPERTY;
ip_send(skb);
LeaveFunction(10);
return NF_STOLEN;
tx_error_icmp:
dst_link_failure(skb);
tx_error:
kfree_skb(skb);
return NF_STOLEN;
}
/*
* Direct Routing transmitter
*/
static int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
{
struct rtable *rt; /* Route to the other host */
struct iphdr *iph = skb->nh.iph;
int mtu;
EnterFunction(10);
if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
goto tx_error_icmp;
/* MTU checking */
mtu = rt->u.dst.pmtu;
if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
ip_rt_put(rt);
IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
goto tx_error;
}
/* update checksum because skb might be defragmented */
ip_send_check(iph);
if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
ip_rt_put(rt);
IP_VS_ERR_RL("ip_vs_dr_xmit(): no memory\n");
goto tx_error;
}
}
/* drop old route */
dst_release(skb->dst);
skb->dst = &rt->u.dst;
#ifdef CONFIG_NETFILTER_DEBUG
skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
#endif /* CONFIG_NETFILTER_DEBUG */
skb->nfcache |= NFC_IPVS_PROPERTY;
ip_send(skb);
#if 0000
NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
do_ip_send);
#endif
LeaveFunction(10);
return NF_STOLEN;
tx_error_icmp:
dst_link_failure(skb);
tx_error:
kfree_skb(skb);
return NF_STOLEN;
}
/*
* Bind a connection entry with the corresponding packet_xmit.
* Called by ip_vs_conn_new.
*/
static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
{
switch (IP_VS_FWD_METHOD(cp)) {
case IP_VS_CONN_F_MASQ:
cp->packet_xmit = ip_vs_nat_xmit;
break;
case IP_VS_CONN_F_TUNNEL:
cp->packet_xmit = ip_vs_tunnel_xmit;
break;
case IP_VS_CONN_F_DROUTE:
cp->packet_xmit = ip_vs_dr_xmit;
break;
case IP_VS_CONN_F_LOCALNODE:
cp->packet_xmit = ip_vs_null_xmit;
break;
case IP_VS_CONN_F_BYPASS:
cp->packet_xmit = ip_vs_bypass_xmit;
break;
}
}
/*
* Bind a connection entry with a virtual service destination
* Called just after a new connection entry is created.
*/
static inline void
ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
{
/* if dest is NULL, then return directly */
if (!dest)
return;
/* Increase the refcnt counter of the dest */
atomic_inc(&dest->refcnt);
/* Bind with the destination and its corresponding transmitter */
cp->flags |= atomic_read(&dest->conn_flags);
cp->dest = dest;
IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
"d:%u.%u.%u.%u:%d fwd:%c s:%s flg:%X cnt:%d destcnt:%d\n",
ip_vs_proto_name(cp->protocol),
NIPQUAD(cp->caddr), ntohs(cp->cport),
NIPQUAD(cp->vaddr), ntohs(cp->vport),
NIPQUAD(cp->daddr), ntohs(cp->dport),
ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
cp->flags, atomic_read(&cp->refcnt),
atomic_read(&dest->refcnt));
}
/*
* Unbind a connection entry with its VS destination
* Called by the ip_vs_conn_expire function.
*/
static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
{
struct ip_vs_dest *dest = cp->dest;
/* if dest is NULL, then return directly */
if (!dest)
return;
IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d "
"v:%u.%u.%u.%u:%d d:%u.%u.%u.%u:%d fwd:%c "
"s:%s flg:%X cnt:%d destcnt:%d",
ip_vs_proto_name(cp->protocol),
NIPQUAD(cp->caddr), ntohs(cp->cport),
NIPQUAD(cp->vaddr), ntohs(cp->vport),
NIPQUAD(cp->daddr), ntohs(cp->dport),
ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
cp->flags, atomic_read(&cp->refcnt),
atomic_read(&dest->refcnt));
/*
* Decrease the inactconns or activeconns counter
* if it is not a connection template ((cp->cport!=0)
* || (cp->flags & IP_VS_CONN_F_NO_CPORT)).
*/
if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
if (cp->flags & IP_VS_CONN_F_INACTIVE) {
atomic_dec(&dest->inactconns);
} else {
atomic_dec(&dest->activeconns);
}
}
/*
* Simply decrease the refcnt of the dest, because the
* dest will be either in service's destination list
* or in the trash.
*/
atomic_dec(&dest->refcnt);
}
/*
* Checking if the destination of a connection template is available.
* If available, return 1, otherwise invalidate this connection
* template and return 0.
*/
int ip_vs_check_template(struct ip_vs_conn *ct)
{
struct ip_vs_dest *dest = ct->dest;
/*
* Checking the dest server status.
*/
if ((dest == NULL) ||
!(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
(sysctl_ip_vs_expire_quiescent_template &&
(atomic_read(&dest->weight) == 0))) {
IP_VS_DBG(9, "check_template: dest not available for "
"protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
"-> d:%u.%u.%u.%u:%d\n",
ip_vs_proto_name(ct->protocol),
NIPQUAD(ct->caddr), ntohs(ct->cport),
NIPQUAD(ct->vaddr), ntohs(ct->vport),
NIPQUAD(ct->daddr), ntohs(ct->dport));
/*
* Invalidate the connection template
*/
if (ct->cport) {
if (ip_vs_conn_unhash(ct)) {
ct->dport = 65535;
ct->vport = 65535;
ct->cport = 0;
ip_vs_conn_hash(ct);
}
}
/*
* Simply decrease the refcnt of the template,
* don't restart its timer.
*/
atomic_dec(&ct->refcnt);
return 0;
}
return 1;
}
static inline void
ip_vs_timeout_attach(struct ip_vs_conn *cp, struct ip_vs_timeout_table *vstim)
{
atomic_inc(&vstim->refcnt);
cp->timeout_table = vstim;
}
static inline void ip_vs_timeout_detach(struct ip_vs_conn *cp)
{
struct ip_vs_timeout_table *vstim = cp->timeout_table;
if (!vstim)
return;
cp->timeout_table = NULL;
atomic_dec(&vstim->refcnt);
}
static void ip_vs_conn_expire(unsigned long data)
{
struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
if (cp->timeout_table)
cp->timeout = cp->timeout_table->timeout[IP_VS_S_TIME_WAIT];
else
cp->timeout = vs_timeout_table.timeout[IP_VS_S_TIME_WAIT];
/*
* hey, I'm using it
*/
atomic_inc(&cp->refcnt);
/*
* do I control anybody?
*/
if (atomic_read(&cp->n_control))
goto expire_later;
/*
* unhash it if it is hashed in the conn table
*/
if (!ip_vs_conn_unhash(cp))
goto expire_later;
/*
* refcnt==1 implies I'm the only one referrer
*/
if (likely(atomic_read(&cp->refcnt) == 1)) {
/* make sure that there is no timer on it now */
if (timer_pending(&cp->timer))
del_timer(&cp->timer);
/* does anybody control me? */
if (cp->control)
ip_vs_control_del(cp);
ip_vs_unbind_dest(cp);
ip_vs_unbind_app(cp);
ip_vs_timeout_detach(cp);
if (cp->flags & IP_VS_CONN_F_NO_CPORT)
atomic_dec(&ip_vs_conn_no_cport_cnt);
atomic_dec(&ip_vs_conn_count);
kmem_cache_free(ip_vs_conn_cachep, cp);
return;
}
/* hash it back to the table */
ip_vs_conn_hash(cp);
expire_later:
IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n",
atomic_read(&cp->refcnt)-1,
atomic_read(&cp->n_control));
ip_vs_conn_put(cp);
}
void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
{
cp->timeout = 0;
mod_timer(&cp->timer, jiffies);
__ip_vs_conn_put(cp);
}
/*
* Create a new connection entry and hash it into the ip_vs_conn_tab.
*/
struct ip_vs_conn *
ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport,
__u32 daddr, __u16 dport, unsigned flags,
struct ip_vs_dest *dest)
{
struct ip_vs_conn *cp;
cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
if (cp == NULL) {
IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
return NULL;
}
memset(cp, 0, sizeof(*cp));
INIT_LIST_HEAD(&cp->c_list);
init_timer(&cp->timer);
cp->timer.data = (unsigned long)cp;
cp->timer.function = ip_vs_conn_expire;
ip_vs_timeout_attach(cp, ip_vs_timeout_table);
cp->protocol = proto;
cp->caddr = caddr;
cp->cport = cport;
cp->vaddr = vaddr;
cp->vport = vport;
cp->daddr = daddr;
cp->dport = dport;
cp->flags = flags;
cp->app_data = NULL;
cp->control = NULL;
cp->lock = SPIN_LOCK_UNLOCKED;
atomic_set(&cp->n_control, 0);
atomic_set(&cp->in_pkts, 0);
atomic_inc(&ip_vs_conn_count);
if (flags & IP_VS_CONN_F_NO_CPORT)
atomic_inc(&ip_vs_conn_no_cport_cnt);
/* Bind its application helper (only for VS/NAT) if any */
ip_vs_bind_app(cp);
/* Bind the connection with a destination server */
ip_vs_bind_dest(cp, dest);
/* Set its state and timeout */
vs_set_state_timeout(cp, IP_VS_S_NONE);
/* Bind its packet transmitter */
ip_vs_bind_xmit(cp);
/*
* Set the entry is referenced by the current thread before hashing
* it in the table, so that other thread run ip_vs_random_dropentry
* but cannot drop this entry.
*/
atomic_set(&cp->refcnt, 1);
/* Hash it in the ip_vs_conn_tab finally */
ip_vs_conn_hash(cp);
return cp;
}
/*
* /proc/net/ip_vs_conn entries
*/
static int
ip_vs_conn_getinfo(char *buffer, char **start, off_t offset, int length)
{
off_t pos=0;
int idx, len=0;
char temp[70];
struct ip_vs_conn *cp;
struct list_head *l, *e;
pos = 128;
if (pos > offset) {
len += sprintf(buffer+len, "%-127s\n",
"Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires");
}
for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
/*
* Lock is actually only need in next loop
* we are called from uspace: must stop bh.
*/
ct_read_lock_bh(idx);
l = &ip_vs_conn_tab[idx];
for (e=l->next; e!=l; e=e->next) {
cp = list_entry(e, struct ip_vs_conn, c_list);
pos += 128;
if (pos <= offset)
continue;
sprintf(temp,
"%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu",
ip_vs_proto_name(cp->protocol),
ntohl(cp->caddr), ntohs(cp->cport),
ntohl(cp->vaddr), ntohs(cp->vport),
ntohl(cp->daddr), ntohs(cp->dport),
ip_vs_state_name(cp->state),
(cp->timer.expires-jiffies)/HZ);
len += sprintf(buffer+len, "%-127s\n", temp);
if (pos >= offset+length) {
ct_read_unlock_bh(idx);
goto done;
}
}
ct_read_unlock_bh(idx);
}
done:
*start = buffer+len-(pos-offset); /* Start of wanted data */
len = pos-offset;
if (len > length)
len = length;
if (len < 0)
len = 0;
return len;
}
/*
* Randomly drop connection entries before running out of memory
*/
static inline int todrop_entry(struct ip_vs_conn *cp)
{
/*
* The drop rate array needs tuning for real environments.
* Called from timer bh only => no locking
*/
static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
static char todrop_counter[9] = {0};
int i;
/* if the conn entry hasn't lasted for 60 seconds, don't drop it.
This will leave enough time for normal connection to get
through. */
if (cp->timeout+jiffies-cp->timer.expires < 60*HZ)
return 0;
/* Don't drop the entry if its number of incoming packets is not
located in [0, 8] */
i = atomic_read(&cp->in_pkts);
if (i > 8 || i < 0) return 0;
if (!todrop_rate[i]) return 0;
if (--todrop_counter[i] > 0) return 0;
todrop_counter[i] = todrop_rate[i];
return 1;
}
void ip_vs_random_dropentry(void)
{
int idx;
struct ip_vs_conn *cp;
struct list_head *l,*e;
struct ip_vs_conn *ct;
/*
* Randomly scan 1/32 of the whole table every second
*/
for (idx=0; idx<(IP_VS_CONN_TAB_SIZE>>5); idx++) {
unsigned hash = net_random()&IP_VS_CONN_TAB_MASK;
/*
* Lock is actually needed in this loop.
*/
ct_write_lock(hash);
l = &ip_vs_conn_tab[hash];
for (e=l->next; e!=l; e=e->next) {
cp = list_entry(e, struct ip_vs_conn, c_list);
if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
/* connection template */
continue;
switch(cp->state) {
case IP_VS_S_SYN_RECV:
case IP_VS_S_SYNACK:
break;
case IP_VS_S_ESTABLISHED:
case IP_VS_S_UDP:
if (todrop_entry(cp))
break;
continue;
default:
continue;
}
/*
* Drop the entry, and drop its ct if not referenced
*/
atomic_inc(&cp->refcnt);
ct_write_unlock(hash);
if ((ct = cp->control))
atomic_inc(&ct->refcnt);
IP_VS_DBG(4, "del connection\n");
ip_vs_conn_expire_now(cp);
if (ct) {
IP_VS_DBG(4, "del conn template\n");
ip_vs_conn_expire_now(ct);
}
ct_write_lock(hash);
}
ct_write_unlock(hash);
}
}
/*
* Flush all the connection entries in the ip_vs_conn_tab
*/
static void ip_vs_conn_flush(void)
{
int idx;
struct ip_vs_conn *cp;
struct list_head *l,*e;
struct ip_vs_conn *ct;
flush_again:
for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
/*
* Lock is actually needed in this loop.
*/
ct_write_lock_bh(idx);
l = &ip_vs_conn_tab[idx];
for (e=l->next; e!=l; e=e->next) {
cp = list_entry(e, struct ip_vs_conn, c_list);
atomic_inc(&cp->refcnt);
ct_write_unlock(idx);
if ((ct = cp->control))
atomic_inc(&ct->refcnt);
IP_VS_DBG(4, "del connection\n");
ip_vs_conn_expire_now(cp);
if (ct) {
IP_VS_DBG(4, "del conn template\n");
ip_vs_conn_expire_now(ct);
}
ct_write_lock(idx);
}
ct_write_unlock_bh(idx);
}
/* the counter may be not NULL, because maybe some conn entries
are run by slow timer handler or unhashed but still referred */
if (atomic_read(&ip_vs_conn_count) != 0) {
schedule();
goto flush_again;
}
}
int ip_vs_conn_init(void)
{
int idx;
/*
* Allocate the connection hash table and initialize its list heads
*/
ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
if (!ip_vs_conn_tab)
return -ENOMEM;
IP_VS_INFO("Connection hash table configured "
"(size=%d, memory=%ldKbytes)\n",
IP_VS_CONN_TAB_SIZE,
(long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
IP_VS_DBG(0, "Each connection entry needs %d bytes at least\n",
sizeof(struct ip_vs_conn));
for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
}
for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
__ip_vs_conntbl_lock_array[idx].l = RW_LOCK_UNLOCKED;
}
/* Allocate ip_vs_conn slab cache */
ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
sizeof(struct ip_vs_conn), 0,
SLAB_HWCACHE_ALIGN, NULL, NULL);
if (!ip_vs_conn_cachep) {
vfree(ip_vs_conn_tab);
return -ENOMEM;
}
proc_net_create("ip_vs_conn", 0, ip_vs_conn_getinfo);
/* calculate the random value for connection hash */
get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
return 0;
}
void ip_vs_conn_cleanup(void)
{
/* flush all the connection entries first */
ip_vs_conn_flush();
/* Release the empty cache */
kmem_cache_destroy(ip_vs_conn_cachep);
proc_net_remove("ip_vs_conn");
vfree(ip_vs_conn_tab);
}