netdev
[Top] [All Lists]

Re: [PATCH 2.6]: keep fragment queues private to each user

To: "David S. Miller" <davem@xxxxxxxxxxxxx>
Subject: Re: [PATCH 2.6]: keep fragment queues private to each user
From: Patrick McHardy <kaber@xxxxxxxxx>
Date: Thu, 27 Jan 2005 00:30:56 +0100
Cc: netdev@xxxxxxxxxxx
In-reply-to: <20050125214158.4be3798c.davem@davemloft.net>
References: <41F5D1CB.9050804@trash.net> <20050125214158.4be3798c.davem@davemloft.net>
Sender: netdev-bounce@xxxxxxxxxxx
User-agent: Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.7.5) Gecko/20050106 Debian/1.7.5-1
David S. Miller wrote:

On Tue, 25 Jan 2005 05:57:47 +0100
Patrick McHardy <kaber@xxxxxxxxx> wrote:

This patch keeps fragment queues private to each ip_defrag user to avoid
skbs jumping between different callers. It shouldn't change any wanted
behaviour, the only questionable one was ip_call_ra_chain, but the RA
option is included in each fragment. If you're fine with the patch I'm going
to send a 2.4 version later.


Please fix the CONFIG_SMP build. You use some variable "user" in ip_frag_intern(), yet don't add this as an argument or anything like that.

Sorry, new patch attached. I'll send the 2.4 patch later tonight.

Regards
Patrick

# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2005/01/27 00:20:54+01:00 kaber@xxxxxxxxxxxx 
#   [IPV4]: Keep fragment queues private to each user
#   
#   Signed-off-by: Patrick McHardy <kaber@xxxxxxxxx>
# 
# net/ipv4/netfilter/ip_nat_standalone.c
#   2005/01/27 00:20:46+01:00 kaber@xxxxxxxxxxxx +1 -1
#   [IPV4]: Keep fragment queues private to each user
#   
#   Signed-off-by: Patrick McHardy <kaber@xxxxxxxxx>
# 
# net/ipv4/netfilter/ip_conntrack_standalone.c
#   2005/01/27 00:20:46+01:00 kaber@xxxxxxxxxxxx +4 -7
#   [IPV4]: Keep fragment queues private to each user
#   
#   Signed-off-by: Patrick McHardy <kaber@xxxxxxxxx>
# 
# net/ipv4/netfilter/ip_conntrack_core.c
#   2005/01/27 00:20:46+01:00 kaber@xxxxxxxxxxxx +2 -9
#   [IPV4]: Keep fragment queues private to each user
#   
#   Signed-off-by: Patrick McHardy <kaber@xxxxxxxxx>
# 
# net/ipv4/ipvs/ip_vs_core.c
#   2005/01/27 00:20:46+01:00 kaber@xxxxxxxxxxxx +11 -8
#   [IPV4]: Keep fragment queues private to each user
#   
#   Signed-off-by: Patrick McHardy <kaber@xxxxxxxxx>
# 
# net/ipv4/ip_input.c
#   2005/01/27 00:20:46+01:00 kaber@xxxxxxxxxxxx +2 -2
#   [IPV4]: Keep fragment queues private to each user
#   
#   Signed-off-by: Patrick McHardy <kaber@xxxxxxxxx>
# 
# net/ipv4/ip_fragment.c
#   2005/01/27 00:20:46+01:00 kaber@xxxxxxxxxxxx +13 -20
#   [IPV4]: Keep fragment queues private to each user
#   
#   Signed-off-by: Patrick McHardy <kaber@xxxxxxxxx>
# 
# include/net/ip.h
#   2005/01/27 00:20:46+01:00 kaber@xxxxxxxxxxxx +14 -3
#   [IPV4]: Keep fragment queues private to each user
#   
#   Signed-off-by: Patrick McHardy <kaber@xxxxxxxxx>
# 
# include/linux/netfilter_ipv4/ip_conntrack.h
#   2005/01/27 00:20:46+01:00 kaber@xxxxxxxxxxxx +1 -2
#   [IPV4]: Keep fragment queues private to each user
#   
#   Signed-off-by: Patrick McHardy <kaber@xxxxxxxxx>
# 
diff -Nru a/include/linux/netfilter_ipv4/ip_conntrack.h 
b/include/linux/netfilter_ipv4/ip_conntrack.h
--- a/include/linux/netfilter_ipv4/ip_conntrack.h       2005-01-27 00:22:21 
+01:00
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h       2005-01-27 00:22:21 
+01:00
@@ -262,10 +262,9 @@
 /* Fake conntrack entry for untracked connections */
 extern struct ip_conntrack ip_conntrack_untracked;
 
-extern int ip_ct_no_defrag;
 /* Returns new sk_buff, or NULL */
 struct sk_buff *
-ip_ct_gather_frags(struct sk_buff *skb);
+ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user);
 
 /* Iterate over all conntracks: if iter returns true, it's deleted. */
 extern void
diff -Nru a/include/net/ip.h b/include/net/ip.h
--- a/include/net/ip.h  2005-01-27 00:22:21 +01:00
+++ b/include/net/ip.h  2005-01-27 00:22:21 +01:00
@@ -286,9 +286,20 @@
 /*
  *     Functions provided by ip_fragment.o
  */
- 
-struct sk_buff *ip_defrag(struct sk_buff *skb);
-extern void ipfrag_flush(void);
+
+enum ip_defrag_users
+{
+       IP_DEFRAG_LOCAL_DELIVER,
+       IP_DEFRAG_CALL_RA_CHAIN,
+       IP_DEFRAG_CONNTRACK_IN,
+       IP_DEFRAG_CONNTRACK_OUT,
+       IP_DEFRAG_NAT_OUT,
+       IP_DEFRAG_VS_IN,
+       IP_DEFRAG_VS_OUT,
+       IP_DEFRAG_VS_FWD
+};
+
+struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user);
 extern int ip_frag_nqueues;
 extern atomic_t ip_frag_mem;
 
diff -Nru a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
--- a/net/ipv4/ip_fragment.c    2005-01-27 00:22:21 +01:00
+++ b/net/ipv4/ip_fragment.c    2005-01-27 00:22:21 +01:00
@@ -73,6 +73,7 @@
 struct ipq {
        struct ipq      *next;          /* linked list pointers                 
*/
        struct list_head lru_list;      /* lru list member                      
*/
+       u32             user;
        u32             saddr;
        u32             daddr;
        u16             id;
@@ -243,13 +244,13 @@
 /* Memory limiting on fragments.  Evictor trashes the oldest 
  * fragment queue until we are back under the threshold.
  */
-static void __ip_evictor(int threshold)
+static void ip_evictor(void)
 {
        struct ipq *qp;
        struct list_head *tmp;
        int work;
 
-       work = atomic_read(&ip_frag_mem) - threshold;
+       work = atomic_read(&ip_frag_mem) - sysctl_ipfrag_low_thresh;
        if (work <= 0)
                return;
 
@@ -274,11 +275,6 @@
        }
 }
 
-static inline void ip_evictor(void)
-{
-       __ip_evictor(sysctl_ipfrag_low_thresh);
-}
-
 /*
  * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
  */
@@ -325,7 +321,8 @@
                if(qp->id == qp_in->id          &&
                   qp->saddr == qp_in->saddr    &&
                   qp->daddr == qp_in->daddr    &&
-                  qp->protocol == qp_in->protocol) {
+                  qp->protocol == qp_in->protocol &&
+                  qp->user == qp_in->user) {
                        atomic_inc(&qp->refcnt);
                        write_unlock(&ipfrag_lock);
                        qp_in->last_in |= COMPLETE;
@@ -352,7 +349,7 @@
 }
 
 /* Add an entry to the 'ipq' queue for a newly received IP datagram. */
-static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph)
+static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
 {
        struct ipq *qp;
 
@@ -364,6 +361,7 @@
        qp->id = iph->id;
        qp->saddr = iph->saddr;
        qp->daddr = iph->daddr;
+       qp->user = user;
        qp->len = 0;
        qp->meat = 0;
        qp->fragments = NULL;
@@ -386,7 +384,7 @@
 /* Find the correct entry in the "incomplete datagrams" queue for
  * this IP datagram, and create new one, if nothing is found.
  */
-static inline struct ipq *ip_find(struct iphdr *iph)
+static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
 {
        __u16 id = iph->id;
        __u32 saddr = iph->saddr;
@@ -400,7 +398,8 @@
                if(qp->id == id         &&
                   qp->saddr == saddr   &&
                   qp->daddr == daddr   &&
-                  qp->protocol == protocol) {
+                  qp->protocol == protocol &&
+                  qp->user == user) {
                        atomic_inc(&qp->refcnt);
                        read_unlock(&ipfrag_lock);
                        return qp;
@@ -408,7 +407,7 @@
        }
        read_unlock(&ipfrag_lock);
 
-       return ip_frag_create(hash, iph);
+       return ip_frag_create(hash, iph, user);
 }
 
 /* Add new segment to existing queue. */
@@ -642,7 +641,7 @@
 }
 
 /* Process an incoming IP datagram fragment. */
-struct sk_buff *ip_defrag(struct sk_buff *skb)
+struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user)
 {
        struct iphdr *iph = skb->nh.iph;
        struct ipq *qp;
@@ -657,7 +656,7 @@
        dev = skb->dev;
 
        /* Lookup (or create) queue header */
-       if ((qp = ip_find(iph)) != NULL) {
+       if ((qp = ip_find(iph, user)) != NULL) {
                struct sk_buff *ret = NULL;
 
                spin_lock(&qp->lock);
@@ -689,10 +688,4 @@
        add_timer(&ipfrag_secret_timer);
 }
 
-void ipfrag_flush(void)
-{
-       __ip_evictor(0);
-}
-
 EXPORT_SYMBOL(ip_defrag);
-EXPORT_SYMBOL(ipfrag_flush);
diff -Nru a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
--- a/net/ipv4/ip_input.c       2005-01-27 00:22:21 +01:00
+++ b/net/ipv4/ip_input.c       2005-01-27 00:22:21 +01:00
@@ -172,7 +172,7 @@
                    (!sk->sk_bound_dev_if ||
                     sk->sk_bound_dev_if == skb->dev->ifindex)) {
                        if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
-                               skb = ip_defrag(skb);
+                               skb = ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN);
                                if (skb == NULL) {
                                        read_unlock(&ip_ra_lock);
                                        return 1;
@@ -273,7 +273,7 @@
         */
 
        if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
-               skb = ip_defrag(skb);
+               skb = ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER);
                if (!skb)
                        return 0;
        }
diff -Nru a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
--- a/net/ipv4/ipvs/ip_vs_core.c        2005-01-27 00:22:21 +01:00
+++ b/net/ipv4/ipvs/ip_vs_core.c        2005-01-27 00:22:21 +01:00
@@ -544,9 +544,9 @@
 }
 
 static inline struct sk_buff *
-ip_vs_gather_frags(struct sk_buff *skb)
+ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
 {
-       skb = ip_defrag(skb);
+       skb = ip_defrag(skb, user);
        if (skb)
                ip_send_check(skb->nh.iph);
        return skb;
@@ -620,7 +620,7 @@
 
        /* reassemble IP fragments */
        if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
-               skb = ip_vs_gather_frags(skb);
+               skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
                if (!skb)
                        return NF_STOLEN;
                *pskb = skb;
@@ -759,7 +759,7 @@
        /* reassemble IP fragments */
        if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) &&
                     !pp->dont_defrag)) {
-               skb = ip_vs_gather_frags(skb);
+               skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
                if (!skb)
                        return NF_STOLEN;
                iph = skb->nh.iph;
@@ -839,7 +839,8 @@
  *     forward to the right destination host if relevant.
  *     Currently handles error types - unreachable, quench, ttl exceeded.
  */
-static int ip_vs_in_icmp(struct sk_buff **pskb, int *related)
+static int 
+ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum)
 {
        struct sk_buff *skb = *pskb;
        struct iphdr *iph;
@@ -853,7 +854,9 @@
 
        /* reassemble IP fragments */
        if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
-               skb = ip_vs_gather_frags(skb);
+               skb = ip_vs_gather_frags(skb,
+                                        hooknum == NF_IP_LOCAL_IN ?
+                                        IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD);
                if (!skb)
                        return NF_STOLEN;
                *pskb = skb;
@@ -962,7 +965,7 @@
 
        iph = skb->nh.iph;
        if (unlikely(iph->protocol == IPPROTO_ICMP)) {
-               int related, verdict = ip_vs_in_icmp(pskb, &related);
+               int related, verdict = ip_vs_in_icmp(pskb, &related, hooknum);
 
                if (related)
                        return verdict;
@@ -1057,7 +1060,7 @@
        if ((*pskb)->nh.iph->protocol != IPPROTO_ICMP)
                return NF_ACCEPT;
 
-       return ip_vs_in_icmp(pskb, &r);
+       return ip_vs_in_icmp(pskb, &r, hooknum);
 }
 
 
diff -Nru a/net/ipv4/netfilter/ip_conntrack_core.c 
b/net/ipv4/netfilter/ip_conntrack_core.c
--- a/net/ipv4/netfilter/ip_conntrack_core.c    2005-01-27 00:22:21 +01:00
+++ b/net/ipv4/netfilter/ip_conntrack_core.c    2005-01-27 00:22:21 +01:00
@@ -936,29 +936,22 @@
        }
 }
 
-int ip_ct_no_defrag;
-
 /* Returns new sk_buff, or NULL */
 struct sk_buff *
-ip_ct_gather_frags(struct sk_buff *skb)
+ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
 {
        struct sock *sk = skb->sk;
 #ifdef CONFIG_NETFILTER_DEBUG
        unsigned int olddebug = skb->nf_debug;
 #endif
 
-       if (unlikely(ip_ct_no_defrag)) {
-               kfree_skb(skb);
-               return NULL;
-       }
-
        if (sk) {
                sock_hold(sk);
                skb_orphan(skb);
        }
 
        local_bh_disable(); 
-       skb = ip_defrag(skb);
+       skb = ip_defrag(skb, user);
        local_bh_enable();
 
        if (!skb) {
diff -Nru a/net/ipv4/netfilter/ip_conntrack_standalone.c 
b/net/ipv4/netfilter/ip_conntrack_standalone.c
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c      2005-01-27 00:22:21 
+01:00
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c      2005-01-27 00:22:21 
+01:00
@@ -391,7 +391,10 @@
 
        /* Gather fragments. */
        if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
-               *pskb = ip_ct_gather_frags(*pskb);
+               *pskb = ip_ct_gather_frags(*pskb,
+                                          hooknum == NF_IP_PRE_ROUTING ? 
+                                          IP_DEFRAG_CONNTRACK_IN :
+                                          IP_DEFRAG_CONNTRACK_OUT);
                if (!*pskb)
                        return NF_STOLEN;
        }
@@ -823,12 +826,6 @@
  cleanup_defraglocalops:
        nf_unregister_hook(&ip_conntrack_defrag_local_out_ops);
  cleanup_defragops:
-       /* Frag queues may hold fragments with skb->dst == NULL */
-       ip_ct_no_defrag = 1;
-       synchronize_net();
-       local_bh_disable();
-       ipfrag_flush();
-       local_bh_enable();
        nf_unregister_hook(&ip_conntrack_defrag_ops);
  cleanup_proc_stat:
 #ifdef CONFIG_PROC_FS
diff -Nru a/net/ipv4/netfilter/ip_nat_standalone.c 
b/net/ipv4/netfilter/ip_nat_standalone.c
--- a/net/ipv4/netfilter/ip_nat_standalone.c    2005-01-27 00:22:21 +01:00
+++ b/net/ipv4/netfilter/ip_nat_standalone.c    2005-01-27 00:22:21 +01:00
@@ -195,7 +195,7 @@
           I'm starting to have nightmares about fragments.  */
 
        if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
-               *pskb = ip_ct_gather_frags(*pskb);
+               *pskb = ip_ct_gather_frags(*pskb, IP_DEFRAG_NAT_OUT);
 
                if (!*pskb)
                        return NF_STOLEN;
<Prev in Thread] Current Thread [Next in Thread>