netdev
[Top] [All Lists]

Re: nfmark routing in ip_route_output()

To: kuznet@xxxxxxxxxxxxx
Subject: Re: nfmark routing in ip_route_output()
From: Rusty Russell <rusty@xxxxxxxxxxxxxxxx>
Date: Thu, 31 Aug 2000 22:29:11 +1100
Cc: netdev@xxxxxxxxxxx, ges@xxxxxxxxxx, netfilter@xxxxxxxxxxxxx
In-reply-to: Your message of "Sun, 13 Aug 2000 20:30:21 +0400." <200008131630.UAA04346@xxxxxxxxxxxxx>
Sender: owner-netdev@xxxxxxxxxxx
In message <200008131630.UAA04346@xxxxxxxxxxxxx> you write:
> If you do not want to depend on skb, add new function using
> rt_key as argument. You may even replace ip_route_output()
> with this new function everywhere, it will be a bit slower,
> but it is worth to do, because has lots of useful applications
> not bound to nfmark.

OK.  This is minimal source level change, so I don't break routing
code this close to 2.4.0.

Responsibility for rerouting is now handled by netfilter module which
alters the packet: this fixes my major design mistake, and removes
route_me_harder from IP stack code.

It works (netfilter testsuite/00netfilter/10localmangle.sh): you can
now change mark for LOCAL_OUT packets and they get rerouted like users
want.

Rusty.

diff -urN -X /tmp/file5G2Cgt --minimal linux-2.4.0-test8-1/include/net/route.h 
working-2.4.0-test8-1/include/net/route.h
--- linux-2.4.0-test8-1/include/net/route.h     Wed Aug 30 19:40:05 2000
+++ working-2.4.0-test8-1/include/net/route.h   Wed Aug 30 23:51:06 2000
@@ -94,12 +94,13 @@
 
 extern struct ip_rt_acct *ip_rt_acct;
 
+struct in_device;
 extern void            ip_rt_init(void);
 extern void            ip_rt_redirect(u32 old_gw, u32 dst, u32 new_gw,
                                       u32 src, u8 tos, struct net_device *dev);
 extern void            ip_rt_advice(struct rtable **rp, int advice);
 extern void            rt_cache_flush(int how);
-extern int             ip_route_output(struct rtable **, u32 dst, u32 src, u32 
tos, int oif);
+extern int             ip_route_output_key(struct rtable **, const struct 
rt_key *key);
 extern int             ip_route_input(struct sk_buff*, u32 dst, u32 src, u8 
tos, struct net_device *devin);
 extern unsigned short  ip_rt_frag_needed(struct iphdr *iph, unsigned short 
new_mtu);
 extern void            ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu);
@@ -110,6 +111,15 @@
 extern int             ip_rt_ioctl(unsigned int cmd, void *arg);
 extern void            ip_rt_get_source(u8 *src, struct rtable *rt);
 extern int             ip_rt_dump(struct sk_buff *skb,  struct 
netlink_callback *cb);
+
+/* Deprecated: use ip_route_output_key directly */
+extern __inline__ int ip_route_output(struct rtable **rp,
+                                     u32 daddr, u32 saddr, u32 tos, int oif)
+{
+       struct rt_key key = { dst:daddr, src:saddr, oif:oif, tos:tos };
+
+       return ip_route_output_key(rp, &key);
+}
 
 
 extern __inline__ void ip_rt_put(struct rtable * rt)
diff -urN -X /tmp/file5G2Cgt --minimal linux-2.4.0-test8-1/net/ipv4/route.c 
working-2.4.0-test8-1/net/ipv4/route.c
--- linux-2.4.0-test8-1/net/ipv4/route.c        Sun Aug 27 15:11:01 2000
+++ working-2.4.0-test8-1/net/ipv4/route.c      Wed Aug 30 23:14:18 2000
@@ -1610,7 +1610,7 @@
  * Major route resolver routine.
  */
 
-int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, 
int oif)
+int ip_route_output_slow(struct rtable **rp, const struct rt_key *oldkey)
 {
        struct rt_key key;
        struct fib_result res;
@@ -1620,25 +1620,31 @@
        unsigned hash;
        int free_res = 0;
        int err;
+       u32 tos;
 
-       tos &= IPTOS_RT_MASK|RTO_ONLINK;
-       key.dst = daddr;
-       key.src = saddr;
+       tos = oldkey->tos & (IPTOS_RT_MASK|RTO_ONLINK);
+       key.dst = oldkey->dst;
+       key.src = oldkey->src;
        key.tos = tos&IPTOS_RT_MASK;
        key.iif = loopback_dev.ifindex;
-       key.oif = oif;
+       key.oif = oldkey->oif;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+       key.fwmark = oldkey->fwmark;
+#endif
        key.scope = (tos&RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
        res.fi = NULL;
 #ifdef CONFIG_IP_MULTIPLE_TABLES
        res.r = NULL;
 #endif
 
-       if (saddr) {
-               if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr))
+       if (oldkey->src) {
+               if (MULTICAST(oldkey->src)
+                   || BADCLASS(oldkey->src)
+                   || ZERONET(oldkey->src))
                        return -EINVAL;
 
                /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
-               dev_out = ip_dev_find(saddr);
+               dev_out = ip_dev_find(oldkey->src);
                if (dev_out == NULL)
                        return -EINVAL;
 
@@ -1650,8 +1656,8 @@
                      of another iface. --ANK
                 */
 
-               if (oif == 0 &&
-                       (MULTICAST(daddr) || daddr == 0xFFFFFFFF)) {
+               if (oldkey->oif == 0
+                   && (MULTICAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF)) {
                        /* Special hack: user can direct multicasts
                           and limited broadcast via necessary interface
                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
@@ -1674,8 +1680,8 @@
                        dev_put(dev_out);
                dev_out = NULL;
        }
-       if (oif) {
-               dev_out = dev_get_by_index(oif);
+       if (oldkey->oif) {
+               dev_out = dev_get_by_index(oldkey->oif);
                if (dev_out == NULL)
                        return -ENODEV;
                if (__in_dev_get(dev_out) == NULL) {
@@ -1683,15 +1689,15 @@
                        return -ENODEV; /* Wrong error code */
                }
 
-               if (LOCAL_MCAST(daddr) || daddr == 0xFFFFFFFF) {
+               if (LOCAL_MCAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF) {
                        if (!key.src)
                                key.src = inet_select_addr(dev_out, 0, 
RT_SCOPE_LINK);
                        goto make_route;
                }
                if (!key.src) {
-                       if (MULTICAST(daddr))
+                       if (MULTICAST(oldkey->dst))
                                key.src = inet_select_addr(dev_out, 0, 
key.scope);
-                       else if (!daddr)
+                       else if (!oldkey->dst)
                                key.src = inet_select_addr(dev_out, 0, 
RT_SCOPE_HOST);
                }
        }
@@ -1712,7 +1718,7 @@
 
        if (fib_lookup(&key, &res)) {
                res.fi = NULL;
-               if (oif) {
+               if (oldkey->oif) {
                        /* Apparently, routing tables are wrong. Assume,
                           that the destination is on link.
 
@@ -1800,7 +1806,7 @@
        } else if (res.type == RTN_MULTICAST) {
                flags |= RTCF_MULTICAST|RTCF_LOCAL;
                read_lock(&inetdev_lock);
-               if (!__in_dev_get(dev_out) || 
!ip_check_mc(__in_dev_get(dev_out), daddr))
+               if (!__in_dev_get(dev_out) || 
!ip_check_mc(__in_dev_get(dev_out), oldkey->dst))
                        flags &= ~RTCF_LOCAL;
                read_unlock(&inetdev_lock);
                /* If multicast route do not exist use
@@ -1819,18 +1825,21 @@
 
        atomic_set(&rth->u.dst.__refcnt, 1);
        rth->u.dst.flags= DST_HOST;
-       rth->key.dst    = daddr;
+       rth->key.dst    = oldkey->dst;
        rth->key.tos    = tos;
-       rth->key.src    = saddr;
+       rth->key.src    = oldkey->src;
        rth->key.iif    = 0;
-       rth->key.oif    = oif;
+       rth->key.oif    = oldkey->oif;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+       rth->key.fwmark = oldkey->fwmark;
+#endif
        rth->rt_dst     = key.dst;
        rth->rt_src     = key.src;
 #ifdef CONFIG_IP_ROUTE_NAT
        rth->rt_dst_map = key.dst;
        rth->rt_src_map = key.src;
 #endif
-       rth->rt_iif     = oif ? : dev_out->ifindex;
+       rth->rt_iif     = oldkey->oif ? : dev_out->ifindex;
        rth->u.dst.dev  = dev_out;
        dev_hold(dev_out);
        rth->rt_gateway = key.dst;
@@ -1850,7 +1859,7 @@
                if (res.type == RTN_MULTICAST) {
                        struct in_device *in_dev = in_dev_get(dev_out);
                        if (in_dev) {
-                               if (IN_DEV_MFORWARD(in_dev) && 
!LOCAL_MCAST(daddr)) {
+                               if (IN_DEV_MFORWARD(in_dev) && 
!LOCAL_MCAST(oldkey->dst)) {
                                        rth->u.dst.input = ip_mr_input;
                                        rth->u.dst.output = ip_mc_output;
                                }
@@ -1864,7 +1873,7 @@
 
        rth->rt_flags = flags;
 
-       hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
+       hash = rt_hash_code(oldkey->dst, oldkey->src^(oldkey->oif<<5), tos);
        err = rt_intern_hash(hash, rth, rp);
 done:
        if (free_res)
@@ -1881,21 +1890,24 @@
        goto done;
 }
 
-int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
+int ip_route_output_key(struct rtable **rp, const struct rt_key *key)
 {
        unsigned hash;
        struct rtable *rth;
 
-       hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
+       hash = rt_hash_code(key->dst, key->src^(key->oif<<5), key->tos);
 
        read_lock_bh(&rt_hash_table[hash].lock);
        for (rth=rt_hash_table[hash].chain; rth; rth=rth->u.rt_next) {
-               if (rth->key.dst == daddr &&
-                   rth->key.src == saddr &&
+               if (rth->key.dst == key->dst &&
+                   rth->key.src == key->src &&
                    rth->key.iif == 0 &&
-                   rth->key.oif == oif &&
-                   !((rth->key.tos^tos)&(IPTOS_RT_MASK|RTO_ONLINK)) &&
-                   ((tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY))
+                   rth->key.oif == key->oif &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+                   rth->key.fwmark == key->fwmark &&
+#endif
+                   !((rth->key.tos^key->tos)&(IPTOS_RT_MASK|RTO_ONLINK)) &&
+                   ((key->tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY))
                ) {
                        rth->u.dst.lastuse = jiffies;
                        dst_hold(&rth->u.dst);
@@ -1907,8 +1919,8 @@
        }
        read_unlock_bh(&rt_hash_table[hash].lock);
 
-       return ip_route_output_slow(rp, daddr, saddr, tos, oif);
-}
+       return ip_route_output_slow(rp, key);
+}      
 
 #ifdef CONFIG_RTNETLINK
 
diff -urN -X /tmp/file5G2Cgt --minimal linux-2.4.0-test8-1/net/netsyms.c 
working-2.4.0-test8-1/net/netsyms.c
--- linux-2.4.0-test8-1/net/netsyms.c   Sun Aug 27 15:11:01 2000
+++ working-2.4.0-test8-1/net/netsyms.c Wed Aug 30 23:22:35 2000
@@ -212,7 +212,7 @@
 EXPORT_SYMBOL(inetdev_lock);
 EXPORT_SYMBOL(inet_add_protocol);
 EXPORT_SYMBOL(inet_del_protocol);
-EXPORT_SYMBOL(ip_route_output);
+EXPORT_SYMBOL(ip_route_output_key);
 EXPORT_SYMBOL(ip_route_input);
 EXPORT_SYMBOL(icmp_send);
 EXPORT_SYMBOL(icmp_reply);
diff -urN -X /tmp/file5G2Cgt --minimal linux-2.4.0-test8-1/net/ipv4/igmp.c 
working-2.4.0-test8-1/net/ipv4/igmp.c
--- linux-2.4.0-test8-1/net/ipv4/igmp.c Sat Aug 12 00:23:39 2000
+++ working-2.4.0-test8-1/net/ipv4/igmp.c       Wed Aug 30 23:18:04 2000
@@ -184,7 +184,10 @@
 
 #define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4)
 
-static inline int igmp_send_report2(struct sk_buff *skb)
+/* Don't just hand NF_HOOK skb->dst->output, in case netfilter hook
+   changes route */
+static inline int
+output_maybe_reroute(struct sk_buff *skb)
 {
        return skb->dst->output(skb);
 }
@@ -247,7 +250,7 @@
        ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr));
 
        return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
-                      igmp_send_report2);
+                      output_maybe_reroute);
 }
 
 
diff -urN -X /tmp/file5G2Cgt --minimal linux-2.4.0-test8-1/net/ipv4/ip_output.c 
working-2.4.0-test8-1/net/ipv4/ip_output.c
--- linux-2.4.0-test8-1/net/ipv4/ip_output.c    Tue Aug 29 14:39:28 2000
+++ working-2.4.0-test8-1/net/ipv4/ip_output.c  Wed Aug 30 23:18:15 2000
@@ -107,42 +107,11 @@
        return 0;
 }
 
-#ifdef CONFIG_NETFILTER
-/* To preserve the cute illusion that a locally-generated packet can
-   be mangled before routing, we actually reroute if a hook altered
-   the packet. -RR */
-static int route_me_harder(struct sk_buff *skb)
-{
-       struct iphdr *iph = skb->nh.iph;
-       struct rtable *rt;
-
-       if (ip_route_output(&rt, iph->daddr, iph->saddr,
-                           RT_TOS(iph->tos) | RTO_CONN,
-                           skb->sk ? skb->sk->bound_dev_if : 0)) {
-               printk("route_me_harder: No more route.\n");
-               return -EINVAL;
-       }
-
-       /* Drop old route. */
-       dst_release(skb->dst);
-
-       skb->dst = &rt->u.dst;
-       return 0;
-}
-#endif
-
-/* Do route recalc if netfilter changes skb. */
+/* Don't just hand NF_HOOK skb->dst->output, in case netfilter hook
+   changes route */
 static inline int
 output_maybe_reroute(struct sk_buff *skb)
 {
-#ifdef CONFIG_NETFILTER
-       if (skb->nfcache & NFC_ALTERED) {
-               if (route_me_harder(skb) != 0) {
-                       kfree_skb(skb);
-                       return -EINVAL;
-               }
-       }
-#endif
        return skb->dst->output(skb);
 }
 
@@ -311,25 +280,6 @@
        struct rtable *rt = (struct rtable *)skb->dst;
        struct net_device *dev;
        struct iphdr *iph = skb->nh.iph;
-
-#ifdef CONFIG_NETFILTER
-       /* BLUE-PEN-FOR-ALEXEY.  I don't understand; you mean I can't
-           hold the route as I pass the packet to userspace? -- RR
-
-          You may hold it, if you really hold it. F.e. if netfilter
-          does not destroy handed skb with skb->dst attached, it
-          will be held. When it was stored in info->arg, then
-          it was not held apparently. Now (without second arg) it is evident,
-          that it is clean.                               --ANK
-        */
-       if (rt==NULL || (skb->nfcache & NFC_ALTERED)) {
-               if (route_me_harder(skb) != 0) {
-                       kfree_skb(skb);
-                       return -EHOSTUNREACH;
-               }
-               rt = (struct rtable *)skb->dst;
-       }
-#endif
 
        dev = rt->u.dst.dev;
 
diff -urN -X /tmp/file5G2Cgt --minimal 
linux-2.4.0-test8-1/net/ipv4/netfilter/ip_nat_standalone.c 
working-2.4.0-test8-1/net/ipv4/netfilter/ip_nat_standalone.c
--- linux-2.4.0-test8-1/net/ipv4/netfilter/ip_nat_standalone.c  Fri Jul 28 
21:36:46 2000
+++ working-2.4.0-test8-1/net/ipv4/netfilter/ip_nat_standalone.c        Wed Aug 
30 21:31:11 2000
@@ -161,6 +161,31 @@
        return ip_nat_fn(hooknum, pskb, in, out, okfn);
 }
 
+/* FIXME: change in oif may mean change in hh_len.  Check and realloc
+   --RR */
+static int
+route_me_harder(struct sk_buff *skb)
+{
+       struct iphdr *iph = skb->nh.iph;
+       struct rtable *rt;
+       struct rt_key key = { dst:iph->daddr,
+                             src:iph->saddr,
+                             oif:skb->sk ? skb->sk->bound_dev_if : 0,
+                             tos:RT_TOS(iph->tos)|RTO_CONN,
+                             fwmark:skb->nfmark };
+
+       if (ip_route_output_key(&rt, &key) != 0) {
+               printk("route_me_harder: No more route.\n");
+               return -EINVAL;
+       }
+
+       /* Drop old route. */
+       dst_release(skb->dst);
+
+       skb->dst = &rt->u.dst;
+       return 0;
+}
+
 static unsigned int
 ip_nat_local_fn(unsigned int hooknum,
                struct sk_buff **pskb,
@@ -168,12 +193,23 @@
                const struct net_device *out,
                int (*okfn)(struct sk_buff *))
 {
+       u_int32_t saddr, daddr;
+       unsigned int ret;
+
        /* root is playing with raw sockets. */
        if ((*pskb)->len < sizeof(struct iphdr)
            || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr))
                return NF_ACCEPT;
 
-       return ip_nat_fn(hooknum, pskb, in, out, okfn);
+       saddr = (*pskb)->nh.iph->saddr;
+       daddr = (*pskb)->nh.iph->daddr;
+
+       ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
+       if (ret != NF_DROP && ret != NF_STOLEN
+           && ((*pskb)->nh.iph->saddr != saddr
+               || (*pskb)->nh.iph->daddr != daddr))
+               return route_me_harder(*pskb) == 0 ? ret : NF_DROP;
+       return ret;
 }
 
 /* We must be after connection tracking and before packet filtering. */
diff -urN -X /tmp/file5G2Cgt --minimal 
linux-2.4.0-test8-1/net/ipv4/netfilter/iptable_mangle.c 
working-2.4.0-test8-1/net/ipv4/netfilter/iptable_mangle.c
--- linux-2.4.0-test8-1/net/ipv4/netfilter/iptable_mangle.c     Tue May 23 
02:32:57 2000
+++ working-2.4.0-test8-1/net/ipv4/netfilter/iptable_mangle.c   Wed Aug 30 
23:51:40 2000
@@ -5,6 +5,11 @@
  */
 #include <linux/module.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/route.h>
+#include <linux/ip.h>
 
 #define MANGLE_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))
 
@@ -86,6 +91,31 @@
        return ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL);
 }
 
+/* FIXME: change in oif may mean change in hh_len.  Check and realloc
+   --RR */
+static int
+route_me_harder(struct sk_buff *skb)
+{
+       struct iphdr *iph = skb->nh.iph;
+       struct rtable *rt;
+       struct rt_key key = { dst:iph->daddr,
+                             src:iph->saddr,
+                             oif:skb->sk ? skb->sk->bound_dev_if : 0,
+                             tos:RT_TOS(iph->tos)|RTO_CONN,
+                             fwmark:skb->nfmark };
+
+       if (ip_route_output_key(&rt, &key) != 0) {
+               printk("route_me_harder: No more route.\n");
+               return -EINVAL;
+       }
+
+       /* Drop old route. */
+       dst_release(skb->dst);
+
+       skb->dst = &rt->u.dst;
+       return 0;
+}
+
 static unsigned int
 ipt_local_out_hook(unsigned int hook,
                   struct sk_buff **pskb,
@@ -93,6 +123,11 @@
                   const struct net_device *out,
                   int (*okfn)(struct sk_buff *))
 {
+       unsigned int ret;
+       u_int8_t tos;
+       u_int32_t saddr, daddr;
+       unsigned long nfmark;
+
        /* root is playing with raw sockets. */
        if ((*pskb)->len < sizeof(struct iphdr)
            || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
@@ -101,7 +136,22 @@
                return NF_ACCEPT;
        }
 
-       return ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL);
+       /* Save things which could affect route */
+       nfmark = (*pskb)->nfmark;
+       saddr = (*pskb)->nh.iph->saddr;
+       daddr = (*pskb)->nh.iph->daddr;
+       tos = (*pskb)->nh.iph->tos;
+
+       ret = ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL);
+       /* Reroute for ANY change. */
+       if (ret != NF_DROP && ret != NF_STOLEN
+           && ((*pskb)->nh.iph->saddr != saddr
+               || (*pskb)->nh.iph->daddr != daddr
+               || (*pskb)->nfmark != nfmark
+               || (*pskb)->nh.iph->tos != tos))
+               return route_me_harder(*pskb) == 0 ? ret : NF_DROP;
+
+       return ret;
 }
 
 static struct nf_hook_ops ipt_ops[]

--
Hacking time.

<Prev in Thread] Current Thread [Next in Thread>