In message <200008131630.UAA04346@xxxxxxxxxxxxx> you write:
> If you do not want to depend on skb, add new function using
> rt_key as argument. You may even replace ip_route_output()
> with this new function everywhere, it will be a bit slower,
> but it is worth to do, because has lots of useful applications
> not bound to nfmark.
OK. This is minimal source level change, so I don't break routing
code this close to 2.4.0.
Responsibility for rerouting is now handled by netfilter module which
alters the packet: this fixes my major design mistake, and removes
route_me_harder from IP stack code.
It works (netfilter testsuite/00netfilter/10localmangle.sh): you can
now change mark for LOCAL_OUT packets and they get rerouted like users
want.
Rusty.
diff -urN -X /tmp/file5G2Cgt --minimal linux-2.4.0-test8-1/include/net/route.h
working-2.4.0-test8-1/include/net/route.h
--- linux-2.4.0-test8-1/include/net/route.h Wed Aug 30 19:40:05 2000
+++ working-2.4.0-test8-1/include/net/route.h Wed Aug 30 23:51:06 2000
@@ -94,12 +94,13 @@
extern struct ip_rt_acct *ip_rt_acct;
+struct in_device;
extern void ip_rt_init(void);
extern void ip_rt_redirect(u32 old_gw, u32 dst, u32 new_gw,
u32 src, u8 tos, struct net_device *dev);
extern void ip_rt_advice(struct rtable **rp, int advice);
extern void rt_cache_flush(int how);
-extern int ip_route_output(struct rtable **, u32 dst, u32 src, u32
tos, int oif);
+extern int ip_route_output_key(struct rtable **, const struct
rt_key *key);
extern int ip_route_input(struct sk_buff*, u32 dst, u32 src, u8
tos, struct net_device *devin);
extern unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short
new_mtu);
extern void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu);
@@ -110,6 +111,15 @@
extern int ip_rt_ioctl(unsigned int cmd, void *arg);
extern void ip_rt_get_source(u8 *src, struct rtable *rt);
extern int ip_rt_dump(struct sk_buff *skb, struct
netlink_callback *cb);
+
+/* Deprecated: use ip_route_output_key directly */
+extern __inline__ int ip_route_output(struct rtable **rp,
+ u32 daddr, u32 saddr, u32 tos, int oif)
+{
+ struct rt_key key = { dst:daddr, src:saddr, oif:oif, tos:tos };
+
+ return ip_route_output_key(rp, &key);
+}
extern __inline__ void ip_rt_put(struct rtable * rt)
diff -urN -X /tmp/file5G2Cgt --minimal linux-2.4.0-test8-1/net/ipv4/route.c
working-2.4.0-test8-1/net/ipv4/route.c
--- linux-2.4.0-test8-1/net/ipv4/route.c Sun Aug 27 15:11:01 2000
+++ working-2.4.0-test8-1/net/ipv4/route.c Wed Aug 30 23:14:18 2000
@@ -1610,7 +1610,7 @@
* Major route resolver routine.
*/
-int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos,
int oif)
+int ip_route_output_slow(struct rtable **rp, const struct rt_key *oldkey)
{
struct rt_key key;
struct fib_result res;
@@ -1620,25 +1620,31 @@
unsigned hash;
int free_res = 0;
int err;
+ u32 tos;
- tos &= IPTOS_RT_MASK|RTO_ONLINK;
- key.dst = daddr;
- key.src = saddr;
+ tos = oldkey->tos & (IPTOS_RT_MASK|RTO_ONLINK);
+ key.dst = oldkey->dst;
+ key.src = oldkey->src;
key.tos = tos&IPTOS_RT_MASK;
key.iif = loopback_dev.ifindex;
- key.oif = oif;
+ key.oif = oldkey->oif;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ key.fwmark = oldkey->fwmark;
+#endif
key.scope = (tos&RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
res.fi = NULL;
#ifdef CONFIG_IP_MULTIPLE_TABLES
res.r = NULL;
#endif
- if (saddr) {
- if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr))
+ if (oldkey->src) {
+ if (MULTICAST(oldkey->src)
+ || BADCLASS(oldkey->src)
+ || ZERONET(oldkey->src))
return -EINVAL;
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
- dev_out = ip_dev_find(saddr);
+ dev_out = ip_dev_find(oldkey->src);
if (dev_out == NULL)
return -EINVAL;
@@ -1650,8 +1656,8 @@
of another iface. --ANK
*/
- if (oif == 0 &&
- (MULTICAST(daddr) || daddr == 0xFFFFFFFF)) {
+ if (oldkey->oif == 0
+ && (MULTICAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF)) {
/* Special hack: user can direct multicasts
and limited broadcast via necessary interface
without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
@@ -1674,8 +1680,8 @@
dev_put(dev_out);
dev_out = NULL;
}
- if (oif) {
- dev_out = dev_get_by_index(oif);
+ if (oldkey->oif) {
+ dev_out = dev_get_by_index(oldkey->oif);
if (dev_out == NULL)
return -ENODEV;
if (__in_dev_get(dev_out) == NULL) {
@@ -1683,15 +1689,15 @@
return -ENODEV; /* Wrong error code */
}
- if (LOCAL_MCAST(daddr) || daddr == 0xFFFFFFFF) {
+ if (LOCAL_MCAST(oldkey->dst) || oldkey->dst == 0xFFFFFFFF) {
if (!key.src)
key.src = inet_select_addr(dev_out, 0,
RT_SCOPE_LINK);
goto make_route;
}
if (!key.src) {
- if (MULTICAST(daddr))
+ if (MULTICAST(oldkey->dst))
key.src = inet_select_addr(dev_out, 0,
key.scope);
- else if (!daddr)
+ else if (!oldkey->dst)
key.src = inet_select_addr(dev_out, 0,
RT_SCOPE_HOST);
}
}
@@ -1712,7 +1718,7 @@
if (fib_lookup(&key, &res)) {
res.fi = NULL;
- if (oif) {
+ if (oldkey->oif) {
/* Apparently, routing tables are wrong. Assume,
that the destination is on link.
@@ -1800,7 +1806,7 @@
} else if (res.type == RTN_MULTICAST) {
flags |= RTCF_MULTICAST|RTCF_LOCAL;
read_lock(&inetdev_lock);
- if (!__in_dev_get(dev_out) ||
!ip_check_mc(__in_dev_get(dev_out), daddr))
+ if (!__in_dev_get(dev_out) ||
!ip_check_mc(__in_dev_get(dev_out), oldkey->dst))
flags &= ~RTCF_LOCAL;
read_unlock(&inetdev_lock);
/* If multicast route do not exist use
@@ -1819,18 +1825,21 @@
atomic_set(&rth->u.dst.__refcnt, 1);
rth->u.dst.flags= DST_HOST;
- rth->key.dst = daddr;
+ rth->key.dst = oldkey->dst;
rth->key.tos = tos;
- rth->key.src = saddr;
+ rth->key.src = oldkey->src;
rth->key.iif = 0;
- rth->key.oif = oif;
+ rth->key.oif = oldkey->oif;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->key.fwmark = oldkey->fwmark;
+#endif
rth->rt_dst = key.dst;
rth->rt_src = key.src;
#ifdef CONFIG_IP_ROUTE_NAT
rth->rt_dst_map = key.dst;
rth->rt_src_map = key.src;
#endif
- rth->rt_iif = oif ? : dev_out->ifindex;
+ rth->rt_iif = oldkey->oif ? : dev_out->ifindex;
rth->u.dst.dev = dev_out;
dev_hold(dev_out);
rth->rt_gateway = key.dst;
@@ -1850,7 +1859,7 @@
if (res.type == RTN_MULTICAST) {
struct in_device *in_dev = in_dev_get(dev_out);
if (in_dev) {
- if (IN_DEV_MFORWARD(in_dev) &&
!LOCAL_MCAST(daddr)) {
+ if (IN_DEV_MFORWARD(in_dev) &&
!LOCAL_MCAST(oldkey->dst)) {
rth->u.dst.input = ip_mr_input;
rth->u.dst.output = ip_mc_output;
}
@@ -1864,7 +1873,7 @@
rth->rt_flags = flags;
- hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
+ hash = rt_hash_code(oldkey->dst, oldkey->src^(oldkey->oif<<5), tos);
err = rt_intern_hash(hash, rth, rp);
done:
if (free_res)
@@ -1881,21 +1890,24 @@
goto done;
}
-int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
+int ip_route_output_key(struct rtable **rp, const struct rt_key *key)
{
unsigned hash;
struct rtable *rth;
- hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
+ hash = rt_hash_code(key->dst, key->src^(key->oif<<5), key->tos);
read_lock_bh(&rt_hash_table[hash].lock);
for (rth=rt_hash_table[hash].chain; rth; rth=rth->u.rt_next) {
- if (rth->key.dst == daddr &&
- rth->key.src == saddr &&
+ if (rth->key.dst == key->dst &&
+ rth->key.src == key->src &&
rth->key.iif == 0 &&
- rth->key.oif == oif &&
- !((rth->key.tos^tos)&(IPTOS_RT_MASK|RTO_ONLINK)) &&
- ((tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY))
+ rth->key.oif == key->oif &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->key.fwmark == key->fwmark &&
+#endif
+ !((rth->key.tos^key->tos)&(IPTOS_RT_MASK|RTO_ONLINK)) &&
+ ((key->tos&RTO_TPROXY) || !(rth->rt_flags&RTCF_TPROXY))
) {
rth->u.dst.lastuse = jiffies;
dst_hold(&rth->u.dst);
@@ -1907,8 +1919,8 @@
}
read_unlock_bh(&rt_hash_table[hash].lock);
- return ip_route_output_slow(rp, daddr, saddr, tos, oif);
-}
+ return ip_route_output_slow(rp, key);
+}
#ifdef CONFIG_RTNETLINK
diff -urN -X /tmp/file5G2Cgt --minimal linux-2.4.0-test8-1/net/netsyms.c
working-2.4.0-test8-1/net/netsyms.c
--- linux-2.4.0-test8-1/net/netsyms.c Sun Aug 27 15:11:01 2000
+++ working-2.4.0-test8-1/net/netsyms.c Wed Aug 30 23:22:35 2000
@@ -212,7 +212,7 @@
EXPORT_SYMBOL(inetdev_lock);
EXPORT_SYMBOL(inet_add_protocol);
EXPORT_SYMBOL(inet_del_protocol);
-EXPORT_SYMBOL(ip_route_output);
+EXPORT_SYMBOL(ip_route_output_key);
EXPORT_SYMBOL(ip_route_input);
EXPORT_SYMBOL(icmp_send);
EXPORT_SYMBOL(icmp_reply);
diff -urN -X /tmp/file5G2Cgt --minimal linux-2.4.0-test8-1/net/ipv4/igmp.c
working-2.4.0-test8-1/net/ipv4/igmp.c
--- linux-2.4.0-test8-1/net/ipv4/igmp.c Sat Aug 12 00:23:39 2000
+++ working-2.4.0-test8-1/net/ipv4/igmp.c Wed Aug 30 23:18:04 2000
@@ -184,7 +184,10 @@
#define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4)
-static inline int igmp_send_report2(struct sk_buff *skb)
+/* Don't just hand NF_HOOK skb->dst->output, in case netfilter hook
+ changes route */
+static inline int
+output_maybe_reroute(struct sk_buff *skb)
{
return skb->dst->output(skb);
}
@@ -247,7 +250,7 @@
ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr));
return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
- igmp_send_report2);
+ output_maybe_reroute);
}
diff -urN -X /tmp/file5G2Cgt --minimal linux-2.4.0-test8-1/net/ipv4/ip_output.c
working-2.4.0-test8-1/net/ipv4/ip_output.c
--- linux-2.4.0-test8-1/net/ipv4/ip_output.c Tue Aug 29 14:39:28 2000
+++ working-2.4.0-test8-1/net/ipv4/ip_output.c Wed Aug 30 23:18:15 2000
@@ -107,42 +107,11 @@
return 0;
}
-#ifdef CONFIG_NETFILTER
-/* To preserve the cute illusion that a locally-generated packet can
- be mangled before routing, we actually reroute if a hook altered
- the packet. -RR */
-static int route_me_harder(struct sk_buff *skb)
-{
- struct iphdr *iph = skb->nh.iph;
- struct rtable *rt;
-
- if (ip_route_output(&rt, iph->daddr, iph->saddr,
- RT_TOS(iph->tos) | RTO_CONN,
- skb->sk ? skb->sk->bound_dev_if : 0)) {
- printk("route_me_harder: No more route.\n");
- return -EINVAL;
- }
-
- /* Drop old route. */
- dst_release(skb->dst);
-
- skb->dst = &rt->u.dst;
- return 0;
-}
-#endif
-
-/* Do route recalc if netfilter changes skb. */
+/* Don't just hand NF_HOOK skb->dst->output, in case netfilter hook
+ changes route */
static inline int
output_maybe_reroute(struct sk_buff *skb)
{
-#ifdef CONFIG_NETFILTER
- if (skb->nfcache & NFC_ALTERED) {
- if (route_me_harder(skb) != 0) {
- kfree_skb(skb);
- return -EINVAL;
- }
- }
-#endif
return skb->dst->output(skb);
}
@@ -311,25 +280,6 @@
struct rtable *rt = (struct rtable *)skb->dst;
struct net_device *dev;
struct iphdr *iph = skb->nh.iph;
-
-#ifdef CONFIG_NETFILTER
- /* BLUE-PEN-FOR-ALEXEY. I don't understand; you mean I can't
- hold the route as I pass the packet to userspace? -- RR
-
- You may hold it, if you really hold it. F.e. if netfilter
- does not destroy handed skb with skb->dst attached, it
- will be held. When it was stored in info->arg, then
- it was not held apparently. Now (without second arg) it is evident,
- that it is clean. --ANK
- */
- if (rt==NULL || (skb->nfcache & NFC_ALTERED)) {
- if (route_me_harder(skb) != 0) {
- kfree_skb(skb);
- return -EHOSTUNREACH;
- }
- rt = (struct rtable *)skb->dst;
- }
-#endif
dev = rt->u.dst.dev;
diff -urN -X /tmp/file5G2Cgt --minimal
linux-2.4.0-test8-1/net/ipv4/netfilter/ip_nat_standalone.c
working-2.4.0-test8-1/net/ipv4/netfilter/ip_nat_standalone.c
--- linux-2.4.0-test8-1/net/ipv4/netfilter/ip_nat_standalone.c Fri Jul 28
21:36:46 2000
+++ working-2.4.0-test8-1/net/ipv4/netfilter/ip_nat_standalone.c Wed Aug
30 21:31:11 2000
@@ -161,6 +161,31 @@
return ip_nat_fn(hooknum, pskb, in, out, okfn);
}
+/* FIXME: change in oif may mean change in hh_len. Check and realloc
+ --RR */
+static int
+route_me_harder(struct sk_buff *skb)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct rtable *rt;
+ struct rt_key key = { dst:iph->daddr,
+ src:iph->saddr,
+ oif:skb->sk ? skb->sk->bound_dev_if : 0,
+ tos:RT_TOS(iph->tos)|RTO_CONN,
+ fwmark:skb->nfmark };
+
+ if (ip_route_output_key(&rt, &key) != 0) {
+ printk("route_me_harder: No more route.\n");
+ return -EINVAL;
+ }
+
+ /* Drop old route. */
+ dst_release(skb->dst);
+
+ skb->dst = &rt->u.dst;
+ return 0;
+}
+
static unsigned int
ip_nat_local_fn(unsigned int hooknum,
struct sk_buff **pskb,
@@ -168,12 +193,23 @@
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
+ u_int32_t saddr, daddr;
+ unsigned int ret;
+
/* root is playing with raw sockets. */
if ((*pskb)->len < sizeof(struct iphdr)
|| (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr))
return NF_ACCEPT;
- return ip_nat_fn(hooknum, pskb, in, out, okfn);
+ saddr = (*pskb)->nh.iph->saddr;
+ daddr = (*pskb)->nh.iph->daddr;
+
+ ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
+ if (ret != NF_DROP && ret != NF_STOLEN
+ && ((*pskb)->nh.iph->saddr != saddr
+ || (*pskb)->nh.iph->daddr != daddr))
+ return route_me_harder(*pskb) == 0 ? ret : NF_DROP;
+ return ret;
}
/* We must be after connection tracking and before packet filtering. */
diff -urN -X /tmp/file5G2Cgt --minimal
linux-2.4.0-test8-1/net/ipv4/netfilter/iptable_mangle.c
working-2.4.0-test8-1/net/ipv4/netfilter/iptable_mangle.c
--- linux-2.4.0-test8-1/net/ipv4/netfilter/iptable_mangle.c Tue May 23
02:32:57 2000
+++ working-2.4.0-test8-1/net/ipv4/netfilter/iptable_mangle.c Wed Aug 30
23:51:40 2000
@@ -5,6 +5,11 @@
*/
#include <linux/module.h>
#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/route.h>
+#include <linux/ip.h>
#define MANGLE_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))
@@ -86,6 +91,31 @@
return ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL);
}
+/* FIXME: change in oif may mean change in hh_len. Check and realloc
+ --RR */
+static int
+route_me_harder(struct sk_buff *skb)
+{
+ struct iphdr *iph = skb->nh.iph;
+ struct rtable *rt;
+ struct rt_key key = { dst:iph->daddr,
+ src:iph->saddr,
+ oif:skb->sk ? skb->sk->bound_dev_if : 0,
+ tos:RT_TOS(iph->tos)|RTO_CONN,
+ fwmark:skb->nfmark };
+
+ if (ip_route_output_key(&rt, &key) != 0) {
+ printk("route_me_harder: No more route.\n");
+ return -EINVAL;
+ }
+
+ /* Drop old route. */
+ dst_release(skb->dst);
+
+ skb->dst = &rt->u.dst;
+ return 0;
+}
+
static unsigned int
ipt_local_out_hook(unsigned int hook,
struct sk_buff **pskb,
@@ -93,6 +123,11 @@
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
+ unsigned int ret;
+ u_int8_t tos;
+ u_int32_t saddr, daddr;
+ unsigned long nfmark;
+
/* root is playing with raw sockets. */
if ((*pskb)->len < sizeof(struct iphdr)
|| (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
@@ -101,7 +136,22 @@
return NF_ACCEPT;
}
- return ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL);
+ /* Save things which could affect route */
+ nfmark = (*pskb)->nfmark;
+ saddr = (*pskb)->nh.iph->saddr;
+ daddr = (*pskb)->nh.iph->daddr;
+ tos = (*pskb)->nh.iph->tos;
+
+ ret = ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL);
+ /* Reroute for ANY change. */
+ if (ret != NF_DROP && ret != NF_STOLEN
+ && ((*pskb)->nh.iph->saddr != saddr
+ || (*pskb)->nh.iph->daddr != daddr
+ || (*pskb)->nfmark != nfmark
+ || (*pskb)->nh.iph->tos != tos))
+ return route_me_harder(*pskb) == 0 ? ret : NF_DROP;
+
+ return ret;
}
static struct nf_hook_ops ipt_ops[]
--
Hacking time.
|