This patch lets you set the fwmark for locally-originated packets on a
per-socket basis. This means that with a little application tweaking
(add an ioctl call) you can control packet routing on a per-socket
basis. Select QoS on each connection, load-balance by hand, slice,
dice!
I've written a patch to do this. It does:
1. Add ioctls to set the fwmark for a socket
2. Make sure that the fwmark is passed to the routing functions
2a. Add new route function ip_route_output_sk which fetches necessary
data out of sk (currently sk->bound_dev_if and sk->fwmark) and
stuffs it into a route key
2b. Convert relevant calls to ip_route_output to use
ip_route_output_sk
2c. Convert ip_route_connect to use ip_route_output_sk
3. Change ip_queue_xmit to copy skb->nfmark into sk->nfmark on
outgoing packets
4. (unrelated bonus patch) Initialize key correctly in fib_frontend;
the old "key.foo = bar...." lines didn't initialize fwmark. The
new initializer zeros all unused fields
Question 1: Do I want to hook in at ip_queue_xmit, or is there a better
place?
Question 2: Do I want to send this to some other mailing list (like
linux-net@vger)?
-david
patches follow
diff -X ~/dontdiff -Naur linux-2.4.16/include/linux/sockios.h
linux-2.4.16-fwmark/include/linux/sockios.h
--- linux-2.4.16/include/linux/sockios.h Tue Dec 11 14:47:10 2001
+++ linux-2.4.16-fwmark/include/linux/sockios.h Sat Dec 8 14:48:17 2001
@@ -105,6 +105,13 @@
#define SIOCGIFVLAN 0x8982 /* 802.1Q VLAN support */
#define SIOCSIFVLAN 0x8983 /* Set 802.1Q VLAN options */
+/* Set netfilter fwmark on packets for this connection */
+#define SIOCSFWMARK 0x8984 /* Set netfilter fwmark on packets
from this cxn */
+#define SIOCGFWMARK 0x8985
+
+
+
+
/* bonding calls */
#define SIOCBONDENSLAVE 0x8990 /* enslave a device to the bond
*/
diff -X ~/dontdiff -Naur linux-2.4.16/include/net/route.h
linux-2.4.16-fwmark/include/net/route.h
--- linux-2.4.16/include/net/route.h Tue Dec 11 14:47:27 2001
+++ linux-2.4.16-fwmark/include/net/route.h Tue Dec 11 12:17:18 2001
@@ -27,6 +27,7 @@
#include <linux/config.h>
#include <net/dst.h>
#include <net/inetpeer.h>
+#include <net/sock.h>
#include <linux/in_route.h>
#include <linux/rtnetlink.h>
#include <linux/route.h>
@@ -140,6 +141,17 @@
return ip_route_output_key(rp, &key);
}
+static inline int ip_route_output_sk(struct rtable **rp,
+ u32 daddr, u32 saddr, u32 tos, const
struct sock *sk)
+{
+ struct rt_key key = { dst:daddr, src:saddr, oif:sk->bound_dev_if,
tos:tos,
+#if defined(CONFIG_NETFILTER) || defined(CONFIG_NETFILTER_MODULE)
+ fwmark:sk->nfmark,
+#endif
+ };
+ return ip_route_output_key(rp, &key);
+}
+
static inline void ip_rt_put(struct rtable * rt)
{
@@ -156,17 +168,17 @@
return ip_tos2prio[IPTOS_TOS(tos)>>1];
}
-static inline int ip_route_connect(struct rtable **rp, u32 dst, u32 src, u32
tos, int oif)
+static inline int ip_route_connect(struct rtable **rp, u32 dst, u32 src, u32
tos, const struct sock *sk)
{
int err;
- err = ip_route_output(rp, dst, src, tos, oif);
+ err = ip_route_output_sk(rp, dst, src, tos, sk);
if (err || (dst && src))
return err;
dst = (*rp)->rt_dst;
src = (*rp)->rt_src;
ip_rt_put(*rp);
*rp = NULL;
- return ip_route_output(rp, dst, src, tos, oif);
+ return ip_route_output_sk(rp, dst, src, tos, sk);
}
extern void rt_bind_peer(struct rtable *rt, int create);
diff -X ~/dontdiff -Naur linux-2.4.16/include/net/sock.h
linux-2.4.16-fwmark/include/net/sock.h
--- linux-2.4.16/include/net/sock.h Tue Dec 11 14:47:28 2001
+++ linux-2.4.16-fwmark/include/net/sock.h Tue Dec 11 12:17:05 2001
@@ -602,6 +602,10 @@
long rcvtimeo;
long sndtimeo;
+#if defined(CONFIG_NETFILTER) || defined(CONFIG_NETFILTER_MODULE)
+ int nfmark; /* Set nfmark on outgoing
packets if non-zero */
+#endif
+
#ifdef CONFIG_FILTER
/* Socket Filtering Instructions */
struct sk_filter *filter;
diff -X ~/dontdiff -Naur linux-2.4.16/net/ipv4/af_inet.c
linux-2.4.16-fwmark/net/ipv4/af_inet.c
--- linux-2.4.16/net/ipv4/af_inet.c Tue Dec 11 14:48:09 2001
+++ linux-2.4.16-fwmark/net/ipv4/af_inet.c Mon Dec 10 16:50:07 2001
@@ -931,6 +931,23 @@
#endif
return -ENOPKG;
+#if defined(CONFIG_NETFILTER) || defined(CONFIG_NETFILTER_MODULE)
+ case SIOCSFWMARK:
+ err = get_user(sk->nfmark,(int *) arg);
+ if (err) {
+ return err;
+ }
+ sk_dst_reset(sk);
+ break;
+ case SIOCGFWMARK:
+ err = put_user(sk->nfmark,(int *) arg);
+ if (err) {
+ return err;
+ }
+ break;
+#endif
+
+
default:
if ((cmd >= SIOCDEVPRIVATE) &&
(cmd <= (SIOCDEVPRIVATE + 15)))
diff -X ~/dontdiff -Naur linux-2.4.16/net/ipv4/fib_frontend.c
linux-2.4.16-fwmark/net/ipv4/fib_frontend.c
--- linux-2.4.16/net/ipv4/fib_frontend.c Tue Dec 11 14:48:10 2001
+++ linux-2.4.16-fwmark/net/ipv4/fib_frontend.c Sat Dec 8 14:47:25 2001
@@ -207,17 +207,10 @@
struct net_device *dev, u32 *spec_dst, u32 *itag)
{
struct in_device *in_dev;
- struct rt_key key;
+ struct rt_key key = { dst:src, src:dst, tos:tos,
oif:0,iif:oif,scope:RT_SCOPE_UNIVERSE};
struct fib_result res;
int no_addr, rpf;
int ret;
-
- key.dst = src;
- key.src = dst;
- key.tos = tos;
- key.oif = 0;
- key.iif = oif;
- key.scope = RT_SCOPE_UNIVERSE;
no_addr = rpf = 0;
read_lock(&inetdev_lock);
diff -X ~/dontdiff -Naur linux-2.4.16/net/ipv4/ip_output.c
linux-2.4.16-fwmark/net/ipv4/ip_output.c
--- linux-2.4.16/net/ipv4/ip_output.c Tue Dec 11 14:48:14 2001
+++ linux-2.4.16-fwmark/net/ipv4/ip_output.c Sat Dec 8 14:47:25 2001
@@ -345,6 +345,12 @@
struct rtable *rt;
struct iphdr *iph;
+#if defined(CONFIG_NETFILTER) || defined(CONFIG_NETFILTER_MODULE)
+ if (sk->nfmark) {
+ skb->nfmark=sk->nfmark;
+ }
+#endif
+
/* Skip all of this if the packet is already routed,
* f.e. by something like SCTP.
*/
@@ -366,9 +372,9 @@
* keep trying until route appears or the connection times
itself
* out.
*/
- if (ip_route_output(&rt, daddr, sk->saddr,
+ if (ip_route_output_sk(&rt, daddr, sk->saddr,
RT_CONN_FLAGS(sk),
- sk->bound_dev_if))
+ sk))
goto no_route;
__sk_dst_set(sk, &rt->u.dst);
sk->route_caps = rt->u.dst.dev->features;
@@ -964,6 +970,7 @@
daddr = replyopts.opt.faddr;
}
+ /* XXX should this use sk->oif ? */
if (ip_route_output(&rt, daddr, rt->rt_spec_dst,
RT_TOS(skb->nh.iph->tos), 0))
return;
diff -X ~/dontdiff -Naur linux-2.4.16/net/ipv4/tcp_ipv4.c
linux-2.4.16-fwmark/net/ipv4/tcp_ipv4.c
--- linux-2.4.16/net/ipv4/tcp_ipv4.c Tue Dec 11 14:48:27 2001
+++ linux-2.4.16-fwmark/net/ipv4/tcp_ipv4.c Sat Dec 8 14:47:25 2001
@@ -667,7 +667,7 @@
}
tmp = ip_route_connect(&rt, nexthop, sk->saddr,
- RT_CONN_FLAGS(sk), sk->bound_dev_if);
+ RT_CONN_FLAGS(sk), sk);
if (tmp < 0)
return tmp;
@@ -1150,11 +1150,11 @@
struct ip_options *opt;
opt = req->af.v4_req.opt;
- if(ip_route_output(&rt, ((opt && opt->srr) ?
+ if(ip_route_output_sk(&rt, ((opt && opt->srr) ?
opt->faddr :
req->af.v4_req.rmt_addr),
req->af.v4_req.loc_addr,
- RT_CONN_FLAGS(sk), sk->bound_dev_if)) {
+ RT_CONN_FLAGS(sk), sk)) {
IP_INC_STATS_BH(IpOutNoRoutes);
return NULL;
}
@@ -1733,7 +1733,7 @@
/* Query new route. */
err = ip_route_connect(&rt, daddr, 0,
RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute,
- sk->bound_dev_if);
+ sk);
if (err)
return err;
@@ -1781,8 +1781,8 @@
if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
daddr = sk->protinfo.af_inet.opt->faddr;
- err = ip_route_output(&rt, daddr, sk->saddr,
- RT_CONN_FLAGS(sk), sk->bound_dev_if);
+ err = ip_route_output_sk(&rt, daddr, sk->saddr,
+ RT_CONN_FLAGS(sk), sk);
if (!err) {
__sk_dst_set(sk, &rt->u.dst);
sk->route_caps = rt->u.dst.dev->features;
diff -X ~/dontdiff -Naur linux-2.4.16/net/ipv4/udp.c
linux-2.4.16-fwmark/net/ipv4/udp.c
--- linux-2.4.16/net/ipv4/udp.c Tue Dec 11 14:48:29 2001
+++ linux-2.4.16-fwmark/net/ipv4/udp.c Sat Dec 8 14:47:25 2001
@@ -724,7 +724,7 @@
sk_dst_reset(sk);
err = ip_route_connect(&rt, usin->sin_addr.s_addr, sk->saddr,
- RT_CONN_FLAGS(sk), sk->bound_dev_if);
+ RT_CONN_FLAGS(sk), sk);
if (err)
return err;
if ((rt->rt_flags&RTCF_BROADCAST) && !sk->broadcast) {
|