netdev
[Top] [All Lists]

[PATCH 2.6]: Fix suboptimal fragment sizing for last fragment

To: "David S. Miller" <davem@xxxxxxxxxx>
Subject: [PATCH 2.6]: Fix suboptimal fragment sizing for last fragment
From: Patrick McHardy <kaber@xxxxxxxxx>
Date: Thu, 02 Sep 2004 20:36:13 +0200
Cc: yoshfuji@xxxxxxxxxxxxxx, Herbert Xu <herbert@xxxxxxxxxx>, netdev@xxxxxxxxxxx
Sender: netdev-bounce@xxxxxxxxxxx
User-agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.6) Gecko/20040413 Debian/1.6-5
Yoshifuji's recent fragment patch prevents unnecessary fragmentation
when the data can be kept in a single packet, but only for the first
packet. When fragmenting, all fragments are still truncated to
multiples of 8 and we might end up creating an unnecessary fragment.

This dump shows the problem (MTU 1499):

172.16.1.123.32771 > 172.16.195.3.4135: udp 2937 (frag 7066:1472@0+)
172.16.1.123 > 172.16.195.3: udp (frag 7066:1472@1472+)
172.16.1.123 > 172.16.195.3: udp (frag 7066:1@2944)

This patch always builds mtu sized fragments and truncates the previous
fragment to a multiple of 8 bytes when allocating a new one. With the
patch the dump looks like this:


172.16.1.123.32772 > 172.16.195.3.4135: udp 2937 (frag 49641:1472@0+)
172.16.1.123 > 172.16.195.3: udp (frag 49641:1473@1472)

Regards
Patrick

# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2004/09/02 17:35:32+02:00 kaber@xxxxxxxxxxxx 
#   [IPV4/IPV6]: Fix suboptimal fragment sizing for last fragment
#   
#   Signed-off-by: Patrick McHardy <kaber@xxxxxxxxx>
# 
# net/ipv6/ip6_output.c
#   2004/09/02 17:35:14+02:00 kaber@xxxxxxxxxxxx +13 -22
#   [IPV4/IPV6]: Fix suboptimal fragment sizing for last fragment
# 
# net/ipv4/ip_output.c
#   2004/09/02 17:35:14+02:00 kaber@xxxxxxxxxxxx +20 -49
#   [IPV4/IPV6]: Fix suboptimal fragment sizing for last fragment
# 
diff -Nru a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
--- a/net/ipv4/ip_output.c      2004-09-02 17:39:01 +02:00
+++ b/net/ipv4/ip_output.c      2004-09-02 17:39:01 +02:00
@@ -735,10 +735,10 @@
        int hh_len;
        int exthdrlen;
        int mtu;
-       int copy = 0;
+       int copy;
        int err;
        int offset = 0;
-       unsigned int maxfraglen, fragheaderlen, fraggap = 0;
+       unsigned int maxfraglen, fragheaderlen;
        int csummode = CHECKSUM_NONE;
 
        if (flags&MSG_PROBE)
@@ -781,6 +781,7 @@
        hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 
        fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
+       maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 
        if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
                ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, 
mtu-exthdrlen);
@@ -788,26 +789,11 @@
        }
 
        /*
-        * Let's try using as much space as possible to avoid generating
-        * additional unnecessary small fragment of length 
-        * (mtu-fragheaderlen)%8 if mtu-fragheaderlen is not 0 modulo 8.
-        * -- yoshfuji
-        */
-       if (fragheaderlen + inet->cork.length + length <= mtu)
-               maxfraglen = mtu;
-       else
-               maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
-
-       if (fragheaderlen + inet->cork.length <= mtu &&
-           fragheaderlen + inet->cork.length + length > mtu)
-               fraggap = 1;
-
-       /*
         * transhdrlen > 0 means that this is the first fragment and we wish
         * it won't be fragmented in the future.
         */
        if (transhdrlen &&
-           length + fragheaderlen <= maxfraglen &&
+           length + fragheaderlen <= mtu &&
            
rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
            !exthdrlen)
                csummode = CHECKSUM_HW;
@@ -821,34 +807,33 @@
         * adding appropriate IP header.
         */
 
-       if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
-               fraggap = 0;
+       if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
                goto alloc_new_skb;
-       }
 
        while (length > 0) {
-               if ((copy = maxfraglen - skb->len) <= 0) {
+               if ((copy = mtu - skb->len) <= 0) {
                        char *data;
                        unsigned int datalen;
                        unsigned int fraglen;
+                       unsigned int fraggap;
                        unsigned int alloclen;
                        struct sk_buff *skb_prev;
-                       BUG_TRAP(fraggap || copy == 0);
+                       BUG_TRAP(copy == 0);
 
 alloc_new_skb:
                        skb_prev = skb;
+                       fraggap = 0;
+                       if (skb_prev)
+                               fraggap = mtu - maxfraglen;
 
-                       if (fraggap)
-                               fraggap = -copy;
-
-                       datalen = maxfraglen - fragheaderlen;
+                       datalen = mtu - fragheaderlen;
                        if (datalen > length + fraggap)
                                datalen = length + fraggap;
 
                        fraglen = datalen + fragheaderlen;
                        if ((flags & MSG_MORE) && 
                            !(rt->u.dst.dev->features&NETIF_F_SG))
-                               alloclen = maxfraglen;
+                               alloclen = mtu;
                        else
                                alloclen = datalen + fragheaderlen;
 
@@ -913,7 +898,6 @@
                        length -= datalen - fraggap;
                        transhdrlen = 0;
                        exthdrlen = 0;
-                       fraggap = 0;
                        csummode = CHECKSUM_NONE;
 
                        /*
@@ -1006,7 +990,7 @@
        int mtu;
        int len;
        int err;
-       unsigned int maxfraglen, fragheaderlen, fraggap = 0;
+       unsigned int maxfraglen, fragheaderlen, fraggap;
 
        if (inet->hdrincl)
                return -EPERM;
@@ -1028,27 +1012,13 @@
        mtu = inet->cork.fragsize;
 
        fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
+       maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 
        if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
                ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
                return -EMSGSIZE;
        }
 
-       /*
-        * Let's try using as much space as possible to avoid generating
-        * additional unnecessary small fragment of length 
-        * (mtu-fragheaderlen)%8 if mtu-fragheaderlen is not 0 modulo 8.
-        * -- yoshfuji
-        */
-       if (fragheaderlen + inet->cork.length + size <= mtu)
-               maxfraglen = mtu;
-       else
-               maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
-
-       if (fragheaderlen + inet->cork.length <= mtu &&
-           fragheaderlen + inet->cork.length + size > mtu)
-               fraggap = 1;
-
        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
                return -EINVAL;
 
@@ -1056,17 +1026,18 @@
 
        while (size > 0) {
                int i;
-               if ((len = maxfraglen - skb->len) <= 0) {
+               if ((len = mtu - skb->len) <= 0) {
                        struct sk_buff *skb_prev;
                        char *data;
                        struct iphdr *iph;
                        int alloclen;
 
-                       BUG_TRAP(fraggap || len == 0);
+                       BUG_TRAP(len == 0);
 
                        skb_prev = skb;
-                       if (fraggap)
-                               fraggap = -len;
+                       fraggap = 0;
+                       if (skb_prev)
+                               fraggap = mtu - maxfraglen;
 
                        alloclen = fragheaderlen + hh_len + fraggap + 15;
                        skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
diff -Nru a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
--- a/net/ipv6/ip6_output.c     2004-09-02 17:39:01 +02:00
+++ b/net/ipv6/ip6_output.c     2004-09-02 17:39:01 +02:00
@@ -814,11 +814,11 @@
        struct inet_opt *inet = inet_sk(sk);
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct sk_buff *skb;
-       unsigned int maxfraglen, fragheaderlen, fraggap = 0;
+       unsigned int maxfraglen, fragheaderlen;
        int exthdrlen;
        int hh_len;
        int mtu;
-       int copy = 0;
+       int copy;
        int err;
        int offset = 0;
        int csummode = CHECKSUM_NONE;
@@ -867,6 +867,7 @@
        hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 
        fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
+       maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 
sizeof(struct frag_hdr);
 
        if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
                if (inet->cork.length + length > sizeof(struct ipv6hdr) + 
IPV6_MAXPLEN - fragheaderlen) {
@@ -883,46 +884,37 @@
         *
         * Note that we may need to "move" the data from the tail of
         * of the buffer to the new fragment when we split 
-        * the message at the first time.
+        * the message.
         *
         * FIXME: It may be fragmented into multiple chunks 
         *        at once if non-fragmentable extension headers
         *        are too large.
         * --yoshfuji 
         */
-       if (fragheaderlen + inet->cork.length + length <= mtu)
-               maxfraglen = mtu;
-       else
-               maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen 
-                            - sizeof(struct frag_hdr);
-
-       if (fragheaderlen + inet->cork.length <= mtu &&
-           fragheaderlen + inet->cork.length + length > mtu)
-               fraggap = 1;
 
        inet->cork.length += length;
 
-       if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
-               fraggap = 0;
+       if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
                goto alloc_new_skb;
-       }
 
        while (length > 0) {
-               if ((copy = maxfraglen - skb->len) <= 0) {
+               if ((copy = mtu - skb->len) <= 0) {
                        char *data;
                        unsigned int datalen;
                        unsigned int fraglen;
+                       unsigned int fraggap;
                        unsigned int alloclen;
                        struct sk_buff *skb_prev;
-                       BUG_TRAP(fraggap || copy == 0);
+                       BUG_TRAP(copy == 0);
 alloc_new_skb:
                        skb_prev = skb;
 
                        /* There's no room in the current skb */
-                       if (fraggap)
-                               fraggap = -copy;
+                       fraggap = 0;
+                       if (skb_prev)
+                               fraggap = mtu - maxfraglen;
 
-                       datalen = maxfraglen - fragheaderlen;
+                       datalen = mtu - fragheaderlen;
 
                        if (datalen > length + fraggap)
                                datalen = length + fraggap;
@@ -930,7 +922,7 @@
                        fraglen = datalen + fragheaderlen;
                        if ((flags & MSG_MORE) &&
                            !(rt->u.dst.dev->features&NETIF_F_SG))
-                               alloclen = maxfraglen;
+                               alloclen = mtu;
                        else
                                alloclen = datalen + fragheaderlen;
 
@@ -1005,7 +997,6 @@
                        length -= datalen - fraggap;
                        transhdrlen = 0;
                        exthdrlen = 0;
-                       fraggap = 0;
                        csummode = CHECKSUM_NONE;
 
                        /*
<Prev in Thread] Current Thread [Next in Thread>