netdev
[Top] [All Lists]

[PATCH] Prioritized Accept Queues with Preemption Capability

To: netdev@xxxxxxxxxxx, linux-net@xxxxxxxxxxxxxxx, kuznet@xxxxxxxxxxxxx
Subject: [PATCH] Prioritized Accept Queues with Preemption Capability
From: Sridhar Samudrala <samudrala@xxxxxxxxxx>
Date: Mon, 17 Sep 2001 19:06:12 -0700 (PDT)
Cc: dmfreim@xxxxxxxxxx
Sender: owner-netdev@xxxxxxxxxxx
The following patch is an enhancement to a mechanism called Prioritized Accept
Queues(PAQ) that can be used to prioritize incoming connection requests on a
socket based on the source/dest ip addreses and ports. We have posted the
original patch in July. This enhancement introduces a way to preempt low
priority connections from the accept queue in order to avoid starvation of
higher priority connections when the accept queue is filled with lower priority
connections. This idea is based on the comments and suggestions made by Alexey.

The documentation on HOWTO use this patch and the test results which show an
improvement in connection rate for higher priority classes can be found at our
project website.
        http://oss.software.ibm.com/qos

We would appreciate any comments or suggestions.

Thanks
Sridhar

---------------------------
Sridhar Samudrala
IBM Linux Technology Centre
samudrala@xxxxxxxxxx

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
diff -urN -X dontdiff linux-2.4.9/Documentation/Configure.help 
linux-2.4.9-ppaq/Documentation/Configure.help
--- linux-2.4.9/Documentation/Configure.help    Sun Aug 12 10:51:41 2001
+++ linux-2.4.9-ppaq/Documentation/Configure.help       Thu Sep 13 18:15:37 2001
@@ -1955,6 +1955,14 @@
   If you want to compile it as a module, say M here and read
   Documentation/modules.txt.  If unsure, say `N'.
 
+Prioritized Accept Queue (EXPERIMENTAL)
+CONFIG_PRIO_ACCEPTQ
+  When enabled, this option allows you to set priorities to incoming
+  connection requests using the rules created by the iptables MARK target
+  option. The nfmark field set by the rules is used as a priority value
+  when the connection is added to accept queue. The priority value can 
+  range between 0-7 with 0 being the highest priority and 7 the lowest. 
+  
 Packet filtering
 CONFIG_IP_NF_FILTER
   Packet filtering defines a table `filter', which has a series of
diff -urN -X dontdiff linux-2.4.9/include/net/sock.h 
linux-2.4.9-ppaq/include/net/sock.h
--- linux-2.4.9/include/net/sock.h      Wed Aug 15 14:21:32 2001
+++ linux-2.4.9-ppaq/include/net/sock.h Thu Sep 13 18:32:38 2001
@@ -239,6 +239,11 @@
 #define pppoe_relay    proto.pppoe.relay
 #endif
 
+#ifdef CONFIG_PRIO_ACCEPTQ
+/* Priorities range from 0-7 */
+#define MAX_ACCEPTQ_PRIO        7
+#endif
+
 /* This defines a selective acknowledgement block. */
 struct tcp_sack_block {
        __u32   start_seq;
@@ -409,7 +414,11 @@
 
        /* FIFO of established children */
        struct open_request     *accept_queue;
+#ifdef CONFIG_PRIO_ACCEPTQ
+       struct open_request     *accept_queue_tail[MAX_ACCEPTQ_PRIO];
+#else
        struct open_request     *accept_queue_tail;
+#endif
 
        int                     write_pending;  /* A write to socket waits to 
start. */
 
diff -urN -X dontdiff linux-2.4.9/include/net/tcp.h 
linux-2.4.9-ppaq/include/net/tcp.h
--- linux-2.4.9/include/net/tcp.h       Wed Aug 15 14:26:33 2001
+++ linux-2.4.9-ppaq/include/net/tcp.h  Thu Sep 13 18:42:25 2001
@@ -519,6 +519,9 @@
                struct tcp_v6_open_req v6_req;
 #endif
        } af;
+#ifdef CONFIG_PRIO_ACCEPTQ
+       int acceptq_prio;
+#endif
 };
 
 /* SLAB cache for open requests. */
@@ -1572,10 +1575,33 @@
                                         struct sock *child)
 {
        struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+#ifdef CONFIG_PRIO_ACCEPTQ
+       int prio = req->acceptq_prio;
+       int prev_prio;
+#endif
 
        req->sk = child;
        tcp_acceptq_added(sk);
 
+#ifdef CONFIG_PRIO_ACCEPTQ
+       if (!tp->accept_queue_tail[prio]) {
+               for (prev_prio = prio - 1; prev_prio >= 0; prev_prio--)
+                       if (tp->accept_queue_tail[prev_prio])
+                               break;
+               tp->accept_queue_tail[prio] = req;
+               if (prev_prio >= 0) {
+                       req->dl_next = 
tp->accept_queue_tail[prev_prio]->dl_next;
+                       tp->accept_queue_tail[prev_prio]->dl_next = req; 
+               } else {
+                       req->dl_next = tp->accept_queue;
+                       tp->accept_queue = req;
+               }
+       } else {
+               req->dl_next = tp->accept_queue_tail[prio]->dl_next;
+               tp->accept_queue_tail[prio]->dl_next = req;
+               tp->accept_queue_tail[prio] = req;
+       }
+#else
        if (!tp->accept_queue_tail) {
                tp->accept_queue = req;
        } else {
@@ -1583,6 +1609,7 @@
        }
        tp->accept_queue_tail = req;
        req->dl_next = NULL;
+#endif
 }
 
 struct tcp_listen_opt
@@ -1649,6 +1676,10 @@
                                        struct tcp_opt *tp,
                                        struct sk_buff *skb)
 {
+#ifdef CONFIG_PRIO_ACCEPTQ
+       int nfmark = (int)skb->nfmark;
+#endif
+
        req->rcv_wnd = 0;               /* So that tcp_send_synack() knows! */
        req->rcv_isn = TCP_SKB_CB(skb)->seq;
        req->mss = tp->mss_clamp;
@@ -1660,6 +1691,9 @@
        req->acked = 0;
        req->ecn_ok = 0;
        req->rmt_port = skb->h.th->source;
+#ifdef CONFIG_PRIO_ACCEPTQ
+       req->acceptq_prio = (nfmark < 0) ? 0 : ((nfmark > MAX_ACCEPTQ_PRIO) ? 
MAX_ACCEPTQ_PRIO : nfmark);
+#endif
 }
 
 #define TCP_MEM_QUANTUM        ((int)PAGE_SIZE)
diff -urN -X dontdiff linux-2.4.9/net/ipv4/netfilter/Config.in 
linux-2.4.9-ppaq/net/ipv4/netfilter/Config.in
--- linux-2.4.9/net/ipv4/netfilter/Config.in    Tue Mar  6 22:44:16 2001
+++ linux-2.4.9-ppaq/net/ipv4/netfilter/Config.in       Thu Sep 13 18:15:38 2001
@@ -27,6 +27,7 @@
   if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
     dep_tristate '  Unclean match support (EXPERIMENTAL)' 
CONFIG_IP_NF_MATCH_UNCLEAN $CONFIG_IP_NF_IPTABLES
     dep_tristate '  Owner match support (EXPERIMENTAL)' 
CONFIG_IP_NF_MATCH_OWNER $CONFIG_IP_NF_IPTABLES
+    bool '  Prioritized Accept Queues (EXPERIMENTAL)' CONFIG_PRIO_ACCEPTQ
   fi
 # The targets
   dep_tristate '  Packet filtering' CONFIG_IP_NF_FILTER $CONFIG_IP_NF_IPTABLES 
diff -urN -X dontdiff linux-2.4.9/net/ipv4/tcp.c linux-2.4.9-ppaq/net/ipv4/tcp.c
--- linux-2.4.9/net/ipv4/tcp.c  Wed Aug 15 01:22:17 2001
+++ linux-2.4.9-ppaq/net/ipv4/tcp.c     Thu Sep 13 18:15:38 2001
@@ -529,7 +529,12 @@
 
        sk->max_ack_backlog = 0;
        sk->ack_backlog = 0;
+#ifdef CONFIG_PRIO_ACCEPTQ
+       tp->accept_queue = NULL;
+       memset(tp->accept_queue_tail, 0, (sizeof(struct open_request *) * 
(MAX_ACCEPTQ_PRIO + 1)));
+#else
        tp->accept_queue = tp->accept_queue_tail = NULL;
+#endif
        tp->syn_wait_lock = RW_LOCK_UNLOCKED;
        tcp_delack_init(tp);
 
@@ -588,7 +593,12 @@
        write_lock_bh(&tp->syn_wait_lock);
        tp->listen_opt =NULL;
        write_unlock_bh(&tp->syn_wait_lock);
+#ifdef CONFIG_PRIO_ACCEPTQ
+       tp->accept_queue = NULL;
+       memset(tp->accept_queue_tail, 0, (sizeof(struct open_request *) * 
(MAX_ACCEPTQ_PRIO + 1)));
+#else
        tp->accept_queue = tp->accept_queue_tail = NULL;
+#endif
 
        if (lopt->qlen) {
                for (i=0; i<TCP_SYNQ_HSIZE; i++) {
@@ -2109,6 +2119,9 @@
        struct open_request *req;
        struct sock *newsk;
        int error;
+#ifdef CONFIG_PRIO_ACCEPTQ
+       int prio;
+#endif
 
        lock_sock(sk); 
 
@@ -2134,8 +2147,17 @@
        }
 
        req = tp->accept_queue;
+#ifdef CONFIG_PRIO_ACCEPTQ
+       tp->accept_queue = req->dl_next;
+       for (prio = 0; prio <= MAX_ACCEPTQ_PRIO; prio++)
+               if (req == tp->accept_queue_tail[prio]) {
+                       tp->accept_queue_tail[prio] = NULL;
+                       break;
+               }
+#else
        if ((tp->accept_queue = req->dl_next) == NULL)
                tp->accept_queue_tail = NULL;
+#endif
 
        newsk = req->sk;
        tcp_acceptq_removed(sk);
diff -urN -X dontdiff linux-2.4.9/net/ipv4/tcp_ipv4.c 
linux-2.4.9-ppaq/net/ipv4/tcp_ipv4.c
--- linux-2.4.9/net/ipv4/tcp_ipv4.c     Wed Apr 25 14:57:39 2001
+++ linux-2.4.9-ppaq/net/ipv4/tcp_ipv4.c        Mon Sep 17 18:49:01 2001
@@ -1262,6 +1262,21 @@
        tcp_v4_send_reset
 };
 
+#ifdef CONFIG_PRIO_ACCEPTQ
+static struct open_request *low_prio_req_in_acceptq(struct sock *sk, int prio)
+{
+       struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+       struct open_request *low_prio_req = NULL;
+       int tmp_prio;
+
+       for (tmp_prio = MAX_ACCEPTQ_PRIO; tmp_prio > prio; tmp_prio--) 
+               if ((low_prio_req = tp->accept_queue_tail[tmp_prio])) 
+                       break;  
+
+       return (low_prio_req);
+}
+#endif
+
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_opt tp;
@@ -1299,7 +1314,19 @@
         * clogging syn queue with openreqs with exponentially increasing
         * timeout.
         */
+#ifdef CONFIG_PRIO_ACCEPTQ
+       /* With Prioritized Accept Queue, a new condition is added so that an 
+        * incoming SYN is dropped only if there are no lower priority 
+        * connection requests in the acceptq. This is to avoid starvation of 
+        * higher priority connection requests in the presence of persistent 
low         
+        * priority connections filling up the acceptq. 
+        */
+       if (tcp_acceptq_is_full(sk) && 
+               !(low_prio_req_in_acceptq(sk, (int)skb->nfmark)) && 
+                       tcp_synq_young(sk) > 1)
+#else
        if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+#endif
                goto drop;
 
        req = tcp_openreq_alloc();
@@ -1407,6 +1434,62 @@
        return 0;
 }
 
+#ifdef CONFIG_PRIO_ACCEPTQ
+/* removes the req(last one) from sk's accept queue. */
+static void remove_openreq_from_acceptq(struct sock *sk, struct open_request 
*req)
+{
+       struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+       struct open_request *tmp_req = NULL; 
+       struct open_request *prev_req;
+       int req_prio = req->acceptq_prio;
+       int prio;
+
+       /* find the last req in the next higher priority class */
+       for (prio = req_prio-1; prio >= 0; prio--)
+               if ((tmp_req = tp->accept_queue_tail[prio]))
+                       break;
+
+       /* no higher priority class, start scanning from the start of acceptq */
+       if (!tmp_req)
+               tmp_req = tp->accept_queue;
+
+       prev_req = tmp_req;
+
+       /* find the prev req */
+       for (; tmp_req ; tmp_req = tmp_req->dl_next) {
+               if (tmp_req == req)
+                       break;
+               prev_req = tmp_req;
+       }
+
+       if (prev_req) {
+               prev_req->dl_next = NULL;
+               if (prev_req->acceptq_prio == req_prio) 
+                       tp->accept_queue_tail[req_prio] = prev_req;
+               else
+                       tp->accept_queue_tail[req_prio] = NULL;
+       } else {
+               BUG_TRAP(prev_req != NULL);
+       }
+}
+
+/* remove lreq from accept queue and add it to syn table */ 
+static void preempt_low_prio_req(struct sock *sk, struct open_request *lreq)
+{
+       struct sock *lsk;
+
+       lsk = lreq->sk;
+       lreq->sk = NULL;
+       remove_openreq_from_acceptq(sk, lreq);
+       tcp_acceptq_removed(sk);
+       tcp_v4_synq_add(sk, lreq);
+       tcp_unhash(lsk);
+       tcp_set_state(lsk, TCP_CLOSE);
+       sock_orphan(lsk);
+       atomic_inc(&tcp_orphan_count);
+       tcp_destroy_sock(lsk);
+}
+#endif /* CONFIG_PRIO_ACCEPTQ */
 
 /* 
  * The three way handshake has completed - we got a valid synack - 
@@ -1419,8 +1502,35 @@
        struct tcp_opt *newtp;
        struct sock *newsk;
 
+#ifdef CONFIG_PRIO_ACCEPTQ
+       if (tcp_acceptq_is_full(sk)) {
+               struct open_request *low_prio_req;
+
+               /* if there is a lower priority req in acceptq and we haven't 
+                * acked any received data on the associated socket, move it to 
+                * syn table so that the incoming higher priority req can be 
+                * accepted. 
+                */
+               if ((low_prio_req = low_prio_req_in_acceptq(sk, 
req->acceptq_prio))) {
+                       struct sock *lsk = low_prio_req->sk;
+                       struct tcp_opt *tp = &lsk->tp_pinfo.af_tcp;
+
+                       bh_lock_sock(lsk);
+                       /* we haven't acked any data received */
+                       if (tp->rcv_wup == (low_prio_req->rcv_isn + 1)) { 
+                               preempt_low_prio_req(sk, low_prio_req); 
+                               bh_unlock_sock(lsk);
+                       } else {
+                               bh_unlock_sock(lsk);
+                               goto exit_overflow;
+                       }
+               } else
+                       goto exit_overflow;
+       }
+#else
        if (tcp_acceptq_is_full(sk))
                goto exit_overflow;
+#endif
 
        if (dst == NULL &&
            (dst = tcp_v4_route_req(sk, req)) == NULL)
diff -urN -X dontdiff linux-2.4.9/net/ipv4/tcp_minisocks.c 
linux-2.4.9-ppaq/net/ipv4/tcp_minisocks.c
--- linux-2.4.9/net/ipv4/tcp_minisocks.c        Wed Aug 15 01:22:17 2001
+++ linux-2.4.9-ppaq/net/ipv4/tcp_minisocks.c   Mon Sep 17 18:49:18 2001
@@ -734,7 +734,12 @@
                newtp->num_sacks = 0;
                newtp->urg_data = 0;
                newtp->listen_opt = NULL;
+#ifdef CONFIG_PRIO_ACCEPTQ
+               newtp->accept_queue = NULL;
+               memset(newtp->accept_queue_tail, 0, (sizeof(struct open_request 
*) * (MAX_ACCEPTQ_PRIO + 1)));
+#else
                newtp->accept_queue = newtp->accept_queue_tail = NULL;
+#endif
                /* Deinitialize syn_wait_lock to trap illegal accesses. */
                memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
 
@@ -802,6 +807,9 @@
        int paws_reject = 0;
        struct tcp_opt ttp;
        struct sock *child;
+#ifdef CONFIG_PRIO_ACCEPTQ
+       struct open_request *r1, *r2;
+#endif
 
        ttp.saw_tstamp = 0;
        if (th->doff > (sizeof(struct tcphdr)>>2)) {
@@ -913,10 +921,24 @@
         * ESTABLISHED STATE. If it will be dropped after
         * socket is created, wait for troubles.
         */
+#ifdef CONFIG_PRIO_ACCEPTQ
+       r1 = *prev;
+#endif
        child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
        if (child == NULL)
                goto listen_overflow;
+#ifdef CONFIG_PRIO_ACCEPTQ
+       r2 = *prev;
 
+       /* With Prioritized Accept Queues, it is possible that prev pointer can 
+        * change in the above call to syn_recv_sock(). This can happen if an 
+        * openreq is preempted and moved from acceptq to syn table and it 
+        * hashes to the same bucket as 'req' and 'req' is the first entry in 
+        * the hash bucket. If so, prev needs to be updated.
+        */ 
+       if ((req == r1) && (r1 != r2))
+               prev = &r2->dl_next;
+#endif
        tcp_synq_unlink(tp, req, prev);
        tcp_synq_removed(sk, req);
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


<Prev in Thread] Current Thread [Next in Thread>
  • [PATCH] Prioritized Accept Queues with Preemption Capability, Sridhar Samudrala <=