A somewhat related thought, while we are at it...
It would arguably make sense to allow a NIC to set max TSO size that the
card is
willing to support (rather than assume/enforce 64k). Some NICs support
bigger max
TSO size today, but it is even more important to allow a NIC to limit
TSO to a smaller size.
One likely scenario where this feature is desirable is a system with highly
fragmented memory.
In this case, the number of physical fragments per TSO frame could be always
so high that it will be cheaper
(on a given platform) to copy the frame than to DMA it.
In this case, 64k TSO becomes a liability and it will make sense to limit
it.
TSO "sweet spot" will be captured anyways - at least on fast networks, going
from 1.5k to 9k typically doubles throughput, while going from 9k to 64k
adds no more than another 10%
(plus a little bit of free %cpu, but not that much).
> -----Original Message-----
> From: netdev-bounce@xxxxxxxxxxx
> [mailto:netdev-bounce@xxxxxxxxxxx] On Behalf Of David S. Miller
> Sent: Tuesday, May 17, 2005 7:24 PM
> To: netdev@xxxxxxxxxxx
> Subject: [PATCH] Super TSO
>
>
> Ok, this implements the idea Herbert and I kicked around last week.
>
> Basically, we build as large TSO segments as humanly possible.
> Then we slice and dice them to fit the congestion and send
> windows at transmit time.
>
> It works remarkably well. Even application limited cases
> like "scp" behave nicely.
>
> During straight streaming tests like "bw_tcp", I see full
> "65535 MOD mss" sized frames going out to my onboard tg3 card
> once the congestion and send window grow large enough.
>
> I did some cross-continent (San Francisco --> east coast of
> US) transfers to make sure TSO stays on when packet drops
> occur and that performance doesn't suffer compared to TSO
> being off. It all looks fine so far.
>
> There were many bugs discovered along the way. For example,
> the old code would use the TSO mss to do nagle calculations.
> It also would mess up the packet sizing for TSO when SACK
> blocks were being advertised.
>
> So we don't have to calculate this "(65535 - headers) % mss"
> thing all the time, I keep a cache of three pieces of state.
> MSS, number of SACKS being advertized, and if TSO is enabled
> or not. If any of these change, we recalculate the packet
> building goal size.
>
> I moved all of the "should we send" logic privately into the
> file net/ipv4/tcp_output.c, and I am sure there are many
> simplifications possible. There are several spurious
> tcp_current_mss() calls scattered about in certain code paths
> as well. For example, some places would call it in order to
> decide whether to call tcp_write_xmit() or not, then
> tcp_write_xmit() does it once more. But there are call sites
> that do not do this, so tcp_write_xmit() has to do it for those cases.
>
> Comments?
>
> --- 1/include/linux/tcp.h.~1~ 2005-05-16 22:29:56.000000000 -0700
> +++ 2/include/linux/tcp.h 2005-05-17 11:15:46.000000000 -0700
> @@ -280,13 +280,15 @@ struct tcp_sock {
> __u32 snd_wnd; /* The window we expect to
> receive */
> __u32 max_window; /* Maximal window ever seen
> from peer */
> __u32 pmtu_cookie; /* Last pmtu seen by socket
> */
> - __u32 mss_cache; /* Cached effective mss, not
> including SACKS */
> - __u16 mss_cache_std; /* Like mss_cache, but without TSO */
> + __u16 mss_cache; /* Cached effective mss, not
> including SACKS */
> + __u16 xmit_size_goal; /* Goal for segmenting output
> packets */
> + __u32 xmit_size_cache;/* Cache for keeping
> xmit_size_goal uptodate */
> __u16 ext_header_len; /* Network protocol overhead
> (IP/IPv6 options) */
> __u8 ca_state; /* State of fast-retransmit
> machine */
> __u8 retransmits; /* Number of unrecovered RTO
> timeouts. */
>
> __u16 advmss; /* Advertised MSS
> */
> + __u16 __pad0;
> __u32 window_clamp; /* Maximal window to advertise
> */
> __u32 rcv_ssthresh; /* Current window clamp
> */
>
> --- 1/include/net/tcp.h.~1~ 2005-05-16 22:29:56.000000000 -0700
> +++ 2/include/net/tcp.h 2005-05-17 16:44:32.000000000 -0700
> @@ -817,11 +817,18 @@ static inline int tcp_ack_scheduled(stru
> return tp->ack.pending&TCP_ACK_SCHED;
> }
>
> -static __inline__ void tcp_dec_quickack_mode(struct tcp_sock *tp)
> +static __inline__ void tcp_dec_quickack_mode(struct tcp_sock *tp,
> +unsigned int pkts)
> {
> - if (tp->ack.quick && --tp->ack.quick == 0) {
> - /* Leaving quickack mode we deflate ATO. */
> - tp->ack.ato = TCP_ATO_MIN;
> + if (tp->ack.quick) {
> + if (pkts > tp->ack.quick)
> + tp->ack.quick = 0;
> + else
> + tp->ack.quick -= pkts;
> +
> + if (!tp->ack.quick) {
> + /* Leaving quickack mode we deflate ATO. */
> + tp->ack.ato = TCP_ATO_MIN;
> + }
> }
> }
>
> @@ -939,7 +946,14 @@ extern __u32 cookie_v4_init_sequence(str
>
> /* tcp_output.c */
>
> -extern int tcp_write_xmit(struct sock *, int nonagle);
> +extern void __tcp_data_snd_check(struct sock *sk, struct
> sk_buff *skb);
> +extern void __tcp_push_pending_frames(struct sock *sk,
> + struct tcp_sock *tp,
> + unsigned int cur_mss,
> + int nonagle);
> +extern int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp);
> +extern int tcp_tso_mince(struct sock *sk, struct tcp_sock *tp,
> + struct sk_buff *skb);
> extern int tcp_retransmit_skb(struct sock *, struct sk_buff
> *); extern void tcp_xmit_retransmit_queue(struct sock *);
> extern void tcp_simple_retransmit(struct sock *); @@ -951,7
> +965,7 @@ extern int tcp_write_wakeup(struct sock extern
> void tcp_send_fin(struct sock *sk); extern void
> tcp_send_active_reset(struct sock *sk, int priority); extern
> int tcp_send_synack(struct sock *); -extern void
> tcp_push_one(struct sock *, unsigned mss_now);
> +extern void tcp_push_one(struct sock *, unsigned int mss_now);
> extern void tcp_send_ack(struct sock *sk); extern void
> tcp_send_delayed_ack(struct sock *sk);
>
> @@ -1054,7 +1068,7 @@ static inline void
> tcp_reset_xmit_timer( static inline void
> tcp_initialize_rcv_mss(struct sock *sk) {
> struct tcp_sock *tp = tcp_sk(sk);
> - unsigned int hint = min(tp->advmss, tp->mss_cache_std);
> + unsigned int hint = min(tp->advmss, tp->mss_cache);
>
> hint = min(hint, tp->rcv_wnd/2);
> hint = min(hint, TCP_MIN_RCVMSS);
> @@ -1353,23 +1367,23 @@ static inline void tcp_cwnd_validate(str }
>
> /* Set slow start threshould and cwnd not falling to slow
> start */ -static inline void __tcp_enter_cwr(struct tcp_sock *tp)
> +static inline void __tcp_enter_cwr(struct tcp_sock *tp, unsigned int
> +pkts)
> {
> tp->undo_marker = 0;
> tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
> tp->snd_cwnd = min(tp->snd_cwnd,
> - tcp_packets_in_flight(tp) + 1U);
> + tcp_packets_in_flight(tp) + pkts);
> tp->snd_cwnd_cnt = 0;
> tp->high_seq = tp->snd_nxt;
> tp->snd_cwnd_stamp = tcp_time_stamp;
> TCP_ECN_queue_cwr(tp);
> }
>
> -static inline void tcp_enter_cwr(struct tcp_sock *tp)
> +static inline void tcp_enter_cwr(struct tcp_sock *tp, unsigned int
> +pkts)
> {
> tp->prior_ssthresh = 0;
> if (tp->ca_state < TCP_CA_CWR) {
> - __tcp_enter_cwr(tp);
> + __tcp_enter_cwr(tp, pkts);
> tcp_set_ca_state(tp, TCP_CA_CWR);
> }
> }
> @@ -1397,74 +1411,6 @@ static __inline__ void tcp_minshall_upda
> tp->snd_sml = TCP_SKB_CB(skb)->end_seq; }
>
> -/* Return 0, if packet can be sent now without violation
> Nagle's rules:
> - 1. It is full sized.
> - 2. Or it contains FIN.
> - 3. Or TCP_NODELAY was set.
> - 4. Or TCP_CORK is not set, and all sent packets are ACKed.
> - With Minshall's modification: all sent small packets are ACKed.
> - */
> -
> -static __inline__ int
> -tcp_nagle_check(const struct tcp_sock *tp, const struct
> sk_buff *skb,
> - unsigned mss_now, int nonagle)
> -{
> - return (skb->len < mss_now &&
> - !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
> - ((nonagle&TCP_NAGLE_CORK) ||
> - (!nonagle &&
> - tp->packets_out &&
> - tcp_minshall_check(tp))));
> -}
> -
> -extern void tcp_set_skb_tso_segs(struct sock *, struct sk_buff *);
> -
> -/* This checks if the data bearing packet SKB (usually
> sk->sk_send_head)
> - * should be put on the wire right now.
> - */
> -static __inline__ int tcp_snd_test(struct sock *sk,
> - struct sk_buff *skb,
> - unsigned cur_mss, int nonagle)
> -{
> - struct tcp_sock *tp = tcp_sk(sk);
> - int pkts = tcp_skb_pcount(skb);
> -
> - if (!pkts) {
> - tcp_set_skb_tso_segs(sk, skb);
> - pkts = tcp_skb_pcount(skb);
> - }
> -
> - /* RFC 1122 - section 4.2.3.4
> - *
> - * We must queue if
> - *
> - * a) The right edge of this frame exceeds the window
> - * b) There are packets in flight and we have a
> small segment
> - * [SWS avoidance and Nagle algorithm]
> - * (part of SWS is done on packetization)
> - * Minshall version sounds: there are no _small_
> - * segments in flight. (tcp_nagle_check)
> - * c) We have too many packets 'in flight'
> - *
> - * Don't use the nagle rule for urgent data (or
> - * for the final FIN -DaveM).
> - *
> - * Also, Nagle rule does not apply to frames, which
> - * sit in the middle of queue (they have no chances
> - * to get new data) and if room at tail of skb is
> - * not enough to save something seriously (<32 for now).
> - */
> -
> - /* Don't be strict about the congestion window for the
> - * final FIN frame. -DaveM
> - */
> - return (((nonagle&TCP_NAGLE_PUSH) || tp->urg_mode
> - || !tcp_nagle_check(tp, skb, cur_mss, nonagle)) &&
> - (((tcp_packets_in_flight(tp) + (pkts-1)) <
> tp->snd_cwnd) ||
> - (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) &&
> - !after(TCP_SKB_CB(skb)->end_seq, tp->snd_una +
> tp->snd_wnd));
> -}
> -
> static __inline__ void tcp_check_probe_timer(struct sock
> *sk, struct tcp_sock *tp) {
> if (!tp->packets_out && !tp->pending)
> @@ -1477,42 +1423,12 @@ static __inline__ int tcp_skb_is_last(co
> return skb->next == (struct sk_buff *)&sk->sk_write_queue; }
>
> -/* Push out any pending frames which were held back due to
> - * TCP_CORK or attempt at coalescing tiny packets.
> - * The socket must be locked by the caller.
> - */
> -static __inline__ void __tcp_push_pending_frames(struct sock *sk,
> - struct tcp_sock *tp,
> - unsigned cur_mss,
> - int nonagle)
> -{
> - struct sk_buff *skb = sk->sk_send_head;
> -
> - if (skb) {
> - if (!tcp_skb_is_last(sk, skb))
> - nonagle = TCP_NAGLE_PUSH;
> - if (!tcp_snd_test(sk, skb, cur_mss, nonagle) ||
> - tcp_write_xmit(sk, nonagle))
> - tcp_check_probe_timer(sk, tp);
> - }
> - tcp_cwnd_validate(sk, tp);
> -}
> -
> static __inline__ void tcp_push_pending_frames(struct sock *sk,
> struct tcp_sock *tp)
> {
> __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk,
> 1), tp->nonagle); }
>
> -static __inline__ int tcp_may_send_now(struct sock *sk,
> struct tcp_sock *tp) -{
> - struct sk_buff *skb = sk->sk_send_head;
> -
> - return (skb &&
> - tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
> - tcp_skb_is_last(sk, skb) ?
> TCP_NAGLE_PUSH : tp->nonagle));
> -}
> -
> static __inline__ void tcp_init_wl(struct tcp_sock *tp, u32
> ack, u32 seq) {
> tp->snd_wl1 = seq;
> @@ -1986,7 +1902,7 @@ static inline void
> tcp_westwood_update_r static inline __u32
> __tcp_westwood_bw_rttmin(const struct tcp_sock *tp) {
> return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) /
> - (__u32) (tp->mss_cache_std),
> + (__u32) (tp->mss_cache),
> 2U);
> }
>
> --- 1/include/net/sock.h.~1~ 2005-05-16 22:29:56.000000000 -0700
> +++ 2/include/net/sock.h 2005-05-17 12:06:44.000000000 -0700
> @@ -1130,13 +1130,16 @@ static inline void
> sk_stream_moderate_sn static inline struct sk_buff
> *sk_stream_alloc_pskb(struct sock *sk,
> int size,
> int mem, int gfp)
> {
> - struct sk_buff *skb = alloc_skb(size +
> sk->sk_prot->max_header, gfp);
> + struct sk_buff *skb;
> + int hdr_len;
>
> + hdr_len = SKB_DATA_ALIGN(sk->sk_prot->max_header);
> + skb = alloc_skb(size + hdr_len, gfp);
> if (skb) {
> skb->truesize += mem;
> if (sk->sk_forward_alloc >= (int)skb->truesize ||
> sk_stream_mem_schedule(sk, skb->truesize, 0)) {
> - skb_reserve(skb, sk->sk_prot->max_header);
> + skb_reserve(skb, hdr_len);
> return skb;
> }
> __kfree_skb(skb);
> --- 1/net/ipv4/tcp_output.c.~1~ 2005-05-16
> 22:29:56.000000000 -0700
> +++ 2/net/ipv4/tcp_output.c 2005-05-17 19:14:23.000000000 -0700
> @@ -141,11 +141,11 @@ static inline void tcp_event_data_sent(s
> tp->ack.pingpong = 1;
> }
>
> -static __inline__ void tcp_event_ack_sent(struct sock *sk)
> +static __inline__ void tcp_event_ack_sent(struct sock *sk,
> unsigned int
> +pkts)
> {
> struct tcp_sock *tp = tcp_sk(sk);
>
> - tcp_dec_quickack_mode(tp);
> + tcp_dec_quickack_mode(tp, pkts);
> tcp_clear_xmit_timer(sk, TCP_TIME_DACK); }
>
> @@ -361,7 +361,7 @@ static int tcp_transmit_skb(struct sock
> tp->af_specific->send_check(sk, th, skb->len, skb);
>
> if (tcb->flags & TCPCB_FLAG_ACK)
> - tcp_event_ack_sent(sk);
> + tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
>
> if (skb->len != tcp_header_size)
> tcp_event_data_sent(tp, skb, sk);
> @@ -372,7 +372,7 @@ static int tcp_transmit_skb(struct sock
> if (err <= 0)
> return err;
>
> - tcp_enter_cwr(tp);
> + tcp_enter_cwr(tp, tcp_skb_pcount(skb));
>
> /* NET_XMIT_CN is special. It does not guarantee,
> * that this packet is lost. It tells that
> device @@ -419,32 +419,11 @@ static inline void tcp_tso_set_push(stru
> TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; }
>
> -/* Send _single_ skb sitting at the send head. This function requires
> - * true push pending frames to setup probe timer etc.
> - */
> -void tcp_push_one(struct sock *sk, unsigned cur_mss) -{
> - struct tcp_sock *tp = tcp_sk(sk);
> - struct sk_buff *skb = sk->sk_send_head;
> -
> - if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) {
> - /* Send it out now. */
> - TCP_SKB_CB(skb)->when = tcp_time_stamp;
> - tcp_tso_set_push(skb);
> - if (!tcp_transmit_skb(sk, skb_clone(skb,
> sk->sk_allocation))) {
> - sk->sk_send_head = NULL;
> - tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
> - tcp_packets_out_inc(sk, tp, skb);
> - return;
> - }
> - }
> -}
> -
> void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb) {
> struct tcp_sock *tp = tcp_sk(sk);
>
> - if (skb->len <= tp->mss_cache_std ||
> + if (skb->len <= tp->mss_cache ||
> !(sk->sk_route_caps & NETIF_F_TSO)) {
> /* Avoid the costly divide in the normal
> * non-TSO case.
> @@ -454,10 +433,10 @@ void tcp_set_skb_tso_segs(struct sock *s
> } else {
> unsigned int factor;
>
> - factor = skb->len + (tp->mss_cache_std - 1);
> - factor /= tp->mss_cache_std;
> + factor = skb->len + (tp->mss_cache - 1);
> + factor /= tp->mss_cache;
> skb_shinfo(skb)->tso_segs = factor;
> - skb_shinfo(skb)->tso_size = tp->mss_cache_std;
> + skb_shinfo(skb)->tso_size = tp->mss_cache;
> }
> }
>
> @@ -662,7 +641,7 @@ unsigned int tcp_sync_mss(struct sock *s
>
> /* And store cached results */
> tp->pmtu_cookie = pmtu;
> - tp->mss_cache = tp->mss_cache_std = mss_now;
> + tp->mss_cache = mss_now;
>
> return mss_now;
> }
> @@ -674,59 +653,274 @@ unsigned int tcp_sync_mss(struct sock *s
> * cannot be large. However, taking into account rare use of
> URG, this
> * is not a big flaw.
> */
> +static inline u32 compute_xmit_cache(u32 mss, int sacks, int
> +tso_enabled) {
> + u32 ret = (mss << 16) | sacks;
> +
> + if (tso_enabled)
> + ret |= (1 << 8);
> +
> + return ret;
> +}
>
> -unsigned int tcp_current_mss(struct sock *sk, int large)
> +unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
> {
> struct tcp_sock *tp = tcp_sk(sk);
> struct dst_entry *dst = __sk_dst_get(sk);
> - unsigned int do_large, mss_now;
> + u32 mss_now;
> + u32 xmit_cache;
> + int doing_tso = 0;
> +
> + mss_now = tp->mss_cache;
> +
> + if (large_allowed &&
> + (sk->sk_route_caps & NETIF_F_TSO) &&
> + !tp->urg_mode)
> + doing_tso = 1;
>
> - mss_now = tp->mss_cache_std;
> if (dst) {
> u32 mtu = dst_mtu(dst);
> if (mtu != tp->pmtu_cookie)
> mss_now = tcp_sync_mss(sk, mtu);
> }
>
> - do_large = (large &&
> - (sk->sk_route_caps & NETIF_F_TSO) &&
> - !tp->urg_mode);
> -
> - if (do_large) {
> - unsigned int large_mss, factor, limit;
> -
> - large_mss = 65535 - tp->af_specific->net_header_len -
> - tp->ext_header_len - tp->tcp_header_len;
> -
> - if (tp->max_window && large_mss > (tp->max_window>>1))
> - large_mss = max((tp->max_window>>1),
> - 68U - tp->tcp_header_len);
> -
> - factor = large_mss / mss_now;
> -
> - /* Always keep large mss multiple of real mss, but
> - * do not exceed 1/tso_win_divisor of the
> congestion window
> - * so we can keep the ACK clock ticking and minimize
> - * bursting.
> - */
> - limit = tp->snd_cwnd;
> - if (sysctl_tcp_tso_win_divisor)
> - limit /= sysctl_tcp_tso_win_divisor;
> - limit = max(1U, limit);
> - if (factor > limit)
> - factor = limit;
> -
> - tp->mss_cache = mss_now * factor;
> -
> - mss_now = tp->mss_cache;
> - }
> + /* If the MSS, the TSO state, or the number of SACK blocks
> + * changes, we have to recompute tp->xmit_size_goal.
> + */
> + xmit_cache = compute_xmit_cache(mss_now, tp->rx_opt.eff_sacks,
> + doing_tso);
>
> if (tp->rx_opt.eff_sacks)
> mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
> (tp->rx_opt.eff_sacks *
> TCPOLEN_SACK_PERBLOCK));
> +
> + if (tp->xmit_size_cache != xmit_cache) {
> + u16 xmit_size_goal = mss_now;
> +
> + if (doing_tso) {
> + xmit_size_goal = 65535 -
> + tp->af_specific->net_header_len -
> + tp->ext_header_len - tp->tcp_header_len;
> +
> + if (tp->rx_opt.eff_sacks)
> + xmit_size_goal -=
> (TCPOLEN_SACK_BASE_ALIGNED +
> +
> (tp->rx_opt.eff_sacks *
> +
> TCPOLEN_SACK_PERBLOCK));
> +
> + xmit_size_goal -= (xmit_size_goal % mss_now);
> + }
> + tp->xmit_size_goal = xmit_size_goal;
> + tp->xmit_size_cache = xmit_cache;
> + }
> +
> return mss_now;
> }
>
> +/* This must be invoked the first time we consider transmitting
> + * SKB onto the wire.
> + */
> +static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff
> +*skb) {
> + int tso_segs = tcp_skb_pcount(skb);
> +
> + if (!tso_segs) {
> + tcp_set_skb_tso_segs(sk, skb);
> + tso_segs = tcp_skb_pcount(skb);
> + }
> + return tso_segs;
> +}
> +
> +/* Return 0, if packet can be sent now without violation
> Nagle's rules:
> + * 1. It is full sized.
> + * 2. Or it contains FIN.
> + * 3. Or TCP_NODELAY was set.
> + * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
> + * With Minshall's modification: all sent small packets are ACKed.
> + */
> +
> +static inline int tcp_nagle_check(const struct tcp_sock *tp,
> + const struct sk_buff *skb,
> + unsigned mss_now, int nonagle)
> +{
> + return (skb->len < mss_now &&
> + !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
> + ((nonagle&TCP_NAGLE_CORK) ||
> + (!nonagle &&
> + tp->packets_out &&
> + tcp_minshall_check(tp))));
> +}
> +
> +/* Return non-zero if the Nagle test allows this packet to be
> + * sent now.
> + */
> +static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff
> +*skb, unsigned int cur_mss, int nonagle) {
> + /* Nagle rule does not apply to frames, which
> + * sit in the middle of queue (they have no chances
> + * to get new data) and if room at tail of skb is
> + * not enough to save something seriously (<32 for now).
> + *
> + * This is implemented in the callers, where they modify
> + * the 'nonagle' argument based upon the location of SKB
> + * in the send queue.
> + */
> + if (nonagle & TCP_NAGLE_PUSH)
> + return 1;
> +
> + /* Don't use the nagle rule for urgent data (or
> + * for the final FIN -DaveM).
> + */
> + if (tp->urg_mode ||
> + (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
> + return 1;
> +
> + if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
> + return 1;
> +
> + return 0;
> +}
> +
> +/* Can at least one segment of SKB be sent right now, according
> + * to the congestion window rules? If so, return how many segments
> + * are allowed.
> + */
> +static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct
> +sk_buff *skb) {
> + u32 in_flight, cwnd;
> +
> + /* Don't be strict about the congestion window for the
> + * final FIN frame. -DaveM
> + */
> + if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
> + return 1;
> +
> + in_flight = tcp_packets_in_flight(tp);
> + cwnd = tp->snd_cwnd;
> + if (in_flight < cwnd)
> + return (cwnd - in_flight);
> +
> + return 0;
> +}
> +
> +/* Does at least the first segment of SKB fit into the congestion
> + * window?
> + */
> +static inline int tcp_snd_wnd_test(struct tcp_sock *tp,
> struct sk_buff
> +*skb, unsigned int cur_mss) {
> + u32 end_seq = TCP_SKB_CB(skb)->end_seq;
> +
> + if (skb->len > cur_mss)
> + end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
> +
> + return !after(end_seq, tp->snd_una + tp->snd_wnd); }
> +
> +/* This checks if the data bearing packet SKB (usually
> + * sk->sk_send_head) should be put on the wire right now. If so, it
> + * returns the number of packets allowed by the congestion window.
> + */
> +static unsigned int tcp_snd_test(struct sock *sk, struct
> sk_buff *skb,
> + unsigned cur_mss, int nonagle)
> +{
> + struct tcp_sock *tp = tcp_sk(sk);
> + unsigned int cwnd_quota;
> +
> + if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
> + return 0;
> +
> + cwnd_quota = tcp_cwnd_test(tp, skb);
> + if (cwnd_quota &&
> + !tcp_snd_wnd_test(tp, skb, cur_mss))
> + cwnd_quota = 0;
> +
> + return cwnd_quota;
> +}
> +
> +int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp) {
> + struct sk_buff *skb = sk->sk_send_head;
> +
> + return (skb &&
> + tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
> + tcp_skb_is_last(sk, skb) ?
> TCP_NAGLE_PUSH : tp->nonagle)); }
> +
> +/* Trim TSO SKB to LEN bytes, put the remaining data into a
> new packet
> + * which is put after SKB on the list. It is very much like
> + * tcp_fragment() except that it may make several kinds of
> assumptions
> + * in order to speed up the splitting operation. In particular, we
> + * know that all the data is in scatter-gather pages, and that the
> + * packet has never been sent out before (and thus is not cloned).
> + */
> +static int tso_fragment(struct sock *sk, struct sk_buff
> *skb, unsigned
> +int len) {
> + struct sk_buff *buff;
> + int nlen = skb->len - len;
> + u16 flags;
> +
> + /* All of a TSO frame must be composed of paged data. */
> + BUG_ON(skb->len != skb->data_len);
> +
> + buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
> + if (unlikely(buff == NULL))
> + return -ENOMEM;
> + sk_charge_skb(sk, buff);
> +
> + buff->truesize += nlen;
> + skb->truesize -= nlen;
> +
> + /* Correct the sequence numbers. */
> + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
> + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
> + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
> +
> + /* PSH and FIN should only be set in the second packet. */
> + flags = TCP_SKB_CB(skb)->flags;
> + TCP_SKB_CB(skb)->flags = flags &
> ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
> + TCP_SKB_CB(buff)->flags = flags;
> +
> + /* This packet was never sent out yet, so no SACK bits. */
> + TCP_SKB_CB(buff)->sacked = 0;
> +
> + buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
> + skb_split(skb, buff, len);
> +
> + /* Fix up tso_factor for both original and new SKB. */
> + tcp_set_skb_tso_segs(sk, skb);
> + tcp_set_skb_tso_segs(sk, buff);
> +
> + /* Link BUFF into the send queue. */
> + __skb_append(skb, buff);
> +
> + return 0;
> +}
> +
> +/* Split TSO frame SKB into one MSS sized packet, then the rest.
> + * This is called during SACK processing when some SACK information
> + * hits a TSO packet.
> + */
> +int tcp_tso_mince(struct sock *sk, struct tcp_sock *tp,
> struct sk_buff
> +*skb) {
> + unsigned int mss_now = tcp_current_mss(sk, 1);
> +
> + BUG_ON(tcp_skb_pcount(skb) <= 1);
> +
> + /* We cannot use tso_fragment() in this case, because
> + * the packet has been sent once already and thus may
> + * be cloned or have other non-trivial details to deal
> + * with.
> + */
> + return tcp_fragment(sk, skb, mss_now); }
> +
> +static inline unsigned int tso_size_limit(u32 window, unsigned int
> +mss_now, unsigned int cwnd) {
> + u32 cwnd_len;
> +
> + cwnd_len = mss_now * cwnd;
> + return min(window, cwnd_len);
> +}
> +
> /* This routine writes packets to the network. It advances the
> * send_head. This happens as incoming acks open up the remote
> * window for us.
> @@ -734,57 +928,167 @@ unsigned int tcp_current_mss(struct sock
> * Returns 1, if no segments are in flight and we have
> queued segments, but
> * cannot send anything now because of SWS or another problem.
> */
> -int tcp_write_xmit(struct sock *sk, int nonagle)
> +static int tcp_write_xmit(struct sock *sk, int nonagle)
> {
> struct tcp_sock *tp = tcp_sk(sk);
> - unsigned int mss_now;
> + unsigned int mss_now, cwnd_quota, sent_pkts, tso_segs;
> + struct sk_buff *skb;
>
> /* If we are closed, the bytes will have to remain here.
> * In time closedown will finish, we empty the write
> queue and all
> * will be happy.
> */
> - if (sk->sk_state != TCP_CLOSE) {
> - struct sk_buff *skb;
> - int sent_pkts = 0;
> + if (unlikely(sk->sk_state == TCP_CLOSE))
> + return 0;
>
> - /* Account for SACKS, we may need to fragment
> due to this.
> - * It is just like the real MSS changing on us
> midstream.
> - * We also handle things correctly when the
> user adds some
> - * IP options mid-stream. Silly to do, but cover it.
> - */
> - mss_now = tcp_current_mss(sk, 1);
> + /* Account for SACKS, we may need to fragment due to this.
> + * It is just like the real MSS changing on us midstream.
> + * We also handle things correctly when the user adds some
> + * IP options mid-stream. Silly to do, but cover it.
> + */
> + mss_now = tcp_current_mss(sk, 1);
> + skb = sk->sk_send_head;
> + if (unlikely(!skb))
> + return 0;
> +
> + tso_segs = tcp_init_tso_segs(sk, skb);
> + cwnd_quota = tcp_cwnd_test(tp, skb);
> + sent_pkts = 0;
> +
> + while (cwnd_quota) {
> + u32 end_seq, window_seq;
> +
> + if (!tcp_nagle_test(tp, skb, mss_now,
> + (tcp_skb_is_last(sk, skb) ?
> + nonagle : TCP_NAGLE_PUSH)))
> + break;
> +
> + end_seq = TCP_SKB_CB(skb)->end_seq;
> + window_seq = tp->snd_una + tp->snd_wnd;
> + if (skb->len > mss_now)
> + end_seq = TCP_SKB_CB(skb)->seq + mss_now;
> + if (after(end_seq, window_seq))
> + break;
> +
> + BUG_ON(!tso_segs);
> +
> + if (tso_segs > 1) {
> + u32 limit = tso_size_limit(window_seq -
> + TCP_SKB_CB(skb)->seq,
> + mss_now, cwnd_quota);
>
> - while ((skb = sk->sk_send_head) &&
> - tcp_snd_test(sk, skb, mss_now,
> - tcp_skb_is_last(sk, skb) ? nonagle :
> -
> TCP_NAGLE_PUSH)) {
> - if (skb->len > mss_now) {
> - if (tcp_fragment(sk, skb, mss_now))
> + if (skb->len > limit) {
> + if (tso_fragment(sk, skb, limit))
> break;
> }
> -
> - TCP_SKB_CB(skb)->when = tcp_time_stamp;
> - tcp_tso_set_push(skb);
> - if (tcp_transmit_skb(sk, skb_clone(skb,
> GFP_ATOMIC)))
> + } else if (skb->len > mss_now) {
> + if (tcp_fragment(sk, skb, mss_now))
> break;
> + }
>
> - /* Advance the send_head. This one is sent out.
> - * This call will increment packets_out.
> - */
> - update_send_head(sk, tp, skb);
> + TCP_SKB_CB(skb)->when = tcp_time_stamp;
> + tcp_tso_set_push(skb);
> + if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
> + break;
>
> - tcp_minshall_update(tp, mss_now, skb);
> - sent_pkts = 1;
> - }
> + /* Advance the send_head. This one is sent out.
> + * This call will increment packets_out.
> + */
> + update_send_head(sk, tp, skb);
>
> - if (sent_pkts) {
> - tcp_cwnd_validate(sk, tp);
> - return 0;
> + tcp_minshall_update(tp, mss_now, skb);
> + sent_pkts++;
> +
> + cwnd_quota -= tcp_skb_pcount(skb);
> + skb = sk->sk_send_head;
> + if (!skb)
> + break;
> + tso_segs = tcp_init_tso_segs(sk, skb);
> + }
> +
> + if (sent_pkts) {
> + tcp_cwnd_validate(sk, tp);
> + return 0;
> + }
> +
> + return !tp->packets_out && sk->sk_send_head; }
> +
> +/* Push out any pending frames which were held back due to
> + * TCP_CORK or attempt at coalescing tiny packets.
> + * The socket must be locked by the caller.
> + */
> +void __tcp_push_pending_frames(struct sock *sk,
> + struct tcp_sock *tp,
> + unsigned int cur_mss,
> + int nonagle)
> +{
> + struct sk_buff *skb = sk->sk_send_head;
> +
> + if (skb) {
> + if (!tcp_skb_is_last(sk, skb))
> + nonagle = TCP_NAGLE_PUSH;
> + if (tcp_write_xmit(sk, nonagle))
> + tcp_check_probe_timer(sk, tp);
> + }
> + tcp_cwnd_validate(sk, tp);
> +}
> +
> +/* As ACKs arrive and the send and congestion windows potentially
> + * open up, we call this to try and make write queue transmit
> + * progress.
> + *
> + * The caller has the socket locked, and has verified that
> + * sk->sk_send_head is not NULL.
> + */
> +void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) {
> + struct tcp_sock *tp = tcp_sk(sk);
> +
> + if (tcp_write_xmit(sk, tp->nonagle))
> + tcp_check_probe_timer(sk, tp);
> +}
> +
> +/* Send _single_ skb sitting at the send head. This function requires
> + * true push pending frames to setup probe timer etc. Caller makes
> + * sure that sk->sk_send_head is non-NULL.
> + */
> +void tcp_push_one(struct sock *sk, unsigned int cur_mss) {
> + struct tcp_sock *tp = tcp_sk(sk);
> + struct sk_buff *skb = sk->sk_send_head;
> + unsigned int tso_segs, cwnd_quota;
> +
> + tso_segs = tcp_init_tso_segs(sk, skb);
> + cwnd_quota = tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH);
> + if (cwnd_quota) {
> + u32 window_seq;
> +
> + window_seq = tp->snd_una + tp->snd_wnd;
> + BUG_ON(!tso_segs);
> +
> + if (tso_segs > 1) {
> + u32 limit = tso_size_limit(window_seq -
> + TCP_SKB_CB(skb)->seq,
> + cur_mss, cwnd_quota);
> +
> + if (skb->len > limit) {
> + if (tso_fragment(sk, skb, limit))
> + return;
> + }
> + } else if (skb->len > cur_mss) {
> + if (tcp_fragment(sk, skb, cur_mss))
> + return;
> }
>
> - return !tp->packets_out && sk->sk_send_head;
> + /* Send it out now. */
> + TCP_SKB_CB(skb)->when = tcp_time_stamp;
> + tcp_tso_set_push(skb);
> + if (!tcp_transmit_skb(sk, skb_clone(skb,
> sk->sk_allocation))) {
> + update_send_head(sk, tp, skb);
> + return;
> + }
> }
> - return 0;
> }
>
> /* This function returns the amount that we can raise the @@
> -1041,12 +1345,6 @@ int tcp_retransmit_skb(struct sock *sk,
> if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
> BUG();
>
> - if (sk->sk_route_caps & NETIF_F_TSO) {
> - sk->sk_route_caps &= ~NETIF_F_TSO;
> - sock_set_flag(sk, SOCK_NO_LARGESEND);
> - tp->mss_cache = tp->mss_cache_std;
> - }
> -
> if (tcp_trim_head(sk, skb, tp->snd_una -
> TCP_SKB_CB(skb)->seq))
> return -ENOMEM;
> }
> @@ -1671,13 +1969,6 @@ int tcp_write_wakeup(struct sock *sk)
> TCP_SKB_CB(skb)->flags |=
> TCPCB_FLAG_PSH;
> if (tcp_fragment(sk, skb, seg_size))
> return -1;
> - /* SWS override triggered
> forced fragmentation.
> - * Disable TSO, the connection
> is too sick. */
> - if (sk->sk_route_caps & NETIF_F_TSO) {
> - sock_set_flag(sk,
> SOCK_NO_LARGESEND);
> - sk->sk_route_caps &=
> ~NETIF_F_TSO;
> - tp->mss_cache =
> tp->mss_cache_std;
> - }
> } else if (!tcp_skb_pcount(skb))
> tcp_set_skb_tso_segs(sk, skb);
>
> --- 1/net/ipv4/tcp.c.~1~ 2005-05-16 22:29:56.000000000 -0700
> +++ 2/net/ipv4/tcp.c 2005-05-17 12:07:26.000000000 -0700
> @@ -634,7 +634,7 @@ static ssize_t do_tcp_sendpages(struct s
> size_t psize, int flags)
> {
> struct tcp_sock *tp = tcp_sk(sk);
> - int mss_now;
> + int mss_now, size_goal;
> int err;
> ssize_t copied;
> long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
> @@ -647,6 +647,7 @@ static ssize_t do_tcp_sendpages(struct s
> clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
>
> mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
> + size_goal = tp->xmit_size_goal;
> copied = 0;
>
> err = -EPIPE;
> @@ -660,7 +661,7 @@ static ssize_t do_tcp_sendpages(struct s
> int offset = poffset % PAGE_SIZE;
> int size = min_t(size_t, psize, PAGE_SIZE - offset);
>
> - if (!sk->sk_send_head || (copy = mss_now -
> skb->len) <= 0) {
> + if (!sk->sk_send_head || (copy = size_goal -
> skb->len) <= 0) {
> new_segment:
> if (!sk_stream_memory_free(sk))
> goto wait_for_sndbuf;
> @@ -671,7 +672,7 @@ new_segment:
> goto wait_for_memory;
>
> skb_entail(sk, tp, skb);
> - copy = mss_now;
> + copy = size_goal;
> }
>
> if (copy > size)
> @@ -712,7 +713,7 @@ new_segment:
> if (!(psize -= copy))
> goto out;
>
> - if (skb->len != mss_now || (flags & MSG_OOB))
> + if (skb->len != size_goal || (flags & MSG_OOB))
> continue;
>
> if (forced_push(tp)) {
> @@ -732,6 +733,7 @@ wait_for_memory:
> goto do_error;
>
> mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
> + size_goal = tp->xmit_size_goal;
> }
>
> out:
> @@ -773,15 +775,11 @@ ssize_t tcp_sendpage(struct socket *sock
>
> static inline int select_size(struct sock *sk, struct
> tcp_sock *tp) {
> - int tmp = tp->mss_cache_std;
> + int tmp = tp->mss_cache;
>
> - if (sk->sk_route_caps & NETIF_F_SG) {
> - int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
> + if (sk->sk_route_caps & NETIF_F_SG)
> + tmp = 0;
>
> - if (tmp >= pgbreak &&
> - tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
> - tmp = pgbreak;
> - }
> return tmp;
> }
>
> @@ -792,7 +790,7 @@ int tcp_sendmsg(struct kiocb *iocb, stru
> struct tcp_sock *tp = tcp_sk(sk);
> struct sk_buff *skb;
> int iovlen, flags;
> - int mss_now;
> + int mss_now, size_goal;
> int err, copied;
> long timeo;
>
> @@ -811,6 +809,7 @@ int tcp_sendmsg(struct kiocb *iocb, stru
> clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
>
> mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
> + size_goal = tp->xmit_size_goal;
>
> /* Ok commence sending. */
> iovlen = msg->msg_iovlen;
> @@ -833,7 +832,7 @@ int tcp_sendmsg(struct kiocb *iocb, stru
> skb = sk->sk_write_queue.prev;
>
> if (!sk->sk_send_head ||
> - (copy = mss_now - skb->len) <= 0) {
> + (copy = size_goal - skb->len) <= 0) {
>
> new_segment:
> /* Allocate new segment. If the
> interface is SG, @@ -856,7 +855,7 @@ new_segment:
> skb->ip_summed = CHECKSUM_HW;
>
> skb_entail(sk, tp, skb);
> - copy = mss_now;
> + copy = size_goal;
> }
>
> /* Try to append data to the end of
> skb. */ @@ -891,11 +890,6 @@ new_segment:
> tcp_mark_push(tp, skb);
> goto new_segment;
> } else if (page) {
> - /* If page is cached, align
> - * offset to L1 cache boundary
> - */
> - off = (off +
> L1_CACHE_BYTES - 1) &
> - ~(L1_CACHE_BYTES - 1);
> if (off == PAGE_SIZE) {
> put_page(page);
> TCP_PAGE(sk) =
> page = NULL;
> @@ -956,7 +950,7 @@ new_segment:
> if ((seglen -= copy) == 0 && iovlen == 0)
> goto out;
>
> - if (skb->len != mss_now || (flags & MSG_OOB))
> + if (skb->len != size_goal || (flags & MSG_OOB))
> continue;
>
> if (forced_push(tp)) {
> @@ -976,6 +970,7 @@ wait_for_memory:
> goto do_error;
>
> mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
> + size_goal = tp->xmit_size_goal;
> }
> }
>
> @@ -2135,7 +2130,7 @@ void tcp_get_info(struct sock *sk, struc
>
> info->tcpi_rto = jiffies_to_usecs(tp->rto);
> info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
> - info->tcpi_snd_mss = tp->mss_cache_std;
> + info->tcpi_snd_mss = tp->mss_cache;
> info->tcpi_rcv_mss = tp->ack.rcv_mss;
>
> info->tcpi_unacked = tp->packets_out;
> @@ -2185,7 +2180,7 @@ int tcp_getsockopt(struct sock *sk, int
>
> switch (optname) {
> case TCP_MAXSEG:
> - val = tp->mss_cache_std;
> + val = tp->mss_cache;
> if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE
> | TCPF_LISTEN)))
> val = tp->rx_opt.user_mss;
> break;
> --- 1/net/ipv4/tcp_input.c.~1~ 2005-05-16
> 22:29:56.000000000 -0700
> +++ 2/net/ipv4/tcp_input.c 2005-05-17 16:49:26.000000000 -0700
> @@ -805,10 +805,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp,
> __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
>
> if (!cwnd) {
> - if (tp->mss_cache_std > 1460)
> + if (tp->mss_cache > 1460)
> cwnd = 2;
> else
> - cwnd = (tp->mss_cache_std > 1095) ? 3 : 4;
> + cwnd = (tp->mss_cache > 1095) ? 3 : 4;
> }
> return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } @@
> -974,14 +974,6 @@ tcp_sacktag_write_queue(struct sock *sk,
> int flag = 0;
> int i;
>
> - /* So, SACKs for already sent large segments will be lost.
> - * Not good, but alternative is to resegment the queue. */
> - if (sk->sk_route_caps & NETIF_F_TSO) {
> - sk->sk_route_caps &= ~NETIF_F_TSO;
> - sock_set_flag(sk, SOCK_NO_LARGESEND);
> - tp->mss_cache = tp->mss_cache_std;
> - }
> -
> if (!tp->sacked_out)
> tp->fackets_out = 0;
> prior_fackets = tp->fackets_out;
> @@ -1038,6 +1030,15 @@ tcp_sacktag_write_queue(struct sock *sk,
> if(!before(TCP_SKB_CB(skb)->seq, end_seq))
> break;
>
> + /* Even if mincing a TSO frame fails, we
> + * continue anyways. We will end up with
> + * more coarse SACK information, but it is
> + * better than ignoring all the SACK information
> + * altogether.
> + */
> + if (tcp_skb_pcount(skb) > 1)
> + tcp_tso_mince(sk, tp, skb);
> +
> fack_count += tcp_skb_pcount(skb);
>
> in_sack = !after(start_seq,
> TCP_SKB_CB(skb)->seq) && @@ -1142,7 +1143,7 @@
> tcp_sacktag_write_queue(struct sock *sk,
> (IsFack(tp) ||
> !before(lost_retrans,
> TCP_SKB_CB(skb)->ack_seq +
> tp->reordering *
> - tp->mss_cache_std))) {
> + tp->mss_cache))) {
> TCP_SKB_CB(skb)->sacked &=
> ~TCPCB_SACKED_RETRANS;
> tp->retrans_out -= tcp_skb_pcount(skb);
>
> @@ -1782,7 +1783,7 @@ static void tcp_try_to_open(struct sock
> tp->retrans_stamp = 0;
>
> if (flag&FLAG_ECE)
> - tcp_enter_cwr(tp);
> + tcp_enter_cwr(tp, 1);
>
> if (tp->ca_state != TCP_CA_CWR) {
> int state = TCP_CA_Open;
> @@ -2170,7 +2171,7 @@ static void vegas_cong_avoid(struct tcp_
> * is the cwnd during the previous RTT.
> */
> old_wnd = (tp->vegas.beg_snd_nxt -
> tp->vegas.beg_snd_una) /
> - tp->mss_cache_std;
> + tp->mss_cache;
> old_snd_cwnd = tp->vegas.beg_snd_cwnd;
>
> /* Save the extent of the current window so we
> can use this @@ -2799,19 +2800,19 @@ static void
> westwood_dupack_update(struc {
> struct tcp_sock *tp = tcp_sk(sk);
>
> - tp->westwood.accounted += tp->mss_cache_std;
> - tp->westwood.cumul_ack = tp->mss_cache_std;
> + tp->westwood.accounted += tp->mss_cache;
> + tp->westwood.cumul_ack = tp->mss_cache;
> }
>
> static inline int westwood_may_change_cumul(struct tcp_sock *tp) {
> - return (tp->westwood.cumul_ack > tp->mss_cache_std);
> + return (tp->westwood.cumul_ack > tp->mss_cache);
> }
>
> static inline void westwood_partial_update(struct tcp_sock *tp) {
> tp->westwood.accounted -= tp->westwood.cumul_ack;
> - tp->westwood.cumul_ack = tp->mss_cache_std;
> + tp->westwood.cumul_ack = tp->mss_cache;
> }
>
> static inline void westwood_complete_update(struct tcp_sock
> *tp) @@ -3952,7 +3953,7 @@ static void tcp_new_space(struct sock *s
> !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
> !tcp_memory_pressure &&
> atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
> - int sndmem = max_t(u32, tp->rx_opt.mss_clamp,
> tp->mss_cache_std) +
> + int sndmem = max_t(u32, tp->rx_opt.mss_clamp,
> tp->mss_cache) +
> MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
> demanded = max_t(unsigned int, tp->snd_cwnd,
> tp->reordering + 1);
> @@ -3975,16 +3976,6 @@ static inline void tcp_check_space(struc
> }
> }
>
> -static void __tcp_data_snd_check(struct sock *sk, struct
> sk_buff *skb) -{
> - struct tcp_sock *tp = tcp_sk(sk);
> -
> - if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una +
> tp->snd_wnd) ||
> - tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
> - tcp_write_xmit(sk, tp->nonagle))
> - tcp_check_probe_timer(sk, tp);
> -}
> -
> static __inline__ void tcp_data_snd_check(struct sock *sk) {
> struct sk_buff *skb = sk->sk_send_head;
> --- 1/net/ipv4/tcp_ipv4.c.~1~ 2005-05-16 22:29:56.000000000 -0700
> +++ 2/net/ipv4/tcp_ipv4.c 2005-05-17 11:27:14.000000000 -0700
> @@ -2060,7 +2060,8 @@ static int tcp_v4_init_sock(struct sock
> */
> tp->snd_ssthresh = 0x7fffffff; /* Infinity */
> tp->snd_cwnd_clamp = ~0;
> - tp->mss_cache_std = tp->mss_cache = 536;
> + tp->mss_cache = 536;
> + tp->xmit_size_cache = ~0;
>
> tp->reordering = sysctl_tcp_reordering;
>
> --- 1/net/ipv6/tcp_ipv6.c.~1~ 2005-05-16 22:29:56.000000000 -0700
> +++ 2/net/ipv6/tcp_ipv6.c 2005-05-17 11:27:25.000000000 -0700
> @@ -2021,7 +2021,8 @@ static int tcp_v6_init_sock(struct sock
> */
> tp->snd_ssthresh = 0x7fffffff;
> tp->snd_cwnd_clamp = ~0;
> - tp->mss_cache_std = tp->mss_cache = 536;
> + tp->mss_cache = 536;
> + tp->xmit_size_cache = ~0;
>
> tp->reordering = sysctl_tcp_reordering;
>
>
|