/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>:
* - Real stateful connection tracking
* - Modified state transitions table
* - Window scaling support added
* - SACK support added
*
* Willy Tarreau:
* - State table bugfixes
* - More robust state changes
* - Tuning timer parameters
*
* version 2.2
*/
#include <linux/config.h>
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/netfilter.h>
#include <linux/module.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/spinlock.h>
#include <net/tcp.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv4/ip_conntrack.h>
#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
#include <linux/netfilter_ipv4/lockhelp.h>
#if 0
#define DEBUGP printk
#define DEBUGP_VARS
#else
#define DEBUGP(format, args...)
#endif
/* Protects conntrack->proto.tcp */
static DECLARE_RWLOCK(tcp_lock);
/* "Be conservative in what you do,
be liberal in what you accept from others."
If it's non-zero, we mark only out of window RST segments as INVALID. */
int ip_ct_tcp_be_liberal = 0;
/* When connection is picked up from the middle, how many packets are required
to pass in each direction when we assume we are in sync - if any side uses
window scaling, we lost the game.
If it is set to zero, we disable picking up already established
connections. */
int ip_ct_tcp_loose = 3;
/* Max number of the retransmitted packets without receiving an (acceptable)
ACK from the destination. If this number is reached, a shorter timer
will be started. */
int ip_ct_tcp_max_retrans = 3;
/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
closely. They're more complex. --RR */
static const char *tcp_conntrack_names[] = {
"NONE",
"SYN_SENT",
"SYN_RECV",
"ESTABLISHED",
"FIN_WAIT",
"CLOSE_WAIT",
"LAST_ACK",
"TIME_WAIT",
"CLOSE",
"LISTEN"
};
#define SECS * HZ
#define MINS * 60 SECS
#define HOURS * 60 MINS
#define DAYS * 24 HOURS
unsigned long ip_ct_tcp_timeout_syn_sent = 2 MINS;
unsigned long ip_ct_tcp_timeout_syn_recv = 60 SECS;
unsigned long ip_ct_tcp_timeout_established = 5 DAYS;
unsigned long ip_ct_tcp_timeout_fin_wait = 2 MINS;
unsigned long ip_ct_tcp_timeout_close_wait = 60 SECS;
unsigned long ip_ct_tcp_timeout_last_ack = 30 SECS;
unsigned long ip_ct_tcp_timeout_time_wait = 2 MINS;
unsigned long ip_ct_tcp_timeout_close = 10 SECS;
/* RFC1122 says the R2 limit should be at least 100 seconds.
Linux uses 15 packets as limit, which corresponds
to ~13-30min depending on RTO. */
unsigned long ip_ct_tcp_timeout_max_retrans = 5 MINS;
static unsigned long * tcp_timeouts[]
= { NULL, /* TCP_CONNTRACK_NONE */
&ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */
&ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */
&ip_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */
&ip_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */
&ip_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */
&ip_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */
&ip_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */
&ip_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */
NULL, /* TCP_CONNTRACK_LISTEN */
};
#define sNO TCP_CONNTRACK_NONE
#define sSS TCP_CONNTRACK_SYN_SENT
#define sSR TCP_CONNTRACK_SYN_RECV
#define sES TCP_CONNTRACK_ESTABLISHED
#define sFW TCP_CONNTRACK_FIN_WAIT
#define sCW TCP_CONNTRACK_CLOSE_WAIT
#define sLA TCP_CONNTRACK_LAST_ACK
#define sTW TCP_CONNTRACK_TIME_WAIT
#define sCL TCP_CONNTRACK_CLOSE
#define sLI TCP_CONNTRACK_LISTEN
#define sIV TCP_CONNTRACK_MAX
#define sIG TCP_CONNTRACK_IGNORE
/* What TCP flags are set from RST/SYN/FIN/ACK. */
enum tcp_bit_set {
TCP_SYN_SET,
TCP_SYNACK_SET,
TCP_FIN_SET,
TCP_ACK_SET,
TCP_RST_SET,
TCP_NONE_SET,
};
/*
* The TCP state transition table needs a few words...
*
* We are the man in the middle. All the packets go through us
* but might get lost in transit to the destination.
* It is assumed that the destinations can't receive segments
* we haven't seen.
*
* The checked segment is in window, but our windows are *not*
* equivalent with the ones of the sender/receiver. We always
* try to guess the state of the current sender.
*
* The meaning of the states are:
*
* NONE: initial state
* SYN_SENT: SYN-only packet seen
* SYN_RECV: SYN-ACK packet seen
* ESTABLISHED: ACK packet seen
* FIN_WAIT: FIN packet seen
* CLOSE_WAIT: ACK seen (after FIN)
* LAST_ACK: FIN seen (after FIN)
* TIME_WAIT: last ACK seen
* CLOSE: closed connection
*
* LISTEN state is not used.
*
* Packets marked as IGNORED (sIG):
* if they may be either invalid or valid
* and the receiver may send back a connection
* closing RST or a SYN/ACK.
*
* Packets marked as INVALID (sIV):
* if they are invalid
* or we do not support the request (simultaneous open)
*/
static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
{
/* ORIGINAL */
/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV },
/*
* sNO -> sSS Initialize a new connection
* sSS -> sSS Retransmitted SYN
* sSR -> sIG Late retransmitted SYN?
* sES -> sIG Error: SYNs in window outside the SYN_SENT state
* are errors. Receiver will reply with RST
* and close the connection.
* Or we are not in sync and hold a dead connection.
* sFW -> sIG
* sCW -> sIG
* sLA -> sIG
* sTW -> sSS Reopened connection (RFC 1122).
* sCL -> sSS
*/
/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
/*
* A SYN/ACK from the client is always invalid:
* - either it tries to set up a simultaneous open, which is
* not supported;
* - or the firewall has just been inserted between the two hosts
* during the session set-up. The SYN will be retransmitted
* by the true client (or it'll time out).
*/
/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
/*
* sNO -> sIV Too late and no reason to do anything...
* sSS -> sIV Client migth not send FIN in this state:
* we enforce waiting for a SYN/ACK reply first.
* sSR -> sFW Close started.
* sES -> sFW
* sFW -> sLA FIN seen in both directions, waiting for
* the last ACK.
* Migth be a retransmitted FIN as well...
* sCW -> sLA
* sLA -> sLA Retransmitted FIN. Remain in the same state.
* sTW -> sTW
* sCL -> sCL
*/
/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
/*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
/*
* sNO -> sES Assumed.
* sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet.
* sSR -> sES Established state is reached.
* sES -> sES :-)
* sFW -> sCW Normal close request answered by ACK.
* sCW -> sCW
* sLA -> sTW Last ACK detected.
* sTW -> sTW Retransmitted last ACK. Remain in the same state.
* sCL -> sCL
*/
/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
},
{
/* REPLY */
/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
/*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
/*
* sNO -> sIV Never reached.
* sSS -> sIV Simultaneous open, not supported
* sSR -> sIV Simultaneous open, not supported.
* sES -> sIV Server may not initiate a connection.
* sFW -> sIV
* sCW -> sIV
* sLA -> sIV
* sTW -> sIV Reopened connection, but server may not do it.
* sCL -> sIV
*/
/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV },
/*
* sSS -> sSR Standard open.
* sSR -> sSR Retransmitted SYN/ACK.
* sES -> sIG Late retransmitted SYN/ACK?
* sFW -> sIG
* sCW -> sIG
* sLA -> sIG
* sTW -> sIG
* sCL -> sIG
*/
/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
/*
* sSS -> sIV Server might not send FIN in this state.
* sSR -> sFW Close started.
* sES -> sFW
* sFW -> sLA FIN seen in both directions.
* sCW -> sLA
* sLA -> sLA Retransmitted FIN.
* sTW -> sTW
* sCL -> sCL
*/
/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
/*ack*/ { sIV, sIG, sIV, sES, sCW, sCW, sTW, sTW, sCL, sIV },
/*
* sSS -> sIG Might be a half-open connection.
* sSR -> sIV Simultaneous open.
* sES -> sES :-)
* sFW -> sCW Normal close request answered by ACK.
* sCW -> sCW
* sLA -> sTW Last ACK detected.
* sTW -> sTW Retransmitted last ACK.
* sCL -> sCL
*/
/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */
/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
}
};
static int tcp_pkt_to_tuple(const struct sk_buff *skb,
unsigned int dataoff,
struct ip_conntrack_tuple *tuple)
{
struct tcphdr _hdr, *hp;
/* Actually only need first 8 bytes. */
hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
if (hp == NULL)
return 0;
tuple->src.u.tcp.port = hp->source;
tuple->dst.u.tcp.port = hp->dest;
return 1;
}
static int tcp_invert_tuple(struct ip_conntrack_tuple *tuple,
const struct ip_conntrack_tuple *orig)
{
tuple->src.u.tcp.port = orig->dst.u.tcp.port;
tuple->dst.u.tcp.port = orig->src.u.tcp.port;
return 1;
}
/* Print out the per-protocol part of the tuple. */
static int tcp_print_tuple(struct seq_file *s,
const struct ip_conntrack_tuple *tuple)
{
return seq_printf(s, "sport=%hu dport=%hu ",
ntohs(tuple->src.u.tcp.port),
ntohs(tuple->dst.u.tcp.port));
}
/* Print out the private part of the conntrack. */
static int tcp_print_conntrack(struct seq_file *s,
const struct ip_conntrack *conntrack)
{
enum tcp_conntrack state;
READ_LOCK(&tcp_lock);
state = conntrack->proto.tcp.state;
READ_UNLOCK(&tcp_lock);
return seq_printf(s, "%s ", tcp_conntrack_names[state]);
}
static unsigned int get_conntrack_index(const struct tcphdr *tcph)
{
if (tcph->rst) return TCP_RST_SET;
else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
else if (tcph->fin) return TCP_FIN_SET;
else if (tcph->ack) return TCP_ACK_SET;
else return TCP_NONE_SET;
}
/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
in IP Filter' by Guido van Rooij.
http://www.nluug.nl/events/sane2000/papers.html
http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz
The boundaries and the conditions are slightly changed:
td_maxend = max(sack + max(win,1)) seen in reply packets
td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
td_end = max(seq + len) seen in sent packets
I. Upper bound for valid data: seq + len <= sender.td_maxend
II. Lower bound for valid data: seq >= sender.td_end - receiver.td_maxwin
III. Upper bound for valid ack: sack <= receiver.td_end
IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW
where sack is the highest right edge of sack block found in the packet.
The upper bound limit for a valid ack is not ignored -
we doesn't have to deal with fragments.
*/
static inline __u32 segment_seq_plus_len(__u32 seq,
size_t len,
struct iphdr *iph,
struct tcphdr *tcph)
{
return (seq + len - (iph->ihl + tcph->doff)*4
+ (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
}
/* Fixme: what about big packets? */
#define MAXACKWINCONST 66000
#define MAXACKWINDOW(sender) \
((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \
: MAXACKWINCONST)
/*
* Simplified tcp_parse_options routine from tcp_input.c
*/
static void tcp_options(const struct sk_buff *skb,
struct iphdr *iph,
struct tcphdr *tcph,
struct ip_ct_tcp_state *state)
{
unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
unsigned char *ptr;
int length = (tcph->doff*4) - sizeof(struct tcphdr);
if (!length)
return;
ptr = skb_header_pointer(skb,
(iph->ihl * 4) + sizeof(struct tcphdr),
length, buff);
BUG_ON(ptr == NULL);
state->td_scale =
state->flags = 0;
while (length > 0) {
int opcode=*ptr++;
int opsize;
switch (opcode) {
case TCPOPT_EOL:
return;
case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
length--;
continue;
default:
opsize=*ptr++;
if (opsize < 2) /* "silly options" */
return;
if (opsize > length)
break; /* don't parse partial options */
if (opcode == TCPOPT_SACK_PERM
&& opsize == TCPOLEN_SACK_PERM)
state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
else if (opcode == TCPOPT_WINDOW
&& opsize == TCPOLEN_WINDOW) {
state->td_scale = *(u_int8_t *)ptr;
if (state->td_scale > 14) {
/* See RFC1323 */
state->td_scale = 14;
}
state->flags |=
IP_CT_TCP_FLAG_WINDOW_SCALE;
}
ptr += opsize - 2;
length -= opsize;
}
}
}
static void tcp_sack(struct tcphdr *tcph, __u32 *sack)
{
__u32 tmp;
unsigned char *ptr;
int length = (tcph->doff*4) - sizeof(struct tcphdr);
/* Fast path for timestamp-only option */
if (length == TCPOLEN_TSTAMP_ALIGNED*4
&& *(__u32 *)(tcph + 1) ==
__constant_ntohl((TCPOPT_NOP << 24)
| (TCPOPT_NOP << 16)
| (TCPOPT_TIMESTAMP << 8)
| TCPOLEN_TIMESTAMP))
return;
ptr = (unsigned char *)(tcph + 1);
while (length > 0) {
int opcode=*ptr++;
int opsize, i;
switch (opcode) {
case TCPOPT_EOL:
return;
case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
length--;
continue;
default:
opsize=*ptr++;
if (opsize < 2) /* "silly options" */
return;
if (opsize > length)
break; /* don't parse partial options */
if (opcode == TCPOPT_SACK
&& opsize >= (TCPOLEN_SACK_BASE
+ TCPOLEN_SACK_PERBLOCK)
&& !((opsize - TCPOLEN_SACK_BASE)
% TCPOLEN_SACK_PERBLOCK)) {
for (i = 0;
i < (opsize - TCPOLEN_SACK_BASE);
i += TCPOLEN_SACK_PERBLOCK) {
tmp = ntohl(*((u_int32_t *)(ptr+i)+1));
if (after(tmp, *sack))
*sack = tmp;
}
return;
}
ptr += opsize - 2;
length -= opsize;
}
}
}
static int tcp_in_window(struct ip_ct_tcp *state,
enum ip_conntrack_dir dir,
unsigned int *index,
const struct sk_buff *skb,
struct iphdr *iph,
struct tcphdr *tcph)
{
struct ip_ct_tcp_state *sender = &state->seen[dir];
struct ip_ct_tcp_state *receiver = &state->seen[!dir];
__u32 seq, ack, sack, end, win, swin;
int res;
/*
* Get the required data from the packet.
*/
seq = ntohl(tcph->seq);
ack = sack = ntohl(tcph->ack_seq);
win = ntohs(tcph->window);
end = segment_seq_plus_len(seq, skb->len, iph, tcph);
if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
tcp_sack(tcph, &sack);
DEBUGP("tcp_in_window: START\n");
DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
"seq=%u ack=%u sack=%u win=%u end=%u\n",
NIPQUAD(iph->saddr), ntohs(tcph->source),
NIPQUAD(iph->daddr), ntohs(tcph->dest),
seq, ack, sack, win, end);
DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
"receiver end=%u maxend=%u maxwin=%u scale=%i\n",
sender->td_end, sender->td_maxend, sender->td_maxwin,
sender->td_scale,
receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
receiver->td_scale);
if (sender->td_end == 0) {
/*
* Initialize sender data.
*/
if (tcph->syn && tcph->ack) {
/*
* Outgoing SYN-ACK in reply to a SYN.
*/
sender->td_end =
sender->td_maxend = end;
sender->td_maxwin = (win == 0 ? 1 : win);
tcp_options(skb, iph, tcph, sender);
/*
* RFC 1323:
* Both sides must send the Window Scale option
* to enable window scaling in either direction.
*/
if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
&& receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
sender->td_scale =
receiver->td_scale = 0;
} else {
/*
* We are in the middle of a connection,
* its history is lost for us.
* Let's try to use the data from the packet.
*/
sender->td_end = end;
sender->td_maxwin = (win == 0 ? 1 : win);
sender->td_maxend = end + sender->td_maxwin;
}
} else if (((state->state == TCP_CONNTRACK_SYN_SENT
&& dir == IP_CT_DIR_ORIGINAL)
|| (state->state == TCP_CONNTRACK_SYN_RECV
&& dir == IP_CT_DIR_REPLY))
&& after(end, sender->td_end)) {
/*
* RFC 793: "if a TCP is reinitialized ... then it need
* not wait at all; it must only be sure to use sequence
* numbers larger than those recently used."
*/
sender->td_end =
sender->td_maxend = end;
sender->td_maxwin = (win == 0 ? 1 : win);
tcp_options(skb, iph, tcph, sender);
}
if (!(tcph->ack)) {
/*
* If there is no ACK, just pretend it was set and OK.
*/
ack = sack = receiver->td_end;
} else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
(TCP_FLAG_ACK|TCP_FLAG_RST))
&& (ack == 0)) {
/*
* Broken TCP stacks, that set ACK in RST packets as well
* with zero ack value.
*/
ack = sack = receiver->td_end;
}
if (seq == end)
/*
* Packets contains no data: we assume it is valid
* and check the ack value only.
*/
seq = end = sender->td_end;
DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
"seq=%u ack=%u sack =%u win=%u end=%u trim=%u\n",
NIPQUAD(iph->saddr), ntohs(tcph->source),
NIPQUAD(iph->daddr), ntohs(tcph->dest),
seq, ack, sack, win, end,
after(end, sender->td_maxend) && before(seq, sender->td_maxend)
? sender->td_maxend : end);
DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
"receiver end=%u maxend=%u maxwin=%u scale=%i\n",
sender->td_end, sender->td_maxend, sender->td_maxwin,
sender->td_scale,
receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
receiver->td_scale);
/* Ignore data over the right edge of the receiver's window. */
if (after(end, sender->td_maxend) &&
before(seq, sender->td_maxend)) {
end = sender->td_maxend;
if (*index == TCP_FIN_SET)
*index = TCP_ACK_SET;
}
DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
before(end, sender->td_maxend + 1)
|| before(seq, sender->td_maxend + 1),
after(seq, sender->td_end - receiver->td_maxwin - 1)
|| after(end, sender->td_end - receiver->td_maxwin - 1),
before(sack, receiver->td_end + 1),
after(ack, receiver->td_end - MAXACKWINDOW(sender)));
if (sender->loose || receiver->loose ||
(before(end, sender->td_maxend + 1) &&
after(seq, sender->td_end - receiver->td_maxwin - 1) &&
before(sack, receiver->td_end + 1) &&
after(ack, receiver->td_end - MAXACKWINDOW(sender)))) {
/*
* Take into account window scaling (RFC 1323).
*/
if (!tcph->syn)
win <<= sender->td_scale;
/*
* Update sender data.
*/
swin = win + (sack - ack);
if (sender->td_maxwin < swin)
sender->td_maxwin = swin;
if (after(end, sender->td_end))
sender->td_end = end;
if (after(sack + win, receiver->td_maxend - 1)) {
receiver->td_maxend = sack + win;
if (win == 0)
receiver->td_maxend++;
}
/*
* Check retransmissions.
*/
if (*index == TCP_ACK_SET) {
if (state->last_dir == dir
&& state->last_seq == seq
&& state->last_ack == ack
&& state->last_end == end)
state->retrans++;
else {
state->last_dir = dir;
state->last_seq = seq;
state->last_ack = ack;
state->last_end = end;
state->retrans = 0;
}
}
/*
* Close the window of disabled window tracking :-)
*/
if (sender->loose)
sender->loose--;
res = 1;
} else {
if (LOG_INVALID(IPPROTO_TCP))
nf_log_packet(PF_INET, 0, skb, NULL, NULL,
"ip_ct_tcp: %s ",
before(end, sender->td_maxend + 1) ?
after(seq, sender->td_end - receiver->td_maxwin - 1) ?
before(sack, receiver->td_end + 1) ?
after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG"
: "ACK is under the lower bound (possibly overly delayed ACK)"
: "ACK is over the upper bound (ACKed data has never seen yet)"
: "SEQ is under the lower bound (retransmitted already ACKed data)"
: "SEQ is over the upper bound (over the window of the receiver)");
res = ip_ct_tcp_be_liberal && !tcph->rst;
}
DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u "
"receiver end=%u maxend=%u maxwin=%u\n",
res, sender->td_end, sender->td_maxend, sender->td_maxwin,
receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
return res;
}
#ifdef CONFIG_IP_NF_NAT_NEEDED
/* Update sender->td_end after NAT successfully mangled the packet */
void ip_conntrack_tcp_update(struct sk_buff *skb,
struct ip_conntrack *conntrack,
enum ip_conntrack_dir dir)
{
struct iphdr *iph = skb->nh.iph;
struct tcphdr *tcph = (void *)skb->nh.iph + skb->nh.iph->ihl*4;
__u32 end;
#ifdef DEBUGP_VARS
struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir];
struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir];
#endif
end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph);
WRITE_LOCK(&tcp_lock);
/*
* We have to worry for the ack in the reply packet only...
*/
if (after(end, conntrack->proto.tcp.seen[dir].td_end))
conntrack->proto.tcp.seen[dir].td_end = end;
conntrack->proto.tcp.last_end = end;
WRITE_UNLOCK(&tcp_lock);
DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
"receiver end=%u maxend=%u maxwin=%u scale=%i\n",
sender->td_end, sender->td_maxend, sender->td_maxwin,
sender->td_scale,
receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
receiver->td_scale);
}
#endif
#define TH_FIN 0x01
#define TH_SYN 0x02
#define TH_RST 0x04
#define TH_PUSH 0x08
#define TH_ACK 0x10
#define TH_URG 0x20
#define TH_ECE 0x40
#define TH_CWR 0x80
/* table of valid flag combinations - ECE and CWR are always valid */
static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] =
{
[TH_SYN] = 1,
[TH_SYN|TH_ACK] = 1,
[TH_RST] = 1,
[TH_RST|TH_ACK] = 1,
[TH_RST|TH_ACK|TH_PUSH] = 1,
[TH_FIN|TH_ACK] = 1,
[TH_ACK] = 1,
[TH_ACK|TH_PUSH] = 1,
[TH_ACK|TH_URG] = 1,
[TH_ACK|TH_URG|TH_PUSH] = 1,
[TH_FIN|TH_ACK|TH_PUSH] = 1,
[TH_FIN|TH_ACK|TH_URG] = 1,
[TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1,
};
/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */
static int tcp_error(struct sk_buff *skb,
enum ip_conntrack_info *ctinfo,
unsigned int hooknum)
{
struct iphdr *iph = skb->nh.iph;
struct tcphdr _tcph, *th;
unsigned int tcplen = skb->len - iph->ihl * 4;
u_int8_t tcpflags;
/* Smaller that minimal TCP header? */
th = skb_header_pointer(skb, iph->ihl * 4,
sizeof(_tcph), &_tcph);
if (th == NULL) {
if (LOG_INVALID(IPPROTO_TCP))
nf_log_packet(PF_INET, 0, skb, NULL, NULL,
"ip_ct_tcp: short packet ");
return -NF_ACCEPT;
}
/* Not whole TCP header or malformed packet */
if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
if (LOG_INVALID(IPPROTO_TCP))
nf_log_packet(PF_INET, 0, skb, NULL, NULL,
"ip_ct_tcp: truncated/malformed packet ");
return -NF_ACCEPT;
}
/* Checksum invalid? Ignore.
* We skip checking packets on the outgoing path
* because the semantic of CHECKSUM_HW is different there
* and moreover root might send raw packets.
*/
/* FIXME: Source route IP option packets --RR */
if (hooknum == NF_IP_PRE_ROUTING
&& csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP,
skb->ip_summed == CHECKSUM_HW ? skb->csum
: skb_checksum(skb, iph->ihl*4, tcplen, 0))) {
if (LOG_INVALID(IPPROTO_TCP))
nf_log_packet(PF_INET, 0, skb, NULL, NULL,
"ip_ct_tcp: bad TCP checksum ");
return -NF_ACCEPT;
}
/* Check TCP flags. */
tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
if (!tcp_valid_flags[tcpflags]) {
if (LOG_INVALID(IPPROTO_TCP))
nf_log_packet(PF_INET, 0, skb, NULL, NULL,
"ip_ct_tcp: invalid TCP flag combination ");
return -NF_ACCEPT;
}
return NF_ACCEPT;
}
/* Returns verdict for packet, or -1 for invalid. */
static int tcp_packet(struct ip_conntrack *conntrack,
const struct sk_buff *skb,
enum ip_conntrack_info ctinfo)
{
enum tcp_conntrack new_state, old_state;
enum ip_conntrack_dir dir;
struct iphdr *iph = skb->nh.iph;
struct tcphdr *th, _tcph;
unsigned long timeout;
unsigned int index;
th = skb_header_pointer(skb, iph->ihl * 4,
sizeof(_tcph), &_tcph);
BUG_ON(th == NULL);
WRITE_LOCK(&tcp_lock);
old_state = conntrack->proto.tcp.state;
dir = CTINFO2DIR(ctinfo);
index = get_conntrack_index(th);
new_state = tcp_conntracks[dir][index][old_state];
switch (new_state) {
case TCP_CONNTRACK_IGNORE:
/* Either SYN in ORIGINAL
* or SYN/ACK in REPLY
* or ACK in REPLY direction (half-open connection). */
if (index == TCP_SYNACK_SET
&& conntrack->proto.tcp.last_index == TCP_SYN_SET
&& conntrack->proto.tcp.last_dir != dir
&& after(ntohl(th->ack_seq),
conntrack->proto.tcp.last_seq)) {
/* This SYN/ACK acknowledges a SYN that we earlier
* ignored as invalid. This means that the client and
* the server are both in sync, while the firewall is
* not. We kill this session and block the SYN/ACK so
* that the client cannot but retransmit its SYN and
* thus initiate a clean new session.
*/
WRITE_UNLOCK(&tcp_lock);
if (LOG_INVALID(IPPROTO_TCP))
nf_log_packet(PF_INET, 0, skb, NULL, NULL,
"ip_ct_tcp: killing out of sync session ");
if (del_timer(&conntrack->timeout))
conntrack->timeout.function((unsigned long)
conntrack);
return -NF_DROP;
}
conntrack->proto.tcp.last_index = index;
conntrack->proto.tcp.last_dir = dir;
conntrack->proto.tcp.last_seq = ntohl(th->seq);
WRITE_UNLOCK(&tcp_lock);
if (LOG_INVALID(IPPROTO_TCP))
nf_log_packet(PF_INET, 0, skb, NULL, NULL,
"ip_ct_tcp: invalid packet ignored ");
return NF_ACCEPT;
case TCP_CONNTRACK_MAX:
/* Invalid packet */
DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
dir, get_conntrack_index(th),
old_state);
WRITE_UNLOCK(&tcp_lock);
if (LOG_INVALID(IPPROTO_TCP))
nf_log_packet(PF_INET, 0, skb, NULL, NULL,
"ip_ct_tcp: invalid state ");
return -NF_ACCEPT;
case TCP_CONNTRACK_SYN_SENT:
if (old_state >= TCP_CONNTRACK_TIME_WAIT) {
/* Attempt to reopen a closed connection.
* Delete this connection and look up again. */
WRITE_UNLOCK(&tcp_lock);
if (del_timer(&conntrack->timeout))
conntrack->timeout.function((unsigned long)
conntrack);
return -NF_REPEAT;
}
break;
case TCP_CONNTRACK_CLOSE:
if (index == TCP_RST_SET
&& ((test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)
&& conntrack->proto.tcp.last_index <= TCP_SYNACK_SET)
|| (!test_bit(IPS_ASSURED_BIT, &conntrack->status)
&& conntrack->proto.tcp.last_index == TCP_ACK_SET))
&& after(ntohl(th->ack_seq),
conntrack->proto.tcp.last_seq)) {
/* Ignore RST closing down invalid SYN or ACK
we had let trough. */
WRITE_UNLOCK(&tcp_lock);
if (LOG_INVALID(IPPROTO_TCP))
nf_log_packet(PF_INET, 0, skb, NULL, NULL,
"ip_ct_tcp: invalid RST (ignored) ");
return NF_ACCEPT;
}
/* Just fall trough */
default:
/* Keep compilers happy. */
break;
}
if (!tcp_in_window(&conntrack->proto.tcp, dir, &index,
skb, iph, th)) {
WRITE_UNLOCK(&tcp_lock);
return -NF_ACCEPT;
}
/* From now on we have got in-window packets */
/* If FIN was trimmed off, we don't change state. */
conntrack->proto.tcp.last_index = index;
new_state = tcp_conntracks[dir][index][old_state];
DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
"syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
NIPQUAD(iph->saddr), ntohs(th->source),
NIPQUAD(iph->daddr), ntohs(th->dest),
(th->syn ? 1 : 0), (th->ack ? 1 : 0),
(th->fin ? 1 : 0), (th->rst ? 1 : 0),
old_state, new_state);
conntrack->proto.tcp.state = new_state;
timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans
&& *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans
? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
WRITE_UNLOCK(&tcp_lock);
if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
/* If only reply is a RST, we can consider ourselves not to
have an established connection: this is a fairly common
problem case, so we can delete the conntrack
immediately. --RR */
if (th->rst) {
if (del_timer(&conntrack->timeout))
conntrack->timeout.function((unsigned long)
conntrack);
return NF_ACCEPT;
}
} else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status)
&& (old_state == TCP_CONNTRACK_SYN_RECV
|| old_state == TCP_CONNTRACK_ESTABLISHED)
&& new_state == TCP_CONNTRACK_ESTABLISHED) {
/* Set ASSURED if we see see valid ack in ESTABLISHED
after SYN_RECV or a valid answer for a picked up
connection. */
set_bit(IPS_ASSURED_BIT, &conntrack->status);
}
ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout);
return NF_ACCEPT;
}
/* Called when a new connection for this protocol found. */
static int tcp_new(struct ip_conntrack *conntrack,
const struct sk_buff *skb)
{
enum tcp_conntrack new_state;
struct iphdr *iph = skb->nh.iph;
struct tcphdr *th, _tcph;
#ifdef DEBUGP_VARS
struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0];
struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1];
#endif
th = skb_header_pointer(skb, iph->ihl * 4,
sizeof(_tcph), &_tcph);
BUG_ON(th == NULL);
/* Don't need lock here: this conntrack not in circulation yet */
new_state
= tcp_conntracks[0][get_conntrack_index(th)]
[TCP_CONNTRACK_NONE];
/* Invalid: delete conntrack */
if (new_state >= TCP_CONNTRACK_MAX) {
DEBUGP("ip_ct_tcp: invalid new deleting.\n");
return 0;
}
if (new_state == TCP_CONNTRACK_SYN_SENT) {
/* SYN packet */
conntrack->proto.tcp.seen[0].td_end =
segment_seq_plus_len(ntohl(th->seq), skb->len,
iph, th);
conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
conntrack->proto.tcp.seen[0].td_maxwin = 1;
conntrack->proto.tcp.seen[0].td_maxend =
conntrack->proto.tcp.seen[0].td_end;
tcp_options(skb, iph, th, &conntrack->proto.tcp.seen[0]);
conntrack->proto.tcp.seen[1].flags = 0;
conntrack->proto.tcp.seen[0].loose =
conntrack->proto.tcp.seen[1].loose = 0;
} else if (ip_ct_tcp_loose == 0) {
/* Don't try to pick up connections. */
return 0;
} else {
/*
* We are in the middle of a connection,
* its history is lost for us.
* Let's try to use the data from the packet.
*/
conntrack->proto.tcp.seen[0].td_end =
segment_seq_plus_len(ntohl(th->seq), skb->len,
iph, th);
conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
conntrack->proto.tcp.seen[0].td_maxwin = 1;
conntrack->proto.tcp.seen[0].td_maxend =
conntrack->proto.tcp.seen[0].td_end +
conntrack->proto.tcp.seen[0].td_maxwin;
conntrack->proto.tcp.seen[0].td_scale = 0;
/* We assume SACK. Should we assume window scaling too? */
conntrack->proto.tcp.seen[0].flags =
conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM;
conntrack->proto.tcp.seen[0].loose =
conntrack->proto.tcp.seen[1].loose = ip_ct_tcp_loose;
}
conntrack->proto.tcp.seen[1].td_end = 0;
conntrack->proto.tcp.seen[1].td_maxend = 0;
conntrack->proto.tcp.seen[1].td_maxwin = 1;
conntrack->proto.tcp.seen[1].td_scale = 0;
/* tcp_packet will set them */
conntrack->proto.tcp.state = TCP_CONNTRACK_NONE;
conntrack->proto.tcp.last_index = TCP_NONE_SET;
DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
"receiver end=%u maxend=%u maxwin=%u scale=%i\n",
sender->td_end, sender->td_maxend, sender->td_maxwin,
sender->td_scale,
receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
receiver->td_scale);
return 1;
}
struct ip_conntrack_protocol ip_conntrack_protocol_tcp =
{
.proto = IPPROTO_TCP,
.name = "tcp",
.pkt_to_tuple = tcp_pkt_to_tuple,
.invert_tuple = tcp_invert_tuple,
.print_tuple = tcp_print_tuple,
.print_conntrack = tcp_print_conntrack,
.packet = tcp_packet,
.new = tcp_new,
.error = tcp_error,
};