From f373b53b5fe67aa4a6f28f921a529cc90f88e79b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 9 Oct 2009 00:16:19 +0000 Subject: tcp: replace ehash_size by ehash_mask Storing the mask (size - 1) instead of the size allows fast path to be a bit faster. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7cda24b53f61..99718703d040 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2000,7 +2000,7 @@ static void *established_get_first(struct seq_file *seq) struct net *net = seq_file_net(seq); void *rc = NULL; - for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { + for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { struct sock *sk; struct hlist_nulls_node *node; struct inet_timewait_sock *tw; @@ -2061,10 +2061,10 @@ get_tw: st->state = TCP_SEQ_STATE_ESTABLISHED; /* Look for next non empty bucket */ - while (++st->bucket < tcp_hashinfo.ehash_size && + while (++st->bucket <= tcp_hashinfo.ehash_mask && empty_bucket(st)) ; - if (st->bucket >= tcp_hashinfo.ehash_size) + if (st->bucket > tcp_hashinfo.ehash_mask) return NULL; spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); -- cgit v1.2.2 From c720c7e8383aff1cb219bddf474ed89d850336e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 15 Oct 2009 06:30:45 +0000 Subject: inet: rename some inet_sock fields In order to have better cache layouts of struct sock (separate zones for rx/tx paths), we need this preliminary patch. Goal is to transfert fields used at lookup time in the first read-mostly cache line (inside struct sock_common) and move sk_refcnt to a separate cache line (only written by rx path) This patch adds inet_ prefix to daddr, rcv_saddr, dport, num, saddr, sport and id fields. This allows a future patch to define these fields as macros, like sk_refcnt, without name clashes. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 70 +++++++++++++++++++++++++++-------------------------- 1 file changed, 36 insertions(+), 34 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 99718703d040..a4a3390a5287 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -165,10 +165,10 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) nexthop = inet->opt->faddr; } - tmp = ip_route_connect(&rt, nexthop, inet->saddr, + tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, IPPROTO_TCP, - inet->sport, usin->sin_port, sk, 1); + inet->inet_sport, usin->sin_port, sk, 1); if (tmp < 0) { if (tmp == -ENETUNREACH) IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); @@ -183,11 +183,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (!inet->opt || !inet->opt->srr) daddr = rt->rt_dst; - if (!inet->saddr) - inet->saddr = rt->rt_src; - inet->rcv_saddr = inet->saddr; + if (!inet->inet_saddr) + inet->inet_saddr = rt->rt_src; + inet->inet_rcv_saddr = inet->inet_saddr; - if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) { + if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { /* Reset inherited state */ tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; @@ -210,8 +210,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } } - inet->dport = usin->sin_port; - inet->daddr = daddr; + inet->inet_dport = usin->sin_port; + inet->inet_daddr = daddr; inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet->opt) @@ -230,7 +230,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) goto failure; err = ip_route_newports(&rt, IPPROTO_TCP, - inet->sport, inet->dport, sk); + inet->inet_sport, inet->inet_dport, sk); if (err) goto failure; @@ -239,12 +239,12 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk_setup_caps(sk, &rt->u.dst); if (!tp->write_seq) - tp->write_seq = secure_tcp_sequence_number(inet->saddr, - inet->daddr, - inet->sport, + tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, + inet->inet_daddr, + inet->inet_sport, usin->sin_port); - inet->id = tp->write_seq ^ jiffies; + inet->inet_id = tp->write_seq ^ jiffies; err = tcp_connect(sk); rt = NULL; @@ -261,7 +261,7 @@ failure: tcp_set_state(sk, TCP_CLOSE); ip_rt_put(rt); sk->sk_route_caps = 0; - inet->dport = 0; + inet->inet_dport = 0; return err; } @@ -520,12 +520,13 @@ void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) struct tcphdr *th = tcp_hdr(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) { - th->check = ~tcp_v4_check(len, inet->saddr, - inet->daddr, 0); + th->check = ~tcp_v4_check(len, inet->inet_saddr, + inet->inet_daddr, 0); skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); } else { - th->check = tcp_v4_check(len, inet->saddr, inet->daddr, + th->check = tcp_v4_check(len, inet->inet_saddr, + inet->inet_daddr, csum_partial(th, th->doff << 2, skb->csum)); @@ -848,7 +849,7 @@ static struct tcp_md5sig_key * struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, struct sock *addr_sk) { - return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr); + return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr); } EXPORT_SYMBOL(tcp_v4_md5_lookup); @@ -923,7 +924,7 @@ EXPORT_SYMBOL(tcp_v4_md5_do_add); static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk, u8 *newkey, u8 newkeylen) { - return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr, + return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr, newkey, newkeylen); } @@ -1089,8 +1090,8 @@ int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, __be32 saddr, daddr; if (sk) { - saddr = inet_sk(sk)->saddr; - daddr = inet_sk(sk)->daddr; + saddr = inet_sk(sk)->inet_saddr; + daddr = inet_sk(sk)->inet_daddr; } else if (req) { saddr = inet_rsk(req)->loc_addr; daddr = inet_rsk(req)->rmt_addr; @@ -1380,9 +1381,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp = tcp_sk(newsk); newinet = inet_sk(newsk); ireq = inet_rsk(req); - newinet->daddr = ireq->rmt_addr; - newinet->rcv_saddr = ireq->loc_addr; - newinet->saddr = ireq->loc_addr; + newinet->inet_daddr = ireq->rmt_addr; + newinet->inet_rcv_saddr = ireq->loc_addr; + newinet->inet_saddr = ireq->loc_addr; newinet->opt = ireq->opt; ireq->opt = NULL; newinet->mc_index = inet_iif(skb); @@ -1390,7 +1391,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = 0; if (newinet->opt) inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; - newinet->id = newtp->write_seq ^ jiffies; + newinet->inet_id = newtp->write_seq ^ jiffies; tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); @@ -1403,7 +1404,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, #ifdef CONFIG_TCP_MD5SIG /* Copy over the MD5 key from the original socket */ - if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) { + key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr); + if (key != NULL) { /* * We're using one, so create a matching key * on the newsk structure. If we fail to get @@ -1412,7 +1414,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, */ char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC); if (newkey != NULL) - tcp_v4_md5_do_add(newsk, newinet->daddr, + tcp_v4_md5_do_add(newsk, newinet->inet_daddr, newkey, key->keylen); newsk->sk_route_caps &= ~NETIF_F_GSO_MASK; } @@ -1711,8 +1713,8 @@ int tcp_v4_remember_stamp(struct sock *sk) struct inet_peer *peer = NULL; int release_it = 0; - if (!rt || rt->rt_dst != inet->daddr) { - peer = inet_getpeer(inet->daddr, 1); + if (!rt || rt->rt_dst != inet->inet_daddr) { + peer = inet_getpeer(inet->inet_daddr, 1); release_it = 1; } else { if (!rt->peer) @@ -2225,7 +2227,7 @@ static void get_openreq4(struct sock *sk, struct request_sock *req, " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n", i, ireq->loc_addr, - ntohs(inet_sk(sk)->sport), + ntohs(inet_sk(sk)->inet_sport), ireq->rmt_addr, ntohs(ireq->rmt_port), TCP_SYN_RECV, @@ -2248,10 +2250,10 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); - __be32 dest = inet->daddr; - __be32 src = inet->rcv_saddr; - __u16 destp = ntohs(inet->dport); - __u16 srcp = ntohs(inet->sport); + __be32 dest = inet->inet_daddr; + __be32 src = inet->inet_rcv_saddr; + __u16 destp = ntohs(inet->inet_dport); + __u16 srcp = ntohs(inet->inet_sport); if (icsk->icsk_pending == ICSK_TIME_RETRANS) { timer_active = 1; -- cgit v1.2.2 From 022c3f7d82f0f1c68018696f2f027b87b9bb45c2 Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Wed, 28 Oct 2009 04:15:22 +0000 Subject: Allow tcp_parse_options to consult dst entry We need tcp_parse_options to be aware of dst_entry to take into account per dst_entry TCP options settings Signed-off-by: Gilad Ben-Yossef Sigend-off-by: Ori Finkelman Sigend-off-by: Yony Amit Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a4a3390a5287..657ae334f125 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1257,11 +1257,21 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; #endif + ireq = inet_rsk(req); + ireq->loc_addr = daddr; + ireq->rmt_addr = saddr; + ireq->no_srccheck = inet_sk(sk)->transparent; + ireq->opt = tcp_v4_save_options(sk, skb); + + dst = inet_csk_route_req(sk, req); + if(!dst) + goto drop_and_free; + tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = 536; tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, 0); + tcp_parse_options(skb, &tmp_opt, 0, dst); if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); @@ -1270,14 +1280,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_openreq_init(req, &tmp_opt, skb); - ireq = inet_rsk(req); - ireq->loc_addr = daddr; - ireq->rmt_addr = saddr; - ireq->no_srccheck = inet_sk(sk)->transparent; - ireq->opt = tcp_v4_save_options(sk, skb); - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_free; + goto drop_and_release; if (!want_cookie) TCP_ECN_create_request(req, tcp_hdr(skb)); @@ -1302,7 +1306,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) */ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle && - (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && peer->v4daddr == saddr) { if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL && -- cgit v1.2.2 From bee7ca9ec03a26676ea2b1c28dc4039348eff3e1 Mon Sep 17 00:00:00 2001 From: William Allen Simpson Date: Tue, 10 Nov 2009 09:51:18 +0000 Subject: net: TCP_MSS_DEFAULT, TCP_MSS_DESIRED Define two symbols needed in both kernel and user space. Remove old (somewhat incorrect) kernel variant that wasn't used in most cases. Default should apply to both RMSS and SMSS (RFC2581). Replace numeric constants with defined symbols. Stand-alone patch, originally developed for TCPCT. Signed-off-by: William.Allen.Simpson@gmail.com Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 657ae334f125..cf7f2086e6e0 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -217,7 +217,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (inet->opt) inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; - tp->rx_opt.mss_clamp = 536; + tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; /* Socket identity is still unknown (sport may be zero). * However we set state to SYN-SENT and not releasing socket @@ -1268,7 +1268,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; tcp_clear_options(&tmp_opt); - tmp_opt.mss_clamp = 536; + tmp_opt.mss_clamp = TCP_MSS_DEFAULT; tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss; tcp_parse_options(skb, &tmp_opt, 0, dst); @@ -1815,7 +1815,7 @@ static int tcp_v4_init_sock(struct sock *sk) */ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; - tp->mss_cache = 536; + tp->mss_cache = TCP_MSS_DEFAULT; tp->reordering = sysctl_tcp_reordering; icsk->icsk_ca_ops = &tcp_init_congestion_ops; -- cgit v1.2.2 From 2c1409a0a2b88585ec0c03f1de0aafa178c56313 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 12 Nov 2009 09:33:09 +0000 Subject: inetpeer: Optimize inet_getid() While investigating for network latencies, I found inet_getid() was a contention point for some workloads, as inet_peer_idlock is shared by all inet_getid() users regardless of peers. One way to fix this is to make ip_id_count an atomic_t instead of __u16, and use atomic_add_return(). In order to keep sizeof(struct inet_peer) = 64 on 64bit arches tcp_ts_stamp is also converted to __u32 instead of "unsigned long". Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cf7f2086e6e0..df18ce04f41e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -204,7 +204,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * when trying new connection. */ if (peer != NULL && - peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) { + (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; tp->rx_opt.ts_recent = peer->tcp_ts; } @@ -1308,7 +1308,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_death_row.sysctl_tw_recycle && (peer = rt_get_peer((struct rtable *)dst)) != NULL && peer->v4daddr == saddr) { - if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL && + if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); @@ -1727,9 +1727,9 @@ int tcp_v4_remember_stamp(struct sock *sk) if (peer) { if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || - (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() && - peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) { - peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp; + ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && + peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { + peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; peer->tcp_ts = tp->rx_opt.ts_recent; } if (release_it) @@ -1748,9 +1748,9 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || - (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() && - peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) { - peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp; + ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && + peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { + peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; peer->tcp_ts = tcptw->tw_ts_recent; } inet_putpeer(peer); -- cgit v1.2.2 From e6b4d11367519bc71729c09d05a126b133c755be Mon Sep 17 00:00:00 2001 From: William Allen Simpson Date: Wed, 2 Dec 2009 18:07:39 +0000 Subject: TCPCT part 1a: add request_values parameter for sending SYNACK Add optional function parameters associated with sending SYNACK. These parameters are not needed after sending SYNACK, and are not used for retransmission. Avoids extending struct tcp_request_sock, and avoids allocating kernel memory. Also affects DCCP as it uses common struct request_sock_ops, but this parameter is currently reserved for future use. Signed-off-by: William.Allen.Simpson@gmail.com Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index df18ce04f41e..649a36d99c73 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -742,8 +742,9 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, * This still operates on a request_sock only, not on a big * socket. */ -static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, - struct dst_entry *dst) +static int __tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, + struct request_sock *req, + struct request_values *rvp) { const struct inet_request_sock *ireq = inet_rsk(req); int err = -1; @@ -753,7 +754,7 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req); + skb = tcp_make_synack(sk, dst, req, rvp); if (skb) { struct tcphdr *th = tcp_hdr(skb); @@ -774,9 +775,10 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, return err; } -static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req) +static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req, + struct request_values *rvp) { - return __tcp_v4_send_synack(sk, req, NULL); + return __tcp_v4_send_synack(sk, NULL, req, rvp); } /* @@ -1211,13 +1213,13 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = { int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { - struct inet_request_sock *ireq; struct tcp_options_received tmp_opt; struct request_sock *req; + struct inet_request_sock *ireq; + struct dst_entry *dst = NULL; __be32 saddr = ip_hdr(skb)->saddr; __be32 daddr = ip_hdr(skb)->daddr; __u32 isn = TCP_SKB_CB(skb)->when; - struct dst_entry *dst = NULL; #ifdef CONFIG_SYN_COOKIES int want_cookie = 0; #else @@ -1337,7 +1339,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) } tcp_rsk(req)->snt_isn = isn; - if (__tcp_v4_send_synack(sk, req, dst) || want_cookie) + if (__tcp_v4_send_synack(sk, dst, req, NULL) || want_cookie) goto drop_and_free; inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); -- cgit v1.2.2 From 435cf559f02ea3a3159eb316f97dc88bdebe9432 Mon Sep 17 00:00:00 2001 From: William Allen Simpson Date: Wed, 2 Dec 2009 18:17:05 +0000 Subject: TCPCT part 1d: define TCP cookie option, extend existing struct's Data structures are carefully composed to require minimal additions. For example, the struct tcp_options_received cookie_plus variable fits between existing 16-bit and 8-bit variables, requiring no additional space (taking alignment into consideration). There are no additions to tcp_request_sock, and only 1 pointer in tcp_sock. This is a significantly revised implementation of an earlier (year-old) patch that no longer applies cleanly, with permission of the original author (Adam Langley): http://thread.gmane.org/gmane.linux.network/102586 The principle difference is using a TCP option to carry the cookie nonce, instead of a user configured offset in the data. This is more flexible and less subject to user configuration error. Such a cookie option has been suggested for many years, and is also useful without SYN data, allowing several related concepts to use the same extension option. "Re: SYN floods (was: does history repeat itself?)", September 9, 1996. http://www.merit.net/mail.archives/nanog/1996-09/msg00235.html "Re: what a new TCP header might look like", May 12, 1998. ftp://ftp.isi.edu/end2end/end2end-interest-1998.mail These functions will also be used in subsequent patches that implement additional features. Requires: TCPCT part 1a: add request_values parameter for sending SYNACK TCPCT part 1b: generate Responder Cookie secret TCPCT part 1c: sysctl_tcp_cookie_size, socket option TCP_COOKIE_TRANSACTIONS Signed-off-by: William.Allen.Simpson@gmail.com Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 649a36d99c73..a2bcac9b388e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1833,6 +1833,19 @@ static int tcp_v4_init_sock(struct sock *sk) tp->af_specific = &tcp_sock_ipv4_specific; #endif + /* TCP Cookie Transactions */ + if (sysctl_tcp_cookie_size > 0) { + /* Default, cookies without s_data_payload. */ + tp->cookie_values = + kzalloc(sizeof(*tp->cookie_values), + sk->sk_allocation); + if (tp->cookie_values != NULL) + kref_init(&tp->cookie_values->kref); + } + /* Presumed zeroed, in order of appearance: + * cookie_in_always, cookie_out_never, + * s_data_constant, s_data_in, s_data_out + */ sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; @@ -1886,6 +1899,13 @@ void tcp_v4_destroy_sock(struct sock *sk) sk->sk_sndmsg_page = NULL; } + /* TCP Cookie Transactions */ + if (tp->cookie_values != NULL) { + kref_put(&tp->cookie_values->kref, + tcp_cookie_values_release); + tp->cookie_values = NULL; + } + percpu_counter_dec(&tcp_sockets_allocated); } -- cgit v1.2.2 From 4957faade11b3a278c3b3cade3411ddc20afa791 Mon Sep 17 00:00:00 2001 From: William Allen Simpson Date: Wed, 2 Dec 2009 18:25:27 +0000 Subject: TCPCT part 1g: Responder Cookie => Initiator Parse incoming TCP_COOKIE option(s). Calculate TCP_COOKIE option. Send optional data. This is a significantly revised implementation of an earlier (year-old) patch that no longer applies cleanly, with permission of the original author (Adam Langley): http://thread.gmane.org/gmane.linux.network/102586 Requires: TCPCT part 1a: add request_values parameter for sending SYNACK TCPCT part 1b: generate Responder Cookie secret TCPCT part 1c: sysctl_tcp_cookie_size, socket option TCP_COOKIE_TRANSACTIONS TCPCT part 1d: define TCP cookie option, extend existing struct's TCPCT part 1e: implement socket option TCP_COOKIE_TRANSACTIONS TCPCT part 1f: Initiator Cookie => Responder Signed-off-by: William.Allen.Simpson@gmail.com Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 47 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a2bcac9b388e..59c911f3889d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1213,9 +1213,12 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = { int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { + struct tcp_extend_values tmp_ext; struct tcp_options_received tmp_opt; + u8 *hash_location; struct request_sock *req; struct inet_request_sock *ireq; + struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = NULL; __be32 saddr = ip_hdr(skb)->saddr; __be32 daddr = ip_hdr(skb)->daddr; @@ -1271,15 +1274,49 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = TCP_MSS_DEFAULT; - tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss; + tmp_opt.user_mss = tp->rx_opt.user_mss; + tcp_parse_options(skb, &tmp_opt, &hash_location, 0, dst); + + if (tmp_opt.cookie_plus > 0 && + tmp_opt.saw_tstamp && + !tp->rx_opt.cookie_out_never && + (sysctl_tcp_cookie_size > 0 || + (tp->cookie_values != NULL && + tp->cookie_values->cookie_desired > 0))) { + u8 *c; + u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS]; + int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE; + + if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0) + goto drop_and_release; + + /* Secret recipe starts with IP addresses */ + *mess++ ^= daddr; + *mess++ ^= saddr; - tcp_parse_options(skb, &tmp_opt, 0, dst); + /* plus variable length Initiator Cookie */ + c = (u8 *)mess; + while (l-- > 0) + *c++ ^= *hash_location++; + +#ifdef CONFIG_SYN_COOKIES + want_cookie = 0; /* not our kind of cookie */ +#endif + tmp_ext.cookie_out_never = 0; /* false */ + tmp_ext.cookie_plus = tmp_opt.cookie_plus; + } else if (!tp->rx_opt.cookie_in_always) { + /* redundant indications, but ensure initialization. */ + tmp_ext.cookie_out_never = 1; /* true */ + tmp_ext.cookie_plus = 0; + } else { + goto drop_and_release; + } + tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always; if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; - tcp_openreq_init(req, &tmp_opt, skb); if (security_inet_conn_request(sk, skb, req)) @@ -1339,7 +1376,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) } tcp_rsk(req)->snt_isn = isn; - if (__tcp_v4_send_synack(sk, dst, req, NULL) || want_cookie) + if (__tcp_v4_send_synack(sk, dst, req, + (struct request_values *)&tmp_ext) || + want_cookie) goto drop_and_free; inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); -- cgit v1.2.2 From b099ce2602d806deb41caaa578731848995cdb2a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 3 Dec 2009 02:29:09 +0000 Subject: net: Batch inet_twsk_purge This function walks the whole hashtable so there is no point in passing it a network namespace. Instead I purge all timewait sockets from dead network namespaces that I find. If the namespace is one of the once I am trying to purge I am guaranteed no new timewait sockets can be formed so this will get them all. If the namespace is one I am not acting for it might form a few more but I will call inet_twsk_purge again and shortly to get rid of them. In any even if the network namespace is dead timewait sockets are useless. Move the calls of inet_twsk_purge into batch_exit routines so that if I am killing a bunch of namespaces at once I will just call inet_twsk_purge once and save a lot of redundant unnecessary work. My simple 4k network namespace exit test the cleanup time dropped from roughly 8.2s to 1.6s. While the time spent running inet_twsk_purge fell to about 2ms. 1ms for ipv4 and 1ms for ipv6. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 59c911f3889d..fee9aabd5aa1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2529,12 +2529,17 @@ static int __net_init tcp_sk_init(struct net *net) static void __net_exit tcp_sk_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv4.tcp_sock); - inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET); +} + +static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) +{ + inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET); } static struct pernet_operations __net_initdata tcp_sk_ops = { - .init = tcp_sk_init, - .exit = tcp_sk_exit, + .init = tcp_sk_init, + .exit = tcp_sk_exit, + .exit_batch = tcp_sk_exit_batch, }; void __init tcp_v4_init(void) -- cgit v1.2.2 From 49d09007879ce7bee36ab453c73e97c00adce884 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Dec 2009 16:06:13 -0800 Subject: tcp: diag: Dont report negative values for rx queue Both netlink and /proc/net/tcp interfaces can report transient negative values for rx queue. ss -> State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB -6 6 127.0.0.1:45956 127.0.0.1:3333 netstat -> tcp 4294967290 6 127.0.0.1:37784 127.0.0.1:3333 ESTABLISHED This is because we dont lock socket while computing tp->rcv_nxt - tp->copied_seq, and another CPU can update copied_seq before rcv_next in RX path. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fee9aabd5aa1..29002ab26e0d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2318,6 +2318,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); __u16 srcp = ntohs(inet->inet_sport); + int rx_queue; if (icsk->icsk_pending == ICSK_TIME_RETRANS) { timer_active = 1; @@ -2333,12 +2334,19 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) timer_expires = jiffies; } + if (sk->sk_state == TCP_LISTEN) + rx_queue = sk->sk_ack_backlog; + else + /* + * because we dont lock socket, we might find a transient negative value + */ + rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); + seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n", i, src, srcp, dest, destp, sk->sk_state, tp->write_seq - tp->snd_una, - sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog : - (tp->rcv_nxt - tp->copied_seq), + rx_queue, timer_active, jiffies_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, -- cgit v1.2.2 From 9327f7053e3993c125944fdb137a0618319ef2a0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Dec 2009 03:46:54 +0000 Subject: tcp: Fix a connect() race with timewait sockets First patch changes __inet_hash_nolisten() and __inet6_hash() to get a timewait parameter to be able to unhash it from ehash at same time the new socket is inserted in hash. This makes sure timewait socket wont be found by a concurrent writer in __inet_check_established() Reported-by: kapil dakhane Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 29002ab26e0d..15e96030ce47 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1464,7 +1464,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, } #endif - __inet_hash_nolisten(newsk); + __inet_hash_nolisten(newsk, NULL); __inet_inherit_port(sk, newsk); return newsk; -- cgit v1.2.2 From bb5b7c11263dbbe78253cd05945a6bf8f55add8e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 15 Dec 2009 20:56:42 -0800 Subject: tcp: Revert per-route SACK/DSACK/TIMESTAMP changes. It creates a regression, triggering badness for SYN_RECV sockets, for example: [19148.022102] Badness at net/ipv4/inet_connection_sock.c:293 [19148.022570] NIP: c02a0914 LR: c02a0904 CTR: 00000000 [19148.023035] REGS: eeecbd30 TRAP: 0700 Not tainted (2.6.32) [19148.023496] MSR: 00029032 CR: 24002442 XER: 00000000 [19148.024012] TASK = eee9a820[1756] 'privoxy' THREAD: eeeca000 This is likely caused by the change in the 'estab' parameter passed to tcp_parse_options() when invoked by the functions in net/ipv4/tcp_minisocks.c But even if that is fixed, the ->conn_request() changes made in this patch series is fundamentally wrong. They try to use the listening socket's 'dst' to probe the route settings. The listening socket doesn't even have a route, and you can't get the right route (the child request one) until much later after we setup all of the state, and it must be done by hand. This stuff really isn't ready, so the best thing to do is a full revert. This reverts the following commits: f55017a93f1a74d50244b1254b9a2bd7ac9bbf7d 022c3f7d82f0f1c68018696f2f027b87b9bb45c2 1aba721eba1d84a2defce45b950272cee1e6c72a cda42ebd67ee5fdf09d7057b5a4584d36fe8a335 345cda2fd695534be5a4494f1b59da9daed33663 dc343475ed062e13fc260acccaab91d7d80fd5b2 05eaade2782fb0c90d3034fd7a7d5a16266182bb 6a2a2d6bf8581216e08be15fcb563cfd6c430e1e Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 15e96030ce47..65b8ebfd078a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1262,20 +1262,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops; #endif - ireq = inet_rsk(req); - ireq->loc_addr = daddr; - ireq->rmt_addr = saddr; - ireq->no_srccheck = inet_sk(sk)->transparent; - ireq->opt = tcp_v4_save_options(sk, skb); - - dst = inet_csk_route_req(sk, req); - if(!dst) - goto drop_and_free; - tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = TCP_MSS_DEFAULT; tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, &hash_location, 0, dst); + tcp_parse_options(skb, &tmp_opt, &hash_location, 0); if (tmp_opt.cookie_plus > 0 && tmp_opt.saw_tstamp && @@ -1319,8 +1309,14 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb); + ireq = inet_rsk(req); + ireq->loc_addr = daddr; + ireq->rmt_addr = saddr; + ireq->no_srccheck = inet_sk(sk)->transparent; + ireq->opt = tcp_v4_save_options(sk, skb); + if (security_inet_conn_request(sk, skb, req)) - goto drop_and_release; + goto drop_and_free; if (!want_cookie) TCP_ECN_create_request(req, tcp_hdr(skb)); @@ -1345,6 +1341,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) */ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle && + (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && peer->v4daddr == saddr) { if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && -- cgit v1.2.2 From d218d11133d888f9745802146a50255a4781d37a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 11 Jan 2010 16:28:01 -0800 Subject: tcp: Generalized TTL Security Mechanism This patch adds the kernel portions needed to implement RFC 5082 Generalized TTL Security Mechanism (GTSM). It is a lightweight security measure against forged packets causing DoS attacks (for BGP). This is already implemented the same way in BSD kernels. For the necessary Quagga patch http://www.gossamer-threads.com/lists/quagga/dev/17389 Description from Cisco http://www.cisco.com/en/US/docs/ios/12_3t/12_3t7/feature/guide/gt_btsh.html It does add one byte to each socket structure, but I did a little rearrangement to reuse a hole (on 64 bit), but it does grow the structure on 32 bit This should be documented on ip(4) man page and the Glibc in.h file also needs update. IPV6_MINHOPLIMIT should also be added (although BSD doesn't support that). Only TCP is supported, but could also be added to UDP, DCCP, SCTP if desired. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 65b8ebfd078a..382f667238ec 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1649,6 +1649,9 @@ int tcp_v4_rcv(struct sk_buff *skb) if (!sk) goto no_tcp_socket; + if (iph->ttl < inet_sk(sk)->min_ttl) + goto discard_and_relse; + process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; -- cgit v1.2.2 From 72659ecce68588b74f6c46862c2b4cec137d7a5a Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Sun, 17 Jan 2010 19:09:39 -0800 Subject: tcp: account SYN-ACK timeouts & retransmissions Currently we don't increment SYN-ACK timeouts & retransmissions although we do increment the same stats for SYN. We seem to have lost the SYN-ACK accounting with the introduction of tcp_syn_recv_timer (commit 2248761e in the netdev-vger-cvs tree). This patch fixes this issue. In the process we also rename the v4/v6 syn/ack retransmit functions for clarity. We also add a new request_socket operations (syn_ack_timeout) so we can keep code in inet_connection_sock.c protocol agnostic. Signed-off-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 382f667238ec..356f544c4c10 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -742,9 +742,9 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, * This still operates on a request_sock only, not on a big * socket. */ -static int __tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, - struct request_sock *req, - struct request_values *rvp) +static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, + struct request_sock *req, + struct request_values *rvp) { const struct inet_request_sock *ireq = inet_rsk(req); int err = -1; @@ -775,10 +775,11 @@ static int __tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, return err; } -static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req, +static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req, struct request_values *rvp) { - return __tcp_v4_send_synack(sk, NULL, req, rvp); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); + return tcp_v4_send_synack(sk, NULL, req, rvp); } /* @@ -1192,10 +1193,11 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) struct request_sock_ops tcp_request_sock_ops __read_mostly = { .family = PF_INET, .obj_size = sizeof(struct tcp_request_sock), - .rtx_syn_ack = tcp_v4_send_synack, + .rtx_syn_ack = tcp_v4_rtx_synack, .send_ack = tcp_v4_reqsk_send_ack, .destructor = tcp_v4_reqsk_destructor, .send_reset = tcp_v4_send_reset, + .syn_ack_timeout = tcp_syn_ack_timeout, }; #ifdef CONFIG_TCP_MD5SIG @@ -1373,8 +1375,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) } tcp_rsk(req)->snt_isn = isn; - if (__tcp_v4_send_synack(sk, dst, req, - (struct request_values *)&tmp_ext) || + if (tcp_v4_send_synack(sk, dst, req, + (struct request_values *)&tmp_ext) || want_cookie) goto drop_and_free; -- cgit v1.2.2 From 2c8c1e7297e19bdef3c178c3ea41d898a7716e3e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 17 Jan 2010 03:35:32 +0000 Subject: net: spread __net_init, __net_exit __net_init/__net_exit are apparently not going away, so use them to full extent. In some cases __net_init was removed, because it was called from __net_exit code. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 356f544c4c10..c3588b4fd979 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2430,12 +2430,12 @@ static struct tcp_seq_afinfo tcp4_seq_afinfo = { }, }; -static int tcp4_proc_init_net(struct net *net) +static int __net_init tcp4_proc_init_net(struct net *net) { return tcp_proc_register(net, &tcp4_seq_afinfo); } -static void tcp4_proc_exit_net(struct net *net) +static void __net_exit tcp4_proc_exit_net(struct net *net) { tcp_proc_unregister(net, &tcp4_seq_afinfo); } -- cgit v1.2.2 From 6b03a53a5ab7ccf2d5d69f96cf1c739c4d2a8fb9 Mon Sep 17 00:00:00 2001 From: Zhu Yi Date: Thu, 4 Mar 2010 18:01:41 +0000 Subject: tcp: use limited socket backlog Make tcp adapt to the limited socket backlog change. Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: "Pekka Savola (ipv6)" Cc: Patrick McHardy Signed-off-by: Zhu Yi Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c3588b4fd979..4baf1943b1bd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1682,8 +1682,10 @@ process: if (!tcp_prequeue(sk, skb)) ret = tcp_v4_do_rcv(sk, skb); } - } else - sk_add_backlog(sk, skb); + } else if (sk_add_backlog_limited(sk, skb)) { + bh_unlock_sock(sk); + goto discard_and_relse; + } bh_unlock_sock(sk); sock_put(sk); -- cgit v1.2.2 From a3a858ff18a72a8d388e31ab0d98f7e944841a62 Mon Sep 17 00:00:00 2001 From: Zhu Yi Date: Thu, 4 Mar 2010 18:01:47 +0000 Subject: net: backlog functions rename sk_add_backlog -> __sk_add_backlog sk_add_backlog_limited -> sk_add_backlog Signed-off-by: Zhu Yi Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4baf1943b1bd..1915f7dc30e6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1682,7 +1682,7 @@ process: if (!tcp_prequeue(sk, skb)) ret = tcp_v4_do_rcv(sk, skb); } - } else if (sk_add_backlog_limited(sk, skb)) { + } else if (sk_add_backlog(sk, skb)) { bh_unlock_sock(sk); goto discard_and_relse; } -- cgit v1.2.2 From 6cce09f87a04797fae5b947ef2626c14a78f0b49 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 7 Mar 2010 23:21:57 +0000 Subject: tcp: Add SNMP counters for backlog and min_ttl drops Commit 6b03a53a (tcp: use limited socket backlog) added the possibility of dropping frames when backlog queue is full. Commit d218d111 (tcp: Generalized TTL Security Mechanism) added the possibility of dropping frames when TTL is under a given limit. This patch adds new SNMP MIB entries, named TCPBacklogDrop and TCPMinTTLDrop, published in /proc/net/netstat in TcpExt: line netstat -s | egrep "TCPBacklogDrop|TCPMinTTLDrop" TCPBacklogDrop: 0 TCPMinTTLDrop: 0 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1915f7dc30e6..8d51d39ad1bb 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1651,8 +1651,10 @@ int tcp_v4_rcv(struct sk_buff *skb) if (!sk) goto no_tcp_socket; - if (iph->ttl < inet_sk(sk)->min_ttl) + if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; + } process: if (sk->sk_state == TCP_TIME_WAIT) @@ -1682,8 +1684,9 @@ process: if (!tcp_prequeue(sk, skb)) ret = tcp_v4_do_rcv(sk, skb); } - } else if (sk_add_backlog(sk, skb)) { + } else if (unlikely(sk_add_backlog(sk, skb))) { bh_unlock_sock(sk); + NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); goto discard_and_relse; } bh_unlock_sock(sk); -- cgit v1.2.2 From bb134d5d9580fc7b945e3bca3c4b263947022966 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Mar 2010 05:55:56 +0000 Subject: tcp: Fix tcp_v4_rcv() Commit d218d111 (tcp: Generalized TTL Security Mechanism) added a bug for TIMEWAIT sockets. We should not test min_ttl for TW sockets. Reported-by: Tetsuo Handa Signed-off-by: Eric Dumazet Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8d51d39ad1bb..70df40980a87 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1651,15 +1651,15 @@ int tcp_v4_rcv(struct sk_buff *skb) if (!sk) goto no_tcp_socket; +process: + if (sk->sk_state == TCP_TIME_WAIT) + goto do_time_wait; + if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; } -process: - if (sk->sk_state == TCP_TIME_WAIT) - goto do_time_wait; - if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; nf_reset(skb); -- cgit v1.2.2 From 97e3ecd112ba45eb217cddab59f48659bc15d9d0 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Thu, 18 Mar 2010 11:27:32 +0000 Subject: TCP: check min TTL on received ICMP packets This adds RFC5082 checks for TTL on received ICMP packets. It adds some security against spoofed ICMP packets disrupting GTSM protected sessions. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 70df40980a87..f4df5f931f36 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -370,6 +370,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (sk->sk_state == TCP_CLOSE) goto out; + if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + goto out; + } + icsk = inet_csk(sk); tp = tcp_sk(sk); seq = ntohl(th->seq); -- cgit v1.2.2 From 5a0e3ad6af8660be21ca98a971cd00f331318c05 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Mar 2010 17:04:11 +0900 Subject: include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo Guess-its-ok-by: Christoph Lameter Cc: Ingo Molnar Cc: Lee Schermerhorn --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f4df5f931f36..3c23e70885f4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include -- cgit v1.2.2