aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c142
1 files changed, 120 insertions, 22 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d86784be7ab3..f240f57b2199 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -62,6 +62,7 @@
62 */ 62 */
63 63
64#include <linux/mm.h> 64#include <linux/mm.h>
65#include <linux/slab.h>
65#include <linux/module.h> 66#include <linux/module.h>
66#include <linux/sysctl.h> 67#include <linux/sysctl.h>
67#include <linux/kernel.h> 68#include <linux/kernel.h>
@@ -89,6 +90,8 @@ int sysctl_tcp_frto __read_mostly = 2;
89int sysctl_tcp_frto_response __read_mostly; 90int sysctl_tcp_frto_response __read_mostly;
90int sysctl_tcp_nometrics_save __read_mostly; 91int sysctl_tcp_nometrics_save __read_mostly;
91 92
93int sysctl_tcp_thin_dupack __read_mostly;
94
92int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; 95int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
93int sysctl_tcp_abc __read_mostly; 96int sysctl_tcp_abc __read_mostly;
94 97
@@ -140,7 +143,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
140 * "len" is invariant segment length, including TCP header. 143 * "len" is invariant segment length, including TCP header.
141 */ 144 */
142 len += skb->data - skb_transport_header(skb); 145 len += skb->data - skb_transport_header(skb);
143 if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) || 146 if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
144 /* If PSH is not set, packet should be 147 /* If PSH is not set, packet should be
145 * full sized, provided peer TCP is not badly broken. 148 * full sized, provided peer TCP is not badly broken.
146 * This observation (if it is correct 8)) allows 149 * This observation (if it is correct 8)) allows
@@ -411,7 +414,7 @@ void tcp_initialize_rcv_mss(struct sock *sk)
411 unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); 414 unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
412 415
413 hint = min(hint, tp->rcv_wnd / 2); 416 hint = min(hint, tp->rcv_wnd / 2);
414 hint = min(hint, TCP_MIN_RCVMSS); 417 hint = min(hint, TCP_MSS_DEFAULT);
415 hint = max(hint, TCP_MIN_MSS); 418 hint = max(hint, TCP_MIN_MSS);
416 419
417 inet_csk(sk)->icsk_ack.rcv_mss = hint; 420 inet_csk(sk)->icsk_ack.rcv_mss = hint;
@@ -2300,7 +2303,7 @@ static inline int tcp_fackets_out(struct tcp_sock *tp)
2300 * they differ. Since neither occurs due to loss, TCP should really 2303 * they differ. Since neither occurs due to loss, TCP should really
2301 * ignore them. 2304 * ignore them.
2302 */ 2305 */
2303static inline int tcp_dupack_heurestics(struct tcp_sock *tp) 2306static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
2304{ 2307{
2305 return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; 2308 return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
2306} 2309}
@@ -2425,7 +2428,7 @@ static int tcp_time_to_recover(struct sock *sk)
2425 return 1; 2428 return 1;
2426 2429
2427 /* Not-A-Trick#2 : Classic rule... */ 2430 /* Not-A-Trick#2 : Classic rule... */
2428 if (tcp_dupack_heurestics(tp) > tp->reordering) 2431 if (tcp_dupack_heuristics(tp) > tp->reordering)
2429 return 1; 2432 return 1;
2430 2433
2431 /* Trick#3 : when we use RFC2988 timer restart, fast 2434 /* Trick#3 : when we use RFC2988 timer restart, fast
@@ -2447,6 +2450,16 @@ static int tcp_time_to_recover(struct sock *sk)
2447 return 1; 2450 return 1;
2448 } 2451 }
2449 2452
2453 /* If a thin stream is detected, retransmit after first
2454 * received dupack. Employ only if SACK is supported in order
2455 * to avoid possible corner-case series of spurious retransmissions
2456 * Use only if there are no unsent data.
2457 */
2458 if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
2459 tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
2460 tcp_is_sack(tp) && !tcp_send_head(sk))
2461 return 1;
2462
2450 return 0; 2463 return 0;
2451} 2464}
2452 2465
@@ -2499,6 +2512,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2499 int err; 2512 int err;
2500 unsigned int mss; 2513 unsigned int mss;
2501 2514
2515 if (packets == 0)
2516 return;
2517
2502 WARN_ON(packets > tp->packets_out); 2518 WARN_ON(packets > tp->packets_out);
2503 if (tp->lost_skb_hint) { 2519 if (tp->lost_skb_hint) {
2504 skb = tp->lost_skb_hint; 2520 skb = tp->lost_skb_hint;
@@ -2717,6 +2733,35 @@ static void tcp_try_undo_dsack(struct sock *sk)
2717 } 2733 }
2718} 2734}
2719 2735
2736/* We can clear retrans_stamp when there are no retransmissions in the
2737 * window. It would seem that it is trivially available for us in
2738 * tp->retrans_out, however, that kind of assumptions doesn't consider
2739 * what will happen if errors occur when sending retransmission for the
2740 * second time. ...It could the that such segment has only
2741 * TCPCB_EVER_RETRANS set at the present time. It seems that checking
2742 * the head skb is enough except for some reneging corner cases that
2743 * are not worth the effort.
2744 *
2745 * Main reason for all this complexity is the fact that connection dying
2746 * time now depends on the validity of the retrans_stamp, in particular,
2747 * that successive retransmissions of a segment must not advance
2748 * retrans_stamp under any conditions.
2749 */
2750static int tcp_any_retrans_done(struct sock *sk)
2751{
2752 struct tcp_sock *tp = tcp_sk(sk);
2753 struct sk_buff *skb;
2754
2755 if (tp->retrans_out)
2756 return 1;
2757
2758 skb = tcp_write_queue_head(sk);
2759 if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
2760 return 1;
2761
2762 return 0;
2763}
2764
2720/* Undo during fast recovery after partial ACK. */ 2765/* Undo during fast recovery after partial ACK. */
2721 2766
2722static int tcp_try_undo_partial(struct sock *sk, int acked) 2767static int tcp_try_undo_partial(struct sock *sk, int acked)
@@ -2729,7 +2774,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
2729 /* Plain luck! Hole if filled with delayed 2774 /* Plain luck! Hole if filled with delayed
2730 * packet, rather than with a retransmit. 2775 * packet, rather than with a retransmit.
2731 */ 2776 */
2732 if (tp->retrans_out == 0) 2777 if (!tcp_any_retrans_done(sk))
2733 tp->retrans_stamp = 0; 2778 tp->retrans_stamp = 0;
2734 2779
2735 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); 2780 tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
@@ -2788,7 +2833,7 @@ static void tcp_try_keep_open(struct sock *sk)
2788 struct tcp_sock *tp = tcp_sk(sk); 2833 struct tcp_sock *tp = tcp_sk(sk);
2789 int state = TCP_CA_Open; 2834 int state = TCP_CA_Open;
2790 2835
2791 if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) 2836 if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker)
2792 state = TCP_CA_Disorder; 2837 state = TCP_CA_Disorder;
2793 2838
2794 if (inet_csk(sk)->icsk_ca_state != state) { 2839 if (inet_csk(sk)->icsk_ca_state != state) {
@@ -2803,7 +2848,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
2803 2848
2804 tcp_verify_left_out(tp); 2849 tcp_verify_left_out(tp);
2805 2850
2806 if (!tp->frto_counter && tp->retrans_out == 0) 2851 if (!tp->frto_counter && !tcp_any_retrans_done(sk))
2807 tp->retrans_stamp = 0; 2852 tp->retrans_stamp = 0;
2808 2853
2809 if (flag & FLAG_ECE) 2854 if (flag & FLAG_ECE)
@@ -3698,7 +3743,7 @@ old_ack:
3698 * the fast version below fails. 3743 * the fast version below fails.
3699 */ 3744 */
3700void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, 3745void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
3701 int estab) 3746 u8 **hvpp, int estab)
3702{ 3747{
3703 unsigned char *ptr; 3748 unsigned char *ptr;
3704 struct tcphdr *th = tcp_hdr(skb); 3749 struct tcphdr *th = tcp_hdr(skb);
@@ -3782,7 +3827,30 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
3782 */ 3827 */
3783 break; 3828 break;
3784#endif 3829#endif
3785 } 3830 case TCPOPT_COOKIE:
3831 /* This option is variable length.
3832 */
3833 switch (opsize) {
3834 case TCPOLEN_COOKIE_BASE:
3835 /* not yet implemented */
3836 break;
3837 case TCPOLEN_COOKIE_PAIR:
3838 /* not yet implemented */
3839 break;
3840 case TCPOLEN_COOKIE_MIN+0:
3841 case TCPOLEN_COOKIE_MIN+2:
3842 case TCPOLEN_COOKIE_MIN+4:
3843 case TCPOLEN_COOKIE_MIN+6:
3844 case TCPOLEN_COOKIE_MAX:
3845 /* 16-bit multiple */
3846 opt_rx->cookie_plus = opsize;
3847 *hvpp = ptr;
3848 default:
3849 /* ignore option */
3850 break;
3851 };
3852 break;
3853 };
3786 3854
3787 ptr += opsize-2; 3855 ptr += opsize-2;
3788 length -= opsize; 3856 length -= opsize;
@@ -3810,17 +3878,20 @@ static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
3810 * If it is wrong it falls back on tcp_parse_options(). 3878 * If it is wrong it falls back on tcp_parse_options().
3811 */ 3879 */
3812static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, 3880static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
3813 struct tcp_sock *tp) 3881 struct tcp_sock *tp, u8 **hvpp)
3814{ 3882{
3815 if (th->doff == sizeof(struct tcphdr) >> 2) { 3883 /* In the spirit of fast parsing, compare doff directly to constant
3884 * values. Because equality is used, short doff can be ignored here.
3885 */
3886 if (th->doff == (sizeof(*th) / 4)) {
3816 tp->rx_opt.saw_tstamp = 0; 3887 tp->rx_opt.saw_tstamp = 0;
3817 return 0; 3888 return 0;
3818 } else if (tp->rx_opt.tstamp_ok && 3889 } else if (tp->rx_opt.tstamp_ok &&
3819 th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { 3890 th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
3820 if (tcp_parse_aligned_timestamp(tp, th)) 3891 if (tcp_parse_aligned_timestamp(tp, th))
3821 return 1; 3892 return 1;
3822 } 3893 }
3823 tcp_parse_options(skb, &tp->rx_opt, 1); 3894 tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
3824 return 1; 3895 return 1;
3825} 3896}
3826 3897
@@ -4845,11 +4916,11 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
4845 struct tcp_sock *tp = tcp_sk(sk); 4916 struct tcp_sock *tp = tcp_sk(sk);
4846 4917
4847 /* More than one full frame received... */ 4918 /* More than one full frame received... */
4848 if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss 4919 if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
4849 /* ... and right edge of window advances far enough. 4920 /* ... and right edge of window advances far enough.
4850 * (tcp_recvmsg() will send ACK otherwise). Or... 4921 * (tcp_recvmsg() will send ACK otherwise). Or...
4851 */ 4922 */
4852 && __tcp_select_window(sk) >= tp->rcv_wnd) || 4923 __tcp_select_window(sk) >= tp->rcv_wnd) ||
4853 /* We ACK each frame or... */ 4924 /* We ACK each frame or... */
4854 tcp_in_quickack_mode(sk) || 4925 tcp_in_quickack_mode(sk) ||
4855 /* We have out of order data. */ 4926 /* We have out of order data. */
@@ -5070,10 +5141,12 @@ out:
5070static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, 5141static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
5071 struct tcphdr *th, int syn_inerr) 5142 struct tcphdr *th, int syn_inerr)
5072{ 5143{
5144 u8 *hash_location;
5073 struct tcp_sock *tp = tcp_sk(sk); 5145 struct tcp_sock *tp = tcp_sk(sk);
5074 5146
5075 /* RFC1323: H1. Apply PAWS check first. */ 5147 /* RFC1323: H1. Apply PAWS check first. */
5076 if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && 5148 if (tcp_fast_parse_options(skb, th, tp, &hash_location) &&
5149 tp->rx_opt.saw_tstamp &&
5077 tcp_paws_discard(sk, skb)) { 5150 tcp_paws_discard(sk, skb)) {
5078 if (!th->rst) { 5151 if (!th->rst) {
5079 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); 5152 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
@@ -5361,11 +5434,13 @@ discard:
5361static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5434static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5362 struct tcphdr *th, unsigned len) 5435 struct tcphdr *th, unsigned len)
5363{ 5436{
5364 struct tcp_sock *tp = tcp_sk(sk); 5437 u8 *hash_location;
5365 struct inet_connection_sock *icsk = inet_csk(sk); 5438 struct inet_connection_sock *icsk = inet_csk(sk);
5439 struct tcp_sock *tp = tcp_sk(sk);
5440 struct tcp_cookie_values *cvp = tp->cookie_values;
5366 int saved_clamp = tp->rx_opt.mss_clamp; 5441 int saved_clamp = tp->rx_opt.mss_clamp;
5367 5442
5368 tcp_parse_options(skb, &tp->rx_opt, 0); 5443 tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0);
5369 5444
5370 if (th->ack) { 5445 if (th->ack) {
5371 /* rfc793: 5446 /* rfc793:
@@ -5462,6 +5537,31 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5462 * Change state from SYN-SENT only after copied_seq 5537 * Change state from SYN-SENT only after copied_seq
5463 * is initialized. */ 5538 * is initialized. */
5464 tp->copied_seq = tp->rcv_nxt; 5539 tp->copied_seq = tp->rcv_nxt;
5540
5541 if (cvp != NULL &&
5542 cvp->cookie_pair_size > 0 &&
5543 tp->rx_opt.cookie_plus > 0) {
5544 int cookie_size = tp->rx_opt.cookie_plus
5545 - TCPOLEN_COOKIE_BASE;
5546 int cookie_pair_size = cookie_size
5547 + cvp->cookie_desired;
5548
5549 /* A cookie extension option was sent and returned.
5550 * Note that each incoming SYNACK replaces the
5551 * Responder cookie. The initial exchange is most
5552 * fragile, as protection against spoofing relies
5553 * entirely upon the sequence and timestamp (above).
5554 * This replacement strategy allows the correct pair to
5555 * pass through, while any others will be filtered via
5556 * Responder verification later.
5557 */
5558 if (sizeof(cvp->cookie_pair) >= cookie_pair_size) {
5559 memcpy(&cvp->cookie_pair[cvp->cookie_desired],
5560 hash_location, cookie_size);
5561 cvp->cookie_pair_size = cookie_pair_size;
5562 }
5563 }
5564
5465 smp_mb(); 5565 smp_mb();
5466 tcp_set_state(sk, TCP_ESTABLISHED); 5566 tcp_set_state(sk, TCP_ESTABLISHED);
5467 5567
@@ -5699,11 +5799,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5699 5799
5700 /* tcp_ack considers this ACK as duplicate 5800 /* tcp_ack considers this ACK as duplicate
5701 * and does not calculate rtt. 5801 * and does not calculate rtt.
5702 * Fix it at least with timestamps. 5802 * Force it here.
5703 */ 5803 */
5704 if (tp->rx_opt.saw_tstamp && 5804 tcp_ack_update_rtt(sk, 0, 0);
5705 tp->rx_opt.rcv_tsecr && !tp->srtt)
5706 tcp_ack_saw_tstamp(sk, 0);
5707 5805
5708 if (tp->rx_opt.tstamp_ok) 5806 if (tp->rx_opt.tstamp_ok)
5709 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 5807 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;