diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 142 |
1 files changed, 120 insertions, 22 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d86784be7ab3..f240f57b2199 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -62,6 +62,7 @@ | |||
62 | */ | 62 | */ |
63 | 63 | ||
64 | #include <linux/mm.h> | 64 | #include <linux/mm.h> |
65 | #include <linux/slab.h> | ||
65 | #include <linux/module.h> | 66 | #include <linux/module.h> |
66 | #include <linux/sysctl.h> | 67 | #include <linux/sysctl.h> |
67 | #include <linux/kernel.h> | 68 | #include <linux/kernel.h> |
@@ -89,6 +90,8 @@ int sysctl_tcp_frto __read_mostly = 2; | |||
89 | int sysctl_tcp_frto_response __read_mostly; | 90 | int sysctl_tcp_frto_response __read_mostly; |
90 | int sysctl_tcp_nometrics_save __read_mostly; | 91 | int sysctl_tcp_nometrics_save __read_mostly; |
91 | 92 | ||
93 | int sysctl_tcp_thin_dupack __read_mostly; | ||
94 | |||
92 | int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; | 95 | int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; |
93 | int sysctl_tcp_abc __read_mostly; | 96 | int sysctl_tcp_abc __read_mostly; |
94 | 97 | ||
@@ -140,7 +143,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) | |||
140 | * "len" is invariant segment length, including TCP header. | 143 | * "len" is invariant segment length, including TCP header. |
141 | */ | 144 | */ |
142 | len += skb->data - skb_transport_header(skb); | 145 | len += skb->data - skb_transport_header(skb); |
143 | if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) || | 146 | if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) || |
144 | /* If PSH is not set, packet should be | 147 | /* If PSH is not set, packet should be |
145 | * full sized, provided peer TCP is not badly broken. | 148 | * full sized, provided peer TCP is not badly broken. |
146 | * This observation (if it is correct 8)) allows | 149 | * This observation (if it is correct 8)) allows |
@@ -411,7 +414,7 @@ void tcp_initialize_rcv_mss(struct sock *sk) | |||
411 | unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); | 414 | unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); |
412 | 415 | ||
413 | hint = min(hint, tp->rcv_wnd / 2); | 416 | hint = min(hint, tp->rcv_wnd / 2); |
414 | hint = min(hint, TCP_MIN_RCVMSS); | 417 | hint = min(hint, TCP_MSS_DEFAULT); |
415 | hint = max(hint, TCP_MIN_MSS); | 418 | hint = max(hint, TCP_MIN_MSS); |
416 | 419 | ||
417 | inet_csk(sk)->icsk_ack.rcv_mss = hint; | 420 | inet_csk(sk)->icsk_ack.rcv_mss = hint; |
@@ -2300,7 +2303,7 @@ static inline int tcp_fackets_out(struct tcp_sock *tp) | |||
2300 | * they differ. Since neither occurs due to loss, TCP should really | 2303 | * they differ. Since neither occurs due to loss, TCP should really |
2301 | * ignore them. | 2304 | * ignore them. |
2302 | */ | 2305 | */ |
2303 | static inline int tcp_dupack_heurestics(struct tcp_sock *tp) | 2306 | static inline int tcp_dupack_heuristics(struct tcp_sock *tp) |
2304 | { | 2307 | { |
2305 | return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; | 2308 | return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; |
2306 | } | 2309 | } |
@@ -2425,7 +2428,7 @@ static int tcp_time_to_recover(struct sock *sk) | |||
2425 | return 1; | 2428 | return 1; |
2426 | 2429 | ||
2427 | /* Not-A-Trick#2 : Classic rule... */ | 2430 | /* Not-A-Trick#2 : Classic rule... */ |
2428 | if (tcp_dupack_heurestics(tp) > tp->reordering) | 2431 | if (tcp_dupack_heuristics(tp) > tp->reordering) |
2429 | return 1; | 2432 | return 1; |
2430 | 2433 | ||
2431 | /* Trick#3 : when we use RFC2988 timer restart, fast | 2434 | /* Trick#3 : when we use RFC2988 timer restart, fast |
@@ -2447,6 +2450,16 @@ static int tcp_time_to_recover(struct sock *sk) | |||
2447 | return 1; | 2450 | return 1; |
2448 | } | 2451 | } |
2449 | 2452 | ||
2453 | /* If a thin stream is detected, retransmit after first | ||
2454 | * received dupack. Employ only if SACK is supported in order | ||
2455 | * to avoid possible corner-case series of spurious retransmissions | ||
2456 | * Use only if there are no unsent data. | ||
2457 | */ | ||
2458 | if ((tp->thin_dupack || sysctl_tcp_thin_dupack) && | ||
2459 | tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 && | ||
2460 | tcp_is_sack(tp) && !tcp_send_head(sk)) | ||
2461 | return 1; | ||
2462 | |||
2450 | return 0; | 2463 | return 0; |
2451 | } | 2464 | } |
2452 | 2465 | ||
@@ -2499,6 +2512,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) | |||
2499 | int err; | 2512 | int err; |
2500 | unsigned int mss; | 2513 | unsigned int mss; |
2501 | 2514 | ||
2515 | if (packets == 0) | ||
2516 | return; | ||
2517 | |||
2502 | WARN_ON(packets > tp->packets_out); | 2518 | WARN_ON(packets > tp->packets_out); |
2503 | if (tp->lost_skb_hint) { | 2519 | if (tp->lost_skb_hint) { |
2504 | skb = tp->lost_skb_hint; | 2520 | skb = tp->lost_skb_hint; |
@@ -2717,6 +2733,35 @@ static void tcp_try_undo_dsack(struct sock *sk) | |||
2717 | } | 2733 | } |
2718 | } | 2734 | } |
2719 | 2735 | ||
2736 | /* We can clear retrans_stamp when there are no retransmissions in the | ||
2737 | * window. It would seem that it is trivially available for us in | ||
2738 | * tp->retrans_out, however, that kind of assumptions doesn't consider | ||
2739 | * what will happen if errors occur when sending retransmission for the | ||
2740 | * second time. ...It could the that such segment has only | ||
2741 | * TCPCB_EVER_RETRANS set at the present time. It seems that checking | ||
2742 | * the head skb is enough except for some reneging corner cases that | ||
2743 | * are not worth the effort. | ||
2744 | * | ||
2745 | * Main reason for all this complexity is the fact that connection dying | ||
2746 | * time now depends on the validity of the retrans_stamp, in particular, | ||
2747 | * that successive retransmissions of a segment must not advance | ||
2748 | * retrans_stamp under any conditions. | ||
2749 | */ | ||
2750 | static int tcp_any_retrans_done(struct sock *sk) | ||
2751 | { | ||
2752 | struct tcp_sock *tp = tcp_sk(sk); | ||
2753 | struct sk_buff *skb; | ||
2754 | |||
2755 | if (tp->retrans_out) | ||
2756 | return 1; | ||
2757 | |||
2758 | skb = tcp_write_queue_head(sk); | ||
2759 | if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) | ||
2760 | return 1; | ||
2761 | |||
2762 | return 0; | ||
2763 | } | ||
2764 | |||
2720 | /* Undo during fast recovery after partial ACK. */ | 2765 | /* Undo during fast recovery after partial ACK. */ |
2721 | 2766 | ||
2722 | static int tcp_try_undo_partial(struct sock *sk, int acked) | 2767 | static int tcp_try_undo_partial(struct sock *sk, int acked) |
@@ -2729,7 +2774,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) | |||
2729 | /* Plain luck! Hole if filled with delayed | 2774 | /* Plain luck! Hole if filled with delayed |
2730 | * packet, rather than with a retransmit. | 2775 | * packet, rather than with a retransmit. |
2731 | */ | 2776 | */ |
2732 | if (tp->retrans_out == 0) | 2777 | if (!tcp_any_retrans_done(sk)) |
2733 | tp->retrans_stamp = 0; | 2778 | tp->retrans_stamp = 0; |
2734 | 2779 | ||
2735 | tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); | 2780 | tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); |
@@ -2788,7 +2833,7 @@ static void tcp_try_keep_open(struct sock *sk) | |||
2788 | struct tcp_sock *tp = tcp_sk(sk); | 2833 | struct tcp_sock *tp = tcp_sk(sk); |
2789 | int state = TCP_CA_Open; | 2834 | int state = TCP_CA_Open; |
2790 | 2835 | ||
2791 | if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) | 2836 | if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker) |
2792 | state = TCP_CA_Disorder; | 2837 | state = TCP_CA_Disorder; |
2793 | 2838 | ||
2794 | if (inet_csk(sk)->icsk_ca_state != state) { | 2839 | if (inet_csk(sk)->icsk_ca_state != state) { |
@@ -2803,7 +2848,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) | |||
2803 | 2848 | ||
2804 | tcp_verify_left_out(tp); | 2849 | tcp_verify_left_out(tp); |
2805 | 2850 | ||
2806 | if (!tp->frto_counter && tp->retrans_out == 0) | 2851 | if (!tp->frto_counter && !tcp_any_retrans_done(sk)) |
2807 | tp->retrans_stamp = 0; | 2852 | tp->retrans_stamp = 0; |
2808 | 2853 | ||
2809 | if (flag & FLAG_ECE) | 2854 | if (flag & FLAG_ECE) |
@@ -3698,7 +3743,7 @@ old_ack: | |||
3698 | * the fast version below fails. | 3743 | * the fast version below fails. |
3699 | */ | 3744 | */ |
3700 | void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, | 3745 | void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, |
3701 | int estab) | 3746 | u8 **hvpp, int estab) |
3702 | { | 3747 | { |
3703 | unsigned char *ptr; | 3748 | unsigned char *ptr; |
3704 | struct tcphdr *th = tcp_hdr(skb); | 3749 | struct tcphdr *th = tcp_hdr(skb); |
@@ -3782,7 +3827,30 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, | |||
3782 | */ | 3827 | */ |
3783 | break; | 3828 | break; |
3784 | #endif | 3829 | #endif |
3785 | } | 3830 | case TCPOPT_COOKIE: |
3831 | /* This option is variable length. | ||
3832 | */ | ||
3833 | switch (opsize) { | ||
3834 | case TCPOLEN_COOKIE_BASE: | ||
3835 | /* not yet implemented */ | ||
3836 | break; | ||
3837 | case TCPOLEN_COOKIE_PAIR: | ||
3838 | /* not yet implemented */ | ||
3839 | break; | ||
3840 | case TCPOLEN_COOKIE_MIN+0: | ||
3841 | case TCPOLEN_COOKIE_MIN+2: | ||
3842 | case TCPOLEN_COOKIE_MIN+4: | ||
3843 | case TCPOLEN_COOKIE_MIN+6: | ||
3844 | case TCPOLEN_COOKIE_MAX: | ||
3845 | /* 16-bit multiple */ | ||
3846 | opt_rx->cookie_plus = opsize; | ||
3847 | *hvpp = ptr; | ||
3848 | default: | ||
3849 | /* ignore option */ | ||
3850 | break; | ||
3851 | }; | ||
3852 | break; | ||
3853 | }; | ||
3786 | 3854 | ||
3787 | ptr += opsize-2; | 3855 | ptr += opsize-2; |
3788 | length -= opsize; | 3856 | length -= opsize; |
@@ -3810,17 +3878,20 @@ static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th) | |||
3810 | * If it is wrong it falls back on tcp_parse_options(). | 3878 | * If it is wrong it falls back on tcp_parse_options(). |
3811 | */ | 3879 | */ |
3812 | static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, | 3880 | static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, |
3813 | struct tcp_sock *tp) | 3881 | struct tcp_sock *tp, u8 **hvpp) |
3814 | { | 3882 | { |
3815 | if (th->doff == sizeof(struct tcphdr) >> 2) { | 3883 | /* In the spirit of fast parsing, compare doff directly to constant |
3884 | * values. Because equality is used, short doff can be ignored here. | ||
3885 | */ | ||
3886 | if (th->doff == (sizeof(*th) / 4)) { | ||
3816 | tp->rx_opt.saw_tstamp = 0; | 3887 | tp->rx_opt.saw_tstamp = 0; |
3817 | return 0; | 3888 | return 0; |
3818 | } else if (tp->rx_opt.tstamp_ok && | 3889 | } else if (tp->rx_opt.tstamp_ok && |
3819 | th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { | 3890 | th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { |
3820 | if (tcp_parse_aligned_timestamp(tp, th)) | 3891 | if (tcp_parse_aligned_timestamp(tp, th)) |
3821 | return 1; | 3892 | return 1; |
3822 | } | 3893 | } |
3823 | tcp_parse_options(skb, &tp->rx_opt, 1); | 3894 | tcp_parse_options(skb, &tp->rx_opt, hvpp, 1); |
3824 | return 1; | 3895 | return 1; |
3825 | } | 3896 | } |
3826 | 3897 | ||
@@ -4845,11 +4916,11 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) | |||
4845 | struct tcp_sock *tp = tcp_sk(sk); | 4916 | struct tcp_sock *tp = tcp_sk(sk); |
4846 | 4917 | ||
4847 | /* More than one full frame received... */ | 4918 | /* More than one full frame received... */ |
4848 | if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss | 4919 | if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && |
4849 | /* ... and right edge of window advances far enough. | 4920 | /* ... and right edge of window advances far enough. |
4850 | * (tcp_recvmsg() will send ACK otherwise). Or... | 4921 | * (tcp_recvmsg() will send ACK otherwise). Or... |
4851 | */ | 4922 | */ |
4852 | && __tcp_select_window(sk) >= tp->rcv_wnd) || | 4923 | __tcp_select_window(sk) >= tp->rcv_wnd) || |
4853 | /* We ACK each frame or... */ | 4924 | /* We ACK each frame or... */ |
4854 | tcp_in_quickack_mode(sk) || | 4925 | tcp_in_quickack_mode(sk) || |
4855 | /* We have out of order data. */ | 4926 | /* We have out of order data. */ |
@@ -5070,10 +5141,12 @@ out: | |||
5070 | static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, | 5141 | static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, |
5071 | struct tcphdr *th, int syn_inerr) | 5142 | struct tcphdr *th, int syn_inerr) |
5072 | { | 5143 | { |
5144 | u8 *hash_location; | ||
5073 | struct tcp_sock *tp = tcp_sk(sk); | 5145 | struct tcp_sock *tp = tcp_sk(sk); |
5074 | 5146 | ||
5075 | /* RFC1323: H1. Apply PAWS check first. */ | 5147 | /* RFC1323: H1. Apply PAWS check first. */ |
5076 | if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && | 5148 | if (tcp_fast_parse_options(skb, th, tp, &hash_location) && |
5149 | tp->rx_opt.saw_tstamp && | ||
5077 | tcp_paws_discard(sk, skb)) { | 5150 | tcp_paws_discard(sk, skb)) { |
5078 | if (!th->rst) { | 5151 | if (!th->rst) { |
5079 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); | 5152 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); |
@@ -5361,11 +5434,13 @@ discard: | |||
5361 | static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | 5434 | static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, |
5362 | struct tcphdr *th, unsigned len) | 5435 | struct tcphdr *th, unsigned len) |
5363 | { | 5436 | { |
5364 | struct tcp_sock *tp = tcp_sk(sk); | 5437 | u8 *hash_location; |
5365 | struct inet_connection_sock *icsk = inet_csk(sk); | 5438 | struct inet_connection_sock *icsk = inet_csk(sk); |
5439 | struct tcp_sock *tp = tcp_sk(sk); | ||
5440 | struct tcp_cookie_values *cvp = tp->cookie_values; | ||
5366 | int saved_clamp = tp->rx_opt.mss_clamp; | 5441 | int saved_clamp = tp->rx_opt.mss_clamp; |
5367 | 5442 | ||
5368 | tcp_parse_options(skb, &tp->rx_opt, 0); | 5443 | tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0); |
5369 | 5444 | ||
5370 | if (th->ack) { | 5445 | if (th->ack) { |
5371 | /* rfc793: | 5446 | /* rfc793: |
@@ -5462,6 +5537,31 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
5462 | * Change state from SYN-SENT only after copied_seq | 5537 | * Change state from SYN-SENT only after copied_seq |
5463 | * is initialized. */ | 5538 | * is initialized. */ |
5464 | tp->copied_seq = tp->rcv_nxt; | 5539 | tp->copied_seq = tp->rcv_nxt; |
5540 | |||
5541 | if (cvp != NULL && | ||
5542 | cvp->cookie_pair_size > 0 && | ||
5543 | tp->rx_opt.cookie_plus > 0) { | ||
5544 | int cookie_size = tp->rx_opt.cookie_plus | ||
5545 | - TCPOLEN_COOKIE_BASE; | ||
5546 | int cookie_pair_size = cookie_size | ||
5547 | + cvp->cookie_desired; | ||
5548 | |||
5549 | /* A cookie extension option was sent and returned. | ||
5550 | * Note that each incoming SYNACK replaces the | ||
5551 | * Responder cookie. The initial exchange is most | ||
5552 | * fragile, as protection against spoofing relies | ||
5553 | * entirely upon the sequence and timestamp (above). | ||
5554 | * This replacement strategy allows the correct pair to | ||
5555 | * pass through, while any others will be filtered via | ||
5556 | * Responder verification later. | ||
5557 | */ | ||
5558 | if (sizeof(cvp->cookie_pair) >= cookie_pair_size) { | ||
5559 | memcpy(&cvp->cookie_pair[cvp->cookie_desired], | ||
5560 | hash_location, cookie_size); | ||
5561 | cvp->cookie_pair_size = cookie_pair_size; | ||
5562 | } | ||
5563 | } | ||
5564 | |||
5465 | smp_mb(); | 5565 | smp_mb(); |
5466 | tcp_set_state(sk, TCP_ESTABLISHED); | 5566 | tcp_set_state(sk, TCP_ESTABLISHED); |
5467 | 5567 | ||
@@ -5699,11 +5799,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
5699 | 5799 | ||
5700 | /* tcp_ack considers this ACK as duplicate | 5800 | /* tcp_ack considers this ACK as duplicate |
5701 | * and does not calculate rtt. | 5801 | * and does not calculate rtt. |
5702 | * Fix it at least with timestamps. | 5802 | * Force it here. |
5703 | */ | 5803 | */ |
5704 | if (tp->rx_opt.saw_tstamp && | 5804 | tcp_ack_update_rtt(sk, 0, 0); |
5705 | tp->rx_opt.rcv_tsecr && !tp->srtt) | ||
5706 | tcp_ack_saw_tstamp(sk, 0); | ||
5707 | 5805 | ||
5708 | if (tp->rx_opt.tstamp_ok) | 5806 | if (tp->rx_opt.tstamp_ok) |
5709 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; | 5807 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; |