diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
| -rw-r--r-- | net/ipv4/tcp_input.c | 142 |
1 files changed, 120 insertions, 22 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d86784be7ab3..f240f57b2199 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
| @@ -62,6 +62,7 @@ | |||
| 62 | */ | 62 | */ |
| 63 | 63 | ||
| 64 | #include <linux/mm.h> | 64 | #include <linux/mm.h> |
| 65 | #include <linux/slab.h> | ||
| 65 | #include <linux/module.h> | 66 | #include <linux/module.h> |
| 66 | #include <linux/sysctl.h> | 67 | #include <linux/sysctl.h> |
| 67 | #include <linux/kernel.h> | 68 | #include <linux/kernel.h> |
| @@ -89,6 +90,8 @@ int sysctl_tcp_frto __read_mostly = 2; | |||
| 89 | int sysctl_tcp_frto_response __read_mostly; | 90 | int sysctl_tcp_frto_response __read_mostly; |
| 90 | int sysctl_tcp_nometrics_save __read_mostly; | 91 | int sysctl_tcp_nometrics_save __read_mostly; |
| 91 | 92 | ||
| 93 | int sysctl_tcp_thin_dupack __read_mostly; | ||
| 94 | |||
| 92 | int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; | 95 | int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; |
| 93 | int sysctl_tcp_abc __read_mostly; | 96 | int sysctl_tcp_abc __read_mostly; |
| 94 | 97 | ||
| @@ -140,7 +143,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) | |||
| 140 | * "len" is invariant segment length, including TCP header. | 143 | * "len" is invariant segment length, including TCP header. |
| 141 | */ | 144 | */ |
| 142 | len += skb->data - skb_transport_header(skb); | 145 | len += skb->data - skb_transport_header(skb); |
| 143 | if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) || | 146 | if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) || |
| 144 | /* If PSH is not set, packet should be | 147 | /* If PSH is not set, packet should be |
| 145 | * full sized, provided peer TCP is not badly broken. | 148 | * full sized, provided peer TCP is not badly broken. |
| 146 | * This observation (if it is correct 8)) allows | 149 | * This observation (if it is correct 8)) allows |
| @@ -411,7 +414,7 @@ void tcp_initialize_rcv_mss(struct sock *sk) | |||
| 411 | unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); | 414 | unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); |
| 412 | 415 | ||
| 413 | hint = min(hint, tp->rcv_wnd / 2); | 416 | hint = min(hint, tp->rcv_wnd / 2); |
| 414 | hint = min(hint, TCP_MIN_RCVMSS); | 417 | hint = min(hint, TCP_MSS_DEFAULT); |
| 415 | hint = max(hint, TCP_MIN_MSS); | 418 | hint = max(hint, TCP_MIN_MSS); |
| 416 | 419 | ||
| 417 | inet_csk(sk)->icsk_ack.rcv_mss = hint; | 420 | inet_csk(sk)->icsk_ack.rcv_mss = hint; |
| @@ -2300,7 +2303,7 @@ static inline int tcp_fackets_out(struct tcp_sock *tp) | |||
| 2300 | * they differ. Since neither occurs due to loss, TCP should really | 2303 | * they differ. Since neither occurs due to loss, TCP should really |
| 2301 | * ignore them. | 2304 | * ignore them. |
| 2302 | */ | 2305 | */ |
| 2303 | static inline int tcp_dupack_heurestics(struct tcp_sock *tp) | 2306 | static inline int tcp_dupack_heuristics(struct tcp_sock *tp) |
| 2304 | { | 2307 | { |
| 2305 | return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; | 2308 | return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; |
| 2306 | } | 2309 | } |
| @@ -2425,7 +2428,7 @@ static int tcp_time_to_recover(struct sock *sk) | |||
| 2425 | return 1; | 2428 | return 1; |
| 2426 | 2429 | ||
| 2427 | /* Not-A-Trick#2 : Classic rule... */ | 2430 | /* Not-A-Trick#2 : Classic rule... */ |
| 2428 | if (tcp_dupack_heurestics(tp) > tp->reordering) | 2431 | if (tcp_dupack_heuristics(tp) > tp->reordering) |
| 2429 | return 1; | 2432 | return 1; |
| 2430 | 2433 | ||
| 2431 | /* Trick#3 : when we use RFC2988 timer restart, fast | 2434 | /* Trick#3 : when we use RFC2988 timer restart, fast |
| @@ -2447,6 +2450,16 @@ static int tcp_time_to_recover(struct sock *sk) | |||
| 2447 | return 1; | 2450 | return 1; |
| 2448 | } | 2451 | } |
| 2449 | 2452 | ||
| 2453 | /* If a thin stream is detected, retransmit after first | ||
| 2454 | * received dupack. Employ only if SACK is supported in order | ||
| 2455 | * to avoid possible corner-case series of spurious retransmissions | ||
| 2456 | * Use only if there are no unsent data. | ||
| 2457 | */ | ||
| 2458 | if ((tp->thin_dupack || sysctl_tcp_thin_dupack) && | ||
| 2459 | tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 && | ||
| 2460 | tcp_is_sack(tp) && !tcp_send_head(sk)) | ||
| 2461 | return 1; | ||
| 2462 | |||
| 2450 | return 0; | 2463 | return 0; |
| 2451 | } | 2464 | } |
| 2452 | 2465 | ||
| @@ -2499,6 +2512,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) | |||
| 2499 | int err; | 2512 | int err; |
| 2500 | unsigned int mss; | 2513 | unsigned int mss; |
| 2501 | 2514 | ||
| 2515 | if (packets == 0) | ||
| 2516 | return; | ||
| 2517 | |||
| 2502 | WARN_ON(packets > tp->packets_out); | 2518 | WARN_ON(packets > tp->packets_out); |
| 2503 | if (tp->lost_skb_hint) { | 2519 | if (tp->lost_skb_hint) { |
| 2504 | skb = tp->lost_skb_hint; | 2520 | skb = tp->lost_skb_hint; |
| @@ -2717,6 +2733,35 @@ static void tcp_try_undo_dsack(struct sock *sk) | |||
| 2717 | } | 2733 | } |
| 2718 | } | 2734 | } |
| 2719 | 2735 | ||
| 2736 | /* We can clear retrans_stamp when there are no retransmissions in the | ||
| 2737 | * window. It would seem that it is trivially available for us in | ||
| 2738 | * tp->retrans_out, however, that kind of assumptions doesn't consider | ||
| 2739 | * what will happen if errors occur when sending retransmission for the | ||
| 2740 | * second time. ...It could the that such segment has only | ||
| 2741 | * TCPCB_EVER_RETRANS set at the present time. It seems that checking | ||
| 2742 | * the head skb is enough except for some reneging corner cases that | ||
| 2743 | * are not worth the effort. | ||
| 2744 | * | ||
| 2745 | * Main reason for all this complexity is the fact that connection dying | ||
| 2746 | * time now depends on the validity of the retrans_stamp, in particular, | ||
| 2747 | * that successive retransmissions of a segment must not advance | ||
| 2748 | * retrans_stamp under any conditions. | ||
| 2749 | */ | ||
| 2750 | static int tcp_any_retrans_done(struct sock *sk) | ||
| 2751 | { | ||
| 2752 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2753 | struct sk_buff *skb; | ||
| 2754 | |||
| 2755 | if (tp->retrans_out) | ||
| 2756 | return 1; | ||
| 2757 | |||
| 2758 | skb = tcp_write_queue_head(sk); | ||
| 2759 | if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) | ||
| 2760 | return 1; | ||
| 2761 | |||
| 2762 | return 0; | ||
| 2763 | } | ||
| 2764 | |||
| 2720 | /* Undo during fast recovery after partial ACK. */ | 2765 | /* Undo during fast recovery after partial ACK. */ |
| 2721 | 2766 | ||
| 2722 | static int tcp_try_undo_partial(struct sock *sk, int acked) | 2767 | static int tcp_try_undo_partial(struct sock *sk, int acked) |
| @@ -2729,7 +2774,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) | |||
| 2729 | /* Plain luck! Hole if filled with delayed | 2774 | /* Plain luck! Hole if filled with delayed |
| 2730 | * packet, rather than with a retransmit. | 2775 | * packet, rather than with a retransmit. |
| 2731 | */ | 2776 | */ |
| 2732 | if (tp->retrans_out == 0) | 2777 | if (!tcp_any_retrans_done(sk)) |
| 2733 | tp->retrans_stamp = 0; | 2778 | tp->retrans_stamp = 0; |
| 2734 | 2779 | ||
| 2735 | tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); | 2780 | tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); |
| @@ -2788,7 +2833,7 @@ static void tcp_try_keep_open(struct sock *sk) | |||
| 2788 | struct tcp_sock *tp = tcp_sk(sk); | 2833 | struct tcp_sock *tp = tcp_sk(sk); |
| 2789 | int state = TCP_CA_Open; | 2834 | int state = TCP_CA_Open; |
| 2790 | 2835 | ||
| 2791 | if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) | 2836 | if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker) |
| 2792 | state = TCP_CA_Disorder; | 2837 | state = TCP_CA_Disorder; |
| 2793 | 2838 | ||
| 2794 | if (inet_csk(sk)->icsk_ca_state != state) { | 2839 | if (inet_csk(sk)->icsk_ca_state != state) { |
| @@ -2803,7 +2848,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) | |||
| 2803 | 2848 | ||
| 2804 | tcp_verify_left_out(tp); | 2849 | tcp_verify_left_out(tp); |
| 2805 | 2850 | ||
| 2806 | if (!tp->frto_counter && tp->retrans_out == 0) | 2851 | if (!tp->frto_counter && !tcp_any_retrans_done(sk)) |
| 2807 | tp->retrans_stamp = 0; | 2852 | tp->retrans_stamp = 0; |
| 2808 | 2853 | ||
| 2809 | if (flag & FLAG_ECE) | 2854 | if (flag & FLAG_ECE) |
| @@ -3698,7 +3743,7 @@ old_ack: | |||
| 3698 | * the fast version below fails. | 3743 | * the fast version below fails. |
| 3699 | */ | 3744 | */ |
| 3700 | void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, | 3745 | void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, |
| 3701 | int estab) | 3746 | u8 **hvpp, int estab) |
| 3702 | { | 3747 | { |
| 3703 | unsigned char *ptr; | 3748 | unsigned char *ptr; |
| 3704 | struct tcphdr *th = tcp_hdr(skb); | 3749 | struct tcphdr *th = tcp_hdr(skb); |
| @@ -3782,7 +3827,30 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, | |||
| 3782 | */ | 3827 | */ |
| 3783 | break; | 3828 | break; |
| 3784 | #endif | 3829 | #endif |
| 3785 | } | 3830 | case TCPOPT_COOKIE: |
| 3831 | /* This option is variable length. | ||
| 3832 | */ | ||
| 3833 | switch (opsize) { | ||
| 3834 | case TCPOLEN_COOKIE_BASE: | ||
| 3835 | /* not yet implemented */ | ||
| 3836 | break; | ||
| 3837 | case TCPOLEN_COOKIE_PAIR: | ||
| 3838 | /* not yet implemented */ | ||
| 3839 | break; | ||
| 3840 | case TCPOLEN_COOKIE_MIN+0: | ||
| 3841 | case TCPOLEN_COOKIE_MIN+2: | ||
| 3842 | case TCPOLEN_COOKIE_MIN+4: | ||
| 3843 | case TCPOLEN_COOKIE_MIN+6: | ||
| 3844 | case TCPOLEN_COOKIE_MAX: | ||
| 3845 | /* 16-bit multiple */ | ||
| 3846 | opt_rx->cookie_plus = opsize; | ||
| 3847 | *hvpp = ptr; | ||
| 3848 | default: | ||
| 3849 | /* ignore option */ | ||
| 3850 | break; | ||
| 3851 | }; | ||
| 3852 | break; | ||
| 3853 | }; | ||
| 3786 | 3854 | ||
| 3787 | ptr += opsize-2; | 3855 | ptr += opsize-2; |
| 3788 | length -= opsize; | 3856 | length -= opsize; |
| @@ -3810,17 +3878,20 @@ static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th) | |||
| 3810 | * If it is wrong it falls back on tcp_parse_options(). | 3878 | * If it is wrong it falls back on tcp_parse_options(). |
| 3811 | */ | 3879 | */ |
| 3812 | static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, | 3880 | static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, |
| 3813 | struct tcp_sock *tp) | 3881 | struct tcp_sock *tp, u8 **hvpp) |
| 3814 | { | 3882 | { |
| 3815 | if (th->doff == sizeof(struct tcphdr) >> 2) { | 3883 | /* In the spirit of fast parsing, compare doff directly to constant |
| 3884 | * values. Because equality is used, short doff can be ignored here. | ||
| 3885 | */ | ||
| 3886 | if (th->doff == (sizeof(*th) / 4)) { | ||
| 3816 | tp->rx_opt.saw_tstamp = 0; | 3887 | tp->rx_opt.saw_tstamp = 0; |
| 3817 | return 0; | 3888 | return 0; |
| 3818 | } else if (tp->rx_opt.tstamp_ok && | 3889 | } else if (tp->rx_opt.tstamp_ok && |
| 3819 | th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { | 3890 | th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { |
| 3820 | if (tcp_parse_aligned_timestamp(tp, th)) | 3891 | if (tcp_parse_aligned_timestamp(tp, th)) |
| 3821 | return 1; | 3892 | return 1; |
| 3822 | } | 3893 | } |
| 3823 | tcp_parse_options(skb, &tp->rx_opt, 1); | 3894 | tcp_parse_options(skb, &tp->rx_opt, hvpp, 1); |
| 3824 | return 1; | 3895 | return 1; |
| 3825 | } | 3896 | } |
| 3826 | 3897 | ||
| @@ -4845,11 +4916,11 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) | |||
| 4845 | struct tcp_sock *tp = tcp_sk(sk); | 4916 | struct tcp_sock *tp = tcp_sk(sk); |
| 4846 | 4917 | ||
| 4847 | /* More than one full frame received... */ | 4918 | /* More than one full frame received... */ |
| 4848 | if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss | 4919 | if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && |
| 4849 | /* ... and right edge of window advances far enough. | 4920 | /* ... and right edge of window advances far enough. |
| 4850 | * (tcp_recvmsg() will send ACK otherwise). Or... | 4921 | * (tcp_recvmsg() will send ACK otherwise). Or... |
| 4851 | */ | 4922 | */ |
| 4852 | && __tcp_select_window(sk) >= tp->rcv_wnd) || | 4923 | __tcp_select_window(sk) >= tp->rcv_wnd) || |
| 4853 | /* We ACK each frame or... */ | 4924 | /* We ACK each frame or... */ |
| 4854 | tcp_in_quickack_mode(sk) || | 4925 | tcp_in_quickack_mode(sk) || |
| 4855 | /* We have out of order data. */ | 4926 | /* We have out of order data. */ |
| @@ -5070,10 +5141,12 @@ out: | |||
| 5070 | static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, | 5141 | static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, |
| 5071 | struct tcphdr *th, int syn_inerr) | 5142 | struct tcphdr *th, int syn_inerr) |
| 5072 | { | 5143 | { |
| 5144 | u8 *hash_location; | ||
| 5073 | struct tcp_sock *tp = tcp_sk(sk); | 5145 | struct tcp_sock *tp = tcp_sk(sk); |
| 5074 | 5146 | ||
| 5075 | /* RFC1323: H1. Apply PAWS check first. */ | 5147 | /* RFC1323: H1. Apply PAWS check first. */ |
| 5076 | if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && | 5148 | if (tcp_fast_parse_options(skb, th, tp, &hash_location) && |
| 5149 | tp->rx_opt.saw_tstamp && | ||
| 5077 | tcp_paws_discard(sk, skb)) { | 5150 | tcp_paws_discard(sk, skb)) { |
| 5078 | if (!th->rst) { | 5151 | if (!th->rst) { |
| 5079 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); | 5152 | NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); |
| @@ -5361,11 +5434,13 @@ discard: | |||
| 5361 | static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | 5434 | static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, |
| 5362 | struct tcphdr *th, unsigned len) | 5435 | struct tcphdr *th, unsigned len) |
| 5363 | { | 5436 | { |
| 5364 | struct tcp_sock *tp = tcp_sk(sk); | 5437 | u8 *hash_location; |
| 5365 | struct inet_connection_sock *icsk = inet_csk(sk); | 5438 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 5439 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 5440 | struct tcp_cookie_values *cvp = tp->cookie_values; | ||
| 5366 | int saved_clamp = tp->rx_opt.mss_clamp; | 5441 | int saved_clamp = tp->rx_opt.mss_clamp; |
| 5367 | 5442 | ||
| 5368 | tcp_parse_options(skb, &tp->rx_opt, 0); | 5443 | tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0); |
| 5369 | 5444 | ||
| 5370 | if (th->ack) { | 5445 | if (th->ack) { |
| 5371 | /* rfc793: | 5446 | /* rfc793: |
| @@ -5462,6 +5537,31 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 5462 | * Change state from SYN-SENT only after copied_seq | 5537 | * Change state from SYN-SENT only after copied_seq |
| 5463 | * is initialized. */ | 5538 | * is initialized. */ |
| 5464 | tp->copied_seq = tp->rcv_nxt; | 5539 | tp->copied_seq = tp->rcv_nxt; |
| 5540 | |||
| 5541 | if (cvp != NULL && | ||
| 5542 | cvp->cookie_pair_size > 0 && | ||
| 5543 | tp->rx_opt.cookie_plus > 0) { | ||
| 5544 | int cookie_size = tp->rx_opt.cookie_plus | ||
| 5545 | - TCPOLEN_COOKIE_BASE; | ||
| 5546 | int cookie_pair_size = cookie_size | ||
| 5547 | + cvp->cookie_desired; | ||
| 5548 | |||
| 5549 | /* A cookie extension option was sent and returned. | ||
| 5550 | * Note that each incoming SYNACK replaces the | ||
| 5551 | * Responder cookie. The initial exchange is most | ||
| 5552 | * fragile, as protection against spoofing relies | ||
| 5553 | * entirely upon the sequence and timestamp (above). | ||
| 5554 | * This replacement strategy allows the correct pair to | ||
| 5555 | * pass through, while any others will be filtered via | ||
| 5556 | * Responder verification later. | ||
| 5557 | */ | ||
| 5558 | if (sizeof(cvp->cookie_pair) >= cookie_pair_size) { | ||
| 5559 | memcpy(&cvp->cookie_pair[cvp->cookie_desired], | ||
| 5560 | hash_location, cookie_size); | ||
| 5561 | cvp->cookie_pair_size = cookie_pair_size; | ||
| 5562 | } | ||
| 5563 | } | ||
| 5564 | |||
| 5465 | smp_mb(); | 5565 | smp_mb(); |
| 5466 | tcp_set_state(sk, TCP_ESTABLISHED); | 5566 | tcp_set_state(sk, TCP_ESTABLISHED); |
| 5467 | 5567 | ||
| @@ -5699,11 +5799,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 5699 | 5799 | ||
| 5700 | /* tcp_ack considers this ACK as duplicate | 5800 | /* tcp_ack considers this ACK as duplicate |
| 5701 | * and does not calculate rtt. | 5801 | * and does not calculate rtt. |
| 5702 | * Fix it at least with timestamps. | 5802 | * Force it here. |
| 5703 | */ | 5803 | */ |
| 5704 | if (tp->rx_opt.saw_tstamp && | 5804 | tcp_ack_update_rtt(sk, 0, 0); |
| 5705 | tp->rx_opt.rcv_tsecr && !tp->srtt) | ||
| 5706 | tcp_ack_saw_tstamp(sk, 0); | ||
| 5707 | 5805 | ||
| 5708 | if (tp->rx_opt.tstamp_ok) | 5806 | if (tp->rx_opt.tstamp_ok) |
| 5709 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; | 5807 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; |
