aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c180
1 files changed, 95 insertions, 85 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 28e029632493..2549b29b062d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -62,6 +62,7 @@
62 */ 62 */
63 63
64#include <linux/mm.h> 64#include <linux/mm.h>
65#include <linux/slab.h>
65#include <linux/module.h> 66#include <linux/module.h>
66#include <linux/sysctl.h> 67#include <linux/sysctl.h>
67#include <linux/kernel.h> 68#include <linux/kernel.h>
@@ -77,10 +78,13 @@ int sysctl_tcp_window_scaling __read_mostly = 1;
77int sysctl_tcp_sack __read_mostly = 1; 78int sysctl_tcp_sack __read_mostly = 1;
78int sysctl_tcp_fack __read_mostly = 1; 79int sysctl_tcp_fack __read_mostly = 1;
79int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; 80int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
81EXPORT_SYMBOL(sysctl_tcp_reordering);
80int sysctl_tcp_ecn __read_mostly = 2; 82int sysctl_tcp_ecn __read_mostly = 2;
83EXPORT_SYMBOL(sysctl_tcp_ecn);
81int sysctl_tcp_dsack __read_mostly = 1; 84int sysctl_tcp_dsack __read_mostly = 1;
82int sysctl_tcp_app_win __read_mostly = 31; 85int sysctl_tcp_app_win __read_mostly = 31;
83int sysctl_tcp_adv_win_scale __read_mostly = 2; 86int sysctl_tcp_adv_win_scale __read_mostly = 2;
87EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
84 88
85int sysctl_tcp_stdurg __read_mostly; 89int sysctl_tcp_stdurg __read_mostly;
86int sysctl_tcp_rfc1337 __read_mostly; 90int sysctl_tcp_rfc1337 __read_mostly;
@@ -89,6 +93,8 @@ int sysctl_tcp_frto __read_mostly = 2;
89int sysctl_tcp_frto_response __read_mostly; 93int sysctl_tcp_frto_response __read_mostly;
90int sysctl_tcp_nometrics_save __read_mostly; 94int sysctl_tcp_nometrics_save __read_mostly;
91 95
96int sysctl_tcp_thin_dupack __read_mostly;
97
92int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; 98int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
93int sysctl_tcp_abc __read_mostly; 99int sysctl_tcp_abc __read_mostly;
94 100
@@ -176,7 +182,7 @@ static void tcp_incr_quickack(struct sock *sk)
176 icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); 182 icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
177} 183}
178 184
179void tcp_enter_quickack_mode(struct sock *sk) 185static void tcp_enter_quickack_mode(struct sock *sk)
180{ 186{
181 struct inet_connection_sock *icsk = inet_csk(sk); 187 struct inet_connection_sock *icsk = inet_csk(sk);
182 tcp_incr_quickack(sk); 188 tcp_incr_quickack(sk);
@@ -253,8 +259,11 @@ static void tcp_fixup_sndbuf(struct sock *sk)
253 int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + 259 int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 +
254 sizeof(struct sk_buff); 260 sizeof(struct sk_buff);
255 261
256 if (sk->sk_sndbuf < 3 * sndmem) 262 if (sk->sk_sndbuf < 3 * sndmem) {
257 sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]); 263 sk->sk_sndbuf = 3 * sndmem;
264 if (sk->sk_sndbuf > sysctl_tcp_wmem[2])
265 sk->sk_sndbuf = sysctl_tcp_wmem[2];
266 }
258} 267}
259 268
260/* 2. Tuning advertised window (window_clamp, rcv_ssthresh) 269/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -390,7 +399,7 @@ static void tcp_clamp_window(struct sock *sk)
390 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && 399 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
391 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && 400 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
392 !tcp_memory_pressure && 401 !tcp_memory_pressure &&
393 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { 402 atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
394 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), 403 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
395 sysctl_tcp_rmem[2]); 404 sysctl_tcp_rmem[2]);
396 } 405 }
@@ -416,15 +425,16 @@ void tcp_initialize_rcv_mss(struct sock *sk)
416 425
417 inet_csk(sk)->icsk_ack.rcv_mss = hint; 426 inet_csk(sk)->icsk_ack.rcv_mss = hint;
418} 427}
428EXPORT_SYMBOL(tcp_initialize_rcv_mss);
419 429
420/* Receiver "autotuning" code. 430/* Receiver "autotuning" code.
421 * 431 *
422 * The algorithm for RTT estimation w/o timestamps is based on 432 * The algorithm for RTT estimation w/o timestamps is based on
423 * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. 433 * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
424 * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps> 434 * <http://public.lanl.gov/radiant/pubs.html#DRS>
425 * 435 *
426 * More detail on this code can be found at 436 * More detail on this code can be found at
427 * <http://www.psc.edu/~jheffner/senior_thesis.ps>, 437 * <http://staff.psc.edu/jheffner/>,
428 * though this reference is out of date. A new paper 438 * though this reference is out of date. A new paper
429 * is pending. 439 * is pending.
430 */ 440 */
@@ -724,7 +734,7 @@ void tcp_update_metrics(struct sock *sk)
724 * Reset our results. 734 * Reset our results.
725 */ 735 */
726 if (!(dst_metric_locked(dst, RTAX_RTT))) 736 if (!(dst_metric_locked(dst, RTAX_RTT)))
727 dst->metrics[RTAX_RTT - 1] = 0; 737 dst_metric_set(dst, RTAX_RTT, 0);
728 return; 738 return;
729 } 739 }
730 740
@@ -766,57 +776,48 @@ void tcp_update_metrics(struct sock *sk)
766 if (dst_metric(dst, RTAX_SSTHRESH) && 776 if (dst_metric(dst, RTAX_SSTHRESH) &&
767 !dst_metric_locked(dst, RTAX_SSTHRESH) && 777 !dst_metric_locked(dst, RTAX_SSTHRESH) &&
768 (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) 778 (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
769 dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1; 779 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
770 if (!dst_metric_locked(dst, RTAX_CWND) && 780 if (!dst_metric_locked(dst, RTAX_CWND) &&
771 tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) 781 tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
772 dst->metrics[RTAX_CWND - 1] = tp->snd_cwnd; 782 dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
773 } else if (tp->snd_cwnd > tp->snd_ssthresh && 783 } else if (tp->snd_cwnd > tp->snd_ssthresh &&
774 icsk->icsk_ca_state == TCP_CA_Open) { 784 icsk->icsk_ca_state == TCP_CA_Open) {
775 /* Cong. avoidance phase, cwnd is reliable. */ 785 /* Cong. avoidance phase, cwnd is reliable. */
776 if (!dst_metric_locked(dst, RTAX_SSTHRESH)) 786 if (!dst_metric_locked(dst, RTAX_SSTHRESH))
777 dst->metrics[RTAX_SSTHRESH-1] = 787 dst_metric_set(dst, RTAX_SSTHRESH,
778 max(tp->snd_cwnd >> 1, tp->snd_ssthresh); 788 max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
779 if (!dst_metric_locked(dst, RTAX_CWND)) 789 if (!dst_metric_locked(dst, RTAX_CWND))
780 dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_cwnd) >> 1; 790 dst_metric_set(dst, RTAX_CWND,
791 (dst_metric(dst, RTAX_CWND) +
792 tp->snd_cwnd) >> 1);
781 } else { 793 } else {
782 /* Else slow start did not finish, cwnd is non-sense, 794 /* Else slow start did not finish, cwnd is non-sense,
783 ssthresh may be also invalid. 795 ssthresh may be also invalid.
784 */ 796 */
785 if (!dst_metric_locked(dst, RTAX_CWND)) 797 if (!dst_metric_locked(dst, RTAX_CWND))
786 dst->metrics[RTAX_CWND-1] = (dst_metric(dst, RTAX_CWND) + tp->snd_ssthresh) >> 1; 798 dst_metric_set(dst, RTAX_CWND,
799 (dst_metric(dst, RTAX_CWND) +
800 tp->snd_ssthresh) >> 1);
787 if (dst_metric(dst, RTAX_SSTHRESH) && 801 if (dst_metric(dst, RTAX_SSTHRESH) &&
788 !dst_metric_locked(dst, RTAX_SSTHRESH) && 802 !dst_metric_locked(dst, RTAX_SSTHRESH) &&
789 tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) 803 tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
790 dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh; 804 dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
791 } 805 }
792 806
793 if (!dst_metric_locked(dst, RTAX_REORDERING)) { 807 if (!dst_metric_locked(dst, RTAX_REORDERING)) {
794 if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && 808 if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
795 tp->reordering != sysctl_tcp_reordering) 809 tp->reordering != sysctl_tcp_reordering)
796 dst->metrics[RTAX_REORDERING-1] = tp->reordering; 810 dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
797 } 811 }
798 } 812 }
799} 813}
800 814
801/* Numbers are taken from RFC3390.
802 *
803 * John Heffner states:
804 *
805 * The RFC specifies a window of no more than 4380 bytes
806 * unless 2*MSS > 4380. Reading the pseudocode in the RFC
807 * is a bit misleading because they use a clamp at 4380 bytes
808 * rather than use a multiplier in the relevant range.
809 */
810__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) 815__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
811{ 816{
812 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 817 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
813 818
814 if (!cwnd) { 819 if (!cwnd)
815 if (tp->mss_cache > 1460) 820 cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
816 cwnd = 2;
817 else
818 cwnd = (tp->mss_cache > 1095) ? 3 : 4;
819 }
820 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 821 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
821} 822}
822 823
@@ -915,25 +916,20 @@ static void tcp_init_metrics(struct sock *sk)
915 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); 916 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
916 } 917 }
917 tcp_set_rto(sk); 918 tcp_set_rto(sk);
918 if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) 919 if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) {
919 goto reset;
920
921cwnd:
922 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
923 tp->snd_cwnd_stamp = tcp_time_stamp;
924 return;
925
926reset: 920reset:
927 /* Play conservative. If timestamps are not 921 /* Play conservative. If timestamps are not
928 * supported, TCP will fail to recalculate correct 922 * supported, TCP will fail to recalculate correct
929 * rtt, if initial rto is too small. FORGET ALL AND RESET! 923 * rtt, if initial rto is too small. FORGET ALL AND RESET!
930 */ 924 */
931 if (!tp->rx_opt.saw_tstamp && tp->srtt) { 925 if (!tp->rx_opt.saw_tstamp && tp->srtt) {
932 tp->srtt = 0; 926 tp->srtt = 0;
933 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; 927 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
934 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; 928 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
929 }
935 } 930 }
936 goto cwnd; 931 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
932 tp->snd_cwnd_stamp = tcp_time_stamp;
937} 933}
938 934
939static void tcp_update_reordering(struct sock *sk, const int metric, 935static void tcp_update_reordering(struct sock *sk, const int metric,
@@ -2307,7 +2303,7 @@ static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
2307 2303
2308static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) 2304static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
2309{ 2305{
2310 return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); 2306 return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
2311} 2307}
2312 2308
2313static inline int tcp_head_timedout(struct sock *sk) 2309static inline int tcp_head_timedout(struct sock *sk)
@@ -2447,6 +2443,16 @@ static int tcp_time_to_recover(struct sock *sk)
2447 return 1; 2443 return 1;
2448 } 2444 }
2449 2445
2446 /* If a thin stream is detected, retransmit after first
2447 * received dupack. Employ only if SACK is supported in order
2448 * to avoid possible corner-case series of spurious retransmissions
2449 * Use only if there are no unsent data.
2450 */
2451 if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
2452 tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
2453 tcp_is_sack(tp) && !tcp_send_head(sk))
2454 return 1;
2455
2450 return 0; 2456 return 0;
2451} 2457}
2452 2458
@@ -2491,7 +2497,7 @@ static void tcp_timeout_skbs(struct sock *sk)
2491/* Mark head of queue up as lost. With RFC3517 SACK, the packets is 2497/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
2492 * is against sacked "cnt", otherwise it's against facked "cnt" 2498 * is against sacked "cnt", otherwise it's against facked "cnt"
2493 */ 2499 */
2494static void tcp_mark_head_lost(struct sock *sk, int packets) 2500static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
2495{ 2501{
2496 struct tcp_sock *tp = tcp_sk(sk); 2502 struct tcp_sock *tp = tcp_sk(sk);
2497 struct sk_buff *skb; 2503 struct sk_buff *skb;
@@ -2503,6 +2509,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2503 if (tp->lost_skb_hint) { 2509 if (tp->lost_skb_hint) {
2504 skb = tp->lost_skb_hint; 2510 skb = tp->lost_skb_hint;
2505 cnt = tp->lost_cnt_hint; 2511 cnt = tp->lost_cnt_hint;
2512 /* Head already handled? */
2513 if (mark_head && skb != tcp_write_queue_head(sk))
2514 return;
2506 } else { 2515 } else {
2507 skb = tcp_write_queue_head(sk); 2516 skb = tcp_write_queue_head(sk);
2508 cnt = 0; 2517 cnt = 0;
@@ -2525,7 +2534,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2525 cnt += tcp_skb_pcount(skb); 2534 cnt += tcp_skb_pcount(skb);
2526 2535
2527 if (cnt > packets) { 2536 if (cnt > packets) {
2528 if (tcp_is_sack(tp) || (oldcnt >= packets)) 2537 if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
2538 (oldcnt >= packets))
2529 break; 2539 break;
2530 2540
2531 mss = skb_shinfo(skb)->gso_size; 2541 mss = skb_shinfo(skb)->gso_size;
@@ -2536,6 +2546,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2536 } 2546 }
2537 2547
2538 tcp_skb_mark_lost(tp, skb); 2548 tcp_skb_mark_lost(tp, skb);
2549
2550 if (mark_head)
2551 break;
2539 } 2552 }
2540 tcp_verify_left_out(tp); 2553 tcp_verify_left_out(tp);
2541} 2554}
@@ -2547,17 +2560,18 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
2547 struct tcp_sock *tp = tcp_sk(sk); 2560 struct tcp_sock *tp = tcp_sk(sk);
2548 2561
2549 if (tcp_is_reno(tp)) { 2562 if (tcp_is_reno(tp)) {
2550 tcp_mark_head_lost(sk, 1); 2563 tcp_mark_head_lost(sk, 1, 1);
2551 } else if (tcp_is_fack(tp)) { 2564 } else if (tcp_is_fack(tp)) {
2552 int lost = tp->fackets_out - tp->reordering; 2565 int lost = tp->fackets_out - tp->reordering;
2553 if (lost <= 0) 2566 if (lost <= 0)
2554 lost = 1; 2567 lost = 1;
2555 tcp_mark_head_lost(sk, lost); 2568 tcp_mark_head_lost(sk, lost, 0);
2556 } else { 2569 } else {
2557 int sacked_upto = tp->sacked_out - tp->reordering; 2570 int sacked_upto = tp->sacked_out - tp->reordering;
2558 if (sacked_upto < fast_rexmit) 2571 if (sacked_upto >= 0)
2559 sacked_upto = fast_rexmit; 2572 tcp_mark_head_lost(sk, sacked_upto, 0);
2560 tcp_mark_head_lost(sk, sacked_upto); 2573 else if (fast_rexmit)
2574 tcp_mark_head_lost(sk, 1, 1);
2561 } 2575 }
2562 2576
2563 tcp_timeout_skbs(sk); 2577 tcp_timeout_skbs(sk);
@@ -2623,7 +2637,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
2623 if (sk->sk_family == AF_INET) { 2637 if (sk->sk_family == AF_INET) {
2624 printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", 2638 printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
2625 msg, 2639 msg,
2626 &inet->daddr, ntohs(inet->dport), 2640 &inet->inet_daddr, ntohs(inet->inet_dport),
2627 tp->snd_cwnd, tcp_left_out(tp), 2641 tp->snd_cwnd, tcp_left_out(tp),
2628 tp->snd_ssthresh, tp->prior_ssthresh, 2642 tp->snd_ssthresh, tp->prior_ssthresh,
2629 tp->packets_out); 2643 tp->packets_out);
@@ -2633,7 +2647,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
2633 struct ipv6_pinfo *np = inet6_sk(sk); 2647 struct ipv6_pinfo *np = inet6_sk(sk);
2634 printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", 2648 printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
2635 msg, 2649 msg,
2636 &np->daddr, ntohs(inet->dport), 2650 &np->daddr, ntohs(inet->inet_dport),
2637 tp->snd_cwnd, tcp_left_out(tp), 2651 tp->snd_cwnd, tcp_left_out(tp),
2638 tp->snd_ssthresh, tp->prior_ssthresh, 2652 tp->snd_ssthresh, tp->prior_ssthresh,
2639 tp->packets_out); 2653 tp->packets_out);
@@ -2866,7 +2880,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
2866 icsk->icsk_mtup.probe_size; 2880 icsk->icsk_mtup.probe_size;
2867 tp->snd_cwnd_cnt = 0; 2881 tp->snd_cwnd_cnt = 0;
2868 tp->snd_cwnd_stamp = tcp_time_stamp; 2882 tp->snd_cwnd_stamp = tcp_time_stamp;
2869 tp->rcv_ssthresh = tcp_current_ssthresh(sk); 2883 tp->snd_ssthresh = tcp_current_ssthresh(sk);
2870 2884
2871 icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; 2885 icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
2872 icsk->icsk_mtup.probe_size = 0; 2886 icsk->icsk_mtup.probe_size = 0;
@@ -2922,6 +2936,7 @@ void tcp_simple_retransmit(struct sock *sk)
2922 } 2936 }
2923 tcp_xmit_retransmit_queue(sk); 2937 tcp_xmit_retransmit_queue(sk);
2924} 2938}
2939EXPORT_SYMBOL(tcp_simple_retransmit);
2925 2940
2926/* Process an event, which can update packets-in-flight not trivially. 2941/* Process an event, which can update packets-in-flight not trivially.
2927 * Main goal of this function is to calculate new estimate for left_out, 2942 * Main goal of this function is to calculate new estimate for left_out,
@@ -2962,7 +2977,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
2962 before(tp->snd_una, tp->high_seq) && 2977 before(tp->snd_una, tp->high_seq) &&
2963 icsk->icsk_ca_state != TCP_CA_Open && 2978 icsk->icsk_ca_state != TCP_CA_Open &&
2964 tp->fackets_out > tp->reordering) { 2979 tp->fackets_out > tp->reordering) {
2965 tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering); 2980 tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
2966 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); 2981 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
2967 } 2982 }
2968 2983
@@ -3270,7 +3285,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3270 * connection startup slow start one packet too 3285 * connection startup slow start one packet too
3271 * quickly. This is severely frowned upon behavior. 3286 * quickly. This is severely frowned upon behavior.
3272 */ 3287 */
3273 if (!(scb->flags & TCPCB_FLAG_SYN)) { 3288 if (!(scb->flags & TCPHDR_SYN)) {
3274 flag |= FLAG_DATA_ACKED; 3289 flag |= FLAG_DATA_ACKED;
3275 } else { 3290 } else {
3276 flag |= FLAG_SYN_ACKED; 3291 flag |= FLAG_SYN_ACKED;
@@ -3390,8 +3405,8 @@ static void tcp_ack_probe(struct sock *sk)
3390 3405
3391static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) 3406static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
3392{ 3407{
3393 return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || 3408 return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
3394 inet_csk(sk)->icsk_ca_state != TCP_CA_Open); 3409 inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
3395} 3410}
3396 3411
3397static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) 3412static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
@@ -3408,9 +3423,9 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp,
3408 const u32 ack, const u32 ack_seq, 3423 const u32 ack, const u32 ack_seq,
3409 const u32 nwin) 3424 const u32 nwin)
3410{ 3425{
3411 return (after(ack, tp->snd_una) || 3426 return after(ack, tp->snd_una) ||
3412 after(ack_seq, tp->snd_wl1) || 3427 after(ack_seq, tp->snd_wl1) ||
3413 (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd)); 3428 (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
3414} 3429}
3415 3430
3416/* Update our send window. 3431/* Update our send window.
@@ -3694,7 +3709,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
3694 } 3709 }
3695 3710
3696 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) 3711 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
3697 dst_confirm(sk->sk_dst_cache); 3712 dst_confirm(__sk_dst_get(sk));
3698 3713
3699 return 1; 3714 return 1;
3700 3715
@@ -3829,18 +3844,20 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
3829 /* 16-bit multiple */ 3844 /* 16-bit multiple */
3830 opt_rx->cookie_plus = opsize; 3845 opt_rx->cookie_plus = opsize;
3831 *hvpp = ptr; 3846 *hvpp = ptr;
3847 break;
3832 default: 3848 default:
3833 /* ignore option */ 3849 /* ignore option */
3834 break; 3850 break;
3835 }; 3851 }
3836 break; 3852 break;
3837 }; 3853 }
3838 3854
3839 ptr += opsize-2; 3855 ptr += opsize-2;
3840 length -= opsize; 3856 length -= opsize;
3841 } 3857 }
3842 } 3858 }
3843} 3859}
3860EXPORT_SYMBOL(tcp_parse_options);
3844 3861
3845static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th) 3862static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
3846{ 3863{
@@ -3907,13 +3924,14 @@ u8 *tcp_parse_md5sig_option(struct tcphdr *th)
3907 if (opsize < 2 || opsize > length) 3924 if (opsize < 2 || opsize > length)
3908 return NULL; 3925 return NULL;
3909 if (opcode == TCPOPT_MD5SIG) 3926 if (opcode == TCPOPT_MD5SIG)
3910 return ptr; 3927 return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
3911 } 3928 }
3912 ptr += opsize - 2; 3929 ptr += opsize - 2;
3913 length -= opsize; 3930 length -= opsize;
3914 } 3931 }
3915 return NULL; 3932 return NULL;
3916} 3933}
3934EXPORT_SYMBOL(tcp_parse_md5sig_option);
3917#endif 3935#endif
3918 3936
3919static inline void tcp_store_ts_recent(struct tcp_sock *tp) 3937static inline void tcp_store_ts_recent(struct tcp_sock *tp)
@@ -4024,6 +4042,8 @@ static void tcp_reset(struct sock *sk)
4024 default: 4042 default:
4025 sk->sk_err = ECONNRESET; 4043 sk->sk_err = ECONNRESET;
4026 } 4044 }
4045 /* This barrier is coupled with smp_rmb() in tcp_poll() */
4046 smp_wmb();
4027 4047
4028 if (!sock_flag(sk, SOCK_DEAD)) 4048 if (!sock_flag(sk, SOCK_DEAD))
4029 sk->sk_error_report(sk); 4049 sk->sk_error_report(sk);
@@ -4303,7 +4323,7 @@ static void tcp_ofo_queue(struct sock *sk)
4303 } 4323 }
4304 4324
4305 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { 4325 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4306 SOCK_DEBUG(sk, "ofo packet was already received \n"); 4326 SOCK_DEBUG(sk, "ofo packet was already received\n");
4307 __skb_unlink(skb, &tp->out_of_order_queue); 4327 __skb_unlink(skb, &tp->out_of_order_queue);
4308 __kfree_skb(skb); 4328 __kfree_skb(skb);
4309 continue; 4329 continue;
@@ -4351,6 +4371,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4351 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) 4371 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
4352 goto drop; 4372 goto drop;
4353 4373
4374 skb_dst_drop(skb);
4354 __skb_pull(skb, th->doff * 4); 4375 __skb_pull(skb, th->doff * 4);
4355 4376
4356 TCP_ECN_accept_cwr(tp, skb); 4377 TCP_ECN_accept_cwr(tp, skb);
@@ -4842,7 +4863,7 @@ static int tcp_should_expand_sndbuf(struct sock *sk)
4842 return 0; 4863 return 0;
4843 4864
4844 /* If we are under soft global TCP memory pressure, do not expand. */ 4865 /* If we are under soft global TCP memory pressure, do not expand. */
4845 if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) 4866 if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
4846 return 0; 4867 return 0;
4847 4868
4848 /* If we filled the congestion window, do not expand. */ 4869 /* If we filled the congestion window, do not expand. */
@@ -5414,6 +5435,7 @@ discard:
5414 __kfree_skb(skb); 5435 __kfree_skb(skb);
5415 return 0; 5436 return 0;
5416} 5437}
5438EXPORT_SYMBOL(tcp_rcv_established);
5417 5439
5418static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5440static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5419 struct tcphdr *th, unsigned len) 5441 struct tcphdr *th, unsigned len)
@@ -5783,11 +5805,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5783 5805
5784 /* tcp_ack considers this ACK as duplicate 5806 /* tcp_ack considers this ACK as duplicate
5785 * and does not calculate rtt. 5807 * and does not calculate rtt.
5786 * Fix it at least with timestamps. 5808 * Force it here.
5787 */ 5809 */
5788 if (tp->rx_opt.saw_tstamp && 5810 tcp_ack_update_rtt(sk, 0, 0);
5789 tp->rx_opt.rcv_tsecr && !tp->srtt)
5790 tcp_ack_saw_tstamp(sk, 0);
5791 5811
5792 if (tp->rx_opt.tstamp_ok) 5812 if (tp->rx_opt.tstamp_ok)
5793 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 5813 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -5819,7 +5839,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5819 if (tp->snd_una == tp->write_seq) { 5839 if (tp->snd_una == tp->write_seq) {
5820 tcp_set_state(sk, TCP_FIN_WAIT2); 5840 tcp_set_state(sk, TCP_FIN_WAIT2);
5821 sk->sk_shutdown |= SEND_SHUTDOWN; 5841 sk->sk_shutdown |= SEND_SHUTDOWN;
5822 dst_confirm(sk->sk_dst_cache); 5842 dst_confirm(__sk_dst_get(sk));
5823 5843
5824 if (!sock_flag(sk, SOCK_DEAD)) 5844 if (!sock_flag(sk, SOCK_DEAD))
5825 /* Wake up lingering close() */ 5845 /* Wake up lingering close() */
@@ -5915,14 +5935,4 @@ discard:
5915 } 5935 }
5916 return 0; 5936 return 0;
5917} 5937}
5918
5919EXPORT_SYMBOL(sysctl_tcp_ecn);
5920EXPORT_SYMBOL(sysctl_tcp_reordering);
5921EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
5922EXPORT_SYMBOL(tcp_parse_options);
5923#ifdef CONFIG_TCP_MD5SIG
5924EXPORT_SYMBOL(tcp_parse_md5sig_option);
5925#endif
5926EXPORT_SYMBOL(tcp_rcv_established);
5927EXPORT_SYMBOL(tcp_rcv_state_process); 5938EXPORT_SYMBOL(tcp_rcv_state_process);
5928EXPORT_SYMBOL(tcp_initialize_rcv_mss);