aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
authorJames Morris <james.l.morris@oracle.com>2014-11-19 05:32:12 -0500
committerJames Morris <james.l.morris@oracle.com>2014-11-19 05:32:12 -0500
commitb10778a00d40b3d9fdaaf5891e802794781ff71c (patch)
tree6ba4cbac86eecedc3f30650e7f764ecf00c83898 /net/ipv4/tcp_input.c
parent594081ee7145cc30a3977cb4e218f81213b63dc5 (diff)
parentbfe01a5ba2490f299e1d2d5508cbbbadd897bbe9 (diff)
Merge commit 'v3.17' into next
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c211
1 files changed, 187 insertions, 24 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 40639c288dc2..a906e0200ff2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -74,6 +74,7 @@
74#include <linux/ipsec.h> 74#include <linux/ipsec.h>
75#include <asm/unaligned.h> 75#include <asm/unaligned.h>
76#include <net/netdma.h> 76#include <net/netdma.h>
77#include <linux/errqueue.h>
77 78
78int sysctl_tcp_timestamps __read_mostly = 1; 79int sysctl_tcp_timestamps __read_mostly = 1;
79int sysctl_tcp_window_scaling __read_mostly = 1; 80int sysctl_tcp_window_scaling __read_mostly = 1;
@@ -1904,16 +1905,17 @@ void tcp_clear_retrans(struct tcp_sock *tp)
1904 tp->sacked_out = 0; 1905 tp->sacked_out = 0;
1905} 1906}
1906 1907
1907/* Enter Loss state. If "how" is not zero, forget all SACK information 1908/* Enter Loss state. If we detect SACK reneging, forget all SACK information
1908 * and reset tags completely, otherwise preserve SACKs. If receiver 1909 * and reset tags completely, otherwise preserve SACKs. If receiver
1909 * dropped its ofo queue, we will know this due to reneging detection. 1910 * dropped its ofo queue, we will know this due to reneging detection.
1910 */ 1911 */
1911void tcp_enter_loss(struct sock *sk, int how) 1912void tcp_enter_loss(struct sock *sk)
1912{ 1913{
1913 const struct inet_connection_sock *icsk = inet_csk(sk); 1914 const struct inet_connection_sock *icsk = inet_csk(sk);
1914 struct tcp_sock *tp = tcp_sk(sk); 1915 struct tcp_sock *tp = tcp_sk(sk);
1915 struct sk_buff *skb; 1916 struct sk_buff *skb;
1916 bool new_recovery = false; 1917 bool new_recovery = false;
1918 bool is_reneg; /* is receiver reneging on SACKs? */
1917 1919
1918 /* Reduce ssthresh if it has not yet been made inside this window. */ 1920 /* Reduce ssthresh if it has not yet been made inside this window. */
1919 if (icsk->icsk_ca_state <= TCP_CA_Disorder || 1921 if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1934,7 +1936,11 @@ void tcp_enter_loss(struct sock *sk, int how)
1934 tcp_reset_reno_sack(tp); 1936 tcp_reset_reno_sack(tp);
1935 1937
1936 tp->undo_marker = tp->snd_una; 1938 tp->undo_marker = tp->snd_una;
1937 if (how) { 1939
1940 skb = tcp_write_queue_head(sk);
1941 is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
1942 if (is_reneg) {
1943 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
1938 tp->sacked_out = 0; 1944 tp->sacked_out = 0;
1939 tp->fackets_out = 0; 1945 tp->fackets_out = 0;
1940 } 1946 }
@@ -1948,7 +1954,7 @@ void tcp_enter_loss(struct sock *sk, int how)
1948 tp->undo_marker = 0; 1954 tp->undo_marker = 0;
1949 1955
1950 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; 1956 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
1951 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { 1957 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
1952 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; 1958 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
1953 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1959 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1954 tp->lost_out += tcp_skb_pcount(skb); 1960 tp->lost_out += tcp_skb_pcount(skb);
@@ -1981,19 +1987,21 @@ void tcp_enter_loss(struct sock *sk, int how)
1981 * remembered SACKs do not reflect real state of receiver i.e. 1987 * remembered SACKs do not reflect real state of receiver i.e.
1982 * receiver _host_ is heavily congested (or buggy). 1988 * receiver _host_ is heavily congested (or buggy).
1983 * 1989 *
1984 * Do processing similar to RTO timeout. 1990 * To avoid big spurious retransmission bursts due to transient SACK
1991 * scoreboard oddities that look like reneging, we give the receiver a
1992 * little time (max(RTT/2, 10ms)) to send us some more ACKs that will
1993 * restore sanity to the SACK scoreboard. If the apparent reneging
1994 * persists until this RTO then we'll clear the SACK scoreboard.
1985 */ 1995 */
1986static bool tcp_check_sack_reneging(struct sock *sk, int flag) 1996static bool tcp_check_sack_reneging(struct sock *sk, int flag)
1987{ 1997{
1988 if (flag & FLAG_SACK_RENEGING) { 1998 if (flag & FLAG_SACK_RENEGING) {
1989 struct inet_connection_sock *icsk = inet_csk(sk); 1999 struct tcp_sock *tp = tcp_sk(sk);
1990 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); 2000 unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
2001 msecs_to_jiffies(10));
1991 2002
1992 tcp_enter_loss(sk, 1);
1993 icsk->icsk_retransmits++;
1994 tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
1995 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 2003 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1996 icsk->icsk_rto, TCP_RTO_MAX); 2004 delay, TCP_RTO_MAX);
1997 return true; 2005 return true;
1998 } 2006 }
1999 return false; 2007 return false;
@@ -2475,7 +2483,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
2475 * losses and/or application stalls), do not perform any further cwnd 2483 * losses and/or application stalls), do not perform any further cwnd
2476 * reductions, but instead slow start up to ssthresh. 2484 * reductions, but instead slow start up to ssthresh.
2477 */ 2485 */
2478static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) 2486static void tcp_init_cwnd_reduction(struct sock *sk)
2479{ 2487{
2480 struct tcp_sock *tp = tcp_sk(sk); 2488 struct tcp_sock *tp = tcp_sk(sk);
2481 2489
@@ -2485,8 +2493,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
2485 tp->prior_cwnd = tp->snd_cwnd; 2493 tp->prior_cwnd = tp->snd_cwnd;
2486 tp->prr_delivered = 0; 2494 tp->prr_delivered = 0;
2487 tp->prr_out = 0; 2495 tp->prr_out = 0;
2488 if (set_ssthresh) 2496 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2489 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2490 TCP_ECN_queue_cwr(tp); 2497 TCP_ECN_queue_cwr(tp);
2491} 2498}
2492 2499
@@ -2528,14 +2535,14 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
2528} 2535}
2529 2536
2530/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */ 2537/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
2531void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) 2538void tcp_enter_cwr(struct sock *sk)
2532{ 2539{
2533 struct tcp_sock *tp = tcp_sk(sk); 2540 struct tcp_sock *tp = tcp_sk(sk);
2534 2541
2535 tp->prior_ssthresh = 0; 2542 tp->prior_ssthresh = 0;
2536 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { 2543 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2537 tp->undo_marker = 0; 2544 tp->undo_marker = 0;
2538 tcp_init_cwnd_reduction(sk, set_ssthresh); 2545 tcp_init_cwnd_reduction(sk);
2539 tcp_set_ca_state(sk, TCP_CA_CWR); 2546 tcp_set_ca_state(sk, TCP_CA_CWR);
2540 } 2547 }
2541} 2548}
@@ -2564,7 +2571,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
2564 tp->retrans_stamp = 0; 2571 tp->retrans_stamp = 0;
2565 2572
2566 if (flag & FLAG_ECE) 2573 if (flag & FLAG_ECE)
2567 tcp_enter_cwr(sk, 1); 2574 tcp_enter_cwr(sk);
2568 2575
2569 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { 2576 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
2570 tcp_try_keep_open(sk); 2577 tcp_try_keep_open(sk);
@@ -2670,7 +2677,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2670 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { 2677 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2671 if (!ece_ack) 2678 if (!ece_ack)
2672 tp->prior_ssthresh = tcp_current_ssthresh(sk); 2679 tp->prior_ssthresh = tcp_current_ssthresh(sk);
2673 tcp_init_cwnd_reduction(sk, true); 2680 tcp_init_cwnd_reduction(sk);
2674 } 2681 }
2675 tcp_set_ca_state(sk, TCP_CA_Recovery); 2682 tcp_set_ca_state(sk, TCP_CA_Recovery);
2676} 2683}
@@ -2680,7 +2687,6 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2680 */ 2687 */
2681static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) 2688static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
2682{ 2689{
2683 struct inet_connection_sock *icsk = inet_csk(sk);
2684 struct tcp_sock *tp = tcp_sk(sk); 2690 struct tcp_sock *tp = tcp_sk(sk);
2685 bool recovered = !before(tp->snd_una, tp->high_seq); 2691 bool recovered = !before(tp->snd_una, tp->high_seq);
2686 2692
@@ -2706,12 +2712,9 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
2706 2712
2707 if (recovered) { 2713 if (recovered) {
2708 /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */ 2714 /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
2709 icsk->icsk_retransmits = 0;
2710 tcp_try_undo_recovery(sk); 2715 tcp_try_undo_recovery(sk);
2711 return; 2716 return;
2712 } 2717 }
2713 if (flag & FLAG_DATA_ACKED)
2714 icsk->icsk_retransmits = 0;
2715 if (tcp_is_reno(tp)) { 2718 if (tcp_is_reno(tp)) {
2716 /* A Reno DUPACK means new data in F-RTO step 2.b above are 2719 /* A Reno DUPACK means new data in F-RTO step 2.b above are
2717 * delivered. Lower inflight to clock out (re)tranmissions. 2720 * delivered. Lower inflight to clock out (re)tranmissions.
@@ -3043,10 +3046,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3043 first_ackt.v64 = 0; 3046 first_ackt.v64 = 0;
3044 3047
3045 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { 3048 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
3049 struct skb_shared_info *shinfo = skb_shinfo(skb);
3046 struct tcp_skb_cb *scb = TCP_SKB_CB(skb); 3050 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
3047 u8 sacked = scb->sacked; 3051 u8 sacked = scb->sacked;
3048 u32 acked_pcount; 3052 u32 acked_pcount;
3049 3053
3054 if (unlikely(shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
3055 between(shinfo->tskey, prior_snd_una, tp->snd_una - 1))
3056 __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
3057
3050 /* Determine how many packets and what bytes were acked, tso and else */ 3058 /* Determine how many packets and what bytes were acked, tso and else */
3051 if (after(scb->end_seq, tp->snd_una)) { 3059 if (after(scb->end_seq, tp->snd_una)) {
3052 if (tcp_skb_pcount(skb) == 1 || 3060 if (tcp_skb_pcount(skb) == 1 ||
@@ -3346,7 +3354,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
3346 tp->tlp_high_seq = 0; 3354 tp->tlp_high_seq = 0;
3347 /* Don't reduce cwnd if DSACK arrives for TLP retrans. */ 3355 /* Don't reduce cwnd if DSACK arrives for TLP retrans. */
3348 if (!(flag & FLAG_DSACKING_ACK)) { 3356 if (!(flag & FLAG_DSACKING_ACK)) {
3349 tcp_init_cwnd_reduction(sk, true); 3357 tcp_init_cwnd_reduction(sk);
3350 tcp_set_ca_state(sk, TCP_CA_CWR); 3358 tcp_set_ca_state(sk, TCP_CA_CWR);
3351 tcp_end_cwnd_reduction(sk); 3359 tcp_end_cwnd_reduction(sk);
3352 tcp_try_keep_open(sk); 3360 tcp_try_keep_open(sk);
@@ -3393,8 +3401,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3393 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) 3401 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3394 tcp_rearm_rto(sk); 3402 tcp_rearm_rto(sk);
3395 3403
3396 if (after(ack, prior_snd_una)) 3404 if (after(ack, prior_snd_una)) {
3397 flag |= FLAG_SND_UNA_ADVANCED; 3405 flag |= FLAG_SND_UNA_ADVANCED;
3406 icsk->icsk_retransmits = 0;
3407 }
3398 3408
3399 prior_fackets = tp->fackets_out; 3409 prior_fackets = tp->fackets_out;
3400 3410
@@ -5877,3 +5887,156 @@ discard:
5877 return 0; 5887 return 0;
5878} 5888}
5879EXPORT_SYMBOL(tcp_rcv_state_process); 5889EXPORT_SYMBOL(tcp_rcv_state_process);
5890
5891static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
5892{
5893 struct inet_request_sock *ireq = inet_rsk(req);
5894
5895 if (family == AF_INET)
5896 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
5897 &ireq->ir_rmt_addr, port);
5898#if IS_ENABLED(CONFIG_IPV6)
5899 else if (family == AF_INET6)
5900 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"),
5901 &ireq->ir_v6_rmt_addr, port);
5902#endif
5903}
5904
5905int tcp_conn_request(struct request_sock_ops *rsk_ops,
5906 const struct tcp_request_sock_ops *af_ops,
5907 struct sock *sk, struct sk_buff *skb)
5908{
5909 struct tcp_options_received tmp_opt;
5910 struct request_sock *req;
5911 struct tcp_sock *tp = tcp_sk(sk);
5912 struct dst_entry *dst = NULL;
5913 __u32 isn = TCP_SKB_CB(skb)->when;
5914 bool want_cookie = false, fastopen;
5915 struct flowi fl;
5916 struct tcp_fastopen_cookie foc = { .len = -1 };
5917 int err;
5918
5919
5920 /* TW buckets are converted to open requests without
5921 * limitations, they conserve resources and peer is
5922 * evidently real one.
5923 */
5924 if ((sysctl_tcp_syncookies == 2 ||
5925 inet_csk_reqsk_queue_is_full(sk)) && !isn) {
5926 want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
5927 if (!want_cookie)
5928 goto drop;
5929 }
5930
5931
5932 /* Accept backlog is full. If we have already queued enough
5933 * of warm entries in syn queue, drop request. It is better than
5934 * clogging syn queue with openreqs with exponentially increasing
5935 * timeout.
5936 */
5937 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
5938 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
5939 goto drop;
5940 }
5941
5942 req = inet_reqsk_alloc(rsk_ops);
5943 if (!req)
5944 goto drop;
5945
5946 tcp_rsk(req)->af_specific = af_ops;
5947
5948 tcp_clear_options(&tmp_opt);
5949 tmp_opt.mss_clamp = af_ops->mss_clamp;
5950 tmp_opt.user_mss = tp->rx_opt.user_mss;
5951 tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
5952
5953 if (want_cookie && !tmp_opt.saw_tstamp)
5954 tcp_clear_options(&tmp_opt);
5955
5956 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
5957 tcp_openreq_init(req, &tmp_opt, skb, sk);
5958
5959 af_ops->init_req(req, sk, skb);
5960
5961 if (security_inet_conn_request(sk, skb, req))
5962 goto drop_and_free;
5963
5964 if (!want_cookie || tmp_opt.tstamp_ok)
5965 TCP_ECN_create_request(req, skb, sock_net(sk));
5966
5967 if (want_cookie) {
5968 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
5969 req->cookie_ts = tmp_opt.tstamp_ok;
5970 } else if (!isn) {
5971 /* VJ's idea. We save last timestamp seen
5972 * from the destination in peer table, when entering
5973 * state TIME-WAIT, and check against it before
5974 * accepting new connection request.
5975 *
5976 * If "isn" is not zero, this request hit alive
5977 * timewait bucket, so that all the necessary checks
5978 * are made in the function processing timewait state.
5979 */
5980 if (tcp_death_row.sysctl_tw_recycle) {
5981 bool strict;
5982
5983 dst = af_ops->route_req(sk, &fl, req, &strict);
5984
5985 if (dst && strict &&
5986 !tcp_peer_is_proven(req, dst, true,
5987 tmp_opt.saw_tstamp)) {
5988 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
5989 goto drop_and_release;
5990 }
5991 }
5992 /* Kill the following clause, if you dislike this way. */
5993 else if (!sysctl_tcp_syncookies &&
5994 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
5995 (sysctl_max_syn_backlog >> 2)) &&
5996 !tcp_peer_is_proven(req, dst, false,
5997 tmp_opt.saw_tstamp)) {
5998 /* Without syncookies last quarter of
5999 * backlog is filled with destinations,
6000 * proven to be alive.
6001 * It means that we continue to communicate
6002 * to destinations, already remembered
6003 * to the moment of synflood.
6004 */
6005 pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
6006 rsk_ops->family);
6007 goto drop_and_release;
6008 }
6009
6010 isn = af_ops->init_seq(skb);
6011 }
6012 if (!dst) {
6013 dst = af_ops->route_req(sk, &fl, req, NULL);
6014 if (!dst)
6015 goto drop_and_free;
6016 }
6017
6018 tcp_rsk(req)->snt_isn = isn;
6019 tcp_openreq_init_rwin(req, sk, dst);
6020 fastopen = !want_cookie &&
6021 tcp_try_fastopen(sk, skb, req, &foc, dst);
6022 err = af_ops->send_synack(sk, dst, &fl, req,
6023 skb_get_queue_mapping(skb), &foc);
6024 if (!fastopen) {
6025 if (err || want_cookie)
6026 goto drop_and_free;
6027
6028 tcp_rsk(req)->listener = NULL;
6029 af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
6030 }
6031
6032 return 0;
6033
6034drop_and_release:
6035 dst_release(dst);
6036drop_and_free:
6037 reqsk_free(req);
6038drop:
6039 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
6040 return 0;
6041}
6042EXPORT_SYMBOL(tcp_conn_request);