aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNandita Dukkipati <nanditad@google.com>2013-03-11 06:00:44 -0400
committerDavid S. Miller <davem@davemloft.net>2013-03-12 08:30:34 -0400
commit9b717a8d245075ffb8e95a2dfb4ee97ce4747457 (patch)
tree08e1ee37c89b11e4c08734c671a2427edb942944
parent6ba8a3b19e764b6a65e4030ab0999be50c291e6c (diff)
tcp: TLP loss detection.
This is the second of the TLP patch series; it augments the basic TLP algorithm with a loss detection scheme. This patch implements a mechanism for loss detection when a Tail loss probe retransmission plugs a hole thereby masking packet loss from the sender. The loss detection algorithm relies on counting TLP dupacks as outlined in Sec. 3 of: http://tools.ietf.org/html/draft-dukkipati-tcpm-tcp-loss-probe-01 The basic idea is: Sender keeps track of TLP "episode" upon retransmission of a TLP packet. An episode ends when the sender receives an ACK above the SND.NXT (tracked by tlp_high_seq) at the time of the episode. We want to make sure that before the episode ends the sender receives a "TLP dupack", indicating that the TLP retransmission was unnecessary, so there was no loss/hole that needed plugging. If the sender gets no TLP dupack before the end of the episode, then it reduces ssthresh and the congestion window, because the TLP packet arriving at the receiver probably plugged a hole. Signed-off-by: Nandita Dukkipati <nanditad@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/tcp.h1
-rw-r--r--include/uapi/linux/snmp.h1
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/tcp_input.c39
-rw-r--r--net/ipv4/tcp_minisocks.c1
-rw-r--r--net/ipv4/tcp_output.c9
-rw-r--r--net/ipv4/tcp_timer.c2
7 files changed, 54 insertions, 0 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 01860d74555c..763c108ee03d 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -204,6 +204,7 @@ struct tcp_sock {
204 syn_data:1, /* SYN includes data */ 204 syn_data:1, /* SYN includes data */
205 syn_fastopen:1, /* SYN includes Fast Open option */ 205 syn_fastopen:1, /* SYN includes Fast Open option */
206 syn_data_acked:1;/* data in SYN is acked by SYN-ACK */ 206 syn_data_acked:1;/* data in SYN is acked by SYN-ACK */
207 u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */
207 208
208/* RTT measurement */ 209/* RTT measurement */
209 u32 srtt; /* smoothed round trip time << 3 */ 210 u32 srtt; /* smoothed round trip time << 3 */
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 290bed6b085f..e00013a1debc 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -203,6 +203,7 @@ enum
203 LINUX_MIB_TCPSLOWSTARTRETRANS, /* TCPSlowStartRetrans */ 203 LINUX_MIB_TCPSLOWSTARTRETRANS, /* TCPSlowStartRetrans */
204 LINUX_MIB_TCPTIMEOUTS, /* TCPTimeouts */ 204 LINUX_MIB_TCPTIMEOUTS, /* TCPTimeouts */
205 LINUX_MIB_TCPLOSSPROBES, /* TCPLossProbes */ 205 LINUX_MIB_TCPLOSSPROBES, /* TCPLossProbes */
206 LINUX_MIB_TCPLOSSPROBERECOVERY, /* TCPLossProbeRecovery */
206 LINUX_MIB_TCPRENORECOVERYFAIL, /* TCPRenoRecoveryFail */ 207 LINUX_MIB_TCPRENORECOVERYFAIL, /* TCPRenoRecoveryFail */
207 LINUX_MIB_TCPSACKRECOVERYFAIL, /* TCPSackRecoveryFail */ 208 LINUX_MIB_TCPSACKRECOVERYFAIL, /* TCPSackRecoveryFail */
208 LINUX_MIB_TCPSCHEDULERFAILED, /* TCPSchedulerFailed */ 209 LINUX_MIB_TCPSCHEDULERFAILED, /* TCPSchedulerFailed */
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 4c35911d935f..b6f2ea174898 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -225,6 +225,7 @@ static const struct snmp_mib snmp4_net_list[] = {
225 SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS), 225 SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
226 SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS), 226 SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
227 SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES), 227 SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES),
228 SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY),
228 SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), 229 SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
229 SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), 230 SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
230 SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED), 231 SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b794f89ac1f2..836d74dd0187 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2682,6 +2682,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
2682 struct tcp_sock *tp = tcp_sk(sk); 2682 struct tcp_sock *tp = tcp_sk(sk);
2683 2683
2684 tp->high_seq = tp->snd_nxt; 2684 tp->high_seq = tp->snd_nxt;
2685 tp->tlp_high_seq = 0;
2685 tp->snd_cwnd_cnt = 0; 2686 tp->snd_cwnd_cnt = 0;
2686 tp->prior_cwnd = tp->snd_cwnd; 2687 tp->prior_cwnd = tp->snd_cwnd;
2687 tp->prr_delivered = 0; 2688 tp->prr_delivered = 0;
@@ -3569,6 +3570,38 @@ static void tcp_send_challenge_ack(struct sock *sk)
3569 } 3570 }
3570} 3571}
3571 3572
3573/* This routine deals with acks during a TLP episode.
3574 * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe.
3575 */
3576static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
3577{
3578 struct tcp_sock *tp = tcp_sk(sk);
3579 bool is_tlp_dupack = (ack == tp->tlp_high_seq) &&
3580 !(flag & (FLAG_SND_UNA_ADVANCED |
3581 FLAG_NOT_DUP | FLAG_DATA_SACKED));
3582
3583 /* Mark the end of TLP episode on receiving TLP dupack or when
3584 * ack is after tlp_high_seq.
3585 */
3586 if (is_tlp_dupack) {
3587 tp->tlp_high_seq = 0;
3588 return;
3589 }
3590
3591 if (after(ack, tp->tlp_high_seq)) {
3592 tp->tlp_high_seq = 0;
3593 /* Don't reduce cwnd if DSACK arrives for TLP retrans. */
3594 if (!(flag & FLAG_DSACKING_ACK)) {
3595 tcp_init_cwnd_reduction(sk, true);
3596 tcp_set_ca_state(sk, TCP_CA_CWR);
3597 tcp_end_cwnd_reduction(sk);
3598 tcp_set_ca_state(sk, TCP_CA_Open);
3599 NET_INC_STATS_BH(sock_net(sk),
3600 LINUX_MIB_TCPLOSSPROBERECOVERY);
3601 }
3602 }
3603}
3604
3572/* This routine deals with incoming acks, but not outgoing ones. */ 3605/* This routine deals with incoming acks, but not outgoing ones. */
3573static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) 3606static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3574{ 3607{
@@ -3676,6 +3709,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3676 tcp_cong_avoid(sk, ack, prior_in_flight); 3709 tcp_cong_avoid(sk, ack, prior_in_flight);
3677 } 3710 }
3678 3711
3712 if (tp->tlp_high_seq)
3713 tcp_process_tlp_ack(sk, ack, flag);
3714
3679 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { 3715 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
3680 struct dst_entry *dst = __sk_dst_get(sk); 3716 struct dst_entry *dst = __sk_dst_get(sk);
3681 if (dst) 3717 if (dst)
@@ -3697,6 +3733,9 @@ no_queue:
3697 */ 3733 */
3698 if (tcp_send_head(sk)) 3734 if (tcp_send_head(sk))
3699 tcp_ack_probe(sk); 3735 tcp_ack_probe(sk);
3736
3737 if (tp->tlp_high_seq)
3738 tcp_process_tlp_ack(sk, ack, flag);
3700 return 1; 3739 return 1;
3701 3740
3702invalid_ack: 3741invalid_ack:
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b83a49cc3816..4bdb09fca401 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -440,6 +440,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
440 newtp->fackets_out = 0; 440 newtp->fackets_out = 0;
441 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 441 newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
442 tcp_enable_early_retrans(newtp); 442 tcp_enable_early_retrans(newtp);
443 newtp->tlp_high_seq = 0;
443 444
444 /* So many TCP implementations out there (incorrectly) count the 445 /* So many TCP implementations out there (incorrectly) count the
445 * initial SYN frame in their delayed-ACK and congestion control 446 * initial SYN frame in their delayed-ACK and congestion control
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index beb63dbc85f5..8e7742f0b5d2 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2132,6 +2132,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
2132 */ 2132 */
2133void tcp_send_loss_probe(struct sock *sk) 2133void tcp_send_loss_probe(struct sock *sk)
2134{ 2134{
2135 struct tcp_sock *tp = tcp_sk(sk);
2135 struct sk_buff *skb; 2136 struct sk_buff *skb;
2136 int pcount; 2137 int pcount;
2137 int mss = tcp_current_mss(sk); 2138 int mss = tcp_current_mss(sk);
@@ -2142,6 +2143,10 @@ void tcp_send_loss_probe(struct sock *sk)
2142 goto rearm_timer; 2143 goto rearm_timer;
2143 } 2144 }
2144 2145
2146 /* At most one outstanding TLP retransmission. */
2147 if (tp->tlp_high_seq)
2148 goto rearm_timer;
2149
2145 /* Retransmit last segment. */ 2150 /* Retransmit last segment. */
2146 skb = tcp_write_queue_tail(sk); 2151 skb = tcp_write_queue_tail(sk);
2147 if (WARN_ON(!skb)) 2152 if (WARN_ON(!skb))
@@ -2164,6 +2169,10 @@ void tcp_send_loss_probe(struct sock *sk)
2164 if (skb->len > 0) 2169 if (skb->len > 0)
2165 err = __tcp_retransmit_skb(sk, skb); 2170 err = __tcp_retransmit_skb(sk, skb);
2166 2171
2172 /* Record snd_nxt for loss detection. */
2173 if (likely(!err))
2174 tp->tlp_high_seq = tp->snd_nxt;
2175
2167rearm_timer: 2176rearm_timer:
2168 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 2177 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2169 inet_csk(sk)->icsk_rto, 2178 inet_csk(sk)->icsk_rto,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index ecd61d54147f..eeccf795e917 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -356,6 +356,8 @@ void tcp_retransmit_timer(struct sock *sk)
356 356
357 WARN_ON(tcp_write_queue_empty(sk)); 357 WARN_ON(tcp_write_queue_empty(sk));
358 358
359 tp->tlp_high_seq = 0;
360
359 if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && 361 if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
360 !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { 362 !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
361 /* Receiver dastardly shrinks window. Our retransmits 363 /* Receiver dastardly shrinks window. Our retransmits