aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYuchung Cheng <ycheng@google.com>2018-07-18 16:56:36 -0400
committerDavid S. Miller <davem@davemloft.net>2018-07-20 17:32:23 -0400
commita0496ef2c23b3b180902dd185d0d63ccbc624cf8 (patch)
tree14637b238b127759e4ccccf6f05fb89aaa6772ca
parent27cde44a259c380a3c09066fc4b42de7dde9b1ad (diff)
tcp: do not delay ACK in DCTCP upon CE status change
Per DCTCP RFC8257 (Section 3.2) the ACK reflecting the CE status change has to be sent immediately so the sender can respond quickly: """ When receiving packets, the CE codepoint MUST be processed as follows: 1. If the CE codepoint is set and DCTCP.CE is false, set DCTCP.CE to true and send an immediate ACK. 2. If the CE codepoint is not set and DCTCP.CE is true, set DCTCP.CE to false and send an immediate ACK. """ Previously DCTCP implementation may continue to delay the ACK. This patch fixes that to implement the RFC by forcing an immediate ACK. Tested with this packetdrill script provided by Larry Brakmo 0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0 0.000 bind(3, ..., ...) = 0 0.000 listen(3, 1) = 0 0.100 < [ect0] SEW 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7> 0.100 > SE. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8> 0.110 < [ect0] . 1:1(0) ack 1 win 257 0.200 accept(3, ..., ...) = 4 +0 setsockopt(4, SOL_SOCKET, SO_DEBUG, [1], 4) = 0 0.200 < [ect0] . 1:1001(1000) ack 1 win 257 0.200 > [ect01] . 1:1(0) ack 1001 0.200 write(4, ..., 1) = 1 0.200 > [ect01] P. 1:2(1) ack 1001 0.200 < [ect0] . 1001:2001(1000) ack 2 win 257 +0.005 < [ce] . 2001:3001(1000) ack 2 win 257 +0.000 > [ect01] . 2:2(0) ack 2001 // Previously the ACK below would be delayed by 40ms +0.000 > [ect01] E. 2:2(0) ack 3001 +0.500 < F. 9501:9501(0) ack 4 win 257 Signed-off-by: Yuchung Cheng <ycheng@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/tcp.h1
-rw-r--r--net/ipv4/tcp_dctcp.c30
-rw-r--r--net/ipv4/tcp_input.c3
3 files changed, 21 insertions, 13 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index a08de496d1b2..25116ec02087 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -342,6 +342,7 @@ ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos,
342 struct pipe_inode_info *pipe, size_t len, 342 struct pipe_inode_info *pipe, size_t len,
343 unsigned int flags); 343 unsigned int flags);
344 344
345void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks);
345static inline void tcp_dec_quickack_mode(struct sock *sk, 346static inline void tcp_dec_quickack_mode(struct sock *sk,
346 const unsigned int pkts) 347 const unsigned int pkts)
347{ 348{
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 078328afbfe3..8b637f9f23a2 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -129,12 +129,15 @@ static void dctcp_ce_state_0_to_1(struct sock *sk)
129 struct dctcp *ca = inet_csk_ca(sk); 129 struct dctcp *ca = inet_csk_ca(sk);
130 struct tcp_sock *tp = tcp_sk(sk); 130 struct tcp_sock *tp = tcp_sk(sk);
131 131
132 /* State has changed from CE=0 to CE=1 and delayed 132 if (!ca->ce_state) {
133 * ACK has not sent yet. 133 /* State has changed from CE=0 to CE=1, force an immediate
134 */ 134 * ACK to reflect the new CE state. If an ACK was delayed,
135 if (!ca->ce_state && 135 * send that first to reflect the prior CE state.
136 inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) 136 */
137 __tcp_send_ack(sk, ca->prior_rcv_nxt); 137 if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
138 __tcp_send_ack(sk, ca->prior_rcv_nxt);
139 tcp_enter_quickack_mode(sk, 1);
140 }
138 141
139 ca->prior_rcv_nxt = tp->rcv_nxt; 142 ca->prior_rcv_nxt = tp->rcv_nxt;
140 ca->ce_state = 1; 143 ca->ce_state = 1;
@@ -147,12 +150,15 @@ static void dctcp_ce_state_1_to_0(struct sock *sk)
147 struct dctcp *ca = inet_csk_ca(sk); 150 struct dctcp *ca = inet_csk_ca(sk);
148 struct tcp_sock *tp = tcp_sk(sk); 151 struct tcp_sock *tp = tcp_sk(sk);
149 152
150 /* State has changed from CE=1 to CE=0 and delayed 153 if (ca->ce_state) {
151 * ACK has not sent yet. 154 /* State has changed from CE=1 to CE=0, force an immediate
152 */ 155 * ACK to reflect the new CE state. If an ACK was delayed,
153 if (ca->ce_state && 156 * send that first to reflect the prior CE state.
154 inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) 157 */
155 __tcp_send_ack(sk, ca->prior_rcv_nxt); 158 if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
159 __tcp_send_ack(sk, ca->prior_rcv_nxt);
160 tcp_enter_quickack_mode(sk, 1);
161 }
156 162
157 ca->prior_rcv_nxt = tp->rcv_nxt; 163 ca->prior_rcv_nxt = tp->rcv_nxt;
158 ca->ce_state = 0; 164 ca->ce_state = 0;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8e5522c6833a..6bade06aaf72 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -215,7 +215,7 @@ static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
215 icsk->icsk_ack.quick = quickacks; 215 icsk->icsk_ack.quick = quickacks;
216} 216}
217 217
218static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) 218void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
219{ 219{
220 struct inet_connection_sock *icsk = inet_csk(sk); 220 struct inet_connection_sock *icsk = inet_csk(sk);
221 221
@@ -223,6 +223,7 @@ static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
223 icsk->icsk_ack.pingpong = 0; 223 icsk->icsk_ack.pingpong = 0;
224 icsk->icsk_ack.ato = TCP_ATO_MIN; 224 icsk->icsk_ack.ato = TCP_ATO_MIN;
225} 225}
226EXPORT_SYMBOL(tcp_enter_quickack_mode);
226 227
227/* Send ACKs quickly, if "quick" count is not exhausted 228/* Send ACKs quickly, if "quick" count is not exhausted
228 * and the session is not interactive. 229 * and the session is not interactive.