aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2015-01-29 01:19:09 -0500
committerDavid S. Miller <davem@davemloft.net>2015-01-29 01:19:09 -0500
commit95224ac1801cbfadc2c587be15fded69a13c4e3b (patch)
tree59f39d492be4f8ee202ca1bc1cb50612ed033edc
parent59343cd7c4809cf7598789e1cd14563780ae4239 (diff)
parentd6b1a8a92a1417f8859a6937d2e6ffe2dfab4e6d (diff)
Merge branch 'tcp_stretch_acks'
Neal Cardwell says: ==================== fix stretch ACK bugs in TCP CUBIC and Reno This patch series fixes the TCP CUBIC and Reno congestion control modules to properly handle stretch ACKs in their respective additive increase modes, and in the transitions from slow start to additive increase. This finishes the project started by commit 9f9843a751d0a2057 ("tcp: properly handle stretch acks in slow start"), which fixed behavior for TCP congestion control when handling stretch ACKs in slow start mode. Motivation: In the Jan 2015 netdev thread 'BW regression after "tcp: refine TSO autosizing"', Eyal Perry documented a regression that Eric Dumazet determined was caused by improper handling of TCP stretch ACKs. Background: LRO, GRO, delayed ACKs, and middleboxes can cause "stretch ACKs" that cover more than the RFC-specified maximum of 2 packets. These stretch ACKs can cause serious performance shortfalls in common congestion control algorithms, like Reno and CUBIC, which were designed and tuned years ago with receiver hosts that were not using LRO or GRO, and were instead ACKing every other packet. Testing: at Google we have been using this approach for handling stretch ACKs for CUBIC datacenter and Internet traffic for several years, with good results. v2: * fixed return type of tcp_slow_start() to be u32 instead of int ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/tcp.h4
-rw-r--r--net/ipv4/tcp_bic.c2
-rw-r--r--net/ipv4/tcp_cong.c32
-rw-r--r--net/ipv4/tcp_cubic.c39
-rw-r--r--net/ipv4/tcp_scalable.c3
-rw-r--r--net/ipv4/tcp_veno.c2
-rw-r--r--net/ipv4/tcp_yeah.c2
7 files changed, 44 insertions, 40 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f50f29faf76f..9d9111ef43ae 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -834,8 +834,8 @@ void tcp_get_available_congestion_control(char *buf, size_t len);
834void tcp_get_allowed_congestion_control(char *buf, size_t len); 834void tcp_get_allowed_congestion_control(char *buf, size_t len);
835int tcp_set_allowed_congestion_control(char *allowed); 835int tcp_set_allowed_congestion_control(char *allowed);
836int tcp_set_congestion_control(struct sock *sk, const char *name); 836int tcp_set_congestion_control(struct sock *sk, const char *name);
837void tcp_slow_start(struct tcp_sock *tp, u32 acked); 837u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
838void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w); 838void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
839 839
840u32 tcp_reno_ssthresh(struct sock *sk); 840u32 tcp_reno_ssthresh(struct sock *sk);
841void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); 841void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index bb395d46a389..c037644eafb7 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -150,7 +150,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
150 tcp_slow_start(tp, acked); 150 tcp_slow_start(tp, acked);
151 else { 151 else {
152 bictcp_update(ca, tp->snd_cwnd); 152 bictcp_update(ca, tp->snd_cwnd);
153 tcp_cong_avoid_ai(tp, ca->cnt); 153 tcp_cong_avoid_ai(tp, ca->cnt, 1);
154 } 154 }
155} 155}
156 156
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 27ead0dd16bc..8670e68e2ce6 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -291,26 +291,32 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
291 * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and 291 * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
292 * returns the leftover acks to adjust cwnd in congestion avoidance mode. 292 * returns the leftover acks to adjust cwnd in congestion avoidance mode.
293 */ 293 */
294void tcp_slow_start(struct tcp_sock *tp, u32 acked) 294u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
295{ 295{
296 u32 cwnd = tp->snd_cwnd + acked; 296 u32 cwnd = tp->snd_cwnd + acked;
297 297
298 if (cwnd > tp->snd_ssthresh) 298 if (cwnd > tp->snd_ssthresh)
299 cwnd = tp->snd_ssthresh + 1; 299 cwnd = tp->snd_ssthresh + 1;
300 acked -= cwnd - tp->snd_cwnd;
300 tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); 301 tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
302
303 return acked;
301} 304}
302EXPORT_SYMBOL_GPL(tcp_slow_start); 305EXPORT_SYMBOL_GPL(tcp_slow_start);
303 306
304/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */ 307/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w),
305void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w) 308 * for every packet that was ACKed.
309 */
310void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
306{ 311{
312 tp->snd_cwnd_cnt += acked;
307 if (tp->snd_cwnd_cnt >= w) { 313 if (tp->snd_cwnd_cnt >= w) {
308 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 314 u32 delta = tp->snd_cwnd_cnt / w;
309 tp->snd_cwnd++; 315
310 tp->snd_cwnd_cnt = 0; 316 tp->snd_cwnd_cnt -= delta * w;
311 } else { 317 tp->snd_cwnd += delta;
312 tp->snd_cwnd_cnt++;
313 } 318 }
319 tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp);
314} 320}
315EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); 321EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
316 322
@@ -329,11 +335,13 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
329 return; 335 return;
330 336
331 /* In "safe" area, increase. */ 337 /* In "safe" area, increase. */
332 if (tp->snd_cwnd <= tp->snd_ssthresh) 338 if (tp->snd_cwnd <= tp->snd_ssthresh) {
333 tcp_slow_start(tp, acked); 339 acked = tcp_slow_start(tp, acked);
340 if (!acked)
341 return;
342 }
334 /* In dangerous area, increase slowly. */ 343 /* In dangerous area, increase slowly. */
335 else 344 tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
336 tcp_cong_avoid_ai(tp, tp->snd_cwnd);
337} 345}
338EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); 346EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
339 347
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 6b6002416a73..4b276d1ed980 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -93,9 +93,7 @@ struct bictcp {
93 u32 epoch_start; /* beginning of an epoch */ 93 u32 epoch_start; /* beginning of an epoch */
94 u32 ack_cnt; /* number of acks */ 94 u32 ack_cnt; /* number of acks */
95 u32 tcp_cwnd; /* estimated tcp cwnd */ 95 u32 tcp_cwnd; /* estimated tcp cwnd */
96#define ACK_RATIO_SHIFT 4 96 u16 unused;
97#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT)
98 u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
99 u8 sample_cnt; /* number of samples to decide curr_rtt */ 97 u8 sample_cnt; /* number of samples to decide curr_rtt */
100 u8 found; /* the exit point is found? */ 98 u8 found; /* the exit point is found? */
101 u32 round_start; /* beginning of each round */ 99 u32 round_start; /* beginning of each round */
@@ -114,7 +112,6 @@ static inline void bictcp_reset(struct bictcp *ca)
114 ca->bic_K = 0; 112 ca->bic_K = 0;
115 ca->delay_min = 0; 113 ca->delay_min = 0;
116 ca->epoch_start = 0; 114 ca->epoch_start = 0;
117 ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
118 ca->ack_cnt = 0; 115 ca->ack_cnt = 0;
119 ca->tcp_cwnd = 0; 116 ca->tcp_cwnd = 0;
120 ca->found = 0; 117 ca->found = 0;
@@ -205,23 +202,30 @@ static u32 cubic_root(u64 a)
205/* 202/*
206 * Compute congestion window to use. 203 * Compute congestion window to use.
207 */ 204 */
208static inline void bictcp_update(struct bictcp *ca, u32 cwnd) 205static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
209{ 206{
210 u32 delta, bic_target, max_cnt; 207 u32 delta, bic_target, max_cnt;
211 u64 offs, t; 208 u64 offs, t;
212 209
213 ca->ack_cnt++; /* count the number of ACKs */ 210 ca->ack_cnt += acked; /* count the number of ACKed packets */
214 211
215 if (ca->last_cwnd == cwnd && 212 if (ca->last_cwnd == cwnd &&
216 (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) 213 (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
217 return; 214 return;
218 215
216 /* The CUBIC function can update ca->cnt at most once per jiffy.
217 * On all cwnd reduction events, ca->epoch_start is set to 0,
218 * which will force a recalculation of ca->cnt.
219 */
220 if (ca->epoch_start && tcp_time_stamp == ca->last_time)
221 goto tcp_friendliness;
222
219 ca->last_cwnd = cwnd; 223 ca->last_cwnd = cwnd;
220 ca->last_time = tcp_time_stamp; 224 ca->last_time = tcp_time_stamp;
221 225
222 if (ca->epoch_start == 0) { 226 if (ca->epoch_start == 0) {
223 ca->epoch_start = tcp_time_stamp; /* record beginning */ 227 ca->epoch_start = tcp_time_stamp; /* record beginning */
224 ca->ack_cnt = 1; /* start counting */ 228 ca->ack_cnt = acked; /* start counting */
225 ca->tcp_cwnd = cwnd; /* syn with cubic */ 229 ca->tcp_cwnd = cwnd; /* syn with cubic */
226 230
227 if (ca->last_max_cwnd <= cwnd) { 231 if (ca->last_max_cwnd <= cwnd) {
@@ -283,6 +287,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
283 if (ca->last_max_cwnd == 0 && ca->cnt > 20) 287 if (ca->last_max_cwnd == 0 && ca->cnt > 20)
284 ca->cnt = 20; /* increase cwnd 5% per RTT */ 288 ca->cnt = 20; /* increase cwnd 5% per RTT */
285 289
290tcp_friendliness:
286 /* TCP Friendly */ 291 /* TCP Friendly */
287 if (tcp_friendliness) { 292 if (tcp_friendliness) {
288 u32 scale = beta_scale; 293 u32 scale = beta_scale;
@@ -301,7 +306,6 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
301 } 306 }
302 } 307 }
303 308
304 ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
305 if (ca->cnt == 0) /* cannot be zero */ 309 if (ca->cnt == 0) /* cannot be zero */
306 ca->cnt = 1; 310 ca->cnt = 1;
307} 311}
@@ -317,11 +321,12 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
317 if (tp->snd_cwnd <= tp->snd_ssthresh) { 321 if (tp->snd_cwnd <= tp->snd_ssthresh) {
318 if (hystart && after(ack, ca->end_seq)) 322 if (hystart && after(ack, ca->end_seq))
319 bictcp_hystart_reset(sk); 323 bictcp_hystart_reset(sk);
320 tcp_slow_start(tp, acked); 324 acked = tcp_slow_start(tp, acked);
321 } else { 325 if (!acked)
322 bictcp_update(ca, tp->snd_cwnd); 326 return;
323 tcp_cong_avoid_ai(tp, ca->cnt);
324 } 327 }
328 bictcp_update(ca, tp->snd_cwnd, acked);
329 tcp_cong_avoid_ai(tp, ca->cnt, acked);
325} 330}
326 331
327static u32 bictcp_recalc_ssthresh(struct sock *sk) 332static u32 bictcp_recalc_ssthresh(struct sock *sk)
@@ -411,20 +416,10 @@ static void hystart_update(struct sock *sk, u32 delay)
411 */ 416 */
412static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) 417static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
413{ 418{
414 const struct inet_connection_sock *icsk = inet_csk(sk);
415 const struct tcp_sock *tp = tcp_sk(sk); 419 const struct tcp_sock *tp = tcp_sk(sk);
416 struct bictcp *ca = inet_csk_ca(sk); 420 struct bictcp *ca = inet_csk_ca(sk);
417 u32 delay; 421 u32 delay;
418 422
419 if (icsk->icsk_ca_state == TCP_CA_Open) {
420 u32 ratio = ca->delayed_ack;
421
422 ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT;
423 ratio += cnt;
424
425 ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT);
426 }
427
428 /* Some calls are for duplicates without timetamps */ 423 /* Some calls are for duplicates without timetamps */
429 if (rtt_us < 0) 424 if (rtt_us < 0)
430 return; 425 return;
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 6824afb65d93..333bcb2415ff 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -25,7 +25,8 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
25 if (tp->snd_cwnd <= tp->snd_ssthresh) 25 if (tp->snd_cwnd <= tp->snd_ssthresh)
26 tcp_slow_start(tp, acked); 26 tcp_slow_start(tp, acked);
27 else 27 else
28 tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)); 28 tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
29 1);
29} 30}
30 31
31static u32 tcp_scalable_ssthresh(struct sock *sk) 32static u32 tcp_scalable_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index a4d2d2d88dca..112151eeee45 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -159,7 +159,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
159 /* In the "non-congestive state", increase cwnd 159 /* In the "non-congestive state", increase cwnd
160 * every rtt. 160 * every rtt.
161 */ 161 */
162 tcp_cong_avoid_ai(tp, tp->snd_cwnd); 162 tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
163 } else { 163 } else {
164 /* In the "congestive state", increase cwnd 164 /* In the "congestive state", increase cwnd
165 * every other rtt. 165 * every other rtt.
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index cd7273218598..17d35662930d 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -92,7 +92,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
92 92
93 } else { 93 } else {
94 /* Reno */ 94 /* Reno */
95 tcp_cong_avoid_ai(tp, tp->snd_cwnd); 95 tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
96 } 96 }
97 97
98 /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. 98 /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.