aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2016-08-17 17:17:09 -0400
committerDavid S. Miller <davem@davemloft.net>2016-08-19 02:36:07 -0400
commit36a6503feddadbbad415fb3891e80f94c10a9b21 (patch)
tree2c42e7d2101b14c87d5e4867f32de41ccd8e2ee6 /net/ipv4/tcp_input.c
parente2d8f646c79f26e094bfaf9b21be614d1e148a67 (diff)
tcp: refine tcp_prune_ofo_queue() to not drop all packets
Over the years, TCP BDP has increased a lot, and is typically in the order of ~10 Mbytes with help of clever Congestion Control modules. In presence of packet losses, TCP stores incoming packets into an out of order queue, and number of skbs sitting there waiting for the missing packets to be received can match the BDP (~10 Mbytes) In some cases, TCP needs to make room for incoming skbs, and current strategy can simply remove all skbs in the out of order queue as a last resort, incurring a huge penalty, both for receiver and sender. Unfortunately these 'last resort events' are quite frequent, forcing sender to send all packets again, stalling the flow and wasting a lot of resources. This patch cleans only a part of the out of order queue in order to meet the memory constraints. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: Soheil Hassas Yeganeh <soheil@google.com> Cc: C. Stephen Gun <csg@google.com> Cc: Van Jacobson <vanj@google.com> Acked-by: Soheil Hassas Yeganeh <soheil@google.com> Acked-by: Yuchung Cheng <ycheng@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c47
1 files changed, 28 insertions, 19 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3ebf45b38bc3..8cd02c0b056c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4392,12 +4392,9 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4392 if (tcp_prune_queue(sk) < 0) 4392 if (tcp_prune_queue(sk) < 0)
4393 return -1; 4393 return -1;
4394 4394
4395 if (!sk_rmem_schedule(sk, skb, size)) { 4395 while (!sk_rmem_schedule(sk, skb, size)) {
4396 if (!tcp_prune_ofo_queue(sk)) 4396 if (!tcp_prune_ofo_queue(sk))
4397 return -1; 4397 return -1;
4398
4399 if (!sk_rmem_schedule(sk, skb, size))
4400 return -1;
4401 } 4398 }
4402 } 4399 }
4403 return 0; 4400 return 0;
@@ -4874,29 +4871,41 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
4874} 4871}
4875 4872
4876/* 4873/*
4877 * Purge the out-of-order queue. 4874 * Clean the out-of-order queue to make room.
4878 * Return true if queue was pruned. 4875 * We drop high sequences packets to :
4876 * 1) Let a chance for holes to be filled.
4877 * 2) not add too big latencies if thousands of packets sit there.
4878 * (But if application shrinks SO_RCVBUF, we could still end up
4879 * freeing whole queue here)
4880 *
4881 * Return true if queue has shrunk.
4879 */ 4882 */
4880static bool tcp_prune_ofo_queue(struct sock *sk) 4883static bool tcp_prune_ofo_queue(struct sock *sk)
4881{ 4884{
4882 struct tcp_sock *tp = tcp_sk(sk); 4885 struct tcp_sock *tp = tcp_sk(sk);
4883 bool res = false; 4886 struct sk_buff *skb;
4884 4887
4885 if (!skb_queue_empty(&tp->out_of_order_queue)) { 4888 if (skb_queue_empty(&tp->out_of_order_queue))
4886 NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); 4889 return false;
4887 __skb_queue_purge(&tp->out_of_order_queue);
4888 4890
4889 /* Reset SACK state. A conforming SACK implementation will 4891 NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
4890 * do the same at a timeout based retransmit. When a connection 4892
4891 * is in a sad state like this, we care only about integrity 4893 while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue)) != NULL) {
4892 * of the connection not performance. 4894 tcp_drop(sk, skb);
4893 */
4894 if (tp->rx_opt.sack_ok)
4895 tcp_sack_reset(&tp->rx_opt);
4896 sk_mem_reclaim(sk); 4895 sk_mem_reclaim(sk);
4897 res = true; 4896 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
4897 !tcp_under_memory_pressure(sk))
4898 break;
4898 } 4899 }
4899 return res; 4900
4901 /* Reset SACK state. A conforming SACK implementation will
4902 * do the same at a timeout based retransmit. When a connection
4903 * is in a sad state like this, we care only about integrity
4904 * of the connection not performance.
4905 */
4906 if (tp->rx_opt.sack_ok)
4907 tcp_sack_reset(&tp->rx_opt);
4908 return true;
4900} 4909}
4901 4910
4902/* Reduce allocated memory if we can, trying to get 4911/* Reduce allocated memory if we can, trying to get