tcp: refine tcp_prune_ofo_queue() to not drop all packets

Over the years, TCP BDP has increased a lot, and is typically in the order of ~10 Mbytes with help of clever Congestion Control modules. In presence of packet losses, TCP stores incoming packets into an out of order queue, and number of skbs sitting there waiting for the missing packets to be received can match the BDP (~10 Mbytes) In some cases, TCP needs to make room for incoming skbs, and current strategy can simply remove all skbs in the out of order queue as a last resort, incurring a huge penalty, both for receiver and sender. Unfortunately these 'last resort events' are quite frequent, forcing sender to send all packets again, stalling the flow and wasting a lot of resources. This patch cleans only a part of the out of order queue in order to meet the memory constraints. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: Soheil Hassas Yeganeh <soheil@google.com> Cc: C. Stephen Gun <csg@google.com> Cc: Van Jacobson <vanj@google.com> Acked-by: Soheil Hassas Yeganeh <soheil@google.com> Acked-by: Yuchung Cheng <ycheng@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Eric Dumazet <edumazet@google.com> 2016-08-17 17:17:09 -0400
committer: David S. Miller <davem@davemloft.net> 2016-08-19 02:36:07 -0400
commit: 36a6503feddadbbad415fb3891e80f94c10a9b21 (patch)
tree: 2c42e7d2101b14c87d5e4867f32de41ccd8e2ee6 /net/ipv4/tcp_input.c
parent: e2d8f646c79f26e094bfaf9b21be614d1e148a67 (diff)
1 files changed, 28 insertions, 19 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3ebf45b38bc3..8cd02c0b056c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4392,12 +4392,9 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
                if (tcp_prune_queue(sk) < 0)
                        return -1;
-                if (!sk_rmem_schedule(sk, skb, size)) {
+                while (!sk_rmem_schedule(sk, skb, size)) {
                        if (!tcp_prune_ofo_queue(sk))
                                return -1;
-                        if (!sk_rmem_schedule(sk, skb, size))
-                                return -1;
                }
        }
        return 0;
@@ -4874,29 +4871,41 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 }
 /*
- * Purge the out-of-order queue.
+ * Clean the out-of-order queue to make room.
- * Return true if queue was pruned.
+ * We drop high sequences packets to :
+ * 1) Let a chance for holes to be filled.
+ * 2) not add too big latencies if thousands of packets sit there.
+ *    (But if application shrinks SO_RCVBUF, we could still end up
+ *     freeing whole queue here)
+ *
+ * Return true if queue has shrunk.
 */
 static bool tcp_prune_ofo_queue(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        bool res = false;
+        struct sk_buff *skb;
-        if (!skb_queue_empty(&tp->out_of_order_queue)) {
+        if (skb_queue_empty(&tp->out_of_order_queue))
-                NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
+                return false;
-                __skb_queue_purge(&tp->out_of_order_queue);
-                /* Reset SACK state.  A conforming SACK implementation will
+        NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
-                 * do the same at a timeout based retransmit.  When a connection
-                 * is in a sad state like this, we care only about integrity
+        while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue)) != NULL) {
-                 * of the connection not performance.
+                tcp_drop(sk, skb);
-                 */
-                if (tp->rx_opt.sack_ok)
-                        tcp_sack_reset(&tp->rx_opt);
                sk_mem_reclaim(sk);
-                res = true;
+                if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+                    !tcp_under_memory_pressure(sk))
+                        break;
        }
-        return res;
+        /* Reset SACK state.  A conforming SACK implementation will
+         * do the same at a timeout based retransmit.  When a connection
+         * is in a sad state like this, we care only about integrity
+         * of the connection not performance.
+         */
+        if (tp->rx_opt.sack_ok)
+                tcp_sack_reset(&tp->rx_opt);
+        return true;
 }
 /* Reduce allocated memory if we can, trying to get
author	Eric Dumazet <edumazet@google.com>	2016-08-17 17:17:09 -0400
committer	David S. Miller <davem@davemloft.net>	2016-08-19 02:36:07 -0400
commit	36a6503feddadbbad415fb3891e80f94c10a9b21 (patch)
tree	2c42e7d2101b14c87d5e4867f32de41ccd8e2ee6 /net/ipv4/tcp_input.c
parent	e2d8f646c79f26e094bfaf9b21be614d1e148a67 (diff)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3ebf45b38bc3..8cd02c0b056c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c
@@ -4392,12 +4392,9 @@ static int tcp_try_rmem_schedule(struct sock sk, struct sk_buff skb,
4392	if (tcp_prune_queue(sk) < 0)	4392	if (tcp_prune_queue(sk) < 0)
4393	return -1;	4393	return -1;
4394		4394
4395	if (!sk_rmem_schedule(sk, skb, size)) {	4395	while (!sk_rmem_schedule(sk, skb, size)) {
4396	if (!tcp_prune_ofo_queue(sk))	4396	if (!tcp_prune_ofo_queue(sk))
4397	return -1;	4397	return -1;
4398
4399	if (!sk_rmem_schedule(sk, skb, size))
4400	return -1;
4401	}	4398	}
4402	}	4399	}
4403	return 0;	4400	return 0;
@@ -4874,29 +4871,41 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
4874	}	4871	}
4875		4872
4876	/*	4873	/*
4877	* Purge the out-of-order queue.	4874	* Clean the out-of-order queue to make room.
4878	* Return true if queue was pruned.	4875	* We drop high sequences packets to :
		4876	* 1) Let a chance for holes to be filled.
		4877	* 2) not add too big latencies if thousands of packets sit there.
		4878	* (But if application shrinks SO_RCVBUF, we could still end up
		4879	* freeing whole queue here)
		4880	*
		4881	* Return true if queue has shrunk.
4879	*/	4882	*/
4880	static bool tcp_prune_ofo_queue(struct sock *sk)	4883	static bool tcp_prune_ofo_queue(struct sock *sk)
4881	{	4884	{
4882	struct tcp_sock *tp = tcp_sk(sk);	4885	struct tcp_sock *tp = tcp_sk(sk);
4883	bool res = false;	4886	struct sk_buff *skb;
4884		4887
4885	if (!skb_queue_empty(&tp->out_of_order_queue)) {	4888	if (skb_queue_empty(&tp->out_of_order_queue))
4886	NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);	4889	return false;
4887	__skb_queue_purge(&tp->out_of_order_queue);
4888		4890
4889	/* Reset SACK state. A conforming SACK implementation will	4891	NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
4890	* do the same at a timeout based retransmit. When a connection	4892
4891	* is in a sad state like this, we care only about integrity	4893	while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue)) != NULL) {
4892	* of the connection not performance.	4894	tcp_drop(sk, skb);
4893	*/
4894	if (tp->rx_opt.sack_ok)
4895	tcp_sack_reset(&tp->rx_opt);
4896	sk_mem_reclaim(sk);	4895	sk_mem_reclaim(sk);
4897	res = true;	4896	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
		4897	!tcp_under_memory_pressure(sk))
		4898	break;
4898	}	4899	}
4899	return res;	4900
		4901	/* Reset SACK state. A conforming SACK implementation will
		4902	* do the same at a timeout based retransmit. When a connection
		4903	* is in a sad state like this, we care only about integrity
		4904	* of the connection not performance.
		4905	*/
		4906	if (tp->rx_opt.sack_ok)
		4907	tcp_sack_reset(&tp->rx_opt);
		4908	return true;
4900	}	4909	}
4901		4910
4902	/* Reduce allocated memory if we can, trying to get	4911	/* Reduce allocated memory if we can, trying to get