tcp: avoid looping in tcp_send_fin()

Presence of an unbound loop in tcp_send_fin() had always been hard to explain when analyzing crash dumps involving gigantic dying processes with millions of sockets. Lets try a different strategy : In case of memory pressure, try to add the FIN flag to last packet in write queue, even if packet was already sent. TCP stack will be able to deliver this FIN after a timeout event. Note that this FIN being delivered by a retransmit, it also carries a Push flag given our current implementation. By checking sk_under_memory_pressure(), we anticipate that cooking many FIN packets might deplete tcp memory. In the case we could not allocate a packet, even with __GFP_WAIT allocation, then not sending a FIN seems quite reasonable if it allows to get rid of this socket, free memory, and not block the process from eventually doing other useful work. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Eric Dumazet <edumazet@google.com> 2015-04-23 13:42:39 -0400
committer: David S. Miller <davem@davemloft.net> 2015-04-24 11:06:48 -0400
commit: 845704a535e9b3c76448f52af1b70e4422ea03fd (patch)
tree: 3a14cdcb01442f0f4d61a3161a9ac508bbdb3732 /net/ipv4
parent: e4b6c30375e83b92d9c3e9b9d853417e8cc74006 (diff)
1 files changed, 29 insertions, 21 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2ade67b7cdb0..a369e8a70b2c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2814,7 +2814,8 @@ begin_fwd:
 /* We allow to exceed memory limits for FIN packets to expedite
 * connection tear down and (memory) recovery.
- * Otherwise tcp_send_fin() could loop forever.
+ * Otherwise tcp_send_fin() could be tempted to either delay FIN
+ * or even be forced to close flow without any FIN.
 */
 static void sk_forced_wmem_schedule(struct sock *sk, int size)
 {
@@ -2827,33 +2828,40 @@ static void sk_forced_wmem_schedule(struct sock *sk, int size)
        sk_memory_allocated_add(sk, amt, &status);
 }
-/* Send a fin.  The caller locks the socket for us.  This cannot be
+/* Send a FIN. The caller locks the socket for us.
- * allowed to fail queueing a FIN frame under any circumstances.
+ * We should try to send a FIN packet really hard, but eventually give up.
 */
 void tcp_send_fin(struct sock *sk)
 {
+        struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
        struct tcp_sock *tp = tcp_sk(sk);
-        struct sk_buff *skb = tcp_write_queue_tail(sk);
-        int mss_now;
-        /* Optimization, tack on the FIN if we have a queue of
+        /* Optimization, tack on the FIN if we have one skb in write queue and
-         * unsent frames.  But be careful about outgoing SACKS
+         * this skb was not yet sent, or we are under memory pressure.
-         * and IP options.
+         * Note: in the latter case, FIN packet will be sent after a timeout,
+         * as TCP stack thinks it has already been transmitted.
         */
-        mss_now = tcp_current_mss(sk);
+        if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) {
+coalesce:
-        if (tcp_send_head(sk)) {
+                TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
-                TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
+                TCP_SKB_CB(tskb)->end_seq++;
-                TCP_SKB_CB(skb)->end_seq++;
                tp->write_seq++;
+                if (!tcp_send_head(sk)) {
+                        /* This means tskb was already sent.
+                         * Pretend we included the FIN on previous transmit.
+                         * We need to set tp->snd_nxt to the value it would have
+                         * if FIN had been sent. This is because retransmit path
+                         * does not change tp->snd_nxt.
+                         */
+                        tp->snd_nxt++;
+                        return;
+                }
        } else {
-                /* Socket is locked, keep trying until memory is available. */
+                skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
-                for (;;) {
+                if (unlikely(!skb)) {
-                        skb = alloc_skb_fclone(MAX_TCP_HEADER,
+                        if (tskb)
-                                               sk->sk_allocation);
+                                goto coalesce;
-                        if (skb)
+                        return;
-                                break;
-                        yield();
                }
                skb_reserve(skb, MAX_TCP_HEADER);
                sk_forced_wmem_schedule(sk, skb->truesize);
@@ -2862,7 +2870,7 @@ void tcp_send_fin(struct sock *sk)
                                     TCPHDR_ACK | TCPHDR_FIN);
                tcp_queue_skb(sk, skb);
        }
-        __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
+        __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
 }
 /* We get here when a process closes a file descriptor (either due to
author	Eric Dumazet <edumazet@google.com>	2015-04-23 13:42:39 -0400
committer	David S. Miller <davem@davemloft.net>	2015-04-24 11:06:48 -0400
commit	845704a535e9b3c76448f52af1b70e4422ea03fd (patch)
tree	3a14cdcb01442f0f4d61a3161a9ac508bbdb3732 /net/ipv4
parent	e4b6c30375e83b92d9c3e9b9d853417e8cc74006 (diff)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2ade67b7cdb0..a369e8a70b2c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c
@@ -2814,7 +2814,8 @@ begin_fwd:
2814		2814
2815	/* We allow to exceed memory limits for FIN packets to expedite	2815	/* We allow to exceed memory limits for FIN packets to expedite
2816	* connection tear down and (memory) recovery.	2816	* connection tear down and (memory) recovery.
2817	* Otherwise tcp_send_fin() could loop forever.	2817	* Otherwise tcp_send_fin() could be tempted to either delay FIN
		2818	* or even be forced to close flow without any FIN.
2818	*/	2819	*/
2819	static void sk_forced_wmem_schedule(struct sock *sk, int size)	2820	static void sk_forced_wmem_schedule(struct sock *sk, int size)
2820	{	2821	{
@@ -2827,33 +2828,40 @@ static void sk_forced_wmem_schedule(struct sock *sk, int size)
2827	sk_memory_allocated_add(sk, amt, &status);	2828	sk_memory_allocated_add(sk, amt, &status);
2828	}	2829	}
2829		2830
2830	/* Send a fin. The caller locks the socket for us. This cannot be	2831	/* Send a FIN. The caller locks the socket for us.
2831	* allowed to fail queueing a FIN frame under any circumstances.	2832	* We should try to send a FIN packet really hard, but eventually give up.
2832	*/	2833	*/
2833	void tcp_send_fin(struct sock *sk)	2834	void tcp_send_fin(struct sock *sk)
2834	{	2835	{
		2836	struct sk_buff skb, tskb = tcp_write_queue_tail(sk);
2835	struct tcp_sock *tp = tcp_sk(sk);	2837	struct tcp_sock *tp = tcp_sk(sk);
2836	struct sk_buff *skb = tcp_write_queue_tail(sk);
2837	int mss_now;
2838		2838
2839	/* Optimization, tack on the FIN if we have a queue of	2839	/* Optimization, tack on the FIN if we have one skb in write queue and
2840	* unsent frames. But be careful about outgoing SACKS	2840	* this skb was not yet sent, or we are under memory pressure.
2841	* and IP options.	2841	* Note: in the latter case, FIN packet will be sent after a timeout,
		2842	* as TCP stack thinks it has already been transmitted.
2842	*/	2843	*/
2843	mss_now = tcp_current_mss(sk);	2844	if (tskb && (tcp_send_head(sk) \|\| sk_under_memory_pressure(sk))) {
2844		2845	coalesce:
2845	if (tcp_send_head(sk)) {	2846	TCP_SKB_CB(tskb)->tcp_flags \|= TCPHDR_FIN;
2846	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_FIN;	2847	TCP_SKB_CB(tskb)->end_seq++;
2847	TCP_SKB_CB(skb)->end_seq++;
2848	tp->write_seq++;	2848	tp->write_seq++;
		2849	if (!tcp_send_head(sk)) {
		2850	/* This means tskb was already sent.
		2851	* Pretend we included the FIN on previous transmit.
		2852	* We need to set tp->snd_nxt to the value it would have
		2853	* if FIN had been sent. This is because retransmit path
		2854	* does not change tp->snd_nxt.
		2855	*/
		2856	tp->snd_nxt++;
		2857	return;
		2858	}
2849	} else {	2859	} else {
2850	/* Socket is locked, keep trying until memory is available. */	2860	skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
2851	for (;;) {	2861	if (unlikely(!skb)) {
2852	skb = alloc_skb_fclone(MAX_TCP_HEADER,	2862	if (tskb)
2853	sk->sk_allocation);	2863	goto coalesce;
2854	if (skb)	2864	return;
2855	break;
2856	yield();
2857	}	2865	}
2858	skb_reserve(skb, MAX_TCP_HEADER);	2866	skb_reserve(skb, MAX_TCP_HEADER);
2859	sk_forced_wmem_schedule(sk, skb->truesize);	2867	sk_forced_wmem_schedule(sk, skb->truesize);
@@ -2862,7 +2870,7 @@ void tcp_send_fin(struct sock *sk)
2862	TCPHDR_ACK \| TCPHDR_FIN);	2870	TCPHDR_ACK \| TCPHDR_FIN);
2863	tcp_queue_skb(sk, skb);	2871	tcp_queue_skb(sk, skb);
2864	}	2872	}
2865	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);	2873	__tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
2866	}	2874	}
2867		2875
2868	/* We get here when a process closes a file descriptor (either due to	2876	/* We get here when a process closes a file descriptor (either due to