tcp: TCP Small Queues

This introduce TSQ (TCP Small Queues) TSQ goal is to reduce number of TCP packets in xmit queues (qdisc & device queues), to reduce RTT and cwnd bias, part of the bufferbloat problem. sk->sk_wmem_alloc not allowed to grow above a given limit, allowing no more than ~128KB [1] per tcp socket in qdisc/dev layers at a given time. TSO packets are sized/capped to half the limit, so that we have two TSO packets in flight, allowing better bandwidth use. As a side effect, setting the limit to 40000 automatically reduces the standard gso max limit (65536) to 40000/2 : It can help to reduce latencies of high prio packets, having smaller TSO packets. This means we divert sock_wfree() to a tcp_wfree() handler, to queue/send following frames when skb_orphan() [2] is called for the already queued skbs. Results on my dev machines (tg3/ixgbe nics) are really impressive, using standard pfifo_fast, and with or without TSO/GSO. Without reduction of nominal bandwidth, we have reduction of buffering per bulk sender : < 1ms on Gbit (instead of 50ms with TSO) < 8ms on 100Mbit (instead of 132 ms) I no longer have 4 MBytes backlogged in qdisc by a single netperf session, and both side socket autotuning no longer use 4 Mbytes. As skb destructor cannot restart xmit itself ( as qdisc lock might be taken at this point ), we delegate the work to a tasklet. We use one tasklest per cpu for performance reasons. If tasklet finds a socket owned by the user, it sets TSQ_OWNED flag. This flag is tested in a new protocol method called from release_sock(), to eventually send new segments. [1] New /proc/sys/net/ipv4/tcp_limit_output_bytes tunable [2] skb_orphan() is usually called at TX completion time, but some drivers call it in their start_xmit() handler. These drivers should at least use BQL, or else a single TCP session can still fill the whole NIC TX ring, since TSQ will have no effect. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Dave Taht <dave.taht@bufferbloat.net> Cc: Tom Herbert <therbert@google.com> Cc: Matt Mathis <mattmathis@google.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: Nandita Dukkipati <nanditad@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Eric Dumazet <eric.dumazet@gmail.com> 2012-07-11 01:50:31 -0400
committer: David S. Miller <davem@davemloft.net> 2012-07-11 21:12:59 -0400
commit: 46d3ceabd8d98ed0ad10f20c595ca784e34786c5 (patch)
tree: 771200292431be56c6ebcb23af9206bc03d40e65 /net/ipv4/tcp_output.c
parent: 2100844ca9d7055d5cddce2f8ed13af94c01f85b (diff)
1 files changed, 153 insertions, 1 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c465d3e51e28..03854abfd9d8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -50,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1;
 */
 int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
+/* Default TSQ limit of two TSO segments */
+int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
 /* This limits the percentage of the congestion window which we
 * will allow a single TSO frame to consume.  Building TSO frames
 * which are too large can cause TCP streams to be bursty.
@@ -65,6 +68,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
 EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
+static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+                           int push_one, gfp_t gfp);
 /* Account for new data that has been sent to the network. */
 static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
@@ -783,6 +788,140 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
        return size;
 }
+/* TCP SMALL QUEUES (TSQ)
+ *
+ * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
+ * to reduce RTT and bufferbloat.
+ * We do this using a special skb destructor (tcp_wfree).
+ *
+ * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
+ * needs to be reallocated in a driver.
+ * The invariant being skb->truesize substracted from sk->sk_wmem_alloc
+ *
+ * Since transmit from skb destructor is forbidden, we use a tasklet
+ * to process all sockets that eventually need to send more skbs.
+ * We use one tasklet per cpu, with its own queue of sockets.
+ */
+struct tsq_tasklet {
+        struct tasklet_struct   tasklet;
+        struct list_head        head; /* queue of tcp sockets */
+};
+static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
+/*
+ * One tasklest per cpu tries to send more skbs.
+ * We run in tasklet context but need to disable irqs when
+ * transfering tsq->head because tcp_wfree() might
+ * interrupt us (non NAPI drivers)
+ */
+static void tcp_tasklet_func(unsigned long data)
+{
+        struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
+        LIST_HEAD(list);
+        unsigned long flags;
+        struct list_head *q, *n;
+        struct tcp_sock *tp;
+        struct sock *sk;
+        local_irq_save(flags);
+        list_splice_init(&tsq->head, &list);
+        local_irq_restore(flags);
+        list_for_each_safe(q, n, &list) {
+                tp = list_entry(q, struct tcp_sock, tsq_node);
+                list_del(&tp->tsq_node);
+                sk = (struct sock *)tp;
+                bh_lock_sock(sk);
+                if (!sock_owned_by_user(sk)) {
+                        if ((1 << sk->sk_state) &
+                            (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
+                             TCPF_CLOSING | TCPF_CLOSE_WAIT))
+                                tcp_write_xmit(sk,
+                                               tcp_current_mss(sk),
+                                               0, 0,
+                                               GFP_ATOMIC);
+                } else {
+                        /* defer the work to tcp_release_cb() */
+                        set_bit(TSQ_OWNED, &tp->tsq_flags);
+                }
+                bh_unlock_sock(sk);
+                clear_bit(TSQ_QUEUED, &tp->tsq_flags);
+                sk_free(sk);
+        }
+}
+/**
+ * tcp_release_cb - tcp release_sock() callback
+ * @sk: socket
+ *
+ * called from release_sock() to perform protocol dependent
+ * actions before socket release.
+ */
+void tcp_release_cb(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        if (test_and_clear_bit(TSQ_OWNED, &tp->tsq_flags)) {
+                if ((1 << sk->sk_state) &
+                    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
+                     TCPF_CLOSING | TCPF_CLOSE_WAIT))
+                        tcp_write_xmit(sk,
+                                       tcp_current_mss(sk),
+                                       0, 0,
+                                       GFP_ATOMIC);
+        }
+}
+EXPORT_SYMBOL(tcp_release_cb);
+void __init tcp_tasklet_init(void)
+{
+        int i;
+        for_each_possible_cpu(i) {
+                struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
+                INIT_LIST_HEAD(&tsq->head);
+                tasklet_init(&tsq->tasklet,
+                             tcp_tasklet_func,
+                             (unsigned long)tsq);
+        }
+}
+/*
+ * Write buffer destructor automatically called from kfree_skb.
+ * We cant xmit new skbs from this context, as we might already
+ * hold qdisc lock.
+ */
+void tcp_wfree(struct sk_buff *skb)
+{
+        struct sock *sk = skb->sk;
+        struct tcp_sock *tp = tcp_sk(sk);
+        if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
+            !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
+                unsigned long flags;
+                struct tsq_tasklet *tsq;
+                /* Keep a ref on socket.
+                 * This last ref will be released in tcp_tasklet_func()
+                 */
+                atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
+                /* queue this socket to tasklet queue */
+                local_irq_save(flags);
+                tsq = &__get_cpu_var(tsq_tasklet);
+                list_add(&tp->tsq_node, &tsq->head);
+                tasklet_schedule(&tsq->tasklet);
+                local_irq_restore(flags);
+        } else {
+                sock_wfree(skb);
+        }
+}
 /* This routine actually transmits TCP packets queued in by
 * tcp_do_sendmsg().  This is used by both the initial
 * transmission and possible later retransmissions.
@@ -844,7 +983,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
        skb_push(skb, tcp_header_size);
        skb_reset_transport_header(skb);
-        skb_set_owner_w(skb, sk);
+        skb_orphan(skb);
+        skb->sk = sk;
+        skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
+                          tcp_wfree : sock_wfree;
+        atomic_add(skb->truesize, &sk->sk_wmem_alloc);
        /* Build TCP header and checksum it. */
        th = tcp_hdr(skb);
@@ -1780,6 +1924,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
        while ((skb = tcp_send_head(sk))) {
                unsigned int limit;
                tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
                BUG_ON(!tso_segs);
@@ -1800,6 +1945,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                                break;
                }
+                /* TSQ : sk_wmem_alloc accounts skb truesize,
+                 * including skb overhead. But thats OK.
+                 */
+                if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
+                        set_bit(TSQ_THROTTLED, &tp->tsq_flags);
+                        break;
+                }
                limit = mss_now;
                if (tso_segs > 1 && !tcp_urg_mode(tp))
                        limit = tcp_mss_split_point(sk, skb, mss_now,
author	Eric Dumazet <eric.dumazet@gmail.com>	2012-07-11 01:50:31 -0400
committer	David S. Miller <davem@davemloft.net>	2012-07-11 21:12:59 -0400
commit	46d3ceabd8d98ed0ad10f20c595ca784e34786c5 (patch)
tree	771200292431be56c6ebcb23af9206bc03d40e65 /net/ipv4/tcp_output.c
parent	2100844ca9d7055d5cddce2f8ed13af94c01f85b (diff)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c465d3e51e28..03854abfd9d8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c
@@ -50,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1;
50	*/	50	*/
51	int sysctl_tcp_workaround_signed_windows __read_mostly = 0;	51	int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
52		52
		53	/* Default TSQ limit of two TSO segments */
		54	int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
		55
53	/* This limits the percentage of the congestion window which we	56	/* This limits the percentage of the congestion window which we
54	* will allow a single TSO frame to consume. Building TSO frames	57	* will allow a single TSO frame to consume. Building TSO frames
55	* which are too large can cause TCP streams to be bursty.	58	* which are too large can cause TCP streams to be bursty.
@@ -65,6 +68,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
65	int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */	68	int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
66	EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);	69	EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
67		70
		71	static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
		72	int push_one, gfp_t gfp);
68		73
69	/* Account for new data that has been sent to the network. */	74	/* Account for new data that has been sent to the network. */
70	static void tcp_event_new_data_sent(struct sock sk, const struct sk_buff skb)	75	static void tcp_event_new_data_sent(struct sock sk, const struct sk_buff skb)
@@ -783,6 +788,140 @@ static unsigned int tcp_established_options(struct sock sk, struct sk_buff skb
783	return size;	788	return size;
784	}	789	}
785		790
		791
		792	/* TCP SMALL QUEUES (TSQ)
		793	*
		794	* TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
		795	* to reduce RTT and bufferbloat.
		796	* We do this using a special skb destructor (tcp_wfree).
		797	*
		798	* Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
		799	* needs to be reallocated in a driver.
		800	* The invariant being skb->truesize substracted from sk->sk_wmem_alloc
		801	*
		802	* Since transmit from skb destructor is forbidden, we use a tasklet
		803	* to process all sockets that eventually need to send more skbs.
		804	* We use one tasklet per cpu, with its own queue of sockets.
		805	*/
		806	struct tsq_tasklet {
		807	struct tasklet_struct tasklet;
		808	struct list_head head; /* queue of tcp sockets */
		809	};
		810	static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
		811
		812	/*
		813	* One tasklest per cpu tries to send more skbs.
		814	* We run in tasklet context but need to disable irqs when
		815	* transfering tsq->head because tcp_wfree() might
		816	* interrupt us (non NAPI drivers)
		817	*/
		818	static void tcp_tasklet_func(unsigned long data)
		819	{
		820	struct tsq_tasklet tsq = (struct tsq_tasklet )data;
		821	LIST_HEAD(list);
		822	unsigned long flags;
		823	struct list_head q, n;
		824	struct tcp_sock *tp;
		825	struct sock *sk;
		826
		827	local_irq_save(flags);
		828	list_splice_init(&tsq->head, &list);
		829	local_irq_restore(flags);
		830
		831	list_for_each_safe(q, n, &list) {
		832	tp = list_entry(q, struct tcp_sock, tsq_node);
		833	list_del(&tp->tsq_node);
		834
		835	sk = (struct sock *)tp;
		836	bh_lock_sock(sk);
		837
		838	if (!sock_owned_by_user(sk)) {
		839	if ((1 << sk->sk_state) &
		840	(TCPF_ESTABLISHED \| TCPF_FIN_WAIT1 \|
		841	TCPF_CLOSING \| TCPF_CLOSE_WAIT))
		842	tcp_write_xmit(sk,
		843	tcp_current_mss(sk),
		844	0, 0,
		845	GFP_ATOMIC);
		846	} else {
		847	/* defer the work to tcp_release_cb() */
		848	set_bit(TSQ_OWNED, &tp->tsq_flags);
		849	}
		850	bh_unlock_sock(sk);
		851
		852	clear_bit(TSQ_QUEUED, &tp->tsq_flags);
		853	sk_free(sk);
		854	}
		855	}
		856
		857	/**
		858	* tcp_release_cb - tcp release_sock() callback
		859	* @sk: socket
		860	*
		861	* called from release_sock() to perform protocol dependent
		862	* actions before socket release.
		863	*/
		864	void tcp_release_cb(struct sock *sk)
		865	{
		866	struct tcp_sock *tp = tcp_sk(sk);
		867
		868	if (test_and_clear_bit(TSQ_OWNED, &tp->tsq_flags)) {
		869	if ((1 << sk->sk_state) &
		870	(TCPF_ESTABLISHED \| TCPF_FIN_WAIT1 \|
		871	TCPF_CLOSING \| TCPF_CLOSE_WAIT))
		872	tcp_write_xmit(sk,
		873	tcp_current_mss(sk),
		874	0, 0,
		875	GFP_ATOMIC);
		876	}
		877	}
		878	EXPORT_SYMBOL(tcp_release_cb);
		879
		880	void __init tcp_tasklet_init(void)
		881	{
		882	int i;
		883
		884	for_each_possible_cpu(i) {
		885	struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
		886
		887	INIT_LIST_HEAD(&tsq->head);
		888	tasklet_init(&tsq->tasklet,
		889	tcp_tasklet_func,
		890	(unsigned long)tsq);
		891	}
		892	}
		893
		894	/*
		895	* Write buffer destructor automatically called from kfree_skb.
		896	* We cant xmit new skbs from this context, as we might already
		897	* hold qdisc lock.
		898	*/
		899	void tcp_wfree(struct sk_buff *skb)
		900	{
		901	struct sock *sk = skb->sk;
		902	struct tcp_sock *tp = tcp_sk(sk);
		903
		904	if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
		905	!test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
		906	unsigned long flags;
		907	struct tsq_tasklet *tsq;
		908
		909	/* Keep a ref on socket.
		910	* This last ref will be released in tcp_tasklet_func()
		911	*/
		912	atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
		913
		914	/* queue this socket to tasklet queue */
		915	local_irq_save(flags);
		916	tsq = &__get_cpu_var(tsq_tasklet);
		917	list_add(&tp->tsq_node, &tsq->head);
		918	tasklet_schedule(&tsq->tasklet);
		919	local_irq_restore(flags);
		920	} else {
		921	sock_wfree(skb);
		922	}
		923	}
		924
786	/* This routine actually transmits TCP packets queued in by	925	/* This routine actually transmits TCP packets queued in by
787	* tcp_do_sendmsg(). This is used by both the initial	926	* tcp_do_sendmsg(). This is used by both the initial
788	* transmission and possible later retransmissions.	927	* transmission and possible later retransmissions.
@@ -844,7 +983,12 @@ static int tcp_transmit_skb(struct sock sk, struct sk_buff skb, int clone_it,
844		983
845	skb_push(skb, tcp_header_size);	984	skb_push(skb, tcp_header_size);
846	skb_reset_transport_header(skb);	985	skb_reset_transport_header(skb);
847	skb_set_owner_w(skb, sk);	986
		987	skb_orphan(skb);
		988	skb->sk = sk;
		989	skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
		990	tcp_wfree : sock_wfree;
		991	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
848		992
849	/* Build TCP header and checksum it. */	993	/* Build TCP header and checksum it. */
850	th = tcp_hdr(skb);	994	th = tcp_hdr(skb);
@@ -1780,6 +1924,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1780	while ((skb = tcp_send_head(sk))) {	1924	while ((skb = tcp_send_head(sk))) {
1781	unsigned int limit;	1925	unsigned int limit;
1782		1926
		1927
1783	tso_segs = tcp_init_tso_segs(sk, skb, mss_now);	1928	tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1784	BUG_ON(!tso_segs);	1929	BUG_ON(!tso_segs);
1785		1930
@@ -1800,6 +1945,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1800	break;	1945	break;
1801	}	1946	}
1802		1947
		1948	/* TSQ : sk_wmem_alloc accounts skb truesize,
		1949	* including skb overhead. But thats OK.
		1950	*/
		1951	if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
		1952	set_bit(TSQ_THROTTLED, &tp->tsq_flags);
		1953	break;
		1954	}
1803	limit = mss_now;	1955	limit = mss_now;
1804	if (tso_segs > 1 && !tcp_urg_mode(tp))	1956	if (tso_segs > 1 && !tcp_urg_mode(tp))
1805	limit = tcp_mss_split_point(sk, skb, mss_now,	1957	limit = tcp_mss_split_point(sk, skb, mss_now,