tcp: TCP Small Queues

This introduce TSQ (TCP Small Queues) TSQ goal is to reduce number of TCP packets in xmit queues (qdisc & device queues), to reduce RTT and cwnd bias, part of the bufferbloat problem. sk->sk_wmem_alloc not allowed to grow above a given limit, allowing no more than ~128KB [1] per tcp socket in qdisc/dev layers at a given time. TSO packets are sized/capped to half the limit, so that we have two TSO packets in flight, allowing better bandwidth use. As a side effect, setting the limit to 40000 automatically reduces the standard gso max limit (65536) to 40000/2 : It can help to reduce latencies of high prio packets, having smaller TSO packets. This means we divert sock_wfree() to a tcp_wfree() handler, to queue/send following frames when skb_orphan() [2] is called for the already queued skbs. Results on my dev machines (tg3/ixgbe nics) are really impressive, using standard pfifo_fast, and with or without TSO/GSO. Without reduction of nominal bandwidth, we have reduction of buffering per bulk sender : < 1ms on Gbit (instead of 50ms with TSO) < 8ms on 100Mbit (instead of 132 ms) I no longer have 4 MBytes backlogged in qdisc by a single netperf session, and both side socket autotuning no longer use 4 Mbytes. As skb destructor cannot restart xmit itself ( as qdisc lock might be taken at this point ), we delegate the work to a tasklet. We use one tasklest per cpu for performance reasons. If tasklet finds a socket owned by the user, it sets TSQ_OWNED flag. This flag is tested in a new protocol method called from release_sock(), to eventually send new segments. [1] New /proc/sys/net/ipv4/tcp_limit_output_bytes tunable [2] skb_orphan() is usually called at TX completion time, but some drivers call it in their start_xmit() handler. These drivers should at least use BQL, or else a single TCP session can still fill the whole NIC TX ring, since TSQ will have no effect. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Dave Taht <dave.taht@bufferbloat.net> Cc: Tom Herbert <therbert@google.com> Cc: Matt Mathis <mattmathis@google.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: Nandita Dukkipati <nanditad@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Eric Dumazet <eric.dumazet@gmail.com> 2012-07-11 01:50:31 -0400
committer: David S. Miller <davem@davemloft.net> 2012-07-11 21:12:59 -0400
commit: 46d3ceabd8d98ed0ad10f20c595ca784e34786c5 (patch)
tree: 771200292431be56c6ebcb23af9206bc03d40e65
parent: 2100844ca9d7055d5cddce2f8ed13af94c01f85b (diff)
11 files changed, 202 insertions, 1 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 47b6c79e9b05..e20c17a7d34e 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -551,6 +551,20 @@ tcp_thin_dupack - BOOLEAN
        Documentation/networking/tcp-thin.txt
        Default: 0
+tcp_limit_output_bytes - INTEGER
+        Controls TCP Small Queue limit per tcp socket.
+        TCP bulk sender tends to increase packets in flight until it
+        gets losses notifications. With SNDBUF autotuning, this can
+        result in a large amount of packets queued in qdisc/device
+        on the local machine, hurting latency of other flows, for
+        typical pfifo_fast qdiscs.
+        tcp_limit_output_bytes limits the number of bytes on qdisc
+        or device to reduce artificial RTT/cwnd and reduce bufferbloat.
+        Note: For GSO/TSO enabled flows, we try to have at least two
+        packets in flight. Reducing tcp_limit_output_bytes might also
+        reduce the size of individual GSO packet (64KB being the max)
+        Default: 131072
 UDP variables:
 udp_mem - vector of 3 INTEGERs: min, pressure, max
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 2de9cf46f9fc..1888169e07c7 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -339,6 +339,9 @@ struct tcp_sock {
        u32     rcv_tstamp;     /* timestamp of last received ACK (for keepalives) */
        u32     lsndtime;       /* timestamp of last sent data packet (for restart window) */
+        struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
+        unsigned long   tsq_flags;
        /* Data for direct copy to user */
        struct {
                struct sk_buff_head     prequeue;
@@ -494,6 +497,12 @@ struct tcp_sock {
        struct tcp_cookie_values  *cookie_values;
 };
+enum tsq_flags {
+        TSQ_THROTTLED,
+        TSQ_QUEUED,
+        TSQ_OWNED, /* tcp_tasklet_func() found socket was locked */
+};
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
 {
        return (struct tcp_sock *)sk;
diff --git a/include/net/sock.h b/include/net/sock.h
index dcb54a0793ec..88de092df50f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -858,6 +858,8 @@ struct proto {
        int                     (*backlog_rcv) (struct sock *sk,
                                                struct sk_buff *skb);
+        void            (*release_cb)(struct sock *sk);
        /* Keeping track of sk's, looking them up, and port selection methods. */
        void                    (*hash)(struct sock *sk);
        void                    (*unhash)(struct sock *sk);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3618fefae049..439984b9af49 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -253,6 +253,7 @@ extern int sysctl_tcp_cookie_size;
 extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_early_retrans;
+extern int sysctl_tcp_limit_output_bytes;
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -321,6 +322,8 @@ extern struct proto tcp_prot;
 extern void tcp_init_mem(struct net *net);
+extern void tcp_tasklet_init(void);
 extern void tcp_v4_err(struct sk_buff *skb, u32);
 extern void tcp_shutdown (struct sock *sk, int how);
@@ -334,6 +337,7 @@ extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                       size_t size);
 extern int tcp_sendpage(struct sock *sk, struct page *page, int offset,
                        size_t size, int flags);
+extern void tcp_release_cb(struct sock *sk);
 extern int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
 extern int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                                 const struct tcphdr *th, unsigned int len);
diff --git a/net/core/sock.c b/net/core/sock.c
index 929bdcc2383b..24039ac12426 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2159,6 +2159,10 @@ void release_sock(struct sock *sk)
        spin_lock_bh(&sk->sk_lock.slock);
        if (sk->sk_backlog.tail)
                __release_sock(sk);
+        if (sk->sk_prot->release_cb)
+                sk->sk_prot->release_cb(sk);
        sk->sk_lock.owned = 0;
        if (waitqueue_active(&sk->sk_lock.wq))
                wake_up(&sk->sk_lock.wq);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 12aa0c5867c4..70730f7aeafe 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -598,6 +598,13 @@ static struct ctl_table ipv4_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec
        },
+        {
+                .procname       = "tcp_limit_output_bytes",
+                .data           = &sysctl_tcp_limit_output_bytes,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec
+        },
 #ifdef CONFIG_NET_DMA
        {
                .procname       = "tcp_dma_copybreak",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d902da96d154..4252cd8f39fd 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -376,6 +376,7 @@ void tcp_init_sock(struct sock *sk)
        skb_queue_head_init(&tp->out_of_order_queue);
        tcp_init_xmit_timers(sk);
        tcp_prequeue_init(tp);
+        INIT_LIST_HEAD(&tp->tsq_node);
        icsk->icsk_rto = TCP_TIMEOUT_INIT;
        tp->mdev = TCP_TIMEOUT_INIT;
@@ -796,6 +797,10 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
                                  inet_csk(sk)->icsk_ext_hdr_len -
                                  tp->tcp_header_len);
+                /* TSQ : try to have two TSO segments in flight */
+                xmit_size_goal = min_t(u32, xmit_size_goal,
+                                       sysctl_tcp_limit_output_bytes >> 1);
                xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
                /* We try hard to avoid divides here */
@@ -3574,4 +3579,5 @@ void __init tcp_init(void)
        tcp_secret_primary = &tcp_secret_one;
        tcp_secret_retiring = &tcp_secret_two;
        tcp_secret_secondary = &tcp_secret_two;
+        tcp_tasklet_init();
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ddefd39ac0cf..01545a3fc0f2 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2588,6 +2588,7 @@ struct proto tcp_prot = {
        .sendmsg                = tcp_sendmsg,
        .sendpage               = tcp_sendpage,
        .backlog_rcv            = tcp_v4_do_rcv,
+        .release_cb             = tcp_release_cb,
        .hash                   = inet_hash,
        .unhash                 = inet_unhash,
        .get_port               = inet_csk_get_port,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 65608863fdee..c66f2ede160e 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -424,6 +424,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
                        treq->snt_isn + 1 + tcp_s_data_size(oldtp);
                tcp_prequeue_init(newtp);
+                INIT_LIST_HEAD(&newtp->tsq_node);
                tcp_init_wl(newtp, treq->rcv_isn);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c465d3e51e28..03854abfd9d8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -50,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1;
 */
 int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
+/* Default TSQ limit of two TSO segments */
+int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
 /* This limits the percentage of the congestion window which we
 * will allow a single TSO frame to consume.  Building TSO frames
 * which are too large can cause TCP streams to be bursty.
@@ -65,6 +68,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
 EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
+static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+                           int push_one, gfp_t gfp);
 /* Account for new data that has been sent to the network. */
 static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
@@ -783,6 +788,140 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
        return size;
 }
+/* TCP SMALL QUEUES (TSQ)
+ *
+ * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
+ * to reduce RTT and bufferbloat.
+ * We do this using a special skb destructor (tcp_wfree).
+ *
+ * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
+ * needs to be reallocated in a driver.
+ * The invariant being skb->truesize substracted from sk->sk_wmem_alloc
+ *
+ * Since transmit from skb destructor is forbidden, we use a tasklet
+ * to process all sockets that eventually need to send more skbs.
+ * We use one tasklet per cpu, with its own queue of sockets.
+ */
+struct tsq_tasklet {
+        struct tasklet_struct   tasklet;
+        struct list_head        head; /* queue of tcp sockets */
+};
+static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
+/*
+ * One tasklest per cpu tries to send more skbs.
+ * We run in tasklet context but need to disable irqs when
+ * transfering tsq->head because tcp_wfree() might
+ * interrupt us (non NAPI drivers)
+ */
+static void tcp_tasklet_func(unsigned long data)
+{
+        struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
+        LIST_HEAD(list);
+        unsigned long flags;
+        struct list_head *q, *n;
+        struct tcp_sock *tp;
+        struct sock *sk;
+        local_irq_save(flags);
+        list_splice_init(&tsq->head, &list);
+        local_irq_restore(flags);
+        list_for_each_safe(q, n, &list) {
+                tp = list_entry(q, struct tcp_sock, tsq_node);
+                list_del(&tp->tsq_node);
+                sk = (struct sock *)tp;
+                bh_lock_sock(sk);
+                if (!sock_owned_by_user(sk)) {
+                        if ((1 << sk->sk_state) &
+                            (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
+                             TCPF_CLOSING | TCPF_CLOSE_WAIT))
+                                tcp_write_xmit(sk,
+                                               tcp_current_mss(sk),
+                                               0, 0,
+                                               GFP_ATOMIC);
+                } else {
+                        /* defer the work to tcp_release_cb() */
+                        set_bit(TSQ_OWNED, &tp->tsq_flags);
+                }
+                bh_unlock_sock(sk);
+                clear_bit(TSQ_QUEUED, &tp->tsq_flags);
+                sk_free(sk);
+        }
+}
+/**
+ * tcp_release_cb - tcp release_sock() callback
+ * @sk: socket
+ *
+ * called from release_sock() to perform protocol dependent
+ * actions before socket release.
+ */
+void tcp_release_cb(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        if (test_and_clear_bit(TSQ_OWNED, &tp->tsq_flags)) {
+                if ((1 << sk->sk_state) &
+                    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
+                     TCPF_CLOSING | TCPF_CLOSE_WAIT))
+                        tcp_write_xmit(sk,
+                                       tcp_current_mss(sk),
+                                       0, 0,
+                                       GFP_ATOMIC);
+        }
+}
+EXPORT_SYMBOL(tcp_release_cb);
+void __init tcp_tasklet_init(void)
+{
+        int i;
+        for_each_possible_cpu(i) {
+                struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
+                INIT_LIST_HEAD(&tsq->head);
+                tasklet_init(&tsq->tasklet,
+                             tcp_tasklet_func,
+                             (unsigned long)tsq);
+        }
+}
+/*
+ * Write buffer destructor automatically called from kfree_skb.
+ * We cant xmit new skbs from this context, as we might already
+ * hold qdisc lock.
+ */
+void tcp_wfree(struct sk_buff *skb)
+{
+        struct sock *sk = skb->sk;
+        struct tcp_sock *tp = tcp_sk(sk);
+        if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
+            !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
+                unsigned long flags;
+                struct tsq_tasklet *tsq;
+                /* Keep a ref on socket.
+                 * This last ref will be released in tcp_tasklet_func()
+                 */
+                atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
+                /* queue this socket to tasklet queue */
+                local_irq_save(flags);
+                tsq = &__get_cpu_var(tsq_tasklet);
+                list_add(&tp->tsq_node, &tsq->head);
+                tasklet_schedule(&tsq->tasklet);
+                local_irq_restore(flags);
+        } else {
+                sock_wfree(skb);
+        }
+}
 /* This routine actually transmits TCP packets queued in by
 * tcp_do_sendmsg().  This is used by both the initial
 * transmission and possible later retransmissions.
@@ -844,7 +983,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
        skb_push(skb, tcp_header_size);
        skb_reset_transport_header(skb);
-        skb_set_owner_w(skb, sk);
+        skb_orphan(skb);
+        skb->sk = sk;
+        skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
+                          tcp_wfree : sock_wfree;
+        atomic_add(skb->truesize, &sk->sk_wmem_alloc);
        /* Build TCP header and checksum it. */
        th = tcp_hdr(skb);
@@ -1780,6 +1924,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
        while ((skb = tcp_send_head(sk))) {
                unsigned int limit;
                tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
                BUG_ON(!tso_segs);
@@ -1800,6 +1945,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                                break;
                }
+                /* TSQ : sk_wmem_alloc accounts skb truesize,
+                 * including skb overhead. But thats OK.
+                 */
+                if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
+                        set_bit(TSQ_THROTTLED, &tp->tsq_flags);
+                        break;
+                }
                limit = mss_now;
                if (tso_segs > 1 && !tcp_urg_mode(tp))
                        limit = tcp_mss_split_point(sk, skb, mss_now,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 61175cb2478f..70458a9cd837 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1970,6 +1970,7 @@ struct proto tcpv6_prot = {
        .sendmsg                = tcp_sendmsg,
        .sendpage               = tcp_sendpage,
        .backlog_rcv            = tcp_v6_do_rcv,
+        .release_cb             = tcp_release_cb,
        .hash                   = tcp_v6_hash,
        .unhash                 = inet_unhash,
        .get_port               = inet_csk_get_port,
author	Eric Dumazet <eric.dumazet@gmail.com>	2012-07-11 01:50:31 -0400
committer	David S. Miller <davem@davemloft.net>	2012-07-11 21:12:59 -0400
commit	46d3ceabd8d98ed0ad10f20c595ca784e34786c5 (patch)
tree	771200292431be56c6ebcb23af9206bc03d40e65
parent	2100844ca9d7055d5cddce2f8ed13af94c01f85b (diff)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 47b6c79e9b05..e20c17a7d34e 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt
@@ -551,6 +551,20 @@ tcp_thin_dupack - BOOLEAN
551	Documentation/networking/tcp-thin.txt	551	Documentation/networking/tcp-thin.txt
552	Default: 0	552	Default: 0
553		553
		554	tcp_limit_output_bytes - INTEGER
		555	Controls TCP Small Queue limit per tcp socket.
		556	TCP bulk sender tends to increase packets in flight until it
		557	gets losses notifications. With SNDBUF autotuning, this can
		558	result in a large amount of packets queued in qdisc/device
		559	on the local machine, hurting latency of other flows, for
		560	typical pfifo_fast qdiscs.
		561	tcp_limit_output_bytes limits the number of bytes on qdisc
		562	or device to reduce artificial RTT/cwnd and reduce bufferbloat.
		563	Note: For GSO/TSO enabled flows, we try to have at least two
		564	packets in flight. Reducing tcp_limit_output_bytes might also
		565	reduce the size of individual GSO packet (64KB being the max)
		566	Default: 131072
		567
554	UDP variables:	568	UDP variables:
555		569
556	udp_mem - vector of 3 INTEGERs: min, pressure, max	570	udp_mem - vector of 3 INTEGERs: min, pressure, max


diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 2de9cf46f9fc..1888169e07c7 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h
@@ -339,6 +339,9 @@ struct tcp_sock {
339	u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */	339	u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */
340	u32 lsndtime; /* timestamp of last sent data packet (for restart window) */	340	u32 lsndtime; /* timestamp of last sent data packet (for restart window) */
341		341
		342	struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
		343	unsigned long tsq_flags;
		344
342	/* Data for direct copy to user */	345	/* Data for direct copy to user */
343	struct {	346	struct {
344	struct sk_buff_head prequeue;	347	struct sk_buff_head prequeue;
@@ -494,6 +497,12 @@ struct tcp_sock {
494	struct tcp_cookie_values *cookie_values;	497	struct tcp_cookie_values *cookie_values;
495	};	498	};
496		499
		500	enum tsq_flags {
		501	TSQ_THROTTLED,
		502	TSQ_QUEUED,
		503	TSQ_OWNED, /* tcp_tasklet_func() found socket was locked */
		504	};
		505
497	static inline struct tcp_sock tcp_sk(const struct sock sk)	506	static inline struct tcp_sock tcp_sk(const struct sock sk)
498	{	507	{
499	return (struct tcp_sock *)sk;	508	return (struct tcp_sock *)sk;


diff --git a/include/net/sock.h b/include/net/sock.h index dcb54a0793ec..88de092df50f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h
@@ -858,6 +858,8 @@ struct proto {
858	int (backlog_rcv) (struct sock sk,	858	int (backlog_rcv) (struct sock sk,
859	struct sk_buff *skb);	859	struct sk_buff *skb);
860		860
		861	void (release_cb)(struct sock sk);
		862
861	/* Keeping track of sk's, looking them up, and port selection methods. */	863	/* Keeping track of sk's, looking them up, and port selection methods. */
862	void (hash)(struct sock sk);	864	void (hash)(struct sock sk);
863	void (unhash)(struct sock sk);	865	void (unhash)(struct sock sk);


diff --git a/include/net/tcp.h b/include/net/tcp.h index 3618fefae049..439984b9af49 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h
@@ -253,6 +253,7 @@ extern int sysctl_tcp_cookie_size;
253	extern int sysctl_tcp_thin_linear_timeouts;	253	extern int sysctl_tcp_thin_linear_timeouts;
254	extern int sysctl_tcp_thin_dupack;	254	extern int sysctl_tcp_thin_dupack;
255	extern int sysctl_tcp_early_retrans;	255	extern int sysctl_tcp_early_retrans;
		256	extern int sysctl_tcp_limit_output_bytes;
256		257
257	extern atomic_long_t tcp_memory_allocated;	258	extern atomic_long_t tcp_memory_allocated;
258	extern struct percpu_counter tcp_sockets_allocated;	259	extern struct percpu_counter tcp_sockets_allocated;
@@ -321,6 +322,8 @@ extern struct proto tcp_prot;
321		322
322	extern void tcp_init_mem(struct net *net);	323	extern void tcp_init_mem(struct net *net);
323		324
		325	extern void tcp_tasklet_init(void);
		326
324	extern void tcp_v4_err(struct sk_buff *skb, u32);	327	extern void tcp_v4_err(struct sk_buff *skb, u32);
325		328
326	extern void tcp_shutdown (struct sock *sk, int how);	329	extern void tcp_shutdown (struct sock *sk, int how);
@@ -334,6 +337,7 @@ extern int tcp_sendmsg(struct kiocb iocb, struct sock sk, struct msghdr *msg,
334	size_t size);	337	size_t size);
335	extern int tcp_sendpage(struct sock sk, struct page page, int offset,	338	extern int tcp_sendpage(struct sock sk, struct page page, int offset,
336	size_t size, int flags);	339	size_t size, int flags);
		340	extern void tcp_release_cb(struct sock *sk);
337	extern int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);	341	extern int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
338	extern int tcp_rcv_state_process(struct sock sk, struct sk_buff skb,	342	extern int tcp_rcv_state_process(struct sock sk, struct sk_buff skb,
339	const struct tcphdr *th, unsigned int len);	343	const struct tcphdr *th, unsigned int len);


diff --git a/net/core/sock.c b/net/core/sock.c index 929bdcc2383b..24039ac12426 100644 --- a/net/core/sock.c +++ b/net/core/sock.c
@@ -2159,6 +2159,10 @@ void release_sock(struct sock *sk)
2159	spin_lock_bh(&sk->sk_lock.slock);	2159	spin_lock_bh(&sk->sk_lock.slock);
2160	if (sk->sk_backlog.tail)	2160	if (sk->sk_backlog.tail)
2161	__release_sock(sk);	2161	__release_sock(sk);
		2162
		2163	if (sk->sk_prot->release_cb)
		2164	sk->sk_prot->release_cb(sk);
		2165
2162	sk->sk_lock.owned = 0;	2166	sk->sk_lock.owned = 0;
2163	if (waitqueue_active(&sk->sk_lock.wq))	2167	if (waitqueue_active(&sk->sk_lock.wq))
2164	wake_up(&sk->sk_lock.wq);	2168	wake_up(&sk->sk_lock.wq);


diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 12aa0c5867c4..70730f7aeafe 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c
@@ -598,6 +598,13 @@ static struct ctl_table ipv4_table[] = {
598	.mode = 0644,	598	.mode = 0644,
599	.proc_handler = proc_dointvec	599	.proc_handler = proc_dointvec
600	},	600	},
		601	{
		602	.procname = "tcp_limit_output_bytes",
		603	.data = &sysctl_tcp_limit_output_bytes,
		604	.maxlen = sizeof(int),
		605	.mode = 0644,
		606	.proc_handler = proc_dointvec
		607	},
601	#ifdef CONFIG_NET_DMA	608	#ifdef CONFIG_NET_DMA
602	{	609	{
603	.procname = "tcp_dma_copybreak",	610	.procname = "tcp_dma_copybreak",


diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d902da96d154..4252cd8f39fd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c
@@ -376,6 +376,7 @@ void tcp_init_sock(struct sock *sk)
376	skb_queue_head_init(&tp->out_of_order_queue);	376	skb_queue_head_init(&tp->out_of_order_queue);
377	tcp_init_xmit_timers(sk);	377	tcp_init_xmit_timers(sk);
378	tcp_prequeue_init(tp);	378	tcp_prequeue_init(tp);
		379	INIT_LIST_HEAD(&tp->tsq_node);
379		380
380	icsk->icsk_rto = TCP_TIMEOUT_INIT;	381	icsk->icsk_rto = TCP_TIMEOUT_INIT;
381	tp->mdev = TCP_TIMEOUT_INIT;	382	tp->mdev = TCP_TIMEOUT_INIT;
@@ -796,6 +797,10 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
796	inet_csk(sk)->icsk_ext_hdr_len -	797	inet_csk(sk)->icsk_ext_hdr_len -
797	tp->tcp_header_len);	798	tp->tcp_header_len);
798		799
		800	/* TSQ : try to have two TSO segments in flight */
		801	xmit_size_goal = min_t(u32, xmit_size_goal,
		802	sysctl_tcp_limit_output_bytes >> 1);
		803
799	xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);	804	xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
800		805
801	/* We try hard to avoid divides here */	806	/* We try hard to avoid divides here */
@@ -3574,4 +3579,5 @@ void __init tcp_init(void)
3574	tcp_secret_primary = &tcp_secret_one;	3579	tcp_secret_primary = &tcp_secret_one;
3575	tcp_secret_retiring = &tcp_secret_two;	3580	tcp_secret_retiring = &tcp_secret_two;
3576	tcp_secret_secondary = &tcp_secret_two;	3581	tcp_secret_secondary = &tcp_secret_two;
		3582	tcp_tasklet_init();
3577	}	3583	}


diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ddefd39ac0cf..01545a3fc0f2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c
@@ -2588,6 +2588,7 @@ struct proto tcp_prot = {
2588	.sendmsg = tcp_sendmsg,	2588	.sendmsg = tcp_sendmsg,
2589	.sendpage = tcp_sendpage,	2589	.sendpage = tcp_sendpage,
2590	.backlog_rcv = tcp_v4_do_rcv,	2590	.backlog_rcv = tcp_v4_do_rcv,
		2591	.release_cb = tcp_release_cb,
2591	.hash = inet_hash,	2592	.hash = inet_hash,
2592	.unhash = inet_unhash,	2593	.unhash = inet_unhash,
2593	.get_port = inet_csk_get_port,	2594	.get_port = inet_csk_get_port,


diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 65608863fdee..c66f2ede160e 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c
@@ -424,6 +424,7 @@ struct sock tcp_create_openreq_child(struct sock sk, struct request_sock *req,
424	treq->snt_isn + 1 + tcp_s_data_size(oldtp);	424	treq->snt_isn + 1 + tcp_s_data_size(oldtp);
425		425
426	tcp_prequeue_init(newtp);	426	tcp_prequeue_init(newtp);
		427	INIT_LIST_HEAD(&newtp->tsq_node);
427		428
428	tcp_init_wl(newtp, treq->rcv_isn);	429	tcp_init_wl(newtp, treq->rcv_isn);
429		430


diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c465d3e51e28..03854abfd9d8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c
@@ -50,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1;
50	*/	50	*/
51	int sysctl_tcp_workaround_signed_windows __read_mostly = 0;	51	int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
52		52
		53	/* Default TSQ limit of two TSO segments */
		54	int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
		55
53	/* This limits the percentage of the congestion window which we	56	/* This limits the percentage of the congestion window which we
54	* will allow a single TSO frame to consume. Building TSO frames	57	* will allow a single TSO frame to consume. Building TSO frames
55	* which are too large can cause TCP streams to be bursty.	58	* which are too large can cause TCP streams to be bursty.
@@ -65,6 +68,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
65	int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */	68	int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
66	EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);	69	EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
67		70
		71	static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
		72	int push_one, gfp_t gfp);
68		73
69	/* Account for new data that has been sent to the network. */	74	/* Account for new data that has been sent to the network. */
70	static void tcp_event_new_data_sent(struct sock sk, const struct sk_buff skb)	75	static void tcp_event_new_data_sent(struct sock sk, const struct sk_buff skb)
@@ -783,6 +788,140 @@ static unsigned int tcp_established_options(struct sock sk, struct sk_buff skb
783	return size;	788	return size;
784	}	789	}
785		790
		791
		792	/* TCP SMALL QUEUES (TSQ)
		793	*
		794	* TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
		795	* to reduce RTT and bufferbloat.
		796	* We do this using a special skb destructor (tcp_wfree).
		797	*
		798	* Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
		799	* needs to be reallocated in a driver.
		800	* The invariant being skb->truesize substracted from sk->sk_wmem_alloc
		801	*
		802	* Since transmit from skb destructor is forbidden, we use a tasklet
		803	* to process all sockets that eventually need to send more skbs.
		804	* We use one tasklet per cpu, with its own queue of sockets.
		805	*/
		806	struct tsq_tasklet {
		807	struct tasklet_struct tasklet;
		808	struct list_head head; /* queue of tcp sockets */
		809	};
		810	static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
		811
		812	/*
		813	* One tasklest per cpu tries to send more skbs.
		814	* We run in tasklet context but need to disable irqs when
		815	* transfering tsq->head because tcp_wfree() might
		816	* interrupt us (non NAPI drivers)
		817	*/
		818	static void tcp_tasklet_func(unsigned long data)
		819	{
		820	struct tsq_tasklet tsq = (struct tsq_tasklet )data;
		821	LIST_HEAD(list);
		822	unsigned long flags;
		823	struct list_head q, n;
		824	struct tcp_sock *tp;
		825	struct sock *sk;
		826
		827	local_irq_save(flags);
		828	list_splice_init(&tsq->head, &list);
		829	local_irq_restore(flags);
		830
		831	list_for_each_safe(q, n, &list) {
		832	tp = list_entry(q, struct tcp_sock, tsq_node);
		833	list_del(&tp->tsq_node);
		834
		835	sk = (struct sock *)tp;
		836	bh_lock_sock(sk);
		837
		838	if (!sock_owned_by_user(sk)) {
		839	if ((1 << sk->sk_state) &
		840	(TCPF_ESTABLISHED \| TCPF_FIN_WAIT1 \|
		841	TCPF_CLOSING \| TCPF_CLOSE_WAIT))
		842	tcp_write_xmit(sk,
		843	tcp_current_mss(sk),
		844	0, 0,
		845	GFP_ATOMIC);
		846	} else {
		847	/* defer the work to tcp_release_cb() */
		848	set_bit(TSQ_OWNED, &tp->tsq_flags);
		849	}
		850	bh_unlock_sock(sk);
		851
		852	clear_bit(TSQ_QUEUED, &tp->tsq_flags);
		853	sk_free(sk);
		854	}
		855	}
		856
		857	/**
		858	* tcp_release_cb - tcp release_sock() callback
		859	* @sk: socket
		860	*
		861	* called from release_sock() to perform protocol dependent
		862	* actions before socket release.
		863	*/
		864	void tcp_release_cb(struct sock *sk)
		865	{
		866	struct tcp_sock *tp = tcp_sk(sk);
		867
		868	if (test_and_clear_bit(TSQ_OWNED, &tp->tsq_flags)) {
		869	if ((1 << sk->sk_state) &
		870	(TCPF_ESTABLISHED \| TCPF_FIN_WAIT1 \|
		871	TCPF_CLOSING \| TCPF_CLOSE_WAIT))
		872	tcp_write_xmit(sk,
		873	tcp_current_mss(sk),
		874	0, 0,
		875	GFP_ATOMIC);
		876	}
		877	}
		878	EXPORT_SYMBOL(tcp_release_cb);
		879
		880	void __init tcp_tasklet_init(void)
		881	{
		882	int i;
		883
		884	for_each_possible_cpu(i) {
		885	struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
		886
		887	INIT_LIST_HEAD(&tsq->head);
		888	tasklet_init(&tsq->tasklet,
		889	tcp_tasklet_func,
		890	(unsigned long)tsq);
		891	}
		892	}
		893
		894	/*
		895	* Write buffer destructor automatically called from kfree_skb.
		896	* We cant xmit new skbs from this context, as we might already
		897	* hold qdisc lock.
		898	*/
		899	void tcp_wfree(struct sk_buff *skb)
		900	{
		901	struct sock *sk = skb->sk;
		902	struct tcp_sock *tp = tcp_sk(sk);
		903
		904	if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
		905	!test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
		906	unsigned long flags;
		907	struct tsq_tasklet *tsq;
		908
		909	/* Keep a ref on socket.
		910	* This last ref will be released in tcp_tasklet_func()
		911	*/
		912	atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
		913
		914	/* queue this socket to tasklet queue */
		915	local_irq_save(flags);
		916	tsq = &__get_cpu_var(tsq_tasklet);
		917	list_add(&tp->tsq_node, &tsq->head);
		918	tasklet_schedule(&tsq->tasklet);
		919	local_irq_restore(flags);
		920	} else {
		921	sock_wfree(skb);
		922	}
		923	}
		924
786	/* This routine actually transmits TCP packets queued in by	925	/* This routine actually transmits TCP packets queued in by
787	* tcp_do_sendmsg(). This is used by both the initial	926	* tcp_do_sendmsg(). This is used by both the initial
788	* transmission and possible later retransmissions.	927	* transmission and possible later retransmissions.
@@ -844,7 +983,12 @@ static int tcp_transmit_skb(struct sock sk, struct sk_buff skb, int clone_it,
844		983
845	skb_push(skb, tcp_header_size);	984	skb_push(skb, tcp_header_size);
846	skb_reset_transport_header(skb);	985	skb_reset_transport_header(skb);
847	skb_set_owner_w(skb, sk);	986
		987	skb_orphan(skb);
		988	skb->sk = sk;
		989	skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
		990	tcp_wfree : sock_wfree;
		991	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
848		992
849	/* Build TCP header and checksum it. */	993	/* Build TCP header and checksum it. */
850	th = tcp_hdr(skb);	994	th = tcp_hdr(skb);
@@ -1780,6 +1924,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1780	while ((skb = tcp_send_head(sk))) {	1924	while ((skb = tcp_send_head(sk))) {
1781	unsigned int limit;	1925	unsigned int limit;
1782		1926
		1927
1783	tso_segs = tcp_init_tso_segs(sk, skb, mss_now);	1928	tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1784	BUG_ON(!tso_segs);	1929	BUG_ON(!tso_segs);
1785		1930
@@ -1800,6 +1945,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1800	break;	1945	break;
1801	}	1946	}
1802		1947
		1948	/* TSQ : sk_wmem_alloc accounts skb truesize,
		1949	* including skb overhead. But thats OK.
		1950	*/
		1951	if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
		1952	set_bit(TSQ_THROTTLED, &tp->tsq_flags);
		1953	break;
		1954	}
1803	limit = mss_now;	1955	limit = mss_now;
1804	if (tso_segs > 1 && !tcp_urg_mode(tp))	1956	if (tso_segs > 1 && !tcp_urg_mode(tp))
1805	limit = tcp_mss_split_point(sk, skb, mss_now,	1957	limit = tcp_mss_split_point(sk, skb, mss_now,


diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 61175cb2478f..70458a9cd837 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c
@@ -1970,6 +1970,7 @@ struct proto tcpv6_prot = {
1970	.sendmsg = tcp_sendmsg,	1970	.sendmsg = tcp_sendmsg,
1971	.sendpage = tcp_sendpage,	1971	.sendpage = tcp_sendpage,
1972	.backlog_rcv = tcp_v6_do_rcv,	1972	.backlog_rcv = tcp_v6_do_rcv,
		1973	.release_cb = tcp_release_cb,
1973	.hash = tcp_v6_hash,	1974	.hash = tcp_v6_hash,
1974	.unhash = inet_unhash,	1975	.unhash = inet_unhash,
1975	.get_port = inet_csk_get_port,	1976	.get_port = inet_csk_get_port,