aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
authorEric Dumazet <eric.dumazet@gmail.com>2012-07-11 01:50:31 -0400
committerDavid S. Miller <davem@davemloft.net>2012-07-11 21:12:59 -0400
commit46d3ceabd8d98ed0ad10f20c595ca784e34786c5 (patch)
tree771200292431be56c6ebcb23af9206bc03d40e65 /net/ipv4/tcp_output.c
parent2100844ca9d7055d5cddce2f8ed13af94c01f85b (diff)
tcp: TCP Small Queues
This introduce TSQ (TCP Small Queues) TSQ goal is to reduce number of TCP packets in xmit queues (qdisc & device queues), to reduce RTT and cwnd bias, part of the bufferbloat problem. sk->sk_wmem_alloc not allowed to grow above a given limit, allowing no more than ~128KB [1] per tcp socket in qdisc/dev layers at a given time. TSO packets are sized/capped to half the limit, so that we have two TSO packets in flight, allowing better bandwidth use. As a side effect, setting the limit to 40000 automatically reduces the standard gso max limit (65536) to 40000/2 : It can help to reduce latencies of high prio packets, having smaller TSO packets. This means we divert sock_wfree() to a tcp_wfree() handler, to queue/send following frames when skb_orphan() [2] is called for the already queued skbs. Results on my dev machines (tg3/ixgbe nics) are really impressive, using standard pfifo_fast, and with or without TSO/GSO. Without reduction of nominal bandwidth, we have reduction of buffering per bulk sender : < 1ms on Gbit (instead of 50ms with TSO) < 8ms on 100Mbit (instead of 132 ms) I no longer have 4 MBytes backlogged in qdisc by a single netperf session, and both side socket autotuning no longer use 4 Mbytes. As skb destructor cannot restart xmit itself ( as qdisc lock might be taken at this point ), we delegate the work to a tasklet. We use one tasklest per cpu for performance reasons. If tasklet finds a socket owned by the user, it sets TSQ_OWNED flag. This flag is tested in a new protocol method called from release_sock(), to eventually send new segments. [1] New /proc/sys/net/ipv4/tcp_limit_output_bytes tunable [2] skb_orphan() is usually called at TX completion time, but some drivers call it in their start_xmit() handler. These drivers should at least use BQL, or else a single TCP session can still fill the whole NIC TX ring, since TSQ will have no effect. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Dave Taht <dave.taht@bufferbloat.net> Cc: Tom Herbert <therbert@google.com> Cc: Matt Mathis <mattmathis@google.com> Cc: Yuchung Cheng <ycheng@google.com> Cc: Nandita Dukkipati <nanditad@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c154
1 files changed, 153 insertions, 1 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c465d3e51e28..03854abfd9d8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -50,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1;
50 */ 50 */
51int sysctl_tcp_workaround_signed_windows __read_mostly = 0; 51int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
52 52
53/* Default TSQ limit of two TSO segments */
54int sysctl_tcp_limit_output_bytes __read_mostly = 131072;
55
53/* This limits the percentage of the congestion window which we 56/* This limits the percentage of the congestion window which we
54 * will allow a single TSO frame to consume. Building TSO frames 57 * will allow a single TSO frame to consume. Building TSO frames
55 * which are too large can cause TCP streams to be bursty. 58 * which are too large can cause TCP streams to be bursty.
@@ -65,6 +68,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
65int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ 68int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
66EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); 69EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
67 70
71static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
72 int push_one, gfp_t gfp);
68 73
69/* Account for new data that has been sent to the network. */ 74/* Account for new data that has been sent to the network. */
70static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) 75static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
@@ -783,6 +788,140 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
783 return size; 788 return size;
784} 789}
785 790
791
792/* TCP SMALL QUEUES (TSQ)
793 *
794 * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
795 * to reduce RTT and bufferbloat.
796 * We do this using a special skb destructor (tcp_wfree).
797 *
798 * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
799 * needs to be reallocated in a driver.
800 * The invariant being skb->truesize substracted from sk->sk_wmem_alloc
801 *
802 * Since transmit from skb destructor is forbidden, we use a tasklet
803 * to process all sockets that eventually need to send more skbs.
804 * We use one tasklet per cpu, with its own queue of sockets.
805 */
806struct tsq_tasklet {
807 struct tasklet_struct tasklet;
808 struct list_head head; /* queue of tcp sockets */
809};
810static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
811
812/*
813 * One tasklest per cpu tries to send more skbs.
814 * We run in tasklet context but need to disable irqs when
815 * transfering tsq->head because tcp_wfree() might
816 * interrupt us (non NAPI drivers)
817 */
818static void tcp_tasklet_func(unsigned long data)
819{
820 struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
821 LIST_HEAD(list);
822 unsigned long flags;
823 struct list_head *q, *n;
824 struct tcp_sock *tp;
825 struct sock *sk;
826
827 local_irq_save(flags);
828 list_splice_init(&tsq->head, &list);
829 local_irq_restore(flags);
830
831 list_for_each_safe(q, n, &list) {
832 tp = list_entry(q, struct tcp_sock, tsq_node);
833 list_del(&tp->tsq_node);
834
835 sk = (struct sock *)tp;
836 bh_lock_sock(sk);
837
838 if (!sock_owned_by_user(sk)) {
839 if ((1 << sk->sk_state) &
840 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
841 TCPF_CLOSING | TCPF_CLOSE_WAIT))
842 tcp_write_xmit(sk,
843 tcp_current_mss(sk),
844 0, 0,
845 GFP_ATOMIC);
846 } else {
847 /* defer the work to tcp_release_cb() */
848 set_bit(TSQ_OWNED, &tp->tsq_flags);
849 }
850 bh_unlock_sock(sk);
851
852 clear_bit(TSQ_QUEUED, &tp->tsq_flags);
853 sk_free(sk);
854 }
855}
856
857/**
858 * tcp_release_cb - tcp release_sock() callback
859 * @sk: socket
860 *
861 * called from release_sock() to perform protocol dependent
862 * actions before socket release.
863 */
864void tcp_release_cb(struct sock *sk)
865{
866 struct tcp_sock *tp = tcp_sk(sk);
867
868 if (test_and_clear_bit(TSQ_OWNED, &tp->tsq_flags)) {
869 if ((1 << sk->sk_state) &
870 (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 |
871 TCPF_CLOSING | TCPF_CLOSE_WAIT))
872 tcp_write_xmit(sk,
873 tcp_current_mss(sk),
874 0, 0,
875 GFP_ATOMIC);
876 }
877}
878EXPORT_SYMBOL(tcp_release_cb);
879
880void __init tcp_tasklet_init(void)
881{
882 int i;
883
884 for_each_possible_cpu(i) {
885 struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
886
887 INIT_LIST_HEAD(&tsq->head);
888 tasklet_init(&tsq->tasklet,
889 tcp_tasklet_func,
890 (unsigned long)tsq);
891 }
892}
893
894/*
895 * Write buffer destructor automatically called from kfree_skb.
896 * We cant xmit new skbs from this context, as we might already
897 * hold qdisc lock.
898 */
899void tcp_wfree(struct sk_buff *skb)
900{
901 struct sock *sk = skb->sk;
902 struct tcp_sock *tp = tcp_sk(sk);
903
904 if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
905 !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
906 unsigned long flags;
907 struct tsq_tasklet *tsq;
908
909 /* Keep a ref on socket.
910 * This last ref will be released in tcp_tasklet_func()
911 */
912 atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
913
914 /* queue this socket to tasklet queue */
915 local_irq_save(flags);
916 tsq = &__get_cpu_var(tsq_tasklet);
917 list_add(&tp->tsq_node, &tsq->head);
918 tasklet_schedule(&tsq->tasklet);
919 local_irq_restore(flags);
920 } else {
921 sock_wfree(skb);
922 }
923}
924
786/* This routine actually transmits TCP packets queued in by 925/* This routine actually transmits TCP packets queued in by
787 * tcp_do_sendmsg(). This is used by both the initial 926 * tcp_do_sendmsg(). This is used by both the initial
788 * transmission and possible later retransmissions. 927 * transmission and possible later retransmissions.
@@ -844,7 +983,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
844 983
845 skb_push(skb, tcp_header_size); 984 skb_push(skb, tcp_header_size);
846 skb_reset_transport_header(skb); 985 skb_reset_transport_header(skb);
847 skb_set_owner_w(skb, sk); 986
987 skb_orphan(skb);
988 skb->sk = sk;
989 skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
990 tcp_wfree : sock_wfree;
991 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
848 992
849 /* Build TCP header and checksum it. */ 993 /* Build TCP header and checksum it. */
850 th = tcp_hdr(skb); 994 th = tcp_hdr(skb);
@@ -1780,6 +1924,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1780 while ((skb = tcp_send_head(sk))) { 1924 while ((skb = tcp_send_head(sk))) {
1781 unsigned int limit; 1925 unsigned int limit;
1782 1926
1927
1783 tso_segs = tcp_init_tso_segs(sk, skb, mss_now); 1928 tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
1784 BUG_ON(!tso_segs); 1929 BUG_ON(!tso_segs);
1785 1930
@@ -1800,6 +1945,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1800 break; 1945 break;
1801 } 1946 }
1802 1947
1948 /* TSQ : sk_wmem_alloc accounts skb truesize,
1949 * including skb overhead. But thats OK.
1950 */
1951 if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) {
1952 set_bit(TSQ_THROTTLED, &tp->tsq_flags);
1953 break;
1954 }
1803 limit = mss_now; 1955 limit = mss_now;
1804 if (tso_segs > 1 && !tcp_urg_mode(tp)) 1956 if (tso_segs > 1 && !tcp_urg_mode(tp))
1805 limit = tcp_mss_split_point(sk, skb, mss_now, 1957 limit = tcp_mss_split_point(sk, skb, mss_now,