diff options
author | Eric Dumazet <eric.dumazet@gmail.com> | 2012-07-11 01:50:31 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2012-07-11 21:12:59 -0400 |
commit | 46d3ceabd8d98ed0ad10f20c595ca784e34786c5 (patch) | |
tree | 771200292431be56c6ebcb23af9206bc03d40e65 /net/ipv4/tcp_output.c | |
parent | 2100844ca9d7055d5cddce2f8ed13af94c01f85b (diff) |
tcp: TCP Small Queues
This introduce TSQ (TCP Small Queues)
TSQ goal is to reduce number of TCP packets in xmit queues (qdisc &
device queues), to reduce RTT and cwnd bias, part of the bufferbloat
problem.
sk->sk_wmem_alloc not allowed to grow above a given limit,
allowing no more than ~128KB [1] per tcp socket in qdisc/dev layers at a
given time.
TSO packets are sized/capped to half the limit, so that we have two
TSO packets in flight, allowing better bandwidth use.
As a side effect, setting the limit to 40000 automatically reduces the
standard gso max limit (65536) to 40000/2 : It can help to reduce
latencies of high prio packets, having smaller TSO packets.
This means we divert sock_wfree() to a tcp_wfree() handler, to
queue/send following frames when skb_orphan() [2] is called for the
already queued skbs.
Results on my dev machines (tg3/ixgbe nics) are really impressive,
using standard pfifo_fast, and with or without TSO/GSO.
Without reduction of nominal bandwidth, we have reduction of buffering
per bulk sender :
< 1ms on Gbit (instead of 50ms with TSO)
< 8ms on 100Mbit (instead of 132 ms)
I no longer have 4 MBytes backlogged in qdisc by a single netperf
session, and both side socket autotuning no longer use 4 Mbytes.
As skb destructor cannot restart xmit itself ( as qdisc lock might be
taken at this point ), we delegate the work to a tasklet. We use one
tasklest per cpu for performance reasons.
If tasklet finds a socket owned by the user, it sets TSQ_OWNED flag.
This flag is tested in a new protocol method called from release_sock(),
to eventually send new segments.
[1] New /proc/sys/net/ipv4/tcp_limit_output_bytes tunable
[2] skb_orphan() is usually called at TX completion time,
but some drivers call it in their start_xmit() handler.
These drivers should at least use BQL, or else a single TCP
session can still fill the whole NIC TX ring, since TSQ will
have no effect.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Dave Taht <dave.taht@bufferbloat.net>
Cc: Tom Herbert <therbert@google.com>
Cc: Matt Mathis <mattmathis@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r-- | net/ipv4/tcp_output.c | 154 |
1 files changed, 153 insertions, 1 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c465d3e51e28..03854abfd9d8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -50,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1; | |||
50 | */ | 50 | */ |
51 | int sysctl_tcp_workaround_signed_windows __read_mostly = 0; | 51 | int sysctl_tcp_workaround_signed_windows __read_mostly = 0; |
52 | 52 | ||
53 | /* Default TSQ limit of two TSO segments */ | ||
54 | int sysctl_tcp_limit_output_bytes __read_mostly = 131072; | ||
55 | |||
53 | /* This limits the percentage of the congestion window which we | 56 | /* This limits the percentage of the congestion window which we |
54 | * will allow a single TSO frame to consume. Building TSO frames | 57 | * will allow a single TSO frame to consume. Building TSO frames |
55 | * which are too large can cause TCP streams to be bursty. | 58 | * which are too large can cause TCP streams to be bursty. |
@@ -65,6 +68,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; | |||
65 | int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ | 68 | int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ |
66 | EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); | 69 | EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); |
67 | 70 | ||
71 | static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | ||
72 | int push_one, gfp_t gfp); | ||
68 | 73 | ||
69 | /* Account for new data that has been sent to the network. */ | 74 | /* Account for new data that has been sent to the network. */ |
70 | static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) | 75 | static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) |
@@ -783,6 +788,140 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb | |||
783 | return size; | 788 | return size; |
784 | } | 789 | } |
785 | 790 | ||
791 | |||
792 | /* TCP SMALL QUEUES (TSQ) | ||
793 | * | ||
794 | * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) | ||
795 | * to reduce RTT and bufferbloat. | ||
796 | * We do this using a special skb destructor (tcp_wfree). | ||
797 | * | ||
798 | * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb | ||
799 | * needs to be reallocated in a driver. | ||
800 | * The invariant being skb->truesize substracted from sk->sk_wmem_alloc | ||
801 | * | ||
802 | * Since transmit from skb destructor is forbidden, we use a tasklet | ||
803 | * to process all sockets that eventually need to send more skbs. | ||
804 | * We use one tasklet per cpu, with its own queue of sockets. | ||
805 | */ | ||
806 | struct tsq_tasklet { | ||
807 | struct tasklet_struct tasklet; | ||
808 | struct list_head head; /* queue of tcp sockets */ | ||
809 | }; | ||
810 | static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); | ||
811 | |||
812 | /* | ||
813 | * One tasklest per cpu tries to send more skbs. | ||
814 | * We run in tasklet context but need to disable irqs when | ||
815 | * transfering tsq->head because tcp_wfree() might | ||
816 | * interrupt us (non NAPI drivers) | ||
817 | */ | ||
818 | static void tcp_tasklet_func(unsigned long data) | ||
819 | { | ||
820 | struct tsq_tasklet *tsq = (struct tsq_tasklet *)data; | ||
821 | LIST_HEAD(list); | ||
822 | unsigned long flags; | ||
823 | struct list_head *q, *n; | ||
824 | struct tcp_sock *tp; | ||
825 | struct sock *sk; | ||
826 | |||
827 | local_irq_save(flags); | ||
828 | list_splice_init(&tsq->head, &list); | ||
829 | local_irq_restore(flags); | ||
830 | |||
831 | list_for_each_safe(q, n, &list) { | ||
832 | tp = list_entry(q, struct tcp_sock, tsq_node); | ||
833 | list_del(&tp->tsq_node); | ||
834 | |||
835 | sk = (struct sock *)tp; | ||
836 | bh_lock_sock(sk); | ||
837 | |||
838 | if (!sock_owned_by_user(sk)) { | ||
839 | if ((1 << sk->sk_state) & | ||
840 | (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | | ||
841 | TCPF_CLOSING | TCPF_CLOSE_WAIT)) | ||
842 | tcp_write_xmit(sk, | ||
843 | tcp_current_mss(sk), | ||
844 | 0, 0, | ||
845 | GFP_ATOMIC); | ||
846 | } else { | ||
847 | /* defer the work to tcp_release_cb() */ | ||
848 | set_bit(TSQ_OWNED, &tp->tsq_flags); | ||
849 | } | ||
850 | bh_unlock_sock(sk); | ||
851 | |||
852 | clear_bit(TSQ_QUEUED, &tp->tsq_flags); | ||
853 | sk_free(sk); | ||
854 | } | ||
855 | } | ||
856 | |||
857 | /** | ||
858 | * tcp_release_cb - tcp release_sock() callback | ||
859 | * @sk: socket | ||
860 | * | ||
861 | * called from release_sock() to perform protocol dependent | ||
862 | * actions before socket release. | ||
863 | */ | ||
864 | void tcp_release_cb(struct sock *sk) | ||
865 | { | ||
866 | struct tcp_sock *tp = tcp_sk(sk); | ||
867 | |||
868 | if (test_and_clear_bit(TSQ_OWNED, &tp->tsq_flags)) { | ||
869 | if ((1 << sk->sk_state) & | ||
870 | (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | | ||
871 | TCPF_CLOSING | TCPF_CLOSE_WAIT)) | ||
872 | tcp_write_xmit(sk, | ||
873 | tcp_current_mss(sk), | ||
874 | 0, 0, | ||
875 | GFP_ATOMIC); | ||
876 | } | ||
877 | } | ||
878 | EXPORT_SYMBOL(tcp_release_cb); | ||
879 | |||
880 | void __init tcp_tasklet_init(void) | ||
881 | { | ||
882 | int i; | ||
883 | |||
884 | for_each_possible_cpu(i) { | ||
885 | struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); | ||
886 | |||
887 | INIT_LIST_HEAD(&tsq->head); | ||
888 | tasklet_init(&tsq->tasklet, | ||
889 | tcp_tasklet_func, | ||
890 | (unsigned long)tsq); | ||
891 | } | ||
892 | } | ||
893 | |||
894 | /* | ||
895 | * Write buffer destructor automatically called from kfree_skb. | ||
896 | * We cant xmit new skbs from this context, as we might already | ||
897 | * hold qdisc lock. | ||
898 | */ | ||
899 | void tcp_wfree(struct sk_buff *skb) | ||
900 | { | ||
901 | struct sock *sk = skb->sk; | ||
902 | struct tcp_sock *tp = tcp_sk(sk); | ||
903 | |||
904 | if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && | ||
905 | !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { | ||
906 | unsigned long flags; | ||
907 | struct tsq_tasklet *tsq; | ||
908 | |||
909 | /* Keep a ref on socket. | ||
910 | * This last ref will be released in tcp_tasklet_func() | ||
911 | */ | ||
912 | atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc); | ||
913 | |||
914 | /* queue this socket to tasklet queue */ | ||
915 | local_irq_save(flags); | ||
916 | tsq = &__get_cpu_var(tsq_tasklet); | ||
917 | list_add(&tp->tsq_node, &tsq->head); | ||
918 | tasklet_schedule(&tsq->tasklet); | ||
919 | local_irq_restore(flags); | ||
920 | } else { | ||
921 | sock_wfree(skb); | ||
922 | } | ||
923 | } | ||
924 | |||
786 | /* This routine actually transmits TCP packets queued in by | 925 | /* This routine actually transmits TCP packets queued in by |
787 | * tcp_do_sendmsg(). This is used by both the initial | 926 | * tcp_do_sendmsg(). This is used by both the initial |
788 | * transmission and possible later retransmissions. | 927 | * transmission and possible later retransmissions. |
@@ -844,7 +983,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
844 | 983 | ||
845 | skb_push(skb, tcp_header_size); | 984 | skb_push(skb, tcp_header_size); |
846 | skb_reset_transport_header(skb); | 985 | skb_reset_transport_header(skb); |
847 | skb_set_owner_w(skb, sk); | 986 | |
987 | skb_orphan(skb); | ||
988 | skb->sk = sk; | ||
989 | skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ? | ||
990 | tcp_wfree : sock_wfree; | ||
991 | atomic_add(skb->truesize, &sk->sk_wmem_alloc); | ||
848 | 992 | ||
849 | /* Build TCP header and checksum it. */ | 993 | /* Build TCP header and checksum it. */ |
850 | th = tcp_hdr(skb); | 994 | th = tcp_hdr(skb); |
@@ -1780,6 +1924,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1780 | while ((skb = tcp_send_head(sk))) { | 1924 | while ((skb = tcp_send_head(sk))) { |
1781 | unsigned int limit; | 1925 | unsigned int limit; |
1782 | 1926 | ||
1927 | |||
1783 | tso_segs = tcp_init_tso_segs(sk, skb, mss_now); | 1928 | tso_segs = tcp_init_tso_segs(sk, skb, mss_now); |
1784 | BUG_ON(!tso_segs); | 1929 | BUG_ON(!tso_segs); |
1785 | 1930 | ||
@@ -1800,6 +1945,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
1800 | break; | 1945 | break; |
1801 | } | 1946 | } |
1802 | 1947 | ||
1948 | /* TSQ : sk_wmem_alloc accounts skb truesize, | ||
1949 | * including skb overhead. But thats OK. | ||
1950 | */ | ||
1951 | if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) { | ||
1952 | set_bit(TSQ_THROTTLED, &tp->tsq_flags); | ||
1953 | break; | ||
1954 | } | ||
1803 | limit = mss_now; | 1955 | limit = mss_now; |
1804 | if (tso_segs > 1 && !tcp_urg_mode(tp)) | 1956 | if (tso_segs > 1 && !tcp_urg_mode(tp)) |
1805 | limit = tcp_mss_split_point(sk, skb, mss_now, | 1957 | limit = tcp_mss_split_point(sk, skb, mss_now, |