diff options
| author | Eric Dumazet <eric.dumazet@gmail.com> | 2012-07-11 01:50:31 -0400 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2012-07-11 21:12:59 -0400 |
| commit | 46d3ceabd8d98ed0ad10f20c595ca784e34786c5 (patch) | |
| tree | 771200292431be56c6ebcb23af9206bc03d40e65 | |
| parent | 2100844ca9d7055d5cddce2f8ed13af94c01f85b (diff) | |
tcp: TCP Small Queues
This introduce TSQ (TCP Small Queues)
TSQ goal is to reduce number of TCP packets in xmit queues (qdisc &
device queues), to reduce RTT and cwnd bias, part of the bufferbloat
problem.
sk->sk_wmem_alloc not allowed to grow above a given limit,
allowing no more than ~128KB [1] per tcp socket in qdisc/dev layers at a
given time.
TSO packets are sized/capped to half the limit, so that we have two
TSO packets in flight, allowing better bandwidth use.
As a side effect, setting the limit to 40000 automatically reduces the
standard gso max limit (65536) to 40000/2 : It can help to reduce
latencies of high prio packets, having smaller TSO packets.
This means we divert sock_wfree() to a tcp_wfree() handler, to
queue/send following frames when skb_orphan() [2] is called for the
already queued skbs.
Results on my dev machines (tg3/ixgbe nics) are really impressive,
using standard pfifo_fast, and with or without TSO/GSO.
Without reduction of nominal bandwidth, we have reduction of buffering
per bulk sender :
< 1ms on Gbit (instead of 50ms with TSO)
< 8ms on 100Mbit (instead of 132 ms)
I no longer have 4 MBytes backlogged in qdisc by a single netperf
session, and both side socket autotuning no longer use 4 Mbytes.
As skb destructor cannot restart xmit itself ( as qdisc lock might be
taken at this point ), we delegate the work to a tasklet. We use one
tasklest per cpu for performance reasons.
If tasklet finds a socket owned by the user, it sets TSQ_OWNED flag.
This flag is tested in a new protocol method called from release_sock(),
to eventually send new segments.
[1] New /proc/sys/net/ipv4/tcp_limit_output_bytes tunable
[2] skb_orphan() is usually called at TX completion time,
but some drivers call it in their start_xmit() handler.
These drivers should at least use BQL, or else a single TCP
session can still fill the whole NIC TX ring, since TSQ will
have no effect.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Dave Taht <dave.taht@bufferbloat.net>
Cc: Tom Herbert <therbert@google.com>
Cc: Matt Mathis <mattmathis@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
| -rw-r--r-- | Documentation/networking/ip-sysctl.txt | 14 | ||||
| -rw-r--r-- | include/linux/tcp.h | 9 | ||||
| -rw-r--r-- | include/net/sock.h | 2 | ||||
| -rw-r--r-- | include/net/tcp.h | 4 | ||||
| -rw-r--r-- | net/core/sock.c | 4 | ||||
| -rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 7 | ||||
| -rw-r--r-- | net/ipv4/tcp.c | 6 | ||||
| -rw-r--r-- | net/ipv4/tcp_ipv4.c | 1 | ||||
| -rw-r--r-- | net/ipv4/tcp_minisocks.c | 1 | ||||
| -rw-r--r-- | net/ipv4/tcp_output.c | 154 | ||||
| -rw-r--r-- | net/ipv6/tcp_ipv6.c | 1 |
11 files changed, 202 insertions, 1 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 47b6c79e9b05..e20c17a7d34e 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt | |||
| @@ -551,6 +551,20 @@ tcp_thin_dupack - BOOLEAN | |||
| 551 | Documentation/networking/tcp-thin.txt | 551 | Documentation/networking/tcp-thin.txt |
| 552 | Default: 0 | 552 | Default: 0 |
| 553 | 553 | ||
| 554 | tcp_limit_output_bytes - INTEGER | ||
| 555 | Controls TCP Small Queue limit per tcp socket. | ||
| 556 | TCP bulk sender tends to increase packets in flight until it | ||
| 557 | gets losses notifications. With SNDBUF autotuning, this can | ||
| 558 | result in a large amount of packets queued in qdisc/device | ||
| 559 | on the local machine, hurting latency of other flows, for | ||
| 560 | typical pfifo_fast qdiscs. | ||
| 561 | tcp_limit_output_bytes limits the number of bytes on qdisc | ||
| 562 | or device to reduce artificial RTT/cwnd and reduce bufferbloat. | ||
| 563 | Note: For GSO/TSO enabled flows, we try to have at least two | ||
| 564 | packets in flight. Reducing tcp_limit_output_bytes might also | ||
| 565 | reduce the size of individual GSO packet (64KB being the max) | ||
| 566 | Default: 131072 | ||
| 567 | |||
| 554 | UDP variables: | 568 | UDP variables: |
| 555 | 569 | ||
| 556 | udp_mem - vector of 3 INTEGERs: min, pressure, max | 570 | udp_mem - vector of 3 INTEGERs: min, pressure, max |
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 2de9cf46f9fc..1888169e07c7 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h | |||
| @@ -339,6 +339,9 @@ struct tcp_sock { | |||
| 339 | u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ | 339 | u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ |
| 340 | u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ | 340 | u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ |
| 341 | 341 | ||
| 342 | struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ | ||
| 343 | unsigned long tsq_flags; | ||
| 344 | |||
| 342 | /* Data for direct copy to user */ | 345 | /* Data for direct copy to user */ |
| 343 | struct { | 346 | struct { |
| 344 | struct sk_buff_head prequeue; | 347 | struct sk_buff_head prequeue; |
| @@ -494,6 +497,12 @@ struct tcp_sock { | |||
| 494 | struct tcp_cookie_values *cookie_values; | 497 | struct tcp_cookie_values *cookie_values; |
| 495 | }; | 498 | }; |
| 496 | 499 | ||
| 500 | enum tsq_flags { | ||
| 501 | TSQ_THROTTLED, | ||
| 502 | TSQ_QUEUED, | ||
| 503 | TSQ_OWNED, /* tcp_tasklet_func() found socket was locked */ | ||
| 504 | }; | ||
| 505 | |||
| 497 | static inline struct tcp_sock *tcp_sk(const struct sock *sk) | 506 | static inline struct tcp_sock *tcp_sk(const struct sock *sk) |
| 498 | { | 507 | { |
| 499 | return (struct tcp_sock *)sk; | 508 | return (struct tcp_sock *)sk; |
diff --git a/include/net/sock.h b/include/net/sock.h index dcb54a0793ec..88de092df50f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h | |||
| @@ -858,6 +858,8 @@ struct proto { | |||
| 858 | int (*backlog_rcv) (struct sock *sk, | 858 | int (*backlog_rcv) (struct sock *sk, |
| 859 | struct sk_buff *skb); | 859 | struct sk_buff *skb); |
| 860 | 860 | ||
| 861 | void (*release_cb)(struct sock *sk); | ||
| 862 | |||
| 861 | /* Keeping track of sk's, looking them up, and port selection methods. */ | 863 | /* Keeping track of sk's, looking them up, and port selection methods. */ |
| 862 | void (*hash)(struct sock *sk); | 864 | void (*hash)(struct sock *sk); |
| 863 | void (*unhash)(struct sock *sk); | 865 | void (*unhash)(struct sock *sk); |
diff --git a/include/net/tcp.h b/include/net/tcp.h index 3618fefae049..439984b9af49 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h | |||
| @@ -253,6 +253,7 @@ extern int sysctl_tcp_cookie_size; | |||
| 253 | extern int sysctl_tcp_thin_linear_timeouts; | 253 | extern int sysctl_tcp_thin_linear_timeouts; |
| 254 | extern int sysctl_tcp_thin_dupack; | 254 | extern int sysctl_tcp_thin_dupack; |
| 255 | extern int sysctl_tcp_early_retrans; | 255 | extern int sysctl_tcp_early_retrans; |
| 256 | extern int sysctl_tcp_limit_output_bytes; | ||
| 256 | 257 | ||
| 257 | extern atomic_long_t tcp_memory_allocated; | 258 | extern atomic_long_t tcp_memory_allocated; |
| 258 | extern struct percpu_counter tcp_sockets_allocated; | 259 | extern struct percpu_counter tcp_sockets_allocated; |
| @@ -321,6 +322,8 @@ extern struct proto tcp_prot; | |||
| 321 | 322 | ||
| 322 | extern void tcp_init_mem(struct net *net); | 323 | extern void tcp_init_mem(struct net *net); |
| 323 | 324 | ||
| 325 | extern void tcp_tasklet_init(void); | ||
| 326 | |||
| 324 | extern void tcp_v4_err(struct sk_buff *skb, u32); | 327 | extern void tcp_v4_err(struct sk_buff *skb, u32); |
| 325 | 328 | ||
| 326 | extern void tcp_shutdown (struct sock *sk, int how); | 329 | extern void tcp_shutdown (struct sock *sk, int how); |
| @@ -334,6 +337,7 @@ extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
| 334 | size_t size); | 337 | size_t size); |
| 335 | extern int tcp_sendpage(struct sock *sk, struct page *page, int offset, | 338 | extern int tcp_sendpage(struct sock *sk, struct page *page, int offset, |
| 336 | size_t size, int flags); | 339 | size_t size, int flags); |
| 340 | extern void tcp_release_cb(struct sock *sk); | ||
| 337 | extern int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); | 341 | extern int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); |
| 338 | extern int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | 342 | extern int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, |
| 339 | const struct tcphdr *th, unsigned int len); | 343 | const struct tcphdr *th, unsigned int len); |
diff --git a/net/core/sock.c b/net/core/sock.c index 929bdcc2383b..24039ac12426 100644 --- a/net/core/sock.c +++ b/net/core/sock.c | |||
| @@ -2159,6 +2159,10 @@ void release_sock(struct sock *sk) | |||
| 2159 | spin_lock_bh(&sk->sk_lock.slock); | 2159 | spin_lock_bh(&sk->sk_lock.slock); |
| 2160 | if (sk->sk_backlog.tail) | 2160 | if (sk->sk_backlog.tail) |
| 2161 | __release_sock(sk); | 2161 | __release_sock(sk); |
| 2162 | |||
| 2163 | if (sk->sk_prot->release_cb) | ||
| 2164 | sk->sk_prot->release_cb(sk); | ||
| 2165 | |||
| 2162 | sk->sk_lock.owned = 0; | 2166 | sk->sk_lock.owned = 0; |
| 2163 | if (waitqueue_active(&sk->sk_lock.wq)) | 2167 | if (waitqueue_active(&sk->sk_lock.wq)) |
| 2164 | wake_up(&sk->sk_lock.wq); | 2168 | wake_up(&sk->sk_lock.wq); |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 12aa0c5867c4..70730f7aeafe 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
| @@ -598,6 +598,13 @@ static struct ctl_table ipv4_table[] = { | |||
| 598 | .mode = 0644, | 598 | .mode = 0644, |
| 599 | .proc_handler = proc_dointvec | 599 | .proc_handler = proc_dointvec |
| 600 | }, | 600 | }, |
| 601 | { | ||
| 602 | .procname = "tcp_limit_output_bytes", | ||
| 603 | .data = &sysctl_tcp_limit_output_bytes, | ||
| 604 | .maxlen = sizeof(int), | ||
| 605 | .mode = 0644, | ||
| 606 | .proc_handler = proc_dointvec | ||
| 607 | }, | ||
| 601 | #ifdef CONFIG_NET_DMA | 608 | #ifdef CONFIG_NET_DMA |
| 602 | { | 609 | { |
| 603 | .procname = "tcp_dma_copybreak", | 610 | .procname = "tcp_dma_copybreak", |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d902da96d154..4252cd8f39fd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
| @@ -376,6 +376,7 @@ void tcp_init_sock(struct sock *sk) | |||
| 376 | skb_queue_head_init(&tp->out_of_order_queue); | 376 | skb_queue_head_init(&tp->out_of_order_queue); |
| 377 | tcp_init_xmit_timers(sk); | 377 | tcp_init_xmit_timers(sk); |
| 378 | tcp_prequeue_init(tp); | 378 | tcp_prequeue_init(tp); |
| 379 | INIT_LIST_HEAD(&tp->tsq_node); | ||
| 379 | 380 | ||
| 380 | icsk->icsk_rto = TCP_TIMEOUT_INIT; | 381 | icsk->icsk_rto = TCP_TIMEOUT_INIT; |
| 381 | tp->mdev = TCP_TIMEOUT_INIT; | 382 | tp->mdev = TCP_TIMEOUT_INIT; |
| @@ -796,6 +797,10 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, | |||
| 796 | inet_csk(sk)->icsk_ext_hdr_len - | 797 | inet_csk(sk)->icsk_ext_hdr_len - |
| 797 | tp->tcp_header_len); | 798 | tp->tcp_header_len); |
| 798 | 799 | ||
| 800 | /* TSQ : try to have two TSO segments in flight */ | ||
| 801 | xmit_size_goal = min_t(u32, xmit_size_goal, | ||
| 802 | sysctl_tcp_limit_output_bytes >> 1); | ||
| 803 | |||
| 799 | xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); | 804 | xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); |
| 800 | 805 | ||
| 801 | /* We try hard to avoid divides here */ | 806 | /* We try hard to avoid divides here */ |
| @@ -3574,4 +3579,5 @@ void __init tcp_init(void) | |||
| 3574 | tcp_secret_primary = &tcp_secret_one; | 3579 | tcp_secret_primary = &tcp_secret_one; |
| 3575 | tcp_secret_retiring = &tcp_secret_two; | 3580 | tcp_secret_retiring = &tcp_secret_two; |
| 3576 | tcp_secret_secondary = &tcp_secret_two; | 3581 | tcp_secret_secondary = &tcp_secret_two; |
| 3582 | tcp_tasklet_init(); | ||
| 3577 | } | 3583 | } |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ddefd39ac0cf..01545a3fc0f2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
| @@ -2588,6 +2588,7 @@ struct proto tcp_prot = { | |||
| 2588 | .sendmsg = tcp_sendmsg, | 2588 | .sendmsg = tcp_sendmsg, |
| 2589 | .sendpage = tcp_sendpage, | 2589 | .sendpage = tcp_sendpage, |
| 2590 | .backlog_rcv = tcp_v4_do_rcv, | 2590 | .backlog_rcv = tcp_v4_do_rcv, |
| 2591 | .release_cb = tcp_release_cb, | ||
| 2591 | .hash = inet_hash, | 2592 | .hash = inet_hash, |
| 2592 | .unhash = inet_unhash, | 2593 | .unhash = inet_unhash, |
| 2593 | .get_port = inet_csk_get_port, | 2594 | .get_port = inet_csk_get_port, |
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 65608863fdee..c66f2ede160e 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
| @@ -424,6 +424,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
| 424 | treq->snt_isn + 1 + tcp_s_data_size(oldtp); | 424 | treq->snt_isn + 1 + tcp_s_data_size(oldtp); |
| 425 | 425 | ||
| 426 | tcp_prequeue_init(newtp); | 426 | tcp_prequeue_init(newtp); |
| 427 | INIT_LIST_HEAD(&newtp->tsq_node); | ||
| 427 | 428 | ||
| 428 | tcp_init_wl(newtp, treq->rcv_isn); | 429 | tcp_init_wl(newtp, treq->rcv_isn); |
| 429 | 430 | ||
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c465d3e51e28..03854abfd9d8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
| @@ -50,6 +50,9 @@ int sysctl_tcp_retrans_collapse __read_mostly = 1; | |||
| 50 | */ | 50 | */ |
| 51 | int sysctl_tcp_workaround_signed_windows __read_mostly = 0; | 51 | int sysctl_tcp_workaround_signed_windows __read_mostly = 0; |
| 52 | 52 | ||
| 53 | /* Default TSQ limit of two TSO segments */ | ||
| 54 | int sysctl_tcp_limit_output_bytes __read_mostly = 131072; | ||
| 55 | |||
| 53 | /* This limits the percentage of the congestion window which we | 56 | /* This limits the percentage of the congestion window which we |
| 54 | * will allow a single TSO frame to consume. Building TSO frames | 57 | * will allow a single TSO frame to consume. Building TSO frames |
| 55 | * which are too large can cause TCP streams to be bursty. | 58 | * which are too large can cause TCP streams to be bursty. |
| @@ -65,6 +68,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; | |||
| 65 | int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ | 68 | int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ |
| 66 | EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); | 69 | EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); |
| 67 | 70 | ||
| 71 | static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | ||
| 72 | int push_one, gfp_t gfp); | ||
| 68 | 73 | ||
| 69 | /* Account for new data that has been sent to the network. */ | 74 | /* Account for new data that has been sent to the network. */ |
| 70 | static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) | 75 | static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) |
| @@ -783,6 +788,140 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb | |||
| 783 | return size; | 788 | return size; |
| 784 | } | 789 | } |
| 785 | 790 | ||
| 791 | |||
| 792 | /* TCP SMALL QUEUES (TSQ) | ||
| 793 | * | ||
| 794 | * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) | ||
| 795 | * to reduce RTT and bufferbloat. | ||
| 796 | * We do this using a special skb destructor (tcp_wfree). | ||
| 797 | * | ||
| 798 | * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb | ||
| 799 | * needs to be reallocated in a driver. | ||
| 800 | * The invariant being skb->truesize substracted from sk->sk_wmem_alloc | ||
| 801 | * | ||
| 802 | * Since transmit from skb destructor is forbidden, we use a tasklet | ||
| 803 | * to process all sockets that eventually need to send more skbs. | ||
| 804 | * We use one tasklet per cpu, with its own queue of sockets. | ||
| 805 | */ | ||
| 806 | struct tsq_tasklet { | ||
| 807 | struct tasklet_struct tasklet; | ||
| 808 | struct list_head head; /* queue of tcp sockets */ | ||
| 809 | }; | ||
| 810 | static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); | ||
| 811 | |||
| 812 | /* | ||
| 813 | * One tasklest per cpu tries to send more skbs. | ||
| 814 | * We run in tasklet context but need to disable irqs when | ||
| 815 | * transfering tsq->head because tcp_wfree() might | ||
| 816 | * interrupt us (non NAPI drivers) | ||
| 817 | */ | ||
| 818 | static void tcp_tasklet_func(unsigned long data) | ||
| 819 | { | ||
| 820 | struct tsq_tasklet *tsq = (struct tsq_tasklet *)data; | ||
| 821 | LIST_HEAD(list); | ||
| 822 | unsigned long flags; | ||
| 823 | struct list_head *q, *n; | ||
| 824 | struct tcp_sock *tp; | ||
| 825 | struct sock *sk; | ||
| 826 | |||
| 827 | local_irq_save(flags); | ||
| 828 | list_splice_init(&tsq->head, &list); | ||
| 829 | local_irq_restore(flags); | ||
| 830 | |||
| 831 | list_for_each_safe(q, n, &list) { | ||
| 832 | tp = list_entry(q, struct tcp_sock, tsq_node); | ||
| 833 | list_del(&tp->tsq_node); | ||
| 834 | |||
| 835 | sk = (struct sock *)tp; | ||
| 836 | bh_lock_sock(sk); | ||
| 837 | |||
| 838 | if (!sock_owned_by_user(sk)) { | ||
| 839 | if ((1 << sk->sk_state) & | ||
| 840 | (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | | ||
| 841 | TCPF_CLOSING | TCPF_CLOSE_WAIT)) | ||
| 842 | tcp_write_xmit(sk, | ||
| 843 | tcp_current_mss(sk), | ||
| 844 | 0, 0, | ||
| 845 | GFP_ATOMIC); | ||
| 846 | } else { | ||
| 847 | /* defer the work to tcp_release_cb() */ | ||
| 848 | set_bit(TSQ_OWNED, &tp->tsq_flags); | ||
| 849 | } | ||
| 850 | bh_unlock_sock(sk); | ||
| 851 | |||
| 852 | clear_bit(TSQ_QUEUED, &tp->tsq_flags); | ||
| 853 | sk_free(sk); | ||
| 854 | } | ||
| 855 | } | ||
| 856 | |||
| 857 | /** | ||
| 858 | * tcp_release_cb - tcp release_sock() callback | ||
| 859 | * @sk: socket | ||
| 860 | * | ||
| 861 | * called from release_sock() to perform protocol dependent | ||
| 862 | * actions before socket release. | ||
| 863 | */ | ||
| 864 | void tcp_release_cb(struct sock *sk) | ||
| 865 | { | ||
| 866 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 867 | |||
| 868 | if (test_and_clear_bit(TSQ_OWNED, &tp->tsq_flags)) { | ||
| 869 | if ((1 << sk->sk_state) & | ||
| 870 | (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | | ||
| 871 | TCPF_CLOSING | TCPF_CLOSE_WAIT)) | ||
| 872 | tcp_write_xmit(sk, | ||
| 873 | tcp_current_mss(sk), | ||
| 874 | 0, 0, | ||
| 875 | GFP_ATOMIC); | ||
| 876 | } | ||
| 877 | } | ||
| 878 | EXPORT_SYMBOL(tcp_release_cb); | ||
| 879 | |||
| 880 | void __init tcp_tasklet_init(void) | ||
| 881 | { | ||
| 882 | int i; | ||
| 883 | |||
| 884 | for_each_possible_cpu(i) { | ||
| 885 | struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); | ||
| 886 | |||
| 887 | INIT_LIST_HEAD(&tsq->head); | ||
| 888 | tasklet_init(&tsq->tasklet, | ||
| 889 | tcp_tasklet_func, | ||
| 890 | (unsigned long)tsq); | ||
| 891 | } | ||
| 892 | } | ||
| 893 | |||
| 894 | /* | ||
| 895 | * Write buffer destructor automatically called from kfree_skb. | ||
| 896 | * We cant xmit new skbs from this context, as we might already | ||
| 897 | * hold qdisc lock. | ||
| 898 | */ | ||
| 899 | void tcp_wfree(struct sk_buff *skb) | ||
| 900 | { | ||
| 901 | struct sock *sk = skb->sk; | ||
| 902 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 903 | |||
| 904 | if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) && | ||
| 905 | !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) { | ||
| 906 | unsigned long flags; | ||
| 907 | struct tsq_tasklet *tsq; | ||
| 908 | |||
| 909 | /* Keep a ref on socket. | ||
| 910 | * This last ref will be released in tcp_tasklet_func() | ||
| 911 | */ | ||
| 912 | atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc); | ||
| 913 | |||
| 914 | /* queue this socket to tasklet queue */ | ||
| 915 | local_irq_save(flags); | ||
| 916 | tsq = &__get_cpu_var(tsq_tasklet); | ||
| 917 | list_add(&tp->tsq_node, &tsq->head); | ||
| 918 | tasklet_schedule(&tsq->tasklet); | ||
| 919 | local_irq_restore(flags); | ||
| 920 | } else { | ||
| 921 | sock_wfree(skb); | ||
| 922 | } | ||
| 923 | } | ||
| 924 | |||
| 786 | /* This routine actually transmits TCP packets queued in by | 925 | /* This routine actually transmits TCP packets queued in by |
| 787 | * tcp_do_sendmsg(). This is used by both the initial | 926 | * tcp_do_sendmsg(). This is used by both the initial |
| 788 | * transmission and possible later retransmissions. | 927 | * transmission and possible later retransmissions. |
| @@ -844,7 +983,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
| 844 | 983 | ||
| 845 | skb_push(skb, tcp_header_size); | 984 | skb_push(skb, tcp_header_size); |
| 846 | skb_reset_transport_header(skb); | 985 | skb_reset_transport_header(skb); |
| 847 | skb_set_owner_w(skb, sk); | 986 | |
| 987 | skb_orphan(skb); | ||
| 988 | skb->sk = sk; | ||
| 989 | skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ? | ||
| 990 | tcp_wfree : sock_wfree; | ||
| 991 | atomic_add(skb->truesize, &sk->sk_wmem_alloc); | ||
| 848 | 992 | ||
| 849 | /* Build TCP header and checksum it. */ | 993 | /* Build TCP header and checksum it. */ |
| 850 | th = tcp_hdr(skb); | 994 | th = tcp_hdr(skb); |
| @@ -1780,6 +1924,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
| 1780 | while ((skb = tcp_send_head(sk))) { | 1924 | while ((skb = tcp_send_head(sk))) { |
| 1781 | unsigned int limit; | 1925 | unsigned int limit; |
| 1782 | 1926 | ||
| 1927 | |||
| 1783 | tso_segs = tcp_init_tso_segs(sk, skb, mss_now); | 1928 | tso_segs = tcp_init_tso_segs(sk, skb, mss_now); |
| 1784 | BUG_ON(!tso_segs); | 1929 | BUG_ON(!tso_segs); |
| 1785 | 1930 | ||
| @@ -1800,6 +1945,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, | |||
| 1800 | break; | 1945 | break; |
| 1801 | } | 1946 | } |
| 1802 | 1947 | ||
| 1948 | /* TSQ : sk_wmem_alloc accounts skb truesize, | ||
| 1949 | * including skb overhead. But thats OK. | ||
| 1950 | */ | ||
| 1951 | if (atomic_read(&sk->sk_wmem_alloc) >= sysctl_tcp_limit_output_bytes) { | ||
| 1952 | set_bit(TSQ_THROTTLED, &tp->tsq_flags); | ||
| 1953 | break; | ||
| 1954 | } | ||
| 1803 | limit = mss_now; | 1955 | limit = mss_now; |
| 1804 | if (tso_segs > 1 && !tcp_urg_mode(tp)) | 1956 | if (tso_segs > 1 && !tcp_urg_mode(tp)) |
| 1805 | limit = tcp_mss_split_point(sk, skb, mss_now, | 1957 | limit = tcp_mss_split_point(sk, skb, mss_now, |
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 61175cb2478f..70458a9cd837 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c | |||
| @@ -1970,6 +1970,7 @@ struct proto tcpv6_prot = { | |||
| 1970 | .sendmsg = tcp_sendmsg, | 1970 | .sendmsg = tcp_sendmsg, |
| 1971 | .sendpage = tcp_sendpage, | 1971 | .sendpage = tcp_sendpage, |
| 1972 | .backlog_rcv = tcp_v6_do_rcv, | 1972 | .backlog_rcv = tcp_v6_do_rcv, |
| 1973 | .release_cb = tcp_release_cb, | ||
| 1973 | .hash = tcp_v6_hash, | 1974 | .hash = tcp_v6_hash, |
| 1974 | .unhash = inet_unhash, | 1975 | .unhash = inet_unhash, |
| 1975 | .get_port = inet_csk_get_port, | 1976 | .get_port = inet_csk_get_port, |
