aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2012-09-23 19:04:42 -0400
committerDavid S. Miller <davem@davemloft.net>2012-09-24 16:31:37 -0400
commit5640f7685831e088fe6c2e1f863a6805962f8e81 (patch)
treefb7660173338a45c27d610eb59ba20cf5c2b91b8 /include
parentb98b8babd6e3370fadb7c6eaacb00eb2f6344a6c (diff)
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg() operations. This page is used to build fragments for skbs. Its done to increase probability of coalescing small write() into single segments in skbs still in write queue (not yet sent) But it wastes a lot of memory for applications handling many mostly idle sockets, since each socket holds one page in sk->sk_sndmsg_page Its also quite inefficient to build TSO 64KB packets, because we need about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit page allocator more than wanted. This patch adds a per task frag allocator and uses bigger pages, if available. An automatic fallback is done in case of memory pressure. (up to 32768 bytes per frag, thats order-3 pages on x86) This increases TCP stream performance by 20% on loopback device, but also benefits on other network devices, since 8x less frags are mapped on transmit and unmapped on tx completion. Alexander Duyck mentioned a probable performance win on systems with IOMMU enabled. Its possible some SG enabled hardware cant cope with bigger fragments, but their ndo_start_xmit() should already handle this, splitting a fragment in sub fragments, since some arches have PAGE_SIZE=65536 Successfully tested on various ethernet devices. (ixgbe, igb, bnx2x, tg3, mellanox mlx4) Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Ben Hutchings <bhutchings@solarflare.com> Cc: Vijay Subramanian <subramanian.vijay@gmail.com> Cc: Alexander Duyck <alexander.h.duyck@intel.com> Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include')
-rw-r--r--include/linux/sched.h3
-rw-r--r--include/net/inet_sock.h4
-rw-r--r--include/net/sock.h27
3 files changed, 19 insertions, 15 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b8c86648a2f..a8e2413f6bc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1530,6 +1530,9 @@ struct task_struct {
1530 * cache last used pipe for splice 1530 * cache last used pipe for splice
1531 */ 1531 */
1532 struct pipe_inode_info *splice_pipe; 1532 struct pipe_inode_info *splice_pipe;
1533
1534 struct page_frag task_frag;
1535
1533#ifdef CONFIG_TASK_DELAY_ACCT 1536#ifdef CONFIG_TASK_DELAY_ACCT
1534 struct task_delay_info *delays; 1537 struct task_delay_info *delays;
1535#endif 1538#endif
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 613cfa40167..256c1ed2d69 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -101,10 +101,8 @@ struct inet_cork {
101 __be32 addr; 101 __be32 addr;
102 struct ip_options *opt; 102 struct ip_options *opt;
103 unsigned int fragsize; 103 unsigned int fragsize;
104 struct dst_entry *dst;
105 int length; /* Total length of all frames */ 104 int length; /* Total length of all frames */
106 struct page *page; 105 struct dst_entry *dst;
107 u32 off;
108 u8 tx_flags; 106 u8 tx_flags;
109}; 107};
110 108
diff --git a/include/net/sock.h b/include/net/sock.h
index 84bdaeca131..f036493b9a6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -247,8 +247,7 @@ struct cg_proto;
247 * @sk_stamp: time stamp of last packet received 247 * @sk_stamp: time stamp of last packet received
248 * @sk_socket: Identd and reporting IO signals 248 * @sk_socket: Identd and reporting IO signals
249 * @sk_user_data: RPC layer private data 249 * @sk_user_data: RPC layer private data
250 * @sk_sndmsg_page: cached page for sendmsg 250 * @sk_frag: cached page frag
251 * @sk_sndmsg_off: cached offset for sendmsg
252 * @sk_peek_off: current peek_offset value 251 * @sk_peek_off: current peek_offset value
253 * @sk_send_head: front of stuff to transmit 252 * @sk_send_head: front of stuff to transmit
254 * @sk_security: used by security modules 253 * @sk_security: used by security modules
@@ -362,9 +361,8 @@ struct sock {
362 ktime_t sk_stamp; 361 ktime_t sk_stamp;
363 struct socket *sk_socket; 362 struct socket *sk_socket;
364 void *sk_user_data; 363 void *sk_user_data;
365 struct page *sk_sndmsg_page; 364 struct page_frag sk_frag;
366 struct sk_buff *sk_send_head; 365 struct sk_buff *sk_send_head;
367 __u32 sk_sndmsg_off;
368 __s32 sk_peek_off; 366 __s32 sk_peek_off;
369 int sk_write_pending; 367 int sk_write_pending;
370#ifdef CONFIG_SECURITY 368#ifdef CONFIG_SECURITY
@@ -2034,18 +2032,23 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk)
2034 2032
2035struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp); 2033struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp);
2036 2034
2037static inline struct page *sk_stream_alloc_page(struct sock *sk) 2035/**
2036 * sk_page_frag - return an appropriate page_frag
2037 * @sk: socket
2038 *
2039 * If socket allocation mode allows current thread to sleep, it means its
2040 * safe to use the per task page_frag instead of the per socket one.
2041 */
2042static inline struct page_frag *sk_page_frag(struct sock *sk)
2038{ 2043{
2039 struct page *page = NULL; 2044 if (sk->sk_allocation & __GFP_WAIT)
2045 return &current->task_frag;
2040 2046
2041 page = alloc_pages(sk->sk_allocation, 0); 2047 return &sk->sk_frag;
2042 if (!page) {
2043 sk_enter_memory_pressure(sk);
2044 sk_stream_moderate_sndbuf(sk);
2045 }
2046 return page;
2047} 2048}
2048 2049
2050extern bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);
2051
2049/* 2052/*
2050 * Default write policy as shown to user space via poll/select/SIGIO 2053 * Default write policy as shown to user space via poll/select/SIGIO
2051 */ 2054 */