aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2012-09-23 19:04:42 -0400
committerDavid S. Miller <davem@davemloft.net>2012-09-24 16:31:37 -0400
commit5640f7685831e088fe6c2e1f863a6805962f8e81 (patch)
treefb7660173338a45c27d610eb59ba20cf5c2b91b8
parentb98b8babd6e3370fadb7c6eaacb00eb2f6344a6c (diff)
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg() operations. This page is used to build fragments for skbs. Its done to increase probability of coalescing small write() into single segments in skbs still in write queue (not yet sent) But it wastes a lot of memory for applications handling many mostly idle sockets, since each socket holds one page in sk->sk_sndmsg_page Its also quite inefficient to build TSO 64KB packets, because we need about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit page allocator more than wanted. This patch adds a per task frag allocator and uses bigger pages, if available. An automatic fallback is done in case of memory pressure. (up to 32768 bytes per frag, thats order-3 pages on x86) This increases TCP stream performance by 20% on loopback device, but also benefits on other network devices, since 8x less frags are mapped on transmit and unmapped on tx completion. Alexander Duyck mentioned a probable performance win on systems with IOMMU enabled. Its possible some SG enabled hardware cant cope with bigger fragments, but their ndo_start_xmit() should already handle this, splitting a fragment in sub fragments, since some arches have PAGE_SIZE=65536 Successfully tested on various ethernet devices. (ixgbe, igb, bnx2x, tg3, mellanox mlx4) Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Ben Hutchings <bhutchings@solarflare.com> Cc: Vijay Subramanian <subramanian.vijay@gmail.com> Cc: Alexander Duyck <alexander.h.duyck@intel.com> Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/sched.h3
-rw-r--r--include/net/inet_sock.h4
-rw-r--r--include/net/sock.h27
-rw-r--r--kernel/exit.c3
-rw-r--r--kernel/fork.c1
-rw-r--r--net/core/skbuff.c37
-rw-r--r--net/core/sock.c49
-rw-r--r--net/ipv4/ip_output.c70
-rw-r--r--net/ipv4/raw.c19
-rw-r--r--net/ipv4/tcp.c79
-rw-r--r--net/ipv4/tcp_ipv4.c8
-rw-r--r--net/ipv6/ip6_output.c65
-rw-r--r--net/sched/em_meta.c2
13 files changed, 167 insertions, 200 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b8c86648a2f9..a8e2413f6bc3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1530,6 +1530,9 @@ struct task_struct {
1530 * cache last used pipe for splice 1530 * cache last used pipe for splice
1531 */ 1531 */
1532 struct pipe_inode_info *splice_pipe; 1532 struct pipe_inode_info *splice_pipe;
1533
1534 struct page_frag task_frag;
1535
1533#ifdef CONFIG_TASK_DELAY_ACCT 1536#ifdef CONFIG_TASK_DELAY_ACCT
1534 struct task_delay_info *delays; 1537 struct task_delay_info *delays;
1535#endif 1538#endif
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 613cfa401672..256c1ed2d69a 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -101,10 +101,8 @@ struct inet_cork {
101 __be32 addr; 101 __be32 addr;
102 struct ip_options *opt; 102 struct ip_options *opt;
103 unsigned int fragsize; 103 unsigned int fragsize;
104 struct dst_entry *dst;
105 int length; /* Total length of all frames */ 104 int length; /* Total length of all frames */
106 struct page *page; 105 struct dst_entry *dst;
107 u32 off;
108 u8 tx_flags; 106 u8 tx_flags;
109}; 107};
110 108
diff --git a/include/net/sock.h b/include/net/sock.h
index 84bdaeca1314..f036493b9a61 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -247,8 +247,7 @@ struct cg_proto;
247 * @sk_stamp: time stamp of last packet received 247 * @sk_stamp: time stamp of last packet received
248 * @sk_socket: Identd and reporting IO signals 248 * @sk_socket: Identd and reporting IO signals
249 * @sk_user_data: RPC layer private data 249 * @sk_user_data: RPC layer private data
250 * @sk_sndmsg_page: cached page for sendmsg 250 * @sk_frag: cached page frag
251 * @sk_sndmsg_off: cached offset for sendmsg
252 * @sk_peek_off: current peek_offset value 251 * @sk_peek_off: current peek_offset value
253 * @sk_send_head: front of stuff to transmit 252 * @sk_send_head: front of stuff to transmit
254 * @sk_security: used by security modules 253 * @sk_security: used by security modules
@@ -362,9 +361,8 @@ struct sock {
362 ktime_t sk_stamp; 361 ktime_t sk_stamp;
363 struct socket *sk_socket; 362 struct socket *sk_socket;
364 void *sk_user_data; 363 void *sk_user_data;
365 struct page *sk_sndmsg_page; 364 struct page_frag sk_frag;
366 struct sk_buff *sk_send_head; 365 struct sk_buff *sk_send_head;
367 __u32 sk_sndmsg_off;
368 __s32 sk_peek_off; 366 __s32 sk_peek_off;
369 int sk_write_pending; 367 int sk_write_pending;
370#ifdef CONFIG_SECURITY 368#ifdef CONFIG_SECURITY
@@ -2034,18 +2032,23 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk)
2034 2032
2035struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp); 2033struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp);
2036 2034
2037static inline struct page *sk_stream_alloc_page(struct sock *sk) 2035/**
2036 * sk_page_frag - return an appropriate page_frag
2037 * @sk: socket
2038 *
2039 * If socket allocation mode allows current thread to sleep, it means its
2040 * safe to use the per task page_frag instead of the per socket one.
2041 */
2042static inline struct page_frag *sk_page_frag(struct sock *sk)
2038{ 2043{
2039 struct page *page = NULL; 2044 if (sk->sk_allocation & __GFP_WAIT)
2045 return &current->task_frag;
2040 2046
2041 page = alloc_pages(sk->sk_allocation, 0); 2047 return &sk->sk_frag;
2042 if (!page) {
2043 sk_enter_memory_pressure(sk);
2044 sk_stream_moderate_sndbuf(sk);
2045 }
2046 return page;
2047} 2048}
2048 2049
2050extern bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);
2051
2049/* 2052/*
2050 * Default write policy as shown to user space via poll/select/SIGIO 2053 * Default write policy as shown to user space via poll/select/SIGIO
2051 */ 2054 */
diff --git a/kernel/exit.c b/kernel/exit.c
index f65345f9e5bb..42f25952edd9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1046,6 +1046,9 @@ void do_exit(long code)
1046 if (tsk->splice_pipe) 1046 if (tsk->splice_pipe)
1047 __free_pipe_info(tsk->splice_pipe); 1047 __free_pipe_info(tsk->splice_pipe);
1048 1048
1049 if (tsk->task_frag.page)
1050 put_page(tsk->task_frag.page);
1051
1049 validate_creds_for_do_exit(tsk); 1052 validate_creds_for_do_exit(tsk);
1050 1053
1051 preempt_disable(); 1054 preempt_disable();
diff --git a/kernel/fork.c b/kernel/fork.c
index 2c8857e12855..01565b9ce0f3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -330,6 +330,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
330 tsk->btrace_seq = 0; 330 tsk->btrace_seq = 0;
331#endif 331#endif
332 tsk->splice_pipe = NULL; 332 tsk->splice_pipe = NULL;
333 tsk->task_frag.page = NULL;
333 334
334 account_kernel_stack(ti, 1); 335 account_kernel_stack(ti, 1);
335 336
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fe00d1208167..2ede3cfa8ffa 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1655,38 +1655,19 @@ static struct page *linear_to_page(struct page *page, unsigned int *len,
1655 unsigned int *offset, 1655 unsigned int *offset,
1656 struct sk_buff *skb, struct sock *sk) 1656 struct sk_buff *skb, struct sock *sk)
1657{ 1657{
1658 struct page *p = sk->sk_sndmsg_page; 1658 struct page_frag *pfrag = sk_page_frag(sk);
1659 unsigned int off;
1660 1659
1661 if (!p) { 1660 if (!sk_page_frag_refill(sk, pfrag))
1662new_page: 1661 return NULL;
1663 p = sk->sk_sndmsg_page = alloc_pages(sk->sk_allocation, 0);
1664 if (!p)
1665 return NULL;
1666
1667 off = sk->sk_sndmsg_off = 0;
1668 /* hold one ref to this page until it's full */
1669 } else {
1670 unsigned int mlen;
1671
1672 /* If we are the only user of the page, we can reset offset */
1673 if (page_count(p) == 1)
1674 sk->sk_sndmsg_off = 0;
1675 off = sk->sk_sndmsg_off;
1676 mlen = PAGE_SIZE - off;
1677 if (mlen < 64 && mlen < *len) {
1678 put_page(p);
1679 goto new_page;
1680 }
1681 1662
1682 *len = min_t(unsigned int, *len, mlen); 1663 *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);
1683 }
1684 1664
1685 memcpy(page_address(p) + off, page_address(page) + *offset, *len); 1665 memcpy(page_address(pfrag->page) + pfrag->offset,
1686 sk->sk_sndmsg_off += *len; 1666 page_address(page) + *offset, *len);
1687 *offset = off; 1667 *offset = pfrag->offset;
1668 pfrag->offset += *len;
1688 1669
1689 return p; 1670 return pfrag->page;
1690} 1671}
1691 1672
1692static bool spd_can_coalesce(const struct splice_pipe_desc *spd, 1673static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
diff --git a/net/core/sock.c b/net/core/sock.c
index 2693f7649222..727114cd6f7e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1744,6 +1744,45 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1744} 1744}
1745EXPORT_SYMBOL(sock_alloc_send_skb); 1745EXPORT_SYMBOL(sock_alloc_send_skb);
1746 1746
1747/* On 32bit arches, an skb frag is limited to 2^15 */
1748#define SKB_FRAG_PAGE_ORDER get_order(32768)
1749
1750bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1751{
1752 int order;
1753
1754 if (pfrag->page) {
1755 if (atomic_read(&pfrag->page->_count) == 1) {
1756 pfrag->offset = 0;
1757 return true;
1758 }
1759 if (pfrag->offset < pfrag->size)
1760 return true;
1761 put_page(pfrag->page);
1762 }
1763
1764 /* We restrict high order allocations to users that can afford to wait */
1765 order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1766
1767 do {
1768 gfp_t gfp = sk->sk_allocation;
1769
1770 if (order)
1771 gfp |= __GFP_COMP | __GFP_NOWARN;
1772 pfrag->page = alloc_pages(gfp, order);
1773 if (likely(pfrag->page)) {
1774 pfrag->offset = 0;
1775 pfrag->size = PAGE_SIZE << order;
1776 return true;
1777 }
1778 } while (--order >= 0);
1779
1780 sk_enter_memory_pressure(sk);
1781 sk_stream_moderate_sndbuf(sk);
1782 return false;
1783}
1784EXPORT_SYMBOL(sk_page_frag_refill);
1785
1747static void __lock_sock(struct sock *sk) 1786static void __lock_sock(struct sock *sk)
1748 __releases(&sk->sk_lock.slock) 1787 __releases(&sk->sk_lock.slock)
1749 __acquires(&sk->sk_lock.slock) 1788 __acquires(&sk->sk_lock.slock)
@@ -2173,8 +2212,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
2173 sk->sk_error_report = sock_def_error_report; 2212 sk->sk_error_report = sock_def_error_report;
2174 sk->sk_destruct = sock_def_destruct; 2213 sk->sk_destruct = sock_def_destruct;
2175 2214
2176 sk->sk_sndmsg_page = NULL; 2215 sk->sk_frag.page = NULL;
2177 sk->sk_sndmsg_off = 0; 2216 sk->sk_frag.offset = 0;
2178 sk->sk_peek_off = -1; 2217 sk->sk_peek_off = -1;
2179 2218
2180 sk->sk_peer_pid = NULL; 2219 sk->sk_peer_pid = NULL;
@@ -2417,6 +2456,12 @@ void sk_common_release(struct sock *sk)
2417 xfrm_sk_free_policy(sk); 2456 xfrm_sk_free_policy(sk);
2418 2457
2419 sk_refcnt_debug_release(sk); 2458 sk_refcnt_debug_release(sk);
2459
2460 if (sk->sk_frag.page) {
2461 put_page(sk->sk_frag.page);
2462 sk->sk_frag.page = NULL;
2463 }
2464
2420 sock_put(sk); 2465 sock_put(sk);
2421} 2466}
2422EXPORT_SYMBOL(sk_common_release); 2467EXPORT_SYMBOL(sk_common_release);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index a5beab1dc958..24a29a39e9a8 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -793,6 +793,7 @@ static int __ip_append_data(struct sock *sk,
793 struct flowi4 *fl4, 793 struct flowi4 *fl4,
794 struct sk_buff_head *queue, 794 struct sk_buff_head *queue,
795 struct inet_cork *cork, 795 struct inet_cork *cork,
796 struct page_frag *pfrag,
796 int getfrag(void *from, char *to, int offset, 797 int getfrag(void *from, char *to, int offset,
797 int len, int odd, struct sk_buff *skb), 798 int len, int odd, struct sk_buff *skb),
798 void *from, int length, int transhdrlen, 799 void *from, int length, int transhdrlen,
@@ -987,47 +988,30 @@ alloc_new_skb:
987 } 988 }
988 } else { 989 } else {
989 int i = skb_shinfo(skb)->nr_frags; 990 int i = skb_shinfo(skb)->nr_frags;
990 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
991 struct page *page = cork->page;
992 int off = cork->off;
993 unsigned int left;
994
995 if (page && (left = PAGE_SIZE - off) > 0) {
996 if (copy >= left)
997 copy = left;
998 if (page != skb_frag_page(frag)) {
999 if (i == MAX_SKB_FRAGS) {
1000 err = -EMSGSIZE;
1001 goto error;
1002 }
1003 skb_fill_page_desc(skb, i, page, off, 0);
1004 skb_frag_ref(skb, i);
1005 frag = &skb_shinfo(skb)->frags[i];
1006 }
1007 } else if (i < MAX_SKB_FRAGS) {
1008 if (copy > PAGE_SIZE)
1009 copy = PAGE_SIZE;
1010 page = alloc_pages(sk->sk_allocation, 0);
1011 if (page == NULL) {
1012 err = -ENOMEM;
1013 goto error;
1014 }
1015 cork->page = page;
1016 cork->off = 0;
1017 991
1018 skb_fill_page_desc(skb, i, page, 0, 0); 992 err = -ENOMEM;
1019 frag = &skb_shinfo(skb)->frags[i]; 993 if (!sk_page_frag_refill(sk, pfrag))
1020 } else {
1021 err = -EMSGSIZE;
1022 goto error;
1023 }
1024 if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
1025 offset, copy, skb->len, skb) < 0) {
1026 err = -EFAULT;
1027 goto error; 994 goto error;
995
996 if (!skb_can_coalesce(skb, i, pfrag->page,
997 pfrag->offset)) {
998 err = -EMSGSIZE;
999 if (i == MAX_SKB_FRAGS)
1000 goto error;
1001
1002 __skb_fill_page_desc(skb, i, pfrag->page,
1003 pfrag->offset, 0);
1004 skb_shinfo(skb)->nr_frags = ++i;
1005 get_page(pfrag->page);
1028 } 1006 }
1029 cork->off += copy; 1007 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1030 skb_frag_size_add(frag, copy); 1008 if (getfrag(from,
1009 page_address(pfrag->page) + pfrag->offset,
1010 offset, copy, skb->len, skb) < 0)
1011 goto error_efault;
1012
1013 pfrag->offset += copy;
1014 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1031 skb->len += copy; 1015 skb->len += copy;
1032 skb->data_len += copy; 1016 skb->data_len += copy;
1033 skb->truesize += copy; 1017 skb->truesize += copy;
@@ -1039,6 +1023,8 @@ alloc_new_skb:
1039 1023
1040 return 0; 1024 return 0;
1041 1025
1026error_efault:
1027 err = -EFAULT;
1042error: 1028error:
1043 cork->length -= length; 1029 cork->length -= length;
1044 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1030 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
@@ -1079,8 +1065,6 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1079 cork->dst = &rt->dst; 1065 cork->dst = &rt->dst;
1080 cork->length = 0; 1066 cork->length = 0;
1081 cork->tx_flags = ipc->tx_flags; 1067 cork->tx_flags = ipc->tx_flags;
1082 cork->page = NULL;
1083 cork->off = 0;
1084 1068
1085 return 0; 1069 return 0;
1086} 1070}
@@ -1117,7 +1101,8 @@ int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1117 transhdrlen = 0; 1101 transhdrlen = 0;
1118 } 1102 }
1119 1103
1120 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag, 1104 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
1105 sk_page_frag(sk), getfrag,
1121 from, length, transhdrlen, flags); 1106 from, length, transhdrlen, flags);
1122} 1107}
1123 1108
@@ -1439,7 +1424,8 @@ struct sk_buff *ip_make_skb(struct sock *sk,
1439 if (err) 1424 if (err)
1440 return ERR_PTR(err); 1425 return ERR_PTR(err);
1441 1426
1442 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag, 1427 err = __ip_append_data(sk, fl4, &queue, &cork,
1428 &current->task_frag, getfrag,
1443 from, length, transhdrlen, flags); 1429 from, length, transhdrlen, flags);
1444 if (err) { 1430 if (err) {
1445 __ip_flush_pending_frames(sk, &queue, &cork); 1431 __ip_flush_pending_frames(sk, &queue, &cork);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index f2425785d40a..a80740ba4248 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -131,18 +131,23 @@ found:
131 * 0 - deliver 131 * 0 - deliver
132 * 1 - block 132 * 1 - block
133 */ 133 */
134static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) 134static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
135{ 135{
136 int type; 136 struct icmphdr _hdr;
137 137 const struct icmphdr *hdr;
138 if (!pskb_may_pull(skb, sizeof(struct icmphdr))) 138
139 pr_err("icmp_filter skb_transport_offset %d data-head %ld len %d/%d\n",
140 skb_transport_offset(skb), skb->data - skb->head, skb->len, skb->data_len);
141 hdr = skb_header_pointer(skb, skb_transport_offset(skb),
142 sizeof(_hdr), &_hdr);
143 pr_err("head %p data %p hdr %p type %d\n", skb->head, skb->data, hdr, hdr ? hdr->type : -1);
144 if (!hdr)
139 return 1; 145 return 1;
140 146
141 type = icmp_hdr(skb)->type; 147 if (hdr->type < 32) {
142 if (type < 32) {
143 __u32 data = raw_sk(sk)->filter.data; 148 __u32 data = raw_sk(sk)->filter.data;
144 149
145 return ((1 << type) & data) != 0; 150 return ((1U << hdr->type) & data) != 0;
146 } 151 }
147 152
148 /* Do not block unknown ICMP types */ 153 /* Do not block unknown ICMP types */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7b1e940393cf..72ea4752f21b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1150,78 +1150,43 @@ new_segment:
1150 if (err) 1150 if (err)
1151 goto do_fault; 1151 goto do_fault;
1152 } else { 1152 } else {
1153 bool merge = false; 1153 bool merge = true;
1154 int i = skb_shinfo(skb)->nr_frags; 1154 int i = skb_shinfo(skb)->nr_frags;
1155 struct page *page = sk->sk_sndmsg_page; 1155 struct page_frag *pfrag = sk_page_frag(sk);
1156 int off; 1156
1157 1157 if (!sk_page_frag_refill(sk, pfrag))
1158 if (page && page_count(page) == 1) 1158 goto wait_for_memory;
1159 sk->sk_sndmsg_off = 0; 1159
1160 1160 if (!skb_can_coalesce(skb, i, pfrag->page,
1161 off = sk->sk_sndmsg_off; 1161 pfrag->offset)) {
1162 1162 if (i == MAX_SKB_FRAGS || !sg) {
1163 if (skb_can_coalesce(skb, i, page, off) && 1163 tcp_mark_push(tp, skb);
1164 off != PAGE_SIZE) { 1164 goto new_segment;
1165 /* We can extend the last page
1166 * fragment. */
1167 merge = true;
1168 } else if (i == MAX_SKB_FRAGS || !sg) {
1169 /* Need to add new fragment and cannot
1170 * do this because interface is non-SG,
1171 * or because all the page slots are
1172 * busy. */
1173 tcp_mark_push(tp, skb);
1174 goto new_segment;
1175 } else if (page) {
1176 if (off == PAGE_SIZE) {
1177 put_page(page);
1178 sk->sk_sndmsg_page = page = NULL;
1179 off = 0;
1180 } 1165 }
1181 } else 1166 merge = false;
1182 off = 0; 1167 }
1183 1168
1184 if (copy > PAGE_SIZE - off) 1169 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1185 copy = PAGE_SIZE - off;
1186 1170
1187 if (!sk_wmem_schedule(sk, copy)) 1171 if (!sk_wmem_schedule(sk, copy))
1188 goto wait_for_memory; 1172 goto wait_for_memory;
1189 1173
1190 if (!page) {
1191 /* Allocate new cache page. */
1192 if (!(page = sk_stream_alloc_page(sk)))
1193 goto wait_for_memory;
1194 }
1195
1196 /* Time to copy data. We are close to
1197 * the end! */
1198 err = skb_copy_to_page_nocache(sk, from, skb, 1174 err = skb_copy_to_page_nocache(sk, from, skb,
1199 page, off, copy); 1175 pfrag->page,
1200 if (err) { 1176 pfrag->offset,
1201 /* If this page was new, give it to the 1177 copy);
1202 * socket so it does not get leaked. 1178 if (err)
1203 */
1204 if (!sk->sk_sndmsg_page) {
1205 sk->sk_sndmsg_page = page;
1206 sk->sk_sndmsg_off = 0;
1207 }
1208 goto do_error; 1179 goto do_error;
1209 }
1210 1180
1211 /* Update the skb. */ 1181 /* Update the skb. */
1212 if (merge) { 1182 if (merge) {
1213 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1183 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1214 } else { 1184 } else {
1215 skb_fill_page_desc(skb, i, page, off, copy); 1185 skb_fill_page_desc(skb, i, pfrag->page,
1216 if (sk->sk_sndmsg_page) { 1186 pfrag->offset, copy);
1217 get_page(page); 1187 get_page(pfrag->page);
1218 } else if (off + copy < PAGE_SIZE) {
1219 get_page(page);
1220 sk->sk_sndmsg_page = page;
1221 }
1222 } 1188 }
1223 1189 pfrag->offset += copy;
1224 sk->sk_sndmsg_off = off + copy;
1225 } 1190 }
1226 1191
1227 if (!copied) 1192 if (!copied)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0a7e020f16b5..93406c583f43 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2200,14 +2200,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
2200 if (inet_csk(sk)->icsk_bind_hash) 2200 if (inet_csk(sk)->icsk_bind_hash)
2201 inet_put_port(sk); 2201 inet_put_port(sk);
2202 2202
2203 /*
2204 * If sendmsg cached page exists, toss it.
2205 */
2206 if (sk->sk_sndmsg_page) {
2207 __free_page(sk->sk_sndmsg_page);
2208 sk->sk_sndmsg_page = NULL;
2209 }
2210
2211 /* TCP Cookie Transactions */ 2203 /* TCP Cookie Transactions */
2212 if (tp->cookie_values != NULL) { 2204 if (tp->cookie_values != NULL) {
2213 kref_put(&tp->cookie_values->kref, 2205 kref_put(&tp->cookie_values->kref,
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 3dd4a37488d5..aece3e792f84 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1279,8 +1279,6 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1279 if (dst_allfrag(rt->dst.path)) 1279 if (dst_allfrag(rt->dst.path))
1280 cork->flags |= IPCORK_ALLFRAG; 1280 cork->flags |= IPCORK_ALLFRAG;
1281 cork->length = 0; 1281 cork->length = 0;
1282 sk->sk_sndmsg_page = NULL;
1283 sk->sk_sndmsg_off = 0;
1284 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len; 1282 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1285 length += exthdrlen; 1283 length += exthdrlen;
1286 transhdrlen += exthdrlen; 1284 transhdrlen += exthdrlen;
@@ -1504,48 +1502,31 @@ alloc_new_skb:
1504 } 1502 }
1505 } else { 1503 } else {
1506 int i = skb_shinfo(skb)->nr_frags; 1504 int i = skb_shinfo(skb)->nr_frags;
1507 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; 1505 struct page_frag *pfrag = sk_page_frag(sk);
1508 struct page *page = sk->sk_sndmsg_page;
1509 int off = sk->sk_sndmsg_off;
1510 unsigned int left;
1511
1512 if (page && (left = PAGE_SIZE - off) > 0) {
1513 if (copy >= left)
1514 copy = left;
1515 if (page != skb_frag_page(frag)) {
1516 if (i == MAX_SKB_FRAGS) {
1517 err = -EMSGSIZE;
1518 goto error;
1519 }
1520 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1521 skb_frag_ref(skb, i);
1522 frag = &skb_shinfo(skb)->frags[i];
1523 }
1524 } else if(i < MAX_SKB_FRAGS) {
1525 if (copy > PAGE_SIZE)
1526 copy = PAGE_SIZE;
1527 page = alloc_pages(sk->sk_allocation, 0);
1528 if (page == NULL) {
1529 err = -ENOMEM;
1530 goto error;
1531 }
1532 sk->sk_sndmsg_page = page;
1533 sk->sk_sndmsg_off = 0;
1534 1506
1535 skb_fill_page_desc(skb, i, page, 0, 0); 1507 err = -ENOMEM;
1536 frag = &skb_shinfo(skb)->frags[i]; 1508 if (!sk_page_frag_refill(sk, pfrag))
1537 } else {
1538 err = -EMSGSIZE;
1539 goto error; 1509 goto error;
1510
1511 if (!skb_can_coalesce(skb, i, pfrag->page,
1512 pfrag->offset)) {
1513 err = -EMSGSIZE;
1514 if (i == MAX_SKB_FRAGS)
1515 goto error;
1516
1517 __skb_fill_page_desc(skb, i, pfrag->page,
1518 pfrag->offset, 0);
1519 skb_shinfo(skb)->nr_frags = ++i;
1520 get_page(pfrag->page);
1540 } 1521 }
1522 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1541 if (getfrag(from, 1523 if (getfrag(from,
1542 skb_frag_address(frag) + skb_frag_size(frag), 1524 page_address(pfrag->page) + pfrag->offset,
1543 offset, copy, skb->len, skb) < 0) { 1525 offset, copy, skb->len, skb) < 0)
1544 err = -EFAULT; 1526 goto error_efault;
1545 goto error; 1527
1546 } 1528 pfrag->offset += copy;
1547 sk->sk_sndmsg_off += copy; 1529 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1548 skb_frag_size_add(frag, copy);
1549 skb->len += copy; 1530 skb->len += copy;
1550 skb->data_len += copy; 1531 skb->data_len += copy;
1551 skb->truesize += copy; 1532 skb->truesize += copy;
@@ -1554,7 +1535,11 @@ alloc_new_skb:
1554 offset += copy; 1535 offset += copy;
1555 length -= copy; 1536 length -= copy;
1556 } 1537 }
1538
1557 return 0; 1539 return 0;
1540
1541error_efault:
1542 err = -EFAULT;
1558error: 1543error:
1559 cork->length -= length; 1544 cork->length -= length;
1560 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1545 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 4ab6e3325573..7c3de6ffa516 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -461,7 +461,7 @@ META_COLLECTOR(int_sk_sndtimeo)
461META_COLLECTOR(int_sk_sendmsg_off) 461META_COLLECTOR(int_sk_sendmsg_off)
462{ 462{
463 SKIP_NONLOCAL(skb); 463 SKIP_NONLOCAL(skb);
464 dst->value = skb->sk->sk_sndmsg_off; 464 dst->value = skb->sk->sk_frag.offset;
465} 465}
466 466
467META_COLLECTOR(int_sk_write_pend) 467META_COLLECTOR(int_sk_write_pend)