aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2012-09-23 19:04:42 -0400
committerDavid S. Miller <davem@davemloft.net>2012-09-24 16:31:37 -0400
commit5640f7685831e088fe6c2e1f863a6805962f8e81 (patch)
treefb7660173338a45c27d610eb59ba20cf5c2b91b8 /net/ipv4/tcp.c
parentb98b8babd6e3370fadb7c6eaacb00eb2f6344a6c (diff)
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg() operations. This page is used to build fragments for skbs. Its done to increase probability of coalescing small write() into single segments in skbs still in write queue (not yet sent) But it wastes a lot of memory for applications handling many mostly idle sockets, since each socket holds one page in sk->sk_sndmsg_page Its also quite inefficient to build TSO 64KB packets, because we need about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit page allocator more than wanted. This patch adds a per task frag allocator and uses bigger pages, if available. An automatic fallback is done in case of memory pressure. (up to 32768 bytes per frag, thats order-3 pages on x86) This increases TCP stream performance by 20% on loopback device, but also benefits on other network devices, since 8x less frags are mapped on transmit and unmapped on tx completion. Alexander Duyck mentioned a probable performance win on systems with IOMMU enabled. Its possible some SG enabled hardware cant cope with bigger fragments, but their ndo_start_xmit() should already handle this, splitting a fragment in sub fragments, since some arches have PAGE_SIZE=65536 Successfully tested on various ethernet devices. (ixgbe, igb, bnx2x, tg3, mellanox mlx4) Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Ben Hutchings <bhutchings@solarflare.com> Cc: Vijay Subramanian <subramanian.vijay@gmail.com> Cc: Alexander Duyck <alexander.h.duyck@intel.com> Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c79
1 files changed, 22 insertions, 57 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7b1e940393cf..72ea4752f21b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1150,78 +1150,43 @@ new_segment:
1150 if (err) 1150 if (err)
1151 goto do_fault; 1151 goto do_fault;
1152 } else { 1152 } else {
1153 bool merge = false; 1153 bool merge = true;
1154 int i = skb_shinfo(skb)->nr_frags; 1154 int i = skb_shinfo(skb)->nr_frags;
1155 struct page *page = sk->sk_sndmsg_page; 1155 struct page_frag *pfrag = sk_page_frag(sk);
1156 int off; 1156
1157 1157 if (!sk_page_frag_refill(sk, pfrag))
1158 if (page && page_count(page) == 1) 1158 goto wait_for_memory;
1159 sk->sk_sndmsg_off = 0; 1159
1160 1160 if (!skb_can_coalesce(skb, i, pfrag->page,
1161 off = sk->sk_sndmsg_off; 1161 pfrag->offset)) {
1162 1162 if (i == MAX_SKB_FRAGS || !sg) {
1163 if (skb_can_coalesce(skb, i, page, off) && 1163 tcp_mark_push(tp, skb);
1164 off != PAGE_SIZE) { 1164 goto new_segment;
1165 /* We can extend the last page
1166 * fragment. */
1167 merge = true;
1168 } else if (i == MAX_SKB_FRAGS || !sg) {
1169 /* Need to add new fragment and cannot
1170 * do this because interface is non-SG,
1171 * or because all the page slots are
1172 * busy. */
1173 tcp_mark_push(tp, skb);
1174 goto new_segment;
1175 } else if (page) {
1176 if (off == PAGE_SIZE) {
1177 put_page(page);
1178 sk->sk_sndmsg_page = page = NULL;
1179 off = 0;
1180 } 1165 }
1181 } else 1166 merge = false;
1182 off = 0; 1167 }
1183 1168
1184 if (copy > PAGE_SIZE - off) 1169 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1185 copy = PAGE_SIZE - off;
1186 1170
1187 if (!sk_wmem_schedule(sk, copy)) 1171 if (!sk_wmem_schedule(sk, copy))
1188 goto wait_for_memory; 1172 goto wait_for_memory;
1189 1173
1190 if (!page) {
1191 /* Allocate new cache page. */
1192 if (!(page = sk_stream_alloc_page(sk)))
1193 goto wait_for_memory;
1194 }
1195
1196 /* Time to copy data. We are close to
1197 * the end! */
1198 err = skb_copy_to_page_nocache(sk, from, skb, 1174 err = skb_copy_to_page_nocache(sk, from, skb,
1199 page, off, copy); 1175 pfrag->page,
1200 if (err) { 1176 pfrag->offset,
1201 /* If this page was new, give it to the 1177 copy);
1202 * socket so it does not get leaked. 1178 if (err)
1203 */
1204 if (!sk->sk_sndmsg_page) {
1205 sk->sk_sndmsg_page = page;
1206 sk->sk_sndmsg_off = 0;
1207 }
1208 goto do_error; 1179 goto do_error;
1209 }
1210 1180
1211 /* Update the skb. */ 1181 /* Update the skb. */
1212 if (merge) { 1182 if (merge) {
1213 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1183 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1214 } else { 1184 } else {
1215 skb_fill_page_desc(skb, i, page, off, copy); 1185 skb_fill_page_desc(skb, i, pfrag->page,
1216 if (sk->sk_sndmsg_page) { 1186 pfrag->offset, copy);
1217 get_page(page); 1187 get_page(pfrag->page);
1218 } else if (off + copy < PAGE_SIZE) {
1219 get_page(page);
1220 sk->sk_sndmsg_page = page;
1221 }
1222 } 1188 }
1223 1189 pfrag->offset += copy;
1224 sk->sk_sndmsg_off = off + copy;
1225 } 1190 }
1226 1191
1227 if (!copied) 1192 if (!copied)