aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2012-09-23 19:04:42 -0400
committerDavid S. Miller <davem@davemloft.net>2012-09-24 16:31:37 -0400
commit5640f7685831e088fe6c2e1f863a6805962f8e81 (patch)
treefb7660173338a45c27d610eb59ba20cf5c2b91b8 /net/ipv4
parentb98b8babd6e3370fadb7c6eaacb00eb2f6344a6c (diff)
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg() operations. This page is used to build fragments for skbs. Its done to increase probability of coalescing small write() into single segments in skbs still in write queue (not yet sent) But it wastes a lot of memory for applications handling many mostly idle sockets, since each socket holds one page in sk->sk_sndmsg_page Its also quite inefficient to build TSO 64KB packets, because we need about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit page allocator more than wanted. This patch adds a per task frag allocator and uses bigger pages, if available. An automatic fallback is done in case of memory pressure. (up to 32768 bytes per frag, thats order-3 pages on x86) This increases TCP stream performance by 20% on loopback device, but also benefits on other network devices, since 8x less frags are mapped on transmit and unmapped on tx completion. Alexander Duyck mentioned a probable performance win on systems with IOMMU enabled. Its possible some SG enabled hardware cant cope with bigger fragments, but their ndo_start_xmit() should already handle this, splitting a fragment in sub fragments, since some arches have PAGE_SIZE=65536 Successfully tested on various ethernet devices. (ixgbe, igb, bnx2x, tg3, mellanox mlx4) Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Ben Hutchings <bhutchings@solarflare.com> Cc: Vijay Subramanian <subramanian.vijay@gmail.com> Cc: Alexander Duyck <alexander.h.duyck@intel.com> Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/ip_output.c70
-rw-r--r--net/ipv4/raw.c19
-rw-r--r--net/ipv4/tcp.c79
-rw-r--r--net/ipv4/tcp_ipv4.c8
4 files changed, 62 insertions, 114 deletions
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index a5beab1dc958..24a29a39e9a8 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -793,6 +793,7 @@ static int __ip_append_data(struct sock *sk,
793 struct flowi4 *fl4, 793 struct flowi4 *fl4,
794 struct sk_buff_head *queue, 794 struct sk_buff_head *queue,
795 struct inet_cork *cork, 795 struct inet_cork *cork,
796 struct page_frag *pfrag,
796 int getfrag(void *from, char *to, int offset, 797 int getfrag(void *from, char *to, int offset,
797 int len, int odd, struct sk_buff *skb), 798 int len, int odd, struct sk_buff *skb),
798 void *from, int length, int transhdrlen, 799 void *from, int length, int transhdrlen,
@@ -987,47 +988,30 @@ alloc_new_skb:
987 } 988 }
988 } else { 989 } else {
989 int i = skb_shinfo(skb)->nr_frags; 990 int i = skb_shinfo(skb)->nr_frags;
990 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
991 struct page *page = cork->page;
992 int off = cork->off;
993 unsigned int left;
994
995 if (page && (left = PAGE_SIZE - off) > 0) {
996 if (copy >= left)
997 copy = left;
998 if (page != skb_frag_page(frag)) {
999 if (i == MAX_SKB_FRAGS) {
1000 err = -EMSGSIZE;
1001 goto error;
1002 }
1003 skb_fill_page_desc(skb, i, page, off, 0);
1004 skb_frag_ref(skb, i);
1005 frag = &skb_shinfo(skb)->frags[i];
1006 }
1007 } else if (i < MAX_SKB_FRAGS) {
1008 if (copy > PAGE_SIZE)
1009 copy = PAGE_SIZE;
1010 page = alloc_pages(sk->sk_allocation, 0);
1011 if (page == NULL) {
1012 err = -ENOMEM;
1013 goto error;
1014 }
1015 cork->page = page;
1016 cork->off = 0;
1017 991
1018 skb_fill_page_desc(skb, i, page, 0, 0); 992 err = -ENOMEM;
1019 frag = &skb_shinfo(skb)->frags[i]; 993 if (!sk_page_frag_refill(sk, pfrag))
1020 } else {
1021 err = -EMSGSIZE;
1022 goto error;
1023 }
1024 if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
1025 offset, copy, skb->len, skb) < 0) {
1026 err = -EFAULT;
1027 goto error; 994 goto error;
995
996 if (!skb_can_coalesce(skb, i, pfrag->page,
997 pfrag->offset)) {
998 err = -EMSGSIZE;
999 if (i == MAX_SKB_FRAGS)
1000 goto error;
1001
1002 __skb_fill_page_desc(skb, i, pfrag->page,
1003 pfrag->offset, 0);
1004 skb_shinfo(skb)->nr_frags = ++i;
1005 get_page(pfrag->page);
1028 } 1006 }
1029 cork->off += copy; 1007 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1030 skb_frag_size_add(frag, copy); 1008 if (getfrag(from,
1009 page_address(pfrag->page) + pfrag->offset,
1010 offset, copy, skb->len, skb) < 0)
1011 goto error_efault;
1012
1013 pfrag->offset += copy;
1014 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1031 skb->len += copy; 1015 skb->len += copy;
1032 skb->data_len += copy; 1016 skb->data_len += copy;
1033 skb->truesize += copy; 1017 skb->truesize += copy;
@@ -1039,6 +1023,8 @@ alloc_new_skb:
1039 1023
1040 return 0; 1024 return 0;
1041 1025
1026error_efault:
1027 err = -EFAULT;
1042error: 1028error:
1043 cork->length -= length; 1029 cork->length -= length;
1044 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1030 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
@@ -1079,8 +1065,6 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1079 cork->dst = &rt->dst; 1065 cork->dst = &rt->dst;
1080 cork->length = 0; 1066 cork->length = 0;
1081 cork->tx_flags = ipc->tx_flags; 1067 cork->tx_flags = ipc->tx_flags;
1082 cork->page = NULL;
1083 cork->off = 0;
1084 1068
1085 return 0; 1069 return 0;
1086} 1070}
@@ -1117,7 +1101,8 @@ int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1117 transhdrlen = 0; 1101 transhdrlen = 0;
1118 } 1102 }
1119 1103
1120 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag, 1104 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
1105 sk_page_frag(sk), getfrag,
1121 from, length, transhdrlen, flags); 1106 from, length, transhdrlen, flags);
1122} 1107}
1123 1108
@@ -1439,7 +1424,8 @@ struct sk_buff *ip_make_skb(struct sock *sk,
1439 if (err) 1424 if (err)
1440 return ERR_PTR(err); 1425 return ERR_PTR(err);
1441 1426
1442 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag, 1427 err = __ip_append_data(sk, fl4, &queue, &cork,
1428 &current->task_frag, getfrag,
1443 from, length, transhdrlen, flags); 1429 from, length, transhdrlen, flags);
1444 if (err) { 1430 if (err) {
1445 __ip_flush_pending_frames(sk, &queue, &cork); 1431 __ip_flush_pending_frames(sk, &queue, &cork);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index f2425785d40a..a80740ba4248 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -131,18 +131,23 @@ found:
131 * 0 - deliver 131 * 0 - deliver
132 * 1 - block 132 * 1 - block
133 */ 133 */
134static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) 134static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
135{ 135{
136 int type; 136 struct icmphdr _hdr;
137 137 const struct icmphdr *hdr;
138 if (!pskb_may_pull(skb, sizeof(struct icmphdr))) 138
139 pr_err("icmp_filter skb_transport_offset %d data-head %ld len %d/%d\n",
140 skb_transport_offset(skb), skb->data - skb->head, skb->len, skb->data_len);
141 hdr = skb_header_pointer(skb, skb_transport_offset(skb),
142 sizeof(_hdr), &_hdr);
143 pr_err("head %p data %p hdr %p type %d\n", skb->head, skb->data, hdr, hdr ? hdr->type : -1);
144 if (!hdr)
139 return 1; 145 return 1;
140 146
141 type = icmp_hdr(skb)->type; 147 if (hdr->type < 32) {
142 if (type < 32) {
143 __u32 data = raw_sk(sk)->filter.data; 148 __u32 data = raw_sk(sk)->filter.data;
144 149
145 return ((1 << type) & data) != 0; 150 return ((1U << hdr->type) & data) != 0;
146 } 151 }
147 152
148 /* Do not block unknown ICMP types */ 153 /* Do not block unknown ICMP types */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7b1e940393cf..72ea4752f21b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1150,78 +1150,43 @@ new_segment:
1150 if (err) 1150 if (err)
1151 goto do_fault; 1151 goto do_fault;
1152 } else { 1152 } else {
1153 bool merge = false; 1153 bool merge = true;
1154 int i = skb_shinfo(skb)->nr_frags; 1154 int i = skb_shinfo(skb)->nr_frags;
1155 struct page *page = sk->sk_sndmsg_page; 1155 struct page_frag *pfrag = sk_page_frag(sk);
1156 int off; 1156
1157 1157 if (!sk_page_frag_refill(sk, pfrag))
1158 if (page && page_count(page) == 1) 1158 goto wait_for_memory;
1159 sk->sk_sndmsg_off = 0; 1159
1160 1160 if (!skb_can_coalesce(skb, i, pfrag->page,
1161 off = sk->sk_sndmsg_off; 1161 pfrag->offset)) {
1162 1162 if (i == MAX_SKB_FRAGS || !sg) {
1163 if (skb_can_coalesce(skb, i, page, off) && 1163 tcp_mark_push(tp, skb);
1164 off != PAGE_SIZE) { 1164 goto new_segment;
1165 /* We can extend the last page
1166 * fragment. */
1167 merge = true;
1168 } else if (i == MAX_SKB_FRAGS || !sg) {
1169 /* Need to add new fragment and cannot
1170 * do this because interface is non-SG,
1171 * or because all the page slots are
1172 * busy. */
1173 tcp_mark_push(tp, skb);
1174 goto new_segment;
1175 } else if (page) {
1176 if (off == PAGE_SIZE) {
1177 put_page(page);
1178 sk->sk_sndmsg_page = page = NULL;
1179 off = 0;
1180 } 1165 }
1181 } else 1166 merge = false;
1182 off = 0; 1167 }
1183 1168
1184 if (copy > PAGE_SIZE - off) 1169 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1185 copy = PAGE_SIZE - off;
1186 1170
1187 if (!sk_wmem_schedule(sk, copy)) 1171 if (!sk_wmem_schedule(sk, copy))
1188 goto wait_for_memory; 1172 goto wait_for_memory;
1189 1173
1190 if (!page) {
1191 /* Allocate new cache page. */
1192 if (!(page = sk_stream_alloc_page(sk)))
1193 goto wait_for_memory;
1194 }
1195
1196 /* Time to copy data. We are close to
1197 * the end! */
1198 err = skb_copy_to_page_nocache(sk, from, skb, 1174 err = skb_copy_to_page_nocache(sk, from, skb,
1199 page, off, copy); 1175 pfrag->page,
1200 if (err) { 1176 pfrag->offset,
1201 /* If this page was new, give it to the 1177 copy);
1202 * socket so it does not get leaked. 1178 if (err)
1203 */
1204 if (!sk->sk_sndmsg_page) {
1205 sk->sk_sndmsg_page = page;
1206 sk->sk_sndmsg_off = 0;
1207 }
1208 goto do_error; 1179 goto do_error;
1209 }
1210 1180
1211 /* Update the skb. */ 1181 /* Update the skb. */
1212 if (merge) { 1182 if (merge) {
1213 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1183 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1214 } else { 1184 } else {
1215 skb_fill_page_desc(skb, i, page, off, copy); 1185 skb_fill_page_desc(skb, i, pfrag->page,
1216 if (sk->sk_sndmsg_page) { 1186 pfrag->offset, copy);
1217 get_page(page); 1187 get_page(pfrag->page);
1218 } else if (off + copy < PAGE_SIZE) {
1219 get_page(page);
1220 sk->sk_sndmsg_page = page;
1221 }
1222 } 1188 }
1223 1189 pfrag->offset += copy;
1224 sk->sk_sndmsg_off = off + copy;
1225 } 1190 }
1226 1191
1227 if (!copied) 1192 if (!copied)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0a7e020f16b5..93406c583f43 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2200,14 +2200,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
2200 if (inet_csk(sk)->icsk_bind_hash) 2200 if (inet_csk(sk)->icsk_bind_hash)
2201 inet_put_port(sk); 2201 inet_put_port(sk);
2202 2202
2203 /*
2204 * If sendmsg cached page exists, toss it.
2205 */
2206 if (sk->sk_sndmsg_page) {
2207 __free_page(sk->sk_sndmsg_page);
2208 sk->sk_sndmsg_page = NULL;
2209 }
2210
2211 /* TCP Cookie Transactions */ 2203 /* TCP Cookie Transactions */
2212 if (tp->cookie_values != NULL) { 2204 if (tp->cookie_values != NULL) {
2213 kref_put(&tp->cookie_values->kref, 2205 kref_put(&tp->cookie_values->kref,