aboutsummaryrefslogtreecommitdiffstats
path: root/net/packet
diff options
context:
space:
mode:
authorNeil Horman <nhorman@tuxdriver.com>2010-11-16 13:26:47 -0500
committerDavid S. Miller <davem@davemloft.net>2010-11-16 13:26:47 -0500
commit0e3125c755445664f00ad036e4fc2cd32fd52877 (patch)
treeb26db97e3239324ac16b13e299e43b7bf2b9560c /net/packet
parent020f01ebd04f3429c32586d90598c9f59e54ca7d (diff)
packet: Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v4)
MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Version 4 of this patch. Change notes: 1) Removed extra memset. Didn't think kcalloc added a GFP_ZERO the way kzalloc did :) Summary: It was shown to me recently that systems under high load were driven very deep into swap when tcpdump was run. The reason this happened was because the AF_PACKET protocol has a SET_RINGBUFFER socket option that allows the user space application to specify how many entries an AF_PACKET socket will have and how large each entry will be. It seems the default setting for tcpdump is to set the ring buffer to 32 entries of 64 Kb each, which implies 32 order 5 allocation. Thats difficult under good circumstances, and horrid under memory pressure. I thought it would be good to make that a bit more usable. I was going to do a simple conversion of the ring buffer from contigous pages to iovecs, but unfortunately, the metadata which AF_PACKET places in these buffers can easily span a page boundary, and given that these buffers get mapped into user space, and the data layout doesn't easily allow for a change to padding between frames to avoid that, a simple iovec change is just going to break user space ABI consistency. So I've done this, I've added a three tiered mechanism to the af_packet set_ring socket option. It attempts to allocate memory in the following order: 1) Using __get_free_pages with GFP_NORETRY set, so as to fail quickly without digging into swap 2) Using vmalloc 3) Using __get_free_pages with GFP_NORETRY clear, causing us to try as hard as needed to get the memory The effect is that we don't disturb the system as much when we're under load, while still being able to conduct tcpdumps effectively. Tested successfully by me. Signed-off-by: Neil Horman <nhorman@tuxdriver.com> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Acked-by: Maciej Żenczykowski <zenczykowski@gmail.com> Reported-by: Maciej Żenczykowski <zenczykowski@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/packet')
-rw-r--r--net/packet/af_packet.c85
1 files changed, 69 insertions, 16 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 8298e676f5a0..20964560a0ed 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -61,6 +61,7 @@
61#include <linux/kernel.h> 61#include <linux/kernel.h>
62#include <linux/kmod.h> 62#include <linux/kmod.h>
63#include <linux/slab.h> 63#include <linux/slab.h>
64#include <linux/vmalloc.h>
64#include <net/net_namespace.h> 65#include <net/net_namespace.h>
65#include <net/ip.h> 66#include <net/ip.h>
66#include <net/protocol.h> 67#include <net/protocol.h>
@@ -163,8 +164,14 @@ struct packet_mreq_max {
163static int packet_set_ring(struct sock *sk, struct tpacket_req *req, 164static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
164 int closing, int tx_ring); 165 int closing, int tx_ring);
165 166
167#define PGV_FROM_VMALLOC 1
168struct pgv {
169 char *buffer;
170 unsigned char flags;
171};
172
166struct packet_ring_buffer { 173struct packet_ring_buffer {
167 char **pg_vec; 174 struct pgv *pg_vec;
168 unsigned int head; 175 unsigned int head;
169 unsigned int frames_per_block; 176 unsigned int frames_per_block;
170 unsigned int frame_size; 177 unsigned int frame_size;
@@ -283,7 +290,8 @@ static void *packet_lookup_frame(struct packet_sock *po,
283 pg_vec_pos = position / rb->frames_per_block; 290 pg_vec_pos = position / rb->frames_per_block;
284 frame_offset = position % rb->frames_per_block; 291 frame_offset = position % rb->frames_per_block;
285 292
286 h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size); 293 h.raw = rb->pg_vec[pg_vec_pos].buffer +
294 (frame_offset * rb->frame_size);
287 295
288 if (status != __packet_get_status(po, h.raw)) 296 if (status != __packet_get_status(po, h.raw))
289 return NULL; 297 return NULL;
@@ -2325,37 +2333,74 @@ static const struct vm_operations_struct packet_mmap_ops = {
2325 .close = packet_mm_close, 2333 .close = packet_mm_close,
2326}; 2334};
2327 2335
2328static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len) 2336static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
2337 unsigned int len)
2329{ 2338{
2330 int i; 2339 int i;
2331 2340
2332 for (i = 0; i < len; i++) { 2341 for (i = 0; i < len; i++) {
2333 if (likely(pg_vec[i])) 2342 if (likely(pg_vec[i].buffer)) {
2334 free_pages((unsigned long) pg_vec[i], order); 2343 if (pg_vec[i].flags & PGV_FROM_VMALLOC)
2344 vfree(pg_vec[i].buffer);
2345 else
2346 free_pages((unsigned long)pg_vec[i].buffer,
2347 order);
2348 pg_vec[i].buffer = NULL;
2349 }
2335 } 2350 }
2336 kfree(pg_vec); 2351 kfree(pg_vec);
2337} 2352}
2338 2353
2339static inline char *alloc_one_pg_vec_page(unsigned long order) 2354static inline char *alloc_one_pg_vec_page(unsigned long order,
2355 unsigned char *flags)
2340{ 2356{
2341 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN; 2357 char *buffer = NULL;
2358 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
2359 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
2360
2361 buffer = (char *) __get_free_pages(gfp_flags, order);
2362
2363 if (buffer)
2364 return buffer;
2365
2366 /*
2367 * __get_free_pages failed, fall back to vmalloc
2368 */
2369 *flags |= PGV_FROM_VMALLOC;
2370 buffer = vmalloc((1 << order) * PAGE_SIZE);
2342 2371
2343 return (char *) __get_free_pages(gfp_flags, order); 2372 if (buffer)
2373 return buffer;
2374
2375 /*
2376 * vmalloc failed, lets dig into swap here
2377 */
2378 *flags = 0;
2379 gfp_flags &= ~__GFP_NORETRY;
2380 buffer = (char *)__get_free_pages(gfp_flags, order);
2381 if (buffer)
2382 return buffer;
2383
2384 /*
2385 * complete and utter failure
2386 */
2387 return NULL;
2344} 2388}
2345 2389
2346static char **alloc_pg_vec(struct tpacket_req *req, int order) 2390static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
2347{ 2391{
2348 unsigned int block_nr = req->tp_block_nr; 2392 unsigned int block_nr = req->tp_block_nr;
2349 char **pg_vec; 2393 struct pgv *pg_vec;
2350 int i; 2394 int i;
2351 2395
2352 pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL); 2396 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
2353 if (unlikely(!pg_vec)) 2397 if (unlikely(!pg_vec))
2354 goto out; 2398 goto out;
2355 2399
2356 for (i = 0; i < block_nr; i++) { 2400 for (i = 0; i < block_nr; i++) {
2357 pg_vec[i] = alloc_one_pg_vec_page(order); 2401 pg_vec[i].buffer = alloc_one_pg_vec_page(order,
2358 if (unlikely(!pg_vec[i])) 2402 &pg_vec[i].flags);
2403 if (unlikely(!pg_vec[i].buffer))
2359 goto out_free_pgvec; 2404 goto out_free_pgvec;
2360 } 2405 }
2361 2406
@@ -2364,6 +2409,7 @@ out:
2364 2409
2365out_free_pgvec: 2410out_free_pgvec:
2366 free_pg_vec(pg_vec, order, block_nr); 2411 free_pg_vec(pg_vec, order, block_nr);
2412 kfree(pg_vec);
2367 pg_vec = NULL; 2413 pg_vec = NULL;
2368 goto out; 2414 goto out;
2369} 2415}
@@ -2371,7 +2417,7 @@ out_free_pgvec:
2371static int packet_set_ring(struct sock *sk, struct tpacket_req *req, 2417static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2372 int closing, int tx_ring) 2418 int closing, int tx_ring)
2373{ 2419{
2374 char **pg_vec = NULL; 2420 struct pgv *pg_vec = NULL;
2375 struct packet_sock *po = pkt_sk(sk); 2421 struct packet_sock *po = pkt_sk(sk);
2376 int was_running, order = 0; 2422 int was_running, order = 0;
2377 struct packet_ring_buffer *rb; 2423 struct packet_ring_buffer *rb;
@@ -2533,15 +2579,22 @@ static int packet_mmap(struct file *file, struct socket *sock,
2533 continue; 2579 continue;
2534 2580
2535 for (i = 0; i < rb->pg_vec_len; i++) { 2581 for (i = 0; i < rb->pg_vec_len; i++) {
2536 struct page *page = virt_to_page(rb->pg_vec[i]); 2582 struct page *page;
2583 void *kaddr = rb->pg_vec[i].buffer;
2537 int pg_num; 2584 int pg_num;
2538 2585
2539 for (pg_num = 0; pg_num < rb->pg_vec_pages; 2586 for (pg_num = 0; pg_num < rb->pg_vec_pages;
2540 pg_num++, page++) { 2587 pg_num++) {
2588 if (rb->pg_vec[i].flags & PGV_FROM_VMALLOC)
2589 page = vmalloc_to_page(kaddr);
2590 else
2591 page = virt_to_page(kaddr);
2592
2541 err = vm_insert_page(vma, start, page); 2593 err = vm_insert_page(vma, start, page);
2542 if (unlikely(err)) 2594 if (unlikely(err))
2543 goto out; 2595 goto out;
2544 start += PAGE_SIZE; 2596 start += PAGE_SIZE;
2597 kaddr += PAGE_SIZE;
2545 } 2598 }
2546 } 2599 }
2547 } 2600 }