aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2011-03-22 19:30:36 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-03-22 20:44:00 -0400
commit89699605fe7cfd8611900346f61cb6cbf179b10a (patch)
tree5970232526d3b3715704bb6f90742e033802c3aa /mm
parentef0a5e80f56f6409e957e7117da9551c3d3ff239 (diff)
mm: vmap area cache
Provide a free area cache for the vmalloc virtual address allocator, based on the algorithm used by the user virtual memory allocator. This reduces the number of rbtree operations and linear traversals over the vmap extents in order to find a free area, by starting off at the last point that a free area was found. The free area cache is reset if areas are freed behind it, or if we are searching for a smaller area or alignment than last time. So allocation patterns are not changed (verified by corner-case and random test cases in userspace testing). This solves a regression caused by lazy vunmap TLB purging introduced in db64fe02 (mm: rewrite vmap layer). That patch will leave extents in the vmap allocator after they are vunmapped, and until a significant number accumulate that can be flushed in a single batch. So in a workload that vmalloc/vfree frequently, a chain of extents will build up from VMALLOC_START address, which have to be iterated over each time (giving an O(n) type of behaviour). After this patch, the search will start from where it left off, giving closer to an amortized O(1). This is verified to solve regressions reported Steven in GFS2, and Avi in KVM. Hugh's update: : I tried out the recent mmotm, and on one machine was fortunate to hit : the BUG_ON(first->va_start < addr) which seems to have been stalling : your vmap area cache patch ever since May. : I can get you addresses etc, I did dump a few out; but once I stared : at them, it was easier just to look at the code: and I cannot see how : you would be so sure that first->va_start < addr, once you've done : that addr = ALIGN(max(...), align) above, if align is over 0x1000 : (align was 0x8000 or 0x4000 in the cases I hit: ioremaps like Steve). : I originally got around it by just changing the : if (first->va_start < addr) { : to : while (first->va_start < addr) { : without thinking about it any further; but that seemed unsatisfactory, : why would we want to loop here when we've got another very similar : loop just below it? : I am never going to admit how long I've spent trying to grasp your : "while (n)" rbtree loop just above this, the one with the peculiar : if (!first && tmp->va_start < addr + size) : in. That's unfamiliar to me, I'm guessing it's designed to save a : subsequent rb_next() in a few circumstances (at risk of then setting : a wrong cached_hole_size?); but they did appear few to me, and I didn't : feel I could sign off something with that in when I don't grasp it, : and it seems responsible for extra code and mistaken BUG_ON below it. : I've reverted to the familiar rbtree loop that find_vma() does (but : with va_end >= addr as you had, to respect the additional guard page): : and then (given that cached_hole_size starts out 0) I don't see the : need for any complications below it. If you do want to keep that loop : as you had it, please add a comment to explain what it's trying to do, : and where addr is relative to first when you emerge from it. : Aren't your tests "size <= cached_hole_size" and : "addr + size > first->va_start" forgetting the guard page we want : before the next area? I've changed those. : I have not changed your many "addr + size - 1 < addr" overflow tests, : but have since come to wonder, shouldn't they be "addr + size < addr" : tests - won't the vend checks go wrong if addr + size is 0? : I have added a few comments - Wolfgang Wander's 2.6.13 description of : 1363c3cd8603a913a27e2995dccbd70d5312d8e6 Avoiding mmap fragmentation : helped me a lot, perhaps a pointer to that would be good too. And I found : it easier to understand when I renamed cached_start slightly and moved the : overflow label down. : This patch would go after your mm-vmap-area-cache.patch in mmotm. : Trivially, nobody is going to get that BUG_ON with this patch, and it : appears to work fine on my machines; but I have not given it anything like : the testing you did on your original, and may have broken all the : performance you were aiming for. Please take a look and test it out : integrate with yours if you're satisfied - thanks. [akpm@linux-foundation.org: add locking comment] Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Minchan Kim <minchan.kim@gmail.com> Reported-and-tested-by: Steven Whitehouse <swhiteho@redhat.com> Reported-and-tested-by: Avi Kivity <avi@redhat.com> Tested-by: "Barry J. Marson" <bmarson@redhat.com> Cc: Prarit Bhargava <prarit@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/vmalloc.c156
1 files changed, 104 insertions, 52 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f9b166732e70..fc77adabb5e3 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -261,8 +261,15 @@ struct vmap_area {
261}; 261};
262 262
263static DEFINE_SPINLOCK(vmap_area_lock); 263static DEFINE_SPINLOCK(vmap_area_lock);
264static struct rb_root vmap_area_root = RB_ROOT;
265static LIST_HEAD(vmap_area_list); 264static LIST_HEAD(vmap_area_list);
265static struct rb_root vmap_area_root = RB_ROOT;
266
267/* The vmap cache globals are protected by vmap_area_lock */
268static struct rb_node *free_vmap_cache;
269static unsigned long cached_hole_size;
270static unsigned long cached_vstart;
271static unsigned long cached_align;
272
266static unsigned long vmap_area_pcpu_hole; 273static unsigned long vmap_area_pcpu_hole;
267 274
268static struct vmap_area *__find_vmap_area(unsigned long addr) 275static struct vmap_area *__find_vmap_area(unsigned long addr)
@@ -331,9 +338,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
331 struct rb_node *n; 338 struct rb_node *n;
332 unsigned long addr; 339 unsigned long addr;
333 int purged = 0; 340 int purged = 0;
341 struct vmap_area *first;
334 342
335 BUG_ON(!size); 343 BUG_ON(!size);
336 BUG_ON(size & ~PAGE_MASK); 344 BUG_ON(size & ~PAGE_MASK);
345 BUG_ON(!is_power_of_2(align));
337 346
338 va = kmalloc_node(sizeof(struct vmap_area), 347 va = kmalloc_node(sizeof(struct vmap_area),
339 gfp_mask & GFP_RECLAIM_MASK, node); 348 gfp_mask & GFP_RECLAIM_MASK, node);
@@ -341,79 +350,106 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
341 return ERR_PTR(-ENOMEM); 350 return ERR_PTR(-ENOMEM);
342 351
343retry: 352retry:
344 addr = ALIGN(vstart, align);
345
346 spin_lock(&vmap_area_lock); 353 spin_lock(&vmap_area_lock);
347 if (addr + size - 1 < addr) 354 /*
348 goto overflow; 355 * Invalidate cache if we have more permissive parameters.
356 * cached_hole_size notes the largest hole noticed _below_
357 * the vmap_area cached in free_vmap_cache: if size fits
358 * into that hole, we want to scan from vstart to reuse
359 * the hole instead of allocating above free_vmap_cache.
360 * Note that __free_vmap_area may update free_vmap_cache
361 * without updating cached_hole_size or cached_align.
362 */
363 if (!free_vmap_cache ||
364 size < cached_hole_size ||
365 vstart < cached_vstart ||
366 align < cached_align) {
367nocache:
368 cached_hole_size = 0;
369 free_vmap_cache = NULL;
370 }
371 /* record if we encounter less permissive parameters */
372 cached_vstart = vstart;
373 cached_align = align;
374
375 /* find starting point for our search */
376 if (free_vmap_cache) {
377 first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
378 addr = ALIGN(first->va_end + PAGE_SIZE, align);
379 if (addr < vstart)
380 goto nocache;
381 if (addr + size - 1 < addr)
382 goto overflow;
383
384 } else {
385 addr = ALIGN(vstart, align);
386 if (addr + size - 1 < addr)
387 goto overflow;
349 388
350 /* XXX: could have a last_hole cache */ 389 n = vmap_area_root.rb_node;
351 n = vmap_area_root.rb_node; 390 first = NULL;
352 if (n) {
353 struct vmap_area *first = NULL;
354 391
355 do { 392 while (n) {
356 struct vmap_area *tmp; 393 struct vmap_area *tmp;
357 tmp = rb_entry(n, struct vmap_area, rb_node); 394 tmp = rb_entry(n, struct vmap_area, rb_node);
358 if (tmp->va_end >= addr) { 395 if (tmp->va_end >= addr) {
359 if (!first && tmp->va_start < addr + size)
360 first = tmp;
361 n = n->rb_left;
362 } else {
363 first = tmp; 396 first = tmp;
397 if (tmp->va_start <= addr)
398 break;
399 n = n->rb_left;
400 } else
364 n = n->rb_right; 401 n = n->rb_right;
365 } 402 }
366 } while (n);
367 403
368 if (!first) 404 if (!first)
369 goto found; 405 goto found;
370
371 if (first->va_end < addr) {
372 n = rb_next(&first->rb_node);
373 if (n)
374 first = rb_entry(n, struct vmap_area, rb_node);
375 else
376 goto found;
377 }
378
379 while (addr + size > first->va_start && addr + size <= vend) {
380 addr = ALIGN(first->va_end + PAGE_SIZE, align);
381 if (addr + size - 1 < addr)
382 goto overflow;
383
384 n = rb_next(&first->rb_node);
385 if (n)
386 first = rb_entry(n, struct vmap_area, rb_node);
387 else
388 goto found;
389 }
390 } 406 }
391found: 407
392 if (addr + size > vend) { 408 /* from the starting point, walk areas until a suitable hole is found */
393overflow: 409 while (addr + size >= first->va_start && addr + size <= vend) {
394 spin_unlock(&vmap_area_lock); 410 if (addr + cached_hole_size < first->va_start)
395 if (!purged) { 411 cached_hole_size = first->va_start - addr;
396 purge_vmap_area_lazy(); 412 addr = ALIGN(first->va_end + PAGE_SIZE, align);
397 purged = 1; 413 if (addr + size - 1 < addr)
398 goto retry; 414 goto overflow;
399 } 415
400 if (printk_ratelimit()) 416 n = rb_next(&first->rb_node);
401 printk(KERN_WARNING 417 if (n)
402 "vmap allocation for size %lu failed: " 418 first = rb_entry(n, struct vmap_area, rb_node);
403 "use vmalloc=<size> to increase size.\n", size); 419 else
404 kfree(va); 420 goto found;
405 return ERR_PTR(-EBUSY);
406 } 421 }
407 422
408 BUG_ON(addr & (align-1)); 423found:
424 if (addr + size > vend)
425 goto overflow;
409 426
410 va->va_start = addr; 427 va->va_start = addr;
411 va->va_end = addr + size; 428 va->va_end = addr + size;
412 va->flags = 0; 429 va->flags = 0;
413 __insert_vmap_area(va); 430 __insert_vmap_area(va);
431 free_vmap_cache = &va->rb_node;
414 spin_unlock(&vmap_area_lock); 432 spin_unlock(&vmap_area_lock);
415 433
434 BUG_ON(va->va_start & (align-1));
435 BUG_ON(va->va_start < vstart);
436 BUG_ON(va->va_end > vend);
437
416 return va; 438 return va;
439
440overflow:
441 spin_unlock(&vmap_area_lock);
442 if (!purged) {
443 purge_vmap_area_lazy();
444 purged = 1;
445 goto retry;
446 }
447 if (printk_ratelimit())
448 printk(KERN_WARNING
449 "vmap allocation for size %lu failed: "
450 "use vmalloc=<size> to increase size.\n", size);
451 kfree(va);
452 return ERR_PTR(-EBUSY);
417} 453}
418 454
419static void rcu_free_va(struct rcu_head *head) 455static void rcu_free_va(struct rcu_head *head)
@@ -426,6 +462,22 @@ static void rcu_free_va(struct rcu_head *head)
426static void __free_vmap_area(struct vmap_area *va) 462static void __free_vmap_area(struct vmap_area *va)
427{ 463{
428 BUG_ON(RB_EMPTY_NODE(&va->rb_node)); 464 BUG_ON(RB_EMPTY_NODE(&va->rb_node));
465
466 if (free_vmap_cache) {
467 if (va->va_end < cached_vstart) {
468 free_vmap_cache = NULL;
469 } else {
470 struct vmap_area *cache;
471 cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
472 if (va->va_start <= cache->va_start) {
473 free_vmap_cache = rb_prev(&va->rb_node);
474 /*
475 * We don't try to update cached_hole_size or
476 * cached_align, but it won't go very wrong.
477 */
478 }
479 }
480 }
429 rb_erase(&va->rb_node, &vmap_area_root); 481 rb_erase(&va->rb_node, &vmap_area_root);
430 RB_CLEAR_NODE(&va->rb_node); 482 RB_CLEAR_NODE(&va->rb_node);
431 list_del_rcu(&va->list); 483 list_del_rcu(&va->list);