mm: vmap area cache

Provide a free area cache for the vmalloc virtual address allocator, based on the algorithm used by the user virtual memory allocator. This reduces the number of rbtree operations and linear traversals over the vmap extents in order to find a free area, by starting off at the last point that a free area was found. The free area cache is reset if areas are freed behind it, or if we are searching for a smaller area or alignment than last time. So allocation patterns are not changed (verified by corner-case and random test cases in userspace testing). This solves a regression caused by lazy vunmap TLB purging introduced in db64fe02 (mm: rewrite vmap layer). That patch will leave extents in the vmap allocator after they are vunmapped, and until a significant number accumulate that can be flushed in a single batch. So in a workload that vmalloc/vfree frequently, a chain of extents will build up from VMALLOC_START address, which have to be iterated over each time (giving an O(n) type of behaviour). After this patch, the search will start from where it left off, giving closer to an amortized O(1). This is verified to solve regressions reported Steven in GFS2, and Avi in KVM. Hugh's update: : I tried out the recent mmotm, and on one machine was fortunate to hit : the BUG_ON(first->va_start < addr) which seems to have been stalling : your vmap area cache patch ever since May. : I can get you addresses etc, I did dump a few out; but once I stared : at them, it was easier just to look at the code: and I cannot see how : you would be so sure that first->va_start < addr, once you've done : that addr = ALIGN(max(...), align) above, if align is over 0x1000 : (align was 0x8000 or 0x4000 in the cases I hit: ioremaps like Steve). : I originally got around it by just changing the : if (first->va_start < addr) { : to : while (first->va_start < addr) { : without thinking about it any further; but that seemed unsatisfactory, : why would we want to loop here when we've got another very similar : loop just below it? : I am never going to admit how long I've spent trying to grasp your : "while (n)" rbtree loop just above this, the one with the peculiar : if (!first && tmp->va_start < addr + size) : in. That's unfamiliar to me, I'm guessing it's designed to save a : subsequent rb_next() in a few circumstances (at risk of then setting : a wrong cached_hole_size?); but they did appear few to me, and I didn't : feel I could sign off something with that in when I don't grasp it, : and it seems responsible for extra code and mistaken BUG_ON below it. : I've reverted to the familiar rbtree loop that find_vma() does (but : with va_end >= addr as you had, to respect the additional guard page): : and then (given that cached_hole_size starts out 0) I don't see the : need for any complications below it. If you do want to keep that loop : as you had it, please add a comment to explain what it's trying to do, : and where addr is relative to first when you emerge from it. : Aren't your tests "size <= cached_hole_size" and : "addr + size > first->va_start" forgetting the guard page we want : before the next area? I've changed those. : I have not changed your many "addr + size - 1 < addr" overflow tests, : but have since come to wonder, shouldn't they be "addr + size < addr" : tests - won't the vend checks go wrong if addr + size is 0? : I have added a few comments - Wolfgang Wander's 2.6.13 description of : 1363c3cd8603a913a27e2995dccbd70d5312d8e6 Avoiding mmap fragmentation : helped me a lot, perhaps a pointer to that would be good too. And I found : it easier to understand when I renamed cached_start slightly and moved the : overflow label down. : This patch would go after your mm-vmap-area-cache.patch in mmotm. : Trivially, nobody is going to get that BUG_ON with this patch, and it : appears to work fine on my machines; but I have not given it anything like : the testing you did on your original, and may have broken all the : performance you were aiming for. Please take a look and test it out : integrate with yours if you're satisfied - thanks. [akpm@linux-foundation.org: add locking comment] Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Hugh Dickins <hughd@google.com> Reviewed-by: Minchan Kim <minchan.kim@gmail.com> Reported-and-tested-by: Steven Whitehouse <swhiteho@redhat.com> Reported-and-tested-by: Avi Kivity <avi@redhat.com> Tested-by: "Barry J. Marson" <bmarson@redhat.com> Cc: Prarit Bhargava <prarit@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Nick Piggin <npiggin@suse.de> 2011-03-22 19:30:36 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2011-03-22 20:44:00 -0400
commit: 89699605fe7cfd8611900346f61cb6cbf179b10a (patch)
tree: 5970232526d3b3715704bb6f90742e033802c3aa /mm
parent: ef0a5e80f56f6409e957e7117da9551c3d3ff239 (diff)
1 files changed, 104 insertions, 52 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f9b166732e70..fc77adabb5e3 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -261,8 +261,15 @@ struct vmap_area {
 };
 static DEFINE_SPINLOCK(vmap_area_lock);
-static struct rb_root vmap_area_root = RB_ROOT;
 static LIST_HEAD(vmap_area_list);
+static struct rb_root vmap_area_root = RB_ROOT;
+/* The vmap cache globals are protected by vmap_area_lock */
+static struct rb_node *free_vmap_cache;
+static unsigned long cached_hole_size;
+static unsigned long cached_vstart;
+static unsigned long cached_align;
 static unsigned long vmap_area_pcpu_hole;
 static struct vmap_area *__find_vmap_area(unsigned long addr)
@@ -331,9 +338,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
        struct rb_node *n;
        unsigned long addr;
        int purged = 0;
+        struct vmap_area *first;
        BUG_ON(!size);
        BUG_ON(size & ~PAGE_MASK);
+        BUG_ON(!is_power_of_2(align));
        va = kmalloc_node(sizeof(struct vmap_area),
                        gfp_mask & GFP_RECLAIM_MASK, node);
@@ -341,79 +350,106 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
                return ERR_PTR(-ENOMEM);
 retry:
-        addr = ALIGN(vstart, align);
        spin_lock(&vmap_area_lock);
-        if (addr + size - 1 < addr)
+        /*
-                goto overflow;
+         * Invalidate cache if we have more permissive parameters.
+         * cached_hole_size notes the largest hole noticed _below_
+         * the vmap_area cached in free_vmap_cache: if size fits
+         * into that hole, we want to scan from vstart to reuse
+         * the hole instead of allocating above free_vmap_cache.
+         * Note that __free_vmap_area may update free_vmap_cache
+         * without updating cached_hole_size or cached_align.
+         */
+        if (!free_vmap_cache ||
+                        size < cached_hole_size ||
+                        vstart < cached_vstart ||
+                        align < cached_align) {
+nocache:
+                cached_hole_size = 0;
+                free_vmap_cache = NULL;
+        }
+        /* record if we encounter less permissive parameters */
+        cached_vstart = vstart;
+        cached_align = align;
+        /* find starting point for our search */
+        if (free_vmap_cache) {
+                first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
+                addr = ALIGN(first->va_end + PAGE_SIZE, align);
+                if (addr < vstart)
+                        goto nocache;
+                if (addr + size - 1 < addr)
+                        goto overflow;
+        } else {
+                addr = ALIGN(vstart, align);
+                if (addr + size - 1 < addr)
+                        goto overflow;
-        /* XXX: could have a last_hole cache */
+                n = vmap_area_root.rb_node;
-        n = vmap_area_root.rb_node;
+                first = NULL;
-        if (n) {
-                struct vmap_area *first = NULL;
-                do {
+                while (n) {
                        struct vmap_area *tmp;
                        tmp = rb_entry(n, struct vmap_area, rb_node);
                        if (tmp->va_end >= addr) {
-                                if (!first && tmp->va_start < addr + size)
-                                        first = tmp;
-                                n = n->rb_left;
-                        } else {
                                first = tmp;
+                                if (tmp->va_start <= addr)
+                                        break;
+                                n = n->rb_left;
+                        } else
                                n = n->rb_right;
-                        }
+                }
-                } while (n);
                if (!first)
                        goto found;
-                if (first->va_end < addr) {
-                        n = rb_next(&first->rb_node);
-                        if (n)
-                                first = rb_entry(n, struct vmap_area, rb_node);
-                        else
-                                goto found;
-                }
-                while (addr + size > first->va_start && addr + size <= vend) {
-                        addr = ALIGN(first->va_end + PAGE_SIZE, align);
-                        if (addr + size - 1 < addr)
-                                goto overflow;
-                        n = rb_next(&first->rb_node);
-                        if (n)
-                                first = rb_entry(n, struct vmap_area, rb_node);
-                        else
-                                goto found;
-                }
        }
-found:
-        if (addr + size > vend) {
+        /* from the starting point, walk areas until a suitable hole is found */
-overflow:
+        while (addr + size >= first->va_start && addr + size <= vend) {
-                spin_unlock(&vmap_area_lock);
+                if (addr + cached_hole_size < first->va_start)
-                if (!purged) {
+                        cached_hole_size = first->va_start - addr;
-                        purge_vmap_area_lazy();
+                addr = ALIGN(first->va_end + PAGE_SIZE, align);
-                        purged = 1;
+                if (addr + size - 1 < addr)
-                        goto retry;
+                        goto overflow;
-                }
-                if (printk_ratelimit())
+                n = rb_next(&first->rb_node);
-                        printk(KERN_WARNING
+                if (n)
-                                "vmap allocation for size %lu failed: "
+                        first = rb_entry(n, struct vmap_area, rb_node);
-                                "use vmalloc=<size> to increase size.\n", size);
+                else
-                kfree(va);
+                        goto found;
-                return ERR_PTR(-EBUSY);
        }
-        BUG_ON(addr & (align-1));
+found:
+        if (addr + size > vend)
+                goto overflow;
        va->va_start = addr;
        va->va_end = addr + size;
        va->flags = 0;
        __insert_vmap_area(va);
+        free_vmap_cache = &va->rb_node;
        spin_unlock(&vmap_area_lock);
+        BUG_ON(va->va_start & (align-1));
+        BUG_ON(va->va_start < vstart);
+        BUG_ON(va->va_end > vend);
        return va;
+overflow:
+        spin_unlock(&vmap_area_lock);
+        if (!purged) {
+                purge_vmap_area_lazy();
+                purged = 1;
+                goto retry;
+        }
+        if (printk_ratelimit())
+                printk(KERN_WARNING
+                        "vmap allocation for size %lu failed: "
+                        "use vmalloc=<size> to increase size.\n", size);
+        kfree(va);
+        return ERR_PTR(-EBUSY);
 }
 static void rcu_free_va(struct rcu_head *head)
@@ -426,6 +462,22 @@ static void rcu_free_va(struct rcu_head *head)
 static void __free_vmap_area(struct vmap_area *va)
 {
        BUG_ON(RB_EMPTY_NODE(&va->rb_node));
+        if (free_vmap_cache) {
+                if (va->va_end < cached_vstart) {
+                        free_vmap_cache = NULL;
+                } else {
+                        struct vmap_area *cache;
+                        cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
+                        if (va->va_start <= cache->va_start) {
+                                free_vmap_cache = rb_prev(&va->rb_node);
+                                /*
+                                 * We don't try to update cached_hole_size or
+                                 * cached_align, but it won't go very wrong.
+                                 */
+                        }
+                }
+        }
        rb_erase(&va->rb_node, &vmap_area_root);
        RB_CLEAR_NODE(&va->rb_node);
        list_del_rcu(&va->list);
author	Nick Piggin <npiggin@suse.de>	2011-03-22 19:30:36 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2011-03-22 20:44:00 -0400
commit	89699605fe7cfd8611900346f61cb6cbf179b10a (patch)
tree	5970232526d3b3715704bb6f90742e033802c3aa /mm
parent	ef0a5e80f56f6409e957e7117da9551c3d3ff239 (diff)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f9b166732e70..fc77adabb5e3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c
@@ -261,8 +261,15 @@ struct vmap_area {
261	};	261	};
262		262
263	static DEFINE_SPINLOCK(vmap_area_lock);	263	static DEFINE_SPINLOCK(vmap_area_lock);
264	static struct rb_root vmap_area_root = RB_ROOT;
265	static LIST_HEAD(vmap_area_list);	264	static LIST_HEAD(vmap_area_list);
		265	static struct rb_root vmap_area_root = RB_ROOT;
		266
		267	/* The vmap cache globals are protected by vmap_area_lock */
		268	static struct rb_node *free_vmap_cache;
		269	static unsigned long cached_hole_size;
		270	static unsigned long cached_vstart;
		271	static unsigned long cached_align;
		272
266	static unsigned long vmap_area_pcpu_hole;	273	static unsigned long vmap_area_pcpu_hole;
267		274
268	static struct vmap_area *__find_vmap_area(unsigned long addr)	275	static struct vmap_area *__find_vmap_area(unsigned long addr)
@@ -331,9 +338,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
331	struct rb_node *n;	338	struct rb_node *n;
332	unsigned long addr;	339	unsigned long addr;
333	int purged = 0;	340	int purged = 0;
		341	struct vmap_area *first;
334		342
335	BUG_ON(!size);	343	BUG_ON(!size);
336	BUG_ON(size & ~PAGE_MASK);	344	BUG_ON(size & ~PAGE_MASK);
		345	BUG_ON(!is_power_of_2(align));
337		346
338	va = kmalloc_node(sizeof(struct vmap_area),	347	va = kmalloc_node(sizeof(struct vmap_area),
339	gfp_mask & GFP_RECLAIM_MASK, node);	348	gfp_mask & GFP_RECLAIM_MASK, node);
@@ -341,79 +350,106 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
341	return ERR_PTR(-ENOMEM);	350	return ERR_PTR(-ENOMEM);
342		351
343	retry:	352	retry:
344	addr = ALIGN(vstart, align);
345
346	spin_lock(&vmap_area_lock);	353	spin_lock(&vmap_area_lock);
347	if (addr + size - 1 < addr)	354	/*
348	goto overflow;	355	* Invalidate cache if we have more permissive parameters.
		356	* cached_hole_size notes the largest hole noticed _below_
		357	* the vmap_area cached in free_vmap_cache: if size fits
		358	* into that hole, we want to scan from vstart to reuse
		359	* the hole instead of allocating above free_vmap_cache.
		360	* Note that __free_vmap_area may update free_vmap_cache
		361	* without updating cached_hole_size or cached_align.
		362	*/
		363	if (!free_vmap_cache \|\|
		364	size < cached_hole_size \|\|
		365	vstart < cached_vstart \|\|
		366	align < cached_align) {
		367	nocache:
		368	cached_hole_size = 0;
		369	free_vmap_cache = NULL;
		370	}
		371	/* record if we encounter less permissive parameters */
		372	cached_vstart = vstart;
		373	cached_align = align;
		374
		375	/* find starting point for our search */
		376	if (free_vmap_cache) {
		377	first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
		378	addr = ALIGN(first->va_end + PAGE_SIZE, align);
		379	if (addr < vstart)
		380	goto nocache;
		381	if (addr + size - 1 < addr)
		382	goto overflow;
		383
		384	} else {
		385	addr = ALIGN(vstart, align);
		386	if (addr + size - 1 < addr)
		387	goto overflow;
349		388
350	/* XXX: could have a last_hole cache */	389	n = vmap_area_root.rb_node;
351	n = vmap_area_root.rb_node;	390	first = NULL;
352	if (n) {
353	struct vmap_area *first = NULL;
354		391
355	do {	392	while (n) {
356	struct vmap_area *tmp;	393	struct vmap_area *tmp;
357	tmp = rb_entry(n, struct vmap_area, rb_node);	394	tmp = rb_entry(n, struct vmap_area, rb_node);
358	if (tmp->va_end >= addr) {	395	if (tmp->va_end >= addr) {
359	if (!first && tmp->va_start < addr + size)
360	first = tmp;
361	n = n->rb_left;
362	} else {
363	first = tmp;	396	first = tmp;
		397	if (tmp->va_start <= addr)
		398	break;
		399	n = n->rb_left;
		400	} else
364	n = n->rb_right;	401	n = n->rb_right;
365	}	402	}
366	} while (n);
367		403
368	if (!first)	404	if (!first)
369	goto found;	405	goto found;
370
371	if (first->va_end < addr) {
372	n = rb_next(&first->rb_node);
373	if (n)
374	first = rb_entry(n, struct vmap_area, rb_node);
375	else
376	goto found;
377	}
378
379	while (addr + size > first->va_start && addr + size <= vend) {
380	addr = ALIGN(first->va_end + PAGE_SIZE, align);
381	if (addr + size - 1 < addr)
382	goto overflow;
383
384	n = rb_next(&first->rb_node);
385	if (n)
386	first = rb_entry(n, struct vmap_area, rb_node);
387	else
388	goto found;
389	}
390	}	406	}
391	found:	407
392	if (addr + size > vend) {	408	/* from the starting point, walk areas until a suitable hole is found */
393	overflow:	409	while (addr + size >= first->va_start && addr + size <= vend) {
394	spin_unlock(&vmap_area_lock);	410	if (addr + cached_hole_size < first->va_start)
395	if (!purged) {	411	cached_hole_size = first->va_start - addr;
396	purge_vmap_area_lazy();	412	addr = ALIGN(first->va_end + PAGE_SIZE, align);
397	purged = 1;	413	if (addr + size - 1 < addr)
398	goto retry;	414	goto overflow;
399	}	415
400	if (printk_ratelimit())	416	n = rb_next(&first->rb_node);
401	printk(KERN_WARNING	417	if (n)
402	"vmap allocation for size %lu failed: "	418	first = rb_entry(n, struct vmap_area, rb_node);
403	"use vmalloc=<size> to increase size.\n", size);	419	else
404	kfree(va);	420	goto found;
405	return ERR_PTR(-EBUSY);
406	}	421	}
407		422
408	BUG_ON(addr & (align-1));	423	found:
		424	if (addr + size > vend)
		425	goto overflow;
409		426
410	va->va_start = addr;	427	va->va_start = addr;
411	va->va_end = addr + size;	428	va->va_end = addr + size;
412	va->flags = 0;	429	va->flags = 0;
413	__insert_vmap_area(va);	430	__insert_vmap_area(va);
		431	free_vmap_cache = &va->rb_node;
414	spin_unlock(&vmap_area_lock);	432	spin_unlock(&vmap_area_lock);
415		433
		434	BUG_ON(va->va_start & (align-1));
		435	BUG_ON(va->va_start < vstart);
		436	BUG_ON(va->va_end > vend);
		437
416	return va;	438	return va;
		439
		440	overflow:
		441	spin_unlock(&vmap_area_lock);
		442	if (!purged) {
		443	purge_vmap_area_lazy();
		444	purged = 1;
		445	goto retry;
		446	}
		447	if (printk_ratelimit())
		448	printk(KERN_WARNING
		449	"vmap allocation for size %lu failed: "
		450	"use vmalloc=<size> to increase size.\n", size);
		451	kfree(va);
		452	return ERR_PTR(-EBUSY);
417	}	453	}
418		454
419	static void rcu_free_va(struct rcu_head *head)	455	static void rcu_free_va(struct rcu_head *head)
@@ -426,6 +462,22 @@ static void rcu_free_va(struct rcu_head *head)
426	static void __free_vmap_area(struct vmap_area *va)	462	static void __free_vmap_area(struct vmap_area *va)
427	{	463	{
428	BUG_ON(RB_EMPTY_NODE(&va->rb_node));	464	BUG_ON(RB_EMPTY_NODE(&va->rb_node));
		465
		466	if (free_vmap_cache) {
		467	if (va->va_end < cached_vstart) {
		468	free_vmap_cache = NULL;
		469	} else {
		470	struct vmap_area *cache;
		471	cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
		472	if (va->va_start <= cache->va_start) {
		473	free_vmap_cache = rb_prev(&va->rb_node);
		474	/*
		475	* We don't try to update cached_hole_size or
		476	* cached_align, but it won't go very wrong.
		477	*/
		478	}
		479	}
		480	}
429	rb_erase(&va->rb_node, &vmap_area_root);	481	rb_erase(&va->rb_node, &vmap_area_root);
430	RB_CLEAR_NODE(&va->rb_node);	482	RB_CLEAR_NODE(&va->rb_node);
431	list_del_rcu(&va->list);	483	list_del_rcu(&va->list);