slub: never fail to shrink cache

SLUB's version of __kmem_cache_shrink() not only removes empty slabs, but also tries to rearrange the partial lists to place slabs filled up most to the head to cope with fragmentation. To achieve that, it allocates a temporary array of lists used to sort slabs by the number of objects in use. If the allocation fails, the whole procedure is aborted. This is unacceptable for the kernel memory accounting extension of the memory cgroup, where we want to make sure that kmem_cache_shrink() successfully discarded empty slabs. Although the allocation failure is utterly unlikely with the current page allocator implementation, which retries GFP_KERNEL allocations of order <= 2 infinitely, it is better not to rely on that. This patch therefore makes __kmem_cache_shrink() allocate the array on stack instead of calling kmalloc, which may fail. The array size is chosen to be equal to 32, because most SLUB caches store not more than 32 objects per slab page. Slab pages with <= 32 free objects are sorted using the array by the number of objects in use and promoted to the head of the partial list, while slab pages with > 32 free objects are left in the end of the list without any ordering imposed on them. Signed-off-by: Vladimir Davydov <vdavydov@parallels.com> Acked-by: Christoph Lameter <cl@linux.com> Acked-by: Pekka Enberg <penberg@kernel.org> Cc: David Rientjes <rientjes@google.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Huang Ying <ying.huang@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Vladimir Davydov <vdavydov@parallels.com> 2015-02-12 17:59:41 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-02-12 21:54:10 -0500
commit: 832f37f5d5f5c7281880c21eb09508750b67f540 (patch)
tree: 9d0102615b2169659d156f633ba7a304ad27e590
parent: 2788cf0c401c268b4819c5407493a8769b7007aa (diff)
1 files changed, 31 insertions, 27 deletions
diff --git a/mm/slub.c b/mm/slub.c
index 1e5a4636cb23..d97b692165d2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3358,11 +3358,12 @@ void kfree(const void *x)
 }
 EXPORT_SYMBOL(kfree);
+#define SHRINK_PROMOTE_MAX 32
 /*
- * kmem_cache_shrink removes empty slabs from the partial lists and sorts
+ * kmem_cache_shrink discards empty slabs and promotes the slabs filled
- * the remaining slabs by the number of items in use. The slabs with the
+ * up most to the head of the partial lists. New allocations will then
- * most items in use come first. New allocations will then fill those up
+ * fill those up and thus they can be removed from the partial lists.
- * and thus they can be removed from the partial lists.
 *
 * The slabs with the least items are placed last. This results in them
 * being allocated from last increasing the chance that the last objects
@@ -3375,51 +3376,57 @@ int __kmem_cache_shrink(struct kmem_cache *s)
        struct kmem_cache_node *n;
        struct page *page;
        struct page *t;
-        int objects = oo_objects(s->max);
+        struct list_head discard;
-        struct list_head *slabs_by_inuse =
+        struct list_head promote[SHRINK_PROMOTE_MAX];
-                kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
        unsigned long flags;
-        if (!slabs_by_inuse)
-                return -ENOMEM;
        flush_all(s);
        for_each_kmem_cache_node(s, node, n) {
                if (!n->nr_partial)
                        continue;
-                for (i = 0; i < objects; i++)
+                INIT_LIST_HEAD(&discard);
-                        INIT_LIST_HEAD(slabs_by_inuse + i);
+                for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
+                        INIT_LIST_HEAD(promote + i);
                spin_lock_irqsave(&n->list_lock, flags);
                /*
-                 * Build lists indexed by the items in use in each slab.
+                 * Build lists of slabs to discard or promote.
                 *
                 * Note that concurrent frees may occur while we hold the
                 * list_lock. page->inuse here is the upper limit.
                 */
                list_for_each_entry_safe(page, t, &n->partial, lru) {
-                        list_move(&page->lru, slabs_by_inuse + page->inuse);
+                        int free = page->objects - page->inuse;
-                        if (!page->inuse)
+                        /* Do not reread page->inuse */
+                        barrier();
+                        /* We do not keep full slabs on the list */
+                        BUG_ON(free <= 0);
+                        if (free == page->objects) {
+                                list_move(&page->lru, &discard);
                                n->nr_partial--;
+                        } else if (free <= SHRINK_PROMOTE_MAX)
+                                list_move(&page->lru, promote + free - 1);
                }
                /*
-                 * Rebuild the partial list with the slabs filled up most
+                 * Promote the slabs filled up most to the head of the
-                 * first and the least used slabs at the end.
+                 * partial list.
                 */
-                for (i = objects - 1; i > 0; i--)
+                for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
-                        list_splice(slabs_by_inuse + i, n->partial.prev);
+                        list_splice(promote + i, &n->partial);
                spin_unlock_irqrestore(&n->list_lock, flags);
                /* Release empty slabs */
-                list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
+                list_for_each_entry_safe(page, t, &discard, lru)
                        discard_slab(s, page);
        }
-        kfree(slabs_by_inuse);
        return 0;
 }
@@ -4686,12 +4693,9 @@ static ssize_t shrink_show(struct kmem_cache *s, char *buf)
 static ssize_t shrink_store(struct kmem_cache *s,
                        const char *buf, size_t length)
 {
-        if (buf[0] == '1') {
+        if (buf[0] == '1')
-                int rc = kmem_cache_shrink(s);
+                kmem_cache_shrink(s);
+        else
-                if (rc)
-                        return rc;
-        } else
                return -EINVAL;
        return length;
 }
author	Vladimir Davydov <vdavydov@parallels.com>	2015-02-12 17:59:41 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-02-12 21:54:10 -0500
commit	832f37f5d5f5c7281880c21eb09508750b67f540 (patch)
tree	9d0102615b2169659d156f633ba7a304ad27e590
parent	2788cf0c401c268b4819c5407493a8769b7007aa (diff)

diff --git a/mm/slub.c b/mm/slub.c index 1e5a4636cb23..d97b692165d2 100644 --- a/mm/slub.c +++ b/mm/slub.c
@@ -3358,11 +3358,12 @@ void kfree(const void *x)
3358	}	3358	}
3359	EXPORT_SYMBOL(kfree);	3359	EXPORT_SYMBOL(kfree);
3360		3360
		3361	#define SHRINK_PROMOTE_MAX 32
		3362
3361	/*	3363	/*
3362	* kmem_cache_shrink removes empty slabs from the partial lists and sorts	3364	* kmem_cache_shrink discards empty slabs and promotes the slabs filled
3363	* the remaining slabs by the number of items in use. The slabs with the	3365	* up most to the head of the partial lists. New allocations will then
3364	* most items in use come first. New allocations will then fill those up	3366	* fill those up and thus they can be removed from the partial lists.
3365	* and thus they can be removed from the partial lists.
3366	*	3367	*
3367	* The slabs with the least items are placed last. This results in them	3368	* The slabs with the least items are placed last. This results in them
3368	* being allocated from last increasing the chance that the last objects	3369	* being allocated from last increasing the chance that the last objects
@@ -3375,51 +3376,57 @@ int __kmem_cache_shrink(struct kmem_cache *s)
3375	struct kmem_cache_node *n;	3376	struct kmem_cache_node *n;
3376	struct page *page;	3377	struct page *page;
3377	struct page *t;	3378	struct page *t;
3378	int objects = oo_objects(s->max);	3379	struct list_head discard;
3379	struct list_head *slabs_by_inuse =	3380	struct list_head promote[SHRINK_PROMOTE_MAX];
3380	kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
3381	unsigned long flags;	3381	unsigned long flags;
3382		3382
3383	if (!slabs_by_inuse)
3384	return -ENOMEM;
3385
3386	flush_all(s);	3383	flush_all(s);
3387	for_each_kmem_cache_node(s, node, n) {	3384	for_each_kmem_cache_node(s, node, n) {
3388	if (!n->nr_partial)	3385	if (!n->nr_partial)
3389	continue;	3386	continue;
3390		3387
3391	for (i = 0; i < objects; i++)	3388	INIT_LIST_HEAD(&discard);
3392	INIT_LIST_HEAD(slabs_by_inuse + i);	3389	for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
		3390	INIT_LIST_HEAD(promote + i);
3393		3391
3394	spin_lock_irqsave(&n->list_lock, flags);	3392	spin_lock_irqsave(&n->list_lock, flags);
3395		3393
3396	/*	3394	/*
3397	* Build lists indexed by the items in use in each slab.	3395	* Build lists of slabs to discard or promote.
3398	*	3396	*
3399	* Note that concurrent frees may occur while we hold the	3397	* Note that concurrent frees may occur while we hold the
3400	* list_lock. page->inuse here is the upper limit.	3398	* list_lock. page->inuse here is the upper limit.
3401	*/	3399	*/
3402	list_for_each_entry_safe(page, t, &n->partial, lru) {	3400	list_for_each_entry_safe(page, t, &n->partial, lru) {
3403	list_move(&page->lru, slabs_by_inuse + page->inuse);	3401	int free = page->objects - page->inuse;
3404	if (!page->inuse)	3402
		3403	/* Do not reread page->inuse */
		3404	barrier();
		3405
		3406	/* We do not keep full slabs on the list */
		3407	BUG_ON(free <= 0);
		3408
		3409	if (free == page->objects) {
		3410	list_move(&page->lru, &discard);
3405	n->nr_partial--;	3411	n->nr_partial--;
		3412	} else if (free <= SHRINK_PROMOTE_MAX)
		3413	list_move(&page->lru, promote + free - 1);
3406	}	3414	}
3407		3415
3408	/*	3416	/*
3409	* Rebuild the partial list with the slabs filled up most	3417	* Promote the slabs filled up most to the head of the
3410	* first and the least used slabs at the end.	3418	* partial list.
3411	*/	3419	*/
3412	for (i = objects - 1; i > 0; i--)	3420	for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
3413	list_splice(slabs_by_inuse + i, n->partial.prev);	3421	list_splice(promote + i, &n->partial);
3414		3422
3415	spin_unlock_irqrestore(&n->list_lock, flags);	3423	spin_unlock_irqrestore(&n->list_lock, flags);
3416		3424
3417	/* Release empty slabs */	3425	/* Release empty slabs */
3418	list_for_each_entry_safe(page, t, slabs_by_inuse, lru)	3426	list_for_each_entry_safe(page, t, &discard, lru)
3419	discard_slab(s, page);	3427	discard_slab(s, page);
3420	}	3428	}
3421		3429
3422	kfree(slabs_by_inuse);
3423	return 0;	3430	return 0;
3424	}	3431	}
3425		3432
@@ -4686,12 +4693,9 @@ static ssize_t shrink_show(struct kmem_cache s, char buf)
4686	static ssize_t shrink_store(struct kmem_cache *s,	4693	static ssize_t shrink_store(struct kmem_cache *s,
4687	const char *buf, size_t length)	4694	const char *buf, size_t length)
4688	{	4695	{
4689	if (buf[0] == '1') {	4696	if (buf[0] == '1')
4690	int rc = kmem_cache_shrink(s);	4697	kmem_cache_shrink(s);
4691		4698	else
4692	if (rc)
4693	return rc;
4694	} else
4695	return -EINVAL;	4699	return -EINVAL;
4696	return length;	4700	return length;
4697	}	4701	}