aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJoonsoo Kim <iamjoonsoo.kim@lge.com>2014-06-23 16:22:06 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-23 19:47:44 -0400
commit03787301420376ae41fbaf4267f4a6253d152ac5 (patch)
treee9a8e5b0c70868c8ba4612d23e1faa7dd2674b86
parentf00cdc6df7d7cfcabb5b740911e6788cb0802bdb (diff)
slab: fix oops when reading /proc/slab_allocators
Commit b1cb0982bdd6 ("change the management method of free objects of the slab") introduced a bug on slab leak detector ('/proc/slab_allocators'). This detector works like as following decription. 1. traverse all objects on all the slabs. 2. determine whether it is active or not. 3. if active, print who allocate this object. but that commit changed the way how to manage free objects, so the logic determining whether it is active or not is also changed. In before, we regard object in cpu caches as inactive one, but, with this commit, we mistakenly regard object in cpu caches as active one. This intoduces kernel oops if DEBUG_PAGEALLOC is enabled. If DEBUG_PAGEALLOC is enabled, kernel_map_pages() is used to detect who corrupt free memory in the slab. It unmaps page table mapping if object is free and map it if object is active. When slab leak detector check object in cpu caches, it mistakenly think this object active so try to access object memory to retrieve caller of allocation. At this point, page table mapping to this object doesn't exist, so oops occurs. Following is oops message reported from Dave. It blew up when something tried to read /proc/slab_allocators (Just cat it, and you should see the oops below) Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: [snip...] CPU: 1 PID: 9386 Comm: trinity-c33 Not tainted 3.14.0-rc5+ #131 task: ffff8801aa46e890 ti: ffff880076924000 task.ti: ffff880076924000 RIP: 0010:[<ffffffffaa1a8f4a>] [<ffffffffaa1a8f4a>] handle_slab+0x8a/0x180 RSP: 0018:ffff880076925de0 EFLAGS: 00010002 RAX: 0000000000001000 RBX: 0000000000000000 RCX: 000000005ce85ce7 RDX: ffffea00079be100 RSI: 0000000000001000 RDI: ffff880107458000 RBP: ffff880076925e18 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 000000000000000f R12: ffff8801e6f84000 R13: ffffea00079be100 R14: ffff880107458000 R15: ffff88022bb8d2c0 FS: 00007fb769e45740(0000) GS:ffff88024d040000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff8801e6f84ff8 CR3: 00000000a22db000 CR4: 00000000001407e0 DR0: 0000000002695000 DR1: 0000000002695000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000070602 Call Trace: leaks_show+0xce/0x240 seq_read+0x28e/0x490 proc_reg_read+0x3d/0x80 vfs_read+0x9b/0x160 SyS_read+0x58/0xb0 tracesys+0xd4/0xd9 Code: f5 00 00 00 0f 1f 44 00 00 48 63 c8 44 3b 0c 8a 0f 84 e3 00 00 00 83 c0 01 44 39 c0 72 eb 41 f6 47 1a 01 0f 84 e9 00 00 00 89 f0 <4d> 8b 4c 04 f8 4d 85 c9 0f 84 88 00 00 00 49 8b 7e 08 4d 8d 46 RIP handle_slab+0x8a/0x180 To fix the problem, I introduce an object status buffer on each slab. With this, we can track object status precisely, so slab leak detector would not access active object and no kernel oops would occur. Memory overhead caused by this fix is only imposed to CONFIG_DEBUG_SLAB_LEAK which is mainly used for debugging, so memory overhead isn't big problem. Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> Reported-by: Dave Jones <davej@redhat.com> Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Reviewed-by: Vladimir Davydov <vdavydov@parallels.com> Cc: Christoph Lameter <cl@linux.com> Cc: Pekka Enberg <penberg@kernel.org> Cc: David Rientjes <rientjes@google.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/slab.c90
1 files changed, 71 insertions, 19 deletions
diff --git a/mm/slab.c b/mm/slab.c
index 9ca3b87edabc..3070b929a1bf 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -386,6 +386,39 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
386 386
387#endif 387#endif
388 388
389#define OBJECT_FREE (0)
390#define OBJECT_ACTIVE (1)
391
392#ifdef CONFIG_DEBUG_SLAB_LEAK
393
394static void set_obj_status(struct page *page, int idx, int val)
395{
396 int freelist_size;
397 char *status;
398 struct kmem_cache *cachep = page->slab_cache;
399
400 freelist_size = cachep->num * sizeof(freelist_idx_t);
401 status = (char *)page->freelist + freelist_size;
402 status[idx] = val;
403}
404
405static inline unsigned int get_obj_status(struct page *page, int idx)
406{
407 int freelist_size;
408 char *status;
409 struct kmem_cache *cachep = page->slab_cache;
410
411 freelist_size = cachep->num * sizeof(freelist_idx_t);
412 status = (char *)page->freelist + freelist_size;
413
414 return status[idx];
415}
416
417#else
418static inline void set_obj_status(struct page *page, int idx, int val) {}
419
420#endif
421
389/* 422/*
390 * Do not go above this order unless 0 objects fit into the slab or 423 * Do not go above this order unless 0 objects fit into the slab or
391 * overridden on the command line. 424 * overridden on the command line.
@@ -576,12 +609,30 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
576 return cachep->array[smp_processor_id()]; 609 return cachep->array[smp_processor_id()];
577} 610}
578 611
612static size_t calculate_freelist_size(int nr_objs, size_t align)
613{
614 size_t freelist_size;
615
616 freelist_size = nr_objs * sizeof(freelist_idx_t);
617 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
618 freelist_size += nr_objs * sizeof(char);
619
620 if (align)
621 freelist_size = ALIGN(freelist_size, align);
622
623 return freelist_size;
624}
625
579static int calculate_nr_objs(size_t slab_size, size_t buffer_size, 626static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
580 size_t idx_size, size_t align) 627 size_t idx_size, size_t align)
581{ 628{
582 int nr_objs; 629 int nr_objs;
630 size_t remained_size;
583 size_t freelist_size; 631 size_t freelist_size;
632 int extra_space = 0;
584 633
634 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
635 extra_space = sizeof(char);
585 /* 636 /*
586 * Ignore padding for the initial guess. The padding 637 * Ignore padding for the initial guess. The padding
587 * is at most @align-1 bytes, and @buffer_size is at 638 * is at most @align-1 bytes, and @buffer_size is at
@@ -590,14 +641,15 @@ static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
590 * into the memory allocation when taking the padding 641 * into the memory allocation when taking the padding
591 * into account. 642 * into account.
592 */ 643 */
593 nr_objs = slab_size / (buffer_size + idx_size); 644 nr_objs = slab_size / (buffer_size + idx_size + extra_space);
594 645
595 /* 646 /*
596 * This calculated number will be either the right 647 * This calculated number will be either the right
597 * amount, or one greater than what we want. 648 * amount, or one greater than what we want.
598 */ 649 */
599 freelist_size = slab_size - nr_objs * buffer_size; 650 remained_size = slab_size - nr_objs * buffer_size;
600 if (freelist_size < ALIGN(nr_objs * idx_size, align)) 651 freelist_size = calculate_freelist_size(nr_objs, align);
652 if (remained_size < freelist_size)
601 nr_objs--; 653 nr_objs--;
602 654
603 return nr_objs; 655 return nr_objs;
@@ -635,7 +687,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
635 } else { 687 } else {
636 nr_objs = calculate_nr_objs(slab_size, buffer_size, 688 nr_objs = calculate_nr_objs(slab_size, buffer_size,
637 sizeof(freelist_idx_t), align); 689 sizeof(freelist_idx_t), align);
638 mgmt_size = ALIGN(nr_objs * sizeof(freelist_idx_t), align); 690 mgmt_size = calculate_freelist_size(nr_objs, align);
639 } 691 }
640 *num = nr_objs; 692 *num = nr_objs;
641 *left_over = slab_size - nr_objs*buffer_size - mgmt_size; 693 *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
@@ -2041,13 +2093,16 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
2041 break; 2093 break;
2042 2094
2043 if (flags & CFLGS_OFF_SLAB) { 2095 if (flags & CFLGS_OFF_SLAB) {
2096 size_t freelist_size_per_obj = sizeof(freelist_idx_t);
2044 /* 2097 /*
2045 * Max number of objs-per-slab for caches which 2098 * Max number of objs-per-slab for caches which
2046 * use off-slab slabs. Needed to avoid a possible 2099 * use off-slab slabs. Needed to avoid a possible
2047 * looping condition in cache_grow(). 2100 * looping condition in cache_grow().
2048 */ 2101 */
2102 if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
2103 freelist_size_per_obj += sizeof(char);
2049 offslab_limit = size; 2104 offslab_limit = size;
2050 offslab_limit /= sizeof(freelist_idx_t); 2105 offslab_limit /= freelist_size_per_obj;
2051 2106
2052 if (num > offslab_limit) 2107 if (num > offslab_limit)
2053 break; 2108 break;
@@ -2294,8 +2349,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2294 if (!cachep->num) 2349 if (!cachep->num)
2295 return -E2BIG; 2350 return -E2BIG;
2296 2351
2297 freelist_size = 2352 freelist_size = calculate_freelist_size(cachep->num, cachep->align);
2298 ALIGN(cachep->num * sizeof(freelist_idx_t), cachep->align);
2299 2353
2300 /* 2354 /*
2301 * If the slab has been placed off-slab, and we have enough space then 2355 * If the slab has been placed off-slab, and we have enough space then
@@ -2308,7 +2362,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2308 2362
2309 if (flags & CFLGS_OFF_SLAB) { 2363 if (flags & CFLGS_OFF_SLAB) {
2310 /* really off slab. No need for manual alignment */ 2364 /* really off slab. No need for manual alignment */
2311 freelist_size = cachep->num * sizeof(freelist_idx_t); 2365 freelist_size = calculate_freelist_size(cachep->num, 0);
2312 2366
2313#ifdef CONFIG_PAGE_POISONING 2367#ifdef CONFIG_PAGE_POISONING
2314 /* If we're going to use the generic kernel_map_pages() 2368 /* If we're going to use the generic kernel_map_pages()
@@ -2612,6 +2666,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2612 if (cachep->ctor) 2666 if (cachep->ctor)
2613 cachep->ctor(objp); 2667 cachep->ctor(objp);
2614#endif 2668#endif
2669 set_obj_status(page, i, OBJECT_FREE);
2615 set_free_obj(page, i, i); 2670 set_free_obj(page, i, i);
2616 } 2671 }
2617} 2672}
@@ -2820,6 +2875,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2820 BUG_ON(objnr >= cachep->num); 2875 BUG_ON(objnr >= cachep->num);
2821 BUG_ON(objp != index_to_obj(cachep, page, objnr)); 2876 BUG_ON(objp != index_to_obj(cachep, page, objnr));
2822 2877
2878 set_obj_status(page, objnr, OBJECT_FREE);
2823 if (cachep->flags & SLAB_POISON) { 2879 if (cachep->flags & SLAB_POISON) {
2824#ifdef CONFIG_DEBUG_PAGEALLOC 2880#ifdef CONFIG_DEBUG_PAGEALLOC
2825 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { 2881 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
@@ -2953,6 +3009,8 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
2953static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, 3009static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2954 gfp_t flags, void *objp, unsigned long caller) 3010 gfp_t flags, void *objp, unsigned long caller)
2955{ 3011{
3012 struct page *page;
3013
2956 if (!objp) 3014 if (!objp)
2957 return objp; 3015 return objp;
2958 if (cachep->flags & SLAB_POISON) { 3016 if (cachep->flags & SLAB_POISON) {
@@ -2983,6 +3041,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2983 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 3041 *dbg_redzone1(cachep, objp) = RED_ACTIVE;
2984 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 3042 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
2985 } 3043 }
3044
3045 page = virt_to_head_page(objp);
3046 set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
2986 objp += obj_offset(cachep); 3047 objp += obj_offset(cachep);
2987 if (cachep->ctor && cachep->flags & SLAB_POISON) 3048 if (cachep->ctor && cachep->flags & SLAB_POISON)
2988 cachep->ctor(objp); 3049 cachep->ctor(objp);
@@ -4219,21 +4280,12 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c,
4219 struct page *page) 4280 struct page *page)
4220{ 4281{
4221 void *p; 4282 void *p;
4222 int i, j; 4283 int i;
4223 4284
4224 if (n[0] == n[1]) 4285 if (n[0] == n[1])
4225 return; 4286 return;
4226 for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { 4287 for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
4227 bool active = true; 4288 if (get_obj_status(page, i) != OBJECT_ACTIVE)
4228
4229 for (j = page->active; j < c->num; j++) {
4230 /* Skip freed item */
4231 if (get_free_obj(page, j) == i) {
4232 active = false;
4233 break;
4234 }
4235 }
4236 if (!active)
4237 continue; 4289 continue;
4238 4290
4239 if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) 4291 if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))