aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2013-12-17 09:27:08 -0500
committerIngo Molnar <mingo@kernel.org>2013-12-17 09:27:08 -0500
commitbb799d3b980eb803ca2da4a4eefbd9308f8d988a (patch)
tree69fbe0cd6d47b23a50f5e1d87bf7489532fae149 /mm
parent919fc6e34831d1c2b58bfb5ae261dc3facc9b269 (diff)
parent319e2e3f63c348a9b66db4667efa73178e18b17d (diff)
Merge tag 'v3.13-rc4' into core/locking
Merge Linux 3.13-rc4, to refresh this rather old tree with the latest fixes. Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/huge_memory.c12
-rw-r--r--mm/hugetlb.c51
-rw-r--r--mm/memcontrol.c41
-rw-r--r--mm/memory.c7
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/migrate.c48
-rw-r--r--mm/shmem.c36
-rw-r--r--mm/slab.c571
-rw-r--r--mm/slub.c45
-rw-r--r--mm/swap.c143
10 files changed, 482 insertions, 474 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bccd5a628ea6..33a5dc492810 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1481,8 +1481,18 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1481 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); 1481 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1482 VM_BUG_ON(!pmd_none(*new_pmd)); 1482 VM_BUG_ON(!pmd_none(*new_pmd));
1483 set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); 1483 set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
1484 if (new_ptl != old_ptl) 1484 if (new_ptl != old_ptl) {
1485 pgtable_t pgtable;
1486
1487 /*
1488 * Move preallocated PTE page table if new_pmd is on
1489 * different PMD page table.
1490 */
1491 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
1492 pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
1493
1485 spin_unlock(new_ptl); 1494 spin_unlock(new_ptl);
1495 }
1486 spin_unlock(old_ptl); 1496 spin_unlock(old_ptl);
1487 } 1497 }
1488out: 1498out:
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7d57af21f49e..dee6cf4e6d34 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -476,40 +476,6 @@ static int vma_has_reserves(struct vm_area_struct *vma, long chg)
476 return 0; 476 return 0;
477} 477}
478 478
479static void copy_gigantic_page(struct page *dst, struct page *src)
480{
481 int i;
482 struct hstate *h = page_hstate(src);
483 struct page *dst_base = dst;
484 struct page *src_base = src;
485
486 for (i = 0; i < pages_per_huge_page(h); ) {
487 cond_resched();
488 copy_highpage(dst, src);
489
490 i++;
491 dst = mem_map_next(dst, dst_base, i);
492 src = mem_map_next(src, src_base, i);
493 }
494}
495
496void copy_huge_page(struct page *dst, struct page *src)
497{
498 int i;
499 struct hstate *h = page_hstate(src);
500
501 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
502 copy_gigantic_page(dst, src);
503 return;
504 }
505
506 might_sleep();
507 for (i = 0; i < pages_per_huge_page(h); i++) {
508 cond_resched();
509 copy_highpage(dst + i, src + i);
510 }
511}
512
513static void enqueue_huge_page(struct hstate *h, struct page *page) 479static void enqueue_huge_page(struct hstate *h, struct page *page)
514{ 480{
515 int nid = page_to_nid(page); 481 int nid = page_to_nid(page);
@@ -736,6 +702,23 @@ int PageHuge(struct page *page)
736} 702}
737EXPORT_SYMBOL_GPL(PageHuge); 703EXPORT_SYMBOL_GPL(PageHuge);
738 704
705/*
706 * PageHeadHuge() only returns true for hugetlbfs head page, but not for
707 * normal or transparent huge pages.
708 */
709int PageHeadHuge(struct page *page_head)
710{
711 compound_page_dtor *dtor;
712
713 if (!PageHead(page_head))
714 return 0;
715
716 dtor = get_compound_page_dtor(page_head);
717
718 return dtor == free_huge_page;
719}
720EXPORT_SYMBOL_GPL(PageHeadHuge);
721
739pgoff_t __basepage_index(struct page *page) 722pgoff_t __basepage_index(struct page *page)
740{ 723{
741 struct page *page_head = compound_head(page); 724 struct page *page_head = compound_head(page);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f1a0ae6e11b8..bf5e89457149 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2694,7 +2694,10 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2694 goto bypass; 2694 goto bypass;
2695 2695
2696 if (unlikely(task_in_memcg_oom(current))) 2696 if (unlikely(task_in_memcg_oom(current)))
2697 goto bypass; 2697 goto nomem;
2698
2699 if (gfp_mask & __GFP_NOFAIL)
2700 oom = false;
2698 2701
2699 /* 2702 /*
2700 * We always charge the cgroup the mm_struct belongs to. 2703 * We always charge the cgroup the mm_struct belongs to.
@@ -6352,6 +6355,42 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6352static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 6355static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
6353{ 6356{
6354 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6357 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6358 /*
6359 * XXX: css_offline() would be where we should reparent all
6360 * memory to prepare the cgroup for destruction. However,
6361 * memcg does not do css_tryget() and res_counter charging
6362 * under the same RCU lock region, which means that charging
6363 * could race with offlining. Offlining only happens to
6364 * cgroups with no tasks in them but charges can show up
6365 * without any tasks from the swapin path when the target
6366 * memcg is looked up from the swapout record and not from the
6367 * current task as it usually is. A race like this can leak
6368 * charges and put pages with stale cgroup pointers into
6369 * circulation:
6370 *
6371 * #0 #1
6372 * lookup_swap_cgroup_id()
6373 * rcu_read_lock()
6374 * mem_cgroup_lookup()
6375 * css_tryget()
6376 * rcu_read_unlock()
6377 * disable css_tryget()
6378 * call_rcu()
6379 * offline_css()
6380 * reparent_charges()
6381 * res_counter_charge()
6382 * css_put()
6383 * css_free()
6384 * pc->mem_cgroup = dead memcg
6385 * add page to lru
6386 *
6387 * The bulk of the charges are still moved in offline_css() to
6388 * avoid pinning a lot of pages in case a long-term reference
6389 * like a swapout record is deferring the css_free() to long
6390 * after offlining. But this makes sure we catch any charges
6391 * made after offlining:
6392 */
6393 mem_cgroup_reparent_charges(memcg);
6355 6394
6356 memcg_destroy_kmem(memcg); 6395 memcg_destroy_kmem(memcg);
6357 __mem_cgroup_free(memcg); 6396 __mem_cgroup_free(memcg);
diff --git a/mm/memory.c b/mm/memory.c
index 0409e8f43fa0..5d9025f3b3e1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4272,13 +4272,6 @@ void copy_user_huge_page(struct page *dst, struct page *src,
4272#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ 4272#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
4273 4273
4274#if USE_SPLIT_PTE_PTLOCKS && BLOATED_SPINLOCKS 4274#if USE_SPLIT_PTE_PTLOCKS && BLOATED_SPINLOCKS
4275static struct kmem_cache *page_ptl_cachep;
4276void __init ptlock_cache_init(void)
4277{
4278 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
4279 SLAB_PANIC, NULL);
4280}
4281
4282bool ptlock_alloc(struct page *page) 4275bool ptlock_alloc(struct page *page)
4283{ 4276{
4284 spinlock_t *ptl; 4277 spinlock_t *ptl;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c4403cdf3433..eca4a3129129 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2950,7 +2950,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2950 return; 2950 return;
2951 } 2951 }
2952 2952
2953 p += snprintf(p, maxlen, policy_modes[mode]); 2953 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2954 2954
2955 if (flags & MPOL_MODE_FLAGS) { 2955 if (flags & MPOL_MODE_FLAGS) {
2956 p += snprintf(p, buffer + maxlen - p, "="); 2956 p += snprintf(p, buffer + maxlen - p, "=");
diff --git a/mm/migrate.c b/mm/migrate.c
index 316e720a2023..bb940045fe85 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -442,6 +442,54 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
442} 442}
443 443
444/* 444/*
445 * Gigantic pages are so large that we do not guarantee that page++ pointer
446 * arithmetic will work across the entire page. We need something more
447 * specialized.
448 */
449static void __copy_gigantic_page(struct page *dst, struct page *src,
450 int nr_pages)
451{
452 int i;
453 struct page *dst_base = dst;
454 struct page *src_base = src;
455
456 for (i = 0; i < nr_pages; ) {
457 cond_resched();
458 copy_highpage(dst, src);
459
460 i++;
461 dst = mem_map_next(dst, dst_base, i);
462 src = mem_map_next(src, src_base, i);
463 }
464}
465
466static void copy_huge_page(struct page *dst, struct page *src)
467{
468 int i;
469 int nr_pages;
470
471 if (PageHuge(src)) {
472 /* hugetlbfs page */
473 struct hstate *h = page_hstate(src);
474 nr_pages = pages_per_huge_page(h);
475
476 if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
477 __copy_gigantic_page(dst, src, nr_pages);
478 return;
479 }
480 } else {
481 /* thp page */
482 BUG_ON(!PageTransHuge(src));
483 nr_pages = hpage_nr_pages(src);
484 }
485
486 for (i = 0; i < nr_pages; i++) {
487 cond_resched();
488 copy_highpage(dst + i, src + i);
489 }
490}
491
492/*
445 * Copy the page to its new location 493 * Copy the page to its new location
446 */ 494 */
447void migrate_page_copy(struct page *newpage, struct page *page) 495void migrate_page_copy(struct page *newpage, struct page *page)
diff --git a/mm/shmem.c b/mm/shmem.c
index 8297623fcaed..902a14842b74 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2918,13 +2918,8 @@ static struct dentry_operations anon_ops = {
2918 .d_dname = simple_dname 2918 .d_dname = simple_dname
2919}; 2919};
2920 2920
2921/** 2921static struct file *__shmem_file_setup(const char *name, loff_t size,
2922 * shmem_file_setup - get an unlinked file living in tmpfs 2922 unsigned long flags, unsigned int i_flags)
2923 * @name: name for dentry (to be seen in /proc/<pid>/maps
2924 * @size: size to be set for the file
2925 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
2926 */
2927struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
2928{ 2923{
2929 struct file *res; 2924 struct file *res;
2930 struct inode *inode; 2925 struct inode *inode;
@@ -2957,6 +2952,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
2957 if (!inode) 2952 if (!inode)
2958 goto put_dentry; 2953 goto put_dentry;
2959 2954
2955 inode->i_flags |= i_flags;
2960 d_instantiate(path.dentry, inode); 2956 d_instantiate(path.dentry, inode);
2961 inode->i_size = size; 2957 inode->i_size = size;
2962 clear_nlink(inode); /* It is unlinked */ 2958 clear_nlink(inode); /* It is unlinked */
@@ -2977,6 +2973,32 @@ put_memory:
2977 shmem_unacct_size(flags, size); 2973 shmem_unacct_size(flags, size);
2978 return res; 2974 return res;
2979} 2975}
2976
2977/**
2978 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
2979 * kernel internal. There will be NO LSM permission checks against the
2980 * underlying inode. So users of this interface must do LSM checks at a
2981 * higher layer. The one user is the big_key implementation. LSM checks
2982 * are provided at the key level rather than the inode level.
2983 * @name: name for dentry (to be seen in /proc/<pid>/maps
2984 * @size: size to be set for the file
2985 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
2986 */
2987struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
2988{
2989 return __shmem_file_setup(name, size, flags, S_PRIVATE);
2990}
2991
2992/**
2993 * shmem_file_setup - get an unlinked file living in tmpfs
2994 * @name: name for dentry (to be seen in /proc/<pid>/maps
2995 * @size: size to be set for the file
2996 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
2997 */
2998struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
2999{
3000 return __shmem_file_setup(name, size, flags, 0);
3001}
2980EXPORT_SYMBOL_GPL(shmem_file_setup); 3002EXPORT_SYMBOL_GPL(shmem_file_setup);
2981 3003
2982/** 3004/**
diff --git a/mm/slab.c b/mm/slab.c
index 0c8967bb2018..eb043bf05f4c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -164,72 +164,6 @@
164static bool pfmemalloc_active __read_mostly; 164static bool pfmemalloc_active __read_mostly;
165 165
166/* 166/*
167 * kmem_bufctl_t:
168 *
169 * Bufctl's are used for linking objs within a slab
170 * linked offsets.
171 *
172 * This implementation relies on "struct page" for locating the cache &
173 * slab an object belongs to.
174 * This allows the bufctl structure to be small (one int), but limits
175 * the number of objects a slab (not a cache) can contain when off-slab
176 * bufctls are used. The limit is the size of the largest general cache
177 * that does not use off-slab slabs.
178 * For 32bit archs with 4 kB pages, is this 56.
179 * This is not serious, as it is only for large objects, when it is unwise
180 * to have too many per slab.
181 * Note: This limit can be raised by introducing a general cache whose size
182 * is less than 512 (PAGE_SIZE<<3), but greater than 256.
183 */
184
185typedef unsigned int kmem_bufctl_t;
186#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0)
187#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1)
188#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2)
189#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)
190
191/*
192 * struct slab_rcu
193 *
194 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
195 * arrange for kmem_freepages to be called via RCU. This is useful if
196 * we need to approach a kernel structure obliquely, from its address
197 * obtained without the usual locking. We can lock the structure to
198 * stabilize it and check it's still at the given address, only if we
199 * can be sure that the memory has not been meanwhile reused for some
200 * other kind of object (which our subsystem's lock might corrupt).
201 *
202 * rcu_read_lock before reading the address, then rcu_read_unlock after
203 * taking the spinlock within the structure expected at that address.
204 */
205struct slab_rcu {
206 struct rcu_head head;
207 struct kmem_cache *cachep;
208 void *addr;
209};
210
211/*
212 * struct slab
213 *
214 * Manages the objs in a slab. Placed either at the beginning of mem allocated
215 * for a slab, or allocated from an general cache.
216 * Slabs are chained into three list: fully used, partial, fully free slabs.
217 */
218struct slab {
219 union {
220 struct {
221 struct list_head list;
222 unsigned long colouroff;
223 void *s_mem; /* including colour offset */
224 unsigned int inuse; /* num of objs active in slab */
225 kmem_bufctl_t free;
226 unsigned short nodeid;
227 };
228 struct slab_rcu __slab_cover_slab_rcu;
229 };
230};
231
232/*
233 * struct array_cache 167 * struct array_cache
234 * 168 *
235 * Purpose: 169 * Purpose:
@@ -456,18 +390,10 @@ static inline struct kmem_cache *virt_to_cache(const void *obj)
456 return page->slab_cache; 390 return page->slab_cache;
457} 391}
458 392
459static inline struct slab *virt_to_slab(const void *obj) 393static inline void *index_to_obj(struct kmem_cache *cache, struct page *page,
460{
461 struct page *page = virt_to_head_page(obj);
462
463 VM_BUG_ON(!PageSlab(page));
464 return page->slab_page;
465}
466
467static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
468 unsigned int idx) 394 unsigned int idx)
469{ 395{
470 return slab->s_mem + cache->size * idx; 396 return page->s_mem + cache->size * idx;
471} 397}
472 398
473/* 399/*
@@ -477,9 +403,9 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
477 * reciprocal_divide(offset, cache->reciprocal_buffer_size) 403 * reciprocal_divide(offset, cache->reciprocal_buffer_size)
478 */ 404 */
479static inline unsigned int obj_to_index(const struct kmem_cache *cache, 405static inline unsigned int obj_to_index(const struct kmem_cache *cache,
480 const struct slab *slab, void *obj) 406 const struct page *page, void *obj)
481{ 407{
482 u32 offset = (obj - slab->s_mem); 408 u32 offset = (obj - page->s_mem);
483 return reciprocal_divide(offset, cache->reciprocal_buffer_size); 409 return reciprocal_divide(offset, cache->reciprocal_buffer_size);
484} 410}
485 411
@@ -641,7 +567,7 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
641 567
642static size_t slab_mgmt_size(size_t nr_objs, size_t align) 568static size_t slab_mgmt_size(size_t nr_objs, size_t align)
643{ 569{
644 return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); 570 return ALIGN(nr_objs * sizeof(unsigned int), align);
645} 571}
646 572
647/* 573/*
@@ -660,8 +586,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
660 * on it. For the latter case, the memory allocated for a 586 * on it. For the latter case, the memory allocated for a
661 * slab is used for: 587 * slab is used for:
662 * 588 *
663 * - The struct slab 589 * - One unsigned int for each object
664 * - One kmem_bufctl_t for each object
665 * - Padding to respect alignment of @align 590 * - Padding to respect alignment of @align
666 * - @buffer_size bytes for each object 591 * - @buffer_size bytes for each object
667 * 592 *
@@ -674,8 +599,6 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
674 mgmt_size = 0; 599 mgmt_size = 0;
675 nr_objs = slab_size / buffer_size; 600 nr_objs = slab_size / buffer_size;
676 601
677 if (nr_objs > SLAB_LIMIT)
678 nr_objs = SLAB_LIMIT;
679 } else { 602 } else {
680 /* 603 /*
681 * Ignore padding for the initial guess. The padding 604 * Ignore padding for the initial guess. The padding
@@ -685,8 +608,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
685 * into the memory allocation when taking the padding 608 * into the memory allocation when taking the padding
686 * into account. 609 * into account.
687 */ 610 */
688 nr_objs = (slab_size - sizeof(struct slab)) / 611 nr_objs = (slab_size) / (buffer_size + sizeof(unsigned int));
689 (buffer_size + sizeof(kmem_bufctl_t));
690 612
691 /* 613 /*
692 * This calculated number will be either the right 614 * This calculated number will be either the right
@@ -696,9 +618,6 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
696 > slab_size) 618 > slab_size)
697 nr_objs--; 619 nr_objs--;
698 620
699 if (nr_objs > SLAB_LIMIT)
700 nr_objs = SLAB_LIMIT;
701
702 mgmt_size = slab_mgmt_size(nr_objs, align); 621 mgmt_size = slab_mgmt_size(nr_objs, align);
703 } 622 }
704 *num = nr_objs; 623 *num = nr_objs;
@@ -829,10 +748,8 @@ static struct array_cache *alloc_arraycache(int node, int entries,
829 return nc; 748 return nc;
830} 749}
831 750
832static inline bool is_slab_pfmemalloc(struct slab *slabp) 751static inline bool is_slab_pfmemalloc(struct page *page)
833{ 752{
834 struct page *page = virt_to_page(slabp->s_mem);
835
836 return PageSlabPfmemalloc(page); 753 return PageSlabPfmemalloc(page);
837} 754}
838 755
@@ -841,23 +758,23 @@ static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
841 struct array_cache *ac) 758 struct array_cache *ac)
842{ 759{
843 struct kmem_cache_node *n = cachep->node[numa_mem_id()]; 760 struct kmem_cache_node *n = cachep->node[numa_mem_id()];
844 struct slab *slabp; 761 struct page *page;
845 unsigned long flags; 762 unsigned long flags;
846 763
847 if (!pfmemalloc_active) 764 if (!pfmemalloc_active)
848 return; 765 return;
849 766
850 spin_lock_irqsave(&n->list_lock, flags); 767 spin_lock_irqsave(&n->list_lock, flags);
851 list_for_each_entry(slabp, &n->slabs_full, list) 768 list_for_each_entry(page, &n->slabs_full, lru)
852 if (is_slab_pfmemalloc(slabp)) 769 if (is_slab_pfmemalloc(page))
853 goto out; 770 goto out;
854 771
855 list_for_each_entry(slabp, &n->slabs_partial, list) 772 list_for_each_entry(page, &n->slabs_partial, lru)
856 if (is_slab_pfmemalloc(slabp)) 773 if (is_slab_pfmemalloc(page))
857 goto out; 774 goto out;
858 775
859 list_for_each_entry(slabp, &n->slabs_free, list) 776 list_for_each_entry(page, &n->slabs_free, lru)
860 if (is_slab_pfmemalloc(slabp)) 777 if (is_slab_pfmemalloc(page))
861 goto out; 778 goto out;
862 779
863 pfmemalloc_active = false; 780 pfmemalloc_active = false;
@@ -897,8 +814,8 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
897 */ 814 */
898 n = cachep->node[numa_mem_id()]; 815 n = cachep->node[numa_mem_id()];
899 if (!list_empty(&n->slabs_free) && force_refill) { 816 if (!list_empty(&n->slabs_free) && force_refill) {
900 struct slab *slabp = virt_to_slab(objp); 817 struct page *page = virt_to_head_page(objp);
901 ClearPageSlabPfmemalloc(virt_to_head_page(slabp->s_mem)); 818 ClearPageSlabPfmemalloc(page);
902 clear_obj_pfmemalloc(&objp); 819 clear_obj_pfmemalloc(&objp);
903 recheck_pfmemalloc_active(cachep, ac); 820 recheck_pfmemalloc_active(cachep, ac);
904 return objp; 821 return objp;
@@ -1099,8 +1016,7 @@ static void drain_alien_cache(struct kmem_cache *cachep,
1099 1016
1100static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 1017static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1101{ 1018{
1102 struct slab *slabp = virt_to_slab(objp); 1019 int nodeid = page_to_nid(virt_to_page(objp));
1103 int nodeid = slabp->nodeid;
1104 struct kmem_cache_node *n; 1020 struct kmem_cache_node *n;
1105 struct array_cache *alien = NULL; 1021 struct array_cache *alien = NULL;
1106 int node; 1022 int node;
@@ -1111,7 +1027,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1111 * Make sure we are not freeing a object from another node to the array 1027 * Make sure we are not freeing a object from another node to the array
1112 * cache on this cpu. 1028 * cache on this cpu.
1113 */ 1029 */
1114 if (likely(slabp->nodeid == node)) 1030 if (likely(nodeid == node))
1115 return 0; 1031 return 0;
1116 1032
1117 n = cachep->node[node]; 1033 n = cachep->node[node];
@@ -1512,6 +1428,8 @@ void __init kmem_cache_init(void)
1512{ 1428{
1513 int i; 1429 int i;
1514 1430
1431 BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) <
1432 sizeof(struct rcu_head));
1515 kmem_cache = &kmem_cache_boot; 1433 kmem_cache = &kmem_cache_boot;
1516 setup_node_pointer(kmem_cache); 1434 setup_node_pointer(kmem_cache);
1517 1435
@@ -1687,7 +1605,7 @@ static noinline void
1687slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) 1605slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1688{ 1606{
1689 struct kmem_cache_node *n; 1607 struct kmem_cache_node *n;
1690 struct slab *slabp; 1608 struct page *page;
1691 unsigned long flags; 1609 unsigned long flags;
1692 int node; 1610 int node;
1693 1611
@@ -1706,15 +1624,15 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1706 continue; 1624 continue;
1707 1625
1708 spin_lock_irqsave(&n->list_lock, flags); 1626 spin_lock_irqsave(&n->list_lock, flags);
1709 list_for_each_entry(slabp, &n->slabs_full, list) { 1627 list_for_each_entry(page, &n->slabs_full, lru) {
1710 active_objs += cachep->num; 1628 active_objs += cachep->num;
1711 active_slabs++; 1629 active_slabs++;
1712 } 1630 }
1713 list_for_each_entry(slabp, &n->slabs_partial, list) { 1631 list_for_each_entry(page, &n->slabs_partial, lru) {
1714 active_objs += slabp->inuse; 1632 active_objs += page->active;
1715 active_slabs++; 1633 active_slabs++;
1716 } 1634 }
1717 list_for_each_entry(slabp, &n->slabs_free, list) 1635 list_for_each_entry(page, &n->slabs_free, lru)
1718 num_slabs++; 1636 num_slabs++;
1719 1637
1720 free_objects += n->free_objects; 1638 free_objects += n->free_objects;
@@ -1736,19 +1654,11 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1736 * did not request dmaable memory, we might get it, but that 1654 * did not request dmaable memory, we might get it, but that
1737 * would be relatively rare and ignorable. 1655 * would be relatively rare and ignorable.
1738 */ 1656 */
1739static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) 1657static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
1658 int nodeid)
1740{ 1659{
1741 struct page *page; 1660 struct page *page;
1742 int nr_pages; 1661 int nr_pages;
1743 int i;
1744
1745#ifndef CONFIG_MMU
1746 /*
1747 * Nommu uses slab's for process anonymous memory allocations, and thus
1748 * requires __GFP_COMP to properly refcount higher order allocations
1749 */
1750 flags |= __GFP_COMP;
1751#endif
1752 1662
1753 flags |= cachep->allocflags; 1663 flags |= cachep->allocflags;
1754 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1664 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
@@ -1772,12 +1682,9 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1772 else 1682 else
1773 add_zone_page_state(page_zone(page), 1683 add_zone_page_state(page_zone(page),
1774 NR_SLAB_UNRECLAIMABLE, nr_pages); 1684 NR_SLAB_UNRECLAIMABLE, nr_pages);
1775 for (i = 0; i < nr_pages; i++) { 1685 __SetPageSlab(page);
1776 __SetPageSlab(page + i); 1686 if (page->pfmemalloc)
1777 1687 SetPageSlabPfmemalloc(page);
1778 if (page->pfmemalloc)
1779 SetPageSlabPfmemalloc(page + i);
1780 }
1781 memcg_bind_pages(cachep, cachep->gfporder); 1688 memcg_bind_pages(cachep, cachep->gfporder);
1782 1689
1783 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { 1690 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
@@ -1789,17 +1696,15 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1789 kmemcheck_mark_unallocated_pages(page, nr_pages); 1696 kmemcheck_mark_unallocated_pages(page, nr_pages);
1790 } 1697 }
1791 1698
1792 return page_address(page); 1699 return page;
1793} 1700}
1794 1701
1795/* 1702/*
1796 * Interface to system's page release. 1703 * Interface to system's page release.
1797 */ 1704 */
1798static void kmem_freepages(struct kmem_cache *cachep, void *addr) 1705static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
1799{ 1706{
1800 unsigned long i = (1 << cachep->gfporder); 1707 const unsigned long nr_freed = (1 << cachep->gfporder);
1801 struct page *page = virt_to_page(addr);
1802 const unsigned long nr_freed = i;
1803 1708
1804 kmemcheck_free_shadow(page, cachep->gfporder); 1709 kmemcheck_free_shadow(page, cachep->gfporder);
1805 1710
@@ -1809,27 +1714,28 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1809 else 1714 else
1810 sub_zone_page_state(page_zone(page), 1715 sub_zone_page_state(page_zone(page),
1811 NR_SLAB_UNRECLAIMABLE, nr_freed); 1716 NR_SLAB_UNRECLAIMABLE, nr_freed);
1812 while (i--) { 1717
1813 BUG_ON(!PageSlab(page)); 1718 BUG_ON(!PageSlab(page));
1814 __ClearPageSlabPfmemalloc(page); 1719 __ClearPageSlabPfmemalloc(page);
1815 __ClearPageSlab(page); 1720 __ClearPageSlab(page);
1816 page++; 1721 page_mapcount_reset(page);
1817 } 1722 page->mapping = NULL;
1818 1723
1819 memcg_release_pages(cachep, cachep->gfporder); 1724 memcg_release_pages(cachep, cachep->gfporder);
1820 if (current->reclaim_state) 1725 if (current->reclaim_state)
1821 current->reclaim_state->reclaimed_slab += nr_freed; 1726 current->reclaim_state->reclaimed_slab += nr_freed;
1822 free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder); 1727 __free_memcg_kmem_pages(page, cachep->gfporder);
1823} 1728}
1824 1729
1825static void kmem_rcu_free(struct rcu_head *head) 1730static void kmem_rcu_free(struct rcu_head *head)
1826{ 1731{
1827 struct slab_rcu *slab_rcu = (struct slab_rcu *)head; 1732 struct kmem_cache *cachep;
1828 struct kmem_cache *cachep = slab_rcu->cachep; 1733 struct page *page;
1829 1734
1830 kmem_freepages(cachep, slab_rcu->addr); 1735 page = container_of(head, struct page, rcu_head);
1831 if (OFF_SLAB(cachep)) 1736 cachep = page->slab_cache;
1832 kmem_cache_free(cachep->slabp_cache, slab_rcu); 1737
1738 kmem_freepages(cachep, page);
1833} 1739}
1834 1740
1835#if DEBUG 1741#if DEBUG
@@ -1978,19 +1884,19 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1978 /* Print some data about the neighboring objects, if they 1884 /* Print some data about the neighboring objects, if they
1979 * exist: 1885 * exist:
1980 */ 1886 */
1981 struct slab *slabp = virt_to_slab(objp); 1887 struct page *page = virt_to_head_page(objp);
1982 unsigned int objnr; 1888 unsigned int objnr;
1983 1889
1984 objnr = obj_to_index(cachep, slabp, objp); 1890 objnr = obj_to_index(cachep, page, objp);
1985 if (objnr) { 1891 if (objnr) {
1986 objp = index_to_obj(cachep, slabp, objnr - 1); 1892 objp = index_to_obj(cachep, page, objnr - 1);
1987 realobj = (char *)objp + obj_offset(cachep); 1893 realobj = (char *)objp + obj_offset(cachep);
1988 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1894 printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1989 realobj, size); 1895 realobj, size);
1990 print_objinfo(cachep, objp, 2); 1896 print_objinfo(cachep, objp, 2);
1991 } 1897 }
1992 if (objnr + 1 < cachep->num) { 1898 if (objnr + 1 < cachep->num) {
1993 objp = index_to_obj(cachep, slabp, objnr + 1); 1899 objp = index_to_obj(cachep, page, objnr + 1);
1994 realobj = (char *)objp + obj_offset(cachep); 1900 realobj = (char *)objp + obj_offset(cachep);
1995 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1901 printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1996 realobj, size); 1902 realobj, size);
@@ -2001,11 +1907,12 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
2001#endif 1907#endif
2002 1908
2003#if DEBUG 1909#if DEBUG
2004static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) 1910static void slab_destroy_debugcheck(struct kmem_cache *cachep,
1911 struct page *page)
2005{ 1912{
2006 int i; 1913 int i;
2007 for (i = 0; i < cachep->num; i++) { 1914 for (i = 0; i < cachep->num; i++) {
2008 void *objp = index_to_obj(cachep, slabp, i); 1915 void *objp = index_to_obj(cachep, page, i);
2009 1916
2010 if (cachep->flags & SLAB_POISON) { 1917 if (cachep->flags & SLAB_POISON) {
2011#ifdef CONFIG_DEBUG_PAGEALLOC 1918#ifdef CONFIG_DEBUG_PAGEALLOC
@@ -2030,7 +1937,8 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab
2030 } 1937 }
2031} 1938}
2032#else 1939#else
2033static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) 1940static void slab_destroy_debugcheck(struct kmem_cache *cachep,
1941 struct page *page)
2034{ 1942{
2035} 1943}
2036#endif 1944#endif
@@ -2044,23 +1952,34 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab
2044 * Before calling the slab must have been unlinked from the cache. The 1952 * Before calling the slab must have been unlinked from the cache. The
2045 * cache-lock is not held/needed. 1953 * cache-lock is not held/needed.
2046 */ 1954 */
2047static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) 1955static void slab_destroy(struct kmem_cache *cachep, struct page *page)
2048{ 1956{
2049 void *addr = slabp->s_mem - slabp->colouroff; 1957 void *freelist;
2050 1958
2051 slab_destroy_debugcheck(cachep, slabp); 1959 freelist = page->freelist;
1960 slab_destroy_debugcheck(cachep, page);
2052 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { 1961 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
2053 struct slab_rcu *slab_rcu; 1962 struct rcu_head *head;
1963
1964 /*
1965 * RCU free overloads the RCU head over the LRU.
1966 * slab_page has been overloeaded over the LRU,
1967 * however it is not used from now on so that
1968 * we can use it safely.
1969 */
1970 head = (void *)&page->rcu_head;
1971 call_rcu(head, kmem_rcu_free);
2054 1972
2055 slab_rcu = (struct slab_rcu *)slabp;
2056 slab_rcu->cachep = cachep;
2057 slab_rcu->addr = addr;
2058 call_rcu(&slab_rcu->head, kmem_rcu_free);
2059 } else { 1973 } else {
2060 kmem_freepages(cachep, addr); 1974 kmem_freepages(cachep, page);
2061 if (OFF_SLAB(cachep))
2062 kmem_cache_free(cachep->slabp_cache, slabp);
2063 } 1975 }
1976
1977 /*
1978 * From now on, we don't use freelist
1979 * although actual page can be freed in rcu context
1980 */
1981 if (OFF_SLAB(cachep))
1982 kmem_cache_free(cachep->freelist_cache, freelist);
2064} 1983}
2065 1984
2066/** 1985/**
@@ -2097,8 +2016,8 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
2097 * use off-slab slabs. Needed to avoid a possible 2016 * use off-slab slabs. Needed to avoid a possible
2098 * looping condition in cache_grow(). 2017 * looping condition in cache_grow().
2099 */ 2018 */
2100 offslab_limit = size - sizeof(struct slab); 2019 offslab_limit = size;
2101 offslab_limit /= sizeof(kmem_bufctl_t); 2020 offslab_limit /= sizeof(unsigned int);
2102 2021
2103 if (num > offslab_limit) 2022 if (num > offslab_limit)
2104 break; 2023 break;
@@ -2220,7 +2139,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2220int 2139int
2221__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) 2140__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2222{ 2141{
2223 size_t left_over, slab_size, ralign; 2142 size_t left_over, freelist_size, ralign;
2224 gfp_t gfp; 2143 gfp_t gfp;
2225 int err; 2144 int err;
2226 size_t size = cachep->size; 2145 size_t size = cachep->size;
@@ -2339,22 +2258,21 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2339 if (!cachep->num) 2258 if (!cachep->num)
2340 return -E2BIG; 2259 return -E2BIG;
2341 2260
2342 slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) 2261 freelist_size =
2343 + sizeof(struct slab), cachep->align); 2262 ALIGN(cachep->num * sizeof(unsigned int), cachep->align);
2344 2263
2345 /* 2264 /*
2346 * If the slab has been placed off-slab, and we have enough space then 2265 * If the slab has been placed off-slab, and we have enough space then
2347 * move it on-slab. This is at the expense of any extra colouring. 2266 * move it on-slab. This is at the expense of any extra colouring.
2348 */ 2267 */
2349 if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { 2268 if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
2350 flags &= ~CFLGS_OFF_SLAB; 2269 flags &= ~CFLGS_OFF_SLAB;
2351 left_over -= slab_size; 2270 left_over -= freelist_size;
2352 } 2271 }
2353 2272
2354 if (flags & CFLGS_OFF_SLAB) { 2273 if (flags & CFLGS_OFF_SLAB) {
2355 /* really off slab. No need for manual alignment */ 2274 /* really off slab. No need for manual alignment */
2356 slab_size = 2275 freelist_size = cachep->num * sizeof(unsigned int);
2357 cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
2358 2276
2359#ifdef CONFIG_PAGE_POISONING 2277#ifdef CONFIG_PAGE_POISONING
2360 /* If we're going to use the generic kernel_map_pages() 2278 /* If we're going to use the generic kernel_map_pages()
@@ -2371,16 +2289,16 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2371 if (cachep->colour_off < cachep->align) 2289 if (cachep->colour_off < cachep->align)
2372 cachep->colour_off = cachep->align; 2290 cachep->colour_off = cachep->align;
2373 cachep->colour = left_over / cachep->colour_off; 2291 cachep->colour = left_over / cachep->colour_off;
2374 cachep->slab_size = slab_size; 2292 cachep->freelist_size = freelist_size;
2375 cachep->flags = flags; 2293 cachep->flags = flags;
2376 cachep->allocflags = 0; 2294 cachep->allocflags = __GFP_COMP;
2377 if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) 2295 if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
2378 cachep->allocflags |= GFP_DMA; 2296 cachep->allocflags |= GFP_DMA;
2379 cachep->size = size; 2297 cachep->size = size;
2380 cachep->reciprocal_buffer_size = reciprocal_value(size); 2298 cachep->reciprocal_buffer_size = reciprocal_value(size);
2381 2299
2382 if (flags & CFLGS_OFF_SLAB) { 2300 if (flags & CFLGS_OFF_SLAB) {
2383 cachep->slabp_cache = kmalloc_slab(slab_size, 0u); 2301 cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
2384 /* 2302 /*
2385 * This is a possibility for one of the malloc_sizes caches. 2303 * This is a possibility for one of the malloc_sizes caches.
2386 * But since we go off slab only for object size greater than 2304 * But since we go off slab only for object size greater than
@@ -2388,7 +2306,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2388 * this should not happen at all. 2306 * this should not happen at all.
2389 * But leave a BUG_ON for some lucky dude. 2307 * But leave a BUG_ON for some lucky dude.
2390 */ 2308 */
2391 BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); 2309 BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
2392 } 2310 }
2393 2311
2394 err = setup_cpu_cache(cachep, gfp); 2312 err = setup_cpu_cache(cachep, gfp);
@@ -2494,7 +2412,7 @@ static int drain_freelist(struct kmem_cache *cache,
2494{ 2412{
2495 struct list_head *p; 2413 struct list_head *p;
2496 int nr_freed; 2414 int nr_freed;
2497 struct slab *slabp; 2415 struct page *page;
2498 2416
2499 nr_freed = 0; 2417 nr_freed = 0;
2500 while (nr_freed < tofree && !list_empty(&n->slabs_free)) { 2418 while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
@@ -2506,18 +2424,18 @@ static int drain_freelist(struct kmem_cache *cache,
2506 goto out; 2424 goto out;
2507 } 2425 }
2508 2426
2509 slabp = list_entry(p, struct slab, list); 2427 page = list_entry(p, struct page, lru);
2510#if DEBUG 2428#if DEBUG
2511 BUG_ON(slabp->inuse); 2429 BUG_ON(page->active);
2512#endif 2430#endif
2513 list_del(&slabp->list); 2431 list_del(&page->lru);
2514 /* 2432 /*
2515 * Safe to drop the lock. The slab is no longer linked 2433 * Safe to drop the lock. The slab is no longer linked
2516 * to the cache. 2434 * to the cache.
2517 */ 2435 */
2518 n->free_objects -= cache->num; 2436 n->free_objects -= cache->num;
2519 spin_unlock_irq(&n->list_lock); 2437 spin_unlock_irq(&n->list_lock);
2520 slab_destroy(cache, slabp); 2438 slab_destroy(cache, page);
2521 nr_freed++; 2439 nr_freed++;
2522 } 2440 }
2523out: 2441out:
@@ -2600,52 +2518,42 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
2600 * descriptors in kmem_cache_create, we search through the malloc_sizes array. 2518 * descriptors in kmem_cache_create, we search through the malloc_sizes array.
2601 * If we are creating a malloc_sizes cache here it would not be visible to 2519 * If we are creating a malloc_sizes cache here it would not be visible to
2602 * kmem_find_general_cachep till the initialization is complete. 2520 * kmem_find_general_cachep till the initialization is complete.
2603 * Hence we cannot have slabp_cache same as the original cache. 2521 * Hence we cannot have freelist_cache same as the original cache.
2604 */ 2522 */
2605static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, 2523static void *alloc_slabmgmt(struct kmem_cache *cachep,
2606 int colour_off, gfp_t local_flags, 2524 struct page *page, int colour_off,
2607 int nodeid) 2525 gfp_t local_flags, int nodeid)
2608{ 2526{
2609 struct slab *slabp; 2527 void *freelist;
2528 void *addr = page_address(page);
2610 2529
2611 if (OFF_SLAB(cachep)) { 2530 if (OFF_SLAB(cachep)) {
2612 /* Slab management obj is off-slab. */ 2531 /* Slab management obj is off-slab. */
2613 slabp = kmem_cache_alloc_node(cachep->slabp_cache, 2532 freelist = kmem_cache_alloc_node(cachep->freelist_cache,
2614 local_flags, nodeid); 2533 local_flags, nodeid);
2615 /* 2534 if (!freelist)
2616 * If the first object in the slab is leaked (it's allocated
2617 * but no one has a reference to it), we want to make sure
2618 * kmemleak does not treat the ->s_mem pointer as a reference
2619 * to the object. Otherwise we will not report the leak.
2620 */
2621 kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
2622 local_flags);
2623 if (!slabp)
2624 return NULL; 2535 return NULL;
2625 } else { 2536 } else {
2626 slabp = objp + colour_off; 2537 freelist = addr + colour_off;
2627 colour_off += cachep->slab_size; 2538 colour_off += cachep->freelist_size;
2628 } 2539 }
2629 slabp->inuse = 0; 2540 page->active = 0;
2630 slabp->colouroff = colour_off; 2541 page->s_mem = addr + colour_off;
2631 slabp->s_mem = objp + colour_off; 2542 return freelist;
2632 slabp->nodeid = nodeid;
2633 slabp->free = 0;
2634 return slabp;
2635} 2543}
2636 2544
2637static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) 2545static inline unsigned int *slab_freelist(struct page *page)
2638{ 2546{
2639 return (kmem_bufctl_t *) (slabp + 1); 2547 return (unsigned int *)(page->freelist);
2640} 2548}
2641 2549
2642static void cache_init_objs(struct kmem_cache *cachep, 2550static void cache_init_objs(struct kmem_cache *cachep,
2643 struct slab *slabp) 2551 struct page *page)
2644{ 2552{
2645 int i; 2553 int i;
2646 2554
2647 for (i = 0; i < cachep->num; i++) { 2555 for (i = 0; i < cachep->num; i++) {
2648 void *objp = index_to_obj(cachep, slabp, i); 2556 void *objp = index_to_obj(cachep, page, i);
2649#if DEBUG 2557#if DEBUG
2650 /* need to poison the objs? */ 2558 /* need to poison the objs? */
2651 if (cachep->flags & SLAB_POISON) 2559 if (cachep->flags & SLAB_POISON)
@@ -2681,9 +2589,8 @@ static void cache_init_objs(struct kmem_cache *cachep,
2681 if (cachep->ctor) 2589 if (cachep->ctor)
2682 cachep->ctor(objp); 2590 cachep->ctor(objp);
2683#endif 2591#endif
2684 slab_bufctl(slabp)[i] = i + 1; 2592 slab_freelist(page)[i] = i;
2685 } 2593 }
2686 slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2687} 2594}
2688 2595
2689static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) 2596static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
@@ -2696,41 +2603,41 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2696 } 2603 }
2697} 2604}
2698 2605
2699static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, 2606static void *slab_get_obj(struct kmem_cache *cachep, struct page *page,
2700 int nodeid) 2607 int nodeid)
2701{ 2608{
2702 void *objp = index_to_obj(cachep, slabp, slabp->free); 2609 void *objp;
2703 kmem_bufctl_t next;
2704 2610
2705 slabp->inuse++; 2611 objp = index_to_obj(cachep, page, slab_freelist(page)[page->active]);
2706 next = slab_bufctl(slabp)[slabp->free]; 2612 page->active++;
2707#if DEBUG 2613#if DEBUG
2708 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2614 WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
2709 WARN_ON(slabp->nodeid != nodeid);
2710#endif 2615#endif
2711 slabp->free = next;
2712 2616
2713 return objp; 2617 return objp;
2714} 2618}
2715 2619
2716static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, 2620static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
2717 void *objp, int nodeid) 2621 void *objp, int nodeid)
2718{ 2622{
2719 unsigned int objnr = obj_to_index(cachep, slabp, objp); 2623 unsigned int objnr = obj_to_index(cachep, page, objp);
2720
2721#if DEBUG 2624#if DEBUG
2625 unsigned int i;
2626
2722 /* Verify that the slab belongs to the intended node */ 2627 /* Verify that the slab belongs to the intended node */
2723 WARN_ON(slabp->nodeid != nodeid); 2628 WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
2724 2629
2725 if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) { 2630 /* Verify double free bug */
2726 printk(KERN_ERR "slab: double free detected in cache " 2631 for (i = page->active; i < cachep->num; i++) {
2727 "'%s', objp %p\n", cachep->name, objp); 2632 if (slab_freelist(page)[i] == objnr) {
2728 BUG(); 2633 printk(KERN_ERR "slab: double free detected in cache "
2634 "'%s', objp %p\n", cachep->name, objp);
2635 BUG();
2636 }
2729 } 2637 }
2730#endif 2638#endif
2731 slab_bufctl(slabp)[objnr] = slabp->free; 2639 page->active--;
2732 slabp->free = objnr; 2640 slab_freelist(page)[page->active] = objnr;
2733 slabp->inuse--;
2734} 2641}
2735 2642
2736/* 2643/*
@@ -2738,23 +2645,11 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2738 * for the slab allocator to be able to lookup the cache and slab of a 2645 * for the slab allocator to be able to lookup the cache and slab of a
2739 * virtual address for kfree, ksize, and slab debugging. 2646 * virtual address for kfree, ksize, and slab debugging.
2740 */ 2647 */
2741static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, 2648static void slab_map_pages(struct kmem_cache *cache, struct page *page,
2742 void *addr) 2649 void *freelist)
2743{ 2650{
2744 int nr_pages; 2651 page->slab_cache = cache;
2745 struct page *page; 2652 page->freelist = freelist;
2746
2747 page = virt_to_page(addr);
2748
2749 nr_pages = 1;
2750 if (likely(!PageCompound(page)))
2751 nr_pages <<= cache->gfporder;
2752
2753 do {
2754 page->slab_cache = cache;
2755 page->slab_page = slab;
2756 page++;
2757 } while (--nr_pages);
2758} 2653}
2759 2654
2760/* 2655/*
@@ -2762,9 +2657,9 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2762 * kmem_cache_alloc() when there are no active objs left in a cache. 2657 * kmem_cache_alloc() when there are no active objs left in a cache.
2763 */ 2658 */
2764static int cache_grow(struct kmem_cache *cachep, 2659static int cache_grow(struct kmem_cache *cachep,
2765 gfp_t flags, int nodeid, void *objp) 2660 gfp_t flags, int nodeid, struct page *page)
2766{ 2661{
2767 struct slab *slabp; 2662 void *freelist;
2768 size_t offset; 2663 size_t offset;
2769 gfp_t local_flags; 2664 gfp_t local_flags;
2770 struct kmem_cache_node *n; 2665 struct kmem_cache_node *n;
@@ -2805,20 +2700,20 @@ static int cache_grow(struct kmem_cache *cachep,
2805 * Get mem for the objs. Attempt to allocate a physical page from 2700 * Get mem for the objs. Attempt to allocate a physical page from
2806 * 'nodeid'. 2701 * 'nodeid'.
2807 */ 2702 */
2808 if (!objp) 2703 if (!page)
2809 objp = kmem_getpages(cachep, local_flags, nodeid); 2704 page = kmem_getpages(cachep, local_flags, nodeid);
2810 if (!objp) 2705 if (!page)
2811 goto failed; 2706 goto failed;
2812 2707
2813 /* Get slab management. */ 2708 /* Get slab management. */
2814 slabp = alloc_slabmgmt(cachep, objp, offset, 2709 freelist = alloc_slabmgmt(cachep, page, offset,
2815 local_flags & ~GFP_CONSTRAINT_MASK, nodeid); 2710 local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
2816 if (!slabp) 2711 if (!freelist)
2817 goto opps1; 2712 goto opps1;
2818 2713
2819 slab_map_pages(cachep, slabp, objp); 2714 slab_map_pages(cachep, page, freelist);
2820 2715
2821 cache_init_objs(cachep, slabp); 2716 cache_init_objs(cachep, page);
2822 2717
2823 if (local_flags & __GFP_WAIT) 2718 if (local_flags & __GFP_WAIT)
2824 local_irq_disable(); 2719 local_irq_disable();
@@ -2826,13 +2721,13 @@ static int cache_grow(struct kmem_cache *cachep,
2826 spin_lock(&n->list_lock); 2721 spin_lock(&n->list_lock);
2827 2722
2828 /* Make slab active. */ 2723 /* Make slab active. */
2829 list_add_tail(&slabp->list, &(n->slabs_free)); 2724 list_add_tail(&page->lru, &(n->slabs_free));
2830 STATS_INC_GROWN(cachep); 2725 STATS_INC_GROWN(cachep);
2831 n->free_objects += cachep->num; 2726 n->free_objects += cachep->num;
2832 spin_unlock(&n->list_lock); 2727 spin_unlock(&n->list_lock);
2833 return 1; 2728 return 1;
2834opps1: 2729opps1:
2835 kmem_freepages(cachep, objp); 2730 kmem_freepages(cachep, page);
2836failed: 2731failed:
2837 if (local_flags & __GFP_WAIT) 2732 if (local_flags & __GFP_WAIT)
2838 local_irq_disable(); 2733 local_irq_disable();
@@ -2880,9 +2775,8 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2880static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, 2775static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2881 unsigned long caller) 2776 unsigned long caller)
2882{ 2777{
2883 struct page *page;
2884 unsigned int objnr; 2778 unsigned int objnr;
2885 struct slab *slabp; 2779 struct page *page;
2886 2780
2887 BUG_ON(virt_to_cache(objp) != cachep); 2781 BUG_ON(virt_to_cache(objp) != cachep);
2888 2782
@@ -2890,8 +2784,6 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2890 kfree_debugcheck(objp); 2784 kfree_debugcheck(objp);
2891 page = virt_to_head_page(objp); 2785 page = virt_to_head_page(objp);
2892 2786
2893 slabp = page->slab_page;
2894
2895 if (cachep->flags & SLAB_RED_ZONE) { 2787 if (cachep->flags & SLAB_RED_ZONE) {
2896 verify_redzone_free(cachep, objp); 2788 verify_redzone_free(cachep, objp);
2897 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2789 *dbg_redzone1(cachep, objp) = RED_INACTIVE;
@@ -2900,14 +2792,11 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2900 if (cachep->flags & SLAB_STORE_USER) 2792 if (cachep->flags & SLAB_STORE_USER)
2901 *dbg_userword(cachep, objp) = (void *)caller; 2793 *dbg_userword(cachep, objp) = (void *)caller;
2902 2794
2903 objnr = obj_to_index(cachep, slabp, objp); 2795 objnr = obj_to_index(cachep, page, objp);
2904 2796
2905 BUG_ON(objnr >= cachep->num); 2797 BUG_ON(objnr >= cachep->num);
2906 BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); 2798 BUG_ON(objp != index_to_obj(cachep, page, objnr));
2907 2799
2908#ifdef CONFIG_DEBUG_SLAB_LEAK
2909 slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
2910#endif
2911 if (cachep->flags & SLAB_POISON) { 2800 if (cachep->flags & SLAB_POISON) {
2912#ifdef CONFIG_DEBUG_PAGEALLOC 2801#ifdef CONFIG_DEBUG_PAGEALLOC
2913 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { 2802 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
@@ -2924,33 +2813,9 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2924 return objp; 2813 return objp;
2925} 2814}
2926 2815
2927static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2928{
2929 kmem_bufctl_t i;
2930 int entries = 0;
2931
2932 /* Check slab's freelist to see if this obj is there. */
2933 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2934 entries++;
2935 if (entries > cachep->num || i >= cachep->num)
2936 goto bad;
2937 }
2938 if (entries != cachep->num - slabp->inuse) {
2939bad:
2940 printk(KERN_ERR "slab: Internal list corruption detected in "
2941 "cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n",
2942 cachep->name, cachep->num, slabp, slabp->inuse,
2943 print_tainted());
2944 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp,
2945 sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t),
2946 1);
2947 BUG();
2948 }
2949}
2950#else 2816#else
2951#define kfree_debugcheck(x) do { } while(0) 2817#define kfree_debugcheck(x) do { } while(0)
2952#define cache_free_debugcheck(x,objp,z) (objp) 2818#define cache_free_debugcheck(x,objp,z) (objp)
2953#define check_slabp(x,y) do { } while(0)
2954#endif 2819#endif
2955 2820
2956static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, 2821static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
@@ -2989,7 +2854,7 @@ retry:
2989 2854
2990 while (batchcount > 0) { 2855 while (batchcount > 0) {
2991 struct list_head *entry; 2856 struct list_head *entry;
2992 struct slab *slabp; 2857 struct page *page;
2993 /* Get slab alloc is to come from. */ 2858 /* Get slab alloc is to come from. */
2994 entry = n->slabs_partial.next; 2859 entry = n->slabs_partial.next;
2995 if (entry == &n->slabs_partial) { 2860 if (entry == &n->slabs_partial) {
@@ -2999,8 +2864,7 @@ retry:
2999 goto must_grow; 2864 goto must_grow;
3000 } 2865 }
3001 2866
3002 slabp = list_entry(entry, struct slab, list); 2867 page = list_entry(entry, struct page, lru);
3003 check_slabp(cachep, slabp);
3004 check_spinlock_acquired(cachep); 2868 check_spinlock_acquired(cachep);
3005 2869
3006 /* 2870 /*
@@ -3008,24 +2872,23 @@ retry:
3008 * there must be at least one object available for 2872 * there must be at least one object available for
3009 * allocation. 2873 * allocation.
3010 */ 2874 */
3011 BUG_ON(slabp->inuse >= cachep->num); 2875 BUG_ON(page->active >= cachep->num);
3012 2876
3013 while (slabp->inuse < cachep->num && batchcount--) { 2877 while (page->active < cachep->num && batchcount--) {
3014 STATS_INC_ALLOCED(cachep); 2878 STATS_INC_ALLOCED(cachep);
3015 STATS_INC_ACTIVE(cachep); 2879 STATS_INC_ACTIVE(cachep);
3016 STATS_SET_HIGH(cachep); 2880 STATS_SET_HIGH(cachep);
3017 2881
3018 ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp, 2882 ac_put_obj(cachep, ac, slab_get_obj(cachep, page,
3019 node)); 2883 node));
3020 } 2884 }
3021 check_slabp(cachep, slabp);
3022 2885
3023 /* move slabp to correct slabp list: */ 2886 /* move slabp to correct slabp list: */
3024 list_del(&slabp->list); 2887 list_del(&page->lru);
3025 if (slabp->free == BUFCTL_END) 2888 if (page->active == cachep->num)
3026 list_add(&slabp->list, &n->slabs_full); 2889 list_add(&page->list, &n->slabs_full);
3027 else 2890 else
3028 list_add(&slabp->list, &n->slabs_partial); 2891 list_add(&page->list, &n->slabs_partial);
3029 } 2892 }
3030 2893
3031must_grow: 2894must_grow:
@@ -3097,16 +2960,6 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3097 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 2960 *dbg_redzone1(cachep, objp) = RED_ACTIVE;
3098 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2961 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
3099 } 2962 }
3100#ifdef CONFIG_DEBUG_SLAB_LEAK
3101 {
3102 struct slab *slabp;
3103 unsigned objnr;
3104
3105 slabp = virt_to_head_page(objp)->slab_page;
3106 objnr = (unsigned)(objp - slabp->s_mem) / cachep->size;
3107 slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
3108 }
3109#endif
3110 objp += obj_offset(cachep); 2963 objp += obj_offset(cachep);
3111 if (cachep->ctor && cachep->flags & SLAB_POISON) 2964 if (cachep->ctor && cachep->flags & SLAB_POISON)
3112 cachep->ctor(objp); 2965 cachep->ctor(objp);
@@ -3248,18 +3101,20 @@ retry:
3248 * We may trigger various forms of reclaim on the allowed 3101 * We may trigger various forms of reclaim on the allowed
3249 * set and go into memory reserves if necessary. 3102 * set and go into memory reserves if necessary.
3250 */ 3103 */
3104 struct page *page;
3105
3251 if (local_flags & __GFP_WAIT) 3106 if (local_flags & __GFP_WAIT)
3252 local_irq_enable(); 3107 local_irq_enable();
3253 kmem_flagcheck(cache, flags); 3108 kmem_flagcheck(cache, flags);
3254 obj = kmem_getpages(cache, local_flags, numa_mem_id()); 3109 page = kmem_getpages(cache, local_flags, numa_mem_id());
3255 if (local_flags & __GFP_WAIT) 3110 if (local_flags & __GFP_WAIT)
3256 local_irq_disable(); 3111 local_irq_disable();
3257 if (obj) { 3112 if (page) {
3258 /* 3113 /*
3259 * Insert into the appropriate per node queues 3114 * Insert into the appropriate per node queues
3260 */ 3115 */
3261 nid = page_to_nid(virt_to_page(obj)); 3116 nid = page_to_nid(page);
3262 if (cache_grow(cache, flags, nid, obj)) { 3117 if (cache_grow(cache, flags, nid, page)) {
3263 obj = ____cache_alloc_node(cache, 3118 obj = ____cache_alloc_node(cache,
3264 flags | GFP_THISNODE, nid); 3119 flags | GFP_THISNODE, nid);
3265 if (!obj) 3120 if (!obj)
@@ -3288,7 +3143,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3288 int nodeid) 3143 int nodeid)
3289{ 3144{
3290 struct list_head *entry; 3145 struct list_head *entry;
3291 struct slab *slabp; 3146 struct page *page;
3292 struct kmem_cache_node *n; 3147 struct kmem_cache_node *n;
3293 void *obj; 3148 void *obj;
3294 int x; 3149 int x;
@@ -3308,26 +3163,24 @@ retry:
3308 goto must_grow; 3163 goto must_grow;
3309 } 3164 }
3310 3165
3311 slabp = list_entry(entry, struct slab, list); 3166 page = list_entry(entry, struct page, lru);
3312 check_spinlock_acquired_node(cachep, nodeid); 3167 check_spinlock_acquired_node(cachep, nodeid);
3313 check_slabp(cachep, slabp);
3314 3168
3315 STATS_INC_NODEALLOCS(cachep); 3169 STATS_INC_NODEALLOCS(cachep);
3316 STATS_INC_ACTIVE(cachep); 3170 STATS_INC_ACTIVE(cachep);
3317 STATS_SET_HIGH(cachep); 3171 STATS_SET_HIGH(cachep);
3318 3172
3319 BUG_ON(slabp->inuse == cachep->num); 3173 BUG_ON(page->active == cachep->num);
3320 3174
3321 obj = slab_get_obj(cachep, slabp, nodeid); 3175 obj = slab_get_obj(cachep, page, nodeid);
3322 check_slabp(cachep, slabp);
3323 n->free_objects--; 3176 n->free_objects--;
3324 /* move slabp to correct slabp list: */ 3177 /* move slabp to correct slabp list: */
3325 list_del(&slabp->list); 3178 list_del(&page->lru);
3326 3179
3327 if (slabp->free == BUFCTL_END) 3180 if (page->active == cachep->num)
3328 list_add(&slabp->list, &n->slabs_full); 3181 list_add(&page->lru, &n->slabs_full);
3329 else 3182 else
3330 list_add(&slabp->list, &n->slabs_partial); 3183 list_add(&page->lru, &n->slabs_partial);
3331 3184
3332 spin_unlock(&n->list_lock); 3185 spin_unlock(&n->list_lock);
3333 goto done; 3186 goto done;
@@ -3477,23 +3330,21 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3477 3330
3478 for (i = 0; i < nr_objects; i++) { 3331 for (i = 0; i < nr_objects; i++) {
3479 void *objp; 3332 void *objp;
3480 struct slab *slabp; 3333 struct page *page;
3481 3334
3482 clear_obj_pfmemalloc(&objpp[i]); 3335 clear_obj_pfmemalloc(&objpp[i]);
3483 objp = objpp[i]; 3336 objp = objpp[i];
3484 3337
3485 slabp = virt_to_slab(objp); 3338 page = virt_to_head_page(objp);
3486 n = cachep->node[node]; 3339 n = cachep->node[node];
3487 list_del(&slabp->list); 3340 list_del(&page->lru);
3488 check_spinlock_acquired_node(cachep, node); 3341 check_spinlock_acquired_node(cachep, node);
3489 check_slabp(cachep, slabp); 3342 slab_put_obj(cachep, page, objp, node);
3490 slab_put_obj(cachep, slabp, objp, node);
3491 STATS_DEC_ACTIVE(cachep); 3343 STATS_DEC_ACTIVE(cachep);
3492 n->free_objects++; 3344 n->free_objects++;
3493 check_slabp(cachep, slabp);
3494 3345
3495 /* fixup slab chains */ 3346 /* fixup slab chains */
3496 if (slabp->inuse == 0) { 3347 if (page->active == 0) {
3497 if (n->free_objects > n->free_limit) { 3348 if (n->free_objects > n->free_limit) {
3498 n->free_objects -= cachep->num; 3349 n->free_objects -= cachep->num;
3499 /* No need to drop any previously held 3350 /* No need to drop any previously held
@@ -3502,16 +3353,16 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3502 * a different cache, refer to comments before 3353 * a different cache, refer to comments before
3503 * alloc_slabmgmt. 3354 * alloc_slabmgmt.
3504 */ 3355 */
3505 slab_destroy(cachep, slabp); 3356 slab_destroy(cachep, page);
3506 } else { 3357 } else {
3507 list_add(&slabp->list, &n->slabs_free); 3358 list_add(&page->lru, &n->slabs_free);
3508 } 3359 }
3509 } else { 3360 } else {
3510 /* Unconditionally move a slab to the end of the 3361 /* Unconditionally move a slab to the end of the
3511 * partial list on free - maximum time for the 3362 * partial list on free - maximum time for the
3512 * other objects to be freed, too. 3363 * other objects to be freed, too.
3513 */ 3364 */
3514 list_add_tail(&slabp->list, &n->slabs_partial); 3365 list_add_tail(&page->lru, &n->slabs_partial);
3515 } 3366 }
3516 } 3367 }
3517} 3368}
@@ -3551,10 +3402,10 @@ free_done:
3551 3402
3552 p = n->slabs_free.next; 3403 p = n->slabs_free.next;
3553 while (p != &(n->slabs_free)) { 3404 while (p != &(n->slabs_free)) {
3554 struct slab *slabp; 3405 struct page *page;
3555 3406
3556 slabp = list_entry(p, struct slab, list); 3407 page = list_entry(p, struct page, lru);
3557 BUG_ON(slabp->inuse); 3408 BUG_ON(page->active);
3558 3409
3559 i++; 3410 i++;
3560 p = p->next; 3411 p = p->next;
@@ -4158,7 +4009,7 @@ out:
4158#ifdef CONFIG_SLABINFO 4009#ifdef CONFIG_SLABINFO
4159void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) 4010void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
4160{ 4011{
4161 struct slab *slabp; 4012 struct page *page;
4162 unsigned long active_objs; 4013 unsigned long active_objs;
4163 unsigned long num_objs; 4014 unsigned long num_objs;
4164 unsigned long active_slabs = 0; 4015 unsigned long active_slabs = 0;
@@ -4178,23 +4029,23 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
4178 check_irq_on(); 4029 check_irq_on();
4179 spin_lock_irq(&n->list_lock); 4030 spin_lock_irq(&n->list_lock);
4180 4031
4181 list_for_each_entry(slabp, &n->slabs_full, list) { 4032 list_for_each_entry(page, &n->slabs_full, lru) {
4182 if (slabp->inuse != cachep->num && !error) 4033 if (page->active != cachep->num && !error)
4183 error = "slabs_full accounting error"; 4034 error = "slabs_full accounting error";
4184 active_objs += cachep->num; 4035 active_objs += cachep->num;
4185 active_slabs++; 4036 active_slabs++;
4186 } 4037 }
4187 list_for_each_entry(slabp, &n->slabs_partial, list) { 4038 list_for_each_entry(page, &n->slabs_partial, lru) {
4188 if (slabp->inuse == cachep->num && !error) 4039 if (page->active == cachep->num && !error)
4189 error = "slabs_partial inuse accounting error"; 4040 error = "slabs_partial accounting error";
4190 if (!slabp->inuse && !error) 4041 if (!page->active && !error)
4191 error = "slabs_partial/inuse accounting error"; 4042 error = "slabs_partial accounting error";
4192 active_objs += slabp->inuse; 4043 active_objs += page->active;
4193 active_slabs++; 4044 active_slabs++;
4194 } 4045 }
4195 list_for_each_entry(slabp, &n->slabs_free, list) { 4046 list_for_each_entry(page, &n->slabs_free, lru) {
4196 if (slabp->inuse && !error) 4047 if (page->active && !error)
4197 error = "slabs_free/inuse accounting error"; 4048 error = "slabs_free accounting error";
4198 num_slabs++; 4049 num_slabs++;
4199 } 4050 }
4200 free_objects += n->free_objects; 4051 free_objects += n->free_objects;
@@ -4346,15 +4197,27 @@ static inline int add_caller(unsigned long *n, unsigned long v)
4346 return 1; 4197 return 1;
4347} 4198}
4348 4199
4349static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) 4200static void handle_slab(unsigned long *n, struct kmem_cache *c,
4201 struct page *page)
4350{ 4202{
4351 void *p; 4203 void *p;
4352 int i; 4204 int i, j;
4205
4353 if (n[0] == n[1]) 4206 if (n[0] == n[1])
4354 return; 4207 return;
4355 for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) { 4208 for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
4356 if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) 4209 bool active = true;
4210
4211 for (j = page->active; j < c->num; j++) {
4212 /* Skip freed item */
4213 if (slab_freelist(page)[j] == i) {
4214 active = false;
4215 break;
4216 }
4217 }
4218 if (!active)
4357 continue; 4219 continue;
4220
4358 if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) 4221 if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
4359 return; 4222 return;
4360 } 4223 }
@@ -4379,7 +4242,7 @@ static void show_symbol(struct seq_file *m, unsigned long address)
4379static int leaks_show(struct seq_file *m, void *p) 4242static int leaks_show(struct seq_file *m, void *p)
4380{ 4243{
4381 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); 4244 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
4382 struct slab *slabp; 4245 struct page *page;
4383 struct kmem_cache_node *n; 4246 struct kmem_cache_node *n;
4384 const char *name; 4247 const char *name;
4385 unsigned long *x = m->private; 4248 unsigned long *x = m->private;
@@ -4403,10 +4266,10 @@ static int leaks_show(struct seq_file *m, void *p)
4403 check_irq_on(); 4266 check_irq_on();
4404 spin_lock_irq(&n->list_lock); 4267 spin_lock_irq(&n->list_lock);
4405 4268
4406 list_for_each_entry(slabp, &n->slabs_full, list) 4269 list_for_each_entry(page, &n->slabs_full, lru)
4407 handle_slab(x, cachep, slabp); 4270 handle_slab(x, cachep, page);
4408 list_for_each_entry(slabp, &n->slabs_partial, list) 4271 list_for_each_entry(page, &n->slabs_partial, lru)
4409 handle_slab(x, cachep, slabp); 4272 handle_slab(x, cachep, page);
4410 spin_unlock_irq(&n->list_lock); 4273 spin_unlock_irq(&n->list_lock);
4411 } 4274 }
4412 name = cachep->name; 4275 name = cachep->name;
diff --git a/mm/slub.c b/mm/slub.c
index 7e8bd8d828bc..545a170ebf9f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -155,7 +155,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
155/* 155/*
156 * Maximum number of desirable partial slabs. 156 * Maximum number of desirable partial slabs.
157 * The existence of more partial slabs makes kmem_cache_shrink 157 * The existence of more partial slabs makes kmem_cache_shrink
158 * sort the partial list by the number of objects in the. 158 * sort the partial list by the number of objects in use.
159 */ 159 */
160#define MAX_PARTIAL 10 160#define MAX_PARTIAL 10
161 161
@@ -933,6 +933,16 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
933 * Hooks for other subsystems that check memory allocations. In a typical 933 * Hooks for other subsystems that check memory allocations. In a typical
934 * production configuration these hooks all should produce no code at all. 934 * production configuration these hooks all should produce no code at all.
935 */ 935 */
936static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
937{
938 kmemleak_alloc(ptr, size, 1, flags);
939}
940
941static inline void kfree_hook(const void *x)
942{
943 kmemleak_free(x);
944}
945
936static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) 946static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
937{ 947{
938 flags &= gfp_allowed_mask; 948 flags &= gfp_allowed_mask;
@@ -1217,8 +1227,8 @@ static unsigned long kmem_cache_flags(unsigned long object_size,
1217 /* 1227 /*
1218 * Enable debugging if selected on the kernel commandline. 1228 * Enable debugging if selected on the kernel commandline.
1219 */ 1229 */
1220 if (slub_debug && (!slub_debug_slabs || 1230 if (slub_debug && (!slub_debug_slabs || (name &&
1221 !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))) 1231 !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))))
1222 flags |= slub_debug; 1232 flags |= slub_debug;
1223 1233
1224 return flags; 1234 return flags;
@@ -1260,13 +1270,30 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
1260static inline void dec_slabs_node(struct kmem_cache *s, int node, 1270static inline void dec_slabs_node(struct kmem_cache *s, int node,
1261 int objects) {} 1271 int objects) {}
1262 1272
1273static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
1274{
1275 kmemleak_alloc(ptr, size, 1, flags);
1276}
1277
1278static inline void kfree_hook(const void *x)
1279{
1280 kmemleak_free(x);
1281}
1282
1263static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) 1283static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
1264 { return 0; } 1284 { return 0; }
1265 1285
1266static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, 1286static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
1267 void *object) {} 1287 void *object)
1288{
1289 kmemleak_alloc_recursive(object, s->object_size, 1, s->flags,
1290 flags & gfp_allowed_mask);
1291}
1268 1292
1269static inline void slab_free_hook(struct kmem_cache *s, void *x) {} 1293static inline void slab_free_hook(struct kmem_cache *s, void *x)
1294{
1295 kmemleak_free_recursive(x, s->flags);
1296}
1270 1297
1271#endif /* CONFIG_SLUB_DEBUG */ 1298#endif /* CONFIG_SLUB_DEBUG */
1272 1299
@@ -2829,8 +2856,8 @@ static struct kmem_cache *kmem_cache_node;
2829 * slab on the node for this slabcache. There are no concurrent accesses 2856 * slab on the node for this slabcache. There are no concurrent accesses
2830 * possible. 2857 * possible.
2831 * 2858 *
2832 * Note that this function only works on the kmalloc_node_cache 2859 * Note that this function only works on the kmem_cache_node
2833 * when allocating for the kmalloc_node_cache. This is used for bootstrapping 2860 * when allocating for the kmem_cache_node. This is used for bootstrapping
2834 * memory on a fresh node that has no slab structures yet. 2861 * memory on a fresh node that has no slab structures yet.
2835 */ 2862 */
2836static void early_kmem_cache_node_alloc(int node) 2863static void early_kmem_cache_node_alloc(int node)
@@ -3272,7 +3299,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
3272 if (page) 3299 if (page)
3273 ptr = page_address(page); 3300 ptr = page_address(page);
3274 3301
3275 kmemleak_alloc(ptr, size, 1, flags); 3302 kmalloc_large_node_hook(ptr, size, flags);
3276 return ptr; 3303 return ptr;
3277} 3304}
3278 3305
@@ -3336,7 +3363,7 @@ void kfree(const void *x)
3336 page = virt_to_head_page(x); 3363 page = virt_to_head_page(x);
3337 if (unlikely(!PageSlab(page))) { 3364 if (unlikely(!PageSlab(page))) {
3338 BUG_ON(!PageCompound(page)); 3365 BUG_ON(!PageCompound(page));
3339 kmemleak_free(x); 3366 kfree_hook(x);
3340 __free_memcg_kmem_pages(page, compound_order(page)); 3367 __free_memcg_kmem_pages(page, compound_order(page));
3341 return; 3368 return;
3342 } 3369 }
diff --git a/mm/swap.c b/mm/swap.c
index 7a9f80d451f5..84b26aaabd03 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -82,19 +82,6 @@ static void __put_compound_page(struct page *page)
82 82
83static void put_compound_page(struct page *page) 83static void put_compound_page(struct page *page)
84{ 84{
85 /*
86 * hugetlbfs pages cannot be split from under us. If this is a
87 * hugetlbfs page, check refcount on head page and release the page if
88 * the refcount becomes zero.
89 */
90 if (PageHuge(page)) {
91 page = compound_head(page);
92 if (put_page_testzero(page))
93 __put_compound_page(page);
94
95 return;
96 }
97
98 if (unlikely(PageTail(page))) { 85 if (unlikely(PageTail(page))) {
99 /* __split_huge_page_refcount can run under us */ 86 /* __split_huge_page_refcount can run under us */
100 struct page *page_head = compound_trans_head(page); 87 struct page *page_head = compound_trans_head(page);
@@ -111,14 +98,31 @@ static void put_compound_page(struct page *page)
111 * still hot on arches that do not support 98 * still hot on arches that do not support
112 * this_cpu_cmpxchg_double(). 99 * this_cpu_cmpxchg_double().
113 */ 100 */
114 if (PageSlab(page_head)) { 101 if (PageSlab(page_head) || PageHeadHuge(page_head)) {
115 if (PageTail(page)) { 102 if (likely(PageTail(page))) {
103 /*
104 * __split_huge_page_refcount
105 * cannot race here.
106 */
107 VM_BUG_ON(!PageHead(page_head));
108 atomic_dec(&page->_mapcount);
116 if (put_page_testzero(page_head)) 109 if (put_page_testzero(page_head))
117 VM_BUG_ON(1); 110 VM_BUG_ON(1);
118 111 if (put_page_testzero(page_head))
119 atomic_dec(&page->_mapcount); 112 __put_compound_page(page_head);
120 goto skip_lock_tail; 113 return;
121 } else 114 } else
115 /*
116 * __split_huge_page_refcount
117 * run before us, "page" was a
118 * THP tail. The split
119 * page_head has been freed
120 * and reallocated as slab or
121 * hugetlbfs page of smaller
122 * order (only possible if
123 * reallocated as slab on
124 * x86).
125 */
122 goto skip_lock; 126 goto skip_lock;
123 } 127 }
124 /* 128 /*
@@ -132,8 +136,27 @@ static void put_compound_page(struct page *page)
132 /* __split_huge_page_refcount run before us */ 136 /* __split_huge_page_refcount run before us */
133 compound_unlock_irqrestore(page_head, flags); 137 compound_unlock_irqrestore(page_head, flags);
134skip_lock: 138skip_lock:
135 if (put_page_testzero(page_head)) 139 if (put_page_testzero(page_head)) {
136 __put_single_page(page_head); 140 /*
141 * The head page may have been
142 * freed and reallocated as a
143 * compound page of smaller
144 * order and then freed again.
145 * All we know is that it
146 * cannot have become: a THP
147 * page, a compound page of
148 * higher order, a tail page.
149 * That is because we still
150 * hold the refcount of the
151 * split THP tail and
152 * page_head was the THP head
153 * before the split.
154 */
155 if (PageHead(page_head))
156 __put_compound_page(page_head);
157 else
158 __put_single_page(page_head);
159 }
137out_put_single: 160out_put_single:
138 if (put_page_testzero(page)) 161 if (put_page_testzero(page))
139 __put_single_page(page); 162 __put_single_page(page);
@@ -155,7 +178,6 @@ out_put_single:
155 VM_BUG_ON(atomic_read(&page->_count) != 0); 178 VM_BUG_ON(atomic_read(&page->_count) != 0);
156 compound_unlock_irqrestore(page_head, flags); 179 compound_unlock_irqrestore(page_head, flags);
157 180
158skip_lock_tail:
159 if (put_page_testzero(page_head)) { 181 if (put_page_testzero(page_head)) {
160 if (PageHead(page_head)) 182 if (PageHead(page_head))
161 __put_compound_page(page_head); 183 __put_compound_page(page_head);
@@ -198,51 +220,52 @@ bool __get_page_tail(struct page *page)
198 * proper PT lock that already serializes against 220 * proper PT lock that already serializes against
199 * split_huge_page(). 221 * split_huge_page().
200 */ 222 */
223 unsigned long flags;
201 bool got = false; 224 bool got = false;
202 struct page *page_head; 225 struct page *page_head = compound_trans_head(page);
203
204 /*
205 * If this is a hugetlbfs page it cannot be split under us. Simply
206 * increment refcount for the head page.
207 */
208 if (PageHuge(page)) {
209 page_head = compound_head(page);
210 atomic_inc(&page_head->_count);
211 got = true;
212 } else {
213 unsigned long flags;
214 226
215 page_head = compound_trans_head(page); 227 if (likely(page != page_head && get_page_unless_zero(page_head))) {
216 if (likely(page != page_head && 228 /* Ref to put_compound_page() comment. */
217 get_page_unless_zero(page_head))) { 229 if (PageSlab(page_head) || PageHeadHuge(page_head)) {
218
219 /* Ref to put_compound_page() comment. */
220 if (PageSlab(page_head)) {
221 if (likely(PageTail(page))) {
222 __get_page_tail_foll(page, false);
223 return true;
224 } else {
225 put_page(page_head);
226 return false;
227 }
228 }
229
230 /*
231 * page_head wasn't a dangling pointer but it
232 * may not be a head page anymore by the time
233 * we obtain the lock. That is ok as long as it
234 * can't be freed from under us.
235 */
236 flags = compound_lock_irqsave(page_head);
237 /* here __split_huge_page_refcount won't run anymore */
238 if (likely(PageTail(page))) { 230 if (likely(PageTail(page))) {
231 /*
232 * This is a hugetlbfs page or a slab
233 * page. __split_huge_page_refcount
234 * cannot race here.
235 */
236 VM_BUG_ON(!PageHead(page_head));
239 __get_page_tail_foll(page, false); 237 __get_page_tail_foll(page, false);
240 got = true; 238 return true;
241 } 239 } else {
242 compound_unlock_irqrestore(page_head, flags); 240 /*
243 if (unlikely(!got)) 241 * __split_huge_page_refcount run
242 * before us, "page" was a THP
243 * tail. The split page_head has been
244 * freed and reallocated as slab or
245 * hugetlbfs page of smaller order
246 * (only possible if reallocated as
247 * slab on x86).
248 */
244 put_page(page_head); 249 put_page(page_head);
250 return false;
251 }
252 }
253
254 /*
255 * page_head wasn't a dangling pointer but it
256 * may not be a head page anymore by the time
257 * we obtain the lock. That is ok as long as it
258 * can't be freed from under us.
259 */
260 flags = compound_lock_irqsave(page_head);
261 /* here __split_huge_page_refcount won't run anymore */
262 if (likely(PageTail(page))) {
263 __get_page_tail_foll(page, false);
264 got = true;
245 } 265 }
266 compound_unlock_irqrestore(page_head, flags);
267 if (unlikely(!got))
268 put_page(page_head);
246 } 269 }
247 return got; 270 return got;
248} 271}