aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/memory.c7
-rw-r--r--mm/mempolicy.c167
-rw-r--r--mm/oom_kill.c1
-rw-r--r--mm/page_alloc.c6
-rw-r--r--mm/rmap.c51
-rw-r--r--mm/shmem.c8
-rw-r--r--mm/slab.c639
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c16
-rw-r--r--mm/vmscan.c343
10 files changed, 898 insertions, 341 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 7a11ddd5060f..2bee1f21aa8a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1871,6 +1871,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1871 goto out; 1871 goto out;
1872 1872
1873 entry = pte_to_swp_entry(orig_pte); 1873 entry = pte_to_swp_entry(orig_pte);
1874again:
1874 page = lookup_swap_cache(entry); 1875 page = lookup_swap_cache(entry);
1875 if (!page) { 1876 if (!page) {
1876 swapin_readahead(entry, address, vma); 1877 swapin_readahead(entry, address, vma);
@@ -1894,6 +1895,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1894 1895
1895 mark_page_accessed(page); 1896 mark_page_accessed(page);
1896 lock_page(page); 1897 lock_page(page);
1898 if (!PageSwapCache(page)) {
1899 /* Page migration has occured */
1900 unlock_page(page);
1901 page_cache_release(page);
1902 goto again;
1903 }
1897 1904
1898 /* 1905 /*
1899 * Back out if somebody else already faulted in this pte. 1906 * Back out if somebody else already faulted in this pte.
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 73790188b0eb..27da6d5c77ba 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -95,6 +95,9 @@
95#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 95#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
96#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ 96#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
97 97
98/* The number of pages to migrate per call to migrate_pages() */
99#define MIGRATE_CHUNK_SIZE 256
100
98static kmem_cache_t *policy_cache; 101static kmem_cache_t *policy_cache;
99static kmem_cache_t *sn_cache; 102static kmem_cache_t *sn_cache;
100 103
@@ -543,24 +546,91 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
543 } 546 }
544} 547}
545 548
546static int swap_pages(struct list_head *pagelist) 549/*
550 * Migrate the list 'pagelist' of pages to a certain destination.
551 *
552 * Specify destination with either non-NULL vma or dest_node >= 0
553 * Return the number of pages not migrated or error code
554 */
555static int migrate_pages_to(struct list_head *pagelist,
556 struct vm_area_struct *vma, int dest)
547{ 557{
558 LIST_HEAD(newlist);
548 LIST_HEAD(moved); 559 LIST_HEAD(moved);
549 LIST_HEAD(failed); 560 LIST_HEAD(failed);
550 int n; 561 int err = 0;
562 int nr_pages;
563 struct page *page;
564 struct list_head *p;
551 565
552 n = migrate_pages(pagelist, NULL, &moved, &failed); 566redo:
553 putback_lru_pages(&failed); 567 nr_pages = 0;
554 putback_lru_pages(&moved); 568 list_for_each(p, pagelist) {
569 if (vma)
570 page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start);
571 else
572 page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
555 573
556 return n; 574 if (!page) {
575 err = -ENOMEM;
576 goto out;
577 }
578 list_add(&page->lru, &newlist);
579 nr_pages++;
580 if (nr_pages > MIGRATE_CHUNK_SIZE);
581 break;
582 }
583 err = migrate_pages(pagelist, &newlist, &moved, &failed);
584
585 putback_lru_pages(&moved); /* Call release pages instead ?? */
586
587 if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
588 goto redo;
589out:
590 /* Return leftover allocated pages */
591 while (!list_empty(&newlist)) {
592 page = list_entry(newlist.next, struct page, lru);
593 list_del(&page->lru);
594 __free_page(page);
595 }
596 list_splice(&failed, pagelist);
597 if (err < 0)
598 return err;
599
600 /* Calculate number of leftover pages */
601 nr_pages = 0;
602 list_for_each(p, pagelist)
603 nr_pages++;
604 return nr_pages;
605}
606
607/*
608 * Migrate pages from one node to a target node.
609 * Returns error or the number of pages not migrated.
610 */
611int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
612{
613 nodemask_t nmask;
614 LIST_HEAD(pagelist);
615 int err = 0;
616
617 nodes_clear(nmask);
618 node_set(source, nmask);
619
620 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
621 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
622
623 if (!list_empty(&pagelist)) {
624 err = migrate_pages_to(&pagelist, NULL, dest);
625 if (!list_empty(&pagelist))
626 putback_lru_pages(&pagelist);
627 }
628 return err;
557} 629}
558 630
559/* 631/*
560 * For now migrate_pages simply swaps out the pages from nodes that are in 632 * Move pages between the two nodesets so as to preserve the physical
561 * the source set but not in the target set. In the future, we would 633 * layout as much as possible.
562 * want a function that moves pages between the two nodesets in such
563 * a way as to preserve the physical layout as much as possible.
564 * 634 *
565 * Returns the number of page that could not be moved. 635 * Returns the number of page that could not be moved.
566 */ 636 */
@@ -568,22 +638,76 @@ int do_migrate_pages(struct mm_struct *mm,
568 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 638 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
569{ 639{
570 LIST_HEAD(pagelist); 640 LIST_HEAD(pagelist);
571 int count = 0; 641 int busy = 0;
572 nodemask_t nodes; 642 int err = 0;
643 nodemask_t tmp;
573 644
574 nodes_andnot(nodes, *from_nodes, *to_nodes); 645 down_read(&mm->mmap_sem);
575 646
576 down_read(&mm->mmap_sem); 647/*
577 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes, 648 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
578 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 649 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
650 * bit in 'tmp', and return that <source, dest> pair for migration.
651 * The pair of nodemasks 'to' and 'from' define the map.
652 *
653 * If no pair of bits is found that way, fallback to picking some
654 * pair of 'source' and 'dest' bits that are not the same. If the
655 * 'source' and 'dest' bits are the same, this represents a node
656 * that will be migrating to itself, so no pages need move.
657 *
658 * If no bits are left in 'tmp', or if all remaining bits left
659 * in 'tmp' correspond to the same bit in 'to', return false
660 * (nothing left to migrate).
661 *
662 * This lets us pick a pair of nodes to migrate between, such that
663 * if possible the dest node is not already occupied by some other
664 * source node, minimizing the risk of overloading the memory on a
665 * node that would happen if we migrated incoming memory to a node
666 * before migrating outgoing memory source that same node.
667 *
668 * A single scan of tmp is sufficient. As we go, we remember the
669 * most recent <s, d> pair that moved (s != d). If we find a pair
670 * that not only moved, but what's better, moved to an empty slot
671 * (d is not set in tmp), then we break out then, with that pair.
672 * Otherwise when we finish scannng from_tmp, we at least have the
673 * most recent <s, d> pair that moved. If we get all the way through
674 * the scan of tmp without finding any node that moved, much less
675 * moved to an empty node, then there is nothing left worth migrating.
676 */
579 677
580 if (!list_empty(&pagelist)) { 678 tmp = *from_nodes;
581 count = swap_pages(&pagelist); 679 while (!nodes_empty(tmp)) {
582 putback_lru_pages(&pagelist); 680 int s,d;
681 int source = -1;
682 int dest = 0;
683
684 for_each_node_mask(s, tmp) {
685 d = node_remap(s, *from_nodes, *to_nodes);
686 if (s == d)
687 continue;
688
689 source = s; /* Node moved. Memorize */
690 dest = d;
691
692 /* dest not in remaining from nodes? */
693 if (!node_isset(dest, tmp))
694 break;
695 }
696 if (source == -1)
697 break;
698
699 node_clear(source, tmp);
700 err = migrate_to_node(mm, source, dest, flags);
701 if (err > 0)
702 busy += err;
703 if (err < 0)
704 break;
583 } 705 }
584 706
585 up_read(&mm->mmap_sem); 707 up_read(&mm->mmap_sem);
586 return count; 708 if (err < 0)
709 return err;
710 return busy;
587} 711}
588 712
589long do_mbind(unsigned long start, unsigned long len, 713long do_mbind(unsigned long start, unsigned long len,
@@ -643,8 +767,9 @@ long do_mbind(unsigned long start, unsigned long len,
643 int nr_failed = 0; 767 int nr_failed = 0;
644 768
645 err = mbind_range(vma, start, end, new); 769 err = mbind_range(vma, start, end, new);
770
646 if (!list_empty(&pagelist)) 771 if (!list_empty(&pagelist))
647 nr_failed = swap_pages(&pagelist); 772 nr_failed = migrate_pages_to(&pagelist, vma, -1);
648 773
649 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 774 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
650 err = -EIO; 775 err = -EIO;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 14bd4ec79597..b05ab8f2a562 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -271,6 +271,7 @@ void out_of_memory(gfp_t gfp_mask, int order)
271 if (printk_ratelimit()) { 271 if (printk_ratelimit()) {
272 printk("oom-killer: gfp_mask=0x%x, order=%d\n", 272 printk("oom-killer: gfp_mask=0x%x, order=%d\n",
273 gfp_mask, order); 273 gfp_mask, order);
274 dump_stack();
274 show_mem(); 275 show_mem();
275 } 276 }
276 277
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index df54e2fc8ee0..44b4eb4202d9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1799,7 +1799,7 @@ void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
1799 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1799 memmap_init_zone((size), (nid), (zone), (start_pfn))
1800#endif 1800#endif
1801 1801
1802static int __meminit zone_batchsize(struct zone *zone) 1802static int __cpuinit zone_batchsize(struct zone *zone)
1803{ 1803{
1804 int batch; 1804 int batch;
1805 1805
@@ -1893,7 +1893,7 @@ static struct per_cpu_pageset
1893 * Dynamically allocate memory for the 1893 * Dynamically allocate memory for the
1894 * per cpu pageset array in struct zone. 1894 * per cpu pageset array in struct zone.
1895 */ 1895 */
1896static int __meminit process_zones(int cpu) 1896static int __cpuinit process_zones(int cpu)
1897{ 1897{
1898 struct zone *zone, *dzone; 1898 struct zone *zone, *dzone;
1899 1899
@@ -1934,7 +1934,7 @@ static inline void free_zone_pagesets(int cpu)
1934 } 1934 }
1935} 1935}
1936 1936
1937static int __meminit pageset_cpuup_callback(struct notifier_block *nfb, 1937static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
1938 unsigned long action, 1938 unsigned long action,
1939 void *hcpu) 1939 void *hcpu)
1940{ 1940{
diff --git a/mm/rmap.c b/mm/rmap.c
index d85a99d28c03..df2c41c2a9a2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -52,6 +52,7 @@
52#include <linux/init.h> 52#include <linux/init.h>
53#include <linux/rmap.h> 53#include <linux/rmap.h>
54#include <linux/rcupdate.h> 54#include <linux/rcupdate.h>
55#include <linux/module.h>
55 56
56#include <asm/tlbflush.h> 57#include <asm/tlbflush.h>
57 58
@@ -205,6 +206,36 @@ out:
205 return anon_vma; 206 return anon_vma;
206} 207}
207 208
209#ifdef CONFIG_MIGRATION
210/*
211 * Remove an anonymous page from swap replacing the swap pte's
212 * through real pte's pointing to valid pages and then releasing
213 * the page from the swap cache.
214 *
215 * Must hold page lock on page.
216 */
217void remove_from_swap(struct page *page)
218{
219 struct anon_vma *anon_vma;
220 struct vm_area_struct *vma;
221
222 if (!PageAnon(page) || !PageSwapCache(page))
223 return;
224
225 anon_vma = page_lock_anon_vma(page);
226 if (!anon_vma)
227 return;
228
229 list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
230 remove_vma_swap(vma, page);
231
232 spin_unlock(&anon_vma->lock);
233
234 delete_from_swap_cache(page);
235}
236EXPORT_SYMBOL(remove_from_swap);
237#endif
238
208/* 239/*
209 * At what user virtual address is page expected in vma? 240 * At what user virtual address is page expected in vma?
210 */ 241 */
@@ -541,7 +572,8 @@ void page_remove_rmap(struct page *page)
541 * Subfunctions of try_to_unmap: try_to_unmap_one called 572 * Subfunctions of try_to_unmap: try_to_unmap_one called
542 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 573 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
543 */ 574 */
544static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) 575static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
576 int ignore_refs)
545{ 577{
546 struct mm_struct *mm = vma->vm_mm; 578 struct mm_struct *mm = vma->vm_mm;
547 unsigned long address; 579 unsigned long address;
@@ -564,7 +596,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
564 * skipped over this mm) then we should reactivate it. 596 * skipped over this mm) then we should reactivate it.
565 */ 597 */
566 if ((vma->vm_flags & VM_LOCKED) || 598 if ((vma->vm_flags & VM_LOCKED) ||
567 ptep_clear_flush_young(vma, address, pte)) { 599 (ptep_clear_flush_young(vma, address, pte)
600 && !ignore_refs)) {
568 ret = SWAP_FAIL; 601 ret = SWAP_FAIL;
569 goto out_unmap; 602 goto out_unmap;
570 } 603 }
@@ -698,7 +731,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
698 pte_unmap_unlock(pte - 1, ptl); 731 pte_unmap_unlock(pte - 1, ptl);
699} 732}
700 733
701static int try_to_unmap_anon(struct page *page) 734static int try_to_unmap_anon(struct page *page, int ignore_refs)
702{ 735{
703 struct anon_vma *anon_vma; 736 struct anon_vma *anon_vma;
704 struct vm_area_struct *vma; 737 struct vm_area_struct *vma;
@@ -709,7 +742,7 @@ static int try_to_unmap_anon(struct page *page)
709 return ret; 742 return ret;
710 743
711 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 744 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
712 ret = try_to_unmap_one(page, vma); 745 ret = try_to_unmap_one(page, vma, ignore_refs);
713 if (ret == SWAP_FAIL || !page_mapped(page)) 746 if (ret == SWAP_FAIL || !page_mapped(page))
714 break; 747 break;
715 } 748 }
@@ -726,7 +759,7 @@ static int try_to_unmap_anon(struct page *page)
726 * 759 *
727 * This function is only called from try_to_unmap for object-based pages. 760 * This function is only called from try_to_unmap for object-based pages.
728 */ 761 */
729static int try_to_unmap_file(struct page *page) 762static int try_to_unmap_file(struct page *page, int ignore_refs)
730{ 763{
731 struct address_space *mapping = page->mapping; 764 struct address_space *mapping = page->mapping;
732 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 765 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -740,7 +773,7 @@ static int try_to_unmap_file(struct page *page)
740 773
741 spin_lock(&mapping->i_mmap_lock); 774 spin_lock(&mapping->i_mmap_lock);
742 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 775 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
743 ret = try_to_unmap_one(page, vma); 776 ret = try_to_unmap_one(page, vma, ignore_refs);
744 if (ret == SWAP_FAIL || !page_mapped(page)) 777 if (ret == SWAP_FAIL || !page_mapped(page))
745 goto out; 778 goto out;
746 } 779 }
@@ -825,16 +858,16 @@ out:
825 * SWAP_AGAIN - we missed a mapping, try again later 858 * SWAP_AGAIN - we missed a mapping, try again later
826 * SWAP_FAIL - the page is unswappable 859 * SWAP_FAIL - the page is unswappable
827 */ 860 */
828int try_to_unmap(struct page *page) 861int try_to_unmap(struct page *page, int ignore_refs)
829{ 862{
830 int ret; 863 int ret;
831 864
832 BUG_ON(!PageLocked(page)); 865 BUG_ON(!PageLocked(page));
833 866
834 if (PageAnon(page)) 867 if (PageAnon(page))
835 ret = try_to_unmap_anon(page); 868 ret = try_to_unmap_anon(page, ignore_refs);
836 else 869 else
837 ret = try_to_unmap_file(page); 870 ret = try_to_unmap_file(page, ignore_refs);
838 871
839 if (!page_mapped(page)) 872 if (!page_mapped(page))
840 ret = SWAP_SUCCESS; 873 ret = SWAP_SUCCESS;
diff --git a/mm/shmem.c b/mm/shmem.c
index ce501bce1c2e..f7ac7b812f92 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1028,6 +1028,14 @@ repeat:
1028 page_cache_release(swappage); 1028 page_cache_release(swappage);
1029 goto repeat; 1029 goto repeat;
1030 } 1030 }
1031 if (!PageSwapCache(swappage)) {
1032 /* Page migration has occured */
1033 shmem_swp_unmap(entry);
1034 spin_unlock(&info->lock);
1035 unlock_page(swappage);
1036 page_cache_release(swappage);
1037 goto repeat;
1038 }
1031 if (PageWriteback(swappage)) { 1039 if (PageWriteback(swappage)) {
1032 shmem_swp_unmap(entry); 1040 shmem_swp_unmap(entry);
1033 spin_unlock(&info->lock); 1041 spin_unlock(&info->lock);
diff --git a/mm/slab.c b/mm/slab.c
index 6f8495e2185b..71370256a7eb 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -55,7 +55,7 @@
55 * 55 *
56 * SMP synchronization: 56 * SMP synchronization:
57 * constructors and destructors are called without any locking. 57 * constructors and destructors are called without any locking.
58 * Several members in kmem_cache_t and struct slab never change, they 58 * Several members in struct kmem_cache and struct slab never change, they
59 * are accessed without any locking. 59 * are accessed without any locking.
60 * The per-cpu arrays are never accessed from the wrong cpu, no locking, 60 * The per-cpu arrays are never accessed from the wrong cpu, no locking,
61 * and local interrupts are disabled so slab code is preempt-safe. 61 * and local interrupts are disabled so slab code is preempt-safe.
@@ -244,7 +244,7 @@ struct slab {
244 */ 244 */
245struct slab_rcu { 245struct slab_rcu {
246 struct rcu_head head; 246 struct rcu_head head;
247 kmem_cache_t *cachep; 247 struct kmem_cache *cachep;
248 void *addr; 248 void *addr;
249}; 249};
250 250
@@ -316,6 +316,8 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
316 */ 316 */
317static __always_inline int index_of(const size_t size) 317static __always_inline int index_of(const size_t size)
318{ 318{
319 extern void __bad_size(void);
320
319 if (__builtin_constant_p(size)) { 321 if (__builtin_constant_p(size)) {
320 int i = 0; 322 int i = 0;
321 323
@@ -326,19 +328,16 @@ static __always_inline int index_of(const size_t size)
326 i++; 328 i++;
327#include "linux/kmalloc_sizes.h" 329#include "linux/kmalloc_sizes.h"
328#undef CACHE 330#undef CACHE
329 { 331 __bad_size();
330 extern void __bad_size(void);
331 __bad_size();
332 }
333 } else 332 } else
334 BUG(); 333 __bad_size();
335 return 0; 334 return 0;
336} 335}
337 336
338#define INDEX_AC index_of(sizeof(struct arraycache_init)) 337#define INDEX_AC index_of(sizeof(struct arraycache_init))
339#define INDEX_L3 index_of(sizeof(struct kmem_list3)) 338#define INDEX_L3 index_of(sizeof(struct kmem_list3))
340 339
341static inline void kmem_list3_init(struct kmem_list3 *parent) 340static void kmem_list3_init(struct kmem_list3 *parent)
342{ 341{
343 INIT_LIST_HEAD(&parent->slabs_full); 342 INIT_LIST_HEAD(&parent->slabs_full);
344 INIT_LIST_HEAD(&parent->slabs_partial); 343 INIT_LIST_HEAD(&parent->slabs_partial);
@@ -364,7 +363,7 @@ static inline void kmem_list3_init(struct kmem_list3 *parent)
364 } while (0) 363 } while (0)
365 364
366/* 365/*
367 * kmem_cache_t 366 * struct kmem_cache
368 * 367 *
369 * manages a cache. 368 * manages a cache.
370 */ 369 */
@@ -375,7 +374,7 @@ struct kmem_cache {
375 unsigned int batchcount; 374 unsigned int batchcount;
376 unsigned int limit; 375 unsigned int limit;
377 unsigned int shared; 376 unsigned int shared;
378 unsigned int objsize; 377 unsigned int buffer_size;
379/* 2) touched by every alloc & free from the backend */ 378/* 2) touched by every alloc & free from the backend */
380 struct kmem_list3 *nodelists[MAX_NUMNODES]; 379 struct kmem_list3 *nodelists[MAX_NUMNODES];
381 unsigned int flags; /* constant flags */ 380 unsigned int flags; /* constant flags */
@@ -392,15 +391,15 @@ struct kmem_cache {
392 size_t colour; /* cache colouring range */ 391 size_t colour; /* cache colouring range */
393 unsigned int colour_off; /* colour offset */ 392 unsigned int colour_off; /* colour offset */
394 unsigned int colour_next; /* cache colouring */ 393 unsigned int colour_next; /* cache colouring */
395 kmem_cache_t *slabp_cache; 394 struct kmem_cache *slabp_cache;
396 unsigned int slab_size; 395 unsigned int slab_size;
397 unsigned int dflags; /* dynamic flags */ 396 unsigned int dflags; /* dynamic flags */
398 397
399 /* constructor func */ 398 /* constructor func */
400 void (*ctor) (void *, kmem_cache_t *, unsigned long); 399 void (*ctor) (void *, struct kmem_cache *, unsigned long);
401 400
402 /* de-constructor func */ 401 /* de-constructor func */
403 void (*dtor) (void *, kmem_cache_t *, unsigned long); 402 void (*dtor) (void *, struct kmem_cache *, unsigned long);
404 403
405/* 4) cache creation/removal */ 404/* 4) cache creation/removal */
406 const char *name; 405 const char *name;
@@ -423,8 +422,14 @@ struct kmem_cache {
423 atomic_t freemiss; 422 atomic_t freemiss;
424#endif 423#endif
425#if DEBUG 424#if DEBUG
426 int dbghead; 425 /*
427 int reallen; 426 * If debugging is enabled, then the allocator can add additional
427 * fields and/or padding to every object. buffer_size contains the total
428 * object size including these internal fields, the following two
429 * variables contain the offset to the user object and its size.
430 */
431 int obj_offset;
432 int obj_size;
428#endif 433#endif
429}; 434};
430 435
@@ -495,50 +500,50 @@ struct kmem_cache {
495 500
496/* memory layout of objects: 501/* memory layout of objects:
497 * 0 : objp 502 * 0 : objp
498 * 0 .. cachep->dbghead - BYTES_PER_WORD - 1: padding. This ensures that 503 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
499 * the end of an object is aligned with the end of the real 504 * the end of an object is aligned with the end of the real
500 * allocation. Catches writes behind the end of the allocation. 505 * allocation. Catches writes behind the end of the allocation.
501 * cachep->dbghead - BYTES_PER_WORD .. cachep->dbghead - 1: 506 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
502 * redzone word. 507 * redzone word.
503 * cachep->dbghead: The real object. 508 * cachep->obj_offset: The real object.
504 * cachep->objsize - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 509 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
505 * cachep->objsize - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long] 510 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
506 */ 511 */
507static int obj_dbghead(kmem_cache_t *cachep) 512static int obj_offset(struct kmem_cache *cachep)
508{ 513{
509 return cachep->dbghead; 514 return cachep->obj_offset;
510} 515}
511 516
512static int obj_reallen(kmem_cache_t *cachep) 517static int obj_size(struct kmem_cache *cachep)
513{ 518{
514 return cachep->reallen; 519 return cachep->obj_size;
515} 520}
516 521
517static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp) 522static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
518{ 523{
519 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 524 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
520 return (unsigned long*) (objp+obj_dbghead(cachep)-BYTES_PER_WORD); 525 return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD);
521} 526}
522 527
523static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp) 528static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
524{ 529{
525 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 530 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
526 if (cachep->flags & SLAB_STORE_USER) 531 if (cachep->flags & SLAB_STORE_USER)
527 return (unsigned long *)(objp + cachep->objsize - 532 return (unsigned long *)(objp + cachep->buffer_size -
528 2 * BYTES_PER_WORD); 533 2 * BYTES_PER_WORD);
529 return (unsigned long *)(objp + cachep->objsize - BYTES_PER_WORD); 534 return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD);
530} 535}
531 536
532static void **dbg_userword(kmem_cache_t *cachep, void *objp) 537static void **dbg_userword(struct kmem_cache *cachep, void *objp)
533{ 538{
534 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 539 BUG_ON(!(cachep->flags & SLAB_STORE_USER));
535 return (void **)(objp + cachep->objsize - BYTES_PER_WORD); 540 return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
536} 541}
537 542
538#else 543#else
539 544
540#define obj_dbghead(x) 0 545#define obj_offset(x) 0
541#define obj_reallen(cachep) (cachep->objsize) 546#define obj_size(cachep) (cachep->buffer_size)
542#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 547#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;})
543#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 548#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;})
544#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 549#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})
@@ -591,6 +596,18 @@ static inline struct slab *page_get_slab(struct page *page)
591 return (struct slab *)page->lru.prev; 596 return (struct slab *)page->lru.prev;
592} 597}
593 598
599static inline struct kmem_cache *virt_to_cache(const void *obj)
600{
601 struct page *page = virt_to_page(obj);
602 return page_get_cache(page);
603}
604
605static inline struct slab *virt_to_slab(const void *obj)
606{
607 struct page *page = virt_to_page(obj);
608 return page_get_slab(page);
609}
610
594/* These are the default caches for kmalloc. Custom caches can have other sizes. */ 611/* These are the default caches for kmalloc. Custom caches can have other sizes. */
595struct cache_sizes malloc_sizes[] = { 612struct cache_sizes malloc_sizes[] = {
596#define CACHE(x) { .cs_size = (x) }, 613#define CACHE(x) { .cs_size = (x) },
@@ -619,16 +636,16 @@ static struct arraycache_init initarray_generic =
619 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 636 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
620 637
621/* internal cache of cache description objs */ 638/* internal cache of cache description objs */
622static kmem_cache_t cache_cache = { 639static struct kmem_cache cache_cache = {
623 .batchcount = 1, 640 .batchcount = 1,
624 .limit = BOOT_CPUCACHE_ENTRIES, 641 .limit = BOOT_CPUCACHE_ENTRIES,
625 .shared = 1, 642 .shared = 1,
626 .objsize = sizeof(kmem_cache_t), 643 .buffer_size = sizeof(struct kmem_cache),
627 .flags = SLAB_NO_REAP, 644 .flags = SLAB_NO_REAP,
628 .spinlock = SPIN_LOCK_UNLOCKED, 645 .spinlock = SPIN_LOCK_UNLOCKED,
629 .name = "kmem_cache", 646 .name = "kmem_cache",
630#if DEBUG 647#if DEBUG
631 .reallen = sizeof(kmem_cache_t), 648 .obj_size = sizeof(struct kmem_cache),
632#endif 649#endif
633}; 650};
634 651
@@ -657,17 +674,17 @@ static enum {
657 674
658static DEFINE_PER_CPU(struct work_struct, reap_work); 675static DEFINE_PER_CPU(struct work_struct, reap_work);
659 676
660static void free_block(kmem_cache_t *cachep, void **objpp, int len, int node); 677static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node);
661static void enable_cpucache(kmem_cache_t *cachep); 678static void enable_cpucache(struct kmem_cache *cachep);
662static void cache_reap(void *unused); 679static void cache_reap(void *unused);
663static int __node_shrink(kmem_cache_t *cachep, int node); 680static int __node_shrink(struct kmem_cache *cachep, int node);
664 681
665static inline struct array_cache *ac_data(kmem_cache_t *cachep) 682static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
666{ 683{
667 return cachep->array[smp_processor_id()]; 684 return cachep->array[smp_processor_id()];
668} 685}
669 686
670static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags) 687static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags)
671{ 688{
672 struct cache_sizes *csizep = malloc_sizes; 689 struct cache_sizes *csizep = malloc_sizes;
673 690
@@ -691,43 +708,80 @@ static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
691 return csizep->cs_cachep; 708 return csizep->cs_cachep;
692} 709}
693 710
694kmem_cache_t *kmem_find_general_cachep(size_t size, gfp_t gfpflags) 711struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
695{ 712{
696 return __find_general_cachep(size, gfpflags); 713 return __find_general_cachep(size, gfpflags);
697} 714}
698EXPORT_SYMBOL(kmem_find_general_cachep); 715EXPORT_SYMBOL(kmem_find_general_cachep);
699 716
700/* Cal the num objs, wastage, and bytes left over for a given slab size. */ 717static size_t slab_mgmt_size(size_t nr_objs, size_t align)
701static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
702 int flags, size_t *left_over, unsigned int *num)
703{ 718{
704 int i; 719 return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
705 size_t wastage = PAGE_SIZE << gfporder; 720}
706 size_t extra = 0;
707 size_t base = 0;
708 721
709 if (!(flags & CFLGS_OFF_SLAB)) { 722/* Calculate the number of objects and left-over bytes for a given
710 base = sizeof(struct slab); 723 buffer size. */
711 extra = sizeof(kmem_bufctl_t); 724static void cache_estimate(unsigned long gfporder, size_t buffer_size,
712 } 725 size_t align, int flags, size_t *left_over,
713 i = 0; 726 unsigned int *num)
714 while (i * size + ALIGN(base + i * extra, align) <= wastage) 727{
715 i++; 728 int nr_objs;
716 if (i > 0) 729 size_t mgmt_size;
717 i--; 730 size_t slab_size = PAGE_SIZE << gfporder;
731
732 /*
733 * The slab management structure can be either off the slab or
734 * on it. For the latter case, the memory allocated for a
735 * slab is used for:
736 *
737 * - The struct slab
738 * - One kmem_bufctl_t for each object
739 * - Padding to respect alignment of @align
740 * - @buffer_size bytes for each object
741 *
742 * If the slab management structure is off the slab, then the
743 * alignment will already be calculated into the size. Because
744 * the slabs are all pages aligned, the objects will be at the
745 * correct alignment when allocated.
746 */
747 if (flags & CFLGS_OFF_SLAB) {
748 mgmt_size = 0;
749 nr_objs = slab_size / buffer_size;
718 750
719 if (i > SLAB_LIMIT) 751 if (nr_objs > SLAB_LIMIT)
720 i = SLAB_LIMIT; 752 nr_objs = SLAB_LIMIT;
753 } else {
754 /*
755 * Ignore padding for the initial guess. The padding
756 * is at most @align-1 bytes, and @buffer_size is at
757 * least @align. In the worst case, this result will
758 * be one greater than the number of objects that fit
759 * into the memory allocation when taking the padding
760 * into account.
761 */
762 nr_objs = (slab_size - sizeof(struct slab)) /
763 (buffer_size + sizeof(kmem_bufctl_t));
764
765 /*
766 * This calculated number will be either the right
767 * amount, or one greater than what we want.
768 */
769 if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
770 > slab_size)
771 nr_objs--;
721 772
722 *num = i; 773 if (nr_objs > SLAB_LIMIT)
723 wastage -= i * size; 774 nr_objs = SLAB_LIMIT;
724 wastage -= ALIGN(base + i * extra, align); 775
725 *left_over = wastage; 776 mgmt_size = slab_mgmt_size(nr_objs, align);
777 }
778 *num = nr_objs;
779 *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
726} 780}
727 781
728#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) 782#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
729 783
730static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg) 784static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg)
731{ 785{
732 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 786 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
733 function, cachep->name, msg); 787 function, cachep->name, msg);
@@ -774,9 +828,9 @@ static struct array_cache *alloc_arraycache(int node, int entries,
774} 828}
775 829
776#ifdef CONFIG_NUMA 830#ifdef CONFIG_NUMA
777static void *__cache_alloc_node(kmem_cache_t *, gfp_t, int); 831static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
778 832
779static inline struct array_cache **alloc_alien_cache(int node, int limit) 833static struct array_cache **alloc_alien_cache(int node, int limit)
780{ 834{
781 struct array_cache **ac_ptr; 835 struct array_cache **ac_ptr;
782 int memsize = sizeof(void *) * MAX_NUMNODES; 836 int memsize = sizeof(void *) * MAX_NUMNODES;
@@ -803,7 +857,7 @@ static inline struct array_cache **alloc_alien_cache(int node, int limit)
803 return ac_ptr; 857 return ac_ptr;
804} 858}
805 859
806static inline void free_alien_cache(struct array_cache **ac_ptr) 860static void free_alien_cache(struct array_cache **ac_ptr)
807{ 861{
808 int i; 862 int i;
809 863
@@ -816,8 +870,8 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
816 kfree(ac_ptr); 870 kfree(ac_ptr);
817} 871}
818 872
819static inline void __drain_alien_cache(kmem_cache_t *cachep, 873static void __drain_alien_cache(struct kmem_cache *cachep,
820 struct array_cache *ac, int node) 874 struct array_cache *ac, int node)
821{ 875{
822 struct kmem_list3 *rl3 = cachep->nodelists[node]; 876 struct kmem_list3 *rl3 = cachep->nodelists[node];
823 877
@@ -829,7 +883,7 @@ static inline void __drain_alien_cache(kmem_cache_t *cachep,
829 } 883 }
830} 884}
831 885
832static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3) 886static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
833{ 887{
834 int i = 0; 888 int i = 0;
835 struct array_cache *ac; 889 struct array_cache *ac;
@@ -854,7 +908,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
854 unsigned long action, void *hcpu) 908 unsigned long action, void *hcpu)
855{ 909{
856 long cpu = (long)hcpu; 910 long cpu = (long)hcpu;
857 kmem_cache_t *cachep; 911 struct kmem_cache *cachep;
858 struct kmem_list3 *l3 = NULL; 912 struct kmem_list3 *l3 = NULL;
859 int node = cpu_to_node(cpu); 913 int node = cpu_to_node(cpu);
860 int memsize = sizeof(struct kmem_list3); 914 int memsize = sizeof(struct kmem_list3);
@@ -992,7 +1046,7 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
992/* 1046/*
993 * swap the static kmem_list3 with kmalloced memory 1047 * swap the static kmem_list3 with kmalloced memory
994 */ 1048 */
995static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, int nodeid) 1049static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid)
996{ 1050{
997 struct kmem_list3 *ptr; 1051 struct kmem_list3 *ptr;
998 1052
@@ -1032,14 +1086,14 @@ void __init kmem_cache_init(void)
1032 1086
1033 /* Bootstrap is tricky, because several objects are allocated 1087 /* Bootstrap is tricky, because several objects are allocated
1034 * from caches that do not exist yet: 1088 * from caches that do not exist yet:
1035 * 1) initialize the cache_cache cache: it contains the kmem_cache_t 1089 * 1) initialize the cache_cache cache: it contains the struct kmem_cache
1036 * structures of all caches, except cache_cache itself: cache_cache 1090 * structures of all caches, except cache_cache itself: cache_cache
1037 * is statically allocated. 1091 * is statically allocated.
1038 * Initially an __init data area is used for the head array and the 1092 * Initially an __init data area is used for the head array and the
1039 * kmem_list3 structures, it's replaced with a kmalloc allocated 1093 * kmem_list3 structures, it's replaced with a kmalloc allocated
1040 * array at the end of the bootstrap. 1094 * array at the end of the bootstrap.
1041 * 2) Create the first kmalloc cache. 1095 * 2) Create the first kmalloc cache.
1042 * The kmem_cache_t for the new cache is allocated normally. 1096 * The struct kmem_cache for the new cache is allocated normally.
1043 * An __init data area is used for the head array. 1097 * An __init data area is used for the head array.
1044 * 3) Create the remaining kmalloc caches, with minimally sized 1098 * 3) Create the remaining kmalloc caches, with minimally sized
1045 * head arrays. 1099 * head arrays.
@@ -1057,9 +1111,9 @@ void __init kmem_cache_init(void)
1057 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 1111 cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1058 cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; 1112 cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
1059 1113
1060 cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); 1114 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
1061 1115
1062 cache_estimate(0, cache_cache.objsize, cache_line_size(), 0, 1116 cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0,
1063 &left_over, &cache_cache.num); 1117 &left_over, &cache_cache.num);
1064 if (!cache_cache.num) 1118 if (!cache_cache.num)
1065 BUG(); 1119 BUG();
@@ -1132,8 +1186,8 @@ void __init kmem_cache_init(void)
1132 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1186 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1133 1187
1134 local_irq_disable(); 1188 local_irq_disable();
1135 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); 1189 BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1136 memcpy(ptr, ac_data(&cache_cache), 1190 memcpy(ptr, cpu_cache_get(&cache_cache),
1137 sizeof(struct arraycache_init)); 1191 sizeof(struct arraycache_init));
1138 cache_cache.array[smp_processor_id()] = ptr; 1192 cache_cache.array[smp_processor_id()] = ptr;
1139 local_irq_enable(); 1193 local_irq_enable();
@@ -1141,9 +1195,9 @@ void __init kmem_cache_init(void)
1141 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1195 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1142 1196
1143 local_irq_disable(); 1197 local_irq_disable();
1144 BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep) 1198 BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1145 != &initarray_generic.cache); 1199 != &initarray_generic.cache);
1146 memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep), 1200 memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
1147 sizeof(struct arraycache_init)); 1201 sizeof(struct arraycache_init));
1148 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1202 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1149 ptr; 1203 ptr;
@@ -1170,7 +1224,7 @@ void __init kmem_cache_init(void)
1170 1224
1171 /* 6) resize the head arrays to their final sizes */ 1225 /* 6) resize the head arrays to their final sizes */
1172 { 1226 {
1173 kmem_cache_t *cachep; 1227 struct kmem_cache *cachep;
1174 mutex_lock(&cache_chain_mutex); 1228 mutex_lock(&cache_chain_mutex);
1175 list_for_each_entry(cachep, &cache_chain, next) 1229 list_for_each_entry(cachep, &cache_chain, next)
1176 enable_cpucache(cachep); 1230 enable_cpucache(cachep);
@@ -1181,7 +1235,7 @@ void __init kmem_cache_init(void)
1181 g_cpucache_up = FULL; 1235 g_cpucache_up = FULL;
1182 1236
1183 /* Register a cpu startup notifier callback 1237 /* Register a cpu startup notifier callback
1184 * that initializes ac_data for all new cpus 1238 * that initializes cpu_cache_get for all new cpus
1185 */ 1239 */
1186 register_cpu_notifier(&cpucache_notifier); 1240 register_cpu_notifier(&cpucache_notifier);
1187 1241
@@ -1213,7 +1267,7 @@ __initcall(cpucache_init);
1213 * did not request dmaable memory, we might get it, but that 1267 * did not request dmaable memory, we might get it, but that
1214 * would be relatively rare and ignorable. 1268 * would be relatively rare and ignorable.
1215 */ 1269 */
1216static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid) 1270static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1217{ 1271{
1218 struct page *page; 1272 struct page *page;
1219 void *addr; 1273 void *addr;
@@ -1239,7 +1293,7 @@ static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
1239/* 1293/*
1240 * Interface to system's page release. 1294 * Interface to system's page release.
1241 */ 1295 */
1242static void kmem_freepages(kmem_cache_t *cachep, void *addr) 1296static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1243{ 1297{
1244 unsigned long i = (1 << cachep->gfporder); 1298 unsigned long i = (1 << cachep->gfporder);
1245 struct page *page = virt_to_page(addr); 1299 struct page *page = virt_to_page(addr);
@@ -1261,7 +1315,7 @@ static void kmem_freepages(kmem_cache_t *cachep, void *addr)
1261static void kmem_rcu_free(struct rcu_head *head) 1315static void kmem_rcu_free(struct rcu_head *head)
1262{ 1316{
1263 struct slab_rcu *slab_rcu = (struct slab_rcu *)head; 1317 struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1264 kmem_cache_t *cachep = slab_rcu->cachep; 1318 struct kmem_cache *cachep = slab_rcu->cachep;
1265 1319
1266 kmem_freepages(cachep, slab_rcu->addr); 1320 kmem_freepages(cachep, slab_rcu->addr);
1267 if (OFF_SLAB(cachep)) 1321 if (OFF_SLAB(cachep))
@@ -1271,12 +1325,12 @@ static void kmem_rcu_free(struct rcu_head *head)
1271#if DEBUG 1325#if DEBUG
1272 1326
1273#ifdef CONFIG_DEBUG_PAGEALLOC 1327#ifdef CONFIG_DEBUG_PAGEALLOC
1274static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, 1328static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1275 unsigned long caller) 1329 unsigned long caller)
1276{ 1330{
1277 int size = obj_reallen(cachep); 1331 int size = obj_size(cachep);
1278 1332
1279 addr = (unsigned long *)&((char *)addr)[obj_dbghead(cachep)]; 1333 addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1280 1334
1281 if (size < 5 * sizeof(unsigned long)) 1335 if (size < 5 * sizeof(unsigned long))
1282 return; 1336 return;
@@ -1304,10 +1358,10 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
1304} 1358}
1305#endif 1359#endif
1306 1360
1307static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) 1361static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1308{ 1362{
1309 int size = obj_reallen(cachep); 1363 int size = obj_size(cachep);
1310 addr = &((char *)addr)[obj_dbghead(cachep)]; 1364 addr = &((char *)addr)[obj_offset(cachep)];
1311 1365
1312 memset(addr, val, size); 1366 memset(addr, val, size);
1313 *(unsigned char *)(addr + size - 1) = POISON_END; 1367 *(unsigned char *)(addr + size - 1) = POISON_END;
@@ -1326,7 +1380,7 @@ static void dump_line(char *data, int offset, int limit)
1326 1380
1327#if DEBUG 1381#if DEBUG
1328 1382
1329static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines) 1383static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1330{ 1384{
1331 int i, size; 1385 int i, size;
1332 char *realobj; 1386 char *realobj;
@@ -1344,8 +1398,8 @@ static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
1344 (unsigned long)*dbg_userword(cachep, objp)); 1398 (unsigned long)*dbg_userword(cachep, objp));
1345 printk("\n"); 1399 printk("\n");
1346 } 1400 }
1347 realobj = (char *)objp + obj_dbghead(cachep); 1401 realobj = (char *)objp + obj_offset(cachep);
1348 size = obj_reallen(cachep); 1402 size = obj_size(cachep);
1349 for (i = 0; i < size && lines; i += 16, lines--) { 1403 for (i = 0; i < size && lines; i += 16, lines--) {
1350 int limit; 1404 int limit;
1351 limit = 16; 1405 limit = 16;
@@ -1355,14 +1409,14 @@ static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
1355 } 1409 }
1356} 1410}
1357 1411
1358static void check_poison_obj(kmem_cache_t *cachep, void *objp) 1412static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1359{ 1413{
1360 char *realobj; 1414 char *realobj;
1361 int size, i; 1415 int size, i;
1362 int lines = 0; 1416 int lines = 0;
1363 1417
1364 realobj = (char *)objp + obj_dbghead(cachep); 1418 realobj = (char *)objp + obj_offset(cachep);
1365 size = obj_reallen(cachep); 1419 size = obj_size(cachep);
1366 1420
1367 for (i = 0; i < size; i++) { 1421 for (i = 0; i < size; i++) {
1368 char exp = POISON_FREE; 1422 char exp = POISON_FREE;
@@ -1395,20 +1449,20 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
1395 /* Print some data about the neighboring objects, if they 1449 /* Print some data about the neighboring objects, if they
1396 * exist: 1450 * exist:
1397 */ 1451 */
1398 struct slab *slabp = page_get_slab(virt_to_page(objp)); 1452 struct slab *slabp = virt_to_slab(objp);
1399 int objnr; 1453 int objnr;
1400 1454
1401 objnr = (objp - slabp->s_mem) / cachep->objsize; 1455 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
1402 if (objnr) { 1456 if (objnr) {
1403 objp = slabp->s_mem + (objnr - 1) * cachep->objsize; 1457 objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size;
1404 realobj = (char *)objp + obj_dbghead(cachep); 1458 realobj = (char *)objp + obj_offset(cachep);
1405 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1459 printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1406 realobj, size); 1460 realobj, size);
1407 print_objinfo(cachep, objp, 2); 1461 print_objinfo(cachep, objp, 2);
1408 } 1462 }
1409 if (objnr + 1 < cachep->num) { 1463 if (objnr + 1 < cachep->num) {
1410 objp = slabp->s_mem + (objnr + 1) * cachep->objsize; 1464 objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size;
1411 realobj = (char *)objp + obj_dbghead(cachep); 1465 realobj = (char *)objp + obj_offset(cachep);
1412 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1466 printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1413 realobj, size); 1467 realobj, size);
1414 print_objinfo(cachep, objp, 2); 1468 print_objinfo(cachep, objp, 2);
@@ -1417,25 +1471,23 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
1417} 1471}
1418#endif 1472#endif
1419 1473
1420/* Destroy all the objs in a slab, and release the mem back to the system. 1474#if DEBUG
1421 * Before calling the slab must have been unlinked from the cache. 1475/**
1422 * The cache-lock is not held/needed. 1476 * slab_destroy_objs - call the registered destructor for each object in
1477 * a slab that is to be destroyed.
1423 */ 1478 */
1424static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp) 1479static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1425{ 1480{
1426 void *addr = slabp->s_mem - slabp->colouroff;
1427
1428#if DEBUG
1429 int i; 1481 int i;
1430 for (i = 0; i < cachep->num; i++) { 1482 for (i = 0; i < cachep->num; i++) {
1431 void *objp = slabp->s_mem + cachep->objsize * i; 1483 void *objp = slabp->s_mem + cachep->buffer_size * i;
1432 1484
1433 if (cachep->flags & SLAB_POISON) { 1485 if (cachep->flags & SLAB_POISON) {
1434#ifdef CONFIG_DEBUG_PAGEALLOC 1486#ifdef CONFIG_DEBUG_PAGEALLOC
1435 if ((cachep->objsize % PAGE_SIZE) == 0 1487 if ((cachep->buffer_size % PAGE_SIZE) == 0
1436 && OFF_SLAB(cachep)) 1488 && OFF_SLAB(cachep))
1437 kernel_map_pages(virt_to_page(objp), 1489 kernel_map_pages(virt_to_page(objp),
1438 cachep->objsize / PAGE_SIZE, 1490 cachep->buffer_size / PAGE_SIZE,
1439 1); 1491 1);
1440 else 1492 else
1441 check_poison_obj(cachep, objp); 1493 check_poison_obj(cachep, objp);
@@ -1452,18 +1504,32 @@ static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
1452 "was overwritten"); 1504 "was overwritten");
1453 } 1505 }
1454 if (cachep->dtor && !(cachep->flags & SLAB_POISON)) 1506 if (cachep->dtor && !(cachep->flags & SLAB_POISON))
1455 (cachep->dtor) (objp + obj_dbghead(cachep), cachep, 0); 1507 (cachep->dtor) (objp + obj_offset(cachep), cachep, 0);
1456 } 1508 }
1509}
1457#else 1510#else
1511static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1512{
1458 if (cachep->dtor) { 1513 if (cachep->dtor) {
1459 int i; 1514 int i;
1460 for (i = 0; i < cachep->num; i++) { 1515 for (i = 0; i < cachep->num; i++) {
1461 void *objp = slabp->s_mem + cachep->objsize * i; 1516 void *objp = slabp->s_mem + cachep->buffer_size * i;
1462 (cachep->dtor) (objp, cachep, 0); 1517 (cachep->dtor) (objp, cachep, 0);
1463 } 1518 }
1464 } 1519 }
1520}
1465#endif 1521#endif
1466 1522
1523/**
1524 * Destroy all the objs in a slab, and release the mem back to the system.
1525 * Before calling the slab must have been unlinked from the cache.
1526 * The cache-lock is not held/needed.
1527 */
1528static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1529{
1530 void *addr = slabp->s_mem - slabp->colouroff;
1531
1532 slab_destroy_objs(cachep, slabp);
1467 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { 1533 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1468 struct slab_rcu *slab_rcu; 1534 struct slab_rcu *slab_rcu;
1469 1535
@@ -1478,9 +1544,9 @@ static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
1478 } 1544 }
1479} 1545}
1480 1546
1481/* For setting up all the kmem_list3s for cache whose objsize is same 1547/* For setting up all the kmem_list3s for cache whose buffer_size is same
1482 as size of kmem_list3. */ 1548 as size of kmem_list3. */
1483static inline void set_up_list3s(kmem_cache_t *cachep, int index) 1549static void set_up_list3s(struct kmem_cache *cachep, int index)
1484{ 1550{
1485 int node; 1551 int node;
1486 1552
@@ -1493,15 +1559,20 @@ static inline void set_up_list3s(kmem_cache_t *cachep, int index)
1493} 1559}
1494 1560
1495/** 1561/**
1496 * calculate_slab_order - calculate size (page order) of slabs and the number 1562 * calculate_slab_order - calculate size (page order) of slabs
1497 * of objects per slab. 1563 * @cachep: pointer to the cache that is being created
1564 * @size: size of objects to be created in this cache.
1565 * @align: required alignment for the objects.
1566 * @flags: slab allocation flags
1567 *
1568 * Also calculates the number of objects per slab.
1498 * 1569 *
1499 * This could be made much more intelligent. For now, try to avoid using 1570 * This could be made much more intelligent. For now, try to avoid using
1500 * high order pages for slabs. When the gfp() functions are more friendly 1571 * high order pages for slabs. When the gfp() functions are more friendly
1501 * towards high-order requests, this should be changed. 1572 * towards high-order requests, this should be changed.
1502 */ 1573 */
1503static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size, 1574static inline size_t calculate_slab_order(struct kmem_cache *cachep,
1504 size_t align, gfp_t flags) 1575 size_t size, size_t align, unsigned long flags)
1505{ 1576{
1506 size_t left_over = 0; 1577 size_t left_over = 0;
1507 1578
@@ -1572,13 +1643,13 @@ static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size,
1572 * cacheline. This can be beneficial if you're counting cycles as closely 1643 * cacheline. This can be beneficial if you're counting cycles as closely
1573 * as davem. 1644 * as davem.
1574 */ 1645 */
1575kmem_cache_t * 1646struct kmem_cache *
1576kmem_cache_create (const char *name, size_t size, size_t align, 1647kmem_cache_create (const char *name, size_t size, size_t align,
1577 unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), 1648 unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long),
1578 void (*dtor)(void*, kmem_cache_t *, unsigned long)) 1649 void (*dtor)(void*, struct kmem_cache *, unsigned long))
1579{ 1650{
1580 size_t left_over, slab_size, ralign; 1651 size_t left_over, slab_size, ralign;
1581 kmem_cache_t *cachep = NULL; 1652 struct kmem_cache *cachep = NULL;
1582 struct list_head *p; 1653 struct list_head *p;
1583 1654
1584 /* 1655 /*
@@ -1596,7 +1667,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1596 mutex_lock(&cache_chain_mutex); 1667 mutex_lock(&cache_chain_mutex);
1597 1668
1598 list_for_each(p, &cache_chain) { 1669 list_for_each(p, &cache_chain) {
1599 kmem_cache_t *pc = list_entry(p, kmem_cache_t, next); 1670 struct kmem_cache *pc = list_entry(p, struct kmem_cache, next);
1600 mm_segment_t old_fs = get_fs(); 1671 mm_segment_t old_fs = get_fs();
1601 char tmp; 1672 char tmp;
1602 int res; 1673 int res;
@@ -1611,7 +1682,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1611 set_fs(old_fs); 1682 set_fs(old_fs);
1612 if (res) { 1683 if (res) {
1613 printk("SLAB: cache with size %d has lost its name\n", 1684 printk("SLAB: cache with size %d has lost its name\n",
1614 pc->objsize); 1685 pc->buffer_size);
1615 continue; 1686 continue;
1616 } 1687 }
1617 1688
@@ -1696,20 +1767,20 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1696 align = ralign; 1767 align = ralign;
1697 1768
1698 /* Get cache's description obj. */ 1769 /* Get cache's description obj. */
1699 cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL); 1770 cachep = kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
1700 if (!cachep) 1771 if (!cachep)
1701 goto oops; 1772 goto oops;
1702 memset(cachep, 0, sizeof(kmem_cache_t)); 1773 memset(cachep, 0, sizeof(struct kmem_cache));
1703 1774
1704#if DEBUG 1775#if DEBUG
1705 cachep->reallen = size; 1776 cachep->obj_size = size;
1706 1777
1707 if (flags & SLAB_RED_ZONE) { 1778 if (flags & SLAB_RED_ZONE) {
1708 /* redzoning only works with word aligned caches */ 1779 /* redzoning only works with word aligned caches */
1709 align = BYTES_PER_WORD; 1780 align = BYTES_PER_WORD;
1710 1781
1711 /* add space for red zone words */ 1782 /* add space for red zone words */
1712 cachep->dbghead += BYTES_PER_WORD; 1783 cachep->obj_offset += BYTES_PER_WORD;
1713 size += 2 * BYTES_PER_WORD; 1784 size += 2 * BYTES_PER_WORD;
1714 } 1785 }
1715 if (flags & SLAB_STORE_USER) { 1786 if (flags & SLAB_STORE_USER) {
@@ -1722,8 +1793,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1722 } 1793 }
1723#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 1794#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
1724 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size 1795 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
1725 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) { 1796 && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
1726 cachep->dbghead += PAGE_SIZE - size; 1797 cachep->obj_offset += PAGE_SIZE - size;
1727 size = PAGE_SIZE; 1798 size = PAGE_SIZE;
1728 } 1799 }
1729#endif 1800#endif
@@ -1786,7 +1857,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1786 if (flags & SLAB_CACHE_DMA) 1857 if (flags & SLAB_CACHE_DMA)
1787 cachep->gfpflags |= GFP_DMA; 1858 cachep->gfpflags |= GFP_DMA;
1788 spin_lock_init(&cachep->spinlock); 1859 spin_lock_init(&cachep->spinlock);
1789 cachep->objsize = size; 1860 cachep->buffer_size = size;
1790 1861
1791 if (flags & CFLGS_OFF_SLAB) 1862 if (flags & CFLGS_OFF_SLAB)
1792 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); 1863 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
@@ -1843,11 +1914,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1843 jiffies + REAPTIMEOUT_LIST3 + 1914 jiffies + REAPTIMEOUT_LIST3 +
1844 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 1915 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1845 1916
1846 BUG_ON(!ac_data(cachep)); 1917 BUG_ON(!cpu_cache_get(cachep));
1847 ac_data(cachep)->avail = 0; 1918 cpu_cache_get(cachep)->avail = 0;
1848 ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 1919 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
1849 ac_data(cachep)->batchcount = 1; 1920 cpu_cache_get(cachep)->batchcount = 1;
1850 ac_data(cachep)->touched = 0; 1921 cpu_cache_get(cachep)->touched = 0;
1851 cachep->batchcount = 1; 1922 cachep->batchcount = 1;
1852 cachep->limit = BOOT_CPUCACHE_ENTRIES; 1923 cachep->limit = BOOT_CPUCACHE_ENTRIES;
1853 } 1924 }
@@ -1875,7 +1946,7 @@ static void check_irq_on(void)
1875 BUG_ON(irqs_disabled()); 1946 BUG_ON(irqs_disabled());
1876} 1947}
1877 1948
1878static void check_spinlock_acquired(kmem_cache_t *cachep) 1949static void check_spinlock_acquired(struct kmem_cache *cachep)
1879{ 1950{
1880#ifdef CONFIG_SMP 1951#ifdef CONFIG_SMP
1881 check_irq_off(); 1952 check_irq_off();
@@ -1883,7 +1954,7 @@ static void check_spinlock_acquired(kmem_cache_t *cachep)
1883#endif 1954#endif
1884} 1955}
1885 1956
1886static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node) 1957static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
1887{ 1958{
1888#ifdef CONFIG_SMP 1959#ifdef CONFIG_SMP
1889 check_irq_off(); 1960 check_irq_off();
@@ -1916,24 +1987,24 @@ static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
1916 preempt_enable(); 1987 preempt_enable();
1917} 1988}
1918 1989
1919static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac, 1990static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
1920 int force, int node); 1991 int force, int node);
1921 1992
1922static void do_drain(void *arg) 1993static void do_drain(void *arg)
1923{ 1994{
1924 kmem_cache_t *cachep = (kmem_cache_t *) arg; 1995 struct kmem_cache *cachep = (struct kmem_cache *) arg;
1925 struct array_cache *ac; 1996 struct array_cache *ac;
1926 int node = numa_node_id(); 1997 int node = numa_node_id();
1927 1998
1928 check_irq_off(); 1999 check_irq_off();
1929 ac = ac_data(cachep); 2000 ac = cpu_cache_get(cachep);
1930 spin_lock(&cachep->nodelists[node]->list_lock); 2001 spin_lock(&cachep->nodelists[node]->list_lock);
1931 free_block(cachep, ac->entry, ac->avail, node); 2002 free_block(cachep, ac->entry, ac->avail, node);
1932 spin_unlock(&cachep->nodelists[node]->list_lock); 2003 spin_unlock(&cachep->nodelists[node]->list_lock);
1933 ac->avail = 0; 2004 ac->avail = 0;
1934} 2005}
1935 2006
1936static void drain_cpu_caches(kmem_cache_t *cachep) 2007static void drain_cpu_caches(struct kmem_cache *cachep)
1937{ 2008{
1938 struct kmem_list3 *l3; 2009 struct kmem_list3 *l3;
1939 int node; 2010 int node;
@@ -1954,7 +2025,7 @@ static void drain_cpu_caches(kmem_cache_t *cachep)
1954 spin_unlock_irq(&cachep->spinlock); 2025 spin_unlock_irq(&cachep->spinlock);
1955} 2026}
1956 2027
1957static int __node_shrink(kmem_cache_t *cachep, int node) 2028static int __node_shrink(struct kmem_cache *cachep, int node)
1958{ 2029{
1959 struct slab *slabp; 2030 struct slab *slabp;
1960 struct kmem_list3 *l3 = cachep->nodelists[node]; 2031 struct kmem_list3 *l3 = cachep->nodelists[node];
@@ -1983,7 +2054,7 @@ static int __node_shrink(kmem_cache_t *cachep, int node)
1983 return ret; 2054 return ret;
1984} 2055}
1985 2056
1986static int __cache_shrink(kmem_cache_t *cachep) 2057static int __cache_shrink(struct kmem_cache *cachep)
1987{ 2058{
1988 int ret = 0, i = 0; 2059 int ret = 0, i = 0;
1989 struct kmem_list3 *l3; 2060 struct kmem_list3 *l3;
@@ -2009,7 +2080,7 @@ static int __cache_shrink(kmem_cache_t *cachep)
2009 * Releases as many slabs as possible for a cache. 2080 * Releases as many slabs as possible for a cache.
2010 * To help debugging, a zero exit status indicates all slabs were released. 2081 * To help debugging, a zero exit status indicates all slabs were released.
2011 */ 2082 */
2012int kmem_cache_shrink(kmem_cache_t *cachep) 2083int kmem_cache_shrink(struct kmem_cache *cachep)
2013{ 2084{
2014 if (!cachep || in_interrupt()) 2085 if (!cachep || in_interrupt())
2015 BUG(); 2086 BUG();
@@ -2022,7 +2093,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2022 * kmem_cache_destroy - delete a cache 2093 * kmem_cache_destroy - delete a cache
2023 * @cachep: the cache to destroy 2094 * @cachep: the cache to destroy
2024 * 2095 *
2025 * Remove a kmem_cache_t object from the slab cache. 2096 * Remove a struct kmem_cache object from the slab cache.
2026 * Returns 0 on success. 2097 * Returns 0 on success.
2027 * 2098 *
2028 * It is expected this function will be called by a module when it is 2099 * It is expected this function will be called by a module when it is
@@ -2035,7 +2106,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2035 * The caller must guarantee that noone will allocate memory from the cache 2106 * The caller must guarantee that noone will allocate memory from the cache
2036 * during the kmem_cache_destroy(). 2107 * during the kmem_cache_destroy().
2037 */ 2108 */
2038int kmem_cache_destroy(kmem_cache_t *cachep) 2109int kmem_cache_destroy(struct kmem_cache *cachep)
2039{ 2110{
2040 int i; 2111 int i;
2041 struct kmem_list3 *l3; 2112 struct kmem_list3 *l3;
@@ -2086,7 +2157,7 @@ int kmem_cache_destroy(kmem_cache_t *cachep)
2086EXPORT_SYMBOL(kmem_cache_destroy); 2157EXPORT_SYMBOL(kmem_cache_destroy);
2087 2158
2088/* Get the memory for a slab management obj. */ 2159/* Get the memory for a slab management obj. */
2089static struct slab *alloc_slabmgmt(kmem_cache_t *cachep, void *objp, 2160static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2090 int colour_off, gfp_t local_flags) 2161 int colour_off, gfp_t local_flags)
2091{ 2162{
2092 struct slab *slabp; 2163 struct slab *slabp;
@@ -2112,13 +2183,13 @@ static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2112 return (kmem_bufctl_t *) (slabp + 1); 2183 return (kmem_bufctl_t *) (slabp + 1);
2113} 2184}
2114 2185
2115static void cache_init_objs(kmem_cache_t *cachep, 2186static void cache_init_objs(struct kmem_cache *cachep,
2116 struct slab *slabp, unsigned long ctor_flags) 2187 struct slab *slabp, unsigned long ctor_flags)
2117{ 2188{
2118 int i; 2189 int i;
2119 2190
2120 for (i = 0; i < cachep->num; i++) { 2191 for (i = 0; i < cachep->num; i++) {
2121 void *objp = slabp->s_mem + cachep->objsize * i; 2192 void *objp = slabp->s_mem + cachep->buffer_size * i;
2122#if DEBUG 2193#if DEBUG
2123 /* need to poison the objs? */ 2194 /* need to poison the objs? */
2124 if (cachep->flags & SLAB_POISON) 2195 if (cachep->flags & SLAB_POISON)
@@ -2136,7 +2207,7 @@ static void cache_init_objs(kmem_cache_t *cachep,
2136 * Otherwise, deadlock. They must also be threaded. 2207 * Otherwise, deadlock. They must also be threaded.
2137 */ 2208 */
2138 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2209 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2139 cachep->ctor(objp + obj_dbghead(cachep), cachep, 2210 cachep->ctor(objp + obj_offset(cachep), cachep,
2140 ctor_flags); 2211 ctor_flags);
2141 2212
2142 if (cachep->flags & SLAB_RED_ZONE) { 2213 if (cachep->flags & SLAB_RED_ZONE) {
@@ -2147,10 +2218,10 @@ static void cache_init_objs(kmem_cache_t *cachep,
2147 slab_error(cachep, "constructor overwrote the" 2218 slab_error(cachep, "constructor overwrote the"
2148 " start of an object"); 2219 " start of an object");
2149 } 2220 }
2150 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) 2221 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
2151 && cachep->flags & SLAB_POISON) 2222 && cachep->flags & SLAB_POISON)
2152 kernel_map_pages(virt_to_page(objp), 2223 kernel_map_pages(virt_to_page(objp),
2153 cachep->objsize / PAGE_SIZE, 0); 2224 cachep->buffer_size / PAGE_SIZE, 0);
2154#else 2225#else
2155 if (cachep->ctor) 2226 if (cachep->ctor)
2156 cachep->ctor(objp, cachep, ctor_flags); 2227 cachep->ctor(objp, cachep, ctor_flags);
@@ -2161,7 +2232,7 @@ static void cache_init_objs(kmem_cache_t *cachep,
2161 slabp->free = 0; 2232 slabp->free = 0;
2162} 2233}
2163 2234
2164static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags) 2235static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2165{ 2236{
2166 if (flags & SLAB_DMA) { 2237 if (flags & SLAB_DMA) {
2167 if (!(cachep->gfpflags & GFP_DMA)) 2238 if (!(cachep->gfpflags & GFP_DMA))
@@ -2172,7 +2243,43 @@ static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags)
2172 } 2243 }
2173} 2244}
2174 2245
2175static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp) 2246static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid)
2247{
2248 void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size);
2249 kmem_bufctl_t next;
2250
2251 slabp->inuse++;
2252 next = slab_bufctl(slabp)[slabp->free];
2253#if DEBUG
2254 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2255 WARN_ON(slabp->nodeid != nodeid);
2256#endif
2257 slabp->free = next;
2258
2259 return objp;
2260}
2261
2262static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp,
2263 int nodeid)
2264{
2265 unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size;
2266
2267#if DEBUG
2268 /* Verify that the slab belongs to the intended node */
2269 WARN_ON(slabp->nodeid != nodeid);
2270
2271 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2272 printk(KERN_ERR "slab: double free detected in cache "
2273 "'%s', objp %p\n", cachep->name, objp);
2274 BUG();
2275 }
2276#endif
2277 slab_bufctl(slabp)[objnr] = slabp->free;
2278 slabp->free = objnr;
2279 slabp->inuse--;
2280}
2281
2282static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp)
2176{ 2283{
2177 int i; 2284 int i;
2178 struct page *page; 2285 struct page *page;
@@ -2191,7 +2298,7 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
2191 * Grow (by 1) the number of slabs within a cache. This is called by 2298 * Grow (by 1) the number of slabs within a cache. This is called by
2192 * kmem_cache_alloc() when there are no active objs left in a cache. 2299 * kmem_cache_alloc() when there are no active objs left in a cache.
2193 */ 2300 */
2194static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2301static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2195{ 2302{
2196 struct slab *slabp; 2303 struct slab *slabp;
2197 void *objp; 2304 void *objp;
@@ -2302,14 +2409,14 @@ static void kfree_debugcheck(const void *objp)
2302 } 2409 }
2303} 2410}
2304 2411
2305static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp, 2412static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2306 void *caller) 2413 void *caller)
2307{ 2414{
2308 struct page *page; 2415 struct page *page;
2309 unsigned int objnr; 2416 unsigned int objnr;
2310 struct slab *slabp; 2417 struct slab *slabp;
2311 2418
2312 objp -= obj_dbghead(cachep); 2419 objp -= obj_offset(cachep);
2313 kfree_debugcheck(objp); 2420 kfree_debugcheck(objp);
2314 page = virt_to_page(objp); 2421 page = virt_to_page(objp);
2315 2422
@@ -2341,31 +2448,31 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
2341 if (cachep->flags & SLAB_STORE_USER) 2448 if (cachep->flags & SLAB_STORE_USER)
2342 *dbg_userword(cachep, objp) = caller; 2449 *dbg_userword(cachep, objp) = caller;
2343 2450
2344 objnr = (objp - slabp->s_mem) / cachep->objsize; 2451 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
2345 2452
2346 BUG_ON(objnr >= cachep->num); 2453 BUG_ON(objnr >= cachep->num);
2347 BUG_ON(objp != slabp->s_mem + objnr * cachep->objsize); 2454 BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size);
2348 2455
2349 if (cachep->flags & SLAB_DEBUG_INITIAL) { 2456 if (cachep->flags & SLAB_DEBUG_INITIAL) {
2350 /* Need to call the slab's constructor so the 2457 /* Need to call the slab's constructor so the
2351 * caller can perform a verify of its state (debugging). 2458 * caller can perform a verify of its state (debugging).
2352 * Called without the cache-lock held. 2459 * Called without the cache-lock held.
2353 */ 2460 */
2354 cachep->ctor(objp + obj_dbghead(cachep), 2461 cachep->ctor(objp + obj_offset(cachep),
2355 cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); 2462 cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
2356 } 2463 }
2357 if (cachep->flags & SLAB_POISON && cachep->dtor) { 2464 if (cachep->flags & SLAB_POISON && cachep->dtor) {
2358 /* we want to cache poison the object, 2465 /* we want to cache poison the object,
2359 * call the destruction callback 2466 * call the destruction callback
2360 */ 2467 */
2361 cachep->dtor(objp + obj_dbghead(cachep), cachep, 0); 2468 cachep->dtor(objp + obj_offset(cachep), cachep, 0);
2362 } 2469 }
2363 if (cachep->flags & SLAB_POISON) { 2470 if (cachep->flags & SLAB_POISON) {
2364#ifdef CONFIG_DEBUG_PAGEALLOC 2471#ifdef CONFIG_DEBUG_PAGEALLOC
2365 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { 2472 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
2366 store_stackinfo(cachep, objp, (unsigned long)caller); 2473 store_stackinfo(cachep, objp, (unsigned long)caller);
2367 kernel_map_pages(virt_to_page(objp), 2474 kernel_map_pages(virt_to_page(objp),
2368 cachep->objsize / PAGE_SIZE, 0); 2475 cachep->buffer_size / PAGE_SIZE, 0);
2369 } else { 2476 } else {
2370 poison_obj(cachep, objp, POISON_FREE); 2477 poison_obj(cachep, objp, POISON_FREE);
2371 } 2478 }
@@ -2376,7 +2483,7 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
2376 return objp; 2483 return objp;
2377} 2484}
2378 2485
2379static void check_slabp(kmem_cache_t *cachep, struct slab *slabp) 2486static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2380{ 2487{
2381 kmem_bufctl_t i; 2488 kmem_bufctl_t i;
2382 int entries = 0; 2489 int entries = 0;
@@ -2409,14 +2516,14 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
2409#define check_slabp(x,y) do { } while(0) 2516#define check_slabp(x,y) do { } while(0)
2410#endif 2517#endif
2411 2518
2412static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags) 2519static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2413{ 2520{
2414 int batchcount; 2521 int batchcount;
2415 struct kmem_list3 *l3; 2522 struct kmem_list3 *l3;
2416 struct array_cache *ac; 2523 struct array_cache *ac;
2417 2524
2418 check_irq_off(); 2525 check_irq_off();
2419 ac = ac_data(cachep); 2526 ac = cpu_cache_get(cachep);
2420 retry: 2527 retry:
2421 batchcount = ac->batchcount; 2528 batchcount = ac->batchcount;
2422 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2529 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -2461,22 +2568,12 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
2461 check_slabp(cachep, slabp); 2568 check_slabp(cachep, slabp);
2462 check_spinlock_acquired(cachep); 2569 check_spinlock_acquired(cachep);
2463 while (slabp->inuse < cachep->num && batchcount--) { 2570 while (slabp->inuse < cachep->num && batchcount--) {
2464 kmem_bufctl_t next;
2465 STATS_INC_ALLOCED(cachep); 2571 STATS_INC_ALLOCED(cachep);
2466 STATS_INC_ACTIVE(cachep); 2572 STATS_INC_ACTIVE(cachep);
2467 STATS_SET_HIGH(cachep); 2573 STATS_SET_HIGH(cachep);
2468 2574
2469 /* get obj pointer */ 2575 ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
2470 ac->entry[ac->avail++] = slabp->s_mem + 2576 numa_node_id());
2471 slabp->free * cachep->objsize;
2472
2473 slabp->inuse++;
2474 next = slab_bufctl(slabp)[slabp->free];
2475#if DEBUG
2476 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2477 WARN_ON(numa_node_id() != slabp->nodeid);
2478#endif
2479 slabp->free = next;
2480 } 2577 }
2481 check_slabp(cachep, slabp); 2578 check_slabp(cachep, slabp);
2482 2579
@@ -2498,7 +2595,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
2498 x = cache_grow(cachep, flags, numa_node_id()); 2595 x = cache_grow(cachep, flags, numa_node_id());
2499 2596
2500 // cache_grow can reenable interrupts, then ac could change. 2597 // cache_grow can reenable interrupts, then ac could change.
2501 ac = ac_data(cachep); 2598 ac = cpu_cache_get(cachep);
2502 if (!x && ac->avail == 0) // no objects in sight? abort 2599 if (!x && ac->avail == 0) // no objects in sight? abort
2503 return NULL; 2600 return NULL;
2504 2601
@@ -2510,7 +2607,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
2510} 2607}
2511 2608
2512static inline void 2609static inline void
2513cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags) 2610cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
2514{ 2611{
2515 might_sleep_if(flags & __GFP_WAIT); 2612 might_sleep_if(flags & __GFP_WAIT);
2516#if DEBUG 2613#if DEBUG
@@ -2519,16 +2616,16 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
2519} 2616}
2520 2617
2521#if DEBUG 2618#if DEBUG
2522static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags, 2619static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags,
2523 void *objp, void *caller) 2620 void *objp, void *caller)
2524{ 2621{
2525 if (!objp) 2622 if (!objp)
2526 return objp; 2623 return objp;
2527 if (cachep->flags & SLAB_POISON) { 2624 if (cachep->flags & SLAB_POISON) {
2528#ifdef CONFIG_DEBUG_PAGEALLOC 2625#ifdef CONFIG_DEBUG_PAGEALLOC
2529 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 2626 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
2530 kernel_map_pages(virt_to_page(objp), 2627 kernel_map_pages(virt_to_page(objp),
2531 cachep->objsize / PAGE_SIZE, 1); 2628 cachep->buffer_size / PAGE_SIZE, 1);
2532 else 2629 else
2533 check_poison_obj(cachep, objp); 2630 check_poison_obj(cachep, objp);
2534#else 2631#else
@@ -2553,7 +2650,7 @@ static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
2553 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 2650 *dbg_redzone1(cachep, objp) = RED_ACTIVE;
2554 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2651 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
2555 } 2652 }
2556 objp += obj_dbghead(cachep); 2653 objp += obj_offset(cachep);
2557 if (cachep->ctor && cachep->flags & SLAB_POISON) { 2654 if (cachep->ctor && cachep->flags & SLAB_POISON) {
2558 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2655 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2559 2656
@@ -2568,7 +2665,7 @@ static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
2568#define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 2665#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
2569#endif 2666#endif
2570 2667
2571static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2668static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2572{ 2669{
2573 void *objp; 2670 void *objp;
2574 struct array_cache *ac; 2671 struct array_cache *ac;
@@ -2583,7 +2680,7 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2583#endif 2680#endif
2584 2681
2585 check_irq_off(); 2682 check_irq_off();
2586 ac = ac_data(cachep); 2683 ac = cpu_cache_get(cachep);
2587 if (likely(ac->avail)) { 2684 if (likely(ac->avail)) {
2588 STATS_INC_ALLOCHIT(cachep); 2685 STATS_INC_ALLOCHIT(cachep);
2589 ac->touched = 1; 2686 ac->touched = 1;
@@ -2595,7 +2692,8 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2595 return objp; 2692 return objp;
2596} 2693}
2597 2694
2598static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2695static __always_inline void *
2696__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
2599{ 2697{
2600 unsigned long save_flags; 2698 unsigned long save_flags;
2601 void *objp; 2699 void *objp;
@@ -2606,7 +2704,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2606 objp = ____cache_alloc(cachep, flags); 2704 objp = ____cache_alloc(cachep, flags);
2607 local_irq_restore(save_flags); 2705 local_irq_restore(save_flags);
2608 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 2706 objp = cache_alloc_debugcheck_after(cachep, flags, objp,
2609 __builtin_return_address(0)); 2707 caller);
2610 prefetchw(objp); 2708 prefetchw(objp);
2611 return objp; 2709 return objp;
2612} 2710}
@@ -2615,13 +2713,12 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2615/* 2713/*
2616 * A interface to enable slab creation on nodeid 2714 * A interface to enable slab creation on nodeid
2617 */ 2715 */
2618static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2716static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2619{ 2717{
2620 struct list_head *entry; 2718 struct list_head *entry;
2621 struct slab *slabp; 2719 struct slab *slabp;
2622 struct kmem_list3 *l3; 2720 struct kmem_list3 *l3;
2623 void *obj; 2721 void *obj;
2624 kmem_bufctl_t next;
2625 int x; 2722 int x;
2626 2723
2627 l3 = cachep->nodelists[nodeid]; 2724 l3 = cachep->nodelists[nodeid];
@@ -2647,14 +2744,7 @@ static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2647 2744
2648 BUG_ON(slabp->inuse == cachep->num); 2745 BUG_ON(slabp->inuse == cachep->num);
2649 2746
2650 /* get obj pointer */ 2747 obj = slab_get_obj(cachep, slabp, nodeid);
2651 obj = slabp->s_mem + slabp->free * cachep->objsize;
2652 slabp->inuse++;
2653 next = slab_bufctl(slabp)[slabp->free];
2654#if DEBUG
2655 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2656#endif
2657 slabp->free = next;
2658 check_slabp(cachep, slabp); 2748 check_slabp(cachep, slabp);
2659 l3->free_objects--; 2749 l3->free_objects--;
2660 /* move slabp to correct slabp list: */ 2750 /* move slabp to correct slabp list: */
@@ -2685,7 +2775,7 @@ static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2685/* 2775/*
2686 * Caller needs to acquire correct kmem_list's list_lock 2776 * Caller needs to acquire correct kmem_list's list_lock
2687 */ 2777 */
2688static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, 2778static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
2689 int node) 2779 int node)
2690{ 2780{
2691 int i; 2781 int i;
@@ -2694,29 +2784,14 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
2694 for (i = 0; i < nr_objects; i++) { 2784 for (i = 0; i < nr_objects; i++) {
2695 void *objp = objpp[i]; 2785 void *objp = objpp[i];
2696 struct slab *slabp; 2786 struct slab *slabp;
2697 unsigned int objnr;
2698 2787
2699 slabp = page_get_slab(virt_to_page(objp)); 2788 slabp = virt_to_slab(objp);
2700 l3 = cachep->nodelists[node]; 2789 l3 = cachep->nodelists[node];
2701 list_del(&slabp->list); 2790 list_del(&slabp->list);
2702 objnr = (objp - slabp->s_mem) / cachep->objsize;
2703 check_spinlock_acquired_node(cachep, node); 2791 check_spinlock_acquired_node(cachep, node);
2704 check_slabp(cachep, slabp); 2792 check_slabp(cachep, slabp);
2705 2793 slab_put_obj(cachep, slabp, objp, node);
2706#if DEBUG
2707 /* Verify that the slab belongs to the intended node */
2708 WARN_ON(slabp->nodeid != node);
2709
2710 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2711 printk(KERN_ERR "slab: double free detected in cache "
2712 "'%s', objp %p\n", cachep->name, objp);
2713 BUG();
2714 }
2715#endif
2716 slab_bufctl(slabp)[objnr] = slabp->free;
2717 slabp->free = objnr;
2718 STATS_DEC_ACTIVE(cachep); 2794 STATS_DEC_ACTIVE(cachep);
2719 slabp->inuse--;
2720 l3->free_objects++; 2795 l3->free_objects++;
2721 check_slabp(cachep, slabp); 2796 check_slabp(cachep, slabp);
2722 2797
@@ -2738,7 +2813,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
2738 } 2813 }
2739} 2814}
2740 2815
2741static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac) 2816static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
2742{ 2817{
2743 int batchcount; 2818 int batchcount;
2744 struct kmem_list3 *l3; 2819 struct kmem_list3 *l3;
@@ -2797,9 +2872,9 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
2797 * 2872 *
2798 * Called with disabled ints. 2873 * Called with disabled ints.
2799 */ 2874 */
2800static inline void __cache_free(kmem_cache_t *cachep, void *objp) 2875static inline void __cache_free(struct kmem_cache *cachep, void *objp)
2801{ 2876{
2802 struct array_cache *ac = ac_data(cachep); 2877 struct array_cache *ac = cpu_cache_get(cachep);
2803 2878
2804 check_irq_off(); 2879 check_irq_off();
2805 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 2880 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
@@ -2810,7 +2885,7 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
2810#ifdef CONFIG_NUMA 2885#ifdef CONFIG_NUMA
2811 { 2886 {
2812 struct slab *slabp; 2887 struct slab *slabp;
2813 slabp = page_get_slab(virt_to_page(objp)); 2888 slabp = virt_to_slab(objp);
2814 if (unlikely(slabp->nodeid != numa_node_id())) { 2889 if (unlikely(slabp->nodeid != numa_node_id())) {
2815 struct array_cache *alien = NULL; 2890 struct array_cache *alien = NULL;
2816 int nodeid = slabp->nodeid; 2891 int nodeid = slabp->nodeid;
@@ -2856,9 +2931,9 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
2856 * Allocate an object from this cache. The flags are only relevant 2931 * Allocate an object from this cache. The flags are only relevant
2857 * if the cache has no available objects. 2932 * if the cache has no available objects.
2858 */ 2933 */
2859void *kmem_cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2934void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2860{ 2935{
2861 return __cache_alloc(cachep, flags); 2936 return __cache_alloc(cachep, flags, __builtin_return_address(0));
2862} 2937}
2863EXPORT_SYMBOL(kmem_cache_alloc); 2938EXPORT_SYMBOL(kmem_cache_alloc);
2864 2939
@@ -2876,12 +2951,12 @@ EXPORT_SYMBOL(kmem_cache_alloc);
2876 * 2951 *
2877 * Currently only used for dentry validation. 2952 * Currently only used for dentry validation.
2878 */ 2953 */
2879int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr) 2954int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
2880{ 2955{
2881 unsigned long addr = (unsigned long)ptr; 2956 unsigned long addr = (unsigned long)ptr;
2882 unsigned long min_addr = PAGE_OFFSET; 2957 unsigned long min_addr = PAGE_OFFSET;
2883 unsigned long align_mask = BYTES_PER_WORD - 1; 2958 unsigned long align_mask = BYTES_PER_WORD - 1;
2884 unsigned long size = cachep->objsize; 2959 unsigned long size = cachep->buffer_size;
2885 struct page *page; 2960 struct page *page;
2886 2961
2887 if (unlikely(addr < min_addr)) 2962 if (unlikely(addr < min_addr))
@@ -2917,32 +2992,23 @@ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
2917 * New and improved: it will now make sure that the object gets 2992 * New and improved: it will now make sure that the object gets
2918 * put on the correct node list so that there is no false sharing. 2993 * put on the correct node list so that there is no false sharing.
2919 */ 2994 */
2920void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2995void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2921{ 2996{
2922 unsigned long save_flags; 2997 unsigned long save_flags;
2923 void *ptr; 2998 void *ptr;
2924 2999
2925 if (nodeid == -1)
2926 return __cache_alloc(cachep, flags);
2927
2928 if (unlikely(!cachep->nodelists[nodeid])) {
2929 /* Fall back to __cache_alloc if we run into trouble */
2930 printk(KERN_WARNING
2931 "slab: not allocating in inactive node %d for cache %s\n",
2932 nodeid, cachep->name);
2933 return __cache_alloc(cachep, flags);
2934 }
2935
2936 cache_alloc_debugcheck_before(cachep, flags); 3000 cache_alloc_debugcheck_before(cachep, flags);
2937 local_irq_save(save_flags); 3001 local_irq_save(save_flags);
2938 if (nodeid == numa_node_id()) 3002
3003 if (nodeid == -1 || nodeid == numa_node_id() ||
3004 !cachep->nodelists[nodeid])
2939 ptr = ____cache_alloc(cachep, flags); 3005 ptr = ____cache_alloc(cachep, flags);
2940 else 3006 else
2941 ptr = __cache_alloc_node(cachep, flags, nodeid); 3007 ptr = __cache_alloc_node(cachep, flags, nodeid);
2942 local_irq_restore(save_flags); 3008 local_irq_restore(save_flags);
2943 ptr = 3009
2944 cache_alloc_debugcheck_after(cachep, flags, ptr, 3010 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr,
2945 __builtin_return_address(0)); 3011 __builtin_return_address(0));
2946 3012
2947 return ptr; 3013 return ptr;
2948} 3014}
@@ -2950,7 +3016,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node);
2950 3016
2951void *kmalloc_node(size_t size, gfp_t flags, int node) 3017void *kmalloc_node(size_t size, gfp_t flags, int node)
2952{ 3018{
2953 kmem_cache_t *cachep; 3019 struct kmem_cache *cachep;
2954 3020
2955 cachep = kmem_find_general_cachep(size, flags); 3021 cachep = kmem_find_general_cachep(size, flags);
2956 if (unlikely(cachep == NULL)) 3022 if (unlikely(cachep == NULL))
@@ -2981,9 +3047,10 @@ EXPORT_SYMBOL(kmalloc_node);
2981 * platforms. For example, on i386, it means that the memory must come 3047 * platforms. For example, on i386, it means that the memory must come
2982 * from the first 16MB. 3048 * from the first 16MB.
2983 */ 3049 */
2984void *__kmalloc(size_t size, gfp_t flags) 3050static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3051 void *caller)
2985{ 3052{
2986 kmem_cache_t *cachep; 3053 struct kmem_cache *cachep;
2987 3054
2988 /* If you want to save a few bytes .text space: replace 3055 /* If you want to save a few bytes .text space: replace
2989 * __ with kmem_. 3056 * __ with kmem_.
@@ -2993,10 +3060,27 @@ void *__kmalloc(size_t size, gfp_t flags)
2993 cachep = __find_general_cachep(size, flags); 3060 cachep = __find_general_cachep(size, flags);
2994 if (unlikely(cachep == NULL)) 3061 if (unlikely(cachep == NULL))
2995 return NULL; 3062 return NULL;
2996 return __cache_alloc(cachep, flags); 3063 return __cache_alloc(cachep, flags, caller);
3064}
3065
3066#ifndef CONFIG_DEBUG_SLAB
3067
3068void *__kmalloc(size_t size, gfp_t flags)
3069{
3070 return __do_kmalloc(size, flags, NULL);
2997} 3071}
2998EXPORT_SYMBOL(__kmalloc); 3072EXPORT_SYMBOL(__kmalloc);
2999 3073
3074#else
3075
3076void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
3077{
3078 return __do_kmalloc(size, flags, caller);
3079}
3080EXPORT_SYMBOL(__kmalloc_track_caller);
3081
3082#endif
3083
3000#ifdef CONFIG_SMP 3084#ifdef CONFIG_SMP
3001/** 3085/**
3002 * __alloc_percpu - allocate one copy of the object for every present 3086 * __alloc_percpu - allocate one copy of the object for every present
@@ -3054,7 +3138,7 @@ EXPORT_SYMBOL(__alloc_percpu);
3054 * Free an object which was previously allocated from this 3138 * Free an object which was previously allocated from this
3055 * cache. 3139 * cache.
3056 */ 3140 */
3057void kmem_cache_free(kmem_cache_t *cachep, void *objp) 3141void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3058{ 3142{
3059 unsigned long flags; 3143 unsigned long flags;
3060 3144
@@ -3075,15 +3159,15 @@ EXPORT_SYMBOL(kmem_cache_free);
3075 */ 3159 */
3076void kfree(const void *objp) 3160void kfree(const void *objp)
3077{ 3161{
3078 kmem_cache_t *c; 3162 struct kmem_cache *c;
3079 unsigned long flags; 3163 unsigned long flags;
3080 3164
3081 if (unlikely(!objp)) 3165 if (unlikely(!objp))
3082 return; 3166 return;
3083 local_irq_save(flags); 3167 local_irq_save(flags);
3084 kfree_debugcheck(objp); 3168 kfree_debugcheck(objp);
3085 c = page_get_cache(virt_to_page(objp)); 3169 c = virt_to_cache(objp);
3086 mutex_debug_check_no_locks_freed(objp, obj_reallen(c)); 3170 mutex_debug_check_no_locks_freed(objp, obj_size(c));
3087 __cache_free(c, (void *)objp); 3171 __cache_free(c, (void *)objp);
3088 local_irq_restore(flags); 3172 local_irq_restore(flags);
3089} 3173}
@@ -3112,13 +3196,13 @@ void free_percpu(const void *objp)
3112EXPORT_SYMBOL(free_percpu); 3196EXPORT_SYMBOL(free_percpu);
3113#endif 3197#endif
3114 3198
3115unsigned int kmem_cache_size(kmem_cache_t *cachep) 3199unsigned int kmem_cache_size(struct kmem_cache *cachep)
3116{ 3200{
3117 return obj_reallen(cachep); 3201 return obj_size(cachep);
3118} 3202}
3119EXPORT_SYMBOL(kmem_cache_size); 3203EXPORT_SYMBOL(kmem_cache_size);
3120 3204
3121const char *kmem_cache_name(kmem_cache_t *cachep) 3205const char *kmem_cache_name(struct kmem_cache *cachep)
3122{ 3206{
3123 return cachep->name; 3207 return cachep->name;
3124} 3208}
@@ -3127,7 +3211,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name);
3127/* 3211/*
3128 * This initializes kmem_list3 for all nodes. 3212 * This initializes kmem_list3 for all nodes.
3129 */ 3213 */
3130static int alloc_kmemlist(kmem_cache_t *cachep) 3214static int alloc_kmemlist(struct kmem_cache *cachep)
3131{ 3215{
3132 int node; 3216 int node;
3133 struct kmem_list3 *l3; 3217 struct kmem_list3 *l3;
@@ -3183,7 +3267,7 @@ static int alloc_kmemlist(kmem_cache_t *cachep)
3183} 3267}
3184 3268
3185struct ccupdate_struct { 3269struct ccupdate_struct {
3186 kmem_cache_t *cachep; 3270 struct kmem_cache *cachep;
3187 struct array_cache *new[NR_CPUS]; 3271 struct array_cache *new[NR_CPUS];
3188}; 3272};
3189 3273
@@ -3193,13 +3277,13 @@ static void do_ccupdate_local(void *info)
3193 struct array_cache *old; 3277 struct array_cache *old;
3194 3278
3195 check_irq_off(); 3279 check_irq_off();
3196 old = ac_data(new->cachep); 3280 old = cpu_cache_get(new->cachep);
3197 3281
3198 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; 3282 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3199 new->new[smp_processor_id()] = old; 3283 new->new[smp_processor_id()] = old;
3200} 3284}
3201 3285
3202static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount, 3286static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount,
3203 int shared) 3287 int shared)
3204{ 3288{
3205 struct ccupdate_struct new; 3289 struct ccupdate_struct new;
@@ -3245,7 +3329,7 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
3245 return 0; 3329 return 0;
3246} 3330}
3247 3331
3248static void enable_cpucache(kmem_cache_t *cachep) 3332static void enable_cpucache(struct kmem_cache *cachep)
3249{ 3333{
3250 int err; 3334 int err;
3251 int limit, shared; 3335 int limit, shared;
@@ -3258,13 +3342,13 @@ static void enable_cpucache(kmem_cache_t *cachep)
3258 * The numbers are guessed, we should auto-tune as described by 3342 * The numbers are guessed, we should auto-tune as described by
3259 * Bonwick. 3343 * Bonwick.
3260 */ 3344 */
3261 if (cachep->objsize > 131072) 3345 if (cachep->buffer_size > 131072)
3262 limit = 1; 3346 limit = 1;
3263 else if (cachep->objsize > PAGE_SIZE) 3347 else if (cachep->buffer_size > PAGE_SIZE)
3264 limit = 8; 3348 limit = 8;
3265 else if (cachep->objsize > 1024) 3349 else if (cachep->buffer_size > 1024)
3266 limit = 24; 3350 limit = 24;
3267 else if (cachep->objsize > 256) 3351 else if (cachep->buffer_size > 256)
3268 limit = 54; 3352 limit = 54;
3269 else 3353 else
3270 limit = 120; 3354 limit = 120;
@@ -3279,7 +3363,7 @@ static void enable_cpucache(kmem_cache_t *cachep)
3279 */ 3363 */
3280 shared = 0; 3364 shared = 0;
3281#ifdef CONFIG_SMP 3365#ifdef CONFIG_SMP
3282 if (cachep->objsize <= PAGE_SIZE) 3366 if (cachep->buffer_size <= PAGE_SIZE)
3283 shared = 8; 3367 shared = 8;
3284#endif 3368#endif
3285 3369
@@ -3297,7 +3381,7 @@ static void enable_cpucache(kmem_cache_t *cachep)
3297 cachep->name, -err); 3381 cachep->name, -err);
3298} 3382}
3299 3383
3300static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac, 3384static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
3301 int force, int node) 3385 int force, int node)
3302{ 3386{
3303 int tofree; 3387 int tofree;
@@ -3342,12 +3426,12 @@ static void cache_reap(void *unused)
3342 } 3426 }
3343 3427
3344 list_for_each(walk, &cache_chain) { 3428 list_for_each(walk, &cache_chain) {
3345 kmem_cache_t *searchp; 3429 struct kmem_cache *searchp;
3346 struct list_head *p; 3430 struct list_head *p;
3347 int tofree; 3431 int tofree;
3348 struct slab *slabp; 3432 struct slab *slabp;
3349 3433
3350 searchp = list_entry(walk, kmem_cache_t, next); 3434 searchp = list_entry(walk, struct kmem_cache, next);
3351 3435
3352 if (searchp->flags & SLAB_NO_REAP) 3436 if (searchp->flags & SLAB_NO_REAP)
3353 goto next; 3437 goto next;
@@ -3359,7 +3443,7 @@ static void cache_reap(void *unused)
3359 drain_alien_cache(searchp, l3); 3443 drain_alien_cache(searchp, l3);
3360 spin_lock_irq(&l3->list_lock); 3444 spin_lock_irq(&l3->list_lock);
3361 3445
3362 drain_array_locked(searchp, ac_data(searchp), 0, 3446 drain_array_locked(searchp, cpu_cache_get(searchp), 0,
3363 numa_node_id()); 3447 numa_node_id());
3364 3448
3365 if (time_after(l3->next_reap, jiffies)) 3449 if (time_after(l3->next_reap, jiffies))
@@ -3450,15 +3534,15 @@ static void *s_start(struct seq_file *m, loff_t *pos)
3450 if (p == &cache_chain) 3534 if (p == &cache_chain)
3451 return NULL; 3535 return NULL;
3452 } 3536 }
3453 return list_entry(p, kmem_cache_t, next); 3537 return list_entry(p, struct kmem_cache, next);
3454} 3538}
3455 3539
3456static void *s_next(struct seq_file *m, void *p, loff_t *pos) 3540static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3457{ 3541{
3458 kmem_cache_t *cachep = p; 3542 struct kmem_cache *cachep = p;
3459 ++*pos; 3543 ++*pos;
3460 return cachep->next.next == &cache_chain ? NULL 3544 return cachep->next.next == &cache_chain ? NULL
3461 : list_entry(cachep->next.next, kmem_cache_t, next); 3545 : list_entry(cachep->next.next, struct kmem_cache, next);
3462} 3546}
3463 3547
3464static void s_stop(struct seq_file *m, void *p) 3548static void s_stop(struct seq_file *m, void *p)
@@ -3468,7 +3552,7 @@ static void s_stop(struct seq_file *m, void *p)
3468 3552
3469static int s_show(struct seq_file *m, void *p) 3553static int s_show(struct seq_file *m, void *p)
3470{ 3554{
3471 kmem_cache_t *cachep = p; 3555 struct kmem_cache *cachep = p;
3472 struct list_head *q; 3556 struct list_head *q;
3473 struct slab *slabp; 3557 struct slab *slabp;
3474 unsigned long active_objs; 3558 unsigned long active_objs;
@@ -3528,7 +3612,7 @@ static int s_show(struct seq_file *m, void *p)
3528 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 3612 printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
3529 3613
3530 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 3614 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
3531 name, active_objs, num_objs, cachep->objsize, 3615 name, active_objs, num_objs, cachep->buffer_size,
3532 cachep->num, (1 << cachep->gfporder)); 3616 cachep->num, (1 << cachep->gfporder));
3533 seq_printf(m, " : tunables %4u %4u %4u", 3617 seq_printf(m, " : tunables %4u %4u %4u",
3534 cachep->limit, cachep->batchcount, cachep->shared); 3618 cachep->limit, cachep->batchcount, cachep->shared);
@@ -3618,7 +3702,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3618 mutex_lock(&cache_chain_mutex); 3702 mutex_lock(&cache_chain_mutex);
3619 res = -EINVAL; 3703 res = -EINVAL;
3620 list_for_each(p, &cache_chain) { 3704 list_for_each(p, &cache_chain) {
3621 kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); 3705 struct kmem_cache *cachep = list_entry(p, struct kmem_cache,
3706 next);
3622 3707
3623 if (!strcmp(cachep->name, kbuf)) { 3708 if (!strcmp(cachep->name, kbuf)) {
3624 if (limit < 1 || 3709 if (limit < 1 ||
@@ -3656,5 +3741,5 @@ unsigned int ksize(const void *objp)
3656 if (unlikely(objp == NULL)) 3741 if (unlikely(objp == NULL))
3657 return 0; 3742 return 0;
3658 3743
3659 return obj_reallen(page_get_cache(virt_to_page(objp))); 3744 return obj_size(virt_to_cache(objp));
3660} 3745}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 7b09ac503fec..db8a3d3e1636 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -27,6 +27,7 @@ static struct address_space_operations swap_aops = {
27 .writepage = swap_writepage, 27 .writepage = swap_writepage,
28 .sync_page = block_sync_page, 28 .sync_page = block_sync_page,
29 .set_page_dirty = __set_page_dirty_nobuffers, 29 .set_page_dirty = __set_page_dirty_nobuffers,
30 .migratepage = migrate_page,
30}; 31};
31 32
32static struct backing_dev_info swap_backing_dev_info = { 33static struct backing_dev_info swap_backing_dev_info = {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f1e69c30d203..1f9cf0d073b8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -554,6 +554,15 @@ static int unuse_mm(struct mm_struct *mm,
554 return 0; 554 return 0;
555} 555}
556 556
557#ifdef CONFIG_MIGRATION
558int remove_vma_swap(struct vm_area_struct *vma, struct page *page)
559{
560 swp_entry_t entry = { .val = page_private(page) };
561
562 return unuse_vma(vma, entry, page);
563}
564#endif
565
557/* 566/*
558 * Scan swap_map from current position to next entry still in use. 567 * Scan swap_map from current position to next entry still in use.
559 * Recycle to start on reaching the end, returning 0 when empty. 568 * Recycle to start on reaching the end, returning 0 when empty.
@@ -646,6 +655,7 @@ static int try_to_unuse(unsigned int type)
646 */ 655 */
647 swap_map = &si->swap_map[i]; 656 swap_map = &si->swap_map[i];
648 entry = swp_entry(type, i); 657 entry = swp_entry(type, i);
658again:
649 page = read_swap_cache_async(entry, NULL, 0); 659 page = read_swap_cache_async(entry, NULL, 0);
650 if (!page) { 660 if (!page) {
651 /* 661 /*
@@ -680,6 +690,12 @@ static int try_to_unuse(unsigned int type)
680 wait_on_page_locked(page); 690 wait_on_page_locked(page);
681 wait_on_page_writeback(page); 691 wait_on_page_writeback(page);
682 lock_page(page); 692 lock_page(page);
693 if (!PageSwapCache(page)) {
694 /* Page migration has occured */
695 unlock_page(page);
696 page_cache_release(page);
697 goto again;
698 }
683 wait_on_page_writeback(page); 699 wait_on_page_writeback(page);
684 700
685 /* 701 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2e34b61a70c7..5a610804cd06 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -477,7 +477,13 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
477 * processes. Try to unmap it here. 477 * processes. Try to unmap it here.
478 */ 478 */
479 if (page_mapped(page) && mapping) { 479 if (page_mapped(page) && mapping) {
480 switch (try_to_unmap(page)) { 480 /*
481 * No unmapping if we do not swap
482 */
483 if (!sc->may_swap)
484 goto keep_locked;
485
486 switch (try_to_unmap(page, 0)) {
481 case SWAP_FAIL: 487 case SWAP_FAIL:
482 goto activate_locked; 488 goto activate_locked;
483 case SWAP_AGAIN: 489 case SWAP_AGAIN:
@@ -492,7 +498,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
492 goto keep_locked; 498 goto keep_locked;
493 if (!may_enter_fs) 499 if (!may_enter_fs)
494 goto keep_locked; 500 goto keep_locked;
495 if (laptop_mode && !sc->may_writepage) 501 if (!sc->may_writepage)
496 goto keep_locked; 502 goto keep_locked;
497 503
498 /* Page is dirty, try to write it out here */ 504 /* Page is dirty, try to write it out here */
@@ -609,6 +615,15 @@ int putback_lru_pages(struct list_head *l)
609} 615}
610 616
611/* 617/*
618 * Non migratable page
619 */
620int fail_migrate_page(struct page *newpage, struct page *page)
621{
622 return -EIO;
623}
624EXPORT_SYMBOL(fail_migrate_page);
625
626/*
612 * swapout a single page 627 * swapout a single page
613 * page is locked upon entry, unlocked on exit 628 * page is locked upon entry, unlocked on exit
614 */ 629 */
@@ -617,7 +632,7 @@ static int swap_page(struct page *page)
617 struct address_space *mapping = page_mapping(page); 632 struct address_space *mapping = page_mapping(page);
618 633
619 if (page_mapped(page) && mapping) 634 if (page_mapped(page) && mapping)
620 if (try_to_unmap(page) != SWAP_SUCCESS) 635 if (try_to_unmap(page, 0) != SWAP_SUCCESS)
621 goto unlock_retry; 636 goto unlock_retry;
622 637
623 if (PageDirty(page)) { 638 if (PageDirty(page)) {
@@ -653,6 +668,167 @@ unlock_retry:
653retry: 668retry:
654 return -EAGAIN; 669 return -EAGAIN;
655} 670}
671EXPORT_SYMBOL(swap_page);
672
673/*
674 * Page migration was first developed in the context of the memory hotplug
675 * project. The main authors of the migration code are:
676 *
677 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
678 * Hirokazu Takahashi <taka@valinux.co.jp>
679 * Dave Hansen <haveblue@us.ibm.com>
680 * Christoph Lameter <clameter@sgi.com>
681 */
682
683/*
684 * Remove references for a page and establish the new page with the correct
685 * basic settings to be able to stop accesses to the page.
686 */
687int migrate_page_remove_references(struct page *newpage,
688 struct page *page, int nr_refs)
689{
690 struct address_space *mapping = page_mapping(page);
691 struct page **radix_pointer;
692
693 /*
694 * Avoid doing any of the following work if the page count
695 * indicates that the page is in use or truncate has removed
696 * the page.
697 */
698 if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
699 return 1;
700
701 /*
702 * Establish swap ptes for anonymous pages or destroy pte
703 * maps for files.
704 *
705 * In order to reestablish file backed mappings the fault handlers
706 * will take the radix tree_lock which may then be used to stop
707 * processses from accessing this page until the new page is ready.
708 *
709 * A process accessing via a swap pte (an anonymous page) will take a
710 * page_lock on the old page which will block the process until the
711 * migration attempt is complete. At that time the PageSwapCache bit
712 * will be examined. If the page was migrated then the PageSwapCache
713 * bit will be clear and the operation to retrieve the page will be
714 * retried which will find the new page in the radix tree. Then a new
715 * direct mapping may be generated based on the radix tree contents.
716 *
717 * If the page was not migrated then the PageSwapCache bit
718 * is still set and the operation may continue.
719 */
720 try_to_unmap(page, 1);
721
722 /*
723 * Give up if we were unable to remove all mappings.
724 */
725 if (page_mapcount(page))
726 return 1;
727
728 write_lock_irq(&mapping->tree_lock);
729
730 radix_pointer = (struct page **)radix_tree_lookup_slot(
731 &mapping->page_tree,
732 page_index(page));
733
734 if (!page_mapping(page) || page_count(page) != nr_refs ||
735 *radix_pointer != page) {
736 write_unlock_irq(&mapping->tree_lock);
737 return 1;
738 }
739
740 /*
741 * Now we know that no one else is looking at the page.
742 *
743 * Certain minimal information about a page must be available
744 * in order for other subsystems to properly handle the page if they
745 * find it through the radix tree update before we are finished
746 * copying the page.
747 */
748 get_page(newpage);
749 newpage->index = page->index;
750 newpage->mapping = page->mapping;
751 if (PageSwapCache(page)) {
752 SetPageSwapCache(newpage);
753 set_page_private(newpage, page_private(page));
754 }
755
756 *radix_pointer = newpage;
757 __put_page(page);
758 write_unlock_irq(&mapping->tree_lock);
759
760 return 0;
761}
762EXPORT_SYMBOL(migrate_page_remove_references);
763
764/*
765 * Copy the page to its new location
766 */
767void migrate_page_copy(struct page *newpage, struct page *page)
768{
769 copy_highpage(newpage, page);
770
771 if (PageError(page))
772 SetPageError(newpage);
773 if (PageReferenced(page))
774 SetPageReferenced(newpage);
775 if (PageUptodate(page))
776 SetPageUptodate(newpage);
777 if (PageActive(page))
778 SetPageActive(newpage);
779 if (PageChecked(page))
780 SetPageChecked(newpage);
781 if (PageMappedToDisk(page))
782 SetPageMappedToDisk(newpage);
783
784 if (PageDirty(page)) {
785 clear_page_dirty_for_io(page);
786 set_page_dirty(newpage);
787 }
788
789 ClearPageSwapCache(page);
790 ClearPageActive(page);
791 ClearPagePrivate(page);
792 set_page_private(page, 0);
793 page->mapping = NULL;
794
795 /*
796 * If any waiters have accumulated on the new page then
797 * wake them up.
798 */
799 if (PageWriteback(newpage))
800 end_page_writeback(newpage);
801}
802EXPORT_SYMBOL(migrate_page_copy);
803
804/*
805 * Common logic to directly migrate a single page suitable for
806 * pages that do not use PagePrivate.
807 *
808 * Pages are locked upon entry and exit.
809 */
810int migrate_page(struct page *newpage, struct page *page)
811{
812 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
813
814 if (migrate_page_remove_references(newpage, page, 2))
815 return -EAGAIN;
816
817 migrate_page_copy(newpage, page);
818
819 /*
820 * Remove auxiliary swap entries and replace
821 * them with real ptes.
822 *
823 * Note that a real pte entry will allow processes that are not
824 * waiting on the page lock to use the new page via the page tables
825 * before the new page is unlocked.
826 */
827 remove_from_swap(newpage);
828 return 0;
829}
830EXPORT_SYMBOL(migrate_page);
831
656/* 832/*
657 * migrate_pages 833 * migrate_pages
658 * 834 *
@@ -666,11 +842,6 @@ retry:
666 * are movable anymore because t has become empty 842 * are movable anymore because t has become empty
667 * or no retryable pages exist anymore. 843 * or no retryable pages exist anymore.
668 * 844 *
669 * SIMPLIFIED VERSION: This implementation of migrate_pages
670 * is only swapping out pages and never touches the second
671 * list. The direct migration patchset
672 * extends this function to avoid the use of swap.
673 *
674 * Return: Number of pages not migrated when "to" ran empty. 845 * Return: Number of pages not migrated when "to" ran empty.
675 */ 846 */
676int migrate_pages(struct list_head *from, struct list_head *to, 847int migrate_pages(struct list_head *from, struct list_head *to,
@@ -691,6 +862,9 @@ redo:
691 retry = 0; 862 retry = 0;
692 863
693 list_for_each_entry_safe(page, page2, from, lru) { 864 list_for_each_entry_safe(page, page2, from, lru) {
865 struct page *newpage = NULL;
866 struct address_space *mapping;
867
694 cond_resched(); 868 cond_resched();
695 869
696 rc = 0; 870 rc = 0;
@@ -698,6 +872,9 @@ redo:
698 /* page was freed from under us. So we are done. */ 872 /* page was freed from under us. So we are done. */
699 goto next; 873 goto next;
700 874
875 if (to && list_empty(to))
876 break;
877
701 /* 878 /*
702 * Skip locked pages during the first two passes to give the 879 * Skip locked pages during the first two passes to give the
703 * functions holding the lock time to release the page. Later we 880 * functions holding the lock time to release the page. Later we
@@ -734,12 +911,69 @@ redo:
734 } 911 }
735 } 912 }
736 913
914 if (!to) {
915 rc = swap_page(page);
916 goto next;
917 }
918
919 newpage = lru_to_page(to);
920 lock_page(newpage);
921
737 /* 922 /*
738 * Page is properly locked and writeback is complete. 923 * Pages are properly locked and writeback is complete.
739 * Try to migrate the page. 924 * Try to migrate the page.
740 */ 925 */
741 rc = swap_page(page); 926 mapping = page_mapping(page);
742 goto next; 927 if (!mapping)
928 goto unlock_both;
929
930 if (mapping->a_ops->migratepage) {
931 rc = mapping->a_ops->migratepage(newpage, page);
932 goto unlock_both;
933 }
934
935 /*
936 * Trigger writeout if page is dirty
937 */
938 if (PageDirty(page)) {
939 switch (pageout(page, mapping)) {
940 case PAGE_KEEP:
941 case PAGE_ACTIVATE:
942 goto unlock_both;
943
944 case PAGE_SUCCESS:
945 unlock_page(newpage);
946 goto next;
947
948 case PAGE_CLEAN:
949 ; /* try to migrate the page below */
950 }
951 }
952 /*
953 * If we have no buffer or can release the buffer
954 * then do a simple migration.
955 */
956 if (!page_has_buffers(page) ||
957 try_to_release_page(page, GFP_KERNEL)) {
958 rc = migrate_page(newpage, page);
959 goto unlock_both;
960 }
961
962 /*
963 * On early passes with mapped pages simply
964 * retry. There may be a lock held for some
965 * buffers that may go away. Later
966 * swap them out.
967 */
968 if (pass > 4) {
969 unlock_page(newpage);
970 newpage = NULL;
971 rc = swap_page(page);
972 goto next;
973 }
974
975unlock_both:
976 unlock_page(newpage);
743 977
744unlock_page: 978unlock_page:
745 unlock_page(page); 979 unlock_page(page);
@@ -752,7 +986,10 @@ next:
752 list_move(&page->lru, failed); 986 list_move(&page->lru, failed);
753 nr_failed++; 987 nr_failed++;
754 } else { 988 } else {
755 /* Success */ 989 if (newpage) {
990 /* Successful migration. Return page to LRU */
991 move_to_lru(newpage);
992 }
756 list_move(&page->lru, moved); 993 list_move(&page->lru, moved);
757 } 994 }
758 } 995 }
@@ -1170,7 +1407,7 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1170 int i; 1407 int i;
1171 1408
1172 sc.gfp_mask = gfp_mask; 1409 sc.gfp_mask = gfp_mask;
1173 sc.may_writepage = 0; 1410 sc.may_writepage = !laptop_mode;
1174 sc.may_swap = 1; 1411 sc.may_swap = 1;
1175 1412
1176 inc_page_state(allocstall); 1413 inc_page_state(allocstall);
@@ -1273,7 +1510,7 @@ loop_again:
1273 total_scanned = 0; 1510 total_scanned = 0;
1274 total_reclaimed = 0; 1511 total_reclaimed = 0;
1275 sc.gfp_mask = GFP_KERNEL; 1512 sc.gfp_mask = GFP_KERNEL;
1276 sc.may_writepage = 0; 1513 sc.may_writepage = !laptop_mode;
1277 sc.may_swap = 1; 1514 sc.may_swap = 1;
1278 sc.nr_mapped = read_page_state(nr_mapped); 1515 sc.nr_mapped = read_page_state(nr_mapped);
1279 1516
@@ -1586,40 +1823,61 @@ module_init(kswapd_init)
1586 */ 1823 */
1587int zone_reclaim_mode __read_mostly; 1824int zone_reclaim_mode __read_mostly;
1588 1825
1826#define RECLAIM_OFF 0
1827#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */
1828#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
1829#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
1830#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */
1831
1589/* 1832/*
1590 * Mininum time between zone reclaim scans 1833 * Mininum time between zone reclaim scans
1591 */ 1834 */
1592#define ZONE_RECLAIM_INTERVAL HZ/2 1835int zone_reclaim_interval __read_mostly = 30*HZ;
1836
1837/*
1838 * Priority for ZONE_RECLAIM. This determines the fraction of pages
1839 * of a node considered for each zone_reclaim. 4 scans 1/16th of
1840 * a zone.
1841 */
1842#define ZONE_RECLAIM_PRIORITY 4
1843
1593/* 1844/*
1594 * Try to free up some pages from this zone through reclaim. 1845 * Try to free up some pages from this zone through reclaim.
1595 */ 1846 */
1596int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1847int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1597{ 1848{
1598 int nr_pages = 1 << order; 1849 int nr_pages;
1599 struct task_struct *p = current; 1850 struct task_struct *p = current;
1600 struct reclaim_state reclaim_state; 1851 struct reclaim_state reclaim_state;
1601 struct scan_control sc = { 1852 struct scan_control sc;
1602 .gfp_mask = gfp_mask, 1853 cpumask_t mask;
1603 .may_writepage = 0, 1854 int node_id;
1604 .may_swap = 0, 1855
1605 .nr_mapped = read_page_state(nr_mapped), 1856 if (time_before(jiffies,
1606 .nr_scanned = 0, 1857 zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
1607 .nr_reclaimed = 0, 1858 return 0;
1608 .priority = 0
1609 };
1610 1859
1611 if (!(gfp_mask & __GFP_WAIT) || 1860 if (!(gfp_mask & __GFP_WAIT) ||
1612 zone->zone_pgdat->node_id != numa_node_id() ||
1613 zone->all_unreclaimable || 1861 zone->all_unreclaimable ||
1614 atomic_read(&zone->reclaim_in_progress) > 0) 1862 atomic_read(&zone->reclaim_in_progress) > 0)
1615 return 0; 1863 return 0;
1616 1864
1617 if (time_before(jiffies, 1865 node_id = zone->zone_pgdat->node_id;
1618 zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL)) 1866 mask = node_to_cpumask(node_id);
1619 return 0; 1867 if (!cpus_empty(mask) && node_id != numa_node_id())
1868 return 0;
1869
1870 sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
1871 sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
1872 sc.nr_scanned = 0;
1873 sc.nr_reclaimed = 0;
1874 sc.priority = ZONE_RECLAIM_PRIORITY + 1;
1875 sc.nr_mapped = read_page_state(nr_mapped);
1876 sc.gfp_mask = gfp_mask;
1620 1877
1621 disable_swap_token(); 1878 disable_swap_token();
1622 1879
1880 nr_pages = 1 << order;
1623 if (nr_pages > SWAP_CLUSTER_MAX) 1881 if (nr_pages > SWAP_CLUSTER_MAX)
1624 sc.swap_cluster_max = nr_pages; 1882 sc.swap_cluster_max = nr_pages;
1625 else 1883 else
@@ -1629,14 +1887,37 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1629 p->flags |= PF_MEMALLOC; 1887 p->flags |= PF_MEMALLOC;
1630 reclaim_state.reclaimed_slab = 0; 1888 reclaim_state.reclaimed_slab = 0;
1631 p->reclaim_state = &reclaim_state; 1889 p->reclaim_state = &reclaim_state;
1632 shrink_zone(zone, &sc); 1890
1891 /*
1892 * Free memory by calling shrink zone with increasing priorities
1893 * until we have enough memory freed.
1894 */
1895 do {
1896 sc.priority--;
1897 shrink_zone(zone, &sc);
1898
1899 } while (sc.nr_reclaimed < nr_pages && sc.priority > 0);
1900
1901 if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
1902 /*
1903 * shrink_slab does not currently allow us to determine
1904 * how many pages were freed in the zone. So we just
1905 * shake the slab and then go offnode for a single allocation.
1906 *
1907 * shrink_slab will free memory on all zones and may take
1908 * a long time.
1909 */
1910 shrink_slab(sc.nr_scanned, gfp_mask, order);
1911 sc.nr_reclaimed = 1; /* Avoid getting the off node timeout */
1912 }
1913
1633 p->reclaim_state = NULL; 1914 p->reclaim_state = NULL;
1634 current->flags &= ~PF_MEMALLOC; 1915 current->flags &= ~PF_MEMALLOC;
1635 1916
1636 if (sc.nr_reclaimed == 0) 1917 if (sc.nr_reclaimed == 0)
1637 zone->last_unsuccessful_zone_reclaim = jiffies; 1918 zone->last_unsuccessful_zone_reclaim = jiffies;
1638 1919
1639 return sc.nr_reclaimed > nr_pages; 1920 return sc.nr_reclaimed >= nr_pages;
1640} 1921}
1641#endif 1922#endif
1642 1923