aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/hugetlb.c9
-rw-r--r--mm/memory.c7
-rw-r--r--mm/mempolicy.c169
-rw-r--r--mm/oom_kill.c1
-rw-r--r--mm/page_alloc.c16
-rw-r--r--mm/rmap.c51
-rw-r--r--mm/shmem.c8
-rw-r--r--mm/slab.c813
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c16
-rw-r--r--mm/vmscan.c343
11 files changed, 1028 insertions, 406 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b21d78c941b5..ceb3ebb3c399 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -444,6 +444,15 @@ retry:
444 page = alloc_huge_page(vma, address); 444 page = alloc_huge_page(vma, address);
445 if (!page) { 445 if (!page) {
446 hugetlb_put_quota(mapping); 446 hugetlb_put_quota(mapping);
447 /*
448 * No huge pages available. So this is an OOM
449 * condition but we do not want to trigger the OOM
450 * killer, so we return VM_FAULT_SIGBUS.
451 *
452 * A program using hugepages may fault with Bus Error
453 * because no huge pages are available in the cpuset, per
454 * memory policy or because all are in use!
455 */
447 goto out; 456 goto out;
448 } 457 }
449 458
diff --git a/mm/memory.c b/mm/memory.c
index 7a11ddd5060f..2bee1f21aa8a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1871,6 +1871,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1871 goto out; 1871 goto out;
1872 1872
1873 entry = pte_to_swp_entry(orig_pte); 1873 entry = pte_to_swp_entry(orig_pte);
1874again:
1874 page = lookup_swap_cache(entry); 1875 page = lookup_swap_cache(entry);
1875 if (!page) { 1876 if (!page) {
1876 swapin_readahead(entry, address, vma); 1877 swapin_readahead(entry, address, vma);
@@ -1894,6 +1895,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1894 1895
1895 mark_page_accessed(page); 1896 mark_page_accessed(page);
1896 lock_page(page); 1897 lock_page(page);
1898 if (!PageSwapCache(page)) {
1899 /* Page migration has occured */
1900 unlock_page(page);
1901 page_cache_release(page);
1902 goto again;
1903 }
1897 1904
1898 /* 1905 /*
1899 * Back out if somebody else already faulted in this pte. 1906 * Back out if somebody else already faulted in this pte.
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 73790188b0eb..3bd7fb7e4b75 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -95,6 +95,9 @@
95#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 95#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
96#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ 96#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
97 97
98/* The number of pages to migrate per call to migrate_pages() */
99#define MIGRATE_CHUNK_SIZE 256
100
98static kmem_cache_t *policy_cache; 101static kmem_cache_t *policy_cache;
99static kmem_cache_t *sn_cache; 102static kmem_cache_t *sn_cache;
100 103
@@ -543,24 +546,91 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
543 } 546 }
544} 547}
545 548
546static int swap_pages(struct list_head *pagelist) 549/*
550 * Migrate the list 'pagelist' of pages to a certain destination.
551 *
552 * Specify destination with either non-NULL vma or dest_node >= 0
553 * Return the number of pages not migrated or error code
554 */
555static int migrate_pages_to(struct list_head *pagelist,
556 struct vm_area_struct *vma, int dest)
547{ 557{
558 LIST_HEAD(newlist);
548 LIST_HEAD(moved); 559 LIST_HEAD(moved);
549 LIST_HEAD(failed); 560 LIST_HEAD(failed);
550 int n; 561 int err = 0;
562 int nr_pages;
563 struct page *page;
564 struct list_head *p;
551 565
552 n = migrate_pages(pagelist, NULL, &moved, &failed); 566redo:
553 putback_lru_pages(&failed); 567 nr_pages = 0;
554 putback_lru_pages(&moved); 568 list_for_each(p, pagelist) {
569 if (vma)
570 page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start);
571 else
572 page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
555 573
556 return n; 574 if (!page) {
575 err = -ENOMEM;
576 goto out;
577 }
578 list_add(&page->lru, &newlist);
579 nr_pages++;
580 if (nr_pages > MIGRATE_CHUNK_SIZE);
581 break;
582 }
583 err = migrate_pages(pagelist, &newlist, &moved, &failed);
584
585 putback_lru_pages(&moved); /* Call release pages instead ?? */
586
587 if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
588 goto redo;
589out:
590 /* Return leftover allocated pages */
591 while (!list_empty(&newlist)) {
592 page = list_entry(newlist.next, struct page, lru);
593 list_del(&page->lru);
594 __free_page(page);
595 }
596 list_splice(&failed, pagelist);
597 if (err < 0)
598 return err;
599
600 /* Calculate number of leftover pages */
601 nr_pages = 0;
602 list_for_each(p, pagelist)
603 nr_pages++;
604 return nr_pages;
557} 605}
558 606
559/* 607/*
560 * For now migrate_pages simply swaps out the pages from nodes that are in 608 * Migrate pages from one node to a target node.
561 * the source set but not in the target set. In the future, we would 609 * Returns error or the number of pages not migrated.
562 * want a function that moves pages between the two nodesets in such 610 */
563 * a way as to preserve the physical layout as much as possible. 611int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
612{
613 nodemask_t nmask;
614 LIST_HEAD(pagelist);
615 int err = 0;
616
617 nodes_clear(nmask);
618 node_set(source, nmask);
619
620 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
621 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
622
623 if (!list_empty(&pagelist)) {
624 err = migrate_pages_to(&pagelist, NULL, dest);
625 if (!list_empty(&pagelist))
626 putback_lru_pages(&pagelist);
627 }
628 return err;
629}
630
631/*
632 * Move pages between the two nodesets so as to preserve the physical
633 * layout as much as possible.
564 * 634 *
565 * Returns the number of page that could not be moved. 635 * Returns the number of page that could not be moved.
566 */ 636 */
@@ -568,22 +638,76 @@ int do_migrate_pages(struct mm_struct *mm,
568 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 638 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
569{ 639{
570 LIST_HEAD(pagelist); 640 LIST_HEAD(pagelist);
571 int count = 0; 641 int busy = 0;
572 nodemask_t nodes; 642 int err = 0;
643 nodemask_t tmp;
573 644
574 nodes_andnot(nodes, *from_nodes, *to_nodes); 645 down_read(&mm->mmap_sem);
575 646
576 down_read(&mm->mmap_sem); 647/*
577 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes, 648 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
578 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 649 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
650 * bit in 'tmp', and return that <source, dest> pair for migration.
651 * The pair of nodemasks 'to' and 'from' define the map.
652 *
653 * If no pair of bits is found that way, fallback to picking some
654 * pair of 'source' and 'dest' bits that are not the same. If the
655 * 'source' and 'dest' bits are the same, this represents a node
656 * that will be migrating to itself, so no pages need move.
657 *
658 * If no bits are left in 'tmp', or if all remaining bits left
659 * in 'tmp' correspond to the same bit in 'to', return false
660 * (nothing left to migrate).
661 *
662 * This lets us pick a pair of nodes to migrate between, such that
663 * if possible the dest node is not already occupied by some other
664 * source node, minimizing the risk of overloading the memory on a
665 * node that would happen if we migrated incoming memory to a node
666 * before migrating outgoing memory source that same node.
667 *
668 * A single scan of tmp is sufficient. As we go, we remember the
669 * most recent <s, d> pair that moved (s != d). If we find a pair
670 * that not only moved, but what's better, moved to an empty slot
671 * (d is not set in tmp), then we break out then, with that pair.
672 * Otherwise when we finish scannng from_tmp, we at least have the
673 * most recent <s, d> pair that moved. If we get all the way through
674 * the scan of tmp without finding any node that moved, much less
675 * moved to an empty node, then there is nothing left worth migrating.
676 */
579 677
580 if (!list_empty(&pagelist)) { 678 tmp = *from_nodes;
581 count = swap_pages(&pagelist); 679 while (!nodes_empty(tmp)) {
582 putback_lru_pages(&pagelist); 680 int s,d;
681 int source = -1;
682 int dest = 0;
683
684 for_each_node_mask(s, tmp) {
685 d = node_remap(s, *from_nodes, *to_nodes);
686 if (s == d)
687 continue;
688
689 source = s; /* Node moved. Memorize */
690 dest = d;
691
692 /* dest not in remaining from nodes? */
693 if (!node_isset(dest, tmp))
694 break;
695 }
696 if (source == -1)
697 break;
698
699 node_clear(source, tmp);
700 err = migrate_to_node(mm, source, dest, flags);
701 if (err > 0)
702 busy += err;
703 if (err < 0)
704 break;
583 } 705 }
584 706
585 up_read(&mm->mmap_sem); 707 up_read(&mm->mmap_sem);
586 return count; 708 if (err < 0)
709 return err;
710 return busy;
587} 711}
588 712
589long do_mbind(unsigned long start, unsigned long len, 713long do_mbind(unsigned long start, unsigned long len,
@@ -643,8 +767,9 @@ long do_mbind(unsigned long start, unsigned long len,
643 int nr_failed = 0; 767 int nr_failed = 0;
644 768
645 err = mbind_range(vma, start, end, new); 769 err = mbind_range(vma, start, end, new);
770
646 if (!list_empty(&pagelist)) 771 if (!list_empty(&pagelist))
647 nr_failed = swap_pages(&pagelist); 772 nr_failed = migrate_pages_to(&pagelist, vma, -1);
648 773
649 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 774 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
650 err = -EIO; 775 err = -EIO;
@@ -1034,6 +1159,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1034 return interleave_nodes(pol); 1159 return interleave_nodes(pol);
1035} 1160}
1036 1161
1162#ifdef CONFIG_HUGETLBFS
1037/* Return a zonelist suitable for a huge page allocation. */ 1163/* Return a zonelist suitable for a huge page allocation. */
1038struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) 1164struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1039{ 1165{
@@ -1047,6 +1173,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1047 } 1173 }
1048 return zonelist_policy(GFP_HIGHUSER, pol); 1174 return zonelist_policy(GFP_HIGHUSER, pol);
1049} 1175}
1176#endif
1050 1177
1051/* Allocate a page in interleaved policy. 1178/* Allocate a page in interleaved policy.
1052 Own path because it needs to do special accounting. */ 1179 Own path because it needs to do special accounting. */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 14bd4ec79597..b05ab8f2a562 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -271,6 +271,7 @@ void out_of_memory(gfp_t gfp_mask, int order)
271 if (printk_ratelimit()) { 271 if (printk_ratelimit()) {
272 printk("oom-killer: gfp_mask=0x%x, order=%d\n", 272 printk("oom-killer: gfp_mask=0x%x, order=%d\n",
273 gfp_mask, order); 273 gfp_mask, order);
274 dump_stack();
274 show_mem(); 275 show_mem();
275 } 276 }
276 277
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index df54e2fc8ee0..dde04ff4be31 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1213,18 +1213,21 @@ static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
1213{ 1213{
1214 int cpu = 0; 1214 int cpu = 0;
1215 1215
1216 memset(ret, 0, sizeof(*ret)); 1216 memset(ret, 0, nr * sizeof(unsigned long));
1217 cpus_and(*cpumask, *cpumask, cpu_online_map); 1217 cpus_and(*cpumask, *cpumask, cpu_online_map);
1218 1218
1219 cpu = first_cpu(*cpumask); 1219 cpu = first_cpu(*cpumask);
1220 while (cpu < NR_CPUS) { 1220 while (cpu < NR_CPUS) {
1221 unsigned long *in, *out, off; 1221 unsigned long *in, *out, off;
1222 1222
1223 if (!cpu_isset(cpu, *cpumask))
1224 continue;
1225
1223 in = (unsigned long *)&per_cpu(page_states, cpu); 1226 in = (unsigned long *)&per_cpu(page_states, cpu);
1224 1227
1225 cpu = next_cpu(cpu, *cpumask); 1228 cpu = next_cpu(cpu, *cpumask);
1226 1229
1227 if (cpu < NR_CPUS) 1230 if (likely(cpu < NR_CPUS))
1228 prefetch(&per_cpu(page_states, cpu)); 1231 prefetch(&per_cpu(page_states, cpu));
1229 1232
1230 out = (unsigned long *)ret; 1233 out = (unsigned long *)ret;
@@ -1799,7 +1802,7 @@ void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
1799 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1802 memmap_init_zone((size), (nid), (zone), (start_pfn))
1800#endif 1803#endif
1801 1804
1802static int __meminit zone_batchsize(struct zone *zone) 1805static int __cpuinit zone_batchsize(struct zone *zone)
1803{ 1806{
1804 int batch; 1807 int batch;
1805 1808
@@ -1886,14 +1889,13 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
1886 * not check if the processor is online before following the pageset pointer. 1889 * not check if the processor is online before following the pageset pointer.
1887 * Other parts of the kernel may not check if the zone is available. 1890 * Other parts of the kernel may not check if the zone is available.
1888 */ 1891 */
1889static struct per_cpu_pageset 1892static struct per_cpu_pageset boot_pageset[NR_CPUS];
1890 boot_pageset[NR_CPUS];
1891 1893
1892/* 1894/*
1893 * Dynamically allocate memory for the 1895 * Dynamically allocate memory for the
1894 * per cpu pageset array in struct zone. 1896 * per cpu pageset array in struct zone.
1895 */ 1897 */
1896static int __meminit process_zones(int cpu) 1898static int __cpuinit process_zones(int cpu)
1897{ 1899{
1898 struct zone *zone, *dzone; 1900 struct zone *zone, *dzone;
1899 1901
@@ -1934,7 +1936,7 @@ static inline void free_zone_pagesets(int cpu)
1934 } 1936 }
1935} 1937}
1936 1938
1937static int __meminit pageset_cpuup_callback(struct notifier_block *nfb, 1939static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
1938 unsigned long action, 1940 unsigned long action,
1939 void *hcpu) 1941 void *hcpu)
1940{ 1942{
diff --git a/mm/rmap.c b/mm/rmap.c
index d85a99d28c03..df2c41c2a9a2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -52,6 +52,7 @@
52#include <linux/init.h> 52#include <linux/init.h>
53#include <linux/rmap.h> 53#include <linux/rmap.h>
54#include <linux/rcupdate.h> 54#include <linux/rcupdate.h>
55#include <linux/module.h>
55 56
56#include <asm/tlbflush.h> 57#include <asm/tlbflush.h>
57 58
@@ -205,6 +206,36 @@ out:
205 return anon_vma; 206 return anon_vma;
206} 207}
207 208
209#ifdef CONFIG_MIGRATION
210/*
211 * Remove an anonymous page from swap replacing the swap pte's
212 * through real pte's pointing to valid pages and then releasing
213 * the page from the swap cache.
214 *
215 * Must hold page lock on page.
216 */
217void remove_from_swap(struct page *page)
218{
219 struct anon_vma *anon_vma;
220 struct vm_area_struct *vma;
221
222 if (!PageAnon(page) || !PageSwapCache(page))
223 return;
224
225 anon_vma = page_lock_anon_vma(page);
226 if (!anon_vma)
227 return;
228
229 list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
230 remove_vma_swap(vma, page);
231
232 spin_unlock(&anon_vma->lock);
233
234 delete_from_swap_cache(page);
235}
236EXPORT_SYMBOL(remove_from_swap);
237#endif
238
208/* 239/*
209 * At what user virtual address is page expected in vma? 240 * At what user virtual address is page expected in vma?
210 */ 241 */
@@ -541,7 +572,8 @@ void page_remove_rmap(struct page *page)
541 * Subfunctions of try_to_unmap: try_to_unmap_one called 572 * Subfunctions of try_to_unmap: try_to_unmap_one called
542 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 573 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
543 */ 574 */
544static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) 575static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
576 int ignore_refs)
545{ 577{
546 struct mm_struct *mm = vma->vm_mm; 578 struct mm_struct *mm = vma->vm_mm;
547 unsigned long address; 579 unsigned long address;
@@ -564,7 +596,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
564 * skipped over this mm) then we should reactivate it. 596 * skipped over this mm) then we should reactivate it.
565 */ 597 */
566 if ((vma->vm_flags & VM_LOCKED) || 598 if ((vma->vm_flags & VM_LOCKED) ||
567 ptep_clear_flush_young(vma, address, pte)) { 599 (ptep_clear_flush_young(vma, address, pte)
600 && !ignore_refs)) {
568 ret = SWAP_FAIL; 601 ret = SWAP_FAIL;
569 goto out_unmap; 602 goto out_unmap;
570 } 603 }
@@ -698,7 +731,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
698 pte_unmap_unlock(pte - 1, ptl); 731 pte_unmap_unlock(pte - 1, ptl);
699} 732}
700 733
701static int try_to_unmap_anon(struct page *page) 734static int try_to_unmap_anon(struct page *page, int ignore_refs)
702{ 735{
703 struct anon_vma *anon_vma; 736 struct anon_vma *anon_vma;
704 struct vm_area_struct *vma; 737 struct vm_area_struct *vma;
@@ -709,7 +742,7 @@ static int try_to_unmap_anon(struct page *page)
709 return ret; 742 return ret;
710 743
711 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 744 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
712 ret = try_to_unmap_one(page, vma); 745 ret = try_to_unmap_one(page, vma, ignore_refs);
713 if (ret == SWAP_FAIL || !page_mapped(page)) 746 if (ret == SWAP_FAIL || !page_mapped(page))
714 break; 747 break;
715 } 748 }
@@ -726,7 +759,7 @@ static int try_to_unmap_anon(struct page *page)
726 * 759 *
727 * This function is only called from try_to_unmap for object-based pages. 760 * This function is only called from try_to_unmap for object-based pages.
728 */ 761 */
729static int try_to_unmap_file(struct page *page) 762static int try_to_unmap_file(struct page *page, int ignore_refs)
730{ 763{
731 struct address_space *mapping = page->mapping; 764 struct address_space *mapping = page->mapping;
732 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 765 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -740,7 +773,7 @@ static int try_to_unmap_file(struct page *page)
740 773
741 spin_lock(&mapping->i_mmap_lock); 774 spin_lock(&mapping->i_mmap_lock);
742 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 775 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
743 ret = try_to_unmap_one(page, vma); 776 ret = try_to_unmap_one(page, vma, ignore_refs);
744 if (ret == SWAP_FAIL || !page_mapped(page)) 777 if (ret == SWAP_FAIL || !page_mapped(page))
745 goto out; 778 goto out;
746 } 779 }
@@ -825,16 +858,16 @@ out:
825 * SWAP_AGAIN - we missed a mapping, try again later 858 * SWAP_AGAIN - we missed a mapping, try again later
826 * SWAP_FAIL - the page is unswappable 859 * SWAP_FAIL - the page is unswappable
827 */ 860 */
828int try_to_unmap(struct page *page) 861int try_to_unmap(struct page *page, int ignore_refs)
829{ 862{
830 int ret; 863 int ret;
831 864
832 BUG_ON(!PageLocked(page)); 865 BUG_ON(!PageLocked(page));
833 866
834 if (PageAnon(page)) 867 if (PageAnon(page))
835 ret = try_to_unmap_anon(page); 868 ret = try_to_unmap_anon(page, ignore_refs);
836 else 869 else
837 ret = try_to_unmap_file(page); 870 ret = try_to_unmap_file(page, ignore_refs);
838 871
839 if (!page_mapped(page)) 872 if (!page_mapped(page))
840 ret = SWAP_SUCCESS; 873 ret = SWAP_SUCCESS;
diff --git a/mm/shmem.c b/mm/shmem.c
index ce501bce1c2e..f7ac7b812f92 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1028,6 +1028,14 @@ repeat:
1028 page_cache_release(swappage); 1028 page_cache_release(swappage);
1029 goto repeat; 1029 goto repeat;
1030 } 1030 }
1031 if (!PageSwapCache(swappage)) {
1032 /* Page migration has occured */
1033 shmem_swp_unmap(entry);
1034 spin_unlock(&info->lock);
1035 unlock_page(swappage);
1036 page_cache_release(swappage);
1037 goto repeat;
1038 }
1031 if (PageWriteback(swappage)) { 1039 if (PageWriteback(swappage)) {
1032 shmem_swp_unmap(entry); 1040 shmem_swp_unmap(entry);
1033 spin_unlock(&info->lock); 1041 spin_unlock(&info->lock);
diff --git a/mm/slab.c b/mm/slab.c
index 6f8495e2185b..d66c2b0d9715 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -55,7 +55,7 @@
55 * 55 *
56 * SMP synchronization: 56 * SMP synchronization:
57 * constructors and destructors are called without any locking. 57 * constructors and destructors are called without any locking.
58 * Several members in kmem_cache_t and struct slab never change, they 58 * Several members in struct kmem_cache and struct slab never change, they
59 * are accessed without any locking. 59 * are accessed without any locking.
60 * The per-cpu arrays are never accessed from the wrong cpu, no locking, 60 * The per-cpu arrays are never accessed from the wrong cpu, no locking,
61 * and local interrupts are disabled so slab code is preempt-safe. 61 * and local interrupts are disabled so slab code is preempt-safe.
@@ -244,7 +244,7 @@ struct slab {
244 */ 244 */
245struct slab_rcu { 245struct slab_rcu {
246 struct rcu_head head; 246 struct rcu_head head;
247 kmem_cache_t *cachep; 247 struct kmem_cache *cachep;
248 void *addr; 248 void *addr;
249}; 249};
250 250
@@ -294,6 +294,7 @@ struct kmem_list3 {
294 unsigned long next_reap; 294 unsigned long next_reap;
295 int free_touched; 295 int free_touched;
296 unsigned int free_limit; 296 unsigned int free_limit;
297 unsigned int colour_next; /* Per-node cache coloring */
297 spinlock_t list_lock; 298 spinlock_t list_lock;
298 struct array_cache *shared; /* shared per node */ 299 struct array_cache *shared; /* shared per node */
299 struct array_cache **alien; /* on other nodes */ 300 struct array_cache **alien; /* on other nodes */
@@ -316,6 +317,8 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
316 */ 317 */
317static __always_inline int index_of(const size_t size) 318static __always_inline int index_of(const size_t size)
318{ 319{
320 extern void __bad_size(void);
321
319 if (__builtin_constant_p(size)) { 322 if (__builtin_constant_p(size)) {
320 int i = 0; 323 int i = 0;
321 324
@@ -326,25 +329,23 @@ static __always_inline int index_of(const size_t size)
326 i++; 329 i++;
327#include "linux/kmalloc_sizes.h" 330#include "linux/kmalloc_sizes.h"
328#undef CACHE 331#undef CACHE
329 { 332 __bad_size();
330 extern void __bad_size(void);
331 __bad_size();
332 }
333 } else 333 } else
334 BUG(); 334 __bad_size();
335 return 0; 335 return 0;
336} 336}
337 337
338#define INDEX_AC index_of(sizeof(struct arraycache_init)) 338#define INDEX_AC index_of(sizeof(struct arraycache_init))
339#define INDEX_L3 index_of(sizeof(struct kmem_list3)) 339#define INDEX_L3 index_of(sizeof(struct kmem_list3))
340 340
341static inline void kmem_list3_init(struct kmem_list3 *parent) 341static void kmem_list3_init(struct kmem_list3 *parent)
342{ 342{
343 INIT_LIST_HEAD(&parent->slabs_full); 343 INIT_LIST_HEAD(&parent->slabs_full);
344 INIT_LIST_HEAD(&parent->slabs_partial); 344 INIT_LIST_HEAD(&parent->slabs_partial);
345 INIT_LIST_HEAD(&parent->slabs_free); 345 INIT_LIST_HEAD(&parent->slabs_free);
346 parent->shared = NULL; 346 parent->shared = NULL;
347 parent->alien = NULL; 347 parent->alien = NULL;
348 parent->colour_next = 0;
348 spin_lock_init(&parent->list_lock); 349 spin_lock_init(&parent->list_lock);
349 parent->free_objects = 0; 350 parent->free_objects = 0;
350 parent->free_touched = 0; 351 parent->free_touched = 0;
@@ -364,7 +365,7 @@ static inline void kmem_list3_init(struct kmem_list3 *parent)
364 } while (0) 365 } while (0)
365 366
366/* 367/*
367 * kmem_cache_t 368 * struct kmem_cache
368 * 369 *
369 * manages a cache. 370 * manages a cache.
370 */ 371 */
@@ -375,7 +376,7 @@ struct kmem_cache {
375 unsigned int batchcount; 376 unsigned int batchcount;
376 unsigned int limit; 377 unsigned int limit;
377 unsigned int shared; 378 unsigned int shared;
378 unsigned int objsize; 379 unsigned int buffer_size;
379/* 2) touched by every alloc & free from the backend */ 380/* 2) touched by every alloc & free from the backend */
380 struct kmem_list3 *nodelists[MAX_NUMNODES]; 381 struct kmem_list3 *nodelists[MAX_NUMNODES];
381 unsigned int flags; /* constant flags */ 382 unsigned int flags; /* constant flags */
@@ -391,16 +392,15 @@ struct kmem_cache {
391 392
392 size_t colour; /* cache colouring range */ 393 size_t colour; /* cache colouring range */
393 unsigned int colour_off; /* colour offset */ 394 unsigned int colour_off; /* colour offset */
394 unsigned int colour_next; /* cache colouring */ 395 struct kmem_cache *slabp_cache;
395 kmem_cache_t *slabp_cache;
396 unsigned int slab_size; 396 unsigned int slab_size;
397 unsigned int dflags; /* dynamic flags */ 397 unsigned int dflags; /* dynamic flags */
398 398
399 /* constructor func */ 399 /* constructor func */
400 void (*ctor) (void *, kmem_cache_t *, unsigned long); 400 void (*ctor) (void *, struct kmem_cache *, unsigned long);
401 401
402 /* de-constructor func */ 402 /* de-constructor func */
403 void (*dtor) (void *, kmem_cache_t *, unsigned long); 403 void (*dtor) (void *, struct kmem_cache *, unsigned long);
404 404
405/* 4) cache creation/removal */ 405/* 4) cache creation/removal */
406 const char *name; 406 const char *name;
@@ -423,8 +423,14 @@ struct kmem_cache {
423 atomic_t freemiss; 423 atomic_t freemiss;
424#endif 424#endif
425#if DEBUG 425#if DEBUG
426 int dbghead; 426 /*
427 int reallen; 427 * If debugging is enabled, then the allocator can add additional
428 * fields and/or padding to every object. buffer_size contains the total
429 * object size including these internal fields, the following two
430 * variables contain the offset to the user object and its size.
431 */
432 int obj_offset;
433 int obj_size;
428#endif 434#endif
429}; 435};
430 436
@@ -495,50 +501,50 @@ struct kmem_cache {
495 501
496/* memory layout of objects: 502/* memory layout of objects:
497 * 0 : objp 503 * 0 : objp
498 * 0 .. cachep->dbghead - BYTES_PER_WORD - 1: padding. This ensures that 504 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
499 * the end of an object is aligned with the end of the real 505 * the end of an object is aligned with the end of the real
500 * allocation. Catches writes behind the end of the allocation. 506 * allocation. Catches writes behind the end of the allocation.
501 * cachep->dbghead - BYTES_PER_WORD .. cachep->dbghead - 1: 507 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
502 * redzone word. 508 * redzone word.
503 * cachep->dbghead: The real object. 509 * cachep->obj_offset: The real object.
504 * cachep->objsize - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 510 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
505 * cachep->objsize - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long] 511 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
506 */ 512 */
507static int obj_dbghead(kmem_cache_t *cachep) 513static int obj_offset(struct kmem_cache *cachep)
508{ 514{
509 return cachep->dbghead; 515 return cachep->obj_offset;
510} 516}
511 517
512static int obj_reallen(kmem_cache_t *cachep) 518static int obj_size(struct kmem_cache *cachep)
513{ 519{
514 return cachep->reallen; 520 return cachep->obj_size;
515} 521}
516 522
517static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp) 523static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
518{ 524{
519 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 525 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
520 return (unsigned long*) (objp+obj_dbghead(cachep)-BYTES_PER_WORD); 526 return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD);
521} 527}
522 528
523static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp) 529static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
524{ 530{
525 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 531 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
526 if (cachep->flags & SLAB_STORE_USER) 532 if (cachep->flags & SLAB_STORE_USER)
527 return (unsigned long *)(objp + cachep->objsize - 533 return (unsigned long *)(objp + cachep->buffer_size -
528 2 * BYTES_PER_WORD); 534 2 * BYTES_PER_WORD);
529 return (unsigned long *)(objp + cachep->objsize - BYTES_PER_WORD); 535 return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD);
530} 536}
531 537
532static void **dbg_userword(kmem_cache_t *cachep, void *objp) 538static void **dbg_userword(struct kmem_cache *cachep, void *objp)
533{ 539{
534 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 540 BUG_ON(!(cachep->flags & SLAB_STORE_USER));
535 return (void **)(objp + cachep->objsize - BYTES_PER_WORD); 541 return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
536} 542}
537 543
538#else 544#else
539 545
540#define obj_dbghead(x) 0 546#define obj_offset(x) 0
541#define obj_reallen(cachep) (cachep->objsize) 547#define obj_size(cachep) (cachep->buffer_size)
542#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 548#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;})
543#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 549#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;})
544#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 550#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})
@@ -591,6 +597,18 @@ static inline struct slab *page_get_slab(struct page *page)
591 return (struct slab *)page->lru.prev; 597 return (struct slab *)page->lru.prev;
592} 598}
593 599
600static inline struct kmem_cache *virt_to_cache(const void *obj)
601{
602 struct page *page = virt_to_page(obj);
603 return page_get_cache(page);
604}
605
606static inline struct slab *virt_to_slab(const void *obj)
607{
608 struct page *page = virt_to_page(obj);
609 return page_get_slab(page);
610}
611
594/* These are the default caches for kmalloc. Custom caches can have other sizes. */ 612/* These are the default caches for kmalloc. Custom caches can have other sizes. */
595struct cache_sizes malloc_sizes[] = { 613struct cache_sizes malloc_sizes[] = {
596#define CACHE(x) { .cs_size = (x) }, 614#define CACHE(x) { .cs_size = (x) },
@@ -619,16 +637,16 @@ static struct arraycache_init initarray_generic =
619 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 637 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
620 638
621/* internal cache of cache description objs */ 639/* internal cache of cache description objs */
622static kmem_cache_t cache_cache = { 640static struct kmem_cache cache_cache = {
623 .batchcount = 1, 641 .batchcount = 1,
624 .limit = BOOT_CPUCACHE_ENTRIES, 642 .limit = BOOT_CPUCACHE_ENTRIES,
625 .shared = 1, 643 .shared = 1,
626 .objsize = sizeof(kmem_cache_t), 644 .buffer_size = sizeof(struct kmem_cache),
627 .flags = SLAB_NO_REAP, 645 .flags = SLAB_NO_REAP,
628 .spinlock = SPIN_LOCK_UNLOCKED, 646 .spinlock = SPIN_LOCK_UNLOCKED,
629 .name = "kmem_cache", 647 .name = "kmem_cache",
630#if DEBUG 648#if DEBUG
631 .reallen = sizeof(kmem_cache_t), 649 .obj_size = sizeof(struct kmem_cache),
632#endif 650#endif
633}; 651};
634 652
@@ -657,17 +675,17 @@ static enum {
657 675
658static DEFINE_PER_CPU(struct work_struct, reap_work); 676static DEFINE_PER_CPU(struct work_struct, reap_work);
659 677
660static void free_block(kmem_cache_t *cachep, void **objpp, int len, int node); 678static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node);
661static void enable_cpucache(kmem_cache_t *cachep); 679static void enable_cpucache(struct kmem_cache *cachep);
662static void cache_reap(void *unused); 680static void cache_reap(void *unused);
663static int __node_shrink(kmem_cache_t *cachep, int node); 681static int __node_shrink(struct kmem_cache *cachep, int node);
664 682
665static inline struct array_cache *ac_data(kmem_cache_t *cachep) 683static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
666{ 684{
667 return cachep->array[smp_processor_id()]; 685 return cachep->array[smp_processor_id()];
668} 686}
669 687
670static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags) 688static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags)
671{ 689{
672 struct cache_sizes *csizep = malloc_sizes; 690 struct cache_sizes *csizep = malloc_sizes;
673 691
@@ -691,43 +709,80 @@ static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
691 return csizep->cs_cachep; 709 return csizep->cs_cachep;
692} 710}
693 711
694kmem_cache_t *kmem_find_general_cachep(size_t size, gfp_t gfpflags) 712struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
695{ 713{
696 return __find_general_cachep(size, gfpflags); 714 return __find_general_cachep(size, gfpflags);
697} 715}
698EXPORT_SYMBOL(kmem_find_general_cachep); 716EXPORT_SYMBOL(kmem_find_general_cachep);
699 717
700/* Cal the num objs, wastage, and bytes left over for a given slab size. */ 718static size_t slab_mgmt_size(size_t nr_objs, size_t align)
701static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
702 int flags, size_t *left_over, unsigned int *num)
703{ 719{
704 int i; 720 return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
705 size_t wastage = PAGE_SIZE << gfporder; 721}
706 size_t extra = 0;
707 size_t base = 0;
708 722
709 if (!(flags & CFLGS_OFF_SLAB)) { 723/* Calculate the number of objects and left-over bytes for a given
710 base = sizeof(struct slab); 724 buffer size. */
711 extra = sizeof(kmem_bufctl_t); 725static void cache_estimate(unsigned long gfporder, size_t buffer_size,
712 } 726 size_t align, int flags, size_t *left_over,
713 i = 0; 727 unsigned int *num)
714 while (i * size + ALIGN(base + i * extra, align) <= wastage) 728{
715 i++; 729 int nr_objs;
716 if (i > 0) 730 size_t mgmt_size;
717 i--; 731 size_t slab_size = PAGE_SIZE << gfporder;
732
733 /*
734 * The slab management structure can be either off the slab or
735 * on it. For the latter case, the memory allocated for a
736 * slab is used for:
737 *
738 * - The struct slab
739 * - One kmem_bufctl_t for each object
740 * - Padding to respect alignment of @align
741 * - @buffer_size bytes for each object
742 *
743 * If the slab management structure is off the slab, then the
744 * alignment will already be calculated into the size. Because
745 * the slabs are all pages aligned, the objects will be at the
746 * correct alignment when allocated.
747 */
748 if (flags & CFLGS_OFF_SLAB) {
749 mgmt_size = 0;
750 nr_objs = slab_size / buffer_size;
751
752 if (nr_objs > SLAB_LIMIT)
753 nr_objs = SLAB_LIMIT;
754 } else {
755 /*
756 * Ignore padding for the initial guess. The padding
757 * is at most @align-1 bytes, and @buffer_size is at
758 * least @align. In the worst case, this result will
759 * be one greater than the number of objects that fit
760 * into the memory allocation when taking the padding
761 * into account.
762 */
763 nr_objs = (slab_size - sizeof(struct slab)) /
764 (buffer_size + sizeof(kmem_bufctl_t));
765
766 /*
767 * This calculated number will be either the right
768 * amount, or one greater than what we want.
769 */
770 if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
771 > slab_size)
772 nr_objs--;
718 773
719 if (i > SLAB_LIMIT) 774 if (nr_objs > SLAB_LIMIT)
720 i = SLAB_LIMIT; 775 nr_objs = SLAB_LIMIT;
721 776
722 *num = i; 777 mgmt_size = slab_mgmt_size(nr_objs, align);
723 wastage -= i * size; 778 }
724 wastage -= ALIGN(base + i * extra, align); 779 *num = nr_objs;
725 *left_over = wastage; 780 *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
726} 781}
727 782
728#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) 783#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
729 784
730static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg) 785static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg)
731{ 786{
732 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 787 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
733 function, cachep->name, msg); 788 function, cachep->name, msg);
@@ -774,9 +829,9 @@ static struct array_cache *alloc_arraycache(int node, int entries,
774} 829}
775 830
776#ifdef CONFIG_NUMA 831#ifdef CONFIG_NUMA
777static void *__cache_alloc_node(kmem_cache_t *, gfp_t, int); 832static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
778 833
779static inline struct array_cache **alloc_alien_cache(int node, int limit) 834static struct array_cache **alloc_alien_cache(int node, int limit)
780{ 835{
781 struct array_cache **ac_ptr; 836 struct array_cache **ac_ptr;
782 int memsize = sizeof(void *) * MAX_NUMNODES; 837 int memsize = sizeof(void *) * MAX_NUMNODES;
@@ -803,7 +858,7 @@ static inline struct array_cache **alloc_alien_cache(int node, int limit)
803 return ac_ptr; 858 return ac_ptr;
804} 859}
805 860
806static inline void free_alien_cache(struct array_cache **ac_ptr) 861static void free_alien_cache(struct array_cache **ac_ptr)
807{ 862{
808 int i; 863 int i;
809 864
@@ -816,8 +871,8 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
816 kfree(ac_ptr); 871 kfree(ac_ptr);
817} 872}
818 873
819static inline void __drain_alien_cache(kmem_cache_t *cachep, 874static void __drain_alien_cache(struct kmem_cache *cachep,
820 struct array_cache *ac, int node) 875 struct array_cache *ac, int node)
821{ 876{
822 struct kmem_list3 *rl3 = cachep->nodelists[node]; 877 struct kmem_list3 *rl3 = cachep->nodelists[node];
823 878
@@ -829,14 +884,14 @@ static inline void __drain_alien_cache(kmem_cache_t *cachep,
829 } 884 }
830} 885}
831 886
832static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3) 887static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
833{ 888{
834 int i = 0; 889 int i = 0;
835 struct array_cache *ac; 890 struct array_cache *ac;
836 unsigned long flags; 891 unsigned long flags;
837 892
838 for_each_online_node(i) { 893 for_each_online_node(i) {
839 ac = l3->alien[i]; 894 ac = alien[i];
840 if (ac) { 895 if (ac) {
841 spin_lock_irqsave(&ac->lock, flags); 896 spin_lock_irqsave(&ac->lock, flags);
842 __drain_alien_cache(cachep, ac, i); 897 __drain_alien_cache(cachep, ac, i);
@@ -845,16 +900,25 @@ static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
845 } 900 }
846} 901}
847#else 902#else
848#define alloc_alien_cache(node, limit) do { } while (0) 903
849#define free_alien_cache(ac_ptr) do { } while (0) 904#define drain_alien_cache(cachep, alien) do { } while (0)
850#define drain_alien_cache(cachep, l3) do { } while (0) 905
906static inline struct array_cache **alloc_alien_cache(int node, int limit)
907{
908 return (struct array_cache **) 0x01020304ul;
909}
910
911static inline void free_alien_cache(struct array_cache **ac_ptr)
912{
913}
914
851#endif 915#endif
852 916
853static int __devinit cpuup_callback(struct notifier_block *nfb, 917static int __devinit cpuup_callback(struct notifier_block *nfb,
854 unsigned long action, void *hcpu) 918 unsigned long action, void *hcpu)
855{ 919{
856 long cpu = (long)hcpu; 920 long cpu = (long)hcpu;
857 kmem_cache_t *cachep; 921 struct kmem_cache *cachep;
858 struct kmem_list3 *l3 = NULL; 922 struct kmem_list3 *l3 = NULL;
859 int node = cpu_to_node(cpu); 923 int node = cpu_to_node(cpu);
860 int memsize = sizeof(struct kmem_list3); 924 int memsize = sizeof(struct kmem_list3);
@@ -881,6 +945,11 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
881 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 945 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
882 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 946 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
883 947
948 /*
949 * The l3s don't come and go as CPUs come and
950 * go. cache_chain_mutex is sufficient
951 * protection here.
952 */
884 cachep->nodelists[node] = l3; 953 cachep->nodelists[node] = l3;
885 } 954 }
886 955
@@ -895,26 +964,46 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
895 & array cache's */ 964 & array cache's */
896 list_for_each_entry(cachep, &cache_chain, next) { 965 list_for_each_entry(cachep, &cache_chain, next) {
897 struct array_cache *nc; 966 struct array_cache *nc;
967 struct array_cache *shared;
968 struct array_cache **alien;
898 969
899 nc = alloc_arraycache(node, cachep->limit, 970 nc = alloc_arraycache(node, cachep->limit,
900 cachep->batchcount); 971 cachep->batchcount);
901 if (!nc) 972 if (!nc)
902 goto bad; 973 goto bad;
974 shared = alloc_arraycache(node,
975 cachep->shared * cachep->batchcount,
976 0xbaadf00d);
977 if (!shared)
978 goto bad;
979
980 alien = alloc_alien_cache(node, cachep->limit);
981 if (!alien)
982 goto bad;
903 cachep->array[cpu] = nc; 983 cachep->array[cpu] = nc;
904 984
905 l3 = cachep->nodelists[node]; 985 l3 = cachep->nodelists[node];
906 BUG_ON(!l3); 986 BUG_ON(!l3);
907 if (!l3->shared) {
908 if (!(nc = alloc_arraycache(node,
909 cachep->shared *
910 cachep->batchcount,
911 0xbaadf00d)))
912 goto bad;
913 987
914 /* we are serialised from CPU_DEAD or 988 spin_lock_irq(&l3->list_lock);
915 CPU_UP_CANCELLED by the cpucontrol lock */ 989 if (!l3->shared) {
916 l3->shared = nc; 990 /*
991 * We are serialised from CPU_DEAD or
992 * CPU_UP_CANCELLED by the cpucontrol lock
993 */
994 l3->shared = shared;
995 shared = NULL;
996 }
997#ifdef CONFIG_NUMA
998 if (!l3->alien) {
999 l3->alien = alien;
1000 alien = NULL;
917 } 1001 }
1002#endif
1003 spin_unlock_irq(&l3->list_lock);
1004
1005 kfree(shared);
1006 free_alien_cache(alien);
918 } 1007 }
919 mutex_unlock(&cache_chain_mutex); 1008 mutex_unlock(&cache_chain_mutex);
920 break; 1009 break;
@@ -923,25 +1012,34 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
923 break; 1012 break;
924#ifdef CONFIG_HOTPLUG_CPU 1013#ifdef CONFIG_HOTPLUG_CPU
925 case CPU_DEAD: 1014 case CPU_DEAD:
1015 /*
1016 * Even if all the cpus of a node are down, we don't free the
1017 * kmem_list3 of any cache. This to avoid a race between
1018 * cpu_down, and a kmalloc allocation from another cpu for
1019 * memory from the node of the cpu going down. The list3
1020 * structure is usually allocated from kmem_cache_create() and
1021 * gets destroyed at kmem_cache_destroy().
1022 */
926 /* fall thru */ 1023 /* fall thru */
927 case CPU_UP_CANCELED: 1024 case CPU_UP_CANCELED:
928 mutex_lock(&cache_chain_mutex); 1025 mutex_lock(&cache_chain_mutex);
929 1026
930 list_for_each_entry(cachep, &cache_chain, next) { 1027 list_for_each_entry(cachep, &cache_chain, next) {
931 struct array_cache *nc; 1028 struct array_cache *nc;
1029 struct array_cache *shared;
1030 struct array_cache **alien;
932 cpumask_t mask; 1031 cpumask_t mask;
933 1032
934 mask = node_to_cpumask(node); 1033 mask = node_to_cpumask(node);
935 spin_lock_irq(&cachep->spinlock);
936 /* cpu is dead; no one can alloc from it. */ 1034 /* cpu is dead; no one can alloc from it. */
937 nc = cachep->array[cpu]; 1035 nc = cachep->array[cpu];
938 cachep->array[cpu] = NULL; 1036 cachep->array[cpu] = NULL;
939 l3 = cachep->nodelists[node]; 1037 l3 = cachep->nodelists[node];
940 1038
941 if (!l3) 1039 if (!l3)
942 goto unlock_cache; 1040 goto free_array_cache;
943 1041
944 spin_lock(&l3->list_lock); 1042 spin_lock_irq(&l3->list_lock);
945 1043
946 /* Free limit for this kmem_list3 */ 1044 /* Free limit for this kmem_list3 */
947 l3->free_limit -= cachep->batchcount; 1045 l3->free_limit -= cachep->batchcount;
@@ -949,34 +1047,44 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
949 free_block(cachep, nc->entry, nc->avail, node); 1047 free_block(cachep, nc->entry, nc->avail, node);
950 1048
951 if (!cpus_empty(mask)) { 1049 if (!cpus_empty(mask)) {
952 spin_unlock(&l3->list_lock); 1050 spin_unlock_irq(&l3->list_lock);
953 goto unlock_cache; 1051 goto free_array_cache;
954 } 1052 }
955 1053
956 if (l3->shared) { 1054 shared = l3->shared;
1055 if (shared) {
957 free_block(cachep, l3->shared->entry, 1056 free_block(cachep, l3->shared->entry,
958 l3->shared->avail, node); 1057 l3->shared->avail, node);
959 kfree(l3->shared);
960 l3->shared = NULL; 1058 l3->shared = NULL;
961 } 1059 }
962 if (l3->alien) {
963 drain_alien_cache(cachep, l3);
964 free_alien_cache(l3->alien);
965 l3->alien = NULL;
966 }
967 1060
968 /* free slabs belonging to this node */ 1061 alien = l3->alien;
969 if (__node_shrink(cachep, node)) { 1062 l3->alien = NULL;
970 cachep->nodelists[node] = NULL; 1063
971 spin_unlock(&l3->list_lock); 1064 spin_unlock_irq(&l3->list_lock);
972 kfree(l3); 1065
973 } else { 1066 kfree(shared);
974 spin_unlock(&l3->list_lock); 1067 if (alien) {
1068 drain_alien_cache(cachep, alien);
1069 free_alien_cache(alien);
975 } 1070 }
976 unlock_cache: 1071free_array_cache:
977 spin_unlock_irq(&cachep->spinlock);
978 kfree(nc); 1072 kfree(nc);
979 } 1073 }
1074 /*
1075 * In the previous loop, all the objects were freed to
1076 * the respective cache's slabs, now we can go ahead and
1077 * shrink each nodelist to its limit.
1078 */
1079 list_for_each_entry(cachep, &cache_chain, next) {
1080 l3 = cachep->nodelists[node];
1081 if (!l3)
1082 continue;
1083 spin_lock_irq(&l3->list_lock);
1084 /* free slabs belonging to this node */
1085 __node_shrink(cachep, node);
1086 spin_unlock_irq(&l3->list_lock);
1087 }
980 mutex_unlock(&cache_chain_mutex); 1088 mutex_unlock(&cache_chain_mutex);
981 break; 1089 break;
982#endif 1090#endif
@@ -992,7 +1100,7 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
992/* 1100/*
993 * swap the static kmem_list3 with kmalloced memory 1101 * swap the static kmem_list3 with kmalloced memory
994 */ 1102 */
995static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, int nodeid) 1103static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid)
996{ 1104{
997 struct kmem_list3 *ptr; 1105 struct kmem_list3 *ptr;
998 1106
@@ -1032,14 +1140,14 @@ void __init kmem_cache_init(void)
1032 1140
1033 /* Bootstrap is tricky, because several objects are allocated 1141 /* Bootstrap is tricky, because several objects are allocated
1034 * from caches that do not exist yet: 1142 * from caches that do not exist yet:
1035 * 1) initialize the cache_cache cache: it contains the kmem_cache_t 1143 * 1) initialize the cache_cache cache: it contains the struct kmem_cache
1036 * structures of all caches, except cache_cache itself: cache_cache 1144 * structures of all caches, except cache_cache itself: cache_cache
1037 * is statically allocated. 1145 * is statically allocated.
1038 * Initially an __init data area is used for the head array and the 1146 * Initially an __init data area is used for the head array and the
1039 * kmem_list3 structures, it's replaced with a kmalloc allocated 1147 * kmem_list3 structures, it's replaced with a kmalloc allocated
1040 * array at the end of the bootstrap. 1148 * array at the end of the bootstrap.
1041 * 2) Create the first kmalloc cache. 1149 * 2) Create the first kmalloc cache.
1042 * The kmem_cache_t for the new cache is allocated normally. 1150 * The struct kmem_cache for the new cache is allocated normally.
1043 * An __init data area is used for the head array. 1151 * An __init data area is used for the head array.
1044 * 3) Create the remaining kmalloc caches, with minimally sized 1152 * 3) Create the remaining kmalloc caches, with minimally sized
1045 * head arrays. 1153 * head arrays.
@@ -1057,15 +1165,14 @@ void __init kmem_cache_init(void)
1057 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 1165 cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1058 cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; 1166 cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
1059 1167
1060 cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); 1168 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
1061 1169
1062 cache_estimate(0, cache_cache.objsize, cache_line_size(), 0, 1170 cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0,
1063 &left_over, &cache_cache.num); 1171 &left_over, &cache_cache.num);
1064 if (!cache_cache.num) 1172 if (!cache_cache.num)
1065 BUG(); 1173 BUG();
1066 1174
1067 cache_cache.colour = left_over / cache_cache.colour_off; 1175 cache_cache.colour = left_over / cache_cache.colour_off;
1068 cache_cache.colour_next = 0;
1069 cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + 1176 cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1070 sizeof(struct slab), cache_line_size()); 1177 sizeof(struct slab), cache_line_size());
1071 1178
@@ -1132,8 +1239,8 @@ void __init kmem_cache_init(void)
1132 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1239 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1133 1240
1134 local_irq_disable(); 1241 local_irq_disable();
1135 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); 1242 BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1136 memcpy(ptr, ac_data(&cache_cache), 1243 memcpy(ptr, cpu_cache_get(&cache_cache),
1137 sizeof(struct arraycache_init)); 1244 sizeof(struct arraycache_init));
1138 cache_cache.array[smp_processor_id()] = ptr; 1245 cache_cache.array[smp_processor_id()] = ptr;
1139 local_irq_enable(); 1246 local_irq_enable();
@@ -1141,9 +1248,9 @@ void __init kmem_cache_init(void)
1141 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1248 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1142 1249
1143 local_irq_disable(); 1250 local_irq_disable();
1144 BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep) 1251 BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1145 != &initarray_generic.cache); 1252 != &initarray_generic.cache);
1146 memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep), 1253 memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
1147 sizeof(struct arraycache_init)); 1254 sizeof(struct arraycache_init));
1148 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1255 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1149 ptr; 1256 ptr;
@@ -1170,7 +1277,7 @@ void __init kmem_cache_init(void)
1170 1277
1171 /* 6) resize the head arrays to their final sizes */ 1278 /* 6) resize the head arrays to their final sizes */
1172 { 1279 {
1173 kmem_cache_t *cachep; 1280 struct kmem_cache *cachep;
1174 mutex_lock(&cache_chain_mutex); 1281 mutex_lock(&cache_chain_mutex);
1175 list_for_each_entry(cachep, &cache_chain, next) 1282 list_for_each_entry(cachep, &cache_chain, next)
1176 enable_cpucache(cachep); 1283 enable_cpucache(cachep);
@@ -1181,7 +1288,7 @@ void __init kmem_cache_init(void)
1181 g_cpucache_up = FULL; 1288 g_cpucache_up = FULL;
1182 1289
1183 /* Register a cpu startup notifier callback 1290 /* Register a cpu startup notifier callback
1184 * that initializes ac_data for all new cpus 1291 * that initializes cpu_cache_get for all new cpus
1185 */ 1292 */
1186 register_cpu_notifier(&cpucache_notifier); 1293 register_cpu_notifier(&cpucache_notifier);
1187 1294
@@ -1213,7 +1320,7 @@ __initcall(cpucache_init);
1213 * did not request dmaable memory, we might get it, but that 1320 * did not request dmaable memory, we might get it, but that
1214 * would be relatively rare and ignorable. 1321 * would be relatively rare and ignorable.
1215 */ 1322 */
1216static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid) 1323static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1217{ 1324{
1218 struct page *page; 1325 struct page *page;
1219 void *addr; 1326 void *addr;
@@ -1239,7 +1346,7 @@ static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
1239/* 1346/*
1240 * Interface to system's page release. 1347 * Interface to system's page release.
1241 */ 1348 */
1242static void kmem_freepages(kmem_cache_t *cachep, void *addr) 1349static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1243{ 1350{
1244 unsigned long i = (1 << cachep->gfporder); 1351 unsigned long i = (1 << cachep->gfporder);
1245 struct page *page = virt_to_page(addr); 1352 struct page *page = virt_to_page(addr);
@@ -1261,7 +1368,7 @@ static void kmem_freepages(kmem_cache_t *cachep, void *addr)
1261static void kmem_rcu_free(struct rcu_head *head) 1368static void kmem_rcu_free(struct rcu_head *head)
1262{ 1369{
1263 struct slab_rcu *slab_rcu = (struct slab_rcu *)head; 1370 struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1264 kmem_cache_t *cachep = slab_rcu->cachep; 1371 struct kmem_cache *cachep = slab_rcu->cachep;
1265 1372
1266 kmem_freepages(cachep, slab_rcu->addr); 1373 kmem_freepages(cachep, slab_rcu->addr);
1267 if (OFF_SLAB(cachep)) 1374 if (OFF_SLAB(cachep))
@@ -1271,12 +1378,12 @@ static void kmem_rcu_free(struct rcu_head *head)
1271#if DEBUG 1378#if DEBUG
1272 1379
1273#ifdef CONFIG_DEBUG_PAGEALLOC 1380#ifdef CONFIG_DEBUG_PAGEALLOC
1274static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, 1381static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1275 unsigned long caller) 1382 unsigned long caller)
1276{ 1383{
1277 int size = obj_reallen(cachep); 1384 int size = obj_size(cachep);
1278 1385
1279 addr = (unsigned long *)&((char *)addr)[obj_dbghead(cachep)]; 1386 addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1280 1387
1281 if (size < 5 * sizeof(unsigned long)) 1388 if (size < 5 * sizeof(unsigned long))
1282 return; 1389 return;
@@ -1304,10 +1411,10 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
1304} 1411}
1305#endif 1412#endif
1306 1413
1307static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) 1414static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1308{ 1415{
1309 int size = obj_reallen(cachep); 1416 int size = obj_size(cachep);
1310 addr = &((char *)addr)[obj_dbghead(cachep)]; 1417 addr = &((char *)addr)[obj_offset(cachep)];
1311 1418
1312 memset(addr, val, size); 1419 memset(addr, val, size);
1313 *(unsigned char *)(addr + size - 1) = POISON_END; 1420 *(unsigned char *)(addr + size - 1) = POISON_END;
@@ -1326,7 +1433,7 @@ static void dump_line(char *data, int offset, int limit)
1326 1433
1327#if DEBUG 1434#if DEBUG
1328 1435
1329static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines) 1436static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1330{ 1437{
1331 int i, size; 1438 int i, size;
1332 char *realobj; 1439 char *realobj;
@@ -1344,8 +1451,8 @@ static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
1344 (unsigned long)*dbg_userword(cachep, objp)); 1451 (unsigned long)*dbg_userword(cachep, objp));
1345 printk("\n"); 1452 printk("\n");
1346 } 1453 }
1347 realobj = (char *)objp + obj_dbghead(cachep); 1454 realobj = (char *)objp + obj_offset(cachep);
1348 size = obj_reallen(cachep); 1455 size = obj_size(cachep);
1349 for (i = 0; i < size && lines; i += 16, lines--) { 1456 for (i = 0; i < size && lines; i += 16, lines--) {
1350 int limit; 1457 int limit;
1351 limit = 16; 1458 limit = 16;
@@ -1355,14 +1462,14 @@ static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
1355 } 1462 }
1356} 1463}
1357 1464
1358static void check_poison_obj(kmem_cache_t *cachep, void *objp) 1465static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1359{ 1466{
1360 char *realobj; 1467 char *realobj;
1361 int size, i; 1468 int size, i;
1362 int lines = 0; 1469 int lines = 0;
1363 1470
1364 realobj = (char *)objp + obj_dbghead(cachep); 1471 realobj = (char *)objp + obj_offset(cachep);
1365 size = obj_reallen(cachep); 1472 size = obj_size(cachep);
1366 1473
1367 for (i = 0; i < size; i++) { 1474 for (i = 0; i < size; i++) {
1368 char exp = POISON_FREE; 1475 char exp = POISON_FREE;
@@ -1395,20 +1502,20 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
1395 /* Print some data about the neighboring objects, if they 1502 /* Print some data about the neighboring objects, if they
1396 * exist: 1503 * exist:
1397 */ 1504 */
1398 struct slab *slabp = page_get_slab(virt_to_page(objp)); 1505 struct slab *slabp = virt_to_slab(objp);
1399 int objnr; 1506 int objnr;
1400 1507
1401 objnr = (objp - slabp->s_mem) / cachep->objsize; 1508 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
1402 if (objnr) { 1509 if (objnr) {
1403 objp = slabp->s_mem + (objnr - 1) * cachep->objsize; 1510 objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size;
1404 realobj = (char *)objp + obj_dbghead(cachep); 1511 realobj = (char *)objp + obj_offset(cachep);
1405 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1512 printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1406 realobj, size); 1513 realobj, size);
1407 print_objinfo(cachep, objp, 2); 1514 print_objinfo(cachep, objp, 2);
1408 } 1515 }
1409 if (objnr + 1 < cachep->num) { 1516 if (objnr + 1 < cachep->num) {
1410 objp = slabp->s_mem + (objnr + 1) * cachep->objsize; 1517 objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size;
1411 realobj = (char *)objp + obj_dbghead(cachep); 1518 realobj = (char *)objp + obj_offset(cachep);
1412 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1519 printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1413 realobj, size); 1520 realobj, size);
1414 print_objinfo(cachep, objp, 2); 1521 print_objinfo(cachep, objp, 2);
@@ -1417,25 +1524,23 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
1417} 1524}
1418#endif 1525#endif
1419 1526
1420/* Destroy all the objs in a slab, and release the mem back to the system. 1527#if DEBUG
1421 * Before calling the slab must have been unlinked from the cache. 1528/**
1422 * The cache-lock is not held/needed. 1529 * slab_destroy_objs - call the registered destructor for each object in
1530 * a slab that is to be destroyed.
1423 */ 1531 */
1424static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp) 1532static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1425{ 1533{
1426 void *addr = slabp->s_mem - slabp->colouroff;
1427
1428#if DEBUG
1429 int i; 1534 int i;
1430 for (i = 0; i < cachep->num; i++) { 1535 for (i = 0; i < cachep->num; i++) {
1431 void *objp = slabp->s_mem + cachep->objsize * i; 1536 void *objp = slabp->s_mem + cachep->buffer_size * i;
1432 1537
1433 if (cachep->flags & SLAB_POISON) { 1538 if (cachep->flags & SLAB_POISON) {
1434#ifdef CONFIG_DEBUG_PAGEALLOC 1539#ifdef CONFIG_DEBUG_PAGEALLOC
1435 if ((cachep->objsize % PAGE_SIZE) == 0 1540 if ((cachep->buffer_size % PAGE_SIZE) == 0
1436 && OFF_SLAB(cachep)) 1541 && OFF_SLAB(cachep))
1437 kernel_map_pages(virt_to_page(objp), 1542 kernel_map_pages(virt_to_page(objp),
1438 cachep->objsize / PAGE_SIZE, 1543 cachep->buffer_size / PAGE_SIZE,
1439 1); 1544 1);
1440 else 1545 else
1441 check_poison_obj(cachep, objp); 1546 check_poison_obj(cachep, objp);
@@ -1452,18 +1557,32 @@ static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
1452 "was overwritten"); 1557 "was overwritten");
1453 } 1558 }
1454 if (cachep->dtor && !(cachep->flags & SLAB_POISON)) 1559 if (cachep->dtor && !(cachep->flags & SLAB_POISON))
1455 (cachep->dtor) (objp + obj_dbghead(cachep), cachep, 0); 1560 (cachep->dtor) (objp + obj_offset(cachep), cachep, 0);
1456 } 1561 }
1562}
1457#else 1563#else
1564static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1565{
1458 if (cachep->dtor) { 1566 if (cachep->dtor) {
1459 int i; 1567 int i;
1460 for (i = 0; i < cachep->num; i++) { 1568 for (i = 0; i < cachep->num; i++) {
1461 void *objp = slabp->s_mem + cachep->objsize * i; 1569 void *objp = slabp->s_mem + cachep->buffer_size * i;
1462 (cachep->dtor) (objp, cachep, 0); 1570 (cachep->dtor) (objp, cachep, 0);
1463 } 1571 }
1464 } 1572 }
1573}
1465#endif 1574#endif
1466 1575
1576/**
1577 * Destroy all the objs in a slab, and release the mem back to the system.
1578 * Before calling the slab must have been unlinked from the cache.
1579 * The cache-lock is not held/needed.
1580 */
1581static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1582{
1583 void *addr = slabp->s_mem - slabp->colouroff;
1584
1585 slab_destroy_objs(cachep, slabp);
1467 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { 1586 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1468 struct slab_rcu *slab_rcu; 1587 struct slab_rcu *slab_rcu;
1469 1588
@@ -1478,9 +1597,9 @@ static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
1478 } 1597 }
1479} 1598}
1480 1599
1481/* For setting up all the kmem_list3s for cache whose objsize is same 1600/* For setting up all the kmem_list3s for cache whose buffer_size is same
1482 as size of kmem_list3. */ 1601 as size of kmem_list3. */
1483static inline void set_up_list3s(kmem_cache_t *cachep, int index) 1602static void set_up_list3s(struct kmem_cache *cachep, int index)
1484{ 1603{
1485 int node; 1604 int node;
1486 1605
@@ -1493,15 +1612,20 @@ static inline void set_up_list3s(kmem_cache_t *cachep, int index)
1493} 1612}
1494 1613
1495/** 1614/**
1496 * calculate_slab_order - calculate size (page order) of slabs and the number 1615 * calculate_slab_order - calculate size (page order) of slabs
1497 * of objects per slab. 1616 * @cachep: pointer to the cache that is being created
1617 * @size: size of objects to be created in this cache.
1618 * @align: required alignment for the objects.
1619 * @flags: slab allocation flags
1620 *
1621 * Also calculates the number of objects per slab.
1498 * 1622 *
1499 * This could be made much more intelligent. For now, try to avoid using 1623 * This could be made much more intelligent. For now, try to avoid using
1500 * high order pages for slabs. When the gfp() functions are more friendly 1624 * high order pages for slabs. When the gfp() functions are more friendly
1501 * towards high-order requests, this should be changed. 1625 * towards high-order requests, this should be changed.
1502 */ 1626 */
1503static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size, 1627static inline size_t calculate_slab_order(struct kmem_cache *cachep,
1504 size_t align, gfp_t flags) 1628 size_t size, size_t align, unsigned long flags)
1505{ 1629{
1506 size_t left_over = 0; 1630 size_t left_over = 0;
1507 1631
@@ -1572,13 +1696,13 @@ static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size,
1572 * cacheline. This can be beneficial if you're counting cycles as closely 1696 * cacheline. This can be beneficial if you're counting cycles as closely
1573 * as davem. 1697 * as davem.
1574 */ 1698 */
1575kmem_cache_t * 1699struct kmem_cache *
1576kmem_cache_create (const char *name, size_t size, size_t align, 1700kmem_cache_create (const char *name, size_t size, size_t align,
1577 unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), 1701 unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long),
1578 void (*dtor)(void*, kmem_cache_t *, unsigned long)) 1702 void (*dtor)(void*, struct kmem_cache *, unsigned long))
1579{ 1703{
1580 size_t left_over, slab_size, ralign; 1704 size_t left_over, slab_size, ralign;
1581 kmem_cache_t *cachep = NULL; 1705 struct kmem_cache *cachep = NULL;
1582 struct list_head *p; 1706 struct list_head *p;
1583 1707
1584 /* 1708 /*
@@ -1596,7 +1720,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1596 mutex_lock(&cache_chain_mutex); 1720 mutex_lock(&cache_chain_mutex);
1597 1721
1598 list_for_each(p, &cache_chain) { 1722 list_for_each(p, &cache_chain) {
1599 kmem_cache_t *pc = list_entry(p, kmem_cache_t, next); 1723 struct kmem_cache *pc = list_entry(p, struct kmem_cache, next);
1600 mm_segment_t old_fs = get_fs(); 1724 mm_segment_t old_fs = get_fs();
1601 char tmp; 1725 char tmp;
1602 int res; 1726 int res;
@@ -1611,7 +1735,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1611 set_fs(old_fs); 1735 set_fs(old_fs);
1612 if (res) { 1736 if (res) {
1613 printk("SLAB: cache with size %d has lost its name\n", 1737 printk("SLAB: cache with size %d has lost its name\n",
1614 pc->objsize); 1738 pc->buffer_size);
1615 continue; 1739 continue;
1616 } 1740 }
1617 1741
@@ -1696,20 +1820,20 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1696 align = ralign; 1820 align = ralign;
1697 1821
1698 /* Get cache's description obj. */ 1822 /* Get cache's description obj. */
1699 cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL); 1823 cachep = kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
1700 if (!cachep) 1824 if (!cachep)
1701 goto oops; 1825 goto oops;
1702 memset(cachep, 0, sizeof(kmem_cache_t)); 1826 memset(cachep, 0, sizeof(struct kmem_cache));
1703 1827
1704#if DEBUG 1828#if DEBUG
1705 cachep->reallen = size; 1829 cachep->obj_size = size;
1706 1830
1707 if (flags & SLAB_RED_ZONE) { 1831 if (flags & SLAB_RED_ZONE) {
1708 /* redzoning only works with word aligned caches */ 1832 /* redzoning only works with word aligned caches */
1709 align = BYTES_PER_WORD; 1833 align = BYTES_PER_WORD;
1710 1834
1711 /* add space for red zone words */ 1835 /* add space for red zone words */
1712 cachep->dbghead += BYTES_PER_WORD; 1836 cachep->obj_offset += BYTES_PER_WORD;
1713 size += 2 * BYTES_PER_WORD; 1837 size += 2 * BYTES_PER_WORD;
1714 } 1838 }
1715 if (flags & SLAB_STORE_USER) { 1839 if (flags & SLAB_STORE_USER) {
@@ -1722,8 +1846,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1722 } 1846 }
1723#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 1847#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
1724 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size 1848 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
1725 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) { 1849 && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
1726 cachep->dbghead += PAGE_SIZE - size; 1850 cachep->obj_offset += PAGE_SIZE - size;
1727 size = PAGE_SIZE; 1851 size = PAGE_SIZE;
1728 } 1852 }
1729#endif 1853#endif
@@ -1786,7 +1910,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1786 if (flags & SLAB_CACHE_DMA) 1910 if (flags & SLAB_CACHE_DMA)
1787 cachep->gfpflags |= GFP_DMA; 1911 cachep->gfpflags |= GFP_DMA;
1788 spin_lock_init(&cachep->spinlock); 1912 spin_lock_init(&cachep->spinlock);
1789 cachep->objsize = size; 1913 cachep->buffer_size = size;
1790 1914
1791 if (flags & CFLGS_OFF_SLAB) 1915 if (flags & CFLGS_OFF_SLAB)
1792 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); 1916 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
@@ -1843,11 +1967,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1843 jiffies + REAPTIMEOUT_LIST3 + 1967 jiffies + REAPTIMEOUT_LIST3 +
1844 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 1968 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1845 1969
1846 BUG_ON(!ac_data(cachep)); 1970 BUG_ON(!cpu_cache_get(cachep));
1847 ac_data(cachep)->avail = 0; 1971 cpu_cache_get(cachep)->avail = 0;
1848 ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 1972 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
1849 ac_data(cachep)->batchcount = 1; 1973 cpu_cache_get(cachep)->batchcount = 1;
1850 ac_data(cachep)->touched = 0; 1974 cpu_cache_get(cachep)->touched = 0;
1851 cachep->batchcount = 1; 1975 cachep->batchcount = 1;
1852 cachep->limit = BOOT_CPUCACHE_ENTRIES; 1976 cachep->limit = BOOT_CPUCACHE_ENTRIES;
1853 } 1977 }
@@ -1875,7 +1999,7 @@ static void check_irq_on(void)
1875 BUG_ON(irqs_disabled()); 1999 BUG_ON(irqs_disabled());
1876} 2000}
1877 2001
1878static void check_spinlock_acquired(kmem_cache_t *cachep) 2002static void check_spinlock_acquired(struct kmem_cache *cachep)
1879{ 2003{
1880#ifdef CONFIG_SMP 2004#ifdef CONFIG_SMP
1881 check_irq_off(); 2005 check_irq_off();
@@ -1883,7 +2007,7 @@ static void check_spinlock_acquired(kmem_cache_t *cachep)
1883#endif 2007#endif
1884} 2008}
1885 2009
1886static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node) 2010static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
1887{ 2011{
1888#ifdef CONFIG_SMP 2012#ifdef CONFIG_SMP
1889 check_irq_off(); 2013 check_irq_off();
@@ -1916,45 +2040,43 @@ static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
1916 preempt_enable(); 2040 preempt_enable();
1917} 2041}
1918 2042
1919static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac, 2043static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
1920 int force, int node); 2044 int force, int node);
1921 2045
1922static void do_drain(void *arg) 2046static void do_drain(void *arg)
1923{ 2047{
1924 kmem_cache_t *cachep = (kmem_cache_t *) arg; 2048 struct kmem_cache *cachep = (struct kmem_cache *) arg;
1925 struct array_cache *ac; 2049 struct array_cache *ac;
1926 int node = numa_node_id(); 2050 int node = numa_node_id();
1927 2051
1928 check_irq_off(); 2052 check_irq_off();
1929 ac = ac_data(cachep); 2053 ac = cpu_cache_get(cachep);
1930 spin_lock(&cachep->nodelists[node]->list_lock); 2054 spin_lock(&cachep->nodelists[node]->list_lock);
1931 free_block(cachep, ac->entry, ac->avail, node); 2055 free_block(cachep, ac->entry, ac->avail, node);
1932 spin_unlock(&cachep->nodelists[node]->list_lock); 2056 spin_unlock(&cachep->nodelists[node]->list_lock);
1933 ac->avail = 0; 2057 ac->avail = 0;
1934} 2058}
1935 2059
1936static void drain_cpu_caches(kmem_cache_t *cachep) 2060static void drain_cpu_caches(struct kmem_cache *cachep)
1937{ 2061{
1938 struct kmem_list3 *l3; 2062 struct kmem_list3 *l3;
1939 int node; 2063 int node;
1940 2064
1941 smp_call_function_all_cpus(do_drain, cachep); 2065 smp_call_function_all_cpus(do_drain, cachep);
1942 check_irq_on(); 2066 check_irq_on();
1943 spin_lock_irq(&cachep->spinlock);
1944 for_each_online_node(node) { 2067 for_each_online_node(node) {
1945 l3 = cachep->nodelists[node]; 2068 l3 = cachep->nodelists[node];
1946 if (l3) { 2069 if (l3) {
1947 spin_lock(&l3->list_lock); 2070 spin_lock_irq(&l3->list_lock);
1948 drain_array_locked(cachep, l3->shared, 1, node); 2071 drain_array_locked(cachep, l3->shared, 1, node);
1949 spin_unlock(&l3->list_lock); 2072 spin_unlock_irq(&l3->list_lock);
1950 if (l3->alien) 2073 if (l3->alien)
1951 drain_alien_cache(cachep, l3); 2074 drain_alien_cache(cachep, l3->alien);
1952 } 2075 }
1953 } 2076 }
1954 spin_unlock_irq(&cachep->spinlock);
1955} 2077}
1956 2078
1957static int __node_shrink(kmem_cache_t *cachep, int node) 2079static int __node_shrink(struct kmem_cache *cachep, int node)
1958{ 2080{
1959 struct slab *slabp; 2081 struct slab *slabp;
1960 struct kmem_list3 *l3 = cachep->nodelists[node]; 2082 struct kmem_list3 *l3 = cachep->nodelists[node];
@@ -1983,7 +2105,7 @@ static int __node_shrink(kmem_cache_t *cachep, int node)
1983 return ret; 2105 return ret;
1984} 2106}
1985 2107
1986static int __cache_shrink(kmem_cache_t *cachep) 2108static int __cache_shrink(struct kmem_cache *cachep)
1987{ 2109{
1988 int ret = 0, i = 0; 2110 int ret = 0, i = 0;
1989 struct kmem_list3 *l3; 2111 struct kmem_list3 *l3;
@@ -2009,7 +2131,7 @@ static int __cache_shrink(kmem_cache_t *cachep)
2009 * Releases as many slabs as possible for a cache. 2131 * Releases as many slabs as possible for a cache.
2010 * To help debugging, a zero exit status indicates all slabs were released. 2132 * To help debugging, a zero exit status indicates all slabs were released.
2011 */ 2133 */
2012int kmem_cache_shrink(kmem_cache_t *cachep) 2134int kmem_cache_shrink(struct kmem_cache *cachep)
2013{ 2135{
2014 if (!cachep || in_interrupt()) 2136 if (!cachep || in_interrupt())
2015 BUG(); 2137 BUG();
@@ -2022,7 +2144,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2022 * kmem_cache_destroy - delete a cache 2144 * kmem_cache_destroy - delete a cache
2023 * @cachep: the cache to destroy 2145 * @cachep: the cache to destroy
2024 * 2146 *
2025 * Remove a kmem_cache_t object from the slab cache. 2147 * Remove a struct kmem_cache object from the slab cache.
2026 * Returns 0 on success. 2148 * Returns 0 on success.
2027 * 2149 *
2028 * It is expected this function will be called by a module when it is 2150 * It is expected this function will be called by a module when it is
@@ -2035,7 +2157,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2035 * The caller must guarantee that noone will allocate memory from the cache 2157 * The caller must guarantee that noone will allocate memory from the cache
2036 * during the kmem_cache_destroy(). 2158 * during the kmem_cache_destroy().
2037 */ 2159 */
2038int kmem_cache_destroy(kmem_cache_t *cachep) 2160int kmem_cache_destroy(struct kmem_cache *cachep)
2039{ 2161{
2040 int i; 2162 int i;
2041 struct kmem_list3 *l3; 2163 struct kmem_list3 *l3;
@@ -2086,7 +2208,7 @@ int kmem_cache_destroy(kmem_cache_t *cachep)
2086EXPORT_SYMBOL(kmem_cache_destroy); 2208EXPORT_SYMBOL(kmem_cache_destroy);
2087 2209
2088/* Get the memory for a slab management obj. */ 2210/* Get the memory for a slab management obj. */
2089static struct slab *alloc_slabmgmt(kmem_cache_t *cachep, void *objp, 2211static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2090 int colour_off, gfp_t local_flags) 2212 int colour_off, gfp_t local_flags)
2091{ 2213{
2092 struct slab *slabp; 2214 struct slab *slabp;
@@ -2112,13 +2234,13 @@ static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2112 return (kmem_bufctl_t *) (slabp + 1); 2234 return (kmem_bufctl_t *) (slabp + 1);
2113} 2235}
2114 2236
2115static void cache_init_objs(kmem_cache_t *cachep, 2237static void cache_init_objs(struct kmem_cache *cachep,
2116 struct slab *slabp, unsigned long ctor_flags) 2238 struct slab *slabp, unsigned long ctor_flags)
2117{ 2239{
2118 int i; 2240 int i;
2119 2241
2120 for (i = 0; i < cachep->num; i++) { 2242 for (i = 0; i < cachep->num; i++) {
2121 void *objp = slabp->s_mem + cachep->objsize * i; 2243 void *objp = slabp->s_mem + cachep->buffer_size * i;
2122#if DEBUG 2244#if DEBUG
2123 /* need to poison the objs? */ 2245 /* need to poison the objs? */
2124 if (cachep->flags & SLAB_POISON) 2246 if (cachep->flags & SLAB_POISON)
@@ -2136,7 +2258,7 @@ static void cache_init_objs(kmem_cache_t *cachep,
2136 * Otherwise, deadlock. They must also be threaded. 2258 * Otherwise, deadlock. They must also be threaded.
2137 */ 2259 */
2138 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2260 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2139 cachep->ctor(objp + obj_dbghead(cachep), cachep, 2261 cachep->ctor(objp + obj_offset(cachep), cachep,
2140 ctor_flags); 2262 ctor_flags);
2141 2263
2142 if (cachep->flags & SLAB_RED_ZONE) { 2264 if (cachep->flags & SLAB_RED_ZONE) {
@@ -2147,10 +2269,10 @@ static void cache_init_objs(kmem_cache_t *cachep,
2147 slab_error(cachep, "constructor overwrote the" 2269 slab_error(cachep, "constructor overwrote the"
2148 " start of an object"); 2270 " start of an object");
2149 } 2271 }
2150 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) 2272 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
2151 && cachep->flags & SLAB_POISON) 2273 && cachep->flags & SLAB_POISON)
2152 kernel_map_pages(virt_to_page(objp), 2274 kernel_map_pages(virt_to_page(objp),
2153 cachep->objsize / PAGE_SIZE, 0); 2275 cachep->buffer_size / PAGE_SIZE, 0);
2154#else 2276#else
2155 if (cachep->ctor) 2277 if (cachep->ctor)
2156 cachep->ctor(objp, cachep, ctor_flags); 2278 cachep->ctor(objp, cachep, ctor_flags);
@@ -2161,7 +2283,7 @@ static void cache_init_objs(kmem_cache_t *cachep,
2161 slabp->free = 0; 2283 slabp->free = 0;
2162} 2284}
2163 2285
2164static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags) 2286static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2165{ 2287{
2166 if (flags & SLAB_DMA) { 2288 if (flags & SLAB_DMA) {
2167 if (!(cachep->gfpflags & GFP_DMA)) 2289 if (!(cachep->gfpflags & GFP_DMA))
@@ -2172,7 +2294,43 @@ static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags)
2172 } 2294 }
2173} 2295}
2174 2296
2175static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp) 2297static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid)
2298{
2299 void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size);
2300 kmem_bufctl_t next;
2301
2302 slabp->inuse++;
2303 next = slab_bufctl(slabp)[slabp->free];
2304#if DEBUG
2305 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2306 WARN_ON(slabp->nodeid != nodeid);
2307#endif
2308 slabp->free = next;
2309
2310 return objp;
2311}
2312
2313static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp,
2314 int nodeid)
2315{
2316 unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size;
2317
2318#if DEBUG
2319 /* Verify that the slab belongs to the intended node */
2320 WARN_ON(slabp->nodeid != nodeid);
2321
2322 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2323 printk(KERN_ERR "slab: double free detected in cache "
2324 "'%s', objp %p\n", cachep->name, objp);
2325 BUG();
2326 }
2327#endif
2328 slab_bufctl(slabp)[objnr] = slabp->free;
2329 slabp->free = objnr;
2330 slabp->inuse--;
2331}
2332
2333static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp)
2176{ 2334{
2177 int i; 2335 int i;
2178 struct page *page; 2336 struct page *page;
@@ -2191,7 +2349,7 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
2191 * Grow (by 1) the number of slabs within a cache. This is called by 2349 * Grow (by 1) the number of slabs within a cache. This is called by
2192 * kmem_cache_alloc() when there are no active objs left in a cache. 2350 * kmem_cache_alloc() when there are no active objs left in a cache.
2193 */ 2351 */
2194static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2352static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2195{ 2353{
2196 struct slab *slabp; 2354 struct slab *slabp;
2197 void *objp; 2355 void *objp;
@@ -2217,20 +2375,20 @@ static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2217 */ 2375 */
2218 ctor_flags |= SLAB_CTOR_ATOMIC; 2376 ctor_flags |= SLAB_CTOR_ATOMIC;
2219 2377
2220 /* About to mess with non-constant members - lock. */ 2378 /* Take the l3 list lock to change the colour_next on this node */
2221 check_irq_off(); 2379 check_irq_off();
2222 spin_lock(&cachep->spinlock); 2380 l3 = cachep->nodelists[nodeid];
2381 spin_lock(&l3->list_lock);
2223 2382
2224 /* Get colour for the slab, and cal the next value. */ 2383 /* Get colour for the slab, and cal the next value. */
2225 offset = cachep->colour_next; 2384 offset = l3->colour_next;
2226 cachep->colour_next++; 2385 l3->colour_next++;
2227 if (cachep->colour_next >= cachep->colour) 2386 if (l3->colour_next >= cachep->colour)
2228 cachep->colour_next = 0; 2387 l3->colour_next = 0;
2229 offset *= cachep->colour_off; 2388 spin_unlock(&l3->list_lock);
2230 2389
2231 spin_unlock(&cachep->spinlock); 2390 offset *= cachep->colour_off;
2232 2391
2233 check_irq_off();
2234 if (local_flags & __GFP_WAIT) 2392 if (local_flags & __GFP_WAIT)
2235 local_irq_enable(); 2393 local_irq_enable();
2236 2394
@@ -2260,7 +2418,6 @@ static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2260 if (local_flags & __GFP_WAIT) 2418 if (local_flags & __GFP_WAIT)
2261 local_irq_disable(); 2419 local_irq_disable();
2262 check_irq_off(); 2420 check_irq_off();
2263 l3 = cachep->nodelists[nodeid];
2264 spin_lock(&l3->list_lock); 2421 spin_lock(&l3->list_lock);
2265 2422
2266 /* Make slab active. */ 2423 /* Make slab active. */
@@ -2302,14 +2459,14 @@ static void kfree_debugcheck(const void *objp)
2302 } 2459 }
2303} 2460}
2304 2461
2305static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp, 2462static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2306 void *caller) 2463 void *caller)
2307{ 2464{
2308 struct page *page; 2465 struct page *page;
2309 unsigned int objnr; 2466 unsigned int objnr;
2310 struct slab *slabp; 2467 struct slab *slabp;
2311 2468
2312 objp -= obj_dbghead(cachep); 2469 objp -= obj_offset(cachep);
2313 kfree_debugcheck(objp); 2470 kfree_debugcheck(objp);
2314 page = virt_to_page(objp); 2471 page = virt_to_page(objp);
2315 2472
@@ -2341,31 +2498,31 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
2341 if (cachep->flags & SLAB_STORE_USER) 2498 if (cachep->flags & SLAB_STORE_USER)
2342 *dbg_userword(cachep, objp) = caller; 2499 *dbg_userword(cachep, objp) = caller;
2343 2500
2344 objnr = (objp - slabp->s_mem) / cachep->objsize; 2501 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
2345 2502
2346 BUG_ON(objnr >= cachep->num); 2503 BUG_ON(objnr >= cachep->num);
2347 BUG_ON(objp != slabp->s_mem + objnr * cachep->objsize); 2504 BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size);
2348 2505
2349 if (cachep->flags & SLAB_DEBUG_INITIAL) { 2506 if (cachep->flags & SLAB_DEBUG_INITIAL) {
2350 /* Need to call the slab's constructor so the 2507 /* Need to call the slab's constructor so the
2351 * caller can perform a verify of its state (debugging). 2508 * caller can perform a verify of its state (debugging).
2352 * Called without the cache-lock held. 2509 * Called without the cache-lock held.
2353 */ 2510 */
2354 cachep->ctor(objp + obj_dbghead(cachep), 2511 cachep->ctor(objp + obj_offset(cachep),
2355 cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); 2512 cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
2356 } 2513 }
2357 if (cachep->flags & SLAB_POISON && cachep->dtor) { 2514 if (cachep->flags & SLAB_POISON && cachep->dtor) {
2358 /* we want to cache poison the object, 2515 /* we want to cache poison the object,
2359 * call the destruction callback 2516 * call the destruction callback
2360 */ 2517 */
2361 cachep->dtor(objp + obj_dbghead(cachep), cachep, 0); 2518 cachep->dtor(objp + obj_offset(cachep), cachep, 0);
2362 } 2519 }
2363 if (cachep->flags & SLAB_POISON) { 2520 if (cachep->flags & SLAB_POISON) {
2364#ifdef CONFIG_DEBUG_PAGEALLOC 2521#ifdef CONFIG_DEBUG_PAGEALLOC
2365 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { 2522 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
2366 store_stackinfo(cachep, objp, (unsigned long)caller); 2523 store_stackinfo(cachep, objp, (unsigned long)caller);
2367 kernel_map_pages(virt_to_page(objp), 2524 kernel_map_pages(virt_to_page(objp),
2368 cachep->objsize / PAGE_SIZE, 0); 2525 cachep->buffer_size / PAGE_SIZE, 0);
2369 } else { 2526 } else {
2370 poison_obj(cachep, objp, POISON_FREE); 2527 poison_obj(cachep, objp, POISON_FREE);
2371 } 2528 }
@@ -2376,7 +2533,7 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
2376 return objp; 2533 return objp;
2377} 2534}
2378 2535
2379static void check_slabp(kmem_cache_t *cachep, struct slab *slabp) 2536static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2380{ 2537{
2381 kmem_bufctl_t i; 2538 kmem_bufctl_t i;
2382 int entries = 0; 2539 int entries = 0;
@@ -2409,14 +2566,14 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
2409#define check_slabp(x,y) do { } while(0) 2566#define check_slabp(x,y) do { } while(0)
2410#endif 2567#endif
2411 2568
2412static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags) 2569static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2413{ 2570{
2414 int batchcount; 2571 int batchcount;
2415 struct kmem_list3 *l3; 2572 struct kmem_list3 *l3;
2416 struct array_cache *ac; 2573 struct array_cache *ac;
2417 2574
2418 check_irq_off(); 2575 check_irq_off();
2419 ac = ac_data(cachep); 2576 ac = cpu_cache_get(cachep);
2420 retry: 2577 retry:
2421 batchcount = ac->batchcount; 2578 batchcount = ac->batchcount;
2422 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2579 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -2461,22 +2618,12 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
2461 check_slabp(cachep, slabp); 2618 check_slabp(cachep, slabp);
2462 check_spinlock_acquired(cachep); 2619 check_spinlock_acquired(cachep);
2463 while (slabp->inuse < cachep->num && batchcount--) { 2620 while (slabp->inuse < cachep->num && batchcount--) {
2464 kmem_bufctl_t next;
2465 STATS_INC_ALLOCED(cachep); 2621 STATS_INC_ALLOCED(cachep);
2466 STATS_INC_ACTIVE(cachep); 2622 STATS_INC_ACTIVE(cachep);
2467 STATS_SET_HIGH(cachep); 2623 STATS_SET_HIGH(cachep);
2468 2624
2469 /* get obj pointer */ 2625 ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
2470 ac->entry[ac->avail++] = slabp->s_mem + 2626 numa_node_id());
2471 slabp->free * cachep->objsize;
2472
2473 slabp->inuse++;
2474 next = slab_bufctl(slabp)[slabp->free];
2475#if DEBUG
2476 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2477 WARN_ON(numa_node_id() != slabp->nodeid);
2478#endif
2479 slabp->free = next;
2480 } 2627 }
2481 check_slabp(cachep, slabp); 2628 check_slabp(cachep, slabp);
2482 2629
@@ -2498,7 +2645,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
2498 x = cache_grow(cachep, flags, numa_node_id()); 2645 x = cache_grow(cachep, flags, numa_node_id());
2499 2646
2500 // cache_grow can reenable interrupts, then ac could change. 2647 // cache_grow can reenable interrupts, then ac could change.
2501 ac = ac_data(cachep); 2648 ac = cpu_cache_get(cachep);
2502 if (!x && ac->avail == 0) // no objects in sight? abort 2649 if (!x && ac->avail == 0) // no objects in sight? abort
2503 return NULL; 2650 return NULL;
2504 2651
@@ -2510,7 +2657,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
2510} 2657}
2511 2658
2512static inline void 2659static inline void
2513cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags) 2660cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
2514{ 2661{
2515 might_sleep_if(flags & __GFP_WAIT); 2662 might_sleep_if(flags & __GFP_WAIT);
2516#if DEBUG 2663#if DEBUG
@@ -2519,16 +2666,16 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
2519} 2666}
2520 2667
2521#if DEBUG 2668#if DEBUG
2522static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags, 2669static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags,
2523 void *objp, void *caller) 2670 void *objp, void *caller)
2524{ 2671{
2525 if (!objp) 2672 if (!objp)
2526 return objp; 2673 return objp;
2527 if (cachep->flags & SLAB_POISON) { 2674 if (cachep->flags & SLAB_POISON) {
2528#ifdef CONFIG_DEBUG_PAGEALLOC 2675#ifdef CONFIG_DEBUG_PAGEALLOC
2529 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 2676 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
2530 kernel_map_pages(virt_to_page(objp), 2677 kernel_map_pages(virt_to_page(objp),
2531 cachep->objsize / PAGE_SIZE, 1); 2678 cachep->buffer_size / PAGE_SIZE, 1);
2532 else 2679 else
2533 check_poison_obj(cachep, objp); 2680 check_poison_obj(cachep, objp);
2534#else 2681#else
@@ -2553,7 +2700,7 @@ static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
2553 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 2700 *dbg_redzone1(cachep, objp) = RED_ACTIVE;
2554 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2701 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
2555 } 2702 }
2556 objp += obj_dbghead(cachep); 2703 objp += obj_offset(cachep);
2557 if (cachep->ctor && cachep->flags & SLAB_POISON) { 2704 if (cachep->ctor && cachep->flags & SLAB_POISON) {
2558 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2705 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2559 2706
@@ -2568,7 +2715,7 @@ static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
2568#define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 2715#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
2569#endif 2716#endif
2570 2717
2571static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2718static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2572{ 2719{
2573 void *objp; 2720 void *objp;
2574 struct array_cache *ac; 2721 struct array_cache *ac;
@@ -2583,7 +2730,7 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2583#endif 2730#endif
2584 2731
2585 check_irq_off(); 2732 check_irq_off();
2586 ac = ac_data(cachep); 2733 ac = cpu_cache_get(cachep);
2587 if (likely(ac->avail)) { 2734 if (likely(ac->avail)) {
2588 STATS_INC_ALLOCHIT(cachep); 2735 STATS_INC_ALLOCHIT(cachep);
2589 ac->touched = 1; 2736 ac->touched = 1;
@@ -2595,7 +2742,8 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2595 return objp; 2742 return objp;
2596} 2743}
2597 2744
2598static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2745static __always_inline void *
2746__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
2599{ 2747{
2600 unsigned long save_flags; 2748 unsigned long save_flags;
2601 void *objp; 2749 void *objp;
@@ -2606,7 +2754,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2606 objp = ____cache_alloc(cachep, flags); 2754 objp = ____cache_alloc(cachep, flags);
2607 local_irq_restore(save_flags); 2755 local_irq_restore(save_flags);
2608 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 2756 objp = cache_alloc_debugcheck_after(cachep, flags, objp,
2609 __builtin_return_address(0)); 2757 caller);
2610 prefetchw(objp); 2758 prefetchw(objp);
2611 return objp; 2759 return objp;
2612} 2760}
@@ -2615,19 +2763,19 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2615/* 2763/*
2616 * A interface to enable slab creation on nodeid 2764 * A interface to enable slab creation on nodeid
2617 */ 2765 */
2618static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2766static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2619{ 2767{
2620 struct list_head *entry; 2768 struct list_head *entry;
2621 struct slab *slabp; 2769 struct slab *slabp;
2622 struct kmem_list3 *l3; 2770 struct kmem_list3 *l3;
2623 void *obj; 2771 void *obj;
2624 kmem_bufctl_t next;
2625 int x; 2772 int x;
2626 2773
2627 l3 = cachep->nodelists[nodeid]; 2774 l3 = cachep->nodelists[nodeid];
2628 BUG_ON(!l3); 2775 BUG_ON(!l3);
2629 2776
2630 retry: 2777 retry:
2778 check_irq_off();
2631 spin_lock(&l3->list_lock); 2779 spin_lock(&l3->list_lock);
2632 entry = l3->slabs_partial.next; 2780 entry = l3->slabs_partial.next;
2633 if (entry == &l3->slabs_partial) { 2781 if (entry == &l3->slabs_partial) {
@@ -2647,14 +2795,7 @@ static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2647 2795
2648 BUG_ON(slabp->inuse == cachep->num); 2796 BUG_ON(slabp->inuse == cachep->num);
2649 2797
2650 /* get obj pointer */ 2798 obj = slab_get_obj(cachep, slabp, nodeid);
2651 obj = slabp->s_mem + slabp->free * cachep->objsize;
2652 slabp->inuse++;
2653 next = slab_bufctl(slabp)[slabp->free];
2654#if DEBUG
2655 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2656#endif
2657 slabp->free = next;
2658 check_slabp(cachep, slabp); 2799 check_slabp(cachep, slabp);
2659 l3->free_objects--; 2800 l3->free_objects--;
2660 /* move slabp to correct slabp list: */ 2801 /* move slabp to correct slabp list: */
@@ -2685,7 +2826,7 @@ static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2685/* 2826/*
2686 * Caller needs to acquire correct kmem_list's list_lock 2827 * Caller needs to acquire correct kmem_list's list_lock
2687 */ 2828 */
2688static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, 2829static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
2689 int node) 2830 int node)
2690{ 2831{
2691 int i; 2832 int i;
@@ -2694,29 +2835,14 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
2694 for (i = 0; i < nr_objects; i++) { 2835 for (i = 0; i < nr_objects; i++) {
2695 void *objp = objpp[i]; 2836 void *objp = objpp[i];
2696 struct slab *slabp; 2837 struct slab *slabp;
2697 unsigned int objnr;
2698 2838
2699 slabp = page_get_slab(virt_to_page(objp)); 2839 slabp = virt_to_slab(objp);
2700 l3 = cachep->nodelists[node]; 2840 l3 = cachep->nodelists[node];
2701 list_del(&slabp->list); 2841 list_del(&slabp->list);
2702 objnr = (objp - slabp->s_mem) / cachep->objsize;
2703 check_spinlock_acquired_node(cachep, node); 2842 check_spinlock_acquired_node(cachep, node);
2704 check_slabp(cachep, slabp); 2843 check_slabp(cachep, slabp);
2705 2844 slab_put_obj(cachep, slabp, objp, node);
2706#if DEBUG
2707 /* Verify that the slab belongs to the intended node */
2708 WARN_ON(slabp->nodeid != node);
2709
2710 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2711 printk(KERN_ERR "slab: double free detected in cache "
2712 "'%s', objp %p\n", cachep->name, objp);
2713 BUG();
2714 }
2715#endif
2716 slab_bufctl(slabp)[objnr] = slabp->free;
2717 slabp->free = objnr;
2718 STATS_DEC_ACTIVE(cachep); 2845 STATS_DEC_ACTIVE(cachep);
2719 slabp->inuse--;
2720 l3->free_objects++; 2846 l3->free_objects++;
2721 check_slabp(cachep, slabp); 2847 check_slabp(cachep, slabp);
2722 2848
@@ -2738,7 +2864,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
2738 } 2864 }
2739} 2865}
2740 2866
2741static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac) 2867static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
2742{ 2868{
2743 int batchcount; 2869 int batchcount;
2744 struct kmem_list3 *l3; 2870 struct kmem_list3 *l3;
@@ -2797,9 +2923,9 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
2797 * 2923 *
2798 * Called with disabled ints. 2924 * Called with disabled ints.
2799 */ 2925 */
2800static inline void __cache_free(kmem_cache_t *cachep, void *objp) 2926static inline void __cache_free(struct kmem_cache *cachep, void *objp)
2801{ 2927{
2802 struct array_cache *ac = ac_data(cachep); 2928 struct array_cache *ac = cpu_cache_get(cachep);
2803 2929
2804 check_irq_off(); 2930 check_irq_off();
2805 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 2931 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
@@ -2810,7 +2936,7 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
2810#ifdef CONFIG_NUMA 2936#ifdef CONFIG_NUMA
2811 { 2937 {
2812 struct slab *slabp; 2938 struct slab *slabp;
2813 slabp = page_get_slab(virt_to_page(objp)); 2939 slabp = virt_to_slab(objp);
2814 if (unlikely(slabp->nodeid != numa_node_id())) { 2940 if (unlikely(slabp->nodeid != numa_node_id())) {
2815 struct array_cache *alien = NULL; 2941 struct array_cache *alien = NULL;
2816 int nodeid = slabp->nodeid; 2942 int nodeid = slabp->nodeid;
@@ -2856,9 +2982,9 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
2856 * Allocate an object from this cache. The flags are only relevant 2982 * Allocate an object from this cache. The flags are only relevant
2857 * if the cache has no available objects. 2983 * if the cache has no available objects.
2858 */ 2984 */
2859void *kmem_cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2985void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2860{ 2986{
2861 return __cache_alloc(cachep, flags); 2987 return __cache_alloc(cachep, flags, __builtin_return_address(0));
2862} 2988}
2863EXPORT_SYMBOL(kmem_cache_alloc); 2989EXPORT_SYMBOL(kmem_cache_alloc);
2864 2990
@@ -2876,12 +3002,12 @@ EXPORT_SYMBOL(kmem_cache_alloc);
2876 * 3002 *
2877 * Currently only used for dentry validation. 3003 * Currently only used for dentry validation.
2878 */ 3004 */
2879int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr) 3005int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
2880{ 3006{
2881 unsigned long addr = (unsigned long)ptr; 3007 unsigned long addr = (unsigned long)ptr;
2882 unsigned long min_addr = PAGE_OFFSET; 3008 unsigned long min_addr = PAGE_OFFSET;
2883 unsigned long align_mask = BYTES_PER_WORD - 1; 3009 unsigned long align_mask = BYTES_PER_WORD - 1;
2884 unsigned long size = cachep->objsize; 3010 unsigned long size = cachep->buffer_size;
2885 struct page *page; 3011 struct page *page;
2886 3012
2887 if (unlikely(addr < min_addr)) 3013 if (unlikely(addr < min_addr))
@@ -2917,32 +3043,23 @@ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
2917 * New and improved: it will now make sure that the object gets 3043 * New and improved: it will now make sure that the object gets
2918 * put on the correct node list so that there is no false sharing. 3044 * put on the correct node list so that there is no false sharing.
2919 */ 3045 */
2920void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) 3046void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2921{ 3047{
2922 unsigned long save_flags; 3048 unsigned long save_flags;
2923 void *ptr; 3049 void *ptr;
2924 3050
2925 if (nodeid == -1)
2926 return __cache_alloc(cachep, flags);
2927
2928 if (unlikely(!cachep->nodelists[nodeid])) {
2929 /* Fall back to __cache_alloc if we run into trouble */
2930 printk(KERN_WARNING
2931 "slab: not allocating in inactive node %d for cache %s\n",
2932 nodeid, cachep->name);
2933 return __cache_alloc(cachep, flags);
2934 }
2935
2936 cache_alloc_debugcheck_before(cachep, flags); 3051 cache_alloc_debugcheck_before(cachep, flags);
2937 local_irq_save(save_flags); 3052 local_irq_save(save_flags);
2938 if (nodeid == numa_node_id()) 3053
3054 if (nodeid == -1 || nodeid == numa_node_id() ||
3055 !cachep->nodelists[nodeid])
2939 ptr = ____cache_alloc(cachep, flags); 3056 ptr = ____cache_alloc(cachep, flags);
2940 else 3057 else
2941 ptr = __cache_alloc_node(cachep, flags, nodeid); 3058 ptr = __cache_alloc_node(cachep, flags, nodeid);
2942 local_irq_restore(save_flags); 3059 local_irq_restore(save_flags);
2943 ptr = 3060
2944 cache_alloc_debugcheck_after(cachep, flags, ptr, 3061 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr,
2945 __builtin_return_address(0)); 3062 __builtin_return_address(0));
2946 3063
2947 return ptr; 3064 return ptr;
2948} 3065}
@@ -2950,7 +3067,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node);
2950 3067
2951void *kmalloc_node(size_t size, gfp_t flags, int node) 3068void *kmalloc_node(size_t size, gfp_t flags, int node)
2952{ 3069{
2953 kmem_cache_t *cachep; 3070 struct kmem_cache *cachep;
2954 3071
2955 cachep = kmem_find_general_cachep(size, flags); 3072 cachep = kmem_find_general_cachep(size, flags);
2956 if (unlikely(cachep == NULL)) 3073 if (unlikely(cachep == NULL))
@@ -2981,9 +3098,10 @@ EXPORT_SYMBOL(kmalloc_node);
2981 * platforms. For example, on i386, it means that the memory must come 3098 * platforms. For example, on i386, it means that the memory must come
2982 * from the first 16MB. 3099 * from the first 16MB.
2983 */ 3100 */
2984void *__kmalloc(size_t size, gfp_t flags) 3101static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3102 void *caller)
2985{ 3103{
2986 kmem_cache_t *cachep; 3104 struct kmem_cache *cachep;
2987 3105
2988 /* If you want to save a few bytes .text space: replace 3106 /* If you want to save a few bytes .text space: replace
2989 * __ with kmem_. 3107 * __ with kmem_.
@@ -2993,10 +3111,27 @@ void *__kmalloc(size_t size, gfp_t flags)
2993 cachep = __find_general_cachep(size, flags); 3111 cachep = __find_general_cachep(size, flags);
2994 if (unlikely(cachep == NULL)) 3112 if (unlikely(cachep == NULL))
2995 return NULL; 3113 return NULL;
2996 return __cache_alloc(cachep, flags); 3114 return __cache_alloc(cachep, flags, caller);
3115}
3116
3117#ifndef CONFIG_DEBUG_SLAB
3118
3119void *__kmalloc(size_t size, gfp_t flags)
3120{
3121 return __do_kmalloc(size, flags, NULL);
2997} 3122}
2998EXPORT_SYMBOL(__kmalloc); 3123EXPORT_SYMBOL(__kmalloc);
2999 3124
3125#else
3126
3127void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
3128{
3129 return __do_kmalloc(size, flags, caller);
3130}
3131EXPORT_SYMBOL(__kmalloc_track_caller);
3132
3133#endif
3134
3000#ifdef CONFIG_SMP 3135#ifdef CONFIG_SMP
3001/** 3136/**
3002 * __alloc_percpu - allocate one copy of the object for every present 3137 * __alloc_percpu - allocate one copy of the object for every present
@@ -3054,7 +3189,7 @@ EXPORT_SYMBOL(__alloc_percpu);
3054 * Free an object which was previously allocated from this 3189 * Free an object which was previously allocated from this
3055 * cache. 3190 * cache.
3056 */ 3191 */
3057void kmem_cache_free(kmem_cache_t *cachep, void *objp) 3192void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3058{ 3193{
3059 unsigned long flags; 3194 unsigned long flags;
3060 3195
@@ -3075,15 +3210,15 @@ EXPORT_SYMBOL(kmem_cache_free);
3075 */ 3210 */
3076void kfree(const void *objp) 3211void kfree(const void *objp)
3077{ 3212{
3078 kmem_cache_t *c; 3213 struct kmem_cache *c;
3079 unsigned long flags; 3214 unsigned long flags;
3080 3215
3081 if (unlikely(!objp)) 3216 if (unlikely(!objp))
3082 return; 3217 return;
3083 local_irq_save(flags); 3218 local_irq_save(flags);
3084 kfree_debugcheck(objp); 3219 kfree_debugcheck(objp);
3085 c = page_get_cache(virt_to_page(objp)); 3220 c = virt_to_cache(objp);
3086 mutex_debug_check_no_locks_freed(objp, obj_reallen(c)); 3221 mutex_debug_check_no_locks_freed(objp, obj_size(c));
3087 __cache_free(c, (void *)objp); 3222 __cache_free(c, (void *)objp);
3088 local_irq_restore(flags); 3223 local_irq_restore(flags);
3089} 3224}
@@ -3112,13 +3247,13 @@ void free_percpu(const void *objp)
3112EXPORT_SYMBOL(free_percpu); 3247EXPORT_SYMBOL(free_percpu);
3113#endif 3248#endif
3114 3249
3115unsigned int kmem_cache_size(kmem_cache_t *cachep) 3250unsigned int kmem_cache_size(struct kmem_cache *cachep)
3116{ 3251{
3117 return obj_reallen(cachep); 3252 return obj_size(cachep);
3118} 3253}
3119EXPORT_SYMBOL(kmem_cache_size); 3254EXPORT_SYMBOL(kmem_cache_size);
3120 3255
3121const char *kmem_cache_name(kmem_cache_t *cachep) 3256const char *kmem_cache_name(struct kmem_cache *cachep)
3122{ 3257{
3123 return cachep->name; 3258 return cachep->name;
3124} 3259}
@@ -3127,7 +3262,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name);
3127/* 3262/*
3128 * This initializes kmem_list3 for all nodes. 3263 * This initializes kmem_list3 for all nodes.
3129 */ 3264 */
3130static int alloc_kmemlist(kmem_cache_t *cachep) 3265static int alloc_kmemlist(struct kmem_cache *cachep)
3131{ 3266{
3132 int node; 3267 int node;
3133 struct kmem_list3 *l3; 3268 struct kmem_list3 *l3;
@@ -3183,7 +3318,7 @@ static int alloc_kmemlist(kmem_cache_t *cachep)
3183} 3318}
3184 3319
3185struct ccupdate_struct { 3320struct ccupdate_struct {
3186 kmem_cache_t *cachep; 3321 struct kmem_cache *cachep;
3187 struct array_cache *new[NR_CPUS]; 3322 struct array_cache *new[NR_CPUS];
3188}; 3323};
3189 3324
@@ -3193,13 +3328,13 @@ static void do_ccupdate_local(void *info)
3193 struct array_cache *old; 3328 struct array_cache *old;
3194 3329
3195 check_irq_off(); 3330 check_irq_off();
3196 old = ac_data(new->cachep); 3331 old = cpu_cache_get(new->cachep);
3197 3332
3198 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; 3333 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3199 new->new[smp_processor_id()] = old; 3334 new->new[smp_processor_id()] = old;
3200} 3335}
3201 3336
3202static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount, 3337static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount,
3203 int shared) 3338 int shared)
3204{ 3339{
3205 struct ccupdate_struct new; 3340 struct ccupdate_struct new;
@@ -3220,11 +3355,11 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
3220 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); 3355 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
3221 3356
3222 check_irq_on(); 3357 check_irq_on();
3223 spin_lock_irq(&cachep->spinlock); 3358 spin_lock(&cachep->spinlock);
3224 cachep->batchcount = batchcount; 3359 cachep->batchcount = batchcount;
3225 cachep->limit = limit; 3360 cachep->limit = limit;
3226 cachep->shared = shared; 3361 cachep->shared = shared;
3227 spin_unlock_irq(&cachep->spinlock); 3362 spin_unlock(&cachep->spinlock);
3228 3363
3229 for_each_online_cpu(i) { 3364 for_each_online_cpu(i) {
3230 struct array_cache *ccold = new.new[i]; 3365 struct array_cache *ccold = new.new[i];
@@ -3245,7 +3380,7 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
3245 return 0; 3380 return 0;
3246} 3381}
3247 3382
3248static void enable_cpucache(kmem_cache_t *cachep) 3383static void enable_cpucache(struct kmem_cache *cachep)
3249{ 3384{
3250 int err; 3385 int err;
3251 int limit, shared; 3386 int limit, shared;
@@ -3258,13 +3393,13 @@ static void enable_cpucache(kmem_cache_t *cachep)
3258 * The numbers are guessed, we should auto-tune as described by 3393 * The numbers are guessed, we should auto-tune as described by
3259 * Bonwick. 3394 * Bonwick.
3260 */ 3395 */
3261 if (cachep->objsize > 131072) 3396 if (cachep->buffer_size > 131072)
3262 limit = 1; 3397 limit = 1;
3263 else if (cachep->objsize > PAGE_SIZE) 3398 else if (cachep->buffer_size > PAGE_SIZE)
3264 limit = 8; 3399 limit = 8;
3265 else if (cachep->objsize > 1024) 3400 else if (cachep->buffer_size > 1024)
3266 limit = 24; 3401 limit = 24;
3267 else if (cachep->objsize > 256) 3402 else if (cachep->buffer_size > 256)
3268 limit = 54; 3403 limit = 54;
3269 else 3404 else
3270 limit = 120; 3405 limit = 120;
@@ -3279,7 +3414,7 @@ static void enable_cpucache(kmem_cache_t *cachep)
3279 */ 3414 */
3280 shared = 0; 3415 shared = 0;
3281#ifdef CONFIG_SMP 3416#ifdef CONFIG_SMP
3282 if (cachep->objsize <= PAGE_SIZE) 3417 if (cachep->buffer_size <= PAGE_SIZE)
3283 shared = 8; 3418 shared = 8;
3284#endif 3419#endif
3285 3420
@@ -3297,7 +3432,7 @@ static void enable_cpucache(kmem_cache_t *cachep)
3297 cachep->name, -err); 3432 cachep->name, -err);
3298} 3433}
3299 3434
3300static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac, 3435static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
3301 int force, int node) 3436 int force, int node)
3302{ 3437{
3303 int tofree; 3438 int tofree;
@@ -3342,12 +3477,12 @@ static void cache_reap(void *unused)
3342 } 3477 }
3343 3478
3344 list_for_each(walk, &cache_chain) { 3479 list_for_each(walk, &cache_chain) {
3345 kmem_cache_t *searchp; 3480 struct kmem_cache *searchp;
3346 struct list_head *p; 3481 struct list_head *p;
3347 int tofree; 3482 int tofree;
3348 struct slab *slabp; 3483 struct slab *slabp;
3349 3484
3350 searchp = list_entry(walk, kmem_cache_t, next); 3485 searchp = list_entry(walk, struct kmem_cache, next);
3351 3486
3352 if (searchp->flags & SLAB_NO_REAP) 3487 if (searchp->flags & SLAB_NO_REAP)
3353 goto next; 3488 goto next;
@@ -3356,10 +3491,10 @@ static void cache_reap(void *unused)
3356 3491
3357 l3 = searchp->nodelists[numa_node_id()]; 3492 l3 = searchp->nodelists[numa_node_id()];
3358 if (l3->alien) 3493 if (l3->alien)
3359 drain_alien_cache(searchp, l3); 3494 drain_alien_cache(searchp, l3->alien);
3360 spin_lock_irq(&l3->list_lock); 3495 spin_lock_irq(&l3->list_lock);
3361 3496
3362 drain_array_locked(searchp, ac_data(searchp), 0, 3497 drain_array_locked(searchp, cpu_cache_get(searchp), 0,
3363 numa_node_id()); 3498 numa_node_id());
3364 3499
3365 if (time_after(l3->next_reap, jiffies)) 3500 if (time_after(l3->next_reap, jiffies))
@@ -3450,15 +3585,15 @@ static void *s_start(struct seq_file *m, loff_t *pos)
3450 if (p == &cache_chain) 3585 if (p == &cache_chain)
3451 return NULL; 3586 return NULL;
3452 } 3587 }
3453 return list_entry(p, kmem_cache_t, next); 3588 return list_entry(p, struct kmem_cache, next);
3454} 3589}
3455 3590
3456static void *s_next(struct seq_file *m, void *p, loff_t *pos) 3591static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3457{ 3592{
3458 kmem_cache_t *cachep = p; 3593 struct kmem_cache *cachep = p;
3459 ++*pos; 3594 ++*pos;
3460 return cachep->next.next == &cache_chain ? NULL 3595 return cachep->next.next == &cache_chain ? NULL
3461 : list_entry(cachep->next.next, kmem_cache_t, next); 3596 : list_entry(cachep->next.next, struct kmem_cache, next);
3462} 3597}
3463 3598
3464static void s_stop(struct seq_file *m, void *p) 3599static void s_stop(struct seq_file *m, void *p)
@@ -3468,7 +3603,7 @@ static void s_stop(struct seq_file *m, void *p)
3468 3603
3469static int s_show(struct seq_file *m, void *p) 3604static int s_show(struct seq_file *m, void *p)
3470{ 3605{
3471 kmem_cache_t *cachep = p; 3606 struct kmem_cache *cachep = p;
3472 struct list_head *q; 3607 struct list_head *q;
3473 struct slab *slabp; 3608 struct slab *slabp;
3474 unsigned long active_objs; 3609 unsigned long active_objs;
@@ -3480,8 +3615,7 @@ static int s_show(struct seq_file *m, void *p)
3480 int node; 3615 int node;
3481 struct kmem_list3 *l3; 3616 struct kmem_list3 *l3;
3482 3617
3483 check_irq_on(); 3618 spin_lock(&cachep->spinlock);
3484 spin_lock_irq(&cachep->spinlock);
3485 active_objs = 0; 3619 active_objs = 0;
3486 num_slabs = 0; 3620 num_slabs = 0;
3487 for_each_online_node(node) { 3621 for_each_online_node(node) {
@@ -3489,7 +3623,8 @@ static int s_show(struct seq_file *m, void *p)
3489 if (!l3) 3623 if (!l3)
3490 continue; 3624 continue;
3491 3625
3492 spin_lock(&l3->list_lock); 3626 check_irq_on();
3627 spin_lock_irq(&l3->list_lock);
3493 3628
3494 list_for_each(q, &l3->slabs_full) { 3629 list_for_each(q, &l3->slabs_full) {
3495 slabp = list_entry(q, struct slab, list); 3630 slabp = list_entry(q, struct slab, list);
@@ -3514,9 +3649,10 @@ static int s_show(struct seq_file *m, void *p)
3514 num_slabs++; 3649 num_slabs++;
3515 } 3650 }
3516 free_objects += l3->free_objects; 3651 free_objects += l3->free_objects;
3517 shared_avail += l3->shared->avail; 3652 if (l3->shared)
3653 shared_avail += l3->shared->avail;
3518 3654
3519 spin_unlock(&l3->list_lock); 3655 spin_unlock_irq(&l3->list_lock);
3520 } 3656 }
3521 num_slabs += active_slabs; 3657 num_slabs += active_slabs;
3522 num_objs = num_slabs * cachep->num; 3658 num_objs = num_slabs * cachep->num;
@@ -3528,7 +3664,7 @@ static int s_show(struct seq_file *m, void *p)
3528 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 3664 printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
3529 3665
3530 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 3666 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
3531 name, active_objs, num_objs, cachep->objsize, 3667 name, active_objs, num_objs, cachep->buffer_size,
3532 cachep->num, (1 << cachep->gfporder)); 3668 cachep->num, (1 << cachep->gfporder));
3533 seq_printf(m, " : tunables %4u %4u %4u", 3669 seq_printf(m, " : tunables %4u %4u %4u",
3534 cachep->limit, cachep->batchcount, cachep->shared); 3670 cachep->limit, cachep->batchcount, cachep->shared);
@@ -3560,7 +3696,7 @@ static int s_show(struct seq_file *m, void *p)
3560 } 3696 }
3561#endif 3697#endif
3562 seq_putc(m, '\n'); 3698 seq_putc(m, '\n');
3563 spin_unlock_irq(&cachep->spinlock); 3699 spin_unlock(&cachep->spinlock);
3564 return 0; 3700 return 0;
3565} 3701}
3566 3702
@@ -3618,7 +3754,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3618 mutex_lock(&cache_chain_mutex); 3754 mutex_lock(&cache_chain_mutex);
3619 res = -EINVAL; 3755 res = -EINVAL;
3620 list_for_each(p, &cache_chain) { 3756 list_for_each(p, &cache_chain) {
3621 kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); 3757 struct kmem_cache *cachep = list_entry(p, struct kmem_cache,
3758 next);
3622 3759
3623 if (!strcmp(cachep->name, kbuf)) { 3760 if (!strcmp(cachep->name, kbuf)) {
3624 if (limit < 1 || 3761 if (limit < 1 ||
@@ -3656,5 +3793,5 @@ unsigned int ksize(const void *objp)
3656 if (unlikely(objp == NULL)) 3793 if (unlikely(objp == NULL))
3657 return 0; 3794 return 0;
3658 3795
3659 return obj_reallen(page_get_cache(virt_to_page(objp))); 3796 return obj_size(virt_to_cache(objp));
3660} 3797}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 7b09ac503fec..db8a3d3e1636 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -27,6 +27,7 @@ static struct address_space_operations swap_aops = {
27 .writepage = swap_writepage, 27 .writepage = swap_writepage,
28 .sync_page = block_sync_page, 28 .sync_page = block_sync_page,
29 .set_page_dirty = __set_page_dirty_nobuffers, 29 .set_page_dirty = __set_page_dirty_nobuffers,
30 .migratepage = migrate_page,
30}; 31};
31 32
32static struct backing_dev_info swap_backing_dev_info = { 33static struct backing_dev_info swap_backing_dev_info = {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f1e69c30d203..1f9cf0d073b8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -554,6 +554,15 @@ static int unuse_mm(struct mm_struct *mm,
554 return 0; 554 return 0;
555} 555}
556 556
557#ifdef CONFIG_MIGRATION
558int remove_vma_swap(struct vm_area_struct *vma, struct page *page)
559{
560 swp_entry_t entry = { .val = page_private(page) };
561
562 return unuse_vma(vma, entry, page);
563}
564#endif
565
557/* 566/*
558 * Scan swap_map from current position to next entry still in use. 567 * Scan swap_map from current position to next entry still in use.
559 * Recycle to start on reaching the end, returning 0 when empty. 568 * Recycle to start on reaching the end, returning 0 when empty.
@@ -646,6 +655,7 @@ static int try_to_unuse(unsigned int type)
646 */ 655 */
647 swap_map = &si->swap_map[i]; 656 swap_map = &si->swap_map[i];
648 entry = swp_entry(type, i); 657 entry = swp_entry(type, i);
658again:
649 page = read_swap_cache_async(entry, NULL, 0); 659 page = read_swap_cache_async(entry, NULL, 0);
650 if (!page) { 660 if (!page) {
651 /* 661 /*
@@ -680,6 +690,12 @@ static int try_to_unuse(unsigned int type)
680 wait_on_page_locked(page); 690 wait_on_page_locked(page);
681 wait_on_page_writeback(page); 691 wait_on_page_writeback(page);
682 lock_page(page); 692 lock_page(page);
693 if (!PageSwapCache(page)) {
694 /* Page migration has occured */
695 unlock_page(page);
696 page_cache_release(page);
697 goto again;
698 }
683 wait_on_page_writeback(page); 699 wait_on_page_writeback(page);
684 700
685 /* 701 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2e34b61a70c7..5a610804cd06 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -477,7 +477,13 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
477 * processes. Try to unmap it here. 477 * processes. Try to unmap it here.
478 */ 478 */
479 if (page_mapped(page) && mapping) { 479 if (page_mapped(page) && mapping) {
480 switch (try_to_unmap(page)) { 480 /*
481 * No unmapping if we do not swap
482 */
483 if (!sc->may_swap)
484 goto keep_locked;
485
486 switch (try_to_unmap(page, 0)) {
481 case SWAP_FAIL: 487 case SWAP_FAIL:
482 goto activate_locked; 488 goto activate_locked;
483 case SWAP_AGAIN: 489 case SWAP_AGAIN:
@@ -492,7 +498,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
492 goto keep_locked; 498 goto keep_locked;
493 if (!may_enter_fs) 499 if (!may_enter_fs)
494 goto keep_locked; 500 goto keep_locked;
495 if (laptop_mode && !sc->may_writepage) 501 if (!sc->may_writepage)
496 goto keep_locked; 502 goto keep_locked;
497 503
498 /* Page is dirty, try to write it out here */ 504 /* Page is dirty, try to write it out here */
@@ -609,6 +615,15 @@ int putback_lru_pages(struct list_head *l)
609} 615}
610 616
611/* 617/*
618 * Non migratable page
619 */
620int fail_migrate_page(struct page *newpage, struct page *page)
621{
622 return -EIO;
623}
624EXPORT_SYMBOL(fail_migrate_page);
625
626/*
612 * swapout a single page 627 * swapout a single page
613 * page is locked upon entry, unlocked on exit 628 * page is locked upon entry, unlocked on exit
614 */ 629 */
@@ -617,7 +632,7 @@ static int swap_page(struct page *page)
617 struct address_space *mapping = page_mapping(page); 632 struct address_space *mapping = page_mapping(page);
618 633
619 if (page_mapped(page) && mapping) 634 if (page_mapped(page) && mapping)
620 if (try_to_unmap(page) != SWAP_SUCCESS) 635 if (try_to_unmap(page, 0) != SWAP_SUCCESS)
621 goto unlock_retry; 636 goto unlock_retry;
622 637
623 if (PageDirty(page)) { 638 if (PageDirty(page)) {
@@ -653,6 +668,167 @@ unlock_retry:
653retry: 668retry:
654 return -EAGAIN; 669 return -EAGAIN;
655} 670}
671EXPORT_SYMBOL(swap_page);
672
673/*
674 * Page migration was first developed in the context of the memory hotplug
675 * project. The main authors of the migration code are:
676 *
677 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
678 * Hirokazu Takahashi <taka@valinux.co.jp>
679 * Dave Hansen <haveblue@us.ibm.com>
680 * Christoph Lameter <clameter@sgi.com>
681 */
682
683/*
684 * Remove references for a page and establish the new page with the correct
685 * basic settings to be able to stop accesses to the page.
686 */
687int migrate_page_remove_references(struct page *newpage,
688 struct page *page, int nr_refs)
689{
690 struct address_space *mapping = page_mapping(page);
691 struct page **radix_pointer;
692
693 /*
694 * Avoid doing any of the following work if the page count
695 * indicates that the page is in use or truncate has removed
696 * the page.
697 */
698 if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
699 return 1;
700
701 /*
702 * Establish swap ptes for anonymous pages or destroy pte
703 * maps for files.
704 *
705 * In order to reestablish file backed mappings the fault handlers
706 * will take the radix tree_lock which may then be used to stop
707 * processses from accessing this page until the new page is ready.
708 *
709 * A process accessing via a swap pte (an anonymous page) will take a
710 * page_lock on the old page which will block the process until the
711 * migration attempt is complete. At that time the PageSwapCache bit
712 * will be examined. If the page was migrated then the PageSwapCache
713 * bit will be clear and the operation to retrieve the page will be
714 * retried which will find the new page in the radix tree. Then a new
715 * direct mapping may be generated based on the radix tree contents.
716 *
717 * If the page was not migrated then the PageSwapCache bit
718 * is still set and the operation may continue.
719 */
720 try_to_unmap(page, 1);
721
722 /*
723 * Give up if we were unable to remove all mappings.
724 */
725 if (page_mapcount(page))
726 return 1;
727
728 write_lock_irq(&mapping->tree_lock);
729
730 radix_pointer = (struct page **)radix_tree_lookup_slot(
731 &mapping->page_tree,
732 page_index(page));
733
734 if (!page_mapping(page) || page_count(page) != nr_refs ||
735 *radix_pointer != page) {
736 write_unlock_irq(&mapping->tree_lock);
737 return 1;
738 }
739
740 /*
741 * Now we know that no one else is looking at the page.
742 *
743 * Certain minimal information about a page must be available
744 * in order for other subsystems to properly handle the page if they
745 * find it through the radix tree update before we are finished
746 * copying the page.
747 */
748 get_page(newpage);
749 newpage->index = page->index;
750 newpage->mapping = page->mapping;
751 if (PageSwapCache(page)) {
752 SetPageSwapCache(newpage);
753 set_page_private(newpage, page_private(page));
754 }
755
756 *radix_pointer = newpage;
757 __put_page(page);
758 write_unlock_irq(&mapping->tree_lock);
759
760 return 0;
761}
762EXPORT_SYMBOL(migrate_page_remove_references);
763
764/*
765 * Copy the page to its new location
766 */
767void migrate_page_copy(struct page *newpage, struct page *page)
768{
769 copy_highpage(newpage, page);
770
771 if (PageError(page))
772 SetPageError(newpage);
773 if (PageReferenced(page))
774 SetPageReferenced(newpage);
775 if (PageUptodate(page))
776 SetPageUptodate(newpage);
777 if (PageActive(page))
778 SetPageActive(newpage);
779 if (PageChecked(page))
780 SetPageChecked(newpage);
781 if (PageMappedToDisk(page))
782 SetPageMappedToDisk(newpage);
783
784 if (PageDirty(page)) {
785 clear_page_dirty_for_io(page);
786 set_page_dirty(newpage);
787 }
788
789 ClearPageSwapCache(page);
790 ClearPageActive(page);
791 ClearPagePrivate(page);
792 set_page_private(page, 0);
793 page->mapping = NULL;
794
795 /*
796 * If any waiters have accumulated on the new page then
797 * wake them up.
798 */
799 if (PageWriteback(newpage))
800 end_page_writeback(newpage);
801}
802EXPORT_SYMBOL(migrate_page_copy);
803
804/*
805 * Common logic to directly migrate a single page suitable for
806 * pages that do not use PagePrivate.
807 *
808 * Pages are locked upon entry and exit.
809 */
810int migrate_page(struct page *newpage, struct page *page)
811{
812 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
813
814 if (migrate_page_remove_references(newpage, page, 2))
815 return -EAGAIN;
816
817 migrate_page_copy(newpage, page);
818
819 /*
820 * Remove auxiliary swap entries and replace
821 * them with real ptes.
822 *
823 * Note that a real pte entry will allow processes that are not
824 * waiting on the page lock to use the new page via the page tables
825 * before the new page is unlocked.
826 */
827 remove_from_swap(newpage);
828 return 0;
829}
830EXPORT_SYMBOL(migrate_page);
831
656/* 832/*
657 * migrate_pages 833 * migrate_pages
658 * 834 *
@@ -666,11 +842,6 @@ retry:
666 * are movable anymore because t has become empty 842 * are movable anymore because t has become empty
667 * or no retryable pages exist anymore. 843 * or no retryable pages exist anymore.
668 * 844 *
669 * SIMPLIFIED VERSION: This implementation of migrate_pages
670 * is only swapping out pages and never touches the second
671 * list. The direct migration patchset
672 * extends this function to avoid the use of swap.
673 *
674 * Return: Number of pages not migrated when "to" ran empty. 845 * Return: Number of pages not migrated when "to" ran empty.
675 */ 846 */
676int migrate_pages(struct list_head *from, struct list_head *to, 847int migrate_pages(struct list_head *from, struct list_head *to,
@@ -691,6 +862,9 @@ redo:
691 retry = 0; 862 retry = 0;
692 863
693 list_for_each_entry_safe(page, page2, from, lru) { 864 list_for_each_entry_safe(page, page2, from, lru) {
865 struct page *newpage = NULL;
866 struct address_space *mapping;
867
694 cond_resched(); 868 cond_resched();
695 869
696 rc = 0; 870 rc = 0;
@@ -698,6 +872,9 @@ redo:
698 /* page was freed from under us. So we are done. */ 872 /* page was freed from under us. So we are done. */
699 goto next; 873 goto next;
700 874
875 if (to && list_empty(to))
876 break;
877
701 /* 878 /*
702 * Skip locked pages during the first two passes to give the 879 * Skip locked pages during the first two passes to give the
703 * functions holding the lock time to release the page. Later we 880 * functions holding the lock time to release the page. Later we
@@ -734,12 +911,69 @@ redo:
734 } 911 }
735 } 912 }
736 913
914 if (!to) {
915 rc = swap_page(page);
916 goto next;
917 }
918
919 newpage = lru_to_page(to);
920 lock_page(newpage);
921
737 /* 922 /*
738 * Page is properly locked and writeback is complete. 923 * Pages are properly locked and writeback is complete.
739 * Try to migrate the page. 924 * Try to migrate the page.
740 */ 925 */
741 rc = swap_page(page); 926 mapping = page_mapping(page);
742 goto next; 927 if (!mapping)
928 goto unlock_both;
929
930 if (mapping->a_ops->migratepage) {
931 rc = mapping->a_ops->migratepage(newpage, page);
932 goto unlock_both;
933 }
934
935 /*
936 * Trigger writeout if page is dirty
937 */
938 if (PageDirty(page)) {
939 switch (pageout(page, mapping)) {
940 case PAGE_KEEP:
941 case PAGE_ACTIVATE:
942 goto unlock_both;
943
944 case PAGE_SUCCESS:
945 unlock_page(newpage);
946 goto next;
947
948 case PAGE_CLEAN:
949 ; /* try to migrate the page below */
950 }
951 }
952 /*
953 * If we have no buffer or can release the buffer
954 * then do a simple migration.
955 */
956 if (!page_has_buffers(page) ||
957 try_to_release_page(page, GFP_KERNEL)) {
958 rc = migrate_page(newpage, page);
959 goto unlock_both;
960 }
961
962 /*
963 * On early passes with mapped pages simply
964 * retry. There may be a lock held for some
965 * buffers that may go away. Later
966 * swap them out.
967 */
968 if (pass > 4) {
969 unlock_page(newpage);
970 newpage = NULL;
971 rc = swap_page(page);
972 goto next;
973 }
974
975unlock_both:
976 unlock_page(newpage);
743 977
744unlock_page: 978unlock_page:
745 unlock_page(page); 979 unlock_page(page);
@@ -752,7 +986,10 @@ next:
752 list_move(&page->lru, failed); 986 list_move(&page->lru, failed);
753 nr_failed++; 987 nr_failed++;
754 } else { 988 } else {
755 /* Success */ 989 if (newpage) {
990 /* Successful migration. Return page to LRU */
991 move_to_lru(newpage);
992 }
756 list_move(&page->lru, moved); 993 list_move(&page->lru, moved);
757 } 994 }
758 } 995 }
@@ -1170,7 +1407,7 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1170 int i; 1407 int i;
1171 1408
1172 sc.gfp_mask = gfp_mask; 1409 sc.gfp_mask = gfp_mask;
1173 sc.may_writepage = 0; 1410 sc.may_writepage = !laptop_mode;
1174 sc.may_swap = 1; 1411 sc.may_swap = 1;
1175 1412
1176 inc_page_state(allocstall); 1413 inc_page_state(allocstall);
@@ -1273,7 +1510,7 @@ loop_again:
1273 total_scanned = 0; 1510 total_scanned = 0;
1274 total_reclaimed = 0; 1511 total_reclaimed = 0;
1275 sc.gfp_mask = GFP_KERNEL; 1512 sc.gfp_mask = GFP_KERNEL;
1276 sc.may_writepage = 0; 1513 sc.may_writepage = !laptop_mode;
1277 sc.may_swap = 1; 1514 sc.may_swap = 1;
1278 sc.nr_mapped = read_page_state(nr_mapped); 1515 sc.nr_mapped = read_page_state(nr_mapped);
1279 1516
@@ -1586,40 +1823,61 @@ module_init(kswapd_init)
1586 */ 1823 */
1587int zone_reclaim_mode __read_mostly; 1824int zone_reclaim_mode __read_mostly;
1588 1825
1826#define RECLAIM_OFF 0
1827#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */
1828#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
1829#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
1830#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */
1831
1589/* 1832/*
1590 * Mininum time between zone reclaim scans 1833 * Mininum time between zone reclaim scans
1591 */ 1834 */
1592#define ZONE_RECLAIM_INTERVAL HZ/2 1835int zone_reclaim_interval __read_mostly = 30*HZ;
1836
1837/*
1838 * Priority for ZONE_RECLAIM. This determines the fraction of pages
1839 * of a node considered for each zone_reclaim. 4 scans 1/16th of
1840 * a zone.
1841 */
1842#define ZONE_RECLAIM_PRIORITY 4
1843
1593/* 1844/*
1594 * Try to free up some pages from this zone through reclaim. 1845 * Try to free up some pages from this zone through reclaim.
1595 */ 1846 */
1596int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1847int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1597{ 1848{
1598 int nr_pages = 1 << order; 1849 int nr_pages;
1599 struct task_struct *p = current; 1850 struct task_struct *p = current;
1600 struct reclaim_state reclaim_state; 1851 struct reclaim_state reclaim_state;
1601 struct scan_control sc = { 1852 struct scan_control sc;
1602 .gfp_mask = gfp_mask, 1853 cpumask_t mask;
1603 .may_writepage = 0, 1854 int node_id;
1604 .may_swap = 0, 1855
1605 .nr_mapped = read_page_state(nr_mapped), 1856 if (time_before(jiffies,
1606 .nr_scanned = 0, 1857 zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
1607 .nr_reclaimed = 0, 1858 return 0;
1608 .priority = 0
1609 };
1610 1859
1611 if (!(gfp_mask & __GFP_WAIT) || 1860 if (!(gfp_mask & __GFP_WAIT) ||
1612 zone->zone_pgdat->node_id != numa_node_id() ||
1613 zone->all_unreclaimable || 1861 zone->all_unreclaimable ||
1614 atomic_read(&zone->reclaim_in_progress) > 0) 1862 atomic_read(&zone->reclaim_in_progress) > 0)
1615 return 0; 1863 return 0;
1616 1864
1617 if (time_before(jiffies, 1865 node_id = zone->zone_pgdat->node_id;
1618 zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL)) 1866 mask = node_to_cpumask(node_id);
1619 return 0; 1867 if (!cpus_empty(mask) && node_id != numa_node_id())
1868 return 0;
1869
1870 sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
1871 sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
1872 sc.nr_scanned = 0;
1873 sc.nr_reclaimed = 0;
1874 sc.priority = ZONE_RECLAIM_PRIORITY + 1;
1875 sc.nr_mapped = read_page_state(nr_mapped);
1876 sc.gfp_mask = gfp_mask;
1620 1877
1621 disable_swap_token(); 1878 disable_swap_token();
1622 1879
1880 nr_pages = 1 << order;
1623 if (nr_pages > SWAP_CLUSTER_MAX) 1881 if (nr_pages > SWAP_CLUSTER_MAX)
1624 sc.swap_cluster_max = nr_pages; 1882 sc.swap_cluster_max = nr_pages;
1625 else 1883 else
@@ -1629,14 +1887,37 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1629 p->flags |= PF_MEMALLOC; 1887 p->flags |= PF_MEMALLOC;
1630 reclaim_state.reclaimed_slab = 0; 1888 reclaim_state.reclaimed_slab = 0;
1631 p->reclaim_state = &reclaim_state; 1889 p->reclaim_state = &reclaim_state;
1632 shrink_zone(zone, &sc); 1890
1891 /*
1892 * Free memory by calling shrink zone with increasing priorities
1893 * until we have enough memory freed.
1894 */
1895 do {
1896 sc.priority--;
1897 shrink_zone(zone, &sc);
1898
1899 } while (sc.nr_reclaimed < nr_pages && sc.priority > 0);
1900
1901 if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
1902 /*
1903 * shrink_slab does not currently allow us to determine
1904 * how many pages were freed in the zone. So we just
1905 * shake the slab and then go offnode for a single allocation.
1906 *
1907 * shrink_slab will free memory on all zones and may take
1908 * a long time.
1909 */
1910 shrink_slab(sc.nr_scanned, gfp_mask, order);
1911 sc.nr_reclaimed = 1; /* Avoid getting the off node timeout */
1912 }
1913
1633 p->reclaim_state = NULL; 1914 p->reclaim_state = NULL;
1634 current->flags &= ~PF_MEMALLOC; 1915 current->flags &= ~PF_MEMALLOC;
1635 1916
1636 if (sc.nr_reclaimed == 0) 1917 if (sc.nr_reclaimed == 0)
1637 zone->last_unsuccessful_zone_reclaim = jiffies; 1918 zone->last_unsuccessful_zone_reclaim = jiffies;
1638 1919
1639 return sc.nr_reclaimed > nr_pages; 1920 return sc.nr_reclaimed >= nr_pages;
1640} 1921}
1641#endif 1922#endif
1642 1923