aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorSteven Whitehouse <steve@men-an-tol.chygwyn.com>2006-02-23 04:49:43 -0500
committerSteven Whitehouse <swhiteho@redhat.com>2006-02-23 04:49:43 -0500
commitd35462b4bb847b68321c55e95c926aa485aecce2 (patch)
treeb08e18bf6e672633402871ee763102fdb5e63229 /mm
parent91ffd7db71e7451f89941a8f428b4daa2a7c1e38 (diff)
parent9e956c2dac9bec602ed1ba29181b45ba6d2b6448 (diff)
Merge branch 'master'
Diffstat (limited to 'mm')
-rw-r--r--mm/hugetlb.c14
-rw-r--r--mm/madvise.c21
-rw-r--r--mm/memory.c17
-rw-r--r--mm/mempolicy.c189
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c124
-rw-r--r--mm/page_alloc.c62
-rw-r--r--mm/rmap.c51
-rw-r--r--mm/shmem.c89
-rw-r--r--mm/slab.c823
-rw-r--r--mm/slob.c2
-rw-r--r--mm/swap.c32
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c16
-rw-r--r--mm/vmscan.c441
15 files changed, 1345 insertions, 539 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b21d78c941b5..508707704d2c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -85,7 +85,7 @@ void free_huge_page(struct page *page)
85 BUG_ON(page_count(page)); 85 BUG_ON(page_count(page));
86 86
87 INIT_LIST_HEAD(&page->lru); 87 INIT_LIST_HEAD(&page->lru);
88 page[1].mapping = NULL; 88 page[1].lru.next = NULL; /* reset dtor */
89 89
90 spin_lock(&hugetlb_lock); 90 spin_lock(&hugetlb_lock);
91 enqueue_huge_page(page); 91 enqueue_huge_page(page);
@@ -105,9 +105,9 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
105 } 105 }
106 spin_unlock(&hugetlb_lock); 106 spin_unlock(&hugetlb_lock);
107 set_page_count(page, 1); 107 set_page_count(page, 1);
108 page[1].mapping = (void *)free_huge_page; 108 page[1].lru.next = (void *)free_huge_page; /* set dtor */
109 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) 109 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
110 clear_highpage(&page[i]); 110 clear_user_highpage(&page[i], addr);
111 return page; 111 return page;
112} 112}
113 113
@@ -391,12 +391,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
391 391
392 if (!new_page) { 392 if (!new_page) {
393 page_cache_release(old_page); 393 page_cache_release(old_page);
394 394 return VM_FAULT_OOM;
395 /* Logically this is OOM, not a SIGBUS, but an OOM
396 * could cause the kernel to go killing other
397 * processes which won't help the hugepage situation
398 * at all (?) */
399 return VM_FAULT_SIGBUS;
400 } 395 }
401 396
402 spin_unlock(&mm->page_table_lock); 397 spin_unlock(&mm->page_table_lock);
@@ -444,6 +439,7 @@ retry:
444 page = alloc_huge_page(vma, address); 439 page = alloc_huge_page(vma, address);
445 if (!page) { 440 if (!page) {
446 hugetlb_put_quota(mapping); 441 hugetlb_put_quota(mapping);
442 ret = VM_FAULT_OOM;
447 goto out; 443 goto out;
448 } 444 }
449 445
diff --git a/mm/madvise.c b/mm/madvise.c
index ae0ae3ea299a..af3d573b0141 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -22,16 +22,23 @@ static long madvise_behavior(struct vm_area_struct * vma,
22 struct mm_struct * mm = vma->vm_mm; 22 struct mm_struct * mm = vma->vm_mm;
23 int error = 0; 23 int error = 0;
24 pgoff_t pgoff; 24 pgoff_t pgoff;
25 int new_flags = vma->vm_flags & ~VM_READHINTMASK; 25 int new_flags = vma->vm_flags;
26 26
27 switch (behavior) { 27 switch (behavior) {
28 case MADV_NORMAL:
29 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
30 break;
28 case MADV_SEQUENTIAL: 31 case MADV_SEQUENTIAL:
29 new_flags |= VM_SEQ_READ; 32 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
30 break; 33 break;
31 case MADV_RANDOM: 34 case MADV_RANDOM:
32 new_flags |= VM_RAND_READ; 35 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
33 break; 36 break;
34 default: 37 case MADV_DONTFORK:
38 new_flags |= VM_DONTCOPY;
39 break;
40 case MADV_DOFORK:
41 new_flags &= ~VM_DONTCOPY;
35 break; 42 break;
36 } 43 }
37 44
@@ -177,6 +184,12 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
177 long error; 184 long error;
178 185
179 switch (behavior) { 186 switch (behavior) {
187 case MADV_DOFORK:
188 if (vma->vm_flags & VM_IO) {
189 error = -EINVAL;
190 break;
191 }
192 case MADV_DONTFORK:
180 case MADV_NORMAL: 193 case MADV_NORMAL:
181 case MADV_SEQUENTIAL: 194 case MADV_SEQUENTIAL:
182 case MADV_RANDOM: 195 case MADV_RANDOM:
diff --git a/mm/memory.c b/mm/memory.c
index 7a11ddd5060f..9abc6008544b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -82,6 +82,16 @@ EXPORT_SYMBOL(num_physpages);
82EXPORT_SYMBOL(high_memory); 82EXPORT_SYMBOL(high_memory);
83EXPORT_SYMBOL(vmalloc_earlyreserve); 83EXPORT_SYMBOL(vmalloc_earlyreserve);
84 84
85int randomize_va_space __read_mostly = 1;
86
87static int __init disable_randmaps(char *s)
88{
89 randomize_va_space = 0;
90 return 0;
91}
92__setup("norandmaps", disable_randmaps);
93
94
85/* 95/*
86 * If a p?d_bad entry is found while walking page tables, report 96 * If a p?d_bad entry is found while walking page tables, report
87 * the error, before resetting entry to p?d_none. Usually (but 97 * the error, before resetting entry to p?d_none. Usually (but
@@ -1871,6 +1881,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1871 goto out; 1881 goto out;
1872 1882
1873 entry = pte_to_swp_entry(orig_pte); 1883 entry = pte_to_swp_entry(orig_pte);
1884again:
1874 page = lookup_swap_cache(entry); 1885 page = lookup_swap_cache(entry);
1875 if (!page) { 1886 if (!page) {
1876 swapin_readahead(entry, address, vma); 1887 swapin_readahead(entry, address, vma);
@@ -1894,6 +1905,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1894 1905
1895 mark_page_accessed(page); 1906 mark_page_accessed(page);
1896 lock_page(page); 1907 lock_page(page);
1908 if (!PageSwapCache(page)) {
1909 /* Page migration has occured */
1910 unlock_page(page);
1911 page_cache_release(page);
1912 goto again;
1913 }
1897 1914
1898 /* 1915 /*
1899 * Back out if somebody else already faulted in this pte. 1916 * Back out if somebody else already faulted in this pte.
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 73790188b0eb..880831bd3003 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -95,6 +95,9 @@
95#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 95#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
96#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ 96#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
97 97
98/* The number of pages to migrate per call to migrate_pages() */
99#define MIGRATE_CHUNK_SIZE 256
100
98static kmem_cache_t *policy_cache; 101static kmem_cache_t *policy_cache;
99static kmem_cache_t *sn_cache; 102static kmem_cache_t *sn_cache;
100 103
@@ -129,19 +132,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
129 } 132 }
130 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; 133 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
131} 134}
135
132/* Generate a custom zonelist for the BIND policy. */ 136/* Generate a custom zonelist for the BIND policy. */
133static struct zonelist *bind_zonelist(nodemask_t *nodes) 137static struct zonelist *bind_zonelist(nodemask_t *nodes)
134{ 138{
135 struct zonelist *zl; 139 struct zonelist *zl;
136 int num, max, nd; 140 int num, max, nd, k;
137 141
138 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); 142 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
139 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); 143 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
140 if (!zl) 144 if (!zl)
141 return NULL; 145 return NULL;
142 num = 0; 146 num = 0;
143 for_each_node_mask(nd, *nodes) 147 /* First put in the highest zones from all nodes, then all the next
144 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone]; 148 lower zones etc. Avoid empty zones because the memory allocator
149 doesn't like them. If you implement node hot removal you
150 have to fix that. */
151 for (k = policy_zone; k >= 0; k--) {
152 for_each_node_mask(nd, *nodes) {
153 struct zone *z = &NODE_DATA(nd)->node_zones[k];
154 if (z->present_pages > 0)
155 zl->zones[num++] = z;
156 }
157 }
145 zl->zones[num] = NULL; 158 zl->zones[num] = NULL;
146 return zl; 159 return zl;
147} 160}
@@ -543,24 +556,91 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
543 } 556 }
544} 557}
545 558
546static int swap_pages(struct list_head *pagelist) 559/*
560 * Migrate the list 'pagelist' of pages to a certain destination.
561 *
562 * Specify destination with either non-NULL vma or dest_node >= 0
563 * Return the number of pages not migrated or error code
564 */
565static int migrate_pages_to(struct list_head *pagelist,
566 struct vm_area_struct *vma, int dest)
547{ 567{
568 LIST_HEAD(newlist);
548 LIST_HEAD(moved); 569 LIST_HEAD(moved);
549 LIST_HEAD(failed); 570 LIST_HEAD(failed);
550 int n; 571 int err = 0;
572 int nr_pages;
573 struct page *page;
574 struct list_head *p;
551 575
552 n = migrate_pages(pagelist, NULL, &moved, &failed); 576redo:
553 putback_lru_pages(&failed); 577 nr_pages = 0;
554 putback_lru_pages(&moved); 578 list_for_each(p, pagelist) {
579 if (vma)
580 page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start);
581 else
582 page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
555 583
556 return n; 584 if (!page) {
585 err = -ENOMEM;
586 goto out;
587 }
588 list_add(&page->lru, &newlist);
589 nr_pages++;
590 if (nr_pages > MIGRATE_CHUNK_SIZE)
591 break;
592 }
593 err = migrate_pages(pagelist, &newlist, &moved, &failed);
594
595 putback_lru_pages(&moved); /* Call release pages instead ?? */
596
597 if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
598 goto redo;
599out:
600 /* Return leftover allocated pages */
601 while (!list_empty(&newlist)) {
602 page = list_entry(newlist.next, struct page, lru);
603 list_del(&page->lru);
604 __free_page(page);
605 }
606 list_splice(&failed, pagelist);
607 if (err < 0)
608 return err;
609
610 /* Calculate number of leftover pages */
611 nr_pages = 0;
612 list_for_each(p, pagelist)
613 nr_pages++;
614 return nr_pages;
557} 615}
558 616
559/* 617/*
560 * For now migrate_pages simply swaps out the pages from nodes that are in 618 * Migrate pages from one node to a target node.
561 * the source set but not in the target set. In the future, we would 619 * Returns error or the number of pages not migrated.
562 * want a function that moves pages between the two nodesets in such 620 */
563 * a way as to preserve the physical layout as much as possible. 621int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
622{
623 nodemask_t nmask;
624 LIST_HEAD(pagelist);
625 int err = 0;
626
627 nodes_clear(nmask);
628 node_set(source, nmask);
629
630 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
631 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
632
633 if (!list_empty(&pagelist)) {
634 err = migrate_pages_to(&pagelist, NULL, dest);
635 if (!list_empty(&pagelist))
636 putback_lru_pages(&pagelist);
637 }
638 return err;
639}
640
641/*
642 * Move pages between the two nodesets so as to preserve the physical
643 * layout as much as possible.
564 * 644 *
565 * Returns the number of page that could not be moved. 645 * Returns the number of page that could not be moved.
566 */ 646 */
@@ -568,22 +648,76 @@ int do_migrate_pages(struct mm_struct *mm,
568 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 648 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
569{ 649{
570 LIST_HEAD(pagelist); 650 LIST_HEAD(pagelist);
571 int count = 0; 651 int busy = 0;
572 nodemask_t nodes; 652 int err = 0;
653 nodemask_t tmp;
573 654
574 nodes_andnot(nodes, *from_nodes, *to_nodes); 655 down_read(&mm->mmap_sem);
575 656
576 down_read(&mm->mmap_sem); 657/*
577 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes, 658 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
578 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 659 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
660 * bit in 'tmp', and return that <source, dest> pair for migration.
661 * The pair of nodemasks 'to' and 'from' define the map.
662 *
663 * If no pair of bits is found that way, fallback to picking some
664 * pair of 'source' and 'dest' bits that are not the same. If the
665 * 'source' and 'dest' bits are the same, this represents a node
666 * that will be migrating to itself, so no pages need move.
667 *
668 * If no bits are left in 'tmp', or if all remaining bits left
669 * in 'tmp' correspond to the same bit in 'to', return false
670 * (nothing left to migrate).
671 *
672 * This lets us pick a pair of nodes to migrate between, such that
673 * if possible the dest node is not already occupied by some other
674 * source node, minimizing the risk of overloading the memory on a
675 * node that would happen if we migrated incoming memory to a node
676 * before migrating outgoing memory source that same node.
677 *
678 * A single scan of tmp is sufficient. As we go, we remember the
679 * most recent <s, d> pair that moved (s != d). If we find a pair
680 * that not only moved, but what's better, moved to an empty slot
681 * (d is not set in tmp), then we break out then, with that pair.
682 * Otherwise when we finish scannng from_tmp, we at least have the
683 * most recent <s, d> pair that moved. If we get all the way through
684 * the scan of tmp without finding any node that moved, much less
685 * moved to an empty node, then there is nothing left worth migrating.
686 */
579 687
580 if (!list_empty(&pagelist)) { 688 tmp = *from_nodes;
581 count = swap_pages(&pagelist); 689 while (!nodes_empty(tmp)) {
582 putback_lru_pages(&pagelist); 690 int s,d;
691 int source = -1;
692 int dest = 0;
693
694 for_each_node_mask(s, tmp) {
695 d = node_remap(s, *from_nodes, *to_nodes);
696 if (s == d)
697 continue;
698
699 source = s; /* Node moved. Memorize */
700 dest = d;
701
702 /* dest not in remaining from nodes? */
703 if (!node_isset(dest, tmp))
704 break;
705 }
706 if (source == -1)
707 break;
708
709 node_clear(source, tmp);
710 err = migrate_to_node(mm, source, dest, flags);
711 if (err > 0)
712 busy += err;
713 if (err < 0)
714 break;
583 } 715 }
584 716
585 up_read(&mm->mmap_sem); 717 up_read(&mm->mmap_sem);
586 return count; 718 if (err < 0)
719 return err;
720 return busy;
587} 721}
588 722
589long do_mbind(unsigned long start, unsigned long len, 723long do_mbind(unsigned long start, unsigned long len,
@@ -643,8 +777,9 @@ long do_mbind(unsigned long start, unsigned long len,
643 int nr_failed = 0; 777 int nr_failed = 0;
644 778
645 err = mbind_range(vma, start, end, new); 779 err = mbind_range(vma, start, end, new);
780
646 if (!list_empty(&pagelist)) 781 if (!list_empty(&pagelist))
647 nr_failed = swap_pages(&pagelist); 782 nr_failed = migrate_pages_to(&pagelist, vma, -1);
648 783
649 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 784 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
650 err = -EIO; 785 err = -EIO;
@@ -673,6 +808,8 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
673 nodes_clear(*nodes); 808 nodes_clear(*nodes);
674 if (maxnode == 0 || !nmask) 809 if (maxnode == 0 || !nmask)
675 return 0; 810 return 0;
811 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
812 return -EINVAL;
676 813
677 nlongs = BITS_TO_LONGS(maxnode); 814 nlongs = BITS_TO_LONGS(maxnode);
678 if ((maxnode % BITS_PER_LONG) == 0) 815 if ((maxnode % BITS_PER_LONG) == 0)
@@ -1034,6 +1171,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1034 return interleave_nodes(pol); 1171 return interleave_nodes(pol);
1035} 1172}
1036 1173
1174#ifdef CONFIG_HUGETLBFS
1037/* Return a zonelist suitable for a huge page allocation. */ 1175/* Return a zonelist suitable for a huge page allocation. */
1038struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) 1176struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1039{ 1177{
@@ -1047,6 +1185,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1047 } 1185 }
1048 return zonelist_policy(GFP_HIGHUSER, pol); 1186 return zonelist_policy(GFP_HIGHUSER, pol);
1049} 1187}
1188#endif
1050 1189
1051/* Allocate a page in interleaved policy. 1190/* Allocate a page in interleaved policy.
1052 Own path because it needs to do special accounting. */ 1191 Own path because it needs to do special accounting. */
diff --git a/mm/nommu.c b/mm/nommu.c
index c10262d68232..99d21020ec9d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -57,6 +57,8 @@ EXPORT_SYMBOL(vmalloc);
57EXPORT_SYMBOL(vfree); 57EXPORT_SYMBOL(vfree);
58EXPORT_SYMBOL(vmalloc_to_page); 58EXPORT_SYMBOL(vmalloc_to_page);
59EXPORT_SYMBOL(vmalloc_32); 59EXPORT_SYMBOL(vmalloc_32);
60EXPORT_SYMBOL(vmap);
61EXPORT_SYMBOL(vunmap);
60 62
61/* 63/*
62 * Handle all mappings that got truncated by a "truncate()" 64 * Handle all mappings that got truncated by a "truncate()"
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 14bd4ec79597..8123fad5a485 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,15 +58,17 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 58
59 /* 59 /*
60 * Processes which fork a lot of child processes are likely 60 * Processes which fork a lot of child processes are likely
61 * a good choice. We add the vmsize of the children if they 61 * a good choice. We add half the vmsize of the children if they
62 * have an own mm. This prevents forking servers to flood the 62 * have an own mm. This prevents forking servers to flood the
63 * machine with an endless amount of children 63 * machine with an endless amount of children. In case a single
64 * child is eating the vast majority of memory, adding only half
65 * to the parents will make the child our kill candidate of choice.
64 */ 66 */
65 list_for_each(tsk, &p->children) { 67 list_for_each(tsk, &p->children) {
66 struct task_struct *chld; 68 struct task_struct *chld;
67 chld = list_entry(tsk, struct task_struct, sibling); 69 chld = list_entry(tsk, struct task_struct, sibling);
68 if (chld->mm != p->mm && chld->mm) 70 if (chld->mm != p->mm && chld->mm)
69 points += chld->mm->total_vm; 71 points += chld->mm->total_vm/2 + 1;
70 } 72 }
71 73
72 /* 74 /*
@@ -131,17 +133,47 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
131} 133}
132 134
133/* 135/*
136 * Types of limitations to the nodes from which allocations may occur
137 */
138#define CONSTRAINT_NONE 1
139#define CONSTRAINT_MEMORY_POLICY 2
140#define CONSTRAINT_CPUSET 3
141
142/*
143 * Determine the type of allocation constraint.
144 */
145static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
146{
147#ifdef CONFIG_NUMA
148 struct zone **z;
149 nodemask_t nodes = node_online_map;
150
151 for (z = zonelist->zones; *z; z++)
152 if (cpuset_zone_allowed(*z, gfp_mask))
153 node_clear((*z)->zone_pgdat->node_id,
154 nodes);
155 else
156 return CONSTRAINT_CPUSET;
157
158 if (!nodes_empty(nodes))
159 return CONSTRAINT_MEMORY_POLICY;
160#endif
161
162 return CONSTRAINT_NONE;
163}
164
165/*
134 * Simple selection loop. We chose the process with the highest 166 * Simple selection loop. We chose the process with the highest
135 * number of 'points'. We expect the caller will lock the tasklist. 167 * number of 'points'. We expect the caller will lock the tasklist.
136 * 168 *
137 * (not docbooked, we don't want this one cluttering up the manual) 169 * (not docbooked, we don't want this one cluttering up the manual)
138 */ 170 */
139static struct task_struct * select_bad_process(void) 171static struct task_struct *select_bad_process(unsigned long *ppoints)
140{ 172{
141 unsigned long maxpoints = 0;
142 struct task_struct *g, *p; 173 struct task_struct *g, *p;
143 struct task_struct *chosen = NULL; 174 struct task_struct *chosen = NULL;
144 struct timespec uptime; 175 struct timespec uptime;
176 *ppoints = 0;
145 177
146 do_posix_clock_monotonic_gettime(&uptime); 178 do_posix_clock_monotonic_gettime(&uptime);
147 do_each_thread(g, p) { 179 do_each_thread(g, p) {
@@ -169,9 +201,9 @@ static struct task_struct * select_bad_process(void)
169 return p; 201 return p;
170 202
171 points = badness(p, uptime.tv_sec); 203 points = badness(p, uptime.tv_sec);
172 if (points > maxpoints || !chosen) { 204 if (points > *ppoints || !chosen) {
173 chosen = p; 205 chosen = p;
174 maxpoints = points; 206 *ppoints = points;
175 } 207 }
176 } while_each_thread(g, p); 208 } while_each_thread(g, p);
177 return chosen; 209 return chosen;
@@ -182,7 +214,7 @@ static struct task_struct * select_bad_process(void)
182 * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that 214 * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
183 * we select a process with CAP_SYS_RAW_IO set). 215 * we select a process with CAP_SYS_RAW_IO set).
184 */ 216 */
185static void __oom_kill_task(task_t *p) 217static void __oom_kill_task(task_t *p, const char *message)
186{ 218{
187 if (p->pid == 1) { 219 if (p->pid == 1) {
188 WARN_ON(1); 220 WARN_ON(1);
@@ -198,8 +230,8 @@ static void __oom_kill_task(task_t *p)
198 return; 230 return;
199 } 231 }
200 task_unlock(p); 232 task_unlock(p);
201 printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", 233 printk(KERN_ERR "%s: Killed process %d (%s).\n",
202 p->pid, p->comm); 234 message, p->pid, p->comm);
203 235
204 /* 236 /*
205 * We give our sacrificial lamb high priority and access to 237 * We give our sacrificial lamb high priority and access to
@@ -212,7 +244,7 @@ static void __oom_kill_task(task_t *p)
212 force_sig(SIGKILL, p); 244 force_sig(SIGKILL, p);
213} 245}
214 246
215static struct mm_struct *oom_kill_task(task_t *p) 247static struct mm_struct *oom_kill_task(task_t *p, const char *message)
216{ 248{
217 struct mm_struct *mm = get_task_mm(p); 249 struct mm_struct *mm = get_task_mm(p);
218 task_t * g, * q; 250 task_t * g, * q;
@@ -224,35 +256,38 @@ static struct mm_struct *oom_kill_task(task_t *p)
224 return NULL; 256 return NULL;
225 } 257 }
226 258
227 __oom_kill_task(p); 259 __oom_kill_task(p, message);
228 /* 260 /*
229 * kill all processes that share the ->mm (i.e. all threads), 261 * kill all processes that share the ->mm (i.e. all threads),
230 * but are in a different thread group 262 * but are in a different thread group
231 */ 263 */
232 do_each_thread(g, q) 264 do_each_thread(g, q)
233 if (q->mm == mm && q->tgid != p->tgid) 265 if (q->mm == mm && q->tgid != p->tgid)
234 __oom_kill_task(q); 266 __oom_kill_task(q, message);
235 while_each_thread(g, q); 267 while_each_thread(g, q);
236 268
237 return mm; 269 return mm;
238} 270}
239 271
240static struct mm_struct *oom_kill_process(struct task_struct *p) 272static struct mm_struct *oom_kill_process(struct task_struct *p,
273 unsigned long points, const char *message)
241{ 274{
242 struct mm_struct *mm; 275 struct mm_struct *mm;
243 struct task_struct *c; 276 struct task_struct *c;
244 struct list_head *tsk; 277 struct list_head *tsk;
245 278
279 printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and "
280 "children.\n", p->pid, p->comm, points);
246 /* Try to kill a child first */ 281 /* Try to kill a child first */
247 list_for_each(tsk, &p->children) { 282 list_for_each(tsk, &p->children) {
248 c = list_entry(tsk, struct task_struct, sibling); 283 c = list_entry(tsk, struct task_struct, sibling);
249 if (c->mm == p->mm) 284 if (c->mm == p->mm)
250 continue; 285 continue;
251 mm = oom_kill_task(c); 286 mm = oom_kill_task(c, message);
252 if (mm) 287 if (mm)
253 return mm; 288 return mm;
254 } 289 }
255 return oom_kill_task(p); 290 return oom_kill_task(p, message);
256} 291}
257 292
258/** 293/**
@@ -263,38 +298,63 @@ static struct mm_struct *oom_kill_process(struct task_struct *p)
263 * OR try to be smart about which process to kill. Note that we 298 * OR try to be smart about which process to kill. Note that we
264 * don't have to be perfect here, we just have to be good. 299 * don't have to be perfect here, we just have to be good.
265 */ 300 */
266void out_of_memory(gfp_t gfp_mask, int order) 301void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
267{ 302{
268 struct mm_struct *mm = NULL; 303 struct mm_struct *mm = NULL;
269 task_t * p; 304 task_t *p;
305 unsigned long points;
270 306
271 if (printk_ratelimit()) { 307 if (printk_ratelimit()) {
272 printk("oom-killer: gfp_mask=0x%x, order=%d\n", 308 printk("oom-killer: gfp_mask=0x%x, order=%d\n",
273 gfp_mask, order); 309 gfp_mask, order);
310 dump_stack();
274 show_mem(); 311 show_mem();
275 } 312 }
276 313
277 cpuset_lock(); 314 cpuset_lock();
278 read_lock(&tasklist_lock); 315 read_lock(&tasklist_lock);
316
317 /*
318 * Check if there were limitations on the allocation (only relevant for
319 * NUMA) that may require different handling.
320 */
321 switch (constrained_alloc(zonelist, gfp_mask)) {
322 case CONSTRAINT_MEMORY_POLICY:
323 mm = oom_kill_process(current, points,
324 "No available memory (MPOL_BIND)");
325 break;
326
327 case CONSTRAINT_CPUSET:
328 mm = oom_kill_process(current, points,
329 "No available memory in cpuset");
330 break;
331
332 case CONSTRAINT_NONE:
279retry: 333retry:
280 p = select_bad_process(); 334 /*
335 * Rambo mode: Shoot down a process and hope it solves whatever
336 * issues we may have.
337 */
338 p = select_bad_process(&points);
281 339
282 if (PTR_ERR(p) == -1UL) 340 if (PTR_ERR(p) == -1UL)
283 goto out; 341 goto out;
284 342
285 /* Found nothing?!?! Either we hang forever, or we panic. */ 343 /* Found nothing?!?! Either we hang forever, or we panic. */
286 if (!p) { 344 if (!p) {
287 read_unlock(&tasklist_lock); 345 read_unlock(&tasklist_lock);
288 cpuset_unlock(); 346 cpuset_unlock();
289 panic("Out of memory and no killable processes...\n"); 347 panic("Out of memory and no killable processes...\n");
290 } 348 }
291 349
292 mm = oom_kill_process(p); 350 mm = oom_kill_process(p, points, "Out of memory");
293 if (!mm) 351 if (!mm)
294 goto retry; 352 goto retry;
353
354 break;
355 }
295 356
296 out: 357out:
297 read_unlock(&tasklist_lock);
298 cpuset_unlock(); 358 cpuset_unlock();
299 if (mm) 359 if (mm)
300 mmput(mm); 360 mmput(mm);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index df54e2fc8ee0..791690d7d3fa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -56,6 +56,7 @@ long nr_swap_pages;
56int percpu_pagelist_fraction; 56int percpu_pagelist_fraction;
57 57
58static void fastcall free_hot_cold_page(struct page *page, int cold); 58static void fastcall free_hot_cold_page(struct page *page, int cold);
59static void __free_pages_ok(struct page *page, unsigned int order);
59 60
60/* 61/*
61 * results with 256, 32 in the lowmem_reserve sysctl: 62 * results with 256, 32 in the lowmem_reserve sysctl:
@@ -169,20 +170,23 @@ static void bad_page(struct page *page)
169 * All pages have PG_compound set. All pages have their ->private pointing at 170 * All pages have PG_compound set. All pages have their ->private pointing at
170 * the head page (even the head page has this). 171 * the head page (even the head page has this).
171 * 172 *
172 * The first tail page's ->mapping, if non-zero, holds the address of the 173 * The first tail page's ->lru.next holds the address of the compound page's
173 * compound page's put_page() function. 174 * put_page() function. Its ->lru.prev holds the order of allocation.
174 * 175 * This usage means that zero-order pages may not be compound.
175 * The order of the allocation is stored in the first tail page's ->index
176 * This is only for debug at present. This usage means that zero-order pages
177 * may not be compound.
178 */ 176 */
177
178static void free_compound_page(struct page *page)
179{
180 __free_pages_ok(page, (unsigned long)page[1].lru.prev);
181}
182
179static void prep_compound_page(struct page *page, unsigned long order) 183static void prep_compound_page(struct page *page, unsigned long order)
180{ 184{
181 int i; 185 int i;
182 int nr_pages = 1 << order; 186 int nr_pages = 1 << order;
183 187
184 page[1].mapping = NULL; 188 page[1].lru.next = (void *)free_compound_page; /* set dtor */
185 page[1].index = order; 189 page[1].lru.prev = (void *)order;
186 for (i = 0; i < nr_pages; i++) { 190 for (i = 0; i < nr_pages; i++) {
187 struct page *p = page + i; 191 struct page *p = page + i;
188 192
@@ -196,7 +200,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
196 int i; 200 int i;
197 int nr_pages = 1 << order; 201 int nr_pages = 1 << order;
198 202
199 if (unlikely(page[1].index != order)) 203 if (unlikely((unsigned long)page[1].lru.prev != order))
200 bad_page(page); 204 bad_page(page);
201 205
202 for (i = 0; i < nr_pages; i++) { 206 for (i = 0; i < nr_pages; i++) {
@@ -1011,7 +1015,7 @@ rebalance:
1011 if (page) 1015 if (page)
1012 goto got_pg; 1016 goto got_pg;
1013 1017
1014 out_of_memory(gfp_mask, order); 1018 out_of_memory(zonelist, gfp_mask, order);
1015 goto restart; 1019 goto restart;
1016 } 1020 }
1017 1021
@@ -1213,18 +1217,21 @@ static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
1213{ 1217{
1214 int cpu = 0; 1218 int cpu = 0;
1215 1219
1216 memset(ret, 0, sizeof(*ret)); 1220 memset(ret, 0, nr * sizeof(unsigned long));
1217 cpus_and(*cpumask, *cpumask, cpu_online_map); 1221 cpus_and(*cpumask, *cpumask, cpu_online_map);
1218 1222
1219 cpu = first_cpu(*cpumask); 1223 cpu = first_cpu(*cpumask);
1220 while (cpu < NR_CPUS) { 1224 while (cpu < NR_CPUS) {
1221 unsigned long *in, *out, off; 1225 unsigned long *in, *out, off;
1222 1226
1227 if (!cpu_isset(cpu, *cpumask))
1228 continue;
1229
1223 in = (unsigned long *)&per_cpu(page_states, cpu); 1230 in = (unsigned long *)&per_cpu(page_states, cpu);
1224 1231
1225 cpu = next_cpu(cpu, *cpumask); 1232 cpu = next_cpu(cpu, *cpumask);
1226 1233
1227 if (cpu < NR_CPUS) 1234 if (likely(cpu < NR_CPUS))
1228 prefetch(&per_cpu(page_states, cpu)); 1235 prefetch(&per_cpu(page_states, cpu));
1229 1236
1230 out = (unsigned long *)ret; 1237 out = (unsigned long *)ret;
@@ -1534,29 +1541,29 @@ static int __initdata node_load[MAX_NUMNODES];
1534 */ 1541 */
1535static int __init find_next_best_node(int node, nodemask_t *used_node_mask) 1542static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
1536{ 1543{
1537 int i, n, val; 1544 int n, val;
1538 int min_val = INT_MAX; 1545 int min_val = INT_MAX;
1539 int best_node = -1; 1546 int best_node = -1;
1540 1547
1541 for_each_online_node(i) { 1548 /* Use the local node if we haven't already */
1542 cpumask_t tmp; 1549 if (!node_isset(node, *used_node_mask)) {
1550 node_set(node, *used_node_mask);
1551 return node;
1552 }
1543 1553
1544 /* Start from local node */ 1554 for_each_online_node(n) {
1545 n = (node+i) % num_online_nodes(); 1555 cpumask_t tmp;
1546 1556
1547 /* Don't want a node to appear more than once */ 1557 /* Don't want a node to appear more than once */
1548 if (node_isset(n, *used_node_mask)) 1558 if (node_isset(n, *used_node_mask))
1549 continue; 1559 continue;
1550 1560
1551 /* Use the local node if we haven't already */
1552 if (!node_isset(node, *used_node_mask)) {
1553 best_node = node;
1554 break;
1555 }
1556
1557 /* Use the distance array to find the distance */ 1561 /* Use the distance array to find the distance */
1558 val = node_distance(node, n); 1562 val = node_distance(node, n);
1559 1563
1564 /* Penalize nodes under us ("prefer the next node") */
1565 val += (n < node);
1566
1560 /* Give preference to headless and unused nodes */ 1567 /* Give preference to headless and unused nodes */
1561 tmp = node_to_cpumask(n); 1568 tmp = node_to_cpumask(n);
1562 if (!cpus_empty(tmp)) 1569 if (!cpus_empty(tmp))
@@ -1799,7 +1806,7 @@ void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
1799 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1806 memmap_init_zone((size), (nid), (zone), (start_pfn))
1800#endif 1807#endif
1801 1808
1802static int __meminit zone_batchsize(struct zone *zone) 1809static int __cpuinit zone_batchsize(struct zone *zone)
1803{ 1810{
1804 int batch; 1811 int batch;
1805 1812
@@ -1886,14 +1893,13 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
1886 * not check if the processor is online before following the pageset pointer. 1893 * not check if the processor is online before following the pageset pointer.
1887 * Other parts of the kernel may not check if the zone is available. 1894 * Other parts of the kernel may not check if the zone is available.
1888 */ 1895 */
1889static struct per_cpu_pageset 1896static struct per_cpu_pageset boot_pageset[NR_CPUS];
1890 boot_pageset[NR_CPUS];
1891 1897
1892/* 1898/*
1893 * Dynamically allocate memory for the 1899 * Dynamically allocate memory for the
1894 * per cpu pageset array in struct zone. 1900 * per cpu pageset array in struct zone.
1895 */ 1901 */
1896static int __meminit process_zones(int cpu) 1902static int __cpuinit process_zones(int cpu)
1897{ 1903{
1898 struct zone *zone, *dzone; 1904 struct zone *zone, *dzone;
1899 1905
@@ -1934,7 +1940,7 @@ static inline void free_zone_pagesets(int cpu)
1934 } 1940 }
1935} 1941}
1936 1942
1937static int __meminit pageset_cpuup_callback(struct notifier_block *nfb, 1943static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
1938 unsigned long action, 1944 unsigned long action,
1939 void *hcpu) 1945 void *hcpu)
1940{ 1946{
diff --git a/mm/rmap.c b/mm/rmap.c
index d85a99d28c03..df2c41c2a9a2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -52,6 +52,7 @@
52#include <linux/init.h> 52#include <linux/init.h>
53#include <linux/rmap.h> 53#include <linux/rmap.h>
54#include <linux/rcupdate.h> 54#include <linux/rcupdate.h>
55#include <linux/module.h>
55 56
56#include <asm/tlbflush.h> 57#include <asm/tlbflush.h>
57 58
@@ -205,6 +206,36 @@ out:
205 return anon_vma; 206 return anon_vma;
206} 207}
207 208
209#ifdef CONFIG_MIGRATION
210/*
211 * Remove an anonymous page from swap replacing the swap pte's
212 * through real pte's pointing to valid pages and then releasing
213 * the page from the swap cache.
214 *
215 * Must hold page lock on page.
216 */
217void remove_from_swap(struct page *page)
218{
219 struct anon_vma *anon_vma;
220 struct vm_area_struct *vma;
221
222 if (!PageAnon(page) || !PageSwapCache(page))
223 return;
224
225 anon_vma = page_lock_anon_vma(page);
226 if (!anon_vma)
227 return;
228
229 list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
230 remove_vma_swap(vma, page);
231
232 spin_unlock(&anon_vma->lock);
233
234 delete_from_swap_cache(page);
235}
236EXPORT_SYMBOL(remove_from_swap);
237#endif
238
208/* 239/*
209 * At what user virtual address is page expected in vma? 240 * At what user virtual address is page expected in vma?
210 */ 241 */
@@ -541,7 +572,8 @@ void page_remove_rmap(struct page *page)
541 * Subfunctions of try_to_unmap: try_to_unmap_one called 572 * Subfunctions of try_to_unmap: try_to_unmap_one called
542 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 573 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
543 */ 574 */
544static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) 575static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
576 int ignore_refs)
545{ 577{
546 struct mm_struct *mm = vma->vm_mm; 578 struct mm_struct *mm = vma->vm_mm;
547 unsigned long address; 579 unsigned long address;
@@ -564,7 +596,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
564 * skipped over this mm) then we should reactivate it. 596 * skipped over this mm) then we should reactivate it.
565 */ 597 */
566 if ((vma->vm_flags & VM_LOCKED) || 598 if ((vma->vm_flags & VM_LOCKED) ||
567 ptep_clear_flush_young(vma, address, pte)) { 599 (ptep_clear_flush_young(vma, address, pte)
600 && !ignore_refs)) {
568 ret = SWAP_FAIL; 601 ret = SWAP_FAIL;
569 goto out_unmap; 602 goto out_unmap;
570 } 603 }
@@ -698,7 +731,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
698 pte_unmap_unlock(pte - 1, ptl); 731 pte_unmap_unlock(pte - 1, ptl);
699} 732}
700 733
701static int try_to_unmap_anon(struct page *page) 734static int try_to_unmap_anon(struct page *page, int ignore_refs)
702{ 735{
703 struct anon_vma *anon_vma; 736 struct anon_vma *anon_vma;
704 struct vm_area_struct *vma; 737 struct vm_area_struct *vma;
@@ -709,7 +742,7 @@ static int try_to_unmap_anon(struct page *page)
709 return ret; 742 return ret;
710 743
711 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 744 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
712 ret = try_to_unmap_one(page, vma); 745 ret = try_to_unmap_one(page, vma, ignore_refs);
713 if (ret == SWAP_FAIL || !page_mapped(page)) 746 if (ret == SWAP_FAIL || !page_mapped(page))
714 break; 747 break;
715 } 748 }
@@ -726,7 +759,7 @@ static int try_to_unmap_anon(struct page *page)
726 * 759 *
727 * This function is only called from try_to_unmap for object-based pages. 760 * This function is only called from try_to_unmap for object-based pages.
728 */ 761 */
729static int try_to_unmap_file(struct page *page) 762static int try_to_unmap_file(struct page *page, int ignore_refs)
730{ 763{
731 struct address_space *mapping = page->mapping; 764 struct address_space *mapping = page->mapping;
732 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 765 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -740,7 +773,7 @@ static int try_to_unmap_file(struct page *page)
740 773
741 spin_lock(&mapping->i_mmap_lock); 774 spin_lock(&mapping->i_mmap_lock);
742 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 775 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
743 ret = try_to_unmap_one(page, vma); 776 ret = try_to_unmap_one(page, vma, ignore_refs);
744 if (ret == SWAP_FAIL || !page_mapped(page)) 777 if (ret == SWAP_FAIL || !page_mapped(page))
745 goto out; 778 goto out;
746 } 779 }
@@ -825,16 +858,16 @@ out:
825 * SWAP_AGAIN - we missed a mapping, try again later 858 * SWAP_AGAIN - we missed a mapping, try again later
826 * SWAP_FAIL - the page is unswappable 859 * SWAP_FAIL - the page is unswappable
827 */ 860 */
828int try_to_unmap(struct page *page) 861int try_to_unmap(struct page *page, int ignore_refs)
829{ 862{
830 int ret; 863 int ret;
831 864
832 BUG_ON(!PageLocked(page)); 865 BUG_ON(!PageLocked(page));
833 866
834 if (PageAnon(page)) 867 if (PageAnon(page))
835 ret = try_to_unmap_anon(page); 868 ret = try_to_unmap_anon(page, ignore_refs);
836 else 869 else
837 ret = try_to_unmap_file(page); 870 ret = try_to_unmap_file(page, ignore_refs);
838 871
839 if (!page_mapped(page)) 872 if (!page_mapped(page))
840 ret = SWAP_SUCCESS; 873 ret = SWAP_SUCCESS;
diff --git a/mm/shmem.c b/mm/shmem.c
index ce501bce1c2e..7c455fbaff7b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -45,6 +45,7 @@
45#include <linux/swapops.h> 45#include <linux/swapops.h>
46#include <linux/mempolicy.h> 46#include <linux/mempolicy.h>
47#include <linux/namei.h> 47#include <linux/namei.h>
48#include <linux/ctype.h>
48#include <asm/uaccess.h> 49#include <asm/uaccess.h>
49#include <asm/div64.h> 50#include <asm/div64.h>
50#include <asm/pgtable.h> 51#include <asm/pgtable.h>
@@ -874,6 +875,51 @@ redirty:
874} 875}
875 876
876#ifdef CONFIG_NUMA 877#ifdef CONFIG_NUMA
878static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
879{
880 char *nodelist = strchr(value, ':');
881 int err = 1;
882
883 if (nodelist) {
884 /* NUL-terminate policy string */
885 *nodelist++ = '\0';
886 if (nodelist_parse(nodelist, *policy_nodes))
887 goto out;
888 }
889 if (!strcmp(value, "default")) {
890 *policy = MPOL_DEFAULT;
891 /* Don't allow a nodelist */
892 if (!nodelist)
893 err = 0;
894 } else if (!strcmp(value, "prefer")) {
895 *policy = MPOL_PREFERRED;
896 /* Insist on a nodelist of one node only */
897 if (nodelist) {
898 char *rest = nodelist;
899 while (isdigit(*rest))
900 rest++;
901 if (!*rest)
902 err = 0;
903 }
904 } else if (!strcmp(value, "bind")) {
905 *policy = MPOL_BIND;
906 /* Insist on a nodelist */
907 if (nodelist)
908 err = 0;
909 } else if (!strcmp(value, "interleave")) {
910 *policy = MPOL_INTERLEAVE;
911 /* Default to nodes online if no nodelist */
912 if (!nodelist)
913 *policy_nodes = node_online_map;
914 err = 0;
915 }
916out:
917 /* Restore string for error message */
918 if (nodelist)
919 *--nodelist = ':';
920 return err;
921}
922
877static struct page *shmem_swapin_async(struct shared_policy *p, 923static struct page *shmem_swapin_async(struct shared_policy *p,
878 swp_entry_t entry, unsigned long idx) 924 swp_entry_t entry, unsigned long idx)
879{ 925{
@@ -926,6 +972,11 @@ shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
926 return page; 972 return page;
927} 973}
928#else 974#else
975static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
976{
977 return 1;
978}
979
929static inline struct page * 980static inline struct page *
930shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx) 981shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
931{ 982{
@@ -1028,6 +1079,14 @@ repeat:
1028 page_cache_release(swappage); 1079 page_cache_release(swappage);
1029 goto repeat; 1080 goto repeat;
1030 } 1081 }
1082 if (!PageSwapCache(swappage)) {
1083 /* Page migration has occured */
1084 shmem_swp_unmap(entry);
1085 spin_unlock(&info->lock);
1086 unlock_page(swappage);
1087 page_cache_release(swappage);
1088 goto repeat;
1089 }
1031 if (PageWriteback(swappage)) { 1090 if (PageWriteback(swappage)) {
1032 shmem_swp_unmap(entry); 1091 shmem_swp_unmap(entry);
1033 spin_unlock(&info->lock); 1092 spin_unlock(&info->lock);
@@ -1851,7 +1910,23 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
1851{ 1910{
1852 char *this_char, *value, *rest; 1911 char *this_char, *value, *rest;
1853 1912
1854 while ((this_char = strsep(&options, ",")) != NULL) { 1913 while (options != NULL) {
1914 this_char = options;
1915 for (;;) {
1916 /*
1917 * NUL-terminate this option: unfortunately,
1918 * mount options form a comma-separated list,
1919 * but mpol's nodelist may also contain commas.
1920 */
1921 options = strchr(options, ',');
1922 if (options == NULL)
1923 break;
1924 options++;
1925 if (!isdigit(*options)) {
1926 options[-1] = '\0';
1927 break;
1928 }
1929 }
1855 if (!*this_char) 1930 if (!*this_char)
1856 continue; 1931 continue;
1857 if ((value = strchr(this_char,'=')) != NULL) { 1932 if ((value = strchr(this_char,'=')) != NULL) {
@@ -1902,18 +1977,8 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
1902 if (*rest) 1977 if (*rest)
1903 goto bad_val; 1978 goto bad_val;
1904 } else if (!strcmp(this_char,"mpol")) { 1979 } else if (!strcmp(this_char,"mpol")) {
1905 if (!strcmp(value,"default")) 1980 if (shmem_parse_mpol(value,policy,policy_nodes))
1906 *policy = MPOL_DEFAULT;
1907 else if (!strcmp(value,"preferred"))
1908 *policy = MPOL_PREFERRED;
1909 else if (!strcmp(value,"bind"))
1910 *policy = MPOL_BIND;
1911 else if (!strcmp(value,"interleave"))
1912 *policy = MPOL_INTERLEAVE;
1913 else
1914 goto bad_val; 1981 goto bad_val;
1915 } else if (!strcmp(this_char,"mpol_nodelist")) {
1916 nodelist_parse(value, *policy_nodes);
1917 } else { 1982 } else {
1918 printk(KERN_ERR "tmpfs: Bad mount option %s\n", 1983 printk(KERN_ERR "tmpfs: Bad mount option %s\n",
1919 this_char); 1984 this_char);
diff --git a/mm/slab.c b/mm/slab.c
index 6f8495e2185b..add05d808a4a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -55,7 +55,7 @@
55 * 55 *
56 * SMP synchronization: 56 * SMP synchronization:
57 * constructors and destructors are called without any locking. 57 * constructors and destructors are called without any locking.
58 * Several members in kmem_cache_t and struct slab never change, they 58 * Several members in struct kmem_cache and struct slab never change, they
59 * are accessed without any locking. 59 * are accessed without any locking.
60 * The per-cpu arrays are never accessed from the wrong cpu, no locking, 60 * The per-cpu arrays are never accessed from the wrong cpu, no locking,
61 * and local interrupts are disabled so slab code is preempt-safe. 61 * and local interrupts are disabled so slab code is preempt-safe.
@@ -244,7 +244,7 @@ struct slab {
244 */ 244 */
245struct slab_rcu { 245struct slab_rcu {
246 struct rcu_head head; 246 struct rcu_head head;
247 kmem_cache_t *cachep; 247 struct kmem_cache *cachep;
248 void *addr; 248 void *addr;
249}; 249};
250 250
@@ -294,6 +294,7 @@ struct kmem_list3 {
294 unsigned long next_reap; 294 unsigned long next_reap;
295 int free_touched; 295 int free_touched;
296 unsigned int free_limit; 296 unsigned int free_limit;
297 unsigned int colour_next; /* Per-node cache coloring */
297 spinlock_t list_lock; 298 spinlock_t list_lock;
298 struct array_cache *shared; /* shared per node */ 299 struct array_cache *shared; /* shared per node */
299 struct array_cache **alien; /* on other nodes */ 300 struct array_cache **alien; /* on other nodes */
@@ -316,6 +317,8 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
316 */ 317 */
317static __always_inline int index_of(const size_t size) 318static __always_inline int index_of(const size_t size)
318{ 319{
320 extern void __bad_size(void);
321
319 if (__builtin_constant_p(size)) { 322 if (__builtin_constant_p(size)) {
320 int i = 0; 323 int i = 0;
321 324
@@ -326,25 +329,23 @@ static __always_inline int index_of(const size_t size)
326 i++; 329 i++;
327#include "linux/kmalloc_sizes.h" 330#include "linux/kmalloc_sizes.h"
328#undef CACHE 331#undef CACHE
329 { 332 __bad_size();
330 extern void __bad_size(void);
331 __bad_size();
332 }
333 } else 333 } else
334 BUG(); 334 __bad_size();
335 return 0; 335 return 0;
336} 336}
337 337
338#define INDEX_AC index_of(sizeof(struct arraycache_init)) 338#define INDEX_AC index_of(sizeof(struct arraycache_init))
339#define INDEX_L3 index_of(sizeof(struct kmem_list3)) 339#define INDEX_L3 index_of(sizeof(struct kmem_list3))
340 340
341static inline void kmem_list3_init(struct kmem_list3 *parent) 341static void kmem_list3_init(struct kmem_list3 *parent)
342{ 342{
343 INIT_LIST_HEAD(&parent->slabs_full); 343 INIT_LIST_HEAD(&parent->slabs_full);
344 INIT_LIST_HEAD(&parent->slabs_partial); 344 INIT_LIST_HEAD(&parent->slabs_partial);
345 INIT_LIST_HEAD(&parent->slabs_free); 345 INIT_LIST_HEAD(&parent->slabs_free);
346 parent->shared = NULL; 346 parent->shared = NULL;
347 parent->alien = NULL; 347 parent->alien = NULL;
348 parent->colour_next = 0;
348 spin_lock_init(&parent->list_lock); 349 spin_lock_init(&parent->list_lock);
349 parent->free_objects = 0; 350 parent->free_objects = 0;
350 parent->free_touched = 0; 351 parent->free_touched = 0;
@@ -364,7 +365,7 @@ static inline void kmem_list3_init(struct kmem_list3 *parent)
364 } while (0) 365 } while (0)
365 366
366/* 367/*
367 * kmem_cache_t 368 * struct kmem_cache
368 * 369 *
369 * manages a cache. 370 * manages a cache.
370 */ 371 */
@@ -375,7 +376,7 @@ struct kmem_cache {
375 unsigned int batchcount; 376 unsigned int batchcount;
376 unsigned int limit; 377 unsigned int limit;
377 unsigned int shared; 378 unsigned int shared;
378 unsigned int objsize; 379 unsigned int buffer_size;
379/* 2) touched by every alloc & free from the backend */ 380/* 2) touched by every alloc & free from the backend */
380 struct kmem_list3 *nodelists[MAX_NUMNODES]; 381 struct kmem_list3 *nodelists[MAX_NUMNODES];
381 unsigned int flags; /* constant flags */ 382 unsigned int flags; /* constant flags */
@@ -391,16 +392,15 @@ struct kmem_cache {
391 392
392 size_t colour; /* cache colouring range */ 393 size_t colour; /* cache colouring range */
393 unsigned int colour_off; /* colour offset */ 394 unsigned int colour_off; /* colour offset */
394 unsigned int colour_next; /* cache colouring */ 395 struct kmem_cache *slabp_cache;
395 kmem_cache_t *slabp_cache;
396 unsigned int slab_size; 396 unsigned int slab_size;
397 unsigned int dflags; /* dynamic flags */ 397 unsigned int dflags; /* dynamic flags */
398 398
399 /* constructor func */ 399 /* constructor func */
400 void (*ctor) (void *, kmem_cache_t *, unsigned long); 400 void (*ctor) (void *, struct kmem_cache *, unsigned long);
401 401
402 /* de-constructor func */ 402 /* de-constructor func */
403 void (*dtor) (void *, kmem_cache_t *, unsigned long); 403 void (*dtor) (void *, struct kmem_cache *, unsigned long);
404 404
405/* 4) cache creation/removal */ 405/* 4) cache creation/removal */
406 const char *name; 406 const char *name;
@@ -423,8 +423,14 @@ struct kmem_cache {
423 atomic_t freemiss; 423 atomic_t freemiss;
424#endif 424#endif
425#if DEBUG 425#if DEBUG
426 int dbghead; 426 /*
427 int reallen; 427 * If debugging is enabled, then the allocator can add additional
428 * fields and/or padding to every object. buffer_size contains the total
429 * object size including these internal fields, the following two
430 * variables contain the offset to the user object and its size.
431 */
432 int obj_offset;
433 int obj_size;
428#endif 434#endif
429}; 435};
430 436
@@ -495,50 +501,50 @@ struct kmem_cache {
495 501
496/* memory layout of objects: 502/* memory layout of objects:
497 * 0 : objp 503 * 0 : objp
498 * 0 .. cachep->dbghead - BYTES_PER_WORD - 1: padding. This ensures that 504 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
499 * the end of an object is aligned with the end of the real 505 * the end of an object is aligned with the end of the real
500 * allocation. Catches writes behind the end of the allocation. 506 * allocation. Catches writes behind the end of the allocation.
501 * cachep->dbghead - BYTES_PER_WORD .. cachep->dbghead - 1: 507 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
502 * redzone word. 508 * redzone word.
503 * cachep->dbghead: The real object. 509 * cachep->obj_offset: The real object.
504 * cachep->objsize - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 510 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
505 * cachep->objsize - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long] 511 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
506 */ 512 */
507static int obj_dbghead(kmem_cache_t *cachep) 513static int obj_offset(struct kmem_cache *cachep)
508{ 514{
509 return cachep->dbghead; 515 return cachep->obj_offset;
510} 516}
511 517
512static int obj_reallen(kmem_cache_t *cachep) 518static int obj_size(struct kmem_cache *cachep)
513{ 519{
514 return cachep->reallen; 520 return cachep->obj_size;
515} 521}
516 522
517static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp) 523static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
518{ 524{
519 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 525 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
520 return (unsigned long*) (objp+obj_dbghead(cachep)-BYTES_PER_WORD); 526 return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD);
521} 527}
522 528
523static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp) 529static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
524{ 530{
525 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 531 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
526 if (cachep->flags & SLAB_STORE_USER) 532 if (cachep->flags & SLAB_STORE_USER)
527 return (unsigned long *)(objp + cachep->objsize - 533 return (unsigned long *)(objp + cachep->buffer_size -
528 2 * BYTES_PER_WORD); 534 2 * BYTES_PER_WORD);
529 return (unsigned long *)(objp + cachep->objsize - BYTES_PER_WORD); 535 return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD);
530} 536}
531 537
532static void **dbg_userword(kmem_cache_t *cachep, void *objp) 538static void **dbg_userword(struct kmem_cache *cachep, void *objp)
533{ 539{
534 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 540 BUG_ON(!(cachep->flags & SLAB_STORE_USER));
535 return (void **)(objp + cachep->objsize - BYTES_PER_WORD); 541 return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
536} 542}
537 543
538#else 544#else
539 545
540#define obj_dbghead(x) 0 546#define obj_offset(x) 0
541#define obj_reallen(cachep) (cachep->objsize) 547#define obj_size(cachep) (cachep->buffer_size)
542#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 548#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;})
543#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 549#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;})
544#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 550#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})
@@ -591,6 +597,18 @@ static inline struct slab *page_get_slab(struct page *page)
591 return (struct slab *)page->lru.prev; 597 return (struct slab *)page->lru.prev;
592} 598}
593 599
600static inline struct kmem_cache *virt_to_cache(const void *obj)
601{
602 struct page *page = virt_to_page(obj);
603 return page_get_cache(page);
604}
605
606static inline struct slab *virt_to_slab(const void *obj)
607{
608 struct page *page = virt_to_page(obj);
609 return page_get_slab(page);
610}
611
594/* These are the default caches for kmalloc. Custom caches can have other sizes. */ 612/* These are the default caches for kmalloc. Custom caches can have other sizes. */
595struct cache_sizes malloc_sizes[] = { 613struct cache_sizes malloc_sizes[] = {
596#define CACHE(x) { .cs_size = (x) }, 614#define CACHE(x) { .cs_size = (x) },
@@ -619,16 +637,16 @@ static struct arraycache_init initarray_generic =
619 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 637 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
620 638
621/* internal cache of cache description objs */ 639/* internal cache of cache description objs */
622static kmem_cache_t cache_cache = { 640static struct kmem_cache cache_cache = {
623 .batchcount = 1, 641 .batchcount = 1,
624 .limit = BOOT_CPUCACHE_ENTRIES, 642 .limit = BOOT_CPUCACHE_ENTRIES,
625 .shared = 1, 643 .shared = 1,
626 .objsize = sizeof(kmem_cache_t), 644 .buffer_size = sizeof(struct kmem_cache),
627 .flags = SLAB_NO_REAP, 645 .flags = SLAB_NO_REAP,
628 .spinlock = SPIN_LOCK_UNLOCKED, 646 .spinlock = SPIN_LOCK_UNLOCKED,
629 .name = "kmem_cache", 647 .name = "kmem_cache",
630#if DEBUG 648#if DEBUG
631 .reallen = sizeof(kmem_cache_t), 649 .obj_size = sizeof(struct kmem_cache),
632#endif 650#endif
633}; 651};
634 652
@@ -657,17 +675,17 @@ static enum {
657 675
658static DEFINE_PER_CPU(struct work_struct, reap_work); 676static DEFINE_PER_CPU(struct work_struct, reap_work);
659 677
660static void free_block(kmem_cache_t *cachep, void **objpp, int len, int node); 678static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node);
661static void enable_cpucache(kmem_cache_t *cachep); 679static void enable_cpucache(struct kmem_cache *cachep);
662static void cache_reap(void *unused); 680static void cache_reap(void *unused);
663static int __node_shrink(kmem_cache_t *cachep, int node); 681static int __node_shrink(struct kmem_cache *cachep, int node);
664 682
665static inline struct array_cache *ac_data(kmem_cache_t *cachep) 683static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
666{ 684{
667 return cachep->array[smp_processor_id()]; 685 return cachep->array[smp_processor_id()];
668} 686}
669 687
670static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags) 688static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags)
671{ 689{
672 struct cache_sizes *csizep = malloc_sizes; 690 struct cache_sizes *csizep = malloc_sizes;
673 691
@@ -691,43 +709,80 @@ static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
691 return csizep->cs_cachep; 709 return csizep->cs_cachep;
692} 710}
693 711
694kmem_cache_t *kmem_find_general_cachep(size_t size, gfp_t gfpflags) 712struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
695{ 713{
696 return __find_general_cachep(size, gfpflags); 714 return __find_general_cachep(size, gfpflags);
697} 715}
698EXPORT_SYMBOL(kmem_find_general_cachep); 716EXPORT_SYMBOL(kmem_find_general_cachep);
699 717
700/* Cal the num objs, wastage, and bytes left over for a given slab size. */ 718static size_t slab_mgmt_size(size_t nr_objs, size_t align)
701static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
702 int flags, size_t *left_over, unsigned int *num)
703{ 719{
704 int i; 720 return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
705 size_t wastage = PAGE_SIZE << gfporder; 721}
706 size_t extra = 0;
707 size_t base = 0;
708 722
709 if (!(flags & CFLGS_OFF_SLAB)) { 723/* Calculate the number of objects and left-over bytes for a given
710 base = sizeof(struct slab); 724 buffer size. */
711 extra = sizeof(kmem_bufctl_t); 725static void cache_estimate(unsigned long gfporder, size_t buffer_size,
712 } 726 size_t align, int flags, size_t *left_over,
713 i = 0; 727 unsigned int *num)
714 while (i * size + ALIGN(base + i * extra, align) <= wastage) 728{
715 i++; 729 int nr_objs;
716 if (i > 0) 730 size_t mgmt_size;
717 i--; 731 size_t slab_size = PAGE_SIZE << gfporder;
732
733 /*
734 * The slab management structure can be either off the slab or
735 * on it. For the latter case, the memory allocated for a
736 * slab is used for:
737 *
738 * - The struct slab
739 * - One kmem_bufctl_t for each object
740 * - Padding to respect alignment of @align
741 * - @buffer_size bytes for each object
742 *
743 * If the slab management structure is off the slab, then the
744 * alignment will already be calculated into the size. Because
745 * the slabs are all pages aligned, the objects will be at the
746 * correct alignment when allocated.
747 */
748 if (flags & CFLGS_OFF_SLAB) {
749 mgmt_size = 0;
750 nr_objs = slab_size / buffer_size;
751
752 if (nr_objs > SLAB_LIMIT)
753 nr_objs = SLAB_LIMIT;
754 } else {
755 /*
756 * Ignore padding for the initial guess. The padding
757 * is at most @align-1 bytes, and @buffer_size is at
758 * least @align. In the worst case, this result will
759 * be one greater than the number of objects that fit
760 * into the memory allocation when taking the padding
761 * into account.
762 */
763 nr_objs = (slab_size - sizeof(struct slab)) /
764 (buffer_size + sizeof(kmem_bufctl_t));
765
766 /*
767 * This calculated number will be either the right
768 * amount, or one greater than what we want.
769 */
770 if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
771 > slab_size)
772 nr_objs--;
718 773
719 if (i > SLAB_LIMIT) 774 if (nr_objs > SLAB_LIMIT)
720 i = SLAB_LIMIT; 775 nr_objs = SLAB_LIMIT;
721 776
722 *num = i; 777 mgmt_size = slab_mgmt_size(nr_objs, align);
723 wastage -= i * size; 778 }
724 wastage -= ALIGN(base + i * extra, align); 779 *num = nr_objs;
725 *left_over = wastage; 780 *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
726} 781}
727 782
728#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) 783#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
729 784
730static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg) 785static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg)
731{ 786{
732 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 787 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
733 function, cachep->name, msg); 788 function, cachep->name, msg);
@@ -774,9 +829,9 @@ static struct array_cache *alloc_arraycache(int node, int entries,
774} 829}
775 830
776#ifdef CONFIG_NUMA 831#ifdef CONFIG_NUMA
777static void *__cache_alloc_node(kmem_cache_t *, gfp_t, int); 832static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
778 833
779static inline struct array_cache **alloc_alien_cache(int node, int limit) 834static struct array_cache **alloc_alien_cache(int node, int limit)
780{ 835{
781 struct array_cache **ac_ptr; 836 struct array_cache **ac_ptr;
782 int memsize = sizeof(void *) * MAX_NUMNODES; 837 int memsize = sizeof(void *) * MAX_NUMNODES;
@@ -803,7 +858,7 @@ static inline struct array_cache **alloc_alien_cache(int node, int limit)
803 return ac_ptr; 858 return ac_ptr;
804} 859}
805 860
806static inline void free_alien_cache(struct array_cache **ac_ptr) 861static void free_alien_cache(struct array_cache **ac_ptr)
807{ 862{
808 int i; 863 int i;
809 864
@@ -816,8 +871,8 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
816 kfree(ac_ptr); 871 kfree(ac_ptr);
817} 872}
818 873
819static inline void __drain_alien_cache(kmem_cache_t *cachep, 874static void __drain_alien_cache(struct kmem_cache *cachep,
820 struct array_cache *ac, int node) 875 struct array_cache *ac, int node)
821{ 876{
822 struct kmem_list3 *rl3 = cachep->nodelists[node]; 877 struct kmem_list3 *rl3 = cachep->nodelists[node];
823 878
@@ -829,14 +884,14 @@ static inline void __drain_alien_cache(kmem_cache_t *cachep,
829 } 884 }
830} 885}
831 886
832static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3) 887static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
833{ 888{
834 int i = 0; 889 int i = 0;
835 struct array_cache *ac; 890 struct array_cache *ac;
836 unsigned long flags; 891 unsigned long flags;
837 892
838 for_each_online_node(i) { 893 for_each_online_node(i) {
839 ac = l3->alien[i]; 894 ac = alien[i];
840 if (ac) { 895 if (ac) {
841 spin_lock_irqsave(&ac->lock, flags); 896 spin_lock_irqsave(&ac->lock, flags);
842 __drain_alien_cache(cachep, ac, i); 897 __drain_alien_cache(cachep, ac, i);
@@ -845,16 +900,25 @@ static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
845 } 900 }
846} 901}
847#else 902#else
848#define alloc_alien_cache(node, limit) do { } while (0) 903
849#define free_alien_cache(ac_ptr) do { } while (0) 904#define drain_alien_cache(cachep, alien) do { } while (0)
850#define drain_alien_cache(cachep, l3) do { } while (0) 905
906static inline struct array_cache **alloc_alien_cache(int node, int limit)
907{
908 return (struct array_cache **) 0x01020304ul;
909}
910
911static inline void free_alien_cache(struct array_cache **ac_ptr)
912{
913}
914
851#endif 915#endif
852 916
853static int __devinit cpuup_callback(struct notifier_block *nfb, 917static int __devinit cpuup_callback(struct notifier_block *nfb,
854 unsigned long action, void *hcpu) 918 unsigned long action, void *hcpu)
855{ 919{
856 long cpu = (long)hcpu; 920 long cpu = (long)hcpu;
857 kmem_cache_t *cachep; 921 struct kmem_cache *cachep;
858 struct kmem_list3 *l3 = NULL; 922 struct kmem_list3 *l3 = NULL;
859 int node = cpu_to_node(cpu); 923 int node = cpu_to_node(cpu);
860 int memsize = sizeof(struct kmem_list3); 924 int memsize = sizeof(struct kmem_list3);
@@ -881,6 +945,11 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
881 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 945 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
882 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 946 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
883 947
948 /*
949 * The l3s don't come and go as CPUs come and
950 * go. cache_chain_mutex is sufficient
951 * protection here.
952 */
884 cachep->nodelists[node] = l3; 953 cachep->nodelists[node] = l3;
885 } 954 }
886 955
@@ -895,26 +964,46 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
895 & array cache's */ 964 & array cache's */
896 list_for_each_entry(cachep, &cache_chain, next) { 965 list_for_each_entry(cachep, &cache_chain, next) {
897 struct array_cache *nc; 966 struct array_cache *nc;
967 struct array_cache *shared;
968 struct array_cache **alien;
898 969
899 nc = alloc_arraycache(node, cachep->limit, 970 nc = alloc_arraycache(node, cachep->limit,
900 cachep->batchcount); 971 cachep->batchcount);
901 if (!nc) 972 if (!nc)
902 goto bad; 973 goto bad;
974 shared = alloc_arraycache(node,
975 cachep->shared * cachep->batchcount,
976 0xbaadf00d);
977 if (!shared)
978 goto bad;
979
980 alien = alloc_alien_cache(node, cachep->limit);
981 if (!alien)
982 goto bad;
903 cachep->array[cpu] = nc; 983 cachep->array[cpu] = nc;
904 984
905 l3 = cachep->nodelists[node]; 985 l3 = cachep->nodelists[node];
906 BUG_ON(!l3); 986 BUG_ON(!l3);
907 if (!l3->shared) {
908 if (!(nc = alloc_arraycache(node,
909 cachep->shared *
910 cachep->batchcount,
911 0xbaadf00d)))
912 goto bad;
913 987
914 /* we are serialised from CPU_DEAD or 988 spin_lock_irq(&l3->list_lock);
915 CPU_UP_CANCELLED by the cpucontrol lock */ 989 if (!l3->shared) {
916 l3->shared = nc; 990 /*
991 * We are serialised from CPU_DEAD or
992 * CPU_UP_CANCELLED by the cpucontrol lock
993 */
994 l3->shared = shared;
995 shared = NULL;
917 } 996 }
997#ifdef CONFIG_NUMA
998 if (!l3->alien) {
999 l3->alien = alien;
1000 alien = NULL;
1001 }
1002#endif
1003 spin_unlock_irq(&l3->list_lock);
1004
1005 kfree(shared);
1006 free_alien_cache(alien);
918 } 1007 }
919 mutex_unlock(&cache_chain_mutex); 1008 mutex_unlock(&cache_chain_mutex);
920 break; 1009 break;
@@ -923,25 +1012,34 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
923 break; 1012 break;
924#ifdef CONFIG_HOTPLUG_CPU 1013#ifdef CONFIG_HOTPLUG_CPU
925 case CPU_DEAD: 1014 case CPU_DEAD:
1015 /*
1016 * Even if all the cpus of a node are down, we don't free the
1017 * kmem_list3 of any cache. This to avoid a race between
1018 * cpu_down, and a kmalloc allocation from another cpu for
1019 * memory from the node of the cpu going down. The list3
1020 * structure is usually allocated from kmem_cache_create() and
1021 * gets destroyed at kmem_cache_destroy().
1022 */
926 /* fall thru */ 1023 /* fall thru */
927 case CPU_UP_CANCELED: 1024 case CPU_UP_CANCELED:
928 mutex_lock(&cache_chain_mutex); 1025 mutex_lock(&cache_chain_mutex);
929 1026
930 list_for_each_entry(cachep, &cache_chain, next) { 1027 list_for_each_entry(cachep, &cache_chain, next) {
931 struct array_cache *nc; 1028 struct array_cache *nc;
1029 struct array_cache *shared;
1030 struct array_cache **alien;
932 cpumask_t mask; 1031 cpumask_t mask;
933 1032
934 mask = node_to_cpumask(node); 1033 mask = node_to_cpumask(node);
935 spin_lock_irq(&cachep->spinlock);
936 /* cpu is dead; no one can alloc from it. */ 1034 /* cpu is dead; no one can alloc from it. */
937 nc = cachep->array[cpu]; 1035 nc = cachep->array[cpu];
938 cachep->array[cpu] = NULL; 1036 cachep->array[cpu] = NULL;
939 l3 = cachep->nodelists[node]; 1037 l3 = cachep->nodelists[node];
940 1038
941 if (!l3) 1039 if (!l3)
942 goto unlock_cache; 1040 goto free_array_cache;
943 1041
944 spin_lock(&l3->list_lock); 1042 spin_lock_irq(&l3->list_lock);
945 1043
946 /* Free limit for this kmem_list3 */ 1044 /* Free limit for this kmem_list3 */
947 l3->free_limit -= cachep->batchcount; 1045 l3->free_limit -= cachep->batchcount;
@@ -949,34 +1047,44 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
949 free_block(cachep, nc->entry, nc->avail, node); 1047 free_block(cachep, nc->entry, nc->avail, node);
950 1048
951 if (!cpus_empty(mask)) { 1049 if (!cpus_empty(mask)) {
952 spin_unlock(&l3->list_lock); 1050 spin_unlock_irq(&l3->list_lock);
953 goto unlock_cache; 1051 goto free_array_cache;
954 } 1052 }
955 1053
956 if (l3->shared) { 1054 shared = l3->shared;
1055 if (shared) {
957 free_block(cachep, l3->shared->entry, 1056 free_block(cachep, l3->shared->entry,
958 l3->shared->avail, node); 1057 l3->shared->avail, node);
959 kfree(l3->shared);
960 l3->shared = NULL; 1058 l3->shared = NULL;
961 } 1059 }
962 if (l3->alien) {
963 drain_alien_cache(cachep, l3);
964 free_alien_cache(l3->alien);
965 l3->alien = NULL;
966 }
967 1060
968 /* free slabs belonging to this node */ 1061 alien = l3->alien;
969 if (__node_shrink(cachep, node)) { 1062 l3->alien = NULL;
970 cachep->nodelists[node] = NULL; 1063
971 spin_unlock(&l3->list_lock); 1064 spin_unlock_irq(&l3->list_lock);
972 kfree(l3); 1065
973 } else { 1066 kfree(shared);
974 spin_unlock(&l3->list_lock); 1067 if (alien) {
1068 drain_alien_cache(cachep, alien);
1069 free_alien_cache(alien);
975 } 1070 }
976 unlock_cache: 1071free_array_cache:
977 spin_unlock_irq(&cachep->spinlock);
978 kfree(nc); 1072 kfree(nc);
979 } 1073 }
1074 /*
1075 * In the previous loop, all the objects were freed to
1076 * the respective cache's slabs, now we can go ahead and
1077 * shrink each nodelist to its limit.
1078 */
1079 list_for_each_entry(cachep, &cache_chain, next) {
1080 l3 = cachep->nodelists[node];
1081 if (!l3)
1082 continue;
1083 spin_lock_irq(&l3->list_lock);
1084 /* free slabs belonging to this node */
1085 __node_shrink(cachep, node);
1086 spin_unlock_irq(&l3->list_lock);
1087 }
980 mutex_unlock(&cache_chain_mutex); 1088 mutex_unlock(&cache_chain_mutex);
981 break; 1089 break;
982#endif 1090#endif
@@ -992,7 +1100,7 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
992/* 1100/*
993 * swap the static kmem_list3 with kmalloced memory 1101 * swap the static kmem_list3 with kmalloced memory
994 */ 1102 */
995static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, int nodeid) 1103static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid)
996{ 1104{
997 struct kmem_list3 *ptr; 1105 struct kmem_list3 *ptr;
998 1106
@@ -1032,14 +1140,14 @@ void __init kmem_cache_init(void)
1032 1140
1033 /* Bootstrap is tricky, because several objects are allocated 1141 /* Bootstrap is tricky, because several objects are allocated
1034 * from caches that do not exist yet: 1142 * from caches that do not exist yet:
1035 * 1) initialize the cache_cache cache: it contains the kmem_cache_t 1143 * 1) initialize the cache_cache cache: it contains the struct kmem_cache
1036 * structures of all caches, except cache_cache itself: cache_cache 1144 * structures of all caches, except cache_cache itself: cache_cache
1037 * is statically allocated. 1145 * is statically allocated.
1038 * Initially an __init data area is used for the head array and the 1146 * Initially an __init data area is used for the head array and the
1039 * kmem_list3 structures, it's replaced with a kmalloc allocated 1147 * kmem_list3 structures, it's replaced with a kmalloc allocated
1040 * array at the end of the bootstrap. 1148 * array at the end of the bootstrap.
1041 * 2) Create the first kmalloc cache. 1149 * 2) Create the first kmalloc cache.
1042 * The kmem_cache_t for the new cache is allocated normally. 1150 * The struct kmem_cache for the new cache is allocated normally.
1043 * An __init data area is used for the head array. 1151 * An __init data area is used for the head array.
1044 * 3) Create the remaining kmalloc caches, with minimally sized 1152 * 3) Create the remaining kmalloc caches, with minimally sized
1045 * head arrays. 1153 * head arrays.
@@ -1057,15 +1165,14 @@ void __init kmem_cache_init(void)
1057 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 1165 cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1058 cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; 1166 cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
1059 1167
1060 cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); 1168 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
1061 1169
1062 cache_estimate(0, cache_cache.objsize, cache_line_size(), 0, 1170 cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0,
1063 &left_over, &cache_cache.num); 1171 &left_over, &cache_cache.num);
1064 if (!cache_cache.num) 1172 if (!cache_cache.num)
1065 BUG(); 1173 BUG();
1066 1174
1067 cache_cache.colour = left_over / cache_cache.colour_off; 1175 cache_cache.colour = left_over / cache_cache.colour_off;
1068 cache_cache.colour_next = 0;
1069 cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + 1176 cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1070 sizeof(struct slab), cache_line_size()); 1177 sizeof(struct slab), cache_line_size());
1071 1178
@@ -1132,8 +1239,8 @@ void __init kmem_cache_init(void)
1132 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1239 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1133 1240
1134 local_irq_disable(); 1241 local_irq_disable();
1135 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); 1242 BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1136 memcpy(ptr, ac_data(&cache_cache), 1243 memcpy(ptr, cpu_cache_get(&cache_cache),
1137 sizeof(struct arraycache_init)); 1244 sizeof(struct arraycache_init));
1138 cache_cache.array[smp_processor_id()] = ptr; 1245 cache_cache.array[smp_processor_id()] = ptr;
1139 local_irq_enable(); 1246 local_irq_enable();
@@ -1141,9 +1248,9 @@ void __init kmem_cache_init(void)
1141 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1248 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1142 1249
1143 local_irq_disable(); 1250 local_irq_disable();
1144 BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep) 1251 BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1145 != &initarray_generic.cache); 1252 != &initarray_generic.cache);
1146 memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep), 1253 memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
1147 sizeof(struct arraycache_init)); 1254 sizeof(struct arraycache_init));
1148 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1255 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1149 ptr; 1256 ptr;
@@ -1170,7 +1277,7 @@ void __init kmem_cache_init(void)
1170 1277
1171 /* 6) resize the head arrays to their final sizes */ 1278 /* 6) resize the head arrays to their final sizes */
1172 { 1279 {
1173 kmem_cache_t *cachep; 1280 struct kmem_cache *cachep;
1174 mutex_lock(&cache_chain_mutex); 1281 mutex_lock(&cache_chain_mutex);
1175 list_for_each_entry(cachep, &cache_chain, next) 1282 list_for_each_entry(cachep, &cache_chain, next)
1176 enable_cpucache(cachep); 1283 enable_cpucache(cachep);
@@ -1181,7 +1288,7 @@ void __init kmem_cache_init(void)
1181 g_cpucache_up = FULL; 1288 g_cpucache_up = FULL;
1182 1289
1183 /* Register a cpu startup notifier callback 1290 /* Register a cpu startup notifier callback
1184 * that initializes ac_data for all new cpus 1291 * that initializes cpu_cache_get for all new cpus
1185 */ 1292 */
1186 register_cpu_notifier(&cpucache_notifier); 1293 register_cpu_notifier(&cpucache_notifier);
1187 1294
@@ -1213,7 +1320,7 @@ __initcall(cpucache_init);
1213 * did not request dmaable memory, we might get it, but that 1320 * did not request dmaable memory, we might get it, but that
1214 * would be relatively rare and ignorable. 1321 * would be relatively rare and ignorable.
1215 */ 1322 */
1216static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid) 1323static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1217{ 1324{
1218 struct page *page; 1325 struct page *page;
1219 void *addr; 1326 void *addr;
@@ -1239,7 +1346,7 @@ static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
1239/* 1346/*
1240 * Interface to system's page release. 1347 * Interface to system's page release.
1241 */ 1348 */
1242static void kmem_freepages(kmem_cache_t *cachep, void *addr) 1349static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1243{ 1350{
1244 unsigned long i = (1 << cachep->gfporder); 1351 unsigned long i = (1 << cachep->gfporder);
1245 struct page *page = virt_to_page(addr); 1352 struct page *page = virt_to_page(addr);
@@ -1261,7 +1368,7 @@ static void kmem_freepages(kmem_cache_t *cachep, void *addr)
1261static void kmem_rcu_free(struct rcu_head *head) 1368static void kmem_rcu_free(struct rcu_head *head)
1262{ 1369{
1263 struct slab_rcu *slab_rcu = (struct slab_rcu *)head; 1370 struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1264 kmem_cache_t *cachep = slab_rcu->cachep; 1371 struct kmem_cache *cachep = slab_rcu->cachep;
1265 1372
1266 kmem_freepages(cachep, slab_rcu->addr); 1373 kmem_freepages(cachep, slab_rcu->addr);
1267 if (OFF_SLAB(cachep)) 1374 if (OFF_SLAB(cachep))
@@ -1271,12 +1378,12 @@ static void kmem_rcu_free(struct rcu_head *head)
1271#if DEBUG 1378#if DEBUG
1272 1379
1273#ifdef CONFIG_DEBUG_PAGEALLOC 1380#ifdef CONFIG_DEBUG_PAGEALLOC
1274static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, 1381static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1275 unsigned long caller) 1382 unsigned long caller)
1276{ 1383{
1277 int size = obj_reallen(cachep); 1384 int size = obj_size(cachep);
1278 1385
1279 addr = (unsigned long *)&((char *)addr)[obj_dbghead(cachep)]; 1386 addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1280 1387
1281 if (size < 5 * sizeof(unsigned long)) 1388 if (size < 5 * sizeof(unsigned long))
1282 return; 1389 return;
@@ -1304,10 +1411,10 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
1304} 1411}
1305#endif 1412#endif
1306 1413
1307static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) 1414static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1308{ 1415{
1309 int size = obj_reallen(cachep); 1416 int size = obj_size(cachep);
1310 addr = &((char *)addr)[obj_dbghead(cachep)]; 1417 addr = &((char *)addr)[obj_offset(cachep)];
1311 1418
1312 memset(addr, val, size); 1419 memset(addr, val, size);
1313 *(unsigned char *)(addr + size - 1) = POISON_END; 1420 *(unsigned char *)(addr + size - 1) = POISON_END;
@@ -1326,7 +1433,7 @@ static void dump_line(char *data, int offset, int limit)
1326 1433
1327#if DEBUG 1434#if DEBUG
1328 1435
1329static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines) 1436static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1330{ 1437{
1331 int i, size; 1438 int i, size;
1332 char *realobj; 1439 char *realobj;
@@ -1344,8 +1451,8 @@ static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
1344 (unsigned long)*dbg_userword(cachep, objp)); 1451 (unsigned long)*dbg_userword(cachep, objp));
1345 printk("\n"); 1452 printk("\n");
1346 } 1453 }
1347 realobj = (char *)objp + obj_dbghead(cachep); 1454 realobj = (char *)objp + obj_offset(cachep);
1348 size = obj_reallen(cachep); 1455 size = obj_size(cachep);
1349 for (i = 0; i < size && lines; i += 16, lines--) { 1456 for (i = 0; i < size && lines; i += 16, lines--) {
1350 int limit; 1457 int limit;
1351 limit = 16; 1458 limit = 16;
@@ -1355,14 +1462,14 @@ static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
1355 } 1462 }
1356} 1463}
1357 1464
1358static void check_poison_obj(kmem_cache_t *cachep, void *objp) 1465static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1359{ 1466{
1360 char *realobj; 1467 char *realobj;
1361 int size, i; 1468 int size, i;
1362 int lines = 0; 1469 int lines = 0;
1363 1470
1364 realobj = (char *)objp + obj_dbghead(cachep); 1471 realobj = (char *)objp + obj_offset(cachep);
1365 size = obj_reallen(cachep); 1472 size = obj_size(cachep);
1366 1473
1367 for (i = 0; i < size; i++) { 1474 for (i = 0; i < size; i++) {
1368 char exp = POISON_FREE; 1475 char exp = POISON_FREE;
@@ -1395,20 +1502,20 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
1395 /* Print some data about the neighboring objects, if they 1502 /* Print some data about the neighboring objects, if they
1396 * exist: 1503 * exist:
1397 */ 1504 */
1398 struct slab *slabp = page_get_slab(virt_to_page(objp)); 1505 struct slab *slabp = virt_to_slab(objp);
1399 int objnr; 1506 int objnr;
1400 1507
1401 objnr = (objp - slabp->s_mem) / cachep->objsize; 1508 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
1402 if (objnr) { 1509 if (objnr) {
1403 objp = slabp->s_mem + (objnr - 1) * cachep->objsize; 1510 objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size;
1404 realobj = (char *)objp + obj_dbghead(cachep); 1511 realobj = (char *)objp + obj_offset(cachep);
1405 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1512 printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1406 realobj, size); 1513 realobj, size);
1407 print_objinfo(cachep, objp, 2); 1514 print_objinfo(cachep, objp, 2);
1408 } 1515 }
1409 if (objnr + 1 < cachep->num) { 1516 if (objnr + 1 < cachep->num) {
1410 objp = slabp->s_mem + (objnr + 1) * cachep->objsize; 1517 objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size;
1411 realobj = (char *)objp + obj_dbghead(cachep); 1518 realobj = (char *)objp + obj_offset(cachep);
1412 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1519 printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1413 realobj, size); 1520 realobj, size);
1414 print_objinfo(cachep, objp, 2); 1521 print_objinfo(cachep, objp, 2);
@@ -1417,25 +1524,23 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
1417} 1524}
1418#endif 1525#endif
1419 1526
1420/* Destroy all the objs in a slab, and release the mem back to the system. 1527#if DEBUG
1421 * Before calling the slab must have been unlinked from the cache. 1528/**
1422 * The cache-lock is not held/needed. 1529 * slab_destroy_objs - call the registered destructor for each object in
1530 * a slab that is to be destroyed.
1423 */ 1531 */
1424static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp) 1532static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1425{ 1533{
1426 void *addr = slabp->s_mem - slabp->colouroff;
1427
1428#if DEBUG
1429 int i; 1534 int i;
1430 for (i = 0; i < cachep->num; i++) { 1535 for (i = 0; i < cachep->num; i++) {
1431 void *objp = slabp->s_mem + cachep->objsize * i; 1536 void *objp = slabp->s_mem + cachep->buffer_size * i;
1432 1537
1433 if (cachep->flags & SLAB_POISON) { 1538 if (cachep->flags & SLAB_POISON) {
1434#ifdef CONFIG_DEBUG_PAGEALLOC 1539#ifdef CONFIG_DEBUG_PAGEALLOC
1435 if ((cachep->objsize % PAGE_SIZE) == 0 1540 if ((cachep->buffer_size % PAGE_SIZE) == 0
1436 && OFF_SLAB(cachep)) 1541 && OFF_SLAB(cachep))
1437 kernel_map_pages(virt_to_page(objp), 1542 kernel_map_pages(virt_to_page(objp),
1438 cachep->objsize / PAGE_SIZE, 1543 cachep->buffer_size / PAGE_SIZE,
1439 1); 1544 1);
1440 else 1545 else
1441 check_poison_obj(cachep, objp); 1546 check_poison_obj(cachep, objp);
@@ -1452,18 +1557,32 @@ static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
1452 "was overwritten"); 1557 "was overwritten");
1453 } 1558 }
1454 if (cachep->dtor && !(cachep->flags & SLAB_POISON)) 1559 if (cachep->dtor && !(cachep->flags & SLAB_POISON))
1455 (cachep->dtor) (objp + obj_dbghead(cachep), cachep, 0); 1560 (cachep->dtor) (objp + obj_offset(cachep), cachep, 0);
1456 } 1561 }
1562}
1457#else 1563#else
1564static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1565{
1458 if (cachep->dtor) { 1566 if (cachep->dtor) {
1459 int i; 1567 int i;
1460 for (i = 0; i < cachep->num; i++) { 1568 for (i = 0; i < cachep->num; i++) {
1461 void *objp = slabp->s_mem + cachep->objsize * i; 1569 void *objp = slabp->s_mem + cachep->buffer_size * i;
1462 (cachep->dtor) (objp, cachep, 0); 1570 (cachep->dtor) (objp, cachep, 0);
1463 } 1571 }
1464 } 1572 }
1573}
1465#endif 1574#endif
1466 1575
1576/**
1577 * Destroy all the objs in a slab, and release the mem back to the system.
1578 * Before calling the slab must have been unlinked from the cache.
1579 * The cache-lock is not held/needed.
1580 */
1581static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1582{
1583 void *addr = slabp->s_mem - slabp->colouroff;
1584
1585 slab_destroy_objs(cachep, slabp);
1467 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { 1586 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1468 struct slab_rcu *slab_rcu; 1587 struct slab_rcu *slab_rcu;
1469 1588
@@ -1478,9 +1597,9 @@ static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
1478 } 1597 }
1479} 1598}
1480 1599
1481/* For setting up all the kmem_list3s for cache whose objsize is same 1600/* For setting up all the kmem_list3s for cache whose buffer_size is same
1482 as size of kmem_list3. */ 1601 as size of kmem_list3. */
1483static inline void set_up_list3s(kmem_cache_t *cachep, int index) 1602static void set_up_list3s(struct kmem_cache *cachep, int index)
1484{ 1603{
1485 int node; 1604 int node;
1486 1605
@@ -1493,15 +1612,20 @@ static inline void set_up_list3s(kmem_cache_t *cachep, int index)
1493} 1612}
1494 1613
1495/** 1614/**
1496 * calculate_slab_order - calculate size (page order) of slabs and the number 1615 * calculate_slab_order - calculate size (page order) of slabs
1497 * of objects per slab. 1616 * @cachep: pointer to the cache that is being created
1617 * @size: size of objects to be created in this cache.
1618 * @align: required alignment for the objects.
1619 * @flags: slab allocation flags
1620 *
1621 * Also calculates the number of objects per slab.
1498 * 1622 *
1499 * This could be made much more intelligent. For now, try to avoid using 1623 * This could be made much more intelligent. For now, try to avoid using
1500 * high order pages for slabs. When the gfp() functions are more friendly 1624 * high order pages for slabs. When the gfp() functions are more friendly
1501 * towards high-order requests, this should be changed. 1625 * towards high-order requests, this should be changed.
1502 */ 1626 */
1503static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size, 1627static inline size_t calculate_slab_order(struct kmem_cache *cachep,
1504 size_t align, gfp_t flags) 1628 size_t size, size_t align, unsigned long flags)
1505{ 1629{
1506 size_t left_over = 0; 1630 size_t left_over = 0;
1507 1631
@@ -1572,13 +1696,13 @@ static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size,
1572 * cacheline. This can be beneficial if you're counting cycles as closely 1696 * cacheline. This can be beneficial if you're counting cycles as closely
1573 * as davem. 1697 * as davem.
1574 */ 1698 */
1575kmem_cache_t * 1699struct kmem_cache *
1576kmem_cache_create (const char *name, size_t size, size_t align, 1700kmem_cache_create (const char *name, size_t size, size_t align,
1577 unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), 1701 unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long),
1578 void (*dtor)(void*, kmem_cache_t *, unsigned long)) 1702 void (*dtor)(void*, struct kmem_cache *, unsigned long))
1579{ 1703{
1580 size_t left_over, slab_size, ralign; 1704 size_t left_over, slab_size, ralign;
1581 kmem_cache_t *cachep = NULL; 1705 struct kmem_cache *cachep = NULL;
1582 struct list_head *p; 1706 struct list_head *p;
1583 1707
1584 /* 1708 /*
@@ -1593,10 +1717,16 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1593 BUG(); 1717 BUG();
1594 } 1718 }
1595 1719
1720 /*
1721 * Prevent CPUs from coming and going.
1722 * lock_cpu_hotplug() nests outside cache_chain_mutex
1723 */
1724 lock_cpu_hotplug();
1725
1596 mutex_lock(&cache_chain_mutex); 1726 mutex_lock(&cache_chain_mutex);
1597 1727
1598 list_for_each(p, &cache_chain) { 1728 list_for_each(p, &cache_chain) {
1599 kmem_cache_t *pc = list_entry(p, kmem_cache_t, next); 1729 struct kmem_cache *pc = list_entry(p, struct kmem_cache, next);
1600 mm_segment_t old_fs = get_fs(); 1730 mm_segment_t old_fs = get_fs();
1601 char tmp; 1731 char tmp;
1602 int res; 1732 int res;
@@ -1611,7 +1741,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1611 set_fs(old_fs); 1741 set_fs(old_fs);
1612 if (res) { 1742 if (res) {
1613 printk("SLAB: cache with size %d has lost its name\n", 1743 printk("SLAB: cache with size %d has lost its name\n",
1614 pc->objsize); 1744 pc->buffer_size);
1615 continue; 1745 continue;
1616 } 1746 }
1617 1747
@@ -1696,20 +1826,20 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1696 align = ralign; 1826 align = ralign;
1697 1827
1698 /* Get cache's description obj. */ 1828 /* Get cache's description obj. */
1699 cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL); 1829 cachep = kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
1700 if (!cachep) 1830 if (!cachep)
1701 goto oops; 1831 goto oops;
1702 memset(cachep, 0, sizeof(kmem_cache_t)); 1832 memset(cachep, 0, sizeof(struct kmem_cache));
1703 1833
1704#if DEBUG 1834#if DEBUG
1705 cachep->reallen = size; 1835 cachep->obj_size = size;
1706 1836
1707 if (flags & SLAB_RED_ZONE) { 1837 if (flags & SLAB_RED_ZONE) {
1708 /* redzoning only works with word aligned caches */ 1838 /* redzoning only works with word aligned caches */
1709 align = BYTES_PER_WORD; 1839 align = BYTES_PER_WORD;
1710 1840
1711 /* add space for red zone words */ 1841 /* add space for red zone words */
1712 cachep->dbghead += BYTES_PER_WORD; 1842 cachep->obj_offset += BYTES_PER_WORD;
1713 size += 2 * BYTES_PER_WORD; 1843 size += 2 * BYTES_PER_WORD;
1714 } 1844 }
1715 if (flags & SLAB_STORE_USER) { 1845 if (flags & SLAB_STORE_USER) {
@@ -1722,8 +1852,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1722 } 1852 }
1723#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 1853#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
1724 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size 1854 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
1725 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) { 1855 && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
1726 cachep->dbghead += PAGE_SIZE - size; 1856 cachep->obj_offset += PAGE_SIZE - size;
1727 size = PAGE_SIZE; 1857 size = PAGE_SIZE;
1728 } 1858 }
1729#endif 1859#endif
@@ -1786,7 +1916,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1786 if (flags & SLAB_CACHE_DMA) 1916 if (flags & SLAB_CACHE_DMA)
1787 cachep->gfpflags |= GFP_DMA; 1917 cachep->gfpflags |= GFP_DMA;
1788 spin_lock_init(&cachep->spinlock); 1918 spin_lock_init(&cachep->spinlock);
1789 cachep->objsize = size; 1919 cachep->buffer_size = size;
1790 1920
1791 if (flags & CFLGS_OFF_SLAB) 1921 if (flags & CFLGS_OFF_SLAB)
1792 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); 1922 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
@@ -1794,8 +1924,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1794 cachep->dtor = dtor; 1924 cachep->dtor = dtor;
1795 cachep->name = name; 1925 cachep->name = name;
1796 1926
1797 /* Don't let CPUs to come and go */
1798 lock_cpu_hotplug();
1799 1927
1800 if (g_cpucache_up == FULL) { 1928 if (g_cpucache_up == FULL) {
1801 enable_cpucache(cachep); 1929 enable_cpucache(cachep);
@@ -1843,23 +1971,23 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1843 jiffies + REAPTIMEOUT_LIST3 + 1971 jiffies + REAPTIMEOUT_LIST3 +
1844 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 1972 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1845 1973
1846 BUG_ON(!ac_data(cachep)); 1974 BUG_ON(!cpu_cache_get(cachep));
1847 ac_data(cachep)->avail = 0; 1975 cpu_cache_get(cachep)->avail = 0;
1848 ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 1976 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
1849 ac_data(cachep)->batchcount = 1; 1977 cpu_cache_get(cachep)->batchcount = 1;
1850 ac_data(cachep)->touched = 0; 1978 cpu_cache_get(cachep)->touched = 0;
1851 cachep->batchcount = 1; 1979 cachep->batchcount = 1;
1852 cachep->limit = BOOT_CPUCACHE_ENTRIES; 1980 cachep->limit = BOOT_CPUCACHE_ENTRIES;
1853 } 1981 }
1854 1982
1855 /* cache setup completed, link it into the list */ 1983 /* cache setup completed, link it into the list */
1856 list_add(&cachep->next, &cache_chain); 1984 list_add(&cachep->next, &cache_chain);
1857 unlock_cpu_hotplug();
1858 oops: 1985 oops:
1859 if (!cachep && (flags & SLAB_PANIC)) 1986 if (!cachep && (flags & SLAB_PANIC))
1860 panic("kmem_cache_create(): failed to create slab `%s'\n", 1987 panic("kmem_cache_create(): failed to create slab `%s'\n",
1861 name); 1988 name);
1862 mutex_unlock(&cache_chain_mutex); 1989 mutex_unlock(&cache_chain_mutex);
1990 unlock_cpu_hotplug();
1863 return cachep; 1991 return cachep;
1864} 1992}
1865EXPORT_SYMBOL(kmem_cache_create); 1993EXPORT_SYMBOL(kmem_cache_create);
@@ -1875,7 +2003,7 @@ static void check_irq_on(void)
1875 BUG_ON(irqs_disabled()); 2003 BUG_ON(irqs_disabled());
1876} 2004}
1877 2005
1878static void check_spinlock_acquired(kmem_cache_t *cachep) 2006static void check_spinlock_acquired(struct kmem_cache *cachep)
1879{ 2007{
1880#ifdef CONFIG_SMP 2008#ifdef CONFIG_SMP
1881 check_irq_off(); 2009 check_irq_off();
@@ -1883,7 +2011,7 @@ static void check_spinlock_acquired(kmem_cache_t *cachep)
1883#endif 2011#endif
1884} 2012}
1885 2013
1886static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node) 2014static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
1887{ 2015{
1888#ifdef CONFIG_SMP 2016#ifdef CONFIG_SMP
1889 check_irq_off(); 2017 check_irq_off();
@@ -1916,45 +2044,43 @@ static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
1916 preempt_enable(); 2044 preempt_enable();
1917} 2045}
1918 2046
1919static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac, 2047static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
1920 int force, int node); 2048 int force, int node);
1921 2049
1922static void do_drain(void *arg) 2050static void do_drain(void *arg)
1923{ 2051{
1924 kmem_cache_t *cachep = (kmem_cache_t *) arg; 2052 struct kmem_cache *cachep = (struct kmem_cache *) arg;
1925 struct array_cache *ac; 2053 struct array_cache *ac;
1926 int node = numa_node_id(); 2054 int node = numa_node_id();
1927 2055
1928 check_irq_off(); 2056 check_irq_off();
1929 ac = ac_data(cachep); 2057 ac = cpu_cache_get(cachep);
1930 spin_lock(&cachep->nodelists[node]->list_lock); 2058 spin_lock(&cachep->nodelists[node]->list_lock);
1931 free_block(cachep, ac->entry, ac->avail, node); 2059 free_block(cachep, ac->entry, ac->avail, node);
1932 spin_unlock(&cachep->nodelists[node]->list_lock); 2060 spin_unlock(&cachep->nodelists[node]->list_lock);
1933 ac->avail = 0; 2061 ac->avail = 0;
1934} 2062}
1935 2063
1936static void drain_cpu_caches(kmem_cache_t *cachep) 2064static void drain_cpu_caches(struct kmem_cache *cachep)
1937{ 2065{
1938 struct kmem_list3 *l3; 2066 struct kmem_list3 *l3;
1939 int node; 2067 int node;
1940 2068
1941 smp_call_function_all_cpus(do_drain, cachep); 2069 smp_call_function_all_cpus(do_drain, cachep);
1942 check_irq_on(); 2070 check_irq_on();
1943 spin_lock_irq(&cachep->spinlock);
1944 for_each_online_node(node) { 2071 for_each_online_node(node) {
1945 l3 = cachep->nodelists[node]; 2072 l3 = cachep->nodelists[node];
1946 if (l3) { 2073 if (l3) {
1947 spin_lock(&l3->list_lock); 2074 spin_lock_irq(&l3->list_lock);
1948 drain_array_locked(cachep, l3->shared, 1, node); 2075 drain_array_locked(cachep, l3->shared, 1, node);
1949 spin_unlock(&l3->list_lock); 2076 spin_unlock_irq(&l3->list_lock);
1950 if (l3->alien) 2077 if (l3->alien)
1951 drain_alien_cache(cachep, l3); 2078 drain_alien_cache(cachep, l3->alien);
1952 } 2079 }
1953 } 2080 }
1954 spin_unlock_irq(&cachep->spinlock);
1955} 2081}
1956 2082
1957static int __node_shrink(kmem_cache_t *cachep, int node) 2083static int __node_shrink(struct kmem_cache *cachep, int node)
1958{ 2084{
1959 struct slab *slabp; 2085 struct slab *slabp;
1960 struct kmem_list3 *l3 = cachep->nodelists[node]; 2086 struct kmem_list3 *l3 = cachep->nodelists[node];
@@ -1983,7 +2109,7 @@ static int __node_shrink(kmem_cache_t *cachep, int node)
1983 return ret; 2109 return ret;
1984} 2110}
1985 2111
1986static int __cache_shrink(kmem_cache_t *cachep) 2112static int __cache_shrink(struct kmem_cache *cachep)
1987{ 2113{
1988 int ret = 0, i = 0; 2114 int ret = 0, i = 0;
1989 struct kmem_list3 *l3; 2115 struct kmem_list3 *l3;
@@ -2009,7 +2135,7 @@ static int __cache_shrink(kmem_cache_t *cachep)
2009 * Releases as many slabs as possible for a cache. 2135 * Releases as many slabs as possible for a cache.
2010 * To help debugging, a zero exit status indicates all slabs were released. 2136 * To help debugging, a zero exit status indicates all slabs were released.
2011 */ 2137 */
2012int kmem_cache_shrink(kmem_cache_t *cachep) 2138int kmem_cache_shrink(struct kmem_cache *cachep)
2013{ 2139{
2014 if (!cachep || in_interrupt()) 2140 if (!cachep || in_interrupt())
2015 BUG(); 2141 BUG();
@@ -2022,7 +2148,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2022 * kmem_cache_destroy - delete a cache 2148 * kmem_cache_destroy - delete a cache
2023 * @cachep: the cache to destroy 2149 * @cachep: the cache to destroy
2024 * 2150 *
2025 * Remove a kmem_cache_t object from the slab cache. 2151 * Remove a struct kmem_cache object from the slab cache.
2026 * Returns 0 on success. 2152 * Returns 0 on success.
2027 * 2153 *
2028 * It is expected this function will be called by a module when it is 2154 * It is expected this function will be called by a module when it is
@@ -2035,7 +2161,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2035 * The caller must guarantee that noone will allocate memory from the cache 2161 * The caller must guarantee that noone will allocate memory from the cache
2036 * during the kmem_cache_destroy(). 2162 * during the kmem_cache_destroy().
2037 */ 2163 */
2038int kmem_cache_destroy(kmem_cache_t *cachep) 2164int kmem_cache_destroy(struct kmem_cache *cachep)
2039{ 2165{
2040 int i; 2166 int i;
2041 struct kmem_list3 *l3; 2167 struct kmem_list3 *l3;
@@ -2086,7 +2212,7 @@ int kmem_cache_destroy(kmem_cache_t *cachep)
2086EXPORT_SYMBOL(kmem_cache_destroy); 2212EXPORT_SYMBOL(kmem_cache_destroy);
2087 2213
2088/* Get the memory for a slab management obj. */ 2214/* Get the memory for a slab management obj. */
2089static struct slab *alloc_slabmgmt(kmem_cache_t *cachep, void *objp, 2215static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2090 int colour_off, gfp_t local_flags) 2216 int colour_off, gfp_t local_flags)
2091{ 2217{
2092 struct slab *slabp; 2218 struct slab *slabp;
@@ -2112,13 +2238,13 @@ static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2112 return (kmem_bufctl_t *) (slabp + 1); 2238 return (kmem_bufctl_t *) (slabp + 1);
2113} 2239}
2114 2240
2115static void cache_init_objs(kmem_cache_t *cachep, 2241static void cache_init_objs(struct kmem_cache *cachep,
2116 struct slab *slabp, unsigned long ctor_flags) 2242 struct slab *slabp, unsigned long ctor_flags)
2117{ 2243{
2118 int i; 2244 int i;
2119 2245
2120 for (i = 0; i < cachep->num; i++) { 2246 for (i = 0; i < cachep->num; i++) {
2121 void *objp = slabp->s_mem + cachep->objsize * i; 2247 void *objp = slabp->s_mem + cachep->buffer_size * i;
2122#if DEBUG 2248#if DEBUG
2123 /* need to poison the objs? */ 2249 /* need to poison the objs? */
2124 if (cachep->flags & SLAB_POISON) 2250 if (cachep->flags & SLAB_POISON)
@@ -2136,7 +2262,7 @@ static void cache_init_objs(kmem_cache_t *cachep,
2136 * Otherwise, deadlock. They must also be threaded. 2262 * Otherwise, deadlock. They must also be threaded.
2137 */ 2263 */
2138 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2264 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2139 cachep->ctor(objp + obj_dbghead(cachep), cachep, 2265 cachep->ctor(objp + obj_offset(cachep), cachep,
2140 ctor_flags); 2266 ctor_flags);
2141 2267
2142 if (cachep->flags & SLAB_RED_ZONE) { 2268 if (cachep->flags & SLAB_RED_ZONE) {
@@ -2147,10 +2273,10 @@ static void cache_init_objs(kmem_cache_t *cachep,
2147 slab_error(cachep, "constructor overwrote the" 2273 slab_error(cachep, "constructor overwrote the"
2148 " start of an object"); 2274 " start of an object");
2149 } 2275 }
2150 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) 2276 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
2151 && cachep->flags & SLAB_POISON) 2277 && cachep->flags & SLAB_POISON)
2152 kernel_map_pages(virt_to_page(objp), 2278 kernel_map_pages(virt_to_page(objp),
2153 cachep->objsize / PAGE_SIZE, 0); 2279 cachep->buffer_size / PAGE_SIZE, 0);
2154#else 2280#else
2155 if (cachep->ctor) 2281 if (cachep->ctor)
2156 cachep->ctor(objp, cachep, ctor_flags); 2282 cachep->ctor(objp, cachep, ctor_flags);
@@ -2161,7 +2287,7 @@ static void cache_init_objs(kmem_cache_t *cachep,
2161 slabp->free = 0; 2287 slabp->free = 0;
2162} 2288}
2163 2289
2164static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags) 2290static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2165{ 2291{
2166 if (flags & SLAB_DMA) { 2292 if (flags & SLAB_DMA) {
2167 if (!(cachep->gfpflags & GFP_DMA)) 2293 if (!(cachep->gfpflags & GFP_DMA))
@@ -2172,7 +2298,43 @@ static void kmem_flagcheck(kmem_cache_t *cachep, gfp_t flags)
2172 } 2298 }
2173} 2299}
2174 2300
2175static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp) 2301static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid)
2302{
2303 void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size);
2304 kmem_bufctl_t next;
2305
2306 slabp->inuse++;
2307 next = slab_bufctl(slabp)[slabp->free];
2308#if DEBUG
2309 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2310 WARN_ON(slabp->nodeid != nodeid);
2311#endif
2312 slabp->free = next;
2313
2314 return objp;
2315}
2316
2317static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp,
2318 int nodeid)
2319{
2320 unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size;
2321
2322#if DEBUG
2323 /* Verify that the slab belongs to the intended node */
2324 WARN_ON(slabp->nodeid != nodeid);
2325
2326 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2327 printk(KERN_ERR "slab: double free detected in cache "
2328 "'%s', objp %p\n", cachep->name, objp);
2329 BUG();
2330 }
2331#endif
2332 slab_bufctl(slabp)[objnr] = slabp->free;
2333 slabp->free = objnr;
2334 slabp->inuse--;
2335}
2336
2337static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp)
2176{ 2338{
2177 int i; 2339 int i;
2178 struct page *page; 2340 struct page *page;
@@ -2191,7 +2353,7 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
2191 * Grow (by 1) the number of slabs within a cache. This is called by 2353 * Grow (by 1) the number of slabs within a cache. This is called by
2192 * kmem_cache_alloc() when there are no active objs left in a cache. 2354 * kmem_cache_alloc() when there are no active objs left in a cache.
2193 */ 2355 */
2194static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2356static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2195{ 2357{
2196 struct slab *slabp; 2358 struct slab *slabp;
2197 void *objp; 2359 void *objp;
@@ -2217,20 +2379,20 @@ static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2217 */ 2379 */
2218 ctor_flags |= SLAB_CTOR_ATOMIC; 2380 ctor_flags |= SLAB_CTOR_ATOMIC;
2219 2381
2220 /* About to mess with non-constant members - lock. */ 2382 /* Take the l3 list lock to change the colour_next on this node */
2221 check_irq_off(); 2383 check_irq_off();
2222 spin_lock(&cachep->spinlock); 2384 l3 = cachep->nodelists[nodeid];
2385 spin_lock(&l3->list_lock);
2223 2386
2224 /* Get colour for the slab, and cal the next value. */ 2387 /* Get colour for the slab, and cal the next value. */
2225 offset = cachep->colour_next; 2388 offset = l3->colour_next;
2226 cachep->colour_next++; 2389 l3->colour_next++;
2227 if (cachep->colour_next >= cachep->colour) 2390 if (l3->colour_next >= cachep->colour)
2228 cachep->colour_next = 0; 2391 l3->colour_next = 0;
2229 offset *= cachep->colour_off; 2392 spin_unlock(&l3->list_lock);
2230 2393
2231 spin_unlock(&cachep->spinlock); 2394 offset *= cachep->colour_off;
2232 2395
2233 check_irq_off();
2234 if (local_flags & __GFP_WAIT) 2396 if (local_flags & __GFP_WAIT)
2235 local_irq_enable(); 2397 local_irq_enable();
2236 2398
@@ -2260,7 +2422,6 @@ static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2260 if (local_flags & __GFP_WAIT) 2422 if (local_flags & __GFP_WAIT)
2261 local_irq_disable(); 2423 local_irq_disable();
2262 check_irq_off(); 2424 check_irq_off();
2263 l3 = cachep->nodelists[nodeid];
2264 spin_lock(&l3->list_lock); 2425 spin_lock(&l3->list_lock);
2265 2426
2266 /* Make slab active. */ 2427 /* Make slab active. */
@@ -2302,14 +2463,14 @@ static void kfree_debugcheck(const void *objp)
2302 } 2463 }
2303} 2464}
2304 2465
2305static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp, 2466static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2306 void *caller) 2467 void *caller)
2307{ 2468{
2308 struct page *page; 2469 struct page *page;
2309 unsigned int objnr; 2470 unsigned int objnr;
2310 struct slab *slabp; 2471 struct slab *slabp;
2311 2472
2312 objp -= obj_dbghead(cachep); 2473 objp -= obj_offset(cachep);
2313 kfree_debugcheck(objp); 2474 kfree_debugcheck(objp);
2314 page = virt_to_page(objp); 2475 page = virt_to_page(objp);
2315 2476
@@ -2341,31 +2502,31 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
2341 if (cachep->flags & SLAB_STORE_USER) 2502 if (cachep->flags & SLAB_STORE_USER)
2342 *dbg_userword(cachep, objp) = caller; 2503 *dbg_userword(cachep, objp) = caller;
2343 2504
2344 objnr = (objp - slabp->s_mem) / cachep->objsize; 2505 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
2345 2506
2346 BUG_ON(objnr >= cachep->num); 2507 BUG_ON(objnr >= cachep->num);
2347 BUG_ON(objp != slabp->s_mem + objnr * cachep->objsize); 2508 BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size);
2348 2509
2349 if (cachep->flags & SLAB_DEBUG_INITIAL) { 2510 if (cachep->flags & SLAB_DEBUG_INITIAL) {
2350 /* Need to call the slab's constructor so the 2511 /* Need to call the slab's constructor so the
2351 * caller can perform a verify of its state (debugging). 2512 * caller can perform a verify of its state (debugging).
2352 * Called without the cache-lock held. 2513 * Called without the cache-lock held.
2353 */ 2514 */
2354 cachep->ctor(objp + obj_dbghead(cachep), 2515 cachep->ctor(objp + obj_offset(cachep),
2355 cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); 2516 cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
2356 } 2517 }
2357 if (cachep->flags & SLAB_POISON && cachep->dtor) { 2518 if (cachep->flags & SLAB_POISON && cachep->dtor) {
2358 /* we want to cache poison the object, 2519 /* we want to cache poison the object,
2359 * call the destruction callback 2520 * call the destruction callback
2360 */ 2521 */
2361 cachep->dtor(objp + obj_dbghead(cachep), cachep, 0); 2522 cachep->dtor(objp + obj_offset(cachep), cachep, 0);
2362 } 2523 }
2363 if (cachep->flags & SLAB_POISON) { 2524 if (cachep->flags & SLAB_POISON) {
2364#ifdef CONFIG_DEBUG_PAGEALLOC 2525#ifdef CONFIG_DEBUG_PAGEALLOC
2365 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { 2526 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
2366 store_stackinfo(cachep, objp, (unsigned long)caller); 2527 store_stackinfo(cachep, objp, (unsigned long)caller);
2367 kernel_map_pages(virt_to_page(objp), 2528 kernel_map_pages(virt_to_page(objp),
2368 cachep->objsize / PAGE_SIZE, 0); 2529 cachep->buffer_size / PAGE_SIZE, 0);
2369 } else { 2530 } else {
2370 poison_obj(cachep, objp, POISON_FREE); 2531 poison_obj(cachep, objp, POISON_FREE);
2371 } 2532 }
@@ -2376,7 +2537,7 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
2376 return objp; 2537 return objp;
2377} 2538}
2378 2539
2379static void check_slabp(kmem_cache_t *cachep, struct slab *slabp) 2540static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2380{ 2541{
2381 kmem_bufctl_t i; 2542 kmem_bufctl_t i;
2382 int entries = 0; 2543 int entries = 0;
@@ -2409,14 +2570,14 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
2409#define check_slabp(x,y) do { } while(0) 2570#define check_slabp(x,y) do { } while(0)
2410#endif 2571#endif
2411 2572
2412static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags) 2573static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2413{ 2574{
2414 int batchcount; 2575 int batchcount;
2415 struct kmem_list3 *l3; 2576 struct kmem_list3 *l3;
2416 struct array_cache *ac; 2577 struct array_cache *ac;
2417 2578
2418 check_irq_off(); 2579 check_irq_off();
2419 ac = ac_data(cachep); 2580 ac = cpu_cache_get(cachep);
2420 retry: 2581 retry:
2421 batchcount = ac->batchcount; 2582 batchcount = ac->batchcount;
2422 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2583 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -2461,22 +2622,12 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
2461 check_slabp(cachep, slabp); 2622 check_slabp(cachep, slabp);
2462 check_spinlock_acquired(cachep); 2623 check_spinlock_acquired(cachep);
2463 while (slabp->inuse < cachep->num && batchcount--) { 2624 while (slabp->inuse < cachep->num && batchcount--) {
2464 kmem_bufctl_t next;
2465 STATS_INC_ALLOCED(cachep); 2625 STATS_INC_ALLOCED(cachep);
2466 STATS_INC_ACTIVE(cachep); 2626 STATS_INC_ACTIVE(cachep);
2467 STATS_SET_HIGH(cachep); 2627 STATS_SET_HIGH(cachep);
2468 2628
2469 /* get obj pointer */ 2629 ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
2470 ac->entry[ac->avail++] = slabp->s_mem + 2630 numa_node_id());
2471 slabp->free * cachep->objsize;
2472
2473 slabp->inuse++;
2474 next = slab_bufctl(slabp)[slabp->free];
2475#if DEBUG
2476 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2477 WARN_ON(numa_node_id() != slabp->nodeid);
2478#endif
2479 slabp->free = next;
2480 } 2631 }
2481 check_slabp(cachep, slabp); 2632 check_slabp(cachep, slabp);
2482 2633
@@ -2498,7 +2649,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
2498 x = cache_grow(cachep, flags, numa_node_id()); 2649 x = cache_grow(cachep, flags, numa_node_id());
2499 2650
2500 // cache_grow can reenable interrupts, then ac could change. 2651 // cache_grow can reenable interrupts, then ac could change.
2501 ac = ac_data(cachep); 2652 ac = cpu_cache_get(cachep);
2502 if (!x && ac->avail == 0) // no objects in sight? abort 2653 if (!x && ac->avail == 0) // no objects in sight? abort
2503 return NULL; 2654 return NULL;
2504 2655
@@ -2510,7 +2661,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
2510} 2661}
2511 2662
2512static inline void 2663static inline void
2513cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags) 2664cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
2514{ 2665{
2515 might_sleep_if(flags & __GFP_WAIT); 2666 might_sleep_if(flags & __GFP_WAIT);
2516#if DEBUG 2667#if DEBUG
@@ -2519,16 +2670,16 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
2519} 2670}
2520 2671
2521#if DEBUG 2672#if DEBUG
2522static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags, 2673static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags,
2523 void *objp, void *caller) 2674 void *objp, void *caller)
2524{ 2675{
2525 if (!objp) 2676 if (!objp)
2526 return objp; 2677 return objp;
2527 if (cachep->flags & SLAB_POISON) { 2678 if (cachep->flags & SLAB_POISON) {
2528#ifdef CONFIG_DEBUG_PAGEALLOC 2679#ifdef CONFIG_DEBUG_PAGEALLOC
2529 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 2680 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
2530 kernel_map_pages(virt_to_page(objp), 2681 kernel_map_pages(virt_to_page(objp),
2531 cachep->objsize / PAGE_SIZE, 1); 2682 cachep->buffer_size / PAGE_SIZE, 1);
2532 else 2683 else
2533 check_poison_obj(cachep, objp); 2684 check_poison_obj(cachep, objp);
2534#else 2685#else
@@ -2553,7 +2704,7 @@ static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
2553 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 2704 *dbg_redzone1(cachep, objp) = RED_ACTIVE;
2554 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2705 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
2555 } 2706 }
2556 objp += obj_dbghead(cachep); 2707 objp += obj_offset(cachep);
2557 if (cachep->ctor && cachep->flags & SLAB_POISON) { 2708 if (cachep->ctor && cachep->flags & SLAB_POISON) {
2558 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2709 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2559 2710
@@ -2568,7 +2719,7 @@ static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
2568#define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 2719#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
2569#endif 2720#endif
2570 2721
2571static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2722static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2572{ 2723{
2573 void *objp; 2724 void *objp;
2574 struct array_cache *ac; 2725 struct array_cache *ac;
@@ -2583,7 +2734,7 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2583#endif 2734#endif
2584 2735
2585 check_irq_off(); 2736 check_irq_off();
2586 ac = ac_data(cachep); 2737 ac = cpu_cache_get(cachep);
2587 if (likely(ac->avail)) { 2738 if (likely(ac->avail)) {
2588 STATS_INC_ALLOCHIT(cachep); 2739 STATS_INC_ALLOCHIT(cachep);
2589 ac->touched = 1; 2740 ac->touched = 1;
@@ -2595,7 +2746,8 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2595 return objp; 2746 return objp;
2596} 2747}
2597 2748
2598static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2749static __always_inline void *
2750__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
2599{ 2751{
2600 unsigned long save_flags; 2752 unsigned long save_flags;
2601 void *objp; 2753 void *objp;
@@ -2606,7 +2758,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2606 objp = ____cache_alloc(cachep, flags); 2758 objp = ____cache_alloc(cachep, flags);
2607 local_irq_restore(save_flags); 2759 local_irq_restore(save_flags);
2608 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 2760 objp = cache_alloc_debugcheck_after(cachep, flags, objp,
2609 __builtin_return_address(0)); 2761 caller);
2610 prefetchw(objp); 2762 prefetchw(objp);
2611 return objp; 2763 return objp;
2612} 2764}
@@ -2615,19 +2767,19 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2615/* 2767/*
2616 * A interface to enable slab creation on nodeid 2768 * A interface to enable slab creation on nodeid
2617 */ 2769 */
2618static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2770static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2619{ 2771{
2620 struct list_head *entry; 2772 struct list_head *entry;
2621 struct slab *slabp; 2773 struct slab *slabp;
2622 struct kmem_list3 *l3; 2774 struct kmem_list3 *l3;
2623 void *obj; 2775 void *obj;
2624 kmem_bufctl_t next;
2625 int x; 2776 int x;
2626 2777
2627 l3 = cachep->nodelists[nodeid]; 2778 l3 = cachep->nodelists[nodeid];
2628 BUG_ON(!l3); 2779 BUG_ON(!l3);
2629 2780
2630 retry: 2781 retry:
2782 check_irq_off();
2631 spin_lock(&l3->list_lock); 2783 spin_lock(&l3->list_lock);
2632 entry = l3->slabs_partial.next; 2784 entry = l3->slabs_partial.next;
2633 if (entry == &l3->slabs_partial) { 2785 if (entry == &l3->slabs_partial) {
@@ -2647,14 +2799,7 @@ static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2647 2799
2648 BUG_ON(slabp->inuse == cachep->num); 2800 BUG_ON(slabp->inuse == cachep->num);
2649 2801
2650 /* get obj pointer */ 2802 obj = slab_get_obj(cachep, slabp, nodeid);
2651 obj = slabp->s_mem + slabp->free * cachep->objsize;
2652 slabp->inuse++;
2653 next = slab_bufctl(slabp)[slabp->free];
2654#if DEBUG
2655 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2656#endif
2657 slabp->free = next;
2658 check_slabp(cachep, slabp); 2803 check_slabp(cachep, slabp);
2659 l3->free_objects--; 2804 l3->free_objects--;
2660 /* move slabp to correct slabp list: */ 2805 /* move slabp to correct slabp list: */
@@ -2685,7 +2830,7 @@ static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2685/* 2830/*
2686 * Caller needs to acquire correct kmem_list's list_lock 2831 * Caller needs to acquire correct kmem_list's list_lock
2687 */ 2832 */
2688static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, 2833static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
2689 int node) 2834 int node)
2690{ 2835{
2691 int i; 2836 int i;
@@ -2694,29 +2839,14 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
2694 for (i = 0; i < nr_objects; i++) { 2839 for (i = 0; i < nr_objects; i++) {
2695 void *objp = objpp[i]; 2840 void *objp = objpp[i];
2696 struct slab *slabp; 2841 struct slab *slabp;
2697 unsigned int objnr;
2698 2842
2699 slabp = page_get_slab(virt_to_page(objp)); 2843 slabp = virt_to_slab(objp);
2700 l3 = cachep->nodelists[node]; 2844 l3 = cachep->nodelists[node];
2701 list_del(&slabp->list); 2845 list_del(&slabp->list);
2702 objnr = (objp - slabp->s_mem) / cachep->objsize;
2703 check_spinlock_acquired_node(cachep, node); 2846 check_spinlock_acquired_node(cachep, node);
2704 check_slabp(cachep, slabp); 2847 check_slabp(cachep, slabp);
2705 2848 slab_put_obj(cachep, slabp, objp, node);
2706#if DEBUG
2707 /* Verify that the slab belongs to the intended node */
2708 WARN_ON(slabp->nodeid != node);
2709
2710 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2711 printk(KERN_ERR "slab: double free detected in cache "
2712 "'%s', objp %p\n", cachep->name, objp);
2713 BUG();
2714 }
2715#endif
2716 slab_bufctl(slabp)[objnr] = slabp->free;
2717 slabp->free = objnr;
2718 STATS_DEC_ACTIVE(cachep); 2849 STATS_DEC_ACTIVE(cachep);
2719 slabp->inuse--;
2720 l3->free_objects++; 2850 l3->free_objects++;
2721 check_slabp(cachep, slabp); 2851 check_slabp(cachep, slabp);
2722 2852
@@ -2738,7 +2868,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
2738 } 2868 }
2739} 2869}
2740 2870
2741static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac) 2871static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
2742{ 2872{
2743 int batchcount; 2873 int batchcount;
2744 struct kmem_list3 *l3; 2874 struct kmem_list3 *l3;
@@ -2797,9 +2927,9 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
2797 * 2927 *
2798 * Called with disabled ints. 2928 * Called with disabled ints.
2799 */ 2929 */
2800static inline void __cache_free(kmem_cache_t *cachep, void *objp) 2930static inline void __cache_free(struct kmem_cache *cachep, void *objp)
2801{ 2931{
2802 struct array_cache *ac = ac_data(cachep); 2932 struct array_cache *ac = cpu_cache_get(cachep);
2803 2933
2804 check_irq_off(); 2934 check_irq_off();
2805 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 2935 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
@@ -2810,7 +2940,7 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
2810#ifdef CONFIG_NUMA 2940#ifdef CONFIG_NUMA
2811 { 2941 {
2812 struct slab *slabp; 2942 struct slab *slabp;
2813 slabp = page_get_slab(virt_to_page(objp)); 2943 slabp = virt_to_slab(objp);
2814 if (unlikely(slabp->nodeid != numa_node_id())) { 2944 if (unlikely(slabp->nodeid != numa_node_id())) {
2815 struct array_cache *alien = NULL; 2945 struct array_cache *alien = NULL;
2816 int nodeid = slabp->nodeid; 2946 int nodeid = slabp->nodeid;
@@ -2856,9 +2986,9 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
2856 * Allocate an object from this cache. The flags are only relevant 2986 * Allocate an object from this cache. The flags are only relevant
2857 * if the cache has no available objects. 2987 * if the cache has no available objects.
2858 */ 2988 */
2859void *kmem_cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2989void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2860{ 2990{
2861 return __cache_alloc(cachep, flags); 2991 return __cache_alloc(cachep, flags, __builtin_return_address(0));
2862} 2992}
2863EXPORT_SYMBOL(kmem_cache_alloc); 2993EXPORT_SYMBOL(kmem_cache_alloc);
2864 2994
@@ -2876,12 +3006,12 @@ EXPORT_SYMBOL(kmem_cache_alloc);
2876 * 3006 *
2877 * Currently only used for dentry validation. 3007 * Currently only used for dentry validation.
2878 */ 3008 */
2879int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr) 3009int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
2880{ 3010{
2881 unsigned long addr = (unsigned long)ptr; 3011 unsigned long addr = (unsigned long)ptr;
2882 unsigned long min_addr = PAGE_OFFSET; 3012 unsigned long min_addr = PAGE_OFFSET;
2883 unsigned long align_mask = BYTES_PER_WORD - 1; 3013 unsigned long align_mask = BYTES_PER_WORD - 1;
2884 unsigned long size = cachep->objsize; 3014 unsigned long size = cachep->buffer_size;
2885 struct page *page; 3015 struct page *page;
2886 3016
2887 if (unlikely(addr < min_addr)) 3017 if (unlikely(addr < min_addr))
@@ -2917,32 +3047,23 @@ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
2917 * New and improved: it will now make sure that the object gets 3047 * New and improved: it will now make sure that the object gets
2918 * put on the correct node list so that there is no false sharing. 3048 * put on the correct node list so that there is no false sharing.
2919 */ 3049 */
2920void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) 3050void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2921{ 3051{
2922 unsigned long save_flags; 3052 unsigned long save_flags;
2923 void *ptr; 3053 void *ptr;
2924 3054
2925 if (nodeid == -1)
2926 return __cache_alloc(cachep, flags);
2927
2928 if (unlikely(!cachep->nodelists[nodeid])) {
2929 /* Fall back to __cache_alloc if we run into trouble */
2930 printk(KERN_WARNING
2931 "slab: not allocating in inactive node %d for cache %s\n",
2932 nodeid, cachep->name);
2933 return __cache_alloc(cachep, flags);
2934 }
2935
2936 cache_alloc_debugcheck_before(cachep, flags); 3055 cache_alloc_debugcheck_before(cachep, flags);
2937 local_irq_save(save_flags); 3056 local_irq_save(save_flags);
2938 if (nodeid == numa_node_id()) 3057
3058 if (nodeid == -1 || nodeid == numa_node_id() ||
3059 !cachep->nodelists[nodeid])
2939 ptr = ____cache_alloc(cachep, flags); 3060 ptr = ____cache_alloc(cachep, flags);
2940 else 3061 else
2941 ptr = __cache_alloc_node(cachep, flags, nodeid); 3062 ptr = __cache_alloc_node(cachep, flags, nodeid);
2942 local_irq_restore(save_flags); 3063 local_irq_restore(save_flags);
2943 ptr = 3064
2944 cache_alloc_debugcheck_after(cachep, flags, ptr, 3065 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr,
2945 __builtin_return_address(0)); 3066 __builtin_return_address(0));
2946 3067
2947 return ptr; 3068 return ptr;
2948} 3069}
@@ -2950,7 +3071,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node);
2950 3071
2951void *kmalloc_node(size_t size, gfp_t flags, int node) 3072void *kmalloc_node(size_t size, gfp_t flags, int node)
2952{ 3073{
2953 kmem_cache_t *cachep; 3074 struct kmem_cache *cachep;
2954 3075
2955 cachep = kmem_find_general_cachep(size, flags); 3076 cachep = kmem_find_general_cachep(size, flags);
2956 if (unlikely(cachep == NULL)) 3077 if (unlikely(cachep == NULL))
@@ -2981,9 +3102,10 @@ EXPORT_SYMBOL(kmalloc_node);
2981 * platforms. For example, on i386, it means that the memory must come 3102 * platforms. For example, on i386, it means that the memory must come
2982 * from the first 16MB. 3103 * from the first 16MB.
2983 */ 3104 */
2984void *__kmalloc(size_t size, gfp_t flags) 3105static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3106 void *caller)
2985{ 3107{
2986 kmem_cache_t *cachep; 3108 struct kmem_cache *cachep;
2987 3109
2988 /* If you want to save a few bytes .text space: replace 3110 /* If you want to save a few bytes .text space: replace
2989 * __ with kmem_. 3111 * __ with kmem_.
@@ -2993,10 +3115,27 @@ void *__kmalloc(size_t size, gfp_t flags)
2993 cachep = __find_general_cachep(size, flags); 3115 cachep = __find_general_cachep(size, flags);
2994 if (unlikely(cachep == NULL)) 3116 if (unlikely(cachep == NULL))
2995 return NULL; 3117 return NULL;
2996 return __cache_alloc(cachep, flags); 3118 return __cache_alloc(cachep, flags, caller);
3119}
3120
3121#ifndef CONFIG_DEBUG_SLAB
3122
3123void *__kmalloc(size_t size, gfp_t flags)
3124{
3125 return __do_kmalloc(size, flags, NULL);
2997} 3126}
2998EXPORT_SYMBOL(__kmalloc); 3127EXPORT_SYMBOL(__kmalloc);
2999 3128
3129#else
3130
3131void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
3132{
3133 return __do_kmalloc(size, flags, caller);
3134}
3135EXPORT_SYMBOL(__kmalloc_track_caller);
3136
3137#endif
3138
3000#ifdef CONFIG_SMP 3139#ifdef CONFIG_SMP
3001/** 3140/**
3002 * __alloc_percpu - allocate one copy of the object for every present 3141 * __alloc_percpu - allocate one copy of the object for every present
@@ -3054,7 +3193,7 @@ EXPORT_SYMBOL(__alloc_percpu);
3054 * Free an object which was previously allocated from this 3193 * Free an object which was previously allocated from this
3055 * cache. 3194 * cache.
3056 */ 3195 */
3057void kmem_cache_free(kmem_cache_t *cachep, void *objp) 3196void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3058{ 3197{
3059 unsigned long flags; 3198 unsigned long flags;
3060 3199
@@ -3075,15 +3214,15 @@ EXPORT_SYMBOL(kmem_cache_free);
3075 */ 3214 */
3076void kfree(const void *objp) 3215void kfree(const void *objp)
3077{ 3216{
3078 kmem_cache_t *c; 3217 struct kmem_cache *c;
3079 unsigned long flags; 3218 unsigned long flags;
3080 3219
3081 if (unlikely(!objp)) 3220 if (unlikely(!objp))
3082 return; 3221 return;
3083 local_irq_save(flags); 3222 local_irq_save(flags);
3084 kfree_debugcheck(objp); 3223 kfree_debugcheck(objp);
3085 c = page_get_cache(virt_to_page(objp)); 3224 c = virt_to_cache(objp);
3086 mutex_debug_check_no_locks_freed(objp, obj_reallen(c)); 3225 mutex_debug_check_no_locks_freed(objp, obj_size(c));
3087 __cache_free(c, (void *)objp); 3226 __cache_free(c, (void *)objp);
3088 local_irq_restore(flags); 3227 local_irq_restore(flags);
3089} 3228}
@@ -3112,13 +3251,13 @@ void free_percpu(const void *objp)
3112EXPORT_SYMBOL(free_percpu); 3251EXPORT_SYMBOL(free_percpu);
3113#endif 3252#endif
3114 3253
3115unsigned int kmem_cache_size(kmem_cache_t *cachep) 3254unsigned int kmem_cache_size(struct kmem_cache *cachep)
3116{ 3255{
3117 return obj_reallen(cachep); 3256 return obj_size(cachep);
3118} 3257}
3119EXPORT_SYMBOL(kmem_cache_size); 3258EXPORT_SYMBOL(kmem_cache_size);
3120 3259
3121const char *kmem_cache_name(kmem_cache_t *cachep) 3260const char *kmem_cache_name(struct kmem_cache *cachep)
3122{ 3261{
3123 return cachep->name; 3262 return cachep->name;
3124} 3263}
@@ -3127,7 +3266,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name);
3127/* 3266/*
3128 * This initializes kmem_list3 for all nodes. 3267 * This initializes kmem_list3 for all nodes.
3129 */ 3268 */
3130static int alloc_kmemlist(kmem_cache_t *cachep) 3269static int alloc_kmemlist(struct kmem_cache *cachep)
3131{ 3270{
3132 int node; 3271 int node;
3133 struct kmem_list3 *l3; 3272 struct kmem_list3 *l3;
@@ -3183,7 +3322,7 @@ static int alloc_kmemlist(kmem_cache_t *cachep)
3183} 3322}
3184 3323
3185struct ccupdate_struct { 3324struct ccupdate_struct {
3186 kmem_cache_t *cachep; 3325 struct kmem_cache *cachep;
3187 struct array_cache *new[NR_CPUS]; 3326 struct array_cache *new[NR_CPUS];
3188}; 3327};
3189 3328
@@ -3193,13 +3332,13 @@ static void do_ccupdate_local(void *info)
3193 struct array_cache *old; 3332 struct array_cache *old;
3194 3333
3195 check_irq_off(); 3334 check_irq_off();
3196 old = ac_data(new->cachep); 3335 old = cpu_cache_get(new->cachep);
3197 3336
3198 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; 3337 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3199 new->new[smp_processor_id()] = old; 3338 new->new[smp_processor_id()] = old;
3200} 3339}
3201 3340
3202static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount, 3341static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount,
3203 int shared) 3342 int shared)
3204{ 3343{
3205 struct ccupdate_struct new; 3344 struct ccupdate_struct new;
@@ -3220,11 +3359,11 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
3220 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); 3359 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
3221 3360
3222 check_irq_on(); 3361 check_irq_on();
3223 spin_lock_irq(&cachep->spinlock); 3362 spin_lock(&cachep->spinlock);
3224 cachep->batchcount = batchcount; 3363 cachep->batchcount = batchcount;
3225 cachep->limit = limit; 3364 cachep->limit = limit;
3226 cachep->shared = shared; 3365 cachep->shared = shared;
3227 spin_unlock_irq(&cachep->spinlock); 3366 spin_unlock(&cachep->spinlock);
3228 3367
3229 for_each_online_cpu(i) { 3368 for_each_online_cpu(i) {
3230 struct array_cache *ccold = new.new[i]; 3369 struct array_cache *ccold = new.new[i];
@@ -3245,7 +3384,7 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
3245 return 0; 3384 return 0;
3246} 3385}
3247 3386
3248static void enable_cpucache(kmem_cache_t *cachep) 3387static void enable_cpucache(struct kmem_cache *cachep)
3249{ 3388{
3250 int err; 3389 int err;
3251 int limit, shared; 3390 int limit, shared;
@@ -3258,13 +3397,13 @@ static void enable_cpucache(kmem_cache_t *cachep)
3258 * The numbers are guessed, we should auto-tune as described by 3397 * The numbers are guessed, we should auto-tune as described by
3259 * Bonwick. 3398 * Bonwick.
3260 */ 3399 */
3261 if (cachep->objsize > 131072) 3400 if (cachep->buffer_size > 131072)
3262 limit = 1; 3401 limit = 1;
3263 else if (cachep->objsize > PAGE_SIZE) 3402 else if (cachep->buffer_size > PAGE_SIZE)
3264 limit = 8; 3403 limit = 8;
3265 else if (cachep->objsize > 1024) 3404 else if (cachep->buffer_size > 1024)
3266 limit = 24; 3405 limit = 24;
3267 else if (cachep->objsize > 256) 3406 else if (cachep->buffer_size > 256)
3268 limit = 54; 3407 limit = 54;
3269 else 3408 else
3270 limit = 120; 3409 limit = 120;
@@ -3279,7 +3418,7 @@ static void enable_cpucache(kmem_cache_t *cachep)
3279 */ 3418 */
3280 shared = 0; 3419 shared = 0;
3281#ifdef CONFIG_SMP 3420#ifdef CONFIG_SMP
3282 if (cachep->objsize <= PAGE_SIZE) 3421 if (cachep->buffer_size <= PAGE_SIZE)
3283 shared = 8; 3422 shared = 8;
3284#endif 3423#endif
3285 3424
@@ -3297,7 +3436,7 @@ static void enable_cpucache(kmem_cache_t *cachep)
3297 cachep->name, -err); 3436 cachep->name, -err);
3298} 3437}
3299 3438
3300static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac, 3439static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
3301 int force, int node) 3440 int force, int node)
3302{ 3441{
3303 int tofree; 3442 int tofree;
@@ -3342,12 +3481,12 @@ static void cache_reap(void *unused)
3342 } 3481 }
3343 3482
3344 list_for_each(walk, &cache_chain) { 3483 list_for_each(walk, &cache_chain) {
3345 kmem_cache_t *searchp; 3484 struct kmem_cache *searchp;
3346 struct list_head *p; 3485 struct list_head *p;
3347 int tofree; 3486 int tofree;
3348 struct slab *slabp; 3487 struct slab *slabp;
3349 3488
3350 searchp = list_entry(walk, kmem_cache_t, next); 3489 searchp = list_entry(walk, struct kmem_cache, next);
3351 3490
3352 if (searchp->flags & SLAB_NO_REAP) 3491 if (searchp->flags & SLAB_NO_REAP)
3353 goto next; 3492 goto next;
@@ -3356,10 +3495,10 @@ static void cache_reap(void *unused)
3356 3495
3357 l3 = searchp->nodelists[numa_node_id()]; 3496 l3 = searchp->nodelists[numa_node_id()];
3358 if (l3->alien) 3497 if (l3->alien)
3359 drain_alien_cache(searchp, l3); 3498 drain_alien_cache(searchp, l3->alien);
3360 spin_lock_irq(&l3->list_lock); 3499 spin_lock_irq(&l3->list_lock);
3361 3500
3362 drain_array_locked(searchp, ac_data(searchp), 0, 3501 drain_array_locked(searchp, cpu_cache_get(searchp), 0,
3363 numa_node_id()); 3502 numa_node_id());
3364 3503
3365 if (time_after(l3->next_reap, jiffies)) 3504 if (time_after(l3->next_reap, jiffies))
@@ -3450,15 +3589,15 @@ static void *s_start(struct seq_file *m, loff_t *pos)
3450 if (p == &cache_chain) 3589 if (p == &cache_chain)
3451 return NULL; 3590 return NULL;
3452 } 3591 }
3453 return list_entry(p, kmem_cache_t, next); 3592 return list_entry(p, struct kmem_cache, next);
3454} 3593}
3455 3594
3456static void *s_next(struct seq_file *m, void *p, loff_t *pos) 3595static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3457{ 3596{
3458 kmem_cache_t *cachep = p; 3597 struct kmem_cache *cachep = p;
3459 ++*pos; 3598 ++*pos;
3460 return cachep->next.next == &cache_chain ? NULL 3599 return cachep->next.next == &cache_chain ? NULL
3461 : list_entry(cachep->next.next, kmem_cache_t, next); 3600 : list_entry(cachep->next.next, struct kmem_cache, next);
3462} 3601}
3463 3602
3464static void s_stop(struct seq_file *m, void *p) 3603static void s_stop(struct seq_file *m, void *p)
@@ -3468,7 +3607,7 @@ static void s_stop(struct seq_file *m, void *p)
3468 3607
3469static int s_show(struct seq_file *m, void *p) 3608static int s_show(struct seq_file *m, void *p)
3470{ 3609{
3471 kmem_cache_t *cachep = p; 3610 struct kmem_cache *cachep = p;
3472 struct list_head *q; 3611 struct list_head *q;
3473 struct slab *slabp; 3612 struct slab *slabp;
3474 unsigned long active_objs; 3613 unsigned long active_objs;
@@ -3480,8 +3619,7 @@ static int s_show(struct seq_file *m, void *p)
3480 int node; 3619 int node;
3481 struct kmem_list3 *l3; 3620 struct kmem_list3 *l3;
3482 3621
3483 check_irq_on(); 3622 spin_lock(&cachep->spinlock);
3484 spin_lock_irq(&cachep->spinlock);
3485 active_objs = 0; 3623 active_objs = 0;
3486 num_slabs = 0; 3624 num_slabs = 0;
3487 for_each_online_node(node) { 3625 for_each_online_node(node) {
@@ -3489,7 +3627,8 @@ static int s_show(struct seq_file *m, void *p)
3489 if (!l3) 3627 if (!l3)
3490 continue; 3628 continue;
3491 3629
3492 spin_lock(&l3->list_lock); 3630 check_irq_on();
3631 spin_lock_irq(&l3->list_lock);
3493 3632
3494 list_for_each(q, &l3->slabs_full) { 3633 list_for_each(q, &l3->slabs_full) {
3495 slabp = list_entry(q, struct slab, list); 3634 slabp = list_entry(q, struct slab, list);
@@ -3514,9 +3653,10 @@ static int s_show(struct seq_file *m, void *p)
3514 num_slabs++; 3653 num_slabs++;
3515 } 3654 }
3516 free_objects += l3->free_objects; 3655 free_objects += l3->free_objects;
3517 shared_avail += l3->shared->avail; 3656 if (l3->shared)
3657 shared_avail += l3->shared->avail;
3518 3658
3519 spin_unlock(&l3->list_lock); 3659 spin_unlock_irq(&l3->list_lock);
3520 } 3660 }
3521 num_slabs += active_slabs; 3661 num_slabs += active_slabs;
3522 num_objs = num_slabs * cachep->num; 3662 num_objs = num_slabs * cachep->num;
@@ -3528,7 +3668,7 @@ static int s_show(struct seq_file *m, void *p)
3528 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 3668 printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
3529 3669
3530 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 3670 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
3531 name, active_objs, num_objs, cachep->objsize, 3671 name, active_objs, num_objs, cachep->buffer_size,
3532 cachep->num, (1 << cachep->gfporder)); 3672 cachep->num, (1 << cachep->gfporder));
3533 seq_printf(m, " : tunables %4u %4u %4u", 3673 seq_printf(m, " : tunables %4u %4u %4u",
3534 cachep->limit, cachep->batchcount, cachep->shared); 3674 cachep->limit, cachep->batchcount, cachep->shared);
@@ -3560,7 +3700,7 @@ static int s_show(struct seq_file *m, void *p)
3560 } 3700 }
3561#endif 3701#endif
3562 seq_putc(m, '\n'); 3702 seq_putc(m, '\n');
3563 spin_unlock_irq(&cachep->spinlock); 3703 spin_unlock(&cachep->spinlock);
3564 return 0; 3704 return 0;
3565} 3705}
3566 3706
@@ -3618,7 +3758,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3618 mutex_lock(&cache_chain_mutex); 3758 mutex_lock(&cache_chain_mutex);
3619 res = -EINVAL; 3759 res = -EINVAL;
3620 list_for_each(p, &cache_chain) { 3760 list_for_each(p, &cache_chain) {
3621 kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); 3761 struct kmem_cache *cachep = list_entry(p, struct kmem_cache,
3762 next);
3622 3763
3623 if (!strcmp(cachep->name, kbuf)) { 3764 if (!strcmp(cachep->name, kbuf)) {
3624 if (limit < 1 || 3765 if (limit < 1 ||
@@ -3656,5 +3797,5 @@ unsigned int ksize(const void *objp)
3656 if (unlikely(objp == NULL)) 3797 if (unlikely(objp == NULL))
3657 return 0; 3798 return 0;
3658 3799
3659 return obj_reallen(page_get_cache(virt_to_page(objp))); 3800 return obj_size(virt_to_cache(objp));
3660} 3801}
diff --git a/mm/slob.c b/mm/slob.c
index 1c240c4b71d9..a1f42bdc0245 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -336,7 +336,7 @@ EXPORT_SYMBOL(slab_reclaim_pages);
336 336
337#ifdef CONFIG_SMP 337#ifdef CONFIG_SMP
338 338
339void *__alloc_percpu(size_t size, size_t align) 339void *__alloc_percpu(size_t size)
340{ 340{
341 int i; 341 int i;
342 struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL); 342 struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
diff --git a/mm/swap.c b/mm/swap.c
index bc2442a7b0ee..cce3dda59c59 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,19 +34,22 @@
34/* How many pages do we try to swap or page in/out together? */ 34/* How many pages do we try to swap or page in/out together? */
35int page_cluster; 35int page_cluster;
36 36
37void put_page(struct page *page) 37static void put_compound_page(struct page *page)
38{ 38{
39 if (unlikely(PageCompound(page))) { 39 page = (struct page *)page_private(page);
40 page = (struct page *)page_private(page); 40 if (put_page_testzero(page)) {
41 if (put_page_testzero(page)) { 41 void (*dtor)(struct page *page);
42 void (*dtor)(struct page *page);
43 42
44 dtor = (void (*)(struct page *))page[1].mapping; 43 dtor = (void (*)(struct page *))page[1].lru.next;
45 (*dtor)(page); 44 (*dtor)(page);
46 }
47 return;
48 } 45 }
49 if (put_page_testzero(page)) 46}
47
48void put_page(struct page *page)
49{
50 if (unlikely(PageCompound(page)))
51 put_compound_page(page);
52 else if (put_page_testzero(page))
50 __page_cache_release(page); 53 __page_cache_release(page);
51} 54}
52EXPORT_SYMBOL(put_page); 55EXPORT_SYMBOL(put_page);
@@ -244,6 +247,15 @@ void release_pages(struct page **pages, int nr, int cold)
244 struct page *page = pages[i]; 247 struct page *page = pages[i];
245 struct zone *pagezone; 248 struct zone *pagezone;
246 249
250 if (unlikely(PageCompound(page))) {
251 if (zone) {
252 spin_unlock_irq(&zone->lru_lock);
253 zone = NULL;
254 }
255 put_compound_page(page);
256 continue;
257 }
258
247 if (!put_page_testzero(page)) 259 if (!put_page_testzero(page))
248 continue; 260 continue;
249 261
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 7b09ac503fec..db8a3d3e1636 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -27,6 +27,7 @@ static struct address_space_operations swap_aops = {
27 .writepage = swap_writepage, 27 .writepage = swap_writepage,
28 .sync_page = block_sync_page, 28 .sync_page = block_sync_page,
29 .set_page_dirty = __set_page_dirty_nobuffers, 29 .set_page_dirty = __set_page_dirty_nobuffers,
30 .migratepage = migrate_page,
30}; 31};
31 32
32static struct backing_dev_info swap_backing_dev_info = { 33static struct backing_dev_info swap_backing_dev_info = {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f1e69c30d203..1f9cf0d073b8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -554,6 +554,15 @@ static int unuse_mm(struct mm_struct *mm,
554 return 0; 554 return 0;
555} 555}
556 556
557#ifdef CONFIG_MIGRATION
558int remove_vma_swap(struct vm_area_struct *vma, struct page *page)
559{
560 swp_entry_t entry = { .val = page_private(page) };
561
562 return unuse_vma(vma, entry, page);
563}
564#endif
565
557/* 566/*
558 * Scan swap_map from current position to next entry still in use. 567 * Scan swap_map from current position to next entry still in use.
559 * Recycle to start on reaching the end, returning 0 when empty. 568 * Recycle to start on reaching the end, returning 0 when empty.
@@ -646,6 +655,7 @@ static int try_to_unuse(unsigned int type)
646 */ 655 */
647 swap_map = &si->swap_map[i]; 656 swap_map = &si->swap_map[i];
648 entry = swp_entry(type, i); 657 entry = swp_entry(type, i);
658again:
649 page = read_swap_cache_async(entry, NULL, 0); 659 page = read_swap_cache_async(entry, NULL, 0);
650 if (!page) { 660 if (!page) {
651 /* 661 /*
@@ -680,6 +690,12 @@ static int try_to_unuse(unsigned int type)
680 wait_on_page_locked(page); 690 wait_on_page_locked(page);
681 wait_on_page_writeback(page); 691 wait_on_page_writeback(page);
682 lock_page(page); 692 lock_page(page);
693 if (!PageSwapCache(page)) {
694 /* Page migration has occured */
695 unlock_page(page);
696 page_cache_release(page);
697 goto again;
698 }
683 wait_on_page_writeback(page); 699 wait_on_page_writeback(page);
684 700
685 /* 701 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2e34b61a70c7..1838c15ca4fd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -443,6 +443,10 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
443 BUG_ON(PageActive(page)); 443 BUG_ON(PageActive(page));
444 444
445 sc->nr_scanned++; 445 sc->nr_scanned++;
446
447 if (!sc->may_swap && page_mapped(page))
448 goto keep_locked;
449
446 /* Double the slab pressure for mapped and swapcache pages */ 450 /* Double the slab pressure for mapped and swapcache pages */
447 if (page_mapped(page) || PageSwapCache(page)) 451 if (page_mapped(page) || PageSwapCache(page))
448 sc->nr_scanned++; 452 sc->nr_scanned++;
@@ -477,7 +481,13 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
477 * processes. Try to unmap it here. 481 * processes. Try to unmap it here.
478 */ 482 */
479 if (page_mapped(page) && mapping) { 483 if (page_mapped(page) && mapping) {
480 switch (try_to_unmap(page)) { 484 /*
485 * No unmapping if we do not swap
486 */
487 if (!sc->may_swap)
488 goto keep_locked;
489
490 switch (try_to_unmap(page, 0)) {
481 case SWAP_FAIL: 491 case SWAP_FAIL:
482 goto activate_locked; 492 goto activate_locked;
483 case SWAP_AGAIN: 493 case SWAP_AGAIN:
@@ -492,7 +502,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
492 goto keep_locked; 502 goto keep_locked;
493 if (!may_enter_fs) 503 if (!may_enter_fs)
494 goto keep_locked; 504 goto keep_locked;
495 if (laptop_mode && !sc->may_writepage) 505 if (!sc->may_writepage)
496 goto keep_locked; 506 goto keep_locked;
497 507
498 /* Page is dirty, try to write it out here */ 508 /* Page is dirty, try to write it out here */
@@ -609,6 +619,15 @@ int putback_lru_pages(struct list_head *l)
609} 619}
610 620
611/* 621/*
622 * Non migratable page
623 */
624int fail_migrate_page(struct page *newpage, struct page *page)
625{
626 return -EIO;
627}
628EXPORT_SYMBOL(fail_migrate_page);
629
630/*
612 * swapout a single page 631 * swapout a single page
613 * page is locked upon entry, unlocked on exit 632 * page is locked upon entry, unlocked on exit
614 */ 633 */
@@ -617,7 +636,7 @@ static int swap_page(struct page *page)
617 struct address_space *mapping = page_mapping(page); 636 struct address_space *mapping = page_mapping(page);
618 637
619 if (page_mapped(page) && mapping) 638 if (page_mapped(page) && mapping)
620 if (try_to_unmap(page) != SWAP_SUCCESS) 639 if (try_to_unmap(page, 1) != SWAP_SUCCESS)
621 goto unlock_retry; 640 goto unlock_retry;
622 641
623 if (PageDirty(page)) { 642 if (PageDirty(page)) {
@@ -653,6 +672,167 @@ unlock_retry:
653retry: 672retry:
654 return -EAGAIN; 673 return -EAGAIN;
655} 674}
675EXPORT_SYMBOL(swap_page);
676
677/*
678 * Page migration was first developed in the context of the memory hotplug
679 * project. The main authors of the migration code are:
680 *
681 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
682 * Hirokazu Takahashi <taka@valinux.co.jp>
683 * Dave Hansen <haveblue@us.ibm.com>
684 * Christoph Lameter <clameter@sgi.com>
685 */
686
687/*
688 * Remove references for a page and establish the new page with the correct
689 * basic settings to be able to stop accesses to the page.
690 */
691int migrate_page_remove_references(struct page *newpage,
692 struct page *page, int nr_refs)
693{
694 struct address_space *mapping = page_mapping(page);
695 struct page **radix_pointer;
696
697 /*
698 * Avoid doing any of the following work if the page count
699 * indicates that the page is in use or truncate has removed
700 * the page.
701 */
702 if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
703 return 1;
704
705 /*
706 * Establish swap ptes for anonymous pages or destroy pte
707 * maps for files.
708 *
709 * In order to reestablish file backed mappings the fault handlers
710 * will take the radix tree_lock which may then be used to stop
711 * processses from accessing this page until the new page is ready.
712 *
713 * A process accessing via a swap pte (an anonymous page) will take a
714 * page_lock on the old page which will block the process until the
715 * migration attempt is complete. At that time the PageSwapCache bit
716 * will be examined. If the page was migrated then the PageSwapCache
717 * bit will be clear and the operation to retrieve the page will be
718 * retried which will find the new page in the radix tree. Then a new
719 * direct mapping may be generated based on the radix tree contents.
720 *
721 * If the page was not migrated then the PageSwapCache bit
722 * is still set and the operation may continue.
723 */
724 try_to_unmap(page, 1);
725
726 /*
727 * Give up if we were unable to remove all mappings.
728 */
729 if (page_mapcount(page))
730 return 1;
731
732 write_lock_irq(&mapping->tree_lock);
733
734 radix_pointer = (struct page **)radix_tree_lookup_slot(
735 &mapping->page_tree,
736 page_index(page));
737
738 if (!page_mapping(page) || page_count(page) != nr_refs ||
739 *radix_pointer != page) {
740 write_unlock_irq(&mapping->tree_lock);
741 return 1;
742 }
743
744 /*
745 * Now we know that no one else is looking at the page.
746 *
747 * Certain minimal information about a page must be available
748 * in order for other subsystems to properly handle the page if they
749 * find it through the radix tree update before we are finished
750 * copying the page.
751 */
752 get_page(newpage);
753 newpage->index = page->index;
754 newpage->mapping = page->mapping;
755 if (PageSwapCache(page)) {
756 SetPageSwapCache(newpage);
757 set_page_private(newpage, page_private(page));
758 }
759
760 *radix_pointer = newpage;
761 __put_page(page);
762 write_unlock_irq(&mapping->tree_lock);
763
764 return 0;
765}
766EXPORT_SYMBOL(migrate_page_remove_references);
767
768/*
769 * Copy the page to its new location
770 */
771void migrate_page_copy(struct page *newpage, struct page *page)
772{
773 copy_highpage(newpage, page);
774
775 if (PageError(page))
776 SetPageError(newpage);
777 if (PageReferenced(page))
778 SetPageReferenced(newpage);
779 if (PageUptodate(page))
780 SetPageUptodate(newpage);
781 if (PageActive(page))
782 SetPageActive(newpage);
783 if (PageChecked(page))
784 SetPageChecked(newpage);
785 if (PageMappedToDisk(page))
786 SetPageMappedToDisk(newpage);
787
788 if (PageDirty(page)) {
789 clear_page_dirty_for_io(page);
790 set_page_dirty(newpage);
791 }
792
793 ClearPageSwapCache(page);
794 ClearPageActive(page);
795 ClearPagePrivate(page);
796 set_page_private(page, 0);
797 page->mapping = NULL;
798
799 /*
800 * If any waiters have accumulated on the new page then
801 * wake them up.
802 */
803 if (PageWriteback(newpage))
804 end_page_writeback(newpage);
805}
806EXPORT_SYMBOL(migrate_page_copy);
807
808/*
809 * Common logic to directly migrate a single page suitable for
810 * pages that do not use PagePrivate.
811 *
812 * Pages are locked upon entry and exit.
813 */
814int migrate_page(struct page *newpage, struct page *page)
815{
816 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
817
818 if (migrate_page_remove_references(newpage, page, 2))
819 return -EAGAIN;
820
821 migrate_page_copy(newpage, page);
822
823 /*
824 * Remove auxiliary swap entries and replace
825 * them with real ptes.
826 *
827 * Note that a real pte entry will allow processes that are not
828 * waiting on the page lock to use the new page via the page tables
829 * before the new page is unlocked.
830 */
831 remove_from_swap(newpage);
832 return 0;
833}
834EXPORT_SYMBOL(migrate_page);
835
656/* 836/*
657 * migrate_pages 837 * migrate_pages
658 * 838 *
@@ -663,14 +843,9 @@ retry:
663 * pages are swapped out. 843 * pages are swapped out.
664 * 844 *
665 * The function returns after 10 attempts or if no pages 845 * The function returns after 10 attempts or if no pages
666 * are movable anymore because t has become empty 846 * are movable anymore because to has become empty
667 * or no retryable pages exist anymore. 847 * or no retryable pages exist anymore.
668 * 848 *
669 * SIMPLIFIED VERSION: This implementation of migrate_pages
670 * is only swapping out pages and never touches the second
671 * list. The direct migration patchset
672 * extends this function to avoid the use of swap.
673 *
674 * Return: Number of pages not migrated when "to" ran empty. 849 * Return: Number of pages not migrated when "to" ran empty.
675 */ 850 */
676int migrate_pages(struct list_head *from, struct list_head *to, 851int migrate_pages(struct list_head *from, struct list_head *to,
@@ -691,6 +866,9 @@ redo:
691 retry = 0; 866 retry = 0;
692 867
693 list_for_each_entry_safe(page, page2, from, lru) { 868 list_for_each_entry_safe(page, page2, from, lru) {
869 struct page *newpage = NULL;
870 struct address_space *mapping;
871
694 cond_resched(); 872 cond_resched();
695 873
696 rc = 0; 874 rc = 0;
@@ -698,6 +876,9 @@ redo:
698 /* page was freed from under us. So we are done. */ 876 /* page was freed from under us. So we are done. */
699 goto next; 877 goto next;
700 878
879 if (to && list_empty(to))
880 break;
881
701 /* 882 /*
702 * Skip locked pages during the first two passes to give the 883 * Skip locked pages during the first two passes to give the
703 * functions holding the lock time to release the page. Later we 884 * functions holding the lock time to release the page. Later we
@@ -734,12 +915,84 @@ redo:
734 } 915 }
735 } 916 }
736 917
918 if (!to) {
919 rc = swap_page(page);
920 goto next;
921 }
922
923 newpage = lru_to_page(to);
924 lock_page(newpage);
925
737 /* 926 /*
738 * Page is properly locked and writeback is complete. 927 * Pages are properly locked and writeback is complete.
739 * Try to migrate the page. 928 * Try to migrate the page.
740 */ 929 */
741 rc = swap_page(page); 930 mapping = page_mapping(page);
742 goto next; 931 if (!mapping)
932 goto unlock_both;
933
934 if (mapping->a_ops->migratepage) {
935 /*
936 * Most pages have a mapping and most filesystems
937 * should provide a migration function. Anonymous
938 * pages are part of swap space which also has its
939 * own migration function. This is the most common
940 * path for page migration.
941 */
942 rc = mapping->a_ops->migratepage(newpage, page);
943 goto unlock_both;
944 }
945
946 /*
947 * Default handling if a filesystem does not provide
948 * a migration function. We can only migrate clean
949 * pages so try to write out any dirty pages first.
950 */
951 if (PageDirty(page)) {
952 switch (pageout(page, mapping)) {
953 case PAGE_KEEP:
954 case PAGE_ACTIVATE:
955 goto unlock_both;
956
957 case PAGE_SUCCESS:
958 unlock_page(newpage);
959 goto next;
960
961 case PAGE_CLEAN:
962 ; /* try to migrate the page below */
963 }
964 }
965
966 /*
967 * Buffers are managed in a filesystem specific way.
968 * We must have no buffers or drop them.
969 */
970 if (!page_has_buffers(page) ||
971 try_to_release_page(page, GFP_KERNEL)) {
972 rc = migrate_page(newpage, page);
973 goto unlock_both;
974 }
975
976 /*
977 * On early passes with mapped pages simply
978 * retry. There may be a lock held for some
979 * buffers that may go away. Later
980 * swap them out.
981 */
982 if (pass > 4) {
983 /*
984 * Persistently unable to drop buffers..... As a
985 * measure of last resort we fall back to
986 * swap_page().
987 */
988 unlock_page(newpage);
989 newpage = NULL;
990 rc = swap_page(page);
991 goto next;
992 }
993
994unlock_both:
995 unlock_page(newpage);
743 996
744unlock_page: 997unlock_page:
745 unlock_page(page); 998 unlock_page(page);
@@ -752,7 +1005,10 @@ next:
752 list_move(&page->lru, failed); 1005 list_move(&page->lru, failed);
753 nr_failed++; 1006 nr_failed++;
754 } else { 1007 } else {
755 /* Success */ 1008 if (newpage) {
1009 /* Successful migration. Return page to LRU */
1010 move_to_lru(newpage);
1011 }
756 list_move(&page->lru, moved); 1012 list_move(&page->lru, moved);
757 } 1013 }
758 } 1014 }
@@ -939,9 +1195,47 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
939 struct page *page; 1195 struct page *page;
940 struct pagevec pvec; 1196 struct pagevec pvec;
941 int reclaim_mapped = 0; 1197 int reclaim_mapped = 0;
942 long mapped_ratio; 1198
943 long distress; 1199 if (unlikely(sc->may_swap)) {
944 long swap_tendency; 1200 long mapped_ratio;
1201 long distress;
1202 long swap_tendency;
1203
1204 /*
1205 * `distress' is a measure of how much trouble we're having
1206 * reclaiming pages. 0 -> no problems. 100 -> great trouble.
1207 */
1208 distress = 100 >> zone->prev_priority;
1209
1210 /*
1211 * The point of this algorithm is to decide when to start
1212 * reclaiming mapped memory instead of just pagecache. Work out
1213 * how much memory
1214 * is mapped.
1215 */
1216 mapped_ratio = (sc->nr_mapped * 100) / total_memory;
1217
1218 /*
1219 * Now decide how much we really want to unmap some pages. The
1220 * mapped ratio is downgraded - just because there's a lot of
1221 * mapped memory doesn't necessarily mean that page reclaim
1222 * isn't succeeding.
1223 *
1224 * The distress ratio is important - we don't want to start
1225 * going oom.
1226 *
1227 * A 100% value of vm_swappiness overrides this algorithm
1228 * altogether.
1229 */
1230 swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
1231
1232 /*
1233 * Now use this metric to decide whether to start moving mapped
1234 * memory onto the inactive list.
1235 */
1236 if (swap_tendency >= 100)
1237 reclaim_mapped = 1;
1238 }
945 1239
946 lru_add_drain(); 1240 lru_add_drain();
947 spin_lock_irq(&zone->lru_lock); 1241 spin_lock_irq(&zone->lru_lock);
@@ -951,37 +1245,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
951 zone->nr_active -= pgmoved; 1245 zone->nr_active -= pgmoved;
952 spin_unlock_irq(&zone->lru_lock); 1246 spin_unlock_irq(&zone->lru_lock);
953 1247
954 /*
955 * `distress' is a measure of how much trouble we're having reclaiming
956 * pages. 0 -> no problems. 100 -> great trouble.
957 */
958 distress = 100 >> zone->prev_priority;
959
960 /*
961 * The point of this algorithm is to decide when to start reclaiming
962 * mapped memory instead of just pagecache. Work out how much memory
963 * is mapped.
964 */
965 mapped_ratio = (sc->nr_mapped * 100) / total_memory;
966
967 /*
968 * Now decide how much we really want to unmap some pages. The mapped
969 * ratio is downgraded - just because there's a lot of mapped memory
970 * doesn't necessarily mean that page reclaim isn't succeeding.
971 *
972 * The distress ratio is important - we don't want to start going oom.
973 *
974 * A 100% value of vm_swappiness overrides this algorithm altogether.
975 */
976 swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
977
978 /*
979 * Now use this metric to decide whether to start moving mapped memory
980 * onto the inactive list.
981 */
982 if (swap_tendency >= 100)
983 reclaim_mapped = 1;
984
985 while (!list_empty(&l_hold)) { 1248 while (!list_empty(&l_hold)) {
986 cond_resched(); 1249 cond_resched();
987 page = lru_to_page(&l_hold); 1250 page = lru_to_page(&l_hold);
@@ -1170,7 +1433,7 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1170 int i; 1433 int i;
1171 1434
1172 sc.gfp_mask = gfp_mask; 1435 sc.gfp_mask = gfp_mask;
1173 sc.may_writepage = 0; 1436 sc.may_writepage = !laptop_mode;
1174 sc.may_swap = 1; 1437 sc.may_swap = 1;
1175 1438
1176 inc_page_state(allocstall); 1439 inc_page_state(allocstall);
@@ -1273,7 +1536,7 @@ loop_again:
1273 total_scanned = 0; 1536 total_scanned = 0;
1274 total_reclaimed = 0; 1537 total_reclaimed = 0;
1275 sc.gfp_mask = GFP_KERNEL; 1538 sc.gfp_mask = GFP_KERNEL;
1276 sc.may_writepage = 0; 1539 sc.may_writepage = !laptop_mode;
1277 sc.may_swap = 1; 1540 sc.may_swap = 1;
1278 sc.nr_mapped = read_page_state(nr_mapped); 1541 sc.nr_mapped = read_page_state(nr_mapped);
1279 1542
@@ -1358,9 +1621,7 @@ scan:
1358 sc.nr_reclaimed = 0; 1621 sc.nr_reclaimed = 0;
1359 sc.priority = priority; 1622 sc.priority = priority;
1360 sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; 1623 sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
1361 atomic_inc(&zone->reclaim_in_progress);
1362 shrink_zone(zone, &sc); 1624 shrink_zone(zone, &sc);
1363 atomic_dec(&zone->reclaim_in_progress);
1364 reclaim_state->reclaimed_slab = 0; 1625 reclaim_state->reclaimed_slab = 0;
1365 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1626 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1366 lru_pages); 1627 lru_pages);
@@ -1586,40 +1847,61 @@ module_init(kswapd_init)
1586 */ 1847 */
1587int zone_reclaim_mode __read_mostly; 1848int zone_reclaim_mode __read_mostly;
1588 1849
1850#define RECLAIM_OFF 0
1851#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */
1852#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
1853#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
1854#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */
1855
1589/* 1856/*
1590 * Mininum time between zone reclaim scans 1857 * Mininum time between zone reclaim scans
1591 */ 1858 */
1592#define ZONE_RECLAIM_INTERVAL HZ/2 1859int zone_reclaim_interval __read_mostly = 30*HZ;
1860
1861/*
1862 * Priority for ZONE_RECLAIM. This determines the fraction of pages
1863 * of a node considered for each zone_reclaim. 4 scans 1/16th of
1864 * a zone.
1865 */
1866#define ZONE_RECLAIM_PRIORITY 4
1867
1593/* 1868/*
1594 * Try to free up some pages from this zone through reclaim. 1869 * Try to free up some pages from this zone through reclaim.
1595 */ 1870 */
1596int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1871int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1597{ 1872{
1598 int nr_pages = 1 << order; 1873 int nr_pages;
1599 struct task_struct *p = current; 1874 struct task_struct *p = current;
1600 struct reclaim_state reclaim_state; 1875 struct reclaim_state reclaim_state;
1601 struct scan_control sc = { 1876 struct scan_control sc;
1602 .gfp_mask = gfp_mask, 1877 cpumask_t mask;
1603 .may_writepage = 0, 1878 int node_id;
1604 .may_swap = 0, 1879
1605 .nr_mapped = read_page_state(nr_mapped), 1880 if (time_before(jiffies,
1606 .nr_scanned = 0, 1881 zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
1607 .nr_reclaimed = 0, 1882 return 0;
1608 .priority = 0
1609 };
1610 1883
1611 if (!(gfp_mask & __GFP_WAIT) || 1884 if (!(gfp_mask & __GFP_WAIT) ||
1612 zone->zone_pgdat->node_id != numa_node_id() ||
1613 zone->all_unreclaimable || 1885 zone->all_unreclaimable ||
1614 atomic_read(&zone->reclaim_in_progress) > 0) 1886 atomic_read(&zone->reclaim_in_progress) > 0)
1615 return 0; 1887 return 0;
1616 1888
1617 if (time_before(jiffies, 1889 node_id = zone->zone_pgdat->node_id;
1618 zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL)) 1890 mask = node_to_cpumask(node_id);
1619 return 0; 1891 if (!cpus_empty(mask) && node_id != numa_node_id())
1892 return 0;
1893
1894 sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
1895 sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
1896 sc.nr_scanned = 0;
1897 sc.nr_reclaimed = 0;
1898 sc.priority = ZONE_RECLAIM_PRIORITY + 1;
1899 sc.nr_mapped = read_page_state(nr_mapped);
1900 sc.gfp_mask = gfp_mask;
1620 1901
1621 disable_swap_token(); 1902 disable_swap_token();
1622 1903
1904 nr_pages = 1 << order;
1623 if (nr_pages > SWAP_CLUSTER_MAX) 1905 if (nr_pages > SWAP_CLUSTER_MAX)
1624 sc.swap_cluster_max = nr_pages; 1906 sc.swap_cluster_max = nr_pages;
1625 else 1907 else
@@ -1629,14 +1911,37 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1629 p->flags |= PF_MEMALLOC; 1911 p->flags |= PF_MEMALLOC;
1630 reclaim_state.reclaimed_slab = 0; 1912 reclaim_state.reclaimed_slab = 0;
1631 p->reclaim_state = &reclaim_state; 1913 p->reclaim_state = &reclaim_state;
1632 shrink_zone(zone, &sc); 1914
1915 /*
1916 * Free memory by calling shrink zone with increasing priorities
1917 * until we have enough memory freed.
1918 */
1919 do {
1920 sc.priority--;
1921 shrink_zone(zone, &sc);
1922
1923 } while (sc.nr_reclaimed < nr_pages && sc.priority > 0);
1924
1925 if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
1926 /*
1927 * shrink_slab does not currently allow us to determine
1928 * how many pages were freed in the zone. So we just
1929 * shake the slab and then go offnode for a single allocation.
1930 *
1931 * shrink_slab will free memory on all zones and may take
1932 * a long time.
1933 */
1934 shrink_slab(sc.nr_scanned, gfp_mask, order);
1935 sc.nr_reclaimed = 1; /* Avoid getting the off node timeout */
1936 }
1937
1633 p->reclaim_state = NULL; 1938 p->reclaim_state = NULL;
1634 current->flags &= ~PF_MEMALLOC; 1939 current->flags &= ~PF_MEMALLOC;
1635 1940
1636 if (sc.nr_reclaimed == 0) 1941 if (sc.nr_reclaimed == 0)
1637 zone->last_unsuccessful_zone_reclaim = jiffies; 1942 zone->last_unsuccessful_zone_reclaim = jiffies;
1638 1943
1639 return sc.nr_reclaimed > nr_pages; 1944 return sc.nr_reclaimed >= nr_pages;
1640} 1945}
1641#endif 1946#endif
1642 1947