aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/Makefile2
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/hugetlb.c286
-rw-r--r--mm/internal.h34
-rw-r--r--mm/memory.c21
-rw-r--r--mm/mempolicy.c117
-rw-r--r--mm/mempool.c4
-rw-r--r--mm/migrate.c655
-rw-r--r--mm/mmap.c10
-rw-r--r--mm/mprotect.c12
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/page_alloc.c113
-rw-r--r--mm/readahead.c32
-rw-r--r--mm/rmap.c14
-rw-r--r--mm/shmem.c7
-rw-r--r--mm/slab.c890
-rw-r--r--mm/swap.c64
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/vmscan.c882
21 files changed, 1773 insertions, 1385 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a9cb80ae64..bd80460360 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,5 +137,11 @@ config SPLIT_PTLOCK_CPUS
137# support for page migration 137# support for page migration
138# 138#
139config MIGRATION 139config MIGRATION
140 bool "Page migration"
140 def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM 141 def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
141 depends on SWAP 142 depends on SWAP
143 help
144 Allows the migration of the physical location of pages of processes
145 while the virtual addresses are not changed. This is useful for
146 example on NUMA systems to put pages nearer to the processors accessing
147 the page.
diff --git a/mm/Makefile b/mm/Makefile
index 9aa03fa1dc..f10c753dce 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -22,3 +22,5 @@ obj-$(CONFIG_SLOB) += slob.o
22obj-$(CONFIG_SLAB) += slab.o 22obj-$(CONFIG_SLAB) += slab.o
23obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 23obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
24obj-$(CONFIG_FS_XIP) += filemap_xip.o 24obj-$(CONFIG_FS_XIP) += filemap_xip.o
25obj-$(CONFIG_MIGRATION) += migrate.o
26
diff --git a/mm/filemap.c b/mm/filemap.c
index 44da3d4769..e8f58f7dd7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -30,6 +30,8 @@
30#include <linux/security.h> 30#include <linux/security.h>
31#include <linux/syscalls.h> 31#include <linux/syscalls.h>
32#include "filemap.h" 32#include "filemap.h"
33#include "internal.h"
34
33/* 35/*
34 * FIXME: remove all knowledge of the buffer layer from the core VM 36 * FIXME: remove all knowledge of the buffer layer from the core VM
35 */ 37 */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 508707704d..ebad6bbb35 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -13,24 +13,48 @@
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/mempolicy.h> 14#include <linux/mempolicy.h>
15#include <linux/cpuset.h> 15#include <linux/cpuset.h>
16#include <linux/mutex.h>
16 17
17#include <asm/page.h> 18#include <asm/page.h>
18#include <asm/pgtable.h> 19#include <asm/pgtable.h>
19 20
20#include <linux/hugetlb.h> 21#include <linux/hugetlb.h>
22#include "internal.h"
21 23
22const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
23static unsigned long nr_huge_pages, free_huge_pages; 25static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
24unsigned long max_huge_pages; 26unsigned long max_huge_pages;
25static struct list_head hugepage_freelists[MAX_NUMNODES]; 27static struct list_head hugepage_freelists[MAX_NUMNODES];
26static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 28static unsigned int nr_huge_pages_node[MAX_NUMNODES];
27static unsigned int free_huge_pages_node[MAX_NUMNODES]; 29static unsigned int free_huge_pages_node[MAX_NUMNODES];
28
29/* 30/*
30 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 31 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
31 */ 32 */
32static DEFINE_SPINLOCK(hugetlb_lock); 33static DEFINE_SPINLOCK(hugetlb_lock);
33 34
35static void clear_huge_page(struct page *page, unsigned long addr)
36{
37 int i;
38
39 might_sleep();
40 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
41 cond_resched();
42 clear_user_highpage(page + i, addr);
43 }
44}
45
46static void copy_huge_page(struct page *dst, struct page *src,
47 unsigned long addr)
48{
49 int i;
50
51 might_sleep();
52 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
53 cond_resched();
54 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
55 }
56}
57
34static void enqueue_huge_page(struct page *page) 58static void enqueue_huge_page(struct page *page)
35{ 59{
36 int nid = page_to_nid(page); 60 int nid = page_to_nid(page);
@@ -64,57 +88,176 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
64 return page; 88 return page;
65} 89}
66 90
67static struct page *alloc_fresh_huge_page(void) 91static void free_huge_page(struct page *page)
92{
93 BUG_ON(page_count(page));
94
95 INIT_LIST_HEAD(&page->lru);
96
97 spin_lock(&hugetlb_lock);
98 enqueue_huge_page(page);
99 spin_unlock(&hugetlb_lock);
100}
101
102static int alloc_fresh_huge_page(void)
68{ 103{
69 static int nid = 0; 104 static int nid = 0;
70 struct page *page; 105 struct page *page;
71 page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, 106 page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
72 HUGETLB_PAGE_ORDER); 107 HUGETLB_PAGE_ORDER);
73 nid = (nid + 1) % num_online_nodes(); 108 nid = next_node(nid, node_online_map);
109 if (nid == MAX_NUMNODES)
110 nid = first_node(node_online_map);
74 if (page) { 111 if (page) {
112 page[1].lru.next = (void *)free_huge_page; /* dtor */
75 spin_lock(&hugetlb_lock); 113 spin_lock(&hugetlb_lock);
76 nr_huge_pages++; 114 nr_huge_pages++;
77 nr_huge_pages_node[page_to_nid(page)]++; 115 nr_huge_pages_node[page_to_nid(page)]++;
78 spin_unlock(&hugetlb_lock); 116 spin_unlock(&hugetlb_lock);
117 put_page(page); /* free it into the hugepage allocator */
118 return 1;
79 } 119 }
80 return page; 120 return 0;
81} 121}
82 122
83void free_huge_page(struct page *page) 123static struct page *alloc_huge_page(struct vm_area_struct *vma,
124 unsigned long addr)
84{ 125{
85 BUG_ON(page_count(page)); 126 struct inode *inode = vma->vm_file->f_dentry->d_inode;
127 struct page *page;
128 int use_reserve = 0;
129 unsigned long idx;
86 130
87 INIT_LIST_HEAD(&page->lru); 131 spin_lock(&hugetlb_lock);
88 page[1].lru.next = NULL; /* reset dtor */ 132
133 if (vma->vm_flags & VM_MAYSHARE) {
134
135 /* idx = radix tree index, i.e. offset into file in
136 * HPAGE_SIZE units */
137 idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
138 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
139
140 /* The hugetlbfs specific inode info stores the number
141 * of "guaranteed available" (huge) pages. That is,
142 * the first 'prereserved_hpages' pages of the inode
143 * are either already instantiated, or have been
144 * pre-reserved (by hugetlb_reserve_for_inode()). Here
145 * we're in the process of instantiating the page, so
146 * we use this to determine whether to draw from the
147 * pre-reserved pool or the truly free pool. */
148 if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
149 use_reserve = 1;
150 }
151
152 if (!use_reserve) {
153 if (free_huge_pages <= reserved_huge_pages)
154 goto fail;
155 } else {
156 BUG_ON(reserved_huge_pages == 0);
157 reserved_huge_pages--;
158 }
159
160 page = dequeue_huge_page(vma, addr);
161 if (!page)
162 goto fail;
163
164 spin_unlock(&hugetlb_lock);
165 set_page_refcounted(page);
166 return page;
167
168 fail:
169 WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
170 spin_unlock(&hugetlb_lock);
171 return NULL;
172}
173
174/* hugetlb_extend_reservation()
175 *
176 * Ensure that at least 'atleast' hugepages are, and will remain,
177 * available to instantiate the first 'atleast' pages of the given
178 * inode. If the inode doesn't already have this many pages reserved
179 * or instantiated, set aside some hugepages in the reserved pool to
180 * satisfy later faults (or fail now if there aren't enough, rather
181 * than getting the SIGBUS later).
182 */
183int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
184 unsigned long atleast)
185{
186 struct inode *inode = &info->vfs_inode;
187 unsigned long change_in_reserve = 0;
188 int ret = 0;
89 189
90 spin_lock(&hugetlb_lock); 190 spin_lock(&hugetlb_lock);
91 enqueue_huge_page(page); 191 read_lock_irq(&inode->i_mapping->tree_lock);
192
193 if (info->prereserved_hpages >= atleast)
194 goto out;
195
196 /* Because we always call this on shared mappings, none of the
197 * pages beyond info->prereserved_hpages can have been
198 * instantiated, so we need to reserve all of them now. */
199 change_in_reserve = atleast - info->prereserved_hpages;
200
201 if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
202 ret = -ENOMEM;
203 goto out;
204 }
205
206 reserved_huge_pages += change_in_reserve;
207 info->prereserved_hpages = atleast;
208
209 out:
210 read_unlock_irq(&inode->i_mapping->tree_lock);
92 spin_unlock(&hugetlb_lock); 211 spin_unlock(&hugetlb_lock);
212
213 return ret;
93} 214}
94 215
95struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) 216/* hugetlb_truncate_reservation()
217 *
218 * This returns pages reserved for the given inode to the general free
219 * hugepage pool. If the inode has any pages prereserved, but not
220 * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
221 * them.
222 */
223void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
224 unsigned long atmost)
96{ 225{
226 struct inode *inode = &info->vfs_inode;
227 struct address_space *mapping = inode->i_mapping;
228 unsigned long idx;
229 unsigned long change_in_reserve = 0;
97 struct page *page; 230 struct page *page;
98 int i;
99 231
100 spin_lock(&hugetlb_lock); 232 spin_lock(&hugetlb_lock);
101 page = dequeue_huge_page(vma, addr); 233 read_lock_irq(&inode->i_mapping->tree_lock);
102 if (!page) { 234
103 spin_unlock(&hugetlb_lock); 235 if (info->prereserved_hpages <= atmost)
104 return NULL; 236 goto out;
237
238 /* Count pages which were reserved, but not instantiated, and
239 * which we can now release. */
240 for (idx = atmost; idx < info->prereserved_hpages; idx++) {
241 page = radix_tree_lookup(&mapping->page_tree, idx);
242 if (!page)
243 /* Pages which are already instantiated can't
244 * be unreserved (and in fact have already
245 * been removed from the reserved pool) */
246 change_in_reserve++;
105 } 247 }
248
249 BUG_ON(reserved_huge_pages < change_in_reserve);
250 reserved_huge_pages -= change_in_reserve;
251 info->prereserved_hpages = atmost;
252
253 out:
254 read_unlock_irq(&inode->i_mapping->tree_lock);
106 spin_unlock(&hugetlb_lock); 255 spin_unlock(&hugetlb_lock);
107 set_page_count(page, 1);
108 page[1].lru.next = (void *)free_huge_page; /* set dtor */
109 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
110 clear_user_highpage(&page[i], addr);
111 return page;
112} 256}
113 257
114static int __init hugetlb_init(void) 258static int __init hugetlb_init(void)
115{ 259{
116 unsigned long i; 260 unsigned long i;
117 struct page *page;
118 261
119 if (HPAGE_SHIFT == 0) 262 if (HPAGE_SHIFT == 0)
120 return 0; 263 return 0;
@@ -123,12 +266,8 @@ static int __init hugetlb_init(void)
123 INIT_LIST_HEAD(&hugepage_freelists[i]); 266 INIT_LIST_HEAD(&hugepage_freelists[i]);
124 267
125 for (i = 0; i < max_huge_pages; ++i) { 268 for (i = 0; i < max_huge_pages; ++i) {
126 page = alloc_fresh_huge_page(); 269 if (!alloc_fresh_huge_page())
127 if (!page)
128 break; 270 break;
129 spin_lock(&hugetlb_lock);
130 enqueue_huge_page(page);
131 spin_unlock(&hugetlb_lock);
132 } 271 }
133 max_huge_pages = free_huge_pages = nr_huge_pages = i; 272 max_huge_pages = free_huge_pages = nr_huge_pages = i;
134 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 273 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
@@ -154,9 +293,9 @@ static void update_and_free_page(struct page *page)
154 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 293 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
155 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 294 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
156 1 << PG_private | 1<< PG_writeback); 295 1 << PG_private | 1<< PG_writeback);
157 set_page_count(&page[i], 0);
158 } 296 }
159 set_page_count(page, 1); 297 page[1].lru.next = NULL;
298 set_page_refcounted(page);
160 __free_pages(page, HUGETLB_PAGE_ORDER); 299 __free_pages(page, HUGETLB_PAGE_ORDER);
161} 300}
162 301
@@ -188,12 +327,8 @@ static inline void try_to_free_low(unsigned long count)
188static unsigned long set_max_huge_pages(unsigned long count) 327static unsigned long set_max_huge_pages(unsigned long count)
189{ 328{
190 while (count > nr_huge_pages) { 329 while (count > nr_huge_pages) {
191 struct page *page = alloc_fresh_huge_page(); 330 if (!alloc_fresh_huge_page())
192 if (!page)
193 return nr_huge_pages; 331 return nr_huge_pages;
194 spin_lock(&hugetlb_lock);
195 enqueue_huge_page(page);
196 spin_unlock(&hugetlb_lock);
197 } 332 }
198 if (count >= nr_huge_pages) 333 if (count >= nr_huge_pages)
199 return nr_huge_pages; 334 return nr_huge_pages;
@@ -225,9 +360,11 @@ int hugetlb_report_meminfo(char *buf)
225 return sprintf(buf, 360 return sprintf(buf,
226 "HugePages_Total: %5lu\n" 361 "HugePages_Total: %5lu\n"
227 "HugePages_Free: %5lu\n" 362 "HugePages_Free: %5lu\n"
363 "HugePages_Rsvd: %5lu\n"
228 "Hugepagesize: %5lu kB\n", 364 "Hugepagesize: %5lu kB\n",
229 nr_huge_pages, 365 nr_huge_pages,
230 free_huge_pages, 366 free_huge_pages,
367 reserved_huge_pages,
231 HPAGE_SIZE/1024); 368 HPAGE_SIZE/1024);
232} 369}
233 370
@@ -240,11 +377,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
240 nid, free_huge_pages_node[nid]); 377 nid, free_huge_pages_node[nid]);
241} 378}
242 379
243int is_hugepage_mem_enough(size_t size)
244{
245 return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
246}
247
248/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 380/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
249unsigned long hugetlb_total_pages(void) 381unsigned long hugetlb_total_pages(void)
250{ 382{
@@ -374,7 +506,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
374 unsigned long address, pte_t *ptep, pte_t pte) 506 unsigned long address, pte_t *ptep, pte_t pte)
375{ 507{
376 struct page *old_page, *new_page; 508 struct page *old_page, *new_page;
377 int i, avoidcopy; 509 int avoidcopy;
378 510
379 old_page = pte_page(pte); 511 old_page = pte_page(pte);
380 512
@@ -395,9 +527,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
395 } 527 }
396 528
397 spin_unlock(&mm->page_table_lock); 529 spin_unlock(&mm->page_table_lock);
398 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) 530 copy_huge_page(new_page, old_page, address);
399 copy_user_highpage(new_page + i, old_page + i,
400 address + i*PAGE_SIZE);
401 spin_lock(&mm->page_table_lock); 531 spin_lock(&mm->page_table_lock);
402 532
403 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 533 ptep = huge_pte_offset(mm, address & HPAGE_MASK);
@@ -442,6 +572,7 @@ retry:
442 ret = VM_FAULT_OOM; 572 ret = VM_FAULT_OOM;
443 goto out; 573 goto out;
444 } 574 }
575 clear_huge_page(page, address);
445 576
446 if (vma->vm_flags & VM_SHARED) { 577 if (vma->vm_flags & VM_SHARED) {
447 int err; 578 int err;
@@ -496,14 +627,24 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
496 pte_t *ptep; 627 pte_t *ptep;
497 pte_t entry; 628 pte_t entry;
498 int ret; 629 int ret;
630 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
499 631
500 ptep = huge_pte_alloc(mm, address); 632 ptep = huge_pte_alloc(mm, address);
501 if (!ptep) 633 if (!ptep)
502 return VM_FAULT_OOM; 634 return VM_FAULT_OOM;
503 635
636 /*
637 * Serialize hugepage allocation and instantiation, so that we don't
638 * get spurious allocation failures if two CPUs race to instantiate
639 * the same page in the page cache.
640 */
641 mutex_lock(&hugetlb_instantiation_mutex);
504 entry = *ptep; 642 entry = *ptep;
505 if (pte_none(entry)) 643 if (pte_none(entry)) {
506 return hugetlb_no_page(mm, vma, address, ptep, write_access); 644 ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
645 mutex_unlock(&hugetlb_instantiation_mutex);
646 return ret;
647 }
507 648
508 ret = VM_FAULT_MINOR; 649 ret = VM_FAULT_MINOR;
509 650
@@ -513,6 +654,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
513 if (write_access && !pte_write(entry)) 654 if (write_access && !pte_write(entry))
514 ret = hugetlb_cow(mm, vma, address, ptep, entry); 655 ret = hugetlb_cow(mm, vma, address, ptep, entry);
515 spin_unlock(&mm->page_table_lock); 656 spin_unlock(&mm->page_table_lock);
657 mutex_unlock(&hugetlb_instantiation_mutex);
516 658
517 return ret; 659 return ret;
518} 660}
@@ -521,10 +663,10 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
521 struct page **pages, struct vm_area_struct **vmas, 663 struct page **pages, struct vm_area_struct **vmas,
522 unsigned long *position, int *length, int i) 664 unsigned long *position, int *length, int i)
523{ 665{
524 unsigned long vpfn, vaddr = *position; 666 unsigned long pfn_offset;
667 unsigned long vaddr = *position;
525 int remainder = *length; 668 int remainder = *length;
526 669
527 vpfn = vaddr/PAGE_SIZE;
528 spin_lock(&mm->page_table_lock); 670 spin_lock(&mm->page_table_lock);
529 while (vaddr < vma->vm_end && remainder) { 671 while (vaddr < vma->vm_end && remainder) {
530 pte_t *pte; 672 pte_t *pte;
@@ -552,19 +694,28 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
552 break; 694 break;
553 } 695 }
554 696
555 if (pages) { 697 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
556 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; 698 page = pte_page(*pte);
557 get_page(page); 699same_page:
558 pages[i] = page; 700 get_page(page);
559 } 701 if (pages)
702 pages[i] = page + pfn_offset;
560 703
561 if (vmas) 704 if (vmas)
562 vmas[i] = vma; 705 vmas[i] = vma;
563 706
564 vaddr += PAGE_SIZE; 707 vaddr += PAGE_SIZE;
565 ++vpfn; 708 ++pfn_offset;
566 --remainder; 709 --remainder;
567 ++i; 710 ++i;
711 if (vaddr < vma->vm_end && remainder &&
712 pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
713 /*
714 * We use pfn_offset to avoid touching the pageframes
715 * of this compound page.
716 */
717 goto same_page;
718 }
568 } 719 }
569 spin_unlock(&mm->page_table_lock); 720 spin_unlock(&mm->page_table_lock);
570 *length = remainder; 721 *length = remainder;
@@ -572,3 +723,32 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
572 723
573 return i; 724 return i;
574} 725}
726
727void hugetlb_change_protection(struct vm_area_struct *vma,
728 unsigned long address, unsigned long end, pgprot_t newprot)
729{
730 struct mm_struct *mm = vma->vm_mm;
731 unsigned long start = address;
732 pte_t *ptep;
733 pte_t pte;
734
735 BUG_ON(address >= end);
736 flush_cache_range(vma, address, end);
737
738 spin_lock(&mm->page_table_lock);
739 for (; address < end; address += HPAGE_SIZE) {
740 ptep = huge_pte_offset(mm, address);
741 if (!ptep)
742 continue;
743 if (!pte_none(*ptep)) {
744 pte = huge_ptep_get_and_clear(mm, address, ptep);
745 pte = pte_mkhuge(pte_modify(pte, newprot));
746 set_huge_pte_at(mm, address, ptep, pte);
747 lazy_mmu_prot_update(pte);
748 }
749 }
750 spin_unlock(&mm->page_table_lock);
751
752 flush_tlb_range(vma, start, end);
753}
754
diff --git a/mm/internal.h b/mm/internal.h
index 17256bb2f4..d20e3cc4ae 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -8,23 +8,33 @@
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11#ifndef __MM_INTERNAL_H
12#define __MM_INTERNAL_H
11 13
12static inline void set_page_refs(struct page *page, int order) 14#include <linux/mm.h>
15
16static inline void set_page_count(struct page *page, int v)
17{
18 atomic_set(&page->_count, v);
19}
20
21/*
22 * Turn a non-refcounted page (->_count == 0) into refcounted with
23 * a count of one.
24 */
25static inline void set_page_refcounted(struct page *page)
13{ 26{
14#ifdef CONFIG_MMU 27 BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page);
28 BUG_ON(atomic_read(&page->_count));
15 set_page_count(page, 1); 29 set_page_count(page, 1);
16#else 30}
17 int i;
18 31
19 /* 32static inline void __put_page(struct page *page)
20 * We need to reference all the pages for this order, otherwise if 33{
21 * anyone accesses one of the pages with (get/put) it will be freed. 34 atomic_dec(&page->_count);
22 * - eg: access_process_vm()
23 */
24 for (i = 0; i < (1 << order); i++)
25 set_page_count(page + i, 1);
26#endif /* CONFIG_MMU */
27} 35}
28 36
29extern void fastcall __init __free_pages_bootmem(struct page *page, 37extern void fastcall __init __free_pages_bootmem(struct page *page,
30 unsigned int order); 38 unsigned int order);
39
40#endif
diff --git a/mm/memory.c b/mm/memory.c
index 85e80a57db..80c3fb370f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -277,7 +277,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
277 anon_vma_unlink(vma); 277 anon_vma_unlink(vma);
278 unlink_file_vma(vma); 278 unlink_file_vma(vma);
279 279
280 if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { 280 if (is_vm_hugetlb_page(vma)) {
281 hugetlb_free_pgd_range(tlb, addr, vma->vm_end, 281 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
282 floor, next? next->vm_start: ceiling); 282 floor, next? next->vm_start: ceiling);
283 } else { 283 } else {
@@ -285,8 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
285 * Optimization: gather nearby vmas into one call down 285 * Optimization: gather nearby vmas into one call down
286 */ 286 */
287 while (next && next->vm_start <= vma->vm_end + PMD_SIZE 287 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
288 && !is_hugepage_only_range(vma->vm_mm, next->vm_start, 288 && !is_vm_hugetlb_page(next)) {
289 HPAGE_SIZE)) {
290 vma = next; 289 vma = next;
291 next = vma->vm_next; 290 next = vma->vm_next;
292 anon_vma_unlink(vma); 291 anon_vma_unlink(vma);
@@ -388,7 +387,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
388{ 387{
389 unsigned long pfn = pte_pfn(pte); 388 unsigned long pfn = pte_pfn(pte);
390 389
391 if (vma->vm_flags & VM_PFNMAP) { 390 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
392 unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; 391 unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
393 if (pfn == vma->vm_pgoff + off) 392 if (pfn == vma->vm_pgoff + off)
394 return NULL; 393 return NULL;
@@ -396,18 +395,12 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
396 return NULL; 395 return NULL;
397 } 396 }
398 397
399 /* 398#ifdef CONFIG_DEBUG_VM
400 * Add some anal sanity checks for now. Eventually,
401 * we should just do "return pfn_to_page(pfn)", but
402 * in the meantime we check that we get a valid pfn,
403 * and that the resulting page looks ok.
404 *
405 * Remove this test eventually!
406 */
407 if (unlikely(!pfn_valid(pfn))) { 399 if (unlikely(!pfn_valid(pfn))) {
408 print_bad_pte(vma, pte, addr); 400 print_bad_pte(vma, pte, addr);
409 return NULL; 401 return NULL;
410 } 402 }
403#endif
411 404
412 /* 405 /*
413 * NOTE! We still have PageReserved() pages in the page 406 * NOTE! We still have PageReserved() pages in the page
@@ -1221,9 +1214,7 @@ out:
1221 * The page has to be a nice clean _individual_ kernel allocation. 1214 * The page has to be a nice clean _individual_ kernel allocation.
1222 * If you allocate a compound page, you need to have marked it as 1215 * If you allocate a compound page, you need to have marked it as
1223 * such (__GFP_COMP), or manually just split the page up yourself 1216 * such (__GFP_COMP), or manually just split the page up yourself
1224 * (which is mainly an issue of doing "set_page_count(page, 1)" for 1217 * (see split_page()).
1225 * each sub-page, and then freeing them one by one when you free
1226 * them rather than freeing it as a compound page).
1227 * 1218 *
1228 * NOTE! Traditionally this was done with "remap_pfn_range()" which 1219 * NOTE! Traditionally this was done with "remap_pfn_range()" which
1229 * took an arbitrary page protection parameter. This doesn't allow 1220 * took an arbitrary page protection parameter. This doesn't allow
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b21869a39f..e93cc740c2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -86,6 +86,7 @@
86#include <linux/swap.h> 86#include <linux/swap.h>
87#include <linux/seq_file.h> 87#include <linux/seq_file.h>
88#include <linux/proc_fs.h> 88#include <linux/proc_fs.h>
89#include <linux/migrate.h>
89 90
90#include <asm/tlbflush.h> 91#include <asm/tlbflush.h>
91#include <asm/uaccess.h> 92#include <asm/uaccess.h>
@@ -95,11 +96,8 @@
95#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 96#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
96#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ 97#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
97 98
98/* The number of pages to migrate per call to migrate_pages() */ 99static struct kmem_cache *policy_cache;
99#define MIGRATE_CHUNK_SIZE 256 100static struct kmem_cache *sn_cache;
100
101static kmem_cache_t *policy_cache;
102static kmem_cache_t *sn_cache;
103 101
104#define PDprintk(fmt...) 102#define PDprintk(fmt...)
105 103
@@ -331,17 +329,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
331 struct vm_area_struct *first, *vma, *prev; 329 struct vm_area_struct *first, *vma, *prev;
332 330
333 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { 331 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
334 /* Must have swap device for migration */
335 if (nr_swap_pages <= 0)
336 return ERR_PTR(-ENODEV);
337 332
338 /* 333 err = migrate_prep();
339 * Clear the LRU lists so pages can be isolated. 334 if (err)
340 * Note that pages may be moved off the LRU after we have 335 return ERR_PTR(err);
341 * drained them. Those pages will fail to migrate like other
342 * pages that may be busy.
343 */
344 lru_add_drain_all();
345 } 336 }
346 337
347 first = find_vma(mm, start); 338 first = find_vma(mm, start);
@@ -550,92 +541,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
550 return err; 541 return err;
551} 542}
552 543
544#ifdef CONFIG_MIGRATION
553/* 545/*
554 * page migration 546 * page migration
555 */ 547 */
556
557static void migrate_page_add(struct page *page, struct list_head *pagelist, 548static void migrate_page_add(struct page *page, struct list_head *pagelist,
558 unsigned long flags) 549 unsigned long flags)
559{ 550{
560 /* 551 /*
561 * Avoid migrating a page that is shared with others. 552 * Avoid migrating a page that is shared with others.
562 */ 553 */
563 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { 554 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
564 if (isolate_lru_page(page)) 555 isolate_lru_page(page, pagelist);
565 list_add_tail(&page->lru, pagelist);
566 }
567}
568
569/*
570 * Migrate the list 'pagelist' of pages to a certain destination.
571 *
572 * Specify destination with either non-NULL vma or dest_node >= 0
573 * Return the number of pages not migrated or error code
574 */
575static int migrate_pages_to(struct list_head *pagelist,
576 struct vm_area_struct *vma, int dest)
577{
578 LIST_HEAD(newlist);
579 LIST_HEAD(moved);
580 LIST_HEAD(failed);
581 int err = 0;
582 unsigned long offset = 0;
583 int nr_pages;
584 struct page *page;
585 struct list_head *p;
586
587redo:
588 nr_pages = 0;
589 list_for_each(p, pagelist) {
590 if (vma) {
591 /*
592 * The address passed to alloc_page_vma is used to
593 * generate the proper interleave behavior. We fake
594 * the address here by an increasing offset in order
595 * to get the proper distribution of pages.
596 *
597 * No decision has been made as to which page
598 * a certain old page is moved to so we cannot
599 * specify the correct address.
600 */
601 page = alloc_page_vma(GFP_HIGHUSER, vma,
602 offset + vma->vm_start);
603 offset += PAGE_SIZE;
604 }
605 else
606 page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
607
608 if (!page) {
609 err = -ENOMEM;
610 goto out;
611 }
612 list_add_tail(&page->lru, &newlist);
613 nr_pages++;
614 if (nr_pages > MIGRATE_CHUNK_SIZE)
615 break;
616 }
617 err = migrate_pages(pagelist, &newlist, &moved, &failed);
618
619 putback_lru_pages(&moved); /* Call release pages instead ?? */
620
621 if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
622 goto redo;
623out:
624 /* Return leftover allocated pages */
625 while (!list_empty(&newlist)) {
626 page = list_entry(newlist.next, struct page, lru);
627 list_del(&page->lru);
628 __free_page(page);
629 }
630 list_splice(&failed, pagelist);
631 if (err < 0)
632 return err;
633
634 /* Calculate number of leftover pages */
635 nr_pages = 0;
636 list_for_each(p, pagelist)
637 nr_pages++;
638 return nr_pages;
639} 556}
640 557
641/* 558/*
@@ -742,8 +659,23 @@ int do_migrate_pages(struct mm_struct *mm,
742 if (err < 0) 659 if (err < 0)
743 return err; 660 return err;
744 return busy; 661 return busy;
662
745} 663}
746 664
665#else
666
667static void migrate_page_add(struct page *page, struct list_head *pagelist,
668 unsigned long flags)
669{
670}
671
672int do_migrate_pages(struct mm_struct *mm,
673 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
674{
675 return -ENOSYS;
676}
677#endif
678
747long do_mbind(unsigned long start, unsigned long len, 679long do_mbind(unsigned long start, unsigned long len,
748 unsigned long mode, nodemask_t *nmask, unsigned long flags) 680 unsigned long mode, nodemask_t *nmask, unsigned long flags)
749{ 681{
@@ -808,6 +740,7 @@ long do_mbind(unsigned long start, unsigned long len,
808 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 740 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
809 err = -EIO; 741 err = -EIO;
810 } 742 }
743
811 if (!list_empty(&pagelist)) 744 if (!list_empty(&pagelist))
812 putback_lru_pages(&pagelist); 745 putback_lru_pages(&pagelist);
813 746
diff --git a/mm/mempool.c b/mm/mempool.c
index 1a99b80480..f71893ed35 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -278,14 +278,14 @@ EXPORT_SYMBOL(mempool_free);
278 */ 278 */
279void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) 279void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
280{ 280{
281 kmem_cache_t *mem = (kmem_cache_t *) pool_data; 281 struct kmem_cache *mem = pool_data;
282 return kmem_cache_alloc(mem, gfp_mask); 282 return kmem_cache_alloc(mem, gfp_mask);
283} 283}
284EXPORT_SYMBOL(mempool_alloc_slab); 284EXPORT_SYMBOL(mempool_alloc_slab);
285 285
286void mempool_free_slab(void *element, void *pool_data) 286void mempool_free_slab(void *element, void *pool_data)
287{ 287{
288 kmem_cache_t *mem = (kmem_cache_t *) pool_data; 288 struct kmem_cache *mem = pool_data;
289 kmem_cache_free(mem, element); 289 kmem_cache_free(mem, element);
290} 290}
291EXPORT_SYMBOL(mempool_free_slab); 291EXPORT_SYMBOL(mempool_free_slab);
diff --git a/mm/migrate.c b/mm/migrate.c
new file mode 100644
index 0000000000..09f6e4aa87
--- /dev/null
+++ b/mm/migrate.c
@@ -0,0 +1,655 @@
1/*
2 * Memory Migration functionality - linux/mm/migration.c
3 *
4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
5 *
6 * Page migration was first developed in the context of the memory hotplug
7 * project. The main authors of the migration code are:
8 *
9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
10 * Hirokazu Takahashi <taka@valinux.co.jp>
11 * Dave Hansen <haveblue@us.ibm.com>
12 * Christoph Lameter <clameter@sgi.com>
13 */
14
15#include <linux/migrate.h>
16#include <linux/module.h>
17#include <linux/swap.h>
18#include <linux/pagemap.h>
19#include <linux/buffer_head.h> /* for try_to_release_page(),
20 buffer_heads_over_limit */
21#include <linux/mm_inline.h>
22#include <linux/pagevec.h>
23#include <linux/rmap.h>
24#include <linux/topology.h>
25#include <linux/cpu.h>
26#include <linux/cpuset.h>
27#include <linux/swapops.h>
28
29#include "internal.h"
30
31#include "internal.h"
32
33/* The maximum number of pages to take off the LRU for migration */
34#define MIGRATE_CHUNK_SIZE 256
35
36#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
37
38/*
39 * Isolate one page from the LRU lists. If successful put it onto
40 * the indicated list with elevated page count.
41 *
42 * Result:
43 * -EBUSY: page not on LRU list
44 * 0: page removed from LRU list and added to the specified list.
45 */
46int isolate_lru_page(struct page *page, struct list_head *pagelist)
47{
48 int ret = -EBUSY;
49
50 if (PageLRU(page)) {
51 struct zone *zone = page_zone(page);
52
53 spin_lock_irq(&zone->lru_lock);
54 if (PageLRU(page)) {
55 ret = 0;
56 get_page(page);
57 ClearPageLRU(page);
58 if (PageActive(page))
59 del_page_from_active_list(zone, page);
60 else
61 del_page_from_inactive_list(zone, page);
62 list_add_tail(&page->lru, pagelist);
63 }
64 spin_unlock_irq(&zone->lru_lock);
65 }
66 return ret;
67}
68
69/*
70 * migrate_prep() needs to be called after we have compiled the list of pages
71 * to be migrated using isolate_lru_page() but before we begin a series of calls
72 * to migrate_pages().
73 */
74int migrate_prep(void)
75{
76 /* Must have swap device for migration */
77 if (nr_swap_pages <= 0)
78 return -ENODEV;
79
80 /*
81 * Clear the LRU lists so pages can be isolated.
82 * Note that pages may be moved off the LRU after we have
83 * drained them. Those pages will fail to migrate like other
84 * pages that may be busy.
85 */
86 lru_add_drain_all();
87
88 return 0;
89}
90
91static inline void move_to_lru(struct page *page)
92{
93 list_del(&page->lru);
94 if (PageActive(page)) {
95 /*
96 * lru_cache_add_active checks that
97 * the PG_active bit is off.
98 */
99 ClearPageActive(page);
100 lru_cache_add_active(page);
101 } else {
102 lru_cache_add(page);
103 }
104 put_page(page);
105}
106
107/*
108 * Add isolated pages on the list back to the LRU.
109 *
110 * returns the number of pages put back.
111 */
112int putback_lru_pages(struct list_head *l)
113{
114 struct page *page;
115 struct page *page2;
116 int count = 0;
117
118 list_for_each_entry_safe(page, page2, l, lru) {
119 move_to_lru(page);
120 count++;
121 }
122 return count;
123}
124
125/*
126 * Non migratable page
127 */
128int fail_migrate_page(struct page *newpage, struct page *page)
129{
130 return -EIO;
131}
132EXPORT_SYMBOL(fail_migrate_page);
133
134/*
135 * swapout a single page
136 * page is locked upon entry, unlocked on exit
137 */
138static int swap_page(struct page *page)
139{
140 struct address_space *mapping = page_mapping(page);
141
142 if (page_mapped(page) && mapping)
143 if (try_to_unmap(page, 1) != SWAP_SUCCESS)
144 goto unlock_retry;
145
146 if (PageDirty(page)) {
147 /* Page is dirty, try to write it out here */
148 switch(pageout(page, mapping)) {
149 case PAGE_KEEP:
150 case PAGE_ACTIVATE:
151 goto unlock_retry;
152
153 case PAGE_SUCCESS:
154 goto retry;
155
156 case PAGE_CLEAN:
157 ; /* try to free the page below */
158 }
159 }
160
161 if (PagePrivate(page)) {
162 if (!try_to_release_page(page, GFP_KERNEL) ||
163 (!mapping && page_count(page) == 1))
164 goto unlock_retry;
165 }
166
167 if (remove_mapping(mapping, page)) {
168 /* Success */
169 unlock_page(page);
170 return 0;
171 }
172
173unlock_retry:
174 unlock_page(page);
175
176retry:
177 return -EAGAIN;
178}
179EXPORT_SYMBOL(swap_page);
180
181/*
182 * Remove references for a page and establish the new page with the correct
183 * basic settings to be able to stop accesses to the page.
184 */
185int migrate_page_remove_references(struct page *newpage,
186 struct page *page, int nr_refs)
187{
188 struct address_space *mapping = page_mapping(page);
189 struct page **radix_pointer;
190
191 /*
192 * Avoid doing any of the following work if the page count
193 * indicates that the page is in use or truncate has removed
194 * the page.
195 */
196 if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
197 return -EAGAIN;
198
199 /*
200 * Establish swap ptes for anonymous pages or destroy pte
201 * maps for files.
202 *
203 * In order to reestablish file backed mappings the fault handlers
204 * will take the radix tree_lock which may then be used to stop
205 * processses from accessing this page until the new page is ready.
206 *
207 * A process accessing via a swap pte (an anonymous page) will take a
208 * page_lock on the old page which will block the process until the
209 * migration attempt is complete. At that time the PageSwapCache bit
210 * will be examined. If the page was migrated then the PageSwapCache
211 * bit will be clear and the operation to retrieve the page will be
212 * retried which will find the new page in the radix tree. Then a new
213 * direct mapping may be generated based on the radix tree contents.
214 *
215 * If the page was not migrated then the PageSwapCache bit
216 * is still set and the operation may continue.
217 */
218 if (try_to_unmap(page, 1) == SWAP_FAIL)
219 /* A vma has VM_LOCKED set -> permanent failure */
220 return -EPERM;
221
222 /*
223 * Give up if we were unable to remove all mappings.
224 */
225 if (page_mapcount(page))
226 return -EAGAIN;
227
228 write_lock_irq(&mapping->tree_lock);
229
230 radix_pointer = (struct page **)radix_tree_lookup_slot(
231 &mapping->page_tree,
232 page_index(page));
233
234 if (!page_mapping(page) || page_count(page) != nr_refs ||
235 *radix_pointer != page) {
236 write_unlock_irq(&mapping->tree_lock);
237 return 1;
238 }
239
240 /*
241 * Now we know that no one else is looking at the page.
242 *
243 * Certain minimal information about a page must be available
244 * in order for other subsystems to properly handle the page if they
245 * find it through the radix tree update before we are finished
246 * copying the page.
247 */
248 get_page(newpage);
249 newpage->index = page->index;
250 newpage->mapping = page->mapping;
251 if (PageSwapCache(page)) {
252 SetPageSwapCache(newpage);
253 set_page_private(newpage, page_private(page));
254 }
255
256 *radix_pointer = newpage;
257 __put_page(page);
258 write_unlock_irq(&mapping->tree_lock);
259
260 return 0;
261}
262EXPORT_SYMBOL(migrate_page_remove_references);
263
264/*
265 * Copy the page to its new location
266 */
267void migrate_page_copy(struct page *newpage, struct page *page)
268{
269 copy_highpage(newpage, page);
270
271 if (PageError(page))
272 SetPageError(newpage);
273 if (PageReferenced(page))
274 SetPageReferenced(newpage);
275 if (PageUptodate(page))
276 SetPageUptodate(newpage);
277 if (PageActive(page))
278 SetPageActive(newpage);
279 if (PageChecked(page))
280 SetPageChecked(newpage);
281 if (PageMappedToDisk(page))
282 SetPageMappedToDisk(newpage);
283
284 if (PageDirty(page)) {
285 clear_page_dirty_for_io(page);
286 set_page_dirty(newpage);
287 }
288
289 ClearPageSwapCache(page);
290 ClearPageActive(page);
291 ClearPagePrivate(page);
292 set_page_private(page, 0);
293 page->mapping = NULL;
294
295 /*
296 * If any waiters have accumulated on the new page then
297 * wake them up.
298 */
299 if (PageWriteback(newpage))
300 end_page_writeback(newpage);
301}
302EXPORT_SYMBOL(migrate_page_copy);
303
304/*
305 * Common logic to directly migrate a single page suitable for
306 * pages that do not use PagePrivate.
307 *
308 * Pages are locked upon entry and exit.
309 */
310int migrate_page(struct page *newpage, struct page *page)
311{
312 int rc;
313
314 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
315
316 rc = migrate_page_remove_references(newpage, page, 2);
317
318 if (rc)
319 return rc;
320
321 migrate_page_copy(newpage, page);
322
323 /*
324 * Remove auxiliary swap entries and replace
325 * them with real ptes.
326 *
327 * Note that a real pte entry will allow processes that are not
328 * waiting on the page lock to use the new page via the page tables
329 * before the new page is unlocked.
330 */
331 remove_from_swap(newpage);
332 return 0;
333}
334EXPORT_SYMBOL(migrate_page);
335
336/*
337 * migrate_pages
338 *
339 * Two lists are passed to this function. The first list
340 * contains the pages isolated from the LRU to be migrated.
341 * The second list contains new pages that the pages isolated
342 * can be moved to. If the second list is NULL then all
343 * pages are swapped out.
344 *
345 * The function returns after 10 attempts or if no pages
346 * are movable anymore because to has become empty
347 * or no retryable pages exist anymore.
348 *
349 * Return: Number of pages not migrated when "to" ran empty.
350 */
351int migrate_pages(struct list_head *from, struct list_head *to,
352 struct list_head *moved, struct list_head *failed)
353{
354 int retry;
355 int nr_failed = 0;
356 int pass = 0;
357 struct page *page;
358 struct page *page2;
359 int swapwrite = current->flags & PF_SWAPWRITE;
360 int rc;
361
362 if (!swapwrite)
363 current->flags |= PF_SWAPWRITE;
364
365redo:
366 retry = 0;
367
368 list_for_each_entry_safe(page, page2, from, lru) {
369 struct page *newpage = NULL;
370 struct address_space *mapping;
371
372 cond_resched();
373
374 rc = 0;
375 if (page_count(page) == 1)
376 /* page was freed from under us. So we are done. */
377 goto next;
378
379 if (to && list_empty(to))
380 break;
381
382 /*
383 * Skip locked pages during the first two passes to give the
384 * functions holding the lock time to release the page. Later we
385 * use lock_page() to have a higher chance of acquiring the
386 * lock.
387 */
388 rc = -EAGAIN;
389 if (pass > 2)
390 lock_page(page);
391 else
392 if (TestSetPageLocked(page))
393 goto next;
394
395 /*
396 * Only wait on writeback if we have already done a pass where
397 * we we may have triggered writeouts for lots of pages.
398 */
399 if (pass > 0) {
400 wait_on_page_writeback(page);
401 } else {
402 if (PageWriteback(page))
403 goto unlock_page;
404 }
405
406 /*
407 * Anonymous pages must have swap cache references otherwise
408 * the information contained in the page maps cannot be
409 * preserved.
410 */
411 if (PageAnon(page) && !PageSwapCache(page)) {
412 if (!add_to_swap(page, GFP_KERNEL)) {
413 rc = -ENOMEM;
414 goto unlock_page;
415 }
416 }
417
418 if (!to) {
419 rc = swap_page(page);
420 goto next;
421 }
422
423 newpage = lru_to_page(to);
424 lock_page(newpage);
425
426 /*
427 * Pages are properly locked and writeback is complete.
428 * Try to migrate the page.
429 */
430 mapping = page_mapping(page);
431 if (!mapping)
432 goto unlock_both;
433
434 if (mapping->a_ops->migratepage) {
435 /*
436 * Most pages have a mapping and most filesystems
437 * should provide a migration function. Anonymous
438 * pages are part of swap space which also has its
439 * own migration function. This is the most common
440 * path for page migration.
441 */
442 rc = mapping->a_ops->migratepage(newpage, page);
443 goto unlock_both;
444 }
445
446 /*
447 * Default handling if a filesystem does not provide
448 * a migration function. We can only migrate clean
449 * pages so try to write out any dirty pages first.
450 */
451 if (PageDirty(page)) {
452 switch (pageout(page, mapping)) {
453 case PAGE_KEEP:
454 case PAGE_ACTIVATE:
455 goto unlock_both;
456
457 case PAGE_SUCCESS:
458 unlock_page(newpage);
459 goto next;
460
461 case PAGE_CLEAN:
462 ; /* try to migrate the page below */
463 }
464 }
465
466 /*
467 * Buffers are managed in a filesystem specific way.
468 * We must have no buffers or drop them.
469 */
470 if (!page_has_buffers(page) ||
471 try_to_release_page(page, GFP_KERNEL)) {
472 rc = migrate_page(newpage, page);
473 goto unlock_both;
474 }
475
476 /*
477 * On early passes with mapped pages simply
478 * retry. There may be a lock held for some
479 * buffers that may go away. Later
480 * swap them out.
481 */
482 if (pass > 4) {
483 /*
484 * Persistently unable to drop buffers..... As a
485 * measure of last resort we fall back to
486 * swap_page().
487 */
488 unlock_page(newpage);
489 newpage = NULL;
490 rc = swap_page(page);
491 goto next;
492 }
493
494unlock_both:
495 unlock_page(newpage);
496
497unlock_page:
498 unlock_page(page);
499
500next:
501 if (rc == -EAGAIN) {
502 retry++;
503 } else if (rc) {
504 /* Permanent failure */
505 list_move(&page->lru, failed);
506 nr_failed++;
507 } else {
508 if (newpage) {
509 /* Successful migration. Return page to LRU */
510 move_to_lru(newpage);
511 }
512 list_move(&page->lru, moved);
513 }
514 }
515 if (retry && pass++ < 10)
516 goto redo;
517
518 if (!swapwrite)
519 current->flags &= ~PF_SWAPWRITE;
520
521 return nr_failed + retry;
522}
523
524/*
525 * Migration function for pages with buffers. This function can only be used
526 * if the underlying filesystem guarantees that no other references to "page"
527 * exist.
528 */
529int buffer_migrate_page(struct page *newpage, struct page *page)
530{
531 struct address_space *mapping = page->mapping;
532 struct buffer_head *bh, *head;
533 int rc;
534
535 if (!mapping)
536 return -EAGAIN;
537
538 if (!page_has_buffers(page))
539 return migrate_page(newpage, page);
540
541 head = page_buffers(page);
542
543 rc = migrate_page_remove_references(newpage, page, 3);
544
545 if (rc)
546 return rc;
547
548 bh = head;
549 do {
550 get_bh(bh);
551 lock_buffer(bh);
552 bh = bh->b_this_page;
553
554 } while (bh != head);
555
556 ClearPagePrivate(page);
557 set_page_private(newpage, page_private(page));
558 set_page_private(page, 0);
559 put_page(page);
560 get_page(newpage);
561
562 bh = head;
563 do {
564 set_bh_page(bh, newpage, bh_offset(bh));
565 bh = bh->b_this_page;
566
567 } while (bh != head);
568
569 SetPagePrivate(newpage);
570
571 migrate_page_copy(newpage, page);
572
573 bh = head;
574 do {
575 unlock_buffer(bh);
576 put_bh(bh);
577 bh = bh->b_this_page;
578
579 } while (bh != head);
580
581 return 0;
582}
583EXPORT_SYMBOL(buffer_migrate_page);
584
585/*
586 * Migrate the list 'pagelist' of pages to a certain destination.
587 *
588 * Specify destination with either non-NULL vma or dest_node >= 0
589 * Return the number of pages not migrated or error code
590 */
591int migrate_pages_to(struct list_head *pagelist,
592 struct vm_area_struct *vma, int dest)
593{
594 LIST_HEAD(newlist);
595 LIST_HEAD(moved);
596 LIST_HEAD(failed);
597 int err = 0;
598 unsigned long offset = 0;
599 int nr_pages;
600 struct page *page;
601 struct list_head *p;
602
603redo:
604 nr_pages = 0;
605 list_for_each(p, pagelist) {
606 if (vma) {
607 /*
608 * The address passed to alloc_page_vma is used to
609 * generate the proper interleave behavior. We fake
610 * the address here by an increasing offset in order
611 * to get the proper distribution of pages.
612 *
613 * No decision has been made as to which page
614 * a certain old page is moved to so we cannot
615 * specify the correct address.
616 */
617 page = alloc_page_vma(GFP_HIGHUSER, vma,
618 offset + vma->vm_start);
619 offset += PAGE_SIZE;
620 }
621 else
622 page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
623
624 if (!page) {
625 err = -ENOMEM;
626 goto out;
627 }
628 list_add_tail(&page->lru, &newlist);
629 nr_pages++;
630 if (nr_pages > MIGRATE_CHUNK_SIZE)
631 break;
632 }
633 err = migrate_pages(pagelist, &newlist, &moved, &failed);
634
635 putback_lru_pages(&moved); /* Call release pages instead ?? */
636
637 if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
638 goto redo;
639out:
640 /* Return leftover allocated pages */
641 while (!list_empty(&newlist)) {
642 page = list_entry(newlist.next, struct page, lru);
643 list_del(&page->lru);
644 __free_page(page);
645 }
646 list_splice(&failed, pagelist);
647 if (err < 0)
648 return err;
649
650 /* Calculate number of leftover pages */
651 nr_pages = 0;
652 list_for_each(p, pagelist)
653 nr_pages++;
654 return nr_pages;
655}
diff --git a/mm/mmap.c b/mm/mmap.c
index 47556d2b3e..0eb9894db6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -612,7 +612,7 @@ again: remove_next = 1 + (end > next->vm_end);
612 * If the vma has a ->close operation then the driver probably needs to release 612 * If the vma has a ->close operation then the driver probably needs to release
613 * per-vma resources, so we don't attempt to merge those. 613 * per-vma resources, so we don't attempt to merge those.
614 */ 614 */
615#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) 615#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
616 616
617static inline int is_mergeable_vma(struct vm_area_struct *vma, 617static inline int is_mergeable_vma(struct vm_area_struct *vma,
618 struct file *file, unsigned long vm_flags) 618 struct file *file, unsigned long vm_flags)
@@ -845,14 +845,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
845 const unsigned long stack_flags 845 const unsigned long stack_flags
846 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); 846 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
847 847
848#ifdef CONFIG_HUGETLB
849 if (flags & VM_HUGETLB) {
850 if (!(flags & VM_DONTCOPY))
851 mm->shared_vm += pages;
852 return;
853 }
854#endif /* CONFIG_HUGETLB */
855
856 if (file) { 848 if (file) {
857 mm->shared_vm += pages; 849 mm->shared_vm += pages;
858 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) 850 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 653b8571c1..4c14d4289b 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -124,7 +124,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
124 * a MAP_NORESERVE private mapping to writable will now reserve. 124 * a MAP_NORESERVE private mapping to writable will now reserve.
125 */ 125 */
126 if (newflags & VM_WRITE) { 126 if (newflags & VM_WRITE) {
127 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { 127 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
128 charged = nrpages; 128 charged = nrpages;
129 if (security_vm_enough_memory(charged)) 129 if (security_vm_enough_memory(charged))
130 return -ENOMEM; 130 return -ENOMEM;
@@ -166,7 +166,10 @@ success:
166 */ 166 */
167 vma->vm_flags = newflags; 167 vma->vm_flags = newflags;
168 vma->vm_page_prot = newprot; 168 vma->vm_page_prot = newprot;
169 change_protection(vma, start, end, newprot); 169 if (is_vm_hugetlb_page(vma))
170 hugetlb_change_protection(vma, start, end, newprot);
171 else
172 change_protection(vma, start, end, newprot);
170 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 173 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
171 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 174 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
172 return 0; 175 return 0;
@@ -240,11 +243,6 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
240 243
241 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ 244 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
242 245
243 if (is_vm_hugetlb_page(vma)) {
244 error = -EACCES;
245 goto out;
246 }
247
248 newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); 246 newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
249 247
250 /* newflags >> 4 shift VM_MAY% in place of VM_% */ 248 /* newflags >> 4 shift VM_MAY% in place of VM_% */
diff --git a/mm/nommu.c b/mm/nommu.c
index 4951f4786f..db45efac17 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -159,7 +159,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
159 /* 159 /*
160 * kmalloc doesn't like __GFP_HIGHMEM for some reason 160 * kmalloc doesn't like __GFP_HIGHMEM for some reason
161 */ 161 */
162 return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM); 162 return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
163} 163}
164 164
165struct page * vmalloc_to_page(void *addr) 165struct page * vmalloc_to_page(void *addr)
@@ -623,7 +623,7 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
623 * - note that this may not return a page-aligned address if the object 623 * - note that this may not return a page-aligned address if the object
624 * we're allocating is smaller than a page 624 * we're allocating is smaller than a page
625 */ 625 */
626 base = kmalloc(len, GFP_KERNEL); 626 base = kmalloc(len, GFP_KERNEL|__GFP_COMP);
627 if (!base) 627 if (!base)
628 goto enomem; 628 goto enomem;
629 629
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 234bd4895d..b7f14a4799 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -55,7 +55,6 @@ unsigned long totalhigh_pages __read_mostly;
55long nr_swap_pages; 55long nr_swap_pages;
56int percpu_pagelist_fraction; 56int percpu_pagelist_fraction;
57 57
58static void fastcall free_hot_cold_page(struct page *page, int cold);
59static void __free_pages_ok(struct page *page, unsigned int order); 58static void __free_pages_ok(struct page *page, unsigned int order);
60 59
61/* 60/*
@@ -190,7 +189,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
190 for (i = 0; i < nr_pages; i++) { 189 for (i = 0; i < nr_pages; i++) {
191 struct page *p = page + i; 190 struct page *p = page + i;
192 191
193 SetPageCompound(p); 192 __SetPageCompound(p);
194 set_page_private(p, (unsigned long)page); 193 set_page_private(p, (unsigned long)page);
195 } 194 }
196} 195}
@@ -209,10 +208,24 @@ static void destroy_compound_page(struct page *page, unsigned long order)
209 if (unlikely(!PageCompound(p) | 208 if (unlikely(!PageCompound(p) |
210 (page_private(p) != (unsigned long)page))) 209 (page_private(p) != (unsigned long)page)))
211 bad_page(page); 210 bad_page(page);
212 ClearPageCompound(p); 211 __ClearPageCompound(p);
213 } 212 }
214} 213}
215 214
215static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
216{
217 int i;
218
219 BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
220 /*
221 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
222 * and __GFP_HIGHMEM from hard or soft interrupt context.
223 */
224 BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
225 for (i = 0; i < (1 << order); i++)
226 clear_highpage(page + i);
227}
228
216/* 229/*
217 * function for dealing with page's order in buddy system. 230 * function for dealing with page's order in buddy system.
218 * zone->lock is already acquired when we use these. 231 * zone->lock is already acquired when we use these.
@@ -423,11 +436,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
423 mutex_debug_check_no_locks_freed(page_address(page), 436 mutex_debug_check_no_locks_freed(page_address(page),
424 PAGE_SIZE<<order); 437 PAGE_SIZE<<order);
425 438
426#ifndef CONFIG_MMU
427 for (i = 1 ; i < (1 << order) ; ++i)
428 __put_page(page + i);
429#endif
430
431 for (i = 0 ; i < (1 << order) ; ++i) 439 for (i = 0 ; i < (1 << order) ; ++i)
432 reserved += free_pages_check(page + i); 440 reserved += free_pages_check(page + i);
433 if (reserved) 441 if (reserved)
@@ -448,28 +456,23 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
448 if (order == 0) { 456 if (order == 0) {
449 __ClearPageReserved(page); 457 __ClearPageReserved(page);
450 set_page_count(page, 0); 458 set_page_count(page, 0);
451 459 set_page_refcounted(page);
452 free_hot_cold_page(page, 0); 460 __free_page(page);
453 } else { 461 } else {
454 LIST_HEAD(list);
455 int loop; 462 int loop;
456 463
464 prefetchw(page);
457 for (loop = 0; loop < BITS_PER_LONG; loop++) { 465 for (loop = 0; loop < BITS_PER_LONG; loop++) {
458 struct page *p = &page[loop]; 466 struct page *p = &page[loop];
459 467
460 if (loop + 16 < BITS_PER_LONG) 468 if (loop + 1 < BITS_PER_LONG)
461 prefetchw(p + 16); 469 prefetchw(p + 1);
462 __ClearPageReserved(p); 470 __ClearPageReserved(p);
463 set_page_count(p, 0); 471 set_page_count(p, 0);
464 } 472 }
465 473
466 arch_free_page(page, order); 474 set_page_refcounted(page);
467 475 __free_pages(page, order);
468 mod_page_state(pgfree, 1 << order);
469
470 list_add(&page->lru, &list);
471 kernel_map_pages(page, 1 << order, 0);
472 free_pages_bulk(page_zone(page), 1, &list, order);
473 } 476 }
474} 477}
475 478
@@ -507,7 +510,7 @@ static inline void expand(struct zone *zone, struct page *page,
507/* 510/*
508 * This page is about to be returned from the page allocator 511 * This page is about to be returned from the page allocator
509 */ 512 */
510static int prep_new_page(struct page *page, int order) 513static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
511{ 514{
512 if (unlikely(page_mapcount(page) | 515 if (unlikely(page_mapcount(page) |
513 (page->mapping != NULL) | 516 (page->mapping != NULL) |
@@ -536,8 +539,15 @@ static int prep_new_page(struct page *page, int order)
536 1 << PG_referenced | 1 << PG_arch_1 | 539 1 << PG_referenced | 1 << PG_arch_1 |
537 1 << PG_checked | 1 << PG_mappedtodisk); 540 1 << PG_checked | 1 << PG_mappedtodisk);
538 set_page_private(page, 0); 541 set_page_private(page, 0);
539 set_page_refs(page, order); 542 set_page_refcounted(page);
540 kernel_map_pages(page, 1 << order, 1); 543 kernel_map_pages(page, 1 << order, 1);
544
545 if (gfp_flags & __GFP_ZERO)
546 prep_zero_page(page, order, gfp_flags);
547
548 if (order && (gfp_flags & __GFP_COMP))
549 prep_compound_page(page, order);
550
541 return 0; 551 return 0;
542} 552}
543 553
@@ -593,13 +603,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
593/* 603/*
594 * Called from the slab reaper to drain pagesets on a particular node that 604 * Called from the slab reaper to drain pagesets on a particular node that
595 * belong to the currently executing processor. 605 * belong to the currently executing processor.
606 * Note that this function must be called with the thread pinned to
607 * a single processor.
596 */ 608 */
597void drain_node_pages(int nodeid) 609void drain_node_pages(int nodeid)
598{ 610{
599 int i, z; 611 int i, z;
600 unsigned long flags; 612 unsigned long flags;
601 613
602 local_irq_save(flags);
603 for (z = 0; z < MAX_NR_ZONES; z++) { 614 for (z = 0; z < MAX_NR_ZONES; z++) {
604 struct zone *zone = NODE_DATA(nodeid)->node_zones + z; 615 struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
605 struct per_cpu_pageset *pset; 616 struct per_cpu_pageset *pset;
@@ -609,11 +620,14 @@ void drain_node_pages(int nodeid)
609 struct per_cpu_pages *pcp; 620 struct per_cpu_pages *pcp;
610 621
611 pcp = &pset->pcp[i]; 622 pcp = &pset->pcp[i];
612 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 623 if (pcp->count) {
613 pcp->count = 0; 624 local_irq_save(flags);
625 free_pages_bulk(zone, pcp->count, &pcp->list, 0);
626 pcp->count = 0;
627 local_irq_restore(flags);
628 }
614 } 629 }
615 } 630 }
616 local_irq_restore(flags);
617} 631}
618#endif 632#endif
619 633
@@ -743,13 +757,22 @@ void fastcall free_cold_page(struct page *page)
743 free_hot_cold_page(page, 1); 757 free_hot_cold_page(page, 1);
744} 758}
745 759
746static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 760/*
761 * split_page takes a non-compound higher-order page, and splits it into
762 * n (1<<order) sub-pages: page[0..n]
763 * Each sub-page must be freed individually.
764 *
765 * Note: this is probably too low level an operation for use in drivers.
766 * Please consult with lkml before using this in your driver.
767 */
768void split_page(struct page *page, unsigned int order)
747{ 769{
748 int i; 770 int i;
749 771
750 BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); 772 BUG_ON(PageCompound(page));
751 for(i = 0; i < (1 << order); i++) 773 BUG_ON(!page_count(page));
752 clear_highpage(page + i); 774 for (i = 1; i < (1 << order); i++)
775 set_page_refcounted(page + i);
753} 776}
754 777
755/* 778/*
@@ -795,14 +818,8 @@ again:
795 put_cpu(); 818 put_cpu();
796 819
797 BUG_ON(bad_range(zone, page)); 820 BUG_ON(bad_range(zone, page));
798 if (prep_new_page(page, order)) 821 if (prep_new_page(page, order, gfp_flags))
799 goto again; 822 goto again;
800
801 if (gfp_flags & __GFP_ZERO)
802 prep_zero_page(page, order, gfp_flags);
803
804 if (order && (gfp_flags & __GFP_COMP))
805 prep_compound_page(page, order);
806 return page; 823 return page;
807 824
808failed: 825failed:
@@ -1214,24 +1231,22 @@ DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
1214 1231
1215static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) 1232static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
1216{ 1233{
1217 int cpu = 0; 1234 unsigned cpu;
1218 1235
1219 memset(ret, 0, nr * sizeof(unsigned long)); 1236 memset(ret, 0, nr * sizeof(unsigned long));
1220 cpus_and(*cpumask, *cpumask, cpu_online_map); 1237 cpus_and(*cpumask, *cpumask, cpu_online_map);
1221 1238
1222 cpu = first_cpu(*cpumask); 1239 for_each_cpu_mask(cpu, *cpumask) {
1223 while (cpu < NR_CPUS) { 1240 unsigned long *in;
1224 unsigned long *in, *out, off; 1241 unsigned long *out;
1225 1242 unsigned off;
1226 if (!cpu_isset(cpu, *cpumask)) 1243 unsigned next_cpu;
1227 continue;
1228 1244
1229 in = (unsigned long *)&per_cpu(page_states, cpu); 1245 in = (unsigned long *)&per_cpu(page_states, cpu);
1230 1246
1231 cpu = next_cpu(cpu, *cpumask); 1247 next_cpu = next_cpu(cpu, *cpumask);
1232 1248 if (likely(next_cpu < NR_CPUS))
1233 if (likely(cpu < NR_CPUS)) 1249 prefetch(&per_cpu(page_states, next_cpu));
1234 prefetch(&per_cpu(page_states, cpu));
1235 1250
1236 out = (unsigned long *)ret; 1251 out = (unsigned long *)ret;
1237 for (off = 0; off < nr; off++) 1252 for (off = 0; off < nr; off++)
@@ -1764,7 +1779,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1764 continue; 1779 continue;
1765 page = pfn_to_page(pfn); 1780 page = pfn_to_page(pfn);
1766 set_page_links(page, zone, nid, pfn); 1781 set_page_links(page, zone, nid, pfn);
1767 set_page_count(page, 1); 1782 init_page_count(page);
1768 reset_page_mapcount(page); 1783 reset_page_mapcount(page);
1769 SetPageReserved(page); 1784 SetPageReserved(page);
1770 INIT_LIST_HEAD(&page->lru); 1785 INIT_LIST_HEAD(&page->lru);
diff --git a/mm/readahead.c b/mm/readahead.c
index 8d6eeaaa62..301b36c4a0 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -52,13 +52,24 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra)
52 return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; 52 return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
53} 53}
54 54
55static inline void reset_ahead_window(struct file_ra_state *ra)
56{
57 /*
58 * ... but preserve ahead_start + ahead_size value,
59 * see 'recheck:' label in page_cache_readahead().
60 * Note: We never use ->ahead_size as rvalue without
61 * checking ->ahead_start != 0 first.
62 */
63 ra->ahead_size += ra->ahead_start;
64 ra->ahead_start = 0;
65}
66
55static inline void ra_off(struct file_ra_state *ra) 67static inline void ra_off(struct file_ra_state *ra)
56{ 68{
57 ra->start = 0; 69 ra->start = 0;
58 ra->flags = 0; 70 ra->flags = 0;
59 ra->size = 0; 71 ra->size = 0;
60 ra->ahead_start = 0; 72 reset_ahead_window(ra);
61 ra->ahead_size = 0;
62 return; 73 return;
63} 74}
64 75
@@ -72,10 +83,10 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
72{ 83{
73 unsigned long newsize = roundup_pow_of_two(size); 84 unsigned long newsize = roundup_pow_of_two(size);
74 85
75 if (newsize <= max / 64) 86 if (newsize <= max / 32)
76 newsize = newsize * newsize; 87 newsize = newsize * 4;
77 else if (newsize <= max / 4) 88 else if (newsize <= max / 4)
78 newsize = max / 4; 89 newsize = newsize * 2;
79 else 90 else
80 newsize = max; 91 newsize = max;
81 return newsize; 92 return newsize;
@@ -426,8 +437,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp,
426 * congestion. The ahead window will any way be closed 437 * congestion. The ahead window will any way be closed
427 * in case we failed due to excessive page cache hits. 438 * in case we failed due to excessive page cache hits.
428 */ 439 */
429 ra->ahead_start = 0; 440 reset_ahead_window(ra);
430 ra->ahead_size = 0;
431 } 441 }
432 442
433 return ret; 443 return ret;
@@ -520,11 +530,11 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
520 * If we get here we are doing sequential IO and this was not the first 530 * If we get here we are doing sequential IO and this was not the first
521 * occurence (ie we have an existing window) 531 * occurence (ie we have an existing window)
522 */ 532 */
523
524 if (ra->ahead_start == 0) { /* no ahead window yet */ 533 if (ra->ahead_start == 0) { /* no ahead window yet */
525 if (!make_ahead_window(mapping, filp, ra, 0)) 534 if (!make_ahead_window(mapping, filp, ra, 0))
526 goto out; 535 goto recheck;
527 } 536 }
537
528 /* 538 /*
529 * Already have an ahead window, check if we crossed into it. 539 * Already have an ahead window, check if we crossed into it.
530 * If so, shift windows and issue a new ahead window. 540 * If so, shift windows and issue a new ahead window.
@@ -536,6 +546,10 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
536 ra->start = ra->ahead_start; 546 ra->start = ra->ahead_start;
537 ra->size = ra->ahead_size; 547 ra->size = ra->ahead_size;
538 make_ahead_window(mapping, filp, ra, 0); 548 make_ahead_window(mapping, filp, ra, 0);
549recheck:
550 /* prev_page shouldn't overrun the ahead window */
551 ra->prev_page = min(ra->prev_page,
552 ra->ahead_start + ra->ahead_size - 1);
539 } 553 }
540 554
541out: 555out:
diff --git a/mm/rmap.c b/mm/rmap.c
index 67f0e20b10..1963e26931 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,13 +56,11 @@
56 56
57#include <asm/tlbflush.h> 57#include <asm/tlbflush.h>
58 58
59//#define RMAP_DEBUG /* can be enabled only for debugging */ 59struct kmem_cache *anon_vma_cachep;
60
61kmem_cache_t *anon_vma_cachep;
62 60
63static inline void validate_anon_vma(struct vm_area_struct *find_vma) 61static inline void validate_anon_vma(struct vm_area_struct *find_vma)
64{ 62{
65#ifdef RMAP_DEBUG 63#ifdef CONFIG_DEBUG_VM
66 struct anon_vma *anon_vma = find_vma->anon_vma; 64 struct anon_vma *anon_vma = find_vma->anon_vma;
67 struct vm_area_struct *vma; 65 struct vm_area_struct *vma;
68 unsigned int mapcount = 0; 66 unsigned int mapcount = 0;
@@ -166,7 +164,8 @@ void anon_vma_unlink(struct vm_area_struct *vma)
166 anon_vma_free(anon_vma); 164 anon_vma_free(anon_vma);
167} 165}
168 166
169static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) 167static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
168 unsigned long flags)
170{ 169{
171 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 170 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
172 SLAB_CTOR_CONSTRUCTOR) { 171 SLAB_CTOR_CONSTRUCTOR) {
@@ -550,13 +549,14 @@ void page_add_file_rmap(struct page *page)
550void page_remove_rmap(struct page *page) 549void page_remove_rmap(struct page *page)
551{ 550{
552 if (atomic_add_negative(-1, &page->_mapcount)) { 551 if (atomic_add_negative(-1, &page->_mapcount)) {
553 if (page_mapcount(page) < 0) { 552#ifdef CONFIG_DEBUG_VM
553 if (unlikely(page_mapcount(page) < 0)) {
554 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); 554 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
555 printk (KERN_EMERG " page->flags = %lx\n", page->flags); 555 printk (KERN_EMERG " page->flags = %lx\n", page->flags);
556 printk (KERN_EMERG " page->count = %x\n", page_count(page)); 556 printk (KERN_EMERG " page->count = %x\n", page_count(page));
557 printk (KERN_EMERG " page->mapping = %p\n", page->mapping); 557 printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
558 } 558 }
559 559#endif
560 BUG_ON(page_mapcount(page) < 0); 560 BUG_ON(page_mapcount(page) < 0);
561 /* 561 /*
562 * It would be tidy to reset the PageAnon mapping here, 562 * It would be tidy to reset the PageAnon mapping here,
diff --git a/mm/shmem.c b/mm/shmem.c
index 7c455fbaff..37eaf42ed2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -875,7 +875,7 @@ redirty:
875} 875}
876 876
877#ifdef CONFIG_NUMA 877#ifdef CONFIG_NUMA
878static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) 878static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
879{ 879{
880 char *nodelist = strchr(value, ':'); 880 char *nodelist = strchr(value, ':');
881 int err = 1; 881 int err = 1;
@@ -2119,7 +2119,7 @@ failed:
2119 return err; 2119 return err;
2120} 2120}
2121 2121
2122static kmem_cache_t *shmem_inode_cachep; 2122static struct kmem_cache *shmem_inode_cachep;
2123 2123
2124static struct inode *shmem_alloc_inode(struct super_block *sb) 2124static struct inode *shmem_alloc_inode(struct super_block *sb)
2125{ 2125{
@@ -2139,7 +2139,8 @@ static void shmem_destroy_inode(struct inode *inode)
2139 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2139 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2140} 2140}
2141 2141
2142static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) 2142static void init_once(void *foo, struct kmem_cache *cachep,
2143 unsigned long flags)
2143{ 2144{
2144 struct shmem_inode_info *p = (struct shmem_inode_info *) foo; 2145 struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2145 2146
diff --git a/mm/slab.c b/mm/slab.c
index d0bd7f07ab..1c8f5ee230 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -50,7 +50,7 @@
50 * The head array is strictly LIFO and should improve the cache hit rates. 50 * The head array is strictly LIFO and should improve the cache hit rates.
51 * On SMP, it additionally reduces the spinlock operations. 51 * On SMP, it additionally reduces the spinlock operations.
52 * 52 *
53 * The c_cpuarray may not be read with enabled local interrupts - 53 * The c_cpuarray may not be read with enabled local interrupts -
54 * it's changed with a smp_call_function(). 54 * it's changed with a smp_call_function().
55 * 55 *
56 * SMP synchronization: 56 * SMP synchronization:
@@ -170,12 +170,12 @@
170#if DEBUG 170#if DEBUG
171# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ 171# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
172 SLAB_POISON | SLAB_HWCACHE_ALIGN | \ 172 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
173 SLAB_NO_REAP | SLAB_CACHE_DMA | \ 173 SLAB_CACHE_DMA | \
174 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ 174 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
175 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 175 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
176 SLAB_DESTROY_BY_RCU) 176 SLAB_DESTROY_BY_RCU)
177#else 177#else
178# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ 178# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
179 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ 179 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
180 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 180 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
181 SLAB_DESTROY_BY_RCU) 181 SLAB_DESTROY_BY_RCU)
@@ -266,16 +266,17 @@ struct array_cache {
266 unsigned int batchcount; 266 unsigned int batchcount;
267 unsigned int touched; 267 unsigned int touched;
268 spinlock_t lock; 268 spinlock_t lock;
269 void *entry[0]; /* 269 void *entry[0]; /*
270 * Must have this definition in here for the proper 270 * Must have this definition in here for the proper
271 * alignment of array_cache. Also simplifies accessing 271 * alignment of array_cache. Also simplifies accessing
272 * the entries. 272 * the entries.
273 * [0] is for gcc 2.95. It should really be []. 273 * [0] is for gcc 2.95. It should really be [].
274 */ 274 */
275}; 275};
276 276
277/* bootstrap: The caches do not work without cpuarrays anymore, 277/*
278 * but the cpuarrays are allocated from the generic caches... 278 * bootstrap: The caches do not work without cpuarrays anymore, but the
279 * cpuarrays are allocated from the generic caches...
279 */ 280 */
280#define BOOT_CPUCACHE_ENTRIES 1 281#define BOOT_CPUCACHE_ENTRIES 1
281struct arraycache_init { 282struct arraycache_init {
@@ -291,13 +292,13 @@ struct kmem_list3 {
291 struct list_head slabs_full; 292 struct list_head slabs_full;
292 struct list_head slabs_free; 293 struct list_head slabs_free;
293 unsigned long free_objects; 294 unsigned long free_objects;
294 unsigned long next_reap;
295 int free_touched;
296 unsigned int free_limit; 295 unsigned int free_limit;
297 unsigned int colour_next; /* Per-node cache coloring */ 296 unsigned int colour_next; /* Per-node cache coloring */
298 spinlock_t list_lock; 297 spinlock_t list_lock;
299 struct array_cache *shared; /* shared per node */ 298 struct array_cache *shared; /* shared per node */
300 struct array_cache **alien; /* on other nodes */ 299 struct array_cache **alien; /* on other nodes */
300 unsigned long next_reap; /* updated without locking */
301 int free_touched; /* updated without locking */
301}; 302};
302 303
303/* 304/*
@@ -310,10 +311,8 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
310#define SIZE_L3 (1 + MAX_NUMNODES) 311#define SIZE_L3 (1 + MAX_NUMNODES)
311 312
312/* 313/*
313 * This function must be completely optimized away if 314 * This function must be completely optimized away if a constant is passed to
314 * a constant is passed to it. Mostly the same as 315 * it. Mostly the same as what is in linux/slab.h except it returns an index.
315 * what is in linux/slab.h except it returns an
316 * index.
317 */ 316 */
318static __always_inline int index_of(const size_t size) 317static __always_inline int index_of(const size_t size)
319{ 318{
@@ -351,14 +350,14 @@ static void kmem_list3_init(struct kmem_list3 *parent)
351 parent->free_touched = 0; 350 parent->free_touched = 0;
352} 351}
353 352
354#define MAKE_LIST(cachep, listp, slab, nodeid) \ 353#define MAKE_LIST(cachep, listp, slab, nodeid) \
355 do { \ 354 do { \
356 INIT_LIST_HEAD(listp); \ 355 INIT_LIST_HEAD(listp); \
357 list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ 356 list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
358 } while (0) 357 } while (0)
359 358
360#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ 359#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
361 do { \ 360 do { \
362 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ 361 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \
363 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ 362 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
364 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 363 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
@@ -373,28 +372,30 @@ static void kmem_list3_init(struct kmem_list3 *parent)
373struct kmem_cache { 372struct kmem_cache {
374/* 1) per-cpu data, touched during every alloc/free */ 373/* 1) per-cpu data, touched during every alloc/free */
375 struct array_cache *array[NR_CPUS]; 374 struct array_cache *array[NR_CPUS];
375/* 2) Cache tunables. Protected by cache_chain_mutex */
376 unsigned int batchcount; 376 unsigned int batchcount;
377 unsigned int limit; 377 unsigned int limit;
378 unsigned int shared; 378 unsigned int shared;
379
379 unsigned int buffer_size; 380 unsigned int buffer_size;
380/* 2) touched by every alloc & free from the backend */ 381/* 3) touched by every alloc & free from the backend */
381 struct kmem_list3 *nodelists[MAX_NUMNODES]; 382 struct kmem_list3 *nodelists[MAX_NUMNODES];
382 unsigned int flags; /* constant flags */
383 unsigned int num; /* # of objs per slab */
384 spinlock_t spinlock;
385 383
386/* 3) cache_grow/shrink */ 384 unsigned int flags; /* constant flags */
385 unsigned int num; /* # of objs per slab */
386
387/* 4) cache_grow/shrink */
387 /* order of pgs per slab (2^n) */ 388 /* order of pgs per slab (2^n) */
388 unsigned int gfporder; 389 unsigned int gfporder;
389 390
390 /* force GFP flags, e.g. GFP_DMA */ 391 /* force GFP flags, e.g. GFP_DMA */
391 gfp_t gfpflags; 392 gfp_t gfpflags;
392 393
393 size_t colour; /* cache colouring range */ 394 size_t colour; /* cache colouring range */
394 unsigned int colour_off; /* colour offset */ 395 unsigned int colour_off; /* colour offset */
395 struct kmem_cache *slabp_cache; 396 struct kmem_cache *slabp_cache;
396 unsigned int slab_size; 397 unsigned int slab_size;
397 unsigned int dflags; /* dynamic flags */ 398 unsigned int dflags; /* dynamic flags */
398 399
399 /* constructor func */ 400 /* constructor func */
400 void (*ctor) (void *, struct kmem_cache *, unsigned long); 401 void (*ctor) (void *, struct kmem_cache *, unsigned long);
@@ -402,11 +403,11 @@ struct kmem_cache {
402 /* de-constructor func */ 403 /* de-constructor func */
403 void (*dtor) (void *, struct kmem_cache *, unsigned long); 404 void (*dtor) (void *, struct kmem_cache *, unsigned long);
404 405
405/* 4) cache creation/removal */ 406/* 5) cache creation/removal */
406 const char *name; 407 const char *name;
407 struct list_head next; 408 struct list_head next;
408 409
409/* 5) statistics */ 410/* 6) statistics */
410#if STATS 411#if STATS
411 unsigned long num_active; 412 unsigned long num_active;
412 unsigned long num_allocations; 413 unsigned long num_allocations;
@@ -438,8 +439,9 @@ struct kmem_cache {
438#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 439#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
439 440
440#define BATCHREFILL_LIMIT 16 441#define BATCHREFILL_LIMIT 16
441/* Optimization question: fewer reaps means less 442/*
442 * probability for unnessary cpucache drain/refill cycles. 443 * Optimization question: fewer reaps means less probability for unnessary
444 * cpucache drain/refill cycles.
443 * 445 *
444 * OTOH the cpuarrays can contain lots of objects, 446 * OTOH the cpuarrays can contain lots of objects,
445 * which could lock up otherwise freeable slabs. 447 * which could lock up otherwise freeable slabs.
@@ -453,17 +455,19 @@ struct kmem_cache {
453#define STATS_INC_ALLOCED(x) ((x)->num_allocations++) 455#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
454#define STATS_INC_GROWN(x) ((x)->grown++) 456#define STATS_INC_GROWN(x) ((x)->grown++)
455#define STATS_INC_REAPED(x) ((x)->reaped++) 457#define STATS_INC_REAPED(x) ((x)->reaped++)
456#define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \ 458#define STATS_SET_HIGH(x) \
457 (x)->high_mark = (x)->num_active; \ 459 do { \
458 } while (0) 460 if ((x)->num_active > (x)->high_mark) \
461 (x)->high_mark = (x)->num_active; \
462 } while (0)
459#define STATS_INC_ERR(x) ((x)->errors++) 463#define STATS_INC_ERR(x) ((x)->errors++)
460#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) 464#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
461#define STATS_INC_NODEFREES(x) ((x)->node_frees++) 465#define STATS_INC_NODEFREES(x) ((x)->node_frees++)
462#define STATS_SET_FREEABLE(x, i) \ 466#define STATS_SET_FREEABLE(x, i) \
463 do { if ((x)->max_freeable < i) \ 467 do { \
464 (x)->max_freeable = i; \ 468 if ((x)->max_freeable < i) \
465 } while (0) 469 (x)->max_freeable = i; \
466 470 } while (0)
467#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) 471#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit)
468#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) 472#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss)
469#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) 473#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit)
@@ -478,9 +482,7 @@ struct kmem_cache {
478#define STATS_INC_ERR(x) do { } while (0) 482#define STATS_INC_ERR(x) do { } while (0)
479#define STATS_INC_NODEALLOCS(x) do { } while (0) 483#define STATS_INC_NODEALLOCS(x) do { } while (0)
480#define STATS_INC_NODEFREES(x) do { } while (0) 484#define STATS_INC_NODEFREES(x) do { } while (0)
481#define STATS_SET_FREEABLE(x, i) \ 485#define STATS_SET_FREEABLE(x, i) do { } while (0)
482 do { } while (0)
483
484#define STATS_INC_ALLOCHIT(x) do { } while (0) 486#define STATS_INC_ALLOCHIT(x) do { } while (0)
485#define STATS_INC_ALLOCMISS(x) do { } while (0) 487#define STATS_INC_ALLOCMISS(x) do { } while (0)
486#define STATS_INC_FREEHIT(x) do { } while (0) 488#define STATS_INC_FREEHIT(x) do { } while (0)
@@ -488,7 +490,8 @@ struct kmem_cache {
488#endif 490#endif
489 491
490#if DEBUG 492#if DEBUG
491/* Magic nums for obj red zoning. 493/*
494 * Magic nums for obj red zoning.
492 * Placed in the first word before and the first word after an obj. 495 * Placed in the first word before and the first word after an obj.
493 */ 496 */
494#define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ 497#define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */
@@ -499,7 +502,8 @@ struct kmem_cache {
499#define POISON_FREE 0x6b /* for use-after-free poisoning */ 502#define POISON_FREE 0x6b /* for use-after-free poisoning */
500#define POISON_END 0xa5 /* end-byte of poisoning */ 503#define POISON_END 0xa5 /* end-byte of poisoning */
501 504
502/* memory layout of objects: 505/*
506 * memory layout of objects:
503 * 0 : objp 507 * 0 : objp
504 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that 508 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
505 * the end of an object is aligned with the end of the real 509 * the end of an object is aligned with the end of the real
@@ -508,7 +512,8 @@ struct kmem_cache {
508 * redzone word. 512 * redzone word.
509 * cachep->obj_offset: The real object. 513 * cachep->obj_offset: The real object.
510 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 514 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
511 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long] 515 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
516 * [BYTES_PER_WORD long]
512 */ 517 */
513static int obj_offset(struct kmem_cache *cachep) 518static int obj_offset(struct kmem_cache *cachep)
514{ 519{
@@ -552,8 +557,8 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
552#endif 557#endif
553 558
554/* 559/*
555 * Maximum size of an obj (in 2^order pages) 560 * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp
556 * and absolute limit for the gfp order. 561 * order.
557 */ 562 */
558#if defined(CONFIG_LARGE_ALLOCS) 563#if defined(CONFIG_LARGE_ALLOCS)
559#define MAX_OBJ_ORDER 13 /* up to 32Mb */ 564#define MAX_OBJ_ORDER 13 /* up to 32Mb */
@@ -573,9 +578,10 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
573#define BREAK_GFP_ORDER_LO 0 578#define BREAK_GFP_ORDER_LO 0
574static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; 579static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
575 580
576/* Functions for storing/retrieving the cachep and or slab from the 581/*
577 * global 'mem_map'. These are used to find the slab an obj belongs to. 582 * Functions for storing/retrieving the cachep and or slab from the page
578 * With kfree(), these are used to find the cache which an obj belongs to. 583 * allocator. These are used to find the slab an obj belongs to. With kfree(),
584 * these are used to find the cache which an obj belongs to.
579 */ 585 */
580static inline void page_set_cache(struct page *page, struct kmem_cache *cache) 586static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
581{ 587{
@@ -584,6 +590,8 @@ static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
584 590
585static inline struct kmem_cache *page_get_cache(struct page *page) 591static inline struct kmem_cache *page_get_cache(struct page *page)
586{ 592{
593 if (unlikely(PageCompound(page)))
594 page = (struct page *)page_private(page);
587 return (struct kmem_cache *)page->lru.next; 595 return (struct kmem_cache *)page->lru.next;
588} 596}
589 597
@@ -594,6 +602,8 @@ static inline void page_set_slab(struct page *page, struct slab *slab)
594 602
595static inline struct slab *page_get_slab(struct page *page) 603static inline struct slab *page_get_slab(struct page *page)
596{ 604{
605 if (unlikely(PageCompound(page)))
606 page = (struct page *)page_private(page);
597 return (struct slab *)page->lru.prev; 607 return (struct slab *)page->lru.prev;
598} 608}
599 609
@@ -609,7 +619,21 @@ static inline struct slab *virt_to_slab(const void *obj)
609 return page_get_slab(page); 619 return page_get_slab(page);
610} 620}
611 621
612/* These are the default caches for kmalloc. Custom caches can have other sizes. */ 622static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
623 unsigned int idx)
624{
625 return slab->s_mem + cache->buffer_size * idx;
626}
627
628static inline unsigned int obj_to_index(struct kmem_cache *cache,
629 struct slab *slab, void *obj)
630{
631 return (unsigned)(obj - slab->s_mem) / cache->buffer_size;
632}
633
634/*
635 * These are the default caches for kmalloc. Custom caches can have other sizes.
636 */
613struct cache_sizes malloc_sizes[] = { 637struct cache_sizes malloc_sizes[] = {
614#define CACHE(x) { .cs_size = (x) }, 638#define CACHE(x) { .cs_size = (x) },
615#include <linux/kmalloc_sizes.h> 639#include <linux/kmalloc_sizes.h>
@@ -642,8 +666,6 @@ static struct kmem_cache cache_cache = {
642 .limit = BOOT_CPUCACHE_ENTRIES, 666 .limit = BOOT_CPUCACHE_ENTRIES,
643 .shared = 1, 667 .shared = 1,
644 .buffer_size = sizeof(struct kmem_cache), 668 .buffer_size = sizeof(struct kmem_cache),
645 .flags = SLAB_NO_REAP,
646 .spinlock = SPIN_LOCK_UNLOCKED,
647 .name = "kmem_cache", 669 .name = "kmem_cache",
648#if DEBUG 670#if DEBUG
649 .obj_size = sizeof(struct kmem_cache), 671 .obj_size = sizeof(struct kmem_cache),
@@ -655,8 +677,8 @@ static DEFINE_MUTEX(cache_chain_mutex);
655static struct list_head cache_chain; 677static struct list_head cache_chain;
656 678
657/* 679/*
658 * vm_enough_memory() looks at this to determine how many 680 * vm_enough_memory() looks at this to determine how many slab-allocated pages
659 * slab-allocated pages are possibly freeable under pressure 681 * are possibly freeable under pressure
660 * 682 *
661 * SLAB_RECLAIM_ACCOUNT turns this on per-slab 683 * SLAB_RECLAIM_ACCOUNT turns this on per-slab
662 */ 684 */
@@ -675,7 +697,8 @@ static enum {
675 697
676static DEFINE_PER_CPU(struct work_struct, reap_work); 698static DEFINE_PER_CPU(struct work_struct, reap_work);
677 699
678static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node); 700static void free_block(struct kmem_cache *cachep, void **objpp, int len,
701 int node);
679static void enable_cpucache(struct kmem_cache *cachep); 702static void enable_cpucache(struct kmem_cache *cachep);
680static void cache_reap(void *unused); 703static void cache_reap(void *unused);
681static int __node_shrink(struct kmem_cache *cachep, int node); 704static int __node_shrink(struct kmem_cache *cachep, int node);
@@ -685,7 +708,8 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
685 return cachep->array[smp_processor_id()]; 708 return cachep->array[smp_processor_id()];
686} 709}
687 710
688static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags) 711static inline struct kmem_cache *__find_general_cachep(size_t size,
712 gfp_t gfpflags)
689{ 713{
690 struct cache_sizes *csizep = malloc_sizes; 714 struct cache_sizes *csizep = malloc_sizes;
691 715
@@ -720,8 +744,9 @@ static size_t slab_mgmt_size(size_t nr_objs, size_t align)
720 return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); 744 return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
721} 745}
722 746
723/* Calculate the number of objects and left-over bytes for a given 747/*
724 buffer size. */ 748 * Calculate the number of objects and left-over bytes for a given buffer size.
749 */
725static void cache_estimate(unsigned long gfporder, size_t buffer_size, 750static void cache_estimate(unsigned long gfporder, size_t buffer_size,
726 size_t align, int flags, size_t *left_over, 751 size_t align, int flags, size_t *left_over,
727 unsigned int *num) 752 unsigned int *num)
@@ -782,7 +807,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
782 807
783#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) 808#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
784 809
785static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg) 810static void __slab_error(const char *function, struct kmem_cache *cachep,
811 char *msg)
786{ 812{
787 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 813 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
788 function, cachep->name, msg); 814 function, cachep->name, msg);
@@ -804,7 +830,7 @@ static void init_reap_node(int cpu)
804 830
805 node = next_node(cpu_to_node(cpu), node_online_map); 831 node = next_node(cpu_to_node(cpu), node_online_map);
806 if (node == MAX_NUMNODES) 832 if (node == MAX_NUMNODES)
807 node = 0; 833 node = first_node(node_online_map);
808 834
809 __get_cpu_var(reap_node) = node; 835 __get_cpu_var(reap_node) = node;
810} 836}
@@ -906,10 +932,8 @@ static void free_alien_cache(struct array_cache **ac_ptr)
906 932
907 if (!ac_ptr) 933 if (!ac_ptr)
908 return; 934 return;
909
910 for_each_node(i) 935 for_each_node(i)
911 kfree(ac_ptr[i]); 936 kfree(ac_ptr[i]);
912
913 kfree(ac_ptr); 937 kfree(ac_ptr);
914} 938}
915 939
@@ -943,7 +967,8 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
943 } 967 }
944} 968}
945 969
946static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) 970static void drain_alien_cache(struct kmem_cache *cachep,
971 struct array_cache **alien)
947{ 972{
948 int i = 0; 973 int i = 0;
949 struct array_cache *ac; 974 struct array_cache *ac;
@@ -986,20 +1011,22 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
986 switch (action) { 1011 switch (action) {
987 case CPU_UP_PREPARE: 1012 case CPU_UP_PREPARE:
988 mutex_lock(&cache_chain_mutex); 1013 mutex_lock(&cache_chain_mutex);
989 /* we need to do this right in the beginning since 1014 /*
1015 * We need to do this right in the beginning since
990 * alloc_arraycache's are going to use this list. 1016 * alloc_arraycache's are going to use this list.
991 * kmalloc_node allows us to add the slab to the right 1017 * kmalloc_node allows us to add the slab to the right
992 * kmem_list3 and not this cpu's kmem_list3 1018 * kmem_list3 and not this cpu's kmem_list3
993 */ 1019 */
994 1020
995 list_for_each_entry(cachep, &cache_chain, next) { 1021 list_for_each_entry(cachep, &cache_chain, next) {
996 /* setup the size64 kmemlist for cpu before we can 1022 /*
1023 * Set up the size64 kmemlist for cpu before we can
997 * begin anything. Make sure some other cpu on this 1024 * begin anything. Make sure some other cpu on this
998 * node has not already allocated this 1025 * node has not already allocated this
999 */ 1026 */
1000 if (!cachep->nodelists[node]) { 1027 if (!cachep->nodelists[node]) {
1001 if (!(l3 = kmalloc_node(memsize, 1028 l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1002 GFP_KERNEL, node))) 1029 if (!l3)
1003 goto bad; 1030 goto bad;
1004 kmem_list3_init(l3); 1031 kmem_list3_init(l3);
1005 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 1032 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
@@ -1015,13 +1042,15 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
1015 1042
1016 spin_lock_irq(&cachep->nodelists[node]->list_lock); 1043 spin_lock_irq(&cachep->nodelists[node]->list_lock);
1017 cachep->nodelists[node]->free_limit = 1044 cachep->nodelists[node]->free_limit =
1018 (1 + nr_cpus_node(node)) * 1045 (1 + nr_cpus_node(node)) *
1019 cachep->batchcount + cachep->num; 1046 cachep->batchcount + cachep->num;
1020 spin_unlock_irq(&cachep->nodelists[node]->list_lock); 1047 spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1021 } 1048 }
1022 1049
1023 /* Now we can go ahead with allocating the shared array's 1050 /*
1024 & array cache's */ 1051 * Now we can go ahead with allocating the shared arrays and
1052 * array caches
1053 */
1025 list_for_each_entry(cachep, &cache_chain, next) { 1054 list_for_each_entry(cachep, &cache_chain, next) {
1026 struct array_cache *nc; 1055 struct array_cache *nc;
1027 struct array_cache *shared; 1056 struct array_cache *shared;
@@ -1041,7 +1070,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
1041 if (!alien) 1070 if (!alien)
1042 goto bad; 1071 goto bad;
1043 cachep->array[cpu] = nc; 1072 cachep->array[cpu] = nc;
1044
1045 l3 = cachep->nodelists[node]; 1073 l3 = cachep->nodelists[node];
1046 BUG_ON(!l3); 1074 BUG_ON(!l3);
1047 1075
@@ -1061,7 +1089,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
1061 } 1089 }
1062#endif 1090#endif
1063 spin_unlock_irq(&l3->list_lock); 1091 spin_unlock_irq(&l3->list_lock);
1064
1065 kfree(shared); 1092 kfree(shared);
1066 free_alien_cache(alien); 1093 free_alien_cache(alien);
1067 } 1094 }
@@ -1083,7 +1110,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
1083 /* fall thru */ 1110 /* fall thru */
1084 case CPU_UP_CANCELED: 1111 case CPU_UP_CANCELED:
1085 mutex_lock(&cache_chain_mutex); 1112 mutex_lock(&cache_chain_mutex);
1086
1087 list_for_each_entry(cachep, &cache_chain, next) { 1113 list_for_each_entry(cachep, &cache_chain, next) {
1088 struct array_cache *nc; 1114 struct array_cache *nc;
1089 struct array_cache *shared; 1115 struct array_cache *shared;
@@ -1150,7 +1176,7 @@ free_array_cache:
1150#endif 1176#endif
1151 } 1177 }
1152 return NOTIFY_OK; 1178 return NOTIFY_OK;
1153 bad: 1179bad:
1154 mutex_unlock(&cache_chain_mutex); 1180 mutex_unlock(&cache_chain_mutex);
1155 return NOTIFY_BAD; 1181 return NOTIFY_BAD;
1156} 1182}
@@ -1160,7 +1186,8 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
1160/* 1186/*
1161 * swap the static kmem_list3 with kmalloced memory 1187 * swap the static kmem_list3 with kmalloced memory
1162 */ 1188 */
1163static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid) 1189static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1190 int nodeid)
1164{ 1191{
1165 struct kmem_list3 *ptr; 1192 struct kmem_list3 *ptr;
1166 1193
@@ -1175,8 +1202,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int no
1175 local_irq_enable(); 1202 local_irq_enable();
1176} 1203}
1177 1204
1178/* Initialisation. 1205/*
1179 * Called after the gfp() functions have been enabled, and before smp_init(). 1206 * Initialisation. Called after the page allocator have been initialised and
1207 * before smp_init().
1180 */ 1208 */
1181void __init kmem_cache_init(void) 1209void __init kmem_cache_init(void)
1182{ 1210{
@@ -1201,9 +1229,9 @@ void __init kmem_cache_init(void)
1201 1229
1202 /* Bootstrap is tricky, because several objects are allocated 1230 /* Bootstrap is tricky, because several objects are allocated
1203 * from caches that do not exist yet: 1231 * from caches that do not exist yet:
1204 * 1) initialize the cache_cache cache: it contains the struct kmem_cache 1232 * 1) initialize the cache_cache cache: it contains the struct
1205 * structures of all caches, except cache_cache itself: cache_cache 1233 * kmem_cache structures of all caches, except cache_cache itself:
1206 * is statically allocated. 1234 * cache_cache is statically allocated.
1207 * Initially an __init data area is used for the head array and the 1235 * Initially an __init data area is used for the head array and the
1208 * kmem_list3 structures, it's replaced with a kmalloc allocated 1236 * kmem_list3 structures, it's replaced with a kmalloc allocated
1209 * array at the end of the bootstrap. 1237 * array at the end of the bootstrap.
@@ -1226,7 +1254,8 @@ void __init kmem_cache_init(void)
1226 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 1254 cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1227 cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; 1255 cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
1228 1256
1229 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); 1257 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1258 cache_line_size());
1230 1259
1231 for (order = 0; order < MAX_ORDER; order++) { 1260 for (order = 0; order < MAX_ORDER; order++) {
1232 cache_estimate(order, cache_cache.buffer_size, 1261 cache_estimate(order, cache_cache.buffer_size,
@@ -1245,24 +1274,26 @@ void __init kmem_cache_init(void)
1245 sizes = malloc_sizes; 1274 sizes = malloc_sizes;
1246 names = cache_names; 1275 names = cache_names;
1247 1276
1248 /* Initialize the caches that provide memory for the array cache 1277 /*
1249 * and the kmem_list3 structures first. 1278 * Initialize the caches that provide memory for the array cache and the
1250 * Without this, further allocations will bug 1279 * kmem_list3 structures first. Without this, further allocations will
1280 * bug.
1251 */ 1281 */
1252 1282
1253 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, 1283 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1254 sizes[INDEX_AC].cs_size, 1284 sizes[INDEX_AC].cs_size,
1255 ARCH_KMALLOC_MINALIGN, 1285 ARCH_KMALLOC_MINALIGN,
1256 (ARCH_KMALLOC_FLAGS | 1286 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1257 SLAB_PANIC), NULL, NULL); 1287 NULL, NULL);
1258 1288
1259 if (INDEX_AC != INDEX_L3) 1289 if (INDEX_AC != INDEX_L3) {
1260 sizes[INDEX_L3].cs_cachep = 1290 sizes[INDEX_L3].cs_cachep =
1261 kmem_cache_create(names[INDEX_L3].name, 1291 kmem_cache_create(names[INDEX_L3].name,
1262 sizes[INDEX_L3].cs_size, 1292 sizes[INDEX_L3].cs_size,
1263 ARCH_KMALLOC_MINALIGN, 1293 ARCH_KMALLOC_MINALIGN,
1264 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, 1294 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1265 NULL); 1295 NULL, NULL);
1296 }
1266 1297
1267 while (sizes->cs_size != ULONG_MAX) { 1298 while (sizes->cs_size != ULONG_MAX) {
1268 /* 1299 /*
@@ -1272,13 +1303,13 @@ void __init kmem_cache_init(void)
1272 * Note for systems short on memory removing the alignment will 1303 * Note for systems short on memory removing the alignment will
1273 * allow tighter packing of the smaller caches. 1304 * allow tighter packing of the smaller caches.
1274 */ 1305 */
1275 if (!sizes->cs_cachep) 1306 if (!sizes->cs_cachep) {
1276 sizes->cs_cachep = kmem_cache_create(names->name, 1307 sizes->cs_cachep = kmem_cache_create(names->name,
1277 sizes->cs_size, 1308 sizes->cs_size,
1278 ARCH_KMALLOC_MINALIGN, 1309 ARCH_KMALLOC_MINALIGN,
1279 (ARCH_KMALLOC_FLAGS 1310 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1280 | SLAB_PANIC), 1311 NULL, NULL);
1281 NULL, NULL); 1312 }
1282 1313
1283 /* Inc off-slab bufctl limit until the ceiling is hit. */ 1314 /* Inc off-slab bufctl limit until the ceiling is hit. */
1284 if (!(OFF_SLAB(sizes->cs_cachep))) { 1315 if (!(OFF_SLAB(sizes->cs_cachep))) {
@@ -1287,13 +1318,11 @@ void __init kmem_cache_init(void)
1287 } 1318 }
1288 1319
1289 sizes->cs_dmacachep = kmem_cache_create(names->name_dma, 1320 sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
1290 sizes->cs_size, 1321 sizes->cs_size,
1291 ARCH_KMALLOC_MINALIGN, 1322 ARCH_KMALLOC_MINALIGN,
1292 (ARCH_KMALLOC_FLAGS | 1323 ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
1293 SLAB_CACHE_DMA | 1324 SLAB_PANIC,
1294 SLAB_PANIC), NULL, 1325 NULL, NULL);
1295 NULL);
1296
1297 sizes++; 1326 sizes++;
1298 names++; 1327 names++;
1299 } 1328 }
@@ -1345,20 +1374,22 @@ void __init kmem_cache_init(void)
1345 struct kmem_cache *cachep; 1374 struct kmem_cache *cachep;
1346 mutex_lock(&cache_chain_mutex); 1375 mutex_lock(&cache_chain_mutex);
1347 list_for_each_entry(cachep, &cache_chain, next) 1376 list_for_each_entry(cachep, &cache_chain, next)
1348 enable_cpucache(cachep); 1377 enable_cpucache(cachep);
1349 mutex_unlock(&cache_chain_mutex); 1378 mutex_unlock(&cache_chain_mutex);
1350 } 1379 }
1351 1380
1352 /* Done! */ 1381 /* Done! */
1353 g_cpucache_up = FULL; 1382 g_cpucache_up = FULL;
1354 1383
1355 /* Register a cpu startup notifier callback 1384 /*
1356 * that initializes cpu_cache_get for all new cpus 1385 * Register a cpu startup notifier callback that initializes
1386 * cpu_cache_get for all new cpus
1357 */ 1387 */
1358 register_cpu_notifier(&cpucache_notifier); 1388 register_cpu_notifier(&cpucache_notifier);
1359 1389
1360 /* The reap timers are started later, with a module init call: 1390 /*
1361 * That part of the kernel is not yet operational. 1391 * The reap timers are started later, with a module init call: That part
1392 * of the kernel is not yet operational.
1362 */ 1393 */
1363} 1394}
1364 1395
@@ -1366,16 +1397,13 @@ static int __init cpucache_init(void)
1366{ 1397{
1367 int cpu; 1398 int cpu;
1368 1399
1369 /* 1400 /*
1370 * Register the timers that return unneeded 1401 * Register the timers that return unneeded pages to the page allocator
1371 * pages to gfp.
1372 */ 1402 */
1373 for_each_online_cpu(cpu) 1403 for_each_online_cpu(cpu)
1374 start_cpu_timer(cpu); 1404 start_cpu_timer(cpu);
1375
1376 return 0; 1405 return 0;
1377} 1406}
1378
1379__initcall(cpucache_init); 1407__initcall(cpucache_init);
1380 1408
1381/* 1409/*
@@ -1402,7 +1430,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1402 atomic_add(i, &slab_reclaim_pages); 1430 atomic_add(i, &slab_reclaim_pages);
1403 add_page_state(nr_slab, i); 1431 add_page_state(nr_slab, i);
1404 while (i--) { 1432 while (i--) {
1405 SetPageSlab(page); 1433 __SetPageSlab(page);
1406 page++; 1434 page++;
1407 } 1435 }
1408 return addr; 1436 return addr;
@@ -1418,8 +1446,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1418 const unsigned long nr_freed = i; 1446 const unsigned long nr_freed = i;
1419 1447
1420 while (i--) { 1448 while (i--) {
1421 if (!TestClearPageSlab(page)) 1449 BUG_ON(!PageSlab(page));
1422 BUG(); 1450 __ClearPageSlab(page);
1423 page++; 1451 page++;
1424 } 1452 }
1425 sub_page_state(nr_slab, nr_freed); 1453 sub_page_state(nr_slab, nr_freed);
@@ -1489,9 +1517,8 @@ static void dump_line(char *data, int offset, int limit)
1489{ 1517{
1490 int i; 1518 int i;
1491 printk(KERN_ERR "%03x:", offset); 1519 printk(KERN_ERR "%03x:", offset);
1492 for (i = 0; i < limit; i++) { 1520 for (i = 0; i < limit; i++)
1493 printk(" %02x", (unsigned char)data[offset + i]); 1521 printk(" %02x", (unsigned char)data[offset + i]);
1494 }
1495 printk("\n"); 1522 printk("\n");
1496} 1523}
1497#endif 1524#endif
@@ -1505,15 +1532,15 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1505 1532
1506 if (cachep->flags & SLAB_RED_ZONE) { 1533 if (cachep->flags & SLAB_RED_ZONE) {
1507 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", 1534 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
1508 *dbg_redzone1(cachep, objp), 1535 *dbg_redzone1(cachep, objp),
1509 *dbg_redzone2(cachep, objp)); 1536 *dbg_redzone2(cachep, objp));
1510 } 1537 }
1511 1538
1512 if (cachep->flags & SLAB_STORE_USER) { 1539 if (cachep->flags & SLAB_STORE_USER) {
1513 printk(KERN_ERR "Last user: [<%p>]", 1540 printk(KERN_ERR "Last user: [<%p>]",
1514 *dbg_userword(cachep, objp)); 1541 *dbg_userword(cachep, objp));
1515 print_symbol("(%s)", 1542 print_symbol("(%s)",
1516 (unsigned long)*dbg_userword(cachep, objp)); 1543 (unsigned long)*dbg_userword(cachep, objp));
1517 printk("\n"); 1544 printk("\n");
1518 } 1545 }
1519 realobj = (char *)objp + obj_offset(cachep); 1546 realobj = (char *)objp + obj_offset(cachep);
@@ -1546,8 +1573,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1546 /* Print header */ 1573 /* Print header */
1547 if (lines == 0) { 1574 if (lines == 0) {
1548 printk(KERN_ERR 1575 printk(KERN_ERR
1549 "Slab corruption: start=%p, len=%d\n", 1576 "Slab corruption: start=%p, len=%d\n",
1550 realobj, size); 1577 realobj, size);
1551 print_objinfo(cachep, objp, 0); 1578 print_objinfo(cachep, objp, 0);
1552 } 1579 }
1553 /* Hexdump the affected line */ 1580 /* Hexdump the affected line */
@@ -1568,18 +1595,18 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1568 * exist: 1595 * exist:
1569 */ 1596 */
1570 struct slab *slabp = virt_to_slab(objp); 1597 struct slab *slabp = virt_to_slab(objp);
1571 int objnr; 1598 unsigned int objnr;
1572 1599
1573 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; 1600 objnr = obj_to_index(cachep, slabp, objp);
1574 if (objnr) { 1601 if (objnr) {
1575 objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size; 1602 objp = index_to_obj(cachep, slabp, objnr - 1);
1576 realobj = (char *)objp + obj_offset(cachep); 1603 realobj = (char *)objp + obj_offset(cachep);
1577 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1604 printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1578 realobj, size); 1605 realobj, size);
1579 print_objinfo(cachep, objp, 2); 1606 print_objinfo(cachep, objp, 2);
1580 } 1607 }
1581 if (objnr + 1 < cachep->num) { 1608 if (objnr + 1 < cachep->num) {
1582 objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size; 1609 objp = index_to_obj(cachep, slabp, objnr + 1);
1583 realobj = (char *)objp + obj_offset(cachep); 1610 realobj = (char *)objp + obj_offset(cachep);
1584 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1611 printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1585 realobj, size); 1612 realobj, size);
@@ -1591,22 +1618,25 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1591 1618
1592#if DEBUG 1619#if DEBUG
1593/** 1620/**
1594 * slab_destroy_objs - call the registered destructor for each object in 1621 * slab_destroy_objs - destroy a slab and its objects
1595 * a slab that is to be destroyed. 1622 * @cachep: cache pointer being destroyed
1623 * @slabp: slab pointer being destroyed
1624 *
1625 * Call the registered destructor for each object in a slab that is being
1626 * destroyed.
1596 */ 1627 */
1597static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) 1628static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1598{ 1629{
1599 int i; 1630 int i;
1600 for (i = 0; i < cachep->num; i++) { 1631 for (i = 0; i < cachep->num; i++) {
1601 void *objp = slabp->s_mem + cachep->buffer_size * i; 1632 void *objp = index_to_obj(cachep, slabp, i);
1602 1633
1603 if (cachep->flags & SLAB_POISON) { 1634 if (cachep->flags & SLAB_POISON) {
1604#ifdef CONFIG_DEBUG_PAGEALLOC 1635#ifdef CONFIG_DEBUG_PAGEALLOC
1605 if ((cachep->buffer_size % PAGE_SIZE) == 0 1636 if (cachep->buffer_size % PAGE_SIZE == 0 &&
1606 && OFF_SLAB(cachep)) 1637 OFF_SLAB(cachep))
1607 kernel_map_pages(virt_to_page(objp), 1638 kernel_map_pages(virt_to_page(objp),
1608 cachep->buffer_size / PAGE_SIZE, 1639 cachep->buffer_size / PAGE_SIZE, 1);
1609 1);
1610 else 1640 else
1611 check_poison_obj(cachep, objp); 1641 check_poison_obj(cachep, objp);
1612#else 1642#else
@@ -1631,7 +1661,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1631 if (cachep->dtor) { 1661 if (cachep->dtor) {
1632 int i; 1662 int i;
1633 for (i = 0; i < cachep->num; i++) { 1663 for (i = 0; i < cachep->num; i++) {
1634 void *objp = slabp->s_mem + cachep->buffer_size * i; 1664 void *objp = index_to_obj(cachep, slabp, i);
1635 (cachep->dtor) (objp, cachep, 0); 1665 (cachep->dtor) (objp, cachep, 0);
1636 } 1666 }
1637 } 1667 }
@@ -1639,9 +1669,13 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1639#endif 1669#endif
1640 1670
1641/** 1671/**
1672 * slab_destroy - destroy and release all objects in a slab
1673 * @cachep: cache pointer being destroyed
1674 * @slabp: slab pointer being destroyed
1675 *
1642 * Destroy all the objs in a slab, and release the mem back to the system. 1676 * Destroy all the objs in a slab, and release the mem back to the system.
1643 * Before calling the slab must have been unlinked from the cache. 1677 * Before calling the slab must have been unlinked from the cache. The
1644 * The cache-lock is not held/needed. 1678 * cache-lock is not held/needed.
1645 */ 1679 */
1646static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) 1680static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1647{ 1681{
@@ -1662,8 +1696,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1662 } 1696 }
1663} 1697}
1664 1698
1665/* For setting up all the kmem_list3s for cache whose buffer_size is same 1699/*
1666 as size of kmem_list3. */ 1700 * For setting up all the kmem_list3s for cache whose buffer_size is same as
1701 * size of kmem_list3.
1702 */
1667static void set_up_list3s(struct kmem_cache *cachep, int index) 1703static void set_up_list3s(struct kmem_cache *cachep, int index)
1668{ 1704{
1669 int node; 1705 int node;
@@ -1689,13 +1725,13 @@ static void set_up_list3s(struct kmem_cache *cachep, int index)
1689 * high order pages for slabs. When the gfp() functions are more friendly 1725 * high order pages for slabs. When the gfp() functions are more friendly
1690 * towards high-order requests, this should be changed. 1726 * towards high-order requests, this should be changed.
1691 */ 1727 */
1692static inline size_t calculate_slab_order(struct kmem_cache *cachep, 1728static size_t calculate_slab_order(struct kmem_cache *cachep,
1693 size_t size, size_t align, unsigned long flags) 1729 size_t size, size_t align, unsigned long flags)
1694{ 1730{
1695 size_t left_over = 0; 1731 size_t left_over = 0;
1696 int gfporder; 1732 int gfporder;
1697 1733
1698 for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) { 1734 for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) {
1699 unsigned int num; 1735 unsigned int num;
1700 size_t remainder; 1736 size_t remainder;
1701 1737
@@ -1730,12 +1766,66 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
1730 /* 1766 /*
1731 * Acceptable internal fragmentation? 1767 * Acceptable internal fragmentation?
1732 */ 1768 */
1733 if ((left_over * 8) <= (PAGE_SIZE << gfporder)) 1769 if (left_over * 8 <= (PAGE_SIZE << gfporder))
1734 break; 1770 break;
1735 } 1771 }
1736 return left_over; 1772 return left_over;
1737} 1773}
1738 1774
1775static void setup_cpu_cache(struct kmem_cache *cachep)
1776{
1777 if (g_cpucache_up == FULL) {
1778 enable_cpucache(cachep);
1779 return;
1780 }
1781 if (g_cpucache_up == NONE) {
1782 /*
1783 * Note: the first kmem_cache_create must create the cache
1784 * that's used by kmalloc(24), otherwise the creation of
1785 * further caches will BUG().
1786 */
1787 cachep->array[smp_processor_id()] = &initarray_generic.cache;
1788
1789 /*
1790 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
1791 * the first cache, then we need to set up all its list3s,
1792 * otherwise the creation of further caches will BUG().
1793 */
1794 set_up_list3s(cachep, SIZE_AC);
1795 if (INDEX_AC == INDEX_L3)
1796 g_cpucache_up = PARTIAL_L3;
1797 else
1798 g_cpucache_up = PARTIAL_AC;
1799 } else {
1800 cachep->array[smp_processor_id()] =
1801 kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1802
1803 if (g_cpucache_up == PARTIAL_AC) {
1804 set_up_list3s(cachep, SIZE_L3);
1805 g_cpucache_up = PARTIAL_L3;
1806 } else {
1807 int node;
1808 for_each_online_node(node) {
1809 cachep->nodelists[node] =
1810 kmalloc_node(sizeof(struct kmem_list3),
1811 GFP_KERNEL, node);
1812 BUG_ON(!cachep->nodelists[node]);
1813 kmem_list3_init(cachep->nodelists[node]);
1814 }
1815 }
1816 }
1817 cachep->nodelists[numa_node_id()]->next_reap =
1818 jiffies + REAPTIMEOUT_LIST3 +
1819 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1820
1821 cpu_cache_get(cachep)->avail = 0;
1822 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
1823 cpu_cache_get(cachep)->batchcount = 1;
1824 cpu_cache_get(cachep)->touched = 0;
1825 cachep->batchcount = 1;
1826 cachep->limit = BOOT_CPUCACHE_ENTRIES;
1827}
1828
1739/** 1829/**
1740 * kmem_cache_create - Create a cache. 1830 * kmem_cache_create - Create a cache.
1741 * @name: A string which is used in /proc/slabinfo to identify this cache. 1831 * @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -1751,9 +1841,8 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
1751 * and the @dtor is run before the pages are handed back. 1841 * and the @dtor is run before the pages are handed back.
1752 * 1842 *
1753 * @name must be valid until the cache is destroyed. This implies that 1843 * @name must be valid until the cache is destroyed. This implies that
1754 * the module calling this has to destroy the cache before getting 1844 * the module calling this has to destroy the cache before getting unloaded.
1755 * unloaded. 1845 *
1756 *
1757 * The flags are 1846 * The flags are
1758 * 1847 *
1759 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 1848 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
@@ -1762,16 +1851,14 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
1762 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check 1851 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
1763 * for buffer overruns. 1852 * for buffer overruns.
1764 * 1853 *
1765 * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
1766 * memory pressure.
1767 *
1768 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 1854 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
1769 * cacheline. This can be beneficial if you're counting cycles as closely 1855 * cacheline. This can be beneficial if you're counting cycles as closely
1770 * as davem. 1856 * as davem.
1771 */ 1857 */
1772struct kmem_cache * 1858struct kmem_cache *
1773kmem_cache_create (const char *name, size_t size, size_t align, 1859kmem_cache_create (const char *name, size_t size, size_t align,
1774 unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long), 1860 unsigned long flags,
1861 void (*ctor)(void*, struct kmem_cache *, unsigned long),
1775 void (*dtor)(void*, struct kmem_cache *, unsigned long)) 1862 void (*dtor)(void*, struct kmem_cache *, unsigned long))
1776{ 1863{
1777 size_t left_over, slab_size, ralign; 1864 size_t left_over, slab_size, ralign;
@@ -1781,12 +1868,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1781 /* 1868 /*
1782 * Sanity checks... these are all serious usage bugs. 1869 * Sanity checks... these are all serious usage bugs.
1783 */ 1870 */
1784 if ((!name) || 1871 if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
1785 in_interrupt() ||
1786 (size < BYTES_PER_WORD) ||
1787 (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { 1872 (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
1788 printk(KERN_ERR "%s: Early error in slab %s\n", 1873 printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
1789 __FUNCTION__, name); 1874 name);
1790 BUG(); 1875 BUG();
1791 } 1876 }
1792 1877
@@ -1840,8 +1925,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1840 * above the next power of two: caches with object sizes just above a 1925 * above the next power of two: caches with object sizes just above a
1841 * power of two have a significant amount of internal fragmentation. 1926 * power of two have a significant amount of internal fragmentation.
1842 */ 1927 */
1843 if ((size < 4096 1928 if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))
1844 || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
1845 flags |= SLAB_RED_ZONE | SLAB_STORE_USER; 1929 flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
1846 if (!(flags & SLAB_DESTROY_BY_RCU)) 1930 if (!(flags & SLAB_DESTROY_BY_RCU))
1847 flags |= SLAB_POISON; 1931 flags |= SLAB_POISON;
@@ -1853,13 +1937,14 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1853 BUG_ON(dtor); 1937 BUG_ON(dtor);
1854 1938
1855 /* 1939 /*
1856 * Always checks flags, a caller might be expecting debug 1940 * Always checks flags, a caller might be expecting debug support which
1857 * support which isn't available. 1941 * isn't available.
1858 */ 1942 */
1859 if (flags & ~CREATE_MASK) 1943 if (flags & ~CREATE_MASK)
1860 BUG(); 1944 BUG();
1861 1945
1862 /* Check that size is in terms of words. This is needed to avoid 1946 /*
1947 * Check that size is in terms of words. This is needed to avoid
1863 * unaligned accesses for some archs when redzoning is used, and makes 1948 * unaligned accesses for some archs when redzoning is used, and makes
1864 * sure any on-slab bufctl's are also correctly aligned. 1949 * sure any on-slab bufctl's are also correctly aligned.
1865 */ 1950 */
@@ -1868,12 +1953,14 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1868 size &= ~(BYTES_PER_WORD - 1); 1953 size &= ~(BYTES_PER_WORD - 1);
1869 } 1954 }
1870 1955
1871 /* calculate out the final buffer alignment: */ 1956 /* calculate the final buffer alignment: */
1957
1872 /* 1) arch recommendation: can be overridden for debug */ 1958 /* 1) arch recommendation: can be overridden for debug */
1873 if (flags & SLAB_HWCACHE_ALIGN) { 1959 if (flags & SLAB_HWCACHE_ALIGN) {
1874 /* Default alignment: as specified by the arch code. 1960 /*
1875 * Except if an object is really small, then squeeze multiple 1961 * Default alignment: as specified by the arch code. Except if
1876 * objects into one cacheline. 1962 * an object is really small, then squeeze multiple objects into
1963 * one cacheline.
1877 */ 1964 */
1878 ralign = cache_line_size(); 1965 ralign = cache_line_size();
1879 while (size <= ralign / 2) 1966 while (size <= ralign / 2)
@@ -1893,7 +1980,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1893 if (ralign > BYTES_PER_WORD) 1980 if (ralign > BYTES_PER_WORD)
1894 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 1981 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
1895 } 1982 }
1896 /* 4) Store it. Note that the debug code below can reduce 1983 /*
1984 * 4) Store it. Note that the debug code below can reduce
1897 * the alignment to BYTES_PER_WORD. 1985 * the alignment to BYTES_PER_WORD.
1898 */ 1986 */
1899 align = ralign; 1987 align = ralign;
@@ -1978,7 +2066,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1978 cachep->gfpflags = 0; 2066 cachep->gfpflags = 0;
1979 if (flags & SLAB_CACHE_DMA) 2067 if (flags & SLAB_CACHE_DMA)
1980 cachep->gfpflags |= GFP_DMA; 2068 cachep->gfpflags |= GFP_DMA;
1981 spin_lock_init(&cachep->spinlock);
1982 cachep->buffer_size = size; 2069 cachep->buffer_size = size;
1983 2070
1984 if (flags & CFLGS_OFF_SLAB) 2071 if (flags & CFLGS_OFF_SLAB)
@@ -1988,64 +2075,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1988 cachep->name = name; 2075 cachep->name = name;
1989 2076
1990 2077
1991 if (g_cpucache_up == FULL) { 2078 setup_cpu_cache(cachep);
1992 enable_cpucache(cachep);
1993 } else {
1994 if (g_cpucache_up == NONE) {
1995 /* Note: the first kmem_cache_create must create
1996 * the cache that's used by kmalloc(24), otherwise
1997 * the creation of further caches will BUG().
1998 */
1999 cachep->array[smp_processor_id()] =
2000 &initarray_generic.cache;
2001
2002 /* If the cache that's used by
2003 * kmalloc(sizeof(kmem_list3)) is the first cache,
2004 * then we need to set up all its list3s, otherwise
2005 * the creation of further caches will BUG().
2006 */
2007 set_up_list3s(cachep, SIZE_AC);
2008 if (INDEX_AC == INDEX_L3)
2009 g_cpucache_up = PARTIAL_L3;
2010 else
2011 g_cpucache_up = PARTIAL_AC;
2012 } else {
2013 cachep->array[smp_processor_id()] =
2014 kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
2015
2016 if (g_cpucache_up == PARTIAL_AC) {
2017 set_up_list3s(cachep, SIZE_L3);
2018 g_cpucache_up = PARTIAL_L3;
2019 } else {
2020 int node;
2021 for_each_online_node(node) {
2022
2023 cachep->nodelists[node] =
2024 kmalloc_node(sizeof
2025 (struct kmem_list3),
2026 GFP_KERNEL, node);
2027 BUG_ON(!cachep->nodelists[node]);
2028 kmem_list3_init(cachep->
2029 nodelists[node]);
2030 }
2031 }
2032 }
2033 cachep->nodelists[numa_node_id()]->next_reap =
2034 jiffies + REAPTIMEOUT_LIST3 +
2035 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
2036
2037 BUG_ON(!cpu_cache_get(cachep));
2038 cpu_cache_get(cachep)->avail = 0;
2039 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
2040 cpu_cache_get(cachep)->batchcount = 1;
2041 cpu_cache_get(cachep)->touched = 0;
2042 cachep->batchcount = 1;
2043 cachep->limit = BOOT_CPUCACHE_ENTRIES;
2044 }
2045 2079
2046 /* cache setup completed, link it into the list */ 2080 /* cache setup completed, link it into the list */
2047 list_add(&cachep->next, &cache_chain); 2081 list_add(&cachep->next, &cache_chain);
2048 oops: 2082oops:
2049 if (!cachep && (flags & SLAB_PANIC)) 2083 if (!cachep && (flags & SLAB_PANIC))
2050 panic("kmem_cache_create(): failed to create slab `%s'\n", 2084 panic("kmem_cache_create(): failed to create slab `%s'\n",
2051 name); 2085 name);
@@ -2089,30 +2123,13 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2089#define check_spinlock_acquired_node(x, y) do { } while(0) 2123#define check_spinlock_acquired_node(x, y) do { } while(0)
2090#endif 2124#endif
2091 2125
2092/* 2126static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
2093 * Waits for all CPUs to execute func(). 2127 struct array_cache *ac,
2094 */ 2128 int force, int node);
2095static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
2096{
2097 check_irq_on();
2098 preempt_disable();
2099
2100 local_irq_disable();
2101 func(arg);
2102 local_irq_enable();
2103
2104 if (smp_call_function(func, arg, 1, 1))
2105 BUG();
2106
2107 preempt_enable();
2108}
2109
2110static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
2111 int force, int node);
2112 2129
2113static void do_drain(void *arg) 2130static void do_drain(void *arg)
2114{ 2131{
2115 struct kmem_cache *cachep = (struct kmem_cache *) arg; 2132 struct kmem_cache *cachep = arg;
2116 struct array_cache *ac; 2133 struct array_cache *ac;
2117 int node = numa_node_id(); 2134 int node = numa_node_id();
2118 2135
@@ -2129,14 +2146,12 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
2129 struct kmem_list3 *l3; 2146 struct kmem_list3 *l3;
2130 int node; 2147 int node;
2131 2148
2132 smp_call_function_all_cpus(do_drain, cachep); 2149 on_each_cpu(do_drain, cachep, 1, 1);
2133 check_irq_on(); 2150 check_irq_on();
2134 for_each_online_node(node) { 2151 for_each_online_node(node) {
2135 l3 = cachep->nodelists[node]; 2152 l3 = cachep->nodelists[node];
2136 if (l3) { 2153 if (l3) {
2137 spin_lock_irq(&l3->list_lock); 2154 drain_array(cachep, l3, l3->shared, 1, node);
2138 drain_array_locked(cachep, l3->shared, 1, node);
2139 spin_unlock_irq(&l3->list_lock);
2140 if (l3->alien) 2155 if (l3->alien)
2141 drain_alien_cache(cachep, l3->alien); 2156 drain_alien_cache(cachep, l3->alien);
2142 } 2157 }
@@ -2260,16 +2275,15 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
2260 2275
2261 /* NUMA: free the list3 structures */ 2276 /* NUMA: free the list3 structures */
2262 for_each_online_node(i) { 2277 for_each_online_node(i) {
2263 if ((l3 = cachep->nodelists[i])) { 2278 l3 = cachep->nodelists[i];
2279 if (l3) {
2264 kfree(l3->shared); 2280 kfree(l3->shared);
2265 free_alien_cache(l3->alien); 2281 free_alien_cache(l3->alien);
2266 kfree(l3); 2282 kfree(l3);
2267 } 2283 }
2268 } 2284 }
2269 kmem_cache_free(&cache_cache, cachep); 2285 kmem_cache_free(&cache_cache, cachep);
2270
2271 unlock_cpu_hotplug(); 2286 unlock_cpu_hotplug();
2272
2273 return 0; 2287 return 0;
2274} 2288}
2275EXPORT_SYMBOL(kmem_cache_destroy); 2289EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2292,7 +2306,6 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2292 slabp->inuse = 0; 2306 slabp->inuse = 0;
2293 slabp->colouroff = colour_off; 2307 slabp->colouroff = colour_off;
2294 slabp->s_mem = objp + colour_off; 2308 slabp->s_mem = objp + colour_off;
2295
2296 return slabp; 2309 return slabp;
2297} 2310}
2298 2311
@@ -2307,7 +2320,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2307 int i; 2320 int i;
2308 2321
2309 for (i = 0; i < cachep->num; i++) { 2322 for (i = 0; i < cachep->num; i++) {
2310 void *objp = slabp->s_mem + cachep->buffer_size * i; 2323 void *objp = index_to_obj(cachep, slabp, i);
2311#if DEBUG 2324#if DEBUG
2312 /* need to poison the objs? */ 2325 /* need to poison the objs? */
2313 if (cachep->flags & SLAB_POISON) 2326 if (cachep->flags & SLAB_POISON)
@@ -2320,9 +2333,9 @@ static void cache_init_objs(struct kmem_cache *cachep,
2320 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2333 *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2321 } 2334 }
2322 /* 2335 /*
2323 * Constructors are not allowed to allocate memory from 2336 * Constructors are not allowed to allocate memory from the same
2324 * the same cache which they are a constructor for. 2337 * cache which they are a constructor for. Otherwise, deadlock.
2325 * Otherwise, deadlock. They must also be threaded. 2338 * They must also be threaded.
2326 */ 2339 */
2327 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2340 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2328 cachep->ctor(objp + obj_offset(cachep), cachep, 2341 cachep->ctor(objp + obj_offset(cachep), cachep,
@@ -2336,8 +2349,8 @@ static void cache_init_objs(struct kmem_cache *cachep,
2336 slab_error(cachep, "constructor overwrote the" 2349 slab_error(cachep, "constructor overwrote the"
2337 " start of an object"); 2350 " start of an object");
2338 } 2351 }
2339 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep) 2352 if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
2340 && cachep->flags & SLAB_POISON) 2353 OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2341 kernel_map_pages(virt_to_page(objp), 2354 kernel_map_pages(virt_to_page(objp),
2342 cachep->buffer_size / PAGE_SIZE, 0); 2355 cachep->buffer_size / PAGE_SIZE, 0);
2343#else 2356#else
@@ -2352,18 +2365,16 @@ static void cache_init_objs(struct kmem_cache *cachep,
2352 2365
2353static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) 2366static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2354{ 2367{
2355 if (flags & SLAB_DMA) { 2368 if (flags & SLAB_DMA)
2356 if (!(cachep->gfpflags & GFP_DMA)) 2369 BUG_ON(!(cachep->gfpflags & GFP_DMA));
2357 BUG(); 2370 else
2358 } else { 2371 BUG_ON(cachep->gfpflags & GFP_DMA);
2359 if (cachep->gfpflags & GFP_DMA)
2360 BUG();
2361 }
2362} 2372}
2363 2373
2364static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid) 2374static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
2375 int nodeid)
2365{ 2376{
2366 void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size); 2377 void *objp = index_to_obj(cachep, slabp, slabp->free);
2367 kmem_bufctl_t next; 2378 kmem_bufctl_t next;
2368 2379
2369 slabp->inuse++; 2380 slabp->inuse++;
@@ -2377,10 +2388,10 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nod
2377 return objp; 2388 return objp;
2378} 2389}
2379 2390
2380static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp, 2391static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2381 int nodeid) 2392 void *objp, int nodeid)
2382{ 2393{
2383 unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size; 2394 unsigned int objnr = obj_to_index(cachep, slabp, objp);
2384 2395
2385#if DEBUG 2396#if DEBUG
2386 /* Verify that the slab belongs to the intended node */ 2397 /* Verify that the slab belongs to the intended node */
@@ -2388,7 +2399,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob
2388 2399
2389 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { 2400 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2390 printk(KERN_ERR "slab: double free detected in cache " 2401 printk(KERN_ERR "slab: double free detected in cache "
2391 "'%s', objp %p\n", cachep->name, objp); 2402 "'%s', objp %p\n", cachep->name, objp);
2392 BUG(); 2403 BUG();
2393 } 2404 }
2394#endif 2405#endif
@@ -2397,14 +2408,18 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob
2397 slabp->inuse--; 2408 slabp->inuse--;
2398} 2409}
2399 2410
2400static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp) 2411static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp,
2412 void *objp)
2401{ 2413{
2402 int i; 2414 int i;
2403 struct page *page; 2415 struct page *page;
2404 2416
2405 /* Nasty!!!!!! I hope this is OK. */ 2417 /* Nasty!!!!!! I hope this is OK. */
2406 i = 1 << cachep->gfporder;
2407 page = virt_to_page(objp); 2418 page = virt_to_page(objp);
2419
2420 i = 1;
2421 if (likely(!PageCompound(page)))
2422 i <<= cachep->gfporder;
2408 do { 2423 do {
2409 page_set_cache(page, cachep); 2424 page_set_cache(page, cachep);
2410 page_set_slab(page, slabp); 2425 page_set_slab(page, slabp);
@@ -2425,8 +2440,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2425 unsigned long ctor_flags; 2440 unsigned long ctor_flags;
2426 struct kmem_list3 *l3; 2441 struct kmem_list3 *l3;
2427 2442
2428 /* Be lazy and only check for valid flags here, 2443 /*
2429 * keeping it out of the critical path in kmem_cache_alloc(). 2444 * Be lazy and only check for valid flags here, keeping it out of the
2445 * critical path in kmem_cache_alloc().
2430 */ 2446 */
2431 if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)) 2447 if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
2432 BUG(); 2448 BUG();
@@ -2467,14 +2483,17 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2467 */ 2483 */
2468 kmem_flagcheck(cachep, flags); 2484 kmem_flagcheck(cachep, flags);
2469 2485
2470 /* Get mem for the objs. 2486 /*
2471 * Attempt to allocate a physical page from 'nodeid', 2487 * Get mem for the objs. Attempt to allocate a physical page from
2488 * 'nodeid'.
2472 */ 2489 */
2473 if (!(objp = kmem_getpages(cachep, flags, nodeid))) 2490 objp = kmem_getpages(cachep, flags, nodeid);
2491 if (!objp)
2474 goto failed; 2492 goto failed;
2475 2493
2476 /* Get slab management. */ 2494 /* Get slab management. */
2477 if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags))) 2495 slabp = alloc_slabmgmt(cachep, objp, offset, local_flags);
2496 if (!slabp)
2478 goto opps1; 2497 goto opps1;
2479 2498
2480 slabp->nodeid = nodeid; 2499 slabp->nodeid = nodeid;
@@ -2493,9 +2512,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2493 l3->free_objects += cachep->num; 2512 l3->free_objects += cachep->num;
2494 spin_unlock(&l3->list_lock); 2513 spin_unlock(&l3->list_lock);
2495 return 1; 2514 return 1;
2496 opps1: 2515opps1:
2497 kmem_freepages(cachep, objp); 2516 kmem_freepages(cachep, objp);
2498 failed: 2517failed:
2499 if (local_flags & __GFP_WAIT) 2518 if (local_flags & __GFP_WAIT)
2500 local_irq_disable(); 2519 local_irq_disable();
2501 return 0; 2520 return 0;
@@ -2538,8 +2557,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2538 page = virt_to_page(objp); 2557 page = virt_to_page(objp);
2539 2558
2540 if (page_get_cache(page) != cachep) { 2559 if (page_get_cache(page) != cachep) {
2541 printk(KERN_ERR 2560 printk(KERN_ERR "mismatch in kmem_cache_free: expected "
2542 "mismatch in kmem_cache_free: expected cache %p, got %p\n", 2561 "cache %p, got %p\n",
2543 page_get_cache(page), cachep); 2562 page_get_cache(page), cachep);
2544 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); 2563 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
2545 printk(KERN_ERR "%p is %s.\n", page_get_cache(page), 2564 printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
@@ -2549,13 +2568,12 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2549 slabp = page_get_slab(page); 2568 slabp = page_get_slab(page);
2550 2569
2551 if (cachep->flags & SLAB_RED_ZONE) { 2570 if (cachep->flags & SLAB_RED_ZONE) {
2552 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE 2571 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE ||
2553 || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { 2572 *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
2554 slab_error(cachep, 2573 slab_error(cachep, "double free, or memory outside"
2555 "double free, or memory outside" 2574 " object was overwritten");
2556 " object was overwritten"); 2575 printk(KERN_ERR "%p: redzone 1:0x%lx, "
2557 printk(KERN_ERR 2576 "redzone 2:0x%lx.\n",
2558 "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
2559 objp, *dbg_redzone1(cachep, objp), 2577 objp, *dbg_redzone1(cachep, objp),
2560 *dbg_redzone2(cachep, objp)); 2578 *dbg_redzone2(cachep, objp));
2561 } 2579 }
@@ -2565,15 +2583,16 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2565 if (cachep->flags & SLAB_STORE_USER) 2583 if (cachep->flags & SLAB_STORE_USER)
2566 *dbg_userword(cachep, objp) = caller; 2584 *dbg_userword(cachep, objp) = caller;
2567 2585
2568 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; 2586 objnr = obj_to_index(cachep, slabp, objp);
2569 2587
2570 BUG_ON(objnr >= cachep->num); 2588 BUG_ON(objnr >= cachep->num);
2571 BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size); 2589 BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
2572 2590
2573 if (cachep->flags & SLAB_DEBUG_INITIAL) { 2591 if (cachep->flags & SLAB_DEBUG_INITIAL) {
2574 /* Need to call the slab's constructor so the 2592 /*
2575 * caller can perform a verify of its state (debugging). 2593 * Need to call the slab's constructor so the caller can
2576 * Called without the cache-lock held. 2594 * perform a verify of its state (debugging). Called without
2595 * the cache-lock held.
2577 */ 2596 */
2578 cachep->ctor(objp + obj_offset(cachep), 2597 cachep->ctor(objp + obj_offset(cachep),
2579 cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); 2598 cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
@@ -2586,7 +2605,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2586 } 2605 }
2587 if (cachep->flags & SLAB_POISON) { 2606 if (cachep->flags & SLAB_POISON) {
2588#ifdef CONFIG_DEBUG_PAGEALLOC 2607#ifdef CONFIG_DEBUG_PAGEALLOC
2589 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { 2608 if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2590 store_stackinfo(cachep, objp, (unsigned long)caller); 2609 store_stackinfo(cachep, objp, (unsigned long)caller);
2591 kernel_map_pages(virt_to_page(objp), 2610 kernel_map_pages(virt_to_page(objp),
2592 cachep->buffer_size / PAGE_SIZE, 0); 2611 cachep->buffer_size / PAGE_SIZE, 0);
@@ -2612,14 +2631,14 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2612 goto bad; 2631 goto bad;
2613 } 2632 }
2614 if (entries != cachep->num - slabp->inuse) { 2633 if (entries != cachep->num - slabp->inuse) {
2615 bad: 2634bad:
2616 printk(KERN_ERR 2635 printk(KERN_ERR "slab: Internal list corruption detected in "
2617 "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", 2636 "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2618 cachep->name, cachep->num, slabp, slabp->inuse); 2637 cachep->name, cachep->num, slabp, slabp->inuse);
2619 for (i = 0; 2638 for (i = 0;
2620 i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); 2639 i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
2621 i++) { 2640 i++) {
2622 if ((i % 16) == 0) 2641 if (i % 16 == 0)
2623 printk("\n%03x:", i); 2642 printk("\n%03x:", i);
2624 printk(" %02x", ((unsigned char *)slabp)[i]); 2643 printk(" %02x", ((unsigned char *)slabp)[i]);
2625 } 2644 }
@@ -2641,12 +2660,13 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2641 2660
2642 check_irq_off(); 2661 check_irq_off();
2643 ac = cpu_cache_get(cachep); 2662 ac = cpu_cache_get(cachep);
2644 retry: 2663retry:
2645 batchcount = ac->batchcount; 2664 batchcount = ac->batchcount;
2646 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2665 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2647 /* if there was little recent activity on this 2666 /*
2648 * cache, then perform only a partial refill. 2667 * If there was little recent activity on this cache, then
2649 * Otherwise we could generate refill bouncing. 2668 * perform only a partial refill. Otherwise we could generate
2669 * refill bouncing.
2650 */ 2670 */
2651 batchcount = BATCHREFILL_LIMIT; 2671 batchcount = BATCHREFILL_LIMIT;
2652 } 2672 }
@@ -2702,29 +2722,29 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2702 list_add(&slabp->list, &l3->slabs_partial); 2722 list_add(&slabp->list, &l3->slabs_partial);
2703 } 2723 }
2704 2724
2705 must_grow: 2725must_grow:
2706 l3->free_objects -= ac->avail; 2726 l3->free_objects -= ac->avail;
2707 alloc_done: 2727alloc_done:
2708 spin_unlock(&l3->list_lock); 2728 spin_unlock(&l3->list_lock);
2709 2729
2710 if (unlikely(!ac->avail)) { 2730 if (unlikely(!ac->avail)) {
2711 int x; 2731 int x;
2712 x = cache_grow(cachep, flags, numa_node_id()); 2732 x = cache_grow(cachep, flags, numa_node_id());
2713 2733
2714 // cache_grow can reenable interrupts, then ac could change. 2734 /* cache_grow can reenable interrupts, then ac could change. */
2715 ac = cpu_cache_get(cachep); 2735 ac = cpu_cache_get(cachep);
2716 if (!x && ac->avail == 0) // no objects in sight? abort 2736 if (!x && ac->avail == 0) /* no objects in sight? abort */
2717 return NULL; 2737 return NULL;
2718 2738
2719 if (!ac->avail) // objects refilled by interrupt? 2739 if (!ac->avail) /* objects refilled by interrupt? */
2720 goto retry; 2740 goto retry;
2721 } 2741 }
2722 ac->touched = 1; 2742 ac->touched = 1;
2723 return ac->entry[--ac->avail]; 2743 return ac->entry[--ac->avail];
2724} 2744}
2725 2745
2726static inline void 2746static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
2727cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) 2747 gfp_t flags)
2728{ 2748{
2729 might_sleep_if(flags & __GFP_WAIT); 2749 might_sleep_if(flags & __GFP_WAIT);
2730#if DEBUG 2750#if DEBUG
@@ -2733,8 +2753,8 @@ cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
2733} 2753}
2734 2754
2735#if DEBUG 2755#if DEBUG
2736static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, 2756static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2737 void *objp, void *caller) 2757 gfp_t flags, void *objp, void *caller)
2738{ 2758{
2739 if (!objp) 2759 if (!objp)
2740 return objp; 2760 return objp;
@@ -2754,15 +2774,14 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags
2754 *dbg_userword(cachep, objp) = caller; 2774 *dbg_userword(cachep, objp) = caller;
2755 2775
2756 if (cachep->flags & SLAB_RED_ZONE) { 2776 if (cachep->flags & SLAB_RED_ZONE) {
2757 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE 2777 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
2758 || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { 2778 *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
2759 slab_error(cachep, 2779 slab_error(cachep, "double free, or memory outside"
2760 "double free, or memory outside" 2780 " object was overwritten");
2761 " object was overwritten");
2762 printk(KERN_ERR 2781 printk(KERN_ERR
2763 "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", 2782 "%p: redzone 1:0x%lx, redzone 2:0x%lx\n",
2764 objp, *dbg_redzone1(cachep, objp), 2783 objp, *dbg_redzone1(cachep, objp),
2765 *dbg_redzone2(cachep, objp)); 2784 *dbg_redzone2(cachep, objp));
2766 } 2785 }
2767 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 2786 *dbg_redzone1(cachep, objp) = RED_ACTIVE;
2768 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2787 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
@@ -2809,8 +2828,8 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2809 return objp; 2828 return objp;
2810} 2829}
2811 2830
2812static __always_inline void * 2831static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
2813__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) 2832 gfp_t flags, void *caller)
2814{ 2833{
2815 unsigned long save_flags; 2834 unsigned long save_flags;
2816 void *objp; 2835 void *objp;
@@ -2830,7 +2849,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
2830/* 2849/*
2831 * A interface to enable slab creation on nodeid 2850 * A interface to enable slab creation on nodeid
2832 */ 2851 */
2833static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 2852static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
2853 int nodeid)
2834{ 2854{
2835 struct list_head *entry; 2855 struct list_head *entry;
2836 struct slab *slabp; 2856 struct slab *slabp;
@@ -2841,7 +2861,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
2841 l3 = cachep->nodelists[nodeid]; 2861 l3 = cachep->nodelists[nodeid];
2842 BUG_ON(!l3); 2862 BUG_ON(!l3);
2843 2863
2844 retry: 2864retry:
2845 check_irq_off(); 2865 check_irq_off();
2846 spin_lock(&l3->list_lock); 2866 spin_lock(&l3->list_lock);
2847 entry = l3->slabs_partial.next; 2867 entry = l3->slabs_partial.next;
@@ -2868,16 +2888,15 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
2868 /* move slabp to correct slabp list: */ 2888 /* move slabp to correct slabp list: */
2869 list_del(&slabp->list); 2889 list_del(&slabp->list);
2870 2890
2871 if (slabp->free == BUFCTL_END) { 2891 if (slabp->free == BUFCTL_END)
2872 list_add(&slabp->list, &l3->slabs_full); 2892 list_add(&slabp->list, &l3->slabs_full);
2873 } else { 2893 else
2874 list_add(&slabp->list, &l3->slabs_partial); 2894 list_add(&slabp->list, &l3->slabs_partial);
2875 }
2876 2895
2877 spin_unlock(&l3->list_lock); 2896 spin_unlock(&l3->list_lock);
2878 goto done; 2897 goto done;
2879 2898
2880 must_grow: 2899must_grow:
2881 spin_unlock(&l3->list_lock); 2900 spin_unlock(&l3->list_lock);
2882 x = cache_grow(cachep, flags, nodeid); 2901 x = cache_grow(cachep, flags, nodeid);
2883 2902
@@ -2885,7 +2904,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
2885 return NULL; 2904 return NULL;
2886 2905
2887 goto retry; 2906 goto retry;
2888 done: 2907done:
2889 return obj; 2908 return obj;
2890} 2909}
2891#endif 2910#endif
@@ -2958,7 +2977,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
2958 } 2977 }
2959 2978
2960 free_block(cachep, ac->entry, batchcount, node); 2979 free_block(cachep, ac->entry, batchcount, node);
2961 free_done: 2980free_done:
2962#if STATS 2981#if STATS
2963 { 2982 {
2964 int i = 0; 2983 int i = 0;
@@ -2979,16 +2998,12 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
2979#endif 2998#endif
2980 spin_unlock(&l3->list_lock); 2999 spin_unlock(&l3->list_lock);
2981 ac->avail -= batchcount; 3000 ac->avail -= batchcount;
2982 memmove(ac->entry, &(ac->entry[batchcount]), 3001 memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
2983 sizeof(void *) * ac->avail);
2984} 3002}
2985 3003
2986/* 3004/*
2987 * __cache_free 3005 * Release an obj back to its cache. If the obj has a constructed state, it must
2988 * Release an obj back to its cache. If the obj has a constructed 3006 * be in this state _before_ it is released. Called with disabled ints.
2989 * state, it must be in this state _before_ it is released.
2990 *
2991 * Called with disabled ints.
2992 */ 3007 */
2993static inline void __cache_free(struct kmem_cache *cachep, void *objp) 3008static inline void __cache_free(struct kmem_cache *cachep, void *objp)
2994{ 3009{
@@ -3007,9 +3022,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3007 if (unlikely(slabp->nodeid != numa_node_id())) { 3022 if (unlikely(slabp->nodeid != numa_node_id())) {
3008 struct array_cache *alien = NULL; 3023 struct array_cache *alien = NULL;
3009 int nodeid = slabp->nodeid; 3024 int nodeid = slabp->nodeid;
3010 struct kmem_list3 *l3 = 3025 struct kmem_list3 *l3;
3011 cachep->nodelists[numa_node_id()];
3012 3026
3027 l3 = cachep->nodelists[numa_node_id()];
3013 STATS_INC_NODEFREES(cachep); 3028 STATS_INC_NODEFREES(cachep);
3014 if (l3->alien && l3->alien[nodeid]) { 3029 if (l3->alien && l3->alien[nodeid]) {
3015 alien = l3->alien[nodeid]; 3030 alien = l3->alien[nodeid];
@@ -3093,7 +3108,7 @@ int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
3093 if (unlikely(page_get_cache(page) != cachep)) 3108 if (unlikely(page_get_cache(page) != cachep))
3094 goto out; 3109 goto out;
3095 return 1; 3110 return 1;
3096 out: 3111out:
3097 return 0; 3112 return 0;
3098} 3113}
3099 3114
@@ -3119,7 +3134,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3119 local_irq_save(save_flags); 3134 local_irq_save(save_flags);
3120 3135
3121 if (nodeid == -1 || nodeid == numa_node_id() || 3136 if (nodeid == -1 || nodeid == numa_node_id() ||
3122 !cachep->nodelists[nodeid]) 3137 !cachep->nodelists[nodeid])
3123 ptr = ____cache_alloc(cachep, flags); 3138 ptr = ____cache_alloc(cachep, flags);
3124 else 3139 else
3125 ptr = __cache_alloc_node(cachep, flags, nodeid); 3140 ptr = __cache_alloc_node(cachep, flags, nodeid);
@@ -3148,6 +3163,7 @@ EXPORT_SYMBOL(kmalloc_node);
3148 * kmalloc - allocate memory 3163 * kmalloc - allocate memory
3149 * @size: how many bytes of memory are required. 3164 * @size: how many bytes of memory are required.
3150 * @flags: the type of memory to allocate. 3165 * @flags: the type of memory to allocate.
3166 * @caller: function caller for debug tracking of the caller
3151 * 3167 *
3152 * kmalloc is the normal method of allocating memory 3168 * kmalloc is the normal method of allocating memory
3153 * in the kernel. 3169 * in the kernel.
@@ -3236,7 +3252,7 @@ void *__alloc_percpu(size_t size)
3236 /* Catch derefs w/o wrappers */ 3252 /* Catch derefs w/o wrappers */
3237 return (void *)(~(unsigned long)pdata); 3253 return (void *)(~(unsigned long)pdata);
3238 3254
3239 unwind_oom: 3255unwind_oom:
3240 while (--i >= 0) { 3256 while (--i >= 0) {
3241 if (!cpu_possible(i)) 3257 if (!cpu_possible(i))
3242 continue; 3258 continue;
@@ -3339,18 +3355,20 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3339 struct array_cache *nc = NULL, *new; 3355 struct array_cache *nc = NULL, *new;
3340 struct array_cache **new_alien = NULL; 3356 struct array_cache **new_alien = NULL;
3341#ifdef CONFIG_NUMA 3357#ifdef CONFIG_NUMA
3342 if (!(new_alien = alloc_alien_cache(node, cachep->limit))) 3358 new_alien = alloc_alien_cache(node, cachep->limit);
3359 if (!new_alien)
3343 goto fail; 3360 goto fail;
3344#endif 3361#endif
3345 if (!(new = alloc_arraycache(node, (cachep->shared * 3362 new = alloc_arraycache(node, cachep->shared*cachep->batchcount,
3346 cachep->batchcount), 3363 0xbaadf00d);
3347 0xbaadf00d))) 3364 if (!new)
3348 goto fail; 3365 goto fail;
3349 if ((l3 = cachep->nodelists[node])) { 3366 l3 = cachep->nodelists[node];
3350 3367 if (l3) {
3351 spin_lock_irq(&l3->list_lock); 3368 spin_lock_irq(&l3->list_lock);
3352 3369
3353 if ((nc = cachep->nodelists[node]->shared)) 3370 nc = cachep->nodelists[node]->shared;
3371 if (nc)
3354 free_block(cachep, nc->entry, nc->avail, node); 3372 free_block(cachep, nc->entry, nc->avail, node);
3355 3373
3356 l3->shared = new; 3374 l3->shared = new;
@@ -3359,27 +3377,27 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3359 new_alien = NULL; 3377 new_alien = NULL;
3360 } 3378 }
3361 l3->free_limit = (1 + nr_cpus_node(node)) * 3379 l3->free_limit = (1 + nr_cpus_node(node)) *
3362 cachep->batchcount + cachep->num; 3380 cachep->batchcount + cachep->num;
3363 spin_unlock_irq(&l3->list_lock); 3381 spin_unlock_irq(&l3->list_lock);
3364 kfree(nc); 3382 kfree(nc);
3365 free_alien_cache(new_alien); 3383 free_alien_cache(new_alien);
3366 continue; 3384 continue;
3367 } 3385 }
3368 if (!(l3 = kmalloc_node(sizeof(struct kmem_list3), 3386 l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
3369 GFP_KERNEL, node))) 3387 if (!l3)
3370 goto fail; 3388 goto fail;
3371 3389
3372 kmem_list3_init(l3); 3390 kmem_list3_init(l3);
3373 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 3391 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3374 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 3392 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3375 l3->shared = new; 3393 l3->shared = new;
3376 l3->alien = new_alien; 3394 l3->alien = new_alien;
3377 l3->free_limit = (1 + nr_cpus_node(node)) * 3395 l3->free_limit = (1 + nr_cpus_node(node)) *
3378 cachep->batchcount + cachep->num; 3396 cachep->batchcount + cachep->num;
3379 cachep->nodelists[node] = l3; 3397 cachep->nodelists[node] = l3;
3380 } 3398 }
3381 return err; 3399 return err;
3382 fail: 3400fail:
3383 err = -ENOMEM; 3401 err = -ENOMEM;
3384 return err; 3402 return err;
3385} 3403}
@@ -3391,7 +3409,7 @@ struct ccupdate_struct {
3391 3409
3392static void do_ccupdate_local(void *info) 3410static void do_ccupdate_local(void *info)
3393{ 3411{
3394 struct ccupdate_struct *new = (struct ccupdate_struct *)info; 3412 struct ccupdate_struct *new = info;
3395 struct array_cache *old; 3413 struct array_cache *old;
3396 3414
3397 check_irq_off(); 3415 check_irq_off();
@@ -3401,16 +3419,17 @@ static void do_ccupdate_local(void *info)
3401 new->new[smp_processor_id()] = old; 3419 new->new[smp_processor_id()] = old;
3402} 3420}
3403 3421
3404static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, 3422/* Always called with the cache_chain_mutex held */
3405 int shared) 3423static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3424 int batchcount, int shared)
3406{ 3425{
3407 struct ccupdate_struct new; 3426 struct ccupdate_struct new;
3408 int i, err; 3427 int i, err;
3409 3428
3410 memset(&new.new, 0, sizeof(new.new)); 3429 memset(&new.new, 0, sizeof(new.new));
3411 for_each_online_cpu(i) { 3430 for_each_online_cpu(i) {
3412 new.new[i] = 3431 new.new[i] = alloc_arraycache(cpu_to_node(i), limit,
3413 alloc_arraycache(cpu_to_node(i), limit, batchcount); 3432 batchcount);
3414 if (!new.new[i]) { 3433 if (!new.new[i]) {
3415 for (i--; i >= 0; i--) 3434 for (i--; i >= 0; i--)
3416 kfree(new.new[i]); 3435 kfree(new.new[i]);
@@ -3419,14 +3438,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount
3419 } 3438 }
3420 new.cachep = cachep; 3439 new.cachep = cachep;
3421 3440
3422 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); 3441 on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1);
3423 3442
3424 check_irq_on(); 3443 check_irq_on();
3425 spin_lock(&cachep->spinlock);
3426 cachep->batchcount = batchcount; 3444 cachep->batchcount = batchcount;
3427 cachep->limit = limit; 3445 cachep->limit = limit;
3428 cachep->shared = shared; 3446 cachep->shared = shared;
3429 spin_unlock(&cachep->spinlock);
3430 3447
3431 for_each_online_cpu(i) { 3448 for_each_online_cpu(i) {
3432 struct array_cache *ccold = new.new[i]; 3449 struct array_cache *ccold = new.new[i];
@@ -3447,15 +3464,17 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount
3447 return 0; 3464 return 0;
3448} 3465}
3449 3466
3467/* Called with cache_chain_mutex held always */
3450static void enable_cpucache(struct kmem_cache *cachep) 3468static void enable_cpucache(struct kmem_cache *cachep)
3451{ 3469{
3452 int err; 3470 int err;
3453 int limit, shared; 3471 int limit, shared;
3454 3472
3455 /* The head array serves three purposes: 3473 /*
3474 * The head array serves three purposes:
3456 * - create a LIFO ordering, i.e. return objects that are cache-warm 3475 * - create a LIFO ordering, i.e. return objects that are cache-warm
3457 * - reduce the number of spinlock operations. 3476 * - reduce the number of spinlock operations.
3458 * - reduce the number of linked list operations on the slab and 3477 * - reduce the number of linked list operations on the slab and
3459 * bufctl chains: array operations are cheaper. 3478 * bufctl chains: array operations are cheaper.
3460 * The numbers are guessed, we should auto-tune as described by 3479 * The numbers are guessed, we should auto-tune as described by
3461 * Bonwick. 3480 * Bonwick.
@@ -3471,7 +3490,8 @@ static void enable_cpucache(struct kmem_cache *cachep)
3471 else 3490 else
3472 limit = 120; 3491 limit = 120;
3473 3492
3474 /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound 3493 /*
3494 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
3475 * allocation behaviour: Most allocs on one cpu, most free operations 3495 * allocation behaviour: Most allocs on one cpu, most free operations
3476 * on another cpu. For these cases, an efficient object passing between 3496 * on another cpu. For these cases, an efficient object passing between
3477 * cpus is necessary. This is provided by a shared array. The array 3497 * cpus is necessary. This is provided by a shared array. The array
@@ -3486,9 +3506,9 @@ static void enable_cpucache(struct kmem_cache *cachep)
3486#endif 3506#endif
3487 3507
3488#if DEBUG 3508#if DEBUG
3489 /* With debugging enabled, large batchcount lead to excessively 3509 /*
3490 * long periods with disabled local interrupts. Limit the 3510 * With debugging enabled, large batchcount lead to excessively long
3491 * batchcount 3511 * periods with disabled local interrupts. Limit the batchcount
3492 */ 3512 */
3493 if (limit > 32) 3513 if (limit > 32)
3494 limit = 32; 3514 limit = 32;
@@ -3499,23 +3519,32 @@ static void enable_cpucache(struct kmem_cache *cachep)
3499 cachep->name, -err); 3519 cachep->name, -err);
3500} 3520}
3501 3521
3502static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, 3522/*
3503 int force, int node) 3523 * Drain an array if it contains any elements taking the l3 lock only if
3524 * necessary. Note that the l3 listlock also protects the array_cache
3525 * if drain_array() is used on the shared array.
3526 */
3527void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
3528 struct array_cache *ac, int force, int node)
3504{ 3529{
3505 int tofree; 3530 int tofree;
3506 3531
3507 check_spinlock_acquired_node(cachep, node); 3532 if (!ac || !ac->avail)
3533 return;
3508 if (ac->touched && !force) { 3534 if (ac->touched && !force) {
3509 ac->touched = 0; 3535 ac->touched = 0;
3510 } else if (ac->avail) { 3536 } else {
3511 tofree = force ? ac->avail : (ac->limit + 4) / 5; 3537 spin_lock_irq(&l3->list_lock);
3512 if (tofree > ac->avail) { 3538 if (ac->avail) {
3513 tofree = (ac->avail + 1) / 2; 3539 tofree = force ? ac->avail : (ac->limit + 4) / 5;
3540 if (tofree > ac->avail)
3541 tofree = (ac->avail + 1) / 2;
3542 free_block(cachep, ac->entry, tofree, node);
3543 ac->avail -= tofree;
3544 memmove(ac->entry, &(ac->entry[tofree]),
3545 sizeof(void *) * ac->avail);
3514 } 3546 }
3515 free_block(cachep, ac->entry, tofree, node); 3547 spin_unlock_irq(&l3->list_lock);
3516 ac->avail -= tofree;
3517 memmove(ac->entry, &(ac->entry[tofree]),
3518 sizeof(void *) * ac->avail);
3519 } 3548 }
3520} 3549}
3521 3550
@@ -3528,13 +3557,14 @@ static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac
3528 * - clear the per-cpu caches for this CPU. 3557 * - clear the per-cpu caches for this CPU.
3529 * - return freeable pages to the main free memory pool. 3558 * - return freeable pages to the main free memory pool.
3530 * 3559 *
3531 * If we cannot acquire the cache chain mutex then just give up - we'll 3560 * If we cannot acquire the cache chain mutex then just give up - we'll try
3532 * try again on the next iteration. 3561 * again on the next iteration.
3533 */ 3562 */
3534static void cache_reap(void *unused) 3563static void cache_reap(void *unused)
3535{ 3564{
3536 struct list_head *walk; 3565 struct list_head *walk;
3537 struct kmem_list3 *l3; 3566 struct kmem_list3 *l3;
3567 int node = numa_node_id();
3538 3568
3539 if (!mutex_trylock(&cache_chain_mutex)) { 3569 if (!mutex_trylock(&cache_chain_mutex)) {
3540 /* Give up. Setup the next iteration. */ 3570 /* Give up. Setup the next iteration. */
@@ -3550,65 +3580,72 @@ static void cache_reap(void *unused)
3550 struct slab *slabp; 3580 struct slab *slabp;
3551 3581
3552 searchp = list_entry(walk, struct kmem_cache, next); 3582 searchp = list_entry(walk, struct kmem_cache, next);
3553
3554 if (searchp->flags & SLAB_NO_REAP)
3555 goto next;
3556
3557 check_irq_on(); 3583 check_irq_on();
3558 3584
3559 l3 = searchp->nodelists[numa_node_id()]; 3585 /*
3586 * We only take the l3 lock if absolutely necessary and we
3587 * have established with reasonable certainty that
3588 * we can do some work if the lock was obtained.
3589 */
3590 l3 = searchp->nodelists[node];
3591
3560 reap_alien(searchp, l3); 3592 reap_alien(searchp, l3);
3561 spin_lock_irq(&l3->list_lock);
3562 3593
3563 drain_array_locked(searchp, cpu_cache_get(searchp), 0, 3594 drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
3564 numa_node_id());
3565 3595
3596 /*
3597 * These are racy checks but it does not matter
3598 * if we skip one check or scan twice.
3599 */
3566 if (time_after(l3->next_reap, jiffies)) 3600 if (time_after(l3->next_reap, jiffies))
3567 goto next_unlock; 3601 goto next;
3568 3602
3569 l3->next_reap = jiffies + REAPTIMEOUT_LIST3; 3603 l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
3570 3604
3571 if (l3->shared) 3605 drain_array(searchp, l3, l3->shared, 0, node);
3572 drain_array_locked(searchp, l3->shared, 0,
3573 numa_node_id());
3574 3606
3575 if (l3->free_touched) { 3607 if (l3->free_touched) {
3576 l3->free_touched = 0; 3608 l3->free_touched = 0;
3577 goto next_unlock; 3609 goto next;
3578 } 3610 }
3579 3611
3580 tofree = 3612 tofree = (l3->free_limit + 5 * searchp->num - 1) /
3581 (l3->free_limit + 5 * searchp->num - 3613 (5 * searchp->num);
3582 1) / (5 * searchp->num);
3583 do { 3614 do {
3615 /*
3616 * Do not lock if there are no free blocks.
3617 */
3618 if (list_empty(&l3->slabs_free))
3619 break;
3620
3621 spin_lock_irq(&l3->list_lock);
3584 p = l3->slabs_free.next; 3622 p = l3->slabs_free.next;
3585 if (p == &(l3->slabs_free)) 3623 if (p == &(l3->slabs_free)) {
3624 spin_unlock_irq(&l3->list_lock);
3586 break; 3625 break;
3626 }
3587 3627
3588 slabp = list_entry(p, struct slab, list); 3628 slabp = list_entry(p, struct slab, list);
3589 BUG_ON(slabp->inuse); 3629 BUG_ON(slabp->inuse);
3590 list_del(&slabp->list); 3630 list_del(&slabp->list);
3591 STATS_INC_REAPED(searchp); 3631 STATS_INC_REAPED(searchp);
3592 3632
3593 /* Safe to drop the lock. The slab is no longer 3633 /*
3594 * linked to the cache. 3634 * Safe to drop the lock. The slab is no longer linked
3595 * searchp cannot disappear, we hold 3635 * to the cache. searchp cannot disappear, we hold
3596 * cache_chain_lock 3636 * cache_chain_lock
3597 */ 3637 */
3598 l3->free_objects -= searchp->num; 3638 l3->free_objects -= searchp->num;
3599 spin_unlock_irq(&l3->list_lock); 3639 spin_unlock_irq(&l3->list_lock);
3600 slab_destroy(searchp, slabp); 3640 slab_destroy(searchp, slabp);
3601 spin_lock_irq(&l3->list_lock);
3602 } while (--tofree > 0); 3641 } while (--tofree > 0);
3603 next_unlock: 3642next:
3604 spin_unlock_irq(&l3->list_lock);
3605 next:
3606 cond_resched(); 3643 cond_resched();
3607 } 3644 }
3608 check_irq_on(); 3645 check_irq_on();
3609 mutex_unlock(&cache_chain_mutex); 3646 mutex_unlock(&cache_chain_mutex);
3610 next_reap_node(); 3647 next_reap_node();
3611 /* Setup the next iteration */ 3648 /* Set up the next iteration */
3612 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 3649 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
3613} 3650}
3614 3651
@@ -3658,8 +3695,8 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3658{ 3695{
3659 struct kmem_cache *cachep = p; 3696 struct kmem_cache *cachep = p;
3660 ++*pos; 3697 ++*pos;
3661 return cachep->next.next == &cache_chain ? NULL 3698 return cachep->next.next == &cache_chain ?
3662 : list_entry(cachep->next.next, struct kmem_cache, next); 3699 NULL : list_entry(cachep->next.next, struct kmem_cache, next);
3663} 3700}
3664 3701
3665static void s_stop(struct seq_file *m, void *p) 3702static void s_stop(struct seq_file *m, void *p)
@@ -3681,7 +3718,6 @@ static int s_show(struct seq_file *m, void *p)
3681 int node; 3718 int node;
3682 struct kmem_list3 *l3; 3719 struct kmem_list3 *l3;
3683 3720
3684 spin_lock(&cachep->spinlock);
3685 active_objs = 0; 3721 active_objs = 0;
3686 num_slabs = 0; 3722 num_slabs = 0;
3687 for_each_online_node(node) { 3723 for_each_online_node(node) {
@@ -3748,7 +3784,9 @@ static int s_show(struct seq_file *m, void *p)
3748 unsigned long node_frees = cachep->node_frees; 3784 unsigned long node_frees = cachep->node_frees;
3749 3785
3750 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ 3786 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
3751 %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees); 3787 %4lu %4lu %4lu %4lu", allocs, high, grown,
3788 reaped, errors, max_freeable, node_allocs,
3789 node_frees);
3752 } 3790 }
3753 /* cpu stats */ 3791 /* cpu stats */
3754 { 3792 {
@@ -3762,7 +3800,6 @@ static int s_show(struct seq_file *m, void *p)
3762 } 3800 }
3763#endif 3801#endif
3764 seq_putc(m, '\n'); 3802 seq_putc(m, '\n');
3765 spin_unlock(&cachep->spinlock);
3766 return 0; 3803 return 0;
3767} 3804}
3768 3805
@@ -3820,13 +3857,12 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3820 mutex_lock(&cache_chain_mutex); 3857 mutex_lock(&cache_chain_mutex);
3821 res = -EINVAL; 3858 res = -EINVAL;
3822 list_for_each(p, &cache_chain) { 3859 list_for_each(p, &cache_chain) {
3823 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, 3860 struct kmem_cache *cachep;
3824 next);
3825 3861
3862 cachep = list_entry(p, struct kmem_cache, next);
3826 if (!strcmp(cachep->name, kbuf)) { 3863 if (!strcmp(cachep->name, kbuf)) {
3827 if (limit < 1 || 3864 if (limit < 1 || batchcount < 1 ||
3828 batchcount < 1 || 3865 batchcount > limit || shared < 0) {
3829 batchcount > limit || shared < 0) {
3830 res = 0; 3866 res = 0;
3831 } else { 3867 } else {
3832 res = do_tune_cpucache(cachep, limit, 3868 res = do_tune_cpucache(cachep, limit,
diff --git a/mm/swap.c b/mm/swap.c
index b524ea90bd..91b7e2026f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -209,19 +209,18 @@ int lru_add_drain_all(void)
209 */ 209 */
210void fastcall __page_cache_release(struct page *page) 210void fastcall __page_cache_release(struct page *page)
211{ 211{
212 unsigned long flags; 212 if (PageLRU(page)) {
213 struct zone *zone = page_zone(page); 213 unsigned long flags;
214 struct zone *zone = page_zone(page);
214 215
215 spin_lock_irqsave(&zone->lru_lock, flags); 216 spin_lock_irqsave(&zone->lru_lock, flags);
216 if (TestClearPageLRU(page)) 217 BUG_ON(!PageLRU(page));
218 __ClearPageLRU(page);
217 del_page_from_lru(zone, page); 219 del_page_from_lru(zone, page);
218 if (page_count(page) != 0) 220 spin_unlock_irqrestore(&zone->lru_lock, flags);
219 page = NULL; 221 }
220 spin_unlock_irqrestore(&zone->lru_lock, flags); 222 free_hot_page(page);
221 if (page)
222 free_hot_page(page);
223} 223}
224
225EXPORT_SYMBOL(__page_cache_release); 224EXPORT_SYMBOL(__page_cache_release);
226 225
227/* 226/*
@@ -245,7 +244,6 @@ void release_pages(struct page **pages, int nr, int cold)
245 pagevec_init(&pages_to_free, cold); 244 pagevec_init(&pages_to_free, cold);
246 for (i = 0; i < nr; i++) { 245 for (i = 0; i < nr; i++) {
247 struct page *page = pages[i]; 246 struct page *page = pages[i];
248 struct zone *pagezone;
249 247
250 if (unlikely(PageCompound(page))) { 248 if (unlikely(PageCompound(page))) {
251 if (zone) { 249 if (zone) {
@@ -259,23 +257,27 @@ void release_pages(struct page **pages, int nr, int cold)
259 if (!put_page_testzero(page)) 257 if (!put_page_testzero(page))
260 continue; 258 continue;
261 259
262 pagezone = page_zone(page); 260 if (PageLRU(page)) {
263 if (pagezone != zone) { 261 struct zone *pagezone = page_zone(page);
264 if (zone) 262 if (pagezone != zone) {
265 spin_unlock_irq(&zone->lru_lock); 263 if (zone)
266 zone = pagezone; 264 spin_unlock_irq(&zone->lru_lock);
267 spin_lock_irq(&zone->lru_lock); 265 zone = pagezone;
268 } 266 spin_lock_irq(&zone->lru_lock);
269 if (TestClearPageLRU(page)) 267 }
268 BUG_ON(!PageLRU(page));
269 __ClearPageLRU(page);
270 del_page_from_lru(zone, page); 270 del_page_from_lru(zone, page);
271 if (page_count(page) == 0) { 271 }
272 if (!pagevec_add(&pages_to_free, page)) { 272
273 if (!pagevec_add(&pages_to_free, page)) {
274 if (zone) {
273 spin_unlock_irq(&zone->lru_lock); 275 spin_unlock_irq(&zone->lru_lock);
274 __pagevec_free(&pages_to_free); 276 zone = NULL;
275 pagevec_reinit(&pages_to_free);
276 zone = NULL; /* No lock is held */
277 } 277 }
278 } 278 __pagevec_free(&pages_to_free);
279 pagevec_reinit(&pages_to_free);
280 }
279 } 281 }
280 if (zone) 282 if (zone)
281 spin_unlock_irq(&zone->lru_lock); 283 spin_unlock_irq(&zone->lru_lock);
@@ -343,8 +345,8 @@ void __pagevec_lru_add(struct pagevec *pvec)
343 zone = pagezone; 345 zone = pagezone;
344 spin_lock_irq(&zone->lru_lock); 346 spin_lock_irq(&zone->lru_lock);
345 } 347 }
346 if (TestSetPageLRU(page)) 348 BUG_ON(PageLRU(page));
347 BUG(); 349 SetPageLRU(page);
348 add_page_to_inactive_list(zone, page); 350 add_page_to_inactive_list(zone, page);
349 } 351 }
350 if (zone) 352 if (zone)
@@ -370,10 +372,10 @@ void __pagevec_lru_add_active(struct pagevec *pvec)
370 zone = pagezone; 372 zone = pagezone;
371 spin_lock_irq(&zone->lru_lock); 373 spin_lock_irq(&zone->lru_lock);
372 } 374 }
373 if (TestSetPageLRU(page)) 375 BUG_ON(PageLRU(page));
374 BUG(); 376 SetPageLRU(page);
375 if (TestSetPageActive(page)) 377 BUG_ON(PageActive(page));
376 BUG(); 378 SetPageActive(page);
377 add_page_to_active_list(zone, page); 379 add_page_to_active_list(zone, page);
378 } 380 }
379 if (zone) 381 if (zone)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index db8a3d3e16..d7af296833 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -15,6 +15,7 @@
15#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
16#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
17#include <linux/pagevec.h> 17#include <linux/pagevec.h>
18#include <linux/migrate.h>
18 19
19#include <asm/pgtable.h> 20#include <asm/pgtable.h>
20 21
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1f9cf0d073..365ed6ff18 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -116,7 +116,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
116 last_in_cluster = offset + SWAPFILE_CLUSTER; 116 last_in_cluster = offset + SWAPFILE_CLUSTER;
117 else if (offset == last_in_cluster) { 117 else if (offset == last_in_cluster) {
118 spin_lock(&swap_lock); 118 spin_lock(&swap_lock);
119 si->cluster_next = offset-SWAPFILE_CLUSTER-1; 119 si->cluster_next = offset-SWAPFILE_CLUSTER+1;
120 goto cluster; 120 goto cluster;
121 } 121 }
122 if (unlikely(--latency_ration < 0)) { 122 if (unlikely(--latency_ration < 0)) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4fe7e3aa02..fd572bbdc9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -33,39 +33,21 @@
33#include <linux/cpuset.h> 33#include <linux/cpuset.h>
34#include <linux/notifier.h> 34#include <linux/notifier.h>
35#include <linux/rwsem.h> 35#include <linux/rwsem.h>
36#include <linux/delay.h>
36 37
37#include <asm/tlbflush.h> 38#include <asm/tlbflush.h>
38#include <asm/div64.h> 39#include <asm/div64.h>
39 40
40#include <linux/swapops.h> 41#include <linux/swapops.h>
41 42
42/* possible outcome of pageout() */ 43#include "internal.h"
43typedef enum {
44 /* failed to write page out, page is locked */
45 PAGE_KEEP,
46 /* move page to the active list, page is locked */
47 PAGE_ACTIVATE,
48 /* page has been sent to the disk successfully, page is unlocked */
49 PAGE_SUCCESS,
50 /* page is clean and locked */
51 PAGE_CLEAN,
52} pageout_t;
53 44
54struct scan_control { 45struct scan_control {
55 /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */
56 unsigned long nr_to_scan;
57
58 /* Incremented by the number of inactive pages that were scanned */ 46 /* Incremented by the number of inactive pages that were scanned */
59 unsigned long nr_scanned; 47 unsigned long nr_scanned;
60 48
61 /* Incremented by the number of pages reclaimed */
62 unsigned long nr_reclaimed;
63
64 unsigned long nr_mapped; /* From page_state */ 49 unsigned long nr_mapped; /* From page_state */
65 50
66 /* Ask shrink_caches, or shrink_zone to scan at this priority */
67 unsigned int priority;
68
69 /* This context's GFP mask */ 51 /* This context's GFP mask */
70 gfp_t gfp_mask; 52 gfp_t gfp_mask;
71 53
@@ -183,10 +165,11 @@ EXPORT_SYMBOL(remove_shrinker);
183 * 165 *
184 * Returns the number of slab objects which we shrunk. 166 * Returns the number of slab objects which we shrunk.
185 */ 167 */
186int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages) 168unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
169 unsigned long lru_pages)
187{ 170{
188 struct shrinker *shrinker; 171 struct shrinker *shrinker;
189 int ret = 0; 172 unsigned long ret = 0;
190 173
191 if (scanned == 0) 174 if (scanned == 0)
192 scanned = SWAP_CLUSTER_MAX; 175 scanned = SWAP_CLUSTER_MAX;
@@ -306,9 +289,10 @@ static void handle_write_error(struct address_space *mapping,
306} 289}
307 290
308/* 291/*
309 * pageout is called by shrink_list() for each dirty page. Calls ->writepage(). 292 * pageout is called by shrink_page_list() for each dirty page.
293 * Calls ->writepage().
310 */ 294 */
311static pageout_t pageout(struct page *page, struct address_space *mapping) 295pageout_t pageout(struct page *page, struct address_space *mapping)
312{ 296{
313 /* 297 /*
314 * If the page is dirty, only perform writeback if that write 298 * If the page is dirty, only perform writeback if that write
@@ -376,7 +360,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
376 return PAGE_CLEAN; 360 return PAGE_CLEAN;
377} 361}
378 362
379static int remove_mapping(struct address_space *mapping, struct page *page) 363int remove_mapping(struct address_space *mapping, struct page *page)
380{ 364{
381 if (!mapping) 365 if (!mapping)
382 return 0; /* truncate got there first */ 366 return 0; /* truncate got there first */
@@ -414,14 +398,15 @@ cannot_free:
414} 398}
415 399
416/* 400/*
417 * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed 401 * shrink_page_list() returns the number of reclaimed pages
418 */ 402 */
419static int shrink_list(struct list_head *page_list, struct scan_control *sc) 403static unsigned long shrink_page_list(struct list_head *page_list,
404 struct scan_control *sc)
420{ 405{
421 LIST_HEAD(ret_pages); 406 LIST_HEAD(ret_pages);
422 struct pagevec freed_pvec; 407 struct pagevec freed_pvec;
423 int pgactivate = 0; 408 int pgactivate = 0;
424 int reclaimed = 0; 409 unsigned long nr_reclaimed = 0;
425 410
426 cond_resched(); 411 cond_resched();
427 412
@@ -464,12 +449,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
464 * Anonymous process memory has backing store? 449 * Anonymous process memory has backing store?
465 * Try to allocate it some swap space here. 450 * Try to allocate it some swap space here.
466 */ 451 */
467 if (PageAnon(page) && !PageSwapCache(page)) { 452 if (PageAnon(page) && !PageSwapCache(page))
468 if (!sc->may_swap)
469 goto keep_locked;
470 if (!add_to_swap(page, GFP_ATOMIC)) 453 if (!add_to_swap(page, GFP_ATOMIC))
471 goto activate_locked; 454 goto activate_locked;
472 }
473#endif /* CONFIG_SWAP */ 455#endif /* CONFIG_SWAP */
474 456
475 mapping = page_mapping(page); 457 mapping = page_mapping(page);
@@ -481,12 +463,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
481 * processes. Try to unmap it here. 463 * processes. Try to unmap it here.
482 */ 464 */
483 if (page_mapped(page) && mapping) { 465 if (page_mapped(page) && mapping) {
484 /*
485 * No unmapping if we do not swap
486 */
487 if (!sc->may_swap)
488 goto keep_locked;
489
490 switch (try_to_unmap(page, 0)) { 466 switch (try_to_unmap(page, 0)) {
491 case SWAP_FAIL: 467 case SWAP_FAIL:
492 goto activate_locked; 468 goto activate_locked;
@@ -561,7 +537,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
561 537
562free_it: 538free_it:
563 unlock_page(page); 539 unlock_page(page);
564 reclaimed++; 540 nr_reclaimed++;
565 if (!pagevec_add(&freed_pvec, page)) 541 if (!pagevec_add(&freed_pvec, page))
566 __pagevec_release_nonlru(&freed_pvec); 542 __pagevec_release_nonlru(&freed_pvec);
567 continue; 543 continue;
@@ -579,483 +555,8 @@ keep:
579 if (pagevec_count(&freed_pvec)) 555 if (pagevec_count(&freed_pvec))
580 __pagevec_release_nonlru(&freed_pvec); 556 __pagevec_release_nonlru(&freed_pvec);
581 mod_page_state(pgactivate, pgactivate); 557 mod_page_state(pgactivate, pgactivate);
582 sc->nr_reclaimed += reclaimed; 558 return nr_reclaimed;
583 return reclaimed;
584}
585
586#ifdef CONFIG_MIGRATION
587static inline void move_to_lru(struct page *page)
588{
589 list_del(&page->lru);
590 if (PageActive(page)) {
591 /*
592 * lru_cache_add_active checks that
593 * the PG_active bit is off.
594 */
595 ClearPageActive(page);
596 lru_cache_add_active(page);
597 } else {
598 lru_cache_add(page);
599 }
600 put_page(page);
601}
602
603/*
604 * Add isolated pages on the list back to the LRU.
605 *
606 * returns the number of pages put back.
607 */
608int putback_lru_pages(struct list_head *l)
609{
610 struct page *page;
611 struct page *page2;
612 int count = 0;
613
614 list_for_each_entry_safe(page, page2, l, lru) {
615 move_to_lru(page);
616 count++;
617 }
618 return count;
619}
620
621/*
622 * Non migratable page
623 */
624int fail_migrate_page(struct page *newpage, struct page *page)
625{
626 return -EIO;
627}
628EXPORT_SYMBOL(fail_migrate_page);
629
630/*
631 * swapout a single page
632 * page is locked upon entry, unlocked on exit
633 */
634static int swap_page(struct page *page)
635{
636 struct address_space *mapping = page_mapping(page);
637
638 if (page_mapped(page) && mapping)
639 if (try_to_unmap(page, 1) != SWAP_SUCCESS)
640 goto unlock_retry;
641
642 if (PageDirty(page)) {
643 /* Page is dirty, try to write it out here */
644 switch(pageout(page, mapping)) {
645 case PAGE_KEEP:
646 case PAGE_ACTIVATE:
647 goto unlock_retry;
648
649 case PAGE_SUCCESS:
650 goto retry;
651
652 case PAGE_CLEAN:
653 ; /* try to free the page below */
654 }
655 }
656
657 if (PagePrivate(page)) {
658 if (!try_to_release_page(page, GFP_KERNEL) ||
659 (!mapping && page_count(page) == 1))
660 goto unlock_retry;
661 }
662
663 if (remove_mapping(mapping, page)) {
664 /* Success */
665 unlock_page(page);
666 return 0;
667 }
668
669unlock_retry:
670 unlock_page(page);
671
672retry:
673 return -EAGAIN;
674}
675EXPORT_SYMBOL(swap_page);
676
677/*
678 * Page migration was first developed in the context of the memory hotplug
679 * project. The main authors of the migration code are:
680 *
681 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
682 * Hirokazu Takahashi <taka@valinux.co.jp>
683 * Dave Hansen <haveblue@us.ibm.com>
684 * Christoph Lameter <clameter@sgi.com>
685 */
686
687/*
688 * Remove references for a page and establish the new page with the correct
689 * basic settings to be able to stop accesses to the page.
690 */
691int migrate_page_remove_references(struct page *newpage,
692 struct page *page, int nr_refs)
693{
694 struct address_space *mapping = page_mapping(page);
695 struct page **radix_pointer;
696
697 /*
698 * Avoid doing any of the following work if the page count
699 * indicates that the page is in use or truncate has removed
700 * the page.
701 */
702 if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
703 return -EAGAIN;
704
705 /*
706 * Establish swap ptes for anonymous pages or destroy pte
707 * maps for files.
708 *
709 * In order to reestablish file backed mappings the fault handlers
710 * will take the radix tree_lock which may then be used to stop
711 * processses from accessing this page until the new page is ready.
712 *
713 * A process accessing via a swap pte (an anonymous page) will take a
714 * page_lock on the old page which will block the process until the
715 * migration attempt is complete. At that time the PageSwapCache bit
716 * will be examined. If the page was migrated then the PageSwapCache
717 * bit will be clear and the operation to retrieve the page will be
718 * retried which will find the new page in the radix tree. Then a new
719 * direct mapping may be generated based on the radix tree contents.
720 *
721 * If the page was not migrated then the PageSwapCache bit
722 * is still set and the operation may continue.
723 */
724 if (try_to_unmap(page, 1) == SWAP_FAIL)
725 /* A vma has VM_LOCKED set -> Permanent failure */
726 return -EPERM;
727
728 /*
729 * Give up if we were unable to remove all mappings.
730 */
731 if (page_mapcount(page))
732 return -EAGAIN;
733
734 write_lock_irq(&mapping->tree_lock);
735
736 radix_pointer = (struct page **)radix_tree_lookup_slot(
737 &mapping->page_tree,
738 page_index(page));
739
740 if (!page_mapping(page) || page_count(page) != nr_refs ||
741 *radix_pointer != page) {
742 write_unlock_irq(&mapping->tree_lock);
743 return -EAGAIN;
744 }
745
746 /*
747 * Now we know that no one else is looking at the page.
748 *
749 * Certain minimal information about a page must be available
750 * in order for other subsystems to properly handle the page if they
751 * find it through the radix tree update before we are finished
752 * copying the page.
753 */
754 get_page(newpage);
755 newpage->index = page->index;
756 newpage->mapping = page->mapping;
757 if (PageSwapCache(page)) {
758 SetPageSwapCache(newpage);
759 set_page_private(newpage, page_private(page));
760 }
761
762 *radix_pointer = newpage;
763 __put_page(page);
764 write_unlock_irq(&mapping->tree_lock);
765
766 return 0;
767}
768EXPORT_SYMBOL(migrate_page_remove_references);
769
770/*
771 * Copy the page to its new location
772 */
773void migrate_page_copy(struct page *newpage, struct page *page)
774{
775 copy_highpage(newpage, page);
776
777 if (PageError(page))
778 SetPageError(newpage);
779 if (PageReferenced(page))
780 SetPageReferenced(newpage);
781 if (PageUptodate(page))
782 SetPageUptodate(newpage);
783 if (PageActive(page))
784 SetPageActive(newpage);
785 if (PageChecked(page))
786 SetPageChecked(newpage);
787 if (PageMappedToDisk(page))
788 SetPageMappedToDisk(newpage);
789
790 if (PageDirty(page)) {
791 clear_page_dirty_for_io(page);
792 set_page_dirty(newpage);
793 }
794
795 ClearPageSwapCache(page);
796 ClearPageActive(page);
797 ClearPagePrivate(page);
798 set_page_private(page, 0);
799 page->mapping = NULL;
800
801 /*
802 * If any waiters have accumulated on the new page then
803 * wake them up.
804 */
805 if (PageWriteback(newpage))
806 end_page_writeback(newpage);
807}
808EXPORT_SYMBOL(migrate_page_copy);
809
810/*
811 * Common logic to directly migrate a single page suitable for
812 * pages that do not use PagePrivate.
813 *
814 * Pages are locked upon entry and exit.
815 */
816int migrate_page(struct page *newpage, struct page *page)
817{
818 int rc;
819
820 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
821
822 rc = migrate_page_remove_references(newpage, page, 2);
823
824 if (rc)
825 return rc;
826
827 migrate_page_copy(newpage, page);
828
829 /*
830 * Remove auxiliary swap entries and replace
831 * them with real ptes.
832 *
833 * Note that a real pte entry will allow processes that are not
834 * waiting on the page lock to use the new page via the page tables
835 * before the new page is unlocked.
836 */
837 remove_from_swap(newpage);
838 return 0;
839} 559}
840EXPORT_SYMBOL(migrate_page);
841
842/*
843 * migrate_pages
844 *
845 * Two lists are passed to this function. The first list
846 * contains the pages isolated from the LRU to be migrated.
847 * The second list contains new pages that the pages isolated
848 * can be moved to. If the second list is NULL then all
849 * pages are swapped out.
850 *
851 * The function returns after 10 attempts or if no pages
852 * are movable anymore because to has become empty
853 * or no retryable pages exist anymore.
854 *
855 * Return: Number of pages not migrated when "to" ran empty.
856 */
857int migrate_pages(struct list_head *from, struct list_head *to,
858 struct list_head *moved, struct list_head *failed)
859{
860 int retry;
861 int nr_failed = 0;
862 int pass = 0;
863 struct page *page;
864 struct page *page2;
865 int swapwrite = current->flags & PF_SWAPWRITE;
866 int rc;
867
868 if (!swapwrite)
869 current->flags |= PF_SWAPWRITE;
870
871redo:
872 retry = 0;
873
874 list_for_each_entry_safe(page, page2, from, lru) {
875 struct page *newpage = NULL;
876 struct address_space *mapping;
877
878 cond_resched();
879
880 rc = 0;
881 if (page_count(page) == 1)
882 /* page was freed from under us. So we are done. */
883 goto next;
884
885 if (to && list_empty(to))
886 break;
887
888 /*
889 * Skip locked pages during the first two passes to give the
890 * functions holding the lock time to release the page. Later we
891 * use lock_page() to have a higher chance of acquiring the
892 * lock.
893 */
894 rc = -EAGAIN;
895 if (pass > 2)
896 lock_page(page);
897 else
898 if (TestSetPageLocked(page))
899 goto next;
900
901 /*
902 * Only wait on writeback if we have already done a pass where
903 * we we may have triggered writeouts for lots of pages.
904 */
905 if (pass > 0) {
906 wait_on_page_writeback(page);
907 } else {
908 if (PageWriteback(page))
909 goto unlock_page;
910 }
911
912 /*
913 * Anonymous pages must have swap cache references otherwise
914 * the information contained in the page maps cannot be
915 * preserved.
916 */
917 if (PageAnon(page) && !PageSwapCache(page)) {
918 if (!add_to_swap(page, GFP_KERNEL)) {
919 rc = -ENOMEM;
920 goto unlock_page;
921 }
922 }
923
924 if (!to) {
925 rc = swap_page(page);
926 goto next;
927 }
928
929 newpage = lru_to_page(to);
930 lock_page(newpage);
931
932 /*
933 * Pages are properly locked and writeback is complete.
934 * Try to migrate the page.
935 */
936 mapping = page_mapping(page);
937 if (!mapping)
938 goto unlock_both;
939
940 if (mapping->a_ops->migratepage) {
941 /*
942 * Most pages have a mapping and most filesystems
943 * should provide a migration function. Anonymous
944 * pages are part of swap space which also has its
945 * own migration function. This is the most common
946 * path for page migration.
947 */
948 rc = mapping->a_ops->migratepage(newpage, page);
949 goto unlock_both;
950 }
951
952 /*
953 * Default handling if a filesystem does not provide
954 * a migration function. We can only migrate clean
955 * pages so try to write out any dirty pages first.
956 */
957 if (PageDirty(page)) {
958 switch (pageout(page, mapping)) {
959 case PAGE_KEEP:
960 case PAGE_ACTIVATE:
961 goto unlock_both;
962
963 case PAGE_SUCCESS:
964 unlock_page(newpage);
965 goto next;
966
967 case PAGE_CLEAN:
968 ; /* try to migrate the page below */
969 }
970 }
971
972 /*
973 * Buffers are managed in a filesystem specific way.
974 * We must have no buffers or drop them.
975 */
976 if (!page_has_buffers(page) ||
977 try_to_release_page(page, GFP_KERNEL)) {
978 rc = migrate_page(newpage, page);
979 goto unlock_both;
980 }
981
982 /*
983 * On early passes with mapped pages simply
984 * retry. There may be a lock held for some
985 * buffers that may go away. Later
986 * swap them out.
987 */
988 if (pass > 4) {
989 /*
990 * Persistently unable to drop buffers..... As a
991 * measure of last resort we fall back to
992 * swap_page().
993 */
994 unlock_page(newpage);
995 newpage = NULL;
996 rc = swap_page(page);
997 goto next;
998 }
999
1000unlock_both:
1001 unlock_page(newpage);
1002
1003unlock_page:
1004 unlock_page(page);
1005
1006next:
1007 if (rc == -EAGAIN) {
1008 retry++;
1009 } else if (rc) {
1010 /* Permanent failure */
1011 list_move(&page->lru, failed);
1012 nr_failed++;
1013 } else {
1014 if (newpage) {
1015 /* Successful migration. Return page to LRU */
1016 move_to_lru(newpage);
1017 }
1018 list_move(&page->lru, moved);
1019 }
1020 }
1021 if (retry && pass++ < 10)
1022 goto redo;
1023
1024 if (!swapwrite)
1025 current->flags &= ~PF_SWAPWRITE;
1026
1027 return nr_failed + retry;
1028}
1029
1030/*
1031 * Isolate one page from the LRU lists and put it on the
1032 * indicated list with elevated refcount.
1033 *
1034 * Result:
1035 * 0 = page not on LRU list
1036 * 1 = page removed from LRU list and added to the specified list.
1037 */
1038int isolate_lru_page(struct page *page)
1039{
1040 int ret = 0;
1041
1042 if (PageLRU(page)) {
1043 struct zone *zone = page_zone(page);
1044 spin_lock_irq(&zone->lru_lock);
1045 if (TestClearPageLRU(page)) {
1046 ret = 1;
1047 get_page(page);
1048 if (PageActive(page))
1049 del_page_from_active_list(zone, page);
1050 else
1051 del_page_from_inactive_list(zone, page);
1052 }
1053 spin_unlock_irq(&zone->lru_lock);
1054 }
1055
1056 return ret;
1057}
1058#endif
1059 560
1060/* 561/*
1061 * zone->lru_lock is heavily contended. Some of the functions that 562 * zone->lru_lock is heavily contended. Some of the functions that
@@ -1074,32 +575,35 @@ int isolate_lru_page(struct page *page)
1074 * 575 *
1075 * returns how many pages were moved onto *@dst. 576 * returns how many pages were moved onto *@dst.
1076 */ 577 */
1077static int isolate_lru_pages(int nr_to_scan, struct list_head *src, 578static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1078 struct list_head *dst, int *scanned) 579 struct list_head *src, struct list_head *dst,
580 unsigned long *scanned)
1079{ 581{
1080 int nr_taken = 0; 582 unsigned long nr_taken = 0;
1081 struct page *page; 583 struct page *page;
1082 int scan = 0; 584 unsigned long scan;
1083 585
1084 while (scan++ < nr_to_scan && !list_empty(src)) { 586 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
587 struct list_head *target;
1085 page = lru_to_page(src); 588 page = lru_to_page(src);
1086 prefetchw_prev_lru_page(page, src, flags); 589 prefetchw_prev_lru_page(page, src, flags);
1087 590
1088 if (!TestClearPageLRU(page)) 591 BUG_ON(!PageLRU(page));
1089 BUG(); 592
1090 list_del(&page->lru); 593 list_del(&page->lru);
1091 if (get_page_testone(page)) { 594 target = src;
595 if (likely(get_page_unless_zero(page))) {
1092 /* 596 /*
1093 * It is being freed elsewhere 597 * Be careful not to clear PageLRU until after we're
598 * sure the page is not being freed elsewhere -- the
599 * page release code relies on it.
1094 */ 600 */
1095 __put_page(page); 601 ClearPageLRU(page);
1096 SetPageLRU(page); 602 target = dst;
1097 list_add(&page->lru, src);
1098 continue;
1099 } else {
1100 list_add(&page->lru, dst);
1101 nr_taken++; 603 nr_taken++;
1102 } 604 } /* else it is being freed elsewhere */
605
606 list_add(&page->lru, target);
1103 } 607 }
1104 608
1105 *scanned = scan; 609 *scanned = scan;
@@ -1107,23 +611,26 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
1107} 611}
1108 612
1109/* 613/*
1110 * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed 614 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
615 * of reclaimed pages
1111 */ 616 */
1112static void shrink_cache(struct zone *zone, struct scan_control *sc) 617static unsigned long shrink_inactive_list(unsigned long max_scan,
618 struct zone *zone, struct scan_control *sc)
1113{ 619{
1114 LIST_HEAD(page_list); 620 LIST_HEAD(page_list);
1115 struct pagevec pvec; 621 struct pagevec pvec;
1116 int max_scan = sc->nr_to_scan; 622 unsigned long nr_scanned = 0;
623 unsigned long nr_reclaimed = 0;
1117 624
1118 pagevec_init(&pvec, 1); 625 pagevec_init(&pvec, 1);
1119 626
1120 lru_add_drain(); 627 lru_add_drain();
1121 spin_lock_irq(&zone->lru_lock); 628 spin_lock_irq(&zone->lru_lock);
1122 while (max_scan > 0) { 629 do {
1123 struct page *page; 630 struct page *page;
1124 int nr_taken; 631 unsigned long nr_taken;
1125 int nr_scan; 632 unsigned long nr_scan;
1126 int nr_freed; 633 unsigned long nr_freed;
1127 634
1128 nr_taken = isolate_lru_pages(sc->swap_cluster_max, 635 nr_taken = isolate_lru_pages(sc->swap_cluster_max,
1129 &zone->inactive_list, 636 &zone->inactive_list,
@@ -1132,12 +639,9 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
1132 zone->pages_scanned += nr_scan; 639 zone->pages_scanned += nr_scan;
1133 spin_unlock_irq(&zone->lru_lock); 640 spin_unlock_irq(&zone->lru_lock);
1134 641
1135 if (nr_taken == 0) 642 nr_scanned += nr_scan;
1136 goto done; 643 nr_freed = shrink_page_list(&page_list, sc);
1137 644 nr_reclaimed += nr_freed;
1138 max_scan -= nr_scan;
1139 nr_freed = shrink_list(&page_list, sc);
1140
1141 local_irq_disable(); 645 local_irq_disable();
1142 if (current_is_kswapd()) { 646 if (current_is_kswapd()) {
1143 __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); 647 __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
@@ -1146,14 +650,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
1146 __mod_page_state_zone(zone, pgscan_direct, nr_scan); 650 __mod_page_state_zone(zone, pgscan_direct, nr_scan);
1147 __mod_page_state_zone(zone, pgsteal, nr_freed); 651 __mod_page_state_zone(zone, pgsteal, nr_freed);
1148 652
653 if (nr_taken == 0)
654 goto done;
655
1149 spin_lock(&zone->lru_lock); 656 spin_lock(&zone->lru_lock);
1150 /* 657 /*
1151 * Put back any unfreeable pages. 658 * Put back any unfreeable pages.
1152 */ 659 */
1153 while (!list_empty(&page_list)) { 660 while (!list_empty(&page_list)) {
1154 page = lru_to_page(&page_list); 661 page = lru_to_page(&page_list);
1155 if (TestSetPageLRU(page)) 662 BUG_ON(PageLRU(page));
1156 BUG(); 663 SetPageLRU(page);
1157 list_del(&page->lru); 664 list_del(&page->lru);
1158 if (PageActive(page)) 665 if (PageActive(page))
1159 add_page_to_active_list(zone, page); 666 add_page_to_active_list(zone, page);
@@ -1165,10 +672,12 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
1165 spin_lock_irq(&zone->lru_lock); 672 spin_lock_irq(&zone->lru_lock);
1166 } 673 }
1167 } 674 }
1168 } 675 } while (nr_scanned < max_scan);
1169 spin_unlock_irq(&zone->lru_lock); 676 spin_unlock(&zone->lru_lock);
1170done: 677done:
678 local_irq_enable();
1171 pagevec_release(&pvec); 679 pagevec_release(&pvec);
680 return nr_reclaimed;
1172} 681}
1173 682
1174/* 683/*
@@ -1188,13 +697,12 @@ done:
1188 * The downside is that we have to touch page->_count against each page. 697 * The downside is that we have to touch page->_count against each page.
1189 * But we had to alter page->flags anyway. 698 * But we had to alter page->flags anyway.
1190 */ 699 */
1191static void 700static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1192refill_inactive_zone(struct zone *zone, struct scan_control *sc) 701 struct scan_control *sc)
1193{ 702{
1194 int pgmoved; 703 unsigned long pgmoved;
1195 int pgdeactivate = 0; 704 int pgdeactivate = 0;
1196 int pgscanned; 705 unsigned long pgscanned;
1197 int nr_pages = sc->nr_to_scan;
1198 LIST_HEAD(l_hold); /* The pages which were snipped off */ 706 LIST_HEAD(l_hold); /* The pages which were snipped off */
1199 LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ 707 LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
1200 LIST_HEAD(l_active); /* Pages to go onto the active_list */ 708 LIST_HEAD(l_active); /* Pages to go onto the active_list */
@@ -1202,7 +710,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
1202 struct pagevec pvec; 710 struct pagevec pvec;
1203 int reclaim_mapped = 0; 711 int reclaim_mapped = 0;
1204 712
1205 if (unlikely(sc->may_swap)) { 713 if (sc->may_swap) {
1206 long mapped_ratio; 714 long mapped_ratio;
1207 long distress; 715 long distress;
1208 long swap_tendency; 716 long swap_tendency;
@@ -1272,10 +780,11 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
1272 while (!list_empty(&l_inactive)) { 780 while (!list_empty(&l_inactive)) {
1273 page = lru_to_page(&l_inactive); 781 page = lru_to_page(&l_inactive);
1274 prefetchw_prev_lru_page(page, &l_inactive, flags); 782 prefetchw_prev_lru_page(page, &l_inactive, flags);
1275 if (TestSetPageLRU(page)) 783 BUG_ON(PageLRU(page));
1276 BUG(); 784 SetPageLRU(page);
1277 if (!TestClearPageActive(page)) 785 BUG_ON(!PageActive(page));
1278 BUG(); 786 ClearPageActive(page);
787
1279 list_move(&page->lru, &zone->inactive_list); 788 list_move(&page->lru, &zone->inactive_list);
1280 pgmoved++; 789 pgmoved++;
1281 if (!pagevec_add(&pvec, page)) { 790 if (!pagevec_add(&pvec, page)) {
@@ -1301,8 +810,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
1301 while (!list_empty(&l_active)) { 810 while (!list_empty(&l_active)) {
1302 page = lru_to_page(&l_active); 811 page = lru_to_page(&l_active);
1303 prefetchw_prev_lru_page(page, &l_active, flags); 812 prefetchw_prev_lru_page(page, &l_active, flags);
1304 if (TestSetPageLRU(page)) 813 BUG_ON(PageLRU(page));
1305 BUG(); 814 SetPageLRU(page);
1306 BUG_ON(!PageActive(page)); 815 BUG_ON(!PageActive(page));
1307 list_move(&page->lru, &zone->active_list); 816 list_move(&page->lru, &zone->active_list);
1308 pgmoved++; 817 pgmoved++;
@@ -1327,11 +836,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
1327/* 836/*
1328 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 837 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1329 */ 838 */
1330static void 839static unsigned long shrink_zone(int priority, struct zone *zone,
1331shrink_zone(struct zone *zone, struct scan_control *sc) 840 struct scan_control *sc)
1332{ 841{
1333 unsigned long nr_active; 842 unsigned long nr_active;
1334 unsigned long nr_inactive; 843 unsigned long nr_inactive;
844 unsigned long nr_to_scan;
845 unsigned long nr_reclaimed = 0;
1335 846
1336 atomic_inc(&zone->reclaim_in_progress); 847 atomic_inc(&zone->reclaim_in_progress);
1337 848
@@ -1339,14 +850,14 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
1339 * Add one to `nr_to_scan' just to make sure that the kernel will 850 * Add one to `nr_to_scan' just to make sure that the kernel will
1340 * slowly sift through the active list. 851 * slowly sift through the active list.
1341 */ 852 */
1342 zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1; 853 zone->nr_scan_active += (zone->nr_active >> priority) + 1;
1343 nr_active = zone->nr_scan_active; 854 nr_active = zone->nr_scan_active;
1344 if (nr_active >= sc->swap_cluster_max) 855 if (nr_active >= sc->swap_cluster_max)
1345 zone->nr_scan_active = 0; 856 zone->nr_scan_active = 0;
1346 else 857 else
1347 nr_active = 0; 858 nr_active = 0;
1348 859
1349 zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1; 860 zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
1350 nr_inactive = zone->nr_scan_inactive; 861 nr_inactive = zone->nr_scan_inactive;
1351 if (nr_inactive >= sc->swap_cluster_max) 862 if (nr_inactive >= sc->swap_cluster_max)
1352 zone->nr_scan_inactive = 0; 863 zone->nr_scan_inactive = 0;
@@ -1355,23 +866,25 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
1355 866
1356 while (nr_active || nr_inactive) { 867 while (nr_active || nr_inactive) {
1357 if (nr_active) { 868 if (nr_active) {
1358 sc->nr_to_scan = min(nr_active, 869 nr_to_scan = min(nr_active,
1359 (unsigned long)sc->swap_cluster_max); 870 (unsigned long)sc->swap_cluster_max);
1360 nr_active -= sc->nr_to_scan; 871 nr_active -= nr_to_scan;
1361 refill_inactive_zone(zone, sc); 872 shrink_active_list(nr_to_scan, zone, sc);
1362 } 873 }
1363 874
1364 if (nr_inactive) { 875 if (nr_inactive) {
1365 sc->nr_to_scan = min(nr_inactive, 876 nr_to_scan = min(nr_inactive,
1366 (unsigned long)sc->swap_cluster_max); 877 (unsigned long)sc->swap_cluster_max);
1367 nr_inactive -= sc->nr_to_scan; 878 nr_inactive -= nr_to_scan;
1368 shrink_cache(zone, sc); 879 nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
880 sc);
1369 } 881 }
1370 } 882 }
1371 883
1372 throttle_vm_writeout(); 884 throttle_vm_writeout();
1373 885
1374 atomic_dec(&zone->reclaim_in_progress); 886 atomic_dec(&zone->reclaim_in_progress);
887 return nr_reclaimed;
1375} 888}
1376 889
1377/* 890/*
@@ -1390,9 +903,10 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
1390 * If a zone is deemed to be full of pinned pages then just give it a light 903 * If a zone is deemed to be full of pinned pages then just give it a light
1391 * scan then give up on it. 904 * scan then give up on it.
1392 */ 905 */
1393static void 906static unsigned long shrink_zones(int priority, struct zone **zones,
1394shrink_caches(struct zone **zones, struct scan_control *sc) 907 struct scan_control *sc)
1395{ 908{
909 unsigned long nr_reclaimed = 0;
1396 int i; 910 int i;
1397 911
1398 for (i = 0; zones[i] != NULL; i++) { 912 for (i = 0; zones[i] != NULL; i++) {
@@ -1404,15 +918,16 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
1404 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 918 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1405 continue; 919 continue;
1406 920
1407 zone->temp_priority = sc->priority; 921 zone->temp_priority = priority;
1408 if (zone->prev_priority > sc->priority) 922 if (zone->prev_priority > priority)
1409 zone->prev_priority = sc->priority; 923 zone->prev_priority = priority;
1410 924
1411 if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) 925 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1412 continue; /* Let kswapd poll it */ 926 continue; /* Let kswapd poll it */
1413 927
1414 shrink_zone(zone, sc); 928 nr_reclaimed += shrink_zone(priority, zone, sc);
1415 } 929 }
930 return nr_reclaimed;
1416} 931}
1417 932
1418/* 933/*
@@ -1428,19 +943,21 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
1428 * holds filesystem locks which prevent writeout this might not work, and the 943 * holds filesystem locks which prevent writeout this might not work, and the
1429 * allocation attempt will fail. 944 * allocation attempt will fail.
1430 */ 945 */
1431int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) 946unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1432{ 947{
1433 int priority; 948 int priority;
1434 int ret = 0; 949 int ret = 0;
1435 int total_scanned = 0, total_reclaimed = 0; 950 unsigned long total_scanned = 0;
951 unsigned long nr_reclaimed = 0;
1436 struct reclaim_state *reclaim_state = current->reclaim_state; 952 struct reclaim_state *reclaim_state = current->reclaim_state;
1437 struct scan_control sc;
1438 unsigned long lru_pages = 0; 953 unsigned long lru_pages = 0;
1439 int i; 954 int i;
1440 955 struct scan_control sc = {
1441 sc.gfp_mask = gfp_mask; 956 .gfp_mask = gfp_mask,
1442 sc.may_writepage = !laptop_mode; 957 .may_writepage = !laptop_mode,
1443 sc.may_swap = 1; 958 .swap_cluster_max = SWAP_CLUSTER_MAX,
959 .may_swap = 1,
960 };
1444 961
1445 inc_page_state(allocstall); 962 inc_page_state(allocstall);
1446 963
@@ -1457,20 +974,16 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1457 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 974 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1458 sc.nr_mapped = read_page_state(nr_mapped); 975 sc.nr_mapped = read_page_state(nr_mapped);
1459 sc.nr_scanned = 0; 976 sc.nr_scanned = 0;
1460 sc.nr_reclaimed = 0;
1461 sc.priority = priority;
1462 sc.swap_cluster_max = SWAP_CLUSTER_MAX;
1463 if (!priority) 977 if (!priority)
1464 disable_swap_token(); 978 disable_swap_token();
1465 shrink_caches(zones, &sc); 979 nr_reclaimed += shrink_zones(priority, zones, &sc);
1466 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); 980 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
1467 if (reclaim_state) { 981 if (reclaim_state) {
1468 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 982 nr_reclaimed += reclaim_state->reclaimed_slab;
1469 reclaim_state->reclaimed_slab = 0; 983 reclaim_state->reclaimed_slab = 0;
1470 } 984 }
1471 total_scanned += sc.nr_scanned; 985 total_scanned += sc.nr_scanned;
1472 total_reclaimed += sc.nr_reclaimed; 986 if (nr_reclaimed >= sc.swap_cluster_max) {
1473 if (total_reclaimed >= sc.swap_cluster_max) {
1474 ret = 1; 987 ret = 1;
1475 goto out; 988 goto out;
1476 } 989 }
@@ -1482,7 +995,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1482 * that's undesirable in laptop mode, where we *want* lumpy 995 * that's undesirable in laptop mode, where we *want* lumpy
1483 * writeout. So in laptop mode, write out the whole world. 996 * writeout. So in laptop mode, write out the whole world.
1484 */ 997 */
1485 if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) { 998 if (total_scanned > sc.swap_cluster_max +
999 sc.swap_cluster_max / 2) {
1486 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1000 wakeup_pdflush(laptop_mode ? 0 : total_scanned);
1487 sc.may_writepage = 1; 1001 sc.may_writepage = 1;
1488 } 1002 }
@@ -1528,22 +1042,26 @@ out:
1528 * the page allocator fallback scheme to ensure that aging of pages is balanced 1042 * the page allocator fallback scheme to ensure that aging of pages is balanced
1529 * across the zones. 1043 * across the zones.
1530 */ 1044 */
1531static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order) 1045static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
1046 int order)
1532{ 1047{
1533 int to_free = nr_pages; 1048 unsigned long to_free = nr_pages;
1534 int all_zones_ok; 1049 int all_zones_ok;
1535 int priority; 1050 int priority;
1536 int i; 1051 int i;
1537 int total_scanned, total_reclaimed; 1052 unsigned long total_scanned;
1053 unsigned long nr_reclaimed;
1538 struct reclaim_state *reclaim_state = current->reclaim_state; 1054 struct reclaim_state *reclaim_state = current->reclaim_state;
1539 struct scan_control sc; 1055 struct scan_control sc = {
1056 .gfp_mask = GFP_KERNEL,
1057 .may_swap = 1,
1058 .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,
1059 };
1540 1060
1541loop_again: 1061loop_again:
1542 total_scanned = 0; 1062 total_scanned = 0;
1543 total_reclaimed = 0; 1063 nr_reclaimed = 0;
1544 sc.gfp_mask = GFP_KERNEL; 1064 sc.may_writepage = !laptop_mode,
1545 sc.may_writepage = !laptop_mode;
1546 sc.may_swap = 1;
1547 sc.nr_mapped = read_page_state(nr_mapped); 1065 sc.nr_mapped = read_page_state(nr_mapped);
1548 1066
1549 inc_page_state(pageoutrun); 1067 inc_page_state(pageoutrun);
@@ -1624,15 +1142,11 @@ scan:
1624 if (zone->prev_priority > priority) 1142 if (zone->prev_priority > priority)
1625 zone->prev_priority = priority; 1143 zone->prev_priority = priority;
1626 sc.nr_scanned = 0; 1144 sc.nr_scanned = 0;
1627 sc.nr_reclaimed = 0; 1145 nr_reclaimed += shrink_zone(priority, zone, &sc);
1628 sc.priority = priority;
1629 sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
1630 shrink_zone(zone, &sc);
1631 reclaim_state->reclaimed_slab = 0; 1146 reclaim_state->reclaimed_slab = 0;
1632 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1147 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1633 lru_pages); 1148 lru_pages);
1634 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 1149 nr_reclaimed += reclaim_state->reclaimed_slab;
1635 total_reclaimed += sc.nr_reclaimed;
1636 total_scanned += sc.nr_scanned; 1150 total_scanned += sc.nr_scanned;
1637 if (zone->all_unreclaimable) 1151 if (zone->all_unreclaimable)
1638 continue; 1152 continue;
@@ -1645,10 +1159,10 @@ scan:
1645 * even in laptop mode 1159 * even in laptop mode
1646 */ 1160 */
1647 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 1161 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
1648 total_scanned > total_reclaimed+total_reclaimed/2) 1162 total_scanned > nr_reclaimed + nr_reclaimed / 2)
1649 sc.may_writepage = 1; 1163 sc.may_writepage = 1;
1650 } 1164 }
1651 if (nr_pages && to_free > total_reclaimed) 1165 if (nr_pages && to_free > nr_reclaimed)
1652 continue; /* swsusp: need to do more work */ 1166 continue; /* swsusp: need to do more work */
1653 if (all_zones_ok) 1167 if (all_zones_ok)
1654 break; /* kswapd: all done */ 1168 break; /* kswapd: all done */
@@ -1665,7 +1179,7 @@ scan:
1665 * matches the direct reclaim path behaviour in terms of impact 1179 * matches the direct reclaim path behaviour in terms of impact
1666 * on zone->*_priority. 1180 * on zone->*_priority.
1667 */ 1181 */
1668 if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages)) 1182 if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages)
1669 break; 1183 break;
1670 } 1184 }
1671out: 1185out:
@@ -1679,7 +1193,7 @@ out:
1679 goto loop_again; 1193 goto loop_again;
1680 } 1194 }
1681 1195
1682 return total_reclaimed; 1196 return nr_reclaimed;
1683} 1197}
1684 1198
1685/* 1199/*
@@ -1779,24 +1293,31 @@ void wakeup_kswapd(struct zone *zone, int order)
1779 * Try to free `nr_pages' of memory, system-wide. Returns the number of freed 1293 * Try to free `nr_pages' of memory, system-wide. Returns the number of freed
1780 * pages. 1294 * pages.
1781 */ 1295 */
1782int shrink_all_memory(int nr_pages) 1296unsigned long shrink_all_memory(unsigned long nr_pages)
1783{ 1297{
1784 pg_data_t *pgdat; 1298 pg_data_t *pgdat;
1785 int nr_to_free = nr_pages; 1299 unsigned long nr_to_free = nr_pages;
1786 int ret = 0; 1300 unsigned long ret = 0;
1301 unsigned retry = 2;
1787 struct reclaim_state reclaim_state = { 1302 struct reclaim_state reclaim_state = {
1788 .reclaimed_slab = 0, 1303 .reclaimed_slab = 0,
1789 }; 1304 };
1790 1305
1791 current->reclaim_state = &reclaim_state; 1306 current->reclaim_state = &reclaim_state;
1307repeat:
1792 for_each_pgdat(pgdat) { 1308 for_each_pgdat(pgdat) {
1793 int freed; 1309 unsigned long freed;
1310
1794 freed = balance_pgdat(pgdat, nr_to_free, 0); 1311 freed = balance_pgdat(pgdat, nr_to_free, 0);
1795 ret += freed; 1312 ret += freed;
1796 nr_to_free -= freed; 1313 nr_to_free -= freed;
1797 if (nr_to_free <= 0) 1314 if ((long)nr_to_free <= 0)
1798 break; 1315 break;
1799 } 1316 }
1317 if (retry-- && ret < nr_pages) {
1318 blk_congestion_wait(WRITE, HZ/5);
1319 goto repeat;
1320 }
1800 current->reclaim_state = NULL; 1321 current->reclaim_state = NULL;
1801 return ret; 1322 return ret;
1802} 1323}
@@ -1808,8 +1329,7 @@ int shrink_all_memory(int nr_pages)
1808 away, we get changed to run anywhere: as the first one comes back, 1329 away, we get changed to run anywhere: as the first one comes back,
1809 restore their cpu bindings. */ 1330 restore their cpu bindings. */
1810static int __devinit cpu_callback(struct notifier_block *nfb, 1331static int __devinit cpu_callback(struct notifier_block *nfb,
1811 unsigned long action, 1332 unsigned long action, void *hcpu)
1812 void *hcpu)
1813{ 1333{
1814 pg_data_t *pgdat; 1334 pg_data_t *pgdat;
1815 cpumask_t mask; 1335 cpumask_t mask;
@@ -1829,10 +1349,15 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
1829static int __init kswapd_init(void) 1349static int __init kswapd_init(void)
1830{ 1350{
1831 pg_data_t *pgdat; 1351 pg_data_t *pgdat;
1352
1832 swap_setup(); 1353 swap_setup();
1833 for_each_pgdat(pgdat) 1354 for_each_pgdat(pgdat) {
1834 pgdat->kswapd 1355 pid_t pid;
1835 = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); 1356
1357 pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
1358 BUG_ON(pid < 0);
1359 pgdat->kswapd = find_task_by_pid(pid);
1360 }
1836 total_memory = nr_free_pagecache_pages(); 1361 total_memory = nr_free_pagecache_pages();
1837 hotcpu_notifier(cpu_callback, 0); 1362 hotcpu_notifier(cpu_callback, 0);
1838 return 0; 1363 return 0;
@@ -1874,46 +1399,24 @@ int zone_reclaim_interval __read_mostly = 30*HZ;
1874/* 1399/*
1875 * Try to free up some pages from this zone through reclaim. 1400 * Try to free up some pages from this zone through reclaim.
1876 */ 1401 */
1877int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1402static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1878{ 1403{
1879 int nr_pages; 1404 /* Minimum pages needed in order to stay on node */
1405 const unsigned long nr_pages = 1 << order;
1880 struct task_struct *p = current; 1406 struct task_struct *p = current;
1881 struct reclaim_state reclaim_state; 1407 struct reclaim_state reclaim_state;
1882 struct scan_control sc; 1408 int priority;
1883 cpumask_t mask; 1409 unsigned long nr_reclaimed = 0;
1884 int node_id; 1410 struct scan_control sc = {
1885 1411 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
1886 if (time_before(jiffies, 1412 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
1887 zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) 1413 .nr_mapped = read_page_state(nr_mapped),
1888 return 0; 1414 .swap_cluster_max = max_t(unsigned long, nr_pages,
1889 1415 SWAP_CLUSTER_MAX),
1890 if (!(gfp_mask & __GFP_WAIT) || 1416 .gfp_mask = gfp_mask,
1891 zone->all_unreclaimable || 1417 };
1892 atomic_read(&zone->reclaim_in_progress) > 0 ||
1893 (p->flags & PF_MEMALLOC))
1894 return 0;
1895
1896 node_id = zone->zone_pgdat->node_id;
1897 mask = node_to_cpumask(node_id);
1898 if (!cpus_empty(mask) && node_id != numa_node_id())
1899 return 0;
1900
1901 sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
1902 sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
1903 sc.nr_scanned = 0;
1904 sc.nr_reclaimed = 0;
1905 sc.priority = ZONE_RECLAIM_PRIORITY + 1;
1906 sc.nr_mapped = read_page_state(nr_mapped);
1907 sc.gfp_mask = gfp_mask;
1908 1418
1909 disable_swap_token(); 1419 disable_swap_token();
1910
1911 nr_pages = 1 << order;
1912 if (nr_pages > SWAP_CLUSTER_MAX)
1913 sc.swap_cluster_max = nr_pages;
1914 else
1915 sc.swap_cluster_max = SWAP_CLUSTER_MAX;
1916
1917 cond_resched(); 1420 cond_resched();
1918 /* 1421 /*
1919 * We need to be able to allocate from the reserves for RECLAIM_SWAP 1422 * We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -1928,17 +1431,20 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1928 * Free memory by calling shrink zone with increasing priorities 1431 * Free memory by calling shrink zone with increasing priorities
1929 * until we have enough memory freed. 1432 * until we have enough memory freed.
1930 */ 1433 */
1434 priority = ZONE_RECLAIM_PRIORITY;
1931 do { 1435 do {
1932 sc.priority--; 1436 nr_reclaimed += shrink_zone(priority, zone, &sc);
1933 shrink_zone(zone, &sc); 1437 priority--;
1438 } while (priority >= 0 && nr_reclaimed < nr_pages);
1934 1439
1935 } while (sc.nr_reclaimed < nr_pages && sc.priority > 0); 1440 if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
1936
1937 if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
1938 /* 1441 /*
1939 * shrink_slab does not currently allow us to determine 1442 * shrink_slab() does not currently allow us to determine how
1940 * how many pages were freed in the zone. So we just 1443 * many pages were freed in this zone. So we just shake the slab
1941 * shake the slab and then go offnode for a single allocation. 1444 * a bit and then go off node for this particular allocation
1445 * despite possibly having freed enough memory to allocate in
1446 * this zone. If we freed local memory then the next
1447 * allocations will be local again.
1942 * 1448 *
1943 * shrink_slab will free memory on all zones and may take 1449 * shrink_slab will free memory on all zones and may take
1944 * a long time. 1450 * a long time.
@@ -1949,10 +1455,54 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1949 p->reclaim_state = NULL; 1455 p->reclaim_state = NULL;
1950 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 1456 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
1951 1457
1952 if (sc.nr_reclaimed == 0) 1458 if (nr_reclaimed == 0) {
1459 /*
1460 * We were unable to reclaim enough pages to stay on node. We
1461 * now allow off node accesses for a certain time period before
1462 * trying again to reclaim pages from the local zone.
1463 */
1953 zone->last_unsuccessful_zone_reclaim = jiffies; 1464 zone->last_unsuccessful_zone_reclaim = jiffies;
1465 }
1954 1466
1955 return sc.nr_reclaimed >= nr_pages; 1467 return nr_reclaimed >= nr_pages;
1956} 1468}
1957#endif
1958 1469
1470int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1471{
1472 cpumask_t mask;
1473 int node_id;
1474
1475 /*
1476 * Do not reclaim if there was a recent unsuccessful attempt at zone
1477 * reclaim. In that case we let allocations go off node for the
1478 * zone_reclaim_interval. Otherwise we would scan for each off-node
1479 * page allocation.
1480 */
1481 if (time_before(jiffies,
1482 zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
1483 return 0;
1484
1485 /*
1486 * Avoid concurrent zone reclaims, do not reclaim in a zone that does
1487 * not have reclaimable pages and if we should not delay the allocation
1488 * then do not scan.
1489 */
1490 if (!(gfp_mask & __GFP_WAIT) ||
1491 zone->all_unreclaimable ||
1492 atomic_read(&zone->reclaim_in_progress) > 0 ||
1493 (current->flags & PF_MEMALLOC))
1494 return 0;
1495
1496 /*
1497 * Only run zone reclaim on the local zone or on zones that do not
1498 * have associated processors. This will favor the local processor
1499 * over remote processors and spread off node memory allocations
1500 * as wide as possible.
1501 */
1502 node_id = zone->zone_pgdat->node_id;
1503 mask = node_to_cpumask(node_id);
1504 if (!cpus_empty(mask) && node_id != numa_node_id())
1505 return 0;
1506 return __zone_reclaim(zone, gfp_mask, order);
1507}
1508#endif