diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 6 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/filemap.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 286 | ||||
-rw-r--r-- | mm/internal.h | 34 | ||||
-rw-r--r-- | mm/memory.c | 21 | ||||
-rw-r--r-- | mm/mempolicy.c | 117 | ||||
-rw-r--r-- | mm/mempool.c | 4 | ||||
-rw-r--r-- | mm/migrate.c | 655 | ||||
-rw-r--r-- | mm/mmap.c | 10 | ||||
-rw-r--r-- | mm/mprotect.c | 12 | ||||
-rw-r--r-- | mm/nommu.c | 4 | ||||
-rw-r--r-- | mm/page_alloc.c | 113 | ||||
-rw-r--r-- | mm/readahead.c | 32 | ||||
-rw-r--r-- | mm/rmap.c | 14 | ||||
-rw-r--r-- | mm/shmem.c | 7 | ||||
-rw-r--r-- | mm/slab.c | 890 | ||||
-rw-r--r-- | mm/swap.c | 64 | ||||
-rw-r--r-- | mm/swap_state.c | 1 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/vmscan.c | 882 |
21 files changed, 1773 insertions, 1385 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index a9cb80ae64..bd80460360 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -137,5 +137,11 @@ config SPLIT_PTLOCK_CPUS | |||
137 | # support for page migration | 137 | # support for page migration |
138 | # | 138 | # |
139 | config MIGRATION | 139 | config MIGRATION |
140 | bool "Page migration" | ||
140 | def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM | 141 | def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM |
141 | depends on SWAP | 142 | depends on SWAP |
143 | help | ||
144 | Allows the migration of the physical location of pages of processes | ||
145 | while the virtual addresses are not changed. This is useful for | ||
146 | example on NUMA systems to put pages nearer to the processors accessing | ||
147 | the page. | ||
diff --git a/mm/Makefile b/mm/Makefile index 9aa03fa1dc..f10c753dce 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -22,3 +22,5 @@ obj-$(CONFIG_SLOB) += slob.o | |||
22 | obj-$(CONFIG_SLAB) += slab.o | 22 | obj-$(CONFIG_SLAB) += slab.o |
23 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 23 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
24 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 24 | obj-$(CONFIG_FS_XIP) += filemap_xip.o |
25 | obj-$(CONFIG_MIGRATION) += migrate.o | ||
26 | |||
diff --git a/mm/filemap.c b/mm/filemap.c index 44da3d4769..e8f58f7dd7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -30,6 +30,8 @@ | |||
30 | #include <linux/security.h> | 30 | #include <linux/security.h> |
31 | #include <linux/syscalls.h> | 31 | #include <linux/syscalls.h> |
32 | #include "filemap.h" | 32 | #include "filemap.h" |
33 | #include "internal.h" | ||
34 | |||
33 | /* | 35 | /* |
34 | * FIXME: remove all knowledge of the buffer layer from the core VM | 36 | * FIXME: remove all knowledge of the buffer layer from the core VM |
35 | */ | 37 | */ |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 508707704d..ebad6bbb35 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -13,24 +13,48 @@ | |||
13 | #include <linux/pagemap.h> | 13 | #include <linux/pagemap.h> |
14 | #include <linux/mempolicy.h> | 14 | #include <linux/mempolicy.h> |
15 | #include <linux/cpuset.h> | 15 | #include <linux/cpuset.h> |
16 | #include <linux/mutex.h> | ||
16 | 17 | ||
17 | #include <asm/page.h> | 18 | #include <asm/page.h> |
18 | #include <asm/pgtable.h> | 19 | #include <asm/pgtable.h> |
19 | 20 | ||
20 | #include <linux/hugetlb.h> | 21 | #include <linux/hugetlb.h> |
22 | #include "internal.h" | ||
21 | 23 | ||
22 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
23 | static unsigned long nr_huge_pages, free_huge_pages; | 25 | static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages; |
24 | unsigned long max_huge_pages; | 26 | unsigned long max_huge_pages; |
25 | static struct list_head hugepage_freelists[MAX_NUMNODES]; | 27 | static struct list_head hugepage_freelists[MAX_NUMNODES]; |
26 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; | 28 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; |
27 | static unsigned int free_huge_pages_node[MAX_NUMNODES]; | 29 | static unsigned int free_huge_pages_node[MAX_NUMNODES]; |
28 | |||
29 | /* | 30 | /* |
30 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages | 31 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages |
31 | */ | 32 | */ |
32 | static DEFINE_SPINLOCK(hugetlb_lock); | 33 | static DEFINE_SPINLOCK(hugetlb_lock); |
33 | 34 | ||
35 | static void clear_huge_page(struct page *page, unsigned long addr) | ||
36 | { | ||
37 | int i; | ||
38 | |||
39 | might_sleep(); | ||
40 | for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { | ||
41 | cond_resched(); | ||
42 | clear_user_highpage(page + i, addr); | ||
43 | } | ||
44 | } | ||
45 | |||
46 | static void copy_huge_page(struct page *dst, struct page *src, | ||
47 | unsigned long addr) | ||
48 | { | ||
49 | int i; | ||
50 | |||
51 | might_sleep(); | ||
52 | for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { | ||
53 | cond_resched(); | ||
54 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE); | ||
55 | } | ||
56 | } | ||
57 | |||
34 | static void enqueue_huge_page(struct page *page) | 58 | static void enqueue_huge_page(struct page *page) |
35 | { | 59 | { |
36 | int nid = page_to_nid(page); | 60 | int nid = page_to_nid(page); |
@@ -64,57 +88,176 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, | |||
64 | return page; | 88 | return page; |
65 | } | 89 | } |
66 | 90 | ||
67 | static struct page *alloc_fresh_huge_page(void) | 91 | static void free_huge_page(struct page *page) |
92 | { | ||
93 | BUG_ON(page_count(page)); | ||
94 | |||
95 | INIT_LIST_HEAD(&page->lru); | ||
96 | |||
97 | spin_lock(&hugetlb_lock); | ||
98 | enqueue_huge_page(page); | ||
99 | spin_unlock(&hugetlb_lock); | ||
100 | } | ||
101 | |||
102 | static int alloc_fresh_huge_page(void) | ||
68 | { | 103 | { |
69 | static int nid = 0; | 104 | static int nid = 0; |
70 | struct page *page; | 105 | struct page *page; |
71 | page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, | 106 | page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, |
72 | HUGETLB_PAGE_ORDER); | 107 | HUGETLB_PAGE_ORDER); |
73 | nid = (nid + 1) % num_online_nodes(); | 108 | nid = next_node(nid, node_online_map); |
109 | if (nid == MAX_NUMNODES) | ||
110 | nid = first_node(node_online_map); | ||
74 | if (page) { | 111 | if (page) { |
112 | page[1].lru.next = (void *)free_huge_page; /* dtor */ | ||
75 | spin_lock(&hugetlb_lock); | 113 | spin_lock(&hugetlb_lock); |
76 | nr_huge_pages++; | 114 | nr_huge_pages++; |
77 | nr_huge_pages_node[page_to_nid(page)]++; | 115 | nr_huge_pages_node[page_to_nid(page)]++; |
78 | spin_unlock(&hugetlb_lock); | 116 | spin_unlock(&hugetlb_lock); |
117 | put_page(page); /* free it into the hugepage allocator */ | ||
118 | return 1; | ||
79 | } | 119 | } |
80 | return page; | 120 | return 0; |
81 | } | 121 | } |
82 | 122 | ||
83 | void free_huge_page(struct page *page) | 123 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
124 | unsigned long addr) | ||
84 | { | 125 | { |
85 | BUG_ON(page_count(page)); | 126 | struct inode *inode = vma->vm_file->f_dentry->d_inode; |
127 | struct page *page; | ||
128 | int use_reserve = 0; | ||
129 | unsigned long idx; | ||
86 | 130 | ||
87 | INIT_LIST_HEAD(&page->lru); | 131 | spin_lock(&hugetlb_lock); |
88 | page[1].lru.next = NULL; /* reset dtor */ | 132 | |
133 | if (vma->vm_flags & VM_MAYSHARE) { | ||
134 | |||
135 | /* idx = radix tree index, i.e. offset into file in | ||
136 | * HPAGE_SIZE units */ | ||
137 | idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) | ||
138 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | ||
139 | |||
140 | /* The hugetlbfs specific inode info stores the number | ||
141 | * of "guaranteed available" (huge) pages. That is, | ||
142 | * the first 'prereserved_hpages' pages of the inode | ||
143 | * are either already instantiated, or have been | ||
144 | * pre-reserved (by hugetlb_reserve_for_inode()). Here | ||
145 | * we're in the process of instantiating the page, so | ||
146 | * we use this to determine whether to draw from the | ||
147 | * pre-reserved pool or the truly free pool. */ | ||
148 | if (idx < HUGETLBFS_I(inode)->prereserved_hpages) | ||
149 | use_reserve = 1; | ||
150 | } | ||
151 | |||
152 | if (!use_reserve) { | ||
153 | if (free_huge_pages <= reserved_huge_pages) | ||
154 | goto fail; | ||
155 | } else { | ||
156 | BUG_ON(reserved_huge_pages == 0); | ||
157 | reserved_huge_pages--; | ||
158 | } | ||
159 | |||
160 | page = dequeue_huge_page(vma, addr); | ||
161 | if (!page) | ||
162 | goto fail; | ||
163 | |||
164 | spin_unlock(&hugetlb_lock); | ||
165 | set_page_refcounted(page); | ||
166 | return page; | ||
167 | |||
168 | fail: | ||
169 | WARN_ON(use_reserve); /* reserved allocations shouldn't fail */ | ||
170 | spin_unlock(&hugetlb_lock); | ||
171 | return NULL; | ||
172 | } | ||
173 | |||
174 | /* hugetlb_extend_reservation() | ||
175 | * | ||
176 | * Ensure that at least 'atleast' hugepages are, and will remain, | ||
177 | * available to instantiate the first 'atleast' pages of the given | ||
178 | * inode. If the inode doesn't already have this many pages reserved | ||
179 | * or instantiated, set aside some hugepages in the reserved pool to | ||
180 | * satisfy later faults (or fail now if there aren't enough, rather | ||
181 | * than getting the SIGBUS later). | ||
182 | */ | ||
183 | int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, | ||
184 | unsigned long atleast) | ||
185 | { | ||
186 | struct inode *inode = &info->vfs_inode; | ||
187 | unsigned long change_in_reserve = 0; | ||
188 | int ret = 0; | ||
89 | 189 | ||
90 | spin_lock(&hugetlb_lock); | 190 | spin_lock(&hugetlb_lock); |
91 | enqueue_huge_page(page); | 191 | read_lock_irq(&inode->i_mapping->tree_lock); |
192 | |||
193 | if (info->prereserved_hpages >= atleast) | ||
194 | goto out; | ||
195 | |||
196 | /* Because we always call this on shared mappings, none of the | ||
197 | * pages beyond info->prereserved_hpages can have been | ||
198 | * instantiated, so we need to reserve all of them now. */ | ||
199 | change_in_reserve = atleast - info->prereserved_hpages; | ||
200 | |||
201 | if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) { | ||
202 | ret = -ENOMEM; | ||
203 | goto out; | ||
204 | } | ||
205 | |||
206 | reserved_huge_pages += change_in_reserve; | ||
207 | info->prereserved_hpages = atleast; | ||
208 | |||
209 | out: | ||
210 | read_unlock_irq(&inode->i_mapping->tree_lock); | ||
92 | spin_unlock(&hugetlb_lock); | 211 | spin_unlock(&hugetlb_lock); |
212 | |||
213 | return ret; | ||
93 | } | 214 | } |
94 | 215 | ||
95 | struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) | 216 | /* hugetlb_truncate_reservation() |
217 | * | ||
218 | * This returns pages reserved for the given inode to the general free | ||
219 | * hugepage pool. If the inode has any pages prereserved, but not | ||
220 | * instantiated, beyond offset (atmost << HPAGE_SIZE), then release | ||
221 | * them. | ||
222 | */ | ||
223 | void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, | ||
224 | unsigned long atmost) | ||
96 | { | 225 | { |
226 | struct inode *inode = &info->vfs_inode; | ||
227 | struct address_space *mapping = inode->i_mapping; | ||
228 | unsigned long idx; | ||
229 | unsigned long change_in_reserve = 0; | ||
97 | struct page *page; | 230 | struct page *page; |
98 | int i; | ||
99 | 231 | ||
100 | spin_lock(&hugetlb_lock); | 232 | spin_lock(&hugetlb_lock); |
101 | page = dequeue_huge_page(vma, addr); | 233 | read_lock_irq(&inode->i_mapping->tree_lock); |
102 | if (!page) { | 234 | |
103 | spin_unlock(&hugetlb_lock); | 235 | if (info->prereserved_hpages <= atmost) |
104 | return NULL; | 236 | goto out; |
237 | |||
238 | /* Count pages which were reserved, but not instantiated, and | ||
239 | * which we can now release. */ | ||
240 | for (idx = atmost; idx < info->prereserved_hpages; idx++) { | ||
241 | page = radix_tree_lookup(&mapping->page_tree, idx); | ||
242 | if (!page) | ||
243 | /* Pages which are already instantiated can't | ||
244 | * be unreserved (and in fact have already | ||
245 | * been removed from the reserved pool) */ | ||
246 | change_in_reserve++; | ||
105 | } | 247 | } |
248 | |||
249 | BUG_ON(reserved_huge_pages < change_in_reserve); | ||
250 | reserved_huge_pages -= change_in_reserve; | ||
251 | info->prereserved_hpages = atmost; | ||
252 | |||
253 | out: | ||
254 | read_unlock_irq(&inode->i_mapping->tree_lock); | ||
106 | spin_unlock(&hugetlb_lock); | 255 | spin_unlock(&hugetlb_lock); |
107 | set_page_count(page, 1); | ||
108 | page[1].lru.next = (void *)free_huge_page; /* set dtor */ | ||
109 | for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) | ||
110 | clear_user_highpage(&page[i], addr); | ||
111 | return page; | ||
112 | } | 256 | } |
113 | 257 | ||
114 | static int __init hugetlb_init(void) | 258 | static int __init hugetlb_init(void) |
115 | { | 259 | { |
116 | unsigned long i; | 260 | unsigned long i; |
117 | struct page *page; | ||
118 | 261 | ||
119 | if (HPAGE_SHIFT == 0) | 262 | if (HPAGE_SHIFT == 0) |
120 | return 0; | 263 | return 0; |
@@ -123,12 +266,8 @@ static int __init hugetlb_init(void) | |||
123 | INIT_LIST_HEAD(&hugepage_freelists[i]); | 266 | INIT_LIST_HEAD(&hugepage_freelists[i]); |
124 | 267 | ||
125 | for (i = 0; i < max_huge_pages; ++i) { | 268 | for (i = 0; i < max_huge_pages; ++i) { |
126 | page = alloc_fresh_huge_page(); | 269 | if (!alloc_fresh_huge_page()) |
127 | if (!page) | ||
128 | break; | 270 | break; |
129 | spin_lock(&hugetlb_lock); | ||
130 | enqueue_huge_page(page); | ||
131 | spin_unlock(&hugetlb_lock); | ||
132 | } | 271 | } |
133 | max_huge_pages = free_huge_pages = nr_huge_pages = i; | 272 | max_huge_pages = free_huge_pages = nr_huge_pages = i; |
134 | printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); | 273 | printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); |
@@ -154,9 +293,9 @@ static void update_and_free_page(struct page *page) | |||
154 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | | 293 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | |
155 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | | 294 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | |
156 | 1 << PG_private | 1<< PG_writeback); | 295 | 1 << PG_private | 1<< PG_writeback); |
157 | set_page_count(&page[i], 0); | ||
158 | } | 296 | } |
159 | set_page_count(page, 1); | 297 | page[1].lru.next = NULL; |
298 | set_page_refcounted(page); | ||
160 | __free_pages(page, HUGETLB_PAGE_ORDER); | 299 | __free_pages(page, HUGETLB_PAGE_ORDER); |
161 | } | 300 | } |
162 | 301 | ||
@@ -188,12 +327,8 @@ static inline void try_to_free_low(unsigned long count) | |||
188 | static unsigned long set_max_huge_pages(unsigned long count) | 327 | static unsigned long set_max_huge_pages(unsigned long count) |
189 | { | 328 | { |
190 | while (count > nr_huge_pages) { | 329 | while (count > nr_huge_pages) { |
191 | struct page *page = alloc_fresh_huge_page(); | 330 | if (!alloc_fresh_huge_page()) |
192 | if (!page) | ||
193 | return nr_huge_pages; | 331 | return nr_huge_pages; |
194 | spin_lock(&hugetlb_lock); | ||
195 | enqueue_huge_page(page); | ||
196 | spin_unlock(&hugetlb_lock); | ||
197 | } | 332 | } |
198 | if (count >= nr_huge_pages) | 333 | if (count >= nr_huge_pages) |
199 | return nr_huge_pages; | 334 | return nr_huge_pages; |
@@ -225,9 +360,11 @@ int hugetlb_report_meminfo(char *buf) | |||
225 | return sprintf(buf, | 360 | return sprintf(buf, |
226 | "HugePages_Total: %5lu\n" | 361 | "HugePages_Total: %5lu\n" |
227 | "HugePages_Free: %5lu\n" | 362 | "HugePages_Free: %5lu\n" |
363 | "HugePages_Rsvd: %5lu\n" | ||
228 | "Hugepagesize: %5lu kB\n", | 364 | "Hugepagesize: %5lu kB\n", |
229 | nr_huge_pages, | 365 | nr_huge_pages, |
230 | free_huge_pages, | 366 | free_huge_pages, |
367 | reserved_huge_pages, | ||
231 | HPAGE_SIZE/1024); | 368 | HPAGE_SIZE/1024); |
232 | } | 369 | } |
233 | 370 | ||
@@ -240,11 +377,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf) | |||
240 | nid, free_huge_pages_node[nid]); | 377 | nid, free_huge_pages_node[nid]); |
241 | } | 378 | } |
242 | 379 | ||
243 | int is_hugepage_mem_enough(size_t size) | ||
244 | { | ||
245 | return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages; | ||
246 | } | ||
247 | |||
248 | /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ | 380 | /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ |
249 | unsigned long hugetlb_total_pages(void) | 381 | unsigned long hugetlb_total_pages(void) |
250 | { | 382 | { |
@@ -374,7 +506,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
374 | unsigned long address, pte_t *ptep, pte_t pte) | 506 | unsigned long address, pte_t *ptep, pte_t pte) |
375 | { | 507 | { |
376 | struct page *old_page, *new_page; | 508 | struct page *old_page, *new_page; |
377 | int i, avoidcopy; | 509 | int avoidcopy; |
378 | 510 | ||
379 | old_page = pte_page(pte); | 511 | old_page = pte_page(pte); |
380 | 512 | ||
@@ -395,9 +527,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
395 | } | 527 | } |
396 | 528 | ||
397 | spin_unlock(&mm->page_table_lock); | 529 | spin_unlock(&mm->page_table_lock); |
398 | for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) | 530 | copy_huge_page(new_page, old_page, address); |
399 | copy_user_highpage(new_page + i, old_page + i, | ||
400 | address + i*PAGE_SIZE); | ||
401 | spin_lock(&mm->page_table_lock); | 531 | spin_lock(&mm->page_table_lock); |
402 | 532 | ||
403 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); | 533 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); |
@@ -442,6 +572,7 @@ retry: | |||
442 | ret = VM_FAULT_OOM; | 572 | ret = VM_FAULT_OOM; |
443 | goto out; | 573 | goto out; |
444 | } | 574 | } |
575 | clear_huge_page(page, address); | ||
445 | 576 | ||
446 | if (vma->vm_flags & VM_SHARED) { | 577 | if (vma->vm_flags & VM_SHARED) { |
447 | int err; | 578 | int err; |
@@ -496,14 +627,24 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
496 | pte_t *ptep; | 627 | pte_t *ptep; |
497 | pte_t entry; | 628 | pte_t entry; |
498 | int ret; | 629 | int ret; |
630 | static DEFINE_MUTEX(hugetlb_instantiation_mutex); | ||
499 | 631 | ||
500 | ptep = huge_pte_alloc(mm, address); | 632 | ptep = huge_pte_alloc(mm, address); |
501 | if (!ptep) | 633 | if (!ptep) |
502 | return VM_FAULT_OOM; | 634 | return VM_FAULT_OOM; |
503 | 635 | ||
636 | /* | ||
637 | * Serialize hugepage allocation and instantiation, so that we don't | ||
638 | * get spurious allocation failures if two CPUs race to instantiate | ||
639 | * the same page in the page cache. | ||
640 | */ | ||
641 | mutex_lock(&hugetlb_instantiation_mutex); | ||
504 | entry = *ptep; | 642 | entry = *ptep; |
505 | if (pte_none(entry)) | 643 | if (pte_none(entry)) { |
506 | return hugetlb_no_page(mm, vma, address, ptep, write_access); | 644 | ret = hugetlb_no_page(mm, vma, address, ptep, write_access); |
645 | mutex_unlock(&hugetlb_instantiation_mutex); | ||
646 | return ret; | ||
647 | } | ||
507 | 648 | ||
508 | ret = VM_FAULT_MINOR; | 649 | ret = VM_FAULT_MINOR; |
509 | 650 | ||
@@ -513,6 +654,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
513 | if (write_access && !pte_write(entry)) | 654 | if (write_access && !pte_write(entry)) |
514 | ret = hugetlb_cow(mm, vma, address, ptep, entry); | 655 | ret = hugetlb_cow(mm, vma, address, ptep, entry); |
515 | spin_unlock(&mm->page_table_lock); | 656 | spin_unlock(&mm->page_table_lock); |
657 | mutex_unlock(&hugetlb_instantiation_mutex); | ||
516 | 658 | ||
517 | return ret; | 659 | return ret; |
518 | } | 660 | } |
@@ -521,10 +663,10 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
521 | struct page **pages, struct vm_area_struct **vmas, | 663 | struct page **pages, struct vm_area_struct **vmas, |
522 | unsigned long *position, int *length, int i) | 664 | unsigned long *position, int *length, int i) |
523 | { | 665 | { |
524 | unsigned long vpfn, vaddr = *position; | 666 | unsigned long pfn_offset; |
667 | unsigned long vaddr = *position; | ||
525 | int remainder = *length; | 668 | int remainder = *length; |
526 | 669 | ||
527 | vpfn = vaddr/PAGE_SIZE; | ||
528 | spin_lock(&mm->page_table_lock); | 670 | spin_lock(&mm->page_table_lock); |
529 | while (vaddr < vma->vm_end && remainder) { | 671 | while (vaddr < vma->vm_end && remainder) { |
530 | pte_t *pte; | 672 | pte_t *pte; |
@@ -552,19 +694,28 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
552 | break; | 694 | break; |
553 | } | 695 | } |
554 | 696 | ||
555 | if (pages) { | 697 | pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; |
556 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | 698 | page = pte_page(*pte); |
557 | get_page(page); | 699 | same_page: |
558 | pages[i] = page; | 700 | get_page(page); |
559 | } | 701 | if (pages) |
702 | pages[i] = page + pfn_offset; | ||
560 | 703 | ||
561 | if (vmas) | 704 | if (vmas) |
562 | vmas[i] = vma; | 705 | vmas[i] = vma; |
563 | 706 | ||
564 | vaddr += PAGE_SIZE; | 707 | vaddr += PAGE_SIZE; |
565 | ++vpfn; | 708 | ++pfn_offset; |
566 | --remainder; | 709 | --remainder; |
567 | ++i; | 710 | ++i; |
711 | if (vaddr < vma->vm_end && remainder && | ||
712 | pfn_offset < HPAGE_SIZE/PAGE_SIZE) { | ||
713 | /* | ||
714 | * We use pfn_offset to avoid touching the pageframes | ||
715 | * of this compound page. | ||
716 | */ | ||
717 | goto same_page; | ||
718 | } | ||
568 | } | 719 | } |
569 | spin_unlock(&mm->page_table_lock); | 720 | spin_unlock(&mm->page_table_lock); |
570 | *length = remainder; | 721 | *length = remainder; |
@@ -572,3 +723,32 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
572 | 723 | ||
573 | return i; | 724 | return i; |
574 | } | 725 | } |
726 | |||
727 | void hugetlb_change_protection(struct vm_area_struct *vma, | ||
728 | unsigned long address, unsigned long end, pgprot_t newprot) | ||
729 | { | ||
730 | struct mm_struct *mm = vma->vm_mm; | ||
731 | unsigned long start = address; | ||
732 | pte_t *ptep; | ||
733 | pte_t pte; | ||
734 | |||
735 | BUG_ON(address >= end); | ||
736 | flush_cache_range(vma, address, end); | ||
737 | |||
738 | spin_lock(&mm->page_table_lock); | ||
739 | for (; address < end; address += HPAGE_SIZE) { | ||
740 | ptep = huge_pte_offset(mm, address); | ||
741 | if (!ptep) | ||
742 | continue; | ||
743 | if (!pte_none(*ptep)) { | ||
744 | pte = huge_ptep_get_and_clear(mm, address, ptep); | ||
745 | pte = pte_mkhuge(pte_modify(pte, newprot)); | ||
746 | set_huge_pte_at(mm, address, ptep, pte); | ||
747 | lazy_mmu_prot_update(pte); | ||
748 | } | ||
749 | } | ||
750 | spin_unlock(&mm->page_table_lock); | ||
751 | |||
752 | flush_tlb_range(vma, start, end); | ||
753 | } | ||
754 | |||
diff --git a/mm/internal.h b/mm/internal.h index 17256bb2f4..d20e3cc4ae 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -8,23 +8,33 @@ | |||
8 | * as published by the Free Software Foundation; either version | 8 | * as published by the Free Software Foundation; either version |
9 | * 2 of the License, or (at your option) any later version. | 9 | * 2 of the License, or (at your option) any later version. |
10 | */ | 10 | */ |
11 | #ifndef __MM_INTERNAL_H | ||
12 | #define __MM_INTERNAL_H | ||
11 | 13 | ||
12 | static inline void set_page_refs(struct page *page, int order) | 14 | #include <linux/mm.h> |
15 | |||
16 | static inline void set_page_count(struct page *page, int v) | ||
17 | { | ||
18 | atomic_set(&page->_count, v); | ||
19 | } | ||
20 | |||
21 | /* | ||
22 | * Turn a non-refcounted page (->_count == 0) into refcounted with | ||
23 | * a count of one. | ||
24 | */ | ||
25 | static inline void set_page_refcounted(struct page *page) | ||
13 | { | 26 | { |
14 | #ifdef CONFIG_MMU | 27 | BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); |
28 | BUG_ON(atomic_read(&page->_count)); | ||
15 | set_page_count(page, 1); | 29 | set_page_count(page, 1); |
16 | #else | 30 | } |
17 | int i; | ||
18 | 31 | ||
19 | /* | 32 | static inline void __put_page(struct page *page) |
20 | * We need to reference all the pages for this order, otherwise if | 33 | { |
21 | * anyone accesses one of the pages with (get/put) it will be freed. | 34 | atomic_dec(&page->_count); |
22 | * - eg: access_process_vm() | ||
23 | */ | ||
24 | for (i = 0; i < (1 << order); i++) | ||
25 | set_page_count(page + i, 1); | ||
26 | #endif /* CONFIG_MMU */ | ||
27 | } | 35 | } |
28 | 36 | ||
29 | extern void fastcall __init __free_pages_bootmem(struct page *page, | 37 | extern void fastcall __init __free_pages_bootmem(struct page *page, |
30 | unsigned int order); | 38 | unsigned int order); |
39 | |||
40 | #endif | ||
diff --git a/mm/memory.c b/mm/memory.c index 85e80a57db..80c3fb370f 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -277,7 +277,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, | |||
277 | anon_vma_unlink(vma); | 277 | anon_vma_unlink(vma); |
278 | unlink_file_vma(vma); | 278 | unlink_file_vma(vma); |
279 | 279 | ||
280 | if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { | 280 | if (is_vm_hugetlb_page(vma)) { |
281 | hugetlb_free_pgd_range(tlb, addr, vma->vm_end, | 281 | hugetlb_free_pgd_range(tlb, addr, vma->vm_end, |
282 | floor, next? next->vm_start: ceiling); | 282 | floor, next? next->vm_start: ceiling); |
283 | } else { | 283 | } else { |
@@ -285,8 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, | |||
285 | * Optimization: gather nearby vmas into one call down | 285 | * Optimization: gather nearby vmas into one call down |
286 | */ | 286 | */ |
287 | while (next && next->vm_start <= vma->vm_end + PMD_SIZE | 287 | while (next && next->vm_start <= vma->vm_end + PMD_SIZE |
288 | && !is_hugepage_only_range(vma->vm_mm, next->vm_start, | 288 | && !is_vm_hugetlb_page(next)) { |
289 | HPAGE_SIZE)) { | ||
290 | vma = next; | 289 | vma = next; |
291 | next = vma->vm_next; | 290 | next = vma->vm_next; |
292 | anon_vma_unlink(vma); | 291 | anon_vma_unlink(vma); |
@@ -388,7 +387,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ | |||
388 | { | 387 | { |
389 | unsigned long pfn = pte_pfn(pte); | 388 | unsigned long pfn = pte_pfn(pte); |
390 | 389 | ||
391 | if (vma->vm_flags & VM_PFNMAP) { | 390 | if (unlikely(vma->vm_flags & VM_PFNMAP)) { |
392 | unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; | 391 | unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; |
393 | if (pfn == vma->vm_pgoff + off) | 392 | if (pfn == vma->vm_pgoff + off) |
394 | return NULL; | 393 | return NULL; |
@@ -396,18 +395,12 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ | |||
396 | return NULL; | 395 | return NULL; |
397 | } | 396 | } |
398 | 397 | ||
399 | /* | 398 | #ifdef CONFIG_DEBUG_VM |
400 | * Add some anal sanity checks for now. Eventually, | ||
401 | * we should just do "return pfn_to_page(pfn)", but | ||
402 | * in the meantime we check that we get a valid pfn, | ||
403 | * and that the resulting page looks ok. | ||
404 | * | ||
405 | * Remove this test eventually! | ||
406 | */ | ||
407 | if (unlikely(!pfn_valid(pfn))) { | 399 | if (unlikely(!pfn_valid(pfn))) { |
408 | print_bad_pte(vma, pte, addr); | 400 | print_bad_pte(vma, pte, addr); |
409 | return NULL; | 401 | return NULL; |
410 | } | 402 | } |
403 | #endif | ||
411 | 404 | ||
412 | /* | 405 | /* |
413 | * NOTE! We still have PageReserved() pages in the page | 406 | * NOTE! We still have PageReserved() pages in the page |
@@ -1221,9 +1214,7 @@ out: | |||
1221 | * The page has to be a nice clean _individual_ kernel allocation. | 1214 | * The page has to be a nice clean _individual_ kernel allocation. |
1222 | * If you allocate a compound page, you need to have marked it as | 1215 | * If you allocate a compound page, you need to have marked it as |
1223 | * such (__GFP_COMP), or manually just split the page up yourself | 1216 | * such (__GFP_COMP), or manually just split the page up yourself |
1224 | * (which is mainly an issue of doing "set_page_count(page, 1)" for | 1217 | * (see split_page()). |
1225 | * each sub-page, and then freeing them one by one when you free | ||
1226 | * them rather than freeing it as a compound page). | ||
1227 | * | 1218 | * |
1228 | * NOTE! Traditionally this was done with "remap_pfn_range()" which | 1219 | * NOTE! Traditionally this was done with "remap_pfn_range()" which |
1229 | * took an arbitrary page protection parameter. This doesn't allow | 1220 | * took an arbitrary page protection parameter. This doesn't allow |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b21869a39f..e93cc740c2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -86,6 +86,7 @@ | |||
86 | #include <linux/swap.h> | 86 | #include <linux/swap.h> |
87 | #include <linux/seq_file.h> | 87 | #include <linux/seq_file.h> |
88 | #include <linux/proc_fs.h> | 88 | #include <linux/proc_fs.h> |
89 | #include <linux/migrate.h> | ||
89 | 90 | ||
90 | #include <asm/tlbflush.h> | 91 | #include <asm/tlbflush.h> |
91 | #include <asm/uaccess.h> | 92 | #include <asm/uaccess.h> |
@@ -95,11 +96,8 @@ | |||
95 | #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ | 96 | #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ |
96 | #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ | 97 | #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ |
97 | 98 | ||
98 | /* The number of pages to migrate per call to migrate_pages() */ | 99 | static struct kmem_cache *policy_cache; |
99 | #define MIGRATE_CHUNK_SIZE 256 | 100 | static struct kmem_cache *sn_cache; |
100 | |||
101 | static kmem_cache_t *policy_cache; | ||
102 | static kmem_cache_t *sn_cache; | ||
103 | 101 | ||
104 | #define PDprintk(fmt...) | 102 | #define PDprintk(fmt...) |
105 | 103 | ||
@@ -331,17 +329,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
331 | struct vm_area_struct *first, *vma, *prev; | 329 | struct vm_area_struct *first, *vma, *prev; |
332 | 330 | ||
333 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { | 331 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { |
334 | /* Must have swap device for migration */ | ||
335 | if (nr_swap_pages <= 0) | ||
336 | return ERR_PTR(-ENODEV); | ||
337 | 332 | ||
338 | /* | 333 | err = migrate_prep(); |
339 | * Clear the LRU lists so pages can be isolated. | 334 | if (err) |
340 | * Note that pages may be moved off the LRU after we have | 335 | return ERR_PTR(err); |
341 | * drained them. Those pages will fail to migrate like other | ||
342 | * pages that may be busy. | ||
343 | */ | ||
344 | lru_add_drain_all(); | ||
345 | } | 336 | } |
346 | 337 | ||
347 | first = find_vma(mm, start); | 338 | first = find_vma(mm, start); |
@@ -550,92 +541,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
550 | return err; | 541 | return err; |
551 | } | 542 | } |
552 | 543 | ||
544 | #ifdef CONFIG_MIGRATION | ||
553 | /* | 545 | /* |
554 | * page migration | 546 | * page migration |
555 | */ | 547 | */ |
556 | |||
557 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | 548 | static void migrate_page_add(struct page *page, struct list_head *pagelist, |
558 | unsigned long flags) | 549 | unsigned long flags) |
559 | { | 550 | { |
560 | /* | 551 | /* |
561 | * Avoid migrating a page that is shared with others. | 552 | * Avoid migrating a page that is shared with others. |
562 | */ | 553 | */ |
563 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { | 554 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) |
564 | if (isolate_lru_page(page)) | 555 | isolate_lru_page(page, pagelist); |
565 | list_add_tail(&page->lru, pagelist); | ||
566 | } | ||
567 | } | ||
568 | |||
569 | /* | ||
570 | * Migrate the list 'pagelist' of pages to a certain destination. | ||
571 | * | ||
572 | * Specify destination with either non-NULL vma or dest_node >= 0 | ||
573 | * Return the number of pages not migrated or error code | ||
574 | */ | ||
575 | static int migrate_pages_to(struct list_head *pagelist, | ||
576 | struct vm_area_struct *vma, int dest) | ||
577 | { | ||
578 | LIST_HEAD(newlist); | ||
579 | LIST_HEAD(moved); | ||
580 | LIST_HEAD(failed); | ||
581 | int err = 0; | ||
582 | unsigned long offset = 0; | ||
583 | int nr_pages; | ||
584 | struct page *page; | ||
585 | struct list_head *p; | ||
586 | |||
587 | redo: | ||
588 | nr_pages = 0; | ||
589 | list_for_each(p, pagelist) { | ||
590 | if (vma) { | ||
591 | /* | ||
592 | * The address passed to alloc_page_vma is used to | ||
593 | * generate the proper interleave behavior. We fake | ||
594 | * the address here by an increasing offset in order | ||
595 | * to get the proper distribution of pages. | ||
596 | * | ||
597 | * No decision has been made as to which page | ||
598 | * a certain old page is moved to so we cannot | ||
599 | * specify the correct address. | ||
600 | */ | ||
601 | page = alloc_page_vma(GFP_HIGHUSER, vma, | ||
602 | offset + vma->vm_start); | ||
603 | offset += PAGE_SIZE; | ||
604 | } | ||
605 | else | ||
606 | page = alloc_pages_node(dest, GFP_HIGHUSER, 0); | ||
607 | |||
608 | if (!page) { | ||
609 | err = -ENOMEM; | ||
610 | goto out; | ||
611 | } | ||
612 | list_add_tail(&page->lru, &newlist); | ||
613 | nr_pages++; | ||
614 | if (nr_pages > MIGRATE_CHUNK_SIZE) | ||
615 | break; | ||
616 | } | ||
617 | err = migrate_pages(pagelist, &newlist, &moved, &failed); | ||
618 | |||
619 | putback_lru_pages(&moved); /* Call release pages instead ?? */ | ||
620 | |||
621 | if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) | ||
622 | goto redo; | ||
623 | out: | ||
624 | /* Return leftover allocated pages */ | ||
625 | while (!list_empty(&newlist)) { | ||
626 | page = list_entry(newlist.next, struct page, lru); | ||
627 | list_del(&page->lru); | ||
628 | __free_page(page); | ||
629 | } | ||
630 | list_splice(&failed, pagelist); | ||
631 | if (err < 0) | ||
632 | return err; | ||
633 | |||
634 | /* Calculate number of leftover pages */ | ||
635 | nr_pages = 0; | ||
636 | list_for_each(p, pagelist) | ||
637 | nr_pages++; | ||
638 | return nr_pages; | ||
639 | } | 556 | } |
640 | 557 | ||
641 | /* | 558 | /* |
@@ -742,8 +659,23 @@ int do_migrate_pages(struct mm_struct *mm, | |||
742 | if (err < 0) | 659 | if (err < 0) |
743 | return err; | 660 | return err; |
744 | return busy; | 661 | return busy; |
662 | |||
745 | } | 663 | } |
746 | 664 | ||
665 | #else | ||
666 | |||
667 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | ||
668 | unsigned long flags) | ||
669 | { | ||
670 | } | ||
671 | |||
672 | int do_migrate_pages(struct mm_struct *mm, | ||
673 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) | ||
674 | { | ||
675 | return -ENOSYS; | ||
676 | } | ||
677 | #endif | ||
678 | |||
747 | long do_mbind(unsigned long start, unsigned long len, | 679 | long do_mbind(unsigned long start, unsigned long len, |
748 | unsigned long mode, nodemask_t *nmask, unsigned long flags) | 680 | unsigned long mode, nodemask_t *nmask, unsigned long flags) |
749 | { | 681 | { |
@@ -808,6 +740,7 @@ long do_mbind(unsigned long start, unsigned long len, | |||
808 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | 740 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) |
809 | err = -EIO; | 741 | err = -EIO; |
810 | } | 742 | } |
743 | |||
811 | if (!list_empty(&pagelist)) | 744 | if (!list_empty(&pagelist)) |
812 | putback_lru_pages(&pagelist); | 745 | putback_lru_pages(&pagelist); |
813 | 746 | ||
diff --git a/mm/mempool.c b/mm/mempool.c index 1a99b80480..f71893ed35 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
@@ -278,14 +278,14 @@ EXPORT_SYMBOL(mempool_free); | |||
278 | */ | 278 | */ |
279 | void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) | 279 | void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) |
280 | { | 280 | { |
281 | kmem_cache_t *mem = (kmem_cache_t *) pool_data; | 281 | struct kmem_cache *mem = pool_data; |
282 | return kmem_cache_alloc(mem, gfp_mask); | 282 | return kmem_cache_alloc(mem, gfp_mask); |
283 | } | 283 | } |
284 | EXPORT_SYMBOL(mempool_alloc_slab); | 284 | EXPORT_SYMBOL(mempool_alloc_slab); |
285 | 285 | ||
286 | void mempool_free_slab(void *element, void *pool_data) | 286 | void mempool_free_slab(void *element, void *pool_data) |
287 | { | 287 | { |
288 | kmem_cache_t *mem = (kmem_cache_t *) pool_data; | 288 | struct kmem_cache *mem = pool_data; |
289 | kmem_cache_free(mem, element); | 289 | kmem_cache_free(mem, element); |
290 | } | 290 | } |
291 | EXPORT_SYMBOL(mempool_free_slab); | 291 | EXPORT_SYMBOL(mempool_free_slab); |
diff --git a/mm/migrate.c b/mm/migrate.c new file mode 100644 index 0000000000..09f6e4aa87 --- /dev/null +++ b/mm/migrate.c | |||
@@ -0,0 +1,655 @@ | |||
1 | /* | ||
2 | * Memory Migration functionality - linux/mm/migration.c | ||
3 | * | ||
4 | * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter | ||
5 | * | ||
6 | * Page migration was first developed in the context of the memory hotplug | ||
7 | * project. The main authors of the migration code are: | ||
8 | * | ||
9 | * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> | ||
10 | * Hirokazu Takahashi <taka@valinux.co.jp> | ||
11 | * Dave Hansen <haveblue@us.ibm.com> | ||
12 | * Christoph Lameter <clameter@sgi.com> | ||
13 | */ | ||
14 | |||
15 | #include <linux/migrate.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/swap.h> | ||
18 | #include <linux/pagemap.h> | ||
19 | #include <linux/buffer_head.h> /* for try_to_release_page(), | ||
20 | buffer_heads_over_limit */ | ||
21 | #include <linux/mm_inline.h> | ||
22 | #include <linux/pagevec.h> | ||
23 | #include <linux/rmap.h> | ||
24 | #include <linux/topology.h> | ||
25 | #include <linux/cpu.h> | ||
26 | #include <linux/cpuset.h> | ||
27 | #include <linux/swapops.h> | ||
28 | |||
29 | #include "internal.h" | ||
30 | |||
31 | #include "internal.h" | ||
32 | |||
33 | /* The maximum number of pages to take off the LRU for migration */ | ||
34 | #define MIGRATE_CHUNK_SIZE 256 | ||
35 | |||
36 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | ||
37 | |||
38 | /* | ||
39 | * Isolate one page from the LRU lists. If successful put it onto | ||
40 | * the indicated list with elevated page count. | ||
41 | * | ||
42 | * Result: | ||
43 | * -EBUSY: page not on LRU list | ||
44 | * 0: page removed from LRU list and added to the specified list. | ||
45 | */ | ||
46 | int isolate_lru_page(struct page *page, struct list_head *pagelist) | ||
47 | { | ||
48 | int ret = -EBUSY; | ||
49 | |||
50 | if (PageLRU(page)) { | ||
51 | struct zone *zone = page_zone(page); | ||
52 | |||
53 | spin_lock_irq(&zone->lru_lock); | ||
54 | if (PageLRU(page)) { | ||
55 | ret = 0; | ||
56 | get_page(page); | ||
57 | ClearPageLRU(page); | ||
58 | if (PageActive(page)) | ||
59 | del_page_from_active_list(zone, page); | ||
60 | else | ||
61 | del_page_from_inactive_list(zone, page); | ||
62 | list_add_tail(&page->lru, pagelist); | ||
63 | } | ||
64 | spin_unlock_irq(&zone->lru_lock); | ||
65 | } | ||
66 | return ret; | ||
67 | } | ||
68 | |||
69 | /* | ||
70 | * migrate_prep() needs to be called after we have compiled the list of pages | ||
71 | * to be migrated using isolate_lru_page() but before we begin a series of calls | ||
72 | * to migrate_pages(). | ||
73 | */ | ||
74 | int migrate_prep(void) | ||
75 | { | ||
76 | /* Must have swap device for migration */ | ||
77 | if (nr_swap_pages <= 0) | ||
78 | return -ENODEV; | ||
79 | |||
80 | /* | ||
81 | * Clear the LRU lists so pages can be isolated. | ||
82 | * Note that pages may be moved off the LRU after we have | ||
83 | * drained them. Those pages will fail to migrate like other | ||
84 | * pages that may be busy. | ||
85 | */ | ||
86 | lru_add_drain_all(); | ||
87 | |||
88 | return 0; | ||
89 | } | ||
90 | |||
91 | static inline void move_to_lru(struct page *page) | ||
92 | { | ||
93 | list_del(&page->lru); | ||
94 | if (PageActive(page)) { | ||
95 | /* | ||
96 | * lru_cache_add_active checks that | ||
97 | * the PG_active bit is off. | ||
98 | */ | ||
99 | ClearPageActive(page); | ||
100 | lru_cache_add_active(page); | ||
101 | } else { | ||
102 | lru_cache_add(page); | ||
103 | } | ||
104 | put_page(page); | ||
105 | } | ||
106 | |||
107 | /* | ||
108 | * Add isolated pages on the list back to the LRU. | ||
109 | * | ||
110 | * returns the number of pages put back. | ||
111 | */ | ||
112 | int putback_lru_pages(struct list_head *l) | ||
113 | { | ||
114 | struct page *page; | ||
115 | struct page *page2; | ||
116 | int count = 0; | ||
117 | |||
118 | list_for_each_entry_safe(page, page2, l, lru) { | ||
119 | move_to_lru(page); | ||
120 | count++; | ||
121 | } | ||
122 | return count; | ||
123 | } | ||
124 | |||
125 | /* | ||
126 | * Non migratable page | ||
127 | */ | ||
128 | int fail_migrate_page(struct page *newpage, struct page *page) | ||
129 | { | ||
130 | return -EIO; | ||
131 | } | ||
132 | EXPORT_SYMBOL(fail_migrate_page); | ||
133 | |||
134 | /* | ||
135 | * swapout a single page | ||
136 | * page is locked upon entry, unlocked on exit | ||
137 | */ | ||
138 | static int swap_page(struct page *page) | ||
139 | { | ||
140 | struct address_space *mapping = page_mapping(page); | ||
141 | |||
142 | if (page_mapped(page) && mapping) | ||
143 | if (try_to_unmap(page, 1) != SWAP_SUCCESS) | ||
144 | goto unlock_retry; | ||
145 | |||
146 | if (PageDirty(page)) { | ||
147 | /* Page is dirty, try to write it out here */ | ||
148 | switch(pageout(page, mapping)) { | ||
149 | case PAGE_KEEP: | ||
150 | case PAGE_ACTIVATE: | ||
151 | goto unlock_retry; | ||
152 | |||
153 | case PAGE_SUCCESS: | ||
154 | goto retry; | ||
155 | |||
156 | case PAGE_CLEAN: | ||
157 | ; /* try to free the page below */ | ||
158 | } | ||
159 | } | ||
160 | |||
161 | if (PagePrivate(page)) { | ||
162 | if (!try_to_release_page(page, GFP_KERNEL) || | ||
163 | (!mapping && page_count(page) == 1)) | ||
164 | goto unlock_retry; | ||
165 | } | ||
166 | |||
167 | if (remove_mapping(mapping, page)) { | ||
168 | /* Success */ | ||
169 | unlock_page(page); | ||
170 | return 0; | ||
171 | } | ||
172 | |||
173 | unlock_retry: | ||
174 | unlock_page(page); | ||
175 | |||
176 | retry: | ||
177 | return -EAGAIN; | ||
178 | } | ||
179 | EXPORT_SYMBOL(swap_page); | ||
180 | |||
181 | /* | ||
182 | * Remove references for a page and establish the new page with the correct | ||
183 | * basic settings to be able to stop accesses to the page. | ||
184 | */ | ||
185 | int migrate_page_remove_references(struct page *newpage, | ||
186 | struct page *page, int nr_refs) | ||
187 | { | ||
188 | struct address_space *mapping = page_mapping(page); | ||
189 | struct page **radix_pointer; | ||
190 | |||
191 | /* | ||
192 | * Avoid doing any of the following work if the page count | ||
193 | * indicates that the page is in use or truncate has removed | ||
194 | * the page. | ||
195 | */ | ||
196 | if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) | ||
197 | return -EAGAIN; | ||
198 | |||
199 | /* | ||
200 | * Establish swap ptes for anonymous pages or destroy pte | ||
201 | * maps for files. | ||
202 | * | ||
203 | * In order to reestablish file backed mappings the fault handlers | ||
204 | * will take the radix tree_lock which may then be used to stop | ||
205 | * processses from accessing this page until the new page is ready. | ||
206 | * | ||
207 | * A process accessing via a swap pte (an anonymous page) will take a | ||
208 | * page_lock on the old page which will block the process until the | ||
209 | * migration attempt is complete. At that time the PageSwapCache bit | ||
210 | * will be examined. If the page was migrated then the PageSwapCache | ||
211 | * bit will be clear and the operation to retrieve the page will be | ||
212 | * retried which will find the new page in the radix tree. Then a new | ||
213 | * direct mapping may be generated based on the radix tree contents. | ||
214 | * | ||
215 | * If the page was not migrated then the PageSwapCache bit | ||
216 | * is still set and the operation may continue. | ||
217 | */ | ||
218 | if (try_to_unmap(page, 1) == SWAP_FAIL) | ||
219 | /* A vma has VM_LOCKED set -> permanent failure */ | ||
220 | return -EPERM; | ||
221 | |||
222 | /* | ||
223 | * Give up if we were unable to remove all mappings. | ||
224 | */ | ||
225 | if (page_mapcount(page)) | ||
226 | return -EAGAIN; | ||
227 | |||
228 | write_lock_irq(&mapping->tree_lock); | ||
229 | |||
230 | radix_pointer = (struct page **)radix_tree_lookup_slot( | ||
231 | &mapping->page_tree, | ||
232 | page_index(page)); | ||
233 | |||
234 | if (!page_mapping(page) || page_count(page) != nr_refs || | ||
235 | *radix_pointer != page) { | ||
236 | write_unlock_irq(&mapping->tree_lock); | ||
237 | return 1; | ||
238 | } | ||
239 | |||
240 | /* | ||
241 | * Now we know that no one else is looking at the page. | ||
242 | * | ||
243 | * Certain minimal information about a page must be available | ||
244 | * in order for other subsystems to properly handle the page if they | ||
245 | * find it through the radix tree update before we are finished | ||
246 | * copying the page. | ||
247 | */ | ||
248 | get_page(newpage); | ||
249 | newpage->index = page->index; | ||
250 | newpage->mapping = page->mapping; | ||
251 | if (PageSwapCache(page)) { | ||
252 | SetPageSwapCache(newpage); | ||
253 | set_page_private(newpage, page_private(page)); | ||
254 | } | ||
255 | |||
256 | *radix_pointer = newpage; | ||
257 | __put_page(page); | ||
258 | write_unlock_irq(&mapping->tree_lock); | ||
259 | |||
260 | return 0; | ||
261 | } | ||
262 | EXPORT_SYMBOL(migrate_page_remove_references); | ||
263 | |||
264 | /* | ||
265 | * Copy the page to its new location | ||
266 | */ | ||
267 | void migrate_page_copy(struct page *newpage, struct page *page) | ||
268 | { | ||
269 | copy_highpage(newpage, page); | ||
270 | |||
271 | if (PageError(page)) | ||
272 | SetPageError(newpage); | ||
273 | if (PageReferenced(page)) | ||
274 | SetPageReferenced(newpage); | ||
275 | if (PageUptodate(page)) | ||
276 | SetPageUptodate(newpage); | ||
277 | if (PageActive(page)) | ||
278 | SetPageActive(newpage); | ||
279 | if (PageChecked(page)) | ||
280 | SetPageChecked(newpage); | ||
281 | if (PageMappedToDisk(page)) | ||
282 | SetPageMappedToDisk(newpage); | ||
283 | |||
284 | if (PageDirty(page)) { | ||
285 | clear_page_dirty_for_io(page); | ||
286 | set_page_dirty(newpage); | ||
287 | } | ||
288 | |||
289 | ClearPageSwapCache(page); | ||
290 | ClearPageActive(page); | ||
291 | ClearPagePrivate(page); | ||
292 | set_page_private(page, 0); | ||
293 | page->mapping = NULL; | ||
294 | |||
295 | /* | ||
296 | * If any waiters have accumulated on the new page then | ||
297 | * wake them up. | ||
298 | */ | ||
299 | if (PageWriteback(newpage)) | ||
300 | end_page_writeback(newpage); | ||
301 | } | ||
302 | EXPORT_SYMBOL(migrate_page_copy); | ||
303 | |||
304 | /* | ||
305 | * Common logic to directly migrate a single page suitable for | ||
306 | * pages that do not use PagePrivate. | ||
307 | * | ||
308 | * Pages are locked upon entry and exit. | ||
309 | */ | ||
310 | int migrate_page(struct page *newpage, struct page *page) | ||
311 | { | ||
312 | int rc; | ||
313 | |||
314 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ | ||
315 | |||
316 | rc = migrate_page_remove_references(newpage, page, 2); | ||
317 | |||
318 | if (rc) | ||
319 | return rc; | ||
320 | |||
321 | migrate_page_copy(newpage, page); | ||
322 | |||
323 | /* | ||
324 | * Remove auxiliary swap entries and replace | ||
325 | * them with real ptes. | ||
326 | * | ||
327 | * Note that a real pte entry will allow processes that are not | ||
328 | * waiting on the page lock to use the new page via the page tables | ||
329 | * before the new page is unlocked. | ||
330 | */ | ||
331 | remove_from_swap(newpage); | ||
332 | return 0; | ||
333 | } | ||
334 | EXPORT_SYMBOL(migrate_page); | ||
335 | |||
336 | /* | ||
337 | * migrate_pages | ||
338 | * | ||
339 | * Two lists are passed to this function. The first list | ||
340 | * contains the pages isolated from the LRU to be migrated. | ||
341 | * The second list contains new pages that the pages isolated | ||
342 | * can be moved to. If the second list is NULL then all | ||
343 | * pages are swapped out. | ||
344 | * | ||
345 | * The function returns after 10 attempts or if no pages | ||
346 | * are movable anymore because to has become empty | ||
347 | * or no retryable pages exist anymore. | ||
348 | * | ||
349 | * Return: Number of pages not migrated when "to" ran empty. | ||
350 | */ | ||
351 | int migrate_pages(struct list_head *from, struct list_head *to, | ||
352 | struct list_head *moved, struct list_head *failed) | ||
353 | { | ||
354 | int retry; | ||
355 | int nr_failed = 0; | ||
356 | int pass = 0; | ||
357 | struct page *page; | ||
358 | struct page *page2; | ||
359 | int swapwrite = current->flags & PF_SWAPWRITE; | ||
360 | int rc; | ||
361 | |||
362 | if (!swapwrite) | ||
363 | current->flags |= PF_SWAPWRITE; | ||
364 | |||
365 | redo: | ||
366 | retry = 0; | ||
367 | |||
368 | list_for_each_entry_safe(page, page2, from, lru) { | ||
369 | struct page *newpage = NULL; | ||
370 | struct address_space *mapping; | ||
371 | |||
372 | cond_resched(); | ||
373 | |||
374 | rc = 0; | ||
375 | if (page_count(page) == 1) | ||
376 | /* page was freed from under us. So we are done. */ | ||
377 | goto next; | ||
378 | |||
379 | if (to && list_empty(to)) | ||
380 | break; | ||
381 | |||
382 | /* | ||
383 | * Skip locked pages during the first two passes to give the | ||
384 | * functions holding the lock time to release the page. Later we | ||
385 | * use lock_page() to have a higher chance of acquiring the | ||
386 | * lock. | ||
387 | */ | ||
388 | rc = -EAGAIN; | ||
389 | if (pass > 2) | ||
390 | lock_page(page); | ||
391 | else | ||
392 | if (TestSetPageLocked(page)) | ||
393 | goto next; | ||
394 | |||
395 | /* | ||
396 | * Only wait on writeback if we have already done a pass where | ||
397 | * we we may have triggered writeouts for lots of pages. | ||
398 | */ | ||
399 | if (pass > 0) { | ||
400 | wait_on_page_writeback(page); | ||
401 | } else { | ||
402 | if (PageWriteback(page)) | ||
403 | goto unlock_page; | ||
404 | } | ||
405 | |||
406 | /* | ||
407 | * Anonymous pages must have swap cache references otherwise | ||
408 | * the information contained in the page maps cannot be | ||
409 | * preserved. | ||
410 | */ | ||
411 | if (PageAnon(page) && !PageSwapCache(page)) { | ||
412 | if (!add_to_swap(page, GFP_KERNEL)) { | ||
413 | rc = -ENOMEM; | ||
414 | goto unlock_page; | ||
415 | } | ||
416 | } | ||
417 | |||
418 | if (!to) { | ||
419 | rc = swap_page(page); | ||
420 | goto next; | ||
421 | } | ||
422 | |||
423 | newpage = lru_to_page(to); | ||
424 | lock_page(newpage); | ||
425 | |||
426 | /* | ||
427 | * Pages are properly locked and writeback is complete. | ||
428 | * Try to migrate the page. | ||
429 | */ | ||
430 | mapping = page_mapping(page); | ||
431 | if (!mapping) | ||
432 | goto unlock_both; | ||
433 | |||
434 | if (mapping->a_ops->migratepage) { | ||
435 | /* | ||
436 | * Most pages have a mapping and most filesystems | ||
437 | * should provide a migration function. Anonymous | ||
438 | * pages are part of swap space which also has its | ||
439 | * own migration function. This is the most common | ||
440 | * path for page migration. | ||
441 | */ | ||
442 | rc = mapping->a_ops->migratepage(newpage, page); | ||
443 | goto unlock_both; | ||
444 | } | ||
445 | |||
446 | /* | ||
447 | * Default handling if a filesystem does not provide | ||
448 | * a migration function. We can only migrate clean | ||
449 | * pages so try to write out any dirty pages first. | ||
450 | */ | ||
451 | if (PageDirty(page)) { | ||
452 | switch (pageout(page, mapping)) { | ||
453 | case PAGE_KEEP: | ||
454 | case PAGE_ACTIVATE: | ||
455 | goto unlock_both; | ||
456 | |||
457 | case PAGE_SUCCESS: | ||
458 | unlock_page(newpage); | ||
459 | goto next; | ||
460 | |||
461 | case PAGE_CLEAN: | ||
462 | ; /* try to migrate the page below */ | ||
463 | } | ||
464 | } | ||
465 | |||
466 | /* | ||
467 | * Buffers are managed in a filesystem specific way. | ||
468 | * We must have no buffers or drop them. | ||
469 | */ | ||
470 | if (!page_has_buffers(page) || | ||
471 | try_to_release_page(page, GFP_KERNEL)) { | ||
472 | rc = migrate_page(newpage, page); | ||
473 | goto unlock_both; | ||
474 | } | ||
475 | |||
476 | /* | ||
477 | * On early passes with mapped pages simply | ||
478 | * retry. There may be a lock held for some | ||
479 | * buffers that may go away. Later | ||
480 | * swap them out. | ||
481 | */ | ||
482 | if (pass > 4) { | ||
483 | /* | ||
484 | * Persistently unable to drop buffers..... As a | ||
485 | * measure of last resort we fall back to | ||
486 | * swap_page(). | ||
487 | */ | ||
488 | unlock_page(newpage); | ||
489 | newpage = NULL; | ||
490 | rc = swap_page(page); | ||
491 | goto next; | ||
492 | } | ||
493 | |||
494 | unlock_both: | ||
495 | unlock_page(newpage); | ||
496 | |||
497 | unlock_page: | ||
498 | unlock_page(page); | ||
499 | |||
500 | next: | ||
501 | if (rc == -EAGAIN) { | ||
502 | retry++; | ||
503 | } else if (rc) { | ||
504 | /* Permanent failure */ | ||
505 | list_move(&page->lru, failed); | ||
506 | nr_failed++; | ||
507 | } else { | ||
508 | if (newpage) { | ||
509 | /* Successful migration. Return page to LRU */ | ||
510 | move_to_lru(newpage); | ||
511 | } | ||
512 | list_move(&page->lru, moved); | ||
513 | } | ||
514 | } | ||
515 | if (retry && pass++ < 10) | ||
516 | goto redo; | ||
517 | |||
518 | if (!swapwrite) | ||
519 | current->flags &= ~PF_SWAPWRITE; | ||
520 | |||
521 | return nr_failed + retry; | ||
522 | } | ||
523 | |||
524 | /* | ||
525 | * Migration function for pages with buffers. This function can only be used | ||
526 | * if the underlying filesystem guarantees that no other references to "page" | ||
527 | * exist. | ||
528 | */ | ||
529 | int buffer_migrate_page(struct page *newpage, struct page *page) | ||
530 | { | ||
531 | struct address_space *mapping = page->mapping; | ||
532 | struct buffer_head *bh, *head; | ||
533 | int rc; | ||
534 | |||
535 | if (!mapping) | ||
536 | return -EAGAIN; | ||
537 | |||
538 | if (!page_has_buffers(page)) | ||
539 | return migrate_page(newpage, page); | ||
540 | |||
541 | head = page_buffers(page); | ||
542 | |||
543 | rc = migrate_page_remove_references(newpage, page, 3); | ||
544 | |||
545 | if (rc) | ||
546 | return rc; | ||
547 | |||
548 | bh = head; | ||
549 | do { | ||
550 | get_bh(bh); | ||
551 | lock_buffer(bh); | ||
552 | bh = bh->b_this_page; | ||
553 | |||
554 | } while (bh != head); | ||
555 | |||
556 | ClearPagePrivate(page); | ||
557 | set_page_private(newpage, page_private(page)); | ||
558 | set_page_private(page, 0); | ||
559 | put_page(page); | ||
560 | get_page(newpage); | ||
561 | |||
562 | bh = head; | ||
563 | do { | ||
564 | set_bh_page(bh, newpage, bh_offset(bh)); | ||
565 | bh = bh->b_this_page; | ||
566 | |||
567 | } while (bh != head); | ||
568 | |||
569 | SetPagePrivate(newpage); | ||
570 | |||
571 | migrate_page_copy(newpage, page); | ||
572 | |||
573 | bh = head; | ||
574 | do { | ||
575 | unlock_buffer(bh); | ||
576 | put_bh(bh); | ||
577 | bh = bh->b_this_page; | ||
578 | |||
579 | } while (bh != head); | ||
580 | |||
581 | return 0; | ||
582 | } | ||
583 | EXPORT_SYMBOL(buffer_migrate_page); | ||
584 | |||
585 | /* | ||
586 | * Migrate the list 'pagelist' of pages to a certain destination. | ||
587 | * | ||
588 | * Specify destination with either non-NULL vma or dest_node >= 0 | ||
589 | * Return the number of pages not migrated or error code | ||
590 | */ | ||
591 | int migrate_pages_to(struct list_head *pagelist, | ||
592 | struct vm_area_struct *vma, int dest) | ||
593 | { | ||
594 | LIST_HEAD(newlist); | ||
595 | LIST_HEAD(moved); | ||
596 | LIST_HEAD(failed); | ||
597 | int err = 0; | ||
598 | unsigned long offset = 0; | ||
599 | int nr_pages; | ||
600 | struct page *page; | ||
601 | struct list_head *p; | ||
602 | |||
603 | redo: | ||
604 | nr_pages = 0; | ||
605 | list_for_each(p, pagelist) { | ||
606 | if (vma) { | ||
607 | /* | ||
608 | * The address passed to alloc_page_vma is used to | ||
609 | * generate the proper interleave behavior. We fake | ||
610 | * the address here by an increasing offset in order | ||
611 | * to get the proper distribution of pages. | ||
612 | * | ||
613 | * No decision has been made as to which page | ||
614 | * a certain old page is moved to so we cannot | ||
615 | * specify the correct address. | ||
616 | */ | ||
617 | page = alloc_page_vma(GFP_HIGHUSER, vma, | ||
618 | offset + vma->vm_start); | ||
619 | offset += PAGE_SIZE; | ||
620 | } | ||
621 | else | ||
622 | page = alloc_pages_node(dest, GFP_HIGHUSER, 0); | ||
623 | |||
624 | if (!page) { | ||
625 | err = -ENOMEM; | ||
626 | goto out; | ||
627 | } | ||
628 | list_add_tail(&page->lru, &newlist); | ||
629 | nr_pages++; | ||
630 | if (nr_pages > MIGRATE_CHUNK_SIZE) | ||
631 | break; | ||
632 | } | ||
633 | err = migrate_pages(pagelist, &newlist, &moved, &failed); | ||
634 | |||
635 | putback_lru_pages(&moved); /* Call release pages instead ?? */ | ||
636 | |||
637 | if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) | ||
638 | goto redo; | ||
639 | out: | ||
640 | /* Return leftover allocated pages */ | ||
641 | while (!list_empty(&newlist)) { | ||
642 | page = list_entry(newlist.next, struct page, lru); | ||
643 | list_del(&page->lru); | ||
644 | __free_page(page); | ||
645 | } | ||
646 | list_splice(&failed, pagelist); | ||
647 | if (err < 0) | ||
648 | return err; | ||
649 | |||
650 | /* Calculate number of leftover pages */ | ||
651 | nr_pages = 0; | ||
652 | list_for_each(p, pagelist) | ||
653 | nr_pages++; | ||
654 | return nr_pages; | ||
655 | } | ||
@@ -612,7 +612,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
612 | * If the vma has a ->close operation then the driver probably needs to release | 612 | * If the vma has a ->close operation then the driver probably needs to release |
613 | * per-vma resources, so we don't attempt to merge those. | 613 | * per-vma resources, so we don't attempt to merge those. |
614 | */ | 614 | */ |
615 | #define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) | 615 | #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) |
616 | 616 | ||
617 | static inline int is_mergeable_vma(struct vm_area_struct *vma, | 617 | static inline int is_mergeable_vma(struct vm_area_struct *vma, |
618 | struct file *file, unsigned long vm_flags) | 618 | struct file *file, unsigned long vm_flags) |
@@ -845,14 +845,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, | |||
845 | const unsigned long stack_flags | 845 | const unsigned long stack_flags |
846 | = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); | 846 | = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); |
847 | 847 | ||
848 | #ifdef CONFIG_HUGETLB | ||
849 | if (flags & VM_HUGETLB) { | ||
850 | if (!(flags & VM_DONTCOPY)) | ||
851 | mm->shared_vm += pages; | ||
852 | return; | ||
853 | } | ||
854 | #endif /* CONFIG_HUGETLB */ | ||
855 | |||
856 | if (file) { | 848 | if (file) { |
857 | mm->shared_vm += pages; | 849 | mm->shared_vm += pages; |
858 | if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) | 850 | if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 653b8571c1..4c14d4289b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -124,7 +124,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
124 | * a MAP_NORESERVE private mapping to writable will now reserve. | 124 | * a MAP_NORESERVE private mapping to writable will now reserve. |
125 | */ | 125 | */ |
126 | if (newflags & VM_WRITE) { | 126 | if (newflags & VM_WRITE) { |
127 | if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { | 127 | if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { |
128 | charged = nrpages; | 128 | charged = nrpages; |
129 | if (security_vm_enough_memory(charged)) | 129 | if (security_vm_enough_memory(charged)) |
130 | return -ENOMEM; | 130 | return -ENOMEM; |
@@ -166,7 +166,10 @@ success: | |||
166 | */ | 166 | */ |
167 | vma->vm_flags = newflags; | 167 | vma->vm_flags = newflags; |
168 | vma->vm_page_prot = newprot; | 168 | vma->vm_page_prot = newprot; |
169 | change_protection(vma, start, end, newprot); | 169 | if (is_vm_hugetlb_page(vma)) |
170 | hugetlb_change_protection(vma, start, end, newprot); | ||
171 | else | ||
172 | change_protection(vma, start, end, newprot); | ||
170 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); | 173 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); |
171 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); | 174 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); |
172 | return 0; | 175 | return 0; |
@@ -240,11 +243,6 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot) | |||
240 | 243 | ||
241 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ | 244 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ |
242 | 245 | ||
243 | if (is_vm_hugetlb_page(vma)) { | ||
244 | error = -EACCES; | ||
245 | goto out; | ||
246 | } | ||
247 | |||
248 | newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); | 246 | newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); |
249 | 247 | ||
250 | /* newflags >> 4 shift VM_MAY% in place of VM_% */ | 248 | /* newflags >> 4 shift VM_MAY% in place of VM_% */ |
diff --git a/mm/nommu.c b/mm/nommu.c index 4951f4786f..db45efac17 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -159,7 +159,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | |||
159 | /* | 159 | /* |
160 | * kmalloc doesn't like __GFP_HIGHMEM for some reason | 160 | * kmalloc doesn't like __GFP_HIGHMEM for some reason |
161 | */ | 161 | */ |
162 | return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM); | 162 | return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); |
163 | } | 163 | } |
164 | 164 | ||
165 | struct page * vmalloc_to_page(void *addr) | 165 | struct page * vmalloc_to_page(void *addr) |
@@ -623,7 +623,7 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) | |||
623 | * - note that this may not return a page-aligned address if the object | 623 | * - note that this may not return a page-aligned address if the object |
624 | * we're allocating is smaller than a page | 624 | * we're allocating is smaller than a page |
625 | */ | 625 | */ |
626 | base = kmalloc(len, GFP_KERNEL); | 626 | base = kmalloc(len, GFP_KERNEL|__GFP_COMP); |
627 | if (!base) | 627 | if (!base) |
628 | goto enomem; | 628 | goto enomem; |
629 | 629 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 234bd4895d..b7f14a4799 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -55,7 +55,6 @@ unsigned long totalhigh_pages __read_mostly; | |||
55 | long nr_swap_pages; | 55 | long nr_swap_pages; |
56 | int percpu_pagelist_fraction; | 56 | int percpu_pagelist_fraction; |
57 | 57 | ||
58 | static void fastcall free_hot_cold_page(struct page *page, int cold); | ||
59 | static void __free_pages_ok(struct page *page, unsigned int order); | 58 | static void __free_pages_ok(struct page *page, unsigned int order); |
60 | 59 | ||
61 | /* | 60 | /* |
@@ -190,7 +189,7 @@ static void prep_compound_page(struct page *page, unsigned long order) | |||
190 | for (i = 0; i < nr_pages; i++) { | 189 | for (i = 0; i < nr_pages; i++) { |
191 | struct page *p = page + i; | 190 | struct page *p = page + i; |
192 | 191 | ||
193 | SetPageCompound(p); | 192 | __SetPageCompound(p); |
194 | set_page_private(p, (unsigned long)page); | 193 | set_page_private(p, (unsigned long)page); |
195 | } | 194 | } |
196 | } | 195 | } |
@@ -209,10 +208,24 @@ static void destroy_compound_page(struct page *page, unsigned long order) | |||
209 | if (unlikely(!PageCompound(p) | | 208 | if (unlikely(!PageCompound(p) | |
210 | (page_private(p) != (unsigned long)page))) | 209 | (page_private(p) != (unsigned long)page))) |
211 | bad_page(page); | 210 | bad_page(page); |
212 | ClearPageCompound(p); | 211 | __ClearPageCompound(p); |
213 | } | 212 | } |
214 | } | 213 | } |
215 | 214 | ||
215 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | ||
216 | { | ||
217 | int i; | ||
218 | |||
219 | BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); | ||
220 | /* | ||
221 | * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO | ||
222 | * and __GFP_HIGHMEM from hard or soft interrupt context. | ||
223 | */ | ||
224 | BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); | ||
225 | for (i = 0; i < (1 << order); i++) | ||
226 | clear_highpage(page + i); | ||
227 | } | ||
228 | |||
216 | /* | 229 | /* |
217 | * function for dealing with page's order in buddy system. | 230 | * function for dealing with page's order in buddy system. |
218 | * zone->lock is already acquired when we use these. | 231 | * zone->lock is already acquired when we use these. |
@@ -423,11 +436,6 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
423 | mutex_debug_check_no_locks_freed(page_address(page), | 436 | mutex_debug_check_no_locks_freed(page_address(page), |
424 | PAGE_SIZE<<order); | 437 | PAGE_SIZE<<order); |
425 | 438 | ||
426 | #ifndef CONFIG_MMU | ||
427 | for (i = 1 ; i < (1 << order) ; ++i) | ||
428 | __put_page(page + i); | ||
429 | #endif | ||
430 | |||
431 | for (i = 0 ; i < (1 << order) ; ++i) | 439 | for (i = 0 ; i < (1 << order) ; ++i) |
432 | reserved += free_pages_check(page + i); | 440 | reserved += free_pages_check(page + i); |
433 | if (reserved) | 441 | if (reserved) |
@@ -448,28 +456,23 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) | |||
448 | if (order == 0) { | 456 | if (order == 0) { |
449 | __ClearPageReserved(page); | 457 | __ClearPageReserved(page); |
450 | set_page_count(page, 0); | 458 | set_page_count(page, 0); |
451 | 459 | set_page_refcounted(page); | |
452 | free_hot_cold_page(page, 0); | 460 | __free_page(page); |
453 | } else { | 461 | } else { |
454 | LIST_HEAD(list); | ||
455 | int loop; | 462 | int loop; |
456 | 463 | ||
464 | prefetchw(page); | ||
457 | for (loop = 0; loop < BITS_PER_LONG; loop++) { | 465 | for (loop = 0; loop < BITS_PER_LONG; loop++) { |
458 | struct page *p = &page[loop]; | 466 | struct page *p = &page[loop]; |
459 | 467 | ||
460 | if (loop + 16 < BITS_PER_LONG) | 468 | if (loop + 1 < BITS_PER_LONG) |
461 | prefetchw(p + 16); | 469 | prefetchw(p + 1); |
462 | __ClearPageReserved(p); | 470 | __ClearPageReserved(p); |
463 | set_page_count(p, 0); | 471 | set_page_count(p, 0); |
464 | } | 472 | } |
465 | 473 | ||
466 | arch_free_page(page, order); | 474 | set_page_refcounted(page); |
467 | 475 | __free_pages(page, order); | |
468 | mod_page_state(pgfree, 1 << order); | ||
469 | |||
470 | list_add(&page->lru, &list); | ||
471 | kernel_map_pages(page, 1 << order, 0); | ||
472 | free_pages_bulk(page_zone(page), 1, &list, order); | ||
473 | } | 476 | } |
474 | } | 477 | } |
475 | 478 | ||
@@ -507,7 +510,7 @@ static inline void expand(struct zone *zone, struct page *page, | |||
507 | /* | 510 | /* |
508 | * This page is about to be returned from the page allocator | 511 | * This page is about to be returned from the page allocator |
509 | */ | 512 | */ |
510 | static int prep_new_page(struct page *page, int order) | 513 | static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) |
511 | { | 514 | { |
512 | if (unlikely(page_mapcount(page) | | 515 | if (unlikely(page_mapcount(page) | |
513 | (page->mapping != NULL) | | 516 | (page->mapping != NULL) | |
@@ -536,8 +539,15 @@ static int prep_new_page(struct page *page, int order) | |||
536 | 1 << PG_referenced | 1 << PG_arch_1 | | 539 | 1 << PG_referenced | 1 << PG_arch_1 | |
537 | 1 << PG_checked | 1 << PG_mappedtodisk); | 540 | 1 << PG_checked | 1 << PG_mappedtodisk); |
538 | set_page_private(page, 0); | 541 | set_page_private(page, 0); |
539 | set_page_refs(page, order); | 542 | set_page_refcounted(page); |
540 | kernel_map_pages(page, 1 << order, 1); | 543 | kernel_map_pages(page, 1 << order, 1); |
544 | |||
545 | if (gfp_flags & __GFP_ZERO) | ||
546 | prep_zero_page(page, order, gfp_flags); | ||
547 | |||
548 | if (order && (gfp_flags & __GFP_COMP)) | ||
549 | prep_compound_page(page, order); | ||
550 | |||
541 | return 0; | 551 | return 0; |
542 | } | 552 | } |
543 | 553 | ||
@@ -593,13 +603,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
593 | /* | 603 | /* |
594 | * Called from the slab reaper to drain pagesets on a particular node that | 604 | * Called from the slab reaper to drain pagesets on a particular node that |
595 | * belong to the currently executing processor. | 605 | * belong to the currently executing processor. |
606 | * Note that this function must be called with the thread pinned to | ||
607 | * a single processor. | ||
596 | */ | 608 | */ |
597 | void drain_node_pages(int nodeid) | 609 | void drain_node_pages(int nodeid) |
598 | { | 610 | { |
599 | int i, z; | 611 | int i, z; |
600 | unsigned long flags; | 612 | unsigned long flags; |
601 | 613 | ||
602 | local_irq_save(flags); | ||
603 | for (z = 0; z < MAX_NR_ZONES; z++) { | 614 | for (z = 0; z < MAX_NR_ZONES; z++) { |
604 | struct zone *zone = NODE_DATA(nodeid)->node_zones + z; | 615 | struct zone *zone = NODE_DATA(nodeid)->node_zones + z; |
605 | struct per_cpu_pageset *pset; | 616 | struct per_cpu_pageset *pset; |
@@ -609,11 +620,14 @@ void drain_node_pages(int nodeid) | |||
609 | struct per_cpu_pages *pcp; | 620 | struct per_cpu_pages *pcp; |
610 | 621 | ||
611 | pcp = &pset->pcp[i]; | 622 | pcp = &pset->pcp[i]; |
612 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); | 623 | if (pcp->count) { |
613 | pcp->count = 0; | 624 | local_irq_save(flags); |
625 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); | ||
626 | pcp->count = 0; | ||
627 | local_irq_restore(flags); | ||
628 | } | ||
614 | } | 629 | } |
615 | } | 630 | } |
616 | local_irq_restore(flags); | ||
617 | } | 631 | } |
618 | #endif | 632 | #endif |
619 | 633 | ||
@@ -743,13 +757,22 @@ void fastcall free_cold_page(struct page *page) | |||
743 | free_hot_cold_page(page, 1); | 757 | free_hot_cold_page(page, 1); |
744 | } | 758 | } |
745 | 759 | ||
746 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | 760 | /* |
761 | * split_page takes a non-compound higher-order page, and splits it into | ||
762 | * n (1<<order) sub-pages: page[0..n] | ||
763 | * Each sub-page must be freed individually. | ||
764 | * | ||
765 | * Note: this is probably too low level an operation for use in drivers. | ||
766 | * Please consult with lkml before using this in your driver. | ||
767 | */ | ||
768 | void split_page(struct page *page, unsigned int order) | ||
747 | { | 769 | { |
748 | int i; | 770 | int i; |
749 | 771 | ||
750 | BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); | 772 | BUG_ON(PageCompound(page)); |
751 | for(i = 0; i < (1 << order); i++) | 773 | BUG_ON(!page_count(page)); |
752 | clear_highpage(page + i); | 774 | for (i = 1; i < (1 << order); i++) |
775 | set_page_refcounted(page + i); | ||
753 | } | 776 | } |
754 | 777 | ||
755 | /* | 778 | /* |
@@ -795,14 +818,8 @@ again: | |||
795 | put_cpu(); | 818 | put_cpu(); |
796 | 819 | ||
797 | BUG_ON(bad_range(zone, page)); | 820 | BUG_ON(bad_range(zone, page)); |
798 | if (prep_new_page(page, order)) | 821 | if (prep_new_page(page, order, gfp_flags)) |
799 | goto again; | 822 | goto again; |
800 | |||
801 | if (gfp_flags & __GFP_ZERO) | ||
802 | prep_zero_page(page, order, gfp_flags); | ||
803 | |||
804 | if (order && (gfp_flags & __GFP_COMP)) | ||
805 | prep_compound_page(page, order); | ||
806 | return page; | 823 | return page; |
807 | 824 | ||
808 | failed: | 825 | failed: |
@@ -1214,24 +1231,22 @@ DEFINE_PER_CPU(long, nr_pagecache_local) = 0; | |||
1214 | 1231 | ||
1215 | static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) | 1232 | static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) |
1216 | { | 1233 | { |
1217 | int cpu = 0; | 1234 | unsigned cpu; |
1218 | 1235 | ||
1219 | memset(ret, 0, nr * sizeof(unsigned long)); | 1236 | memset(ret, 0, nr * sizeof(unsigned long)); |
1220 | cpus_and(*cpumask, *cpumask, cpu_online_map); | 1237 | cpus_and(*cpumask, *cpumask, cpu_online_map); |
1221 | 1238 | ||
1222 | cpu = first_cpu(*cpumask); | 1239 | for_each_cpu_mask(cpu, *cpumask) { |
1223 | while (cpu < NR_CPUS) { | 1240 | unsigned long *in; |
1224 | unsigned long *in, *out, off; | 1241 | unsigned long *out; |
1225 | 1242 | unsigned off; | |
1226 | if (!cpu_isset(cpu, *cpumask)) | 1243 | unsigned next_cpu; |
1227 | continue; | ||
1228 | 1244 | ||
1229 | in = (unsigned long *)&per_cpu(page_states, cpu); | 1245 | in = (unsigned long *)&per_cpu(page_states, cpu); |
1230 | 1246 | ||
1231 | cpu = next_cpu(cpu, *cpumask); | 1247 | next_cpu = next_cpu(cpu, *cpumask); |
1232 | 1248 | if (likely(next_cpu < NR_CPUS)) | |
1233 | if (likely(cpu < NR_CPUS)) | 1249 | prefetch(&per_cpu(page_states, next_cpu)); |
1234 | prefetch(&per_cpu(page_states, cpu)); | ||
1235 | 1250 | ||
1236 | out = (unsigned long *)ret; | 1251 | out = (unsigned long *)ret; |
1237 | for (off = 0; off < nr; off++) | 1252 | for (off = 0; off < nr; off++) |
@@ -1764,7 +1779,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
1764 | continue; | 1779 | continue; |
1765 | page = pfn_to_page(pfn); | 1780 | page = pfn_to_page(pfn); |
1766 | set_page_links(page, zone, nid, pfn); | 1781 | set_page_links(page, zone, nid, pfn); |
1767 | set_page_count(page, 1); | 1782 | init_page_count(page); |
1768 | reset_page_mapcount(page); | 1783 | reset_page_mapcount(page); |
1769 | SetPageReserved(page); | 1784 | SetPageReserved(page); |
1770 | INIT_LIST_HEAD(&page->lru); | 1785 | INIT_LIST_HEAD(&page->lru); |
diff --git a/mm/readahead.c b/mm/readahead.c index 8d6eeaaa62..301b36c4a0 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -52,13 +52,24 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra) | |||
52 | return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; | 52 | return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; |
53 | } | 53 | } |
54 | 54 | ||
55 | static inline void reset_ahead_window(struct file_ra_state *ra) | ||
56 | { | ||
57 | /* | ||
58 | * ... but preserve ahead_start + ahead_size value, | ||
59 | * see 'recheck:' label in page_cache_readahead(). | ||
60 | * Note: We never use ->ahead_size as rvalue without | ||
61 | * checking ->ahead_start != 0 first. | ||
62 | */ | ||
63 | ra->ahead_size += ra->ahead_start; | ||
64 | ra->ahead_start = 0; | ||
65 | } | ||
66 | |||
55 | static inline void ra_off(struct file_ra_state *ra) | 67 | static inline void ra_off(struct file_ra_state *ra) |
56 | { | 68 | { |
57 | ra->start = 0; | 69 | ra->start = 0; |
58 | ra->flags = 0; | 70 | ra->flags = 0; |
59 | ra->size = 0; | 71 | ra->size = 0; |
60 | ra->ahead_start = 0; | 72 | reset_ahead_window(ra); |
61 | ra->ahead_size = 0; | ||
62 | return; | 73 | return; |
63 | } | 74 | } |
64 | 75 | ||
@@ -72,10 +83,10 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) | |||
72 | { | 83 | { |
73 | unsigned long newsize = roundup_pow_of_two(size); | 84 | unsigned long newsize = roundup_pow_of_two(size); |
74 | 85 | ||
75 | if (newsize <= max / 64) | 86 | if (newsize <= max / 32) |
76 | newsize = newsize * newsize; | 87 | newsize = newsize * 4; |
77 | else if (newsize <= max / 4) | 88 | else if (newsize <= max / 4) |
78 | newsize = max / 4; | 89 | newsize = newsize * 2; |
79 | else | 90 | else |
80 | newsize = max; | 91 | newsize = max; |
81 | return newsize; | 92 | return newsize; |
@@ -426,8 +437,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp, | |||
426 | * congestion. The ahead window will any way be closed | 437 | * congestion. The ahead window will any way be closed |
427 | * in case we failed due to excessive page cache hits. | 438 | * in case we failed due to excessive page cache hits. |
428 | */ | 439 | */ |
429 | ra->ahead_start = 0; | 440 | reset_ahead_window(ra); |
430 | ra->ahead_size = 0; | ||
431 | } | 441 | } |
432 | 442 | ||
433 | return ret; | 443 | return ret; |
@@ -520,11 +530,11 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, | |||
520 | * If we get here we are doing sequential IO and this was not the first | 530 | * If we get here we are doing sequential IO and this was not the first |
521 | * occurence (ie we have an existing window) | 531 | * occurence (ie we have an existing window) |
522 | */ | 532 | */ |
523 | |||
524 | if (ra->ahead_start == 0) { /* no ahead window yet */ | 533 | if (ra->ahead_start == 0) { /* no ahead window yet */ |
525 | if (!make_ahead_window(mapping, filp, ra, 0)) | 534 | if (!make_ahead_window(mapping, filp, ra, 0)) |
526 | goto out; | 535 | goto recheck; |
527 | } | 536 | } |
537 | |||
528 | /* | 538 | /* |
529 | * Already have an ahead window, check if we crossed into it. | 539 | * Already have an ahead window, check if we crossed into it. |
530 | * If so, shift windows and issue a new ahead window. | 540 | * If so, shift windows and issue a new ahead window. |
@@ -536,6 +546,10 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, | |||
536 | ra->start = ra->ahead_start; | 546 | ra->start = ra->ahead_start; |
537 | ra->size = ra->ahead_size; | 547 | ra->size = ra->ahead_size; |
538 | make_ahead_window(mapping, filp, ra, 0); | 548 | make_ahead_window(mapping, filp, ra, 0); |
549 | recheck: | ||
550 | /* prev_page shouldn't overrun the ahead window */ | ||
551 | ra->prev_page = min(ra->prev_page, | ||
552 | ra->ahead_start + ra->ahead_size - 1); | ||
539 | } | 553 | } |
540 | 554 | ||
541 | out: | 555 | out: |
@@ -56,13 +56,11 @@ | |||
56 | 56 | ||
57 | #include <asm/tlbflush.h> | 57 | #include <asm/tlbflush.h> |
58 | 58 | ||
59 | //#define RMAP_DEBUG /* can be enabled only for debugging */ | 59 | struct kmem_cache *anon_vma_cachep; |
60 | |||
61 | kmem_cache_t *anon_vma_cachep; | ||
62 | 60 | ||
63 | static inline void validate_anon_vma(struct vm_area_struct *find_vma) | 61 | static inline void validate_anon_vma(struct vm_area_struct *find_vma) |
64 | { | 62 | { |
65 | #ifdef RMAP_DEBUG | 63 | #ifdef CONFIG_DEBUG_VM |
66 | struct anon_vma *anon_vma = find_vma->anon_vma; | 64 | struct anon_vma *anon_vma = find_vma->anon_vma; |
67 | struct vm_area_struct *vma; | 65 | struct vm_area_struct *vma; |
68 | unsigned int mapcount = 0; | 66 | unsigned int mapcount = 0; |
@@ -166,7 +164,8 @@ void anon_vma_unlink(struct vm_area_struct *vma) | |||
166 | anon_vma_free(anon_vma); | 164 | anon_vma_free(anon_vma); |
167 | } | 165 | } |
168 | 166 | ||
169 | static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) | 167 | static void anon_vma_ctor(void *data, struct kmem_cache *cachep, |
168 | unsigned long flags) | ||
170 | { | 169 | { |
171 | if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == | 170 | if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == |
172 | SLAB_CTOR_CONSTRUCTOR) { | 171 | SLAB_CTOR_CONSTRUCTOR) { |
@@ -550,13 +549,14 @@ void page_add_file_rmap(struct page *page) | |||
550 | void page_remove_rmap(struct page *page) | 549 | void page_remove_rmap(struct page *page) |
551 | { | 550 | { |
552 | if (atomic_add_negative(-1, &page->_mapcount)) { | 551 | if (atomic_add_negative(-1, &page->_mapcount)) { |
553 | if (page_mapcount(page) < 0) { | 552 | #ifdef CONFIG_DEBUG_VM |
553 | if (unlikely(page_mapcount(page) < 0)) { | ||
554 | printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); | 554 | printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); |
555 | printk (KERN_EMERG " page->flags = %lx\n", page->flags); | 555 | printk (KERN_EMERG " page->flags = %lx\n", page->flags); |
556 | printk (KERN_EMERG " page->count = %x\n", page_count(page)); | 556 | printk (KERN_EMERG " page->count = %x\n", page_count(page)); |
557 | printk (KERN_EMERG " page->mapping = %p\n", page->mapping); | 557 | printk (KERN_EMERG " page->mapping = %p\n", page->mapping); |
558 | } | 558 | } |
559 | 559 | #endif | |
560 | BUG_ON(page_mapcount(page) < 0); | 560 | BUG_ON(page_mapcount(page) < 0); |
561 | /* | 561 | /* |
562 | * It would be tidy to reset the PageAnon mapping here, | 562 | * It would be tidy to reset the PageAnon mapping here, |
diff --git a/mm/shmem.c b/mm/shmem.c index 7c455fbaff..37eaf42ed2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -875,7 +875,7 @@ redirty: | |||
875 | } | 875 | } |
876 | 876 | ||
877 | #ifdef CONFIG_NUMA | 877 | #ifdef CONFIG_NUMA |
878 | static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) | 878 | static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) |
879 | { | 879 | { |
880 | char *nodelist = strchr(value, ':'); | 880 | char *nodelist = strchr(value, ':'); |
881 | int err = 1; | 881 | int err = 1; |
@@ -2119,7 +2119,7 @@ failed: | |||
2119 | return err; | 2119 | return err; |
2120 | } | 2120 | } |
2121 | 2121 | ||
2122 | static kmem_cache_t *shmem_inode_cachep; | 2122 | static struct kmem_cache *shmem_inode_cachep; |
2123 | 2123 | ||
2124 | static struct inode *shmem_alloc_inode(struct super_block *sb) | 2124 | static struct inode *shmem_alloc_inode(struct super_block *sb) |
2125 | { | 2125 | { |
@@ -2139,7 +2139,8 @@ static void shmem_destroy_inode(struct inode *inode) | |||
2139 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); | 2139 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); |
2140 | } | 2140 | } |
2141 | 2141 | ||
2142 | static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) | 2142 | static void init_once(void *foo, struct kmem_cache *cachep, |
2143 | unsigned long flags) | ||
2143 | { | 2144 | { |
2144 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; | 2145 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; |
2145 | 2146 | ||
@@ -50,7 +50,7 @@ | |||
50 | * The head array is strictly LIFO and should improve the cache hit rates. | 50 | * The head array is strictly LIFO and should improve the cache hit rates. |
51 | * On SMP, it additionally reduces the spinlock operations. | 51 | * On SMP, it additionally reduces the spinlock operations. |
52 | * | 52 | * |
53 | * The c_cpuarray may not be read with enabled local interrupts - | 53 | * The c_cpuarray may not be read with enabled local interrupts - |
54 | * it's changed with a smp_call_function(). | 54 | * it's changed with a smp_call_function(). |
55 | * | 55 | * |
56 | * SMP synchronization: | 56 | * SMP synchronization: |
@@ -170,12 +170,12 @@ | |||
170 | #if DEBUG | 170 | #if DEBUG |
171 | # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ | 171 | # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ |
172 | SLAB_POISON | SLAB_HWCACHE_ALIGN | \ | 172 | SLAB_POISON | SLAB_HWCACHE_ALIGN | \ |
173 | SLAB_NO_REAP | SLAB_CACHE_DMA | \ | 173 | SLAB_CACHE_DMA | \ |
174 | SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ | 174 | SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ |
175 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | 175 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ |
176 | SLAB_DESTROY_BY_RCU) | 176 | SLAB_DESTROY_BY_RCU) |
177 | #else | 177 | #else |
178 | # define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ | 178 | # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ |
179 | SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ | 179 | SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ |
180 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | 180 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ |
181 | SLAB_DESTROY_BY_RCU) | 181 | SLAB_DESTROY_BY_RCU) |
@@ -266,16 +266,17 @@ struct array_cache { | |||
266 | unsigned int batchcount; | 266 | unsigned int batchcount; |
267 | unsigned int touched; | 267 | unsigned int touched; |
268 | spinlock_t lock; | 268 | spinlock_t lock; |
269 | void *entry[0]; /* | 269 | void *entry[0]; /* |
270 | * Must have this definition in here for the proper | 270 | * Must have this definition in here for the proper |
271 | * alignment of array_cache. Also simplifies accessing | 271 | * alignment of array_cache. Also simplifies accessing |
272 | * the entries. | 272 | * the entries. |
273 | * [0] is for gcc 2.95. It should really be []. | 273 | * [0] is for gcc 2.95. It should really be []. |
274 | */ | 274 | */ |
275 | }; | 275 | }; |
276 | 276 | ||
277 | /* bootstrap: The caches do not work without cpuarrays anymore, | 277 | /* |
278 | * but the cpuarrays are allocated from the generic caches... | 278 | * bootstrap: The caches do not work without cpuarrays anymore, but the |
279 | * cpuarrays are allocated from the generic caches... | ||
279 | */ | 280 | */ |
280 | #define BOOT_CPUCACHE_ENTRIES 1 | 281 | #define BOOT_CPUCACHE_ENTRIES 1 |
281 | struct arraycache_init { | 282 | struct arraycache_init { |
@@ -291,13 +292,13 @@ struct kmem_list3 { | |||
291 | struct list_head slabs_full; | 292 | struct list_head slabs_full; |
292 | struct list_head slabs_free; | 293 | struct list_head slabs_free; |
293 | unsigned long free_objects; | 294 | unsigned long free_objects; |
294 | unsigned long next_reap; | ||
295 | int free_touched; | ||
296 | unsigned int free_limit; | 295 | unsigned int free_limit; |
297 | unsigned int colour_next; /* Per-node cache coloring */ | 296 | unsigned int colour_next; /* Per-node cache coloring */ |
298 | spinlock_t list_lock; | 297 | spinlock_t list_lock; |
299 | struct array_cache *shared; /* shared per node */ | 298 | struct array_cache *shared; /* shared per node */ |
300 | struct array_cache **alien; /* on other nodes */ | 299 | struct array_cache **alien; /* on other nodes */ |
300 | unsigned long next_reap; /* updated without locking */ | ||
301 | int free_touched; /* updated without locking */ | ||
301 | }; | 302 | }; |
302 | 303 | ||
303 | /* | 304 | /* |
@@ -310,10 +311,8 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; | |||
310 | #define SIZE_L3 (1 + MAX_NUMNODES) | 311 | #define SIZE_L3 (1 + MAX_NUMNODES) |
311 | 312 | ||
312 | /* | 313 | /* |
313 | * This function must be completely optimized away if | 314 | * This function must be completely optimized away if a constant is passed to |
314 | * a constant is passed to it. Mostly the same as | 315 | * it. Mostly the same as what is in linux/slab.h except it returns an index. |
315 | * what is in linux/slab.h except it returns an | ||
316 | * index. | ||
317 | */ | 316 | */ |
318 | static __always_inline int index_of(const size_t size) | 317 | static __always_inline int index_of(const size_t size) |
319 | { | 318 | { |
@@ -351,14 +350,14 @@ static void kmem_list3_init(struct kmem_list3 *parent) | |||
351 | parent->free_touched = 0; | 350 | parent->free_touched = 0; |
352 | } | 351 | } |
353 | 352 | ||
354 | #define MAKE_LIST(cachep, listp, slab, nodeid) \ | 353 | #define MAKE_LIST(cachep, listp, slab, nodeid) \ |
355 | do { \ | 354 | do { \ |
356 | INIT_LIST_HEAD(listp); \ | 355 | INIT_LIST_HEAD(listp); \ |
357 | list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ | 356 | list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ |
358 | } while (0) | 357 | } while (0) |
359 | 358 | ||
360 | #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ | 359 | #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ |
361 | do { \ | 360 | do { \ |
362 | MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ | 361 | MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ |
363 | MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ | 362 | MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ |
364 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ | 363 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ |
@@ -373,28 +372,30 @@ static void kmem_list3_init(struct kmem_list3 *parent) | |||
373 | struct kmem_cache { | 372 | struct kmem_cache { |
374 | /* 1) per-cpu data, touched during every alloc/free */ | 373 | /* 1) per-cpu data, touched during every alloc/free */ |
375 | struct array_cache *array[NR_CPUS]; | 374 | struct array_cache *array[NR_CPUS]; |
375 | /* 2) Cache tunables. Protected by cache_chain_mutex */ | ||
376 | unsigned int batchcount; | 376 | unsigned int batchcount; |
377 | unsigned int limit; | 377 | unsigned int limit; |
378 | unsigned int shared; | 378 | unsigned int shared; |
379 | |||
379 | unsigned int buffer_size; | 380 | unsigned int buffer_size; |
380 | /* 2) touched by every alloc & free from the backend */ | 381 | /* 3) touched by every alloc & free from the backend */ |
381 | struct kmem_list3 *nodelists[MAX_NUMNODES]; | 382 | struct kmem_list3 *nodelists[MAX_NUMNODES]; |
382 | unsigned int flags; /* constant flags */ | ||
383 | unsigned int num; /* # of objs per slab */ | ||
384 | spinlock_t spinlock; | ||
385 | 383 | ||
386 | /* 3) cache_grow/shrink */ | 384 | unsigned int flags; /* constant flags */ |
385 | unsigned int num; /* # of objs per slab */ | ||
386 | |||
387 | /* 4) cache_grow/shrink */ | ||
387 | /* order of pgs per slab (2^n) */ | 388 | /* order of pgs per slab (2^n) */ |
388 | unsigned int gfporder; | 389 | unsigned int gfporder; |
389 | 390 | ||
390 | /* force GFP flags, e.g. GFP_DMA */ | 391 | /* force GFP flags, e.g. GFP_DMA */ |
391 | gfp_t gfpflags; | 392 | gfp_t gfpflags; |
392 | 393 | ||
393 | size_t colour; /* cache colouring range */ | 394 | size_t colour; /* cache colouring range */ |
394 | unsigned int colour_off; /* colour offset */ | 395 | unsigned int colour_off; /* colour offset */ |
395 | struct kmem_cache *slabp_cache; | 396 | struct kmem_cache *slabp_cache; |
396 | unsigned int slab_size; | 397 | unsigned int slab_size; |
397 | unsigned int dflags; /* dynamic flags */ | 398 | unsigned int dflags; /* dynamic flags */ |
398 | 399 | ||
399 | /* constructor func */ | 400 | /* constructor func */ |
400 | void (*ctor) (void *, struct kmem_cache *, unsigned long); | 401 | void (*ctor) (void *, struct kmem_cache *, unsigned long); |
@@ -402,11 +403,11 @@ struct kmem_cache { | |||
402 | /* de-constructor func */ | 403 | /* de-constructor func */ |
403 | void (*dtor) (void *, struct kmem_cache *, unsigned long); | 404 | void (*dtor) (void *, struct kmem_cache *, unsigned long); |
404 | 405 | ||
405 | /* 4) cache creation/removal */ | 406 | /* 5) cache creation/removal */ |
406 | const char *name; | 407 | const char *name; |
407 | struct list_head next; | 408 | struct list_head next; |
408 | 409 | ||
409 | /* 5) statistics */ | 410 | /* 6) statistics */ |
410 | #if STATS | 411 | #if STATS |
411 | unsigned long num_active; | 412 | unsigned long num_active; |
412 | unsigned long num_allocations; | 413 | unsigned long num_allocations; |
@@ -438,8 +439,9 @@ struct kmem_cache { | |||
438 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) | 439 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) |
439 | 440 | ||
440 | #define BATCHREFILL_LIMIT 16 | 441 | #define BATCHREFILL_LIMIT 16 |
441 | /* Optimization question: fewer reaps means less | 442 | /* |
442 | * probability for unnessary cpucache drain/refill cycles. | 443 | * Optimization question: fewer reaps means less probability for unnessary |
444 | * cpucache drain/refill cycles. | ||
443 | * | 445 | * |
444 | * OTOH the cpuarrays can contain lots of objects, | 446 | * OTOH the cpuarrays can contain lots of objects, |
445 | * which could lock up otherwise freeable slabs. | 447 | * which could lock up otherwise freeable slabs. |
@@ -453,17 +455,19 @@ struct kmem_cache { | |||
453 | #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) | 455 | #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) |
454 | #define STATS_INC_GROWN(x) ((x)->grown++) | 456 | #define STATS_INC_GROWN(x) ((x)->grown++) |
455 | #define STATS_INC_REAPED(x) ((x)->reaped++) | 457 | #define STATS_INC_REAPED(x) ((x)->reaped++) |
456 | #define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \ | 458 | #define STATS_SET_HIGH(x) \ |
457 | (x)->high_mark = (x)->num_active; \ | 459 | do { \ |
458 | } while (0) | 460 | if ((x)->num_active > (x)->high_mark) \ |
461 | (x)->high_mark = (x)->num_active; \ | ||
462 | } while (0) | ||
459 | #define STATS_INC_ERR(x) ((x)->errors++) | 463 | #define STATS_INC_ERR(x) ((x)->errors++) |
460 | #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) | 464 | #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) |
461 | #define STATS_INC_NODEFREES(x) ((x)->node_frees++) | 465 | #define STATS_INC_NODEFREES(x) ((x)->node_frees++) |
462 | #define STATS_SET_FREEABLE(x, i) \ | 466 | #define STATS_SET_FREEABLE(x, i) \ |
463 | do { if ((x)->max_freeable < i) \ | 467 | do { \ |
464 | (x)->max_freeable = i; \ | 468 | if ((x)->max_freeable < i) \ |
465 | } while (0) | 469 | (x)->max_freeable = i; \ |
466 | 470 | } while (0) | |
467 | #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) | 471 | #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) |
468 | #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) | 472 | #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) |
469 | #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) | 473 | #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) |
@@ -478,9 +482,7 @@ struct kmem_cache { | |||
478 | #define STATS_INC_ERR(x) do { } while (0) | 482 | #define STATS_INC_ERR(x) do { } while (0) |
479 | #define STATS_INC_NODEALLOCS(x) do { } while (0) | 483 | #define STATS_INC_NODEALLOCS(x) do { } while (0) |
480 | #define STATS_INC_NODEFREES(x) do { } while (0) | 484 | #define STATS_INC_NODEFREES(x) do { } while (0) |
481 | #define STATS_SET_FREEABLE(x, i) \ | 485 | #define STATS_SET_FREEABLE(x, i) do { } while (0) |
482 | do { } while (0) | ||
483 | |||
484 | #define STATS_INC_ALLOCHIT(x) do { } while (0) | 486 | #define STATS_INC_ALLOCHIT(x) do { } while (0) |
485 | #define STATS_INC_ALLOCMISS(x) do { } while (0) | 487 | #define STATS_INC_ALLOCMISS(x) do { } while (0) |
486 | #define STATS_INC_FREEHIT(x) do { } while (0) | 488 | #define STATS_INC_FREEHIT(x) do { } while (0) |
@@ -488,7 +490,8 @@ struct kmem_cache { | |||
488 | #endif | 490 | #endif |
489 | 491 | ||
490 | #if DEBUG | 492 | #if DEBUG |
491 | /* Magic nums for obj red zoning. | 493 | /* |
494 | * Magic nums for obj red zoning. | ||
492 | * Placed in the first word before and the first word after an obj. | 495 | * Placed in the first word before and the first word after an obj. |
493 | */ | 496 | */ |
494 | #define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ | 497 | #define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ |
@@ -499,7 +502,8 @@ struct kmem_cache { | |||
499 | #define POISON_FREE 0x6b /* for use-after-free poisoning */ | 502 | #define POISON_FREE 0x6b /* for use-after-free poisoning */ |
500 | #define POISON_END 0xa5 /* end-byte of poisoning */ | 503 | #define POISON_END 0xa5 /* end-byte of poisoning */ |
501 | 504 | ||
502 | /* memory layout of objects: | 505 | /* |
506 | * memory layout of objects: | ||
503 | * 0 : objp | 507 | * 0 : objp |
504 | * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that | 508 | * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that |
505 | * the end of an object is aligned with the end of the real | 509 | * the end of an object is aligned with the end of the real |
@@ -508,7 +512,8 @@ struct kmem_cache { | |||
508 | * redzone word. | 512 | * redzone word. |
509 | * cachep->obj_offset: The real object. | 513 | * cachep->obj_offset: The real object. |
510 | * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] | 514 | * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] |
511 | * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long] | 515 | * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address |
516 | * [BYTES_PER_WORD long] | ||
512 | */ | 517 | */ |
513 | static int obj_offset(struct kmem_cache *cachep) | 518 | static int obj_offset(struct kmem_cache *cachep) |
514 | { | 519 | { |
@@ -552,8 +557,8 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) | |||
552 | #endif | 557 | #endif |
553 | 558 | ||
554 | /* | 559 | /* |
555 | * Maximum size of an obj (in 2^order pages) | 560 | * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp |
556 | * and absolute limit for the gfp order. | 561 | * order. |
557 | */ | 562 | */ |
558 | #if defined(CONFIG_LARGE_ALLOCS) | 563 | #if defined(CONFIG_LARGE_ALLOCS) |
559 | #define MAX_OBJ_ORDER 13 /* up to 32Mb */ | 564 | #define MAX_OBJ_ORDER 13 /* up to 32Mb */ |
@@ -573,9 +578,10 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) | |||
573 | #define BREAK_GFP_ORDER_LO 0 | 578 | #define BREAK_GFP_ORDER_LO 0 |
574 | static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; | 579 | static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; |
575 | 580 | ||
576 | /* Functions for storing/retrieving the cachep and or slab from the | 581 | /* |
577 | * global 'mem_map'. These are used to find the slab an obj belongs to. | 582 | * Functions for storing/retrieving the cachep and or slab from the page |
578 | * With kfree(), these are used to find the cache which an obj belongs to. | 583 | * allocator. These are used to find the slab an obj belongs to. With kfree(), |
584 | * these are used to find the cache which an obj belongs to. | ||
579 | */ | 585 | */ |
580 | static inline void page_set_cache(struct page *page, struct kmem_cache *cache) | 586 | static inline void page_set_cache(struct page *page, struct kmem_cache *cache) |
581 | { | 587 | { |
@@ -584,6 +590,8 @@ static inline void page_set_cache(struct page *page, struct kmem_cache *cache) | |||
584 | 590 | ||
585 | static inline struct kmem_cache *page_get_cache(struct page *page) | 591 | static inline struct kmem_cache *page_get_cache(struct page *page) |
586 | { | 592 | { |
593 | if (unlikely(PageCompound(page))) | ||
594 | page = (struct page *)page_private(page); | ||
587 | return (struct kmem_cache *)page->lru.next; | 595 | return (struct kmem_cache *)page->lru.next; |
588 | } | 596 | } |
589 | 597 | ||
@@ -594,6 +602,8 @@ static inline void page_set_slab(struct page *page, struct slab *slab) | |||
594 | 602 | ||
595 | static inline struct slab *page_get_slab(struct page *page) | 603 | static inline struct slab *page_get_slab(struct page *page) |
596 | { | 604 | { |
605 | if (unlikely(PageCompound(page))) | ||
606 | page = (struct page *)page_private(page); | ||
597 | return (struct slab *)page->lru.prev; | 607 | return (struct slab *)page->lru.prev; |
598 | } | 608 | } |
599 | 609 | ||
@@ -609,7 +619,21 @@ static inline struct slab *virt_to_slab(const void *obj) | |||
609 | return page_get_slab(page); | 619 | return page_get_slab(page); |
610 | } | 620 | } |
611 | 621 | ||
612 | /* These are the default caches for kmalloc. Custom caches can have other sizes. */ | 622 | static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, |
623 | unsigned int idx) | ||
624 | { | ||
625 | return slab->s_mem + cache->buffer_size * idx; | ||
626 | } | ||
627 | |||
628 | static inline unsigned int obj_to_index(struct kmem_cache *cache, | ||
629 | struct slab *slab, void *obj) | ||
630 | { | ||
631 | return (unsigned)(obj - slab->s_mem) / cache->buffer_size; | ||
632 | } | ||
633 | |||
634 | /* | ||
635 | * These are the default caches for kmalloc. Custom caches can have other sizes. | ||
636 | */ | ||
613 | struct cache_sizes malloc_sizes[] = { | 637 | struct cache_sizes malloc_sizes[] = { |
614 | #define CACHE(x) { .cs_size = (x) }, | 638 | #define CACHE(x) { .cs_size = (x) }, |
615 | #include <linux/kmalloc_sizes.h> | 639 | #include <linux/kmalloc_sizes.h> |
@@ -642,8 +666,6 @@ static struct kmem_cache cache_cache = { | |||
642 | .limit = BOOT_CPUCACHE_ENTRIES, | 666 | .limit = BOOT_CPUCACHE_ENTRIES, |
643 | .shared = 1, | 667 | .shared = 1, |
644 | .buffer_size = sizeof(struct kmem_cache), | 668 | .buffer_size = sizeof(struct kmem_cache), |
645 | .flags = SLAB_NO_REAP, | ||
646 | .spinlock = SPIN_LOCK_UNLOCKED, | ||
647 | .name = "kmem_cache", | 669 | .name = "kmem_cache", |
648 | #if DEBUG | 670 | #if DEBUG |
649 | .obj_size = sizeof(struct kmem_cache), | 671 | .obj_size = sizeof(struct kmem_cache), |
@@ -655,8 +677,8 @@ static DEFINE_MUTEX(cache_chain_mutex); | |||
655 | static struct list_head cache_chain; | 677 | static struct list_head cache_chain; |
656 | 678 | ||
657 | /* | 679 | /* |
658 | * vm_enough_memory() looks at this to determine how many | 680 | * vm_enough_memory() looks at this to determine how many slab-allocated pages |
659 | * slab-allocated pages are possibly freeable under pressure | 681 | * are possibly freeable under pressure |
660 | * | 682 | * |
661 | * SLAB_RECLAIM_ACCOUNT turns this on per-slab | 683 | * SLAB_RECLAIM_ACCOUNT turns this on per-slab |
662 | */ | 684 | */ |
@@ -675,7 +697,8 @@ static enum { | |||
675 | 697 | ||
676 | static DEFINE_PER_CPU(struct work_struct, reap_work); | 698 | static DEFINE_PER_CPU(struct work_struct, reap_work); |
677 | 699 | ||
678 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node); | 700 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, |
701 | int node); | ||
679 | static void enable_cpucache(struct kmem_cache *cachep); | 702 | static void enable_cpucache(struct kmem_cache *cachep); |
680 | static void cache_reap(void *unused); | 703 | static void cache_reap(void *unused); |
681 | static int __node_shrink(struct kmem_cache *cachep, int node); | 704 | static int __node_shrink(struct kmem_cache *cachep, int node); |
@@ -685,7 +708,8 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) | |||
685 | return cachep->array[smp_processor_id()]; | 708 | return cachep->array[smp_processor_id()]; |
686 | } | 709 | } |
687 | 710 | ||
688 | static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags) | 711 | static inline struct kmem_cache *__find_general_cachep(size_t size, |
712 | gfp_t gfpflags) | ||
689 | { | 713 | { |
690 | struct cache_sizes *csizep = malloc_sizes; | 714 | struct cache_sizes *csizep = malloc_sizes; |
691 | 715 | ||
@@ -720,8 +744,9 @@ static size_t slab_mgmt_size(size_t nr_objs, size_t align) | |||
720 | return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); | 744 | return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); |
721 | } | 745 | } |
722 | 746 | ||
723 | /* Calculate the number of objects and left-over bytes for a given | 747 | /* |
724 | buffer size. */ | 748 | * Calculate the number of objects and left-over bytes for a given buffer size. |
749 | */ | ||
725 | static void cache_estimate(unsigned long gfporder, size_t buffer_size, | 750 | static void cache_estimate(unsigned long gfporder, size_t buffer_size, |
726 | size_t align, int flags, size_t *left_over, | 751 | size_t align, int flags, size_t *left_over, |
727 | unsigned int *num) | 752 | unsigned int *num) |
@@ -782,7 +807,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, | |||
782 | 807 | ||
783 | #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) | 808 | #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) |
784 | 809 | ||
785 | static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg) | 810 | static void __slab_error(const char *function, struct kmem_cache *cachep, |
811 | char *msg) | ||
786 | { | 812 | { |
787 | printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", | 813 | printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", |
788 | function, cachep->name, msg); | 814 | function, cachep->name, msg); |
@@ -804,7 +830,7 @@ static void init_reap_node(int cpu) | |||
804 | 830 | ||
805 | node = next_node(cpu_to_node(cpu), node_online_map); | 831 | node = next_node(cpu_to_node(cpu), node_online_map); |
806 | if (node == MAX_NUMNODES) | 832 | if (node == MAX_NUMNODES) |
807 | node = 0; | 833 | node = first_node(node_online_map); |
808 | 834 | ||
809 | __get_cpu_var(reap_node) = node; | 835 | __get_cpu_var(reap_node) = node; |
810 | } | 836 | } |
@@ -906,10 +932,8 @@ static void free_alien_cache(struct array_cache **ac_ptr) | |||
906 | 932 | ||
907 | if (!ac_ptr) | 933 | if (!ac_ptr) |
908 | return; | 934 | return; |
909 | |||
910 | for_each_node(i) | 935 | for_each_node(i) |
911 | kfree(ac_ptr[i]); | 936 | kfree(ac_ptr[i]); |
912 | |||
913 | kfree(ac_ptr); | 937 | kfree(ac_ptr); |
914 | } | 938 | } |
915 | 939 | ||
@@ -943,7 +967,8 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) | |||
943 | } | 967 | } |
944 | } | 968 | } |
945 | 969 | ||
946 | static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) | 970 | static void drain_alien_cache(struct kmem_cache *cachep, |
971 | struct array_cache **alien) | ||
947 | { | 972 | { |
948 | int i = 0; | 973 | int i = 0; |
949 | struct array_cache *ac; | 974 | struct array_cache *ac; |
@@ -986,20 +1011,22 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
986 | switch (action) { | 1011 | switch (action) { |
987 | case CPU_UP_PREPARE: | 1012 | case CPU_UP_PREPARE: |
988 | mutex_lock(&cache_chain_mutex); | 1013 | mutex_lock(&cache_chain_mutex); |
989 | /* we need to do this right in the beginning since | 1014 | /* |
1015 | * We need to do this right in the beginning since | ||
990 | * alloc_arraycache's are going to use this list. | 1016 | * alloc_arraycache's are going to use this list. |
991 | * kmalloc_node allows us to add the slab to the right | 1017 | * kmalloc_node allows us to add the slab to the right |
992 | * kmem_list3 and not this cpu's kmem_list3 | 1018 | * kmem_list3 and not this cpu's kmem_list3 |
993 | */ | 1019 | */ |
994 | 1020 | ||
995 | list_for_each_entry(cachep, &cache_chain, next) { | 1021 | list_for_each_entry(cachep, &cache_chain, next) { |
996 | /* setup the size64 kmemlist for cpu before we can | 1022 | /* |
1023 | * Set up the size64 kmemlist for cpu before we can | ||
997 | * begin anything. Make sure some other cpu on this | 1024 | * begin anything. Make sure some other cpu on this |
998 | * node has not already allocated this | 1025 | * node has not already allocated this |
999 | */ | 1026 | */ |
1000 | if (!cachep->nodelists[node]) { | 1027 | if (!cachep->nodelists[node]) { |
1001 | if (!(l3 = kmalloc_node(memsize, | 1028 | l3 = kmalloc_node(memsize, GFP_KERNEL, node); |
1002 | GFP_KERNEL, node))) | 1029 | if (!l3) |
1003 | goto bad; | 1030 | goto bad; |
1004 | kmem_list3_init(l3); | 1031 | kmem_list3_init(l3); |
1005 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + | 1032 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + |
@@ -1015,13 +1042,15 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
1015 | 1042 | ||
1016 | spin_lock_irq(&cachep->nodelists[node]->list_lock); | 1043 | spin_lock_irq(&cachep->nodelists[node]->list_lock); |
1017 | cachep->nodelists[node]->free_limit = | 1044 | cachep->nodelists[node]->free_limit = |
1018 | (1 + nr_cpus_node(node)) * | 1045 | (1 + nr_cpus_node(node)) * |
1019 | cachep->batchcount + cachep->num; | 1046 | cachep->batchcount + cachep->num; |
1020 | spin_unlock_irq(&cachep->nodelists[node]->list_lock); | 1047 | spin_unlock_irq(&cachep->nodelists[node]->list_lock); |
1021 | } | 1048 | } |
1022 | 1049 | ||
1023 | /* Now we can go ahead with allocating the shared array's | 1050 | /* |
1024 | & array cache's */ | 1051 | * Now we can go ahead with allocating the shared arrays and |
1052 | * array caches | ||
1053 | */ | ||
1025 | list_for_each_entry(cachep, &cache_chain, next) { | 1054 | list_for_each_entry(cachep, &cache_chain, next) { |
1026 | struct array_cache *nc; | 1055 | struct array_cache *nc; |
1027 | struct array_cache *shared; | 1056 | struct array_cache *shared; |
@@ -1041,7 +1070,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
1041 | if (!alien) | 1070 | if (!alien) |
1042 | goto bad; | 1071 | goto bad; |
1043 | cachep->array[cpu] = nc; | 1072 | cachep->array[cpu] = nc; |
1044 | |||
1045 | l3 = cachep->nodelists[node]; | 1073 | l3 = cachep->nodelists[node]; |
1046 | BUG_ON(!l3); | 1074 | BUG_ON(!l3); |
1047 | 1075 | ||
@@ -1061,7 +1089,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
1061 | } | 1089 | } |
1062 | #endif | 1090 | #endif |
1063 | spin_unlock_irq(&l3->list_lock); | 1091 | spin_unlock_irq(&l3->list_lock); |
1064 | |||
1065 | kfree(shared); | 1092 | kfree(shared); |
1066 | free_alien_cache(alien); | 1093 | free_alien_cache(alien); |
1067 | } | 1094 | } |
@@ -1083,7 +1110,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
1083 | /* fall thru */ | 1110 | /* fall thru */ |
1084 | case CPU_UP_CANCELED: | 1111 | case CPU_UP_CANCELED: |
1085 | mutex_lock(&cache_chain_mutex); | 1112 | mutex_lock(&cache_chain_mutex); |
1086 | |||
1087 | list_for_each_entry(cachep, &cache_chain, next) { | 1113 | list_for_each_entry(cachep, &cache_chain, next) { |
1088 | struct array_cache *nc; | 1114 | struct array_cache *nc; |
1089 | struct array_cache *shared; | 1115 | struct array_cache *shared; |
@@ -1150,7 +1176,7 @@ free_array_cache: | |||
1150 | #endif | 1176 | #endif |
1151 | } | 1177 | } |
1152 | return NOTIFY_OK; | 1178 | return NOTIFY_OK; |
1153 | bad: | 1179 | bad: |
1154 | mutex_unlock(&cache_chain_mutex); | 1180 | mutex_unlock(&cache_chain_mutex); |
1155 | return NOTIFY_BAD; | 1181 | return NOTIFY_BAD; |
1156 | } | 1182 | } |
@@ -1160,7 +1186,8 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; | |||
1160 | /* | 1186 | /* |
1161 | * swap the static kmem_list3 with kmalloced memory | 1187 | * swap the static kmem_list3 with kmalloced memory |
1162 | */ | 1188 | */ |
1163 | static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid) | 1189 | static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, |
1190 | int nodeid) | ||
1164 | { | 1191 | { |
1165 | struct kmem_list3 *ptr; | 1192 | struct kmem_list3 *ptr; |
1166 | 1193 | ||
@@ -1175,8 +1202,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int no | |||
1175 | local_irq_enable(); | 1202 | local_irq_enable(); |
1176 | } | 1203 | } |
1177 | 1204 | ||
1178 | /* Initialisation. | 1205 | /* |
1179 | * Called after the gfp() functions have been enabled, and before smp_init(). | 1206 | * Initialisation. Called after the page allocator have been initialised and |
1207 | * before smp_init(). | ||
1180 | */ | 1208 | */ |
1181 | void __init kmem_cache_init(void) | 1209 | void __init kmem_cache_init(void) |
1182 | { | 1210 | { |
@@ -1201,9 +1229,9 @@ void __init kmem_cache_init(void) | |||
1201 | 1229 | ||
1202 | /* Bootstrap is tricky, because several objects are allocated | 1230 | /* Bootstrap is tricky, because several objects are allocated |
1203 | * from caches that do not exist yet: | 1231 | * from caches that do not exist yet: |
1204 | * 1) initialize the cache_cache cache: it contains the struct kmem_cache | 1232 | * 1) initialize the cache_cache cache: it contains the struct |
1205 | * structures of all caches, except cache_cache itself: cache_cache | 1233 | * kmem_cache structures of all caches, except cache_cache itself: |
1206 | * is statically allocated. | 1234 | * cache_cache is statically allocated. |
1207 | * Initially an __init data area is used for the head array and the | 1235 | * Initially an __init data area is used for the head array and the |
1208 | * kmem_list3 structures, it's replaced with a kmalloc allocated | 1236 | * kmem_list3 structures, it's replaced with a kmalloc allocated |
1209 | * array at the end of the bootstrap. | 1237 | * array at the end of the bootstrap. |
@@ -1226,7 +1254,8 @@ void __init kmem_cache_init(void) | |||
1226 | cache_cache.array[smp_processor_id()] = &initarray_cache.cache; | 1254 | cache_cache.array[smp_processor_id()] = &initarray_cache.cache; |
1227 | cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; | 1255 | cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; |
1228 | 1256 | ||
1229 | cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); | 1257 | cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, |
1258 | cache_line_size()); | ||
1230 | 1259 | ||
1231 | for (order = 0; order < MAX_ORDER; order++) { | 1260 | for (order = 0; order < MAX_ORDER; order++) { |
1232 | cache_estimate(order, cache_cache.buffer_size, | 1261 | cache_estimate(order, cache_cache.buffer_size, |
@@ -1245,24 +1274,26 @@ void __init kmem_cache_init(void) | |||
1245 | sizes = malloc_sizes; | 1274 | sizes = malloc_sizes; |
1246 | names = cache_names; | 1275 | names = cache_names; |
1247 | 1276 | ||
1248 | /* Initialize the caches that provide memory for the array cache | 1277 | /* |
1249 | * and the kmem_list3 structures first. | 1278 | * Initialize the caches that provide memory for the array cache and the |
1250 | * Without this, further allocations will bug | 1279 | * kmem_list3 structures first. Without this, further allocations will |
1280 | * bug. | ||
1251 | */ | 1281 | */ |
1252 | 1282 | ||
1253 | sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, | 1283 | sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, |
1254 | sizes[INDEX_AC].cs_size, | 1284 | sizes[INDEX_AC].cs_size, |
1255 | ARCH_KMALLOC_MINALIGN, | 1285 | ARCH_KMALLOC_MINALIGN, |
1256 | (ARCH_KMALLOC_FLAGS | | 1286 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, |
1257 | SLAB_PANIC), NULL, NULL); | 1287 | NULL, NULL); |
1258 | 1288 | ||
1259 | if (INDEX_AC != INDEX_L3) | 1289 | if (INDEX_AC != INDEX_L3) { |
1260 | sizes[INDEX_L3].cs_cachep = | 1290 | sizes[INDEX_L3].cs_cachep = |
1261 | kmem_cache_create(names[INDEX_L3].name, | 1291 | kmem_cache_create(names[INDEX_L3].name, |
1262 | sizes[INDEX_L3].cs_size, | 1292 | sizes[INDEX_L3].cs_size, |
1263 | ARCH_KMALLOC_MINALIGN, | 1293 | ARCH_KMALLOC_MINALIGN, |
1264 | (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, | 1294 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, |
1265 | NULL); | 1295 | NULL, NULL); |
1296 | } | ||
1266 | 1297 | ||
1267 | while (sizes->cs_size != ULONG_MAX) { | 1298 | while (sizes->cs_size != ULONG_MAX) { |
1268 | /* | 1299 | /* |
@@ -1272,13 +1303,13 @@ void __init kmem_cache_init(void) | |||
1272 | * Note for systems short on memory removing the alignment will | 1303 | * Note for systems short on memory removing the alignment will |
1273 | * allow tighter packing of the smaller caches. | 1304 | * allow tighter packing of the smaller caches. |
1274 | */ | 1305 | */ |
1275 | if (!sizes->cs_cachep) | 1306 | if (!sizes->cs_cachep) { |
1276 | sizes->cs_cachep = kmem_cache_create(names->name, | 1307 | sizes->cs_cachep = kmem_cache_create(names->name, |
1277 | sizes->cs_size, | 1308 | sizes->cs_size, |
1278 | ARCH_KMALLOC_MINALIGN, | 1309 | ARCH_KMALLOC_MINALIGN, |
1279 | (ARCH_KMALLOC_FLAGS | 1310 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, |
1280 | | SLAB_PANIC), | 1311 | NULL, NULL); |
1281 | NULL, NULL); | 1312 | } |
1282 | 1313 | ||
1283 | /* Inc off-slab bufctl limit until the ceiling is hit. */ | 1314 | /* Inc off-slab bufctl limit until the ceiling is hit. */ |
1284 | if (!(OFF_SLAB(sizes->cs_cachep))) { | 1315 | if (!(OFF_SLAB(sizes->cs_cachep))) { |
@@ -1287,13 +1318,11 @@ void __init kmem_cache_init(void) | |||
1287 | } | 1318 | } |
1288 | 1319 | ||
1289 | sizes->cs_dmacachep = kmem_cache_create(names->name_dma, | 1320 | sizes->cs_dmacachep = kmem_cache_create(names->name_dma, |
1290 | sizes->cs_size, | 1321 | sizes->cs_size, |
1291 | ARCH_KMALLOC_MINALIGN, | 1322 | ARCH_KMALLOC_MINALIGN, |
1292 | (ARCH_KMALLOC_FLAGS | | 1323 | ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| |
1293 | SLAB_CACHE_DMA | | 1324 | SLAB_PANIC, |
1294 | SLAB_PANIC), NULL, | 1325 | NULL, NULL); |
1295 | NULL); | ||
1296 | |||
1297 | sizes++; | 1326 | sizes++; |
1298 | names++; | 1327 | names++; |
1299 | } | 1328 | } |
@@ -1345,20 +1374,22 @@ void __init kmem_cache_init(void) | |||
1345 | struct kmem_cache *cachep; | 1374 | struct kmem_cache *cachep; |
1346 | mutex_lock(&cache_chain_mutex); | 1375 | mutex_lock(&cache_chain_mutex); |
1347 | list_for_each_entry(cachep, &cache_chain, next) | 1376 | list_for_each_entry(cachep, &cache_chain, next) |
1348 | enable_cpucache(cachep); | 1377 | enable_cpucache(cachep); |
1349 | mutex_unlock(&cache_chain_mutex); | 1378 | mutex_unlock(&cache_chain_mutex); |
1350 | } | 1379 | } |
1351 | 1380 | ||
1352 | /* Done! */ | 1381 | /* Done! */ |
1353 | g_cpucache_up = FULL; | 1382 | g_cpucache_up = FULL; |
1354 | 1383 | ||
1355 | /* Register a cpu startup notifier callback | 1384 | /* |
1356 | * that initializes cpu_cache_get for all new cpus | 1385 | * Register a cpu startup notifier callback that initializes |
1386 | * cpu_cache_get for all new cpus | ||
1357 | */ | 1387 | */ |
1358 | register_cpu_notifier(&cpucache_notifier); | 1388 | register_cpu_notifier(&cpucache_notifier); |
1359 | 1389 | ||
1360 | /* The reap timers are started later, with a module init call: | 1390 | /* |
1361 | * That part of the kernel is not yet operational. | 1391 | * The reap timers are started later, with a module init call: That part |
1392 | * of the kernel is not yet operational. | ||
1362 | */ | 1393 | */ |
1363 | } | 1394 | } |
1364 | 1395 | ||
@@ -1366,16 +1397,13 @@ static int __init cpucache_init(void) | |||
1366 | { | 1397 | { |
1367 | int cpu; | 1398 | int cpu; |
1368 | 1399 | ||
1369 | /* | 1400 | /* |
1370 | * Register the timers that return unneeded | 1401 | * Register the timers that return unneeded pages to the page allocator |
1371 | * pages to gfp. | ||
1372 | */ | 1402 | */ |
1373 | for_each_online_cpu(cpu) | 1403 | for_each_online_cpu(cpu) |
1374 | start_cpu_timer(cpu); | 1404 | start_cpu_timer(cpu); |
1375 | |||
1376 | return 0; | 1405 | return 0; |
1377 | } | 1406 | } |
1378 | |||
1379 | __initcall(cpucache_init); | 1407 | __initcall(cpucache_init); |
1380 | 1408 | ||
1381 | /* | 1409 | /* |
@@ -1402,7 +1430,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1402 | atomic_add(i, &slab_reclaim_pages); | 1430 | atomic_add(i, &slab_reclaim_pages); |
1403 | add_page_state(nr_slab, i); | 1431 | add_page_state(nr_slab, i); |
1404 | while (i--) { | 1432 | while (i--) { |
1405 | SetPageSlab(page); | 1433 | __SetPageSlab(page); |
1406 | page++; | 1434 | page++; |
1407 | } | 1435 | } |
1408 | return addr; | 1436 | return addr; |
@@ -1418,8 +1446,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) | |||
1418 | const unsigned long nr_freed = i; | 1446 | const unsigned long nr_freed = i; |
1419 | 1447 | ||
1420 | while (i--) { | 1448 | while (i--) { |
1421 | if (!TestClearPageSlab(page)) | 1449 | BUG_ON(!PageSlab(page)); |
1422 | BUG(); | 1450 | __ClearPageSlab(page); |
1423 | page++; | 1451 | page++; |
1424 | } | 1452 | } |
1425 | sub_page_state(nr_slab, nr_freed); | 1453 | sub_page_state(nr_slab, nr_freed); |
@@ -1489,9 +1517,8 @@ static void dump_line(char *data, int offset, int limit) | |||
1489 | { | 1517 | { |
1490 | int i; | 1518 | int i; |
1491 | printk(KERN_ERR "%03x:", offset); | 1519 | printk(KERN_ERR "%03x:", offset); |
1492 | for (i = 0; i < limit; i++) { | 1520 | for (i = 0; i < limit; i++) |
1493 | printk(" %02x", (unsigned char)data[offset + i]); | 1521 | printk(" %02x", (unsigned char)data[offset + i]); |
1494 | } | ||
1495 | printk("\n"); | 1522 | printk("\n"); |
1496 | } | 1523 | } |
1497 | #endif | 1524 | #endif |
@@ -1505,15 +1532,15 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) | |||
1505 | 1532 | ||
1506 | if (cachep->flags & SLAB_RED_ZONE) { | 1533 | if (cachep->flags & SLAB_RED_ZONE) { |
1507 | printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", | 1534 | printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", |
1508 | *dbg_redzone1(cachep, objp), | 1535 | *dbg_redzone1(cachep, objp), |
1509 | *dbg_redzone2(cachep, objp)); | 1536 | *dbg_redzone2(cachep, objp)); |
1510 | } | 1537 | } |
1511 | 1538 | ||
1512 | if (cachep->flags & SLAB_STORE_USER) { | 1539 | if (cachep->flags & SLAB_STORE_USER) { |
1513 | printk(KERN_ERR "Last user: [<%p>]", | 1540 | printk(KERN_ERR "Last user: [<%p>]", |
1514 | *dbg_userword(cachep, objp)); | 1541 | *dbg_userword(cachep, objp)); |
1515 | print_symbol("(%s)", | 1542 | print_symbol("(%s)", |
1516 | (unsigned long)*dbg_userword(cachep, objp)); | 1543 | (unsigned long)*dbg_userword(cachep, objp)); |
1517 | printk("\n"); | 1544 | printk("\n"); |
1518 | } | 1545 | } |
1519 | realobj = (char *)objp + obj_offset(cachep); | 1546 | realobj = (char *)objp + obj_offset(cachep); |
@@ -1546,8 +1573,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) | |||
1546 | /* Print header */ | 1573 | /* Print header */ |
1547 | if (lines == 0) { | 1574 | if (lines == 0) { |
1548 | printk(KERN_ERR | 1575 | printk(KERN_ERR |
1549 | "Slab corruption: start=%p, len=%d\n", | 1576 | "Slab corruption: start=%p, len=%d\n", |
1550 | realobj, size); | 1577 | realobj, size); |
1551 | print_objinfo(cachep, objp, 0); | 1578 | print_objinfo(cachep, objp, 0); |
1552 | } | 1579 | } |
1553 | /* Hexdump the affected line */ | 1580 | /* Hexdump the affected line */ |
@@ -1568,18 +1595,18 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) | |||
1568 | * exist: | 1595 | * exist: |
1569 | */ | 1596 | */ |
1570 | struct slab *slabp = virt_to_slab(objp); | 1597 | struct slab *slabp = virt_to_slab(objp); |
1571 | int objnr; | 1598 | unsigned int objnr; |
1572 | 1599 | ||
1573 | objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; | 1600 | objnr = obj_to_index(cachep, slabp, objp); |
1574 | if (objnr) { | 1601 | if (objnr) { |
1575 | objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size; | 1602 | objp = index_to_obj(cachep, slabp, objnr - 1); |
1576 | realobj = (char *)objp + obj_offset(cachep); | 1603 | realobj = (char *)objp + obj_offset(cachep); |
1577 | printk(KERN_ERR "Prev obj: start=%p, len=%d\n", | 1604 | printk(KERN_ERR "Prev obj: start=%p, len=%d\n", |
1578 | realobj, size); | 1605 | realobj, size); |
1579 | print_objinfo(cachep, objp, 2); | 1606 | print_objinfo(cachep, objp, 2); |
1580 | } | 1607 | } |
1581 | if (objnr + 1 < cachep->num) { | 1608 | if (objnr + 1 < cachep->num) { |
1582 | objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size; | 1609 | objp = index_to_obj(cachep, slabp, objnr + 1); |
1583 | realobj = (char *)objp + obj_offset(cachep); | 1610 | realobj = (char *)objp + obj_offset(cachep); |
1584 | printk(KERN_ERR "Next obj: start=%p, len=%d\n", | 1611 | printk(KERN_ERR "Next obj: start=%p, len=%d\n", |
1585 | realobj, size); | 1612 | realobj, size); |
@@ -1591,22 +1618,25 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) | |||
1591 | 1618 | ||
1592 | #if DEBUG | 1619 | #if DEBUG |
1593 | /** | 1620 | /** |
1594 | * slab_destroy_objs - call the registered destructor for each object in | 1621 | * slab_destroy_objs - destroy a slab and its objects |
1595 | * a slab that is to be destroyed. | 1622 | * @cachep: cache pointer being destroyed |
1623 | * @slabp: slab pointer being destroyed | ||
1624 | * | ||
1625 | * Call the registered destructor for each object in a slab that is being | ||
1626 | * destroyed. | ||
1596 | */ | 1627 | */ |
1597 | static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) | 1628 | static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) |
1598 | { | 1629 | { |
1599 | int i; | 1630 | int i; |
1600 | for (i = 0; i < cachep->num; i++) { | 1631 | for (i = 0; i < cachep->num; i++) { |
1601 | void *objp = slabp->s_mem + cachep->buffer_size * i; | 1632 | void *objp = index_to_obj(cachep, slabp, i); |
1602 | 1633 | ||
1603 | if (cachep->flags & SLAB_POISON) { | 1634 | if (cachep->flags & SLAB_POISON) { |
1604 | #ifdef CONFIG_DEBUG_PAGEALLOC | 1635 | #ifdef CONFIG_DEBUG_PAGEALLOC |
1605 | if ((cachep->buffer_size % PAGE_SIZE) == 0 | 1636 | if (cachep->buffer_size % PAGE_SIZE == 0 && |
1606 | && OFF_SLAB(cachep)) | 1637 | OFF_SLAB(cachep)) |
1607 | kernel_map_pages(virt_to_page(objp), | 1638 | kernel_map_pages(virt_to_page(objp), |
1608 | cachep->buffer_size / PAGE_SIZE, | 1639 | cachep->buffer_size / PAGE_SIZE, 1); |
1609 | 1); | ||
1610 | else | 1640 | else |
1611 | check_poison_obj(cachep, objp); | 1641 | check_poison_obj(cachep, objp); |
1612 | #else | 1642 | #else |
@@ -1631,7 +1661,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) | |||
1631 | if (cachep->dtor) { | 1661 | if (cachep->dtor) { |
1632 | int i; | 1662 | int i; |
1633 | for (i = 0; i < cachep->num; i++) { | 1663 | for (i = 0; i < cachep->num; i++) { |
1634 | void *objp = slabp->s_mem + cachep->buffer_size * i; | 1664 | void *objp = index_to_obj(cachep, slabp, i); |
1635 | (cachep->dtor) (objp, cachep, 0); | 1665 | (cachep->dtor) (objp, cachep, 0); |
1636 | } | 1666 | } |
1637 | } | 1667 | } |
@@ -1639,9 +1669,13 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) | |||
1639 | #endif | 1669 | #endif |
1640 | 1670 | ||
1641 | /** | 1671 | /** |
1672 | * slab_destroy - destroy and release all objects in a slab | ||
1673 | * @cachep: cache pointer being destroyed | ||
1674 | * @slabp: slab pointer being destroyed | ||
1675 | * | ||
1642 | * Destroy all the objs in a slab, and release the mem back to the system. | 1676 | * Destroy all the objs in a slab, and release the mem back to the system. |
1643 | * Before calling the slab must have been unlinked from the cache. | 1677 | * Before calling the slab must have been unlinked from the cache. The |
1644 | * The cache-lock is not held/needed. | 1678 | * cache-lock is not held/needed. |
1645 | */ | 1679 | */ |
1646 | static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) | 1680 | static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) |
1647 | { | 1681 | { |
@@ -1662,8 +1696,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) | |||
1662 | } | 1696 | } |
1663 | } | 1697 | } |
1664 | 1698 | ||
1665 | /* For setting up all the kmem_list3s for cache whose buffer_size is same | 1699 | /* |
1666 | as size of kmem_list3. */ | 1700 | * For setting up all the kmem_list3s for cache whose buffer_size is same as |
1701 | * size of kmem_list3. | ||
1702 | */ | ||
1667 | static void set_up_list3s(struct kmem_cache *cachep, int index) | 1703 | static void set_up_list3s(struct kmem_cache *cachep, int index) |
1668 | { | 1704 | { |
1669 | int node; | 1705 | int node; |
@@ -1689,13 +1725,13 @@ static void set_up_list3s(struct kmem_cache *cachep, int index) | |||
1689 | * high order pages for slabs. When the gfp() functions are more friendly | 1725 | * high order pages for slabs. When the gfp() functions are more friendly |
1690 | * towards high-order requests, this should be changed. | 1726 | * towards high-order requests, this should be changed. |
1691 | */ | 1727 | */ |
1692 | static inline size_t calculate_slab_order(struct kmem_cache *cachep, | 1728 | static size_t calculate_slab_order(struct kmem_cache *cachep, |
1693 | size_t size, size_t align, unsigned long flags) | 1729 | size_t size, size_t align, unsigned long flags) |
1694 | { | 1730 | { |
1695 | size_t left_over = 0; | 1731 | size_t left_over = 0; |
1696 | int gfporder; | 1732 | int gfporder; |
1697 | 1733 | ||
1698 | for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) { | 1734 | for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) { |
1699 | unsigned int num; | 1735 | unsigned int num; |
1700 | size_t remainder; | 1736 | size_t remainder; |
1701 | 1737 | ||
@@ -1730,12 +1766,66 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep, | |||
1730 | /* | 1766 | /* |
1731 | * Acceptable internal fragmentation? | 1767 | * Acceptable internal fragmentation? |
1732 | */ | 1768 | */ |
1733 | if ((left_over * 8) <= (PAGE_SIZE << gfporder)) | 1769 | if (left_over * 8 <= (PAGE_SIZE << gfporder)) |
1734 | break; | 1770 | break; |
1735 | } | 1771 | } |
1736 | return left_over; | 1772 | return left_over; |
1737 | } | 1773 | } |
1738 | 1774 | ||
1775 | static void setup_cpu_cache(struct kmem_cache *cachep) | ||
1776 | { | ||
1777 | if (g_cpucache_up == FULL) { | ||
1778 | enable_cpucache(cachep); | ||
1779 | return; | ||
1780 | } | ||
1781 | if (g_cpucache_up == NONE) { | ||
1782 | /* | ||
1783 | * Note: the first kmem_cache_create must create the cache | ||
1784 | * that's used by kmalloc(24), otherwise the creation of | ||
1785 | * further caches will BUG(). | ||
1786 | */ | ||
1787 | cachep->array[smp_processor_id()] = &initarray_generic.cache; | ||
1788 | |||
1789 | /* | ||
1790 | * If the cache that's used by kmalloc(sizeof(kmem_list3)) is | ||
1791 | * the first cache, then we need to set up all its list3s, | ||
1792 | * otherwise the creation of further caches will BUG(). | ||
1793 | */ | ||
1794 | set_up_list3s(cachep, SIZE_AC); | ||
1795 | if (INDEX_AC == INDEX_L3) | ||
1796 | g_cpucache_up = PARTIAL_L3; | ||
1797 | else | ||
1798 | g_cpucache_up = PARTIAL_AC; | ||
1799 | } else { | ||
1800 | cachep->array[smp_processor_id()] = | ||
1801 | kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); | ||
1802 | |||
1803 | if (g_cpucache_up == PARTIAL_AC) { | ||
1804 | set_up_list3s(cachep, SIZE_L3); | ||
1805 | g_cpucache_up = PARTIAL_L3; | ||
1806 | } else { | ||
1807 | int node; | ||
1808 | for_each_online_node(node) { | ||
1809 | cachep->nodelists[node] = | ||
1810 | kmalloc_node(sizeof(struct kmem_list3), | ||
1811 | GFP_KERNEL, node); | ||
1812 | BUG_ON(!cachep->nodelists[node]); | ||
1813 | kmem_list3_init(cachep->nodelists[node]); | ||
1814 | } | ||
1815 | } | ||
1816 | } | ||
1817 | cachep->nodelists[numa_node_id()]->next_reap = | ||
1818 | jiffies + REAPTIMEOUT_LIST3 + | ||
1819 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; | ||
1820 | |||
1821 | cpu_cache_get(cachep)->avail = 0; | ||
1822 | cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; | ||
1823 | cpu_cache_get(cachep)->batchcount = 1; | ||
1824 | cpu_cache_get(cachep)->touched = 0; | ||
1825 | cachep->batchcount = 1; | ||
1826 | cachep->limit = BOOT_CPUCACHE_ENTRIES; | ||
1827 | } | ||
1828 | |||
1739 | /** | 1829 | /** |
1740 | * kmem_cache_create - Create a cache. | 1830 | * kmem_cache_create - Create a cache. |
1741 | * @name: A string which is used in /proc/slabinfo to identify this cache. | 1831 | * @name: A string which is used in /proc/slabinfo to identify this cache. |
@@ -1751,9 +1841,8 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep, | |||
1751 | * and the @dtor is run before the pages are handed back. | 1841 | * and the @dtor is run before the pages are handed back. |
1752 | * | 1842 | * |
1753 | * @name must be valid until the cache is destroyed. This implies that | 1843 | * @name must be valid until the cache is destroyed. This implies that |
1754 | * the module calling this has to destroy the cache before getting | 1844 | * the module calling this has to destroy the cache before getting unloaded. |
1755 | * unloaded. | 1845 | * |
1756 | * | ||
1757 | * The flags are | 1846 | * The flags are |
1758 | * | 1847 | * |
1759 | * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) | 1848 | * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) |
@@ -1762,16 +1851,14 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep, | |||
1762 | * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check | 1851 | * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check |
1763 | * for buffer overruns. | 1852 | * for buffer overruns. |
1764 | * | 1853 | * |
1765 | * %SLAB_NO_REAP - Don't automatically reap this cache when we're under | ||
1766 | * memory pressure. | ||
1767 | * | ||
1768 | * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware | 1854 | * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware |
1769 | * cacheline. This can be beneficial if you're counting cycles as closely | 1855 | * cacheline. This can be beneficial if you're counting cycles as closely |
1770 | * as davem. | 1856 | * as davem. |
1771 | */ | 1857 | */ |
1772 | struct kmem_cache * | 1858 | struct kmem_cache * |
1773 | kmem_cache_create (const char *name, size_t size, size_t align, | 1859 | kmem_cache_create (const char *name, size_t size, size_t align, |
1774 | unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long), | 1860 | unsigned long flags, |
1861 | void (*ctor)(void*, struct kmem_cache *, unsigned long), | ||
1775 | void (*dtor)(void*, struct kmem_cache *, unsigned long)) | 1862 | void (*dtor)(void*, struct kmem_cache *, unsigned long)) |
1776 | { | 1863 | { |
1777 | size_t left_over, slab_size, ralign; | 1864 | size_t left_over, slab_size, ralign; |
@@ -1781,12 +1868,10 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1781 | /* | 1868 | /* |
1782 | * Sanity checks... these are all serious usage bugs. | 1869 | * Sanity checks... these are all serious usage bugs. |
1783 | */ | 1870 | */ |
1784 | if ((!name) || | 1871 | if (!name || in_interrupt() || (size < BYTES_PER_WORD) || |
1785 | in_interrupt() || | ||
1786 | (size < BYTES_PER_WORD) || | ||
1787 | (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { | 1872 | (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { |
1788 | printk(KERN_ERR "%s: Early error in slab %s\n", | 1873 | printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__, |
1789 | __FUNCTION__, name); | 1874 | name); |
1790 | BUG(); | 1875 | BUG(); |
1791 | } | 1876 | } |
1792 | 1877 | ||
@@ -1840,8 +1925,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1840 | * above the next power of two: caches with object sizes just above a | 1925 | * above the next power of two: caches with object sizes just above a |
1841 | * power of two have a significant amount of internal fragmentation. | 1926 | * power of two have a significant amount of internal fragmentation. |
1842 | */ | 1927 | */ |
1843 | if ((size < 4096 | 1928 | if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD)) |
1844 | || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD))) | ||
1845 | flags |= SLAB_RED_ZONE | SLAB_STORE_USER; | 1929 | flags |= SLAB_RED_ZONE | SLAB_STORE_USER; |
1846 | if (!(flags & SLAB_DESTROY_BY_RCU)) | 1930 | if (!(flags & SLAB_DESTROY_BY_RCU)) |
1847 | flags |= SLAB_POISON; | 1931 | flags |= SLAB_POISON; |
@@ -1853,13 +1937,14 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1853 | BUG_ON(dtor); | 1937 | BUG_ON(dtor); |
1854 | 1938 | ||
1855 | /* | 1939 | /* |
1856 | * Always checks flags, a caller might be expecting debug | 1940 | * Always checks flags, a caller might be expecting debug support which |
1857 | * support which isn't available. | 1941 | * isn't available. |
1858 | */ | 1942 | */ |
1859 | if (flags & ~CREATE_MASK) | 1943 | if (flags & ~CREATE_MASK) |
1860 | BUG(); | 1944 | BUG(); |
1861 | 1945 | ||
1862 | /* Check that size is in terms of words. This is needed to avoid | 1946 | /* |
1947 | * Check that size is in terms of words. This is needed to avoid | ||
1863 | * unaligned accesses for some archs when redzoning is used, and makes | 1948 | * unaligned accesses for some archs when redzoning is used, and makes |
1864 | * sure any on-slab bufctl's are also correctly aligned. | 1949 | * sure any on-slab bufctl's are also correctly aligned. |
1865 | */ | 1950 | */ |
@@ -1868,12 +1953,14 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1868 | size &= ~(BYTES_PER_WORD - 1); | 1953 | size &= ~(BYTES_PER_WORD - 1); |
1869 | } | 1954 | } |
1870 | 1955 | ||
1871 | /* calculate out the final buffer alignment: */ | 1956 | /* calculate the final buffer alignment: */ |
1957 | |||
1872 | /* 1) arch recommendation: can be overridden for debug */ | 1958 | /* 1) arch recommendation: can be overridden for debug */ |
1873 | if (flags & SLAB_HWCACHE_ALIGN) { | 1959 | if (flags & SLAB_HWCACHE_ALIGN) { |
1874 | /* Default alignment: as specified by the arch code. | 1960 | /* |
1875 | * Except if an object is really small, then squeeze multiple | 1961 | * Default alignment: as specified by the arch code. Except if |
1876 | * objects into one cacheline. | 1962 | * an object is really small, then squeeze multiple objects into |
1963 | * one cacheline. | ||
1877 | */ | 1964 | */ |
1878 | ralign = cache_line_size(); | 1965 | ralign = cache_line_size(); |
1879 | while (size <= ralign / 2) | 1966 | while (size <= ralign / 2) |
@@ -1893,7 +1980,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1893 | if (ralign > BYTES_PER_WORD) | 1980 | if (ralign > BYTES_PER_WORD) |
1894 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | 1981 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); |
1895 | } | 1982 | } |
1896 | /* 4) Store it. Note that the debug code below can reduce | 1983 | /* |
1984 | * 4) Store it. Note that the debug code below can reduce | ||
1897 | * the alignment to BYTES_PER_WORD. | 1985 | * the alignment to BYTES_PER_WORD. |
1898 | */ | 1986 | */ |
1899 | align = ralign; | 1987 | align = ralign; |
@@ -1978,7 +2066,6 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1978 | cachep->gfpflags = 0; | 2066 | cachep->gfpflags = 0; |
1979 | if (flags & SLAB_CACHE_DMA) | 2067 | if (flags & SLAB_CACHE_DMA) |
1980 | cachep->gfpflags |= GFP_DMA; | 2068 | cachep->gfpflags |= GFP_DMA; |
1981 | spin_lock_init(&cachep->spinlock); | ||
1982 | cachep->buffer_size = size; | 2069 | cachep->buffer_size = size; |
1983 | 2070 | ||
1984 | if (flags & CFLGS_OFF_SLAB) | 2071 | if (flags & CFLGS_OFF_SLAB) |
@@ -1988,64 +2075,11 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1988 | cachep->name = name; | 2075 | cachep->name = name; |
1989 | 2076 | ||
1990 | 2077 | ||
1991 | if (g_cpucache_up == FULL) { | 2078 | setup_cpu_cache(cachep); |
1992 | enable_cpucache(cachep); | ||
1993 | } else { | ||
1994 | if (g_cpucache_up == NONE) { | ||
1995 | /* Note: the first kmem_cache_create must create | ||
1996 | * the cache that's used by kmalloc(24), otherwise | ||
1997 | * the creation of further caches will BUG(). | ||
1998 | */ | ||
1999 | cachep->array[smp_processor_id()] = | ||
2000 | &initarray_generic.cache; | ||
2001 | |||
2002 | /* If the cache that's used by | ||
2003 | * kmalloc(sizeof(kmem_list3)) is the first cache, | ||
2004 | * then we need to set up all its list3s, otherwise | ||
2005 | * the creation of further caches will BUG(). | ||
2006 | */ | ||
2007 | set_up_list3s(cachep, SIZE_AC); | ||
2008 | if (INDEX_AC == INDEX_L3) | ||
2009 | g_cpucache_up = PARTIAL_L3; | ||
2010 | else | ||
2011 | g_cpucache_up = PARTIAL_AC; | ||
2012 | } else { | ||
2013 | cachep->array[smp_processor_id()] = | ||
2014 | kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); | ||
2015 | |||
2016 | if (g_cpucache_up == PARTIAL_AC) { | ||
2017 | set_up_list3s(cachep, SIZE_L3); | ||
2018 | g_cpucache_up = PARTIAL_L3; | ||
2019 | } else { | ||
2020 | int node; | ||
2021 | for_each_online_node(node) { | ||
2022 | |||
2023 | cachep->nodelists[node] = | ||
2024 | kmalloc_node(sizeof | ||
2025 | (struct kmem_list3), | ||
2026 | GFP_KERNEL, node); | ||
2027 | BUG_ON(!cachep->nodelists[node]); | ||
2028 | kmem_list3_init(cachep-> | ||
2029 | nodelists[node]); | ||
2030 | } | ||
2031 | } | ||
2032 | } | ||
2033 | cachep->nodelists[numa_node_id()]->next_reap = | ||
2034 | jiffies + REAPTIMEOUT_LIST3 + | ||
2035 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; | ||
2036 | |||
2037 | BUG_ON(!cpu_cache_get(cachep)); | ||
2038 | cpu_cache_get(cachep)->avail = 0; | ||
2039 | cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; | ||
2040 | cpu_cache_get(cachep)->batchcount = 1; | ||
2041 | cpu_cache_get(cachep)->touched = 0; | ||
2042 | cachep->batchcount = 1; | ||
2043 | cachep->limit = BOOT_CPUCACHE_ENTRIES; | ||
2044 | } | ||
2045 | 2079 | ||
2046 | /* cache setup completed, link it into the list */ | 2080 | /* cache setup completed, link it into the list */ |
2047 | list_add(&cachep->next, &cache_chain); | 2081 | list_add(&cachep->next, &cache_chain); |
2048 | oops: | 2082 | oops: |
2049 | if (!cachep && (flags & SLAB_PANIC)) | 2083 | if (!cachep && (flags & SLAB_PANIC)) |
2050 | panic("kmem_cache_create(): failed to create slab `%s'\n", | 2084 | panic("kmem_cache_create(): failed to create slab `%s'\n", |
2051 | name); | 2085 | name); |
@@ -2089,30 +2123,13 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) | |||
2089 | #define check_spinlock_acquired_node(x, y) do { } while(0) | 2123 | #define check_spinlock_acquired_node(x, y) do { } while(0) |
2090 | #endif | 2124 | #endif |
2091 | 2125 | ||
2092 | /* | 2126 | static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, |
2093 | * Waits for all CPUs to execute func(). | 2127 | struct array_cache *ac, |
2094 | */ | 2128 | int force, int node); |
2095 | static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg) | ||
2096 | { | ||
2097 | check_irq_on(); | ||
2098 | preempt_disable(); | ||
2099 | |||
2100 | local_irq_disable(); | ||
2101 | func(arg); | ||
2102 | local_irq_enable(); | ||
2103 | |||
2104 | if (smp_call_function(func, arg, 1, 1)) | ||
2105 | BUG(); | ||
2106 | |||
2107 | preempt_enable(); | ||
2108 | } | ||
2109 | |||
2110 | static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, | ||
2111 | int force, int node); | ||
2112 | 2129 | ||
2113 | static void do_drain(void *arg) | 2130 | static void do_drain(void *arg) |
2114 | { | 2131 | { |
2115 | struct kmem_cache *cachep = (struct kmem_cache *) arg; | 2132 | struct kmem_cache *cachep = arg; |
2116 | struct array_cache *ac; | 2133 | struct array_cache *ac; |
2117 | int node = numa_node_id(); | 2134 | int node = numa_node_id(); |
2118 | 2135 | ||
@@ -2129,14 +2146,12 @@ static void drain_cpu_caches(struct kmem_cache *cachep) | |||
2129 | struct kmem_list3 *l3; | 2146 | struct kmem_list3 *l3; |
2130 | int node; | 2147 | int node; |
2131 | 2148 | ||
2132 | smp_call_function_all_cpus(do_drain, cachep); | 2149 | on_each_cpu(do_drain, cachep, 1, 1); |
2133 | check_irq_on(); | 2150 | check_irq_on(); |
2134 | for_each_online_node(node) { | 2151 | for_each_online_node(node) { |
2135 | l3 = cachep->nodelists[node]; | 2152 | l3 = cachep->nodelists[node]; |
2136 | if (l3) { | 2153 | if (l3) { |
2137 | spin_lock_irq(&l3->list_lock); | 2154 | drain_array(cachep, l3, l3->shared, 1, node); |
2138 | drain_array_locked(cachep, l3->shared, 1, node); | ||
2139 | spin_unlock_irq(&l3->list_lock); | ||
2140 | if (l3->alien) | 2155 | if (l3->alien) |
2141 | drain_alien_cache(cachep, l3->alien); | 2156 | drain_alien_cache(cachep, l3->alien); |
2142 | } | 2157 | } |
@@ -2260,16 +2275,15 @@ int kmem_cache_destroy(struct kmem_cache *cachep) | |||
2260 | 2275 | ||
2261 | /* NUMA: free the list3 structures */ | 2276 | /* NUMA: free the list3 structures */ |
2262 | for_each_online_node(i) { | 2277 | for_each_online_node(i) { |
2263 | if ((l3 = cachep->nodelists[i])) { | 2278 | l3 = cachep->nodelists[i]; |
2279 | if (l3) { | ||
2264 | kfree(l3->shared); | 2280 | kfree(l3->shared); |
2265 | free_alien_cache(l3->alien); | 2281 | free_alien_cache(l3->alien); |
2266 | kfree(l3); | 2282 | kfree(l3); |
2267 | } | 2283 | } |
2268 | } | 2284 | } |
2269 | kmem_cache_free(&cache_cache, cachep); | 2285 | kmem_cache_free(&cache_cache, cachep); |
2270 | |||
2271 | unlock_cpu_hotplug(); | 2286 | unlock_cpu_hotplug(); |
2272 | |||
2273 | return 0; | 2287 | return 0; |
2274 | } | 2288 | } |
2275 | EXPORT_SYMBOL(kmem_cache_destroy); | 2289 | EXPORT_SYMBOL(kmem_cache_destroy); |
@@ -2292,7 +2306,6 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, | |||
2292 | slabp->inuse = 0; | 2306 | slabp->inuse = 0; |
2293 | slabp->colouroff = colour_off; | 2307 | slabp->colouroff = colour_off; |
2294 | slabp->s_mem = objp + colour_off; | 2308 | slabp->s_mem = objp + colour_off; |
2295 | |||
2296 | return slabp; | 2309 | return slabp; |
2297 | } | 2310 | } |
2298 | 2311 | ||
@@ -2307,7 +2320,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2307 | int i; | 2320 | int i; |
2308 | 2321 | ||
2309 | for (i = 0; i < cachep->num; i++) { | 2322 | for (i = 0; i < cachep->num; i++) { |
2310 | void *objp = slabp->s_mem + cachep->buffer_size * i; | 2323 | void *objp = index_to_obj(cachep, slabp, i); |
2311 | #if DEBUG | 2324 | #if DEBUG |
2312 | /* need to poison the objs? */ | 2325 | /* need to poison the objs? */ |
2313 | if (cachep->flags & SLAB_POISON) | 2326 | if (cachep->flags & SLAB_POISON) |
@@ -2320,9 +2333,9 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2320 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; | 2333 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; |
2321 | } | 2334 | } |
2322 | /* | 2335 | /* |
2323 | * Constructors are not allowed to allocate memory from | 2336 | * Constructors are not allowed to allocate memory from the same |
2324 | * the same cache which they are a constructor for. | 2337 | * cache which they are a constructor for. Otherwise, deadlock. |
2325 | * Otherwise, deadlock. They must also be threaded. | 2338 | * They must also be threaded. |
2326 | */ | 2339 | */ |
2327 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) | 2340 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) |
2328 | cachep->ctor(objp + obj_offset(cachep), cachep, | 2341 | cachep->ctor(objp + obj_offset(cachep), cachep, |
@@ -2336,8 +2349,8 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2336 | slab_error(cachep, "constructor overwrote the" | 2349 | slab_error(cachep, "constructor overwrote the" |
2337 | " start of an object"); | 2350 | " start of an object"); |
2338 | } | 2351 | } |
2339 | if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep) | 2352 | if ((cachep->buffer_size % PAGE_SIZE) == 0 && |
2340 | && cachep->flags & SLAB_POISON) | 2353 | OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) |
2341 | kernel_map_pages(virt_to_page(objp), | 2354 | kernel_map_pages(virt_to_page(objp), |
2342 | cachep->buffer_size / PAGE_SIZE, 0); | 2355 | cachep->buffer_size / PAGE_SIZE, 0); |
2343 | #else | 2356 | #else |
@@ -2352,18 +2365,16 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2352 | 2365 | ||
2353 | static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) | 2366 | static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) |
2354 | { | 2367 | { |
2355 | if (flags & SLAB_DMA) { | 2368 | if (flags & SLAB_DMA) |
2356 | if (!(cachep->gfpflags & GFP_DMA)) | 2369 | BUG_ON(!(cachep->gfpflags & GFP_DMA)); |
2357 | BUG(); | 2370 | else |
2358 | } else { | 2371 | BUG_ON(cachep->gfpflags & GFP_DMA); |
2359 | if (cachep->gfpflags & GFP_DMA) | ||
2360 | BUG(); | ||
2361 | } | ||
2362 | } | 2372 | } |
2363 | 2373 | ||
2364 | static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid) | 2374 | static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, |
2375 | int nodeid) | ||
2365 | { | 2376 | { |
2366 | void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size); | 2377 | void *objp = index_to_obj(cachep, slabp, slabp->free); |
2367 | kmem_bufctl_t next; | 2378 | kmem_bufctl_t next; |
2368 | 2379 | ||
2369 | slabp->inuse++; | 2380 | slabp->inuse++; |
@@ -2377,10 +2388,10 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nod | |||
2377 | return objp; | 2388 | return objp; |
2378 | } | 2389 | } |
2379 | 2390 | ||
2380 | static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp, | 2391 | static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, |
2381 | int nodeid) | 2392 | void *objp, int nodeid) |
2382 | { | 2393 | { |
2383 | unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size; | 2394 | unsigned int objnr = obj_to_index(cachep, slabp, objp); |
2384 | 2395 | ||
2385 | #if DEBUG | 2396 | #if DEBUG |
2386 | /* Verify that the slab belongs to the intended node */ | 2397 | /* Verify that the slab belongs to the intended node */ |
@@ -2388,7 +2399,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob | |||
2388 | 2399 | ||
2389 | if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { | 2400 | if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { |
2390 | printk(KERN_ERR "slab: double free detected in cache " | 2401 | printk(KERN_ERR "slab: double free detected in cache " |
2391 | "'%s', objp %p\n", cachep->name, objp); | 2402 | "'%s', objp %p\n", cachep->name, objp); |
2392 | BUG(); | 2403 | BUG(); |
2393 | } | 2404 | } |
2394 | #endif | 2405 | #endif |
@@ -2397,14 +2408,18 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob | |||
2397 | slabp->inuse--; | 2408 | slabp->inuse--; |
2398 | } | 2409 | } |
2399 | 2410 | ||
2400 | static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp) | 2411 | static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, |
2412 | void *objp) | ||
2401 | { | 2413 | { |
2402 | int i; | 2414 | int i; |
2403 | struct page *page; | 2415 | struct page *page; |
2404 | 2416 | ||
2405 | /* Nasty!!!!!! I hope this is OK. */ | 2417 | /* Nasty!!!!!! I hope this is OK. */ |
2406 | i = 1 << cachep->gfporder; | ||
2407 | page = virt_to_page(objp); | 2418 | page = virt_to_page(objp); |
2419 | |||
2420 | i = 1; | ||
2421 | if (likely(!PageCompound(page))) | ||
2422 | i <<= cachep->gfporder; | ||
2408 | do { | 2423 | do { |
2409 | page_set_cache(page, cachep); | 2424 | page_set_cache(page, cachep); |
2410 | page_set_slab(page, slabp); | 2425 | page_set_slab(page, slabp); |
@@ -2425,8 +2440,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
2425 | unsigned long ctor_flags; | 2440 | unsigned long ctor_flags; |
2426 | struct kmem_list3 *l3; | 2441 | struct kmem_list3 *l3; |
2427 | 2442 | ||
2428 | /* Be lazy and only check for valid flags here, | 2443 | /* |
2429 | * keeping it out of the critical path in kmem_cache_alloc(). | 2444 | * Be lazy and only check for valid flags here, keeping it out of the |
2445 | * critical path in kmem_cache_alloc(). | ||
2430 | */ | 2446 | */ |
2431 | if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)) | 2447 | if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)) |
2432 | BUG(); | 2448 | BUG(); |
@@ -2467,14 +2483,17 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
2467 | */ | 2483 | */ |
2468 | kmem_flagcheck(cachep, flags); | 2484 | kmem_flagcheck(cachep, flags); |
2469 | 2485 | ||
2470 | /* Get mem for the objs. | 2486 | /* |
2471 | * Attempt to allocate a physical page from 'nodeid', | 2487 | * Get mem for the objs. Attempt to allocate a physical page from |
2488 | * 'nodeid'. | ||
2472 | */ | 2489 | */ |
2473 | if (!(objp = kmem_getpages(cachep, flags, nodeid))) | 2490 | objp = kmem_getpages(cachep, flags, nodeid); |
2491 | if (!objp) | ||
2474 | goto failed; | 2492 | goto failed; |
2475 | 2493 | ||
2476 | /* Get slab management. */ | 2494 | /* Get slab management. */ |
2477 | if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags))) | 2495 | slabp = alloc_slabmgmt(cachep, objp, offset, local_flags); |
2496 | if (!slabp) | ||
2478 | goto opps1; | 2497 | goto opps1; |
2479 | 2498 | ||
2480 | slabp->nodeid = nodeid; | 2499 | slabp->nodeid = nodeid; |
@@ -2493,9 +2512,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
2493 | l3->free_objects += cachep->num; | 2512 | l3->free_objects += cachep->num; |
2494 | spin_unlock(&l3->list_lock); | 2513 | spin_unlock(&l3->list_lock); |
2495 | return 1; | 2514 | return 1; |
2496 | opps1: | 2515 | opps1: |
2497 | kmem_freepages(cachep, objp); | 2516 | kmem_freepages(cachep, objp); |
2498 | failed: | 2517 | failed: |
2499 | if (local_flags & __GFP_WAIT) | 2518 | if (local_flags & __GFP_WAIT) |
2500 | local_irq_disable(); | 2519 | local_irq_disable(); |
2501 | return 0; | 2520 | return 0; |
@@ -2538,8 +2557,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
2538 | page = virt_to_page(objp); | 2557 | page = virt_to_page(objp); |
2539 | 2558 | ||
2540 | if (page_get_cache(page) != cachep) { | 2559 | if (page_get_cache(page) != cachep) { |
2541 | printk(KERN_ERR | 2560 | printk(KERN_ERR "mismatch in kmem_cache_free: expected " |
2542 | "mismatch in kmem_cache_free: expected cache %p, got %p\n", | 2561 | "cache %p, got %p\n", |
2543 | page_get_cache(page), cachep); | 2562 | page_get_cache(page), cachep); |
2544 | printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); | 2563 | printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); |
2545 | printk(KERN_ERR "%p is %s.\n", page_get_cache(page), | 2564 | printk(KERN_ERR "%p is %s.\n", page_get_cache(page), |
@@ -2549,13 +2568,12 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
2549 | slabp = page_get_slab(page); | 2568 | slabp = page_get_slab(page); |
2550 | 2569 | ||
2551 | if (cachep->flags & SLAB_RED_ZONE) { | 2570 | if (cachep->flags & SLAB_RED_ZONE) { |
2552 | if (*dbg_redzone1(cachep, objp) != RED_ACTIVE | 2571 | if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || |
2553 | || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { | 2572 | *dbg_redzone2(cachep, objp) != RED_ACTIVE) { |
2554 | slab_error(cachep, | 2573 | slab_error(cachep, "double free, or memory outside" |
2555 | "double free, or memory outside" | 2574 | " object was overwritten"); |
2556 | " object was overwritten"); | 2575 | printk(KERN_ERR "%p: redzone 1:0x%lx, " |
2557 | printk(KERN_ERR | 2576 | "redzone 2:0x%lx.\n", |
2558 | "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", | ||
2559 | objp, *dbg_redzone1(cachep, objp), | 2577 | objp, *dbg_redzone1(cachep, objp), |
2560 | *dbg_redzone2(cachep, objp)); | 2578 | *dbg_redzone2(cachep, objp)); |
2561 | } | 2579 | } |
@@ -2565,15 +2583,16 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
2565 | if (cachep->flags & SLAB_STORE_USER) | 2583 | if (cachep->flags & SLAB_STORE_USER) |
2566 | *dbg_userword(cachep, objp) = caller; | 2584 | *dbg_userword(cachep, objp) = caller; |
2567 | 2585 | ||
2568 | objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; | 2586 | objnr = obj_to_index(cachep, slabp, objp); |
2569 | 2587 | ||
2570 | BUG_ON(objnr >= cachep->num); | 2588 | BUG_ON(objnr >= cachep->num); |
2571 | BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size); | 2589 | BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); |
2572 | 2590 | ||
2573 | if (cachep->flags & SLAB_DEBUG_INITIAL) { | 2591 | if (cachep->flags & SLAB_DEBUG_INITIAL) { |
2574 | /* Need to call the slab's constructor so the | 2592 | /* |
2575 | * caller can perform a verify of its state (debugging). | 2593 | * Need to call the slab's constructor so the caller can |
2576 | * Called without the cache-lock held. | 2594 | * perform a verify of its state (debugging). Called without |
2595 | * the cache-lock held. | ||
2577 | */ | 2596 | */ |
2578 | cachep->ctor(objp + obj_offset(cachep), | 2597 | cachep->ctor(objp + obj_offset(cachep), |
2579 | cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); | 2598 | cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); |
@@ -2586,7 +2605,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
2586 | } | 2605 | } |
2587 | if (cachep->flags & SLAB_POISON) { | 2606 | if (cachep->flags & SLAB_POISON) { |
2588 | #ifdef CONFIG_DEBUG_PAGEALLOC | 2607 | #ifdef CONFIG_DEBUG_PAGEALLOC |
2589 | if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { | 2608 | if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { |
2590 | store_stackinfo(cachep, objp, (unsigned long)caller); | 2609 | store_stackinfo(cachep, objp, (unsigned long)caller); |
2591 | kernel_map_pages(virt_to_page(objp), | 2610 | kernel_map_pages(virt_to_page(objp), |
2592 | cachep->buffer_size / PAGE_SIZE, 0); | 2611 | cachep->buffer_size / PAGE_SIZE, 0); |
@@ -2612,14 +2631,14 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) | |||
2612 | goto bad; | 2631 | goto bad; |
2613 | } | 2632 | } |
2614 | if (entries != cachep->num - slabp->inuse) { | 2633 | if (entries != cachep->num - slabp->inuse) { |
2615 | bad: | 2634 | bad: |
2616 | printk(KERN_ERR | 2635 | printk(KERN_ERR "slab: Internal list corruption detected in " |
2617 | "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", | 2636 | "cache '%s'(%d), slabp %p(%d). Hexdump:\n", |
2618 | cachep->name, cachep->num, slabp, slabp->inuse); | 2637 | cachep->name, cachep->num, slabp, slabp->inuse); |
2619 | for (i = 0; | 2638 | for (i = 0; |
2620 | i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); | 2639 | i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); |
2621 | i++) { | 2640 | i++) { |
2622 | if ((i % 16) == 0) | 2641 | if (i % 16 == 0) |
2623 | printk("\n%03x:", i); | 2642 | printk("\n%03x:", i); |
2624 | printk(" %02x", ((unsigned char *)slabp)[i]); | 2643 | printk(" %02x", ((unsigned char *)slabp)[i]); |
2625 | } | 2644 | } |
@@ -2641,12 +2660,13 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) | |||
2641 | 2660 | ||
2642 | check_irq_off(); | 2661 | check_irq_off(); |
2643 | ac = cpu_cache_get(cachep); | 2662 | ac = cpu_cache_get(cachep); |
2644 | retry: | 2663 | retry: |
2645 | batchcount = ac->batchcount; | 2664 | batchcount = ac->batchcount; |
2646 | if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { | 2665 | if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { |
2647 | /* if there was little recent activity on this | 2666 | /* |
2648 | * cache, then perform only a partial refill. | 2667 | * If there was little recent activity on this cache, then |
2649 | * Otherwise we could generate refill bouncing. | 2668 | * perform only a partial refill. Otherwise we could generate |
2669 | * refill bouncing. | ||
2650 | */ | 2670 | */ |
2651 | batchcount = BATCHREFILL_LIMIT; | 2671 | batchcount = BATCHREFILL_LIMIT; |
2652 | } | 2672 | } |
@@ -2702,29 +2722,29 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) | |||
2702 | list_add(&slabp->list, &l3->slabs_partial); | 2722 | list_add(&slabp->list, &l3->slabs_partial); |
2703 | } | 2723 | } |
2704 | 2724 | ||
2705 | must_grow: | 2725 | must_grow: |
2706 | l3->free_objects -= ac->avail; | 2726 | l3->free_objects -= ac->avail; |
2707 | alloc_done: | 2727 | alloc_done: |
2708 | spin_unlock(&l3->list_lock); | 2728 | spin_unlock(&l3->list_lock); |
2709 | 2729 | ||
2710 | if (unlikely(!ac->avail)) { | 2730 | if (unlikely(!ac->avail)) { |
2711 | int x; | 2731 | int x; |
2712 | x = cache_grow(cachep, flags, numa_node_id()); | 2732 | x = cache_grow(cachep, flags, numa_node_id()); |
2713 | 2733 | ||
2714 | // cache_grow can reenable interrupts, then ac could change. | 2734 | /* cache_grow can reenable interrupts, then ac could change. */ |
2715 | ac = cpu_cache_get(cachep); | 2735 | ac = cpu_cache_get(cachep); |
2716 | if (!x && ac->avail == 0) // no objects in sight? abort | 2736 | if (!x && ac->avail == 0) /* no objects in sight? abort */ |
2717 | return NULL; | 2737 | return NULL; |
2718 | 2738 | ||
2719 | if (!ac->avail) // objects refilled by interrupt? | 2739 | if (!ac->avail) /* objects refilled by interrupt? */ |
2720 | goto retry; | 2740 | goto retry; |
2721 | } | 2741 | } |
2722 | ac->touched = 1; | 2742 | ac->touched = 1; |
2723 | return ac->entry[--ac->avail]; | 2743 | return ac->entry[--ac->avail]; |
2724 | } | 2744 | } |
2725 | 2745 | ||
2726 | static inline void | 2746 | static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, |
2727 | cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) | 2747 | gfp_t flags) |
2728 | { | 2748 | { |
2729 | might_sleep_if(flags & __GFP_WAIT); | 2749 | might_sleep_if(flags & __GFP_WAIT); |
2730 | #if DEBUG | 2750 | #if DEBUG |
@@ -2733,8 +2753,8 @@ cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) | |||
2733 | } | 2753 | } |
2734 | 2754 | ||
2735 | #if DEBUG | 2755 | #if DEBUG |
2736 | static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, | 2756 | static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, |
2737 | void *objp, void *caller) | 2757 | gfp_t flags, void *objp, void *caller) |
2738 | { | 2758 | { |
2739 | if (!objp) | 2759 | if (!objp) |
2740 | return objp; | 2760 | return objp; |
@@ -2754,15 +2774,14 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags | |||
2754 | *dbg_userword(cachep, objp) = caller; | 2774 | *dbg_userword(cachep, objp) = caller; |
2755 | 2775 | ||
2756 | if (cachep->flags & SLAB_RED_ZONE) { | 2776 | if (cachep->flags & SLAB_RED_ZONE) { |
2757 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE | 2777 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || |
2758 | || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { | 2778 | *dbg_redzone2(cachep, objp) != RED_INACTIVE) { |
2759 | slab_error(cachep, | 2779 | slab_error(cachep, "double free, or memory outside" |
2760 | "double free, or memory outside" | 2780 | " object was overwritten"); |
2761 | " object was overwritten"); | ||
2762 | printk(KERN_ERR | 2781 | printk(KERN_ERR |
2763 | "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", | 2782 | "%p: redzone 1:0x%lx, redzone 2:0x%lx\n", |
2764 | objp, *dbg_redzone1(cachep, objp), | 2783 | objp, *dbg_redzone1(cachep, objp), |
2765 | *dbg_redzone2(cachep, objp)); | 2784 | *dbg_redzone2(cachep, objp)); |
2766 | } | 2785 | } |
2767 | *dbg_redzone1(cachep, objp) = RED_ACTIVE; | 2786 | *dbg_redzone1(cachep, objp) = RED_ACTIVE; |
2768 | *dbg_redzone2(cachep, objp) = RED_ACTIVE; | 2787 | *dbg_redzone2(cachep, objp) = RED_ACTIVE; |
@@ -2809,8 +2828,8 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
2809 | return objp; | 2828 | return objp; |
2810 | } | 2829 | } |
2811 | 2830 | ||
2812 | static __always_inline void * | 2831 | static __always_inline void *__cache_alloc(struct kmem_cache *cachep, |
2813 | __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) | 2832 | gfp_t flags, void *caller) |
2814 | { | 2833 | { |
2815 | unsigned long save_flags; | 2834 | unsigned long save_flags; |
2816 | void *objp; | 2835 | void *objp; |
@@ -2830,7 +2849,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) | |||
2830 | /* | 2849 | /* |
2831 | * A interface to enable slab creation on nodeid | 2850 | * A interface to enable slab creation on nodeid |
2832 | */ | 2851 | */ |
2833 | static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | 2852 | static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, |
2853 | int nodeid) | ||
2834 | { | 2854 | { |
2835 | struct list_head *entry; | 2855 | struct list_head *entry; |
2836 | struct slab *slabp; | 2856 | struct slab *slabp; |
@@ -2841,7 +2861,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node | |||
2841 | l3 = cachep->nodelists[nodeid]; | 2861 | l3 = cachep->nodelists[nodeid]; |
2842 | BUG_ON(!l3); | 2862 | BUG_ON(!l3); |
2843 | 2863 | ||
2844 | retry: | 2864 | retry: |
2845 | check_irq_off(); | 2865 | check_irq_off(); |
2846 | spin_lock(&l3->list_lock); | 2866 | spin_lock(&l3->list_lock); |
2847 | entry = l3->slabs_partial.next; | 2867 | entry = l3->slabs_partial.next; |
@@ -2868,16 +2888,15 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node | |||
2868 | /* move slabp to correct slabp list: */ | 2888 | /* move slabp to correct slabp list: */ |
2869 | list_del(&slabp->list); | 2889 | list_del(&slabp->list); |
2870 | 2890 | ||
2871 | if (slabp->free == BUFCTL_END) { | 2891 | if (slabp->free == BUFCTL_END) |
2872 | list_add(&slabp->list, &l3->slabs_full); | 2892 | list_add(&slabp->list, &l3->slabs_full); |
2873 | } else { | 2893 | else |
2874 | list_add(&slabp->list, &l3->slabs_partial); | 2894 | list_add(&slabp->list, &l3->slabs_partial); |
2875 | } | ||
2876 | 2895 | ||
2877 | spin_unlock(&l3->list_lock); | 2896 | spin_unlock(&l3->list_lock); |
2878 | goto done; | 2897 | goto done; |
2879 | 2898 | ||
2880 | must_grow: | 2899 | must_grow: |
2881 | spin_unlock(&l3->list_lock); | 2900 | spin_unlock(&l3->list_lock); |
2882 | x = cache_grow(cachep, flags, nodeid); | 2901 | x = cache_grow(cachep, flags, nodeid); |
2883 | 2902 | ||
@@ -2885,7 +2904,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node | |||
2885 | return NULL; | 2904 | return NULL; |
2886 | 2905 | ||
2887 | goto retry; | 2906 | goto retry; |
2888 | done: | 2907 | done: |
2889 | return obj; | 2908 | return obj; |
2890 | } | 2909 | } |
2891 | #endif | 2910 | #endif |
@@ -2958,7 +2977,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) | |||
2958 | } | 2977 | } |
2959 | 2978 | ||
2960 | free_block(cachep, ac->entry, batchcount, node); | 2979 | free_block(cachep, ac->entry, batchcount, node); |
2961 | free_done: | 2980 | free_done: |
2962 | #if STATS | 2981 | #if STATS |
2963 | { | 2982 | { |
2964 | int i = 0; | 2983 | int i = 0; |
@@ -2979,16 +2998,12 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) | |||
2979 | #endif | 2998 | #endif |
2980 | spin_unlock(&l3->list_lock); | 2999 | spin_unlock(&l3->list_lock); |
2981 | ac->avail -= batchcount; | 3000 | ac->avail -= batchcount; |
2982 | memmove(ac->entry, &(ac->entry[batchcount]), | 3001 | memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); |
2983 | sizeof(void *) * ac->avail); | ||
2984 | } | 3002 | } |
2985 | 3003 | ||
2986 | /* | 3004 | /* |
2987 | * __cache_free | 3005 | * Release an obj back to its cache. If the obj has a constructed state, it must |
2988 | * Release an obj back to its cache. If the obj has a constructed | 3006 | * be in this state _before_ it is released. Called with disabled ints. |
2989 | * state, it must be in this state _before_ it is released. | ||
2990 | * | ||
2991 | * Called with disabled ints. | ||
2992 | */ | 3007 | */ |
2993 | static inline void __cache_free(struct kmem_cache *cachep, void *objp) | 3008 | static inline void __cache_free(struct kmem_cache *cachep, void *objp) |
2994 | { | 3009 | { |
@@ -3007,9 +3022,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) | |||
3007 | if (unlikely(slabp->nodeid != numa_node_id())) { | 3022 | if (unlikely(slabp->nodeid != numa_node_id())) { |
3008 | struct array_cache *alien = NULL; | 3023 | struct array_cache *alien = NULL; |
3009 | int nodeid = slabp->nodeid; | 3024 | int nodeid = slabp->nodeid; |
3010 | struct kmem_list3 *l3 = | 3025 | struct kmem_list3 *l3; |
3011 | cachep->nodelists[numa_node_id()]; | ||
3012 | 3026 | ||
3027 | l3 = cachep->nodelists[numa_node_id()]; | ||
3013 | STATS_INC_NODEFREES(cachep); | 3028 | STATS_INC_NODEFREES(cachep); |
3014 | if (l3->alien && l3->alien[nodeid]) { | 3029 | if (l3->alien && l3->alien[nodeid]) { |
3015 | alien = l3->alien[nodeid]; | 3030 | alien = l3->alien[nodeid]; |
@@ -3093,7 +3108,7 @@ int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr) | |||
3093 | if (unlikely(page_get_cache(page) != cachep)) | 3108 | if (unlikely(page_get_cache(page) != cachep)) |
3094 | goto out; | 3109 | goto out; |
3095 | return 1; | 3110 | return 1; |
3096 | out: | 3111 | out: |
3097 | return 0; | 3112 | return 0; |
3098 | } | 3113 | } |
3099 | 3114 | ||
@@ -3119,7 +3134,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
3119 | local_irq_save(save_flags); | 3134 | local_irq_save(save_flags); |
3120 | 3135 | ||
3121 | if (nodeid == -1 || nodeid == numa_node_id() || | 3136 | if (nodeid == -1 || nodeid == numa_node_id() || |
3122 | !cachep->nodelists[nodeid]) | 3137 | !cachep->nodelists[nodeid]) |
3123 | ptr = ____cache_alloc(cachep, flags); | 3138 | ptr = ____cache_alloc(cachep, flags); |
3124 | else | 3139 | else |
3125 | ptr = __cache_alloc_node(cachep, flags, nodeid); | 3140 | ptr = __cache_alloc_node(cachep, flags, nodeid); |
@@ -3148,6 +3163,7 @@ EXPORT_SYMBOL(kmalloc_node); | |||
3148 | * kmalloc - allocate memory | 3163 | * kmalloc - allocate memory |
3149 | * @size: how many bytes of memory are required. | 3164 | * @size: how many bytes of memory are required. |
3150 | * @flags: the type of memory to allocate. | 3165 | * @flags: the type of memory to allocate. |
3166 | * @caller: function caller for debug tracking of the caller | ||
3151 | * | 3167 | * |
3152 | * kmalloc is the normal method of allocating memory | 3168 | * kmalloc is the normal method of allocating memory |
3153 | * in the kernel. | 3169 | * in the kernel. |
@@ -3236,7 +3252,7 @@ void *__alloc_percpu(size_t size) | |||
3236 | /* Catch derefs w/o wrappers */ | 3252 | /* Catch derefs w/o wrappers */ |
3237 | return (void *)(~(unsigned long)pdata); | 3253 | return (void *)(~(unsigned long)pdata); |
3238 | 3254 | ||
3239 | unwind_oom: | 3255 | unwind_oom: |
3240 | while (--i >= 0) { | 3256 | while (--i >= 0) { |
3241 | if (!cpu_possible(i)) | 3257 | if (!cpu_possible(i)) |
3242 | continue; | 3258 | continue; |
@@ -3339,18 +3355,20 @@ static int alloc_kmemlist(struct kmem_cache *cachep) | |||
3339 | struct array_cache *nc = NULL, *new; | 3355 | struct array_cache *nc = NULL, *new; |
3340 | struct array_cache **new_alien = NULL; | 3356 | struct array_cache **new_alien = NULL; |
3341 | #ifdef CONFIG_NUMA | 3357 | #ifdef CONFIG_NUMA |
3342 | if (!(new_alien = alloc_alien_cache(node, cachep->limit))) | 3358 | new_alien = alloc_alien_cache(node, cachep->limit); |
3359 | if (!new_alien) | ||
3343 | goto fail; | 3360 | goto fail; |
3344 | #endif | 3361 | #endif |
3345 | if (!(new = alloc_arraycache(node, (cachep->shared * | 3362 | new = alloc_arraycache(node, cachep->shared*cachep->batchcount, |
3346 | cachep->batchcount), | 3363 | 0xbaadf00d); |
3347 | 0xbaadf00d))) | 3364 | if (!new) |
3348 | goto fail; | 3365 | goto fail; |
3349 | if ((l3 = cachep->nodelists[node])) { | 3366 | l3 = cachep->nodelists[node]; |
3350 | 3367 | if (l3) { | |
3351 | spin_lock_irq(&l3->list_lock); | 3368 | spin_lock_irq(&l3->list_lock); |
3352 | 3369 | ||
3353 | if ((nc = cachep->nodelists[node]->shared)) | 3370 | nc = cachep->nodelists[node]->shared; |
3371 | if (nc) | ||
3354 | free_block(cachep, nc->entry, nc->avail, node); | 3372 | free_block(cachep, nc->entry, nc->avail, node); |
3355 | 3373 | ||
3356 | l3->shared = new; | 3374 | l3->shared = new; |
@@ -3359,27 +3377,27 @@ static int alloc_kmemlist(struct kmem_cache *cachep) | |||
3359 | new_alien = NULL; | 3377 | new_alien = NULL; |
3360 | } | 3378 | } |
3361 | l3->free_limit = (1 + nr_cpus_node(node)) * | 3379 | l3->free_limit = (1 + nr_cpus_node(node)) * |
3362 | cachep->batchcount + cachep->num; | 3380 | cachep->batchcount + cachep->num; |
3363 | spin_unlock_irq(&l3->list_lock); | 3381 | spin_unlock_irq(&l3->list_lock); |
3364 | kfree(nc); | 3382 | kfree(nc); |
3365 | free_alien_cache(new_alien); | 3383 | free_alien_cache(new_alien); |
3366 | continue; | 3384 | continue; |
3367 | } | 3385 | } |
3368 | if (!(l3 = kmalloc_node(sizeof(struct kmem_list3), | 3386 | l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); |
3369 | GFP_KERNEL, node))) | 3387 | if (!l3) |
3370 | goto fail; | 3388 | goto fail; |
3371 | 3389 | ||
3372 | kmem_list3_init(l3); | 3390 | kmem_list3_init(l3); |
3373 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + | 3391 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + |
3374 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; | 3392 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; |
3375 | l3->shared = new; | 3393 | l3->shared = new; |
3376 | l3->alien = new_alien; | 3394 | l3->alien = new_alien; |
3377 | l3->free_limit = (1 + nr_cpus_node(node)) * | 3395 | l3->free_limit = (1 + nr_cpus_node(node)) * |
3378 | cachep->batchcount + cachep->num; | 3396 | cachep->batchcount + cachep->num; |
3379 | cachep->nodelists[node] = l3; | 3397 | cachep->nodelists[node] = l3; |
3380 | } | 3398 | } |
3381 | return err; | 3399 | return err; |
3382 | fail: | 3400 | fail: |
3383 | err = -ENOMEM; | 3401 | err = -ENOMEM; |
3384 | return err; | 3402 | return err; |
3385 | } | 3403 | } |
@@ -3391,7 +3409,7 @@ struct ccupdate_struct { | |||
3391 | 3409 | ||
3392 | static void do_ccupdate_local(void *info) | 3410 | static void do_ccupdate_local(void *info) |
3393 | { | 3411 | { |
3394 | struct ccupdate_struct *new = (struct ccupdate_struct *)info; | 3412 | struct ccupdate_struct *new = info; |
3395 | struct array_cache *old; | 3413 | struct array_cache *old; |
3396 | 3414 | ||
3397 | check_irq_off(); | 3415 | check_irq_off(); |
@@ -3401,16 +3419,17 @@ static void do_ccupdate_local(void *info) | |||
3401 | new->new[smp_processor_id()] = old; | 3419 | new->new[smp_processor_id()] = old; |
3402 | } | 3420 | } |
3403 | 3421 | ||
3404 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, | 3422 | /* Always called with the cache_chain_mutex held */ |
3405 | int shared) | 3423 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, |
3424 | int batchcount, int shared) | ||
3406 | { | 3425 | { |
3407 | struct ccupdate_struct new; | 3426 | struct ccupdate_struct new; |
3408 | int i, err; | 3427 | int i, err; |
3409 | 3428 | ||
3410 | memset(&new.new, 0, sizeof(new.new)); | 3429 | memset(&new.new, 0, sizeof(new.new)); |
3411 | for_each_online_cpu(i) { | 3430 | for_each_online_cpu(i) { |
3412 | new.new[i] = | 3431 | new.new[i] = alloc_arraycache(cpu_to_node(i), limit, |
3413 | alloc_arraycache(cpu_to_node(i), limit, batchcount); | 3432 | batchcount); |
3414 | if (!new.new[i]) { | 3433 | if (!new.new[i]) { |
3415 | for (i--; i >= 0; i--) | 3434 | for (i--; i >= 0; i--) |
3416 | kfree(new.new[i]); | 3435 | kfree(new.new[i]); |
@@ -3419,14 +3438,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount | |||
3419 | } | 3438 | } |
3420 | new.cachep = cachep; | 3439 | new.cachep = cachep; |
3421 | 3440 | ||
3422 | smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); | 3441 | on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1); |
3423 | 3442 | ||
3424 | check_irq_on(); | 3443 | check_irq_on(); |
3425 | spin_lock(&cachep->spinlock); | ||
3426 | cachep->batchcount = batchcount; | 3444 | cachep->batchcount = batchcount; |
3427 | cachep->limit = limit; | 3445 | cachep->limit = limit; |
3428 | cachep->shared = shared; | 3446 | cachep->shared = shared; |
3429 | spin_unlock(&cachep->spinlock); | ||
3430 | 3447 | ||
3431 | for_each_online_cpu(i) { | 3448 | for_each_online_cpu(i) { |
3432 | struct array_cache *ccold = new.new[i]; | 3449 | struct array_cache *ccold = new.new[i]; |
@@ -3447,15 +3464,17 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount | |||
3447 | return 0; | 3464 | return 0; |
3448 | } | 3465 | } |
3449 | 3466 | ||
3467 | /* Called with cache_chain_mutex held always */ | ||
3450 | static void enable_cpucache(struct kmem_cache *cachep) | 3468 | static void enable_cpucache(struct kmem_cache *cachep) |
3451 | { | 3469 | { |
3452 | int err; | 3470 | int err; |
3453 | int limit, shared; | 3471 | int limit, shared; |
3454 | 3472 | ||
3455 | /* The head array serves three purposes: | 3473 | /* |
3474 | * The head array serves three purposes: | ||
3456 | * - create a LIFO ordering, i.e. return objects that are cache-warm | 3475 | * - create a LIFO ordering, i.e. return objects that are cache-warm |
3457 | * - reduce the number of spinlock operations. | 3476 | * - reduce the number of spinlock operations. |
3458 | * - reduce the number of linked list operations on the slab and | 3477 | * - reduce the number of linked list operations on the slab and |
3459 | * bufctl chains: array operations are cheaper. | 3478 | * bufctl chains: array operations are cheaper. |
3460 | * The numbers are guessed, we should auto-tune as described by | 3479 | * The numbers are guessed, we should auto-tune as described by |
3461 | * Bonwick. | 3480 | * Bonwick. |
@@ -3471,7 +3490,8 @@ static void enable_cpucache(struct kmem_cache *cachep) | |||
3471 | else | 3490 | else |
3472 | limit = 120; | 3491 | limit = 120; |
3473 | 3492 | ||
3474 | /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound | 3493 | /* |
3494 | * CPU bound tasks (e.g. network routing) can exhibit cpu bound | ||
3475 | * allocation behaviour: Most allocs on one cpu, most free operations | 3495 | * allocation behaviour: Most allocs on one cpu, most free operations |
3476 | * on another cpu. For these cases, an efficient object passing between | 3496 | * on another cpu. For these cases, an efficient object passing between |
3477 | * cpus is necessary. This is provided by a shared array. The array | 3497 | * cpus is necessary. This is provided by a shared array. The array |
@@ -3486,9 +3506,9 @@ static void enable_cpucache(struct kmem_cache *cachep) | |||
3486 | #endif | 3506 | #endif |
3487 | 3507 | ||
3488 | #if DEBUG | 3508 | #if DEBUG |
3489 | /* With debugging enabled, large batchcount lead to excessively | 3509 | /* |
3490 | * long periods with disabled local interrupts. Limit the | 3510 | * With debugging enabled, large batchcount lead to excessively long |
3491 | * batchcount | 3511 | * periods with disabled local interrupts. Limit the batchcount |
3492 | */ | 3512 | */ |
3493 | if (limit > 32) | 3513 | if (limit > 32) |
3494 | limit = 32; | 3514 | limit = 32; |
@@ -3499,23 +3519,32 @@ static void enable_cpucache(struct kmem_cache *cachep) | |||
3499 | cachep->name, -err); | 3519 | cachep->name, -err); |
3500 | } | 3520 | } |
3501 | 3521 | ||
3502 | static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, | 3522 | /* |
3503 | int force, int node) | 3523 | * Drain an array if it contains any elements taking the l3 lock only if |
3524 | * necessary. Note that the l3 listlock also protects the array_cache | ||
3525 | * if drain_array() is used on the shared array. | ||
3526 | */ | ||
3527 | void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, | ||
3528 | struct array_cache *ac, int force, int node) | ||
3504 | { | 3529 | { |
3505 | int tofree; | 3530 | int tofree; |
3506 | 3531 | ||
3507 | check_spinlock_acquired_node(cachep, node); | 3532 | if (!ac || !ac->avail) |
3533 | return; | ||
3508 | if (ac->touched && !force) { | 3534 | if (ac->touched && !force) { |
3509 | ac->touched = 0; | 3535 | ac->touched = 0; |
3510 | } else if (ac->avail) { | 3536 | } else { |
3511 | tofree = force ? ac->avail : (ac->limit + 4) / 5; | 3537 | spin_lock_irq(&l3->list_lock); |
3512 | if (tofree > ac->avail) { | 3538 | if (ac->avail) { |
3513 | tofree = (ac->avail + 1) / 2; | 3539 | tofree = force ? ac->avail : (ac->limit + 4) / 5; |
3540 | if (tofree > ac->avail) | ||
3541 | tofree = (ac->avail + 1) / 2; | ||
3542 | free_block(cachep, ac->entry, tofree, node); | ||
3543 | ac->avail -= tofree; | ||
3544 | memmove(ac->entry, &(ac->entry[tofree]), | ||
3545 | sizeof(void *) * ac->avail); | ||
3514 | } | 3546 | } |
3515 | free_block(cachep, ac->entry, tofree, node); | 3547 | spin_unlock_irq(&l3->list_lock); |
3516 | ac->avail -= tofree; | ||
3517 | memmove(ac->entry, &(ac->entry[tofree]), | ||
3518 | sizeof(void *) * ac->avail); | ||
3519 | } | 3548 | } |
3520 | } | 3549 | } |
3521 | 3550 | ||
@@ -3528,13 +3557,14 @@ static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac | |||
3528 | * - clear the per-cpu caches for this CPU. | 3557 | * - clear the per-cpu caches for this CPU. |
3529 | * - return freeable pages to the main free memory pool. | 3558 | * - return freeable pages to the main free memory pool. |
3530 | * | 3559 | * |
3531 | * If we cannot acquire the cache chain mutex then just give up - we'll | 3560 | * If we cannot acquire the cache chain mutex then just give up - we'll try |
3532 | * try again on the next iteration. | 3561 | * again on the next iteration. |
3533 | */ | 3562 | */ |
3534 | static void cache_reap(void *unused) | 3563 | static void cache_reap(void *unused) |
3535 | { | 3564 | { |
3536 | struct list_head *walk; | 3565 | struct list_head *walk; |
3537 | struct kmem_list3 *l3; | 3566 | struct kmem_list3 *l3; |
3567 | int node = numa_node_id(); | ||
3538 | 3568 | ||
3539 | if (!mutex_trylock(&cache_chain_mutex)) { | 3569 | if (!mutex_trylock(&cache_chain_mutex)) { |
3540 | /* Give up. Setup the next iteration. */ | 3570 | /* Give up. Setup the next iteration. */ |
@@ -3550,65 +3580,72 @@ static void cache_reap(void *unused) | |||
3550 | struct slab *slabp; | 3580 | struct slab *slabp; |
3551 | 3581 | ||
3552 | searchp = list_entry(walk, struct kmem_cache, next); | 3582 | searchp = list_entry(walk, struct kmem_cache, next); |
3553 | |||
3554 | if (searchp->flags & SLAB_NO_REAP) | ||
3555 | goto next; | ||
3556 | |||
3557 | check_irq_on(); | 3583 | check_irq_on(); |
3558 | 3584 | ||
3559 | l3 = searchp->nodelists[numa_node_id()]; | 3585 | /* |
3586 | * We only take the l3 lock if absolutely necessary and we | ||
3587 | * have established with reasonable certainty that | ||
3588 | * we can do some work if the lock was obtained. | ||
3589 | */ | ||
3590 | l3 = searchp->nodelists[node]; | ||
3591 | |||
3560 | reap_alien(searchp, l3); | 3592 | reap_alien(searchp, l3); |
3561 | spin_lock_irq(&l3->list_lock); | ||
3562 | 3593 | ||
3563 | drain_array_locked(searchp, cpu_cache_get(searchp), 0, | 3594 | drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); |
3564 | numa_node_id()); | ||
3565 | 3595 | ||
3596 | /* | ||
3597 | * These are racy checks but it does not matter | ||
3598 | * if we skip one check or scan twice. | ||
3599 | */ | ||
3566 | if (time_after(l3->next_reap, jiffies)) | 3600 | if (time_after(l3->next_reap, jiffies)) |
3567 | goto next_unlock; | 3601 | goto next; |
3568 | 3602 | ||
3569 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3; | 3603 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3; |
3570 | 3604 | ||
3571 | if (l3->shared) | 3605 | drain_array(searchp, l3, l3->shared, 0, node); |
3572 | drain_array_locked(searchp, l3->shared, 0, | ||
3573 | numa_node_id()); | ||
3574 | 3606 | ||
3575 | if (l3->free_touched) { | 3607 | if (l3->free_touched) { |
3576 | l3->free_touched = 0; | 3608 | l3->free_touched = 0; |
3577 | goto next_unlock; | 3609 | goto next; |
3578 | } | 3610 | } |
3579 | 3611 | ||
3580 | tofree = | 3612 | tofree = (l3->free_limit + 5 * searchp->num - 1) / |
3581 | (l3->free_limit + 5 * searchp->num - | 3613 | (5 * searchp->num); |
3582 | 1) / (5 * searchp->num); | ||
3583 | do { | 3614 | do { |
3615 | /* | ||
3616 | * Do not lock if there are no free blocks. | ||
3617 | */ | ||
3618 | if (list_empty(&l3->slabs_free)) | ||
3619 | break; | ||
3620 | |||
3621 | spin_lock_irq(&l3->list_lock); | ||
3584 | p = l3->slabs_free.next; | 3622 | p = l3->slabs_free.next; |
3585 | if (p == &(l3->slabs_free)) | 3623 | if (p == &(l3->slabs_free)) { |
3624 | spin_unlock_irq(&l3->list_lock); | ||
3586 | break; | 3625 | break; |
3626 | } | ||
3587 | 3627 | ||
3588 | slabp = list_entry(p, struct slab, list); | 3628 | slabp = list_entry(p, struct slab, list); |
3589 | BUG_ON(slabp->inuse); | 3629 | BUG_ON(slabp->inuse); |
3590 | list_del(&slabp->list); | 3630 | list_del(&slabp->list); |
3591 | STATS_INC_REAPED(searchp); | 3631 | STATS_INC_REAPED(searchp); |
3592 | 3632 | ||
3593 | /* Safe to drop the lock. The slab is no longer | 3633 | /* |
3594 | * linked to the cache. | 3634 | * Safe to drop the lock. The slab is no longer linked |
3595 | * searchp cannot disappear, we hold | 3635 | * to the cache. searchp cannot disappear, we hold |
3596 | * cache_chain_lock | 3636 | * cache_chain_lock |
3597 | */ | 3637 | */ |
3598 | l3->free_objects -= searchp->num; | 3638 | l3->free_objects -= searchp->num; |
3599 | spin_unlock_irq(&l3->list_lock); | 3639 | spin_unlock_irq(&l3->list_lock); |
3600 | slab_destroy(searchp, slabp); | 3640 | slab_destroy(searchp, slabp); |
3601 | spin_lock_irq(&l3->list_lock); | ||
3602 | } while (--tofree > 0); | 3641 | } while (--tofree > 0); |
3603 | next_unlock: | 3642 | next: |
3604 | spin_unlock_irq(&l3->list_lock); | ||
3605 | next: | ||
3606 | cond_resched(); | 3643 | cond_resched(); |
3607 | } | 3644 | } |
3608 | check_irq_on(); | 3645 | check_irq_on(); |
3609 | mutex_unlock(&cache_chain_mutex); | 3646 | mutex_unlock(&cache_chain_mutex); |
3610 | next_reap_node(); | 3647 | next_reap_node(); |
3611 | /* Setup the next iteration */ | 3648 | /* Set up the next iteration */ |
3612 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); | 3649 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); |
3613 | } | 3650 | } |
3614 | 3651 | ||
@@ -3658,8 +3695,8 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) | |||
3658 | { | 3695 | { |
3659 | struct kmem_cache *cachep = p; | 3696 | struct kmem_cache *cachep = p; |
3660 | ++*pos; | 3697 | ++*pos; |
3661 | return cachep->next.next == &cache_chain ? NULL | 3698 | return cachep->next.next == &cache_chain ? |
3662 | : list_entry(cachep->next.next, struct kmem_cache, next); | 3699 | NULL : list_entry(cachep->next.next, struct kmem_cache, next); |
3663 | } | 3700 | } |
3664 | 3701 | ||
3665 | static void s_stop(struct seq_file *m, void *p) | 3702 | static void s_stop(struct seq_file *m, void *p) |
@@ -3681,7 +3718,6 @@ static int s_show(struct seq_file *m, void *p) | |||
3681 | int node; | 3718 | int node; |
3682 | struct kmem_list3 *l3; | 3719 | struct kmem_list3 *l3; |
3683 | 3720 | ||
3684 | spin_lock(&cachep->spinlock); | ||
3685 | active_objs = 0; | 3721 | active_objs = 0; |
3686 | num_slabs = 0; | 3722 | num_slabs = 0; |
3687 | for_each_online_node(node) { | 3723 | for_each_online_node(node) { |
@@ -3748,7 +3784,9 @@ static int s_show(struct seq_file *m, void *p) | |||
3748 | unsigned long node_frees = cachep->node_frees; | 3784 | unsigned long node_frees = cachep->node_frees; |
3749 | 3785 | ||
3750 | seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ | 3786 | seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ |
3751 | %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees); | 3787 | %4lu %4lu %4lu %4lu", allocs, high, grown, |
3788 | reaped, errors, max_freeable, node_allocs, | ||
3789 | node_frees); | ||
3752 | } | 3790 | } |
3753 | /* cpu stats */ | 3791 | /* cpu stats */ |
3754 | { | 3792 | { |
@@ -3762,7 +3800,6 @@ static int s_show(struct seq_file *m, void *p) | |||
3762 | } | 3800 | } |
3763 | #endif | 3801 | #endif |
3764 | seq_putc(m, '\n'); | 3802 | seq_putc(m, '\n'); |
3765 | spin_unlock(&cachep->spinlock); | ||
3766 | return 0; | 3803 | return 0; |
3767 | } | 3804 | } |
3768 | 3805 | ||
@@ -3820,13 +3857,12 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, | |||
3820 | mutex_lock(&cache_chain_mutex); | 3857 | mutex_lock(&cache_chain_mutex); |
3821 | res = -EINVAL; | 3858 | res = -EINVAL; |
3822 | list_for_each(p, &cache_chain) { | 3859 | list_for_each(p, &cache_chain) { |
3823 | struct kmem_cache *cachep = list_entry(p, struct kmem_cache, | 3860 | struct kmem_cache *cachep; |
3824 | next); | ||
3825 | 3861 | ||
3862 | cachep = list_entry(p, struct kmem_cache, next); | ||
3826 | if (!strcmp(cachep->name, kbuf)) { | 3863 | if (!strcmp(cachep->name, kbuf)) { |
3827 | if (limit < 1 || | 3864 | if (limit < 1 || batchcount < 1 || |
3828 | batchcount < 1 || | 3865 | batchcount > limit || shared < 0) { |
3829 | batchcount > limit || shared < 0) { | ||
3830 | res = 0; | 3866 | res = 0; |
3831 | } else { | 3867 | } else { |
3832 | res = do_tune_cpucache(cachep, limit, | 3868 | res = do_tune_cpucache(cachep, limit, |
@@ -209,19 +209,18 @@ int lru_add_drain_all(void) | |||
209 | */ | 209 | */ |
210 | void fastcall __page_cache_release(struct page *page) | 210 | void fastcall __page_cache_release(struct page *page) |
211 | { | 211 | { |
212 | unsigned long flags; | 212 | if (PageLRU(page)) { |
213 | struct zone *zone = page_zone(page); | 213 | unsigned long flags; |
214 | struct zone *zone = page_zone(page); | ||
214 | 215 | ||
215 | spin_lock_irqsave(&zone->lru_lock, flags); | 216 | spin_lock_irqsave(&zone->lru_lock, flags); |
216 | if (TestClearPageLRU(page)) | 217 | BUG_ON(!PageLRU(page)); |
218 | __ClearPageLRU(page); | ||
217 | del_page_from_lru(zone, page); | 219 | del_page_from_lru(zone, page); |
218 | if (page_count(page) != 0) | 220 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
219 | page = NULL; | 221 | } |
220 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 222 | free_hot_page(page); |
221 | if (page) | ||
222 | free_hot_page(page); | ||
223 | } | 223 | } |
224 | |||
225 | EXPORT_SYMBOL(__page_cache_release); | 224 | EXPORT_SYMBOL(__page_cache_release); |
226 | 225 | ||
227 | /* | 226 | /* |
@@ -245,7 +244,6 @@ void release_pages(struct page **pages, int nr, int cold) | |||
245 | pagevec_init(&pages_to_free, cold); | 244 | pagevec_init(&pages_to_free, cold); |
246 | for (i = 0; i < nr; i++) { | 245 | for (i = 0; i < nr; i++) { |
247 | struct page *page = pages[i]; | 246 | struct page *page = pages[i]; |
248 | struct zone *pagezone; | ||
249 | 247 | ||
250 | if (unlikely(PageCompound(page))) { | 248 | if (unlikely(PageCompound(page))) { |
251 | if (zone) { | 249 | if (zone) { |
@@ -259,23 +257,27 @@ void release_pages(struct page **pages, int nr, int cold) | |||
259 | if (!put_page_testzero(page)) | 257 | if (!put_page_testzero(page)) |
260 | continue; | 258 | continue; |
261 | 259 | ||
262 | pagezone = page_zone(page); | 260 | if (PageLRU(page)) { |
263 | if (pagezone != zone) { | 261 | struct zone *pagezone = page_zone(page); |
264 | if (zone) | 262 | if (pagezone != zone) { |
265 | spin_unlock_irq(&zone->lru_lock); | 263 | if (zone) |
266 | zone = pagezone; | 264 | spin_unlock_irq(&zone->lru_lock); |
267 | spin_lock_irq(&zone->lru_lock); | 265 | zone = pagezone; |
268 | } | 266 | spin_lock_irq(&zone->lru_lock); |
269 | if (TestClearPageLRU(page)) | 267 | } |
268 | BUG_ON(!PageLRU(page)); | ||
269 | __ClearPageLRU(page); | ||
270 | del_page_from_lru(zone, page); | 270 | del_page_from_lru(zone, page); |
271 | if (page_count(page) == 0) { | 271 | } |
272 | if (!pagevec_add(&pages_to_free, page)) { | 272 | |
273 | if (!pagevec_add(&pages_to_free, page)) { | ||
274 | if (zone) { | ||
273 | spin_unlock_irq(&zone->lru_lock); | 275 | spin_unlock_irq(&zone->lru_lock); |
274 | __pagevec_free(&pages_to_free); | 276 | zone = NULL; |
275 | pagevec_reinit(&pages_to_free); | ||
276 | zone = NULL; /* No lock is held */ | ||
277 | } | 277 | } |
278 | } | 278 | __pagevec_free(&pages_to_free); |
279 | pagevec_reinit(&pages_to_free); | ||
280 | } | ||
279 | } | 281 | } |
280 | if (zone) | 282 | if (zone) |
281 | spin_unlock_irq(&zone->lru_lock); | 283 | spin_unlock_irq(&zone->lru_lock); |
@@ -343,8 +345,8 @@ void __pagevec_lru_add(struct pagevec *pvec) | |||
343 | zone = pagezone; | 345 | zone = pagezone; |
344 | spin_lock_irq(&zone->lru_lock); | 346 | spin_lock_irq(&zone->lru_lock); |
345 | } | 347 | } |
346 | if (TestSetPageLRU(page)) | 348 | BUG_ON(PageLRU(page)); |
347 | BUG(); | 349 | SetPageLRU(page); |
348 | add_page_to_inactive_list(zone, page); | 350 | add_page_to_inactive_list(zone, page); |
349 | } | 351 | } |
350 | if (zone) | 352 | if (zone) |
@@ -370,10 +372,10 @@ void __pagevec_lru_add_active(struct pagevec *pvec) | |||
370 | zone = pagezone; | 372 | zone = pagezone; |
371 | spin_lock_irq(&zone->lru_lock); | 373 | spin_lock_irq(&zone->lru_lock); |
372 | } | 374 | } |
373 | if (TestSetPageLRU(page)) | 375 | BUG_ON(PageLRU(page)); |
374 | BUG(); | 376 | SetPageLRU(page); |
375 | if (TestSetPageActive(page)) | 377 | BUG_ON(PageActive(page)); |
376 | BUG(); | 378 | SetPageActive(page); |
377 | add_page_to_active_list(zone, page); | 379 | add_page_to_active_list(zone, page); |
378 | } | 380 | } |
379 | if (zone) | 381 | if (zone) |
diff --git a/mm/swap_state.c b/mm/swap_state.c index db8a3d3e16..d7af296833 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/buffer_head.h> | 15 | #include <linux/buffer_head.h> |
16 | #include <linux/backing-dev.h> | 16 | #include <linux/backing-dev.h> |
17 | #include <linux/pagevec.h> | 17 | #include <linux/pagevec.h> |
18 | #include <linux/migrate.h> | ||
18 | 19 | ||
19 | #include <asm/pgtable.h> | 20 | #include <asm/pgtable.h> |
20 | 21 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index 1f9cf0d073..365ed6ff18 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -116,7 +116,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) | |||
116 | last_in_cluster = offset + SWAPFILE_CLUSTER; | 116 | last_in_cluster = offset + SWAPFILE_CLUSTER; |
117 | else if (offset == last_in_cluster) { | 117 | else if (offset == last_in_cluster) { |
118 | spin_lock(&swap_lock); | 118 | spin_lock(&swap_lock); |
119 | si->cluster_next = offset-SWAPFILE_CLUSTER-1; | 119 | si->cluster_next = offset-SWAPFILE_CLUSTER+1; |
120 | goto cluster; | 120 | goto cluster; |
121 | } | 121 | } |
122 | if (unlikely(--latency_ration < 0)) { | 122 | if (unlikely(--latency_ration < 0)) { |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 4fe7e3aa02..fd572bbdc9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -33,39 +33,21 @@ | |||
33 | #include <linux/cpuset.h> | 33 | #include <linux/cpuset.h> |
34 | #include <linux/notifier.h> | 34 | #include <linux/notifier.h> |
35 | #include <linux/rwsem.h> | 35 | #include <linux/rwsem.h> |
36 | #include <linux/delay.h> | ||
36 | 37 | ||
37 | #include <asm/tlbflush.h> | 38 | #include <asm/tlbflush.h> |
38 | #include <asm/div64.h> | 39 | #include <asm/div64.h> |
39 | 40 | ||
40 | #include <linux/swapops.h> | 41 | #include <linux/swapops.h> |
41 | 42 | ||
42 | /* possible outcome of pageout() */ | 43 | #include "internal.h" |
43 | typedef enum { | ||
44 | /* failed to write page out, page is locked */ | ||
45 | PAGE_KEEP, | ||
46 | /* move page to the active list, page is locked */ | ||
47 | PAGE_ACTIVATE, | ||
48 | /* page has been sent to the disk successfully, page is unlocked */ | ||
49 | PAGE_SUCCESS, | ||
50 | /* page is clean and locked */ | ||
51 | PAGE_CLEAN, | ||
52 | } pageout_t; | ||
53 | 44 | ||
54 | struct scan_control { | 45 | struct scan_control { |
55 | /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */ | ||
56 | unsigned long nr_to_scan; | ||
57 | |||
58 | /* Incremented by the number of inactive pages that were scanned */ | 46 | /* Incremented by the number of inactive pages that were scanned */ |
59 | unsigned long nr_scanned; | 47 | unsigned long nr_scanned; |
60 | 48 | ||
61 | /* Incremented by the number of pages reclaimed */ | ||
62 | unsigned long nr_reclaimed; | ||
63 | |||
64 | unsigned long nr_mapped; /* From page_state */ | 49 | unsigned long nr_mapped; /* From page_state */ |
65 | 50 | ||
66 | /* Ask shrink_caches, or shrink_zone to scan at this priority */ | ||
67 | unsigned int priority; | ||
68 | |||
69 | /* This context's GFP mask */ | 51 | /* This context's GFP mask */ |
70 | gfp_t gfp_mask; | 52 | gfp_t gfp_mask; |
71 | 53 | ||
@@ -183,10 +165,11 @@ EXPORT_SYMBOL(remove_shrinker); | |||
183 | * | 165 | * |
184 | * Returns the number of slab objects which we shrunk. | 166 | * Returns the number of slab objects which we shrunk. |
185 | */ | 167 | */ |
186 | int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages) | 168 | unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, |
169 | unsigned long lru_pages) | ||
187 | { | 170 | { |
188 | struct shrinker *shrinker; | 171 | struct shrinker *shrinker; |
189 | int ret = 0; | 172 | unsigned long ret = 0; |
190 | 173 | ||
191 | if (scanned == 0) | 174 | if (scanned == 0) |
192 | scanned = SWAP_CLUSTER_MAX; | 175 | scanned = SWAP_CLUSTER_MAX; |
@@ -306,9 +289,10 @@ static void handle_write_error(struct address_space *mapping, | |||
306 | } | 289 | } |
307 | 290 | ||
308 | /* | 291 | /* |
309 | * pageout is called by shrink_list() for each dirty page. Calls ->writepage(). | 292 | * pageout is called by shrink_page_list() for each dirty page. |
293 | * Calls ->writepage(). | ||
310 | */ | 294 | */ |
311 | static pageout_t pageout(struct page *page, struct address_space *mapping) | 295 | pageout_t pageout(struct page *page, struct address_space *mapping) |
312 | { | 296 | { |
313 | /* | 297 | /* |
314 | * If the page is dirty, only perform writeback if that write | 298 | * If the page is dirty, only perform writeback if that write |
@@ -376,7 +360,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) | |||
376 | return PAGE_CLEAN; | 360 | return PAGE_CLEAN; |
377 | } | 361 | } |
378 | 362 | ||
379 | static int remove_mapping(struct address_space *mapping, struct page *page) | 363 | int remove_mapping(struct address_space *mapping, struct page *page) |
380 | { | 364 | { |
381 | if (!mapping) | 365 | if (!mapping) |
382 | return 0; /* truncate got there first */ | 366 | return 0; /* truncate got there first */ |
@@ -414,14 +398,15 @@ cannot_free: | |||
414 | } | 398 | } |
415 | 399 | ||
416 | /* | 400 | /* |
417 | * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed | 401 | * shrink_page_list() returns the number of reclaimed pages |
418 | */ | 402 | */ |
419 | static int shrink_list(struct list_head *page_list, struct scan_control *sc) | 403 | static unsigned long shrink_page_list(struct list_head *page_list, |
404 | struct scan_control *sc) | ||
420 | { | 405 | { |
421 | LIST_HEAD(ret_pages); | 406 | LIST_HEAD(ret_pages); |
422 | struct pagevec freed_pvec; | 407 | struct pagevec freed_pvec; |
423 | int pgactivate = 0; | 408 | int pgactivate = 0; |
424 | int reclaimed = 0; | 409 | unsigned long nr_reclaimed = 0; |
425 | 410 | ||
426 | cond_resched(); | 411 | cond_resched(); |
427 | 412 | ||
@@ -464,12 +449,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) | |||
464 | * Anonymous process memory has backing store? | 449 | * Anonymous process memory has backing store? |
465 | * Try to allocate it some swap space here. | 450 | * Try to allocate it some swap space here. |
466 | */ | 451 | */ |
467 | if (PageAnon(page) && !PageSwapCache(page)) { | 452 | if (PageAnon(page) && !PageSwapCache(page)) |
468 | if (!sc->may_swap) | ||
469 | goto keep_locked; | ||
470 | if (!add_to_swap(page, GFP_ATOMIC)) | 453 | if (!add_to_swap(page, GFP_ATOMIC)) |
471 | goto activate_locked; | 454 | goto activate_locked; |
472 | } | ||
473 | #endif /* CONFIG_SWAP */ | 455 | #endif /* CONFIG_SWAP */ |
474 | 456 | ||
475 | mapping = page_mapping(page); | 457 | mapping = page_mapping(page); |
@@ -481,12 +463,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) | |||
481 | * processes. Try to unmap it here. | 463 | * processes. Try to unmap it here. |
482 | */ | 464 | */ |
483 | if (page_mapped(page) && mapping) { | 465 | if (page_mapped(page) && mapping) { |
484 | /* | ||
485 | * No unmapping if we do not swap | ||
486 | */ | ||
487 | if (!sc->may_swap) | ||
488 | goto keep_locked; | ||
489 | |||
490 | switch (try_to_unmap(page, 0)) { | 466 | switch (try_to_unmap(page, 0)) { |
491 | case SWAP_FAIL: | 467 | case SWAP_FAIL: |
492 | goto activate_locked; | 468 | goto activate_locked; |
@@ -561,7 +537,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) | |||
561 | 537 | ||
562 | free_it: | 538 | free_it: |
563 | unlock_page(page); | 539 | unlock_page(page); |
564 | reclaimed++; | 540 | nr_reclaimed++; |
565 | if (!pagevec_add(&freed_pvec, page)) | 541 | if (!pagevec_add(&freed_pvec, page)) |
566 | __pagevec_release_nonlru(&freed_pvec); | 542 | __pagevec_release_nonlru(&freed_pvec); |
567 | continue; | 543 | continue; |
@@ -579,483 +555,8 @@ keep: | |||
579 | if (pagevec_count(&freed_pvec)) | 555 | if (pagevec_count(&freed_pvec)) |
580 | __pagevec_release_nonlru(&freed_pvec); | 556 | __pagevec_release_nonlru(&freed_pvec); |
581 | mod_page_state(pgactivate, pgactivate); | 557 | mod_page_state(pgactivate, pgactivate); |
582 | sc->nr_reclaimed += reclaimed; | 558 | return nr_reclaimed; |
583 | return reclaimed; | ||
584 | } | ||
585 | |||
586 | #ifdef CONFIG_MIGRATION | ||
587 | static inline void move_to_lru(struct page *page) | ||
588 | { | ||
589 | list_del(&page->lru); | ||
590 | if (PageActive(page)) { | ||
591 | /* | ||
592 | * lru_cache_add_active checks that | ||
593 | * the PG_active bit is off. | ||
594 | */ | ||
595 | ClearPageActive(page); | ||
596 | lru_cache_add_active(page); | ||
597 | } else { | ||
598 | lru_cache_add(page); | ||
599 | } | ||
600 | put_page(page); | ||
601 | } | ||
602 | |||
603 | /* | ||
604 | * Add isolated pages on the list back to the LRU. | ||
605 | * | ||
606 | * returns the number of pages put back. | ||
607 | */ | ||
608 | int putback_lru_pages(struct list_head *l) | ||
609 | { | ||
610 | struct page *page; | ||
611 | struct page *page2; | ||
612 | int count = 0; | ||
613 | |||
614 | list_for_each_entry_safe(page, page2, l, lru) { | ||
615 | move_to_lru(page); | ||
616 | count++; | ||
617 | } | ||
618 | return count; | ||
619 | } | ||
620 | |||
621 | /* | ||
622 | * Non migratable page | ||
623 | */ | ||
624 | int fail_migrate_page(struct page *newpage, struct page *page) | ||
625 | { | ||
626 | return -EIO; | ||
627 | } | ||
628 | EXPORT_SYMBOL(fail_migrate_page); | ||
629 | |||
630 | /* | ||
631 | * swapout a single page | ||
632 | * page is locked upon entry, unlocked on exit | ||
633 | */ | ||
634 | static int swap_page(struct page *page) | ||
635 | { | ||
636 | struct address_space *mapping = page_mapping(page); | ||
637 | |||
638 | if (page_mapped(page) && mapping) | ||
639 | if (try_to_unmap(page, 1) != SWAP_SUCCESS) | ||
640 | goto unlock_retry; | ||
641 | |||
642 | if (PageDirty(page)) { | ||
643 | /* Page is dirty, try to write it out here */ | ||
644 | switch(pageout(page, mapping)) { | ||
645 | case PAGE_KEEP: | ||
646 | case PAGE_ACTIVATE: | ||
647 | goto unlock_retry; | ||
648 | |||
649 | case PAGE_SUCCESS: | ||
650 | goto retry; | ||
651 | |||
652 | case PAGE_CLEAN: | ||
653 | ; /* try to free the page below */ | ||
654 | } | ||
655 | } | ||
656 | |||
657 | if (PagePrivate(page)) { | ||
658 | if (!try_to_release_page(page, GFP_KERNEL) || | ||
659 | (!mapping && page_count(page) == 1)) | ||
660 | goto unlock_retry; | ||
661 | } | ||
662 | |||
663 | if (remove_mapping(mapping, page)) { | ||
664 | /* Success */ | ||
665 | unlock_page(page); | ||
666 | return 0; | ||
667 | } | ||
668 | |||
669 | unlock_retry: | ||
670 | unlock_page(page); | ||
671 | |||
672 | retry: | ||
673 | return -EAGAIN; | ||
674 | } | ||
675 | EXPORT_SYMBOL(swap_page); | ||
676 | |||
677 | /* | ||
678 | * Page migration was first developed in the context of the memory hotplug | ||
679 | * project. The main authors of the migration code are: | ||
680 | * | ||
681 | * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> | ||
682 | * Hirokazu Takahashi <taka@valinux.co.jp> | ||
683 | * Dave Hansen <haveblue@us.ibm.com> | ||
684 | * Christoph Lameter <clameter@sgi.com> | ||
685 | */ | ||
686 | |||
687 | /* | ||
688 | * Remove references for a page and establish the new page with the correct | ||
689 | * basic settings to be able to stop accesses to the page. | ||
690 | */ | ||
691 | int migrate_page_remove_references(struct page *newpage, | ||
692 | struct page *page, int nr_refs) | ||
693 | { | ||
694 | struct address_space *mapping = page_mapping(page); | ||
695 | struct page **radix_pointer; | ||
696 | |||
697 | /* | ||
698 | * Avoid doing any of the following work if the page count | ||
699 | * indicates that the page is in use or truncate has removed | ||
700 | * the page. | ||
701 | */ | ||
702 | if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) | ||
703 | return -EAGAIN; | ||
704 | |||
705 | /* | ||
706 | * Establish swap ptes for anonymous pages or destroy pte | ||
707 | * maps for files. | ||
708 | * | ||
709 | * In order to reestablish file backed mappings the fault handlers | ||
710 | * will take the radix tree_lock which may then be used to stop | ||
711 | * processses from accessing this page until the new page is ready. | ||
712 | * | ||
713 | * A process accessing via a swap pte (an anonymous page) will take a | ||
714 | * page_lock on the old page which will block the process until the | ||
715 | * migration attempt is complete. At that time the PageSwapCache bit | ||
716 | * will be examined. If the page was migrated then the PageSwapCache | ||
717 | * bit will be clear and the operation to retrieve the page will be | ||
718 | * retried which will find the new page in the radix tree. Then a new | ||
719 | * direct mapping may be generated based on the radix tree contents. | ||
720 | * | ||
721 | * If the page was not migrated then the PageSwapCache bit | ||
722 | * is still set and the operation may continue. | ||
723 | */ | ||
724 | if (try_to_unmap(page, 1) == SWAP_FAIL) | ||
725 | /* A vma has VM_LOCKED set -> Permanent failure */ | ||
726 | return -EPERM; | ||
727 | |||
728 | /* | ||
729 | * Give up if we were unable to remove all mappings. | ||
730 | */ | ||
731 | if (page_mapcount(page)) | ||
732 | return -EAGAIN; | ||
733 | |||
734 | write_lock_irq(&mapping->tree_lock); | ||
735 | |||
736 | radix_pointer = (struct page **)radix_tree_lookup_slot( | ||
737 | &mapping->page_tree, | ||
738 | page_index(page)); | ||
739 | |||
740 | if (!page_mapping(page) || page_count(page) != nr_refs || | ||
741 | *radix_pointer != page) { | ||
742 | write_unlock_irq(&mapping->tree_lock); | ||
743 | return -EAGAIN; | ||
744 | } | ||
745 | |||
746 | /* | ||
747 | * Now we know that no one else is looking at the page. | ||
748 | * | ||
749 | * Certain minimal information about a page must be available | ||
750 | * in order for other subsystems to properly handle the page if they | ||
751 | * find it through the radix tree update before we are finished | ||
752 | * copying the page. | ||
753 | */ | ||
754 | get_page(newpage); | ||
755 | newpage->index = page->index; | ||
756 | newpage->mapping = page->mapping; | ||
757 | if (PageSwapCache(page)) { | ||
758 | SetPageSwapCache(newpage); | ||
759 | set_page_private(newpage, page_private(page)); | ||
760 | } | ||
761 | |||
762 | *radix_pointer = newpage; | ||
763 | __put_page(page); | ||
764 | write_unlock_irq(&mapping->tree_lock); | ||
765 | |||
766 | return 0; | ||
767 | } | ||
768 | EXPORT_SYMBOL(migrate_page_remove_references); | ||
769 | |||
770 | /* | ||
771 | * Copy the page to its new location | ||
772 | */ | ||
773 | void migrate_page_copy(struct page *newpage, struct page *page) | ||
774 | { | ||
775 | copy_highpage(newpage, page); | ||
776 | |||
777 | if (PageError(page)) | ||
778 | SetPageError(newpage); | ||
779 | if (PageReferenced(page)) | ||
780 | SetPageReferenced(newpage); | ||
781 | if (PageUptodate(page)) | ||
782 | SetPageUptodate(newpage); | ||
783 | if (PageActive(page)) | ||
784 | SetPageActive(newpage); | ||
785 | if (PageChecked(page)) | ||
786 | SetPageChecked(newpage); | ||
787 | if (PageMappedToDisk(page)) | ||
788 | SetPageMappedToDisk(newpage); | ||
789 | |||
790 | if (PageDirty(page)) { | ||
791 | clear_page_dirty_for_io(page); | ||
792 | set_page_dirty(newpage); | ||
793 | } | ||
794 | |||
795 | ClearPageSwapCache(page); | ||
796 | ClearPageActive(page); | ||
797 | ClearPagePrivate(page); | ||
798 | set_page_private(page, 0); | ||
799 | page->mapping = NULL; | ||
800 | |||
801 | /* | ||
802 | * If any waiters have accumulated on the new page then | ||
803 | * wake them up. | ||
804 | */ | ||
805 | if (PageWriteback(newpage)) | ||
806 | end_page_writeback(newpage); | ||
807 | } | ||
808 | EXPORT_SYMBOL(migrate_page_copy); | ||
809 | |||
810 | /* | ||
811 | * Common logic to directly migrate a single page suitable for | ||
812 | * pages that do not use PagePrivate. | ||
813 | * | ||
814 | * Pages are locked upon entry and exit. | ||
815 | */ | ||
816 | int migrate_page(struct page *newpage, struct page *page) | ||
817 | { | ||
818 | int rc; | ||
819 | |||
820 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ | ||
821 | |||
822 | rc = migrate_page_remove_references(newpage, page, 2); | ||
823 | |||
824 | if (rc) | ||
825 | return rc; | ||
826 | |||
827 | migrate_page_copy(newpage, page); | ||
828 | |||
829 | /* | ||
830 | * Remove auxiliary swap entries and replace | ||
831 | * them with real ptes. | ||
832 | * | ||
833 | * Note that a real pte entry will allow processes that are not | ||
834 | * waiting on the page lock to use the new page via the page tables | ||
835 | * before the new page is unlocked. | ||
836 | */ | ||
837 | remove_from_swap(newpage); | ||
838 | return 0; | ||
839 | } | 559 | } |
840 | EXPORT_SYMBOL(migrate_page); | ||
841 | |||
842 | /* | ||
843 | * migrate_pages | ||
844 | * | ||
845 | * Two lists are passed to this function. The first list | ||
846 | * contains the pages isolated from the LRU to be migrated. | ||
847 | * The second list contains new pages that the pages isolated | ||
848 | * can be moved to. If the second list is NULL then all | ||
849 | * pages are swapped out. | ||
850 | * | ||
851 | * The function returns after 10 attempts or if no pages | ||
852 | * are movable anymore because to has become empty | ||
853 | * or no retryable pages exist anymore. | ||
854 | * | ||
855 | * Return: Number of pages not migrated when "to" ran empty. | ||
856 | */ | ||
857 | int migrate_pages(struct list_head *from, struct list_head *to, | ||
858 | struct list_head *moved, struct list_head *failed) | ||
859 | { | ||
860 | int retry; | ||
861 | int nr_failed = 0; | ||
862 | int pass = 0; | ||
863 | struct page *page; | ||
864 | struct page *page2; | ||
865 | int swapwrite = current->flags & PF_SWAPWRITE; | ||
866 | int rc; | ||
867 | |||
868 | if (!swapwrite) | ||
869 | current->flags |= PF_SWAPWRITE; | ||
870 | |||
871 | redo: | ||
872 | retry = 0; | ||
873 | |||
874 | list_for_each_entry_safe(page, page2, from, lru) { | ||
875 | struct page *newpage = NULL; | ||
876 | struct address_space *mapping; | ||
877 | |||
878 | cond_resched(); | ||
879 | |||
880 | rc = 0; | ||
881 | if (page_count(page) == 1) | ||
882 | /* page was freed from under us. So we are done. */ | ||
883 | goto next; | ||
884 | |||
885 | if (to && list_empty(to)) | ||
886 | break; | ||
887 | |||
888 | /* | ||
889 | * Skip locked pages during the first two passes to give the | ||
890 | * functions holding the lock time to release the page. Later we | ||
891 | * use lock_page() to have a higher chance of acquiring the | ||
892 | * lock. | ||
893 | */ | ||
894 | rc = -EAGAIN; | ||
895 | if (pass > 2) | ||
896 | lock_page(page); | ||
897 | else | ||
898 | if (TestSetPageLocked(page)) | ||
899 | goto next; | ||
900 | |||
901 | /* | ||
902 | * Only wait on writeback if we have already done a pass where | ||
903 | * we we may have triggered writeouts for lots of pages. | ||
904 | */ | ||
905 | if (pass > 0) { | ||
906 | wait_on_page_writeback(page); | ||
907 | } else { | ||
908 | if (PageWriteback(page)) | ||
909 | goto unlock_page; | ||
910 | } | ||
911 | |||
912 | /* | ||
913 | * Anonymous pages must have swap cache references otherwise | ||
914 | * the information contained in the page maps cannot be | ||
915 | * preserved. | ||
916 | */ | ||
917 | if (PageAnon(page) && !PageSwapCache(page)) { | ||
918 | if (!add_to_swap(page, GFP_KERNEL)) { | ||
919 | rc = -ENOMEM; | ||
920 | goto unlock_page; | ||
921 | } | ||
922 | } | ||
923 | |||
924 | if (!to) { | ||
925 | rc = swap_page(page); | ||
926 | goto next; | ||
927 | } | ||
928 | |||
929 | newpage = lru_to_page(to); | ||
930 | lock_page(newpage); | ||
931 | |||
932 | /* | ||
933 | * Pages are properly locked and writeback is complete. | ||
934 | * Try to migrate the page. | ||
935 | */ | ||
936 | mapping = page_mapping(page); | ||
937 | if (!mapping) | ||
938 | goto unlock_both; | ||
939 | |||
940 | if (mapping->a_ops->migratepage) { | ||
941 | /* | ||
942 | * Most pages have a mapping and most filesystems | ||
943 | * should provide a migration function. Anonymous | ||
944 | * pages are part of swap space which also has its | ||
945 | * own migration function. This is the most common | ||
946 | * path for page migration. | ||
947 | */ | ||
948 | rc = mapping->a_ops->migratepage(newpage, page); | ||
949 | goto unlock_both; | ||
950 | } | ||
951 | |||
952 | /* | ||
953 | * Default handling if a filesystem does not provide | ||
954 | * a migration function. We can only migrate clean | ||
955 | * pages so try to write out any dirty pages first. | ||
956 | */ | ||
957 | if (PageDirty(page)) { | ||
958 | switch (pageout(page, mapping)) { | ||
959 | case PAGE_KEEP: | ||
960 | case PAGE_ACTIVATE: | ||
961 | goto unlock_both; | ||
962 | |||
963 | case PAGE_SUCCESS: | ||
964 | unlock_page(newpage); | ||
965 | goto next; | ||
966 | |||
967 | case PAGE_CLEAN: | ||
968 | ; /* try to migrate the page below */ | ||
969 | } | ||
970 | } | ||
971 | |||
972 | /* | ||
973 | * Buffers are managed in a filesystem specific way. | ||
974 | * We must have no buffers or drop them. | ||
975 | */ | ||
976 | if (!page_has_buffers(page) || | ||
977 | try_to_release_page(page, GFP_KERNEL)) { | ||
978 | rc = migrate_page(newpage, page); | ||
979 | goto unlock_both; | ||
980 | } | ||
981 | |||
982 | /* | ||
983 | * On early passes with mapped pages simply | ||
984 | * retry. There may be a lock held for some | ||
985 | * buffers that may go away. Later | ||
986 | * swap them out. | ||
987 | */ | ||
988 | if (pass > 4) { | ||
989 | /* | ||
990 | * Persistently unable to drop buffers..... As a | ||
991 | * measure of last resort we fall back to | ||
992 | * swap_page(). | ||
993 | */ | ||
994 | unlock_page(newpage); | ||
995 | newpage = NULL; | ||
996 | rc = swap_page(page); | ||
997 | goto next; | ||
998 | } | ||
999 | |||
1000 | unlock_both: | ||
1001 | unlock_page(newpage); | ||
1002 | |||
1003 | unlock_page: | ||
1004 | unlock_page(page); | ||
1005 | |||
1006 | next: | ||
1007 | if (rc == -EAGAIN) { | ||
1008 | retry++; | ||
1009 | } else if (rc) { | ||
1010 | /* Permanent failure */ | ||
1011 | list_move(&page->lru, failed); | ||
1012 | nr_failed++; | ||
1013 | } else { | ||
1014 | if (newpage) { | ||
1015 | /* Successful migration. Return page to LRU */ | ||
1016 | move_to_lru(newpage); | ||
1017 | } | ||
1018 | list_move(&page->lru, moved); | ||
1019 | } | ||
1020 | } | ||
1021 | if (retry && pass++ < 10) | ||
1022 | goto redo; | ||
1023 | |||
1024 | if (!swapwrite) | ||
1025 | current->flags &= ~PF_SWAPWRITE; | ||
1026 | |||
1027 | return nr_failed + retry; | ||
1028 | } | ||
1029 | |||
1030 | /* | ||
1031 | * Isolate one page from the LRU lists and put it on the | ||
1032 | * indicated list with elevated refcount. | ||
1033 | * | ||
1034 | * Result: | ||
1035 | * 0 = page not on LRU list | ||
1036 | * 1 = page removed from LRU list and added to the specified list. | ||
1037 | */ | ||
1038 | int isolate_lru_page(struct page *page) | ||
1039 | { | ||
1040 | int ret = 0; | ||
1041 | |||
1042 | if (PageLRU(page)) { | ||
1043 | struct zone *zone = page_zone(page); | ||
1044 | spin_lock_irq(&zone->lru_lock); | ||
1045 | if (TestClearPageLRU(page)) { | ||
1046 | ret = 1; | ||
1047 | get_page(page); | ||
1048 | if (PageActive(page)) | ||
1049 | del_page_from_active_list(zone, page); | ||
1050 | else | ||
1051 | del_page_from_inactive_list(zone, page); | ||
1052 | } | ||
1053 | spin_unlock_irq(&zone->lru_lock); | ||
1054 | } | ||
1055 | |||
1056 | return ret; | ||
1057 | } | ||
1058 | #endif | ||
1059 | 560 | ||
1060 | /* | 561 | /* |
1061 | * zone->lru_lock is heavily contended. Some of the functions that | 562 | * zone->lru_lock is heavily contended. Some of the functions that |
@@ -1074,32 +575,35 @@ int isolate_lru_page(struct page *page) | |||
1074 | * | 575 | * |
1075 | * returns how many pages were moved onto *@dst. | 576 | * returns how many pages were moved onto *@dst. |
1076 | */ | 577 | */ |
1077 | static int isolate_lru_pages(int nr_to_scan, struct list_head *src, | 578 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, |
1078 | struct list_head *dst, int *scanned) | 579 | struct list_head *src, struct list_head *dst, |
580 | unsigned long *scanned) | ||
1079 | { | 581 | { |
1080 | int nr_taken = 0; | 582 | unsigned long nr_taken = 0; |
1081 | struct page *page; | 583 | struct page *page; |
1082 | int scan = 0; | 584 | unsigned long scan; |
1083 | 585 | ||
1084 | while (scan++ < nr_to_scan && !list_empty(src)) { | 586 | for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { |
587 | struct list_head *target; | ||
1085 | page = lru_to_page(src); | 588 | page = lru_to_page(src); |
1086 | prefetchw_prev_lru_page(page, src, flags); | 589 | prefetchw_prev_lru_page(page, src, flags); |
1087 | 590 | ||
1088 | if (!TestClearPageLRU(page)) | 591 | BUG_ON(!PageLRU(page)); |
1089 | BUG(); | 592 | |
1090 | list_del(&page->lru); | 593 | list_del(&page->lru); |
1091 | if (get_page_testone(page)) { | 594 | target = src; |
595 | if (likely(get_page_unless_zero(page))) { | ||
1092 | /* | 596 | /* |
1093 | * It is being freed elsewhere | 597 | * Be careful not to clear PageLRU until after we're |
598 | * sure the page is not being freed elsewhere -- the | ||
599 | * page release code relies on it. | ||
1094 | */ | 600 | */ |
1095 | __put_page(page); | 601 | ClearPageLRU(page); |
1096 | SetPageLRU(page); | 602 | target = dst; |
1097 | list_add(&page->lru, src); | ||
1098 | continue; | ||
1099 | } else { | ||
1100 | list_add(&page->lru, dst); | ||
1101 | nr_taken++; | 603 | nr_taken++; |
1102 | } | 604 | } /* else it is being freed elsewhere */ |
605 | |||
606 | list_add(&page->lru, target); | ||
1103 | } | 607 | } |
1104 | 608 | ||
1105 | *scanned = scan; | 609 | *scanned = scan; |
@@ -1107,23 +611,26 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, | |||
1107 | } | 611 | } |
1108 | 612 | ||
1109 | /* | 613 | /* |
1110 | * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed | 614 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number |
615 | * of reclaimed pages | ||
1111 | */ | 616 | */ |
1112 | static void shrink_cache(struct zone *zone, struct scan_control *sc) | 617 | static unsigned long shrink_inactive_list(unsigned long max_scan, |
618 | struct zone *zone, struct scan_control *sc) | ||
1113 | { | 619 | { |
1114 | LIST_HEAD(page_list); | 620 | LIST_HEAD(page_list); |
1115 | struct pagevec pvec; | 621 | struct pagevec pvec; |
1116 | int max_scan = sc->nr_to_scan; | 622 | unsigned long nr_scanned = 0; |
623 | unsigned long nr_reclaimed = 0; | ||
1117 | 624 | ||
1118 | pagevec_init(&pvec, 1); | 625 | pagevec_init(&pvec, 1); |
1119 | 626 | ||
1120 | lru_add_drain(); | 627 | lru_add_drain(); |
1121 | spin_lock_irq(&zone->lru_lock); | 628 | spin_lock_irq(&zone->lru_lock); |
1122 | while (max_scan > 0) { | 629 | do { |
1123 | struct page *page; | 630 | struct page *page; |
1124 | int nr_taken; | 631 | unsigned long nr_taken; |
1125 | int nr_scan; | 632 | unsigned long nr_scan; |
1126 | int nr_freed; | 633 | unsigned long nr_freed; |
1127 | 634 | ||
1128 | nr_taken = isolate_lru_pages(sc->swap_cluster_max, | 635 | nr_taken = isolate_lru_pages(sc->swap_cluster_max, |
1129 | &zone->inactive_list, | 636 | &zone->inactive_list, |
@@ -1132,12 +639,9 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) | |||
1132 | zone->pages_scanned += nr_scan; | 639 | zone->pages_scanned += nr_scan; |
1133 | spin_unlock_irq(&zone->lru_lock); | 640 | spin_unlock_irq(&zone->lru_lock); |
1134 | 641 | ||
1135 | if (nr_taken == 0) | 642 | nr_scanned += nr_scan; |
1136 | goto done; | 643 | nr_freed = shrink_page_list(&page_list, sc); |
1137 | 644 | nr_reclaimed += nr_freed; | |
1138 | max_scan -= nr_scan; | ||
1139 | nr_freed = shrink_list(&page_list, sc); | ||
1140 | |||
1141 | local_irq_disable(); | 645 | local_irq_disable(); |
1142 | if (current_is_kswapd()) { | 646 | if (current_is_kswapd()) { |
1143 | __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); | 647 | __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); |
@@ -1146,14 +650,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) | |||
1146 | __mod_page_state_zone(zone, pgscan_direct, nr_scan); | 650 | __mod_page_state_zone(zone, pgscan_direct, nr_scan); |
1147 | __mod_page_state_zone(zone, pgsteal, nr_freed); | 651 | __mod_page_state_zone(zone, pgsteal, nr_freed); |
1148 | 652 | ||
653 | if (nr_taken == 0) | ||
654 | goto done; | ||
655 | |||
1149 | spin_lock(&zone->lru_lock); | 656 | spin_lock(&zone->lru_lock); |
1150 | /* | 657 | /* |
1151 | * Put back any unfreeable pages. | 658 | * Put back any unfreeable pages. |
1152 | */ | 659 | */ |
1153 | while (!list_empty(&page_list)) { | 660 | while (!list_empty(&page_list)) { |
1154 | page = lru_to_page(&page_list); | 661 | page = lru_to_page(&page_list); |
1155 | if (TestSetPageLRU(page)) | 662 | BUG_ON(PageLRU(page)); |
1156 | BUG(); | 663 | SetPageLRU(page); |
1157 | list_del(&page->lru); | 664 | list_del(&page->lru); |
1158 | if (PageActive(page)) | 665 | if (PageActive(page)) |
1159 | add_page_to_active_list(zone, page); | 666 | add_page_to_active_list(zone, page); |
@@ -1165,10 +672,12 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) | |||
1165 | spin_lock_irq(&zone->lru_lock); | 672 | spin_lock_irq(&zone->lru_lock); |
1166 | } | 673 | } |
1167 | } | 674 | } |
1168 | } | 675 | } while (nr_scanned < max_scan); |
1169 | spin_unlock_irq(&zone->lru_lock); | 676 | spin_unlock(&zone->lru_lock); |
1170 | done: | 677 | done: |
678 | local_irq_enable(); | ||
1171 | pagevec_release(&pvec); | 679 | pagevec_release(&pvec); |
680 | return nr_reclaimed; | ||
1172 | } | 681 | } |
1173 | 682 | ||
1174 | /* | 683 | /* |
@@ -1188,13 +697,12 @@ done: | |||
1188 | * The downside is that we have to touch page->_count against each page. | 697 | * The downside is that we have to touch page->_count against each page. |
1189 | * But we had to alter page->flags anyway. | 698 | * But we had to alter page->flags anyway. |
1190 | */ | 699 | */ |
1191 | static void | 700 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, |
1192 | refill_inactive_zone(struct zone *zone, struct scan_control *sc) | 701 | struct scan_control *sc) |
1193 | { | 702 | { |
1194 | int pgmoved; | 703 | unsigned long pgmoved; |
1195 | int pgdeactivate = 0; | 704 | int pgdeactivate = 0; |
1196 | int pgscanned; | 705 | unsigned long pgscanned; |
1197 | int nr_pages = sc->nr_to_scan; | ||
1198 | LIST_HEAD(l_hold); /* The pages which were snipped off */ | 706 | LIST_HEAD(l_hold); /* The pages which were snipped off */ |
1199 | LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ | 707 | LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ |
1200 | LIST_HEAD(l_active); /* Pages to go onto the active_list */ | 708 | LIST_HEAD(l_active); /* Pages to go onto the active_list */ |
@@ -1202,7 +710,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) | |||
1202 | struct pagevec pvec; | 710 | struct pagevec pvec; |
1203 | int reclaim_mapped = 0; | 711 | int reclaim_mapped = 0; |
1204 | 712 | ||
1205 | if (unlikely(sc->may_swap)) { | 713 | if (sc->may_swap) { |
1206 | long mapped_ratio; | 714 | long mapped_ratio; |
1207 | long distress; | 715 | long distress; |
1208 | long swap_tendency; | 716 | long swap_tendency; |
@@ -1272,10 +780,11 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) | |||
1272 | while (!list_empty(&l_inactive)) { | 780 | while (!list_empty(&l_inactive)) { |
1273 | page = lru_to_page(&l_inactive); | 781 | page = lru_to_page(&l_inactive); |
1274 | prefetchw_prev_lru_page(page, &l_inactive, flags); | 782 | prefetchw_prev_lru_page(page, &l_inactive, flags); |
1275 | if (TestSetPageLRU(page)) | 783 | BUG_ON(PageLRU(page)); |
1276 | BUG(); | 784 | SetPageLRU(page); |
1277 | if (!TestClearPageActive(page)) | 785 | BUG_ON(!PageActive(page)); |
1278 | BUG(); | 786 | ClearPageActive(page); |
787 | |||
1279 | list_move(&page->lru, &zone->inactive_list); | 788 | list_move(&page->lru, &zone->inactive_list); |
1280 | pgmoved++; | 789 | pgmoved++; |
1281 | if (!pagevec_add(&pvec, page)) { | 790 | if (!pagevec_add(&pvec, page)) { |
@@ -1301,8 +810,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) | |||
1301 | while (!list_empty(&l_active)) { | 810 | while (!list_empty(&l_active)) { |
1302 | page = lru_to_page(&l_active); | 811 | page = lru_to_page(&l_active); |
1303 | prefetchw_prev_lru_page(page, &l_active, flags); | 812 | prefetchw_prev_lru_page(page, &l_active, flags); |
1304 | if (TestSetPageLRU(page)) | 813 | BUG_ON(PageLRU(page)); |
1305 | BUG(); | 814 | SetPageLRU(page); |
1306 | BUG_ON(!PageActive(page)); | 815 | BUG_ON(!PageActive(page)); |
1307 | list_move(&page->lru, &zone->active_list); | 816 | list_move(&page->lru, &zone->active_list); |
1308 | pgmoved++; | 817 | pgmoved++; |
@@ -1327,11 +836,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) | |||
1327 | /* | 836 | /* |
1328 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 837 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
1329 | */ | 838 | */ |
1330 | static void | 839 | static unsigned long shrink_zone(int priority, struct zone *zone, |
1331 | shrink_zone(struct zone *zone, struct scan_control *sc) | 840 | struct scan_control *sc) |
1332 | { | 841 | { |
1333 | unsigned long nr_active; | 842 | unsigned long nr_active; |
1334 | unsigned long nr_inactive; | 843 | unsigned long nr_inactive; |
844 | unsigned long nr_to_scan; | ||
845 | unsigned long nr_reclaimed = 0; | ||
1335 | 846 | ||
1336 | atomic_inc(&zone->reclaim_in_progress); | 847 | atomic_inc(&zone->reclaim_in_progress); |
1337 | 848 | ||
@@ -1339,14 +850,14 @@ shrink_zone(struct zone *zone, struct scan_control *sc) | |||
1339 | * Add one to `nr_to_scan' just to make sure that the kernel will | 850 | * Add one to `nr_to_scan' just to make sure that the kernel will |
1340 | * slowly sift through the active list. | 851 | * slowly sift through the active list. |
1341 | */ | 852 | */ |
1342 | zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1; | 853 | zone->nr_scan_active += (zone->nr_active >> priority) + 1; |
1343 | nr_active = zone->nr_scan_active; | 854 | nr_active = zone->nr_scan_active; |
1344 | if (nr_active >= sc->swap_cluster_max) | 855 | if (nr_active >= sc->swap_cluster_max) |
1345 | zone->nr_scan_active = 0; | 856 | zone->nr_scan_active = 0; |
1346 | else | 857 | else |
1347 | nr_active = 0; | 858 | nr_active = 0; |
1348 | 859 | ||
1349 | zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1; | 860 | zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1; |
1350 | nr_inactive = zone->nr_scan_inactive; | 861 | nr_inactive = zone->nr_scan_inactive; |
1351 | if (nr_inactive >= sc->swap_cluster_max) | 862 | if (nr_inactive >= sc->swap_cluster_max) |
1352 | zone->nr_scan_inactive = 0; | 863 | zone->nr_scan_inactive = 0; |
@@ -1355,23 +866,25 @@ shrink_zone(struct zone *zone, struct scan_control *sc) | |||
1355 | 866 | ||
1356 | while (nr_active || nr_inactive) { | 867 | while (nr_active || nr_inactive) { |
1357 | if (nr_active) { | 868 | if (nr_active) { |
1358 | sc->nr_to_scan = min(nr_active, | 869 | nr_to_scan = min(nr_active, |
1359 | (unsigned long)sc->swap_cluster_max); | 870 | (unsigned long)sc->swap_cluster_max); |
1360 | nr_active -= sc->nr_to_scan; | 871 | nr_active -= nr_to_scan; |
1361 | refill_inactive_zone(zone, sc); | 872 | shrink_active_list(nr_to_scan, zone, sc); |
1362 | } | 873 | } |
1363 | 874 | ||
1364 | if (nr_inactive) { | 875 | if (nr_inactive) { |
1365 | sc->nr_to_scan = min(nr_inactive, | 876 | nr_to_scan = min(nr_inactive, |
1366 | (unsigned long)sc->swap_cluster_max); | 877 | (unsigned long)sc->swap_cluster_max); |
1367 | nr_inactive -= sc->nr_to_scan; | 878 | nr_inactive -= nr_to_scan; |
1368 | shrink_cache(zone, sc); | 879 | nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, |
880 | sc); | ||
1369 | } | 881 | } |
1370 | } | 882 | } |
1371 | 883 | ||
1372 | throttle_vm_writeout(); | 884 | throttle_vm_writeout(); |
1373 | 885 | ||
1374 | atomic_dec(&zone->reclaim_in_progress); | 886 | atomic_dec(&zone->reclaim_in_progress); |
887 | return nr_reclaimed; | ||
1375 | } | 888 | } |
1376 | 889 | ||
1377 | /* | 890 | /* |
@@ -1390,9 +903,10 @@ shrink_zone(struct zone *zone, struct scan_control *sc) | |||
1390 | * If a zone is deemed to be full of pinned pages then just give it a light | 903 | * If a zone is deemed to be full of pinned pages then just give it a light |
1391 | * scan then give up on it. | 904 | * scan then give up on it. |
1392 | */ | 905 | */ |
1393 | static void | 906 | static unsigned long shrink_zones(int priority, struct zone **zones, |
1394 | shrink_caches(struct zone **zones, struct scan_control *sc) | 907 | struct scan_control *sc) |
1395 | { | 908 | { |
909 | unsigned long nr_reclaimed = 0; | ||
1396 | int i; | 910 | int i; |
1397 | 911 | ||
1398 | for (i = 0; zones[i] != NULL; i++) { | 912 | for (i = 0; zones[i] != NULL; i++) { |
@@ -1404,15 +918,16 @@ shrink_caches(struct zone **zones, struct scan_control *sc) | |||
1404 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) | 918 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) |
1405 | continue; | 919 | continue; |
1406 | 920 | ||
1407 | zone->temp_priority = sc->priority; | 921 | zone->temp_priority = priority; |
1408 | if (zone->prev_priority > sc->priority) | 922 | if (zone->prev_priority > priority) |
1409 | zone->prev_priority = sc->priority; | 923 | zone->prev_priority = priority; |
1410 | 924 | ||
1411 | if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) | 925 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
1412 | continue; /* Let kswapd poll it */ | 926 | continue; /* Let kswapd poll it */ |
1413 | 927 | ||
1414 | shrink_zone(zone, sc); | 928 | nr_reclaimed += shrink_zone(priority, zone, sc); |
1415 | } | 929 | } |
930 | return nr_reclaimed; | ||
1416 | } | 931 | } |
1417 | 932 | ||
1418 | /* | 933 | /* |
@@ -1428,19 +943,21 @@ shrink_caches(struct zone **zones, struct scan_control *sc) | |||
1428 | * holds filesystem locks which prevent writeout this might not work, and the | 943 | * holds filesystem locks which prevent writeout this might not work, and the |
1429 | * allocation attempt will fail. | 944 | * allocation attempt will fail. |
1430 | */ | 945 | */ |
1431 | int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | 946 | unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) |
1432 | { | 947 | { |
1433 | int priority; | 948 | int priority; |
1434 | int ret = 0; | 949 | int ret = 0; |
1435 | int total_scanned = 0, total_reclaimed = 0; | 950 | unsigned long total_scanned = 0; |
951 | unsigned long nr_reclaimed = 0; | ||
1436 | struct reclaim_state *reclaim_state = current->reclaim_state; | 952 | struct reclaim_state *reclaim_state = current->reclaim_state; |
1437 | struct scan_control sc; | ||
1438 | unsigned long lru_pages = 0; | 953 | unsigned long lru_pages = 0; |
1439 | int i; | 954 | int i; |
1440 | 955 | struct scan_control sc = { | |
1441 | sc.gfp_mask = gfp_mask; | 956 | .gfp_mask = gfp_mask, |
1442 | sc.may_writepage = !laptop_mode; | 957 | .may_writepage = !laptop_mode, |
1443 | sc.may_swap = 1; | 958 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
959 | .may_swap = 1, | ||
960 | }; | ||
1444 | 961 | ||
1445 | inc_page_state(allocstall); | 962 | inc_page_state(allocstall); |
1446 | 963 | ||
@@ -1457,20 +974,16 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
1457 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 974 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
1458 | sc.nr_mapped = read_page_state(nr_mapped); | 975 | sc.nr_mapped = read_page_state(nr_mapped); |
1459 | sc.nr_scanned = 0; | 976 | sc.nr_scanned = 0; |
1460 | sc.nr_reclaimed = 0; | ||
1461 | sc.priority = priority; | ||
1462 | sc.swap_cluster_max = SWAP_CLUSTER_MAX; | ||
1463 | if (!priority) | 977 | if (!priority) |
1464 | disable_swap_token(); | 978 | disable_swap_token(); |
1465 | shrink_caches(zones, &sc); | 979 | nr_reclaimed += shrink_zones(priority, zones, &sc); |
1466 | shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); | 980 | shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); |
1467 | if (reclaim_state) { | 981 | if (reclaim_state) { |
1468 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | 982 | nr_reclaimed += reclaim_state->reclaimed_slab; |
1469 | reclaim_state->reclaimed_slab = 0; | 983 | reclaim_state->reclaimed_slab = 0; |
1470 | } | 984 | } |
1471 | total_scanned += sc.nr_scanned; | 985 | total_scanned += sc.nr_scanned; |
1472 | total_reclaimed += sc.nr_reclaimed; | 986 | if (nr_reclaimed >= sc.swap_cluster_max) { |
1473 | if (total_reclaimed >= sc.swap_cluster_max) { | ||
1474 | ret = 1; | 987 | ret = 1; |
1475 | goto out; | 988 | goto out; |
1476 | } | 989 | } |
@@ -1482,7 +995,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
1482 | * that's undesirable in laptop mode, where we *want* lumpy | 995 | * that's undesirable in laptop mode, where we *want* lumpy |
1483 | * writeout. So in laptop mode, write out the whole world. | 996 | * writeout. So in laptop mode, write out the whole world. |
1484 | */ | 997 | */ |
1485 | if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) { | 998 | if (total_scanned > sc.swap_cluster_max + |
999 | sc.swap_cluster_max / 2) { | ||
1486 | wakeup_pdflush(laptop_mode ? 0 : total_scanned); | 1000 | wakeup_pdflush(laptop_mode ? 0 : total_scanned); |
1487 | sc.may_writepage = 1; | 1001 | sc.may_writepage = 1; |
1488 | } | 1002 | } |
@@ -1528,22 +1042,26 @@ out: | |||
1528 | * the page allocator fallback scheme to ensure that aging of pages is balanced | 1042 | * the page allocator fallback scheme to ensure that aging of pages is balanced |
1529 | * across the zones. | 1043 | * across the zones. |
1530 | */ | 1044 | */ |
1531 | static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order) | 1045 | static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, |
1046 | int order) | ||
1532 | { | 1047 | { |
1533 | int to_free = nr_pages; | 1048 | unsigned long to_free = nr_pages; |
1534 | int all_zones_ok; | 1049 | int all_zones_ok; |
1535 | int priority; | 1050 | int priority; |
1536 | int i; | 1051 | int i; |
1537 | int total_scanned, total_reclaimed; | 1052 | unsigned long total_scanned; |
1053 | unsigned long nr_reclaimed; | ||
1538 | struct reclaim_state *reclaim_state = current->reclaim_state; | 1054 | struct reclaim_state *reclaim_state = current->reclaim_state; |
1539 | struct scan_control sc; | 1055 | struct scan_control sc = { |
1056 | .gfp_mask = GFP_KERNEL, | ||
1057 | .may_swap = 1, | ||
1058 | .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX, | ||
1059 | }; | ||
1540 | 1060 | ||
1541 | loop_again: | 1061 | loop_again: |
1542 | total_scanned = 0; | 1062 | total_scanned = 0; |
1543 | total_reclaimed = 0; | 1063 | nr_reclaimed = 0; |
1544 | sc.gfp_mask = GFP_KERNEL; | 1064 | sc.may_writepage = !laptop_mode, |
1545 | sc.may_writepage = !laptop_mode; | ||
1546 | sc.may_swap = 1; | ||
1547 | sc.nr_mapped = read_page_state(nr_mapped); | 1065 | sc.nr_mapped = read_page_state(nr_mapped); |
1548 | 1066 | ||
1549 | inc_page_state(pageoutrun); | 1067 | inc_page_state(pageoutrun); |
@@ -1624,15 +1142,11 @@ scan: | |||
1624 | if (zone->prev_priority > priority) | 1142 | if (zone->prev_priority > priority) |
1625 | zone->prev_priority = priority; | 1143 | zone->prev_priority = priority; |
1626 | sc.nr_scanned = 0; | 1144 | sc.nr_scanned = 0; |
1627 | sc.nr_reclaimed = 0; | 1145 | nr_reclaimed += shrink_zone(priority, zone, &sc); |
1628 | sc.priority = priority; | ||
1629 | sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; | ||
1630 | shrink_zone(zone, &sc); | ||
1631 | reclaim_state->reclaimed_slab = 0; | 1146 | reclaim_state->reclaimed_slab = 0; |
1632 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, | 1147 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, |
1633 | lru_pages); | 1148 | lru_pages); |
1634 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | 1149 | nr_reclaimed += reclaim_state->reclaimed_slab; |
1635 | total_reclaimed += sc.nr_reclaimed; | ||
1636 | total_scanned += sc.nr_scanned; | 1150 | total_scanned += sc.nr_scanned; |
1637 | if (zone->all_unreclaimable) | 1151 | if (zone->all_unreclaimable) |
1638 | continue; | 1152 | continue; |
@@ -1645,10 +1159,10 @@ scan: | |||
1645 | * even in laptop mode | 1159 | * even in laptop mode |
1646 | */ | 1160 | */ |
1647 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && | 1161 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && |
1648 | total_scanned > total_reclaimed+total_reclaimed/2) | 1162 | total_scanned > nr_reclaimed + nr_reclaimed / 2) |
1649 | sc.may_writepage = 1; | 1163 | sc.may_writepage = 1; |
1650 | } | 1164 | } |
1651 | if (nr_pages && to_free > total_reclaimed) | 1165 | if (nr_pages && to_free > nr_reclaimed) |
1652 | continue; /* swsusp: need to do more work */ | 1166 | continue; /* swsusp: need to do more work */ |
1653 | if (all_zones_ok) | 1167 | if (all_zones_ok) |
1654 | break; /* kswapd: all done */ | 1168 | break; /* kswapd: all done */ |
@@ -1665,7 +1179,7 @@ scan: | |||
1665 | * matches the direct reclaim path behaviour in terms of impact | 1179 | * matches the direct reclaim path behaviour in terms of impact |
1666 | * on zone->*_priority. | 1180 | * on zone->*_priority. |
1667 | */ | 1181 | */ |
1668 | if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages)) | 1182 | if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages) |
1669 | break; | 1183 | break; |
1670 | } | 1184 | } |
1671 | out: | 1185 | out: |
@@ -1679,7 +1193,7 @@ out: | |||
1679 | goto loop_again; | 1193 | goto loop_again; |
1680 | } | 1194 | } |
1681 | 1195 | ||
1682 | return total_reclaimed; | 1196 | return nr_reclaimed; |
1683 | } | 1197 | } |
1684 | 1198 | ||
1685 | /* | 1199 | /* |
@@ -1779,24 +1293,31 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
1779 | * Try to free `nr_pages' of memory, system-wide. Returns the number of freed | 1293 | * Try to free `nr_pages' of memory, system-wide. Returns the number of freed |
1780 | * pages. | 1294 | * pages. |
1781 | */ | 1295 | */ |
1782 | int shrink_all_memory(int nr_pages) | 1296 | unsigned long shrink_all_memory(unsigned long nr_pages) |
1783 | { | 1297 | { |
1784 | pg_data_t *pgdat; | 1298 | pg_data_t *pgdat; |
1785 | int nr_to_free = nr_pages; | 1299 | unsigned long nr_to_free = nr_pages; |
1786 | int ret = 0; | 1300 | unsigned long ret = 0; |
1301 | unsigned retry = 2; | ||
1787 | struct reclaim_state reclaim_state = { | 1302 | struct reclaim_state reclaim_state = { |
1788 | .reclaimed_slab = 0, | 1303 | .reclaimed_slab = 0, |
1789 | }; | 1304 | }; |
1790 | 1305 | ||
1791 | current->reclaim_state = &reclaim_state; | 1306 | current->reclaim_state = &reclaim_state; |
1307 | repeat: | ||
1792 | for_each_pgdat(pgdat) { | 1308 | for_each_pgdat(pgdat) { |
1793 | int freed; | 1309 | unsigned long freed; |
1310 | |||
1794 | freed = balance_pgdat(pgdat, nr_to_free, 0); | 1311 | freed = balance_pgdat(pgdat, nr_to_free, 0); |
1795 | ret += freed; | 1312 | ret += freed; |
1796 | nr_to_free -= freed; | 1313 | nr_to_free -= freed; |
1797 | if (nr_to_free <= 0) | 1314 | if ((long)nr_to_free <= 0) |
1798 | break; | 1315 | break; |
1799 | } | 1316 | } |
1317 | if (retry-- && ret < nr_pages) { | ||
1318 | blk_congestion_wait(WRITE, HZ/5); | ||
1319 | goto repeat; | ||
1320 | } | ||
1800 | current->reclaim_state = NULL; | 1321 | current->reclaim_state = NULL; |
1801 | return ret; | 1322 | return ret; |
1802 | } | 1323 | } |
@@ -1808,8 +1329,7 @@ int shrink_all_memory(int nr_pages) | |||
1808 | away, we get changed to run anywhere: as the first one comes back, | 1329 | away, we get changed to run anywhere: as the first one comes back, |
1809 | restore their cpu bindings. */ | 1330 | restore their cpu bindings. */ |
1810 | static int __devinit cpu_callback(struct notifier_block *nfb, | 1331 | static int __devinit cpu_callback(struct notifier_block *nfb, |
1811 | unsigned long action, | 1332 | unsigned long action, void *hcpu) |
1812 | void *hcpu) | ||
1813 | { | 1333 | { |
1814 | pg_data_t *pgdat; | 1334 | pg_data_t *pgdat; |
1815 | cpumask_t mask; | 1335 | cpumask_t mask; |
@@ -1829,10 +1349,15 @@ static int __devinit cpu_callback(struct notifier_block *nfb, | |||
1829 | static int __init kswapd_init(void) | 1349 | static int __init kswapd_init(void) |
1830 | { | 1350 | { |
1831 | pg_data_t *pgdat; | 1351 | pg_data_t *pgdat; |
1352 | |||
1832 | swap_setup(); | 1353 | swap_setup(); |
1833 | for_each_pgdat(pgdat) | 1354 | for_each_pgdat(pgdat) { |
1834 | pgdat->kswapd | 1355 | pid_t pid; |
1835 | = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); | 1356 | |
1357 | pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); | ||
1358 | BUG_ON(pid < 0); | ||
1359 | pgdat->kswapd = find_task_by_pid(pid); | ||
1360 | } | ||
1836 | total_memory = nr_free_pagecache_pages(); | 1361 | total_memory = nr_free_pagecache_pages(); |
1837 | hotcpu_notifier(cpu_callback, 0); | 1362 | hotcpu_notifier(cpu_callback, 0); |
1838 | return 0; | 1363 | return 0; |
@@ -1874,46 +1399,24 @@ int zone_reclaim_interval __read_mostly = 30*HZ; | |||
1874 | /* | 1399 | /* |
1875 | * Try to free up some pages from this zone through reclaim. | 1400 | * Try to free up some pages from this zone through reclaim. |
1876 | */ | 1401 | */ |
1877 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 1402 | static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) |
1878 | { | 1403 | { |
1879 | int nr_pages; | 1404 | /* Minimum pages needed in order to stay on node */ |
1405 | const unsigned long nr_pages = 1 << order; | ||
1880 | struct task_struct *p = current; | 1406 | struct task_struct *p = current; |
1881 | struct reclaim_state reclaim_state; | 1407 | struct reclaim_state reclaim_state; |
1882 | struct scan_control sc; | 1408 | int priority; |
1883 | cpumask_t mask; | 1409 | unsigned long nr_reclaimed = 0; |
1884 | int node_id; | 1410 | struct scan_control sc = { |
1885 | 1411 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | |
1886 | if (time_before(jiffies, | 1412 | .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
1887 | zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) | 1413 | .nr_mapped = read_page_state(nr_mapped), |
1888 | return 0; | 1414 | .swap_cluster_max = max_t(unsigned long, nr_pages, |
1889 | 1415 | SWAP_CLUSTER_MAX), | |
1890 | if (!(gfp_mask & __GFP_WAIT) || | 1416 | .gfp_mask = gfp_mask, |
1891 | zone->all_unreclaimable || | 1417 | }; |
1892 | atomic_read(&zone->reclaim_in_progress) > 0 || | ||
1893 | (p->flags & PF_MEMALLOC)) | ||
1894 | return 0; | ||
1895 | |||
1896 | node_id = zone->zone_pgdat->node_id; | ||
1897 | mask = node_to_cpumask(node_id); | ||
1898 | if (!cpus_empty(mask) && node_id != numa_node_id()) | ||
1899 | return 0; | ||
1900 | |||
1901 | sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE); | ||
1902 | sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP); | ||
1903 | sc.nr_scanned = 0; | ||
1904 | sc.nr_reclaimed = 0; | ||
1905 | sc.priority = ZONE_RECLAIM_PRIORITY + 1; | ||
1906 | sc.nr_mapped = read_page_state(nr_mapped); | ||
1907 | sc.gfp_mask = gfp_mask; | ||
1908 | 1418 | ||
1909 | disable_swap_token(); | 1419 | disable_swap_token(); |
1910 | |||
1911 | nr_pages = 1 << order; | ||
1912 | if (nr_pages > SWAP_CLUSTER_MAX) | ||
1913 | sc.swap_cluster_max = nr_pages; | ||
1914 | else | ||
1915 | sc.swap_cluster_max = SWAP_CLUSTER_MAX; | ||
1916 | |||
1917 | cond_resched(); | 1420 | cond_resched(); |
1918 | /* | 1421 | /* |
1919 | * We need to be able to allocate from the reserves for RECLAIM_SWAP | 1422 | * We need to be able to allocate from the reserves for RECLAIM_SWAP |
@@ -1928,17 +1431,20 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1928 | * Free memory by calling shrink zone with increasing priorities | 1431 | * Free memory by calling shrink zone with increasing priorities |
1929 | * until we have enough memory freed. | 1432 | * until we have enough memory freed. |
1930 | */ | 1433 | */ |
1434 | priority = ZONE_RECLAIM_PRIORITY; | ||
1931 | do { | 1435 | do { |
1932 | sc.priority--; | 1436 | nr_reclaimed += shrink_zone(priority, zone, &sc); |
1933 | shrink_zone(zone, &sc); | 1437 | priority--; |
1438 | } while (priority >= 0 && nr_reclaimed < nr_pages); | ||
1934 | 1439 | ||
1935 | } while (sc.nr_reclaimed < nr_pages && sc.priority > 0); | 1440 | if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { |
1936 | |||
1937 | if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { | ||
1938 | /* | 1441 | /* |
1939 | * shrink_slab does not currently allow us to determine | 1442 | * shrink_slab() does not currently allow us to determine how |
1940 | * how many pages were freed in the zone. So we just | 1443 | * many pages were freed in this zone. So we just shake the slab |
1941 | * shake the slab and then go offnode for a single allocation. | 1444 | * a bit and then go off node for this particular allocation |
1445 | * despite possibly having freed enough memory to allocate in | ||
1446 | * this zone. If we freed local memory then the next | ||
1447 | * allocations will be local again. | ||
1942 | * | 1448 | * |
1943 | * shrink_slab will free memory on all zones and may take | 1449 | * shrink_slab will free memory on all zones and may take |
1944 | * a long time. | 1450 | * a long time. |
@@ -1949,10 +1455,54 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1949 | p->reclaim_state = NULL; | 1455 | p->reclaim_state = NULL; |
1950 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | 1456 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); |
1951 | 1457 | ||
1952 | if (sc.nr_reclaimed == 0) | 1458 | if (nr_reclaimed == 0) { |
1459 | /* | ||
1460 | * We were unable to reclaim enough pages to stay on node. We | ||
1461 | * now allow off node accesses for a certain time period before | ||
1462 | * trying again to reclaim pages from the local zone. | ||
1463 | */ | ||
1953 | zone->last_unsuccessful_zone_reclaim = jiffies; | 1464 | zone->last_unsuccessful_zone_reclaim = jiffies; |
1465 | } | ||
1954 | 1466 | ||
1955 | return sc.nr_reclaimed >= nr_pages; | 1467 | return nr_reclaimed >= nr_pages; |
1956 | } | 1468 | } |
1957 | #endif | ||
1958 | 1469 | ||
1470 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | ||
1471 | { | ||
1472 | cpumask_t mask; | ||
1473 | int node_id; | ||
1474 | |||
1475 | /* | ||
1476 | * Do not reclaim if there was a recent unsuccessful attempt at zone | ||
1477 | * reclaim. In that case we let allocations go off node for the | ||
1478 | * zone_reclaim_interval. Otherwise we would scan for each off-node | ||
1479 | * page allocation. | ||
1480 | */ | ||
1481 | if (time_before(jiffies, | ||
1482 | zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) | ||
1483 | return 0; | ||
1484 | |||
1485 | /* | ||
1486 | * Avoid concurrent zone reclaims, do not reclaim in a zone that does | ||
1487 | * not have reclaimable pages and if we should not delay the allocation | ||
1488 | * then do not scan. | ||
1489 | */ | ||
1490 | if (!(gfp_mask & __GFP_WAIT) || | ||
1491 | zone->all_unreclaimable || | ||
1492 | atomic_read(&zone->reclaim_in_progress) > 0 || | ||
1493 | (current->flags & PF_MEMALLOC)) | ||
1494 | return 0; | ||
1495 | |||
1496 | /* | ||
1497 | * Only run zone reclaim on the local zone or on zones that do not | ||
1498 | * have associated processors. This will favor the local processor | ||
1499 | * over remote processors and spread off node memory allocations | ||
1500 | * as wide as possible. | ||
1501 | */ | ||
1502 | node_id = zone->zone_pgdat->node_id; | ||
1503 | mask = node_to_cpumask(node_id); | ||
1504 | if (!cpus_empty(mask) && node_id != numa_node_id()) | ||
1505 | return 0; | ||
1506 | return __zone_reclaim(zone, gfp_mask, order); | ||
1507 | } | ||
1508 | #endif | ||