diff options
author | Steven Whitehouse <swhiteho@redhat.com> | 2006-07-03 10:25:08 -0400 |
---|---|---|
committer | Steven Whitehouse <swhiteho@redhat.com> | 2006-07-03 10:25:08 -0400 |
commit | 0a1340c185734a57fbf4775927966ad4a1347b02 (patch) | |
tree | d9ed8f0dd809a7c542a3356601125ea5b5aaa804 /mm | |
parent | af18ddb8864b096e3ed4732e2d4b21c956dcfe3a (diff) | |
parent | 29454dde27d8e340bb1987bad9aa504af7081eba (diff) |
Merge rsync://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Conflicts:
include/linux/kernel.h
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 13 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/filemap.c | 257 | ||||
-rw-r--r-- | mm/filemap.h | 36 | ||||
-rw-r--r-- | mm/filemap_xip.c | 2 | ||||
-rw-r--r-- | mm/fremap.c | 9 | ||||
-rw-r--r-- | mm/highmem.c | 6 | ||||
-rw-r--r-- | mm/hugetlb.c | 282 | ||||
-rw-r--r-- | mm/memory.c | 133 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 154 | ||||
-rw-r--r-- | mm/mempolicy.c | 54 | ||||
-rw-r--r-- | mm/migrate.c | 1076 | ||||
-rw-r--r-- | mm/mmap.c | 14 | ||||
-rw-r--r-- | mm/mmzone.c | 1 | ||||
-rw-r--r-- | mm/mprotect.c | 37 | ||||
-rw-r--r-- | mm/msync.c | 3 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 9 | ||||
-rw-r--r-- | mm/page-writeback.c | 100 | ||||
-rw-r--r-- | mm/page_alloc.c | 670 | ||||
-rw-r--r-- | mm/page_io.c | 4 | ||||
-rw-r--r-- | mm/pdflush.c | 18 | ||||
-rw-r--r-- | mm/readahead.c | 20 | ||||
-rw-r--r-- | mm/rmap.c | 121 | ||||
-rw-r--r-- | mm/shmem.c | 34 | ||||
-rw-r--r-- | mm/slab.c | 392 | ||||
-rw-r--r-- | mm/slob.c | 1 | ||||
-rw-r--r-- | mm/sparse.c | 25 | ||||
-rw-r--r-- | mm/swap.c | 49 | ||||
-rw-r--r-- | mm/swap_state.c | 6 | ||||
-rw-r--r-- | mm/swapfile.c | 44 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 4 | ||||
-rw-r--r-- | mm/truncate.c | 22 | ||||
-rw-r--r-- | mm/vmalloc.c | 122 | ||||
-rw-r--r-- | mm/vmscan.c | 341 | ||||
-rw-r--r-- | mm/vmstat.c | 614 |
36 files changed, 2879 insertions, 1798 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 332f5c29b53a..8f5b45615f7b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -115,7 +115,8 @@ config SPARSEMEM_EXTREME | |||
115 | # eventually, we can have this option just 'select SPARSEMEM' | 115 | # eventually, we can have this option just 'select SPARSEMEM' |
116 | config MEMORY_HOTPLUG | 116 | config MEMORY_HOTPLUG |
117 | bool "Allow for memory hot-add" | 117 | bool "Allow for memory hot-add" |
118 | depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND | 118 | depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG |
119 | depends on (IA64 || X86 || PPC64) | ||
119 | 120 | ||
120 | comment "Memory hotplug is currently incompatible with Software Suspend" | 121 | comment "Memory hotplug is currently incompatible with Software Suspend" |
121 | depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND | 122 | depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND |
@@ -138,10 +139,16 @@ config SPLIT_PTLOCK_CPUS | |||
138 | # | 139 | # |
139 | config MIGRATION | 140 | config MIGRATION |
140 | bool "Page migration" | 141 | bool "Page migration" |
141 | def_bool y if NUMA | 142 | def_bool y |
142 | depends on SWAP && NUMA | 143 | depends on NUMA |
143 | help | 144 | help |
144 | Allows the migration of the physical location of pages of processes | 145 | Allows the migration of the physical location of pages of processes |
145 | while the virtual addresses are not changed. This is useful for | 146 | while the virtual addresses are not changed. This is useful for |
146 | example on NUMA systems to put pages nearer to the processors accessing | 147 | example on NUMA systems to put pages nearer to the processors accessing |
147 | the page. | 148 | the page. |
149 | |||
150 | config RESOURCES_64BIT | ||
151 | bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL) | ||
152 | default 64BIT | ||
153 | help | ||
154 | This option allows memory and IO resources to be 64 bit. | ||
diff --git a/mm/Makefile b/mm/Makefile index 0b8f73f2ed16..9dd824c11eeb 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -10,7 +10,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | |||
10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | 10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ |
11 | page_alloc.o page-writeback.o pdflush.o \ | 11 | page_alloc.o page-writeback.o pdflush.o \ |
12 | readahead.o swap.o truncate.o vmscan.o \ | 12 | readahead.o swap.o truncate.o vmscan.o \ |
13 | prio_tree.o util.o mmzone.o $(mmu-y) | 13 | prio_tree.o util.o mmzone.o vmstat.o $(mmu-y) |
14 | 14 | ||
15 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o | 15 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o |
16 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o | 16 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o |
diff --git a/mm/filemap.c b/mm/filemap.c index a02a0b2c986b..b9c91ab7f0f8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -9,11 +9,11 @@ | |||
9 | * most "normal" filesystems (but you don't /have/ to use this: | 9 | * most "normal" filesystems (but you don't /have/ to use this: |
10 | * the NFS filesystem used to do this differently, for example) | 10 | * the NFS filesystem used to do this differently, for example) |
11 | */ | 11 | */ |
12 | #include <linux/config.h> | ||
13 | #include <linux/module.h> | 12 | #include <linux/module.h> |
14 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
15 | #include <linux/compiler.h> | 14 | #include <linux/compiler.h> |
16 | #include <linux/fs.h> | 15 | #include <linux/fs.h> |
16 | #include <linux/uaccess.h> | ||
17 | #include <linux/aio.h> | 17 | #include <linux/aio.h> |
18 | #include <linux/capability.h> | 18 | #include <linux/capability.h> |
19 | #include <linux/kernel_stat.h> | 19 | #include <linux/kernel_stat.h> |
@@ -38,7 +38,6 @@ | |||
38 | */ | 38 | */ |
39 | #include <linux/buffer_head.h> /* for generic_osync_inode */ | 39 | #include <linux/buffer_head.h> /* for generic_osync_inode */ |
40 | 40 | ||
41 | #include <asm/uaccess.h> | ||
42 | #include <asm/mman.h> | 41 | #include <asm/mman.h> |
43 | 42 | ||
44 | static ssize_t | 43 | static ssize_t |
@@ -120,7 +119,7 @@ void __remove_from_page_cache(struct page *page) | |||
120 | radix_tree_delete(&mapping->page_tree, page->index); | 119 | radix_tree_delete(&mapping->page_tree, page->index); |
121 | page->mapping = NULL; | 120 | page->mapping = NULL; |
122 | mapping->nrpages--; | 121 | mapping->nrpages--; |
123 | pagecache_acct(-1); | 122 | __dec_zone_page_state(page, NR_FILE_PAGES); |
124 | } | 123 | } |
125 | 124 | ||
126 | void remove_from_page_cache(struct page *page) | 125 | void remove_from_page_cache(struct page *page) |
@@ -171,15 +170,17 @@ static int sync_page(void *word) | |||
171 | } | 170 | } |
172 | 171 | ||
173 | /** | 172 | /** |
174 | * filemap_fdatawrite_range - start writeback against all of a mapping's | 173 | * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range |
175 | * dirty pages that lie within the byte offsets <start, end> | ||
176 | * @mapping: address space structure to write | 174 | * @mapping: address space structure to write |
177 | * @start: offset in bytes where the range starts | 175 | * @start: offset in bytes where the range starts |
178 | * @end: offset in bytes where the range ends (inclusive) | 176 | * @end: offset in bytes where the range ends (inclusive) |
179 | * @sync_mode: enable synchronous operation | 177 | * @sync_mode: enable synchronous operation |
180 | * | 178 | * |
179 | * Start writeback against all of a mapping's dirty pages that lie | ||
180 | * within the byte offsets <start, end> inclusive. | ||
181 | * | ||
181 | * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as | 182 | * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as |
182 | * opposed to a regular memory * cleansing writeback. The difference between | 183 | * opposed to a regular memory cleansing writeback. The difference between |
183 | * these two operations is that if a dirty page/buffer is encountered, it must | 184 | * these two operations is that if a dirty page/buffer is encountered, it must |
184 | * be waited upon, and not just skipped over. | 185 | * be waited upon, and not just skipped over. |
185 | */ | 186 | */ |
@@ -190,8 +191,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, | |||
190 | struct writeback_control wbc = { | 191 | struct writeback_control wbc = { |
191 | .sync_mode = sync_mode, | 192 | .sync_mode = sync_mode, |
192 | .nr_to_write = mapping->nrpages * 2, | 193 | .nr_to_write = mapping->nrpages * 2, |
193 | .start = start, | 194 | .range_start = start, |
194 | .end = end, | 195 | .range_end = end, |
195 | }; | 196 | }; |
196 | 197 | ||
197 | if (!mapping_cap_writeback_dirty(mapping)) | 198 | if (!mapping_cap_writeback_dirty(mapping)) |
@@ -204,7 +205,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, | |||
204 | static inline int __filemap_fdatawrite(struct address_space *mapping, | 205 | static inline int __filemap_fdatawrite(struct address_space *mapping, |
205 | int sync_mode) | 206 | int sync_mode) |
206 | { | 207 | { |
207 | return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode); | 208 | return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); |
208 | } | 209 | } |
209 | 210 | ||
210 | int filemap_fdatawrite(struct address_space *mapping) | 211 | int filemap_fdatawrite(struct address_space *mapping) |
@@ -219,7 +220,10 @@ static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, | |||
219 | return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); | 220 | return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); |
220 | } | 221 | } |
221 | 222 | ||
222 | /* | 223 | /** |
224 | * filemap_flush - mostly a non-blocking flush | ||
225 | * @mapping: target address_space | ||
226 | * | ||
223 | * This is a mostly non-blocking flush. Not suitable for data-integrity | 227 | * This is a mostly non-blocking flush. Not suitable for data-integrity |
224 | * purposes - I/O may not be started against all dirty pages. | 228 | * purposes - I/O may not be started against all dirty pages. |
225 | */ | 229 | */ |
@@ -229,7 +233,12 @@ int filemap_flush(struct address_space *mapping) | |||
229 | } | 233 | } |
230 | EXPORT_SYMBOL(filemap_flush); | 234 | EXPORT_SYMBOL(filemap_flush); |
231 | 235 | ||
232 | /* | 236 | /** |
237 | * wait_on_page_writeback_range - wait for writeback to complete | ||
238 | * @mapping: target address_space | ||
239 | * @start: beginning page index | ||
240 | * @end: ending page index | ||
241 | * | ||
233 | * Wait for writeback to complete against pages indexed by start->end | 242 | * Wait for writeback to complete against pages indexed by start->end |
234 | * inclusive | 243 | * inclusive |
235 | */ | 244 | */ |
@@ -276,7 +285,13 @@ int wait_on_page_writeback_range(struct address_space *mapping, | |||
276 | return ret; | 285 | return ret; |
277 | } | 286 | } |
278 | 287 | ||
279 | /* | 288 | /** |
289 | * sync_page_range - write and wait on all pages in the passed range | ||
290 | * @inode: target inode | ||
291 | * @mapping: target address_space | ||
292 | * @pos: beginning offset in pages to write | ||
293 | * @count: number of bytes to write | ||
294 | * | ||
280 | * Write and wait upon all the pages in the passed range. This is a "data | 295 | * Write and wait upon all the pages in the passed range. This is a "data |
281 | * integrity" operation. It waits upon in-flight writeout before starting and | 296 | * integrity" operation. It waits upon in-flight writeout before starting and |
282 | * waiting upon new writeout. If there was an IO error, return it. | 297 | * waiting upon new writeout. If there was an IO error, return it. |
@@ -305,7 +320,13 @@ int sync_page_range(struct inode *inode, struct address_space *mapping, | |||
305 | } | 320 | } |
306 | EXPORT_SYMBOL(sync_page_range); | 321 | EXPORT_SYMBOL(sync_page_range); |
307 | 322 | ||
308 | /* | 323 | /** |
324 | * sync_page_range_nolock | ||
325 | * @inode: target inode | ||
326 | * @mapping: target address_space | ||
327 | * @pos: beginning offset in pages to write | ||
328 | * @count: number of bytes to write | ||
329 | * | ||
309 | * Note: Holding i_mutex across sync_page_range_nolock is not a good idea | 330 | * Note: Holding i_mutex across sync_page_range_nolock is not a good idea |
310 | * as it forces O_SYNC writers to different parts of the same file | 331 | * as it forces O_SYNC writers to different parts of the same file |
311 | * to be serialised right until io completion. | 332 | * to be serialised right until io completion. |
@@ -329,10 +350,11 @@ int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, | |||
329 | EXPORT_SYMBOL(sync_page_range_nolock); | 350 | EXPORT_SYMBOL(sync_page_range_nolock); |
330 | 351 | ||
331 | /** | 352 | /** |
332 | * filemap_fdatawait - walk the list of under-writeback pages of the given | 353 | * filemap_fdatawait - wait for all under-writeback pages to complete |
333 | * address space and wait for all of them. | ||
334 | * | ||
335 | * @mapping: address space structure to wait for | 354 | * @mapping: address space structure to wait for |
355 | * | ||
356 | * Walk the list of under-writeback pages of the given address space | ||
357 | * and wait for all of them. | ||
336 | */ | 358 | */ |
337 | int filemap_fdatawait(struct address_space *mapping) | 359 | int filemap_fdatawait(struct address_space *mapping) |
338 | { | 360 | { |
@@ -368,7 +390,12 @@ int filemap_write_and_wait(struct address_space *mapping) | |||
368 | } | 390 | } |
369 | EXPORT_SYMBOL(filemap_write_and_wait); | 391 | EXPORT_SYMBOL(filemap_write_and_wait); |
370 | 392 | ||
371 | /* | 393 | /** |
394 | * filemap_write_and_wait_range - write out & wait on a file range | ||
395 | * @mapping: the address_space for the pages | ||
396 | * @lstart: offset in bytes where the range starts | ||
397 | * @lend: offset in bytes where the range ends (inclusive) | ||
398 | * | ||
372 | * Write out and wait upon file offsets lstart->lend, inclusive. | 399 | * Write out and wait upon file offsets lstart->lend, inclusive. |
373 | * | 400 | * |
374 | * Note that `lend' is inclusive (describes the last byte to be written) so | 401 | * Note that `lend' is inclusive (describes the last byte to be written) so |
@@ -394,8 +421,14 @@ int filemap_write_and_wait_range(struct address_space *mapping, | |||
394 | return err; | 421 | return err; |
395 | } | 422 | } |
396 | 423 | ||
397 | /* | 424 | /** |
398 | * This function is used to add newly allocated pagecache pages: | 425 | * add_to_page_cache - add newly allocated pagecache pages |
426 | * @page: page to add | ||
427 | * @mapping: the page's address_space | ||
428 | * @offset: page index | ||
429 | * @gfp_mask: page allocation mode | ||
430 | * | ||
431 | * This function is used to add newly allocated pagecache pages; | ||
399 | * the page is new, so we can just run SetPageLocked() against it. | 432 | * the page is new, so we can just run SetPageLocked() against it. |
400 | * The other page state flags were set by rmqueue(). | 433 | * The other page state flags were set by rmqueue(). |
401 | * | 434 | * |
@@ -415,14 +448,13 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, | |||
415 | page->mapping = mapping; | 448 | page->mapping = mapping; |
416 | page->index = offset; | 449 | page->index = offset; |
417 | mapping->nrpages++; | 450 | mapping->nrpages++; |
418 | pagecache_acct(1); | 451 | __inc_zone_page_state(page, NR_FILE_PAGES); |
419 | } | 452 | } |
420 | write_unlock_irq(&mapping->tree_lock); | 453 | write_unlock_irq(&mapping->tree_lock); |
421 | radix_tree_preload_end(); | 454 | radix_tree_preload_end(); |
422 | } | 455 | } |
423 | return error; | 456 | return error; |
424 | } | 457 | } |
425 | |||
426 | EXPORT_SYMBOL(add_to_page_cache); | 458 | EXPORT_SYMBOL(add_to_page_cache); |
427 | 459 | ||
428 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | 460 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, |
@@ -489,8 +521,7 @@ void fastcall wait_on_page_bit(struct page *page, int bit_nr) | |||
489 | EXPORT_SYMBOL(wait_on_page_bit); | 521 | EXPORT_SYMBOL(wait_on_page_bit); |
490 | 522 | ||
491 | /** | 523 | /** |
492 | * unlock_page() - unlock a locked page | 524 | * unlock_page - unlock a locked page |
493 | * | ||
494 | * @page: the page | 525 | * @page: the page |
495 | * | 526 | * |
496 | * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). | 527 | * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). |
@@ -513,8 +544,9 @@ void fastcall unlock_page(struct page *page) | |||
513 | } | 544 | } |
514 | EXPORT_SYMBOL(unlock_page); | 545 | EXPORT_SYMBOL(unlock_page); |
515 | 546 | ||
516 | /* | 547 | /** |
517 | * End writeback against a page. | 548 | * end_page_writeback - end writeback against a page |
549 | * @page: the page | ||
518 | */ | 550 | */ |
519 | void end_page_writeback(struct page *page) | 551 | void end_page_writeback(struct page *page) |
520 | { | 552 | { |
@@ -527,10 +559,11 @@ void end_page_writeback(struct page *page) | |||
527 | } | 559 | } |
528 | EXPORT_SYMBOL(end_page_writeback); | 560 | EXPORT_SYMBOL(end_page_writeback); |
529 | 561 | ||
530 | /* | 562 | /** |
531 | * Get a lock on the page, assuming we need to sleep to get it. | 563 | * __lock_page - get a lock on the page, assuming we need to sleep to get it |
564 | * @page: the page to lock | ||
532 | * | 565 | * |
533 | * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some | 566 | * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some |
534 | * random driver's requestfn sets TASK_RUNNING, we could busywait. However | 567 | * random driver's requestfn sets TASK_RUNNING, we could busywait. However |
535 | * chances are that on the second loop, the block layer's plug list is empty, | 568 | * chances are that on the second loop, the block layer's plug list is empty, |
536 | * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. | 569 | * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. |
@@ -544,8 +577,12 @@ void fastcall __lock_page(struct page *page) | |||
544 | } | 577 | } |
545 | EXPORT_SYMBOL(__lock_page); | 578 | EXPORT_SYMBOL(__lock_page); |
546 | 579 | ||
547 | /* | 580 | /** |
548 | * a rather lightweight function, finding and getting a reference to a | 581 | * find_get_page - find and get a page reference |
582 | * @mapping: the address_space to search | ||
583 | * @offset: the page index | ||
584 | * | ||
585 | * A rather lightweight function, finding and getting a reference to a | ||
549 | * hashed page atomically. | 586 | * hashed page atomically. |
550 | */ | 587 | */ |
551 | struct page * find_get_page(struct address_space *mapping, unsigned long offset) | 588 | struct page * find_get_page(struct address_space *mapping, unsigned long offset) |
@@ -559,11 +596,14 @@ struct page * find_get_page(struct address_space *mapping, unsigned long offset) | |||
559 | read_unlock_irq(&mapping->tree_lock); | 596 | read_unlock_irq(&mapping->tree_lock); |
560 | return page; | 597 | return page; |
561 | } | 598 | } |
562 | |||
563 | EXPORT_SYMBOL(find_get_page); | 599 | EXPORT_SYMBOL(find_get_page); |
564 | 600 | ||
565 | /* | 601 | /** |
566 | * Same as above, but trylock it instead of incrementing the count. | 602 | * find_trylock_page - find and lock a page |
603 | * @mapping: the address_space to search | ||
604 | * @offset: the page index | ||
605 | * | ||
606 | * Same as find_get_page(), but trylock it instead of incrementing the count. | ||
567 | */ | 607 | */ |
568 | struct page *find_trylock_page(struct address_space *mapping, unsigned long offset) | 608 | struct page *find_trylock_page(struct address_space *mapping, unsigned long offset) |
569 | { | 609 | { |
@@ -576,12 +616,10 @@ struct page *find_trylock_page(struct address_space *mapping, unsigned long offs | |||
576 | read_unlock_irq(&mapping->tree_lock); | 616 | read_unlock_irq(&mapping->tree_lock); |
577 | return page; | 617 | return page; |
578 | } | 618 | } |
579 | |||
580 | EXPORT_SYMBOL(find_trylock_page); | 619 | EXPORT_SYMBOL(find_trylock_page); |
581 | 620 | ||
582 | /** | 621 | /** |
583 | * find_lock_page - locate, pin and lock a pagecache page | 622 | * find_lock_page - locate, pin and lock a pagecache page |
584 | * | ||
585 | * @mapping: the address_space to search | 623 | * @mapping: the address_space to search |
586 | * @offset: the page index | 624 | * @offset: the page index |
587 | * | 625 | * |
@@ -617,12 +655,10 @@ repeat: | |||
617 | read_unlock_irq(&mapping->tree_lock); | 655 | read_unlock_irq(&mapping->tree_lock); |
618 | return page; | 656 | return page; |
619 | } | 657 | } |
620 | |||
621 | EXPORT_SYMBOL(find_lock_page); | 658 | EXPORT_SYMBOL(find_lock_page); |
622 | 659 | ||
623 | /** | 660 | /** |
624 | * find_or_create_page - locate or add a pagecache page | 661 | * find_or_create_page - locate or add a pagecache page |
625 | * | ||
626 | * @mapping: the page's address_space | 662 | * @mapping: the page's address_space |
627 | * @index: the page's index into the mapping | 663 | * @index: the page's index into the mapping |
628 | * @gfp_mask: page allocation mode | 664 | * @gfp_mask: page allocation mode |
@@ -663,7 +699,6 @@ repeat: | |||
663 | page_cache_release(cached_page); | 699 | page_cache_release(cached_page); |
664 | return page; | 700 | return page; |
665 | } | 701 | } |
666 | |||
667 | EXPORT_SYMBOL(find_or_create_page); | 702 | EXPORT_SYMBOL(find_or_create_page); |
668 | 703 | ||
669 | /** | 704 | /** |
@@ -729,9 +764,16 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, | |||
729 | return i; | 764 | return i; |
730 | } | 765 | } |
731 | 766 | ||
732 | /* | 767 | /** |
768 | * find_get_pages_tag - find and return pages that match @tag | ||
769 | * @mapping: the address_space to search | ||
770 | * @index: the starting page index | ||
771 | * @tag: the tag index | ||
772 | * @nr_pages: the maximum number of pages | ||
773 | * @pages: where the resulting pages are placed | ||
774 | * | ||
733 | * Like find_get_pages, except we only return pages which are tagged with | 775 | * Like find_get_pages, except we only return pages which are tagged with |
734 | * `tag'. We update *index to index the next page for the traversal. | 776 | * @tag. We update @index to index the next page for the traversal. |
735 | */ | 777 | */ |
736 | unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, | 778 | unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, |
737 | int tag, unsigned int nr_pages, struct page **pages) | 779 | int tag, unsigned int nr_pages, struct page **pages) |
@@ -750,7 +792,11 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, | |||
750 | return ret; | 792 | return ret; |
751 | } | 793 | } |
752 | 794 | ||
753 | /* | 795 | /** |
796 | * grab_cache_page_nowait - returns locked page at given index in given cache | ||
797 | * @mapping: target address_space | ||
798 | * @index: the page index | ||
799 | * | ||
754 | * Same as grab_cache_page, but do not wait if the page is unavailable. | 800 | * Same as grab_cache_page, but do not wait if the page is unavailable. |
755 | * This is intended for speculative data generators, where the data can | 801 | * This is intended for speculative data generators, where the data can |
756 | * be regenerated if the page couldn't be grabbed. This routine should | 802 | * be regenerated if the page couldn't be grabbed. This routine should |
@@ -779,19 +825,51 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index) | |||
779 | } | 825 | } |
780 | return page; | 826 | return page; |
781 | } | 827 | } |
782 | |||
783 | EXPORT_SYMBOL(grab_cache_page_nowait); | 828 | EXPORT_SYMBOL(grab_cache_page_nowait); |
784 | 829 | ||
785 | /* | 830 | /* |
831 | * CD/DVDs are error prone. When a medium error occurs, the driver may fail | ||
832 | * a _large_ part of the i/o request. Imagine the worst scenario: | ||
833 | * | ||
834 | * ---R__________________________________________B__________ | ||
835 | * ^ reading here ^ bad block(assume 4k) | ||
836 | * | ||
837 | * read(R) => miss => readahead(R...B) => media error => frustrating retries | ||
838 | * => failing the whole request => read(R) => read(R+1) => | ||
839 | * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => | ||
840 | * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => | ||
841 | * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... | ||
842 | * | ||
843 | * It is going insane. Fix it by quickly scaling down the readahead size. | ||
844 | */ | ||
845 | static void shrink_readahead_size_eio(struct file *filp, | ||
846 | struct file_ra_state *ra) | ||
847 | { | ||
848 | if (!ra->ra_pages) | ||
849 | return; | ||
850 | |||
851 | ra->ra_pages /= 4; | ||
852 | printk(KERN_WARNING "Reducing readahead size to %luK\n", | ||
853 | ra->ra_pages << (PAGE_CACHE_SHIFT - 10)); | ||
854 | } | ||
855 | |||
856 | /** | ||
857 | * do_generic_mapping_read - generic file read routine | ||
858 | * @mapping: address_space to be read | ||
859 | * @_ra: file's readahead state | ||
860 | * @filp: the file to read | ||
861 | * @ppos: current file position | ||
862 | * @desc: read_descriptor | ||
863 | * @actor: read method | ||
864 | * | ||
786 | * This is a generic file read routine, and uses the | 865 | * This is a generic file read routine, and uses the |
787 | * mapping->a_ops->readpage() function for the actual low-level | 866 | * mapping->a_ops->readpage() function for the actual low-level stuff. |
788 | * stuff. | ||
789 | * | 867 | * |
790 | * This is really ugly. But the goto's actually try to clarify some | 868 | * This is really ugly. But the goto's actually try to clarify some |
791 | * of the logic when it comes to error handling etc. | 869 | * of the logic when it comes to error handling etc. |
792 | * | 870 | * |
793 | * Note the struct file* is only passed for the use of readpage. It may be | 871 | * Note the struct file* is only passed for the use of readpage. |
794 | * NULL. | 872 | * It may be NULL. |
795 | */ | 873 | */ |
796 | void do_generic_mapping_read(struct address_space *mapping, | 874 | void do_generic_mapping_read(struct address_space *mapping, |
797 | struct file_ra_state *_ra, | 875 | struct file_ra_state *_ra, |
@@ -932,6 +1010,7 @@ readpage: | |||
932 | } | 1010 | } |
933 | unlock_page(page); | 1011 | unlock_page(page); |
934 | error = -EIO; | 1012 | error = -EIO; |
1013 | shrink_readahead_size_eio(filp, &ra); | ||
935 | goto readpage_error; | 1014 | goto readpage_error; |
936 | } | 1015 | } |
937 | unlock_page(page); | 1016 | unlock_page(page); |
@@ -1004,7 +1083,6 @@ out: | |||
1004 | if (filp) | 1083 | if (filp) |
1005 | file_accessed(filp); | 1084 | file_accessed(filp); |
1006 | } | 1085 | } |
1007 | |||
1008 | EXPORT_SYMBOL(do_generic_mapping_read); | 1086 | EXPORT_SYMBOL(do_generic_mapping_read); |
1009 | 1087 | ||
1010 | int file_read_actor(read_descriptor_t *desc, struct page *page, | 1088 | int file_read_actor(read_descriptor_t *desc, struct page *page, |
@@ -1046,7 +1124,13 @@ success: | |||
1046 | } | 1124 | } |
1047 | EXPORT_SYMBOL_GPL(file_read_actor); | 1125 | EXPORT_SYMBOL_GPL(file_read_actor); |
1048 | 1126 | ||
1049 | /* | 1127 | /** |
1128 | * __generic_file_aio_read - generic filesystem read routine | ||
1129 | * @iocb: kernel I/O control block | ||
1130 | * @iov: io vector request | ||
1131 | * @nr_segs: number of segments in the iovec | ||
1132 | * @ppos: current file position | ||
1133 | * | ||
1050 | * This is the "read()" routine for all filesystems | 1134 | * This is the "read()" routine for all filesystems |
1051 | * that can use the page cache directly. | 1135 | * that can use the page cache directly. |
1052 | */ | 1136 | */ |
@@ -1125,7 +1209,6 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1125 | out: | 1209 | out: |
1126 | return retval; | 1210 | return retval; |
1127 | } | 1211 | } |
1128 | |||
1129 | EXPORT_SYMBOL(__generic_file_aio_read); | 1212 | EXPORT_SYMBOL(__generic_file_aio_read); |
1130 | 1213 | ||
1131 | ssize_t | 1214 | ssize_t |
@@ -1136,7 +1219,6 @@ generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t | |||
1136 | BUG_ON(iocb->ki_pos != pos); | 1219 | BUG_ON(iocb->ki_pos != pos); |
1137 | return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos); | 1220 | return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos); |
1138 | } | 1221 | } |
1139 | |||
1140 | EXPORT_SYMBOL(generic_file_aio_read); | 1222 | EXPORT_SYMBOL(generic_file_aio_read); |
1141 | 1223 | ||
1142 | ssize_t | 1224 | ssize_t |
@@ -1152,7 +1234,6 @@ generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppo | |||
1152 | ret = wait_on_sync_kiocb(&kiocb); | 1234 | ret = wait_on_sync_kiocb(&kiocb); |
1153 | return ret; | 1235 | return ret; |
1154 | } | 1236 | } |
1155 | |||
1156 | EXPORT_SYMBOL(generic_file_read); | 1237 | EXPORT_SYMBOL(generic_file_read); |
1157 | 1238 | ||
1158 | int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) | 1239 | int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) |
@@ -1193,7 +1274,6 @@ ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos, | |||
1193 | return desc.written; | 1274 | return desc.written; |
1194 | return desc.error; | 1275 | return desc.error; |
1195 | } | 1276 | } |
1196 | |||
1197 | EXPORT_SYMBOL(generic_file_sendfile); | 1277 | EXPORT_SYMBOL(generic_file_sendfile); |
1198 | 1278 | ||
1199 | static ssize_t | 1279 | static ssize_t |
@@ -1229,11 +1309,15 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) | |||
1229 | } | 1309 | } |
1230 | 1310 | ||
1231 | #ifdef CONFIG_MMU | 1311 | #ifdef CONFIG_MMU |
1232 | /* | 1312 | static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); |
1313 | /** | ||
1314 | * page_cache_read - adds requested page to the page cache if not already there | ||
1315 | * @file: file to read | ||
1316 | * @offset: page index | ||
1317 | * | ||
1233 | * This adds the requested page to the page cache if it isn't already there, | 1318 | * This adds the requested page to the page cache if it isn't already there, |
1234 | * and schedules an I/O to read in its contents from disk. | 1319 | * and schedules an I/O to read in its contents from disk. |
1235 | */ | 1320 | */ |
1236 | static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); | ||
1237 | static int fastcall page_cache_read(struct file * file, unsigned long offset) | 1321 | static int fastcall page_cache_read(struct file * file, unsigned long offset) |
1238 | { | 1322 | { |
1239 | struct address_space *mapping = file->f_mapping; | 1323 | struct address_space *mapping = file->f_mapping; |
@@ -1260,7 +1344,12 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset) | |||
1260 | 1344 | ||
1261 | #define MMAP_LOTSAMISS (100) | 1345 | #define MMAP_LOTSAMISS (100) |
1262 | 1346 | ||
1263 | /* | 1347 | /** |
1348 | * filemap_nopage - read in file data for page fault handling | ||
1349 | * @area: the applicable vm_area | ||
1350 | * @address: target address to read in | ||
1351 | * @type: returned with VM_FAULT_{MINOR,MAJOR} if not %NULL | ||
1352 | * | ||
1264 | * filemap_nopage() is invoked via the vma operations vector for a | 1353 | * filemap_nopage() is invoked via the vma operations vector for a |
1265 | * mapped memory region to read in file data during a page fault. | 1354 | * mapped memory region to read in file data during a page fault. |
1266 | * | 1355 | * |
@@ -1327,7 +1416,7 @@ retry_find: | |||
1327 | */ | 1416 | */ |
1328 | if (!did_readaround) { | 1417 | if (!did_readaround) { |
1329 | majmin = VM_FAULT_MAJOR; | 1418 | majmin = VM_FAULT_MAJOR; |
1330 | inc_page_state(pgmajfault); | 1419 | count_vm_event(PGMAJFAULT); |
1331 | } | 1420 | } |
1332 | did_readaround = 1; | 1421 | did_readaround = 1; |
1333 | ra_pages = max_sane_readahead(file->f_ra.ra_pages); | 1422 | ra_pages = max_sane_readahead(file->f_ra.ra_pages); |
@@ -1398,7 +1487,7 @@ no_cached_page: | |||
1398 | page_not_uptodate: | 1487 | page_not_uptodate: |
1399 | if (!did_readaround) { | 1488 | if (!did_readaround) { |
1400 | majmin = VM_FAULT_MAJOR; | 1489 | majmin = VM_FAULT_MAJOR; |
1401 | inc_page_state(pgmajfault); | 1490 | count_vm_event(PGMAJFAULT); |
1402 | } | 1491 | } |
1403 | lock_page(page); | 1492 | lock_page(page); |
1404 | 1493 | ||
@@ -1460,10 +1549,10 @@ page_not_uptodate: | |||
1460 | * Things didn't work out. Return zero to tell the | 1549 | * Things didn't work out. Return zero to tell the |
1461 | * mm layer so, possibly freeing the page cache page first. | 1550 | * mm layer so, possibly freeing the page cache page first. |
1462 | */ | 1551 | */ |
1552 | shrink_readahead_size_eio(file, ra); | ||
1463 | page_cache_release(page); | 1553 | page_cache_release(page); |
1464 | return NULL; | 1554 | return NULL; |
1465 | } | 1555 | } |
1466 | |||
1467 | EXPORT_SYMBOL(filemap_nopage); | 1556 | EXPORT_SYMBOL(filemap_nopage); |
1468 | 1557 | ||
1469 | static struct page * filemap_getpage(struct file *file, unsigned long pgoff, | 1558 | static struct page * filemap_getpage(struct file *file, unsigned long pgoff, |
@@ -1717,7 +1806,13 @@ repeat: | |||
1717 | return page; | 1806 | return page; |
1718 | } | 1807 | } |
1719 | 1808 | ||
1720 | /* | 1809 | /** |
1810 | * read_cache_page - read into page cache, fill it if needed | ||
1811 | * @mapping: the page's address_space | ||
1812 | * @index: the page index | ||
1813 | * @filler: function to perform the read | ||
1814 | * @data: destination for read data | ||
1815 | * | ||
1721 | * Read into the page cache. If a page already exists, | 1816 | * Read into the page cache. If a page already exists, |
1722 | * and PageUptodate() is not set, try to fill the page. | 1817 | * and PageUptodate() is not set, try to fill the page. |
1723 | */ | 1818 | */ |
@@ -1755,7 +1850,6 @@ retry: | |||
1755 | out: | 1850 | out: |
1756 | return page; | 1851 | return page; |
1757 | } | 1852 | } |
1758 | |||
1759 | EXPORT_SYMBOL(read_cache_page); | 1853 | EXPORT_SYMBOL(read_cache_page); |
1760 | 1854 | ||
1761 | /* | 1855 | /* |
@@ -1826,7 +1920,7 @@ int remove_suid(struct dentry *dentry) | |||
1826 | EXPORT_SYMBOL(remove_suid); | 1920 | EXPORT_SYMBOL(remove_suid); |
1827 | 1921 | ||
1828 | size_t | 1922 | size_t |
1829 | __filemap_copy_from_user_iovec(char *vaddr, | 1923 | __filemap_copy_from_user_iovec_inatomic(char *vaddr, |
1830 | const struct iovec *iov, size_t base, size_t bytes) | 1924 | const struct iovec *iov, size_t base, size_t bytes) |
1831 | { | 1925 | { |
1832 | size_t copied = 0, left = 0; | 1926 | size_t copied = 0, left = 0; |
@@ -1836,18 +1930,14 @@ __filemap_copy_from_user_iovec(char *vaddr, | |||
1836 | int copy = min(bytes, iov->iov_len - base); | 1930 | int copy = min(bytes, iov->iov_len - base); |
1837 | 1931 | ||
1838 | base = 0; | 1932 | base = 0; |
1839 | left = __copy_from_user_inatomic(vaddr, buf, copy); | 1933 | left = __copy_from_user_inatomic_nocache(vaddr, buf, copy); |
1840 | copied += copy; | 1934 | copied += copy; |
1841 | bytes -= copy; | 1935 | bytes -= copy; |
1842 | vaddr += copy; | 1936 | vaddr += copy; |
1843 | iov++; | 1937 | iov++; |
1844 | 1938 | ||
1845 | if (unlikely(left)) { | 1939 | if (unlikely(left)) |
1846 | /* zero the rest of the target like __copy_from_user */ | ||
1847 | if (bytes) | ||
1848 | memset(vaddr, 0, bytes); | ||
1849 | break; | 1940 | break; |
1850 | } | ||
1851 | } | 1941 | } |
1852 | return copied - left; | 1942 | return copied - left; |
1853 | } | 1943 | } |
@@ -1855,7 +1945,7 @@ __filemap_copy_from_user_iovec(char *vaddr, | |||
1855 | /* | 1945 | /* |
1856 | * Performs necessary checks before doing a write | 1946 | * Performs necessary checks before doing a write |
1857 | * | 1947 | * |
1858 | * Can adjust writing position aor amount of bytes to write. | 1948 | * Can adjust writing position or amount of bytes to write. |
1859 | * Returns appropriate error code that caller should return or | 1949 | * Returns appropriate error code that caller should return or |
1860 | * zero in case that write should be allowed. | 1950 | * zero in case that write should be allowed. |
1861 | */ | 1951 | */ |
@@ -1979,7 +2069,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
1979 | { | 2069 | { |
1980 | struct file *file = iocb->ki_filp; | 2070 | struct file *file = iocb->ki_filp; |
1981 | struct address_space * mapping = file->f_mapping; | 2071 | struct address_space * mapping = file->f_mapping; |
1982 | struct address_space_operations *a_ops = mapping->a_ops; | 2072 | const struct address_space_operations *a_ops = mapping->a_ops; |
1983 | struct inode *inode = mapping->host; | 2073 | struct inode *inode = mapping->host; |
1984 | long status = 0; | 2074 | long status = 0; |
1985 | struct page *page; | 2075 | struct page *page; |
@@ -2005,14 +2095,21 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2005 | do { | 2095 | do { |
2006 | unsigned long index; | 2096 | unsigned long index; |
2007 | unsigned long offset; | 2097 | unsigned long offset; |
2008 | unsigned long maxlen; | ||
2009 | size_t copied; | 2098 | size_t copied; |
2010 | 2099 | ||
2011 | offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ | 2100 | offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ |
2012 | index = pos >> PAGE_CACHE_SHIFT; | 2101 | index = pos >> PAGE_CACHE_SHIFT; |
2013 | bytes = PAGE_CACHE_SIZE - offset; | 2102 | bytes = PAGE_CACHE_SIZE - offset; |
2014 | if (bytes > count) | 2103 | |
2015 | bytes = count; | 2104 | /* Limit the size of the copy to the caller's write size */ |
2105 | bytes = min(bytes, count); | ||
2106 | |||
2107 | /* | ||
2108 | * Limit the size of the copy to that of the current segment, | ||
2109 | * because fault_in_pages_readable() doesn't know how to walk | ||
2110 | * segments. | ||
2111 | */ | ||
2112 | bytes = min(bytes, cur_iov->iov_len - iov_base); | ||
2016 | 2113 | ||
2017 | /* | 2114 | /* |
2018 | * Bring in the user page that we will copy from _first_. | 2115 | * Bring in the user page that we will copy from _first_. |
@@ -2020,10 +2117,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2020 | * same page as we're writing to, without it being marked | 2117 | * same page as we're writing to, without it being marked |
2021 | * up-to-date. | 2118 | * up-to-date. |
2022 | */ | 2119 | */ |
2023 | maxlen = cur_iov->iov_len - iov_base; | 2120 | fault_in_pages_readable(buf, bytes); |
2024 | if (maxlen > bytes) | ||
2025 | maxlen = bytes; | ||
2026 | fault_in_pages_readable(buf, maxlen); | ||
2027 | 2121 | ||
2028 | page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); | 2122 | page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); |
2029 | if (!page) { | 2123 | if (!page) { |
@@ -2031,6 +2125,12 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2031 | break; | 2125 | break; |
2032 | } | 2126 | } |
2033 | 2127 | ||
2128 | if (unlikely(bytes == 0)) { | ||
2129 | status = 0; | ||
2130 | copied = 0; | ||
2131 | goto zero_length_segment; | ||
2132 | } | ||
2133 | |||
2034 | status = a_ops->prepare_write(file, page, offset, offset+bytes); | 2134 | status = a_ops->prepare_write(file, page, offset, offset+bytes); |
2035 | if (unlikely(status)) { | 2135 | if (unlikely(status)) { |
2036 | loff_t isize = i_size_read(inode); | 2136 | loff_t isize = i_size_read(inode); |
@@ -2060,7 +2160,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2060 | page_cache_release(page); | 2160 | page_cache_release(page); |
2061 | continue; | 2161 | continue; |
2062 | } | 2162 | } |
2063 | if (likely(copied > 0)) { | 2163 | zero_length_segment: |
2164 | if (likely(copied >= 0)) { | ||
2064 | if (!status) | 2165 | if (!status) |
2065 | status = copied; | 2166 | status = copied; |
2066 | 2167 | ||
@@ -2125,7 +2226,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | |||
2125 | unsigned long nr_segs, loff_t *ppos) | 2226 | unsigned long nr_segs, loff_t *ppos) |
2126 | { | 2227 | { |
2127 | struct file *file = iocb->ki_filp; | 2228 | struct file *file = iocb->ki_filp; |
2128 | struct address_space * mapping = file->f_mapping; | 2229 | const struct address_space * mapping = file->f_mapping; |
2129 | size_t ocount; /* original count */ | 2230 | size_t ocount; /* original count */ |
2130 | size_t count; /* after file limit checks */ | 2231 | size_t count; /* after file limit checks */ |
2131 | struct inode *inode = mapping->host; | 2232 | struct inode *inode = mapping->host; |
diff --git a/mm/filemap.h b/mm/filemap.h index 13793ba0ce17..3f2a343c6015 100644 --- a/mm/filemap.h +++ b/mm/filemap.h | |||
@@ -13,18 +13,26 @@ | |||
13 | #include <linux/highmem.h> | 13 | #include <linux/highmem.h> |
14 | #include <linux/uio.h> | 14 | #include <linux/uio.h> |
15 | #include <linux/config.h> | 15 | #include <linux/config.h> |
16 | #include <asm/uaccess.h> | 16 | #include <linux/uaccess.h> |
17 | 17 | ||
18 | size_t | 18 | size_t |
19 | __filemap_copy_from_user_iovec(char *vaddr, | 19 | __filemap_copy_from_user_iovec_inatomic(char *vaddr, |
20 | const struct iovec *iov, | 20 | const struct iovec *iov, |
21 | size_t base, | 21 | size_t base, |
22 | size_t bytes); | 22 | size_t bytes); |
23 | 23 | ||
24 | /* | 24 | /* |
25 | * Copy as much as we can into the page and return the number of bytes which | 25 | * Copy as much as we can into the page and return the number of bytes which |
26 | * were sucessfully copied. If a fault is encountered then clear the page | 26 | * were sucessfully copied. If a fault is encountered then clear the page |
27 | * out to (offset+bytes) and return the number of bytes which were copied. | 27 | * out to (offset+bytes) and return the number of bytes which were copied. |
28 | * | ||
29 | * NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache | ||
30 | * to *NOT* zero any tail of the buffer that it failed to copy. If it does, | ||
31 | * and if the following non-atomic copy succeeds, then there is a small window | ||
32 | * where the target page contains neither the data before the write, nor the | ||
33 | * data after the write (it contains zero). A read at this time will see | ||
34 | * data that is inconsistent with any ordering of the read and the write. | ||
35 | * (This has been detected in practice). | ||
28 | */ | 36 | */ |
29 | static inline size_t | 37 | static inline size_t |
30 | filemap_copy_from_user(struct page *page, unsigned long offset, | 38 | filemap_copy_from_user(struct page *page, unsigned long offset, |
@@ -34,13 +42,13 @@ filemap_copy_from_user(struct page *page, unsigned long offset, | |||
34 | int left; | 42 | int left; |
35 | 43 | ||
36 | kaddr = kmap_atomic(page, KM_USER0); | 44 | kaddr = kmap_atomic(page, KM_USER0); |
37 | left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); | 45 | left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes); |
38 | kunmap_atomic(kaddr, KM_USER0); | 46 | kunmap_atomic(kaddr, KM_USER0); |
39 | 47 | ||
40 | if (left != 0) { | 48 | if (left != 0) { |
41 | /* Do it the slow way */ | 49 | /* Do it the slow way */ |
42 | kaddr = kmap(page); | 50 | kaddr = kmap(page); |
43 | left = __copy_from_user(kaddr + offset, buf, bytes); | 51 | left = __copy_from_user_nocache(kaddr + offset, buf, bytes); |
44 | kunmap(page); | 52 | kunmap(page); |
45 | } | 53 | } |
46 | return bytes - left; | 54 | return bytes - left; |
@@ -60,13 +68,15 @@ filemap_copy_from_user_iovec(struct page *page, unsigned long offset, | |||
60 | size_t copied; | 68 | size_t copied; |
61 | 69 | ||
62 | kaddr = kmap_atomic(page, KM_USER0); | 70 | kaddr = kmap_atomic(page, KM_USER0); |
63 | copied = __filemap_copy_from_user_iovec(kaddr + offset, iov, | 71 | copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov, |
64 | base, bytes); | 72 | base, bytes); |
65 | kunmap_atomic(kaddr, KM_USER0); | 73 | kunmap_atomic(kaddr, KM_USER0); |
66 | if (copied != bytes) { | 74 | if (copied != bytes) { |
67 | kaddr = kmap(page); | 75 | kaddr = kmap(page); |
68 | copied = __filemap_copy_from_user_iovec(kaddr + offset, iov, | 76 | copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov, |
69 | base, bytes); | 77 | base, bytes); |
78 | if (bytes - copied) | ||
79 | memset(kaddr + offset + copied, 0, bytes - copied); | ||
70 | kunmap(page); | 80 | kunmap(page); |
71 | } | 81 | } |
72 | return copied; | 82 | return copied; |
@@ -78,7 +88,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) | |||
78 | const struct iovec *iov = *iovp; | 88 | const struct iovec *iov = *iovp; |
79 | size_t base = *basep; | 89 | size_t base = *basep; |
80 | 90 | ||
81 | while (bytes) { | 91 | do { |
82 | int copy = min(bytes, iov->iov_len - base); | 92 | int copy = min(bytes, iov->iov_len - base); |
83 | 93 | ||
84 | bytes -= copy; | 94 | bytes -= copy; |
@@ -87,7 +97,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) | |||
87 | iov++; | 97 | iov++; |
88 | base = 0; | 98 | base = 0; |
89 | } | 99 | } |
90 | } | 100 | } while (bytes); |
91 | *iovp = iov; | 101 | *iovp = iov; |
92 | *basep = base; | 102 | *basep = base; |
93 | } | 103 | } |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index b960ac8e5918..b4fd0d7c9bfb 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -273,7 +273,7 @@ __xip_file_write(struct file *filp, const char __user *buf, | |||
273 | size_t count, loff_t pos, loff_t *ppos) | 273 | size_t count, loff_t pos, loff_t *ppos) |
274 | { | 274 | { |
275 | struct address_space * mapping = filp->f_mapping; | 275 | struct address_space * mapping = filp->f_mapping; |
276 | struct address_space_operations *a_ops = mapping->a_ops; | 276 | const struct address_space_operations *a_ops = mapping->a_ops; |
277 | struct inode *inode = mapping->host; | 277 | struct inode *inode = mapping->host; |
278 | long status = 0; | 278 | long status = 0; |
279 | struct page *page; | 279 | struct page *page; |
diff --git a/mm/fremap.c b/mm/fremap.c index 9f381e58bf44..21b7d0cbc98c 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -83,6 +83,7 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
83 | page_add_file_rmap(page); | 83 | page_add_file_rmap(page); |
84 | pte_val = *pte; | 84 | pte_val = *pte; |
85 | update_mmu_cache(vma, addr, pte_val); | 85 | update_mmu_cache(vma, addr, pte_val); |
86 | lazy_mmu_prot_update(pte_val); | ||
86 | err = 0; | 87 | err = 0; |
87 | unlock: | 88 | unlock: |
88 | pte_unmap_unlock(pte, ptl); | 89 | pte_unmap_unlock(pte, ptl); |
@@ -114,7 +115,13 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
114 | 115 | ||
115 | set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); | 116 | set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); |
116 | pte_val = *pte; | 117 | pte_val = *pte; |
117 | update_mmu_cache(vma, addr, pte_val); | 118 | /* |
119 | * We don't need to run update_mmu_cache() here because the "file pte" | ||
120 | * being installed by install_file_pte() is not a real pte - it's a | ||
121 | * non-present entry (like a swap entry), noting what file offset should | ||
122 | * be mapped there when there's a fault (in a non-linear vma where | ||
123 | * that's not obvious). | ||
124 | */ | ||
118 | pte_unmap_unlock(pte, ptl); | 125 | pte_unmap_unlock(pte, ptl); |
119 | err = 0; | 126 | err = 0; |
120 | out: | 127 | out: |
diff --git a/mm/highmem.c b/mm/highmem.c index 9b274fdf9d08..9b2a5403c447 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -315,8 +315,8 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) | |||
315 | if (bvec->bv_page == org_vec->bv_page) | 315 | if (bvec->bv_page == org_vec->bv_page) |
316 | continue; | 316 | continue; |
317 | 317 | ||
318 | mempool_free(bvec->bv_page, pool); | 318 | dec_zone_page_state(bvec->bv_page, NR_BOUNCE); |
319 | dec_page_state(nr_bounce); | 319 | mempool_free(bvec->bv_page, pool); |
320 | } | 320 | } |
321 | 321 | ||
322 | bio_endio(bio_orig, bio_orig->bi_size, err); | 322 | bio_endio(bio_orig, bio_orig->bi_size, err); |
@@ -397,7 +397,7 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, | |||
397 | to->bv_page = mempool_alloc(pool, q->bounce_gfp); | 397 | to->bv_page = mempool_alloc(pool, q->bounce_gfp); |
398 | to->bv_len = from->bv_len; | 398 | to->bv_len = from->bv_len; |
399 | to->bv_offset = from->bv_offset; | 399 | to->bv_offset = from->bv_offset; |
400 | inc_page_state(nr_bounce); | 400 | inc_zone_page_state(to->bv_page, NR_BOUNCE); |
401 | 401 | ||
402 | if (rw == WRITE) { | 402 | if (rw == WRITE) { |
403 | char *vto, *vfrom; | 403 | char *vto, *vfrom; |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 832f676ca038..df499973255f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -22,7 +22,7 @@ | |||
22 | #include "internal.h" | 22 | #include "internal.h" |
23 | 23 | ||
24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
25 | static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages; | 25 | static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; |
26 | unsigned long max_huge_pages; | 26 | unsigned long max_huge_pages; |
27 | static struct list_head hugepage_freelists[MAX_NUMNODES]; | 27 | static struct list_head hugepage_freelists[MAX_NUMNODES]; |
28 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; | 28 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; |
@@ -123,39 +123,13 @@ static int alloc_fresh_huge_page(void) | |||
123 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 123 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
124 | unsigned long addr) | 124 | unsigned long addr) |
125 | { | 125 | { |
126 | struct inode *inode = vma->vm_file->f_dentry->d_inode; | ||
127 | struct page *page; | 126 | struct page *page; |
128 | int use_reserve = 0; | ||
129 | unsigned long idx; | ||
130 | 127 | ||
131 | spin_lock(&hugetlb_lock); | 128 | spin_lock(&hugetlb_lock); |
132 | 129 | if (vma->vm_flags & VM_MAYSHARE) | |
133 | if (vma->vm_flags & VM_MAYSHARE) { | 130 | resv_huge_pages--; |
134 | 131 | else if (free_huge_pages <= resv_huge_pages) | |
135 | /* idx = radix tree index, i.e. offset into file in | 132 | goto fail; |
136 | * HPAGE_SIZE units */ | ||
137 | idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) | ||
138 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | ||
139 | |||
140 | /* The hugetlbfs specific inode info stores the number | ||
141 | * of "guaranteed available" (huge) pages. That is, | ||
142 | * the first 'prereserved_hpages' pages of the inode | ||
143 | * are either already instantiated, or have been | ||
144 | * pre-reserved (by hugetlb_reserve_for_inode()). Here | ||
145 | * we're in the process of instantiating the page, so | ||
146 | * we use this to determine whether to draw from the | ||
147 | * pre-reserved pool or the truly free pool. */ | ||
148 | if (idx < HUGETLBFS_I(inode)->prereserved_hpages) | ||
149 | use_reserve = 1; | ||
150 | } | ||
151 | |||
152 | if (!use_reserve) { | ||
153 | if (free_huge_pages <= reserved_huge_pages) | ||
154 | goto fail; | ||
155 | } else { | ||
156 | BUG_ON(reserved_huge_pages == 0); | ||
157 | reserved_huge_pages--; | ||
158 | } | ||
159 | 133 | ||
160 | page = dequeue_huge_page(vma, addr); | 134 | page = dequeue_huge_page(vma, addr); |
161 | if (!page) | 135 | if (!page) |
@@ -165,96 +139,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
165 | set_page_refcounted(page); | 139 | set_page_refcounted(page); |
166 | return page; | 140 | return page; |
167 | 141 | ||
168 | fail: | 142 | fail: |
169 | WARN_ON(use_reserve); /* reserved allocations shouldn't fail */ | ||
170 | spin_unlock(&hugetlb_lock); | 143 | spin_unlock(&hugetlb_lock); |
171 | return NULL; | 144 | return NULL; |
172 | } | 145 | } |
173 | 146 | ||
174 | /* hugetlb_extend_reservation() | ||
175 | * | ||
176 | * Ensure that at least 'atleast' hugepages are, and will remain, | ||
177 | * available to instantiate the first 'atleast' pages of the given | ||
178 | * inode. If the inode doesn't already have this many pages reserved | ||
179 | * or instantiated, set aside some hugepages in the reserved pool to | ||
180 | * satisfy later faults (or fail now if there aren't enough, rather | ||
181 | * than getting the SIGBUS later). | ||
182 | */ | ||
183 | int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, | ||
184 | unsigned long atleast) | ||
185 | { | ||
186 | struct inode *inode = &info->vfs_inode; | ||
187 | unsigned long change_in_reserve = 0; | ||
188 | int ret = 0; | ||
189 | |||
190 | spin_lock(&hugetlb_lock); | ||
191 | read_lock_irq(&inode->i_mapping->tree_lock); | ||
192 | |||
193 | if (info->prereserved_hpages >= atleast) | ||
194 | goto out; | ||
195 | |||
196 | /* Because we always call this on shared mappings, none of the | ||
197 | * pages beyond info->prereserved_hpages can have been | ||
198 | * instantiated, so we need to reserve all of them now. */ | ||
199 | change_in_reserve = atleast - info->prereserved_hpages; | ||
200 | |||
201 | if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) { | ||
202 | ret = -ENOMEM; | ||
203 | goto out; | ||
204 | } | ||
205 | |||
206 | reserved_huge_pages += change_in_reserve; | ||
207 | info->prereserved_hpages = atleast; | ||
208 | |||
209 | out: | ||
210 | read_unlock_irq(&inode->i_mapping->tree_lock); | ||
211 | spin_unlock(&hugetlb_lock); | ||
212 | |||
213 | return ret; | ||
214 | } | ||
215 | |||
216 | /* hugetlb_truncate_reservation() | ||
217 | * | ||
218 | * This returns pages reserved for the given inode to the general free | ||
219 | * hugepage pool. If the inode has any pages prereserved, but not | ||
220 | * instantiated, beyond offset (atmost << HPAGE_SIZE), then release | ||
221 | * them. | ||
222 | */ | ||
223 | void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, | ||
224 | unsigned long atmost) | ||
225 | { | ||
226 | struct inode *inode = &info->vfs_inode; | ||
227 | struct address_space *mapping = inode->i_mapping; | ||
228 | unsigned long idx; | ||
229 | unsigned long change_in_reserve = 0; | ||
230 | struct page *page; | ||
231 | |||
232 | spin_lock(&hugetlb_lock); | ||
233 | read_lock_irq(&inode->i_mapping->tree_lock); | ||
234 | |||
235 | if (info->prereserved_hpages <= atmost) | ||
236 | goto out; | ||
237 | |||
238 | /* Count pages which were reserved, but not instantiated, and | ||
239 | * which we can now release. */ | ||
240 | for (idx = atmost; idx < info->prereserved_hpages; idx++) { | ||
241 | page = radix_tree_lookup(&mapping->page_tree, idx); | ||
242 | if (!page) | ||
243 | /* Pages which are already instantiated can't | ||
244 | * be unreserved (and in fact have already | ||
245 | * been removed from the reserved pool) */ | ||
246 | change_in_reserve++; | ||
247 | } | ||
248 | |||
249 | BUG_ON(reserved_huge_pages < change_in_reserve); | ||
250 | reserved_huge_pages -= change_in_reserve; | ||
251 | info->prereserved_hpages = atmost; | ||
252 | |||
253 | out: | ||
254 | read_unlock_irq(&inode->i_mapping->tree_lock); | ||
255 | spin_unlock(&hugetlb_lock); | ||
256 | } | ||
257 | |||
258 | static int __init hugetlb_init(void) | 147 | static int __init hugetlb_init(void) |
259 | { | 148 | { |
260 | unsigned long i; | 149 | unsigned long i; |
@@ -334,7 +223,7 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
334 | return nr_huge_pages; | 223 | return nr_huge_pages; |
335 | 224 | ||
336 | spin_lock(&hugetlb_lock); | 225 | spin_lock(&hugetlb_lock); |
337 | count = max(count, reserved_huge_pages); | 226 | count = max(count, resv_huge_pages); |
338 | try_to_free_low(count); | 227 | try_to_free_low(count); |
339 | while (count < nr_huge_pages) { | 228 | while (count < nr_huge_pages) { |
340 | struct page *page = dequeue_huge_page(NULL, 0); | 229 | struct page *page = dequeue_huge_page(NULL, 0); |
@@ -361,11 +250,11 @@ int hugetlb_report_meminfo(char *buf) | |||
361 | return sprintf(buf, | 250 | return sprintf(buf, |
362 | "HugePages_Total: %5lu\n" | 251 | "HugePages_Total: %5lu\n" |
363 | "HugePages_Free: %5lu\n" | 252 | "HugePages_Free: %5lu\n" |
364 | "HugePages_Rsvd: %5lu\n" | 253 | "HugePages_Rsvd: %5lu\n" |
365 | "Hugepagesize: %5lu kB\n", | 254 | "Hugepagesize: %5lu kB\n", |
366 | nr_huge_pages, | 255 | nr_huge_pages, |
367 | free_huge_pages, | 256 | free_huge_pages, |
368 | reserved_huge_pages, | 257 | resv_huge_pages, |
369 | HPAGE_SIZE/1024); | 258 | HPAGE_SIZE/1024); |
370 | } | 259 | } |
371 | 260 | ||
@@ -754,3 +643,156 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
754 | flush_tlb_range(vma, start, end); | 643 | flush_tlb_range(vma, start, end); |
755 | } | 644 | } |
756 | 645 | ||
646 | struct file_region { | ||
647 | struct list_head link; | ||
648 | long from; | ||
649 | long to; | ||
650 | }; | ||
651 | |||
652 | static long region_add(struct list_head *head, long f, long t) | ||
653 | { | ||
654 | struct file_region *rg, *nrg, *trg; | ||
655 | |||
656 | /* Locate the region we are either in or before. */ | ||
657 | list_for_each_entry(rg, head, link) | ||
658 | if (f <= rg->to) | ||
659 | break; | ||
660 | |||
661 | /* Round our left edge to the current segment if it encloses us. */ | ||
662 | if (f > rg->from) | ||
663 | f = rg->from; | ||
664 | |||
665 | /* Check for and consume any regions we now overlap with. */ | ||
666 | nrg = rg; | ||
667 | list_for_each_entry_safe(rg, trg, rg->link.prev, link) { | ||
668 | if (&rg->link == head) | ||
669 | break; | ||
670 | if (rg->from > t) | ||
671 | break; | ||
672 | |||
673 | /* If this area reaches higher then extend our area to | ||
674 | * include it completely. If this is not the first area | ||
675 | * which we intend to reuse, free it. */ | ||
676 | if (rg->to > t) | ||
677 | t = rg->to; | ||
678 | if (rg != nrg) { | ||
679 | list_del(&rg->link); | ||
680 | kfree(rg); | ||
681 | } | ||
682 | } | ||
683 | nrg->from = f; | ||
684 | nrg->to = t; | ||
685 | return 0; | ||
686 | } | ||
687 | |||
688 | static long region_chg(struct list_head *head, long f, long t) | ||
689 | { | ||
690 | struct file_region *rg, *nrg; | ||
691 | long chg = 0; | ||
692 | |||
693 | /* Locate the region we are before or in. */ | ||
694 | list_for_each_entry(rg, head, link) | ||
695 | if (f <= rg->to) | ||
696 | break; | ||
697 | |||
698 | /* If we are below the current region then a new region is required. | ||
699 | * Subtle, allocate a new region at the position but make it zero | ||
700 | * size such that we can guarentee to record the reservation. */ | ||
701 | if (&rg->link == head || t < rg->from) { | ||
702 | nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); | ||
703 | if (nrg == 0) | ||
704 | return -ENOMEM; | ||
705 | nrg->from = f; | ||
706 | nrg->to = f; | ||
707 | INIT_LIST_HEAD(&nrg->link); | ||
708 | list_add(&nrg->link, rg->link.prev); | ||
709 | |||
710 | return t - f; | ||
711 | } | ||
712 | |||
713 | /* Round our left edge to the current segment if it encloses us. */ | ||
714 | if (f > rg->from) | ||
715 | f = rg->from; | ||
716 | chg = t - f; | ||
717 | |||
718 | /* Check for and consume any regions we now overlap with. */ | ||
719 | list_for_each_entry(rg, rg->link.prev, link) { | ||
720 | if (&rg->link == head) | ||
721 | break; | ||
722 | if (rg->from > t) | ||
723 | return chg; | ||
724 | |||
725 | /* We overlap with this area, if it extends futher than | ||
726 | * us then we must extend ourselves. Account for its | ||
727 | * existing reservation. */ | ||
728 | if (rg->to > t) { | ||
729 | chg += rg->to - t; | ||
730 | t = rg->to; | ||
731 | } | ||
732 | chg -= rg->to - rg->from; | ||
733 | } | ||
734 | return chg; | ||
735 | } | ||
736 | |||
737 | static long region_truncate(struct list_head *head, long end) | ||
738 | { | ||
739 | struct file_region *rg, *trg; | ||
740 | long chg = 0; | ||
741 | |||
742 | /* Locate the region we are either in or before. */ | ||
743 | list_for_each_entry(rg, head, link) | ||
744 | if (end <= rg->to) | ||
745 | break; | ||
746 | if (&rg->link == head) | ||
747 | return 0; | ||
748 | |||
749 | /* If we are in the middle of a region then adjust it. */ | ||
750 | if (end > rg->from) { | ||
751 | chg = rg->to - end; | ||
752 | rg->to = end; | ||
753 | rg = list_entry(rg->link.next, typeof(*rg), link); | ||
754 | } | ||
755 | |||
756 | /* Drop any remaining regions. */ | ||
757 | list_for_each_entry_safe(rg, trg, rg->link.prev, link) { | ||
758 | if (&rg->link == head) | ||
759 | break; | ||
760 | chg += rg->to - rg->from; | ||
761 | list_del(&rg->link); | ||
762 | kfree(rg); | ||
763 | } | ||
764 | return chg; | ||
765 | } | ||
766 | |||
767 | static int hugetlb_acct_memory(long delta) | ||
768 | { | ||
769 | int ret = -ENOMEM; | ||
770 | |||
771 | spin_lock(&hugetlb_lock); | ||
772 | if ((delta + resv_huge_pages) <= free_huge_pages) { | ||
773 | resv_huge_pages += delta; | ||
774 | ret = 0; | ||
775 | } | ||
776 | spin_unlock(&hugetlb_lock); | ||
777 | return ret; | ||
778 | } | ||
779 | |||
780 | int hugetlb_reserve_pages(struct inode *inode, long from, long to) | ||
781 | { | ||
782 | long ret, chg; | ||
783 | |||
784 | chg = region_chg(&inode->i_mapping->private_list, from, to); | ||
785 | if (chg < 0) | ||
786 | return chg; | ||
787 | ret = hugetlb_acct_memory(chg); | ||
788 | if (ret < 0) | ||
789 | return ret; | ||
790 | region_add(&inode->i_mapping->private_list, from, to); | ||
791 | return 0; | ||
792 | } | ||
793 | |||
794 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | ||
795 | { | ||
796 | long chg = region_truncate(&inode->i_mapping->private_list, offset); | ||
797 | hugetlb_acct_memory(freed - chg); | ||
798 | } | ||
diff --git a/mm/memory.c b/mm/memory.c index 0ec7bc644271..7e2a4b1580e3 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -126,7 +126,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) | |||
126 | pmd_clear(pmd); | 126 | pmd_clear(pmd); |
127 | pte_lock_deinit(page); | 127 | pte_lock_deinit(page); |
128 | pte_free_tlb(tlb, page); | 128 | pte_free_tlb(tlb, page); |
129 | dec_page_state(nr_page_table_pages); | 129 | dec_zone_page_state(page, NR_PAGETABLE); |
130 | tlb->mm->nr_ptes--; | 130 | tlb->mm->nr_ptes--; |
131 | } | 131 | } |
132 | 132 | ||
@@ -311,7 +311,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | |||
311 | pte_free(new); | 311 | pte_free(new); |
312 | } else { | 312 | } else { |
313 | mm->nr_ptes++; | 313 | mm->nr_ptes++; |
314 | inc_page_state(nr_page_table_pages); | 314 | inc_zone_page_state(new, NR_PAGETABLE); |
315 | pmd_populate(mm, pmd, new); | 315 | pmd_populate(mm, pmd, new); |
316 | } | 316 | } |
317 | spin_unlock(&mm->page_table_lock); | 317 | spin_unlock(&mm->page_table_lock); |
@@ -434,7 +434,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
434 | /* pte contains position in swap or file, so copy. */ | 434 | /* pte contains position in swap or file, so copy. */ |
435 | if (unlikely(!pte_present(pte))) { | 435 | if (unlikely(!pte_present(pte))) { |
436 | if (!pte_file(pte)) { | 436 | if (!pte_file(pte)) { |
437 | swap_duplicate(pte_to_swp_entry(pte)); | 437 | swp_entry_t entry = pte_to_swp_entry(pte); |
438 | |||
439 | swap_duplicate(entry); | ||
438 | /* make sure dst_mm is on swapoff's mmlist. */ | 440 | /* make sure dst_mm is on swapoff's mmlist. */ |
439 | if (unlikely(list_empty(&dst_mm->mmlist))) { | 441 | if (unlikely(list_empty(&dst_mm->mmlist))) { |
440 | spin_lock(&mmlist_lock); | 442 | spin_lock(&mmlist_lock); |
@@ -443,6 +445,16 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
443 | &src_mm->mmlist); | 445 | &src_mm->mmlist); |
444 | spin_unlock(&mmlist_lock); | 446 | spin_unlock(&mmlist_lock); |
445 | } | 447 | } |
448 | if (is_write_migration_entry(entry) && | ||
449 | is_cow_mapping(vm_flags)) { | ||
450 | /* | ||
451 | * COW mappings require pages in both parent | ||
452 | * and child to be set to read. | ||
453 | */ | ||
454 | make_migration_entry_read(&entry); | ||
455 | pte = swp_entry_to_pte(entry); | ||
456 | set_pte_at(src_mm, addr, src_pte, pte); | ||
457 | } | ||
446 | } | 458 | } |
447 | goto out_set_pte; | 459 | goto out_set_pte; |
448 | } | 460 | } |
@@ -1445,25 +1457,60 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1445 | { | 1457 | { |
1446 | struct page *old_page, *new_page; | 1458 | struct page *old_page, *new_page; |
1447 | pte_t entry; | 1459 | pte_t entry; |
1448 | int ret = VM_FAULT_MINOR; | 1460 | int reuse, ret = VM_FAULT_MINOR; |
1449 | 1461 | ||
1450 | old_page = vm_normal_page(vma, address, orig_pte); | 1462 | old_page = vm_normal_page(vma, address, orig_pte); |
1451 | if (!old_page) | 1463 | if (!old_page) |
1452 | goto gotten; | 1464 | goto gotten; |
1453 | 1465 | ||
1454 | if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { | 1466 | if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) == |
1455 | int reuse = can_share_swap_page(old_page); | 1467 | (VM_SHARED|VM_WRITE))) { |
1456 | unlock_page(old_page); | 1468 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { |
1457 | if (reuse) { | 1469 | /* |
1458 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 1470 | * Notify the address space that the page is about to |
1459 | entry = pte_mkyoung(orig_pte); | 1471 | * become writable so that it can prohibit this or wait |
1460 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1472 | * for the page to get into an appropriate state. |
1461 | ptep_set_access_flags(vma, address, page_table, entry, 1); | 1473 | * |
1462 | update_mmu_cache(vma, address, entry); | 1474 | * We do this without the lock held, so that it can |
1463 | lazy_mmu_prot_update(entry); | 1475 | * sleep if it needs to. |
1464 | ret |= VM_FAULT_WRITE; | 1476 | */ |
1465 | goto unlock; | 1477 | page_cache_get(old_page); |
1478 | pte_unmap_unlock(page_table, ptl); | ||
1479 | |||
1480 | if (vma->vm_ops->page_mkwrite(vma, old_page) < 0) | ||
1481 | goto unwritable_page; | ||
1482 | |||
1483 | page_cache_release(old_page); | ||
1484 | |||
1485 | /* | ||
1486 | * Since we dropped the lock we need to revalidate | ||
1487 | * the PTE as someone else may have changed it. If | ||
1488 | * they did, we just return, as we can count on the | ||
1489 | * MMU to tell us if they didn't also make it writable. | ||
1490 | */ | ||
1491 | page_table = pte_offset_map_lock(mm, pmd, address, | ||
1492 | &ptl); | ||
1493 | if (!pte_same(*page_table, orig_pte)) | ||
1494 | goto unlock; | ||
1466 | } | 1495 | } |
1496 | |||
1497 | reuse = 1; | ||
1498 | } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { | ||
1499 | reuse = can_share_swap_page(old_page); | ||
1500 | unlock_page(old_page); | ||
1501 | } else { | ||
1502 | reuse = 0; | ||
1503 | } | ||
1504 | |||
1505 | if (reuse) { | ||
1506 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | ||
1507 | entry = pte_mkyoung(orig_pte); | ||
1508 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
1509 | ptep_set_access_flags(vma, address, page_table, entry, 1); | ||
1510 | update_mmu_cache(vma, address, entry); | ||
1511 | lazy_mmu_prot_update(entry); | ||
1512 | ret |= VM_FAULT_WRITE; | ||
1513 | goto unlock; | ||
1467 | } | 1514 | } |
1468 | 1515 | ||
1469 | /* | 1516 | /* |
@@ -1523,6 +1570,10 @@ oom: | |||
1523 | if (old_page) | 1570 | if (old_page) |
1524 | page_cache_release(old_page); | 1571 | page_cache_release(old_page); |
1525 | return VM_FAULT_OOM; | 1572 | return VM_FAULT_OOM; |
1573 | |||
1574 | unwritable_page: | ||
1575 | page_cache_release(old_page); | ||
1576 | return VM_FAULT_SIGBUS; | ||
1526 | } | 1577 | } |
1527 | 1578 | ||
1528 | /* | 1579 | /* |
@@ -1879,7 +1930,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1879 | goto out; | 1930 | goto out; |
1880 | 1931 | ||
1881 | entry = pte_to_swp_entry(orig_pte); | 1932 | entry = pte_to_swp_entry(orig_pte); |
1882 | again: | 1933 | if (is_migration_entry(entry)) { |
1934 | migration_entry_wait(mm, pmd, address); | ||
1935 | goto out; | ||
1936 | } | ||
1883 | page = lookup_swap_cache(entry); | 1937 | page = lookup_swap_cache(entry); |
1884 | if (!page) { | 1938 | if (!page) { |
1885 | swapin_readahead(entry, address, vma); | 1939 | swapin_readahead(entry, address, vma); |
@@ -1897,18 +1951,12 @@ again: | |||
1897 | 1951 | ||
1898 | /* Had to read the page from swap area: Major fault */ | 1952 | /* Had to read the page from swap area: Major fault */ |
1899 | ret = VM_FAULT_MAJOR; | 1953 | ret = VM_FAULT_MAJOR; |
1900 | inc_page_state(pgmajfault); | 1954 | count_vm_event(PGMAJFAULT); |
1901 | grab_swap_token(); | 1955 | grab_swap_token(); |
1902 | } | 1956 | } |
1903 | 1957 | ||
1904 | mark_page_accessed(page); | 1958 | mark_page_accessed(page); |
1905 | lock_page(page); | 1959 | lock_page(page); |
1906 | if (!PageSwapCache(page)) { | ||
1907 | /* Page migration has occured */ | ||
1908 | unlock_page(page); | ||
1909 | page_cache_release(page); | ||
1910 | goto again; | ||
1911 | } | ||
1912 | 1960 | ||
1913 | /* | 1961 | /* |
1914 | * Back out if somebody else already faulted in this pte. | 1962 | * Back out if somebody else already faulted in this pte. |
@@ -2074,18 +2122,31 @@ retry: | |||
2074 | /* | 2122 | /* |
2075 | * Should we do an early C-O-W break? | 2123 | * Should we do an early C-O-W break? |
2076 | */ | 2124 | */ |
2077 | if (write_access && !(vma->vm_flags & VM_SHARED)) { | 2125 | if (write_access) { |
2078 | struct page *page; | 2126 | if (!(vma->vm_flags & VM_SHARED)) { |
2127 | struct page *page; | ||
2079 | 2128 | ||
2080 | if (unlikely(anon_vma_prepare(vma))) | 2129 | if (unlikely(anon_vma_prepare(vma))) |
2081 | goto oom; | 2130 | goto oom; |
2082 | page = alloc_page_vma(GFP_HIGHUSER, vma, address); | 2131 | page = alloc_page_vma(GFP_HIGHUSER, vma, address); |
2083 | if (!page) | 2132 | if (!page) |
2084 | goto oom; | 2133 | goto oom; |
2085 | copy_user_highpage(page, new_page, address); | 2134 | copy_user_highpage(page, new_page, address); |
2086 | page_cache_release(new_page); | 2135 | page_cache_release(new_page); |
2087 | new_page = page; | 2136 | new_page = page; |
2088 | anon = 1; | 2137 | anon = 1; |
2138 | |||
2139 | } else { | ||
2140 | /* if the page will be shareable, see if the backing | ||
2141 | * address space wants to know that the page is about | ||
2142 | * to become writable */ | ||
2143 | if (vma->vm_ops->page_mkwrite && | ||
2144 | vma->vm_ops->page_mkwrite(vma, new_page) < 0 | ||
2145 | ) { | ||
2146 | page_cache_release(new_page); | ||
2147 | return VM_FAULT_SIGBUS; | ||
2148 | } | ||
2149 | } | ||
2089 | } | 2150 | } |
2090 | 2151 | ||
2091 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2152 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
@@ -2263,7 +2324,7 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2263 | 2324 | ||
2264 | __set_current_state(TASK_RUNNING); | 2325 | __set_current_state(TASK_RUNNING); |
2265 | 2326 | ||
2266 | inc_page_state(pgfault); | 2327 | count_vm_event(PGFAULT); |
2267 | 2328 | ||
2268 | if (unlikely(is_vm_hugetlb_page(vma))) | 2329 | if (unlikely(is_vm_hugetlb_page(vma))) |
2269 | return hugetlb_fault(mm, vma, address, write_access); | 2330 | return hugetlb_fault(mm, vma, address, write_access); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 70df5c0d957e..01c9fb97c619 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -4,7 +4,6 @@ | |||
4 | * Copyright (C) | 4 | * Copyright (C) |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include <linux/config.h> | ||
8 | #include <linux/stddef.h> | 7 | #include <linux/stddef.h> |
9 | #include <linux/mm.h> | 8 | #include <linux/mm.h> |
10 | #include <linux/swap.h> | 9 | #include <linux/swap.h> |
@@ -21,12 +20,13 @@ | |||
21 | #include <linux/memory_hotplug.h> | 20 | #include <linux/memory_hotplug.h> |
22 | #include <linux/highmem.h> | 21 | #include <linux/highmem.h> |
23 | #include <linux/vmalloc.h> | 22 | #include <linux/vmalloc.h> |
23 | #include <linux/ioport.h> | ||
24 | 24 | ||
25 | #include <asm/tlbflush.h> | 25 | #include <asm/tlbflush.h> |
26 | 26 | ||
27 | extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, | 27 | extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, |
28 | unsigned long size); | 28 | unsigned long size); |
29 | static void __add_zone(struct zone *zone, unsigned long phys_start_pfn) | 29 | static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) |
30 | { | 30 | { |
31 | struct pglist_data *pgdat = zone->zone_pgdat; | 31 | struct pglist_data *pgdat = zone->zone_pgdat; |
32 | int nr_pages = PAGES_PER_SECTION; | 32 | int nr_pages = PAGES_PER_SECTION; |
@@ -34,8 +34,15 @@ static void __add_zone(struct zone *zone, unsigned long phys_start_pfn) | |||
34 | int zone_type; | 34 | int zone_type; |
35 | 35 | ||
36 | zone_type = zone - pgdat->node_zones; | 36 | zone_type = zone - pgdat->node_zones; |
37 | if (!populated_zone(zone)) { | ||
38 | int ret = 0; | ||
39 | ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages); | ||
40 | if (ret < 0) | ||
41 | return ret; | ||
42 | } | ||
37 | memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); | 43 | memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); |
38 | zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); | 44 | zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); |
45 | return 0; | ||
39 | } | 46 | } |
40 | 47 | ||
41 | extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | 48 | extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, |
@@ -50,7 +57,11 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn) | |||
50 | if (ret < 0) | 57 | if (ret < 0) |
51 | return ret; | 58 | return ret; |
52 | 59 | ||
53 | __add_zone(zone, phys_start_pfn); | 60 | ret = __add_zone(zone, phys_start_pfn); |
61 | |||
62 | if (ret < 0) | ||
63 | return ret; | ||
64 | |||
54 | return register_new_memory(__pfn_to_section(phys_start_pfn)); | 65 | return register_new_memory(__pfn_to_section(phys_start_pfn)); |
55 | } | 66 | } |
56 | 67 | ||
@@ -115,7 +126,11 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
115 | unsigned long i; | 126 | unsigned long i; |
116 | unsigned long flags; | 127 | unsigned long flags; |
117 | unsigned long onlined_pages = 0; | 128 | unsigned long onlined_pages = 0; |
129 | struct resource res; | ||
130 | u64 section_end; | ||
131 | unsigned long start_pfn; | ||
118 | struct zone *zone; | 132 | struct zone *zone; |
133 | int need_zonelists_rebuild = 0; | ||
119 | 134 | ||
120 | /* | 135 | /* |
121 | * This doesn't need a lock to do pfn_to_page(). | 136 | * This doesn't need a lock to do pfn_to_page(). |
@@ -128,15 +143,140 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
128 | grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages); | 143 | grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages); |
129 | pgdat_resize_unlock(zone->zone_pgdat, &flags); | 144 | pgdat_resize_unlock(zone->zone_pgdat, &flags); |
130 | 145 | ||
131 | for (i = 0; i < nr_pages; i++) { | 146 | /* |
132 | struct page *page = pfn_to_page(pfn + i); | 147 | * If this zone is not populated, then it is not in zonelist. |
133 | online_page(page); | 148 | * This means the page allocator ignores this zone. |
134 | onlined_pages++; | 149 | * So, zonelist must be updated after online. |
150 | */ | ||
151 | if (!populated_zone(zone)) | ||
152 | need_zonelists_rebuild = 1; | ||
153 | |||
154 | res.start = (u64)pfn << PAGE_SHIFT; | ||
155 | res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1; | ||
156 | res.flags = IORESOURCE_MEM; /* we just need system ram */ | ||
157 | section_end = res.end; | ||
158 | |||
159 | while (find_next_system_ram(&res) >= 0) { | ||
160 | start_pfn = (unsigned long)(res.start >> PAGE_SHIFT); | ||
161 | nr_pages = (unsigned long) | ||
162 | ((res.end + 1 - res.start) >> PAGE_SHIFT); | ||
163 | |||
164 | if (PageReserved(pfn_to_page(start_pfn))) { | ||
165 | /* this region's page is not onlined now */ | ||
166 | for (i = 0; i < nr_pages; i++) { | ||
167 | struct page *page = pfn_to_page(start_pfn + i); | ||
168 | online_page(page); | ||
169 | onlined_pages++; | ||
170 | } | ||
171 | } | ||
172 | |||
173 | res.start = res.end + 1; | ||
174 | res.end = section_end; | ||
135 | } | 175 | } |
136 | zone->present_pages += onlined_pages; | 176 | zone->present_pages += onlined_pages; |
137 | zone->zone_pgdat->node_present_pages += onlined_pages; | 177 | zone->zone_pgdat->node_present_pages += onlined_pages; |
138 | 178 | ||
139 | setup_per_zone_pages_min(); | 179 | setup_per_zone_pages_min(); |
140 | 180 | ||
181 | if (need_zonelists_rebuild) | ||
182 | build_all_zonelists(); | ||
183 | vm_total_pages = nr_free_pagecache_pages(); | ||
141 | return 0; | 184 | return 0; |
142 | } | 185 | } |
186 | |||
187 | static pg_data_t *hotadd_new_pgdat(int nid, u64 start) | ||
188 | { | ||
189 | struct pglist_data *pgdat; | ||
190 | unsigned long zones_size[MAX_NR_ZONES] = {0}; | ||
191 | unsigned long zholes_size[MAX_NR_ZONES] = {0}; | ||
192 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
193 | |||
194 | pgdat = arch_alloc_nodedata(nid); | ||
195 | if (!pgdat) | ||
196 | return NULL; | ||
197 | |||
198 | arch_refresh_nodedata(nid, pgdat); | ||
199 | |||
200 | /* we can use NODE_DATA(nid) from here */ | ||
201 | |||
202 | /* init node's zones as empty zones, we don't have any present pages.*/ | ||
203 | free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size); | ||
204 | |||
205 | return pgdat; | ||
206 | } | ||
207 | |||
208 | static void rollback_node_hotadd(int nid, pg_data_t *pgdat) | ||
209 | { | ||
210 | arch_refresh_nodedata(nid, NULL); | ||
211 | arch_free_nodedata(pgdat); | ||
212 | return; | ||
213 | } | ||
214 | |||
215 | /* add this memory to iomem resource */ | ||
216 | static void register_memory_resource(u64 start, u64 size) | ||
217 | { | ||
218 | struct resource *res; | ||
219 | |||
220 | res = kzalloc(sizeof(struct resource), GFP_KERNEL); | ||
221 | BUG_ON(!res); | ||
222 | |||
223 | res->name = "System RAM"; | ||
224 | res->start = start; | ||
225 | res->end = start + size - 1; | ||
226 | res->flags = IORESOURCE_MEM; | ||
227 | if (request_resource(&iomem_resource, res) < 0) { | ||
228 | printk("System RAM resource %llx - %llx cannot be added\n", | ||
229 | (unsigned long long)res->start, (unsigned long long)res->end); | ||
230 | kfree(res); | ||
231 | } | ||
232 | } | ||
233 | |||
234 | |||
235 | |||
236 | int add_memory(int nid, u64 start, u64 size) | ||
237 | { | ||
238 | pg_data_t *pgdat = NULL; | ||
239 | int new_pgdat = 0; | ||
240 | int ret; | ||
241 | |||
242 | if (!node_online(nid)) { | ||
243 | pgdat = hotadd_new_pgdat(nid, start); | ||
244 | if (!pgdat) | ||
245 | return -ENOMEM; | ||
246 | new_pgdat = 1; | ||
247 | ret = kswapd_run(nid); | ||
248 | if (ret) | ||
249 | goto error; | ||
250 | } | ||
251 | |||
252 | /* call arch's memory hotadd */ | ||
253 | ret = arch_add_memory(nid, start, size); | ||
254 | |||
255 | if (ret < 0) | ||
256 | goto error; | ||
257 | |||
258 | /* we online node here. we can't roll back from here. */ | ||
259 | node_set_online(nid); | ||
260 | |||
261 | if (new_pgdat) { | ||
262 | ret = register_one_node(nid); | ||
263 | /* | ||
264 | * If sysfs file of new node can't create, cpu on the node | ||
265 | * can't be hot-added. There is no rollback way now. | ||
266 | * So, check by BUG_ON() to catch it reluctantly.. | ||
267 | */ | ||
268 | BUG_ON(ret); | ||
269 | } | ||
270 | |||
271 | /* register this memory as resource */ | ||
272 | register_memory_resource(start, size); | ||
273 | |||
274 | return ret; | ||
275 | error: | ||
276 | /* rollback pgdat allocation and others */ | ||
277 | if (new_pgdat) | ||
278 | rollback_node_hotadd(nid, pgdat); | ||
279 | |||
280 | return ret; | ||
281 | } | ||
282 | EXPORT_SYMBOL_GPL(add_memory); | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8778f58880c4..e07e27e846a2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -87,6 +87,8 @@ | |||
87 | #include <linux/seq_file.h> | 87 | #include <linux/seq_file.h> |
88 | #include <linux/proc_fs.h> | 88 | #include <linux/proc_fs.h> |
89 | #include <linux/migrate.h> | 89 | #include <linux/migrate.h> |
90 | #include <linux/rmap.h> | ||
91 | #include <linux/security.h> | ||
90 | 92 | ||
91 | #include <asm/tlbflush.h> | 93 | #include <asm/tlbflush.h> |
92 | #include <asm/uaccess.h> | 94 | #include <asm/uaccess.h> |
@@ -587,6 +589,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
587 | isolate_lru_page(page, pagelist); | 589 | isolate_lru_page(page, pagelist); |
588 | } | 590 | } |
589 | 591 | ||
592 | static struct page *new_node_page(struct page *page, unsigned long node, int **x) | ||
593 | { | ||
594 | return alloc_pages_node(node, GFP_HIGHUSER, 0); | ||
595 | } | ||
596 | |||
590 | /* | 597 | /* |
591 | * Migrate pages from one node to a target node. | 598 | * Migrate pages from one node to a target node. |
592 | * Returns error or the number of pages not migrated. | 599 | * Returns error or the number of pages not migrated. |
@@ -603,11 +610,9 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) | |||
603 | check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, | 610 | check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, |
604 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | 611 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); |
605 | 612 | ||
606 | if (!list_empty(&pagelist)) { | 613 | if (!list_empty(&pagelist)) |
607 | err = migrate_pages_to(&pagelist, NULL, dest); | 614 | err = migrate_pages(&pagelist, new_node_page, dest); |
608 | if (!list_empty(&pagelist)) | 615 | |
609 | putback_lru_pages(&pagelist); | ||
610 | } | ||
611 | return err; | 616 | return err; |
612 | } | 617 | } |
613 | 618 | ||
@@ -627,6 +632,10 @@ int do_migrate_pages(struct mm_struct *mm, | |||
627 | 632 | ||
628 | down_read(&mm->mmap_sem); | 633 | down_read(&mm->mmap_sem); |
629 | 634 | ||
635 | err = migrate_vmas(mm, from_nodes, to_nodes, flags); | ||
636 | if (err) | ||
637 | goto out; | ||
638 | |||
630 | /* | 639 | /* |
631 | * Find a 'source' bit set in 'tmp' whose corresponding 'dest' | 640 | * Find a 'source' bit set in 'tmp' whose corresponding 'dest' |
632 | * bit in 'to' is not also set in 'tmp'. Clear the found 'source' | 641 | * bit in 'to' is not also set in 'tmp'. Clear the found 'source' |
@@ -686,7 +695,7 @@ int do_migrate_pages(struct mm_struct *mm, | |||
686 | if (err < 0) | 695 | if (err < 0) |
687 | break; | 696 | break; |
688 | } | 697 | } |
689 | 698 | out: | |
690 | up_read(&mm->mmap_sem); | 699 | up_read(&mm->mmap_sem); |
691 | if (err < 0) | 700 | if (err < 0) |
692 | return err; | 701 | return err; |
@@ -694,6 +703,12 @@ int do_migrate_pages(struct mm_struct *mm, | |||
694 | 703 | ||
695 | } | 704 | } |
696 | 705 | ||
706 | static struct page *new_vma_page(struct page *page, unsigned long private, int **x) | ||
707 | { | ||
708 | struct vm_area_struct *vma = (struct vm_area_struct *)private; | ||
709 | |||
710 | return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma)); | ||
711 | } | ||
697 | #else | 712 | #else |
698 | 713 | ||
699 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | 714 | static void migrate_page_add(struct page *page, struct list_head *pagelist, |
@@ -706,6 +721,11 @@ int do_migrate_pages(struct mm_struct *mm, | |||
706 | { | 721 | { |
707 | return -ENOSYS; | 722 | return -ENOSYS; |
708 | } | 723 | } |
724 | |||
725 | static struct page *new_vma_page(struct page *page, unsigned long private) | ||
726 | { | ||
727 | return NULL; | ||
728 | } | ||
709 | #endif | 729 | #endif |
710 | 730 | ||
711 | long do_mbind(unsigned long start, unsigned long len, | 731 | long do_mbind(unsigned long start, unsigned long len, |
@@ -767,15 +787,13 @@ long do_mbind(unsigned long start, unsigned long len, | |||
767 | err = mbind_range(vma, start, end, new); | 787 | err = mbind_range(vma, start, end, new); |
768 | 788 | ||
769 | if (!list_empty(&pagelist)) | 789 | if (!list_empty(&pagelist)) |
770 | nr_failed = migrate_pages_to(&pagelist, vma, -1); | 790 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
791 | (unsigned long)vma); | ||
771 | 792 | ||
772 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | 793 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) |
773 | err = -EIO; | 794 | err = -EIO; |
774 | } | 795 | } |
775 | 796 | ||
776 | if (!list_empty(&pagelist)) | ||
777 | putback_lru_pages(&pagelist); | ||
778 | |||
779 | up_write(&mm->mmap_sem); | 797 | up_write(&mm->mmap_sem); |
780 | mpol_free(new); | 798 | mpol_free(new); |
781 | return err; | 799 | return err; |
@@ -929,6 +947,10 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, | |||
929 | goto out; | 947 | goto out; |
930 | } | 948 | } |
931 | 949 | ||
950 | err = security_task_movememory(task); | ||
951 | if (err) | ||
952 | goto out; | ||
953 | |||
932 | err = do_migrate_pages(mm, &old, &new, | 954 | err = do_migrate_pages(mm, &old, &new, |
933 | capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); | 955 | capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); |
934 | out: | 956 | out: |
@@ -1187,10 +1209,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | |||
1187 | 1209 | ||
1188 | zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); | 1210 | zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); |
1189 | page = __alloc_pages(gfp, order, zl); | 1211 | page = __alloc_pages(gfp, order, zl); |
1190 | if (page && page_zone(page) == zl->zones[0]) { | 1212 | if (page && page_zone(page) == zl->zones[0]) |
1191 | zone_pcp(zl->zones[0],get_cpu())->interleave_hit++; | 1213 | inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); |
1192 | put_cpu(); | ||
1193 | } | ||
1194 | return page; | 1214 | return page; |
1195 | } | 1215 | } |
1196 | 1216 | ||
@@ -1799,7 +1819,7 @@ static inline void check_huge_range(struct vm_area_struct *vma, | |||
1799 | 1819 | ||
1800 | int show_numa_map(struct seq_file *m, void *v) | 1820 | int show_numa_map(struct seq_file *m, void *v) |
1801 | { | 1821 | { |
1802 | struct task_struct *task = m->private; | 1822 | struct proc_maps_private *priv = m->private; |
1803 | struct vm_area_struct *vma = v; | 1823 | struct vm_area_struct *vma = v; |
1804 | struct numa_maps *md; | 1824 | struct numa_maps *md; |
1805 | struct file *file = vma->vm_file; | 1825 | struct file *file = vma->vm_file; |
@@ -1815,7 +1835,7 @@ int show_numa_map(struct seq_file *m, void *v) | |||
1815 | return 0; | 1835 | return 0; |
1816 | 1836 | ||
1817 | mpol_to_str(buffer, sizeof(buffer), | 1837 | mpol_to_str(buffer, sizeof(buffer), |
1818 | get_vma_policy(task, vma, vma->vm_start)); | 1838 | get_vma_policy(priv->task, vma, vma->vm_start)); |
1819 | 1839 | ||
1820 | seq_printf(m, "%08lx %s", vma->vm_start, buffer); | 1840 | seq_printf(m, "%08lx %s", vma->vm_start, buffer); |
1821 | 1841 | ||
@@ -1869,7 +1889,7 @@ out: | |||
1869 | kfree(md); | 1889 | kfree(md); |
1870 | 1890 | ||
1871 | if (m->count < m->size) | 1891 | if (m->count < m->size) |
1872 | m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; | 1892 | m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; |
1873 | return 0; | 1893 | return 0; |
1874 | } | 1894 | } |
1875 | 1895 | ||
diff --git a/mm/migrate.c b/mm/migrate.c index 1c25040693d2..3f1e0c2c942c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/migrate.h> | 15 | #include <linux/migrate.h> |
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
17 | #include <linux/swap.h> | 17 | #include <linux/swap.h> |
18 | #include <linux/swapops.h> | ||
18 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> |
19 | #include <linux/buffer_head.h> | 20 | #include <linux/buffer_head.h> |
20 | #include <linux/mm_inline.h> | 21 | #include <linux/mm_inline.h> |
@@ -23,13 +24,13 @@ | |||
23 | #include <linux/topology.h> | 24 | #include <linux/topology.h> |
24 | #include <linux/cpu.h> | 25 | #include <linux/cpu.h> |
25 | #include <linux/cpuset.h> | 26 | #include <linux/cpuset.h> |
26 | #include <linux/swapops.h> | 27 | #include <linux/writeback.h> |
28 | #include <linux/mempolicy.h> | ||
29 | #include <linux/vmalloc.h> | ||
30 | #include <linux/security.h> | ||
27 | 31 | ||
28 | #include "internal.h" | 32 | #include "internal.h" |
29 | 33 | ||
30 | /* The maximum number of pages to take off the LRU for migration */ | ||
31 | #define MIGRATE_CHUNK_SIZE 256 | ||
32 | |||
33 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | 34 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) |
34 | 35 | ||
35 | /* | 36 | /* |
@@ -64,16 +65,11 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist) | |||
64 | } | 65 | } |
65 | 66 | ||
66 | /* | 67 | /* |
67 | * migrate_prep() needs to be called after we have compiled the list of pages | 68 | * migrate_prep() needs to be called before we start compiling a list of pages |
68 | * to be migrated using isolate_lru_page() but before we begin a series of calls | 69 | * to be migrated using isolate_lru_page(). |
69 | * to migrate_pages(). | ||
70 | */ | 70 | */ |
71 | int migrate_prep(void) | 71 | int migrate_prep(void) |
72 | { | 72 | { |
73 | /* Must have swap device for migration */ | ||
74 | if (nr_swap_pages <= 0) | ||
75 | return -ENODEV; | ||
76 | |||
77 | /* | 73 | /* |
78 | * Clear the LRU lists so pages can be isolated. | 74 | * Clear the LRU lists so pages can be isolated. |
79 | * Note that pages may be moved off the LRU after we have | 75 | * Note that pages may be moved off the LRU after we have |
@@ -87,7 +83,6 @@ int migrate_prep(void) | |||
87 | 83 | ||
88 | static inline void move_to_lru(struct page *page) | 84 | static inline void move_to_lru(struct page *page) |
89 | { | 85 | { |
90 | list_del(&page->lru); | ||
91 | if (PageActive(page)) { | 86 | if (PageActive(page)) { |
92 | /* | 87 | /* |
93 | * lru_cache_add_active checks that | 88 | * lru_cache_add_active checks that |
@@ -113,113 +108,200 @@ int putback_lru_pages(struct list_head *l) | |||
113 | int count = 0; | 108 | int count = 0; |
114 | 109 | ||
115 | list_for_each_entry_safe(page, page2, l, lru) { | 110 | list_for_each_entry_safe(page, page2, l, lru) { |
111 | list_del(&page->lru); | ||
116 | move_to_lru(page); | 112 | move_to_lru(page); |
117 | count++; | 113 | count++; |
118 | } | 114 | } |
119 | return count; | 115 | return count; |
120 | } | 116 | } |
121 | 117 | ||
122 | /* | 118 | static inline int is_swap_pte(pte_t pte) |
123 | * Non migratable page | ||
124 | */ | ||
125 | int fail_migrate_page(struct page *newpage, struct page *page) | ||
126 | { | 119 | { |
127 | return -EIO; | 120 | return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); |
128 | } | 121 | } |
129 | EXPORT_SYMBOL(fail_migrate_page); | ||
130 | 122 | ||
131 | /* | 123 | /* |
132 | * swapout a single page | 124 | * Restore a potential migration pte to a working pte entry |
133 | * page is locked upon entry, unlocked on exit | ||
134 | */ | 125 | */ |
135 | static int swap_page(struct page *page) | 126 | static void remove_migration_pte(struct vm_area_struct *vma, |
127 | struct page *old, struct page *new) | ||
136 | { | 128 | { |
137 | struct address_space *mapping = page_mapping(page); | 129 | struct mm_struct *mm = vma->vm_mm; |
130 | swp_entry_t entry; | ||
131 | pgd_t *pgd; | ||
132 | pud_t *pud; | ||
133 | pmd_t *pmd; | ||
134 | pte_t *ptep, pte; | ||
135 | spinlock_t *ptl; | ||
136 | unsigned long addr = page_address_in_vma(new, vma); | ||
137 | |||
138 | if (addr == -EFAULT) | ||
139 | return; | ||
140 | |||
141 | pgd = pgd_offset(mm, addr); | ||
142 | if (!pgd_present(*pgd)) | ||
143 | return; | ||
144 | |||
145 | pud = pud_offset(pgd, addr); | ||
146 | if (!pud_present(*pud)) | ||
147 | return; | ||
148 | |||
149 | pmd = pmd_offset(pud, addr); | ||
150 | if (!pmd_present(*pmd)) | ||
151 | return; | ||
152 | |||
153 | ptep = pte_offset_map(pmd, addr); | ||
154 | |||
155 | if (!is_swap_pte(*ptep)) { | ||
156 | pte_unmap(ptep); | ||
157 | return; | ||
158 | } | ||
138 | 159 | ||
139 | if (page_mapped(page) && mapping) | 160 | ptl = pte_lockptr(mm, pmd); |
140 | if (try_to_unmap(page, 1) != SWAP_SUCCESS) | 161 | spin_lock(ptl); |
141 | goto unlock_retry; | 162 | pte = *ptep; |
163 | if (!is_swap_pte(pte)) | ||
164 | goto out; | ||
142 | 165 | ||
143 | if (PageDirty(page)) { | 166 | entry = pte_to_swp_entry(pte); |
144 | /* Page is dirty, try to write it out here */ | ||
145 | switch(pageout(page, mapping)) { | ||
146 | case PAGE_KEEP: | ||
147 | case PAGE_ACTIVATE: | ||
148 | goto unlock_retry; | ||
149 | 167 | ||
150 | case PAGE_SUCCESS: | 168 | if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) |
151 | goto retry; | 169 | goto out; |
152 | 170 | ||
153 | case PAGE_CLEAN: | 171 | get_page(new); |
154 | ; /* try to free the page below */ | 172 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
155 | } | 173 | if (is_write_migration_entry(entry)) |
156 | } | 174 | pte = pte_mkwrite(pte); |
175 | set_pte_at(mm, addr, ptep, pte); | ||
157 | 176 | ||
158 | if (PagePrivate(page)) { | 177 | if (PageAnon(new)) |
159 | if (!try_to_release_page(page, GFP_KERNEL) || | 178 | page_add_anon_rmap(new, vma, addr); |
160 | (!mapping && page_count(page) == 1)) | 179 | else |
161 | goto unlock_retry; | 180 | page_add_file_rmap(new); |
162 | } | ||
163 | 181 | ||
164 | if (remove_mapping(mapping, page)) { | 182 | /* No need to invalidate - it was non-present before */ |
165 | /* Success */ | 183 | update_mmu_cache(vma, addr, pte); |
166 | unlock_page(page); | 184 | lazy_mmu_prot_update(pte); |
167 | return 0; | ||
168 | } | ||
169 | 185 | ||
170 | unlock_retry: | 186 | out: |
171 | unlock_page(page); | 187 | pte_unmap_unlock(ptep, ptl); |
188 | } | ||
172 | 189 | ||
173 | retry: | 190 | /* |
174 | return -EAGAIN; | 191 | * Note that remove_file_migration_ptes will only work on regular mappings, |
192 | * Nonlinear mappings do not use migration entries. | ||
193 | */ | ||
194 | static void remove_file_migration_ptes(struct page *old, struct page *new) | ||
195 | { | ||
196 | struct vm_area_struct *vma; | ||
197 | struct address_space *mapping = page_mapping(new); | ||
198 | struct prio_tree_iter iter; | ||
199 | pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
200 | |||
201 | if (!mapping) | ||
202 | return; | ||
203 | |||
204 | spin_lock(&mapping->i_mmap_lock); | ||
205 | |||
206 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) | ||
207 | remove_migration_pte(vma, old, new); | ||
208 | |||
209 | spin_unlock(&mapping->i_mmap_lock); | ||
175 | } | 210 | } |
176 | 211 | ||
177 | /* | 212 | /* |
178 | * Remove references for a page and establish the new page with the correct | 213 | * Must hold mmap_sem lock on at least one of the vmas containing |
179 | * basic settings to be able to stop accesses to the page. | 214 | * the page so that the anon_vma cannot vanish. |
180 | */ | 215 | */ |
181 | int migrate_page_remove_references(struct page *newpage, | 216 | static void remove_anon_migration_ptes(struct page *old, struct page *new) |
182 | struct page *page, int nr_refs) | ||
183 | { | 217 | { |
184 | struct address_space *mapping = page_mapping(page); | 218 | struct anon_vma *anon_vma; |
185 | struct page **radix_pointer; | 219 | struct vm_area_struct *vma; |
220 | unsigned long mapping; | ||
186 | 221 | ||
187 | /* | 222 | mapping = (unsigned long)new->mapping; |
188 | * Avoid doing any of the following work if the page count | ||
189 | * indicates that the page is in use or truncate has removed | ||
190 | * the page. | ||
191 | */ | ||
192 | if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) | ||
193 | return -EAGAIN; | ||
194 | 223 | ||
195 | /* | 224 | if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) |
196 | * Establish swap ptes for anonymous pages or destroy pte | 225 | return; |
197 | * maps for files. | ||
198 | * | ||
199 | * In order to reestablish file backed mappings the fault handlers | ||
200 | * will take the radix tree_lock which may then be used to stop | ||
201 | * processses from accessing this page until the new page is ready. | ||
202 | * | ||
203 | * A process accessing via a swap pte (an anonymous page) will take a | ||
204 | * page_lock on the old page which will block the process until the | ||
205 | * migration attempt is complete. At that time the PageSwapCache bit | ||
206 | * will be examined. If the page was migrated then the PageSwapCache | ||
207 | * bit will be clear and the operation to retrieve the page will be | ||
208 | * retried which will find the new page in the radix tree. Then a new | ||
209 | * direct mapping may be generated based on the radix tree contents. | ||
210 | * | ||
211 | * If the page was not migrated then the PageSwapCache bit | ||
212 | * is still set and the operation may continue. | ||
213 | */ | ||
214 | if (try_to_unmap(page, 1) == SWAP_FAIL) | ||
215 | /* A vma has VM_LOCKED set -> permanent failure */ | ||
216 | return -EPERM; | ||
217 | 226 | ||
218 | /* | 227 | /* |
219 | * Give up if we were unable to remove all mappings. | 228 | * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. |
220 | */ | 229 | */ |
221 | if (page_mapcount(page)) | 230 | anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); |
222 | return -EAGAIN; | 231 | spin_lock(&anon_vma->lock); |
232 | |||
233 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) | ||
234 | remove_migration_pte(vma, old, new); | ||
235 | |||
236 | spin_unlock(&anon_vma->lock); | ||
237 | } | ||
238 | |||
239 | /* | ||
240 | * Get rid of all migration entries and replace them by | ||
241 | * references to the indicated page. | ||
242 | */ | ||
243 | static void remove_migration_ptes(struct page *old, struct page *new) | ||
244 | { | ||
245 | if (PageAnon(new)) | ||
246 | remove_anon_migration_ptes(old, new); | ||
247 | else | ||
248 | remove_file_migration_ptes(old, new); | ||
249 | } | ||
250 | |||
251 | /* | ||
252 | * Something used the pte of a page under migration. We need to | ||
253 | * get to the page and wait until migration is finished. | ||
254 | * When we return from this function the fault will be retried. | ||
255 | * | ||
256 | * This function is called from do_swap_page(). | ||
257 | */ | ||
258 | void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, | ||
259 | unsigned long address) | ||
260 | { | ||
261 | pte_t *ptep, pte; | ||
262 | spinlock_t *ptl; | ||
263 | swp_entry_t entry; | ||
264 | struct page *page; | ||
265 | |||
266 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
267 | pte = *ptep; | ||
268 | if (!is_swap_pte(pte)) | ||
269 | goto out; | ||
270 | |||
271 | entry = pte_to_swp_entry(pte); | ||
272 | if (!is_migration_entry(entry)) | ||
273 | goto out; | ||
274 | |||
275 | page = migration_entry_to_page(entry); | ||
276 | |||
277 | get_page(page); | ||
278 | pte_unmap_unlock(ptep, ptl); | ||
279 | wait_on_page_locked(page); | ||
280 | put_page(page); | ||
281 | return; | ||
282 | out: | ||
283 | pte_unmap_unlock(ptep, ptl); | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * Replace the page in the mapping. | ||
288 | * | ||
289 | * The number of remaining references must be: | ||
290 | * 1 for anonymous pages without a mapping | ||
291 | * 2 for pages with a mapping | ||
292 | * 3 for pages with a mapping and PagePrivate set. | ||
293 | */ | ||
294 | static int migrate_page_move_mapping(struct address_space *mapping, | ||
295 | struct page *newpage, struct page *page) | ||
296 | { | ||
297 | struct page **radix_pointer; | ||
298 | |||
299 | if (!mapping) { | ||
300 | /* Anonymous page */ | ||
301 | if (page_count(page) != 1) | ||
302 | return -EAGAIN; | ||
303 | return 0; | ||
304 | } | ||
223 | 305 | ||
224 | write_lock_irq(&mapping->tree_lock); | 306 | write_lock_irq(&mapping->tree_lock); |
225 | 307 | ||
@@ -227,7 +309,7 @@ int migrate_page_remove_references(struct page *newpage, | |||
227 | &mapping->page_tree, | 309 | &mapping->page_tree, |
228 | page_index(page)); | 310 | page_index(page)); |
229 | 311 | ||
230 | if (!page_mapping(page) || page_count(page) != nr_refs || | 312 | if (page_count(page) != 2 + !!PagePrivate(page) || |
231 | *radix_pointer != page) { | 313 | *radix_pointer != page) { |
232 | write_unlock_irq(&mapping->tree_lock); | 314 | write_unlock_irq(&mapping->tree_lock); |
233 | return -EAGAIN; | 315 | return -EAGAIN; |
@@ -235,19 +317,14 @@ int migrate_page_remove_references(struct page *newpage, | |||
235 | 317 | ||
236 | /* | 318 | /* |
237 | * Now we know that no one else is looking at the page. | 319 | * Now we know that no one else is looking at the page. |
238 | * | ||
239 | * Certain minimal information about a page must be available | ||
240 | * in order for other subsystems to properly handle the page if they | ||
241 | * find it through the radix tree update before we are finished | ||
242 | * copying the page. | ||
243 | */ | 320 | */ |
244 | get_page(newpage); | 321 | get_page(newpage); |
245 | newpage->index = page->index; | 322 | #ifdef CONFIG_SWAP |
246 | newpage->mapping = page->mapping; | ||
247 | if (PageSwapCache(page)) { | 323 | if (PageSwapCache(page)) { |
248 | SetPageSwapCache(newpage); | 324 | SetPageSwapCache(newpage); |
249 | set_page_private(newpage, page_private(page)); | 325 | set_page_private(newpage, page_private(page)); |
250 | } | 326 | } |
327 | #endif | ||
251 | 328 | ||
252 | *radix_pointer = newpage; | 329 | *radix_pointer = newpage; |
253 | __put_page(page); | 330 | __put_page(page); |
@@ -255,12 +332,11 @@ int migrate_page_remove_references(struct page *newpage, | |||
255 | 332 | ||
256 | return 0; | 333 | return 0; |
257 | } | 334 | } |
258 | EXPORT_SYMBOL(migrate_page_remove_references); | ||
259 | 335 | ||
260 | /* | 336 | /* |
261 | * Copy the page to its new location | 337 | * Copy the page to its new location |
262 | */ | 338 | */ |
263 | void migrate_page_copy(struct page *newpage, struct page *page) | 339 | static void migrate_page_copy(struct page *newpage, struct page *page) |
264 | { | 340 | { |
265 | copy_highpage(newpage, page); | 341 | copy_highpage(newpage, page); |
266 | 342 | ||
@@ -282,7 +358,9 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
282 | set_page_dirty(newpage); | 358 | set_page_dirty(newpage); |
283 | } | 359 | } |
284 | 360 | ||
361 | #ifdef CONFIG_SWAP | ||
285 | ClearPageSwapCache(page); | 362 | ClearPageSwapCache(page); |
363 | #endif | ||
286 | ClearPageActive(page); | 364 | ClearPageActive(page); |
287 | ClearPagePrivate(page); | 365 | ClearPagePrivate(page); |
288 | set_page_private(page, 0); | 366 | set_page_private(page, 0); |
@@ -295,7 +373,18 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
295 | if (PageWriteback(newpage)) | 373 | if (PageWriteback(newpage)) |
296 | end_page_writeback(newpage); | 374 | end_page_writeback(newpage); |
297 | } | 375 | } |
298 | EXPORT_SYMBOL(migrate_page_copy); | 376 | |
377 | /************************************************************ | ||
378 | * Migration functions | ||
379 | ***********************************************************/ | ||
380 | |||
381 | /* Always fail migration. Used for mappings that are not movable */ | ||
382 | int fail_migrate_page(struct address_space *mapping, | ||
383 | struct page *newpage, struct page *page) | ||
384 | { | ||
385 | return -EIO; | ||
386 | } | ||
387 | EXPORT_SYMBOL(fail_migrate_page); | ||
299 | 388 | ||
300 | /* | 389 | /* |
301 | * Common logic to directly migrate a single page suitable for | 390 | * Common logic to directly migrate a single page suitable for |
@@ -303,51 +392,284 @@ EXPORT_SYMBOL(migrate_page_copy); | |||
303 | * | 392 | * |
304 | * Pages are locked upon entry and exit. | 393 | * Pages are locked upon entry and exit. |
305 | */ | 394 | */ |
306 | int migrate_page(struct page *newpage, struct page *page) | 395 | int migrate_page(struct address_space *mapping, |
396 | struct page *newpage, struct page *page) | ||
307 | { | 397 | { |
308 | int rc; | 398 | int rc; |
309 | 399 | ||
310 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ | 400 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ |
311 | 401 | ||
312 | rc = migrate_page_remove_references(newpage, page, 2); | 402 | rc = migrate_page_move_mapping(mapping, newpage, page); |
403 | |||
404 | if (rc) | ||
405 | return rc; | ||
406 | |||
407 | migrate_page_copy(newpage, page); | ||
408 | return 0; | ||
409 | } | ||
410 | EXPORT_SYMBOL(migrate_page); | ||
411 | |||
412 | /* | ||
413 | * Migration function for pages with buffers. This function can only be used | ||
414 | * if the underlying filesystem guarantees that no other references to "page" | ||
415 | * exist. | ||
416 | */ | ||
417 | int buffer_migrate_page(struct address_space *mapping, | ||
418 | struct page *newpage, struct page *page) | ||
419 | { | ||
420 | struct buffer_head *bh, *head; | ||
421 | int rc; | ||
422 | |||
423 | if (!page_has_buffers(page)) | ||
424 | return migrate_page(mapping, newpage, page); | ||
425 | |||
426 | head = page_buffers(page); | ||
427 | |||
428 | rc = migrate_page_move_mapping(mapping, newpage, page); | ||
313 | 429 | ||
314 | if (rc) | 430 | if (rc) |
315 | return rc; | 431 | return rc; |
316 | 432 | ||
433 | bh = head; | ||
434 | do { | ||
435 | get_bh(bh); | ||
436 | lock_buffer(bh); | ||
437 | bh = bh->b_this_page; | ||
438 | |||
439 | } while (bh != head); | ||
440 | |||
441 | ClearPagePrivate(page); | ||
442 | set_page_private(newpage, page_private(page)); | ||
443 | set_page_private(page, 0); | ||
444 | put_page(page); | ||
445 | get_page(newpage); | ||
446 | |||
447 | bh = head; | ||
448 | do { | ||
449 | set_bh_page(bh, newpage, bh_offset(bh)); | ||
450 | bh = bh->b_this_page; | ||
451 | |||
452 | } while (bh != head); | ||
453 | |||
454 | SetPagePrivate(newpage); | ||
455 | |||
317 | migrate_page_copy(newpage, page); | 456 | migrate_page_copy(newpage, page); |
318 | 457 | ||
458 | bh = head; | ||
459 | do { | ||
460 | unlock_buffer(bh); | ||
461 | put_bh(bh); | ||
462 | bh = bh->b_this_page; | ||
463 | |||
464 | } while (bh != head); | ||
465 | |||
466 | return 0; | ||
467 | } | ||
468 | EXPORT_SYMBOL(buffer_migrate_page); | ||
469 | |||
470 | /* | ||
471 | * Writeback a page to clean the dirty state | ||
472 | */ | ||
473 | static int writeout(struct address_space *mapping, struct page *page) | ||
474 | { | ||
475 | struct writeback_control wbc = { | ||
476 | .sync_mode = WB_SYNC_NONE, | ||
477 | .nr_to_write = 1, | ||
478 | .range_start = 0, | ||
479 | .range_end = LLONG_MAX, | ||
480 | .nonblocking = 1, | ||
481 | .for_reclaim = 1 | ||
482 | }; | ||
483 | int rc; | ||
484 | |||
485 | if (!mapping->a_ops->writepage) | ||
486 | /* No write method for the address space */ | ||
487 | return -EINVAL; | ||
488 | |||
489 | if (!clear_page_dirty_for_io(page)) | ||
490 | /* Someone else already triggered a write */ | ||
491 | return -EAGAIN; | ||
492 | |||
319 | /* | 493 | /* |
320 | * Remove auxiliary swap entries and replace | 494 | * A dirty page may imply that the underlying filesystem has |
321 | * them with real ptes. | 495 | * the page on some queue. So the page must be clean for |
322 | * | 496 | * migration. Writeout may mean we loose the lock and the |
323 | * Note that a real pte entry will allow processes that are not | 497 | * page state is no longer what we checked for earlier. |
324 | * waiting on the page lock to use the new page via the page tables | 498 | * At this point we know that the migration attempt cannot |
325 | * before the new page is unlocked. | 499 | * be successful. |
326 | */ | 500 | */ |
327 | remove_from_swap(newpage); | 501 | remove_migration_ptes(page, page); |
328 | return 0; | 502 | |
503 | rc = mapping->a_ops->writepage(page, &wbc); | ||
504 | if (rc < 0) | ||
505 | /* I/O Error writing */ | ||
506 | return -EIO; | ||
507 | |||
508 | if (rc != AOP_WRITEPAGE_ACTIVATE) | ||
509 | /* unlocked. Relock */ | ||
510 | lock_page(page); | ||
511 | |||
512 | return -EAGAIN; | ||
513 | } | ||
514 | |||
515 | /* | ||
516 | * Default handling if a filesystem does not provide a migration function. | ||
517 | */ | ||
518 | static int fallback_migrate_page(struct address_space *mapping, | ||
519 | struct page *newpage, struct page *page) | ||
520 | { | ||
521 | if (PageDirty(page)) | ||
522 | return writeout(mapping, page); | ||
523 | |||
524 | /* | ||
525 | * Buffers may be managed in a filesystem specific way. | ||
526 | * We must have no buffers or drop them. | ||
527 | */ | ||
528 | if (page_has_buffers(page) && | ||
529 | !try_to_release_page(page, GFP_KERNEL)) | ||
530 | return -EAGAIN; | ||
531 | |||
532 | return migrate_page(mapping, newpage, page); | ||
533 | } | ||
534 | |||
535 | /* | ||
536 | * Move a page to a newly allocated page | ||
537 | * The page is locked and all ptes have been successfully removed. | ||
538 | * | ||
539 | * The new page will have replaced the old page if this function | ||
540 | * is successful. | ||
541 | */ | ||
542 | static int move_to_new_page(struct page *newpage, struct page *page) | ||
543 | { | ||
544 | struct address_space *mapping; | ||
545 | int rc; | ||
546 | |||
547 | /* | ||
548 | * Block others from accessing the page when we get around to | ||
549 | * establishing additional references. We are the only one | ||
550 | * holding a reference to the new page at this point. | ||
551 | */ | ||
552 | if (TestSetPageLocked(newpage)) | ||
553 | BUG(); | ||
554 | |||
555 | /* Prepare mapping for the new page.*/ | ||
556 | newpage->index = page->index; | ||
557 | newpage->mapping = page->mapping; | ||
558 | |||
559 | mapping = page_mapping(page); | ||
560 | if (!mapping) | ||
561 | rc = migrate_page(mapping, newpage, page); | ||
562 | else if (mapping->a_ops->migratepage) | ||
563 | /* | ||
564 | * Most pages have a mapping and most filesystems | ||
565 | * should provide a migration function. Anonymous | ||
566 | * pages are part of swap space which also has its | ||
567 | * own migration function. This is the most common | ||
568 | * path for page migration. | ||
569 | */ | ||
570 | rc = mapping->a_ops->migratepage(mapping, | ||
571 | newpage, page); | ||
572 | else | ||
573 | rc = fallback_migrate_page(mapping, newpage, page); | ||
574 | |||
575 | if (!rc) | ||
576 | remove_migration_ptes(page, newpage); | ||
577 | else | ||
578 | newpage->mapping = NULL; | ||
579 | |||
580 | unlock_page(newpage); | ||
581 | |||
582 | return rc; | ||
583 | } | ||
584 | |||
585 | /* | ||
586 | * Obtain the lock on page, remove all ptes and migrate the page | ||
587 | * to the newly allocated page in newpage. | ||
588 | */ | ||
589 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | ||
590 | struct page *page, int force) | ||
591 | { | ||
592 | int rc = 0; | ||
593 | int *result = NULL; | ||
594 | struct page *newpage = get_new_page(page, private, &result); | ||
595 | |||
596 | if (!newpage) | ||
597 | return -ENOMEM; | ||
598 | |||
599 | if (page_count(page) == 1) | ||
600 | /* page was freed from under us. So we are done. */ | ||
601 | goto move_newpage; | ||
602 | |||
603 | rc = -EAGAIN; | ||
604 | if (TestSetPageLocked(page)) { | ||
605 | if (!force) | ||
606 | goto move_newpage; | ||
607 | lock_page(page); | ||
608 | } | ||
609 | |||
610 | if (PageWriteback(page)) { | ||
611 | if (!force) | ||
612 | goto unlock; | ||
613 | wait_on_page_writeback(page); | ||
614 | } | ||
615 | |||
616 | /* | ||
617 | * Establish migration ptes or remove ptes | ||
618 | */ | ||
619 | try_to_unmap(page, 1); | ||
620 | if (!page_mapped(page)) | ||
621 | rc = move_to_new_page(newpage, page); | ||
622 | |||
623 | if (rc) | ||
624 | remove_migration_ptes(page, page); | ||
625 | |||
626 | unlock: | ||
627 | unlock_page(page); | ||
628 | |||
629 | if (rc != -EAGAIN) { | ||
630 | /* | ||
631 | * A page that has been migrated has all references | ||
632 | * removed and will be freed. A page that has not been | ||
633 | * migrated will have kepts its references and be | ||
634 | * restored. | ||
635 | */ | ||
636 | list_del(&page->lru); | ||
637 | move_to_lru(page); | ||
638 | } | ||
639 | |||
640 | move_newpage: | ||
641 | /* | ||
642 | * Move the new page to the LRU. If migration was not successful | ||
643 | * then this will free the page. | ||
644 | */ | ||
645 | move_to_lru(newpage); | ||
646 | if (result) { | ||
647 | if (rc) | ||
648 | *result = rc; | ||
649 | else | ||
650 | *result = page_to_nid(newpage); | ||
651 | } | ||
652 | return rc; | ||
329 | } | 653 | } |
330 | EXPORT_SYMBOL(migrate_page); | ||
331 | 654 | ||
332 | /* | 655 | /* |
333 | * migrate_pages | 656 | * migrate_pages |
334 | * | 657 | * |
335 | * Two lists are passed to this function. The first list | 658 | * The function takes one list of pages to migrate and a function |
336 | * contains the pages isolated from the LRU to be migrated. | 659 | * that determines from the page to be migrated and the private data |
337 | * The second list contains new pages that the pages isolated | 660 | * the target of the move and allocates the page. |
338 | * can be moved to. If the second list is NULL then all | ||
339 | * pages are swapped out. | ||
340 | * | 661 | * |
341 | * The function returns after 10 attempts or if no pages | 662 | * The function returns after 10 attempts or if no pages |
342 | * are movable anymore because to has become empty | 663 | * are movable anymore because to has become empty |
343 | * or no retryable pages exist anymore. | 664 | * or no retryable pages exist anymore. All pages will be |
665 | * retruned to the LRU or freed. | ||
344 | * | 666 | * |
345 | * Return: Number of pages not migrated when "to" ran empty. | 667 | * Return: Number of pages not migrated or error code. |
346 | */ | 668 | */ |
347 | int migrate_pages(struct list_head *from, struct list_head *to, | 669 | int migrate_pages(struct list_head *from, |
348 | struct list_head *moved, struct list_head *failed) | 670 | new_page_t get_new_page, unsigned long private) |
349 | { | 671 | { |
350 | int retry; | 672 | int retry = 1; |
351 | int nr_failed = 0; | 673 | int nr_failed = 0; |
352 | int pass = 0; | 674 | int pass = 0; |
353 | struct page *page; | 675 | struct page *page; |
@@ -358,305 +680,317 @@ int migrate_pages(struct list_head *from, struct list_head *to, | |||
358 | if (!swapwrite) | 680 | if (!swapwrite) |
359 | current->flags |= PF_SWAPWRITE; | 681 | current->flags |= PF_SWAPWRITE; |
360 | 682 | ||
361 | redo: | 683 | for(pass = 0; pass < 10 && retry; pass++) { |
362 | retry = 0; | 684 | retry = 0; |
685 | |||
686 | list_for_each_entry_safe(page, page2, from, lru) { | ||
687 | cond_resched(); | ||
688 | |||
689 | rc = unmap_and_move(get_new_page, private, | ||
690 | page, pass > 2); | ||
691 | |||
692 | switch(rc) { | ||
693 | case -ENOMEM: | ||
694 | goto out; | ||
695 | case -EAGAIN: | ||
696 | retry++; | ||
697 | break; | ||
698 | case 0: | ||
699 | break; | ||
700 | default: | ||
701 | /* Permanent failure */ | ||
702 | nr_failed++; | ||
703 | break; | ||
704 | } | ||
705 | } | ||
706 | } | ||
707 | rc = 0; | ||
708 | out: | ||
709 | if (!swapwrite) | ||
710 | current->flags &= ~PF_SWAPWRITE; | ||
711 | |||
712 | putback_lru_pages(from); | ||
713 | |||
714 | if (rc) | ||
715 | return rc; | ||
363 | 716 | ||
364 | list_for_each_entry_safe(page, page2, from, lru) { | 717 | return nr_failed + retry; |
365 | struct page *newpage = NULL; | 718 | } |
366 | struct address_space *mapping; | ||
367 | 719 | ||
368 | cond_resched(); | 720 | #ifdef CONFIG_NUMA |
721 | /* | ||
722 | * Move a list of individual pages | ||
723 | */ | ||
724 | struct page_to_node { | ||
725 | unsigned long addr; | ||
726 | struct page *page; | ||
727 | int node; | ||
728 | int status; | ||
729 | }; | ||
369 | 730 | ||
370 | rc = 0; | 731 | static struct page *new_page_node(struct page *p, unsigned long private, |
371 | if (page_count(page) == 1) | 732 | int **result) |
372 | /* page was freed from under us. So we are done. */ | 733 | { |
373 | goto next; | 734 | struct page_to_node *pm = (struct page_to_node *)private; |
374 | 735 | ||
375 | if (to && list_empty(to)) | 736 | while (pm->node != MAX_NUMNODES && pm->page != p) |
376 | break; | 737 | pm++; |
377 | 738 | ||
378 | /* | 739 | if (pm->node == MAX_NUMNODES) |
379 | * Skip locked pages during the first two passes to give the | 740 | return NULL; |
380 | * functions holding the lock time to release the page. Later we | ||
381 | * use lock_page() to have a higher chance of acquiring the | ||
382 | * lock. | ||
383 | */ | ||
384 | rc = -EAGAIN; | ||
385 | if (pass > 2) | ||
386 | lock_page(page); | ||
387 | else | ||
388 | if (TestSetPageLocked(page)) | ||
389 | goto next; | ||
390 | 741 | ||
391 | /* | 742 | *result = &pm->status; |
392 | * Only wait on writeback if we have already done a pass where | ||
393 | * we we may have triggered writeouts for lots of pages. | ||
394 | */ | ||
395 | if (pass > 0) { | ||
396 | wait_on_page_writeback(page); | ||
397 | } else { | ||
398 | if (PageWriteback(page)) | ||
399 | goto unlock_page; | ||
400 | } | ||
401 | 743 | ||
402 | /* | 744 | return alloc_pages_node(pm->node, GFP_HIGHUSER, 0); |
403 | * Anonymous pages must have swap cache references otherwise | 745 | } |
404 | * the information contained in the page maps cannot be | ||
405 | * preserved. | ||
406 | */ | ||
407 | if (PageAnon(page) && !PageSwapCache(page)) { | ||
408 | if (!add_to_swap(page, GFP_KERNEL)) { | ||
409 | rc = -ENOMEM; | ||
410 | goto unlock_page; | ||
411 | } | ||
412 | } | ||
413 | 746 | ||
414 | if (!to) { | 747 | /* |
415 | rc = swap_page(page); | 748 | * Move a set of pages as indicated in the pm array. The addr |
416 | goto next; | 749 | * field must be set to the virtual address of the page to be moved |
417 | } | 750 | * and the node number must contain a valid target node. |
751 | */ | ||
752 | static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, | ||
753 | int migrate_all) | ||
754 | { | ||
755 | int err; | ||
756 | struct page_to_node *pp; | ||
757 | LIST_HEAD(pagelist); | ||
758 | |||
759 | down_read(&mm->mmap_sem); | ||
418 | 760 | ||
419 | newpage = lru_to_page(to); | 761 | /* |
420 | lock_page(newpage); | 762 | * Build a list of pages to migrate |
763 | */ | ||
764 | migrate_prep(); | ||
765 | for (pp = pm; pp->node != MAX_NUMNODES; pp++) { | ||
766 | struct vm_area_struct *vma; | ||
767 | struct page *page; | ||
421 | 768 | ||
422 | /* | 769 | /* |
423 | * Pages are properly locked and writeback is complete. | 770 | * A valid page pointer that will not match any of the |
424 | * Try to migrate the page. | 771 | * pages that will be moved. |
425 | */ | 772 | */ |
426 | mapping = page_mapping(page); | 773 | pp->page = ZERO_PAGE(0); |
427 | if (!mapping) | ||
428 | goto unlock_both; | ||
429 | 774 | ||
430 | if (mapping->a_ops->migratepage) { | 775 | err = -EFAULT; |
431 | /* | 776 | vma = find_vma(mm, pp->addr); |
432 | * Most pages have a mapping and most filesystems | 777 | if (!vma) |
433 | * should provide a migration function. Anonymous | 778 | goto set_status; |
434 | * pages are part of swap space which also has its | ||
435 | * own migration function. This is the most common | ||
436 | * path for page migration. | ||
437 | */ | ||
438 | rc = mapping->a_ops->migratepage(newpage, page); | ||
439 | goto unlock_both; | ||
440 | } | ||
441 | |||
442 | /* Make sure the dirty bit is up to date */ | ||
443 | if (try_to_unmap(page, 1) == SWAP_FAIL) { | ||
444 | rc = -EPERM; | ||
445 | goto unlock_both; | ||
446 | } | ||
447 | 779 | ||
448 | if (page_mapcount(page)) { | 780 | page = follow_page(vma, pp->addr, FOLL_GET); |
449 | rc = -EAGAIN; | 781 | err = -ENOENT; |
450 | goto unlock_both; | 782 | if (!page) |
451 | } | 783 | goto set_status; |
452 | 784 | ||
453 | /* | 785 | if (PageReserved(page)) /* Check for zero page */ |
454 | * Default handling if a filesystem does not provide | 786 | goto put_and_set; |
455 | * a migration function. We can only migrate clean | ||
456 | * pages so try to write out any dirty pages first. | ||
457 | */ | ||
458 | if (PageDirty(page)) { | ||
459 | switch (pageout(page, mapping)) { | ||
460 | case PAGE_KEEP: | ||
461 | case PAGE_ACTIVATE: | ||
462 | goto unlock_both; | ||
463 | |||
464 | case PAGE_SUCCESS: | ||
465 | unlock_page(newpage); | ||
466 | goto next; | ||
467 | |||
468 | case PAGE_CLEAN: | ||
469 | ; /* try to migrate the page below */ | ||
470 | } | ||
471 | } | ||
472 | 787 | ||
473 | /* | 788 | pp->page = page; |
474 | * Buffers are managed in a filesystem specific way. | 789 | err = page_to_nid(page); |
475 | * We must have no buffers or drop them. | ||
476 | */ | ||
477 | if (!page_has_buffers(page) || | ||
478 | try_to_release_page(page, GFP_KERNEL)) { | ||
479 | rc = migrate_page(newpage, page); | ||
480 | goto unlock_both; | ||
481 | } | ||
482 | 790 | ||
483 | /* | 791 | if (err == pp->node) |
484 | * On early passes with mapped pages simply | ||
485 | * retry. There may be a lock held for some | ||
486 | * buffers that may go away. Later | ||
487 | * swap them out. | ||
488 | */ | ||
489 | if (pass > 4) { | ||
490 | /* | 792 | /* |
491 | * Persistently unable to drop buffers..... As a | 793 | * Node already in the right place |
492 | * measure of last resort we fall back to | ||
493 | * swap_page(). | ||
494 | */ | 794 | */ |
495 | unlock_page(newpage); | 795 | goto put_and_set; |
496 | newpage = NULL; | ||
497 | rc = swap_page(page); | ||
498 | goto next; | ||
499 | } | ||
500 | 796 | ||
501 | unlock_both: | 797 | err = -EACCES; |
502 | unlock_page(newpage); | 798 | if (page_mapcount(page) > 1 && |
503 | 799 | !migrate_all) | |
504 | unlock_page: | 800 | goto put_and_set; |
505 | unlock_page(page); | 801 | |
506 | 802 | err = isolate_lru_page(page, &pagelist); | |
507 | next: | 803 | put_and_set: |
508 | if (rc == -EAGAIN) { | 804 | /* |
509 | retry++; | 805 | * Either remove the duplicate refcount from |
510 | } else if (rc) { | 806 | * isolate_lru_page() or drop the page ref if it was |
511 | /* Permanent failure */ | 807 | * not isolated. |
512 | list_move(&page->lru, failed); | 808 | */ |
513 | nr_failed++; | 809 | put_page(page); |
514 | } else { | 810 | set_status: |
515 | if (newpage) { | 811 | pp->status = err; |
516 | /* Successful migration. Return page to LRU */ | ||
517 | move_to_lru(newpage); | ||
518 | } | ||
519 | list_move(&page->lru, moved); | ||
520 | } | ||
521 | } | 812 | } |
522 | if (retry && pass++ < 10) | ||
523 | goto redo; | ||
524 | 813 | ||
525 | if (!swapwrite) | 814 | if (!list_empty(&pagelist)) |
526 | current->flags &= ~PF_SWAPWRITE; | 815 | err = migrate_pages(&pagelist, new_page_node, |
816 | (unsigned long)pm); | ||
817 | else | ||
818 | err = -ENOENT; | ||
527 | 819 | ||
528 | return nr_failed + retry; | 820 | up_read(&mm->mmap_sem); |
821 | return err; | ||
529 | } | 822 | } |
530 | 823 | ||
531 | /* | 824 | /* |
532 | * Migration function for pages with buffers. This function can only be used | 825 | * Determine the nodes of a list of pages. The addr in the pm array |
533 | * if the underlying filesystem guarantees that no other references to "page" | 826 | * must have been set to the virtual address of which we want to determine |
534 | * exist. | 827 | * the node number. |
535 | */ | 828 | */ |
536 | int buffer_migrate_page(struct page *newpage, struct page *page) | 829 | static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) |
537 | { | 830 | { |
538 | struct address_space *mapping = page->mapping; | 831 | down_read(&mm->mmap_sem); |
539 | struct buffer_head *bh, *head; | 832 | |
540 | int rc; | 833 | for ( ; pm->node != MAX_NUMNODES; pm++) { |
834 | struct vm_area_struct *vma; | ||
835 | struct page *page; | ||
836 | int err; | ||
837 | |||
838 | err = -EFAULT; | ||
839 | vma = find_vma(mm, pm->addr); | ||
840 | if (!vma) | ||
841 | goto set_status; | ||
842 | |||
843 | page = follow_page(vma, pm->addr, 0); | ||
844 | err = -ENOENT; | ||
845 | /* Use PageReserved to check for zero page */ | ||
846 | if (!page || PageReserved(page)) | ||
847 | goto set_status; | ||
848 | |||
849 | err = page_to_nid(page); | ||
850 | set_status: | ||
851 | pm->status = err; | ||
852 | } | ||
541 | 853 | ||
542 | if (!mapping) | 854 | up_read(&mm->mmap_sem); |
543 | return -EAGAIN; | 855 | return 0; |
856 | } | ||
544 | 857 | ||
545 | if (!page_has_buffers(page)) | 858 | /* |
546 | return migrate_page(newpage, page); | 859 | * Move a list of pages in the address space of the currently executing |
860 | * process. | ||
861 | */ | ||
862 | asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, | ||
863 | const void __user * __user *pages, | ||
864 | const int __user *nodes, | ||
865 | int __user *status, int flags) | ||
866 | { | ||
867 | int err = 0; | ||
868 | int i; | ||
869 | struct task_struct *task; | ||
870 | nodemask_t task_nodes; | ||
871 | struct mm_struct *mm; | ||
872 | struct page_to_node *pm = NULL; | ||
547 | 873 | ||
548 | head = page_buffers(page); | 874 | /* Check flags */ |
875 | if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) | ||
876 | return -EINVAL; | ||
549 | 877 | ||
550 | rc = migrate_page_remove_references(newpage, page, 3); | 878 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) |
879 | return -EPERM; | ||
551 | 880 | ||
552 | if (rc) | 881 | /* Find the mm_struct */ |
553 | return rc; | 882 | read_lock(&tasklist_lock); |
883 | task = pid ? find_task_by_pid(pid) : current; | ||
884 | if (!task) { | ||
885 | read_unlock(&tasklist_lock); | ||
886 | return -ESRCH; | ||
887 | } | ||
888 | mm = get_task_mm(task); | ||
889 | read_unlock(&tasklist_lock); | ||
554 | 890 | ||
555 | bh = head; | 891 | if (!mm) |
556 | do { | 892 | return -EINVAL; |
557 | get_bh(bh); | ||
558 | lock_buffer(bh); | ||
559 | bh = bh->b_this_page; | ||
560 | 893 | ||
561 | } while (bh != head); | 894 | /* |
895 | * Check if this process has the right to modify the specified | ||
896 | * process. The right exists if the process has administrative | ||
897 | * capabilities, superuser privileges or the same | ||
898 | * userid as the target process. | ||
899 | */ | ||
900 | if ((current->euid != task->suid) && (current->euid != task->uid) && | ||
901 | (current->uid != task->suid) && (current->uid != task->uid) && | ||
902 | !capable(CAP_SYS_NICE)) { | ||
903 | err = -EPERM; | ||
904 | goto out2; | ||
905 | } | ||
562 | 906 | ||
563 | ClearPagePrivate(page); | 907 | err = security_task_movememory(task); |
564 | set_page_private(newpage, page_private(page)); | 908 | if (err) |
565 | set_page_private(page, 0); | 909 | goto out2; |
566 | put_page(page); | ||
567 | get_page(newpage); | ||
568 | 910 | ||
569 | bh = head; | ||
570 | do { | ||
571 | set_bh_page(bh, newpage, bh_offset(bh)); | ||
572 | bh = bh->b_this_page; | ||
573 | 911 | ||
574 | } while (bh != head); | 912 | task_nodes = cpuset_mems_allowed(task); |
575 | 913 | ||
576 | SetPagePrivate(newpage); | 914 | /* Limit nr_pages so that the multiplication may not overflow */ |
915 | if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { | ||
916 | err = -E2BIG; | ||
917 | goto out2; | ||
918 | } | ||
577 | 919 | ||
578 | migrate_page_copy(newpage, page); | 920 | pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); |
921 | if (!pm) { | ||
922 | err = -ENOMEM; | ||
923 | goto out2; | ||
924 | } | ||
579 | 925 | ||
580 | bh = head; | 926 | /* |
581 | do { | 927 | * Get parameters from user space and initialize the pm |
582 | unlock_buffer(bh); | 928 | * array. Return various errors if the user did something wrong. |
583 | put_bh(bh); | 929 | */ |
584 | bh = bh->b_this_page; | 930 | for (i = 0; i < nr_pages; i++) { |
931 | const void *p; | ||
585 | 932 | ||
586 | } while (bh != head); | 933 | err = -EFAULT; |
934 | if (get_user(p, pages + i)) | ||
935 | goto out; | ||
587 | 936 | ||
588 | return 0; | 937 | pm[i].addr = (unsigned long)p; |
589 | } | 938 | if (nodes) { |
590 | EXPORT_SYMBOL(buffer_migrate_page); | 939 | int node; |
591 | 940 | ||
592 | /* | 941 | if (get_user(node, nodes + i)) |
593 | * Migrate the list 'pagelist' of pages to a certain destination. | 942 | goto out; |
594 | * | ||
595 | * Specify destination with either non-NULL vma or dest_node >= 0 | ||
596 | * Return the number of pages not migrated or error code | ||
597 | */ | ||
598 | int migrate_pages_to(struct list_head *pagelist, | ||
599 | struct vm_area_struct *vma, int dest) | ||
600 | { | ||
601 | LIST_HEAD(newlist); | ||
602 | LIST_HEAD(moved); | ||
603 | LIST_HEAD(failed); | ||
604 | int err = 0; | ||
605 | unsigned long offset = 0; | ||
606 | int nr_pages; | ||
607 | struct page *page; | ||
608 | struct list_head *p; | ||
609 | 943 | ||
610 | redo: | 944 | err = -ENODEV; |
611 | nr_pages = 0; | 945 | if (!node_online(node)) |
612 | list_for_each(p, pagelist) { | 946 | goto out; |
613 | if (vma) { | ||
614 | /* | ||
615 | * The address passed to alloc_page_vma is used to | ||
616 | * generate the proper interleave behavior. We fake | ||
617 | * the address here by an increasing offset in order | ||
618 | * to get the proper distribution of pages. | ||
619 | * | ||
620 | * No decision has been made as to which page | ||
621 | * a certain old page is moved to so we cannot | ||
622 | * specify the correct address. | ||
623 | */ | ||
624 | page = alloc_page_vma(GFP_HIGHUSER, vma, | ||
625 | offset + vma->vm_start); | ||
626 | offset += PAGE_SIZE; | ||
627 | } | ||
628 | else | ||
629 | page = alloc_pages_node(dest, GFP_HIGHUSER, 0); | ||
630 | 947 | ||
631 | if (!page) { | 948 | err = -EACCES; |
632 | err = -ENOMEM; | 949 | if (!node_isset(node, task_nodes)) |
633 | goto out; | 950 | goto out; |
951 | |||
952 | pm[i].node = node; | ||
634 | } | 953 | } |
635 | list_add_tail(&page->lru, &newlist); | ||
636 | nr_pages++; | ||
637 | if (nr_pages > MIGRATE_CHUNK_SIZE) | ||
638 | break; | ||
639 | } | 954 | } |
640 | err = migrate_pages(pagelist, &newlist, &moved, &failed); | 955 | /* End marker */ |
956 | pm[nr_pages].node = MAX_NUMNODES; | ||
957 | |||
958 | if (nodes) | ||
959 | err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL); | ||
960 | else | ||
961 | err = do_pages_stat(mm, pm); | ||
641 | 962 | ||
642 | putback_lru_pages(&moved); /* Call release pages instead ?? */ | 963 | if (err >= 0) |
964 | /* Return status information */ | ||
965 | for (i = 0; i < nr_pages; i++) | ||
966 | if (put_user(pm[i].status, status + i)) | ||
967 | err = -EFAULT; | ||
643 | 968 | ||
644 | if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) | ||
645 | goto redo; | ||
646 | out: | 969 | out: |
647 | /* Return leftover allocated pages */ | 970 | vfree(pm); |
648 | while (!list_empty(&newlist)) { | 971 | out2: |
649 | page = list_entry(newlist.next, struct page, lru); | 972 | mmput(mm); |
650 | list_del(&page->lru); | 973 | return err; |
651 | __free_page(page); | 974 | } |
652 | } | 975 | #endif |
653 | list_splice(&failed, pagelist); | 976 | |
654 | if (err < 0) | 977 | /* |
655 | return err; | 978 | * Call migration functions in the vma_ops that may prepare |
656 | 979 | * memory in a vm for migration. migration functions may perform | |
657 | /* Calculate number of leftover pages */ | 980 | * the migration for vmas that do not have an underlying page struct. |
658 | nr_pages = 0; | 981 | */ |
659 | list_for_each(p, pagelist) | 982 | int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, |
660 | nr_pages++; | 983 | const nodemask_t *from, unsigned long flags) |
661 | return nr_pages; | 984 | { |
985 | struct vm_area_struct *vma; | ||
986 | int err = 0; | ||
987 | |||
988 | for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) { | ||
989 | if (vma->vm_ops && vma->vm_ops->migrate) { | ||
990 | err = vma->vm_ops->migrate(vma, to, from, flags); | ||
991 | if (err) | ||
992 | break; | ||
993 | } | ||
994 | } | ||
995 | return err; | ||
662 | } | 996 | } |
@@ -96,7 +96,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin) | |||
96 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { | 96 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { |
97 | unsigned long n; | 97 | unsigned long n; |
98 | 98 | ||
99 | free = get_page_cache_size(); | 99 | free = global_page_state(NR_FILE_PAGES); |
100 | free += nr_swap_pages; | 100 | free += nr_swap_pages; |
101 | 101 | ||
102 | /* | 102 | /* |
@@ -1065,7 +1065,8 @@ munmap_back: | |||
1065 | vma->vm_start = addr; | 1065 | vma->vm_start = addr; |
1066 | vma->vm_end = addr + len; | 1066 | vma->vm_end = addr + len; |
1067 | vma->vm_flags = vm_flags; | 1067 | vma->vm_flags = vm_flags; |
1068 | vma->vm_page_prot = protection_map[vm_flags & 0x0f]; | 1068 | vma->vm_page_prot = protection_map[vm_flags & |
1069 | (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; | ||
1069 | vma->vm_pgoff = pgoff; | 1070 | vma->vm_pgoff = pgoff; |
1070 | 1071 | ||
1071 | if (file) { | 1072 | if (file) { |
@@ -1089,6 +1090,12 @@ munmap_back: | |||
1089 | goto free_vma; | 1090 | goto free_vma; |
1090 | } | 1091 | } |
1091 | 1092 | ||
1093 | /* Don't make the VMA automatically writable if it's shared, but the | ||
1094 | * backer wishes to know when pages are first written to */ | ||
1095 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) | ||
1096 | vma->vm_page_prot = | ||
1097 | protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)]; | ||
1098 | |||
1092 | /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform | 1099 | /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform |
1093 | * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) | 1100 | * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) |
1094 | * that memory reservation must be checked; but that reservation | 1101 | * that memory reservation must be checked; but that reservation |
@@ -1921,7 +1928,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
1921 | vma->vm_end = addr + len; | 1928 | vma->vm_end = addr + len; |
1922 | vma->vm_pgoff = pgoff; | 1929 | vma->vm_pgoff = pgoff; |
1923 | vma->vm_flags = flags; | 1930 | vma->vm_flags = flags; |
1924 | vma->vm_page_prot = protection_map[flags & 0x0f]; | 1931 | vma->vm_page_prot = protection_map[flags & |
1932 | (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; | ||
1925 | vma_link(mm, vma, prev, rb_link, rb_parent); | 1933 | vma_link(mm, vma, prev, rb_link, rb_parent); |
1926 | out: | 1934 | out: |
1927 | mm->total_vm += len >> PAGE_SHIFT; | 1935 | mm->total_vm += len >> PAGE_SHIFT; |
diff --git a/mm/mmzone.c b/mm/mmzone.c index b022370e612e..0959ee1a4795 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -5,7 +5,6 @@ | |||
5 | */ | 5 | */ |
6 | 6 | ||
7 | 7 | ||
8 | #include <linux/config.h> | ||
9 | #include <linux/stddef.h> | 8 | #include <linux/stddef.h> |
10 | #include <linux/mmzone.h> | 9 | #include <linux/mmzone.h> |
11 | #include <linux/module.h> | 10 | #include <linux/module.h> |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 4c14d4289b61..638edabaff71 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -19,7 +19,8 @@ | |||
19 | #include <linux/mempolicy.h> | 19 | #include <linux/mempolicy.h> |
20 | #include <linux/personality.h> | 20 | #include <linux/personality.h> |
21 | #include <linux/syscalls.h> | 21 | #include <linux/syscalls.h> |
22 | 22 | #include <linux/swap.h> | |
23 | #include <linux/swapops.h> | ||
23 | #include <asm/uaccess.h> | 24 | #include <asm/uaccess.h> |
24 | #include <asm/pgtable.h> | 25 | #include <asm/pgtable.h> |
25 | #include <asm/cacheflush.h> | 26 | #include <asm/cacheflush.h> |
@@ -28,12 +29,13 @@ | |||
28 | static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | 29 | static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, |
29 | unsigned long addr, unsigned long end, pgprot_t newprot) | 30 | unsigned long addr, unsigned long end, pgprot_t newprot) |
30 | { | 31 | { |
31 | pte_t *pte; | 32 | pte_t *pte, oldpte; |
32 | spinlock_t *ptl; | 33 | spinlock_t *ptl; |
33 | 34 | ||
34 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 35 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
35 | do { | 36 | do { |
36 | if (pte_present(*pte)) { | 37 | oldpte = *pte; |
38 | if (pte_present(oldpte)) { | ||
37 | pte_t ptent; | 39 | pte_t ptent; |
38 | 40 | ||
39 | /* Avoid an SMP race with hardware updated dirty/clean | 41 | /* Avoid an SMP race with hardware updated dirty/clean |
@@ -43,7 +45,22 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
43 | ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot); | 45 | ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot); |
44 | set_pte_at(mm, addr, pte, ptent); | 46 | set_pte_at(mm, addr, pte, ptent); |
45 | lazy_mmu_prot_update(ptent); | 47 | lazy_mmu_prot_update(ptent); |
48 | #ifdef CONFIG_MIGRATION | ||
49 | } else if (!pte_file(oldpte)) { | ||
50 | swp_entry_t entry = pte_to_swp_entry(oldpte); | ||
51 | |||
52 | if (is_write_migration_entry(entry)) { | ||
53 | /* | ||
54 | * A protection check is difficult so | ||
55 | * just be safe and disable write | ||
56 | */ | ||
57 | make_migration_entry_read(&entry); | ||
58 | set_pte_at(mm, addr, pte, | ||
59 | swp_entry_to_pte(entry)); | ||
60 | } | ||
61 | #endif | ||
46 | } | 62 | } |
63 | |||
47 | } while (pte++, addr += PAGE_SIZE, addr != end); | 64 | } while (pte++, addr += PAGE_SIZE, addr != end); |
48 | pte_unmap_unlock(pte - 1, ptl); | 65 | pte_unmap_unlock(pte - 1, ptl); |
49 | } | 66 | } |
@@ -106,6 +123,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
106 | unsigned long oldflags = vma->vm_flags; | 123 | unsigned long oldflags = vma->vm_flags; |
107 | long nrpages = (end - start) >> PAGE_SHIFT; | 124 | long nrpages = (end - start) >> PAGE_SHIFT; |
108 | unsigned long charged = 0; | 125 | unsigned long charged = 0; |
126 | unsigned int mask; | ||
109 | pgprot_t newprot; | 127 | pgprot_t newprot; |
110 | pgoff_t pgoff; | 128 | pgoff_t pgoff; |
111 | int error; | 129 | int error; |
@@ -132,8 +150,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
132 | } | 150 | } |
133 | } | 151 | } |
134 | 152 | ||
135 | newprot = protection_map[newflags & 0xf]; | ||
136 | |||
137 | /* | 153 | /* |
138 | * First try to merge with previous and/or next vma. | 154 | * First try to merge with previous and/or next vma. |
139 | */ | 155 | */ |
@@ -160,6 +176,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
160 | } | 176 | } |
161 | 177 | ||
162 | success: | 178 | success: |
179 | /* Don't make the VMA automatically writable if it's shared, but the | ||
180 | * backer wishes to know when pages are first written to */ | ||
181 | mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED; | ||
182 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) | ||
183 | mask &= ~VM_SHARED; | ||
184 | |||
185 | newprot = protection_map[newflags & mask]; | ||
186 | |||
163 | /* | 187 | /* |
164 | * vm_flags and vm_page_prot are protected by the mmap_sem | 188 | * vm_flags and vm_page_prot are protected by the mmap_sem |
165 | * held in write mode. | 189 | * held in write mode. |
@@ -205,8 +229,7 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot) | |||
205 | /* | 229 | /* |
206 | * Does the application expect PROT_READ to imply PROT_EXEC: | 230 | * Does the application expect PROT_READ to imply PROT_EXEC: |
207 | */ | 231 | */ |
208 | if (unlikely((prot & PROT_READ) && | 232 | if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) |
209 | (current->personality & READ_IMPLIES_EXEC))) | ||
210 | prot |= PROT_EXEC; | 233 | prot |= PROT_EXEC; |
211 | 234 | ||
212 | vm_flags = calc_vm_prot_bits(prot); | 235 | vm_flags = calc_vm_prot_bits(prot); |
diff --git a/mm/msync.c b/mm/msync.c index bc6c95376366..d083544df21b 100644 --- a/mm/msync.c +++ b/mm/msync.c | |||
@@ -170,8 +170,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) | |||
170 | * just ignore them, but return -ENOMEM at the end. | 170 | * just ignore them, but return -ENOMEM at the end. |
171 | */ | 171 | */ |
172 | down_read(¤t->mm->mmap_sem); | 172 | down_read(¤t->mm->mmap_sem); |
173 | if (flags & MS_SYNC) | ||
174 | current->flags |= PF_SYNCWRITE; | ||
175 | vma = find_vma(current->mm, start); | 173 | vma = find_vma(current->mm, start); |
176 | if (!vma) { | 174 | if (!vma) { |
177 | error = -ENOMEM; | 175 | error = -ENOMEM; |
@@ -228,7 +226,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) | |||
228 | } | 226 | } |
229 | } while (vma && !done); | 227 | } while (vma && !done); |
230 | out_unlock: | 228 | out_unlock: |
231 | current->flags &= ~PF_SYNCWRITE; | ||
232 | up_read(¤t->mm->mmap_sem); | 229 | up_read(¤t->mm->mmap_sem); |
233 | out: | 230 | out: |
234 | return error; | 231 | return error; |
diff --git a/mm/nommu.c b/mm/nommu.c index 029fadac0fb5..5151c44a8257 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1122,7 +1122,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin) | |||
1122 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { | 1122 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { |
1123 | unsigned long n; | 1123 | unsigned long n; |
1124 | 1124 | ||
1125 | free = get_page_cache_size(); | 1125 | free = global_page_state(NR_FILE_PAGES); |
1126 | free += nr_swap_pages; | 1126 | free += nr_swap_pages; |
1127 | 1127 | ||
1128 | /* | 1128 | /* |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 042e6436c3ee..d46ed0f1dc06 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -22,10 +22,11 @@ | |||
22 | #include <linux/jiffies.h> | 22 | #include <linux/jiffies.h> |
23 | #include <linux/cpuset.h> | 23 | #include <linux/cpuset.h> |
24 | 24 | ||
25 | int sysctl_panic_on_oom; | ||
25 | /* #define DEBUG */ | 26 | /* #define DEBUG */ |
26 | 27 | ||
27 | /** | 28 | /** |
28 | * oom_badness - calculate a numeric value for how bad this task has been | 29 | * badness - calculate a numeric value for how bad this task has been |
29 | * @p: task struct of which task we should calculate | 30 | * @p: task struct of which task we should calculate |
30 | * @uptime: current uptime in seconds | 31 | * @uptime: current uptime in seconds |
31 | * | 32 | * |
@@ -200,7 +201,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) | |||
200 | continue; | 201 | continue; |
201 | 202 | ||
202 | /* | 203 | /* |
203 | * This is in the process of releasing memory so for wait it | 204 | * This is in the process of releasing memory so wait for it |
204 | * to finish before killing some other task by mistake. | 205 | * to finish before killing some other task by mistake. |
205 | */ | 206 | */ |
206 | releasing = test_tsk_thread_flag(p, TIF_MEMDIE) || | 207 | releasing = test_tsk_thread_flag(p, TIF_MEMDIE) || |
@@ -306,7 +307,7 @@ static int oom_kill_process(struct task_struct *p, unsigned long points, | |||
306 | } | 307 | } |
307 | 308 | ||
308 | /** | 309 | /** |
309 | * oom_kill - kill the "best" process when we run out of memory | 310 | * out_of_memory - kill the "best" process when we run out of memory |
310 | * | 311 | * |
311 | * If we run out of memory, we have the choice between either | 312 | * If we run out of memory, we have the choice between either |
312 | * killing a random task (bad), letting the system crash (worse) | 313 | * killing a random task (bad), letting the system crash (worse) |
@@ -344,6 +345,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
344 | break; | 345 | break; |
345 | 346 | ||
346 | case CONSTRAINT_NONE: | 347 | case CONSTRAINT_NONE: |
348 | if (sysctl_panic_on_oom) | ||
349 | panic("out of memory. panic_on_oom is selected\n"); | ||
347 | retry: | 350 | retry: |
348 | /* | 351 | /* |
349 | * Rambo mode: Shoot down a process and hope it solves whatever | 352 | * Rambo mode: Shoot down a process and hope it solves whatever |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 75d7f48b79bb..e630188ccc40 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -99,22 +99,6 @@ EXPORT_SYMBOL(laptop_mode); | |||
99 | 99 | ||
100 | static void background_writeout(unsigned long _min_pages); | 100 | static void background_writeout(unsigned long _min_pages); |
101 | 101 | ||
102 | struct writeback_state | ||
103 | { | ||
104 | unsigned long nr_dirty; | ||
105 | unsigned long nr_unstable; | ||
106 | unsigned long nr_mapped; | ||
107 | unsigned long nr_writeback; | ||
108 | }; | ||
109 | |||
110 | static void get_writeback_state(struct writeback_state *wbs) | ||
111 | { | ||
112 | wbs->nr_dirty = read_page_state(nr_dirty); | ||
113 | wbs->nr_unstable = read_page_state(nr_unstable); | ||
114 | wbs->nr_mapped = read_page_state(nr_mapped); | ||
115 | wbs->nr_writeback = read_page_state(nr_writeback); | ||
116 | } | ||
117 | |||
118 | /* | 102 | /* |
119 | * Work out the current dirty-memory clamping and background writeout | 103 | * Work out the current dirty-memory clamping and background writeout |
120 | * thresholds. | 104 | * thresholds. |
@@ -133,8 +117,8 @@ static void get_writeback_state(struct writeback_state *wbs) | |||
133 | * clamping level. | 117 | * clamping level. |
134 | */ | 118 | */ |
135 | static void | 119 | static void |
136 | get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, | 120 | get_dirty_limits(long *pbackground, long *pdirty, |
137 | struct address_space *mapping) | 121 | struct address_space *mapping) |
138 | { | 122 | { |
139 | int background_ratio; /* Percentages */ | 123 | int background_ratio; /* Percentages */ |
140 | int dirty_ratio; | 124 | int dirty_ratio; |
@@ -144,8 +128,6 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, | |||
144 | unsigned long available_memory = total_pages; | 128 | unsigned long available_memory = total_pages; |
145 | struct task_struct *tsk; | 129 | struct task_struct *tsk; |
146 | 130 | ||
147 | get_writeback_state(wbs); | ||
148 | |||
149 | #ifdef CONFIG_HIGHMEM | 131 | #ifdef CONFIG_HIGHMEM |
150 | /* | 132 | /* |
151 | * If this mapping can only allocate from low memory, | 133 | * If this mapping can only allocate from low memory, |
@@ -156,7 +138,9 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, | |||
156 | #endif | 138 | #endif |
157 | 139 | ||
158 | 140 | ||
159 | unmapped_ratio = 100 - (wbs->nr_mapped * 100) / total_pages; | 141 | unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) + |
142 | global_page_state(NR_ANON_PAGES)) * 100) / | ||
143 | total_pages; | ||
160 | 144 | ||
161 | dirty_ratio = vm_dirty_ratio; | 145 | dirty_ratio = vm_dirty_ratio; |
162 | if (dirty_ratio > unmapped_ratio / 2) | 146 | if (dirty_ratio > unmapped_ratio / 2) |
@@ -189,7 +173,6 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty, | |||
189 | */ | 173 | */ |
190 | static void balance_dirty_pages(struct address_space *mapping) | 174 | static void balance_dirty_pages(struct address_space *mapping) |
191 | { | 175 | { |
192 | struct writeback_state wbs; | ||
193 | long nr_reclaimable; | 176 | long nr_reclaimable; |
194 | long background_thresh; | 177 | long background_thresh; |
195 | long dirty_thresh; | 178 | long dirty_thresh; |
@@ -204,13 +187,15 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
204 | .sync_mode = WB_SYNC_NONE, | 187 | .sync_mode = WB_SYNC_NONE, |
205 | .older_than_this = NULL, | 188 | .older_than_this = NULL, |
206 | .nr_to_write = write_chunk, | 189 | .nr_to_write = write_chunk, |
190 | .range_cyclic = 1, | ||
207 | }; | 191 | }; |
208 | 192 | ||
209 | get_dirty_limits(&wbs, &background_thresh, | 193 | get_dirty_limits(&background_thresh, &dirty_thresh, mapping); |
210 | &dirty_thresh, mapping); | 194 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + |
211 | nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; | 195 | global_page_state(NR_UNSTABLE_NFS); |
212 | if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) | 196 | if (nr_reclaimable + global_page_state(NR_WRITEBACK) <= |
213 | break; | 197 | dirty_thresh) |
198 | break; | ||
214 | 199 | ||
215 | if (!dirty_exceeded) | 200 | if (!dirty_exceeded) |
216 | dirty_exceeded = 1; | 201 | dirty_exceeded = 1; |
@@ -223,11 +208,14 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
223 | */ | 208 | */ |
224 | if (nr_reclaimable) { | 209 | if (nr_reclaimable) { |
225 | writeback_inodes(&wbc); | 210 | writeback_inodes(&wbc); |
226 | get_dirty_limits(&wbs, &background_thresh, | 211 | get_dirty_limits(&background_thresh, |
227 | &dirty_thresh, mapping); | 212 | &dirty_thresh, mapping); |
228 | nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable; | 213 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + |
229 | if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) | 214 | global_page_state(NR_UNSTABLE_NFS); |
230 | break; | 215 | if (nr_reclaimable + |
216 | global_page_state(NR_WRITEBACK) | ||
217 | <= dirty_thresh) | ||
218 | break; | ||
231 | pages_written += write_chunk - wbc.nr_to_write; | 219 | pages_written += write_chunk - wbc.nr_to_write; |
232 | if (pages_written >= write_chunk) | 220 | if (pages_written >= write_chunk) |
233 | break; /* We've done our duty */ | 221 | break; /* We've done our duty */ |
@@ -235,8 +223,9 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
235 | blk_congestion_wait(WRITE, HZ/10); | 223 | blk_congestion_wait(WRITE, HZ/10); |
236 | } | 224 | } |
237 | 225 | ||
238 | if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh && dirty_exceeded) | 226 | if (nr_reclaimable + global_page_state(NR_WRITEBACK) |
239 | dirty_exceeded = 0; | 227 | <= dirty_thresh && dirty_exceeded) |
228 | dirty_exceeded = 0; | ||
240 | 229 | ||
241 | if (writeback_in_progress(bdi)) | 230 | if (writeback_in_progress(bdi)) |
242 | return; /* pdflush is already working this queue */ | 231 | return; /* pdflush is already working this queue */ |
@@ -298,12 +287,11 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); | |||
298 | 287 | ||
299 | void throttle_vm_writeout(void) | 288 | void throttle_vm_writeout(void) |
300 | { | 289 | { |
301 | struct writeback_state wbs; | ||
302 | long background_thresh; | 290 | long background_thresh; |
303 | long dirty_thresh; | 291 | long dirty_thresh; |
304 | 292 | ||
305 | for ( ; ; ) { | 293 | for ( ; ; ) { |
306 | get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); | 294 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL); |
307 | 295 | ||
308 | /* | 296 | /* |
309 | * Boost the allowable dirty threshold a bit for page | 297 | * Boost the allowable dirty threshold a bit for page |
@@ -311,8 +299,9 @@ void throttle_vm_writeout(void) | |||
311 | */ | 299 | */ |
312 | dirty_thresh += dirty_thresh / 10; /* wheeee... */ | 300 | dirty_thresh += dirty_thresh / 10; /* wheeee... */ |
313 | 301 | ||
314 | if (wbs.nr_unstable + wbs.nr_writeback <= dirty_thresh) | 302 | if (global_page_state(NR_UNSTABLE_NFS) + |
315 | break; | 303 | global_page_state(NR_WRITEBACK) <= dirty_thresh) |
304 | break; | ||
316 | blk_congestion_wait(WRITE, HZ/10); | 305 | blk_congestion_wait(WRITE, HZ/10); |
317 | } | 306 | } |
318 | } | 307 | } |
@@ -331,15 +320,16 @@ static void background_writeout(unsigned long _min_pages) | |||
331 | .older_than_this = NULL, | 320 | .older_than_this = NULL, |
332 | .nr_to_write = 0, | 321 | .nr_to_write = 0, |
333 | .nonblocking = 1, | 322 | .nonblocking = 1, |
323 | .range_cyclic = 1, | ||
334 | }; | 324 | }; |
335 | 325 | ||
336 | for ( ; ; ) { | 326 | for ( ; ; ) { |
337 | struct writeback_state wbs; | ||
338 | long background_thresh; | 327 | long background_thresh; |
339 | long dirty_thresh; | 328 | long dirty_thresh; |
340 | 329 | ||
341 | get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL); | 330 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL); |
342 | if (wbs.nr_dirty + wbs.nr_unstable < background_thresh | 331 | if (global_page_state(NR_FILE_DIRTY) + |
332 | global_page_state(NR_UNSTABLE_NFS) < background_thresh | ||
343 | && min_pages <= 0) | 333 | && min_pages <= 0) |
344 | break; | 334 | break; |
345 | wbc.encountered_congestion = 0; | 335 | wbc.encountered_congestion = 0; |
@@ -363,12 +353,9 @@ static void background_writeout(unsigned long _min_pages) | |||
363 | */ | 353 | */ |
364 | int wakeup_pdflush(long nr_pages) | 354 | int wakeup_pdflush(long nr_pages) |
365 | { | 355 | { |
366 | if (nr_pages == 0) { | 356 | if (nr_pages == 0) |
367 | struct writeback_state wbs; | 357 | nr_pages = global_page_state(NR_FILE_DIRTY) + |
368 | 358 | global_page_state(NR_UNSTABLE_NFS); | |
369 | get_writeback_state(&wbs); | ||
370 | nr_pages = wbs.nr_dirty + wbs.nr_unstable; | ||
371 | } | ||
372 | return pdflush_operation(background_writeout, nr_pages); | 359 | return pdflush_operation(background_writeout, nr_pages); |
373 | } | 360 | } |
374 | 361 | ||
@@ -399,7 +386,6 @@ static void wb_kupdate(unsigned long arg) | |||
399 | unsigned long start_jif; | 386 | unsigned long start_jif; |
400 | unsigned long next_jif; | 387 | unsigned long next_jif; |
401 | long nr_to_write; | 388 | long nr_to_write; |
402 | struct writeback_state wbs; | ||
403 | struct writeback_control wbc = { | 389 | struct writeback_control wbc = { |
404 | .bdi = NULL, | 390 | .bdi = NULL, |
405 | .sync_mode = WB_SYNC_NONE, | 391 | .sync_mode = WB_SYNC_NONE, |
@@ -407,15 +393,16 @@ static void wb_kupdate(unsigned long arg) | |||
407 | .nr_to_write = 0, | 393 | .nr_to_write = 0, |
408 | .nonblocking = 1, | 394 | .nonblocking = 1, |
409 | .for_kupdate = 1, | 395 | .for_kupdate = 1, |
396 | .range_cyclic = 1, | ||
410 | }; | 397 | }; |
411 | 398 | ||
412 | sync_supers(); | 399 | sync_supers(); |
413 | 400 | ||
414 | get_writeback_state(&wbs); | ||
415 | oldest_jif = jiffies - dirty_expire_interval; | 401 | oldest_jif = jiffies - dirty_expire_interval; |
416 | start_jif = jiffies; | 402 | start_jif = jiffies; |
417 | next_jif = start_jif + dirty_writeback_interval; | 403 | next_jif = start_jif + dirty_writeback_interval; |
418 | nr_to_write = wbs.nr_dirty + wbs.nr_unstable + | 404 | nr_to_write = global_page_state(NR_FILE_DIRTY) + |
405 | global_page_state(NR_UNSTABLE_NFS) + | ||
419 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); | 406 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); |
420 | while (nr_to_write > 0) { | 407 | while (nr_to_write > 0) { |
421 | wbc.encountered_congestion = 0; | 408 | wbc.encountered_congestion = 0; |
@@ -513,14 +500,14 @@ static void set_ratelimit(void) | |||
513 | ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; | 500 | ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE; |
514 | } | 501 | } |
515 | 502 | ||
516 | static int | 503 | static int __cpuinit |
517 | ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) | 504 | ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) |
518 | { | 505 | { |
519 | set_ratelimit(); | 506 | set_ratelimit(); |
520 | return 0; | 507 | return 0; |
521 | } | 508 | } |
522 | 509 | ||
523 | static struct notifier_block ratelimit_nb = { | 510 | static struct notifier_block __cpuinitdata ratelimit_nb = { |
524 | .notifier_call = ratelimit_handler, | 511 | .notifier_call = ratelimit_handler, |
525 | .next = NULL, | 512 | .next = NULL, |
526 | }; | 513 | }; |
@@ -637,7 +624,8 @@ int __set_page_dirty_nobuffers(struct page *page) | |||
637 | if (mapping2) { /* Race with truncate? */ | 624 | if (mapping2) { /* Race with truncate? */ |
638 | BUG_ON(mapping2 != mapping); | 625 | BUG_ON(mapping2 != mapping); |
639 | if (mapping_cap_account_dirty(mapping)) | 626 | if (mapping_cap_account_dirty(mapping)) |
640 | inc_page_state(nr_dirty); | 627 | __inc_zone_page_state(page, |
628 | NR_FILE_DIRTY); | ||
641 | radix_tree_tag_set(&mapping->page_tree, | 629 | radix_tree_tag_set(&mapping->page_tree, |
642 | page_index(page), PAGECACHE_TAG_DIRTY); | 630 | page_index(page), PAGECACHE_TAG_DIRTY); |
643 | } | 631 | } |
@@ -724,9 +712,9 @@ int test_clear_page_dirty(struct page *page) | |||
724 | radix_tree_tag_clear(&mapping->page_tree, | 712 | radix_tree_tag_clear(&mapping->page_tree, |
725 | page_index(page), | 713 | page_index(page), |
726 | PAGECACHE_TAG_DIRTY); | 714 | PAGECACHE_TAG_DIRTY); |
727 | write_unlock_irqrestore(&mapping->tree_lock, flags); | ||
728 | if (mapping_cap_account_dirty(mapping)) | 715 | if (mapping_cap_account_dirty(mapping)) |
729 | dec_page_state(nr_dirty); | 716 | __dec_zone_page_state(page, NR_FILE_DIRTY); |
717 | write_unlock_irqrestore(&mapping->tree_lock, flags); | ||
730 | return 1; | 718 | return 1; |
731 | } | 719 | } |
732 | write_unlock_irqrestore(&mapping->tree_lock, flags); | 720 | write_unlock_irqrestore(&mapping->tree_lock, flags); |
@@ -757,7 +745,7 @@ int clear_page_dirty_for_io(struct page *page) | |||
757 | if (mapping) { | 745 | if (mapping) { |
758 | if (TestClearPageDirty(page)) { | 746 | if (TestClearPageDirty(page)) { |
759 | if (mapping_cap_account_dirty(mapping)) | 747 | if (mapping_cap_account_dirty(mapping)) |
760 | dec_page_state(nr_dirty); | 748 | dec_zone_page_state(page, NR_FILE_DIRTY); |
761 | return 1; | 749 | return 1; |
762 | } | 750 | } |
763 | return 0; | 751 | return 0; |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 253a450c400d..3e792a583f3b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -14,7 +14,6 @@ | |||
14 | * (lots of bits borrowed from Ingo Molnar & Andrew Morton) | 14 | * (lots of bits borrowed from Ingo Molnar & Andrew Morton) |
15 | */ | 15 | */ |
16 | 16 | ||
17 | #include <linux/config.h> | ||
18 | #include <linux/stddef.h> | 17 | #include <linux/stddef.h> |
19 | #include <linux/mm.h> | 18 | #include <linux/mm.h> |
20 | #include <linux/swap.h> | 19 | #include <linux/swap.h> |
@@ -37,6 +36,7 @@ | |||
37 | #include <linux/nodemask.h> | 36 | #include <linux/nodemask.h> |
38 | #include <linux/vmalloc.h> | 37 | #include <linux/vmalloc.h> |
39 | #include <linux/mempolicy.h> | 38 | #include <linux/mempolicy.h> |
39 | #include <linux/stop_machine.h> | ||
40 | 40 | ||
41 | #include <asm/tlbflush.h> | 41 | #include <asm/tlbflush.h> |
42 | #include <asm/div64.h> | 42 | #include <asm/div64.h> |
@@ -83,8 +83,8 @@ EXPORT_SYMBOL(zone_table); | |||
83 | static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; | 83 | static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; |
84 | int min_free_kbytes = 1024; | 84 | int min_free_kbytes = 1024; |
85 | 85 | ||
86 | unsigned long __initdata nr_kernel_pages; | 86 | unsigned long __meminitdata nr_kernel_pages; |
87 | unsigned long __initdata nr_all_pages; | 87 | unsigned long __meminitdata nr_all_pages; |
88 | 88 | ||
89 | #ifdef CONFIG_DEBUG_VM | 89 | #ifdef CONFIG_DEBUG_VM |
90 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 90 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
@@ -265,7 +265,7 @@ static inline void rmv_page_order(struct page *page) | |||
265 | * satisfies the following equation: | 265 | * satisfies the following equation: |
266 | * P = B & ~(1 << O) | 266 | * P = B & ~(1 << O) |
267 | * | 267 | * |
268 | * Assumption: *_mem_map is contigious at least up to MAX_ORDER | 268 | * Assumption: *_mem_map is contiguous at least up to MAX_ORDER |
269 | */ | 269 | */ |
270 | static inline struct page * | 270 | static inline struct page * |
271 | __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) | 271 | __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) |
@@ -286,22 +286,27 @@ __find_combined_index(unsigned long page_idx, unsigned int order) | |||
286 | * we can do coalesce a page and its buddy if | 286 | * we can do coalesce a page and its buddy if |
287 | * (a) the buddy is not in a hole && | 287 | * (a) the buddy is not in a hole && |
288 | * (b) the buddy is in the buddy system && | 288 | * (b) the buddy is in the buddy system && |
289 | * (c) a page and its buddy have the same order. | 289 | * (c) a page and its buddy have the same order && |
290 | * (d) a page and its buddy are in the same zone. | ||
290 | * | 291 | * |
291 | * For recording whether a page is in the buddy system, we use PG_buddy. | 292 | * For recording whether a page is in the buddy system, we use PG_buddy. |
292 | * Setting, clearing, and testing PG_buddy is serialized by zone->lock. | 293 | * Setting, clearing, and testing PG_buddy is serialized by zone->lock. |
293 | * | 294 | * |
294 | * For recording page's order, we use page_private(page). | 295 | * For recording page's order, we use page_private(page). |
295 | */ | 296 | */ |
296 | static inline int page_is_buddy(struct page *page, int order) | 297 | static inline int page_is_buddy(struct page *page, struct page *buddy, |
298 | int order) | ||
297 | { | 299 | { |
298 | #ifdef CONFIG_HOLES_IN_ZONE | 300 | #ifdef CONFIG_HOLES_IN_ZONE |
299 | if (!pfn_valid(page_to_pfn(page))) | 301 | if (!pfn_valid(page_to_pfn(buddy))) |
300 | return 0; | 302 | return 0; |
301 | #endif | 303 | #endif |
302 | 304 | ||
303 | if (PageBuddy(page) && page_order(page) == order) { | 305 | if (page_zone_id(page) != page_zone_id(buddy)) |
304 | BUG_ON(page_count(page) != 0); | 306 | return 0; |
307 | |||
308 | if (PageBuddy(buddy) && page_order(buddy) == order) { | ||
309 | BUG_ON(page_count(buddy) != 0); | ||
305 | return 1; | 310 | return 1; |
306 | } | 311 | } |
307 | return 0; | 312 | return 0; |
@@ -352,7 +357,7 @@ static inline void __free_one_page(struct page *page, | |||
352 | struct page *buddy; | 357 | struct page *buddy; |
353 | 358 | ||
354 | buddy = __page_find_buddy(page, page_idx, order); | 359 | buddy = __page_find_buddy(page, page_idx, order); |
355 | if (!page_is_buddy(buddy, order)) | 360 | if (!page_is_buddy(page, buddy, order)) |
356 | break; /* Move the buddy up one level. */ | 361 | break; /* Move the buddy up one level. */ |
357 | 362 | ||
358 | list_del(&buddy->lru); | 363 | list_del(&buddy->lru); |
@@ -440,8 +445,8 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
440 | 445 | ||
441 | arch_free_page(page, order); | 446 | arch_free_page(page, order); |
442 | if (!PageHighMem(page)) | 447 | if (!PageHighMem(page)) |
443 | mutex_debug_check_no_locks_freed(page_address(page), | 448 | debug_check_no_locks_freed(page_address(page), |
444 | PAGE_SIZE<<order); | 449 | PAGE_SIZE<<order); |
445 | 450 | ||
446 | for (i = 0 ; i < (1 << order) ; ++i) | 451 | for (i = 0 ; i < (1 << order) ; ++i) |
447 | reserved += free_pages_check(page + i); | 452 | reserved += free_pages_check(page + i); |
@@ -450,7 +455,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
450 | 455 | ||
451 | kernel_map_pages(page, 1 << order, 0); | 456 | kernel_map_pages(page, 1 << order, 0); |
452 | local_irq_save(flags); | 457 | local_irq_save(flags); |
453 | __mod_page_state(pgfree, 1 << order); | 458 | __count_vm_events(PGFREE, 1 << order); |
454 | free_one_page(page_zone(page), page, order); | 459 | free_one_page(page_zone(page), page, order); |
455 | local_irq_restore(flags); | 460 | local_irq_restore(flags); |
456 | } | 461 | } |
@@ -703,27 +708,6 @@ void drain_local_pages(void) | |||
703 | } | 708 | } |
704 | #endif /* CONFIG_PM */ | 709 | #endif /* CONFIG_PM */ |
705 | 710 | ||
706 | static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu) | ||
707 | { | ||
708 | #ifdef CONFIG_NUMA | ||
709 | pg_data_t *pg = z->zone_pgdat; | ||
710 | pg_data_t *orig = zonelist->zones[0]->zone_pgdat; | ||
711 | struct per_cpu_pageset *p; | ||
712 | |||
713 | p = zone_pcp(z, cpu); | ||
714 | if (pg == orig) { | ||
715 | p->numa_hit++; | ||
716 | } else { | ||
717 | p->numa_miss++; | ||
718 | zone_pcp(zonelist->zones[0], cpu)->numa_foreign++; | ||
719 | } | ||
720 | if (pg == NODE_DATA(numa_node_id())) | ||
721 | p->local_node++; | ||
722 | else | ||
723 | p->other_node++; | ||
724 | #endif | ||
725 | } | ||
726 | |||
727 | /* | 711 | /* |
728 | * Free a 0-order page | 712 | * Free a 0-order page |
729 | */ | 713 | */ |
@@ -744,7 +728,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) | |||
744 | 728 | ||
745 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; | 729 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; |
746 | local_irq_save(flags); | 730 | local_irq_save(flags); |
747 | __inc_page_state(pgfree); | 731 | __count_vm_event(PGFREE); |
748 | list_add(&page->lru, &pcp->list); | 732 | list_add(&page->lru, &pcp->list); |
749 | pcp->count++; | 733 | pcp->count++; |
750 | if (pcp->count >= pcp->high) { | 734 | if (pcp->count >= pcp->high) { |
@@ -820,8 +804,8 @@ again: | |||
820 | goto failed; | 804 | goto failed; |
821 | } | 805 | } |
822 | 806 | ||
823 | __mod_page_state_zone(zone, pgalloc, 1 << order); | 807 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
824 | zone_statistics(zonelist, zone, cpu); | 808 | zone_statistics(zonelist, zone); |
825 | local_irq_restore(flags); | 809 | local_irq_restore(flags); |
826 | put_cpu(); | 810 | put_cpu(); |
827 | 811 | ||
@@ -951,8 +935,7 @@ restart: | |||
951 | goto got_pg; | 935 | goto got_pg; |
952 | 936 | ||
953 | do { | 937 | do { |
954 | if (cpuset_zone_allowed(*z, gfp_mask|__GFP_HARDWALL)) | 938 | wakeup_kswapd(*z, order); |
955 | wakeup_kswapd(*z, order); | ||
956 | } while (*(++z)); | 939 | } while (*(++z)); |
957 | 940 | ||
958 | /* | 941 | /* |
@@ -1226,141 +1209,6 @@ static void show_node(struct zone *zone) | |||
1226 | #define show_node(zone) do { } while (0) | 1209 | #define show_node(zone) do { } while (0) |
1227 | #endif | 1210 | #endif |
1228 | 1211 | ||
1229 | /* | ||
1230 | * Accumulate the page_state information across all CPUs. | ||
1231 | * The result is unavoidably approximate - it can change | ||
1232 | * during and after execution of this function. | ||
1233 | */ | ||
1234 | static DEFINE_PER_CPU(struct page_state, page_states) = {0}; | ||
1235 | |||
1236 | atomic_t nr_pagecache = ATOMIC_INIT(0); | ||
1237 | EXPORT_SYMBOL(nr_pagecache); | ||
1238 | #ifdef CONFIG_SMP | ||
1239 | DEFINE_PER_CPU(long, nr_pagecache_local) = 0; | ||
1240 | #endif | ||
1241 | |||
1242 | static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) | ||
1243 | { | ||
1244 | unsigned cpu; | ||
1245 | |||
1246 | memset(ret, 0, nr * sizeof(unsigned long)); | ||
1247 | cpus_and(*cpumask, *cpumask, cpu_online_map); | ||
1248 | |||
1249 | for_each_cpu_mask(cpu, *cpumask) { | ||
1250 | unsigned long *in; | ||
1251 | unsigned long *out; | ||
1252 | unsigned off; | ||
1253 | unsigned next_cpu; | ||
1254 | |||
1255 | in = (unsigned long *)&per_cpu(page_states, cpu); | ||
1256 | |||
1257 | next_cpu = next_cpu(cpu, *cpumask); | ||
1258 | if (likely(next_cpu < NR_CPUS)) | ||
1259 | prefetch(&per_cpu(page_states, next_cpu)); | ||
1260 | |||
1261 | out = (unsigned long *)ret; | ||
1262 | for (off = 0; off < nr; off++) | ||
1263 | *out++ += *in++; | ||
1264 | } | ||
1265 | } | ||
1266 | |||
1267 | void get_page_state_node(struct page_state *ret, int node) | ||
1268 | { | ||
1269 | int nr; | ||
1270 | cpumask_t mask = node_to_cpumask(node); | ||
1271 | |||
1272 | nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); | ||
1273 | nr /= sizeof(unsigned long); | ||
1274 | |||
1275 | __get_page_state(ret, nr+1, &mask); | ||
1276 | } | ||
1277 | |||
1278 | void get_page_state(struct page_state *ret) | ||
1279 | { | ||
1280 | int nr; | ||
1281 | cpumask_t mask = CPU_MASK_ALL; | ||
1282 | |||
1283 | nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); | ||
1284 | nr /= sizeof(unsigned long); | ||
1285 | |||
1286 | __get_page_state(ret, nr + 1, &mask); | ||
1287 | } | ||
1288 | |||
1289 | void get_full_page_state(struct page_state *ret) | ||
1290 | { | ||
1291 | cpumask_t mask = CPU_MASK_ALL; | ||
1292 | |||
1293 | __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); | ||
1294 | } | ||
1295 | |||
1296 | unsigned long read_page_state_offset(unsigned long offset) | ||
1297 | { | ||
1298 | unsigned long ret = 0; | ||
1299 | int cpu; | ||
1300 | |||
1301 | for_each_online_cpu(cpu) { | ||
1302 | unsigned long in; | ||
1303 | |||
1304 | in = (unsigned long)&per_cpu(page_states, cpu) + offset; | ||
1305 | ret += *((unsigned long *)in); | ||
1306 | } | ||
1307 | return ret; | ||
1308 | } | ||
1309 | |||
1310 | void __mod_page_state_offset(unsigned long offset, unsigned long delta) | ||
1311 | { | ||
1312 | void *ptr; | ||
1313 | |||
1314 | ptr = &__get_cpu_var(page_states); | ||
1315 | *(unsigned long *)(ptr + offset) += delta; | ||
1316 | } | ||
1317 | EXPORT_SYMBOL(__mod_page_state_offset); | ||
1318 | |||
1319 | void mod_page_state_offset(unsigned long offset, unsigned long delta) | ||
1320 | { | ||
1321 | unsigned long flags; | ||
1322 | void *ptr; | ||
1323 | |||
1324 | local_irq_save(flags); | ||
1325 | ptr = &__get_cpu_var(page_states); | ||
1326 | *(unsigned long *)(ptr + offset) += delta; | ||
1327 | local_irq_restore(flags); | ||
1328 | } | ||
1329 | EXPORT_SYMBOL(mod_page_state_offset); | ||
1330 | |||
1331 | void __get_zone_counts(unsigned long *active, unsigned long *inactive, | ||
1332 | unsigned long *free, struct pglist_data *pgdat) | ||
1333 | { | ||
1334 | struct zone *zones = pgdat->node_zones; | ||
1335 | int i; | ||
1336 | |||
1337 | *active = 0; | ||
1338 | *inactive = 0; | ||
1339 | *free = 0; | ||
1340 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
1341 | *active += zones[i].nr_active; | ||
1342 | *inactive += zones[i].nr_inactive; | ||
1343 | *free += zones[i].free_pages; | ||
1344 | } | ||
1345 | } | ||
1346 | |||
1347 | void get_zone_counts(unsigned long *active, | ||
1348 | unsigned long *inactive, unsigned long *free) | ||
1349 | { | ||
1350 | struct pglist_data *pgdat; | ||
1351 | |||
1352 | *active = 0; | ||
1353 | *inactive = 0; | ||
1354 | *free = 0; | ||
1355 | for_each_online_pgdat(pgdat) { | ||
1356 | unsigned long l, m, n; | ||
1357 | __get_zone_counts(&l, &m, &n, pgdat); | ||
1358 | *active += l; | ||
1359 | *inactive += m; | ||
1360 | *free += n; | ||
1361 | } | ||
1362 | } | ||
1363 | |||
1364 | void si_meminfo(struct sysinfo *val) | 1212 | void si_meminfo(struct sysinfo *val) |
1365 | { | 1213 | { |
1366 | val->totalram = totalram_pages; | 1214 | val->totalram = totalram_pages; |
@@ -1401,7 +1249,6 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
1401 | */ | 1249 | */ |
1402 | void show_free_areas(void) | 1250 | void show_free_areas(void) |
1403 | { | 1251 | { |
1404 | struct page_state ps; | ||
1405 | int cpu, temperature; | 1252 | int cpu, temperature; |
1406 | unsigned long active; | 1253 | unsigned long active; |
1407 | unsigned long inactive; | 1254 | unsigned long inactive; |
@@ -1433,7 +1280,6 @@ void show_free_areas(void) | |||
1433 | } | 1280 | } |
1434 | } | 1281 | } |
1435 | 1282 | ||
1436 | get_page_state(&ps); | ||
1437 | get_zone_counts(&active, &inactive, &free); | 1283 | get_zone_counts(&active, &inactive, &free); |
1438 | 1284 | ||
1439 | printk("Free pages: %11ukB (%ukB HighMem)\n", | 1285 | printk("Free pages: %11ukB (%ukB HighMem)\n", |
@@ -1444,13 +1290,13 @@ void show_free_areas(void) | |||
1444 | "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", | 1290 | "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", |
1445 | active, | 1291 | active, |
1446 | inactive, | 1292 | inactive, |
1447 | ps.nr_dirty, | 1293 | global_page_state(NR_FILE_DIRTY), |
1448 | ps.nr_writeback, | 1294 | global_page_state(NR_WRITEBACK), |
1449 | ps.nr_unstable, | 1295 | global_page_state(NR_UNSTABLE_NFS), |
1450 | nr_free_pages(), | 1296 | nr_free_pages(), |
1451 | ps.nr_slab, | 1297 | global_page_state(NR_SLAB), |
1452 | ps.nr_mapped, | 1298 | global_page_state(NR_FILE_MAPPED), |
1453 | ps.nr_page_table_pages); | 1299 | global_page_state(NR_PAGETABLE)); |
1454 | 1300 | ||
1455 | for_each_zone(zone) { | 1301 | for_each_zone(zone) { |
1456 | int i; | 1302 | int i; |
@@ -1485,7 +1331,7 @@ void show_free_areas(void) | |||
1485 | } | 1331 | } |
1486 | 1332 | ||
1487 | for_each_zone(zone) { | 1333 | for_each_zone(zone) { |
1488 | unsigned long nr, flags, order, total = 0; | 1334 | unsigned long nr[MAX_ORDER], flags, order, total = 0; |
1489 | 1335 | ||
1490 | show_node(zone); | 1336 | show_node(zone); |
1491 | printk("%s: ", zone->name); | 1337 | printk("%s: ", zone->name); |
@@ -1496,11 +1342,12 @@ void show_free_areas(void) | |||
1496 | 1342 | ||
1497 | spin_lock_irqsave(&zone->lock, flags); | 1343 | spin_lock_irqsave(&zone->lock, flags); |
1498 | for (order = 0; order < MAX_ORDER; order++) { | 1344 | for (order = 0; order < MAX_ORDER; order++) { |
1499 | nr = zone->free_area[order].nr_free; | 1345 | nr[order] = zone->free_area[order].nr_free; |
1500 | total += nr << order; | 1346 | total += nr[order] << order; |
1501 | printk("%lu*%lukB ", nr, K(1UL) << order); | ||
1502 | } | 1347 | } |
1503 | spin_unlock_irqrestore(&zone->lock, flags); | 1348 | spin_unlock_irqrestore(&zone->lock, flags); |
1349 | for (order = 0; order < MAX_ORDER; order++) | ||
1350 | printk("%lu*%lukB ", nr[order], K(1UL) << order); | ||
1504 | printk("= %lukB\n", K(total)); | 1351 | printk("= %lukB\n", K(total)); |
1505 | } | 1352 | } |
1506 | 1353 | ||
@@ -1512,7 +1359,7 @@ void show_free_areas(void) | |||
1512 | * | 1359 | * |
1513 | * Add all populated zones of a node to the zonelist. | 1360 | * Add all populated zones of a node to the zonelist. |
1514 | */ | 1361 | */ |
1515 | static int __init build_zonelists_node(pg_data_t *pgdat, | 1362 | static int __meminit build_zonelists_node(pg_data_t *pgdat, |
1516 | struct zonelist *zonelist, int nr_zones, int zone_type) | 1363 | struct zonelist *zonelist, int nr_zones, int zone_type) |
1517 | { | 1364 | { |
1518 | struct zone *zone; | 1365 | struct zone *zone; |
@@ -1548,7 +1395,7 @@ static inline int highest_zone(int zone_bits) | |||
1548 | 1395 | ||
1549 | #ifdef CONFIG_NUMA | 1396 | #ifdef CONFIG_NUMA |
1550 | #define MAX_NODE_LOAD (num_online_nodes()) | 1397 | #define MAX_NODE_LOAD (num_online_nodes()) |
1551 | static int __initdata node_load[MAX_NUMNODES]; | 1398 | static int __meminitdata node_load[MAX_NUMNODES]; |
1552 | /** | 1399 | /** |
1553 | * find_next_best_node - find the next node that should appear in a given node's fallback list | 1400 | * find_next_best_node - find the next node that should appear in a given node's fallback list |
1554 | * @node: node whose fallback list we're appending | 1401 | * @node: node whose fallback list we're appending |
@@ -1563,7 +1410,7 @@ static int __initdata node_load[MAX_NUMNODES]; | |||
1563 | * on them otherwise. | 1410 | * on them otherwise. |
1564 | * It returns -1 if no node is found. | 1411 | * It returns -1 if no node is found. |
1565 | */ | 1412 | */ |
1566 | static int __init find_next_best_node(int node, nodemask_t *used_node_mask) | 1413 | static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) |
1567 | { | 1414 | { |
1568 | int n, val; | 1415 | int n, val; |
1569 | int min_val = INT_MAX; | 1416 | int min_val = INT_MAX; |
@@ -1609,7 +1456,7 @@ static int __init find_next_best_node(int node, nodemask_t *used_node_mask) | |||
1609 | return best_node; | 1456 | return best_node; |
1610 | } | 1457 | } |
1611 | 1458 | ||
1612 | static void __init build_zonelists(pg_data_t *pgdat) | 1459 | static void __meminit build_zonelists(pg_data_t *pgdat) |
1613 | { | 1460 | { |
1614 | int i, j, k, node, local_node; | 1461 | int i, j, k, node, local_node; |
1615 | int prev_node, load; | 1462 | int prev_node, load; |
@@ -1661,7 +1508,7 @@ static void __init build_zonelists(pg_data_t *pgdat) | |||
1661 | 1508 | ||
1662 | #else /* CONFIG_NUMA */ | 1509 | #else /* CONFIG_NUMA */ |
1663 | 1510 | ||
1664 | static void __init build_zonelists(pg_data_t *pgdat) | 1511 | static void __meminit build_zonelists(pg_data_t *pgdat) |
1665 | { | 1512 | { |
1666 | int i, j, k, node, local_node; | 1513 | int i, j, k, node, local_node; |
1667 | 1514 | ||
@@ -1699,14 +1546,29 @@ static void __init build_zonelists(pg_data_t *pgdat) | |||
1699 | 1546 | ||
1700 | #endif /* CONFIG_NUMA */ | 1547 | #endif /* CONFIG_NUMA */ |
1701 | 1548 | ||
1702 | void __init build_all_zonelists(void) | 1549 | /* return values int ....just for stop_machine_run() */ |
1550 | static int __meminit __build_all_zonelists(void *dummy) | ||
1703 | { | 1551 | { |
1704 | int i; | 1552 | int nid; |
1553 | for_each_online_node(nid) | ||
1554 | build_zonelists(NODE_DATA(nid)); | ||
1555 | return 0; | ||
1556 | } | ||
1705 | 1557 | ||
1706 | for_each_online_node(i) | 1558 | void __meminit build_all_zonelists(void) |
1707 | build_zonelists(NODE_DATA(i)); | 1559 | { |
1708 | printk("Built %i zonelists\n", num_online_nodes()); | 1560 | if (system_state == SYSTEM_BOOTING) { |
1709 | cpuset_init_current_mems_allowed(); | 1561 | __build_all_zonelists(0); |
1562 | cpuset_init_current_mems_allowed(); | ||
1563 | } else { | ||
1564 | /* we have to stop all cpus to guaranntee there is no user | ||
1565 | of zonelist */ | ||
1566 | stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); | ||
1567 | /* cpuset refresh routine should be here */ | ||
1568 | } | ||
1569 | vm_total_pages = nr_free_pagecache_pages(); | ||
1570 | printk("Built %i zonelists. Total pages: %ld\n", | ||
1571 | num_online_nodes(), vm_total_pages); | ||
1710 | } | 1572 | } |
1711 | 1573 | ||
1712 | /* | 1574 | /* |
@@ -1722,7 +1584,8 @@ void __init build_all_zonelists(void) | |||
1722 | */ | 1584 | */ |
1723 | #define PAGES_PER_WAITQUEUE 256 | 1585 | #define PAGES_PER_WAITQUEUE 256 |
1724 | 1586 | ||
1725 | static inline unsigned long wait_table_size(unsigned long pages) | 1587 | #ifndef CONFIG_MEMORY_HOTPLUG |
1588 | static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) | ||
1726 | { | 1589 | { |
1727 | unsigned long size = 1; | 1590 | unsigned long size = 1; |
1728 | 1591 | ||
@@ -1740,6 +1603,29 @@ static inline unsigned long wait_table_size(unsigned long pages) | |||
1740 | 1603 | ||
1741 | return max(size, 4UL); | 1604 | return max(size, 4UL); |
1742 | } | 1605 | } |
1606 | #else | ||
1607 | /* | ||
1608 | * A zone's size might be changed by hot-add, so it is not possible to determine | ||
1609 | * a suitable size for its wait_table. So we use the maximum size now. | ||
1610 | * | ||
1611 | * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: | ||
1612 | * | ||
1613 | * i386 (preemption config) : 4096 x 16 = 64Kbyte. | ||
1614 | * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. | ||
1615 | * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. | ||
1616 | * | ||
1617 | * The maximum entries are prepared when a zone's memory is (512K + 256) pages | ||
1618 | * or more by the traditional way. (See above). It equals: | ||
1619 | * | ||
1620 | * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. | ||
1621 | * ia64(16K page size) : = ( 8G + 4M)byte. | ||
1622 | * powerpc (64K page size) : = (32G +16M)byte. | ||
1623 | */ | ||
1624 | static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) | ||
1625 | { | ||
1626 | return 4096UL; | ||
1627 | } | ||
1628 | #endif | ||
1743 | 1629 | ||
1744 | /* | 1630 | /* |
1745 | * This is an integer logarithm so that shifts can be used later | 1631 | * This is an integer logarithm so that shifts can be used later |
@@ -1964,7 +1850,7 @@ static inline void free_zone_pagesets(int cpu) | |||
1964 | } | 1850 | } |
1965 | } | 1851 | } |
1966 | 1852 | ||
1967 | static int pageset_cpuup_callback(struct notifier_block *nfb, | 1853 | static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, |
1968 | unsigned long action, | 1854 | unsigned long action, |
1969 | void *hcpu) | 1855 | void *hcpu) |
1970 | { | 1856 | { |
@@ -1986,7 +1872,7 @@ static int pageset_cpuup_callback(struct notifier_block *nfb, | |||
1986 | return ret; | 1872 | return ret; |
1987 | } | 1873 | } |
1988 | 1874 | ||
1989 | static struct notifier_block pageset_notifier = | 1875 | static struct notifier_block __cpuinitdata pageset_notifier = |
1990 | { &pageset_cpuup_callback, NULL, 0 }; | 1876 | { &pageset_cpuup_callback, NULL, 0 }; |
1991 | 1877 | ||
1992 | void __init setup_per_cpu_pageset(void) | 1878 | void __init setup_per_cpu_pageset(void) |
@@ -2005,23 +1891,46 @@ void __init setup_per_cpu_pageset(void) | |||
2005 | #endif | 1891 | #endif |
2006 | 1892 | ||
2007 | static __meminit | 1893 | static __meminit |
2008 | void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | 1894 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) |
2009 | { | 1895 | { |
2010 | int i; | 1896 | int i; |
2011 | struct pglist_data *pgdat = zone->zone_pgdat; | 1897 | struct pglist_data *pgdat = zone->zone_pgdat; |
1898 | size_t alloc_size; | ||
2012 | 1899 | ||
2013 | /* | 1900 | /* |
2014 | * The per-page waitqueue mechanism uses hashed waitqueues | 1901 | * The per-page waitqueue mechanism uses hashed waitqueues |
2015 | * per zone. | 1902 | * per zone. |
2016 | */ | 1903 | */ |
2017 | zone->wait_table_size = wait_table_size(zone_size_pages); | 1904 | zone->wait_table_hash_nr_entries = |
2018 | zone->wait_table_bits = wait_table_bits(zone->wait_table_size); | 1905 | wait_table_hash_nr_entries(zone_size_pages); |
2019 | zone->wait_table = (wait_queue_head_t *) | 1906 | zone->wait_table_bits = |
2020 | alloc_bootmem_node(pgdat, zone->wait_table_size | 1907 | wait_table_bits(zone->wait_table_hash_nr_entries); |
2021 | * sizeof(wait_queue_head_t)); | 1908 | alloc_size = zone->wait_table_hash_nr_entries |
1909 | * sizeof(wait_queue_head_t); | ||
1910 | |||
1911 | if (system_state == SYSTEM_BOOTING) { | ||
1912 | zone->wait_table = (wait_queue_head_t *) | ||
1913 | alloc_bootmem_node(pgdat, alloc_size); | ||
1914 | } else { | ||
1915 | /* | ||
1916 | * This case means that a zone whose size was 0 gets new memory | ||
1917 | * via memory hot-add. | ||
1918 | * But it may be the case that a new node was hot-added. In | ||
1919 | * this case vmalloc() will not be able to use this new node's | ||
1920 | * memory - this wait_table must be initialized to use this new | ||
1921 | * node itself as well. | ||
1922 | * To use this new node's memory, further consideration will be | ||
1923 | * necessary. | ||
1924 | */ | ||
1925 | zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size); | ||
1926 | } | ||
1927 | if (!zone->wait_table) | ||
1928 | return -ENOMEM; | ||
2022 | 1929 | ||
2023 | for(i = 0; i < zone->wait_table_size; ++i) | 1930 | for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) |
2024 | init_waitqueue_head(zone->wait_table + i); | 1931 | init_waitqueue_head(zone->wait_table + i); |
1932 | |||
1933 | return 0; | ||
2025 | } | 1934 | } |
2026 | 1935 | ||
2027 | static __meminit void zone_pcp_init(struct zone *zone) | 1936 | static __meminit void zone_pcp_init(struct zone *zone) |
@@ -2043,12 +1952,15 @@ static __meminit void zone_pcp_init(struct zone *zone) | |||
2043 | zone->name, zone->present_pages, batch); | 1952 | zone->name, zone->present_pages, batch); |
2044 | } | 1953 | } |
2045 | 1954 | ||
2046 | static __meminit void init_currently_empty_zone(struct zone *zone, | 1955 | __meminit int init_currently_empty_zone(struct zone *zone, |
2047 | unsigned long zone_start_pfn, unsigned long size) | 1956 | unsigned long zone_start_pfn, |
1957 | unsigned long size) | ||
2048 | { | 1958 | { |
2049 | struct pglist_data *pgdat = zone->zone_pgdat; | 1959 | struct pglist_data *pgdat = zone->zone_pgdat; |
2050 | 1960 | int ret; | |
2051 | zone_wait_table_init(zone, size); | 1961 | ret = zone_wait_table_init(zone, size); |
1962 | if (ret) | ||
1963 | return ret; | ||
2052 | pgdat->nr_zones = zone_idx(zone) + 1; | 1964 | pgdat->nr_zones = zone_idx(zone) + 1; |
2053 | 1965 | ||
2054 | zone->zone_start_pfn = zone_start_pfn; | 1966 | zone->zone_start_pfn = zone_start_pfn; |
@@ -2056,6 +1968,8 @@ static __meminit void init_currently_empty_zone(struct zone *zone, | |||
2056 | memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); | 1968 | memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); |
2057 | 1969 | ||
2058 | zone_init_free_lists(pgdat, zone, zone->spanned_pages); | 1970 | zone_init_free_lists(pgdat, zone, zone->spanned_pages); |
1971 | |||
1972 | return 0; | ||
2059 | } | 1973 | } |
2060 | 1974 | ||
2061 | /* | 1975 | /* |
@@ -2064,12 +1978,13 @@ static __meminit void init_currently_empty_zone(struct zone *zone, | |||
2064 | * - mark all memory queues empty | 1978 | * - mark all memory queues empty |
2065 | * - clear the memory bitmaps | 1979 | * - clear the memory bitmaps |
2066 | */ | 1980 | */ |
2067 | static void __init free_area_init_core(struct pglist_data *pgdat, | 1981 | static void __meminit free_area_init_core(struct pglist_data *pgdat, |
2068 | unsigned long *zones_size, unsigned long *zholes_size) | 1982 | unsigned long *zones_size, unsigned long *zholes_size) |
2069 | { | 1983 | { |
2070 | unsigned long j; | 1984 | unsigned long j; |
2071 | int nid = pgdat->node_id; | 1985 | int nid = pgdat->node_id; |
2072 | unsigned long zone_start_pfn = pgdat->node_start_pfn; | 1986 | unsigned long zone_start_pfn = pgdat->node_start_pfn; |
1987 | int ret; | ||
2073 | 1988 | ||
2074 | pgdat_resize_init(pgdat); | 1989 | pgdat_resize_init(pgdat); |
2075 | pgdat->nr_zones = 0; | 1990 | pgdat->nr_zones = 0; |
@@ -2106,12 +2021,14 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
2106 | zone->nr_scan_inactive = 0; | 2021 | zone->nr_scan_inactive = 0; |
2107 | zone->nr_active = 0; | 2022 | zone->nr_active = 0; |
2108 | zone->nr_inactive = 0; | 2023 | zone->nr_inactive = 0; |
2024 | zap_zone_vm_stats(zone); | ||
2109 | atomic_set(&zone->reclaim_in_progress, 0); | 2025 | atomic_set(&zone->reclaim_in_progress, 0); |
2110 | if (!size) | 2026 | if (!size) |
2111 | continue; | 2027 | continue; |
2112 | 2028 | ||
2113 | zonetable_add(zone, nid, j, zone_start_pfn, size); | 2029 | zonetable_add(zone, nid, j, zone_start_pfn, size); |
2114 | init_currently_empty_zone(zone, zone_start_pfn, size); | 2030 | ret = init_currently_empty_zone(zone, zone_start_pfn, size); |
2031 | BUG_ON(ret); | ||
2115 | zone_start_pfn += size; | 2032 | zone_start_pfn += size; |
2116 | } | 2033 | } |
2117 | } | 2034 | } |
@@ -2152,7 +2069,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) | |||
2152 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ | 2069 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ |
2153 | } | 2070 | } |
2154 | 2071 | ||
2155 | void __init free_area_init_node(int nid, struct pglist_data *pgdat, | 2072 | void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, |
2156 | unsigned long *zones_size, unsigned long node_start_pfn, | 2073 | unsigned long *zones_size, unsigned long node_start_pfn, |
2157 | unsigned long *zholes_size) | 2074 | unsigned long *zholes_size) |
2158 | { | 2075 | { |
@@ -2178,307 +2095,18 @@ void __init free_area_init(unsigned long *zones_size) | |||
2178 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); | 2095 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); |
2179 | } | 2096 | } |
2180 | 2097 | ||
2181 | #ifdef CONFIG_PROC_FS | ||
2182 | |||
2183 | #include <linux/seq_file.h> | ||
2184 | |||
2185 | static void *frag_start(struct seq_file *m, loff_t *pos) | ||
2186 | { | ||
2187 | pg_data_t *pgdat; | ||
2188 | loff_t node = *pos; | ||
2189 | for (pgdat = first_online_pgdat(); | ||
2190 | pgdat && node; | ||
2191 | pgdat = next_online_pgdat(pgdat)) | ||
2192 | --node; | ||
2193 | |||
2194 | return pgdat; | ||
2195 | } | ||
2196 | |||
2197 | static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) | ||
2198 | { | ||
2199 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
2200 | |||
2201 | (*pos)++; | ||
2202 | return next_online_pgdat(pgdat); | ||
2203 | } | ||
2204 | |||
2205 | static void frag_stop(struct seq_file *m, void *arg) | ||
2206 | { | ||
2207 | } | ||
2208 | |||
2209 | /* | ||
2210 | * This walks the free areas for each zone. | ||
2211 | */ | ||
2212 | static int frag_show(struct seq_file *m, void *arg) | ||
2213 | { | ||
2214 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
2215 | struct zone *zone; | ||
2216 | struct zone *node_zones = pgdat->node_zones; | ||
2217 | unsigned long flags; | ||
2218 | int order; | ||
2219 | |||
2220 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | ||
2221 | if (!populated_zone(zone)) | ||
2222 | continue; | ||
2223 | |||
2224 | spin_lock_irqsave(&zone->lock, flags); | ||
2225 | seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); | ||
2226 | for (order = 0; order < MAX_ORDER; ++order) | ||
2227 | seq_printf(m, "%6lu ", zone->free_area[order].nr_free); | ||
2228 | spin_unlock_irqrestore(&zone->lock, flags); | ||
2229 | seq_putc(m, '\n'); | ||
2230 | } | ||
2231 | return 0; | ||
2232 | } | ||
2233 | |||
2234 | struct seq_operations fragmentation_op = { | ||
2235 | .start = frag_start, | ||
2236 | .next = frag_next, | ||
2237 | .stop = frag_stop, | ||
2238 | .show = frag_show, | ||
2239 | }; | ||
2240 | |||
2241 | /* | ||
2242 | * Output information about zones in @pgdat. | ||
2243 | */ | ||
2244 | static int zoneinfo_show(struct seq_file *m, void *arg) | ||
2245 | { | ||
2246 | pg_data_t *pgdat = arg; | ||
2247 | struct zone *zone; | ||
2248 | struct zone *node_zones = pgdat->node_zones; | ||
2249 | unsigned long flags; | ||
2250 | |||
2251 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { | ||
2252 | int i; | ||
2253 | |||
2254 | if (!populated_zone(zone)) | ||
2255 | continue; | ||
2256 | |||
2257 | spin_lock_irqsave(&zone->lock, flags); | ||
2258 | seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); | ||
2259 | seq_printf(m, | ||
2260 | "\n pages free %lu" | ||
2261 | "\n min %lu" | ||
2262 | "\n low %lu" | ||
2263 | "\n high %lu" | ||
2264 | "\n active %lu" | ||
2265 | "\n inactive %lu" | ||
2266 | "\n scanned %lu (a: %lu i: %lu)" | ||
2267 | "\n spanned %lu" | ||
2268 | "\n present %lu", | ||
2269 | zone->free_pages, | ||
2270 | zone->pages_min, | ||
2271 | zone->pages_low, | ||
2272 | zone->pages_high, | ||
2273 | zone->nr_active, | ||
2274 | zone->nr_inactive, | ||
2275 | zone->pages_scanned, | ||
2276 | zone->nr_scan_active, zone->nr_scan_inactive, | ||
2277 | zone->spanned_pages, | ||
2278 | zone->present_pages); | ||
2279 | seq_printf(m, | ||
2280 | "\n protection: (%lu", | ||
2281 | zone->lowmem_reserve[0]); | ||
2282 | for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) | ||
2283 | seq_printf(m, ", %lu", zone->lowmem_reserve[i]); | ||
2284 | seq_printf(m, | ||
2285 | ")" | ||
2286 | "\n pagesets"); | ||
2287 | for_each_online_cpu(i) { | ||
2288 | struct per_cpu_pageset *pageset; | ||
2289 | int j; | ||
2290 | |||
2291 | pageset = zone_pcp(zone, i); | ||
2292 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { | ||
2293 | if (pageset->pcp[j].count) | ||
2294 | break; | ||
2295 | } | ||
2296 | if (j == ARRAY_SIZE(pageset->pcp)) | ||
2297 | continue; | ||
2298 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { | ||
2299 | seq_printf(m, | ||
2300 | "\n cpu: %i pcp: %i" | ||
2301 | "\n count: %i" | ||
2302 | "\n high: %i" | ||
2303 | "\n batch: %i", | ||
2304 | i, j, | ||
2305 | pageset->pcp[j].count, | ||
2306 | pageset->pcp[j].high, | ||
2307 | pageset->pcp[j].batch); | ||
2308 | } | ||
2309 | #ifdef CONFIG_NUMA | ||
2310 | seq_printf(m, | ||
2311 | "\n numa_hit: %lu" | ||
2312 | "\n numa_miss: %lu" | ||
2313 | "\n numa_foreign: %lu" | ||
2314 | "\n interleave_hit: %lu" | ||
2315 | "\n local_node: %lu" | ||
2316 | "\n other_node: %lu", | ||
2317 | pageset->numa_hit, | ||
2318 | pageset->numa_miss, | ||
2319 | pageset->numa_foreign, | ||
2320 | pageset->interleave_hit, | ||
2321 | pageset->local_node, | ||
2322 | pageset->other_node); | ||
2323 | #endif | ||
2324 | } | ||
2325 | seq_printf(m, | ||
2326 | "\n all_unreclaimable: %u" | ||
2327 | "\n prev_priority: %i" | ||
2328 | "\n temp_priority: %i" | ||
2329 | "\n start_pfn: %lu", | ||
2330 | zone->all_unreclaimable, | ||
2331 | zone->prev_priority, | ||
2332 | zone->temp_priority, | ||
2333 | zone->zone_start_pfn); | ||
2334 | spin_unlock_irqrestore(&zone->lock, flags); | ||
2335 | seq_putc(m, '\n'); | ||
2336 | } | ||
2337 | return 0; | ||
2338 | } | ||
2339 | |||
2340 | struct seq_operations zoneinfo_op = { | ||
2341 | .start = frag_start, /* iterate over all zones. The same as in | ||
2342 | * fragmentation. */ | ||
2343 | .next = frag_next, | ||
2344 | .stop = frag_stop, | ||
2345 | .show = zoneinfo_show, | ||
2346 | }; | ||
2347 | |||
2348 | static char *vmstat_text[] = { | ||
2349 | "nr_dirty", | ||
2350 | "nr_writeback", | ||
2351 | "nr_unstable", | ||
2352 | "nr_page_table_pages", | ||
2353 | "nr_mapped", | ||
2354 | "nr_slab", | ||
2355 | |||
2356 | "pgpgin", | ||
2357 | "pgpgout", | ||
2358 | "pswpin", | ||
2359 | "pswpout", | ||
2360 | |||
2361 | "pgalloc_high", | ||
2362 | "pgalloc_normal", | ||
2363 | "pgalloc_dma32", | ||
2364 | "pgalloc_dma", | ||
2365 | |||
2366 | "pgfree", | ||
2367 | "pgactivate", | ||
2368 | "pgdeactivate", | ||
2369 | |||
2370 | "pgfault", | ||
2371 | "pgmajfault", | ||
2372 | |||
2373 | "pgrefill_high", | ||
2374 | "pgrefill_normal", | ||
2375 | "pgrefill_dma32", | ||
2376 | "pgrefill_dma", | ||
2377 | |||
2378 | "pgsteal_high", | ||
2379 | "pgsteal_normal", | ||
2380 | "pgsteal_dma32", | ||
2381 | "pgsteal_dma", | ||
2382 | |||
2383 | "pgscan_kswapd_high", | ||
2384 | "pgscan_kswapd_normal", | ||
2385 | "pgscan_kswapd_dma32", | ||
2386 | "pgscan_kswapd_dma", | ||
2387 | |||
2388 | "pgscan_direct_high", | ||
2389 | "pgscan_direct_normal", | ||
2390 | "pgscan_direct_dma32", | ||
2391 | "pgscan_direct_dma", | ||
2392 | |||
2393 | "pginodesteal", | ||
2394 | "slabs_scanned", | ||
2395 | "kswapd_steal", | ||
2396 | "kswapd_inodesteal", | ||
2397 | "pageoutrun", | ||
2398 | "allocstall", | ||
2399 | |||
2400 | "pgrotated", | ||
2401 | "nr_bounce", | ||
2402 | }; | ||
2403 | |||
2404 | static void *vmstat_start(struct seq_file *m, loff_t *pos) | ||
2405 | { | ||
2406 | struct page_state *ps; | ||
2407 | |||
2408 | if (*pos >= ARRAY_SIZE(vmstat_text)) | ||
2409 | return NULL; | ||
2410 | |||
2411 | ps = kmalloc(sizeof(*ps), GFP_KERNEL); | ||
2412 | m->private = ps; | ||
2413 | if (!ps) | ||
2414 | return ERR_PTR(-ENOMEM); | ||
2415 | get_full_page_state(ps); | ||
2416 | ps->pgpgin /= 2; /* sectors -> kbytes */ | ||
2417 | ps->pgpgout /= 2; | ||
2418 | return (unsigned long *)ps + *pos; | ||
2419 | } | ||
2420 | |||
2421 | static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) | ||
2422 | { | ||
2423 | (*pos)++; | ||
2424 | if (*pos >= ARRAY_SIZE(vmstat_text)) | ||
2425 | return NULL; | ||
2426 | return (unsigned long *)m->private + *pos; | ||
2427 | } | ||
2428 | |||
2429 | static int vmstat_show(struct seq_file *m, void *arg) | ||
2430 | { | ||
2431 | unsigned long *l = arg; | ||
2432 | unsigned long off = l - (unsigned long *)m->private; | ||
2433 | |||
2434 | seq_printf(m, "%s %lu\n", vmstat_text[off], *l); | ||
2435 | return 0; | ||
2436 | } | ||
2437 | |||
2438 | static void vmstat_stop(struct seq_file *m, void *arg) | ||
2439 | { | ||
2440 | kfree(m->private); | ||
2441 | m->private = NULL; | ||
2442 | } | ||
2443 | |||
2444 | struct seq_operations vmstat_op = { | ||
2445 | .start = vmstat_start, | ||
2446 | .next = vmstat_next, | ||
2447 | .stop = vmstat_stop, | ||
2448 | .show = vmstat_show, | ||
2449 | }; | ||
2450 | |||
2451 | #endif /* CONFIG_PROC_FS */ | ||
2452 | |||
2453 | #ifdef CONFIG_HOTPLUG_CPU | 2098 | #ifdef CONFIG_HOTPLUG_CPU |
2454 | static int page_alloc_cpu_notify(struct notifier_block *self, | 2099 | static int page_alloc_cpu_notify(struct notifier_block *self, |
2455 | unsigned long action, void *hcpu) | 2100 | unsigned long action, void *hcpu) |
2456 | { | 2101 | { |
2457 | int cpu = (unsigned long)hcpu; | 2102 | int cpu = (unsigned long)hcpu; |
2458 | long *count; | ||
2459 | unsigned long *src, *dest; | ||
2460 | 2103 | ||
2461 | if (action == CPU_DEAD) { | 2104 | if (action == CPU_DEAD) { |
2462 | int i; | ||
2463 | |||
2464 | /* Drain local pagecache count. */ | ||
2465 | count = &per_cpu(nr_pagecache_local, cpu); | ||
2466 | atomic_add(*count, &nr_pagecache); | ||
2467 | *count = 0; | ||
2468 | local_irq_disable(); | 2105 | local_irq_disable(); |
2469 | __drain_pages(cpu); | 2106 | __drain_pages(cpu); |
2470 | 2107 | vm_events_fold_cpu(cpu); | |
2471 | /* Add dead cpu's page_states to our own. */ | ||
2472 | dest = (unsigned long *)&__get_cpu_var(page_states); | ||
2473 | src = (unsigned long *)&per_cpu(page_states, cpu); | ||
2474 | |||
2475 | for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long); | ||
2476 | i++) { | ||
2477 | dest[i] += src[i]; | ||
2478 | src[i] = 0; | ||
2479 | } | ||
2480 | |||
2481 | local_irq_enable(); | 2108 | local_irq_enable(); |
2109 | refresh_cpu_vm_stats(cpu); | ||
2482 | } | 2110 | } |
2483 | return NOTIFY_OK; | 2111 | return NOTIFY_OK; |
2484 | } | 2112 | } |
@@ -2804,42 +2432,14 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
2804 | } | 2432 | } |
2805 | 2433 | ||
2806 | #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE | 2434 | #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE |
2807 | /* | ||
2808 | * pfn <-> page translation. out-of-line version. | ||
2809 | * (see asm-generic/memory_model.h) | ||
2810 | */ | ||
2811 | #if defined(CONFIG_FLATMEM) | ||
2812 | struct page *pfn_to_page(unsigned long pfn) | ||
2813 | { | ||
2814 | return mem_map + (pfn - ARCH_PFN_OFFSET); | ||
2815 | } | ||
2816 | unsigned long page_to_pfn(struct page *page) | ||
2817 | { | ||
2818 | return (page - mem_map) + ARCH_PFN_OFFSET; | ||
2819 | } | ||
2820 | #elif defined(CONFIG_DISCONTIGMEM) | ||
2821 | struct page *pfn_to_page(unsigned long pfn) | ||
2822 | { | ||
2823 | int nid = arch_pfn_to_nid(pfn); | ||
2824 | return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid); | ||
2825 | } | ||
2826 | unsigned long page_to_pfn(struct page *page) | ||
2827 | { | ||
2828 | struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); | ||
2829 | return (page - pgdat->node_mem_map) + pgdat->node_start_pfn; | ||
2830 | } | ||
2831 | #elif defined(CONFIG_SPARSEMEM) | ||
2832 | struct page *pfn_to_page(unsigned long pfn) | 2435 | struct page *pfn_to_page(unsigned long pfn) |
2833 | { | 2436 | { |
2834 | return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn; | 2437 | return __pfn_to_page(pfn); |
2835 | } | 2438 | } |
2836 | |||
2837 | unsigned long page_to_pfn(struct page *page) | 2439 | unsigned long page_to_pfn(struct page *page) |
2838 | { | 2440 | { |
2839 | long section_id = page_to_section(page); | 2441 | return __page_to_pfn(page); |
2840 | return page - __section_mem_map_addr(__nr_to_section(section_id)); | ||
2841 | } | 2442 | } |
2842 | #endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */ | ||
2843 | EXPORT_SYMBOL(pfn_to_page); | 2443 | EXPORT_SYMBOL(pfn_to_page); |
2844 | EXPORT_SYMBOL(page_to_pfn); | 2444 | EXPORT_SYMBOL(page_to_pfn); |
2845 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ | 2445 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ |
diff --git a/mm/page_io.c b/mm/page_io.c index bb2b0d53889c..88029948d00a 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -101,7 +101,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
101 | } | 101 | } |
102 | if (wbc->sync_mode == WB_SYNC_ALL) | 102 | if (wbc->sync_mode == WB_SYNC_ALL) |
103 | rw |= (1 << BIO_RW_SYNC); | 103 | rw |= (1 << BIO_RW_SYNC); |
104 | inc_page_state(pswpout); | 104 | count_vm_event(PSWPOUT); |
105 | set_page_writeback(page); | 105 | set_page_writeback(page); |
106 | unlock_page(page); | 106 | unlock_page(page); |
107 | submit_bio(rw, bio); | 107 | submit_bio(rw, bio); |
@@ -123,7 +123,7 @@ int swap_readpage(struct file *file, struct page *page) | |||
123 | ret = -ENOMEM; | 123 | ret = -ENOMEM; |
124 | goto out; | 124 | goto out; |
125 | } | 125 | } |
126 | inc_page_state(pswpin); | 126 | count_vm_event(PSWPIN); |
127 | submit_bio(READ, bio); | 127 | submit_bio(READ, bio); |
128 | out: | 128 | out: |
129 | return ret; | 129 | return ret; |
diff --git a/mm/pdflush.c b/mm/pdflush.c index c4b6d0afd736..b02102feeb4b 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c | |||
@@ -104,21 +104,20 @@ static int __pdflush(struct pdflush_work *my_work) | |||
104 | list_move(&my_work->list, &pdflush_list); | 104 | list_move(&my_work->list, &pdflush_list); |
105 | my_work->when_i_went_to_sleep = jiffies; | 105 | my_work->when_i_went_to_sleep = jiffies; |
106 | spin_unlock_irq(&pdflush_lock); | 106 | spin_unlock_irq(&pdflush_lock); |
107 | |||
108 | schedule(); | 107 | schedule(); |
109 | if (try_to_freeze()) { | 108 | try_to_freeze(); |
110 | spin_lock_irq(&pdflush_lock); | ||
111 | continue; | ||
112 | } | ||
113 | |||
114 | spin_lock_irq(&pdflush_lock); | 109 | spin_lock_irq(&pdflush_lock); |
115 | if (!list_empty(&my_work->list)) { | 110 | if (!list_empty(&my_work->list)) { |
116 | printk("pdflush: bogus wakeup!\n"); | 111 | /* |
112 | * Someone woke us up, but without removing our control | ||
113 | * structure from the global list. swsusp will do this | ||
114 | * in try_to_freeze()->refrigerator(). Handle it. | ||
115 | */ | ||
117 | my_work->fn = NULL; | 116 | my_work->fn = NULL; |
118 | continue; | 117 | continue; |
119 | } | 118 | } |
120 | if (my_work->fn == NULL) { | 119 | if (my_work->fn == NULL) { |
121 | printk("pdflush: NULL work function\n"); | 120 | printk("pdflush: bogus wakeup\n"); |
122 | continue; | 121 | continue; |
123 | } | 122 | } |
124 | spin_unlock_irq(&pdflush_lock); | 123 | spin_unlock_irq(&pdflush_lock); |
@@ -202,8 +201,7 @@ int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0) | |||
202 | unsigned long flags; | 201 | unsigned long flags; |
203 | int ret = 0; | 202 | int ret = 0; |
204 | 203 | ||
205 | if (fn == NULL) | 204 | BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */ |
206 | BUG(); /* Hard to diagnose if it's deferred */ | ||
207 | 205 | ||
208 | spin_lock_irqsave(&pdflush_lock, flags); | 206 | spin_lock_irqsave(&pdflush_lock, flags); |
209 | if (list_empty(&pdflush_list)) { | 207 | if (list_empty(&pdflush_list)) { |
diff --git a/mm/readahead.c b/mm/readahead.c index ba7db816f4c8..1ba736ac0367 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -119,8 +119,7 @@ static inline unsigned long get_next_ra_size(struct file_ra_state *ra) | |||
119 | #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) | 119 | #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) |
120 | 120 | ||
121 | /** | 121 | /** |
122 | * read_cache_pages - populate an address space with some pages, and | 122 | * read_cache_pages - populate an address space with some pages & start reads against them |
123 | * start reads against them. | ||
124 | * @mapping: the address_space | 123 | * @mapping: the address_space |
125 | * @pages: The address of a list_head which contains the target pages. These | 124 | * @pages: The address of a list_head which contains the target pages. These |
126 | * pages have their ->index populated and are otherwise uninitialised. | 125 | * pages have their ->index populated and are otherwise uninitialised. |
@@ -183,14 +182,11 @@ static int read_pages(struct address_space *mapping, struct file *filp, | |||
183 | list_del(&page->lru); | 182 | list_del(&page->lru); |
184 | if (!add_to_page_cache(page, mapping, | 183 | if (!add_to_page_cache(page, mapping, |
185 | page->index, GFP_KERNEL)) { | 184 | page->index, GFP_KERNEL)) { |
186 | ret = mapping->a_ops->readpage(filp, page); | 185 | mapping->a_ops->readpage(filp, page); |
187 | if (ret != AOP_TRUNCATED_PAGE) { | 186 | if (!pagevec_add(&lru_pvec, page)) |
188 | if (!pagevec_add(&lru_pvec, page)) | 187 | __pagevec_lru_add(&lru_pvec); |
189 | __pagevec_lru_add(&lru_pvec); | 188 | } else |
190 | continue; | 189 | page_cache_release(page); |
191 | } /* else fall through to release */ | ||
192 | } | ||
193 | page_cache_release(page); | ||
194 | } | 190 | } |
195 | pagevec_lru_add(&lru_pvec); | 191 | pagevec_lru_add(&lru_pvec); |
196 | ret = 0; | 192 | ret = 0; |
@@ -395,8 +391,8 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
395 | * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block' | 391 | * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block' |
396 | * is set wait till the read completes. Otherwise attempt to read without | 392 | * is set wait till the read completes. Otherwise attempt to read without |
397 | * blocking. | 393 | * blocking. |
398 | * Returns 1 meaning 'success' if read is succesfull without switching off | 394 | * Returns 1 meaning 'success' if read is successful without switching off |
399 | * readhaead mode. Otherwise return failure. | 395 | * readahead mode. Otherwise return failure. |
400 | */ | 396 | */ |
401 | static int | 397 | static int |
402 | blockable_page_cache_readahead(struct address_space *mapping, struct file *filp, | 398 | blockable_page_cache_readahead(struct address_space *mapping, struct file *filp, |
@@ -103,7 +103,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
103 | spin_lock(&mm->page_table_lock); | 103 | spin_lock(&mm->page_table_lock); |
104 | if (likely(!vma->anon_vma)) { | 104 | if (likely(!vma->anon_vma)) { |
105 | vma->anon_vma = anon_vma; | 105 | vma->anon_vma = anon_vma; |
106 | list_add(&vma->anon_vma_node, &anon_vma->head); | 106 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); |
107 | allocated = NULL; | 107 | allocated = NULL; |
108 | } | 108 | } |
109 | spin_unlock(&mm->page_table_lock); | 109 | spin_unlock(&mm->page_table_lock); |
@@ -127,7 +127,7 @@ void __anon_vma_link(struct vm_area_struct *vma) | |||
127 | struct anon_vma *anon_vma = vma->anon_vma; | 127 | struct anon_vma *anon_vma = vma->anon_vma; |
128 | 128 | ||
129 | if (anon_vma) { | 129 | if (anon_vma) { |
130 | list_add(&vma->anon_vma_node, &anon_vma->head); | 130 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); |
131 | validate_anon_vma(vma); | 131 | validate_anon_vma(vma); |
132 | } | 132 | } |
133 | } | 133 | } |
@@ -138,7 +138,7 @@ void anon_vma_link(struct vm_area_struct *vma) | |||
138 | 138 | ||
139 | if (anon_vma) { | 139 | if (anon_vma) { |
140 | spin_lock(&anon_vma->lock); | 140 | spin_lock(&anon_vma->lock); |
141 | list_add(&vma->anon_vma_node, &anon_vma->head); | 141 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); |
142 | validate_anon_vma(vma); | 142 | validate_anon_vma(vma); |
143 | spin_unlock(&anon_vma->lock); | 143 | spin_unlock(&anon_vma->lock); |
144 | } | 144 | } |
@@ -205,44 +205,6 @@ out: | |||
205 | return anon_vma; | 205 | return anon_vma; |
206 | } | 206 | } |
207 | 207 | ||
208 | #ifdef CONFIG_MIGRATION | ||
209 | /* | ||
210 | * Remove an anonymous page from swap replacing the swap pte's | ||
211 | * through real pte's pointing to valid pages and then releasing | ||
212 | * the page from the swap cache. | ||
213 | * | ||
214 | * Must hold page lock on page and mmap_sem of one vma that contains | ||
215 | * the page. | ||
216 | */ | ||
217 | void remove_from_swap(struct page *page) | ||
218 | { | ||
219 | struct anon_vma *anon_vma; | ||
220 | struct vm_area_struct *vma; | ||
221 | unsigned long mapping; | ||
222 | |||
223 | if (!PageSwapCache(page)) | ||
224 | return; | ||
225 | |||
226 | mapping = (unsigned long)page->mapping; | ||
227 | |||
228 | if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) | ||
229 | return; | ||
230 | |||
231 | /* | ||
232 | * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. | ||
233 | */ | ||
234 | anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); | ||
235 | spin_lock(&anon_vma->lock); | ||
236 | |||
237 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) | ||
238 | remove_vma_swap(vma, page); | ||
239 | |||
240 | spin_unlock(&anon_vma->lock); | ||
241 | delete_from_swap_cache(page); | ||
242 | } | ||
243 | EXPORT_SYMBOL(remove_from_swap); | ||
244 | #endif | ||
245 | |||
246 | /* | 208 | /* |
247 | * At what user virtual address is page expected in vma? | 209 | * At what user virtual address is page expected in vma? |
248 | */ | 210 | */ |
@@ -493,7 +455,7 @@ static void __page_set_anon_rmap(struct page *page, | |||
493 | * nr_mapped state can be updated without turning off | 455 | * nr_mapped state can be updated without turning off |
494 | * interrupts because it is not modified via interrupt. | 456 | * interrupts because it is not modified via interrupt. |
495 | */ | 457 | */ |
496 | __inc_page_state(nr_mapped); | 458 | __inc_zone_page_state(page, NR_ANON_PAGES); |
497 | } | 459 | } |
498 | 460 | ||
499 | /** | 461 | /** |
@@ -537,7 +499,7 @@ void page_add_new_anon_rmap(struct page *page, | |||
537 | void page_add_file_rmap(struct page *page) | 499 | void page_add_file_rmap(struct page *page) |
538 | { | 500 | { |
539 | if (atomic_inc_and_test(&page->_mapcount)) | 501 | if (atomic_inc_and_test(&page->_mapcount)) |
540 | __inc_page_state(nr_mapped); | 502 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
541 | } | 503 | } |
542 | 504 | ||
543 | /** | 505 | /** |
@@ -569,7 +531,8 @@ void page_remove_rmap(struct page *page) | |||
569 | */ | 531 | */ |
570 | if (page_test_and_clear_dirty(page)) | 532 | if (page_test_and_clear_dirty(page)) |
571 | set_page_dirty(page); | 533 | set_page_dirty(page); |
572 | __dec_page_state(nr_mapped); | 534 | __dec_zone_page_state(page, |
535 | PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); | ||
573 | } | 536 | } |
574 | } | 537 | } |
575 | 538 | ||
@@ -578,7 +541,7 @@ void page_remove_rmap(struct page *page) | |||
578 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. | 541 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. |
579 | */ | 542 | */ |
580 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 543 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
581 | int ignore_refs) | 544 | int migration) |
582 | { | 545 | { |
583 | struct mm_struct *mm = vma->vm_mm; | 546 | struct mm_struct *mm = vma->vm_mm; |
584 | unsigned long address; | 547 | unsigned long address; |
@@ -600,9 +563,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
600 | * If it's recently referenced (perhaps page_referenced | 563 | * If it's recently referenced (perhaps page_referenced |
601 | * skipped over this mm) then we should reactivate it. | 564 | * skipped over this mm) then we should reactivate it. |
602 | */ | 565 | */ |
603 | if ((vma->vm_flags & VM_LOCKED) || | 566 | if (!migration && ((vma->vm_flags & VM_LOCKED) || |
604 | (ptep_clear_flush_young(vma, address, pte) | 567 | (ptep_clear_flush_young(vma, address, pte)))) { |
605 | && !ignore_refs)) { | ||
606 | ret = SWAP_FAIL; | 568 | ret = SWAP_FAIL; |
607 | goto out_unmap; | 569 | goto out_unmap; |
608 | } | 570 | } |
@@ -620,24 +582,45 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
620 | 582 | ||
621 | if (PageAnon(page)) { | 583 | if (PageAnon(page)) { |
622 | swp_entry_t entry = { .val = page_private(page) }; | 584 | swp_entry_t entry = { .val = page_private(page) }; |
623 | /* | 585 | |
624 | * Store the swap location in the pte. | 586 | if (PageSwapCache(page)) { |
625 | * See handle_pte_fault() ... | 587 | /* |
626 | */ | 588 | * Store the swap location in the pte. |
627 | BUG_ON(!PageSwapCache(page)); | 589 | * See handle_pte_fault() ... |
628 | swap_duplicate(entry); | 590 | */ |
629 | if (list_empty(&mm->mmlist)) { | 591 | swap_duplicate(entry); |
630 | spin_lock(&mmlist_lock); | 592 | if (list_empty(&mm->mmlist)) { |
631 | if (list_empty(&mm->mmlist)) | 593 | spin_lock(&mmlist_lock); |
632 | list_add(&mm->mmlist, &init_mm.mmlist); | 594 | if (list_empty(&mm->mmlist)) |
633 | spin_unlock(&mmlist_lock); | 595 | list_add(&mm->mmlist, &init_mm.mmlist); |
596 | spin_unlock(&mmlist_lock); | ||
597 | } | ||
598 | dec_mm_counter(mm, anon_rss); | ||
599 | #ifdef CONFIG_MIGRATION | ||
600 | } else { | ||
601 | /* | ||
602 | * Store the pfn of the page in a special migration | ||
603 | * pte. do_swap_page() will wait until the migration | ||
604 | * pte is removed and then restart fault handling. | ||
605 | */ | ||
606 | BUG_ON(!migration); | ||
607 | entry = make_migration_entry(page, pte_write(pteval)); | ||
608 | #endif | ||
634 | } | 609 | } |
635 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 610 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
636 | BUG_ON(pte_file(*pte)); | 611 | BUG_ON(pte_file(*pte)); |
637 | dec_mm_counter(mm, anon_rss); | ||
638 | } else | 612 | } else |
613 | #ifdef CONFIG_MIGRATION | ||
614 | if (migration) { | ||
615 | /* Establish migration entry for a file page */ | ||
616 | swp_entry_t entry; | ||
617 | entry = make_migration_entry(page, pte_write(pteval)); | ||
618 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | ||
619 | } else | ||
620 | #endif | ||
639 | dec_mm_counter(mm, file_rss); | 621 | dec_mm_counter(mm, file_rss); |
640 | 622 | ||
623 | |||
641 | page_remove_rmap(page); | 624 | page_remove_rmap(page); |
642 | page_cache_release(page); | 625 | page_cache_release(page); |
643 | 626 | ||
@@ -736,7 +719,7 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
736 | pte_unmap_unlock(pte - 1, ptl); | 719 | pte_unmap_unlock(pte - 1, ptl); |
737 | } | 720 | } |
738 | 721 | ||
739 | static int try_to_unmap_anon(struct page *page, int ignore_refs) | 722 | static int try_to_unmap_anon(struct page *page, int migration) |
740 | { | 723 | { |
741 | struct anon_vma *anon_vma; | 724 | struct anon_vma *anon_vma; |
742 | struct vm_area_struct *vma; | 725 | struct vm_area_struct *vma; |
@@ -747,7 +730,7 @@ static int try_to_unmap_anon(struct page *page, int ignore_refs) | |||
747 | return ret; | 730 | return ret; |
748 | 731 | ||
749 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 732 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { |
750 | ret = try_to_unmap_one(page, vma, ignore_refs); | 733 | ret = try_to_unmap_one(page, vma, migration); |
751 | if (ret == SWAP_FAIL || !page_mapped(page)) | 734 | if (ret == SWAP_FAIL || !page_mapped(page)) |
752 | break; | 735 | break; |
753 | } | 736 | } |
@@ -764,7 +747,7 @@ static int try_to_unmap_anon(struct page *page, int ignore_refs) | |||
764 | * | 747 | * |
765 | * This function is only called from try_to_unmap for object-based pages. | 748 | * This function is only called from try_to_unmap for object-based pages. |
766 | */ | 749 | */ |
767 | static int try_to_unmap_file(struct page *page, int ignore_refs) | 750 | static int try_to_unmap_file(struct page *page, int migration) |
768 | { | 751 | { |
769 | struct address_space *mapping = page->mapping; | 752 | struct address_space *mapping = page->mapping; |
770 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 753 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
@@ -778,7 +761,7 @@ static int try_to_unmap_file(struct page *page, int ignore_refs) | |||
778 | 761 | ||
779 | spin_lock(&mapping->i_mmap_lock); | 762 | spin_lock(&mapping->i_mmap_lock); |
780 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 763 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
781 | ret = try_to_unmap_one(page, vma, ignore_refs); | 764 | ret = try_to_unmap_one(page, vma, migration); |
782 | if (ret == SWAP_FAIL || !page_mapped(page)) | 765 | if (ret == SWAP_FAIL || !page_mapped(page)) |
783 | goto out; | 766 | goto out; |
784 | } | 767 | } |
@@ -788,7 +771,7 @@ static int try_to_unmap_file(struct page *page, int ignore_refs) | |||
788 | 771 | ||
789 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 772 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
790 | shared.vm_set.list) { | 773 | shared.vm_set.list) { |
791 | if (vma->vm_flags & VM_LOCKED) | 774 | if ((vma->vm_flags & VM_LOCKED) && !migration) |
792 | continue; | 775 | continue; |
793 | cursor = (unsigned long) vma->vm_private_data; | 776 | cursor = (unsigned long) vma->vm_private_data; |
794 | if (cursor > max_nl_cursor) | 777 | if (cursor > max_nl_cursor) |
@@ -822,7 +805,7 @@ static int try_to_unmap_file(struct page *page, int ignore_refs) | |||
822 | do { | 805 | do { |
823 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 806 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
824 | shared.vm_set.list) { | 807 | shared.vm_set.list) { |
825 | if (vma->vm_flags & VM_LOCKED) | 808 | if ((vma->vm_flags & VM_LOCKED) && !migration) |
826 | continue; | 809 | continue; |
827 | cursor = (unsigned long) vma->vm_private_data; | 810 | cursor = (unsigned long) vma->vm_private_data; |
828 | while ( cursor < max_nl_cursor && | 811 | while ( cursor < max_nl_cursor && |
@@ -863,16 +846,16 @@ out: | |||
863 | * SWAP_AGAIN - we missed a mapping, try again later | 846 | * SWAP_AGAIN - we missed a mapping, try again later |
864 | * SWAP_FAIL - the page is unswappable | 847 | * SWAP_FAIL - the page is unswappable |
865 | */ | 848 | */ |
866 | int try_to_unmap(struct page *page, int ignore_refs) | 849 | int try_to_unmap(struct page *page, int migration) |
867 | { | 850 | { |
868 | int ret; | 851 | int ret; |
869 | 852 | ||
870 | BUG_ON(!PageLocked(page)); | 853 | BUG_ON(!PageLocked(page)); |
871 | 854 | ||
872 | if (PageAnon(page)) | 855 | if (PageAnon(page)) |
873 | ret = try_to_unmap_anon(page, ignore_refs); | 856 | ret = try_to_unmap_anon(page, migration); |
874 | else | 857 | else |
875 | ret = try_to_unmap_file(page, ignore_refs); | 858 | ret = try_to_unmap_file(page, migration); |
876 | 859 | ||
877 | if (!page_mapped(page)) | 860 | if (!page_mapped(page)) |
878 | ret = SWAP_SUCCESS; | 861 | ret = SWAP_SUCCESS; |
diff --git a/mm/shmem.c b/mm/shmem.c index 1e43c8a865ba..db21c51531ca 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -23,10 +23,8 @@ | |||
23 | * which makes it a completely usable filesystem. | 23 | * which makes it a completely usable filesystem. |
24 | */ | 24 | */ |
25 | 25 | ||
26 | #include <linux/config.h> | ||
27 | #include <linux/module.h> | 26 | #include <linux/module.h> |
28 | #include <linux/init.h> | 27 | #include <linux/init.h> |
29 | #include <linux/devfs_fs_kernel.h> | ||
30 | #include <linux/fs.h> | 28 | #include <linux/fs.h> |
31 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
32 | #include <linux/mman.h> | 30 | #include <linux/mman.h> |
@@ -174,7 +172,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) | |||
174 | } | 172 | } |
175 | 173 | ||
176 | static struct super_operations shmem_ops; | 174 | static struct super_operations shmem_ops; |
177 | static struct address_space_operations shmem_aops; | 175 | static const struct address_space_operations shmem_aops; |
178 | static struct file_operations shmem_file_operations; | 176 | static struct file_operations shmem_file_operations; |
179 | static struct inode_operations shmem_inode_operations; | 177 | static struct inode_operations shmem_inode_operations; |
180 | static struct inode_operations shmem_dir_inode_operations; | 178 | static struct inode_operations shmem_dir_inode_operations; |
@@ -1046,12 +1044,12 @@ repeat: | |||
1046 | swappage = lookup_swap_cache(swap); | 1044 | swappage = lookup_swap_cache(swap); |
1047 | if (!swappage) { | 1045 | if (!swappage) { |
1048 | shmem_swp_unmap(entry); | 1046 | shmem_swp_unmap(entry); |
1049 | spin_unlock(&info->lock); | ||
1050 | /* here we actually do the io */ | 1047 | /* here we actually do the io */ |
1051 | if (type && *type == VM_FAULT_MINOR) { | 1048 | if (type && *type == VM_FAULT_MINOR) { |
1052 | inc_page_state(pgmajfault); | 1049 | __count_vm_event(PGMAJFAULT); |
1053 | *type = VM_FAULT_MAJOR; | 1050 | *type = VM_FAULT_MAJOR; |
1054 | } | 1051 | } |
1052 | spin_unlock(&info->lock); | ||
1055 | swappage = shmem_swapin(info, swap, idx); | 1053 | swappage = shmem_swapin(info, swap, idx); |
1056 | if (!swappage) { | 1054 | if (!swappage) { |
1057 | spin_lock(&info->lock); | 1055 | spin_lock(&info->lock); |
@@ -1081,14 +1079,6 @@ repeat: | |||
1081 | page_cache_release(swappage); | 1079 | page_cache_release(swappage); |
1082 | goto repeat; | 1080 | goto repeat; |
1083 | } | 1081 | } |
1084 | if (!PageSwapCache(swappage)) { | ||
1085 | /* Page migration has occured */ | ||
1086 | shmem_swp_unmap(entry); | ||
1087 | spin_unlock(&info->lock); | ||
1088 | unlock_page(swappage); | ||
1089 | page_cache_release(swappage); | ||
1090 | goto repeat; | ||
1091 | } | ||
1092 | if (PageWriteback(swappage)) { | 1082 | if (PageWriteback(swappage)) { |
1093 | shmem_swp_unmap(entry); | 1083 | shmem_swp_unmap(entry); |
1094 | spin_unlock(&info->lock); | 1084 | spin_unlock(&info->lock); |
@@ -1654,9 +1644,9 @@ static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos, | |||
1654 | return desc.error; | 1644 | return desc.error; |
1655 | } | 1645 | } |
1656 | 1646 | ||
1657 | static int shmem_statfs(struct super_block *sb, struct kstatfs *buf) | 1647 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) |
1658 | { | 1648 | { |
1659 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); | 1649 | struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); |
1660 | 1650 | ||
1661 | buf->f_type = TMPFS_MAGIC; | 1651 | buf->f_type = TMPFS_MAGIC; |
1662 | buf->f_bsize = PAGE_CACHE_SIZE; | 1652 | buf->f_bsize = PAGE_CACHE_SIZE; |
@@ -2170,7 +2160,7 @@ static void destroy_inodecache(void) | |||
2170 | printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n"); | 2160 | printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n"); |
2171 | } | 2161 | } |
2172 | 2162 | ||
2173 | static struct address_space_operations shmem_aops = { | 2163 | static const struct address_space_operations shmem_aops = { |
2174 | .writepage = shmem_writepage, | 2164 | .writepage = shmem_writepage, |
2175 | .set_page_dirty = __set_page_dirty_nobuffers, | 2165 | .set_page_dirty = __set_page_dirty_nobuffers, |
2176 | #ifdef CONFIG_TMPFS | 2166 | #ifdef CONFIG_TMPFS |
@@ -2233,10 +2223,10 @@ static struct vm_operations_struct shmem_vm_ops = { | |||
2233 | }; | 2223 | }; |
2234 | 2224 | ||
2235 | 2225 | ||
2236 | static struct super_block *shmem_get_sb(struct file_system_type *fs_type, | 2226 | static int shmem_get_sb(struct file_system_type *fs_type, |
2237 | int flags, const char *dev_name, void *data) | 2227 | int flags, const char *dev_name, void *data, struct vfsmount *mnt) |
2238 | { | 2228 | { |
2239 | return get_sb_nodev(fs_type, flags, data, shmem_fill_super); | 2229 | return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt); |
2240 | } | 2230 | } |
2241 | 2231 | ||
2242 | static struct file_system_type tmpfs_fs_type = { | 2232 | static struct file_system_type tmpfs_fs_type = { |
@@ -2260,10 +2250,8 @@ static int __init init_tmpfs(void) | |||
2260 | printk(KERN_ERR "Could not register tmpfs\n"); | 2250 | printk(KERN_ERR "Could not register tmpfs\n"); |
2261 | goto out2; | 2251 | goto out2; |
2262 | } | 2252 | } |
2263 | #ifdef CONFIG_TMPFS | 2253 | |
2264 | devfs_mk_dir("shm"); | 2254 | shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, |
2265 | #endif | ||
2266 | shm_mnt = do_kern_mount(tmpfs_fs_type.name, MS_NOUSER, | ||
2267 | tmpfs_fs_type.name, NULL); | 2255 | tmpfs_fs_type.name, NULL); |
2268 | if (IS_ERR(shm_mnt)) { | 2256 | if (IS_ERR(shm_mnt)) { |
2269 | error = PTR_ERR(shm_mnt); | 2257 | error = PTR_ERR(shm_mnt); |
@@ -89,6 +89,7 @@ | |||
89 | #include <linux/config.h> | 89 | #include <linux/config.h> |
90 | #include <linux/slab.h> | 90 | #include <linux/slab.h> |
91 | #include <linux/mm.h> | 91 | #include <linux/mm.h> |
92 | #include <linux/poison.h> | ||
92 | #include <linux/swap.h> | 93 | #include <linux/swap.h> |
93 | #include <linux/cache.h> | 94 | #include <linux/cache.h> |
94 | #include <linux/interrupt.h> | 95 | #include <linux/interrupt.h> |
@@ -106,6 +107,7 @@ | |||
106 | #include <linux/nodemask.h> | 107 | #include <linux/nodemask.h> |
107 | #include <linux/mempolicy.h> | 108 | #include <linux/mempolicy.h> |
108 | #include <linux/mutex.h> | 109 | #include <linux/mutex.h> |
110 | #include <linux/rtmutex.h> | ||
109 | 111 | ||
110 | #include <asm/uaccess.h> | 112 | #include <asm/uaccess.h> |
111 | #include <asm/cacheflush.h> | 113 | #include <asm/cacheflush.h> |
@@ -307,6 +309,13 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; | |||
307 | #define SIZE_AC 1 | 309 | #define SIZE_AC 1 |
308 | #define SIZE_L3 (1 + MAX_NUMNODES) | 310 | #define SIZE_L3 (1 + MAX_NUMNODES) |
309 | 311 | ||
312 | static int drain_freelist(struct kmem_cache *cache, | ||
313 | struct kmem_list3 *l3, int tofree); | ||
314 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, | ||
315 | int node); | ||
316 | static void enable_cpucache(struct kmem_cache *cachep); | ||
317 | static void cache_reap(void *unused); | ||
318 | |||
310 | /* | 319 | /* |
311 | * This function must be completely optimized away if a constant is passed to | 320 | * This function must be completely optimized away if a constant is passed to |
312 | * it. Mostly the same as what is in linux/slab.h except it returns an index. | 321 | * it. Mostly the same as what is in linux/slab.h except it returns an index. |
@@ -331,6 +340,8 @@ static __always_inline int index_of(const size_t size) | |||
331 | return 0; | 340 | return 0; |
332 | } | 341 | } |
333 | 342 | ||
343 | static int slab_early_init = 1; | ||
344 | |||
334 | #define INDEX_AC index_of(sizeof(struct arraycache_init)) | 345 | #define INDEX_AC index_of(sizeof(struct arraycache_init)) |
335 | #define INDEX_L3 index_of(sizeof(struct kmem_list3)) | 346 | #define INDEX_L3 index_of(sizeof(struct kmem_list3)) |
336 | 347 | ||
@@ -452,7 +463,7 @@ struct kmem_cache { | |||
452 | #define STATS_DEC_ACTIVE(x) ((x)->num_active--) | 463 | #define STATS_DEC_ACTIVE(x) ((x)->num_active--) |
453 | #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) | 464 | #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) |
454 | #define STATS_INC_GROWN(x) ((x)->grown++) | 465 | #define STATS_INC_GROWN(x) ((x)->grown++) |
455 | #define STATS_INC_REAPED(x) ((x)->reaped++) | 466 | #define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) |
456 | #define STATS_SET_HIGH(x) \ | 467 | #define STATS_SET_HIGH(x) \ |
457 | do { \ | 468 | do { \ |
458 | if ((x)->num_active > (x)->high_mark) \ | 469 | if ((x)->num_active > (x)->high_mark) \ |
@@ -476,7 +487,7 @@ struct kmem_cache { | |||
476 | #define STATS_DEC_ACTIVE(x) do { } while (0) | 487 | #define STATS_DEC_ACTIVE(x) do { } while (0) |
477 | #define STATS_INC_ALLOCED(x) do { } while (0) | 488 | #define STATS_INC_ALLOCED(x) do { } while (0) |
478 | #define STATS_INC_GROWN(x) do { } while (0) | 489 | #define STATS_INC_GROWN(x) do { } while (0) |
479 | #define STATS_INC_REAPED(x) do { } while (0) | 490 | #define STATS_ADD_REAPED(x,y) do { } while (0) |
480 | #define STATS_SET_HIGH(x) do { } while (0) | 491 | #define STATS_SET_HIGH(x) do { } while (0) |
481 | #define STATS_INC_ERR(x) do { } while (0) | 492 | #define STATS_INC_ERR(x) do { } while (0) |
482 | #define STATS_INC_NODEALLOCS(x) do { } while (0) | 493 | #define STATS_INC_NODEALLOCS(x) do { } while (0) |
@@ -490,17 +501,6 @@ struct kmem_cache { | |||
490 | #endif | 501 | #endif |
491 | 502 | ||
492 | #if DEBUG | 503 | #if DEBUG |
493 | /* | ||
494 | * Magic nums for obj red zoning. | ||
495 | * Placed in the first word before and the first word after an obj. | ||
496 | */ | ||
497 | #define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ | ||
498 | #define RED_ACTIVE 0x170FC2A5UL /* when obj is active */ | ||
499 | |||
500 | /* ...and for poisoning */ | ||
501 | #define POISON_INUSE 0x5a /* for use-uninitialised poisoning */ | ||
502 | #define POISON_FREE 0x6b /* for use-after-free poisoning */ | ||
503 | #define POISON_END 0xa5 /* end-byte of poisoning */ | ||
504 | 504 | ||
505 | /* | 505 | /* |
506 | * memory layout of objects: | 506 | * memory layout of objects: |
@@ -592,6 +592,7 @@ static inline struct kmem_cache *page_get_cache(struct page *page) | |||
592 | { | 592 | { |
593 | if (unlikely(PageCompound(page))) | 593 | if (unlikely(PageCompound(page))) |
594 | page = (struct page *)page_private(page); | 594 | page = (struct page *)page_private(page); |
595 | BUG_ON(!PageSlab(page)); | ||
595 | return (struct kmem_cache *)page->lru.next; | 596 | return (struct kmem_cache *)page->lru.next; |
596 | } | 597 | } |
597 | 598 | ||
@@ -604,6 +605,7 @@ static inline struct slab *page_get_slab(struct page *page) | |||
604 | { | 605 | { |
605 | if (unlikely(PageCompound(page))) | 606 | if (unlikely(PageCompound(page))) |
606 | page = (struct page *)page_private(page); | 607 | page = (struct page *)page_private(page); |
608 | BUG_ON(!PageSlab(page)); | ||
607 | return (struct slab *)page->lru.prev; | 609 | return (struct slab *)page->lru.prev; |
608 | } | 610 | } |
609 | 611 | ||
@@ -705,12 +707,6 @@ int slab_is_available(void) | |||
705 | 707 | ||
706 | static DEFINE_PER_CPU(struct work_struct, reap_work); | 708 | static DEFINE_PER_CPU(struct work_struct, reap_work); |
707 | 709 | ||
708 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, | ||
709 | int node); | ||
710 | static void enable_cpucache(struct kmem_cache *cachep); | ||
711 | static void cache_reap(void *unused); | ||
712 | static int __node_shrink(struct kmem_cache *cachep, int node); | ||
713 | |||
714 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) | 710 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) |
715 | { | 711 | { |
716 | return cachep->array[smp_processor_id()]; | 712 | return cachep->array[smp_processor_id()]; |
@@ -1024,6 +1020,40 @@ static void drain_alien_cache(struct kmem_cache *cachep, | |||
1024 | } | 1020 | } |
1025 | } | 1021 | } |
1026 | } | 1022 | } |
1023 | |||
1024 | static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | ||
1025 | { | ||
1026 | struct slab *slabp = virt_to_slab(objp); | ||
1027 | int nodeid = slabp->nodeid; | ||
1028 | struct kmem_list3 *l3; | ||
1029 | struct array_cache *alien = NULL; | ||
1030 | |||
1031 | /* | ||
1032 | * Make sure we are not freeing a object from another node to the array | ||
1033 | * cache on this cpu. | ||
1034 | */ | ||
1035 | if (likely(slabp->nodeid == numa_node_id())) | ||
1036 | return 0; | ||
1037 | |||
1038 | l3 = cachep->nodelists[numa_node_id()]; | ||
1039 | STATS_INC_NODEFREES(cachep); | ||
1040 | if (l3->alien && l3->alien[nodeid]) { | ||
1041 | alien = l3->alien[nodeid]; | ||
1042 | spin_lock(&alien->lock); | ||
1043 | if (unlikely(alien->avail == alien->limit)) { | ||
1044 | STATS_INC_ACOVERFLOW(cachep); | ||
1045 | __drain_alien_cache(cachep, alien, nodeid); | ||
1046 | } | ||
1047 | alien->entry[alien->avail++] = objp; | ||
1048 | spin_unlock(&alien->lock); | ||
1049 | } else { | ||
1050 | spin_lock(&(cachep->nodelists[nodeid])->list_lock); | ||
1051 | free_block(cachep, &objp, 1, nodeid); | ||
1052 | spin_unlock(&(cachep->nodelists[nodeid])->list_lock); | ||
1053 | } | ||
1054 | return 1; | ||
1055 | } | ||
1056 | |||
1027 | #else | 1057 | #else |
1028 | 1058 | ||
1029 | #define drain_alien_cache(cachep, alien) do { } while (0) | 1059 | #define drain_alien_cache(cachep, alien) do { } while (0) |
@@ -1038,9 +1068,14 @@ static inline void free_alien_cache(struct array_cache **ac_ptr) | |||
1038 | { | 1068 | { |
1039 | } | 1069 | } |
1040 | 1070 | ||
1071 | static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | ||
1072 | { | ||
1073 | return 0; | ||
1074 | } | ||
1075 | |||
1041 | #endif | 1076 | #endif |
1042 | 1077 | ||
1043 | static int cpuup_callback(struct notifier_block *nfb, | 1078 | static int __devinit cpuup_callback(struct notifier_block *nfb, |
1044 | unsigned long action, void *hcpu) | 1079 | unsigned long action, void *hcpu) |
1045 | { | 1080 | { |
1046 | long cpu = (long)hcpu; | 1081 | long cpu = (long)hcpu; |
@@ -1207,10 +1242,7 @@ free_array_cache: | |||
1207 | l3 = cachep->nodelists[node]; | 1242 | l3 = cachep->nodelists[node]; |
1208 | if (!l3) | 1243 | if (!l3) |
1209 | continue; | 1244 | continue; |
1210 | spin_lock_irq(&l3->list_lock); | 1245 | drain_freelist(cachep, l3, l3->free_objects); |
1211 | /* free slabs belonging to this node */ | ||
1212 | __node_shrink(cachep, node); | ||
1213 | spin_unlock_irq(&l3->list_lock); | ||
1214 | } | 1246 | } |
1215 | mutex_unlock(&cache_chain_mutex); | 1247 | mutex_unlock(&cache_chain_mutex); |
1216 | break; | 1248 | break; |
@@ -1222,7 +1254,9 @@ bad: | |||
1222 | return NOTIFY_BAD; | 1254 | return NOTIFY_BAD; |
1223 | } | 1255 | } |
1224 | 1256 | ||
1225 | static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; | 1257 | static struct notifier_block __cpuinitdata cpucache_notifier = { |
1258 | &cpuup_callback, NULL, 0 | ||
1259 | }; | ||
1226 | 1260 | ||
1227 | /* | 1261 | /* |
1228 | * swap the static kmem_list3 with kmalloced memory | 1262 | * swap the static kmem_list3 with kmalloced memory |
@@ -1335,6 +1369,8 @@ void __init kmem_cache_init(void) | |||
1335 | NULL, NULL); | 1369 | NULL, NULL); |
1336 | } | 1370 | } |
1337 | 1371 | ||
1372 | slab_early_init = 0; | ||
1373 | |||
1338 | while (sizes->cs_size != ULONG_MAX) { | 1374 | while (sizes->cs_size != ULONG_MAX) { |
1339 | /* | 1375 | /* |
1340 | * For performance, all the general caches are L1 aligned. | 1376 | * For performance, all the general caches are L1 aligned. |
@@ -1450,31 +1486,29 @@ __initcall(cpucache_init); | |||
1450 | static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | 1486 | static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) |
1451 | { | 1487 | { |
1452 | struct page *page; | 1488 | struct page *page; |
1453 | void *addr; | 1489 | int nr_pages; |
1454 | int i; | 1490 | int i; |
1455 | 1491 | ||
1456 | flags |= cachep->gfpflags; | ||
1457 | #ifndef CONFIG_MMU | 1492 | #ifndef CONFIG_MMU |
1458 | /* nommu uses slab's for process anonymous memory allocations, so | 1493 | /* |
1459 | * requires __GFP_COMP to properly refcount higher order allocations" | 1494 | * Nommu uses slab's for process anonymous memory allocations, and thus |
1495 | * requires __GFP_COMP to properly refcount higher order allocations | ||
1460 | */ | 1496 | */ |
1461 | page = alloc_pages_node(nodeid, (flags | __GFP_COMP), cachep->gfporder); | 1497 | flags |= __GFP_COMP; |
1462 | #else | ||
1463 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); | ||
1464 | #endif | 1498 | #endif |
1499 | flags |= cachep->gfpflags; | ||
1500 | |||
1501 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); | ||
1465 | if (!page) | 1502 | if (!page) |
1466 | return NULL; | 1503 | return NULL; |
1467 | addr = page_address(page); | ||
1468 | 1504 | ||
1469 | i = (1 << cachep->gfporder); | 1505 | nr_pages = (1 << cachep->gfporder); |
1470 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1506 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1471 | atomic_add(i, &slab_reclaim_pages); | 1507 | atomic_add(nr_pages, &slab_reclaim_pages); |
1472 | add_page_state(nr_slab, i); | 1508 | add_zone_page_state(page_zone(page), NR_SLAB, nr_pages); |
1473 | while (i--) { | 1509 | for (i = 0; i < nr_pages; i++) |
1474 | __SetPageSlab(page); | 1510 | __SetPageSlab(page + i); |
1475 | page++; | 1511 | return page_address(page); |
1476 | } | ||
1477 | return addr; | ||
1478 | } | 1512 | } |
1479 | 1513 | ||
1480 | /* | 1514 | /* |
@@ -1486,12 +1520,12 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) | |||
1486 | struct page *page = virt_to_page(addr); | 1520 | struct page *page = virt_to_page(addr); |
1487 | const unsigned long nr_freed = i; | 1521 | const unsigned long nr_freed = i; |
1488 | 1522 | ||
1523 | sub_zone_page_state(page_zone(page), NR_SLAB, nr_freed); | ||
1489 | while (i--) { | 1524 | while (i--) { |
1490 | BUG_ON(!PageSlab(page)); | 1525 | BUG_ON(!PageSlab(page)); |
1491 | __ClearPageSlab(page); | 1526 | __ClearPageSlab(page); |
1492 | page++; | 1527 | page++; |
1493 | } | 1528 | } |
1494 | sub_page_state(nr_slab, nr_freed); | ||
1495 | if (current->reclaim_state) | 1529 | if (current->reclaim_state) |
1496 | current->reclaim_state->reclaimed_slab += nr_freed; | 1530 | current->reclaim_state->reclaimed_slab += nr_freed; |
1497 | free_pages((unsigned long)addr, cachep->gfporder); | 1531 | free_pages((unsigned long)addr, cachep->gfporder); |
@@ -1913,8 +1947,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1913 | void (*dtor)(void*, struct kmem_cache *, unsigned long)) | 1947 | void (*dtor)(void*, struct kmem_cache *, unsigned long)) |
1914 | { | 1948 | { |
1915 | size_t left_over, slab_size, ralign; | 1949 | size_t left_over, slab_size, ralign; |
1916 | struct kmem_cache *cachep = NULL; | 1950 | struct kmem_cache *cachep = NULL, *pc; |
1917 | struct list_head *p; | ||
1918 | 1951 | ||
1919 | /* | 1952 | /* |
1920 | * Sanity checks... these are all serious usage bugs. | 1953 | * Sanity checks... these are all serious usage bugs. |
@@ -1934,8 +1967,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1934 | 1967 | ||
1935 | mutex_lock(&cache_chain_mutex); | 1968 | mutex_lock(&cache_chain_mutex); |
1936 | 1969 | ||
1937 | list_for_each(p, &cache_chain) { | 1970 | list_for_each_entry(pc, &cache_chain, next) { |
1938 | struct kmem_cache *pc = list_entry(p, struct kmem_cache, next); | ||
1939 | mm_segment_t old_fs = get_fs(); | 1971 | mm_segment_t old_fs = get_fs(); |
1940 | char tmp; | 1972 | char tmp; |
1941 | int res; | 1973 | int res; |
@@ -2069,8 +2101,12 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2069 | #endif | 2101 | #endif |
2070 | #endif | 2102 | #endif |
2071 | 2103 | ||
2072 | /* Determine if the slab management is 'on' or 'off' slab. */ | 2104 | /* |
2073 | if (size >= (PAGE_SIZE >> 3)) | 2105 | * Determine if the slab management is 'on' or 'off' slab. |
2106 | * (bootstrapping cannot cope with offslab caches so don't do | ||
2107 | * it too early on.) | ||
2108 | */ | ||
2109 | if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init) | ||
2074 | /* | 2110 | /* |
2075 | * Size is large, assume best to place the slab management obj | 2111 | * Size is large, assume best to place the slab management obj |
2076 | * off-slab (should allow better packing of objs). | 2112 | * off-slab (should allow better packing of objs). |
@@ -2210,32 +2246,45 @@ static void drain_cpu_caches(struct kmem_cache *cachep) | |||
2210 | } | 2246 | } |
2211 | } | 2247 | } |
2212 | 2248 | ||
2213 | static int __node_shrink(struct kmem_cache *cachep, int node) | 2249 | /* |
2250 | * Remove slabs from the list of free slabs. | ||
2251 | * Specify the number of slabs to drain in tofree. | ||
2252 | * | ||
2253 | * Returns the actual number of slabs released. | ||
2254 | */ | ||
2255 | static int drain_freelist(struct kmem_cache *cache, | ||
2256 | struct kmem_list3 *l3, int tofree) | ||
2214 | { | 2257 | { |
2258 | struct list_head *p; | ||
2259 | int nr_freed; | ||
2215 | struct slab *slabp; | 2260 | struct slab *slabp; |
2216 | struct kmem_list3 *l3 = cachep->nodelists[node]; | ||
2217 | int ret; | ||
2218 | 2261 | ||
2219 | for (;;) { | 2262 | nr_freed = 0; |
2220 | struct list_head *p; | 2263 | while (nr_freed < tofree && !list_empty(&l3->slabs_free)) { |
2221 | 2264 | ||
2265 | spin_lock_irq(&l3->list_lock); | ||
2222 | p = l3->slabs_free.prev; | 2266 | p = l3->slabs_free.prev; |
2223 | if (p == &l3->slabs_free) | 2267 | if (p == &l3->slabs_free) { |
2224 | break; | 2268 | spin_unlock_irq(&l3->list_lock); |
2269 | goto out; | ||
2270 | } | ||
2225 | 2271 | ||
2226 | slabp = list_entry(l3->slabs_free.prev, struct slab, list); | 2272 | slabp = list_entry(p, struct slab, list); |
2227 | #if DEBUG | 2273 | #if DEBUG |
2228 | BUG_ON(slabp->inuse); | 2274 | BUG_ON(slabp->inuse); |
2229 | #endif | 2275 | #endif |
2230 | list_del(&slabp->list); | 2276 | list_del(&slabp->list); |
2231 | 2277 | /* | |
2232 | l3->free_objects -= cachep->num; | 2278 | * Safe to drop the lock. The slab is no longer linked |
2279 | * to the cache. | ||
2280 | */ | ||
2281 | l3->free_objects -= cache->num; | ||
2233 | spin_unlock_irq(&l3->list_lock); | 2282 | spin_unlock_irq(&l3->list_lock); |
2234 | slab_destroy(cachep, slabp); | 2283 | slab_destroy(cache, slabp); |
2235 | spin_lock_irq(&l3->list_lock); | 2284 | nr_freed++; |
2236 | } | 2285 | } |
2237 | ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial); | 2286 | out: |
2238 | return ret; | 2287 | return nr_freed; |
2239 | } | 2288 | } |
2240 | 2289 | ||
2241 | static int __cache_shrink(struct kmem_cache *cachep) | 2290 | static int __cache_shrink(struct kmem_cache *cachep) |
@@ -2248,11 +2297,13 @@ static int __cache_shrink(struct kmem_cache *cachep) | |||
2248 | check_irq_on(); | 2297 | check_irq_on(); |
2249 | for_each_online_node(i) { | 2298 | for_each_online_node(i) { |
2250 | l3 = cachep->nodelists[i]; | 2299 | l3 = cachep->nodelists[i]; |
2251 | if (l3) { | 2300 | if (!l3) |
2252 | spin_lock_irq(&l3->list_lock); | 2301 | continue; |
2253 | ret += __node_shrink(cachep, i); | 2302 | |
2254 | spin_unlock_irq(&l3->list_lock); | 2303 | drain_freelist(cachep, l3, l3->free_objects); |
2255 | } | 2304 | |
2305 | ret += !list_empty(&l3->slabs_full) || | ||
2306 | !list_empty(&l3->slabs_partial); | ||
2256 | } | 2307 | } |
2257 | return (ret ? 1 : 0); | 2308 | return (ret ? 1 : 0); |
2258 | } | 2309 | } |
@@ -2460,23 +2511,28 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, | |||
2460 | slabp->inuse--; | 2511 | slabp->inuse--; |
2461 | } | 2512 | } |
2462 | 2513 | ||
2463 | static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, | 2514 | /* |
2464 | void *objp) | 2515 | * Map pages beginning at addr to the given cache and slab. This is required |
2516 | * for the slab allocator to be able to lookup the cache and slab of a | ||
2517 | * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging. | ||
2518 | */ | ||
2519 | static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, | ||
2520 | void *addr) | ||
2465 | { | 2521 | { |
2466 | int i; | 2522 | int nr_pages; |
2467 | struct page *page; | 2523 | struct page *page; |
2468 | 2524 | ||
2469 | /* Nasty!!!!!! I hope this is OK. */ | 2525 | page = virt_to_page(addr); |
2470 | page = virt_to_page(objp); | ||
2471 | 2526 | ||
2472 | i = 1; | 2527 | nr_pages = 1; |
2473 | if (likely(!PageCompound(page))) | 2528 | if (likely(!PageCompound(page))) |
2474 | i <<= cachep->gfporder; | 2529 | nr_pages <<= cache->gfporder; |
2530 | |||
2475 | do { | 2531 | do { |
2476 | page_set_cache(page, cachep); | 2532 | page_set_cache(page, cache); |
2477 | page_set_slab(page, slabp); | 2533 | page_set_slab(page, slab); |
2478 | page++; | 2534 | page++; |
2479 | } while (--i); | 2535 | } while (--nr_pages); |
2480 | } | 2536 | } |
2481 | 2537 | ||
2482 | /* | 2538 | /* |
@@ -2548,7 +2604,7 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
2548 | goto opps1; | 2604 | goto opps1; |
2549 | 2605 | ||
2550 | slabp->nodeid = nodeid; | 2606 | slabp->nodeid = nodeid; |
2551 | set_slab_attr(cachep, slabp, objp); | 2607 | slab_map_pages(cachep, slabp, objp); |
2552 | 2608 | ||
2553 | cache_init_objs(cachep, slabp, ctor_flags); | 2609 | cache_init_objs(cachep, slabp, ctor_flags); |
2554 | 2610 | ||
@@ -2596,6 +2652,28 @@ static void kfree_debugcheck(const void *objp) | |||
2596 | } | 2652 | } |
2597 | } | 2653 | } |
2598 | 2654 | ||
2655 | static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) | ||
2656 | { | ||
2657 | unsigned long redzone1, redzone2; | ||
2658 | |||
2659 | redzone1 = *dbg_redzone1(cache, obj); | ||
2660 | redzone2 = *dbg_redzone2(cache, obj); | ||
2661 | |||
2662 | /* | ||
2663 | * Redzone is ok. | ||
2664 | */ | ||
2665 | if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE) | ||
2666 | return; | ||
2667 | |||
2668 | if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE) | ||
2669 | slab_error(cache, "double free detected"); | ||
2670 | else | ||
2671 | slab_error(cache, "memory outside object was overwritten"); | ||
2672 | |||
2673 | printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n", | ||
2674 | obj, redzone1, redzone2); | ||
2675 | } | ||
2676 | |||
2599 | static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | 2677 | static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, |
2600 | void *caller) | 2678 | void *caller) |
2601 | { | 2679 | { |
@@ -2607,27 +2685,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
2607 | kfree_debugcheck(objp); | 2685 | kfree_debugcheck(objp); |
2608 | page = virt_to_page(objp); | 2686 | page = virt_to_page(objp); |
2609 | 2687 | ||
2610 | if (page_get_cache(page) != cachep) { | ||
2611 | printk(KERN_ERR "mismatch in kmem_cache_free: expected " | ||
2612 | "cache %p, got %p\n", | ||
2613 | page_get_cache(page), cachep); | ||
2614 | printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); | ||
2615 | printk(KERN_ERR "%p is %s.\n", page_get_cache(page), | ||
2616 | page_get_cache(page)->name); | ||
2617 | WARN_ON(1); | ||
2618 | } | ||
2619 | slabp = page_get_slab(page); | 2688 | slabp = page_get_slab(page); |
2620 | 2689 | ||
2621 | if (cachep->flags & SLAB_RED_ZONE) { | 2690 | if (cachep->flags & SLAB_RED_ZONE) { |
2622 | if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || | 2691 | verify_redzone_free(cachep, objp); |
2623 | *dbg_redzone2(cachep, objp) != RED_ACTIVE) { | ||
2624 | slab_error(cachep, "double free, or memory outside" | ||
2625 | " object was overwritten"); | ||
2626 | printk(KERN_ERR "%p: redzone 1:0x%lx, " | ||
2627 | "redzone 2:0x%lx.\n", | ||
2628 | objp, *dbg_redzone1(cachep, objp), | ||
2629 | *dbg_redzone2(cachep, objp)); | ||
2630 | } | ||
2631 | *dbg_redzone1(cachep, objp) = RED_INACTIVE; | 2692 | *dbg_redzone1(cachep, objp) = RED_INACTIVE; |
2632 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; | 2693 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; |
2633 | } | 2694 | } |
@@ -3087,41 +3148,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) | |||
3087 | check_irq_off(); | 3148 | check_irq_off(); |
3088 | objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); | 3149 | objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); |
3089 | 3150 | ||
3090 | /* Make sure we are not freeing a object from another | 3151 | if (cache_free_alien(cachep, objp)) |
3091 | * node to the array cache on this cpu. | 3152 | return; |
3092 | */ | 3153 | |
3093 | #ifdef CONFIG_NUMA | ||
3094 | { | ||
3095 | struct slab *slabp; | ||
3096 | slabp = virt_to_slab(objp); | ||
3097 | if (unlikely(slabp->nodeid != numa_node_id())) { | ||
3098 | struct array_cache *alien = NULL; | ||
3099 | int nodeid = slabp->nodeid; | ||
3100 | struct kmem_list3 *l3; | ||
3101 | |||
3102 | l3 = cachep->nodelists[numa_node_id()]; | ||
3103 | STATS_INC_NODEFREES(cachep); | ||
3104 | if (l3->alien && l3->alien[nodeid]) { | ||
3105 | alien = l3->alien[nodeid]; | ||
3106 | spin_lock(&alien->lock); | ||
3107 | if (unlikely(alien->avail == alien->limit)) { | ||
3108 | STATS_INC_ACOVERFLOW(cachep); | ||
3109 | __drain_alien_cache(cachep, | ||
3110 | alien, nodeid); | ||
3111 | } | ||
3112 | alien->entry[alien->avail++] = objp; | ||
3113 | spin_unlock(&alien->lock); | ||
3114 | } else { | ||
3115 | spin_lock(&(cachep->nodelists[nodeid])-> | ||
3116 | list_lock); | ||
3117 | free_block(cachep, &objp, 1, nodeid); | ||
3118 | spin_unlock(&(cachep->nodelists[nodeid])-> | ||
3119 | list_lock); | ||
3120 | } | ||
3121 | return; | ||
3122 | } | ||
3123 | } | ||
3124 | #endif | ||
3125 | if (likely(ac->avail < ac->limit)) { | 3154 | if (likely(ac->avail < ac->limit)) { |
3126 | STATS_INC_FREEHIT(cachep); | 3155 | STATS_INC_FREEHIT(cachep); |
3127 | ac->entry[ac->avail++] = objp; | 3156 | ac->entry[ac->avail++] = objp; |
@@ -3254,26 +3283,10 @@ EXPORT_SYMBOL(kmalloc_node); | |||
3254 | #endif | 3283 | #endif |
3255 | 3284 | ||
3256 | /** | 3285 | /** |
3257 | * kmalloc - allocate memory | 3286 | * __do_kmalloc - allocate memory |
3258 | * @size: how many bytes of memory are required. | 3287 | * @size: how many bytes of memory are required. |
3259 | * @flags: the type of memory to allocate. | 3288 | * @flags: the type of memory to allocate (see kmalloc). |
3260 | * @caller: function caller for debug tracking of the caller | 3289 | * @caller: function caller for debug tracking of the caller |
3261 | * | ||
3262 | * kmalloc is the normal method of allocating memory | ||
3263 | * in the kernel. | ||
3264 | * | ||
3265 | * The @flags argument may be one of: | ||
3266 | * | ||
3267 | * %GFP_USER - Allocate memory on behalf of user. May sleep. | ||
3268 | * | ||
3269 | * %GFP_KERNEL - Allocate normal kernel ram. May sleep. | ||
3270 | * | ||
3271 | * %GFP_ATOMIC - Allocation will not sleep. Use inside interrupt handlers. | ||
3272 | * | ||
3273 | * Additionally, the %GFP_DMA flag may be set to indicate the memory | ||
3274 | * must be suitable for DMA. This can mean different things on different | ||
3275 | * platforms. For example, on i386, it means that the memory must come | ||
3276 | * from the first 16MB. | ||
3277 | */ | 3290 | */ |
3278 | static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | 3291 | static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, |
3279 | void *caller) | 3292 | void *caller) |
@@ -3371,6 +3384,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) | |||
3371 | { | 3384 | { |
3372 | unsigned long flags; | 3385 | unsigned long flags; |
3373 | 3386 | ||
3387 | BUG_ON(virt_to_cache(objp) != cachep); | ||
3388 | |||
3374 | local_irq_save(flags); | 3389 | local_irq_save(flags); |
3375 | __cache_free(cachep, objp); | 3390 | __cache_free(cachep, objp); |
3376 | local_irq_restore(flags); | 3391 | local_irq_restore(flags); |
@@ -3396,7 +3411,7 @@ void kfree(const void *objp) | |||
3396 | local_irq_save(flags); | 3411 | local_irq_save(flags); |
3397 | kfree_debugcheck(objp); | 3412 | kfree_debugcheck(objp); |
3398 | c = virt_to_cache(objp); | 3413 | c = virt_to_cache(objp); |
3399 | mutex_debug_check_no_locks_freed(objp, obj_size(c)); | 3414 | debug_check_no_locks_freed(objp, obj_size(c)); |
3400 | __cache_free(c, (void *)objp); | 3415 | __cache_free(c, (void *)objp); |
3401 | local_irq_restore(flags); | 3416 | local_irq_restore(flags); |
3402 | } | 3417 | } |
@@ -3680,7 +3695,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, | |||
3680 | */ | 3695 | */ |
3681 | static void cache_reap(void *unused) | 3696 | static void cache_reap(void *unused) |
3682 | { | 3697 | { |
3683 | struct list_head *walk; | 3698 | struct kmem_cache *searchp; |
3684 | struct kmem_list3 *l3; | 3699 | struct kmem_list3 *l3; |
3685 | int node = numa_node_id(); | 3700 | int node = numa_node_id(); |
3686 | 3701 | ||
@@ -3691,13 +3706,7 @@ static void cache_reap(void *unused) | |||
3691 | return; | 3706 | return; |
3692 | } | 3707 | } |
3693 | 3708 | ||
3694 | list_for_each(walk, &cache_chain) { | 3709 | list_for_each_entry(searchp, &cache_chain, next) { |
3695 | struct kmem_cache *searchp; | ||
3696 | struct list_head *p; | ||
3697 | int tofree; | ||
3698 | struct slab *slabp; | ||
3699 | |||
3700 | searchp = list_entry(walk, struct kmem_cache, next); | ||
3701 | check_irq_on(); | 3710 | check_irq_on(); |
3702 | 3711 | ||
3703 | /* | 3712 | /* |
@@ -3722,47 +3731,22 @@ static void cache_reap(void *unused) | |||
3722 | 3731 | ||
3723 | drain_array(searchp, l3, l3->shared, 0, node); | 3732 | drain_array(searchp, l3, l3->shared, 0, node); |
3724 | 3733 | ||
3725 | if (l3->free_touched) { | 3734 | if (l3->free_touched) |
3726 | l3->free_touched = 0; | 3735 | l3->free_touched = 0; |
3727 | goto next; | 3736 | else { |
3728 | } | 3737 | int freed; |
3729 | 3738 | ||
3730 | tofree = (l3->free_limit + 5 * searchp->num - 1) / | 3739 | freed = drain_freelist(searchp, l3, (l3->free_limit + |
3731 | (5 * searchp->num); | 3740 | 5 * searchp->num - 1) / (5 * searchp->num)); |
3732 | do { | 3741 | STATS_ADD_REAPED(searchp, freed); |
3733 | /* | 3742 | } |
3734 | * Do not lock if there are no free blocks. | ||
3735 | */ | ||
3736 | if (list_empty(&l3->slabs_free)) | ||
3737 | break; | ||
3738 | |||
3739 | spin_lock_irq(&l3->list_lock); | ||
3740 | p = l3->slabs_free.next; | ||
3741 | if (p == &(l3->slabs_free)) { | ||
3742 | spin_unlock_irq(&l3->list_lock); | ||
3743 | break; | ||
3744 | } | ||
3745 | |||
3746 | slabp = list_entry(p, struct slab, list); | ||
3747 | BUG_ON(slabp->inuse); | ||
3748 | list_del(&slabp->list); | ||
3749 | STATS_INC_REAPED(searchp); | ||
3750 | |||
3751 | /* | ||
3752 | * Safe to drop the lock. The slab is no longer linked | ||
3753 | * to the cache. searchp cannot disappear, we hold | ||
3754 | * cache_chain_lock | ||
3755 | */ | ||
3756 | l3->free_objects -= searchp->num; | ||
3757 | spin_unlock_irq(&l3->list_lock); | ||
3758 | slab_destroy(searchp, slabp); | ||
3759 | } while (--tofree > 0); | ||
3760 | next: | 3743 | next: |
3761 | cond_resched(); | 3744 | cond_resched(); |
3762 | } | 3745 | } |
3763 | check_irq_on(); | 3746 | check_irq_on(); |
3764 | mutex_unlock(&cache_chain_mutex); | 3747 | mutex_unlock(&cache_chain_mutex); |
3765 | next_reap_node(); | 3748 | next_reap_node(); |
3749 | refresh_cpu_vm_stats(smp_processor_id()); | ||
3766 | /* Set up the next iteration */ | 3750 | /* Set up the next iteration */ |
3767 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); | 3751 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); |
3768 | } | 3752 | } |
@@ -3825,7 +3809,6 @@ static void s_stop(struct seq_file *m, void *p) | |||
3825 | static int s_show(struct seq_file *m, void *p) | 3809 | static int s_show(struct seq_file *m, void *p) |
3826 | { | 3810 | { |
3827 | struct kmem_cache *cachep = p; | 3811 | struct kmem_cache *cachep = p; |
3828 | struct list_head *q; | ||
3829 | struct slab *slabp; | 3812 | struct slab *slabp; |
3830 | unsigned long active_objs; | 3813 | unsigned long active_objs; |
3831 | unsigned long num_objs; | 3814 | unsigned long num_objs; |
@@ -3846,15 +3829,13 @@ static int s_show(struct seq_file *m, void *p) | |||
3846 | check_irq_on(); | 3829 | check_irq_on(); |
3847 | spin_lock_irq(&l3->list_lock); | 3830 | spin_lock_irq(&l3->list_lock); |
3848 | 3831 | ||
3849 | list_for_each(q, &l3->slabs_full) { | 3832 | list_for_each_entry(slabp, &l3->slabs_full, list) { |
3850 | slabp = list_entry(q, struct slab, list); | ||
3851 | if (slabp->inuse != cachep->num && !error) | 3833 | if (slabp->inuse != cachep->num && !error) |
3852 | error = "slabs_full accounting error"; | 3834 | error = "slabs_full accounting error"; |
3853 | active_objs += cachep->num; | 3835 | active_objs += cachep->num; |
3854 | active_slabs++; | 3836 | active_slabs++; |
3855 | } | 3837 | } |
3856 | list_for_each(q, &l3->slabs_partial) { | 3838 | list_for_each_entry(slabp, &l3->slabs_partial, list) { |
3857 | slabp = list_entry(q, struct slab, list); | ||
3858 | if (slabp->inuse == cachep->num && !error) | 3839 | if (slabp->inuse == cachep->num && !error) |
3859 | error = "slabs_partial inuse accounting error"; | 3840 | error = "slabs_partial inuse accounting error"; |
3860 | if (!slabp->inuse && !error) | 3841 | if (!slabp->inuse && !error) |
@@ -3862,8 +3843,7 @@ static int s_show(struct seq_file *m, void *p) | |||
3862 | active_objs += slabp->inuse; | 3843 | active_objs += slabp->inuse; |
3863 | active_slabs++; | 3844 | active_slabs++; |
3864 | } | 3845 | } |
3865 | list_for_each(q, &l3->slabs_free) { | 3846 | list_for_each_entry(slabp, &l3->slabs_free, list) { |
3866 | slabp = list_entry(q, struct slab, list); | ||
3867 | if (slabp->inuse && !error) | 3847 | if (slabp->inuse && !error) |
3868 | error = "slabs_free/inuse accounting error"; | 3848 | error = "slabs_free/inuse accounting error"; |
3869 | num_slabs++; | 3849 | num_slabs++; |
@@ -3956,7 +3936,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, | |||
3956 | { | 3936 | { |
3957 | char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; | 3937 | char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; |
3958 | int limit, batchcount, shared, res; | 3938 | int limit, batchcount, shared, res; |
3959 | struct list_head *p; | 3939 | struct kmem_cache *cachep; |
3960 | 3940 | ||
3961 | if (count > MAX_SLABINFO_WRITE) | 3941 | if (count > MAX_SLABINFO_WRITE) |
3962 | return -EINVAL; | 3942 | return -EINVAL; |
@@ -3975,10 +3955,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, | |||
3975 | /* Find the cache in the chain of caches. */ | 3955 | /* Find the cache in the chain of caches. */ |
3976 | mutex_lock(&cache_chain_mutex); | 3956 | mutex_lock(&cache_chain_mutex); |
3977 | res = -EINVAL; | 3957 | res = -EINVAL; |
3978 | list_for_each(p, &cache_chain) { | 3958 | list_for_each_entry(cachep, &cache_chain, next) { |
3979 | struct kmem_cache *cachep; | ||
3980 | |||
3981 | cachep = list_entry(p, struct kmem_cache, next); | ||
3982 | if (!strcmp(cachep->name, kbuf)) { | 3959 | if (!strcmp(cachep->name, kbuf)) { |
3983 | if (limit < 1 || batchcount < 1 || | 3960 | if (limit < 1 || batchcount < 1 || |
3984 | batchcount > limit || shared < 0) { | 3961 | batchcount > limit || shared < 0) { |
@@ -4080,7 +4057,6 @@ static void show_symbol(struct seq_file *m, unsigned long address) | |||
4080 | static int leaks_show(struct seq_file *m, void *p) | 4057 | static int leaks_show(struct seq_file *m, void *p) |
4081 | { | 4058 | { |
4082 | struct kmem_cache *cachep = p; | 4059 | struct kmem_cache *cachep = p; |
4083 | struct list_head *q; | ||
4084 | struct slab *slabp; | 4060 | struct slab *slabp; |
4085 | struct kmem_list3 *l3; | 4061 | struct kmem_list3 *l3; |
4086 | const char *name; | 4062 | const char *name; |
@@ -4105,14 +4081,10 @@ static int leaks_show(struct seq_file *m, void *p) | |||
4105 | check_irq_on(); | 4081 | check_irq_on(); |
4106 | spin_lock_irq(&l3->list_lock); | 4082 | spin_lock_irq(&l3->list_lock); |
4107 | 4083 | ||
4108 | list_for_each(q, &l3->slabs_full) { | 4084 | list_for_each_entry(slabp, &l3->slabs_full, list) |
4109 | slabp = list_entry(q, struct slab, list); | ||
4110 | handle_slab(n, cachep, slabp); | 4085 | handle_slab(n, cachep, slabp); |
4111 | } | 4086 | list_for_each_entry(slabp, &l3->slabs_partial, list) |
4112 | list_for_each(q, &l3->slabs_partial) { | ||
4113 | slabp = list_entry(q, struct slab, list); | ||
4114 | handle_slab(n, cachep, slabp); | 4087 | handle_slab(n, cachep, slabp); |
4115 | } | ||
4116 | spin_unlock_irq(&l3->list_lock); | 4088 | spin_unlock_irq(&l3->list_lock); |
4117 | } | 4089 | } |
4118 | name = cachep->name; | 4090 | name = cachep->name; |
@@ -29,7 +29,6 @@ | |||
29 | * essentially no allocation space overhead. | 29 | * essentially no allocation space overhead. |
30 | */ | 30 | */ |
31 | 31 | ||
32 | #include <linux/config.h> | ||
33 | #include <linux/slab.h> | 32 | #include <linux/slab.h> |
34 | #include <linux/mm.h> | 33 | #include <linux/mm.h> |
35 | #include <linux/cache.h> | 34 | #include <linux/cache.h> |
diff --git a/mm/sparse.c b/mm/sparse.c index 100040c0dfb6..86c52ab80878 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -1,7 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | * sparse memory mappings. | 2 | * sparse memory mappings. |
3 | */ | 3 | */ |
4 | #include <linux/config.h> | ||
5 | #include <linux/mm.h> | 4 | #include <linux/mm.h> |
6 | #include <linux/mmzone.h> | 5 | #include <linux/mmzone.h> |
7 | #include <linux/bootmem.h> | 6 | #include <linux/bootmem.h> |
@@ -45,7 +44,7 @@ static struct mem_section *sparse_index_alloc(int nid) | |||
45 | 44 | ||
46 | static int sparse_index_init(unsigned long section_nr, int nid) | 45 | static int sparse_index_init(unsigned long section_nr, int nid) |
47 | { | 46 | { |
48 | static spinlock_t index_init_lock = SPIN_LOCK_UNLOCKED; | 47 | static DEFINE_SPINLOCK(index_init_lock); |
49 | unsigned long root = SECTION_NR_TO_ROOT(section_nr); | 48 | unsigned long root = SECTION_NR_TO_ROOT(section_nr); |
50 | struct mem_section *section; | 49 | struct mem_section *section; |
51 | int ret = 0; | 50 | int ret = 0; |
@@ -99,6 +98,22 @@ int __section_nr(struct mem_section* ms) | |||
99 | return (root_nr * SECTIONS_PER_ROOT) + (ms - root); | 98 | return (root_nr * SECTIONS_PER_ROOT) + (ms - root); |
100 | } | 99 | } |
101 | 100 | ||
101 | /* | ||
102 | * During early boot, before section_mem_map is used for an actual | ||
103 | * mem_map, we use section_mem_map to store the section's NUMA | ||
104 | * node. This keeps us from having to use another data structure. The | ||
105 | * node information is cleared just before we store the real mem_map. | ||
106 | */ | ||
107 | static inline unsigned long sparse_encode_early_nid(int nid) | ||
108 | { | ||
109 | return (nid << SECTION_NID_SHIFT); | ||
110 | } | ||
111 | |||
112 | static inline int sparse_early_nid(struct mem_section *section) | ||
113 | { | ||
114 | return (section->section_mem_map >> SECTION_NID_SHIFT); | ||
115 | } | ||
116 | |||
102 | /* Record a memory area against a node. */ | 117 | /* Record a memory area against a node. */ |
103 | void memory_present(int nid, unsigned long start, unsigned long end) | 118 | void memory_present(int nid, unsigned long start, unsigned long end) |
104 | { | 119 | { |
@@ -113,7 +128,8 @@ void memory_present(int nid, unsigned long start, unsigned long end) | |||
113 | 128 | ||
114 | ms = __nr_to_section(section); | 129 | ms = __nr_to_section(section); |
115 | if (!ms->section_mem_map) | 130 | if (!ms->section_mem_map) |
116 | ms->section_mem_map = SECTION_MARKED_PRESENT; | 131 | ms->section_mem_map = sparse_encode_early_nid(nid) | |
132 | SECTION_MARKED_PRESENT; | ||
117 | } | 133 | } |
118 | } | 134 | } |
119 | 135 | ||
@@ -164,6 +180,7 @@ static int sparse_init_one_section(struct mem_section *ms, | |||
164 | if (!valid_section(ms)) | 180 | if (!valid_section(ms)) |
165 | return -EINVAL; | 181 | return -EINVAL; |
166 | 182 | ||
183 | ms->section_mem_map &= ~SECTION_MAP_MASK; | ||
167 | ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum); | 184 | ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum); |
168 | 185 | ||
169 | return 1; | 186 | return 1; |
@@ -172,8 +189,8 @@ static int sparse_init_one_section(struct mem_section *ms, | |||
172 | static struct page *sparse_early_mem_map_alloc(unsigned long pnum) | 189 | static struct page *sparse_early_mem_map_alloc(unsigned long pnum) |
173 | { | 190 | { |
174 | struct page *map; | 191 | struct page *map; |
175 | int nid = early_pfn_to_nid(section_nr_to_pfn(pnum)); | ||
176 | struct mem_section *ms = __nr_to_section(pnum); | 192 | struct mem_section *ms = __nr_to_section(pnum); |
193 | int nid = sparse_early_nid(ms); | ||
177 | 194 | ||
178 | map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); | 195 | map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); |
179 | if (map) | 196 | if (map) |
@@ -86,9 +86,8 @@ int rotate_reclaimable_page(struct page *page) | |||
86 | zone = page_zone(page); | 86 | zone = page_zone(page); |
87 | spin_lock_irqsave(&zone->lru_lock, flags); | 87 | spin_lock_irqsave(&zone->lru_lock, flags); |
88 | if (PageLRU(page) && !PageActive(page)) { | 88 | if (PageLRU(page) && !PageActive(page)) { |
89 | list_del(&page->lru); | 89 | list_move_tail(&page->lru, &zone->inactive_list); |
90 | list_add_tail(&page->lru, &zone->inactive_list); | 90 | __count_vm_event(PGROTATED); |
91 | inc_page_state(pgrotated); | ||
92 | } | 91 | } |
93 | if (!test_clear_page_writeback(page)) | 92 | if (!test_clear_page_writeback(page)) |
94 | BUG(); | 93 | BUG(); |
@@ -108,7 +107,7 @@ void fastcall activate_page(struct page *page) | |||
108 | del_page_from_inactive_list(zone, page); | 107 | del_page_from_inactive_list(zone, page); |
109 | SetPageActive(page); | 108 | SetPageActive(page); |
110 | add_page_to_active_list(zone, page); | 109 | add_page_to_active_list(zone, page); |
111 | inc_page_state(pgactivate); | 110 | __count_vm_event(PGACTIVATE); |
112 | } | 111 | } |
113 | spin_unlock_irq(&zone->lru_lock); | 112 | spin_unlock_irq(&zone->lru_lock); |
114 | } | 113 | } |
@@ -480,48 +479,6 @@ static int cpu_swap_callback(struct notifier_block *nfb, | |||
480 | #endif /* CONFIG_HOTPLUG_CPU */ | 479 | #endif /* CONFIG_HOTPLUG_CPU */ |
481 | #endif /* CONFIG_SMP */ | 480 | #endif /* CONFIG_SMP */ |
482 | 481 | ||
483 | #ifdef CONFIG_SMP | ||
484 | void percpu_counter_mod(struct percpu_counter *fbc, long amount) | ||
485 | { | ||
486 | long count; | ||
487 | long *pcount; | ||
488 | int cpu = get_cpu(); | ||
489 | |||
490 | pcount = per_cpu_ptr(fbc->counters, cpu); | ||
491 | count = *pcount + amount; | ||
492 | if (count >= FBC_BATCH || count <= -FBC_BATCH) { | ||
493 | spin_lock(&fbc->lock); | ||
494 | fbc->count += count; | ||
495 | *pcount = 0; | ||
496 | spin_unlock(&fbc->lock); | ||
497 | } else { | ||
498 | *pcount = count; | ||
499 | } | ||
500 | put_cpu(); | ||
501 | } | ||
502 | EXPORT_SYMBOL(percpu_counter_mod); | ||
503 | |||
504 | /* | ||
505 | * Add up all the per-cpu counts, return the result. This is a more accurate | ||
506 | * but much slower version of percpu_counter_read_positive() | ||
507 | */ | ||
508 | long percpu_counter_sum(struct percpu_counter *fbc) | ||
509 | { | ||
510 | long ret; | ||
511 | int cpu; | ||
512 | |||
513 | spin_lock(&fbc->lock); | ||
514 | ret = fbc->count; | ||
515 | for_each_possible_cpu(cpu) { | ||
516 | long *pcount = per_cpu_ptr(fbc->counters, cpu); | ||
517 | ret += *pcount; | ||
518 | } | ||
519 | spin_unlock(&fbc->lock); | ||
520 | return ret < 0 ? 0 : ret; | ||
521 | } | ||
522 | EXPORT_SYMBOL(percpu_counter_sum); | ||
523 | #endif | ||
524 | |||
525 | /* | 482 | /* |
526 | * Perform any setup for the swap system | 483 | * Perform any setup for the swap system |
527 | */ | 484 | */ |
diff --git a/mm/swap_state.c b/mm/swap_state.c index e0e1583f32c2..fccbd9bba77b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -24,7 +24,7 @@ | |||
24 | * vmscan's shrink_list, to make sync_page look nicer, and to allow | 24 | * vmscan's shrink_list, to make sync_page look nicer, and to allow |
25 | * future use of radix_tree tags in the swap cache. | 25 | * future use of radix_tree tags in the swap cache. |
26 | */ | 26 | */ |
27 | static struct address_space_operations swap_aops = { | 27 | static const struct address_space_operations swap_aops = { |
28 | .writepage = swap_writepage, | 28 | .writepage = swap_writepage, |
29 | .sync_page = block_sync_page, | 29 | .sync_page = block_sync_page, |
30 | .set_page_dirty = __set_page_dirty_nobuffers, | 30 | .set_page_dirty = __set_page_dirty_nobuffers, |
@@ -87,7 +87,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, | |||
87 | SetPageSwapCache(page); | 87 | SetPageSwapCache(page); |
88 | set_page_private(page, entry.val); | 88 | set_page_private(page, entry.val); |
89 | total_swapcache_pages++; | 89 | total_swapcache_pages++; |
90 | pagecache_acct(1); | 90 | __inc_zone_page_state(page, NR_FILE_PAGES); |
91 | } | 91 | } |
92 | write_unlock_irq(&swapper_space.tree_lock); | 92 | write_unlock_irq(&swapper_space.tree_lock); |
93 | radix_tree_preload_end(); | 93 | radix_tree_preload_end(); |
@@ -132,7 +132,7 @@ void __delete_from_swap_cache(struct page *page) | |||
132 | set_page_private(page, 0); | 132 | set_page_private(page, 0); |
133 | ClearPageSwapCache(page); | 133 | ClearPageSwapCache(page); |
134 | total_swapcache_pages--; | 134 | total_swapcache_pages--; |
135 | pagecache_acct(-1); | 135 | __dec_zone_page_state(page, NR_FILE_PAGES); |
136 | INC_CACHE_INFO(del_total); | 136 | INC_CACHE_INFO(del_total); |
137 | } | 137 | } |
138 | 138 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index e5fd5385f0cc..e70d6c6d6fee 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -5,7 +5,6 @@ | |||
5 | * Swap reorganised 29.12.95, Stephen Tweedie | 5 | * Swap reorganised 29.12.95, Stephen Tweedie |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include <linux/config.h> | ||
9 | #include <linux/mm.h> | 8 | #include <linux/mm.h> |
10 | #include <linux/hugetlb.h> | 9 | #include <linux/hugetlb.h> |
11 | #include <linux/mman.h> | 10 | #include <linux/mman.h> |
@@ -395,6 +394,9 @@ void free_swap_and_cache(swp_entry_t entry) | |||
395 | struct swap_info_struct * p; | 394 | struct swap_info_struct * p; |
396 | struct page *page = NULL; | 395 | struct page *page = NULL; |
397 | 396 | ||
397 | if (is_migration_entry(entry)) | ||
398 | return; | ||
399 | |||
398 | p = swap_info_get(entry); | 400 | p = swap_info_get(entry); |
399 | if (p) { | 401 | if (p) { |
400 | if (swap_entry_free(p, swp_offset(entry)) == 1) { | 402 | if (swap_entry_free(p, swp_offset(entry)) == 1) { |
@@ -615,15 +617,6 @@ static int unuse_mm(struct mm_struct *mm, | |||
615 | return 0; | 617 | return 0; |
616 | } | 618 | } |
617 | 619 | ||
618 | #ifdef CONFIG_MIGRATION | ||
619 | int remove_vma_swap(struct vm_area_struct *vma, struct page *page) | ||
620 | { | ||
621 | swp_entry_t entry = { .val = page_private(page) }; | ||
622 | |||
623 | return unuse_vma(vma, entry, page); | ||
624 | } | ||
625 | #endif | ||
626 | |||
627 | /* | 620 | /* |
628 | * Scan swap_map from current position to next entry still in use. | 621 | * Scan swap_map from current position to next entry still in use. |
629 | * Recycle to start on reaching the end, returning 0 when empty. | 622 | * Recycle to start on reaching the end, returning 0 when empty. |
@@ -716,7 +709,6 @@ static int try_to_unuse(unsigned int type) | |||
716 | */ | 709 | */ |
717 | swap_map = &si->swap_map[i]; | 710 | swap_map = &si->swap_map[i]; |
718 | entry = swp_entry(type, i); | 711 | entry = swp_entry(type, i); |
719 | again: | ||
720 | page = read_swap_cache_async(entry, NULL, 0); | 712 | page = read_swap_cache_async(entry, NULL, 0); |
721 | if (!page) { | 713 | if (!page) { |
722 | /* | 714 | /* |
@@ -751,12 +743,6 @@ again: | |||
751 | wait_on_page_locked(page); | 743 | wait_on_page_locked(page); |
752 | wait_on_page_writeback(page); | 744 | wait_on_page_writeback(page); |
753 | lock_page(page); | 745 | lock_page(page); |
754 | if (!PageSwapCache(page)) { | ||
755 | /* Page migration has occured */ | ||
756 | unlock_page(page); | ||
757 | page_cache_release(page); | ||
758 | goto again; | ||
759 | } | ||
760 | wait_on_page_writeback(page); | 746 | wait_on_page_writeback(page); |
761 | 747 | ||
762 | /* | 748 | /* |
@@ -785,10 +771,8 @@ again: | |||
785 | while (*swap_map > 1 && !retval && | 771 | while (*swap_map > 1 && !retval && |
786 | (p = p->next) != &start_mm->mmlist) { | 772 | (p = p->next) != &start_mm->mmlist) { |
787 | mm = list_entry(p, struct mm_struct, mmlist); | 773 | mm = list_entry(p, struct mm_struct, mmlist); |
788 | if (atomic_inc_return(&mm->mm_users) == 1) { | 774 | if (!atomic_inc_not_zero(&mm->mm_users)) |
789 | atomic_dec(&mm->mm_users); | ||
790 | continue; | 775 | continue; |
791 | } | ||
792 | spin_unlock(&mmlist_lock); | 776 | spin_unlock(&mmlist_lock); |
793 | mmput(prev_mm); | 777 | mmput(prev_mm); |
794 | prev_mm = mm; | 778 | prev_mm = mm; |
@@ -1407,19 +1391,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1407 | if (!(p->flags & SWP_USED)) | 1391 | if (!(p->flags & SWP_USED)) |
1408 | break; | 1392 | break; |
1409 | error = -EPERM; | 1393 | error = -EPERM; |
1410 | /* | 1394 | if (type >= MAX_SWAPFILES) { |
1411 | * Test if adding another swap device is possible. There are | ||
1412 | * two limiting factors: 1) the number of bits for the swap | ||
1413 | * type swp_entry_t definition and 2) the number of bits for | ||
1414 | * the swap type in the swap ptes as defined by the different | ||
1415 | * architectures. To honor both limitations a swap entry | ||
1416 | * with swap offset 0 and swap type ~0UL is created, encoded | ||
1417 | * to a swap pte, decoded to a swp_entry_t again and finally | ||
1418 | * the swap type part is extracted. This will mask all bits | ||
1419 | * from the initial ~0UL that can't be encoded in either the | ||
1420 | * swp_entry_t or the architecture definition of a swap pte. | ||
1421 | */ | ||
1422 | if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) { | ||
1423 | spin_unlock(&swap_lock); | 1395 | spin_unlock(&swap_lock); |
1424 | goto out; | 1396 | goto out; |
1425 | } | 1397 | } |
@@ -1504,8 +1476,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1504 | error = -EINVAL; | 1476 | error = -EINVAL; |
1505 | goto bad_swap; | 1477 | goto bad_swap; |
1506 | } | 1478 | } |
1507 | page = read_cache_page(mapping, 0, | 1479 | page = read_mapping_page(mapping, 0, swap_file); |
1508 | (filler_t *)mapping->a_ops->readpage, swap_file); | ||
1509 | if (IS_ERR(page)) { | 1480 | if (IS_ERR(page)) { |
1510 | error = PTR_ERR(page); | 1481 | error = PTR_ERR(page); |
1511 | goto bad_swap; | 1482 | goto bad_swap; |
@@ -1709,6 +1680,9 @@ int swap_duplicate(swp_entry_t entry) | |||
1709 | unsigned long offset, type; | 1680 | unsigned long offset, type; |
1710 | int result = 0; | 1681 | int result = 0; |
1711 | 1682 | ||
1683 | if (is_migration_entry(entry)) | ||
1684 | return 1; | ||
1685 | |||
1712 | type = swp_type(entry); | 1686 | type = swp_type(entry); |
1713 | if (type >= nr_swapfiles) | 1687 | if (type >= nr_swapfiles) |
1714 | goto bad_file; | 1688 | goto bad_file; |
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index f9d6a9cc91c4..5f2cbf0f153c 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c | |||
@@ -12,7 +12,6 @@ | |||
12 | 12 | ||
13 | #include <linux/fs.h> | 13 | #include <linux/fs.h> |
14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
15 | #include <linux/devfs_fs_kernel.h> | ||
16 | #include <linux/vfs.h> | 15 | #include <linux/vfs.h> |
17 | #include <linux/mount.h> | 16 | #include <linux/mount.h> |
18 | #include <linux/file.h> | 17 | #include <linux/file.h> |
@@ -33,9 +32,6 @@ static int __init init_tmpfs(void) | |||
33 | { | 32 | { |
34 | BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); | 33 | BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); |
35 | 34 | ||
36 | #ifdef CONFIG_TMPFS | ||
37 | devfs_mk_dir("shm"); | ||
38 | #endif | ||
39 | shm_mnt = kern_mount(&tmpfs_fs_type); | 35 | shm_mnt = kern_mount(&tmpfs_fs_type); |
40 | BUG_ON(IS_ERR(shm_mnt)); | 36 | BUG_ON(IS_ERR(shm_mnt)); |
41 | 37 | ||
diff --git a/mm/truncate.c b/mm/truncate.c index 6cb3fff25f67..cf1b015df4a7 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -230,14 +230,24 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
230 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 230 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { |
231 | for (i = 0; i < pagevec_count(&pvec); i++) { | 231 | for (i = 0; i < pagevec_count(&pvec); i++) { |
232 | struct page *page = pvec.pages[i]; | 232 | struct page *page = pvec.pages[i]; |
233 | pgoff_t index; | ||
234 | int lock_failed; | ||
233 | 235 | ||
234 | if (TestSetPageLocked(page)) { | 236 | lock_failed = TestSetPageLocked(page); |
235 | next++; | 237 | |
236 | continue; | 238 | /* |
237 | } | 239 | * We really shouldn't be looking at the ->index of an |
238 | if (page->index > next) | 240 | * unlocked page. But we're not allowed to lock these |
239 | next = page->index; | 241 | * pages. So we rely upon nobody altering the ->index |
242 | * of this (pinned-by-us) page. | ||
243 | */ | ||
244 | index = page->index; | ||
245 | if (index > next) | ||
246 | next = index; | ||
240 | next++; | 247 | next++; |
248 | if (lock_failed) | ||
249 | continue; | ||
250 | |||
241 | if (PageDirty(page) || PageWriteback(page)) | 251 | if (PageDirty(page) || PageWriteback(page)) |
242 | goto unlock; | 252 | goto unlock; |
243 | if (page_mapped(page)) | 253 | if (page_mapped(page)) |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c0504f1e34eb..35f8553f893a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -257,6 +257,19 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int | |||
257 | } | 257 | } |
258 | 258 | ||
259 | /* Caller must hold vmlist_lock */ | 259 | /* Caller must hold vmlist_lock */ |
260 | static struct vm_struct *__find_vm_area(void *addr) | ||
261 | { | ||
262 | struct vm_struct *tmp; | ||
263 | |||
264 | for (tmp = vmlist; tmp != NULL; tmp = tmp->next) { | ||
265 | if (tmp->addr == addr) | ||
266 | break; | ||
267 | } | ||
268 | |||
269 | return tmp; | ||
270 | } | ||
271 | |||
272 | /* Caller must hold vmlist_lock */ | ||
260 | struct vm_struct *__remove_vm_area(void *addr) | 273 | struct vm_struct *__remove_vm_area(void *addr) |
261 | { | 274 | { |
262 | struct vm_struct **p, *tmp; | 275 | struct vm_struct **p, *tmp; |
@@ -498,11 +511,33 @@ EXPORT_SYMBOL(__vmalloc); | |||
498 | */ | 511 | */ |
499 | void *vmalloc(unsigned long size) | 512 | void *vmalloc(unsigned long size) |
500 | { | 513 | { |
501 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); | 514 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); |
502 | } | 515 | } |
503 | EXPORT_SYMBOL(vmalloc); | 516 | EXPORT_SYMBOL(vmalloc); |
504 | 517 | ||
505 | /** | 518 | /** |
519 | * vmalloc_user - allocate virtually contiguous memory which has | ||
520 | * been zeroed so it can be mapped to userspace without | ||
521 | * leaking data. | ||
522 | * | ||
523 | * @size: allocation size | ||
524 | */ | ||
525 | void *vmalloc_user(unsigned long size) | ||
526 | { | ||
527 | struct vm_struct *area; | ||
528 | void *ret; | ||
529 | |||
530 | ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); | ||
531 | write_lock(&vmlist_lock); | ||
532 | area = __find_vm_area(ret); | ||
533 | area->flags |= VM_USERMAP; | ||
534 | write_unlock(&vmlist_lock); | ||
535 | |||
536 | return ret; | ||
537 | } | ||
538 | EXPORT_SYMBOL(vmalloc_user); | ||
539 | |||
540 | /** | ||
506 | * vmalloc_node - allocate memory on a specific node | 541 | * vmalloc_node - allocate memory on a specific node |
507 | * | 542 | * |
508 | * @size: allocation size | 543 | * @size: allocation size |
@@ -516,7 +551,7 @@ EXPORT_SYMBOL(vmalloc); | |||
516 | */ | 551 | */ |
517 | void *vmalloc_node(unsigned long size, int node) | 552 | void *vmalloc_node(unsigned long size, int node) |
518 | { | 553 | { |
519 | return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node); | 554 | return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node); |
520 | } | 555 | } |
521 | EXPORT_SYMBOL(vmalloc_node); | 556 | EXPORT_SYMBOL(vmalloc_node); |
522 | 557 | ||
@@ -556,6 +591,28 @@ void *vmalloc_32(unsigned long size) | |||
556 | } | 591 | } |
557 | EXPORT_SYMBOL(vmalloc_32); | 592 | EXPORT_SYMBOL(vmalloc_32); |
558 | 593 | ||
594 | /** | ||
595 | * vmalloc_32_user - allocate virtually contiguous memory (32bit | ||
596 | * addressable) which is zeroed so it can be | ||
597 | * mapped to userspace without leaking data. | ||
598 | * | ||
599 | * @size: allocation size | ||
600 | */ | ||
601 | void *vmalloc_32_user(unsigned long size) | ||
602 | { | ||
603 | struct vm_struct *area; | ||
604 | void *ret; | ||
605 | |||
606 | ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); | ||
607 | write_lock(&vmlist_lock); | ||
608 | area = __find_vm_area(ret); | ||
609 | area->flags |= VM_USERMAP; | ||
610 | write_unlock(&vmlist_lock); | ||
611 | |||
612 | return ret; | ||
613 | } | ||
614 | EXPORT_SYMBOL(vmalloc_32_user); | ||
615 | |||
559 | long vread(char *buf, char *addr, unsigned long count) | 616 | long vread(char *buf, char *addr, unsigned long count) |
560 | { | 617 | { |
561 | struct vm_struct *tmp; | 618 | struct vm_struct *tmp; |
@@ -630,3 +687,64 @@ finished: | |||
630 | read_unlock(&vmlist_lock); | 687 | read_unlock(&vmlist_lock); |
631 | return buf - buf_start; | 688 | return buf - buf_start; |
632 | } | 689 | } |
690 | |||
691 | /** | ||
692 | * remap_vmalloc_range - map vmalloc pages to userspace | ||
693 | * | ||
694 | * @vma: vma to cover (map full range of vma) | ||
695 | * @addr: vmalloc memory | ||
696 | * @pgoff: number of pages into addr before first page to map | ||
697 | * @returns: 0 for success, -Exxx on failure | ||
698 | * | ||
699 | * This function checks that addr is a valid vmalloc'ed area, and | ||
700 | * that it is big enough to cover the vma. Will return failure if | ||
701 | * that criteria isn't met. | ||
702 | * | ||
703 | * Similar to remap_pfn_range (see mm/memory.c) | ||
704 | */ | ||
705 | int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | ||
706 | unsigned long pgoff) | ||
707 | { | ||
708 | struct vm_struct *area; | ||
709 | unsigned long uaddr = vma->vm_start; | ||
710 | unsigned long usize = vma->vm_end - vma->vm_start; | ||
711 | int ret; | ||
712 | |||
713 | if ((PAGE_SIZE-1) & (unsigned long)addr) | ||
714 | return -EINVAL; | ||
715 | |||
716 | read_lock(&vmlist_lock); | ||
717 | area = __find_vm_area(addr); | ||
718 | if (!area) | ||
719 | goto out_einval_locked; | ||
720 | |||
721 | if (!(area->flags & VM_USERMAP)) | ||
722 | goto out_einval_locked; | ||
723 | |||
724 | if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) | ||
725 | goto out_einval_locked; | ||
726 | read_unlock(&vmlist_lock); | ||
727 | |||
728 | addr += pgoff << PAGE_SHIFT; | ||
729 | do { | ||
730 | struct page *page = vmalloc_to_page(addr); | ||
731 | ret = vm_insert_page(vma, uaddr, page); | ||
732 | if (ret) | ||
733 | return ret; | ||
734 | |||
735 | uaddr += PAGE_SIZE; | ||
736 | addr += PAGE_SIZE; | ||
737 | usize -= PAGE_SIZE; | ||
738 | } while (usize > 0); | ||
739 | |||
740 | /* Prevent "things" like memory migration? VM_flags need a cleanup... */ | ||
741 | vma->vm_flags |= VM_RESERVED; | ||
742 | |||
743 | return ret; | ||
744 | |||
745 | out_einval_locked: | ||
746 | read_unlock(&vmlist_lock); | ||
747 | return -EINVAL; | ||
748 | } | ||
749 | EXPORT_SYMBOL(remap_vmalloc_range); | ||
750 | |||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 440a733fe2e9..ff2ebe9458a3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/notifier.h> | 34 | #include <linux/notifier.h> |
35 | #include <linux/rwsem.h> | 35 | #include <linux/rwsem.h> |
36 | #include <linux/delay.h> | 36 | #include <linux/delay.h> |
37 | #include <linux/kthread.h> | ||
37 | 38 | ||
38 | #include <asm/tlbflush.h> | 39 | #include <asm/tlbflush.h> |
39 | #include <asm/div64.h> | 40 | #include <asm/div64.h> |
@@ -46,8 +47,6 @@ struct scan_control { | |||
46 | /* Incremented by the number of inactive pages that were scanned */ | 47 | /* Incremented by the number of inactive pages that were scanned */ |
47 | unsigned long nr_scanned; | 48 | unsigned long nr_scanned; |
48 | 49 | ||
49 | unsigned long nr_mapped; /* From page_state */ | ||
50 | |||
51 | /* This context's GFP mask */ | 50 | /* This context's GFP mask */ |
52 | gfp_t gfp_mask; | 51 | gfp_t gfp_mask; |
53 | 52 | ||
@@ -61,6 +60,8 @@ struct scan_control { | |||
61 | * In this context, it doesn't matter that we scan the | 60 | * In this context, it doesn't matter that we scan the |
62 | * whole list at once. */ | 61 | * whole list at once. */ |
63 | int swap_cluster_max; | 62 | int swap_cluster_max; |
63 | |||
64 | int swappiness; | ||
64 | }; | 65 | }; |
65 | 66 | ||
66 | /* | 67 | /* |
@@ -108,7 +109,7 @@ struct shrinker { | |||
108 | * From 0 .. 100. Higher means more swappy. | 109 | * From 0 .. 100. Higher means more swappy. |
109 | */ | 110 | */ |
110 | int vm_swappiness = 60; | 111 | int vm_swappiness = 60; |
111 | static long total_memory; | 112 | long vm_total_pages; /* The total number of pages which the VM controls */ |
112 | 113 | ||
113 | static LIST_HEAD(shrinker_list); | 114 | static LIST_HEAD(shrinker_list); |
114 | static DECLARE_RWSEM(shrinker_rwsem); | 115 | static DECLARE_RWSEM(shrinker_rwsem); |
@@ -214,7 +215,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, | |||
214 | break; | 215 | break; |
215 | if (shrink_ret < nr_before) | 216 | if (shrink_ret < nr_before) |
216 | ret += nr_before - shrink_ret; | 217 | ret += nr_before - shrink_ret; |
217 | mod_page_state(slabs_scanned, this_scan); | 218 | count_vm_events(SLABS_SCANNED, this_scan); |
218 | total_scan -= this_scan; | 219 | total_scan -= this_scan; |
219 | 220 | ||
220 | cond_resched(); | 221 | cond_resched(); |
@@ -288,11 +289,23 @@ static void handle_write_error(struct address_space *mapping, | |||
288 | unlock_page(page); | 289 | unlock_page(page); |
289 | } | 290 | } |
290 | 291 | ||
292 | /* possible outcome of pageout() */ | ||
293 | typedef enum { | ||
294 | /* failed to write page out, page is locked */ | ||
295 | PAGE_KEEP, | ||
296 | /* move page to the active list, page is locked */ | ||
297 | PAGE_ACTIVATE, | ||
298 | /* page has been sent to the disk successfully, page is unlocked */ | ||
299 | PAGE_SUCCESS, | ||
300 | /* page is clean and locked */ | ||
301 | PAGE_CLEAN, | ||
302 | } pageout_t; | ||
303 | |||
291 | /* | 304 | /* |
292 | * pageout is called by shrink_page_list() for each dirty page. | 305 | * pageout is called by shrink_page_list() for each dirty page. |
293 | * Calls ->writepage(). | 306 | * Calls ->writepage(). |
294 | */ | 307 | */ |
295 | pageout_t pageout(struct page *page, struct address_space *mapping) | 308 | static pageout_t pageout(struct page *page, struct address_space *mapping) |
296 | { | 309 | { |
297 | /* | 310 | /* |
298 | * If the page is dirty, only perform writeback if that write | 311 | * If the page is dirty, only perform writeback if that write |
@@ -337,6 +350,8 @@ pageout_t pageout(struct page *page, struct address_space *mapping) | |||
337 | struct writeback_control wbc = { | 350 | struct writeback_control wbc = { |
338 | .sync_mode = WB_SYNC_NONE, | 351 | .sync_mode = WB_SYNC_NONE, |
339 | .nr_to_write = SWAP_CLUSTER_MAX, | 352 | .nr_to_write = SWAP_CLUSTER_MAX, |
353 | .range_start = 0, | ||
354 | .range_end = LLONG_MAX, | ||
340 | .nonblocking = 1, | 355 | .nonblocking = 1, |
341 | .for_reclaim = 1, | 356 | .for_reclaim = 1, |
342 | }; | 357 | }; |
@@ -554,7 +569,7 @@ keep: | |||
554 | list_splice(&ret_pages, page_list); | 569 | list_splice(&ret_pages, page_list); |
555 | if (pagevec_count(&freed_pvec)) | 570 | if (pagevec_count(&freed_pvec)) |
556 | __pagevec_release_nonlru(&freed_pvec); | 571 | __pagevec_release_nonlru(&freed_pvec); |
557 | mod_page_state(pgactivate, pgactivate); | 572 | count_vm_events(PGACTIVATE, pgactivate); |
558 | return nr_reclaimed; | 573 | return nr_reclaimed; |
559 | } | 574 | } |
560 | 575 | ||
@@ -644,11 +659,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
644 | nr_reclaimed += nr_freed; | 659 | nr_reclaimed += nr_freed; |
645 | local_irq_disable(); | 660 | local_irq_disable(); |
646 | if (current_is_kswapd()) { | 661 | if (current_is_kswapd()) { |
647 | __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); | 662 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); |
648 | __mod_page_state(kswapd_steal, nr_freed); | 663 | __count_vm_events(KSWAPD_STEAL, nr_freed); |
649 | } else | 664 | } else |
650 | __mod_page_state_zone(zone, pgscan_direct, nr_scan); | 665 | __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); |
651 | __mod_page_state_zone(zone, pgsteal, nr_freed); | 666 | __count_vm_events(PGACTIVATE, nr_freed); |
652 | 667 | ||
653 | if (nr_taken == 0) | 668 | if (nr_taken == 0) |
654 | goto done; | 669 | goto done; |
@@ -727,7 +742,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
727 | * how much memory | 742 | * how much memory |
728 | * is mapped. | 743 | * is mapped. |
729 | */ | 744 | */ |
730 | mapped_ratio = (sc->nr_mapped * 100) / total_memory; | 745 | mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + |
746 | global_page_state(NR_ANON_PAGES)) * 100) / | ||
747 | vm_total_pages; | ||
731 | 748 | ||
732 | /* | 749 | /* |
733 | * Now decide how much we really want to unmap some pages. The | 750 | * Now decide how much we really want to unmap some pages. The |
@@ -741,7 +758,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
741 | * A 100% value of vm_swappiness overrides this algorithm | 758 | * A 100% value of vm_swappiness overrides this algorithm |
742 | * altogether. | 759 | * altogether. |
743 | */ | 760 | */ |
744 | swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; | 761 | swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; |
745 | 762 | ||
746 | /* | 763 | /* |
747 | * Now use this metric to decide whether to start moving mapped | 764 | * Now use this metric to decide whether to start moving mapped |
@@ -824,11 +841,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
824 | } | 841 | } |
825 | } | 842 | } |
826 | zone->nr_active += pgmoved; | 843 | zone->nr_active += pgmoved; |
827 | spin_unlock(&zone->lru_lock); | ||
828 | 844 | ||
829 | __mod_page_state_zone(zone, pgrefill, pgscanned); | 845 | __count_zone_vm_events(PGREFILL, zone, pgscanned); |
830 | __mod_page_state(pgdeactivate, pgdeactivate); | 846 | __count_vm_events(PGDEACTIVATE, pgdeactivate); |
831 | local_irq_enable(); | 847 | spin_unlock_irq(&zone->lru_lock); |
832 | 848 | ||
833 | pagevec_release(&pvec); | 849 | pagevec_release(&pvec); |
834 | } | 850 | } |
@@ -957,9 +973,10 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
957 | .may_writepage = !laptop_mode, | 973 | .may_writepage = !laptop_mode, |
958 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 974 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
959 | .may_swap = 1, | 975 | .may_swap = 1, |
976 | .swappiness = vm_swappiness, | ||
960 | }; | 977 | }; |
961 | 978 | ||
962 | inc_page_state(allocstall); | 979 | count_vm_event(ALLOCSTALL); |
963 | 980 | ||
964 | for (i = 0; zones[i] != NULL; i++) { | 981 | for (i = 0; zones[i] != NULL; i++) { |
965 | struct zone *zone = zones[i]; | 982 | struct zone *zone = zones[i]; |
@@ -972,7 +989,6 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
972 | } | 989 | } |
973 | 990 | ||
974 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 991 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
975 | sc.nr_mapped = read_page_state(nr_mapped); | ||
976 | sc.nr_scanned = 0; | 992 | sc.nr_scanned = 0; |
977 | if (!priority) | 993 | if (!priority) |
978 | disable_swap_token(); | 994 | disable_swap_token(); |
@@ -1021,10 +1037,6 @@ out: | |||
1021 | * For kswapd, balance_pgdat() will work across all this node's zones until | 1037 | * For kswapd, balance_pgdat() will work across all this node's zones until |
1022 | * they are all at pages_high. | 1038 | * they are all at pages_high. |
1023 | * | 1039 | * |
1024 | * If `nr_pages' is non-zero then it is the number of pages which are to be | ||
1025 | * reclaimed, regardless of the zone occupancies. This is a software suspend | ||
1026 | * special. | ||
1027 | * | ||
1028 | * Returns the number of pages which were actually freed. | 1040 | * Returns the number of pages which were actually freed. |
1029 | * | 1041 | * |
1030 | * There is special handling here for zones which are full of pinned pages. | 1042 | * There is special handling here for zones which are full of pinned pages. |
@@ -1042,10 +1054,8 @@ out: | |||
1042 | * the page allocator fallback scheme to ensure that aging of pages is balanced | 1054 | * the page allocator fallback scheme to ensure that aging of pages is balanced |
1043 | * across the zones. | 1055 | * across the zones. |
1044 | */ | 1056 | */ |
1045 | static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, | 1057 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) |
1046 | int order) | ||
1047 | { | 1058 | { |
1048 | unsigned long to_free = nr_pages; | ||
1049 | int all_zones_ok; | 1059 | int all_zones_ok; |
1050 | int priority; | 1060 | int priority; |
1051 | int i; | 1061 | int i; |
@@ -1055,16 +1065,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, | |||
1055 | struct scan_control sc = { | 1065 | struct scan_control sc = { |
1056 | .gfp_mask = GFP_KERNEL, | 1066 | .gfp_mask = GFP_KERNEL, |
1057 | .may_swap = 1, | 1067 | .may_swap = 1, |
1058 | .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX, | 1068 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
1069 | .swappiness = vm_swappiness, | ||
1059 | }; | 1070 | }; |
1060 | 1071 | ||
1061 | loop_again: | 1072 | loop_again: |
1062 | total_scanned = 0; | 1073 | total_scanned = 0; |
1063 | nr_reclaimed = 0; | 1074 | nr_reclaimed = 0; |
1064 | sc.may_writepage = !laptop_mode; | 1075 | sc.may_writepage = !laptop_mode; |
1065 | sc.nr_mapped = read_page_state(nr_mapped); | 1076 | count_vm_event(PAGEOUTRUN); |
1066 | |||
1067 | inc_page_state(pageoutrun); | ||
1068 | 1077 | ||
1069 | for (i = 0; i < pgdat->nr_zones; i++) { | 1078 | for (i = 0; i < pgdat->nr_zones; i++) { |
1070 | struct zone *zone = pgdat->node_zones + i; | 1079 | struct zone *zone = pgdat->node_zones + i; |
@@ -1082,31 +1091,26 @@ loop_again: | |||
1082 | 1091 | ||
1083 | all_zones_ok = 1; | 1092 | all_zones_ok = 1; |
1084 | 1093 | ||
1085 | if (nr_pages == 0) { | 1094 | /* |
1086 | /* | 1095 | * Scan in the highmem->dma direction for the highest |
1087 | * Scan in the highmem->dma direction for the highest | 1096 | * zone which needs scanning |
1088 | * zone which needs scanning | 1097 | */ |
1089 | */ | 1098 | for (i = pgdat->nr_zones - 1; i >= 0; i--) { |
1090 | for (i = pgdat->nr_zones - 1; i >= 0; i--) { | 1099 | struct zone *zone = pgdat->node_zones + i; |
1091 | struct zone *zone = pgdat->node_zones + i; | ||
1092 | 1100 | ||
1093 | if (!populated_zone(zone)) | 1101 | if (!populated_zone(zone)) |
1094 | continue; | 1102 | continue; |
1095 | 1103 | ||
1096 | if (zone->all_unreclaimable && | 1104 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
1097 | priority != DEF_PRIORITY) | 1105 | continue; |
1098 | continue; | ||
1099 | 1106 | ||
1100 | if (!zone_watermark_ok(zone, order, | 1107 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
1101 | zone->pages_high, 0, 0)) { | 1108 | 0, 0)) { |
1102 | end_zone = i; | 1109 | end_zone = i; |
1103 | goto scan; | 1110 | goto scan; |
1104 | } | ||
1105 | } | 1111 | } |
1106 | goto out; | ||
1107 | } else { | ||
1108 | end_zone = pgdat->nr_zones - 1; | ||
1109 | } | 1112 | } |
1113 | goto out; | ||
1110 | scan: | 1114 | scan: |
1111 | for (i = 0; i <= end_zone; i++) { | 1115 | for (i = 0; i <= end_zone; i++) { |
1112 | struct zone *zone = pgdat->node_zones + i; | 1116 | struct zone *zone = pgdat->node_zones + i; |
@@ -1133,11 +1137,9 @@ scan: | |||
1133 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 1137 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
1134 | continue; | 1138 | continue; |
1135 | 1139 | ||
1136 | if (nr_pages == 0) { /* Not software suspend */ | 1140 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
1137 | if (!zone_watermark_ok(zone, order, | 1141 | end_zone, 0)) |
1138 | zone->pages_high, end_zone, 0)) | 1142 | all_zones_ok = 0; |
1139 | all_zones_ok = 0; | ||
1140 | } | ||
1141 | zone->temp_priority = priority; | 1143 | zone->temp_priority = priority; |
1142 | if (zone->prev_priority > priority) | 1144 | if (zone->prev_priority > priority) |
1143 | zone->prev_priority = priority; | 1145 | zone->prev_priority = priority; |
@@ -1162,8 +1164,6 @@ scan: | |||
1162 | total_scanned > nr_reclaimed + nr_reclaimed / 2) | 1164 | total_scanned > nr_reclaimed + nr_reclaimed / 2) |
1163 | sc.may_writepage = 1; | 1165 | sc.may_writepage = 1; |
1164 | } | 1166 | } |
1165 | if (nr_pages && to_free > nr_reclaimed) | ||
1166 | continue; /* swsusp: need to do more work */ | ||
1167 | if (all_zones_ok) | 1167 | if (all_zones_ok) |
1168 | break; /* kswapd: all done */ | 1168 | break; /* kswapd: all done */ |
1169 | /* | 1169 | /* |
@@ -1179,7 +1179,7 @@ scan: | |||
1179 | * matches the direct reclaim path behaviour in terms of impact | 1179 | * matches the direct reclaim path behaviour in terms of impact |
1180 | * on zone->*_priority. | 1180 | * on zone->*_priority. |
1181 | */ | 1181 | */ |
1182 | if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages) | 1182 | if (nr_reclaimed >= SWAP_CLUSTER_MAX) |
1183 | break; | 1183 | break; |
1184 | } | 1184 | } |
1185 | out: | 1185 | out: |
@@ -1220,7 +1220,6 @@ static int kswapd(void *p) | |||
1220 | }; | 1220 | }; |
1221 | cpumask_t cpumask; | 1221 | cpumask_t cpumask; |
1222 | 1222 | ||
1223 | daemonize("kswapd%d", pgdat->node_id); | ||
1224 | cpumask = node_to_cpumask(pgdat->node_id); | 1223 | cpumask = node_to_cpumask(pgdat->node_id); |
1225 | if (!cpus_empty(cpumask)) | 1224 | if (!cpus_empty(cpumask)) |
1226 | set_cpus_allowed(tsk, cpumask); | 1225 | set_cpus_allowed(tsk, cpumask); |
@@ -1261,7 +1260,7 @@ static int kswapd(void *p) | |||
1261 | } | 1260 | } |
1262 | finish_wait(&pgdat->kswapd_wait, &wait); | 1261 | finish_wait(&pgdat->kswapd_wait, &wait); |
1263 | 1262 | ||
1264 | balance_pgdat(pgdat, 0, order); | 1263 | balance_pgdat(pgdat, order); |
1265 | } | 1264 | } |
1266 | return 0; | 1265 | return 0; |
1267 | } | 1266 | } |
@@ -1290,35 +1289,152 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
1290 | 1289 | ||
1291 | #ifdef CONFIG_PM | 1290 | #ifdef CONFIG_PM |
1292 | /* | 1291 | /* |
1293 | * Try to free `nr_pages' of memory, system-wide. Returns the number of freed | 1292 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages |
1294 | * pages. | 1293 | * from LRU lists system-wide, for given pass and priority, and returns the |
1294 | * number of reclaimed pages | ||
1295 | * | ||
1296 | * For pass > 3 we also try to shrink the LRU lists that contain a few pages | ||
1297 | */ | ||
1298 | static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, | ||
1299 | int prio, struct scan_control *sc) | ||
1300 | { | ||
1301 | struct zone *zone; | ||
1302 | unsigned long nr_to_scan, ret = 0; | ||
1303 | |||
1304 | for_each_zone(zone) { | ||
1305 | |||
1306 | if (!populated_zone(zone)) | ||
1307 | continue; | ||
1308 | |||
1309 | if (zone->all_unreclaimable && prio != DEF_PRIORITY) | ||
1310 | continue; | ||
1311 | |||
1312 | /* For pass = 0 we don't shrink the active list */ | ||
1313 | if (pass > 0) { | ||
1314 | zone->nr_scan_active += (zone->nr_active >> prio) + 1; | ||
1315 | if (zone->nr_scan_active >= nr_pages || pass > 3) { | ||
1316 | zone->nr_scan_active = 0; | ||
1317 | nr_to_scan = min(nr_pages, zone->nr_active); | ||
1318 | shrink_active_list(nr_to_scan, zone, sc); | ||
1319 | } | ||
1320 | } | ||
1321 | |||
1322 | zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1; | ||
1323 | if (zone->nr_scan_inactive >= nr_pages || pass > 3) { | ||
1324 | zone->nr_scan_inactive = 0; | ||
1325 | nr_to_scan = min(nr_pages, zone->nr_inactive); | ||
1326 | ret += shrink_inactive_list(nr_to_scan, zone, sc); | ||
1327 | if (ret >= nr_pages) | ||
1328 | return ret; | ||
1329 | } | ||
1330 | } | ||
1331 | |||
1332 | return ret; | ||
1333 | } | ||
1334 | |||
1335 | /* | ||
1336 | * Try to free `nr_pages' of memory, system-wide, and return the number of | ||
1337 | * freed pages. | ||
1338 | * | ||
1339 | * Rather than trying to age LRUs the aim is to preserve the overall | ||
1340 | * LRU order by reclaiming preferentially | ||
1341 | * inactive > active > active referenced > active mapped | ||
1295 | */ | 1342 | */ |
1296 | unsigned long shrink_all_memory(unsigned long nr_pages) | 1343 | unsigned long shrink_all_memory(unsigned long nr_pages) |
1297 | { | 1344 | { |
1298 | pg_data_t *pgdat; | 1345 | unsigned long lru_pages, nr_slab; |
1299 | unsigned long nr_to_free = nr_pages; | ||
1300 | unsigned long ret = 0; | 1346 | unsigned long ret = 0; |
1301 | unsigned retry = 2; | 1347 | int pass; |
1302 | struct reclaim_state reclaim_state = { | 1348 | struct reclaim_state reclaim_state; |
1303 | .reclaimed_slab = 0, | 1349 | struct zone *zone; |
1350 | struct scan_control sc = { | ||
1351 | .gfp_mask = GFP_KERNEL, | ||
1352 | .may_swap = 0, | ||
1353 | .swap_cluster_max = nr_pages, | ||
1354 | .may_writepage = 1, | ||
1355 | .swappiness = vm_swappiness, | ||
1304 | }; | 1356 | }; |
1305 | 1357 | ||
1306 | current->reclaim_state = &reclaim_state; | 1358 | current->reclaim_state = &reclaim_state; |
1307 | repeat: | 1359 | |
1308 | for_each_online_pgdat(pgdat) { | 1360 | lru_pages = 0; |
1309 | unsigned long freed; | 1361 | for_each_zone(zone) |
1310 | 1362 | lru_pages += zone->nr_active + zone->nr_inactive; | |
1311 | freed = balance_pgdat(pgdat, nr_to_free, 0); | 1363 | |
1312 | ret += freed; | 1364 | nr_slab = global_page_state(NR_SLAB); |
1313 | nr_to_free -= freed; | 1365 | /* If slab caches are huge, it's better to hit them first */ |
1314 | if ((long)nr_to_free <= 0) | 1366 | while (nr_slab >= lru_pages) { |
1367 | reclaim_state.reclaimed_slab = 0; | ||
1368 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); | ||
1369 | if (!reclaim_state.reclaimed_slab) | ||
1315 | break; | 1370 | break; |
1371 | |||
1372 | ret += reclaim_state.reclaimed_slab; | ||
1373 | if (ret >= nr_pages) | ||
1374 | goto out; | ||
1375 | |||
1376 | nr_slab -= reclaim_state.reclaimed_slab; | ||
1316 | } | 1377 | } |
1317 | if (retry-- && ret < nr_pages) { | 1378 | |
1318 | blk_congestion_wait(WRITE, HZ/5); | 1379 | /* |
1319 | goto repeat; | 1380 | * We try to shrink LRUs in 5 passes: |
1381 | * 0 = Reclaim from inactive_list only | ||
1382 | * 1 = Reclaim from active list but don't reclaim mapped | ||
1383 | * 2 = 2nd pass of type 1 | ||
1384 | * 3 = Reclaim mapped (normal reclaim) | ||
1385 | * 4 = 2nd pass of type 3 | ||
1386 | */ | ||
1387 | for (pass = 0; pass < 5; pass++) { | ||
1388 | int prio; | ||
1389 | |||
1390 | /* Needed for shrinking slab caches later on */ | ||
1391 | if (!lru_pages) | ||
1392 | for_each_zone(zone) { | ||
1393 | lru_pages += zone->nr_active; | ||
1394 | lru_pages += zone->nr_inactive; | ||
1395 | } | ||
1396 | |||
1397 | /* Force reclaiming mapped pages in the passes #3 and #4 */ | ||
1398 | if (pass > 2) { | ||
1399 | sc.may_swap = 1; | ||
1400 | sc.swappiness = 100; | ||
1401 | } | ||
1402 | |||
1403 | for (prio = DEF_PRIORITY; prio >= 0; prio--) { | ||
1404 | unsigned long nr_to_scan = nr_pages - ret; | ||
1405 | |||
1406 | sc.nr_scanned = 0; | ||
1407 | ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); | ||
1408 | if (ret >= nr_pages) | ||
1409 | goto out; | ||
1410 | |||
1411 | reclaim_state.reclaimed_slab = 0; | ||
1412 | shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages); | ||
1413 | ret += reclaim_state.reclaimed_slab; | ||
1414 | if (ret >= nr_pages) | ||
1415 | goto out; | ||
1416 | |||
1417 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) | ||
1418 | blk_congestion_wait(WRITE, HZ / 10); | ||
1419 | } | ||
1420 | |||
1421 | lru_pages = 0; | ||
1320 | } | 1422 | } |
1423 | |||
1424 | /* | ||
1425 | * If ret = 0, we could not shrink LRUs, but there may be something | ||
1426 | * in slab caches | ||
1427 | */ | ||
1428 | if (!ret) | ||
1429 | do { | ||
1430 | reclaim_state.reclaimed_slab = 0; | ||
1431 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); | ||
1432 | ret += reclaim_state.reclaimed_slab; | ||
1433 | } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); | ||
1434 | |||
1435 | out: | ||
1321 | current->reclaim_state = NULL; | 1436 | current->reclaim_state = NULL; |
1437 | |||
1322 | return ret; | 1438 | return ret; |
1323 | } | 1439 | } |
1324 | #endif | 1440 | #endif |
@@ -1328,7 +1444,7 @@ repeat: | |||
1328 | not required for correctness. So if the last cpu in a node goes | 1444 | not required for correctness. So if the last cpu in a node goes |
1329 | away, we get changed to run anywhere: as the first one comes back, | 1445 | away, we get changed to run anywhere: as the first one comes back, |
1330 | restore their cpu bindings. */ | 1446 | restore their cpu bindings. */ |
1331 | static int cpu_callback(struct notifier_block *nfb, | 1447 | static int __devinit cpu_callback(struct notifier_block *nfb, |
1332 | unsigned long action, void *hcpu) | 1448 | unsigned long action, void *hcpu) |
1333 | { | 1449 | { |
1334 | pg_data_t *pgdat; | 1450 | pg_data_t *pgdat; |
@@ -1346,21 +1462,35 @@ static int cpu_callback(struct notifier_block *nfb, | |||
1346 | } | 1462 | } |
1347 | #endif /* CONFIG_HOTPLUG_CPU */ | 1463 | #endif /* CONFIG_HOTPLUG_CPU */ |
1348 | 1464 | ||
1465 | /* | ||
1466 | * This kswapd start function will be called by init and node-hot-add. | ||
1467 | * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. | ||
1468 | */ | ||
1469 | int kswapd_run(int nid) | ||
1470 | { | ||
1471 | pg_data_t *pgdat = NODE_DATA(nid); | ||
1472 | int ret = 0; | ||
1473 | |||
1474 | if (pgdat->kswapd) | ||
1475 | return 0; | ||
1476 | |||
1477 | pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); | ||
1478 | if (IS_ERR(pgdat->kswapd)) { | ||
1479 | /* failure at boot is fatal */ | ||
1480 | BUG_ON(system_state == SYSTEM_BOOTING); | ||
1481 | printk("Failed to start kswapd on node %d\n",nid); | ||
1482 | ret = -1; | ||
1483 | } | ||
1484 | return ret; | ||
1485 | } | ||
1486 | |||
1349 | static int __init kswapd_init(void) | 1487 | static int __init kswapd_init(void) |
1350 | { | 1488 | { |
1351 | pg_data_t *pgdat; | 1489 | int nid; |
1352 | 1490 | ||
1353 | swap_setup(); | 1491 | swap_setup(); |
1354 | for_each_online_pgdat(pgdat) { | 1492 | for_each_online_node(nid) |
1355 | pid_t pid; | 1493 | kswapd_run(nid); |
1356 | |||
1357 | pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); | ||
1358 | BUG_ON(pid < 0); | ||
1359 | read_lock(&tasklist_lock); | ||
1360 | pgdat->kswapd = find_task_by_pid(pid); | ||
1361 | read_unlock(&tasklist_lock); | ||
1362 | } | ||
1363 | total_memory = nr_free_pagecache_pages(); | ||
1364 | hotcpu_notifier(cpu_callback, 0); | 1494 | hotcpu_notifier(cpu_callback, 0); |
1365 | return 0; | 1495 | return 0; |
1366 | } | 1496 | } |
@@ -1387,11 +1517,6 @@ int zone_reclaim_mode __read_mostly; | |||
1387 | #define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ | 1517 | #define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ |
1388 | 1518 | ||
1389 | /* | 1519 | /* |
1390 | * Mininum time between zone reclaim scans | ||
1391 | */ | ||
1392 | int zone_reclaim_interval __read_mostly = 30*HZ; | ||
1393 | |||
1394 | /* | ||
1395 | * Priority for ZONE_RECLAIM. This determines the fraction of pages | 1520 | * Priority for ZONE_RECLAIM. This determines the fraction of pages |
1396 | * of a node considered for each zone_reclaim. 4 scans 1/16th of | 1521 | * of a node considered for each zone_reclaim. 4 scans 1/16th of |
1397 | * a zone. | 1522 | * a zone. |
@@ -1412,10 +1537,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1412 | struct scan_control sc = { | 1537 | struct scan_control sc = { |
1413 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 1538 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), |
1414 | .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 1539 | .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
1415 | .nr_mapped = read_page_state(nr_mapped), | ||
1416 | .swap_cluster_max = max_t(unsigned long, nr_pages, | 1540 | .swap_cluster_max = max_t(unsigned long, nr_pages, |
1417 | SWAP_CLUSTER_MAX), | 1541 | SWAP_CLUSTER_MAX), |
1418 | .gfp_mask = gfp_mask, | 1542 | .gfp_mask = gfp_mask, |
1543 | .swappiness = vm_swappiness, | ||
1419 | }; | 1544 | }; |
1420 | 1545 | ||
1421 | disable_swap_token(); | 1546 | disable_swap_token(); |
@@ -1456,16 +1581,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1456 | 1581 | ||
1457 | p->reclaim_state = NULL; | 1582 | p->reclaim_state = NULL; |
1458 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | 1583 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); |
1459 | |||
1460 | if (nr_reclaimed == 0) { | ||
1461 | /* | ||
1462 | * We were unable to reclaim enough pages to stay on node. We | ||
1463 | * now allow off node accesses for a certain time period before | ||
1464 | * trying again to reclaim pages from the local zone. | ||
1465 | */ | ||
1466 | zone->last_unsuccessful_zone_reclaim = jiffies; | ||
1467 | } | ||
1468 | |||
1469 | return nr_reclaimed >= nr_pages; | 1584 | return nr_reclaimed >= nr_pages; |
1470 | } | 1585 | } |
1471 | 1586 | ||
@@ -1475,13 +1590,17 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1475 | int node_id; | 1590 | int node_id; |
1476 | 1591 | ||
1477 | /* | 1592 | /* |
1478 | * Do not reclaim if there was a recent unsuccessful attempt at zone | 1593 | * Do not reclaim if there are not enough reclaimable pages in this |
1479 | * reclaim. In that case we let allocations go off node for the | 1594 | * zone that would satify this allocations. |
1480 | * zone_reclaim_interval. Otherwise we would scan for each off-node | 1595 | * |
1481 | * page allocation. | 1596 | * All unmapped pagecache pages are reclaimable. |
1597 | * | ||
1598 | * Both counters may be temporarily off a bit so we use | ||
1599 | * SWAP_CLUSTER_MAX as the boundary. It may also be good to | ||
1600 | * leave a few frequently used unmapped pagecache pages around. | ||
1482 | */ | 1601 | */ |
1483 | if (time_before(jiffies, | 1602 | if (zone_page_state(zone, NR_FILE_PAGES) - |
1484 | zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) | 1603 | zone_page_state(zone, NR_FILE_MAPPED) < SWAP_CLUSTER_MAX) |
1485 | return 0; | 1604 | return 0; |
1486 | 1605 | ||
1487 | /* | 1606 | /* |
diff --git a/mm/vmstat.c b/mm/vmstat.c new file mode 100644 index 000000000000..73b83d67bab6 --- /dev/null +++ b/mm/vmstat.c | |||
@@ -0,0 +1,614 @@ | |||
1 | /* | ||
2 | * linux/mm/vmstat.c | ||
3 | * | ||
4 | * Manages VM statistics | ||
5 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | ||
6 | * | ||
7 | * zoned VM statistics | ||
8 | * Copyright (C) 2006 Silicon Graphics, Inc., | ||
9 | * Christoph Lameter <christoph@lameter.com> | ||
10 | */ | ||
11 | |||
12 | #include <linux/config.h> | ||
13 | #include <linux/mm.h> | ||
14 | #include <linux/module.h> | ||
15 | |||
16 | void __get_zone_counts(unsigned long *active, unsigned long *inactive, | ||
17 | unsigned long *free, struct pglist_data *pgdat) | ||
18 | { | ||
19 | struct zone *zones = pgdat->node_zones; | ||
20 | int i; | ||
21 | |||
22 | *active = 0; | ||
23 | *inactive = 0; | ||
24 | *free = 0; | ||
25 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
26 | *active += zones[i].nr_active; | ||
27 | *inactive += zones[i].nr_inactive; | ||
28 | *free += zones[i].free_pages; | ||
29 | } | ||
30 | } | ||
31 | |||
32 | void get_zone_counts(unsigned long *active, | ||
33 | unsigned long *inactive, unsigned long *free) | ||
34 | { | ||
35 | struct pglist_data *pgdat; | ||
36 | |||
37 | *active = 0; | ||
38 | *inactive = 0; | ||
39 | *free = 0; | ||
40 | for_each_online_pgdat(pgdat) { | ||
41 | unsigned long l, m, n; | ||
42 | __get_zone_counts(&l, &m, &n, pgdat); | ||
43 | *active += l; | ||
44 | *inactive += m; | ||
45 | *free += n; | ||
46 | } | ||
47 | } | ||
48 | |||
49 | #ifdef CONFIG_VM_EVENT_COUNTERS | ||
50 | DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; | ||
51 | EXPORT_PER_CPU_SYMBOL(vm_event_states); | ||
52 | |||
53 | static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) | ||
54 | { | ||
55 | int cpu = 0; | ||
56 | int i; | ||
57 | |||
58 | memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); | ||
59 | |||
60 | cpu = first_cpu(*cpumask); | ||
61 | while (cpu < NR_CPUS) { | ||
62 | struct vm_event_state *this = &per_cpu(vm_event_states, cpu); | ||
63 | |||
64 | cpu = next_cpu(cpu, *cpumask); | ||
65 | |||
66 | if (cpu < NR_CPUS) | ||
67 | prefetch(&per_cpu(vm_event_states, cpu)); | ||
68 | |||
69 | |||
70 | for (i = 0; i < NR_VM_EVENT_ITEMS; i++) | ||
71 | ret[i] += this->event[i]; | ||
72 | } | ||
73 | } | ||
74 | |||
75 | /* | ||
76 | * Accumulate the vm event counters across all CPUs. | ||
77 | * The result is unavoidably approximate - it can change | ||
78 | * during and after execution of this function. | ||
79 | */ | ||
80 | void all_vm_events(unsigned long *ret) | ||
81 | { | ||
82 | sum_vm_events(ret, &cpu_online_map); | ||
83 | } | ||
84 | |||
85 | #ifdef CONFIG_HOTPLUG | ||
86 | /* | ||
87 | * Fold the foreign cpu events into our own. | ||
88 | * | ||
89 | * This is adding to the events on one processor | ||
90 | * but keeps the global counts constant. | ||
91 | */ | ||
92 | void vm_events_fold_cpu(int cpu) | ||
93 | { | ||
94 | struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu); | ||
95 | int i; | ||
96 | |||
97 | for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { | ||
98 | count_vm_events(i, fold_state->event[i]); | ||
99 | fold_state->event[i] = 0; | ||
100 | } | ||
101 | } | ||
102 | #endif /* CONFIG_HOTPLUG */ | ||
103 | |||
104 | #endif /* CONFIG_VM_EVENT_COUNTERS */ | ||
105 | |||
106 | /* | ||
107 | * Manage combined zone based / global counters | ||
108 | * | ||
109 | * vm_stat contains the global counters | ||
110 | */ | ||
111 | atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; | ||
112 | EXPORT_SYMBOL(vm_stat); | ||
113 | |||
114 | #ifdef CONFIG_SMP | ||
115 | |||
116 | #define STAT_THRESHOLD 32 | ||
117 | |||
118 | /* | ||
119 | * Determine pointer to currently valid differential byte given a zone and | ||
120 | * the item number. | ||
121 | * | ||
122 | * Preemption must be off | ||
123 | */ | ||
124 | static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item) | ||
125 | { | ||
126 | return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item]; | ||
127 | } | ||
128 | |||
129 | /* | ||
130 | * For use when we know that interrupts are disabled. | ||
131 | */ | ||
132 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | ||
133 | int delta) | ||
134 | { | ||
135 | s8 *p; | ||
136 | long x; | ||
137 | |||
138 | p = diff_pointer(zone, item); | ||
139 | x = delta + *p; | ||
140 | |||
141 | if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) { | ||
142 | zone_page_state_add(x, zone, item); | ||
143 | x = 0; | ||
144 | } | ||
145 | |||
146 | *p = x; | ||
147 | } | ||
148 | EXPORT_SYMBOL(__mod_zone_page_state); | ||
149 | |||
150 | /* | ||
151 | * For an unknown interrupt state | ||
152 | */ | ||
153 | void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | ||
154 | int delta) | ||
155 | { | ||
156 | unsigned long flags; | ||
157 | |||
158 | local_irq_save(flags); | ||
159 | __mod_zone_page_state(zone, item, delta); | ||
160 | local_irq_restore(flags); | ||
161 | } | ||
162 | EXPORT_SYMBOL(mod_zone_page_state); | ||
163 | |||
164 | /* | ||
165 | * Optimized increment and decrement functions. | ||
166 | * | ||
167 | * These are only for a single page and therefore can take a struct page * | ||
168 | * argument instead of struct zone *. This allows the inclusion of the code | ||
169 | * generated for page_zone(page) into the optimized functions. | ||
170 | * | ||
171 | * No overflow check is necessary and therefore the differential can be | ||
172 | * incremented or decremented in place which may allow the compilers to | ||
173 | * generate better code. | ||
174 | * | ||
175 | * The increment or decrement is known and therefore one boundary check can | ||
176 | * be omitted. | ||
177 | * | ||
178 | * Some processors have inc/dec instructions that are atomic vs an interrupt. | ||
179 | * However, the code must first determine the differential location in a zone | ||
180 | * based on the processor number and then inc/dec the counter. There is no | ||
181 | * guarantee without disabling preemption that the processor will not change | ||
182 | * in between and therefore the atomicity vs. interrupt cannot be exploited | ||
183 | * in a useful way here. | ||
184 | */ | ||
185 | static void __inc_zone_state(struct zone *zone, enum zone_stat_item item) | ||
186 | { | ||
187 | s8 *p = diff_pointer(zone, item); | ||
188 | |||
189 | (*p)++; | ||
190 | |||
191 | if (unlikely(*p > STAT_THRESHOLD)) { | ||
192 | zone_page_state_add(*p, zone, item); | ||
193 | *p = 0; | ||
194 | } | ||
195 | } | ||
196 | |||
197 | void __inc_zone_page_state(struct page *page, enum zone_stat_item item) | ||
198 | { | ||
199 | __inc_zone_state(page_zone(page), item); | ||
200 | } | ||
201 | EXPORT_SYMBOL(__inc_zone_page_state); | ||
202 | |||
203 | void __dec_zone_page_state(struct page *page, enum zone_stat_item item) | ||
204 | { | ||
205 | struct zone *zone = page_zone(page); | ||
206 | s8 *p = diff_pointer(zone, item); | ||
207 | |||
208 | (*p)--; | ||
209 | |||
210 | if (unlikely(*p < -STAT_THRESHOLD)) { | ||
211 | zone_page_state_add(*p, zone, item); | ||
212 | *p = 0; | ||
213 | } | ||
214 | } | ||
215 | EXPORT_SYMBOL(__dec_zone_page_state); | ||
216 | |||
217 | void inc_zone_state(struct zone *zone, enum zone_stat_item item) | ||
218 | { | ||
219 | unsigned long flags; | ||
220 | |||
221 | local_irq_save(flags); | ||
222 | __inc_zone_state(zone, item); | ||
223 | local_irq_restore(flags); | ||
224 | } | ||
225 | |||
226 | void inc_zone_page_state(struct page *page, enum zone_stat_item item) | ||
227 | { | ||
228 | unsigned long flags; | ||
229 | struct zone *zone; | ||
230 | |||
231 | zone = page_zone(page); | ||
232 | local_irq_save(flags); | ||
233 | __inc_zone_state(zone, item); | ||
234 | local_irq_restore(flags); | ||
235 | } | ||
236 | EXPORT_SYMBOL(inc_zone_page_state); | ||
237 | |||
238 | void dec_zone_page_state(struct page *page, enum zone_stat_item item) | ||
239 | { | ||
240 | unsigned long flags; | ||
241 | struct zone *zone; | ||
242 | s8 *p; | ||
243 | |||
244 | zone = page_zone(page); | ||
245 | local_irq_save(flags); | ||
246 | p = diff_pointer(zone, item); | ||
247 | |||
248 | (*p)--; | ||
249 | |||
250 | if (unlikely(*p < -STAT_THRESHOLD)) { | ||
251 | zone_page_state_add(*p, zone, item); | ||
252 | *p = 0; | ||
253 | } | ||
254 | local_irq_restore(flags); | ||
255 | } | ||
256 | EXPORT_SYMBOL(dec_zone_page_state); | ||
257 | |||
258 | /* | ||
259 | * Update the zone counters for one cpu. | ||
260 | */ | ||
261 | void refresh_cpu_vm_stats(int cpu) | ||
262 | { | ||
263 | struct zone *zone; | ||
264 | int i; | ||
265 | unsigned long flags; | ||
266 | |||
267 | for_each_zone(zone) { | ||
268 | struct per_cpu_pageset *pcp; | ||
269 | |||
270 | pcp = zone_pcp(zone, cpu); | ||
271 | |||
272 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | ||
273 | if (pcp->vm_stat_diff[i]) { | ||
274 | local_irq_save(flags); | ||
275 | zone_page_state_add(pcp->vm_stat_diff[i], | ||
276 | zone, i); | ||
277 | pcp->vm_stat_diff[i] = 0; | ||
278 | local_irq_restore(flags); | ||
279 | } | ||
280 | } | ||
281 | } | ||
282 | |||
283 | static void __refresh_cpu_vm_stats(void *dummy) | ||
284 | { | ||
285 | refresh_cpu_vm_stats(smp_processor_id()); | ||
286 | } | ||
287 | |||
288 | /* | ||
289 | * Consolidate all counters. | ||
290 | * | ||
291 | * Note that the result is less inaccurate but still inaccurate | ||
292 | * if concurrent processes are allowed to run. | ||
293 | */ | ||
294 | void refresh_vm_stats(void) | ||
295 | { | ||
296 | on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1); | ||
297 | } | ||
298 | EXPORT_SYMBOL(refresh_vm_stats); | ||
299 | |||
300 | #endif | ||
301 | |||
302 | #ifdef CONFIG_NUMA | ||
303 | /* | ||
304 | * zonelist = the list of zones passed to the allocator | ||
305 | * z = the zone from which the allocation occurred. | ||
306 | * | ||
307 | * Must be called with interrupts disabled. | ||
308 | */ | ||
309 | void zone_statistics(struct zonelist *zonelist, struct zone *z) | ||
310 | { | ||
311 | if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) { | ||
312 | __inc_zone_state(z, NUMA_HIT); | ||
313 | } else { | ||
314 | __inc_zone_state(z, NUMA_MISS); | ||
315 | __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN); | ||
316 | } | ||
317 | if (z->zone_pgdat == NODE_DATA(numa_node_id())) | ||
318 | __inc_zone_state(z, NUMA_LOCAL); | ||
319 | else | ||
320 | __inc_zone_state(z, NUMA_OTHER); | ||
321 | } | ||
322 | #endif | ||
323 | |||
324 | #ifdef CONFIG_PROC_FS | ||
325 | |||
326 | #include <linux/seq_file.h> | ||
327 | |||
328 | static void *frag_start(struct seq_file *m, loff_t *pos) | ||
329 | { | ||
330 | pg_data_t *pgdat; | ||
331 | loff_t node = *pos; | ||
332 | for (pgdat = first_online_pgdat(); | ||
333 | pgdat && node; | ||
334 | pgdat = next_online_pgdat(pgdat)) | ||
335 | --node; | ||
336 | |||
337 | return pgdat; | ||
338 | } | ||
339 | |||
340 | static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) | ||
341 | { | ||
342 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
343 | |||
344 | (*pos)++; | ||
345 | return next_online_pgdat(pgdat); | ||
346 | } | ||
347 | |||
348 | static void frag_stop(struct seq_file *m, void *arg) | ||
349 | { | ||
350 | } | ||
351 | |||
352 | /* | ||
353 | * This walks the free areas for each zone. | ||
354 | */ | ||
355 | static int frag_show(struct seq_file *m, void *arg) | ||
356 | { | ||
357 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
358 | struct zone *zone; | ||
359 | struct zone *node_zones = pgdat->node_zones; | ||
360 | unsigned long flags; | ||
361 | int order; | ||
362 | |||
363 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | ||
364 | if (!populated_zone(zone)) | ||
365 | continue; | ||
366 | |||
367 | spin_lock_irqsave(&zone->lock, flags); | ||
368 | seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); | ||
369 | for (order = 0; order < MAX_ORDER; ++order) | ||
370 | seq_printf(m, "%6lu ", zone->free_area[order].nr_free); | ||
371 | spin_unlock_irqrestore(&zone->lock, flags); | ||
372 | seq_putc(m, '\n'); | ||
373 | } | ||
374 | return 0; | ||
375 | } | ||
376 | |||
377 | struct seq_operations fragmentation_op = { | ||
378 | .start = frag_start, | ||
379 | .next = frag_next, | ||
380 | .stop = frag_stop, | ||
381 | .show = frag_show, | ||
382 | }; | ||
383 | |||
384 | static char *vmstat_text[] = { | ||
385 | /* Zoned VM counters */ | ||
386 | "nr_anon_pages", | ||
387 | "nr_mapped", | ||
388 | "nr_file_pages", | ||
389 | "nr_slab", | ||
390 | "nr_page_table_pages", | ||
391 | "nr_dirty", | ||
392 | "nr_writeback", | ||
393 | "nr_unstable", | ||
394 | "nr_bounce", | ||
395 | |||
396 | #ifdef CONFIG_NUMA | ||
397 | "numa_hit", | ||
398 | "numa_miss", | ||
399 | "numa_foreign", | ||
400 | "numa_interleave", | ||
401 | "numa_local", | ||
402 | "numa_other", | ||
403 | #endif | ||
404 | |||
405 | #ifdef CONFIG_VM_EVENT_COUNTERS | ||
406 | "pgpgin", | ||
407 | "pgpgout", | ||
408 | "pswpin", | ||
409 | "pswpout", | ||
410 | |||
411 | "pgalloc_dma", | ||
412 | "pgalloc_dma32", | ||
413 | "pgalloc_normal", | ||
414 | "pgalloc_high", | ||
415 | |||
416 | "pgfree", | ||
417 | "pgactivate", | ||
418 | "pgdeactivate", | ||
419 | |||
420 | "pgfault", | ||
421 | "pgmajfault", | ||
422 | |||
423 | "pgrefill_dma", | ||
424 | "pgrefill_dma32", | ||
425 | "pgrefill_normal", | ||
426 | "pgrefill_high", | ||
427 | |||
428 | "pgsteal_dma", | ||
429 | "pgsteal_dma32", | ||
430 | "pgsteal_normal", | ||
431 | "pgsteal_high", | ||
432 | |||
433 | "pgscan_kswapd_dma", | ||
434 | "pgscan_kswapd_dma32", | ||
435 | "pgscan_kswapd_normal", | ||
436 | "pgscan_kswapd_high", | ||
437 | |||
438 | "pgscan_direct_dma", | ||
439 | "pgscan_direct_dma32", | ||
440 | "pgscan_direct_normal", | ||
441 | "pgscan_direct_high", | ||
442 | |||
443 | "pginodesteal", | ||
444 | "slabs_scanned", | ||
445 | "kswapd_steal", | ||
446 | "kswapd_inodesteal", | ||
447 | "pageoutrun", | ||
448 | "allocstall", | ||
449 | |||
450 | "pgrotated", | ||
451 | #endif | ||
452 | }; | ||
453 | |||
454 | /* | ||
455 | * Output information about zones in @pgdat. | ||
456 | */ | ||
457 | static int zoneinfo_show(struct seq_file *m, void *arg) | ||
458 | { | ||
459 | pg_data_t *pgdat = arg; | ||
460 | struct zone *zone; | ||
461 | struct zone *node_zones = pgdat->node_zones; | ||
462 | unsigned long flags; | ||
463 | |||
464 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { | ||
465 | int i; | ||
466 | |||
467 | if (!populated_zone(zone)) | ||
468 | continue; | ||
469 | |||
470 | spin_lock_irqsave(&zone->lock, flags); | ||
471 | seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); | ||
472 | seq_printf(m, | ||
473 | "\n pages free %lu" | ||
474 | "\n min %lu" | ||
475 | "\n low %lu" | ||
476 | "\n high %lu" | ||
477 | "\n active %lu" | ||
478 | "\n inactive %lu" | ||
479 | "\n scanned %lu (a: %lu i: %lu)" | ||
480 | "\n spanned %lu" | ||
481 | "\n present %lu", | ||
482 | zone->free_pages, | ||
483 | zone->pages_min, | ||
484 | zone->pages_low, | ||
485 | zone->pages_high, | ||
486 | zone->nr_active, | ||
487 | zone->nr_inactive, | ||
488 | zone->pages_scanned, | ||
489 | zone->nr_scan_active, zone->nr_scan_inactive, | ||
490 | zone->spanned_pages, | ||
491 | zone->present_pages); | ||
492 | |||
493 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | ||
494 | seq_printf(m, "\n %-12s %lu", vmstat_text[i], | ||
495 | zone_page_state(zone, i)); | ||
496 | |||
497 | seq_printf(m, | ||
498 | "\n protection: (%lu", | ||
499 | zone->lowmem_reserve[0]); | ||
500 | for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) | ||
501 | seq_printf(m, ", %lu", zone->lowmem_reserve[i]); | ||
502 | seq_printf(m, | ||
503 | ")" | ||
504 | "\n pagesets"); | ||
505 | for_each_online_cpu(i) { | ||
506 | struct per_cpu_pageset *pageset; | ||
507 | int j; | ||
508 | |||
509 | pageset = zone_pcp(zone, i); | ||
510 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { | ||
511 | if (pageset->pcp[j].count) | ||
512 | break; | ||
513 | } | ||
514 | if (j == ARRAY_SIZE(pageset->pcp)) | ||
515 | continue; | ||
516 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { | ||
517 | seq_printf(m, | ||
518 | "\n cpu: %i pcp: %i" | ||
519 | "\n count: %i" | ||
520 | "\n high: %i" | ||
521 | "\n batch: %i", | ||
522 | i, j, | ||
523 | pageset->pcp[j].count, | ||
524 | pageset->pcp[j].high, | ||
525 | pageset->pcp[j].batch); | ||
526 | } | ||
527 | } | ||
528 | seq_printf(m, | ||
529 | "\n all_unreclaimable: %u" | ||
530 | "\n prev_priority: %i" | ||
531 | "\n temp_priority: %i" | ||
532 | "\n start_pfn: %lu", | ||
533 | zone->all_unreclaimable, | ||
534 | zone->prev_priority, | ||
535 | zone->temp_priority, | ||
536 | zone->zone_start_pfn); | ||
537 | spin_unlock_irqrestore(&zone->lock, flags); | ||
538 | seq_putc(m, '\n'); | ||
539 | } | ||
540 | return 0; | ||
541 | } | ||
542 | |||
543 | struct seq_operations zoneinfo_op = { | ||
544 | .start = frag_start, /* iterate over all zones. The same as in | ||
545 | * fragmentation. */ | ||
546 | .next = frag_next, | ||
547 | .stop = frag_stop, | ||
548 | .show = zoneinfo_show, | ||
549 | }; | ||
550 | |||
551 | static void *vmstat_start(struct seq_file *m, loff_t *pos) | ||
552 | { | ||
553 | unsigned long *v; | ||
554 | #ifdef CONFIG_VM_EVENT_COUNTERS | ||
555 | unsigned long *e; | ||
556 | #endif | ||
557 | int i; | ||
558 | |||
559 | if (*pos >= ARRAY_SIZE(vmstat_text)) | ||
560 | return NULL; | ||
561 | |||
562 | #ifdef CONFIG_VM_EVENT_COUNTERS | ||
563 | v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) | ||
564 | + sizeof(struct vm_event_state), GFP_KERNEL); | ||
565 | #else | ||
566 | v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long), | ||
567 | GFP_KERNEL); | ||
568 | #endif | ||
569 | m->private = v; | ||
570 | if (!v) | ||
571 | return ERR_PTR(-ENOMEM); | ||
572 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | ||
573 | v[i] = global_page_state(i); | ||
574 | #ifdef CONFIG_VM_EVENT_COUNTERS | ||
575 | e = v + NR_VM_ZONE_STAT_ITEMS; | ||
576 | all_vm_events(e); | ||
577 | e[PGPGIN] /= 2; /* sectors -> kbytes */ | ||
578 | e[PGPGOUT] /= 2; | ||
579 | #endif | ||
580 | return v + *pos; | ||
581 | } | ||
582 | |||
583 | static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) | ||
584 | { | ||
585 | (*pos)++; | ||
586 | if (*pos >= ARRAY_SIZE(vmstat_text)) | ||
587 | return NULL; | ||
588 | return (unsigned long *)m->private + *pos; | ||
589 | } | ||
590 | |||
591 | static int vmstat_show(struct seq_file *m, void *arg) | ||
592 | { | ||
593 | unsigned long *l = arg; | ||
594 | unsigned long off = l - (unsigned long *)m->private; | ||
595 | |||
596 | seq_printf(m, "%s %lu\n", vmstat_text[off], *l); | ||
597 | return 0; | ||
598 | } | ||
599 | |||
600 | static void vmstat_stop(struct seq_file *m, void *arg) | ||
601 | { | ||
602 | kfree(m->private); | ||
603 | m->private = NULL; | ||
604 | } | ||
605 | |||
606 | struct seq_operations vmstat_op = { | ||
607 | .start = vmstat_start, | ||
608 | .next = vmstat_next, | ||
609 | .stop = vmstat_stop, | ||
610 | .show = vmstat_show, | ||
611 | }; | ||
612 | |||
613 | #endif /* CONFIG_PROC_FS */ | ||
614 | |||